summaryrefslogtreecommitdiff
path: root/storage/maria
diff options
context:
space:
mode:
Diffstat (limited to 'storage/maria')
-rw-r--r--storage/maria/CMakeLists.txt84
-rw-r--r--storage/maria/Makefile.am202
-rw-r--r--storage/maria/compat_aliases.cc245
-rw-r--r--storage/maria/compat_aliases.h27
-rw-r--r--storage/maria/file_formats.txt71
-rw-r--r--storage/maria/ft_maria.c48
-rw-r--r--storage/maria/ha_maria.cc3686
-rw-r--r--storage/maria/ha_maria.h197
-rw-r--r--storage/maria/lockman.c786
-rw-r--r--storage/maria/lockman.h76
-rw-r--r--storage/maria/ma_bitmap.c2910
-rw-r--r--storage/maria/ma_blockrec.c7404
-rw-r--r--storage/maria/ma_blockrec.h290
-rw-r--r--storage/maria/ma_cache.c107
-rw-r--r--storage/maria/ma_changed.c33
-rw-r--r--storage/maria/ma_check.c6805
-rw-r--r--storage/maria/ma_check_standalone.h104
-rw-r--r--storage/maria/ma_checkpoint.c1196
-rw-r--r--storage/maria/ma_checkpoint.h92
-rw-r--r--storage/maria/ma_checksum.c89
-rw-r--r--storage/maria/ma_close.c208
-rw-r--r--storage/maria/ma_commit.c129
-rw-r--r--storage/maria/ma_commit.h18
-rw-r--r--storage/maria/ma_control_file.c607
-rw-r--r--storage/maria/ma_control_file.h74
-rw-r--r--storage/maria/ma_create.c1419
-rw-r--r--storage/maria/ma_dbug.c201
-rw-r--r--storage/maria/ma_delete.c1650
-rw-r--r--storage/maria/ma_delete_all.c192
-rw-r--r--storage/maria/ma_delete_table.c107
-rw-r--r--storage/maria/ma_dynrec.c2042
-rw-r--r--storage/maria/ma_extra.c637
-rw-r--r--storage/maria/ma_ft_boolean_search.c1042
-rw-r--r--storage/maria/ma_ft_eval.c254
-rw-r--r--storage/maria/ma_ft_eval.h41
-rw-r--r--storage/maria/ma_ft_nlq_search.c380
-rw-r--r--storage/maria/ma_ft_parser.c417
-rw-r--r--storage/maria/ma_ft_stem.c18
-rw-r--r--storage/maria/ma_ft_test1.c317
-rw-r--r--storage/maria/ma_ft_test1.h420
-rw-r--r--storage/maria/ma_ft_update.c379
-rw-r--r--storage/maria/ma_ftdefs.h156
-rw-r--r--storage/maria/ma_fulltext.h27
-rw-r--r--storage/maria/ma_info.c142
-rw-r--r--storage/maria/ma_init.c184
-rw-r--r--storage/maria/ma_key.c775
-rw-r--r--storage/maria/ma_key_recover.c1432
-rw-r--r--storage/maria/ma_key_recover.h122
-rw-r--r--storage/maria/ma_keycache.c164
-rw-r--r--storage/maria/ma_locking.c554
-rw-r--r--storage/maria/ma_loghandler.c9316
-rw-r--r--storage/maria/ma_loghandler.h506
-rw-r--r--storage/maria/ma_loghandler_lsn.h111
-rw-r--r--storage/maria/ma_open.c1945
-rw-r--r--storage/maria/ma_packrec.c1723
-rw-r--r--storage/maria/ma_page.c619
-rw-r--r--storage/maria/ma_pagecache.c5104
-rw-r--r--storage/maria/ma_pagecache.h325
-rw-r--r--storage/maria/ma_pagecaches.c104
-rw-r--r--storage/maria/ma_pagecrc.c378
-rw-r--r--storage/maria/ma_panic.c140
-rw-r--r--storage/maria/ma_preload.c116
-rw-r--r--storage/maria/ma_range.c312
-rw-r--r--storage/maria/ma_recovery.c3755
-rw-r--r--storage/maria/ma_recovery.h33
-rw-r--r--storage/maria/ma_recovery_util.c146
-rw-r--r--storage/maria/ma_recovery_util.h37
-rw-r--r--storage/maria/ma_rename.c135
-rw-r--r--storage/maria/ma_rfirst.c26
-rw-r--r--storage/maria/ma_rkey.c215
-rw-r--r--storage/maria/ma_rlast.c26
-rw-r--r--storage/maria/ma_rnext.c130
-rw-r--r--storage/maria/ma_rnext_same.c113
-rw-r--r--storage/maria/ma_rprev.c86
-rw-r--r--storage/maria/ma_rrnd.c44
-rw-r--r--storage/maria/ma_rsame.c78
-rw-r--r--storage/maria/ma_rsamepos.c63
-rw-r--r--storage/maria/ma_rt_index.c1343
-rw-r--r--storage/maria/ma_rt_index.h46
-rw-r--r--storage/maria/ma_rt_key.c120
-rw-r--r--storage/maria/ma_rt_key.h31
-rw-r--r--storage/maria/ma_rt_mbr.c818
-rw-r--r--storage/maria/ma_rt_mbr.h40
-rw-r--r--storage/maria/ma_rt_split.c554
-rw-r--r--storage/maria/ma_rt_test.c692
-rw-r--r--storage/maria/ma_scan.c74
-rw-r--r--storage/maria/ma_search.c2397
-rw-r--r--storage/maria/ma_servicethread.c134
-rw-r--r--storage/maria/ma_servicethread.h22
-rw-r--r--storage/maria/ma_sort.c1077
-rw-r--r--storage/maria/ma_sp_defs.h48
-rw-r--r--storage/maria/ma_sp_key.c305
-rw-r--r--storage/maria/ma_sp_test.c568
-rw-r--r--storage/maria/ma_state.c795
-rw-r--r--storage/maria/ma_state.h86
-rw-r--r--storage/maria/ma_static.c109
-rw-r--r--storage/maria/ma_statrec.c302
-rw-r--r--storage/maria/ma_test1.c899
-rw-r--r--storage/maria/ma_test2.c1246
-rw-r--r--storage/maria/ma_test3.c501
-rw-r--r--storage/maria/ma_test_all.res14
-rwxr-xr-xstorage/maria/ma_test_all.sh19
-rw-r--r--storage/maria/ma_test_big.sh22
-rwxr-xr-xstorage/maria/ma_test_force_start.pl238
-rwxr-xr-xstorage/maria/ma_test_recovery8
-rw-r--r--storage/maria/ma_unique.c244
-rw-r--r--storage/maria/ma_update.c253
-rw-r--r--storage/maria/ma_write.c2461
-rw-r--r--storage/maria/maria_chk.c2008
-rw-r--r--storage/maria/maria_def.h1267
-rw-r--r--storage/maria/maria_ftdump.c282
-rw-r--r--storage/maria/maria_pack.c3234
-rw-r--r--storage/maria/maria_read_log.c308
-rwxr-xr-xstorage/maria/maria_rename.sh17
-rw-r--r--storage/maria/plug.in19
-rw-r--r--storage/maria/tablockman.c674
-rw-r--r--storage/maria/tablockman.h87
-rwxr-xr-xstorage/maria/test_pack10
-rw-r--r--storage/maria/trnman.c979
-rw-r--r--storage/maria/trnman.h67
-rw-r--r--storage/maria/trnman_public.h85
-rw-r--r--storage/maria/unittest/CMakeLists.txt95
-rw-r--r--storage/maria/unittest/Makefile.am115
-rw-r--r--storage/maria/unittest/lockman-t.c308
-rw-r--r--storage/maria/unittest/lockman1-t.c334
-rw-r--r--storage/maria/unittest/lockman2-t.c361
-rw-r--r--storage/maria/unittest/ma_control_file-t.c592
-rw-r--r--storage/maria/unittest/ma_loghandler_examples.c65
-rw-r--r--storage/maria/unittest/ma_maria_log_cleanup.c64
-rw-r--r--storage/maria/unittest/ma_pagecache_consist.c498
-rw-r--r--storage/maria/unittest/ma_pagecache_rwconsist.c362
-rw-r--r--storage/maria/unittest/ma_pagecache_rwconsist2.c358
-rw-r--r--storage/maria/unittest/ma_pagecache_single.c853
-rwxr-xr-xstorage/maria/unittest/ma_test_all-t710
-rw-r--r--storage/maria/unittest/ma_test_loghandler-t.c661
-rw-r--r--storage/maria/unittest/ma_test_loghandler_first_lsn-t.c160
-rw-r--r--storage/maria/unittest/ma_test_loghandler_max_lsn-t.c156
-rw-r--r--storage/maria/unittest/ma_test_loghandler_multigroup-t.c746
-rw-r--r--storage/maria/unittest/ma_test_loghandler_multithread-t.c556
-rw-r--r--storage/maria/unittest/ma_test_loghandler_noflush-t.c146
-rw-r--r--storage/maria/unittest/ma_test_loghandler_nologs-t.c195
-rw-r--r--storage/maria/unittest/ma_test_loghandler_pagecache-t.c200
-rw-r--r--storage/maria/unittest/ma_test_loghandler_purge-t.c192
-rw-r--r--storage/maria/unittest/ma_test_recovery.expected1578
-rwxr-xr-xstorage/maria/unittest/ma_test_recovery.pl481
-rw-r--r--storage/maria/unittest/sequence_storage.c110
-rw-r--r--storage/maria/unittest/sequence_storage.h28
-rw-r--r--storage/maria/unittest/test_file.c118
-rw-r--r--storage/maria/unittest/test_file.h29
-rw-r--r--storage/maria/unittest/trnman-t.c175
150 files changed, 98895 insertions, 0 deletions
diff --git a/storage/maria/CMakeLists.txt b/storage/maria/CMakeLists.txt
new file mode 100644
index 00000000000..7b5b190bd57
--- /dev/null
+++ b/storage/maria/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Copyright (C) 2007 MySQL AB
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+SET(ARIA_SOURCES ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c
+ ma_rnext.c ma_rnext_same.c
+ ma_search.c ma_page.c ma_key_recover.c ma_key.c
+ ma_locking.c ma_state.c
+ ma_rrnd.c ma_scan.c ma_cache.c
+ ma_statrec.c ma_packrec.c ma_dynrec.c
+ ma_blockrec.c ma_bitmap.c
+ ma_update.c ma_write.c ma_unique.c
+ ma_delete.c
+ ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c
+ ma_rsamepos.c ma_panic.c ma_close.c ma_create.c
+ ma_range.c ma_dbug.c ma_checksum.c
+ ma_changed.c ma_static.c ma_delete_all.c
+ ma_delete_table.c ma_rename.c ma_check.c
+ ma_keycache.c ma_preload.c ma_ft_parser.c
+ ma_ft_update.c ma_ft_boolean_search.c
+ ma_ft_nlq_search.c ft_maria.c ma_sort.c
+ ha_maria.cc trnman.c lockman.c tablockman.c
+ ma_rt_index.c ma_rt_key.c ma_rt_mbr.c ma_rt_split.c
+ ma_sp_key.c ma_control_file.c ma_loghandler.c
+ ma_pagecache.c ma_pagecaches.c compat_aliases.cc compat_aliases.h
+ ma_checkpoint.c ma_recovery.c ma_commit.c ma_pagecrc.c
+ ha_maria.h maria_def.h ma_recovery_util.c ma_servicethread.c
+)
+
+MYSQL_ADD_PLUGIN(aria ${ARIA_SOURCES}
+ STORAGE_ENGINE
+ MANDATORY
+ RECOMPILE_FOR_EMBEDDED)
+
+TARGET_LINK_LIBRARIES(aria myisam)
+
+MYSQL_ADD_EXECUTABLE(aria_ftdump maria_ftdump.c)
+TARGET_LINK_LIBRARIES(aria_ftdump aria)
+
+MYSQL_ADD_EXECUTABLE(aria_chk maria_chk.c)
+TARGET_LINK_LIBRARIES(aria_chk aria)
+
+MYSQL_ADD_EXECUTABLE(aria_read_log maria_read_log.c)
+TARGET_LINK_LIBRARIES(aria_read_log aria)
+
+MYSQL_ADD_EXECUTABLE(aria_dump_log ma_loghandler.c unittest/ma_loghandler_examples.c)
+TARGET_LINK_LIBRARIES(aria_dump_log aria)
+SET_TARGET_PROPERTIES(aria_dump_log PROPERTIES COMPILE_FLAGS "-DMARIA_DUMP_LOG")
+
+MYSQL_ADD_EXECUTABLE(aria_pack maria_pack.c)
+TARGET_LINK_LIBRARIES(aria_pack aria)
+
+IF(WITH_UNIT_TESTS AND FALSE)
+ ADD_EXECUTABLE(ma_test1 ma_test1.c)
+ TARGET_LINK_LIBRARIES(ma_test1 aria)
+
+ ADD_EXECUTABLE(ma_test2 ma_test2.c)
+ TARGET_LINK_LIBRARIES(ma_test2 aria)
+
+ ADD_EXECUTABLE(ma_test3 ma_test3.c)
+ TARGET_LINK_LIBRARIES(ma_test3 aria)
+
+ ADD_EXECUTABLE(ma_rt_test ma_rt_test.c)
+ TARGET_LINK_LIBRARIES(ma_rt_test aria)
+
+ ADD_EXECUTABLE(ma_sp_test ma_sp_test.c)
+ TARGET_LINK_LIBRARIES(ma_sp_test aria)
+ENDIF()
+
+IF (MSVC)
+ SET_TARGET_PROPERTIES(aria_chk aria_pack PROPERTIES LINK_FLAGS "setargv.obj")
+ENDIF()
+
diff --git a/storage/maria/Makefile.am b/storage/maria/Makefile.am
new file mode 100644
index 00000000000..a83063a0226
--- /dev/null
+++ b/storage/maria/Makefile.am
@@ -0,0 +1,202 @@
+# Copyright (C) 2000-2008 MySQL AB
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+MYSQLDATAdir = $(localstatedir)
+MYSQLSHAREdir = $(pkgdatadir)
+MYSQLBASEdir= $(prefix)
+MYSQLLIBdir= $(pkglibdir)
+INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include \
+ -I$(top_srcdir)/regex \
+ -I$(top_srcdir)/sql \
+ -I$(srcdir)
+WRAPLIBS=
+
+LDADD =
+
+DEFS = @DEFS@
+
+# "." is needed first because tests in unittest need libaria
+SUBDIRS = . unittest
+
+EXTRA_DIST = ma_test_all.sh ma_test_all.res ma_test_big.sh \
+ ma_ft_stem.c CMakeLists.txt plug.in ma_test_recovery
+pkgdata_DATA =
+pkglib_LIBRARIES = libaria.a
+bin_PROGRAMS = aria_chk aria_pack aria_ftdump aria_read_log \
+ aria_dump_log
+aria_chk_DEPENDENCIES= $(LIBRARIES)
+# Only reason to link with libmyisam.a here is that it's where some fulltext
+# pieces are (but soon we'll remove fulltext dependencies from Aria).
+# For now, it imposes that storage/myisam be built before storage/maria.
+aria_chk_SOURCES= maria_chk.c
+aria_chk_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+aria_pack_SOURCES= maria_pack.c
+aria_pack_DEPENDENCIES=$(LIBRARIES)
+aria_pack_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+aria_read_log_SOURCES= maria_read_log.c
+aria_read_log_DEPENDENCIES=$(LIBRARIES)
+aria_read_log_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+aria_dump_log_DEPENDENCIES=$(LIBRARIES) ma_loghandler.c
+aria_dump_log_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+aria_dump_log_SOURCES= ma_loghandler.c unittest/ma_loghandler_examples.c
+aria_dump_log_CPPFLAGS= -DMARIA_DUMP_LOG
+noinst_PROGRAMS = ma_test1 ma_test2 ma_test3 ma_rt_test ma_sp_test
+noinst_HEADERS = maria_def.h ma_rt_index.h ma_rt_key.h ma_rt_mbr.h \
+ ma_sp_defs.h ma_fulltext.h ma_ftdefs.h ma_ft_test1.h \
+ ma_ft_eval.h trnman.h lockman.h tablockman.h \
+ ma_control_file.h ha_maria.h ma_blockrec.h \
+ ma_loghandler.h ma_loghandler_lsn.h ma_pagecache.h \
+ ma_checkpoint.h ma_recovery.h ma_commit.h ma_state.h \
+ trnman_public.h ma_check_standalone.h \
+ ma_key_recover.h ma_recovery_util.h \
+ ma_servicethread.h compat_aliases.h
+ma_test1_DEPENDENCIES= $(LIBRARIES)
+ma_test1_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_test2_DEPENDENCIES= $(LIBRARIES)
+ma_test2_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_test3_DEPENDENCIES= $(LIBRARIES)
+ma_test3_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+#ma_ft_test1_DEPENDENCIES= $(LIBRARIES)
+#ma_ft_eval_DEPENDENCIES= $(LIBRARIES)
+aria_ftdump_SOURCES= maria_ftdump.c
+aria_ftdump_DEPENDENCIES= $(LIBRARIES)
+aria_ftdump_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_rt_test_DEPENDENCIES= $(LIBRARIES)
+ma_rt_test_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_sp_test_DEPENDENCIES= $(LIBRARIES)
+ma_sp_test_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+libaria_a_SOURCES = ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c \
+ ma_rnext.c ma_rnext_same.c \
+ ma_search.c ma_page.c ma_key_recover.c ma_key.c \
+ ma_locking.c ma_state.c \
+ ma_rrnd.c ma_scan.c ma_cache.c \
+ ma_statrec.c ma_packrec.c ma_dynrec.c \
+ ma_blockrec.c ma_bitmap.c \
+ ma_update.c ma_write.c ma_unique.c \
+ ma_delete.c \
+ ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c \
+ ma_rsamepos.c ma_panic.c ma_close.c ma_create.c\
+ ma_range.c ma_dbug.c ma_checksum.c \
+ ma_changed.c ma_static.c ma_delete_all.c \
+ ma_delete_table.c ma_rename.c ma_check.c \
+ ma_keycache.c ma_preload.c ma_ft_parser.c \
+ ma_ft_update.c ma_ft_boolean_search.c \
+ ma_ft_nlq_search.c ft_maria.c ma_sort.c \
+ trnman.c lockman.c tablockman.c \
+ ma_rt_index.c ma_rt_key.c ma_rt_mbr.c ma_rt_split.c \
+ ma_sp_key.c ma_control_file.c ma_loghandler.c \
+ ma_pagecache.c ma_pagecaches.c \
+ ma_checkpoint.c ma_recovery.c ma_commit.c \
+ ma_pagecrc.c ma_recovery_util.c \
+ ha_maria.cc compat_aliases.cc ma_servicethread.c
+CLEANFILES = test?.MA? FT?.MA? isam.log ma_test_all ma_rt_test.MA? sp_test.MA? aria_log_control aria_log.0000*
+
+SUFFIXES = .sh
+
+.sh:
+ @RM@ -f $@ $@-t
+ @SED@ \
+ -e 's!@''bindir''@!$(bindir)!g' \
+ -e 's!@''scriptdir''@!$(bindir)!g' \
+ -e 's!@''prefix''@!$(prefix)!g' \
+ -e 's!@''datadir''@!$(datadir)!g' \
+ -e 's!@''localstatedir''@!$(localstatedir)!g' \
+ -e 's!@''libexecdir''@!$(libexecdir)!g' \
+ -e 's!@''CC''@!@CC@!'\
+ -e 's!@''CXX''@!@CXX@!'\
+ -e 's!@''GXX''@!@GXX@!'\
+ -e 's!@''PERL''@!@PERL@!' \
+ -e 's!@''CFLAGS''@!@SAVE_CFLAGS@!'\
+ -e 's!@''CXXFLAGS''@!@SAVE_CXXFLAGS@!'\
+ -e 's!@''LDFLAGS''@!@SAVE_LDFLAGS@!'\
+ -e 's!@''VERSION''@!@VERSION@!' \
+ -e 's!@''MYSQL_SERVER_SUFFIX''@!@MYSQL_SERVER_SUFFIX@!' \
+ -e 's!@''COMPILATION_COMMENT''@!@COMPILATION_COMMENT@!' \
+ -e 's!@''MACHINE_TYPE''@!@MACHINE_TYPE@!' \
+ -e 's!@''HOSTNAME''@!@HOSTNAME@!' \
+ -e 's!@''SYSTEM_TYPE''@!@SYSTEM_TYPE@!' \
+ -e 's!@''CHECK_PID''@!@CHECK_PID@!' \
+ -e 's!@''FIND_PROC''@!@FIND_PROC@!' \
+ -e 's!@''MYSQLD_DEFAULT_SWITCHES''@!@MYSQLD_DEFAULT_SWITCHES@!' \
+ -e 's!@''MYSQL_UNIX_ADDR''@!@MYSQL_UNIX_ADDR@!' \
+ -e 's!@''TARGET_LINUX''@!@TARGET_LINUX@!' \
+ -e "s!@""CONF_COMMAND""@!@CONF_COMMAND@!" \
+ -e 's!@''MYSQLD_USER''@!@MYSQLD_USER@!' \
+ -e 's!@''sysconfdir''@!@sysconfdir@!' \
+ -e 's!@''SHORT_MYSQL_INTRO''@!@SHORT_MYSQL_INTRO@!' \
+ -e 's!@''SHARED_LIB_VERSION''@!@SHARED_LIB_VERSION@!' \
+ -e 's!@''MYSQL_BASE_VERSION''@!@MYSQL_BASE_VERSION@!' \
+ -e 's!@''MYSQL_NO_DASH_VERSION''@!@MYSQL_NO_DASH_VERSION@!' \
+ -e 's!@''MYSQL_TCP_PORT''@!@MYSQL_TCP_PORT@!' \
+ -e 's!@''PERL_DBI_VERSION''@!@PERL_DBI_VERSION@!' \
+ -e 's!@''PERL_DBD_VERSION''@!@PERL_DBD_VERSION@!' \
+ -e 's!@''PERL_DATA_DUMPER''@!@PERL_DATA_DUMPER@!' \
+ $< > $@-t
+ @CHMOD@ +x $@-t
+ @MV@ $@-t $@
+
+tags:
+ etags *.h *.c *.cc
+
+unittests = unittest
+
+test:
+ perl $(top_srcdir)/unittest/unit.pl run $(unittests)
+
+test-verbose:
+ HARNESS_VERBOSE=1 perl $(top_srcdir)/unittest/unit.pl run $(unittests)
+
+# Don't update the files from bitkeeper
+%::SCCS/s.%
diff --git a/storage/maria/compat_aliases.cc b/storage/maria/compat_aliases.cc
new file mode 100644
index 00000000000..2d3c67d69a7
--- /dev/null
+++ b/storage/maria/compat_aliases.cc
@@ -0,0 +1,245 @@
+/* Copyright (C) 2010 Monty Program Ab
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ compatibility aliases for system and static variables
+*/
+#include <my_global.h>
+#include <maria.h>
+#include <mysql/plugin.h>
+#include "ma_loghandler.h"
+#include "compat_aliases.h"
+
+ulong block_size_alias;
+static MYSQL_SYSVAR_ULONG(block_size, block_size_alias,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Deprecated, use --aria-block-size instead", 0, 0,
+ MARIA_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH,
+ MARIA_MAX_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH);
+
+ulong checkpoint_interval_alias;
+static MYSQL_SYSVAR_ULONG(checkpoint_interval, checkpoint_interval_alias,
+ PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-checkpoint-interval instead",
+ NULL, NULL, 30, 0, UINT_MAX, 1);
+
+ulong force_start_after_recovery_failures_alias;
+static MYSQL_SYSVAR_ULONG(force_start_after_recovery_failures, force_start_after_recovery_failures_alias,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Deprecated, use --aria-force-start-after-recovery-failures instead",
+ NULL, NULL, 0, 0, UINT_MAX8, 1);
+
+my_bool page_checksum_alias;
+static MYSQL_SYSVAR_BOOL(page_checksum, page_checksum_alias, 0,
+ "Deprecated, use --aria-page-checksum instead", 0, 0, 1);
+
+char *log_dir_path_alias;
+static MYSQL_SYSVAR_STR(log_dir_path, log_dir_path_alias,
+ PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Deprecated, use --aria-log-dir-path instead",
+ NULL, NULL, mysql_real_data_home);
+
+ulong log_file_size_alias;
+static MYSQL_SYSVAR_ULONG(log_file_size, log_file_size_alias,
+ PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-log-file-size instead",
+ NULL, NULL, TRANSLOG_FILE_SIZE,
+ TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+
+ulong group_commit_alias;
+static MYSQL_SYSVAR_ENUM(group_commit, group_commit_alias,
+ PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-group-commit instead",
+ NULL, NULL,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+ulong group_commit_interval_alias;
+static MYSQL_SYSVAR_ULONG(group_commit_interval, group_commit_interval_alias,
+ PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-group-commit-interval instead",
+ NULL, NULL, 0, 0, UINT_MAX, 1);
+
+ulong log_purge_type_alias;
+static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type_alias,
+ PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-log-purge-type instead",
+ NULL, NULL, TRANSLOG_PURGE_IMMIDIATE,
+ &maria_translog_purge_type_typelib);
+
+ulonglong max_sort_file_size_alias;
+static MYSQL_SYSVAR_ULONGLONG(max_sort_file_size, max_sort_file_size_alias,
+ PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-max-temp-length instead",
+ 0, 0, MAX_FILE_SIZE, 0, MAX_FILE_SIZE, 1024*1024);
+
+ulong pagecache_age_threshold_alias;
+static MYSQL_SYSVAR_ULONG(pagecache_age_threshold, pagecache_age_threshold_alias,
+ PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-pagecache-age-threshold instead",
+ 0, 0, 300, 100, ~0L, 100);
+
+ulonglong pagecache_buffer_size_alias;
+static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, pagecache_buffer_size_alias,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Deprecated, use --aria-pagecache-buffer-size instead",
+ 0, 0, KEY_CACHE_SIZE, MALLOC_OVERHEAD, ~0UL, IO_SIZE);
+
+ulong pagecache_division_limit_alias;
+static MYSQL_SYSVAR_ULONG(pagecache_division_limit, pagecache_division_limit_alias,
+ PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-pagecache-division-limit instead",
+ 0, 0, 100, 1, 100, 1);
+
+ulong recover_alias;
+static MYSQL_SYSVAR_ENUM(recover, recover_alias, PLUGIN_VAR_OPCMDARG,
+ "Deprecated, use --aria-recover instead",
+ NULL, NULL, HA_RECOVER_DEFAULT, &maria_recover_typelib);
+
+ulong repair_threads_alias;
+static MYSQL_THDVAR_ULONG(repair_threads, PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-repair-threads instead",
+ 0, 0, 1, 1, ~0L, 1);
+
+ulong sort_buffer_size_alias;
+static MYSQL_THDVAR_ULONG(sort_buffer_size, PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-sort-buffer-size instead",
+ 0, 0, 128L*1024L*1024L, 4, ~0L, 1);
+
+ulong stats_method_alias;
+static MYSQL_THDVAR_ENUM(stats_method, PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-stats-method instead",
+ 0, 0, 0, &maria_stats_method_typelib);
+
+ulong sync_log_dir_alias;
+static MYSQL_SYSVAR_ENUM(sync_log_dir, sync_log_dir_alias,
+ PLUGIN_VAR_RQCMDARG,
+ "Deprecated, use --aria-sync-log-dir instead",
+ NULL, NULL, TRANSLOG_SYNC_DIR_NEWFILE,
+ &maria_sync_log_dir_typelib);
+
+my_bool used_for_temp_tables_alias= 1;
+static MYSQL_SYSVAR_BOOL(used_for_temp_tables,
+ used_for_temp_tables_alias, PLUGIN_VAR_READONLY | PLUGIN_VAR_NOCMDOPT,
+ NULL, 0, 0, 1);
+
+static struct st_mysql_show_var status_variables_aliases[]= {
+ {"Maria", (char*) &status_variables, SHOW_ARRAY},
+ {NullS, NullS, SHOW_LONG}
+};
+
+/*
+ There is one problem with aliases for command-line options.
+ Plugin initialization works like this
+
+ for all plugins:
+ prepare command-line options
+ initialize command-line option variables to the default values
+ parse command line, assign values as necessary
+
+ for all plugins:
+ call the plugin initialization function
+
+ it means, we cannot have maria* and aria* command-line options to use
+ the same underlying variables - because after assigning maria* values,
+ MySQL will put there default values again preparing for parsing aria*
+ values. So, maria* values will be lost.
+
+ So, we create separate set of variables for maria* options,
+ and take both values into account in ha_maria_init().
+
+ When the command line was parsed, we patch maria* options
+ to use the same variables as aria* options so that
+ set @@maria_some_var would have the same value as @@aria_some_var
+ without forcing us to copy the values around all the time.
+*/
+
+static struct st_mysql_sys_var* system_variables_aliases[]= {
+ MYSQL_SYSVAR(block_size),
+ MYSQL_SYSVAR(checkpoint_interval),
+ MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
+ MYSQL_SYSVAR(log_dir_path),
+ MYSQL_SYSVAR(log_file_size),
+ MYSQL_SYSVAR(log_purge_type),
+ MYSQL_SYSVAR(max_sort_file_size),
+ MYSQL_SYSVAR(page_checksum),
+ MYSQL_SYSVAR(pagecache_age_threshold),
+ MYSQL_SYSVAR(pagecache_buffer_size),
+ MYSQL_SYSVAR(pagecache_division_limit),
+ MYSQL_SYSVAR(recover),
+ MYSQL_SYSVAR(repair_threads),
+ MYSQL_SYSVAR(sort_buffer_size),
+ MYSQL_SYSVAR(stats_method),
+ MYSQL_SYSVAR(sync_log_dir),
+ MYSQL_SYSVAR(used_for_temp_tables),
+ NULL
+};
+
+#define COPY_SYSVAR(name) \
+ memcpy(&MYSQL_SYSVAR_NAME(name), system_variables[i++], \
+ sizeof(MYSQL_SYSVAR_NAME(name))); \
+ if (name ## _alias != MYSQL_SYSVAR_NAME(name).def_val && \
+ *MYSQL_SYSVAR_NAME(name).value == MYSQL_SYSVAR_NAME(name).def_val) \
+ *MYSQL_SYSVAR_NAME(name).value= name ## _alias;
+
+#define COPY_THDVAR(name) \
+ name ## _alias= THDVAR(0, name); \
+ memcpy(&MYSQL_SYSVAR_NAME(name), system_variables[i++], \
+ sizeof(MYSQL_SYSVAR_NAME(name))); \
+ if (name ## _alias != MYSQL_SYSVAR_NAME(name).def_val && \
+ THDVAR(0, name) == MYSQL_SYSVAR_NAME(name).def_val) \
+ THDVAR(0, name)= name ## _alias;
+
+void copy_variable_aliases()
+{
+ int i= 0;
+ COPY_SYSVAR(block_size);
+ COPY_SYSVAR(checkpoint_interval);
+ COPY_SYSVAR(force_start_after_recovery_failures);
+ COPY_SYSVAR(group_commit);
+ COPY_SYSVAR(group_commit_interval);
+ COPY_SYSVAR(log_dir_path);
+ COPY_SYSVAR(log_file_size);
+ COPY_SYSVAR(log_purge_type);
+ COPY_SYSVAR(max_sort_file_size);
+ COPY_SYSVAR(page_checksum);
+ COPY_SYSVAR(pagecache_age_threshold);
+ COPY_SYSVAR(pagecache_buffer_size);
+ COPY_SYSVAR(pagecache_division_limit);
+ COPY_SYSVAR(recover);
+ COPY_THDVAR(repair_threads);
+ COPY_THDVAR(sort_buffer_size);
+ COPY_THDVAR(stats_method);
+ COPY_SYSVAR(sync_log_dir);
+ COPY_SYSVAR(used_for_temp_tables);
+}
+
+struct st_maria_plugin compat_aliases= {
+ MYSQL_DAEMON_PLUGIN,
+ &maria_storage_engine,
+ "Maria",
+ "Monty Program Ab",
+ "Compatibility aliases for the Aria engine",
+ PLUGIN_LICENSE_GPL,
+ NULL,
+ NULL,
+ 0x0105,
+ status_variables_aliases,
+ system_variables_aliases,
+ "1.5",
+ MariaDB_PLUGIN_MATURITY_GAMMA
+};
+
diff --git a/storage/maria/compat_aliases.h b/storage/maria/compat_aliases.h
new file mode 100644
index 00000000000..46a4da74eec
--- /dev/null
+++ b/storage/maria/compat_aliases.h
@@ -0,0 +1,27 @@
+/* Copyright (C) 2010 Monty Program Ab
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+extern struct st_maria_plugin compat_aliases;
+extern char mysql_real_data_home[FN_REFLEN];
+extern TYPELIB maria_recover_typelib;
+extern TYPELIB maria_stats_method_typelib;
+extern TYPELIB maria_translog_purge_type_typelib;
+extern TYPELIB maria_sync_log_dir_typelib;
+extern TYPELIB maria_group_commit_typelib;
+extern struct st_mysql_storage_engine maria_storage_engine;
+extern my_bool use_maria_for_temp_tables;
+extern struct st_mysql_sys_var* system_variables[];
+extern st_mysql_show_var status_variables[];
+void copy_variable_aliases();
diff --git a/storage/maria/file_formats.txt b/storage/maria/file_formats.txt
new file mode 100644
index 00000000000..927e8ad985e
--- /dev/null
+++ b/storage/maria/file_formats.txt
@@ -0,0 +1,71 @@
+#
+# This should contain a description of the file format for most Maria files
+#
+
+# Description of the header in the index file
+
+Header, 24 bytes
+
+Pos Length
+
+0 4 file_version
+4 2 options
+6 2 header_length
+8 2 state_info_length
+10 2 base_info_length
+12 2 base_pos
+14 2 key_parts
+16 2 unique_key_parts
+18 1 keys
+19 1 uniques
+20 1 language
+21 1 fulltext_keys
+22 1 data_file_type
+23 1 org_data_file_type
+
+
+Status part
+
+24 2 open_count
+26 2 state_changed
+28 7 create_rename_lsn
+ 7 is_of_horizon
+ 7 skip_redo_lsn
+ 8 state.records
+ 8 state->state.del
+ 8 state->split
+ 8 state->dellink
+ 8 state->first_bitmap_with_space
+ 8 state->state.key_file_length
+ 8 state->state.data_file_length
+ 8 state->state.empty
+ 8 state->state.key_empty
+ 8 state->auto_increment
+ 8 state->state.checksum
+ 4 state->process
+ 4 state->unique
+ 4 state->status
+ 4 state->update_count
+
+ 1 state->sortkey
+ 1 reserved
+
+for each key
+ 8 state->key_root[i]
+
+ 8 state->key_del
+ 4 state->sec_index_changed
+ 4 state->sec_index_used
+ 4 state->version
+ 8 state->key_map
+ 8 state->create_time
+ 8 state->recover_time
+ 8 state->check_time
+ 8 state->records_at_analyze
+
+for each key
+ 4 reserved
+
+for each key part
+ 8 state->rec_per_key_part[i]
+ 4 state->nulls_per_key_part[i]
diff --git a/storage/maria/ft_maria.c b/storage/maria/ft_maria.c
new file mode 100644
index 00000000000..b1b24592593
--- /dev/null
+++ b/storage/maria/ft_maria.c
@@ -0,0 +1,48 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/*
+ This function is for interface functions between fulltext and maria
+*/
+
+#include "ma_ftdefs.h"
+
+FT_INFO *maria_ft_init_search(uint flags, void *info, uint keynr,
+ uchar *query, size_t query_len,
+ CHARSET_INFO *cs, uchar *record)
+{
+ FT_INFO *res;
+ if (flags & FT_BOOL)
+ res= maria_ft_init_boolean_search((MARIA_HA *) info, keynr, query,
+ query_len, cs);
+ else
+ res= maria_ft_init_nlq_search((MARIA_HA *) info, keynr, query, query_len,
+ flags, record);
+ return res;
+}
+
+const struct _ft_vft _ma_ft_vft_nlq = {
+ maria_ft_nlq_read_next, maria_ft_nlq_find_relevance,
+ maria_ft_nlq_close_search, maria_ft_nlq_get_relevance,
+ maria_ft_nlq_reinit_search
+};
+const struct _ft_vft _ma_ft_vft_boolean = {
+ maria_ft_boolean_read_next, maria_ft_boolean_find_relevance,
+ maria_ft_boolean_close_search, maria_ft_boolean_get_relevance,
+ maria_ft_boolean_reinit_search
+};
+
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
new file mode 100644
index 00000000000..27958285a2e
--- /dev/null
+++ b/storage/maria/ha_maria.cc
@@ -0,0 +1,3686 @@
+/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+ Copyright (C) 2008-2009 Sun Microsystems, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation // gcc: Class implementation
+#endif
+
+#define MYSQL_SERVER 1
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+#include <m_ctype.h>
+#include <my_dir.h>
+#include <myisampack.h>
+#include <my_bit.h>
+#include "ha_maria.h"
+#include "trnman_public.h"
+#include "trnman.h"
+#include "compat_aliases.h"
+
+C_MODE_START
+#include "maria_def.h"
+#include "ma_rt_index.h"
+#include "ma_blockrec.h"
+#include "ma_checkpoint.h"
+#include "ma_recovery.h"
+C_MODE_END
+
+/*
+ Note that in future versions, only *transactional* Maria tables can
+ rollback, so this flag should be up or down conditionally.
+*/
+#ifdef MARIA_CANNOT_ROLLBACK
+#define CANNOT_ROLLBACK_FLAG HA_NO_TRANSACTIONS
+#define trans_register_ha(A, B, C) do { /* nothing */ } while(0)
+#else
+#define CANNOT_ROLLBACK_FLAG 0
+#endif
+#define THD_TRN (*(TRN **)thd_ha_data(thd, maria_hton))
+
+ulong pagecache_division_limit, pagecache_age_threshold;
+ulonglong pagecache_buffer_size;
+
+/**
+ As the auto-repair is initiated when opened from the SQL layer
+ (open_unireg_entry(), check_and_repair()), it does not happen when Maria's
+ Recovery internally opens the table to apply log records to it, which is
+ good. It would happen only after Recovery, if the table is still
+ corrupted.
+*/
+ulong maria_recover_options= HA_RECOVER_NONE;
+handlerton *maria_hton;
+
+/* bits in maria_recover_options */
+const char *maria_recover_names[]=
+{
+ /*
+ Compared to MyISAM, "default" was renamed to "normal" as it collided with
+ SET var=default which sets to the var's default i.e. what happens when the
+ var is not set i.e. HA_RECOVER_NONE.
+ Another change is that OFF is used to disable, not ""; this is to have OFF
+ display in SHOW VARIABLES which is better than "".
+ */
+ "OFF", "NORMAL", "BACKUP", "FORCE", "QUICK", NullS
+};
+TYPELIB maria_recover_typelib=
+{
+ array_elements(maria_recover_names) - 1, "",
+ maria_recover_names, NULL
+};
+
+const char *maria_stats_method_names[]=
+{
+ "nulls_unequal", "nulls_equal",
+ "nulls_ignored", NullS
+};
+TYPELIB maria_stats_method_typelib=
+{
+ array_elements(maria_stats_method_names) - 1, "",
+ maria_stats_method_names, NULL
+};
+
+/* transactions log purge mode */
+const char *maria_translog_purge_type_names[]=
+{
+ "immediate", "external", "at_flush", NullS
+};
+TYPELIB maria_translog_purge_type_typelib=
+{
+ array_elements(maria_translog_purge_type_names) - 1, "",
+ maria_translog_purge_type_names, NULL
+};
+
+/* transactional log directory sync */
+const char *maria_sync_log_dir_names[]=
+{
+ "NEVER", "NEWFILE", "ALWAYS", NullS
+};
+TYPELIB maria_sync_log_dir_typelib=
+{
+ array_elements(maria_sync_log_dir_names) - 1, "",
+ maria_sync_log_dir_names, NULL
+};
+
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+ "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+ array_elements(maria_group_commit_names) - 1, "",
+ maria_group_commit_names, NULL
+};
+
+/** Interval between background checkpoints in seconds */
+static ulong checkpoint_interval;
+static void update_checkpoint_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+/** After that many consecutive recovery failures, remove logs */
+static ulong force_start_after_recovery_failures;
+static void update_log_file_size(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+
+static MYSQL_SYSVAR_ULONG(block_size, maria_block_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Block size to be used for Aria index pages.", 0, 0,
+ MARIA_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH,
+ MARIA_MAX_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH);
+
+static MYSQL_SYSVAR_ULONG(checkpoint_interval, checkpoint_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between automatic checkpoints, in seconds; 0 means"
+ " 'no automatic checkpoints' which makes sense only for testing.",
+ NULL, update_checkpoint_interval, 30, 0, UINT_MAX, 1);
+
+static MYSQL_SYSVAR_ULONG(force_start_after_recovery_failures,
+ force_start_after_recovery_failures,
+ /*
+ Read-only because setting it on the fly has no useful effect,
+ should be set on command-line.
+ */
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of consecutive log recovery failures after which logs will be"
+ " automatically deleted to cure the problem; 0 (the default) disables"
+ " the feature.", NULL, NULL, 0, 0, UINT_MAX8, 1);
+
+static MYSQL_SYSVAR_BOOL(page_checksum, maria_page_checksums, 0,
+ "Maintain page checksums (can be overridden per table "
+ "with PAGE_CHECKSUM clause in CREATE TABLE)", 0, 0, 1);
+
+/* It is only command line argument */
+static MYSQL_SYSVAR_STR(log_dir_path, maria_data_root,
+ PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Path to the directory where to store transactional log",
+ NULL, NULL, mysql_real_data_home);
+
+
+static MYSQL_SYSVAR_ULONG(log_file_size, log_file_size,
+ PLUGIN_VAR_RQCMDARG,
+ "Limit for transaction log size",
+ NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
+ TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies Aria group commit mode. "
+ "Possible values are \"none\" (no group commit), "
+ "\"hard\" (with waiting to actual commit), "
+ "\"soft\" (no wait for commit (DANGEROUS!!!))",
+ NULL, update_maria_group_commit,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between commite in microseconds (1/1000000c)."
+ " 0 stands for no waiting"
+ " for other threads to come and do a commit in \"hard\" mode and no"
+ " sync()/commit at all in \"soft\" mode. Option has only an effect"
+ " if aria_group_commit is used",
+ NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
+static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies how Aria transactional log will be purged. "
+ "Possible values of name are \"immediate\", \"external\" "
+ "and \"at_flush\"",
+ NULL, NULL, TRANSLOG_PURGE_IMMIDIATE,
+ &maria_translog_purge_type_typelib);
+
+static MYSQL_SYSVAR_ULONGLONG(max_sort_file_size,
+ maria_max_temp_length, PLUGIN_VAR_RQCMDARG,
+ "Don't use the fast sort index method to created index if the "
+ "temporary file would get bigger than this.",
+ 0, 0, MAX_FILE_SIZE & ~(1*MB-1), 0, MAX_FILE_SIZE, 1*MB);
+
+static MYSQL_SYSVAR_ULONG(pagecache_age_threshold,
+ pagecache_age_threshold, PLUGIN_VAR_RQCMDARG,
+ "This characterizes the number of hits a hot block has to be untouched "
+ "until it is considered aged enough to be downgraded to a warm block. "
+ "This specifies the percentage ratio of that number of hits to the "
+ "total number of blocks in the page cache.", 0, 0,
+ 300, 100, ~0L, 100);
+
+static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, pagecache_buffer_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "The size of the buffer used for index blocks for Aria tables. "
+ "Increase this to get better index handling (for all reads and "
+ "multiple writes) to as much as you can afford.", 0, 0,
+ KEY_CACHE_SIZE, 0, ~(ulong) 0, 1);
+
+static MYSQL_SYSVAR_ULONG(pagecache_division_limit, pagecache_division_limit,
+ PLUGIN_VAR_RQCMDARG,
+ "The minimum percentage of warm blocks in key cache", 0, 0,
+ 100, 1, 100, 1);
+
+static MYSQL_SYSVAR_ENUM(recover, maria_recover_options, PLUGIN_VAR_OPCMDARG,
+ "Specifies how corrupted tables should be automatically repaired."
+ " Possible values are \"NORMAL\" (the default), \"BACKUP\", \"FORCE\","
+ " \"QUICK\", or \"OFF\" which is like not using the option.",
+ NULL, NULL, HA_RECOVER_DEFAULT, &maria_recover_typelib);
+
+static MYSQL_THDVAR_ULONG(repair_threads, PLUGIN_VAR_RQCMDARG,
+ "Number of threads to use when repairing Aria tables. The value of 1 "
+ "disables parallel repair.",
+ 0, 0, 1, 1, ~0L, 1);
+
+static MYSQL_THDVAR_ULONG(sort_buffer_size, PLUGIN_VAR_RQCMDARG,
+ "The buffer that is allocated when sorting the index when doing a "
+ "REPAIR or when creating indexes with CREATE INDEX or ALTER TABLE.",
+ 0, 0, 128L*1024L*1024L, 4, ~0L, 1);
+
+static MYSQL_THDVAR_ENUM(stats_method, PLUGIN_VAR_RQCMDARG,
+ "Specifies how Aria index statistics collection code should treat "
+ "NULLs. Possible values are \"nulls_unequal\", \"nulls_equal\", "
+ "and \"nulls_ignored\".", 0, 0, 0, &maria_stats_method_typelib);
+
+static MYSQL_SYSVAR_ENUM(sync_log_dir, sync_log_dir, PLUGIN_VAR_RQCMDARG,
+ "Controls syncing directory after log file growth and new file "
+ "creation. Possible values are \"never\", \"newfile\" and "
+ "\"always\").", NULL, NULL, TRANSLOG_SYNC_DIR_NEWFILE,
+ &maria_sync_log_dir_typelib);
+
+#ifdef USE_MARIA_FOR_TMP_TABLES
+#define USE_MARIA_FOR_TMP_TABLES_VAL 1
+#else
+#define USE_MARIA_FOR_TMP_TABLES_VAL 0
+#endif
+my_bool use_maria_for_temp_tables= USE_MARIA_FOR_TMP_TABLES_VAL;
+
+static MYSQL_SYSVAR_BOOL(used_for_temp_tables,
+ use_maria_for_temp_tables, PLUGIN_VAR_READONLY | PLUGIN_VAR_NOCMDOPT,
+ "Whether temporary tables should be MyISAM or Aria", 0, 0,
+ 1);
+
+/*****************************************************************************
+** MARIA tables
+*****************************************************************************/
+
+static handler *maria_create_handler(handlerton *hton,
+ TABLE_SHARE * table,
+ MEM_ROOT *mem_root)
+{
+ return new (mem_root) ha_maria(hton, table);
+}
+
+
+// collect errors printed by maria_check routines
+
+static void _ma_check_print_msg(HA_CHECK *param, const char *msg_type,
+ const char *fmt, va_list args)
+{
+ THD *thd= (THD *) param->thd;
+ Protocol *protocol= thd->protocol;
+ uint length, msg_length;
+ char msgbuf[HA_MAX_MSG_BUF];
+ char name[NAME_LEN * 2 + 2];
+
+ msg_length= my_vsnprintf(msgbuf, sizeof(msgbuf), fmt, args);
+ msgbuf[sizeof(msgbuf) - 1]= 0; // healthy paranoia
+
+ DBUG_PRINT(msg_type, ("message: %s", msgbuf));
+
+ if (!thd->vio_ok())
+ {
+ sql_print_error(fmt, args);
+ return;
+ }
+
+ if (param->testflag &
+ (T_CREATE_MISSING_KEYS | T_SAFE_REPAIR | T_AUTO_REPAIR))
+ {
+ my_message(ER_NOT_KEYFILE, msgbuf, MYF(MY_WME));
+ return;
+ }
+ length= (uint) (strxmov(name, param->db_name, ".", param->table_name,
+ NullS) - name);
+ /*
+ TODO: switch from protocol to push_warning here. The main reason we didn't
+ it yet is parallel repair. Due to following trace:
+ ma_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr.
+
+ Also we likely need to lock mutex here (in both cases with protocol and
+ push_warning).
+ */
+ protocol->prepare_for_resend();
+ protocol->store(name, length, system_charset_info);
+ protocol->store(param->op_name, system_charset_info);
+ protocol->store(msg_type, system_charset_info);
+ protocol->store(msgbuf, msg_length, system_charset_info);
+ if (protocol->write())
+ sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n",
+ msgbuf);
+ return;
+}
+
+
+/*
+ Convert TABLE object to Maria key and column definition
+
+ SYNOPSIS
+ table2maria()
+ table_arg in TABLE object.
+ keydef_out out Maria key definition.
+ recinfo_out out Maria column definition.
+ records_out out Number of fields.
+
+ DESCRIPTION
+ This function will allocate and initialize Maria key and column
+ definition for further use in ma_create or for a check for underlying
+ table conformance in merge engine.
+
+ The caller needs to free *recinfo_out after use. Since *recinfo_out
+ and *keydef_out are allocated with a my_multi_malloc, *keydef_out
+ is freed automatically when *recinfo_out is freed.
+
+ RETURN VALUE
+ 0 OK
+ # error code
+*/
+
+static int table2maria(TABLE *table_arg, data_file_type row_type,
+ MARIA_KEYDEF **keydef_out,
+ MARIA_COLUMNDEF **recinfo_out, uint *records_out,
+ MARIA_CREATE_INFO *create_info)
+{
+ uint i, j, recpos, minpos, fieldpos, temp_length, length;
+ enum ha_base_keytype type= HA_KEYTYPE_BINARY;
+ uchar *record;
+ KEY *pos;
+ MARIA_KEYDEF *keydef;
+ MARIA_COLUMNDEF *recinfo, *recinfo_pos;
+ HA_KEYSEG *keyseg;
+ TABLE_SHARE *share= table_arg->s;
+ uint options= share->db_options_in_use;
+ DBUG_ENTER("table2maria");
+
+ if (row_type == BLOCK_RECORD)
+ options|= HA_OPTION_PACK_RECORD;
+
+ if (!(my_multi_malloc(MYF(MY_WME),
+ recinfo_out, (share->fields * 2 + 2) * sizeof(MARIA_COLUMNDEF),
+ keydef_out, share->keys * sizeof(MARIA_KEYDEF),
+ &keyseg,
+ (share->key_parts + share->keys) * sizeof(HA_KEYSEG),
+ NullS)))
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM); /* purecov: inspected */
+ keydef= *keydef_out;
+ recinfo= *recinfo_out;
+ pos= table_arg->key_info;
+ for (i= 0; i < share->keys; i++, pos++)
+ {
+ keydef[i].flag= (uint16) (pos->flags & (HA_NOSAME | HA_FULLTEXT |
+ HA_SPATIAL));
+ keydef[i].key_alg= pos->algorithm == HA_KEY_ALG_UNDEF ?
+ (pos->flags & HA_SPATIAL ? HA_KEY_ALG_RTREE : HA_KEY_ALG_BTREE) :
+ pos->algorithm;
+ keydef[i].block_length= pos->block_size;
+ keydef[i].seg= keyseg;
+ keydef[i].keysegs= pos->key_parts;
+ for (j= 0; j < pos->key_parts; j++)
+ {
+ Field *field= pos->key_part[j].field;
+ type= field->key_type();
+ keydef[i].seg[j].flag= pos->key_part[j].key_part_flag;
+
+ if (options & HA_OPTION_PACK_KEYS ||
+ (pos->flags & (HA_PACK_KEY | HA_BINARY_PACK_KEY |
+ HA_SPACE_PACK_USED)))
+ {
+ if (pos->key_part[j].length > 8 &&
+ (type == HA_KEYTYPE_TEXT ||
+ type == HA_KEYTYPE_NUM ||
+ (type == HA_KEYTYPE_BINARY && !field->zero_pack())))
+ {
+ /* No blobs here */
+ if (j == 0)
+ keydef[i].flag|= HA_PACK_KEY;
+ if (!(field->flags & ZEROFILL_FLAG) &&
+ (field->type() == MYSQL_TYPE_STRING ||
+ field->type() == MYSQL_TYPE_VAR_STRING ||
+ ((int) (pos->key_part[j].length - field->decimals())) >= 4))
+ keydef[i].seg[j].flag|= HA_SPACE_PACK;
+ }
+ else if (j == 0 && (!(pos->flags & HA_NOSAME) || pos->key_length > 16))
+ keydef[i].flag|= HA_BINARY_PACK_KEY;
+ }
+ keydef[i].seg[j].type= (int) type;
+ keydef[i].seg[j].start= pos->key_part[j].offset;
+ keydef[i].seg[j].length= pos->key_part[j].length;
+ keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_end=
+ keydef[i].seg[j].bit_length= 0;
+ keydef[i].seg[j].bit_pos= 0;
+ keydef[i].seg[j].language= field->charset()->number;
+
+ if (field->null_ptr)
+ {
+ keydef[i].seg[j].null_bit= field->null_bit;
+ keydef[i].seg[j].null_pos= (uint) (field->null_ptr-
+ (uchar*) table_arg->record[0]);
+ }
+ else
+ {
+ keydef[i].seg[j].null_bit= 0;
+ keydef[i].seg[j].null_pos= 0;
+ }
+ if (field->type() == MYSQL_TYPE_BLOB ||
+ field->type() == MYSQL_TYPE_GEOMETRY)
+ {
+ keydef[i].seg[j].flag|= HA_BLOB_PART;
+ /* save number of bytes used to pack length */
+ keydef[i].seg[j].bit_start= (uint) (field->pack_length() -
+ share->blob_ptr_size);
+ }
+ else if (field->type() == MYSQL_TYPE_BIT)
+ {
+ keydef[i].seg[j].bit_length= ((Field_bit *) field)->bit_len;
+ keydef[i].seg[j].bit_start= ((Field_bit *) field)->bit_ofs;
+ keydef[i].seg[j].bit_pos= (uint) (((Field_bit *) field)->bit_ptr -
+ (uchar*) table_arg->record[0]);
+ }
+ }
+ keyseg+= pos->key_parts;
+ }
+ if (table_arg->found_next_number_field)
+ keydef[share->next_number_index].flag|= HA_AUTO_KEY;
+ record= table_arg->record[0];
+ recpos= 0;
+ recinfo_pos= recinfo;
+ create_info->null_bytes= table_arg->s->null_bytes;
+
+ while (recpos < (uint) share->stored_rec_length)
+ {
+ Field **field, *found= 0;
+ minpos= share->reclength;
+ length= 0;
+
+ for (field= table_arg->field; *field; field++)
+ {
+ if ((fieldpos= (*field)->offset(record)) >= recpos &&
+ fieldpos <= minpos)
+ {
+ /* skip null fields */
+ if (!(temp_length= (*field)->pack_length_in_rec()))
+ continue; /* Skip null-fields */
+ if (! found || fieldpos < minpos ||
+ (fieldpos == minpos && temp_length < length))
+ {
+ minpos= fieldpos;
+ found= *field;
+ length= temp_length;
+ }
+ }
+ }
+ DBUG_PRINT("loop", ("found: 0x%lx recpos: %d minpos: %d length: %d",
+ (long) found, recpos, minpos, length));
+ if (!found)
+ break;
+
+ if (found->flags & BLOB_FLAG)
+ recinfo_pos->type= FIELD_BLOB;
+ else if (found->type() == MYSQL_TYPE_VARCHAR)
+ recinfo_pos->type= FIELD_VARCHAR;
+ else if (!(options & HA_OPTION_PACK_RECORD) ||
+ (found->zero_pack() && (found->flags & PRI_KEY_FLAG)))
+ recinfo_pos->type= FIELD_NORMAL;
+ else if (found->zero_pack())
+ recinfo_pos->type= FIELD_SKIP_ZERO;
+ else
+ recinfo_pos->type= ((length <= 3 ||
+ (found->flags & ZEROFILL_FLAG)) ?
+ FIELD_NORMAL :
+ found->type() == MYSQL_TYPE_STRING ||
+ found->type() == MYSQL_TYPE_VAR_STRING ?
+ FIELD_SKIP_ENDSPACE :
+ FIELD_SKIP_PRESPACE);
+ if (found->null_ptr)
+ {
+ recinfo_pos->null_bit= found->null_bit;
+ recinfo_pos->null_pos= (uint) (found->null_ptr -
+ (uchar*) table_arg->record[0]);
+ }
+ else
+ {
+ recinfo_pos->null_bit= 0;
+ recinfo_pos->null_pos= 0;
+ }
+ (recinfo_pos++)->length= (uint16) length;
+ recpos= minpos + length;
+ DBUG_PRINT("loop", ("length: %d type: %d",
+ recinfo_pos[-1].length,recinfo_pos[-1].type));
+ }
+ *records_out= (uint) (recinfo_pos - recinfo);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Check for underlying table conformance
+
+ SYNOPSIS
+ maria_check_definition()
+ t1_keyinfo in First table key definition
+ t1_recinfo in First table record definition
+ t1_keys in Number of keys in first table
+ t1_recs in Number of records in first table
+ t2_keyinfo in Second table key definition
+ t2_recinfo in Second table record definition
+ t2_keys in Number of keys in second table
+ t2_recs in Number of records in second table
+ strict in Strict check switch
+
+ DESCRIPTION
+ This function compares two Maria definitions. By intention it was done
+ to compare merge table definition against underlying table definition.
+ It may also be used to compare dot-frm and MAI definitions of Maria
+ table as well to compare different Maria table definitions.
+
+ For merge table it is not required that number of keys in merge table
+ must exactly match number of keys in underlying table. When calling this
+ function for underlying table conformance check, 'strict' flag must be
+ set to false, and converted merge definition must be passed as t1_*.
+
+ Otherwise 'strict' flag must be set to 1 and it is not required to pass
+ converted dot-frm definition as t1_*.
+
+ RETURN VALUE
+ 0 - Equal definitions.
+ 1 - Different definitions.
+
+ TODO
+ - compare FULLTEXT keys;
+ - compare SPATIAL keys;
+ - compare FIELD_SKIP_ZERO which is converted to FIELD_NORMAL correctly
+ (should be correctly detected in table2maria).
+*/
+
+int maria_check_definition(MARIA_KEYDEF *t1_keyinfo,
+ MARIA_COLUMNDEF *t1_recinfo,
+ uint t1_keys, uint t1_recs,
+ MARIA_KEYDEF *t2_keyinfo,
+ MARIA_COLUMNDEF *t2_recinfo,
+ uint t2_keys, uint t2_recs, bool strict)
+{
+ uint i, j;
+ DBUG_ENTER("maria_check_definition");
+ if ((strict ? t1_keys != t2_keys : t1_keys > t2_keys))
+ {
+ DBUG_PRINT("error", ("Number of keys differs: t1_keys=%u, t2_keys=%u",
+ t1_keys, t2_keys));
+ DBUG_RETURN(1);
+ }
+ if (t1_recs != t2_recs)
+ {
+ DBUG_PRINT("error", ("Number of recs differs: t1_recs=%u, t2_recs=%u",
+ t1_recs, t2_recs));
+ DBUG_RETURN(1);
+ }
+ for (i= 0; i < t1_keys; i++)
+ {
+ HA_KEYSEG *t1_keysegs= t1_keyinfo[i].seg;
+ HA_KEYSEG *t2_keysegs= t2_keyinfo[i].seg;
+ if (t1_keyinfo[i].flag & HA_FULLTEXT && t2_keyinfo[i].flag & HA_FULLTEXT)
+ continue;
+ else if (t1_keyinfo[i].flag & HA_FULLTEXT ||
+ t2_keyinfo[i].flag & HA_FULLTEXT)
+ {
+ DBUG_PRINT("error", ("Key %d has different definition", i));
+ DBUG_PRINT("error", ("t1_fulltext= %d, t2_fulltext=%d",
+ test(t1_keyinfo[i].flag & HA_FULLTEXT),
+ test(t2_keyinfo[i].flag & HA_FULLTEXT)));
+ DBUG_RETURN(1);
+ }
+ if (t1_keyinfo[i].flag & HA_SPATIAL && t2_keyinfo[i].flag & HA_SPATIAL)
+ continue;
+ else if (t1_keyinfo[i].flag & HA_SPATIAL ||
+ t2_keyinfo[i].flag & HA_SPATIAL)
+ {
+ DBUG_PRINT("error", ("Key %d has different definition", i));
+ DBUG_PRINT("error", ("t1_spatial= %d, t2_spatial=%d",
+ test(t1_keyinfo[i].flag & HA_SPATIAL),
+ test(t2_keyinfo[i].flag & HA_SPATIAL)));
+ DBUG_RETURN(1);
+ }
+ if (t1_keyinfo[i].keysegs != t2_keyinfo[i].keysegs ||
+ t1_keyinfo[i].key_alg != t2_keyinfo[i].key_alg)
+ {
+ DBUG_PRINT("error", ("Key %d has different definition", i));
+ DBUG_PRINT("error", ("t1_keysegs=%d, t1_key_alg=%d",
+ t1_keyinfo[i].keysegs, t1_keyinfo[i].key_alg));
+ DBUG_PRINT("error", ("t2_keysegs=%d, t2_key_alg=%d",
+ t2_keyinfo[i].keysegs, t2_keyinfo[i].key_alg));
+ DBUG_RETURN(1);
+ }
+ for (j= t1_keyinfo[i].keysegs; j--;)
+ {
+ uint8 t1_keysegs_j__type= t1_keysegs[j].type;
+ /*
+ Table migration from 4.1 to 5.1. In 5.1 a *TEXT key part is
+ always HA_KEYTYPE_VARTEXT2. In 4.1 we had only the equivalent of
+ HA_KEYTYPE_VARTEXT1. Since we treat both the same on MyISAM
+ level, we can ignore a mismatch between these types.
+ */
+ if ((t1_keysegs[j].flag & HA_BLOB_PART) &&
+ (t2_keysegs[j].flag & HA_BLOB_PART))
+ {
+ if ((t1_keysegs_j__type == HA_KEYTYPE_VARTEXT2) &&
+ (t2_keysegs[j].type == HA_KEYTYPE_VARTEXT1))
+ t1_keysegs_j__type= HA_KEYTYPE_VARTEXT1; /* purecov: tested */
+ else if ((t1_keysegs_j__type == HA_KEYTYPE_VARBINARY2) &&
+ (t2_keysegs[j].type == HA_KEYTYPE_VARBINARY1))
+ t1_keysegs_j__type= HA_KEYTYPE_VARBINARY1; /* purecov: inspected */
+ }
+
+ if (t1_keysegs_j__type != t2_keysegs[j].type ||
+ t1_keysegs[j].language != t2_keysegs[j].language ||
+ t1_keysegs[j].null_bit != t2_keysegs[j].null_bit ||
+ t1_keysegs[j].length != t2_keysegs[j].length)
+ {
+ DBUG_PRINT("error", ("Key segment %d (key %d) has different "
+ "definition", j, i));
+ DBUG_PRINT("error", ("t1_type=%d, t1_language=%d, t1_null_bit=%d, "
+ "t1_length=%d",
+ t1_keysegs[j].type, t1_keysegs[j].language,
+ t1_keysegs[j].null_bit, t1_keysegs[j].length));
+ DBUG_PRINT("error", ("t2_type=%d, t2_language=%d, t2_null_bit=%d, "
+ "t2_length=%d",
+ t2_keysegs[j].type, t2_keysegs[j].language,
+ t2_keysegs[j].null_bit, t2_keysegs[j].length));
+
+ DBUG_RETURN(1);
+ }
+ }
+ }
+
+ for (i= 0; i < t1_recs; i++)
+ {
+ MARIA_COLUMNDEF *t1_rec= &t1_recinfo[i];
+ MARIA_COLUMNDEF *t2_rec= &t2_recinfo[i];
+ /*
+ FIELD_SKIP_ZERO can be changed to FIELD_NORMAL in maria_create,
+ see NOTE1 in ma_create.c
+ */
+ if ((t1_rec->type != t2_rec->type &&
+ !(t1_rec->type == (int) FIELD_SKIP_ZERO &&
+ t1_rec->length == 1 &&
+ t2_rec->type == (int) FIELD_NORMAL)) ||
+ t1_rec->length != t2_rec->length ||
+ t1_rec->null_bit != t2_rec->null_bit)
+ {
+ DBUG_PRINT("error", ("Field %d has different definition", i));
+ DBUG_PRINT("error", ("t1_type=%d, t1_length=%d, t1_null_bit=%d",
+ t1_rec->type, t1_rec->length, t1_rec->null_bit));
+ DBUG_PRINT("error", ("t2_type=%d, t2_length=%d, t2_null_bit=%d",
+ t2_rec->type, t2_rec->length, t2_rec->null_bit));
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+extern "C" {
+
+int _ma_killed_ptr(HA_CHECK *param)
+{
+ return thd_killed((THD*)param->thd);
+}
+
+
+void _ma_check_print_error(HA_CHECK *param, const char *fmt, ...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_error");
+ param->error_printed |= 1;
+ param->out_flag |= O_DATA_LOST;
+ va_start(args, fmt);
+ _ma_check_print_msg(param, "error", fmt, args);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+
+void _ma_check_print_info(HA_CHECK *param, const char *fmt, ...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_info");
+ va_start(args, fmt);
+ _ma_check_print_msg(param, "info", fmt, args);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+
+void _ma_check_print_warning(HA_CHECK *param, const char *fmt, ...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_warning");
+ param->warning_printed= 1;
+ param->out_flag |= O_DATA_LOST;
+ va_start(args, fmt);
+ _ma_check_print_msg(param, "warning", fmt, args);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+/*
+ Create a transaction object
+
+ SYNOPSIS
+ info Maria handler
+
+ RETURN
+ 0 ok
+ # Error number (HA_ERR_OUT_OF_MEM)
+*/
+
+static int maria_create_trn_for_mysql(MARIA_HA *info)
+{
+ THD *thd= (THD*) info->external_ptr;
+ TRN *trn= THD_TRN;
+ DBUG_ENTER("maria_create_trn_for_mysql");
+
+ if (!trn) /* no transaction yet - open it now */
+ {
+ trn= trnman_new_trn(& thd->transaction.wt);
+ if (unlikely(!trn))
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+ THD_TRN= trn;
+ if (thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+ trans_register_ha(thd, TRUE, maria_hton);
+ }
+ _ma_set_trn_for_table(info, trn);
+ if (!trnman_increment_locked_tables(trn))
+ {
+ trans_register_ha(thd, FALSE, maria_hton);
+ trnman_new_statement(trn);
+ }
+#ifdef EXTRA_DEBUG
+ if (info->lock_type == F_WRLCK &&
+ ! (trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED))
+ {
+ trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED |
+ TRN_STATE_TABLES_CAN_CHANGE);
+ (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY,
+ (uchar*) thd->query(),
+ thd->query_length());
+ }
+ else
+ {
+ DBUG_PRINT("info", ("lock_type: %d trnman_flags: %u",
+ info->lock_type, trnman_get_flags(trn)));
+ }
+
+#endif
+ DBUG_RETURN(0);
+}
+
+} /* extern "C" */
+
+/**
+ Transactional table doing bulk insert with one single UNDO
+ (UNDO_BULK_INSERT) and with repair.
+*/
+#define BULK_INSERT_SINGLE_UNDO_AND_REPAIR 1
+/**
+ Transactional table doing bulk insert with one single UNDO
+ (UNDO_BULK_INSERT) and without repair.
+*/
+#define BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR 2
+/**
+ None of BULK_INSERT_SINGLE_UNDO_AND_REPAIR and
+ BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR.
+*/
+#define BULK_INSERT_NONE 0
+
+ha_maria::ha_maria(handlerton *hton, TABLE_SHARE *table_arg):
+handler(hton, table_arg), file(0),
+int_table_flags(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER |
+ HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
+ HA_DUPLICATE_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY |
+ HA_FILE_BASED | HA_CAN_GEOMETRY | CANNOT_ROLLBACK_FLAG |
+ HA_CAN_BIT_FIELD | HA_CAN_RTREEKEYS |
+ HA_HAS_RECORDS | HA_STATS_RECORDS_IS_EXACT),
+can_enable_indexes(1), bulk_insert_single_undo(BULK_INSERT_NONE)
+{}
+
+
+handler *ha_maria::clone(MEM_ROOT *mem_root)
+{
+ ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(mem_root));
+ if (new_handler)
+ {
+ new_handler->file->state= file->state;
+ /* maria_create_trn_for_mysql() is never called for clone() tables */
+ new_handler->file->trn= file->trn;
+ }
+ return new_handler;
+}
+
+
+static const char *ha_maria_exts[]=
+{
+ MARIA_NAME_IEXT,
+ MARIA_NAME_DEXT,
+ NullS
+};
+
+
+const char **ha_maria::bas_ext() const
+{
+ return ha_maria_exts;
+}
+
+
+const char *ha_maria::index_type(uint key_number)
+{
+ return ((table->key_info[key_number].flags & HA_FULLTEXT) ?
+ "FULLTEXT" :
+ (table->key_info[key_number].flags & HA_SPATIAL) ?
+ "SPATIAL" :
+ (table->key_info[key_number].algorithm == HA_KEY_ALG_RTREE) ?
+ "RTREE" : "BTREE");
+}
+
+
+double ha_maria::scan_time()
+{
+ if (file->s->data_file_type == BLOCK_RECORD)
+ return ulonglong2double(stats.data_file_length - file->s->block_size) / max(file->s->block_size / 2, IO_SIZE) + 2;
+ return handler::scan_time();
+}
+
+/*
+ We need to be able to store at least two keys on an index page as the
+ splitting algorithms depends on this. (With only one key on a page
+ we also can't use any compression, which may make the index file much
+ larger)
+ We use HA_MAX_KEY_BUFF as this is a stack restriction imposed by the
+ handler interface.
+
+ We also need to reserve place for a record pointer (8) and 3 bytes
+ per key segment to store the length of the segment + possible null bytes.
+ These extra bytes are required here so that maria_create() will surely
+ accept any keys created which the returned key data storage length.
+*/
+
+uint ha_maria::max_supported_key_length() const
+{
+ uint tmp= (maria_max_key_length() - 8 - HA_MAX_KEY_SEG*3);
+ return min(HA_MAX_KEY_BUFF, tmp);
+}
+
+
+#ifdef HAVE_REPLICATION
+int ha_maria::net_read_dump(NET * net)
+{
+ int data_fd= file->dfile.file;
+ int error= 0;
+
+ my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME));
+ for (;;)
+ {
+ ulong packet_len= my_net_read(net);
+ if (!packet_len)
+ break; // end of file
+ if (packet_len == packet_error)
+ {
+ sql_print_error("ha_maria::net_read_dump - read error ");
+ error= -1;
+ goto err;
+ }
+ if (my_write(data_fd, (uchar *) net->read_pos, (uint) packet_len,
+ MYF(MY_WME | MY_FNABP)))
+ {
+ error= errno;
+ goto err;
+ }
+ }
+err:
+ return error;
+}
+
+
+int ha_maria::dump(THD * thd, int fd)
+{
+ MARIA_SHARE *share= file->s;
+ NET *net= &thd->net;
+ uint block_size= share->block_size;
+ my_off_t bytes_to_read= share->state.state.data_file_length;
+ int data_fd= file->dfile.file;
+ uchar *buf= (uchar *) my_malloc(block_size, MYF(MY_WME));
+ if (!buf)
+ return ENOMEM;
+
+ int error= 0;
+ my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME));
+ for (; bytes_to_read > 0;)
+ {
+ size_t bytes= my_read(data_fd, buf, block_size, MYF(MY_WME));
+ if (bytes == MY_FILE_ERROR)
+ {
+ error= errno;
+ goto err;
+ }
+
+ if (fd >= 0)
+ {
+ if (my_write(fd, buf, bytes, MYF(MY_WME | MY_FNABP)))
+ {
+ error= errno ? errno : EPIPE;
+ goto err;
+ }
+ }
+ else
+ {
+ if (my_net_write(net, buf, bytes))
+ {
+ error= errno ? errno : EPIPE;
+ goto err;
+ }
+ }
+ bytes_to_read -= bytes;
+ }
+
+ if (fd < 0)
+ {
+ if (my_net_write(net, (uchar*) "", 0))
+ error= errno ? errno : EPIPE;
+ net_flush(net);
+ }
+
+err:
+ my_free((uchar*) buf, MYF(0));
+ return error;
+}
+#endif /* HAVE_REPLICATION */
+
+ /* Name is here without an extension */
+
+int ha_maria::open(const char *name, int mode, uint test_if_locked)
+{
+ uint i;
+
+#ifdef NOT_USED
+ /*
+ If the user wants to have memory mapped data files, add an
+ open_flag. Do not memory map temporary tables because they are
+ expected to be inserted and thus extended a lot. Memory mapping is
+ efficient for files that keep their size, but very inefficient for
+ growing files. Using an open_flag instead of calling ma_extra(...
+ HA_EXTRA_MMAP ...) after maxs_open() has the advantage that the
+ mapping is not repeated for every open, but just done on the initial
+ open, when the MyISAM share is created. Every time the server
+ requires to open a new instance of a table it calls this method. We
+ will always supply HA_OPEN_MMAP for a permanent table. However, the
+ Maria storage engine will ignore this flag if this is a secondary
+ open of a table that is in use by other threads already (if the
+ Maria share exists already).
+ */
+ if (!(test_if_locked & HA_OPEN_TMP_TABLE) && opt_maria_use_mmap)
+ test_if_locked|= HA_OPEN_MMAP;
+#endif
+
+ if (unlikely(maria_recover_options != HA_RECOVER_NONE))
+ {
+ /* user asked to trigger a repair if table was not properly closed */
+ test_if_locked|= HA_OPEN_ABORT_IF_CRASHED;
+ }
+
+ if (!(file= maria_open(name, mode, test_if_locked | HA_OPEN_FROM_SQL_LAYER)))
+ return (my_errno ? my_errno : -1);
+
+ file->s->chst_invalidator= query_cache_invalidate_by_MyISAM_filename_ref;
+
+ if (test_if_locked & (HA_OPEN_IGNORE_IF_LOCKED | HA_OPEN_TMP_TABLE))
+ VOID(maria_extra(file, HA_EXTRA_NO_WAIT_LOCK, 0));
+
+ info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+ if (!(test_if_locked & HA_OPEN_WAIT_IF_LOCKED))
+ VOID(maria_extra(file, HA_EXTRA_WAIT_LOCK, 0));
+ if ((data_file_type= file->s->data_file_type) != STATIC_RECORD)
+ int_table_flags |= HA_REC_NOT_IN_SEQ;
+ if (!file->s->base.born_transactional)
+ {
+ /*
+ INSERT DELAYED cannot work with transactional tables (because it cannot
+ stand up to "when client gets ok the data is safe on disk": the record
+ may not even be inserted). In the future, we could enable it back (as a
+ client doing INSERT DELAYED knows the specificities; but we then should
+ make sure to regularly commit in the delayed_insert thread).
+ */
+ int_table_flags|= HA_CAN_INSERT_DELAYED;
+ }
+ if (file->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+ int_table_flags |= HA_HAS_NEW_CHECKSUM;
+
+ for (i= 0; i < table->s->keys; i++)
+ {
+ plugin_ref parser= table->key_info[i].parser;
+ if (table->key_info[i].flags & HA_USES_PARSER)
+ file->s->keyinfo[i].parser=
+ (struct st_mysql_ftparser *)plugin_decl(parser)->info;
+ table->key_info[i].block_size= file->s->keyinfo[i].block_length;
+ }
+ my_errno= 0;
+ return my_errno;
+}
+
+
+int ha_maria::close(void)
+{
+ MARIA_HA *tmp= file;
+ if (!tmp)
+ return 0;
+ file= 0;
+ return maria_close(tmp);
+}
+
+
+int ha_maria::write_row(uchar * buf)
+{
+ ha_statistic_increment(&SSV::ha_write_count);
+
+ /* If we have a timestamp column, update it to the current time */
+ if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
+ table->timestamp_field->set_time();
+
+ /*
+ If we have an auto_increment column and we are writing a changed row
+ or a new row, then update the auto_increment value in the record.
+ */
+ if (table->next_number_field && buf == table->record[0])
+ {
+ int error;
+ if ((error= update_auto_increment()))
+ return error;
+ }
+ return maria_write(file, buf);
+}
+
+
+int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt)
+{
+ int error;
+ HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+ MARIA_SHARE *share= file->s;
+ const char *old_proc_info= thd_proc_info(thd, "Checking table");
+ TRN *old_trn= file->trn;
+
+ if (!file || !&param) return HA_ADMIN_INTERNAL_ERROR;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "check";
+ param.db_name= table->s->db.str;
+ param.table_name= table->alias;
+ param.testflag= check_opt->flags | T_CHECK | T_SILENT;
+ param.stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method);
+
+ if (!(table->db_stat & HA_READ_ONLY))
+ param.testflag |= T_STATISTICS;
+ param.using_global_keycache= 1;
+
+ if (!maria_is_crashed(file) &&
+ (((param.testflag & T_CHECK_ONLY_CHANGED) &&
+ !(share->state.changed & (STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR |
+ STATE_IN_REPAIR)) &&
+ share->state.open_count == 0) ||
+ ((param.testflag & T_FAST) && (share->state.open_count ==
+ (uint) (share->global_changed ? 1 :
+ 0)))))
+ return HA_ADMIN_ALREADY_DONE;
+
+ maria_chk_init_for_check(&param, file);
+ (void) maria_chk_status(&param, file); // Not fatal
+ error= maria_chk_size(&param, file);
+ if (!error)
+ error|= maria_chk_del(&param, file, param.testflag);
+ if (!error)
+ error= maria_chk_key(&param, file);
+ if (!error)
+ {
+ if ((!(param.testflag & T_QUICK) &&
+ ((share->options &
+ (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) ||
+ (param.testflag & (T_EXTEND | T_MEDIUM)))) || maria_is_crashed(file))
+ {
+ ulonglong old_testflag= param.testflag;
+ param.testflag |= T_MEDIUM;
+ if (!(error= init_io_cache(&param.read_cache, file->dfile.file,
+ my_default_record_cache_size, READ_CACHE,
+ share->pack.header_length, 1, MYF(MY_WME))))
+ {
+ error= maria_chk_data_link(&param, file,
+ test(param.testflag & T_EXTEND));
+ end_io_cache(&(param.read_cache));
+ }
+ param.testflag= old_testflag;
+ }
+ }
+ if (!error)
+ {
+ if ((share->state.changed & (STATE_CHANGED |
+ STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR |
+ STATE_CRASHED | STATE_NOT_ANALYZED)) ||
+ (param.testflag & T_STATISTICS) || maria_is_crashed(file))
+ {
+ file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ pthread_mutex_lock(&share->intern_lock);
+ DBUG_PRINT("info", ("Reseting crashed state"));
+ share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR);
+ if (!(table->db_stat & HA_READ_ONLY))
+ error= maria_update_state_info(&param, file,
+ UPDATE_TIME | UPDATE_OPEN_COUNT |
+ UPDATE_STAT);
+ pthread_mutex_unlock(&share->intern_lock);
+ info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE |
+ HA_STATUS_CONST);
+ }
+ }
+ else if (!maria_is_crashed(file) && !thd->killed)
+ {
+ maria_mark_crashed(file);
+ file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ }
+
+ /* Reset trn, that may have been set by repair */
+ _ma_set_trn_for_table(file, old_trn);
+ thd_proc_info(thd, old_proc_info);
+ return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK;
+}
+
+
+/*
+ Analyze the key distribution in the table
+ As the table may be only locked for read, we have to take into account that
+ two threads may do an analyze at the same time!
+*/
+
+int ha_maria::analyze(THD *thd, HA_CHECK_OPT * check_opt)
+{
+ int error= 0;
+ HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+ MARIA_SHARE *share= file->s;
+
+ if (!&param)
+ return HA_ADMIN_INTERNAL_ERROR;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "analyze";
+ param.db_name= table->s->db.str;
+ param.table_name= table->alias;
+ param.testflag= (T_FAST | T_CHECK | T_SILENT | T_STATISTICS |
+ T_DONT_CHECK_CHECKSUM);
+ param.using_global_keycache= 1;
+ param.stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method);
+
+ if (!(share->state.changed & STATE_NOT_ANALYZED))
+ return HA_ADMIN_ALREADY_DONE;
+
+ error= maria_chk_key(&param, file);
+ if (!error)
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ error= maria_update_state_info(&param, file, UPDATE_STAT);
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+ else if (!maria_is_crashed(file) && !thd->killed)
+ maria_mark_crashed(file);
+ return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK;
+}
+
+
+int ha_maria::restore(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ HA_CHECK_OPT tmp_check_opt;
+ char *backup_dir= thd->lex->backup_dir;
+ char src_path[FN_REFLEN], dst_path[FN_REFLEN];
+ char table_name[FN_REFLEN];
+ int error;
+ const char *errmsg;
+ DBUG_ENTER("restore");
+
+ VOID(tablename_to_filename(table->s->table_name.str, table_name,
+ sizeof(table_name)));
+
+ if (fn_format_relative_to_data_home(src_path, table_name, backup_dir,
+ MARIA_NAME_DEXT))
+ DBUG_RETURN(HA_ADMIN_INVALID);
+
+ strxmov(dst_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS);
+ if (my_copy(src_path, dst_path, MYF(MY_WME)))
+ {
+ error= HA_ADMIN_FAILED;
+ errmsg= "Failed in my_copy (Error %d)";
+ goto err;
+ }
+
+ tmp_check_opt.init();
+ tmp_check_opt.flags |= T_VERY_SILENT | T_CALC_CHECKSUM | T_QUICK;
+ DBUG_RETURN(repair(thd, &tmp_check_opt));
+
+err:
+ {
+ /*
+ Don't allocate param on stack here as this may be huge and it's
+ also allocated by repair()
+ */
+ HA_CHECK *param;
+ if (!(param= (HA_CHECK*) my_malloc(sizeof(*param), MYF(MY_WME | MY_FAE))))
+ DBUG_RETURN(error);
+ maria_chk_init(param);
+ param->thd= thd;
+ param->op_name= "restore";
+ param->db_name= table->s->db.str;
+ param->table_name= table->s->table_name.str;
+ param->testflag= 0;
+ _ma_check_print_error(param, errmsg, my_errno);
+ my_free(param, MYF(0));
+ DBUG_RETURN(error);
+ }
+}
+
+
+int ha_maria::backup(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ char *backup_dir= thd->lex->backup_dir;
+ char src_path[FN_REFLEN], dst_path[FN_REFLEN];
+ char table_name[FN_REFLEN];
+ int error;
+ const char *errmsg;
+ DBUG_ENTER("ha_maria::backup");
+
+ VOID(tablename_to_filename(table->s->table_name.str, table_name,
+ sizeof(table_name)));
+
+ if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir,
+ reg_ext))
+ {
+ errmsg= "Failed in fn_format() for .frm file (errno: %d)";
+ error= HA_ADMIN_INVALID;
+ goto err;
+ }
+
+ strxmov(src_path, table->s->normalized_path.str, reg_ext, NullS);
+ if (my_copy(src_path, dst_path,
+ MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE)))
+ {
+ error= HA_ADMIN_FAILED;
+ errmsg= "Failed copying .frm file (errno: %d)";
+ goto err;
+ }
+
+ /* Change extension */
+ if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir,
+ MARIA_NAME_DEXT))
+ {
+ errmsg= "Failed in fn_format() for .MYD file (errno: %d)";
+ error= HA_ADMIN_INVALID;
+ goto err;
+ }
+
+ strxmov(src_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS);
+ if (_ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_FORCE_WRITE,
+ FLUSH_KEEP))
+ {
+ error= HA_ADMIN_FAILED;
+ errmsg= "Failed in flush (Error %d)";
+ goto err;
+ }
+ if (my_copy(src_path, dst_path,
+ MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE)))
+ {
+ errmsg= "Failed copying .MYD file (errno: %d)";
+ error= HA_ADMIN_FAILED;
+ goto err;
+ }
+ DBUG_RETURN(HA_ADMIN_OK);
+
+err:
+ {
+ HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+ if (!&param)
+ return HA_ADMIN_INTERNAL_ERROR;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "backup";
+ param.db_name= table->s->db.str;
+ param.table_name= table->s->table_name.str;
+ param.testflag= 0;
+ _ma_check_print_error(&param, errmsg, my_errno);
+ DBUG_RETURN(error);
+ }
+}
+
+
+int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ int error;
+ HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+ ha_rows start_records;
+
+ if (!file || !&param)
+ return HA_ADMIN_INTERNAL_ERROR;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "repair";
+ param.testflag= ((check_opt->flags & ~(T_EXTEND)) |
+ T_SILENT | T_FORCE_CREATE | T_CALC_CHECKSUM |
+ (check_opt->flags & T_EXTEND ? T_REP : T_REP_BY_SORT));
+ param.sort_buffer_length= THDVAR(thd, sort_buffer_size);
+ start_records= file->state->records;
+ while ((error= repair(thd, &param, 0)) && param.retry_repair)
+ {
+ param.retry_repair= 0;
+ if (test_all_bits(param.testflag,
+ (uint) (T_RETRY_WITHOUT_QUICK | T_QUICK)))
+ {
+ param.testflag&= ~(T_RETRY_WITHOUT_QUICK | T_QUICK);
+ /* Ensure we don't loose any rows when retrying without quick */
+ param.testflag|= T_SAFE_REPAIR;
+ if (thd->vio_ok())
+ _ma_check_print_info(&param, "Retrying repair without quick");
+ else
+ sql_print_information("Retrying repair of: '%s' without quick",
+ table->s->path.str);
+ continue;
+ }
+ param.testflag &= ~T_QUICK;
+ if ((param.testflag & T_REP_BY_SORT))
+ {
+ param.testflag= (param.testflag & ~T_REP_BY_SORT) | T_REP;
+ sql_print_information("Retrying repair of: '%s' with keycache",
+ table->s->path.str);
+ continue;
+ }
+ break;
+ }
+ if (!error && start_records != file->state->records &&
+ !(check_opt->flags & T_VERY_SILENT))
+ {
+ char llbuff[22], llbuff2[22];
+ sql_print_information("Found %s of %s rows when repairing '%s'",
+ llstr(file->state->records, llbuff),
+ llstr(start_records, llbuff2),
+ table->s->path.str);
+ }
+ return error;
+}
+
+int ha_maria::zerofill(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ int error;
+ HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+ TRN *old_trn;
+ MARIA_SHARE *share= file->s;
+
+ if (!file || !&param)
+ return HA_ADMIN_INTERNAL_ERROR;
+
+ old_trn= file->trn;
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "zerofill";
+ param.testflag= check_opt->flags | T_SILENT | T_ZEROFILL;
+ param.sort_buffer_length= THDVAR(thd, sort_buffer_size);
+ error=maria_zerofill(&param, file, share->open_file_name.str);
+
+ /* Reset trn, that may have been set by repair */
+ _ma_set_trn_for_table(file, old_trn);
+
+ if (!error)
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ maria_update_state_info(&param, file, UPDATE_TIME | UPDATE_OPEN_COUNT);
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+ return error;
+}
+
+int ha_maria::optimize(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ int error;
+ HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+
+ if (!file || !&param)
+ return HA_ADMIN_INTERNAL_ERROR;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "optimize";
+ param.testflag= (check_opt->flags | T_SILENT | T_FORCE_CREATE |
+ T_REP_BY_SORT | T_STATISTICS | T_SORT_INDEX);
+ param.sort_buffer_length= THDVAR(thd, sort_buffer_size);
+ if ((error= repair(thd, &param, 1)) && param.retry_repair)
+ {
+ sql_print_warning("Warning: Optimize table got errno %d on %s.%s, retrying",
+ my_errno, param.db_name, param.table_name);
+ param.testflag &= ~T_REP_BY_SORT;
+ error= repair(thd, &param, 1);
+ }
+
+ return error;
+}
+
+
+int ha_maria::repair(THD *thd, HA_CHECK *param, bool do_optimize)
+{
+ int error= 0;
+ ulonglong local_testflag= param->testflag;
+ bool optimize_done= !do_optimize, statistics_done= 0;
+ const char *old_proc_info= thd->proc_info;
+ char fixed_name[FN_REFLEN];
+ MARIA_SHARE *share= file->s;
+ ha_rows rows= file->state->records;
+ TRN *old_trn= file->trn;
+ DBUG_ENTER("ha_maria::repair");
+
+ /*
+ Normally this method is entered with a properly opened table. If the
+ repair fails, it can be repeated with more elaborate options. Under
+ special circumstances it can happen that a repair fails so that it
+ closed the data file and cannot re-open it. In this case file->dfile
+ is set to -1. We must not try another repair without an open data
+ file. (Bug #25289)
+ */
+ if (file->dfile.file == -1)
+ {
+ sql_print_information("Retrying repair of: '%s' failed. "
+ "Please try REPAIR EXTENDED or aria_chk",
+ table->s->path.str);
+ DBUG_RETURN(HA_ADMIN_FAILED);
+ }
+
+ /*
+ If transactions was not enabled for a transactional table then
+ file->s->status is not up to date. This is needed for repair_by_sort
+ to work
+ */
+ if (share->base.born_transactional && !share->now_transactional)
+ _ma_copy_nontrans_state_information(file);
+
+ param->db_name= table->s->db.str;
+ param->table_name= table->alias;
+ param->tmpfile_createflag= O_RDWR | O_TRUNC;
+ param->using_global_keycache= 1;
+ param->thd= thd;
+ param->tmpdir= &mysql_tmpdir_list;
+ param->out_flag= 0;
+ strmov(fixed_name, share->open_file_name.str);
+
+ // Don't lock tables if we have used LOCK TABLE
+ if (!thd->locked_tables &&
+ maria_lock_database(file, table->s->tmp_table ? F_EXTRA_LCK : F_WRLCK))
+ {
+ _ma_check_print_error(param, ER(ER_CANT_LOCK), my_errno);
+ DBUG_RETURN(HA_ADMIN_FAILED);
+ }
+
+ if (!do_optimize ||
+ (((share->data_file_type == BLOCK_RECORD) ?
+ (share->state.changed & STATE_NOT_OPTIMIZED_ROWS) :
+ (file->state->del ||
+ share->state.split != file->state->records)) &&
+ (!(param->testflag & T_QUICK) ||
+ (share->state.changed & (STATE_NOT_OPTIMIZED_KEYS |
+ STATE_NOT_OPTIMIZED_ROWS)))))
+ {
+ ulonglong key_map= ((local_testflag & T_CREATE_MISSING_KEYS) ?
+ maria_get_mask_all_keys_active(share->base.keys) :
+ share->state.key_map);
+ ulonglong save_testflag= param->testflag;
+ if (maria_test_if_sort_rep(file, file->state->records, key_map, 0) &&
+ (local_testflag & T_REP_BY_SORT))
+ {
+ local_testflag |= T_STATISTICS;
+ param->testflag |= T_STATISTICS; // We get this for free
+ statistics_done= 1;
+ /* TODO: Remove BLOCK_RECORD test when parallel works with blocks */
+ if (THDVAR(thd,repair_threads) > 1 &&
+ share->data_file_type != BLOCK_RECORD)
+ {
+ char buf[40];
+ /* TODO: respect maria_repair_threads variable */
+ my_snprintf(buf, 40, "Repair with %d threads", my_count_bits(key_map));
+ thd_proc_info(thd, buf);
+ param->testflag|= T_REP_PARALLEL;
+ error= maria_repair_parallel(param, file, fixed_name,
+ test(param->testflag & T_QUICK));
+ /* to reset proc_info, as it was pointing to local buffer */
+ thd_proc_info(thd, "Repair done");
+ }
+ else
+ {
+ thd_proc_info(thd, "Repair by sorting");
+ param->testflag|= T_REP_BY_SORT;
+ error= maria_repair_by_sort(param, file, fixed_name,
+ test(param->testflag & T_QUICK));
+ }
+ }
+ else
+ {
+ thd_proc_info(thd, "Repair with keycache");
+ param->testflag &= ~(T_REP_BY_SORT | T_REP_PARALLEL);
+ error= maria_repair(param, file, fixed_name,
+ test(param->testflag & T_QUICK));
+ }
+ param->testflag= save_testflag | (param->testflag & T_RETRY_WITHOUT_QUICK);
+ optimize_done= 1;
+ }
+ if (!error)
+ {
+ if ((local_testflag & T_SORT_INDEX) &&
+ (share->state.changed & STATE_NOT_SORTED_PAGES))
+ {
+ optimize_done= 1;
+ thd_proc_info(thd, "Sorting index");
+ error= maria_sort_index(param, file, fixed_name);
+ }
+ if (!statistics_done && (local_testflag & T_STATISTICS))
+ {
+ if (share->state.changed & STATE_NOT_ANALYZED)
+ {
+ optimize_done= 1;
+ thd_proc_info(thd, "Analyzing");
+ error= maria_chk_key(param, file);
+ }
+ else
+ local_testflag &= ~T_STATISTICS; // Don't update statistics
+ }
+ }
+ thd_proc_info(thd, "Saving state");
+ pthread_mutex_lock(&share->intern_lock);
+ if (!error)
+ {
+ if ((share->state.changed & STATE_CHANGED) || maria_is_crashed(file))
+ {
+ DBUG_PRINT("info", ("Reseting crashed state"));
+ share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR);
+ file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ }
+ /*
+ repair updates share->state.state. Ensure that file->state is up to date
+ */
+ if (file->state != &share->state.state)
+ *file->state= share->state.state;
+ if (share->base.auto_key)
+ _ma_update_auto_increment_key(param, file, 1);
+ if (optimize_done)
+ error= maria_update_state_info(param, file,
+ UPDATE_TIME | UPDATE_OPEN_COUNT |
+ (local_testflag &
+ T_STATISTICS ? UPDATE_STAT : 0));
+ info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE |
+ HA_STATUS_CONST, 0);
+ if (rows != file->state->records && !(param->testflag & T_VERY_SILENT))
+ {
+ char llbuff[22], llbuff2[22];
+ _ma_check_print_warning(param, "Number of rows changed from %s to %s",
+ llstr(rows, llbuff),
+ llstr(file->state->records, llbuff2));
+ /* Abort if warning was converted to error */
+ if (current_thd->is_error())
+ error= 1;
+ }
+ }
+ else
+ {
+ maria_mark_crashed_on_repair(file);
+ file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ maria_update_state_info(param, file, 0);
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+ thd_proc_info(thd, old_proc_info);
+ if (!thd->locked_tables)
+ maria_lock_database(file, F_UNLCK);
+
+ /* Reset trn, that may have been set by repair */
+ _ma_set_trn_for_table(file, old_trn);
+ error= error ? HA_ADMIN_FAILED :
+ (optimize_done ?
+ (write_log_record_for_repair(param, file) ? HA_ADMIN_FAILED :
+ HA_ADMIN_OK) : HA_ADMIN_ALREADY_DONE);
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Assign table indexes to a specific key cache.
+*/
+
+int ha_maria::assign_to_keycache(THD * thd, HA_CHECK_OPT *check_opt)
+{
+#if 0 && NOT_IMPLEMENTED
+ PAGECACHE *new_pagecache= check_opt->pagecache;
+ const char *errmsg= 0;
+ int error= HA_ADMIN_OK;
+ ulonglong map;
+ TABLE_LIST *table_list= table->pos_in_table_list;
+ DBUG_ENTER("ha_maria::assign_to_keycache");
+
+
+ table->keys_in_use_for_query.clear_all();
+
+ if (table_list->process_index_hints(table))
+ DBUG_RETURN(HA_ADMIN_FAILED);
+ map= ~(ulonglong) 0;
+ if (!table->keys_in_use_for_query.is_clear_all())
+ /* use all keys if there's no list specified by the user through hints */
+ map= table->keys_in_use_for_query.to_ulonglong();
+
+ if ((error= maria_assign_to_pagecache(file, map, new_pagecache)))
+ {
+ char buf[STRING_BUFFER_USUAL_SIZE];
+ my_snprintf(buf, sizeof(buf),
+ "Failed to flush to index file (errno: %d)", error);
+ errmsg= buf;
+ error= HA_ADMIN_CORRUPT;
+ }
+
+ if (error != HA_ADMIN_OK)
+ {
+ /* Send error to user */
+ HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+ if (!&param)
+ return HA_ADMIN_INTERNAL_ERROR;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "assign_to_keycache";
+ param.db_name= table->s->db.str;
+ param.table_name= table->s->table_name.str;
+ param.testflag= 0;
+ _ma_check_print_error(&param, errmsg);
+ }
+ DBUG_RETURN(error);
+#else
+ return HA_ADMIN_NOT_IMPLEMENTED;
+#endif
+}
+
+
+/*
+ Preload pages of the index file for a table into the key cache.
+*/
+
+int ha_maria::preload_keys(THD * thd, HA_CHECK_OPT *check_opt)
+{
+ ulonglong map;
+ TABLE_LIST *table_list= table->pos_in_table_list;
+
+ DBUG_ENTER("ha_maria::preload_keys");
+
+ table->keys_in_use_for_query.clear_all();
+
+ if (table_list->process_index_hints(table))
+ DBUG_RETURN(HA_ADMIN_FAILED);
+
+ map= ~(ulonglong) 0;
+ /* Check validity of the index references */
+ if (!table->keys_in_use_for_query.is_clear_all())
+ /* use all keys if there's no list specified by the user through hints */
+ map= table->keys_in_use_for_query.to_ulonglong();
+
+ maria_extra(file, HA_EXTRA_PRELOAD_BUFFER_SIZE,
+ (void*) &thd->variables.preload_buff_size);
+
+ int error;
+
+ if ((error= maria_preload(file, map, table_list->ignore_leaves)))
+ {
+ char buf[MYSQL_ERRMSG_SIZE+20];
+ const char *errmsg;
+
+ switch (error) {
+ case HA_ERR_NON_UNIQUE_BLOCK_SIZE:
+ errmsg= "Indexes use different block sizes";
+ break;
+ case HA_ERR_OUT_OF_MEM:
+ errmsg= "Failed to allocate buffer";
+ break;
+ default:
+ my_snprintf(buf, sizeof(buf),
+ "Failed to read from index file (errno: %d)", my_errno);
+ errmsg= buf;
+ }
+
+ HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+ if (!&param)
+ return HA_ADMIN_INTERNAL_ERROR;
+
+ maria_chk_init(&param);
+ param.thd= thd;
+ param.op_name= "preload_keys";
+ param.db_name= table->s->db.str;
+ param.table_name= table->s->table_name.str;
+ param.testflag= 0;
+ _ma_check_print_error(&param, "%s", errmsg);
+ DBUG_RETURN(HA_ADMIN_FAILED);
+ }
+ DBUG_RETURN(HA_ADMIN_OK);
+}
+
+
+/*
+ Disable indexes, making it persistent if requested.
+
+ SYNOPSIS
+ disable_indexes()
+ mode mode of operation:
+ HA_KEY_SWITCH_NONUNIQ disable all non-unique keys
+ HA_KEY_SWITCH_ALL disable all keys
+ HA_KEY_SWITCH_NONUNIQ_SAVE dis. non-uni. and make persistent
+ HA_KEY_SWITCH_ALL_SAVE dis. all keys and make persistent
+
+ IMPLEMENTATION
+ HA_KEY_SWITCH_NONUNIQ is not implemented.
+ HA_KEY_SWITCH_ALL_SAVE is not implemented.
+
+ RETURN
+ 0 ok
+ HA_ERR_WRONG_COMMAND mode not implemented.
+*/
+
+int ha_maria::disable_indexes(uint mode)
+{
+ int error;
+
+ if (mode == HA_KEY_SWITCH_ALL)
+ {
+ /* call a storage engine function to switch the key map */
+ error= maria_disable_indexes(file);
+ }
+ else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE)
+ {
+ maria_extra(file, HA_EXTRA_NO_KEYS, 0);
+ info(HA_STATUS_CONST); // Read new key info
+ error= 0;
+ }
+ else
+ {
+ /* mode not implemented */
+ error= HA_ERR_WRONG_COMMAND;
+ }
+ return error;
+}
+
+
+/*
+ Enable indexes, making it persistent if requested.
+
+ SYNOPSIS
+ enable_indexes()
+ mode mode of operation:
+ HA_KEY_SWITCH_NONUNIQ enable all non-unique keys
+ HA_KEY_SWITCH_ALL enable all keys
+ HA_KEY_SWITCH_NONUNIQ_SAVE en. non-uni. and make persistent
+ HA_KEY_SWITCH_ALL_SAVE en. all keys and make persistent
+
+ DESCRIPTION
+ Enable indexes, which might have been disabled by disable_index() before.
+ The modes without _SAVE work only if both data and indexes are empty,
+ since the MARIA repair would enable them persistently.
+ To be sure in these cases, call handler::delete_all_rows() before.
+
+ IMPLEMENTATION
+ HA_KEY_SWITCH_NONUNIQ is not implemented.
+ HA_KEY_SWITCH_ALL_SAVE is not implemented.
+
+ RETURN
+ 0 ok
+ !=0 Error, among others:
+ HA_ERR_CRASHED data or index is non-empty. Delete all rows and retry.
+ HA_ERR_WRONG_COMMAND mode not implemented.
+*/
+
+int ha_maria::enable_indexes(uint mode)
+{
+ int error;
+ DBUG_PRINT("info", ("ha_maria::enable_indexes mode: %d", mode));
+ if (maria_is_all_keys_active(file->s->state.key_map, file->s->base.keys))
+ {
+ /* All indexes are enabled already. */
+ return 0;
+ }
+
+ if (mode == HA_KEY_SWITCH_ALL)
+ {
+ error= maria_enable_indexes(file);
+ /*
+ Do not try to repair on error,
+ as this could make the enabled state persistent,
+ but mode==HA_KEY_SWITCH_ALL forbids it.
+ */
+ }
+ else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE)
+ {
+ THD *thd= current_thd;
+ HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+ if (!&param)
+ return HA_ADMIN_INTERNAL_ERROR;
+
+ const char *save_proc_info= thd_proc_info(thd, "Creating index");
+
+ maria_chk_init(&param);
+ param.op_name= "recreating_index";
+ param.testflag= (T_SILENT | T_REP_BY_SORT | T_QUICK |
+ T_CREATE_MISSING_KEYS | T_SAFE_REPAIR);
+ if (bulk_insert_single_undo == BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR)
+ {
+ bulk_insert_single_undo= BULK_INSERT_SINGLE_UNDO_AND_REPAIR;
+ /*
+ Don't bump create_rename_lsn, because UNDO_BULK_INSERT
+ should not be skipped in case of crash during repair.
+ */
+ param.testflag|= T_NO_CREATE_RENAME_LSN;
+ }
+ param.myf_rw &= ~MY_WAIT_IF_FULL;
+ param.sort_buffer_length= THDVAR(thd,sort_buffer_size);
+ param.stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method);
+ param.tmpdir= &mysql_tmpdir_list;
+ if ((error= (repair(thd, &param, 0) != HA_ADMIN_OK)) && param.retry_repair)
+ {
+ sql_print_warning("Warning: Enabling keys got errno %d on %s.%s, "
+ "retrying",
+ my_errno, param.db_name, param.table_name);
+ /* This should never fail normally */
+ DBUG_ASSERT(thd->killed != 0);
+ /* Repairing by sort failed. Now try standard repair method. */
+ param.testflag &= ~T_REP_BY_SORT;
+ error= (repair(thd, &param, 0) != HA_ADMIN_OK);
+ /*
+ If the standard repair succeeded, clear all error messages which
+ might have been set by the first repair. They can still be seen
+ with SHOW WARNINGS then.
+ */
+ if (!error)
+ thd->clear_error();
+ }
+ info(HA_STATUS_CONST);
+ thd_proc_info(thd, save_proc_info);
+ }
+ else
+ {
+ /* mode not implemented */
+ error= HA_ERR_WRONG_COMMAND;
+ }
+ DBUG_EXECUTE_IF("maria_flush_whole_log",
+ {
+ DBUG_PRINT("maria_flush_whole_log", ("now"));
+ translog_flush(translog_get_horizon());
+ });
+ DBUG_EXECUTE_IF("maria_crash_enable_index",
+ {
+ DBUG_PRINT("maria_crash_enable_index", ("now"));
+ DBUG_ABORT();
+ });
+ return error;
+}
+
+
+/*
+ Test if indexes are disabled.
+
+
+ SYNOPSIS
+ indexes_are_disabled()
+ no parameters
+
+
+ RETURN
+ 0 indexes are not disabled
+ 1 all indexes are disabled
+ [2 non-unique indexes are disabled - NOT YET IMPLEMENTED]
+*/
+
+int ha_maria::indexes_are_disabled(void)
+{
+ return maria_indexes_are_disabled(file);
+}
+
+
+/*
+ prepare for a many-rows insert operation
+ e.g. - disable indexes (if they can be recreated fast) or
+ activate special bulk-insert optimizations
+
+ SYNOPSIS
+ start_bulk_insert(rows)
+ rows Rows to be inserted
+ 0 if we don't know
+
+ NOTICE
+ Do not forget to call end_bulk_insert() later!
+*/
+
+void ha_maria::start_bulk_insert(ha_rows rows)
+{
+ DBUG_ENTER("ha_maria::start_bulk_insert");
+ THD *thd= current_thd;
+ ulong size= min(thd->variables.read_buff_size,
+ (ulong) (table->s->avg_row_length * rows));
+ MARIA_SHARE *share= file->s;
+ DBUG_PRINT("info", ("start_bulk_insert: rows %lu size %lu",
+ (ulong) rows, size));
+
+ /* don't enable row cache if too few rows */
+ if (!rows || (rows > MARIA_MIN_ROWS_TO_USE_WRITE_CACHE))
+ maria_extra(file, HA_EXTRA_WRITE_CACHE, (void*) &size);
+
+ can_enable_indexes= (maria_is_all_keys_active(share->state.key_map,
+ share->base.keys));
+ bulk_insert_single_undo= BULK_INSERT_NONE;
+
+ if (!(specialflag & SPECIAL_SAFE_MODE))
+ {
+ /*
+ Only disable old index if the table was empty and we are inserting
+ a lot of rows.
+ We should not do this for only a few rows as this is slower and
+ we don't want to update the key statistics based of only a few rows.
+ Index file rebuild requires an exclusive lock, so if versioning is on
+ don't do it (see how ha_maria::store_lock() tries to predict repair).
+ We can repair index only if we have an exclusive (TL_WRITE) lock. To
+ see if table is empty, we shouldn't rely on the old records' count from
+ our transaction's start (if that old count is 0 but now there are
+ records in the table, we would wrongly destroy them).
+ So we need to look at share->state.state.records.
+ As a safety net for now, we don't remove the test of
+ file->state->records, because there is uncertainty on what will happen
+ during repair if the two states disagree.
+ */
+ if ((file->state->records == 0) &&
+ (share->state.state.records == 0) && can_enable_indexes &&
+ (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES) &&
+ (file->lock.type == TL_WRITE))
+ {
+ /**
+ @todo for a single-row INSERT SELECT, we will go into repair, which
+ is more costly (flushes, syncs) than a row write.
+ */
+ maria_disable_non_unique_index(file, rows);
+ if (share->now_transactional)
+ {
+ bulk_insert_single_undo= BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR;
+ write_log_record_for_bulk_insert(file);
+ _ma_tmp_disable_logging_for_table(file, TRUE);
+ /*
+ Pages currently in the page cache have type PAGECACHE_LSN_PAGE, we
+ are not allowed to overwrite them with PAGECACHE_PLAIN_PAGE, so
+ throw them away. It is not losing data, because we just wrote and
+ forced an UNDO which will for sure empty the table if we crash. The
+ upcoming unique-key insertions however need a proper index, so we
+ cannot leave the corrupted on-disk index file, thus we truncate it.
+ */
+ maria_delete_all_rows(file);
+ }
+ }
+ else if (!file->bulk_insert &&
+ (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT))
+ {
+ maria_init_bulk_insert(file, thd->variables.bulk_insert_buff_size, rows);
+ }
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ end special bulk-insert optimizations,
+ which have been activated by start_bulk_insert().
+
+ SYNOPSIS
+ end_bulk_insert()
+ no arguments
+
+ RETURN
+ 0 OK
+ != 0 Error
+*/
+
+int ha_maria::end_bulk_insert()
+{
+ int err;
+ DBUG_ENTER("ha_maria::end_bulk_insert");
+ maria_end_bulk_insert(file);
+ if ((err= maria_extra(file, HA_EXTRA_NO_CACHE, 0)))
+ goto end;
+ if (can_enable_indexes && !file->s->deleting)
+ err= enable_indexes(HA_KEY_SWITCH_NONUNIQ_SAVE);
+end:
+ if (bulk_insert_single_undo != BULK_INSERT_NONE)
+ {
+ DBUG_ASSERT(can_enable_indexes);
+ /*
+ Table was transactional just before start_bulk_insert().
+ No need to flush pages if we did a repair (which already flushed).
+ */
+ err|=
+ _ma_reenable_logging_for_table(file,
+ bulk_insert_single_undo ==
+ BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR);
+ }
+ DBUG_RETURN(err);
+}
+
+
+bool ha_maria::check_and_repair(THD *thd)
+{
+ int error, crashed;
+ LEX_STRING old_query;
+ HA_CHECK_OPT check_opt;
+ DBUG_ENTER("ha_maria::check_and_repair");
+
+ check_opt.init();
+
+ error= 1;
+ if ((file->s->state.changed &
+ (STATE_CRASHED | STATE_CRASHED_ON_REPAIR | STATE_MOVED)) ==
+ STATE_MOVED)
+ {
+ sql_print_information("Zerofilling moved table: '%s'",
+ table->s->path.str);
+ if (!(error= zerofill(thd, &check_opt)))
+ DBUG_RETURN(0);
+ }
+
+ /*
+ if we got this far - the table is crashed.
+ but don't auto-repair if maria_recover_options is not set
+ */
+ if (!maria_recover_options)
+ DBUG_RETURN(error);
+
+ error= 0;
+ check_opt.flags= T_MEDIUM | T_AUTO_REPAIR;
+ // Don't use quick if deleted rows
+ if (!file->state->del && (maria_recover_options & HA_RECOVER_QUICK))
+ check_opt.flags |= T_QUICK;
+
+ old_query= thd->query_string;
+ pthread_mutex_lock(&LOCK_thread_count);
+ thd->query_string= table->s->table_name;
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ if (!(crashed= maria_is_crashed(file)))
+ {
+ sql_print_warning("Checking table: '%s'", table->s->path.str);
+ crashed= check(thd, &check_opt);
+ }
+
+ if (crashed)
+ {
+ sql_print_warning("Recovering table: '%s'", table->s->path.str);
+ check_opt.flags=
+ ((maria_recover_options & HA_RECOVER_BACKUP ? T_BACKUP_DATA : 0) |
+ (maria_recover_options & HA_RECOVER_FORCE ? 0 : T_SAFE_REPAIR) |
+ T_AUTO_REPAIR);
+ if (repair(thd, &check_opt))
+ error= 1;
+ }
+ pthread_mutex_lock(&LOCK_thread_count);
+ thd->query_string= old_query;
+ pthread_mutex_unlock(&LOCK_thread_count);
+ DBUG_RETURN(error);
+}
+
+
+bool ha_maria::is_crashed() const
+{
+ return (file->s->state.changed & (STATE_CRASHED | STATE_MOVED) ||
+ (my_disable_locking && file->s->state.open_count));
+}
+
+#define CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING(msg) \
+ do { \
+ if (file->lock.type == TL_WRITE_CONCURRENT_INSERT) \
+ { \
+ my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), msg); \
+ return 1; \
+ } \
+ } while(0)
+
+int ha_maria::update_row(const uchar * old_data, uchar * new_data)
+{
+ CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING("UPDATE in WRITE CONCURRENT");
+ ha_statistic_increment(&SSV::ha_update_count);
+ if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
+ table->timestamp_field->set_time();
+ return maria_update(file, old_data, new_data);
+}
+
+
+int ha_maria::delete_row(const uchar * buf)
+{
+ CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING("DELETE in WRITE CONCURRENT");
+ ha_statistic_increment(&SSV::ha_delete_count);
+ return maria_delete(file, buf);
+}
+
+C_MODE_START
+
+ICP_RESULT index_cond_func_maria(void *arg)
+{
+ ha_maria *h= (ha_maria*)arg;
+ if (h->end_range)
+ {
+ if (h->compare_key2(h->end_range) > 0)
+ return ICP_OUT_OF_RANGE; /* caller should return HA_ERR_END_OF_FILE already */
+ }
+ return h->pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
+}
+
+C_MODE_END
+
+int ha_maria::index_read_map(uchar * buf, const uchar * key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_key_count);
+ int error= maria_rkey(file, buf, active_index, key, keypart_map, find_flag);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_read_idx_map(uchar * buf, uint index, const uchar * key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag)
+{
+ ha_statistic_increment(&SSV::ha_read_key_count);
+ int error= maria_rkey(file, buf, index, key, keypart_map, find_flag);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_read_last_map(uchar * buf, const uchar * key,
+ key_part_map keypart_map)
+{
+ DBUG_ENTER("ha_maria::index_read_last_map");
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_key_count);
+ int error= maria_rkey(file, buf, active_index, key, keypart_map,
+ HA_READ_PREFIX_LAST);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ DBUG_RETURN(error);
+}
+
+
+int ha_maria::index_next(uchar * buf)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_next_count);
+ int error= maria_rnext(file, buf, active_index);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_prev(uchar * buf)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_prev_count);
+ int error= maria_rprev(file, buf, active_index);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_first(uchar * buf)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_first_count);
+ int error= maria_rfirst(file, buf, active_index);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_last(uchar * buf)
+{
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_last_count);
+ int error= maria_rlast(file, buf, active_index);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_next_same(uchar * buf,
+ const uchar *key __attribute__ ((unused)),
+ uint length __attribute__ ((unused)))
+{
+ int error;
+ DBUG_ASSERT(inited == INDEX);
+ ha_statistic_increment(&SSV::ha_read_next_count);
+ /*
+ TODO: Delete this loop in Maria 1.5 as versioning will ensure this never
+ happens
+ */
+ do
+ {
+ error= maria_rnext_same(file,buf);
+ } while (error == HA_ERR_RECORD_DELETED);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::index_init(uint idx, bool sorted)
+{
+ active_index=idx;
+ if (pushed_idx_cond_keyno == idx)
+ ma_set_index_cond_func(file, index_cond_func_maria, this);
+ return 0;
+}
+
+
+int ha_maria::index_end()
+{
+ active_index=MAX_KEY;
+ ma_set_index_cond_func(file, NULL, 0);
+ in_range_check_pushed_down= FALSE;
+ ds_mrr.dsmrr_close();
+ return 0;
+}
+
+
+int ha_maria::rnd_init(bool scan)
+{
+ if (scan)
+ return maria_scan_init(file);
+ return maria_reset(file); // Free buffers
+}
+
+
+int ha_maria::rnd_end()
+{
+ ds_mrr.dsmrr_close();
+ /* Safe to call even if we don't have started a scan */
+ maria_scan_end(file);
+ return 0;
+}
+
+
+int ha_maria::rnd_next(uchar *buf)
+{
+ ha_statistic_increment(&SSV::ha_read_rnd_next_count);
+ int error= maria_scan(file, buf);
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+int ha_maria::remember_rnd_pos()
+{
+ return (*file->s->scan_remember_pos)(file, &remember_pos);
+}
+
+
+int ha_maria::restart_rnd_next(uchar *buf)
+{
+ (*file->s->scan_restore_pos)(file, remember_pos);
+ return rnd_next(buf);
+}
+
+
+int ha_maria::rnd_pos(uchar *buf, uchar *pos)
+{
+ ha_statistic_increment(&SSV::ha_read_rnd_count);
+ int error= maria_rrnd(file, buf, my_get_ptr(pos, ref_length));
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+void ha_maria::position(const uchar *record)
+{
+ my_off_t row_position= maria_position(file);
+ my_store_ptr(ref, ref_length, row_position);
+}
+
+
+int ha_maria::info(uint flag)
+{
+ return info(flag, table->s->tmp_table == NO_TMP_TABLE);
+}
+
+int ha_maria::info(uint flag, my_bool lock_table_share)
+{
+ MARIA_INFO maria_info;
+ char name_buff[FN_REFLEN];
+
+ (void) maria_status(file, &maria_info, flag);
+ if (flag & HA_STATUS_VARIABLE)
+ {
+ stats.records= maria_info.records;
+ stats.deleted= maria_info.deleted;
+ stats.data_file_length= maria_info.data_file_length;
+ stats.index_file_length= maria_info.index_file_length;
+ stats.delete_length= maria_info.delete_length;
+ stats.check_time= maria_info.check_time;
+ stats.mean_rec_length= maria_info.mean_reclength;
+ }
+ if (flag & HA_STATUS_CONST)
+ {
+ TABLE_SHARE *share= table->s;
+ stats.max_data_file_length= maria_info.max_data_file_length;
+ stats.max_index_file_length= maria_info.max_index_file_length;
+ stats.create_time= maria_info.create_time;
+ ref_length= maria_info.reflength;
+ share->db_options_in_use= maria_info.options;
+ stats.block_size= maria_block_size;
+ stats.mrr_length_per_rec= maria_info.reflength + 8; // 8 = max(sizeof(void *))
+
+ /* Update share */
+ if (lock_table_share)
+ pthread_mutex_lock(&share->mutex);
+ share->keys_in_use.set_prefix(share->keys);
+ share->keys_in_use.intersect_extended(maria_info.key_map);
+ share->keys_for_keyread.intersect(share->keys_in_use);
+ share->db_record_offset= maria_info.record_offset;
+ if (share->key_parts)
+ {
+ ulong *to= table->key_info[0].rec_per_key, *end;
+ double *from= maria_info.rec_per_key;
+ for (end= to+ share->key_parts ; to < end ; to++, from++)
+ *to= (ulong) (*from + 0.5);
+ }
+ if (lock_table_share)
+ pthread_mutex_unlock(&share->mutex);
+
+ /*
+ Set data_file_name and index_file_name to point at the symlink value
+ if table is symlinked (Ie; Real name is not same as generated name)
+ */
+ data_file_name= index_file_name= 0;
+ fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_DEXT,
+ MY_APPEND_EXT | MY_UNPACK_FILENAME);
+ if (strcmp(name_buff, maria_info.data_file_name))
+ data_file_name =maria_info.data_file_name;
+ fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_IEXT,
+ MY_APPEND_EXT | MY_UNPACK_FILENAME);
+ if (strcmp(name_buff, maria_info.index_file_name))
+ index_file_name=maria_info.index_file_name;
+ }
+ if (flag & HA_STATUS_ERRKEY)
+ {
+ errkey= maria_info.errkey;
+ my_store_ptr(dup_ref, ref_length, maria_info.dup_key_pos);
+ }
+ /* Faster to always update, than to do it based on flag */
+ stats.update_time= maria_info.update_time;
+ stats.auto_increment_value= maria_info.auto_increment;
+
+ return 0;
+}
+
+
+int ha_maria::extra(enum ha_extra_function operation)
+{
+ int tmp;
+ TRN *old_trn= file->trn;
+ if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_KEYREAD)
+ return 0;
+#ifdef NOT_USED
+ if (operation == HA_EXTRA_MMAP && !opt_maria_use_mmap)
+ return 0;
+#endif
+
+ /*
+ We have to set file->trn here because in some cases we call
+ extern_lock(F_UNLOCK) (which resets file->trn) followed by maria_close()
+ without calling commit/rollback in between. If file->trn is not set
+ we can't remove file->share from the transaction list in the extra() call.
+
+ table->in_use is not set in the case this is a done as part of closefrm()
+ as part of drop table.
+ */
+
+ if (file->s->now_transactional && !file->trn && table->in_use &&
+ (operation == HA_EXTRA_PREPARE_FOR_DROP ||
+ operation == HA_EXTRA_PREPARE_FOR_RENAME))
+ {
+ THD *thd= table->in_use;
+ TRN *trn= THD_TRN;
+ _ma_set_trn_for_table(file, trn);
+ }
+ tmp= maria_extra(file, operation, 0);
+ file->trn= old_trn; // Reset trn if was used
+ return tmp;
+}
+
+int ha_maria::reset(void)
+{
+ pushed_idx_cond= NULL;
+ pushed_idx_cond_keyno= MAX_KEY;
+ ma_set_index_cond_func(file, NULL, 0);
+ ds_mrr.dsmrr_close();
+ if (file->trn)
+ {
+ /* Next statement is a new statement. Ensure it's logged */
+ trnman_set_flags(file->trn,
+ trnman_get_flags(file->trn) & ~TRN_STATE_INFO_LOGGED);
+ }
+ return maria_reset(file);
+}
+
+/* To be used with WRITE_CACHE and EXTRA_CACHE */
+
+int ha_maria::extra_opt(enum ha_extra_function operation, ulong cache_size)
+{
+ if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_WRITE_CACHE)
+ return 0;
+ return maria_extra(file, operation, (void*) &cache_size);
+}
+
+
+int ha_maria::delete_all_rows()
+{
+ THD *thd= current_thd;
+ (void) translog_log_debug_info(file->trn, LOGREC_DEBUG_INFO_QUERY,
+ (uchar*) thd->query(), thd->query_length());
+ if (file->s->now_transactional &&
+ ((table->in_use->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) ||
+ table->in_use->locked_tables))
+ {
+ /*
+ We are not in autocommit mode or user have done LOCK TABLES.
+ We must do the delete row by row to be able to rollback the command
+ */
+ return HA_ERR_WRONG_COMMAND;
+ }
+ return maria_delete_all_rows(file);
+}
+
+
+int ha_maria::delete_table(const char *name)
+{
+ THD *thd= current_thd;
+ (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+ (uchar*) thd->query(), thd->query_length());
+ return maria_delete_table(name);
+}
+
+
+/* This is mainly for temporary tables, so no logging necessary */
+
+void ha_maria::drop_table(const char *name)
+{
+ (void) close();
+ (void) maria_delete_table(name);
+}
+
+
+int ha_maria::external_lock(THD *thd, int lock_type)
+{
+ DBUG_ENTER("ha_maria::external_lock");
+ /*
+ We don't test now_transactional because it may vary between lock/unlock
+ and thus confuse our reference counting.
+ It is critical to skip non-transactional tables: user-visible temporary
+ tables get an external_lock() when read/written for the first time, but no
+ corresponding unlock (they just stay locked and are later dropped while
+ locked); if a tmp table was transactional, "SELECT FROM non_tmp, tmp"
+ would never commit as its "locked_tables" count would stay 1.
+ When Maria has has_transactions()==TRUE, open_temporary_table()
+ (sql_base.cc) will use TRANSACTIONAL_TMP_TABLE and thus the
+ external_lock(F_UNLCK) will happen and we can then allow the user to
+ create transactional temporary tables.
+ */
+ if (file->s->base.born_transactional)
+ {
+ /* Transactional table */
+ if (lock_type != F_UNLCK)
+ {
+ file->external_ptr= thd; // For maria_register_trn()
+
+ if (!file->s->lock_key_trees) // If we don't use versioning
+ {
+ /*
+ We come here in the following cases:
+ - The table is a temporary table
+ - It's a table which is crash safe but not yet versioned, for
+ example a table with fulltext or rtree keys
+
+ Set the current state to point to save_state so that the
+ block_format code don't count the same record twice.
+ Copy also the current state. This may have been wrong if the
+ same file was used several times in the last statement
+ */
+ file->state= file->state_start;
+ *file->state= file->s->state.state;
+ }
+
+ if (file->trn)
+ {
+ /* This can only happen with tables created with clone() */
+ DBUG_ASSERT(cloned);
+ trnman_increment_locked_tables(file->trn);
+ }
+
+ if (!thd->transaction.on)
+ {
+ /*
+ No need to log REDOs/UNDOs. If this is an internal temporary table
+ which will be renamed to a permanent table (like in ALTER TABLE),
+ the rename happens after unlocking so will be durable (and the table
+ will get its create_rename_lsn).
+ Note: if we wanted to enable users to have an old backup and apply
+ tons of archived logs to roll-forward, we could then not disable
+ REDOs/UNDOs in this case.
+ */
+ DBUG_PRINT("info", ("Disabling logging for table"));
+ _ma_tmp_disable_logging_for_table(file, TRUE);
+ }
+ }
+ else
+ {
+ TRN *trn= THD_TRN;
+ /* End of transaction */
+
+ /*
+ We always re-enable, don't rely on thd->transaction.on as it is
+ sometimes reset to true after unlocking (see mysql_truncate() for a
+ partitioned table based on Maria).
+ Note that we can come here without having an exclusive lock on the
+ table, for example in this case:
+ external_lock(F_(WR|RD)LCK); thr_lock() which fails due to lock
+ abortion; external_lock(F_UNLCK). Fortunately, the re-enabling happens
+ only if we were the thread which disabled logging.
+ */
+ if (_ma_reenable_logging_for_table(file, TRUE))
+ DBUG_RETURN(1);
+ /** @todo zero file->trn also in commit and rollback */
+ _ma_set_trn_for_table(file, NULL); // Safety
+ /*
+ Ensure that file->state points to the current number of rows. This
+ is needed if someone calls maria_info() without first doing an
+ external lock of the table
+ */
+ file->state= &file->s->state.state;
+ if (trn)
+ {
+ DBUG_PRINT("info",
+ ("locked_tables: %u", trnman_has_locked_tables(trn)));
+ if (trnman_has_locked_tables(trn) &&
+ !trnman_decrement_locked_tables(trn))
+ {
+ /*
+ OK should not have been sent to client yet (ACID).
+ This is a bit excessive, ACID requires this only if there are some
+ changes to commit (rollback shouldn't be tested).
+ */
+ DBUG_ASSERT(!thd->main_da.is_sent ||
+ thd->killed == THD::KILL_CONNECTION);
+ /* autocommit ? rollback a transaction */
+#ifdef MARIA_CANNOT_ROLLBACK
+ if (ma_commit(trn))
+ DBUG_RETURN(1);
+ THD_TRN= 0;
+#else
+ if (!(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
+ {
+ trnman_rollback_trn(trn);
+ DBUG_PRINT("info", ("THD_TRN set to 0x0"));
+ THD_TRN= 0;
+ }
+#endif
+ }
+ trnman_set_flags(trn, trnman_get_flags(trn) & ~ TRN_STATE_INFO_LOGGED);
+ }
+ }
+ } /* if transactional table */
+ DBUG_RETURN(maria_lock_database(file, !table->s->tmp_table ?
+ lock_type : ((lock_type == F_UNLCK) ?
+ F_UNLCK : F_EXTRA_LCK)));
+}
+
+int ha_maria::start_stmt(THD *thd, thr_lock_type lock_type)
+{
+ TRN *trn;
+ if (file->s->base.born_transactional)
+ {
+ trn= THD_TRN;
+ DBUG_ASSERT(trn); // this may be called only after external_lock()
+ DBUG_ASSERT(trnman_has_locked_tables(trn));
+ DBUG_ASSERT(lock_type != TL_UNLOCK);
+ DBUG_ASSERT(file->trn == trn);
+
+ /*
+ If there was an implicit commit under this LOCK TABLES by a previous
+ statement (like a DDL), at least if that previous statement was about a
+ different ha_maria than 'this' then this->file->trn is a stale
+ pointer. We fix it:
+ */
+ _ma_set_trn_for_table(file, trn);
+ /*
+ As external_lock() was already called, don't increment locked_tables.
+ Note that we call the function below possibly several times when
+ statement starts (once per table). This is ok as long as that function
+ does cheap operations. Otherwise, we will need to do it only on first
+ call to start_stmt().
+ */
+ trnman_new_statement(trn);
+
+#ifdef EXTRA_DEBUG
+ if (!(trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED) &&
+ trnman_get_flags(trn) & TRN_STATE_TABLES_CAN_CHANGE)
+ {
+ trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED);
+ (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY,
+ (uchar*) thd->query(),
+ thd->query_length());
+ }
+#endif
+ }
+ return 0;
+}
+
+
+/**
+ Performs an implicit commit of the Maria transaction and creates a new
+ one.
+
+ This can be considered a hack. When Maria loses HA_NO_TRANSACTIONS it will
+ be participant in the connection's transaction and so the implicit commits
+ (ha_commit()) (like in end_active_trans()) will do the implicit commit
+ without need to call this function which can then be removed.
+
+ @param thd THD object
+ @param new_trn if a new transaction should be created; a new
+ transaction is not needed when we know that the
+ tables will be unlocked very soon.
+*/
+
+int ha_maria::implicit_commit(THD *thd, bool new_trn)
+{
+#ifndef MARIA_CANNOT_ROLLBACK
+#error this method should be removed
+#endif
+ TRN *trn;
+ int error= 0;
+ TABLE *table;
+ DBUG_ENTER("ha_maria::implicit_commit");
+ if (!new_trn && thd->locked_tables)
+ {
+ /*
+ "we are under LOCK TABLES" <=> "we shouldn't commit".
+ As thd->locked_tables is true, we are either under LOCK TABLES, or in
+ prelocking; prelocking can be under LOCK TABLES, or not (and in this
+ latter case only we should commit).
+ Note that we come here only at the end of the top statement
+ (dispatch_command()), we are never committing inside a sub-statement./
+ */
+ enum prelocked_mode_type prelocked_mode= thd->prelocked_mode;
+ if ((prelocked_mode == NON_PRELOCKED) ||
+ (prelocked_mode == PRELOCKED_UNDER_LOCK_TABLES))
+ {
+ DBUG_PRINT("info", ("locked_tables, skipping"));
+ DBUG_RETURN(0);
+ }
+ }
+ if ((trn= THD_TRN) != NULL)
+ {
+ uint locked_tables= trnman_has_locked_tables(trn);
+ if (unlikely(ma_commit(trn)))
+ error= 1;
+ if (!new_trn)
+ {
+ THD_TRN= NULL;
+ goto end;
+ }
+ /*
+ We need to create a new transaction and put it in THD_TRN. Indeed,
+ tables may be under LOCK TABLES, and so they will start the next
+ statement assuming they have a trn (see ha_maria::start_stmt()).
+ */
+ trn= trnman_new_trn(& thd->transaction.wt);
+ /* This is just a commit, tables stay locked if they were: */
+ trnman_reset_locked_tables(trn, locked_tables);
+ THD_TRN= trn;
+ if (unlikely(trn == NULL))
+ error= HA_ERR_OUT_OF_MEM;
+
+ /*
+ Move all locked tables to the new transaction
+ We must do it here as otherwise file->thd and file->state may be
+ stale pointers. We can't do this in start_stmt() as we don't know
+ when we should call _ma_setup_live_state() and in some cases, like
+ in check table, we use the table without calling start_stmt().
+ */
+ for (table=thd->open_tables; table ; table=table->next)
+ {
+ if (table->db_stat && table->file->ht == maria_hton)
+ {
+ MARIA_HA *handler= ((ha_maria*) table->file)->file;
+ if (handler->s->base.born_transactional)
+ {
+ _ma_set_trn_for_table(handler, trn);
+ /* If handler uses versioning */
+ if (handler->s->lock_key_trees)
+ {
+ if (_ma_setup_live_state(handler))
+ error= HA_ERR_OUT_OF_MEM;
+ }
+ }
+ }
+ }
+ }
+end:
+ DBUG_RETURN(error);
+}
+
+
+THR_LOCK_DATA **ha_maria::store_lock(THD *thd,
+ THR_LOCK_DATA **to,
+ enum thr_lock_type lock_type)
+{
+ /* Test if we can fix test below */
+ DBUG_ASSERT(lock_type != TL_UNLOCK &&
+ (lock_type == TL_IGNORE || file->lock.type == TL_UNLOCK));
+ if (lock_type != TL_IGNORE && file->lock.type == TL_UNLOCK)
+ {
+ const enum enum_sql_command sql_command= thd->lex->sql_command;
+ /*
+ We have to disable concurrent inserts for INSERT ... SELECT or
+ INSERT/UPDATE/DELETE with sub queries if we are using statement based
+ logging. We take the safe route here and disable this for all commands
+ that only does reading that are not SELECT.
+ */
+ if (lock_type <= TL_READ_HIGH_PRIORITY &&
+ !thd->current_stmt_binlog_row_based &&
+ (sql_command != SQLCOM_SELECT &&
+ sql_command != SQLCOM_LOCK_TABLES) &&
+ (thd->options & OPTION_BIN_LOG) &&
+ mysql_bin_log.is_open())
+ lock_type= TL_READ_NO_INSERT;
+ else if (lock_type == TL_WRITE_CONCURRENT_INSERT)
+ {
+ const enum enum_duplicates duplicates= thd->lex->duplicates;
+ /*
+ Explanation for the 3 conditions below, in order:
+
+ - Bulk insert may use repair, which will cause problems if other
+ threads try to read/insert to the table: disable versioning.
+ Note that our read of file->state->records is incorrect, as such
+ variable may have changed when we come to start_bulk_insert() (worse
+ case: we see != 0 so allow versioning, start_bulk_insert() sees 0 and
+ uses repair). This is prevented because start_bulk_insert() will not
+ try repair if we enabled versioning.
+ - INSERT SELECT ON DUPLICATE KEY UPDATE comes here with
+ TL_WRITE_CONCURRENT_INSERT but shouldn't because it can do
+ update/delete of a row and versioning doesn't support that
+ - same for LOAD DATA CONCURRENT REPLACE.
+ */
+ if ((file->state->records == 0) ||
+ (sql_command == SQLCOM_INSERT_SELECT && duplicates == DUP_UPDATE) ||
+ (sql_command == SQLCOM_LOAD && duplicates == DUP_REPLACE))
+ lock_type= TL_WRITE;
+ }
+ file->lock.type= lock_type;
+ }
+ *to++= &file->lock;
+ return to;
+}
+
+
+void ha_maria::update_create_info(HA_CREATE_INFO *create_info)
+{
+ ha_maria::info(HA_STATUS_AUTO | HA_STATUS_CONST);
+ if (!(create_info->used_fields & HA_CREATE_USED_AUTO))
+ {
+ create_info->auto_increment_value= stats.auto_increment_value;
+ }
+ create_info->data_file_name= data_file_name;
+ create_info->index_file_name= index_file_name;
+ /* We need to restore the row type as Maria can change it */
+ if (create_info->row_type != ROW_TYPE_DEFAULT &&
+ !(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT))
+ create_info->row_type= get_row_type();
+ /*
+ Show always page checksums, as this can be forced with
+ maria_page_checksums variable
+ */
+ if (create_info->page_checksum == HA_CHOICE_UNDEF)
+ create_info->page_checksum=
+ (file->s->options & HA_OPTION_PAGE_CHECKSUM) ? HA_CHOICE_YES :
+ HA_CHOICE_NO;
+}
+
+
+enum row_type ha_maria::get_row_type() const
+{
+ switch (file->s->data_file_type) {
+ case STATIC_RECORD: return ROW_TYPE_FIXED;
+ case DYNAMIC_RECORD: return ROW_TYPE_DYNAMIC;
+ case BLOCK_RECORD: return ROW_TYPE_PAGE;
+ case COMPRESSED_RECORD: return ROW_TYPE_COMPRESSED;
+ default: return ROW_TYPE_NOT_USED;
+ }
+}
+
+
+static enum data_file_type maria_row_type(HA_CREATE_INFO *info)
+{
+ if (info->transactional == HA_CHOICE_YES)
+ return BLOCK_RECORD;
+ switch (info->row_type) {
+ case ROW_TYPE_FIXED: return STATIC_RECORD;
+ case ROW_TYPE_DYNAMIC: return DYNAMIC_RECORD;
+ default: return BLOCK_RECORD;
+ }
+}
+
+
+int ha_maria::create(const char *name, register TABLE *table_arg,
+ HA_CREATE_INFO *ha_create_info)
+{
+ int error;
+ uint create_flags= 0, record_count, i;
+ char buff[FN_REFLEN];
+ MARIA_KEYDEF *keydef;
+ MARIA_COLUMNDEF *recinfo;
+ MARIA_CREATE_INFO create_info;
+ TABLE_SHARE *share= table_arg->s;
+ uint options= share->db_options_in_use;
+ enum data_file_type row_type;
+ THD *thd= current_thd;
+ DBUG_ENTER("ha_maria::create");
+
+ for (i= 0; i < share->keys; i++)
+ {
+ if (table_arg->key_info[i].flags & HA_USES_PARSER)
+ {
+ create_flags|= HA_CREATE_RELIES_ON_SQL_LAYER;
+ break;
+ }
+ }
+ /* Note: BLOCK_RECORD is used if table is transactional */
+ row_type= maria_row_type(ha_create_info);
+ if (ha_create_info->transactional == HA_CHOICE_YES &&
+ ha_create_info->row_type != ROW_TYPE_PAGE &&
+ ha_create_info->row_type != ROW_TYPE_NOT_USED &&
+ ha_create_info->row_type != ROW_TYPE_DEFAULT)
+ push_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "Row format set to PAGE because of TRANSACTIONAL=1 option");
+
+ bzero((char*) &create_info, sizeof(create_info));
+ if ((error= table2maria(table_arg, row_type, &keydef, &recinfo,
+ &record_count, &create_info)))
+ DBUG_RETURN(error); /* purecov: inspected */
+ create_info.max_rows= share->max_rows;
+ create_info.reloc_rows= share->min_rows;
+ create_info.with_auto_increment= share->next_number_key_offset == 0;
+ create_info.auto_increment= (ha_create_info->auto_increment_value ?
+ ha_create_info->auto_increment_value -1 :
+ (ulonglong) 0);
+ create_info.data_file_length= ((ulonglong) share->max_rows *
+ share->avg_row_length);
+ create_info.data_file_name= ha_create_info->data_file_name;
+ create_info.index_file_name= ha_create_info->index_file_name;
+ create_info.language= share->table_charset->number;
+
+ /*
+ Table is transactional:
+ - If the user specify that table is transactional (in this case
+ row type is forced to BLOCK_RECORD)
+ - If they specify BLOCK_RECORD without specifying transactional behaviour
+
+ Shouldn't this test be pushed down to maria_create()? Because currently,
+ ma_test1 -T crashes: it creates a table with DYNAMIC_RECORD but has
+ born_transactional==1, which confuses some recovery-related code.
+ */
+ create_info.transactional= (row_type == BLOCK_RECORD &&
+ ha_create_info->transactional != HA_CHOICE_NO);
+
+ if (ha_create_info->options & HA_LEX_CREATE_TMP_TABLE)
+ create_flags|= HA_CREATE_TMP_TABLE;
+ if (ha_create_info->options & HA_CREATE_KEEP_FILES)
+ create_flags|= HA_CREATE_KEEP_FILES;
+ if (options & HA_OPTION_PACK_RECORD)
+ create_flags|= HA_PACK_RECORD;
+ if (options & HA_OPTION_CHECKSUM)
+ create_flags|= HA_CREATE_CHECKSUM;
+ if (options & HA_OPTION_DELAY_KEY_WRITE)
+ create_flags|= HA_CREATE_DELAY_KEY_WRITE;
+ if ((ha_create_info->page_checksum == HA_CHOICE_UNDEF &&
+ maria_page_checksums) ||
+ ha_create_info->page_checksum == HA_CHOICE_YES)
+ create_flags|= HA_CREATE_PAGE_CHECKSUM;
+
+ (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+ (uchar*) thd->query(), thd->query_length());
+
+ /* TODO: Check that the following fn_format is really needed */
+ error=
+ maria_create(fn_format(buff, name, "", "",
+ MY_UNPACK_FILENAME | MY_APPEND_EXT),
+ row_type, share->keys, keydef,
+ record_count, recinfo,
+ 0, (MARIA_UNIQUEDEF *) 0,
+ &create_info, create_flags);
+
+ my_free((uchar*) recinfo, MYF(0));
+ DBUG_RETURN(error);
+}
+
+
+int ha_maria::rename_table(const char *from, const char *to)
+{
+ THD *thd= current_thd;
+ (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+ (uchar*) thd->query(), thd->query_length());
+ return maria_rename(from, to);
+}
+
+
+void ha_maria::get_auto_increment(ulonglong offset, ulonglong increment,
+ ulonglong nb_desired_values,
+ ulonglong *first_value,
+ ulonglong *nb_reserved_values)
+{
+ ulonglong nr;
+ int error;
+ uchar key[HA_MAX_KEY_LENGTH];
+
+ if (!table->s->next_number_key_offset)
+ { // Autoincrement at key-start
+ ha_maria::info(HA_STATUS_AUTO);
+ *first_value= stats.auto_increment_value;
+ /* Maria has only table-level lock for now, so reserves to +inf */
+ *nb_reserved_values= ULONGLONG_MAX;
+ return;
+ }
+
+ /* it's safe to call the following if bulk_insert isn't on */
+ maria_flush_bulk_insert(file, table->s->next_number_index);
+
+ (void) extra(HA_EXTRA_KEYREAD);
+ key_copy(key, table->record[0],
+ table->key_info + table->s->next_number_index,
+ table->s->next_number_key_offset);
+ error= maria_rkey(file, table->record[1], (int) table->s->next_number_index,
+ key, make_prev_keypart_map(table->s->next_number_keypart),
+ HA_READ_PREFIX_LAST);
+ if (error)
+ nr= 1;
+ else
+ {
+ /* Get data from record[1] */
+ nr= ((ulonglong) table->next_number_field->
+ val_int_offset(table->s->rec_buff_length) + 1);
+ }
+ extra(HA_EXTRA_NO_KEYREAD);
+ *first_value= nr;
+ /*
+ MySQL needs to call us for next row: assume we are inserting ("a",null)
+ here, we return 3, and next this statement will want to insert ("b",null):
+ there is no reason why ("b",3+1) would be the good row to insert: maybe it
+ already exists, maybe 3+1 is too large...
+ */
+ *nb_reserved_values= 1;
+}
+
+
+/*
+ Find out how many rows there is in the given range
+
+ SYNOPSIS
+ records_in_range()
+ inx Index to use
+ min_key Start of range. Null pointer if from first key
+ max_key End of range. Null pointer if to last key
+
+ NOTES
+ min_key.flag can have one of the following values:
+ HA_READ_KEY_EXACT Include the key in the range
+ HA_READ_AFTER_KEY Don't include key in range
+
+ max_key.flag can have one of the following values:
+ HA_READ_BEFORE_KEY Don't include key in range
+ HA_READ_AFTER_KEY Include all 'end_key' values in the range
+
+ RETURN
+ HA_POS_ERROR Something is wrong with the index tree.
+ 0 There is no matching keys in the given range
+ number > 0 There is approximately 'number' matching rows in
+ the range.
+*/
+
+ha_rows ha_maria::records_in_range(uint inx, key_range *min_key,
+ key_range *max_key)
+{
+ return (ha_rows) maria_records_in_range(file, (int) inx, min_key, max_key);
+}
+
+
+int ha_maria::ft_read(uchar * buf)
+{
+ int error;
+
+ if (!ft_handler)
+ return -1;
+
+ thread_safe_increment(table->in_use->status_var.ha_read_next_count,
+ &LOCK_status); // why ?
+
+ error= ft_handler->please->read_next(ft_handler, (char*) buf);
+
+ table->status= error ? STATUS_NOT_FOUND : 0;
+ return error;
+}
+
+
+uint ha_maria::checksum() const
+{
+ return (uint) file->state->checksum;
+}
+
+
+bool ha_maria::check_if_incompatible_data(HA_CREATE_INFO *create_info,
+ uint table_changes)
+{
+ DBUG_ENTER("check_if_incompatible_data");
+ uint options= table->s->db_options_in_use;
+
+ if (create_info->auto_increment_value != stats.auto_increment_value ||
+ create_info->data_file_name != data_file_name ||
+ create_info->index_file_name != index_file_name ||
+ (maria_row_type(create_info) != data_file_type &&
+ create_info->row_type != ROW_TYPE_DEFAULT) ||
+ table_changes == IS_EQUAL_NO ||
+ (table_changes & IS_EQUAL_PACK_LENGTH)) // Not implemented yet
+ DBUG_RETURN(COMPATIBLE_DATA_NO);
+
+ if ((options & (HA_OPTION_CHECKSUM |
+ HA_OPTION_DELAY_KEY_WRITE)) !=
+ (create_info->table_options & (HA_OPTION_CHECKSUM |
+ HA_OPTION_DELAY_KEY_WRITE)))
+ DBUG_RETURN(COMPATIBLE_DATA_NO);
+ DBUG_RETURN(COMPATIBLE_DATA_YES);
+}
+
+
+static int maria_hton_panic(handlerton *hton, ha_panic_function flag)
+{
+ /* If no background checkpoints, we need to do one now */
+ return ((checkpoint_interval == 0) ?
+ ma_checkpoint_execute(CHECKPOINT_FULL, FALSE) : 0) | maria_panic(flag);
+}
+
+
+static int maria_commit(handlerton *hton __attribute__ ((unused)),
+ THD *thd, bool all)
+{
+ TRN *trn= THD_TRN;
+ DBUG_ENTER("maria_commit");
+ trnman_reset_locked_tables(trn, 0);
+ trnman_set_flags(trn, trnman_get_flags(trn) & ~TRN_STATE_INFO_LOGGED);
+
+ /* statement or transaction ? */
+ if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all)
+ DBUG_RETURN(0); // end of statement
+ DBUG_PRINT("info", ("THD_TRN set to 0x0"));
+ THD_TRN= 0;
+ DBUG_RETURN(ma_commit(trn)); // end of transaction
+}
+
+
+static int maria_rollback(handlerton *hton __attribute__ ((unused)),
+ THD *thd, bool all)
+{
+ TRN *trn= THD_TRN;
+ DBUG_ENTER("maria_rollback");
+ trnman_reset_locked_tables(trn, 0);
+ /* statement or transaction ? */
+ if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all)
+ {
+ trnman_rollback_statement(trn);
+ DBUG_RETURN(0); // end of statement
+ }
+ DBUG_PRINT("info", ("THD_TRN set to 0x0"));
+ THD_TRN= 0;
+ DBUG_RETURN(trnman_rollback_trn(trn) ?
+ HA_ERR_OUT_OF_MEM : 0); // end of transaction
+}
+
+
+
+/**
+ @brief flush log handler
+
+ @param hton maria handlerton (unused)
+
+ @retval FALSE OK
+ @retval TRUE Error
+*/
+
+bool maria_flush_logs(handlerton *hton)
+{
+ return test(translog_purge_at_flush());
+}
+
+
+#define SHOW_MSG_LEN (FN_REFLEN + 20)
+/**
+ @brief show status handler
+
+ @param hton maria handlerton
+ @param thd thread handler
+ @param print print function
+ @param stat type of status
+*/
+
+bool maria_show_status(handlerton *hton,
+ THD *thd,
+ stat_print_fn *print,
+ enum ha_stat_type stat)
+{
+ const LEX_STRING *engine_name= hton_name(hton);
+ switch (stat) {
+ case HA_ENGINE_LOGS:
+ {
+ TRANSLOG_ADDRESS horizon= translog_get_horizon();
+ uint32 last_file= LSN_FILE_NO(horizon);
+ uint32 first_needed= translog_get_first_needed_file();
+ uint32 first_file= translog_get_first_file(horizon);
+ uint32 i;
+ const char unknown[]= "unknown";
+ const char needed[]= "in use";
+ const char unneeded[]= "free";
+ char path[FN_REFLEN];
+
+ if (first_file == 0)
+ {
+ const char error[]= "error";
+ print(thd, engine_name->str, engine_name->length,
+ STRING_WITH_LEN(""), error, sizeof(error) - 1);
+ break;
+ }
+
+ for (i= first_file; i <= last_file; i++)
+ {
+ char *file;
+ const char *status;
+ uint length, status_len;
+ MY_STAT stat_buff, *stat;
+ const char error[]= "can't stat";
+ char object[SHOW_MSG_LEN];
+ file= translog_filename_by_fileno(i, path);
+ if (!(stat= my_stat(file, &stat_buff, MYF(0))))
+ {
+ status= error;
+ status_len= sizeof(error) - 1;
+ length= my_snprintf(object, SHOW_MSG_LEN, "Size unknown ; %s", file);
+ }
+ else
+ {
+ if (first_needed == 0)
+ {
+ status= unknown;
+ status_len= sizeof(unknown) - 1;
+ }
+ else if (i < first_needed)
+ {
+ status= unneeded;
+ status_len= sizeof(unneeded) - 1;
+ }
+ else
+ {
+ status= needed;
+ status_len= sizeof(needed) - 1;
+ }
+ length= my_snprintf(object, SHOW_MSG_LEN, "Size %12lu ; %s",
+ (ulong) stat->st_size, file);
+ }
+
+ print(thd, engine_name->str, engine_name->length,
+ object, length, status, status_len);
+ }
+ break;
+ }
+ case HA_ENGINE_STATUS:
+ case HA_ENGINE_MUTEX:
+ default:
+ break;
+ }
+ return 0;
+}
+
+
+/**
+ Callback to delete all logs in directory. This is lower-level than other
+ functions in ma_loghandler.c which delete logs, as it does not rely on
+ translog_init() having been called first.
+
+ @param directory directory where file is
+ @param filename base name of the file to delete
+*/
+
+static my_bool translog_callback_delete_all(const char *directory,
+ const char *filename)
+{
+ char complete_name[FN_REFLEN];
+ fn_format(complete_name, filename, directory, "", MYF(MY_UNPACK_FILENAME));
+ return my_delete(complete_name, MYF(MY_WME));
+}
+
+
+/**
+ Helper function for option aria-force-start-after-recovery-failures.
+ Deletes logs if too many failures. Otherwise, increments the counter of
+ failures in the control file.
+ Notice how this has to be called _before_ translog_init() (if log is
+ corrupted, translog_init() might crash the server, so we need to remove logs
+ before).
+
+ @param log_dir directory where logs to be deleted are
+*/
+
+static int mark_recovery_start(const char* log_dir)
+{
+ int res;
+ DBUG_ENTER("mark_recovery_start");
+ if (unlikely(maria_recover_options == HA_RECOVER_NONE))
+ ma_message_no_user(ME_JUST_WARNING, "Please consider using option"
+ " --aria-recover[=...] to automatically check and"
+ " repair tables when logs are removed by option"
+ " --aria-force-start-after-recovery-failures=#");
+ if (recovery_failures >= force_start_after_recovery_failures)
+ {
+ /*
+ Remove logs which cause the problem; keep control file which has
+ critical info like uuid, max_trid (removing control file may make
+ correct tables look corrupted!).
+ */
+ char msg[100];
+ res= translog_walk_filenames(log_dir, &translog_callback_delete_all);
+ my_snprintf(msg, sizeof(msg),
+ "%s logs after %u consecutive failures of"
+ " recovery from logs",
+ (res ? "failed to remove some" : "removed all"),
+ recovery_failures);
+ ma_message_no_user((res ? 0 : ME_JUST_WARNING), msg);
+ }
+ else
+ res= ma_control_file_write_and_force(last_checkpoint_lsn, last_logno,
+ max_trid_in_control_file,
+ recovery_failures + 1);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ Helper function for option aria-force-start-after-recovery-failures.
+ Records in the control file that recovery was a success, so that it's not
+ counted for aria-force-start-after-recovery-failures.
+*/
+
+static int mark_recovery_success(void)
+{
+ /* success of recovery, reset recovery_failures: */
+ int res;
+ DBUG_ENTER("mark_recovery_success");
+ res= ma_control_file_write_and_force(last_checkpoint_lsn, last_logno,
+ max_trid_in_control_file, 0);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Return 1 if table has changed during the current transaction
+*/
+
+bool ha_maria::is_changed() const
+{
+ return file->state->changed;
+}
+
+
+static int ha_maria_init(void *p)
+{
+ int res;
+ copy_variable_aliases();
+ const char *log_dir= maria_data_root;
+ maria_hton= (handlerton *)p;
+ maria_hton->state= SHOW_OPTION_YES;
+ maria_hton->db_type= DB_TYPE_UNKNOWN;
+ maria_hton->create= maria_create_handler;
+ maria_hton->panic= maria_hton_panic;
+ maria_hton->commit= maria_commit;
+ maria_hton->rollback= maria_rollback;
+ maria_hton->flush_logs= maria_flush_logs;
+ maria_hton->show_status= maria_show_status;
+ /* TODO: decide if we support Maria being used for log tables */
+ maria_hton->flags= HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES;
+ bzero(maria_log_pagecache, sizeof(*maria_log_pagecache));
+ maria_tmpdir= &mysql_tmpdir_list; /* For REDO */
+ res= maria_upgrade() || maria_init() || ma_control_file_open(TRUE, TRUE) ||
+ ((force_start_after_recovery_failures != 0) &&
+ mark_recovery_start(log_dir)) ||
+ !init_pagecache(maria_pagecache,
+ (size_t) pagecache_buffer_size, pagecache_division_limit,
+ pagecache_age_threshold, maria_block_size, 0) ||
+ !init_pagecache(maria_log_pagecache,
+ TRANSLOG_PAGECACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, 0) ||
+ translog_init(maria_data_root, log_file_size,
+ MYSQL_VERSION_ID, server_id, maria_log_pagecache,
+ TRANSLOG_DEFAULT_FLAGS, 0) ||
+ maria_recovery_from_log() ||
+ ((force_start_after_recovery_failures != 0 ||
+ maria_recovery_changed_data) && mark_recovery_success()) ||
+ ma_checkpoint_init(checkpoint_interval);
+ maria_multi_threaded= maria_in_ha_maria= TRUE;
+ maria_create_trn_hook= maria_create_trn_for_mysql;
+
+#if defined(HAVE_REALPATH) && !defined(HAVE_valgrind) && !defined(HAVE_BROKEN_REALPATH)
+ /* We can only test for sub paths if my_symlink.c is using realpath */
+ maria_test_invalid_symlink= test_if_data_home_dir;
+#endif
+ if (res)
+ maria_hton= 0;
+ return res ? HA_ERR_INITIALIZATION : 0;
+}
+
+
+#ifdef HAVE_QUERY_CACHE
+/**
+ @brief Register a named table with a call back function to the query cache.
+
+ @param thd The thread handle
+ @param table_key A pointer to the table name in the table cache
+ @param key_length The length of the table name
+ @param[out] engine_callback The pointer to the storage engine call back
+ function, currently 0
+ @param[out] engine_data Engine data will be set to 0.
+
+ @note Despite the name of this function, it is used to check each statement
+ before it is cached and not to register a table or callback function.
+
+ @see handler::register_query_cache_table
+
+ @return The error code. The engine_data and engine_callback will be set to 0.
+ @retval TRUE Success
+ @retval FALSE An error occurred
+*/
+
+my_bool ha_maria::register_query_cache_table(THD *thd, char *table_name,
+ uint table_name_len,
+ qc_engine_callback
+ *engine_callback,
+ ulonglong *engine_data)
+{
+ ulonglong actual_data_file_length;
+ ulonglong current_data_file_length;
+ DBUG_ENTER("ha_maria::register_query_cache_table");
+
+ /*
+ No call back function is needed to determine if a cached statement
+ is valid or not.
+ */
+ *engine_callback= 0;
+
+ /*
+ No engine data is needed.
+ */
+ *engine_data= 0;
+
+ if (file->s->now_transactional && file->s->have_versioning)
+ return (file->trn->trid >= file->s->state.last_change_trn);
+
+ /*
+ If a concurrent INSERT has happened just before the currently processed
+ SELECT statement, the total size of the table is unknown.
+
+ To determine if the table size is known, the current thread's snap shot of
+ the table size with the actual table size are compared.
+
+ If the table size is unknown the SELECT statement can't be cached.
+ */
+
+ /*
+ POSIX visibility rules specify that "2. Whatever memory values a
+ thread can see when it unlocks a mutex <...> can also be seen by any
+ thread that later locks the same mutex". In this particular case,
+ concurrent insert thread had modified the data_file_length in
+ MYISAM_SHARE before it has unlocked (or even locked)
+ structure_guard_mutex. So, here we're guaranteed to see at least that
+ value after we've locked the same mutex. We can see a later value
+ (modified by some other thread) though, but it's ok, as we only want
+ to know if the variable was changed, the actual new value doesn't matter
+ */
+ actual_data_file_length= file->s->state.state.data_file_length;
+ current_data_file_length= file->state->data_file_length;
+
+ /* Return whether is ok to try to cache current statement. */
+ DBUG_RETURN(!(file->s->non_transactional_concurrent_insert &&
+ current_data_file_length != actual_data_file_length));
+}
+#endif
+
+struct st_mysql_sys_var* system_variables[]= {
+ MYSQL_SYSVAR(block_size),
+ MYSQL_SYSVAR(checkpoint_interval),
+ MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
+ MYSQL_SYSVAR(log_dir_path),
+ MYSQL_SYSVAR(log_file_size),
+ MYSQL_SYSVAR(log_purge_type),
+ MYSQL_SYSVAR(max_sort_file_size),
+ MYSQL_SYSVAR(page_checksum),
+ MYSQL_SYSVAR(pagecache_age_threshold),
+ MYSQL_SYSVAR(pagecache_buffer_size),
+ MYSQL_SYSVAR(pagecache_division_limit),
+ MYSQL_SYSVAR(recover),
+ MYSQL_SYSVAR(repair_threads),
+ MYSQL_SYSVAR(sort_buffer_size),
+ MYSQL_SYSVAR(stats_method),
+ MYSQL_SYSVAR(sync_log_dir),
+ MYSQL_SYSVAR(used_for_temp_tables),
+ NULL
+};
+
+
+/**
+ @brief Updates the checkpoint interval and restarts the background thread.
+*/
+
+static void update_checkpoint_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ma_checkpoint_end();
+ ma_checkpoint_init(*(ulong *)var_ptr= (ulong)(*(long *)save));
+}
+
+/**
+ @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= (ulong)*((long *)var_ptr);
+ DBUG_ENTER("update_maria_group_commit");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu",
+ value, (ulong)(*(long *)save),
+ maria_group_commit_interval));
+ /* old value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(FALSE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(FALSE);
+ if (maria_group_commit_interval)
+ translog_soft_sync_end();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+ translog_sync();
+ /* new value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(TRUE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(TRUE);
+ /* variable change made under global lock so we can just read it */
+ if (maria_group_commit_interval)
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong new_value= (ulong)*((long *)save);
+ ulong *value_ptr= (ulong*) var_ptr;
+ DBUG_ENTER("update_maria_group_commit_interval");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu",
+ *value_ptr, new_value, maria_group_commit));
+
+ /* variable change made under global lock so we can just read it */
+ switch (maria_group_commit) {
+ case TRANSLOG_GCOMMIT_NONE:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ if (*value_ptr)
+ translog_soft_sync_end();
+ translog_set_group_commit_interval(new_value);
+ if ((*value_ptr= new_value))
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates the transaction log file limit.
+*/
+
+static void update_log_file_size(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ uint32 size= (uint32)((ulong)(*(long *)save));
+ translog_set_file_size(size);
+ *(ulong *)var_ptr= size;
+}
+
+
+SHOW_VAR status_variables[]= {
+ {"pagecache_blocks_not_flushed", (char*) &maria_pagecache_var.global_blocks_changed, SHOW_LONG_NOFLUSH},
+ {"pagecache_blocks_unused", (char*) &maria_pagecache_var.blocks_unused, SHOW_LONG_NOFLUSH},
+ {"pagecache_blocks_used", (char*) &maria_pagecache_var.blocks_used, SHOW_LONG_NOFLUSH},
+ {"pagecache_read_requests", (char*) &maria_pagecache_var.global_cache_r_requests, SHOW_LONGLONG},
+ {"pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
+ {"pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
+ {"pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+ {"transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG},
+ {NullS, NullS, SHOW_LONG}
+};
+
+static struct st_mysql_show_var aria_status_variables[]= {
+ {"Aria", (char*) &status_variables, SHOW_ARRAY},
+ {NullS, NullS, SHOW_LONG}
+};
+
+/****************************************************************************
+ * Maria MRR implementation: use DS-MRR
+ ***************************************************************************/
+
+int ha_maria::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+ uint n_ranges, uint mode,
+ HANDLER_BUFFER *buf)
+{
+ return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
+}
+
+int ha_maria::multi_range_read_next(char **range_info)
+{
+ return ds_mrr.dsmrr_next(range_info);
+}
+
+ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+ void *seq_init_param,
+ uint n_ranges, uint *bufsz,
+ uint *flags, COST_VECT *cost)
+{
+ /*
+ This call is here because there is no location where this->table would
+ already be known.
+ TODO: consider moving it into some per-query initialization call.
+ */
+ ds_mrr.init(this, table);
+ return ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges, bufsz,
+ flags, cost);
+}
+
+ha_rows ha_maria::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+ uint *bufsz, uint *flags,
+ COST_VECT *cost)
+{
+ ds_mrr.init(this, table);
+ return ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+}
+
+/* MyISAM MRR implementation ends */
+
+
+/* Index condition pushdown implementation*/
+
+
+Item *ha_maria::idx_cond_push(uint keyno_arg, Item* idx_cond_arg)
+{
+ pushed_idx_cond_keyno= keyno_arg;
+ pushed_idx_cond= idx_cond_arg;
+ in_range_check_pushed_down= TRUE;
+ if (active_index == pushed_idx_cond_keyno)
+ ma_set_index_cond_func(file, index_cond_func_maria, this);
+ return NULL;
+}
+
+
+
+
+struct st_mysql_storage_engine maria_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+maria_declare_plugin(aria)
+compat_aliases,
+{
+ MYSQL_STORAGE_ENGINE_PLUGIN,
+ &maria_storage_engine,
+ "Aria",
+ "Monty Program Ab",
+ "Crash-safe tables with MyISAM heritage",
+ PLUGIN_LICENSE_GPL,
+ ha_maria_init, /* Plugin Init */
+ NULL, /* Plugin Deinit */
+ 0x0105, /* 1.5 */
+ aria_status_variables, /* status variables */
+ system_variables, /* system variables */
+ "1.5", /* string version */
+ MariaDB_PLUGIN_MATURITY_GAMMA /* maturity */
+}
+maria_declare_plugin_end;
diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h
new file mode 100644
index 00000000000..605ad1d3a20
--- /dev/null
+++ b/storage/maria/ha_maria.h
@@ -0,0 +1,197 @@
+/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef HA_MARIA_INCLUDED
+#define HA_MARIA_INCLUDED
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface /* gcc class implementation */
+#endif
+
+/* class for the maria handler */
+
+#include <maria.h>
+
+#define HA_RECOVER_NONE 0 /* No automatic recover */
+#define HA_RECOVER_DEFAULT 1 /* Automatic recover active */
+#define HA_RECOVER_BACKUP 2 /* Make a backupfile on recover */
+#define HA_RECOVER_FORCE 4 /* Recover even if we loose rows */
+#define HA_RECOVER_QUICK 8 /* Don't check rows in data file */
+
+C_MODE_START
+ICP_RESULT index_cond_func_maria(void *arg);
+C_MODE_END
+
+extern ulong maria_sort_buffer_size;
+extern TYPELIB maria_recover_typelib;
+extern ulong maria_recover_options;
+
+class ha_maria :public handler
+{
+ MARIA_HA *file;
+ ulonglong int_table_flags;
+ MARIA_RECORD_POS remember_pos;
+ char *data_file_name, *index_file_name;
+ enum data_file_type data_file_type;
+ bool can_enable_indexes;
+ /**
+ If a transactional table is doing bulk insert with a single
+ UNDO_BULK_INSERT with/without repair.
+ */
+ uint8 bulk_insert_single_undo;
+ int repair(THD * thd, HA_CHECK *param, bool optimize);
+ int zerofill(THD * thd, HA_CHECK_OPT *check_opt);
+
+public:
+ ha_maria(handlerton *hton, TABLE_SHARE * table_arg);
+ ~ha_maria() {}
+ handler *clone(MEM_ROOT *mem_root);
+ const char *table_type() const
+ { return "Aria"; }
+ const char *index_type(uint key_number);
+ const char **bas_ext() const;
+ ulonglong table_flags() const
+ { return int_table_flags; }
+ ulong index_flags(uint inx, uint part, bool all_parts) const
+ {
+ return ((table_share->key_info[inx].algorithm == HA_KEY_ALG_FULLTEXT) ?
+ 0 : HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE |
+ HA_READ_ORDER | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN);
+ }
+ uint max_supported_keys() const
+ { return MARIA_MAX_KEY; }
+ uint max_supported_key_length() const;
+ uint max_supported_key_part_length() const
+ { return max_supported_key_length(); }
+ enum row_type get_row_type() const;
+ uint checksum() const;
+ virtual double scan_time();
+
+ int open(const char *name, int mode, uint test_if_locked);
+ int close(void);
+ int write_row(uchar * buf);
+ int update_row(const uchar * old_data, uchar * new_data);
+ int delete_row(const uchar * buf);
+ int index_read_map(uchar * buf, const uchar * key, key_part_map keypart_map,
+ enum ha_rkey_function find_flag);
+ int index_read_idx_map(uchar * buf, uint idx, const uchar * key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag);
+ int index_read_last_map(uchar * buf, const uchar * key,
+ key_part_map keypart_map);
+ int index_next(uchar * buf);
+ int index_prev(uchar * buf);
+ int index_first(uchar * buf);
+ int index_last(uchar * buf);
+ int index_next_same(uchar * buf, const uchar * key, uint keylen);
+ int ft_init()
+ {
+ if (!ft_handler)
+ return 1;
+ ft_handler->please->reinit_search(ft_handler);
+ return 0;
+ }
+ FT_INFO *ft_init_ext(uint flags, uint inx, String * key)
+ {
+ return maria_ft_init_search(flags, file, inx,
+ (uchar *) key->ptr(), key->length(),
+ key->charset(), table->record[0]);
+ }
+ int ft_read(uchar * buf);
+ int index_init(uint idx, bool sorted);
+ int index_end();
+ int rnd_init(bool scan);
+ int rnd_end(void);
+ int rnd_next(uchar * buf);
+ int rnd_pos(uchar * buf, uchar * pos);
+ int remember_rnd_pos();
+ int restart_rnd_next(uchar * buf);
+ void position(const uchar * record);
+ int info(uint);
+ int info(uint, my_bool);
+ int extra(enum ha_extra_function operation);
+ int extra_opt(enum ha_extra_function operation, ulong cache_size);
+ int reset(void);
+ int external_lock(THD * thd, int lock_type);
+ int start_stmt(THD *thd, thr_lock_type lock_type);
+ int delete_all_rows(void);
+ int disable_indexes(uint mode);
+ int enable_indexes(uint mode);
+ int indexes_are_disabled(void);
+ void start_bulk_insert(ha_rows rows);
+ int end_bulk_insert();
+ ha_rows records_in_range(uint inx, key_range * min_key, key_range * max_key);
+ void update_create_info(HA_CREATE_INFO * create_info);
+ int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info);
+ THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to,
+ enum thr_lock_type lock_type);
+ virtual void get_auto_increment(ulonglong offset, ulonglong increment,
+ ulonglong nb_desired_values,
+ ulonglong *first_value,
+ ulonglong *nb_reserved_values);
+ int rename_table(const char *from, const char *to);
+ int delete_table(const char *name);
+ void drop_table(const char *name);
+ int check(THD * thd, HA_CHECK_OPT * check_opt);
+ int analyze(THD * thd, HA_CHECK_OPT * check_opt);
+ int repair(THD * thd, HA_CHECK_OPT * check_opt);
+ bool check_and_repair(THD * thd);
+ bool is_crashed() const;
+ bool is_changed() const;
+ bool auto_repair() const { return maria_recover_options != HA_RECOVER_NONE; }
+ int optimize(THD * thd, HA_CHECK_OPT * check_opt);
+ int restore(THD * thd, HA_CHECK_OPT * check_opt);
+ int backup(THD * thd, HA_CHECK_OPT * check_opt);
+ int assign_to_keycache(THD * thd, HA_CHECK_OPT * check_opt);
+ int preload_keys(THD * thd, HA_CHECK_OPT * check_opt);
+ bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes);
+ bool check_if_supported_virtual_columns(void) { return TRUE;}
+#ifdef HAVE_REPLICATION
+ int dump(THD * thd, int fd);
+ int net_read_dump(NET * net);
+#endif
+#ifdef HAVE_QUERY_CACHE
+ my_bool register_query_cache_table(THD *thd, char *table_key,
+ uint key_length,
+ qc_engine_callback
+ *engine_callback,
+ ulonglong *engine_data);
+#endif
+ MARIA_HA *file_ptr(void)
+ {
+ return file;
+ }
+ static int implicit_commit(THD *thd, bool new_trn);
+ /**
+ * Multi Range Read interface
+ */
+ int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+ uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+ int multi_range_read_next(char **range_info);
+ ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+ void *seq_init_param,
+ uint n_ranges, uint *bufsz,
+ uint *flags, COST_VECT *cost);
+ ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+ uint *bufsz, uint *flags, COST_VECT *cost);
+
+ /* Index condition pushdown implementation */
+ Item *idx_cond_push(uint keyno, Item* idx_cond);
+private:
+ DsMrr_impl ds_mrr;
+ friend ICP_RESULT index_cond_func_maria(void *arg);
+};
+
+#endif /* HA_MARIA_INCLUDED */
diff --git a/storage/maria/lockman.c b/storage/maria/lockman.c
new file mode 100644
index 00000000000..d6d4dcd44e6
--- /dev/null
+++ b/storage/maria/lockman.c
@@ -0,0 +1,786 @@
+/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */
+/* QQ: TODO instant duration locks */
+/* QQ: #warning automatically place S instead of LS if possible */
+
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Generic Lock Manager
+
+ Lock manager handles locks on "resources", a resource must be uniquely
+ identified by a 64-bit number. Lock manager itself does not imply
+ anything about the nature of a resource - it can be a row, a table, a
+ database, or just anything.
+
+ Locks belong to "lock owners". A Lock owner is uniquely identified by a
+ 16-bit number. A function loid2lo must be provided by the application
+ that takes such a number as an argument and returns a LOCK_OWNER
+ structure.
+
+ Lock levels are completely defined by three tables. Lock compatibility
+ matrix specifies which locks can be held at the same time on a resource.
+ Lock combining matrix specifies what lock level has the same behaviour as
+ a pair of two locks of given levels. getlock_result matrix simplifies
+ intention locking and lock escalation for an application, basically it
+ defines which locks are intention locks and which locks are "loose"
+ locks. It is only used to provide better diagnostics for the
+ application, lock manager itself does not differentiate between normal,
+ intention, and loose locks.
+
+ Internally lock manager is based on a lock-free hash, see lf_hash.c for
+ details. All locks are stored in a hash, with a resource id as a search
+ key, so all locks for the same resource will be considered collisions and
+ will be put in a one (lock-free) linked list. The main lock-handling
+ logic is in the inner loop that searches for a lock in such a linked
+ list - lockfind().
+
+ This works as follows. Locks generally are added to the end of the list
+ (with one exception, see below). When scanning the list it is always
+ possible to determine what locks are granted (active) and what locks are
+ waiting - first lock is obviously active, the second is active if it's
+ compatible with the first, and so on, a lock is active if it's compatible
+ with all previous locks and all locks before it are also active.
+ To calculate the "compatible with all previous locks" all locks are
+ accumulated in prev_lock variable using lock_combining_matrix.
+
+ Lock upgrades: when a thread that has a lock on a given resource,
+ requests a new lock on the same resource and the old lock is not enough
+ to satisfy new lock requirements (which is defined by
+ lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock is
+ placed in the list. Depending on other locks it is immediately active or
+ it will wait for other locks. Here's an exception to "locks are added
+ to the end" rule - upgraded locks are added after the last active lock
+ but before all waiting locks. Old lock (the one we upgraded from) is
+ not removed from the list, indeed it may be needed if the new lock was
+ in a savepoint that gets rolled back. So old lock is marked as "ignored"
+ (IGNORE_ME flag). New lock gets an UPGRADED flag.
+
+ Loose locks add an important exception to the above. Loose locks do not
+ always commute with other locks. In the list IX-LS both locks are active,
+ while in the LS-IX list only the first lock is active. This creates a
+ problem in lock upgrades. If the list was IX-LS and the owner of the
+ first lock wants to place LS lock (which can be immediately granted), the
+ IX lock is upgraded to LSIX and the list becomes IX-LS-LSIX, which,
+ according to the lock compatibility matrix means that the last lock is
+ waiting - of course it all happened because IX and LS were swapped and
+ they don't commute. To work around this there's ACTIVE flag which is set
+ in every lock that never waited (was placed active), and this flag
+ overrides "compatible with all previous locks" rule.
+
+ When a lock is placed to the end of the list it's either compatible with
+ all locks and all locks are active - new lock becomes active at once, or
+ it conflicts with some of the locks, in this case in the 'blocker'
+ variable a conflicting lock is returned and the calling thread waits on a
+ pthread condition in the LOCK_OWNER structure of the owner of the
+ conflicting lock. Or a new lock is compatible with all locks, but some
+ existing locks are not compatible with each other (example: request IS,
+ when the list is S-IX) - that is not all locks are active. In this case a
+ first waiting lock is returned in the 'blocker' variable, lockman_getlock()
+ notices that a "blocker" does not conflict with the requested lock, and
+ "dereferences" it, to find the lock that it's waiting on. The calling
+ thread than begins to wait on the same lock.
+
+ To better support table-row relations where one needs to lock the table
+ with an intention lock before locking the row, extended diagnostics is
+ provided. When an intention lock (presumably on a table) is granted,
+ lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row,
+ perhaps the thread already has a normal lock on this table),
+ GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual),
+ GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check
+ whether it's possible to lock the row, but no need to lock it - perhaps
+ the thread has a loose lock on this table). This is defined by
+ getlock_result[] table.
+*/
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_bit.h>
+#include <lf.h>
+#include "lockman.h"
+
+/*
+ Lock compatibility matrix.
+
+ It's asymmetric. Read it as "Somebody has the lock <value in the row
+ label>, can I set the lock <value in the column label> ?"
+
+ ') Though you can take LS lock while somebody has S lock, it makes no
+ sense - it's simpler to take S lock too.
+
+ 1 - compatible
+ 0 - incompatible
+ -1 - "impossible", so that we can assert the impossibility.
+*/
+static int lock_compatibility_matrix[10][10]=
+{ /* N S X IS IX SIX LS LX SLX LSIX */
+ { -1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
+ { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */
+ { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */
+ { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */
+ { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */
+ { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */
+ { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */
+};
+
+/*
+ Lock combining matrix.
+
+ It's symmetric. Read it as "what lock level L is identical to the
+ set of two locks A and B"
+
+ One should never get N from it, we assert the impossibility
+*/
+static enum lockman_lock_type lock_combining_matrix[10][10]=
+{/* N S X IS IX SIX LS LX SLX LSIX */
+ { N, S, X, IS, IX, SIX, S, SLX, SLX, SIX}, /* N */
+ { S, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */
+ { X, X, X, X, X, X, X, X, X, X}, /* X */
+ { IS, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */
+ { IX, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */
+ { SIX, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */
+ { LS, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */
+ { LX, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */
+ { SLX, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */
+ { LSIX, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */
+};
+
+#define REPEAT_ONCE_MORE 0
+#define OK_TO_PLACE_THE_LOCK 1
+#define OK_TO_PLACE_THE_REQUEST 2
+#define ALREADY_HAVE_THE_LOCK 4
+#define ALREADY_HAVE_THE_REQUEST 8
+#define PLACE_NEW_DISABLE_OLD 16
+#define REQUEST_NEW_DISABLE_OLD 32
+#define RESOURCE_WAS_UNLOCKED 64
+
+#define NEED_TO_WAIT (OK_TO_PLACE_THE_REQUEST | ALREADY_HAVE_THE_REQUEST |\
+ REQUEST_NEW_DISABLE_OLD)
+#define ALREADY_HAVE (ALREADY_HAVE_THE_LOCK | ALREADY_HAVE_THE_REQUEST)
+#define LOCK_UPGRADE (PLACE_NEW_DISABLE_OLD | REQUEST_NEW_DISABLE_OLD)
+
+
+/*
+ the return codes for lockman_getlock
+
+ It's asymmetric. Read it as "I have the lock <value in the row label>,
+ what value should be returned for <value in the column label> ?"
+
+ 0 means impossible combination (assert!)
+
+ Defines below help to preserve the table structure.
+ I/L/A values are self explanatory
+ x means the combination is possible (assert should not crash)
+ but it cannot happen in row locks, only in table locks (S,X),
+ or lock escalations (LS,LX)
+*/
+#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE
+#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+#define A GOT_THE_LOCK
+#define x GOT_THE_LOCK
+static enum lockman_getlock_result getlock_result[10][10]=
+{/* N S X IS IX SIX LS LX SLX LSIX */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */
+ { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */
+ { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */
+ { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */
+ { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */
+ { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */
+ { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */
+ { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */
+ { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */
+ { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */
+};
+#undef I
+#undef L
+#undef A
+#undef x
+
+LF_REQUIRE_PINS(4)
+
+typedef struct lockman_lock {
+ uint64 resource;
+ struct lockman_lock *lonext;
+ intptr volatile link;
+ uint32 hashnr;
+ /* QQ: TODO - remove hashnr from LOCK */
+ uint16 loid;
+ uchar lock; /* sizeof(uchar) <= sizeof(enum) */
+ uchar flags;
+} LOCK;
+
+#define IGNORE_ME 1
+#define UPGRADED 2
+#define ACTIVE 4
+
+typedef struct {
+ intptr volatile *prev;
+ LOCK *curr, *next;
+ LOCK *blocker, *upgrade_from;
+} CURSOR;
+
+#define PTR(V) (LOCK *)((V) & (~(intptr)1))
+#define DELETED(V) ((V) & 1)
+
+/*
+ NOTE
+ cursor is positioned in either case
+ pins[0..3] are used, they are NOT removed on return
+*/
+static int lockfind(LOCK * volatile *head, LOCK *node,
+ CURSOR *cursor, LF_PINS *pins)
+{
+ uint32 hashnr, cur_hashnr;
+ uint64 resource, cur_resource;
+ intptr cur_link;
+ my_bool cur_active, compatible, upgrading, prev_active;
+ enum lockman_lock_type lock, prev_lock, cur_lock;
+ uint16 loid, cur_loid;
+ int cur_flags, flags;
+
+ hashnr= node->hashnr;
+ resource= node->resource;
+ lock= node->lock;
+ loid= node->loid;
+ flags= node->flags;
+
+retry:
+ cursor->prev= (intptr *)head;
+ prev_lock= N;
+ cur_active= TRUE;
+ compatible= TRUE;
+ upgrading= FALSE;
+ cursor->blocker= cursor->upgrade_from= 0;
+ _lf_unpin(pins, 3);
+ do {
+ cursor->curr= PTR(*cursor->prev);
+ _lf_pin(pins, 1, cursor->curr);
+ } while(*cursor->prev != (intptr)cursor->curr && LF_BACKOFF);
+ for (;;)
+ {
+ if (!cursor->curr)
+ break;
+ do {
+ cur_link= cursor->curr->link;
+ cursor->next= PTR(cur_link);
+ _lf_pin(pins, 0, cursor->next);
+ } while (cur_link != cursor->curr->link && LF_BACKOFF);
+ cur_hashnr= cursor->curr->hashnr;
+ cur_resource= cursor->curr->resource;
+ cur_lock= cursor->curr->lock;
+ cur_loid= cursor->curr->loid;
+ cur_flags= cursor->curr->flags;
+ if (*cursor->prev != (intptr)cursor->curr)
+ {
+ (void)LF_BACKOFF;
+ goto retry;
+ }
+ if (!DELETED(cur_link))
+ {
+ if (cur_hashnr > hashnr ||
+ (cur_hashnr == hashnr && cur_resource >= resource))
+ {
+ if (cur_hashnr > hashnr || cur_resource > resource)
+ break;
+ /* ok, we have a lock for this resource */
+ DBUG_ASSERT(lock_compatibility_matrix[prev_lock][cur_lock] >= 0);
+ DBUG_ASSERT(lock_compatibility_matrix[cur_lock][lock] >= 0);
+ if ((cur_flags & IGNORE_ME) && ! (flags & IGNORE_ME))
+ {
+ DBUG_ASSERT(cur_active);
+ if (cur_loid == loid)
+ cursor->upgrade_from= cursor->curr;
+ }
+ else
+ {
+ prev_active= cur_active;
+ if (cur_flags & ACTIVE)
+ DBUG_ASSERT(prev_active == TRUE);
+ else
+ cur_active&= lock_compatibility_matrix[prev_lock][cur_lock];
+ if (upgrading && !cur_active /*&& !(cur_flags & UPGRADED)*/)
+ break;
+ if (prev_active && !cur_active)
+ {
+ cursor->blocker= cursor->curr;
+ _lf_pin(pins, 3, cursor->curr);
+ }
+ if (cur_loid == loid)
+ {
+ /* we already have a lock on this resource */
+ DBUG_ASSERT(lock_combining_matrix[cur_lock][lock] != N);
+ DBUG_ASSERT(!upgrading || (flags & IGNORE_ME));
+ if (lock_combining_matrix[cur_lock][lock] == cur_lock)
+ {
+ /* new lock is compatible */
+ if (cur_active)
+ {
+ cursor->blocker= cursor->curr; /* loose-locks! */
+ _lf_unpin(pins, 3); /* loose-locks! */
+ return ALREADY_HAVE_THE_LOCK;
+ }
+ else
+ return ALREADY_HAVE_THE_REQUEST;
+ }
+ /* not compatible, upgrading */
+ upgrading= TRUE;
+ cursor->upgrade_from= cursor->curr;
+ }
+ else
+ {
+ if (!lock_compatibility_matrix[cur_lock][lock])
+ {
+ compatible= FALSE;
+ cursor->blocker= cursor->curr;
+ _lf_pin(pins, 3, cursor->curr);
+ }
+ }
+ prev_lock= lock_combining_matrix[prev_lock][cur_lock];
+ DBUG_ASSERT(prev_lock != N);
+ }
+ }
+ cursor->prev= &(cursor->curr->link);
+ _lf_pin(pins, 2, cursor->curr);
+ }
+ else
+ {
+ if (my_atomic_casptr((void **)cursor->prev,
+ (void **)(char*) &cursor->curr, cursor->next))
+ _lf_alloc_free(pins, cursor->curr);
+ else
+ {
+ (void)LF_BACKOFF;
+ goto retry;
+ }
+ }
+ cursor->curr= cursor->next;
+ _lf_pin(pins, 1, cursor->curr);
+ }
+ /*
+ either the end of lock list - no more locks for this resource,
+ or upgrading and the end of active lock list
+ */
+ if (upgrading)
+ {
+ if (compatible /*&& prev_active*/)
+ return PLACE_NEW_DISABLE_OLD;
+ else
+ return REQUEST_NEW_DISABLE_OLD;
+ }
+ if (cur_active && compatible)
+ {
+ /*
+ either no locks for this resource or all are compatible.
+ ok to place the lock in any case.
+ */
+ return prev_lock == N ? RESOURCE_WAS_UNLOCKED
+ : OK_TO_PLACE_THE_LOCK;
+ }
+ /* we have a lock conflict. ok to place a lock request. And wait */
+ return OK_TO_PLACE_THE_REQUEST;
+}
+
+/*
+ NOTE
+ it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays
+*/
+static int lockinsert(LOCK * volatile *head, LOCK *node, LF_PINS *pins,
+ LOCK **blocker)
+{
+ CURSOR cursor;
+ int res;
+
+ do
+ {
+ res= lockfind(head, node, &cursor, pins);
+ DBUG_ASSERT(res != ALREADY_HAVE_THE_REQUEST);
+ if (!(res & ALREADY_HAVE))
+ {
+ if (res & LOCK_UPGRADE)
+ {
+ node->flags|= UPGRADED;
+ node->lock= lock_combining_matrix[cursor.upgrade_from->lock][node->lock];
+ }
+ if (!(res & NEED_TO_WAIT))
+ node->flags|= ACTIVE;
+ node->link= (intptr)cursor.curr;
+ DBUG_ASSERT(node->link != (intptr)node);
+ DBUG_ASSERT(cursor.prev != &node->link);
+ if (!my_atomic_casptr((void **)cursor.prev,
+ (void **)(char*) &cursor.curr, node))
+ {
+ res= REPEAT_ONCE_MORE;
+ node->flags&= ~ACTIVE;
+ }
+ if (res & LOCK_UPGRADE)
+ cursor.upgrade_from->flags|= IGNORE_ME;
+ /*
+ QQ: is this OK ? if a reader has already read upgrade_from,
+ it may find it conflicting with node :(
+ - see the last test from test_lockman_simple()
+ */
+ }
+
+ } while (res == REPEAT_ONCE_MORE);
+ _lf_unpin(pins, 0);
+ _lf_unpin(pins, 1);
+ _lf_unpin(pins, 2);
+ /*
+ note that blocker is not necessarily pinned here (when it's == curr).
+ this is ok as in such a case it's either a dummy node for
+ initialize_bucket() and dummy nodes don't need pinning,
+ or it's a lock of the same transaction for lockman_getlock,
+ and it cannot be removed by another thread
+ */
+ *blocker= cursor.blocker;
+ return res;
+}
+
+/*
+ NOTE
+ it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays
+*/
+static int lockpeek(LOCK * volatile *head, LOCK *node, LF_PINS *pins,
+ LOCK **blocker)
+{
+ CURSOR cursor;
+ int res;
+
+ res= lockfind(head, node, &cursor, pins);
+
+ _lf_unpin(pins, 0);
+ _lf_unpin(pins, 1);
+ _lf_unpin(pins, 2);
+ if (blocker)
+ *blocker= cursor.blocker;
+ return res;
+}
+
+/*
+ NOTE
+ it uses pins[0..3], on return all pins are removed.
+
+ One _must_ have the lock (or request) to call this
+*/
+static int lockdelete(LOCK * volatile *head, LOCK *node, LF_PINS *pins)
+{
+ CURSOR cursor;
+ int res;
+
+ do
+ {
+ res= lockfind(head, node, &cursor, pins);
+ DBUG_ASSERT(res & ALREADY_HAVE);
+
+ if (cursor.upgrade_from)
+ cursor.upgrade_from->flags&= ~IGNORE_ME;
+
+ /*
+ XXX this does not work with savepoints, as old lock is left ignored.
+ It cannot be unignored, as would basically mean moving the lock back
+ in the lock chain (from upgraded). And the latter is not allowed -
+ because it breaks list scanning. So old ignored lock must be deleted,
+ new - same - lock must be installed right after the lock we're deleting,
+ then we can delete. Good news is - this is only required when rolling
+ back a savepoint.
+ */
+ if (my_atomic_casptr((void **)(char*)&(cursor.curr->link),
+ (void **)(char*)&cursor.next, 1+(char *)cursor.next))
+ {
+ if (my_atomic_casptr((void **)cursor.prev,
+ (void **)(char*)&cursor.curr, cursor.next))
+ _lf_alloc_free(pins, cursor.curr);
+ else
+ lockfind(head, node, &cursor, pins);
+ }
+ else
+ {
+ res= REPEAT_ONCE_MORE;
+ if (cursor.upgrade_from)
+ cursor.upgrade_from->flags|= IGNORE_ME;
+ }
+ } while (res == REPEAT_ONCE_MORE);
+ _lf_unpin(pins, 0);
+ _lf_unpin(pins, 1);
+ _lf_unpin(pins, 2);
+ _lf_unpin(pins, 3);
+ return res;
+}
+
+void lockman_init(LOCKMAN *lm, loid_to_lo_func *func, uint timeout)
+{
+ lf_alloc_init(&lm->alloc, sizeof(LOCK), offsetof(LOCK, lonext));
+ lf_dynarray_init(&lm->array, sizeof(LOCK **));
+ lm->size= 1;
+ lm->count= 0;
+ lm->loid_to_lo= func;
+ lm->lock_timeout= timeout;
+}
+
+void lockman_destroy(LOCKMAN *lm)
+{
+ LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0);
+ while (el)
+ {
+ intptr next= el->link;
+ if (el->hashnr & 1)
+ lf_alloc_direct_free(&lm->alloc, el);
+ else
+ my_free((void *)el, MYF(0));
+ el= (LOCK *)next;
+ }
+ lf_alloc_destroy(&lm->alloc);
+ lf_dynarray_destroy(&lm->array);
+}
+
+/* TODO: optimize it */
+#define MAX_LOAD 1
+
+static void initialize_bucket(LOCKMAN *lm, LOCK * volatile *node,
+ uint bucket, LF_PINS *pins)
+{
+ int res;
+ uint parent= my_clear_highest_bit(bucket);
+ LOCK *dummy= (LOCK *)my_malloc(sizeof(LOCK), MYF(MY_WME));
+ LOCK **tmp= 0, *cur;
+ LOCK * volatile *el= _lf_dynarray_lvalue(&lm->array, parent);
+
+ if (*el == NULL && bucket)
+ initialize_bucket(lm, el, parent, pins);
+ dummy->hashnr= my_reverse_bits(bucket);
+ dummy->loid= 0;
+ dummy->lock= X; /* doesn't matter, in fact */
+ dummy->resource= 0;
+ dummy->flags= 0;
+ res= lockinsert(el, dummy, pins, &cur);
+ DBUG_ASSERT(res & (ALREADY_HAVE_THE_LOCK | RESOURCE_WAS_UNLOCKED));
+ if (res & ALREADY_HAVE_THE_LOCK)
+ {
+ my_free((void *)dummy, MYF(0));
+ dummy= cur;
+ }
+ my_atomic_casptr((void **)node, (void **)(char*) &tmp, dummy);
+}
+
+static inline uint calc_hash(uint64 resource)
+{
+ const uchar *pos= (uchar *)&resource;
+ ulong nr1= 1, nr2= 4, i;
+ for (i= 0; i < sizeof(resource) ; i++, pos++)
+ {
+ nr1^= (ulong) ((((uint) nr1 & 63)+nr2) * ((uint)*pos)) + (nr1 << 8);
+ nr2+= 3;
+ }
+ return nr1 & INT_MAX32;
+}
+
+/*
+ RETURN
+ see enum lockman_getlock_result
+ NOTE
+ uses pins[0..3], they're removed on return
+*/
+enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo,
+ uint64 resource,
+ enum lockman_lock_type lock)
+{
+ int res;
+ uint csize, bucket, hashnr;
+ LOCK *node, * volatile *el, *blocker;
+ LF_PINS *pins= lo->pins;
+ enum lockman_lock_type old_lock;
+
+ DBUG_ASSERT(lo->loid);
+ lf_rwlock_by_pins(pins);
+ node= (LOCK *)_lf_alloc_new(pins);
+ node->flags= 0;
+ node->lock= lock;
+ node->loid= lo->loid;
+ node->resource= resource;
+ hashnr= calc_hash(resource);
+ bucket= hashnr % lm->size;
+ el= _lf_dynarray_lvalue(&lm->array, bucket);
+ if (*el == NULL)
+ initialize_bucket(lm, el, bucket, pins);
+ node->hashnr= my_reverse_bits(hashnr) | 1;
+ res= lockinsert(el, node, pins, &blocker);
+ if (res & ALREADY_HAVE)
+ {
+ int r;
+ old_lock= blocker->lock;
+ _lf_alloc_free(pins, node);
+ lf_rwunlock_by_pins(pins);
+ r= getlock_result[old_lock][lock];
+ DBUG_ASSERT(r);
+ return r;
+ }
+ /* a new value was added to the hash */
+ csize= lm->size;
+ if ((my_atomic_add32(&lm->count, 1)+1.0) / csize > MAX_LOAD)
+ my_atomic_cas32(&lm->size, (int*) &csize, csize*2);
+ node->lonext= lo->all_locks;
+ lo->all_locks= node;
+ for ( ; res & NEED_TO_WAIT; res= lockpeek(el, node, pins, &blocker))
+ {
+ LOCK_OWNER *wait_for_lo;
+ ulonglong deadline;
+ struct timespec timeout;
+
+ _lf_assert_pin(pins, 3); /* blocker must be pinned here */
+ wait_for_lo= lm->loid_to_lo(blocker->loid);
+
+ /*
+ now, this is tricky. blocker is not necessarily a LOCK
+ we're waiting for. If it's compatible with what we want,
+ then we're waiting for a lock that blocker is waiting for
+ (see two places where blocker is set in lockfind)
+ In the latter case, let's "dereference" it
+ */
+ if (lock_compatibility_matrix[blocker->lock][lock])
+ {
+ blocker= wait_for_lo->all_locks;
+ _lf_pin(pins, 3, blocker);
+ if (blocker != wait_for_lo->all_locks)
+ continue;
+ wait_for_lo= wait_for_lo->waiting_for;
+ }
+
+ /*
+ note that the blocker transaction may have ended by now,
+ its LOCK_OWNER and short id were reused, so 'wait_for_lo' may point
+ to an unrelated - albeit valid - LOCK_OWNER
+ */
+ if (!wait_for_lo)
+ continue;
+
+ lo->waiting_for= wait_for_lo;
+ lf_rwunlock_by_pins(pins);
+
+ /*
+ We lock a mutex - it may belong to a wrong LOCK_OWNER, but it must
+ belong to _some_ LOCK_OWNER. It means, we can never free() a LOCK_OWNER,
+ if there're other active LOCK_OWNERs.
+ */
+ /* QQ: race condition here */
+ pthread_mutex_lock(wait_for_lo->mutex);
+ if (DELETED(blocker->link))
+ {
+ /*
+ blocker transaction was ended, or a savepoint that owned
+ the lock was rolled back. Either way - the lock was removed
+ */
+ pthread_mutex_unlock(wait_for_lo->mutex);
+ lf_rwlock_by_pins(pins);
+ continue;
+ }
+
+ /* yuck. waiting */
+ deadline= my_getsystime() + lm->lock_timeout * 10000;
+ set_timespec_nsec(timeout,lm->lock_timeout * 1000000);
+ do
+ {
+ pthread_cond_timedwait(wait_for_lo->cond, wait_for_lo->mutex, &timeout);
+ } while (!DELETED(blocker->link) && my_getsystime() < deadline);
+ pthread_mutex_unlock(wait_for_lo->mutex);
+ lf_rwlock_by_pins(pins);
+ if (!DELETED(blocker->link))
+ {
+ /*
+ timeout.
+ note that we _don't_ release the lock request here.
+ Instead we're relying on the caller to abort the transaction,
+ and release all locks at once - see lockman_release_locks()
+ */
+ _lf_unpin(pins, 3);
+ lf_rwunlock_by_pins(pins);
+ return DIDNT_GET_THE_LOCK;
+ }
+ }
+ lo->waiting_for= 0;
+ _lf_assert_unpin(pins, 3); /* unpin should not be needed */
+ lf_rwunlock_by_pins(pins);
+ return getlock_result[lock][lock];
+}
+
+/*
+ RETURN
+ 0 - deleted
+ 1 - didn't (not found)
+ NOTE
+ see lockdelete() for pin usage notes
+*/
+int lockman_release_locks(LOCKMAN *lm, LOCK_OWNER *lo)
+{
+ LOCK * volatile *el, *node, *next;
+ uint bucket;
+ LF_PINS *pins= lo->pins;
+
+ pthread_mutex_lock(lo->mutex);
+ lf_rwlock_by_pins(pins);
+ for (node= lo->all_locks; node; node= next)
+ {
+ next= node->lonext;
+ bucket= calc_hash(node->resource) % lm->size;
+ el= _lf_dynarray_lvalue(&lm->array, bucket);
+ if (*el == NULL)
+ initialize_bucket(lm, el, bucket, pins);
+ lockdelete(el, node, pins);
+ my_atomic_add32(&lm->count, -1);
+ }
+ lf_rwunlock_by_pins(pins);
+ lo->all_locks= 0;
+ /* now signal all waiters */
+ pthread_cond_broadcast(lo->cond);
+ pthread_mutex_unlock(lo->mutex);
+ return 0;
+}
+
+#ifdef MY_LF_EXTRA_DEBUG
+static const char *lock2str[]=
+{ "N", "S", "X", "IS", "IX", "SIX", "LS", "LX", "SLX", "LSIX" };
+/*
+ NOTE
+ the function below is NOT thread-safe !!!
+*/
+void print_lockhash(LOCKMAN *lm)
+{
+ LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0);
+ printf("hash: size %u count %u\n", lm->size, lm->count);
+ while (el)
+ {
+ intptr next= el->link;
+ if (el->hashnr & 1)
+ {
+ printf("0x%08lx { resource %lu, loid %u, lock %s",
+ (long) el->hashnr, (ulong) el->resource, el->loid,
+ lock2str[el->lock]);
+ if (el->flags & IGNORE_ME) printf(" IGNORE_ME");
+ if (el->flags & UPGRADED) printf(" UPGRADED");
+ if (el->flags & ACTIVE) printf(" ACTIVE");
+ if (DELETED(next)) printf(" ***DELETED***");
+ printf("}\n");
+ }
+ else
+ {
+ /*printf("0x%08x { dummy }\n", el->hashnr);*/
+ DBUG_ASSERT(el->resource == 0 && el->loid == 0 && el->lock == X);
+ }
+ el= PTR(next);
+ }
+}
+#endif
diff --git a/storage/maria/lockman.h b/storage/maria/lockman.h
new file mode 100644
index 00000000000..82ab483896f
--- /dev/null
+++ b/storage/maria/lockman.h
@@ -0,0 +1,76 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _lockman_h
+#define _lockman_h
+
+/*
+ Lock levels:
+ ^^^^^^^^^^^
+
+ N - "no lock", not a lock, used sometimes internally to simplify the code
+ S - Shared
+ X - eXclusive
+ IS - Intention Shared
+ IX - Intention eXclusive
+ SIX - Shared + Intention eXclusive
+ LS - Loose Shared
+ LX - Loose eXclusive
+ SLX - Shared + Loose eXclusive
+ LSIX - Loose Shared + Intention eXclusive
+*/
+enum lockman_lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST };
+
+struct lockman_lock;
+
+typedef struct st_lock_owner LOCK_OWNER;
+struct st_lock_owner {
+ LF_PINS *pins; /* must be allocated from lockman's pinbox */
+ struct lockman_lock *all_locks; /* a LIFO */
+ LOCK_OWNER *waiting_for;
+ pthread_cond_t *cond; /* transactions waiting for this, wait on 'cond' */
+ pthread_mutex_t *mutex; /* mutex is required to use 'cond' */
+ uint16 loid;
+};
+
+typedef LOCK_OWNER *loid_to_lo_func(uint16);
+typedef struct {
+ LF_DYNARRAY array; /* hash itself */
+ LF_ALLOCATOR alloc; /* allocator for elements */
+ int32 volatile size; /* size of array */
+ int32 volatile count; /* number of elements in the hash */
+ uint lock_timeout;
+ loid_to_lo_func *loid_to_lo;
+} LOCKMAN;
+#define DIDNT_GET_THE_LOCK 0
+enum lockman_getlock_result {
+ NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT,
+ GOT_THE_LOCK,
+ GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE,
+ GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+};
+
+void lockman_init(LOCKMAN *, loid_to_lo_func *, uint);
+void lockman_destroy(LOCKMAN *);
+enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo,
+ uint64 resource,
+ enum lockman_lock_type lock);
+int lockman_release_locks(LOCKMAN *, LOCK_OWNER *);
+
+#ifdef EXTRA_DEBUG
+void print_lockhash(LOCKMAN *lm);
+#endif
+
+#endif
diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c
new file mode 100644
index 00000000000..c0763b0612d
--- /dev/null
+++ b/storage/maria/ma_bitmap.c
@@ -0,0 +1,2910 @@
+/* Copyright (C) 2007 Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Bitmap handling (for records in block)
+
+ The data file starts with a bitmap page, followed by as many data
+ pages as the bitmap can cover. After this there is a new bitmap page
+ and more data pages etc.
+
+ The bitmap code assumes there is always an active bitmap page and thus
+ that there is at least one bitmap page in the file
+
+ Structure of bitmap page:
+
+ Fixed size records (to be implemented later):
+
+ 2 bits are used to indicate:
+
+ 0 Empty
+ 1 0-75 % full (at least room for 2 records)
+ 2 75-100 % full (at least room for one record)
+ 3 100 % full (no more room for records)
+
+ Assuming 8K pages, this will allow us to map:
+ 8192 (bytes per page) * 4 (pages mapped per byte) * 8192 (page size)= 256M
+
+ (For Maria this will be 7*4 * 8192 = 224K smaller because of LSN)
+
+ Note that for fixed size rows, we can't add more columns without doing
+ a full reorganization of the table. The user can always force a dynamic
+ size row format by specifying ROW_FORMAT=dynamic.
+
+
+ Dynamic size records:
+
+ 3 bits are used to indicate Bytes free in 8K page
+
+ 0 Empty page 8176 (head or tail)
+ 1 0-30 % full (at least room for 3 records) 5724
+ 2 30-60 % full (at least room for 2 records) 3271
+ 3 60-90 % full (at least room for one record) 818
+ 4 100 % full (no more room for records) 0
+ 5 Tail page, 0-40 % full 4906
+ 6 Tail page, 40-80 % full 1636
+ 7 Full tail page or full blob page 0
+
+ Assuming 8K pages, this will allow us to map:
+ 8192 (bytes per page) * 8 bits/byte / 3 bits/page * 8192 (page size)= 170.7M
+
+ Note that values 1-3 may be adjust for each individual table based on
+ 'min record length'. Tail pages are for overflow data which can be of
+ any size and thus doesn't have to be adjusted for different tables.
+ If we add more columns to the table, some of the originally calculated
+ 'cut off' points may not be optimal, but they shouldn't be 'drasticly
+ wrong'.
+
+ When allocating data from the bitmap, we are trying to do it in a
+ 'best fit' manner. Blobs and varchar blocks are given out in large
+ continuous extents to allow fast access to these. Before allowing a
+ row to 'flow over' to other blocks, we will compact the page and use
+ all space on it. If there is many rows in the page, we will ensure
+ there is *LEFT_TO_GROW_ON_SPLIT* bytes left on the page to allow other
+ rows to grow.
+
+ The bitmap format allows us to extend the row file in big chunks, if needed.
+
+ When calculating the size for a packed row, we will calculate the following
+ things separately:
+ - Row header + null_bits + empty_bits fixed size segments etc.
+ - Size of all char/varchar fields
+ - Size of each blob field
+
+ The bitmap handler will get all the above information and return
+ either one page or a set of pages to put the different parts.
+
+ Bitmaps are read on demand in response to insert/delete/update operations.
+ The following bitmap pointers will be cached and stored on disk on close:
+ - Current insert_bitmap; When inserting new data we will first try to
+ fill this one.
+ - First bitmap which is not completely full. This is updated when we
+ free data with an update or delete.
+
+ While flushing out bitmaps, we will cache the status of the bitmap in memory
+ to avoid having to read a bitmap for insert of new data that will not
+ be of any use
+ - Total empty space
+ - Largest number of continuous pages
+
+ Bitmap ONLY goes to disk in the following scenarios
+ - The file is closed (and we flush all changes to disk)
+ - On checkpoint
+ (Ie: When we do a checkpoint, we have to ensure that all bitmaps are
+ put on disk even if they are not in the page cache).
+ - When explicitely requested (for example on backup or after recvoery,
+ to simplify things)
+
+ The flow of writing a row is that:
+ - Lock the bitmap
+ - Decide which data pages we will write to
+ - Mark them full in the bitmap page so that other threads do not try to
+ use the same data pages as us
+ - We unlock the bitmap
+ - Write the data pages
+ - Lock the bitmap
+ - Correct the bitmap page with the true final occupation of the data
+ pages (that is, we marked pages full but when we are done we realize
+ we didn't fill them)
+ - Unlock the bitmap.
+*/
+
+#include "maria_def.h"
+#include "ma_blockrec.h"
+
+#define FULL_HEAD_PAGE 4
+#define FULL_TAIL_PAGE 7
+
+/*#define WRONG_BITMAP_FLUSH 1*/ /*define only for provoking bugs*/
+#undef WRONG_BITMAP_FLUSH
+
+static my_bool _ma_read_bitmap_page(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page);
+static my_bool _ma_bitmap_create_missing(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page);
+
+/* Write bitmap page to key cache */
+
+static inline my_bool write_changed_bitmap(MARIA_SHARE *share,
+ MARIA_FILE_BITMAP *bitmap)
+{
+ DBUG_ENTER("write_changed_bitmap");
+ DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size);
+ DBUG_ASSERT(bitmap->file.write_callback != 0);
+ DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
+
+ /*
+ Mark that a bitmap page has been written to page cache and we have
+ to flush it during checkpoint.
+ */
+ bitmap->changed_not_flushed= 1;
+
+ if ((bitmap->non_flushable == 0)
+#ifdef WRONG_BITMAP_FLUSH
+ || 1
+#endif
+ )
+ {
+ my_bool res= pagecache_write(share->pagecache,
+ &bitmap->file, bitmap->page, 0,
+ bitmap->map, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE);
+ DBUG_RETURN(res);
+ }
+ else
+ {
+ MARIA_PINNED_PAGE page_link;
+ int res= pagecache_write(share->pagecache,
+ &bitmap->file, bitmap->page, 0,
+ bitmap->map, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, PAGECACHE_PIN,
+ PAGECACHE_WRITE_DELAY, &page_link.link,
+ LSN_IMPOSSIBLE);
+ page_link.unlock= PAGECACHE_LOCK_LEFT_UNLOCKED;
+ page_link.changed= 1;
+ push_dynamic(&bitmap->pinned_pages, (void*) &page_link);
+ DBUG_RETURN(res);
+ }
+}
+
+/*
+ Initialize bitmap variables in share
+
+ SYNOPSIS
+ _ma_bitmap_init()
+ share Share handler
+ file data file handler
+
+ NOTES
+ This is called the first time a file is opened.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_bitmap_init(MARIA_SHARE *share, File file)
+{
+ uint aligned_bit_blocks;
+ uint max_page_size;
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ uint size= share->block_size;
+#ifndef DBUG_OFF
+ /* We want to have a copy of the bitmap to be able to print differences */
+ size*= 2;
+#endif
+
+ if (((bitmap->map= (uchar*) my_malloc(size, MYF(MY_WME))) == NULL) ||
+ my_init_dynamic_array(&bitmap->pinned_pages,
+ sizeof(MARIA_PINNED_PAGE), 1, 1))
+ return 1;
+
+ bitmap->block_size= share->block_size;
+ bitmap->file.file= file;
+ _ma_bitmap_set_pagecache_callbacks(&bitmap->file, share);
+
+ /* Size needs to be aligned on 6 */
+ aligned_bit_blocks= (share->block_size - PAGE_SUFFIX_SIZE) / 6;
+ bitmap->total_size= aligned_bit_blocks * 6;
+ /*
+ In each 6 bytes, we have 6*8/3 = 16 pages covered
+ The +1 is to add the bitmap page, as this doesn't have to be covered
+ */
+ bitmap->pages_covered= aligned_bit_blocks * 16 + 1;
+ bitmap->flush_all_requested= 0;
+ bitmap->non_flushable= 0;
+
+ /* Update size for bits */
+ /* TODO; Make this dependent of the row size */
+ max_page_size= share->block_size - PAGE_OVERHEAD_SIZE + DIR_ENTRY_SIZE;
+ bitmap->sizes[0]= max_page_size; /* Empty page */
+ bitmap->sizes[1]= max_page_size - max_page_size * 30 / 100;
+ bitmap->sizes[2]= max_page_size - max_page_size * 60 / 100;
+ bitmap->sizes[3]= max_page_size - max_page_size * 90 / 100;
+ bitmap->sizes[4]= 0; /* Full page */
+ bitmap->sizes[5]= max_page_size - max_page_size * 40 / 100;
+ bitmap->sizes[6]= max_page_size - max_page_size * 80 / 100;
+ bitmap->sizes[7]= 0;
+
+ pthread_mutex_init(&share->bitmap.bitmap_lock, MY_MUTEX_INIT_SLOW);
+ pthread_cond_init(&share->bitmap.bitmap_cond, 0);
+
+ _ma_bitmap_reset_cache(share);
+
+ if (share->state.first_bitmap_with_space == ~(pgcache_page_no_t) 0)
+ {
+ /* Start scanning for free space from start of file */
+ share->state.first_bitmap_with_space = 0;
+ }
+ return 0;
+}
+
+
+/*
+ Free data allocated by _ma_bitmap_init
+
+ SYNOPSIS
+ _ma_bitmap_end()
+ share Share handler
+*/
+
+my_bool _ma_bitmap_end(MARIA_SHARE *share)
+{
+ my_bool res= _ma_bitmap_flush(share);
+ safe_mutex_assert_owner(&share->close_lock);
+ pthread_mutex_destroy(&share->bitmap.bitmap_lock);
+ pthread_cond_destroy(&share->bitmap.bitmap_cond);
+ delete_dynamic(&share->bitmap.pinned_pages);
+ my_free(share->bitmap.map, MYF(MY_ALLOW_ZERO_PTR));
+ share->bitmap.map= 0;
+ return res;
+}
+
+
+/*
+ Send updated bitmap to the page cache
+
+ SYNOPSIS
+ _ma_bitmap_flush()
+ share Share handler
+
+ NOTES
+ In the future, _ma_bitmap_flush() will be called to flush changes don't
+ by this thread (ie, checking the changed flag is ok). The reason we
+ check it again in the mutex is that if someone else did a flush at the
+ same time, we don't have to do the write.
+ This is also ok for _ma_scan_init_block_record() which does not want to
+ miss rows: it cares only for committed rows, that is, rows for which there
+ was a commit before our transaction started; as commit and transaction's
+ start are protected by the same LOCK_trn_list mutex, we see memory at
+ least as new as at other transaction's commit time, so if the committed
+ rows caused bitmap->changed to be true, we see it; if we see 0 it really
+ means a flush happened since then. So, it's ok to read without bitmap's
+ mutex.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_bitmap_flush(MARIA_SHARE *share)
+{
+ my_bool res= 0;
+ DBUG_ENTER("_ma_bitmap_flush");
+ if (share->bitmap.changed)
+ {
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ if (share->bitmap.changed)
+ {
+ res= write_changed_bitmap(share, &share->bitmap);
+ share->bitmap.changed= 0;
+ }
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ }
+ DBUG_RETURN(res);
+}
+
+
+/**
+ Dirty-page filtering criteria for bitmap pages
+
+ @param type Page's type
+ @param pageno Page's number
+ @param rec_lsn Page's rec_lsn
+ @param arg pages_covered of bitmap
+*/
+
+static enum pagecache_flush_filter_result
+filter_flush_bitmap_pages(enum pagecache_page_type type
+ __attribute__ ((unused)),
+ pgcache_page_no_t pageno,
+ LSN rec_lsn __attribute__ ((unused)),
+ void *arg)
+{
+ return ((pageno % (*(ulong*)arg)) == 0);
+}
+
+
+/**
+ Flushes current bitmap page to the pagecache, and then all bitmap pages
+ from pagecache to the file. Used by Checkpoint.
+
+ @param share Table's share
+*/
+
+my_bool _ma_bitmap_flush_all(MARIA_SHARE *share)
+{
+ my_bool res= 0;
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ DBUG_ENTER("_ma_bitmap_flush_all");
+ pthread_mutex_lock(&bitmap->bitmap_lock);
+ if (bitmap->changed || bitmap->changed_not_flushed)
+ {
+ bitmap->flush_all_requested++;
+#ifndef WRONG_BITMAP_FLUSH
+ while (bitmap->non_flushable > 0)
+ {
+ DBUG_PRINT("info", ("waiting for bitmap to be flushable"));
+ pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock);
+ }
+#endif
+ DBUG_ASSERT(bitmap->flush_all_requested == 1);
+ /*
+ Bitmap is in a flushable state: its contents in memory are reflected by
+ log records (complete REDO-UNDO groups) and all bitmap pages are
+ unpinned. We keep the mutex to preserve this situation, and flush to the
+ file.
+ */
+ if (bitmap->changed)
+ {
+ bitmap->changed= FALSE;
+ res= write_changed_bitmap(share, bitmap);
+ }
+ /*
+ We do NOT use FLUSH_KEEP_LAZY because we must be sure that bitmap
+ pages have been flushed. That's a condition of correctness of
+ Recovery: data pages may have been all flushed, if we write the
+ checkpoint record Recovery will start from after their REDOs. If
+ bitmap page was not flushed, as the REDOs about it will be skipped, it
+ will wrongly not be recovered. If bitmap pages had a rec_lsn it would
+ be different.
+ There should be no pinned pages as bitmap->non_flushable==0.
+ */
+ if (flush_pagecache_blocks_with_filter(share->pagecache,
+ &bitmap->file, FLUSH_KEEP,
+ filter_flush_bitmap_pages,
+ &bitmap->pages_covered) &
+ PCFLUSH_PINNED_AND_ERROR)
+ res= TRUE;
+ bitmap->changed_not_flushed= FALSE;
+ bitmap->flush_all_requested--;
+ /*
+ Some well-behaved threads may be waiting for flush_all_requested to
+ become false, wake them up.
+ */
+ DBUG_PRINT("info", ("bitmap flusher waking up others"));
+ pthread_cond_broadcast(&bitmap->bitmap_cond);
+ }
+ pthread_mutex_unlock(&bitmap->bitmap_lock);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Lock bitmap from being used by another thread
+
+ @fn _ma_bitmap_lock()
+ @param share Table's share
+
+ @notes
+ This is a temporary solution for allowing someone to delete an inserted
+ duplicate-key row while someone else is doing concurrent inserts.
+ This is ok for now as duplicate key errors are not that common.
+
+ In the future we will add locks for row-pages to ensure two threads doesn't
+ work at the same time on the same page.
+*/
+
+void _ma_bitmap_lock(MARIA_SHARE *share)
+{
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ DBUG_ENTER("_ma_bitmap_lock");
+
+ if (!share->now_transactional)
+ DBUG_VOID_RETURN;
+
+ pthread_mutex_lock(&bitmap->bitmap_lock);
+ bitmap->flush_all_requested++;
+ while (bitmap->non_flushable)
+ {
+ DBUG_PRINT("info", ("waiting for bitmap to be flushable"));
+ pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock);
+ }
+ /*
+ Ensure that _ma_bitmap_flush_all() and _ma_bitmap_lock() are blocked.
+ ma_bitmap_flushable() is blocked thanks to 'flush_all_requested'.
+ */
+ bitmap->non_flushable= 1;
+ pthread_mutex_unlock(&bitmap->bitmap_lock);
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Unlock bitmap after _ma_bitmap_lock()
+
+ @fn _ma_bitmap_unlock()
+ @param share Table's share
+*/
+
+void _ma_bitmap_unlock(MARIA_SHARE *share)
+{
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ DBUG_ENTER("_ma_bitmap_unlock");
+
+ if (!share->now_transactional)
+ DBUG_VOID_RETURN;
+ DBUG_ASSERT(bitmap->flush_all_requested > 0 && bitmap->non_flushable == 1);
+
+ pthread_mutex_lock(&bitmap->bitmap_lock);
+ bitmap->flush_all_requested--;
+ bitmap->non_flushable= 0;
+ pthread_mutex_unlock(&bitmap->bitmap_lock);
+ pthread_cond_broadcast(&bitmap->bitmap_cond);
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Unpin all pinned bitmap pages
+
+ @param share Table's share
+
+ @return Operation status
+ @retval 0 ok
+
+ @note This unpins pages pinned by other threads.
+*/
+
+static void _ma_bitmap_unpin_all(MARIA_SHARE *share)
+{
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*)
+ dynamic_array_ptr(&bitmap->pinned_pages, 0));
+ MARIA_PINNED_PAGE *pinned_page= page_link + bitmap->pinned_pages.elements;
+ DBUG_ENTER("_ma_bitmap_unpin_all");
+ DBUG_PRINT("info", ("pinned: %u", bitmap->pinned_pages.elements));
+ while (pinned_page-- != page_link)
+ pagecache_unlock_by_link(share->pagecache, pinned_page->link,
+ pinned_page->unlock, PAGECACHE_UNPIN,
+ LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, TRUE, TRUE);
+ bitmap->pinned_pages.elements= 0;
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Intialize bitmap in memory to a zero bitmap
+
+ SYNOPSIS
+ _ma_bitmap_delete_all()
+ share Share handler
+
+ NOTES
+ This is called on maria_delete_all_rows (truncate data file).
+*/
+
+void _ma_bitmap_delete_all(MARIA_SHARE *share)
+{
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ DBUG_ENTER("_ma_bitmap_delete_all");
+ if (bitmap->map) /* Not in create */
+ {
+ bzero(bitmap->map, bitmap->block_size);
+ bitmap->changed= 1;
+ bitmap->page= 0;
+ bitmap->used_size= bitmap->total_size;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Reset bitmap caches
+
+ @fn _ma_bitmap_reset_cache()
+ @param share Maria share
+
+ @notes
+ This is called after we have swapped file descriptors and we want
+ bitmap to forget all cached information
+*/
+
+void _ma_bitmap_reset_cache(MARIA_SHARE *share)
+{
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+
+ if (bitmap->map) /* If using bitmap */
+ {
+ /* Forget changes in current bitmap page */
+ bitmap->changed= 0;
+
+ /*
+ We can't read a page yet, as in some case we don't have an active
+ page cache yet.
+ Pretend we have a dummy, full and not changed bitmap page in memory.
+ */
+ bitmap->page= ~(ulonglong) 0;
+ bitmap->used_size= bitmap->total_size;
+ bfill(bitmap->map, share->block_size, 255);
+#ifndef DBUG_OFF
+ memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+#endif
+ }
+}
+
+
+/*
+ Return bitmap pattern for the smallest head block that can hold 'size'
+
+ SYNOPSIS
+ size_to_head_pattern()
+ bitmap Bitmap
+ size Requested size
+
+ RETURN
+ 0-3 For a description of the bitmap sizes, see the header
+*/
+
+static uint size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+ if (size <= bitmap->sizes[3])
+ return 3;
+ if (size <= bitmap->sizes[2])
+ return 2;
+ if (size <= bitmap->sizes[1])
+ return 1;
+ DBUG_ASSERT(size <= bitmap->sizes[0]);
+ return 0;
+}
+
+
+/*
+ Return bitmap pattern for head block where there is size bytes free
+
+ SYNOPSIS
+ _ma_free_size_to_head_pattern()
+ bitmap Bitmap
+ size Requested size
+
+ RETURN
+ 0-4 (Possible bitmap patterns for head block)
+*/
+
+uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+ if (size < bitmap->sizes[3])
+ return 4;
+ if (size < bitmap->sizes[2])
+ return 3;
+ if (size < bitmap->sizes[1])
+ return 2;
+ return (size < bitmap->sizes[0]) ? 1 : 0;
+}
+
+
+/*
+ Return bitmap pattern for the smallest tail block that can hold 'size'
+
+ SYNOPSIS
+ size_to_tail_pattern()
+ bitmap Bitmap
+ size Requested size
+
+ RETURN
+ 0, 5 or 6 For a description of the bitmap sizes, see the header
+*/
+
+static uint size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+ if (size <= bitmap->sizes[6])
+ return 6;
+ if (size <= bitmap->sizes[5])
+ return 5;
+ DBUG_ASSERT(size <= bitmap->sizes[0]);
+ return 0;
+}
+
+
+/*
+ Return bitmap pattern for tail block where there is size bytes free
+
+ SYNOPSIS
+ free_size_to_tail_pattern()
+ bitmap Bitmap
+ size Requested size
+
+ RETURN
+ 0, 5, 6, 7 For a description of the bitmap sizes, see the header
+*/
+
+static uint free_size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+ if (size >= bitmap->sizes[0])
+ return 0; /* Revert to empty page */
+ if (size < bitmap->sizes[6])
+ return 7;
+ if (size < bitmap->sizes[5])
+ return 6;
+ return 5;
+}
+
+
+/*
+ Return size guranteed to be available on a page
+
+ SYNOPSIS
+ pattern_to_head_size()
+ bitmap Bitmap
+ pattern Pattern (0-7)
+
+ RETURN
+ 0 - block_size
+*/
+
+static inline uint pattern_to_size(MARIA_FILE_BITMAP *bitmap, uint pattern)
+{
+ DBUG_ASSERT(pattern <= 7);
+ return bitmap->sizes[pattern];
+}
+
+
+/*
+ Print bitmap for debugging
+
+ SYNOPSIS
+ _ma_print_bitmap()
+ bitmap Bitmap to print
+
+ IMPLEMENTATION
+ Prints all changed bits since last call to _ma_print_bitmap().
+ This is done by having a copy of the last bitmap in
+ bitmap->map+bitmap->block_size.
+*/
+
+#ifndef DBUG_OFF
+
+const char *bits_to_txt[]=
+{
+ "empty", "00-30% full", "30-60% full", "60-90% full", "full",
+ "tail 00-40 % full", "tail 40-80 % full", "tail/blob full"
+};
+
+static void _ma_print_bitmap_changes(MARIA_FILE_BITMAP *bitmap)
+{
+ uchar *pos, *end, *org_pos;
+ ulong page;
+ DBUG_ENTER("_ma_print_bitmap_changes");
+
+ end= bitmap->map + bitmap->used_size;
+ DBUG_LOCK_FILE;
+ fprintf(DBUG_FILE,"\nBitmap page changes at page: %lu bitmap: 0x%lx\n",
+ (ulong) bitmap->page, (long) bitmap->map);
+
+ page= (ulong) bitmap->page+1;
+ for (pos= bitmap->map, org_pos= bitmap->map + bitmap->block_size ;
+ pos < end ;
+ pos+= 6, org_pos+= 6)
+ {
+ ulonglong bits= uint6korr(pos); /* 6 bytes = 6*8/3= 16 patterns */
+ ulonglong org_bits= uint6korr(org_pos);
+ uint i;
+
+ /*
+ Test if there is any changes in the next 16 bitmaps (to not have to
+ loop through all bits if we know they are the same)
+ */
+ if (bits != org_bits)
+ {
+ for (i= 0; i < 16 ; i++, bits>>= 3, org_bits>>= 3)
+ {
+ if ((bits & 7) != (org_bits & 7))
+ fprintf(DBUG_FILE, "Page: %8lu %s -> %s\n", page+i,
+ bits_to_txt[org_bits & 7], bits_to_txt[bits & 7]);
+ }
+ }
+ page+= 16;
+ }
+ fputc('\n', DBUG_FILE);
+ DBUG_UNLOCK_FILE;
+ memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+ DBUG_VOID_RETURN;
+}
+
+
+/* Print content of bitmap for debugging */
+
+void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data,
+ pgcache_page_no_t page)
+{
+ uchar *pos, *end;
+ char llbuff[22];
+
+ end= bitmap->map + bitmap->used_size;
+ DBUG_LOCK_FILE;
+ fprintf(DBUG_FILE,"\nDump of bitmap page at %s\n", llstr(page, llbuff));
+
+ page++; /* Skip bitmap page */
+ for (pos= data, end= pos + bitmap->total_size;
+ pos < end ;
+ pos+= 6)
+ {
+ ulonglong bits= uint6korr(pos); /* 6 bytes = 6*8/3= 16 patterns */
+
+ /*
+ Test if there is any changes in the next 16 bitmaps (to not have to
+ loop through all bits if we know they are the same)
+ */
+ if (bits)
+ {
+ uint i;
+ for (i= 0; i < 16 ; i++, bits>>= 3)
+ {
+ if (bits & 7)
+ fprintf(DBUG_FILE, "Page: %8s %s\n", llstr(page+i, llbuff),
+ bits_to_txt[bits & 7]);
+ }
+ }
+ page+= 16;
+ }
+ fputc('\n', DBUG_FILE);
+ DBUG_UNLOCK_FILE;
+}
+
+#endif /* DBUG_OFF */
+
+
+/***************************************************************************
+ Reading & writing bitmap pages
+***************************************************************************/
+
+/*
+ Read a given bitmap page
+
+ SYNOPSIS
+ _ma_read_bitmap_page()
+ info Maria handler
+ bitmap Bitmap handler
+ page Page to read
+
+ TODO
+ Update 'bitmap->used_size' to real size of used bitmap
+
+ NOTE
+ We don't always have share->bitmap.bitmap_lock here
+ (when called from_ma_check_bitmap_data() for example).
+
+ RETURN
+ 0 ok
+ 1 error (Error writing old bitmap or reading bitmap page)
+*/
+
+static my_bool _ma_read_bitmap_page(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page)
+{
+ MARIA_SHARE *share= info->s;
+ my_bool res;
+ DBUG_ENTER("_ma_read_bitmap_page");
+ DBUG_ASSERT(page % bitmap->pages_covered == 0);
+ DBUG_ASSERT(!bitmap->changed);
+
+ bitmap->page= page;
+ if (((page + 1) * bitmap->block_size) > share->state.state.data_file_length)
+ {
+ /* Inexistent or half-created page */
+ res= _ma_bitmap_create_missing(info, bitmap, page);
+ DBUG_RETURN(res);
+ }
+ bitmap->used_size= bitmap->total_size;
+ DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size);
+ res= pagecache_read(share->pagecache,
+ &bitmap->file, page, 0,
+ bitmap->map, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == NULL;
+
+ /*
+ We can't check maria_bitmap_marker here as if the bitmap page
+ previously had a true checksum and the user switched mode to not checksum
+ this may have any value, except maria_normal_page_marker.
+
+ Using maria_normal_page_marker gives us a protection against bugs
+ when running without any checksums.
+ */
+
+#ifndef DBUG_OFF
+ if (!res)
+ memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+#endif
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Change to another bitmap page
+
+ SYNOPSIS
+ _ma_change_bitmap_page()
+ info Maria handler
+ bitmap Bitmap handler
+ page Bitmap page to read
+
+ NOTES
+ If old bitmap was changed, write it out before reading new one
+ We return empty bitmap if page is outside of file size
+
+ RETURN
+ 0 ok
+ 1 error (Error writing old bitmap or reading bitmap page)
+*/
+
+static my_bool _ma_change_bitmap_page(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page)
+{
+ DBUG_ENTER("_ma_change_bitmap_page");
+
+ if (bitmap->changed)
+ {
+ if (write_changed_bitmap(info->s, bitmap))
+ DBUG_RETURN(1);
+ bitmap->changed= 0;
+ }
+ DBUG_RETURN(_ma_read_bitmap_page(info, bitmap, page));
+}
+
+
+/*
+ Read next suitable bitmap
+
+ SYNOPSIS
+ move_to_next_bitmap()
+ bitmap Bitmap handle
+
+ NOTES
+ The found bitmap may be full, so calling function may need to call this
+ repeatedly until it finds enough space.
+
+ TODO
+ Add cache of bitmaps to not read something that is not usable
+
+ RETURN
+ 0 ok
+ 1 error (either couldn't save old bitmap or read new one)
+*/
+
+static my_bool move_to_next_bitmap(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap)
+{
+ pgcache_page_no_t page= bitmap->page;
+ MARIA_STATE_INFO *state= &info->s->state;
+ DBUG_ENTER("move_to_next_bitmap");
+
+ if (state->first_bitmap_with_space != ~(ulonglong) 0 &&
+ state->first_bitmap_with_space != page)
+ {
+ page= state->first_bitmap_with_space;
+ state->first_bitmap_with_space= ~(ulonglong) 0;
+ }
+ else
+ page+= bitmap->pages_covered;
+ DBUG_RETURN(_ma_change_bitmap_page(info, bitmap, page));
+}
+
+
+/****************************************************************************
+ Allocate data in bitmaps
+****************************************************************************/
+
+/*
+ Store data in 'block' and mark the place used in the bitmap
+
+ SYNOPSIS
+ fill_block()
+ bitmap Bitmap handle
+ block Store data about what we found
+ best_data Pointer to best 6 uchar aligned area in bitmap->map
+ best_pos Which bit in *best_data the area starts
+ 0 = first bit pattern, 1 second bit pattern etc
+ best_bits The original value of the bits at best_pos
+ fill_pattern Bitmap pattern to store in best_data[best_pos]
+
+ NOTES
+ We mark all pages to be 'TAIL's, which means that
+ block->page_count is really a row position inside the page.
+*/
+
+static void fill_block(MARIA_FILE_BITMAP *bitmap,
+ MARIA_BITMAP_BLOCK *block,
+ uchar *best_data, uint best_pos, uint best_bits,
+ uint fill_pattern)
+{
+ uint page, offset, tmp;
+ uchar *data;
+ DBUG_ENTER("fill_block");
+
+ /* For each 6 bytes we have 6*8/3= 16 patterns */
+ page= ((uint) (best_data - bitmap->map)) / 6 * 16 + best_pos;
+ DBUG_ASSERT(page + 1 < bitmap->pages_covered);
+ block->page= bitmap->page + 1 + page;
+ block->page_count= TAIL_PAGE_COUNT_MARKER;
+ block->empty_space= pattern_to_size(bitmap, best_bits);
+ block->sub_blocks= 0;
+ block->org_bitmap_value= best_bits;
+ block->used= BLOCKUSED_TAIL; /* See _ma_bitmap_release_unused() */
+
+ /*
+ Mark place used by reading/writing 2 bytes at a time to handle
+ bitmaps in overlapping bytes
+ */
+ best_pos*= 3;
+ data= best_data+ best_pos / 8;
+ offset= best_pos & 7;
+ tmp= uint2korr(data);
+
+ /* we turn off the 3 bits and replace them with fill_pattern */
+ tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset);
+ int2store(data, tmp);
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Allocate data for head block
+
+ SYNOPSIS
+ allocate_head()
+ bitmap bitmap
+ size Size of data region we need to store
+ block Store found information here
+
+ IMPLEMENTATION
+ Find the best-fit page to put a region of 'size'
+ This is defined as the first page of the set of pages
+ with the smallest free space that can hold 'size'.
+
+ RETURN
+ 0 ok (block is updated)
+ 1 error (no space in bitmap; block is not touched)
+*/
+
+
+static my_bool allocate_head(MARIA_FILE_BITMAP *bitmap, uint size,
+ MARIA_BITMAP_BLOCK *block)
+{
+ uint min_bits= size_to_head_pattern(bitmap, size);
+ uchar *data= bitmap->map, *end= data + bitmap->used_size;
+ uchar *best_data= 0;
+ uint best_bits= (uint) -1, best_pos;
+ DBUG_ENTER("allocate_head");
+
+ LINT_INIT(best_pos);
+ DBUG_ASSERT(size <= FULL_PAGE_SIZE(bitmap->block_size));
+
+ for (; data < end; data+= 6)
+ {
+ ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */
+ uint i;
+
+ /*
+ Skip common patterns
+ We can skip empty pages (if we already found a match) or
+ anything matching the following pattern as this will be either
+ a full page or a tail page
+ */
+ if ((!bits && best_data) ||
+ ((bits & LL(04444444444444444)) == LL(04444444444444444)))
+ continue;
+ for (i= 0; i < 16 ; i++, bits >>= 3)
+ {
+ uint pattern= (uint) (bits & 7);
+ if (pattern <= min_bits)
+ {
+ /* There is enough space here */
+ if ((int) pattern > (int) best_bits)
+ {
+ /*
+ There is more than enough space here and it's better than what
+ we have found so far. Remember it, as we will choose it if we
+ don't find anything in this bitmap page.
+ */
+ best_bits= pattern;
+ best_data= data;
+ best_pos= i;
+ if (pattern == min_bits)
+ goto found; /* Best possible match */
+ }
+ }
+ }
+ }
+ if (!best_data) /* Found no place */
+ {
+ if (data >= bitmap->map + bitmap->total_size)
+ DBUG_RETURN(1); /* No space in bitmap */
+ /* Allocate data at end of bitmap */
+ bitmap->used_size+= 6;
+ set_if_smaller(bitmap->used_size, bitmap->total_size);
+ best_data= data;
+ best_pos= best_bits= 0;
+ }
+
+found:
+ fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_HEAD_PAGE);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Allocate data for tail block
+
+ SYNOPSIS
+ allocate_tail()
+ bitmap bitmap
+ size Size of block we need to find
+ block Store found information here
+
+ RETURN
+ 0 ok (block is updated)
+ 1 error (no space in bitmap; block is not touched)
+*/
+
+
+static my_bool allocate_tail(MARIA_FILE_BITMAP *bitmap, uint size,
+ MARIA_BITMAP_BLOCK *block)
+{
+ uint min_bits= size_to_tail_pattern(bitmap, size);
+ uchar *data= bitmap->map, *end= data + bitmap->used_size;
+ uchar *best_data= 0;
+ uint best_bits= (uint) -1, best_pos;
+ DBUG_ENTER("allocate_tail");
+ DBUG_PRINT("enter", ("size: %u", size));
+
+ LINT_INIT(best_pos);
+ /*
+ We have to add DIR_ENTRY_SIZE here as this is not part of the data size
+ See call to allocate_tail() in find_tail().
+ */
+ DBUG_ASSERT(size <= MAX_TAIL_SIZE(bitmap->block_size) + DIR_ENTRY_SIZE);
+
+ for (; data < end; data += 6)
+ {
+ ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */
+ uint i;
+
+ /*
+ Skip common patterns
+ We can skip empty pages (if we already found a match) or
+ the following patterns: 1-4 (head pages, not suitable for tail) or
+ 7 (full tail page). See 'Dynamic size records' comment at start of file.
+
+ At the moment we only skip full head and tail pages (ie, all bits are
+ set) as this is easy to detect with one simple test and is a
+ quite common case if we have blobs.
+ */
+
+ if ((!bits && best_data) || bits == LL(0xffffffffffff) ||
+ bits == LL(04444444444444444))
+ continue;
+ for (i= 0; i < 16; i++, bits >>= 3)
+ {
+ uint pattern= (uint) (bits & 7);
+ if (pattern <= min_bits && (!pattern || pattern >= 5))
+ {
+ if ((int) pattern > (int) best_bits)
+ {
+ best_bits= pattern;
+ best_data= data;
+ best_pos= i;
+ if (pattern == min_bits)
+ goto found; /* Can't be better */
+ }
+ }
+ }
+ }
+ if (!best_data)
+ {
+ if (data >= bitmap->map + bitmap->total_size)
+ DBUG_RETURN(1);
+ /* Allocate data at end of bitmap */
+ best_data= data;
+ bitmap->used_size+= 6;
+ set_if_smaller(bitmap->used_size, bitmap->total_size);
+ best_pos= best_bits= 0;
+ }
+
+found:
+ fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_TAIL_PAGE);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Allocate data for full blocks
+
+ SYNOPSIS
+ allocate_full_pages()
+ bitmap bitmap
+ pages_needed Total size in pages (bitmap->total_size) we would like to have
+ block Store found information here
+ full_page 1 if we are not allowed to split extent
+
+ IMPLEMENTATION
+ We will return the smallest area >= size. If there is no such
+ block, we will return the biggest area that satisfies
+ area_size >= min(BLOB_SEGMENT_MIN_SIZE*full_page_size, size)
+
+ To speed up searches, we will only consider areas that has at least 16 free
+ pages starting on an even boundary. When finding such an area, we will
+ extend it with all previous and following free pages. This will ensure
+ we don't get holes between areas
+
+ RETURN
+ # Blocks used
+ 0 error (no space in bitmap; block is not touched)
+*/
+
+static ulong allocate_full_pages(MARIA_FILE_BITMAP *bitmap,
+ ulong pages_needed,
+ MARIA_BITMAP_BLOCK *block, my_bool full_page)
+{
+ uchar *data= bitmap->map, *data_end= data + bitmap->used_size;
+ uchar *page_end= data + bitmap->total_size;
+ uchar *best_data= 0;
+ uint min_size;
+ uint best_area_size, best_prefix_area_size, best_suffix_area_size;
+ uint page, size;
+ ulonglong best_prefix_bits;
+ DBUG_ENTER("allocate_full_pages");
+ DBUG_PRINT("enter", ("pages_needed: %lu", pages_needed));
+
+ /* Following variables are only used if best_data is set */
+ LINT_INIT(best_prefix_bits);
+ LINT_INIT(best_prefix_area_size);
+ LINT_INIT(best_suffix_area_size);
+
+ min_size= pages_needed;
+ if (!full_page && min_size > BLOB_SEGMENT_MIN_SIZE)
+ min_size= BLOB_SEGMENT_MIN_SIZE;
+ best_area_size= ~(uint) 0;
+
+ for (; data < page_end; data+= 6)
+ {
+ ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */
+ uchar *data_start;
+ ulonglong prefix_bits= 0;
+ uint area_size, prefix_area_size, suffix_area_size;
+
+ /* Find area with at least 16 free pages */
+ if (bits)
+ continue;
+ data_start= data;
+ /* Find size of area */
+ for (data+=6 ; data < data_end ; data+= 6)
+ {
+ if ((bits= uint6korr(data)))
+ break;
+ }
+ area_size= (uint) (data - data_start) / 6 * 16;
+ if (area_size >= best_area_size)
+ continue;
+ prefix_area_size= suffix_area_size= 0;
+ if (!bits)
+ {
+ /*
+ End of page; All the rest of the bits on page are part of area
+ This is needed because bitmap->used_size only covers the set bits
+ in the bitmap.
+ */
+ area_size+= (uint) (page_end - data) / 6 * 16;
+ if (area_size >= best_area_size)
+ break;
+ data= page_end;
+ }
+ else
+ {
+ /* Add bits at end of page */
+ for (; !(bits & 7); bits >>= 3)
+ suffix_area_size++;
+ area_size+= suffix_area_size;
+ }
+ if (data_start != bitmap->map)
+ {
+ /* Add bits before page */
+ bits= prefix_bits= uint6korr(data_start - 6);
+ DBUG_ASSERT(bits != 0);
+ /* 111 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 */
+ if (!(bits & LL(07000000000000000)))
+ {
+ data_start-= 6;
+ do
+ {
+ prefix_area_size++;
+ bits<<= 3;
+ } while (!(bits & LL(07000000000000000)));
+ area_size+= prefix_area_size;
+ /* Calculate offset to page from data_start */
+ prefix_area_size= 16 - prefix_area_size;
+ }
+ }
+ if (area_size >= min_size && area_size <= best_area_size)
+ {
+ best_data= data_start;
+ best_area_size= area_size;
+ best_prefix_bits= prefix_bits;
+ best_prefix_area_size= prefix_area_size;
+ best_suffix_area_size= suffix_area_size;
+
+ /* Prefer to put data in biggest possible area */
+ if (area_size <= pages_needed)
+ min_size= area_size;
+ else
+ min_size= pages_needed;
+ }
+ }
+ if (!best_data)
+ DBUG_RETURN(0); /* No room on page */
+
+ /*
+ Now allocate min(pages_needed, area_size), starting from
+ best_start + best_prefix_area_size
+ */
+ if (best_area_size > pages_needed)
+ best_area_size= pages_needed;
+
+ /* For each 6 bytes we have 6*8/3= 16 patterns */
+ page= ((uint) (best_data - bitmap->map) * 8) / 3 + best_prefix_area_size;
+ block->page= bitmap->page + 1 + page;
+ block->page_count= best_area_size;
+ block->empty_space= 0;
+ block->sub_blocks= 0;
+ block->org_bitmap_value= 0;
+ block->used= 0;
+ DBUG_ASSERT(page + best_area_size < bitmap->pages_covered);
+ DBUG_PRINT("info", ("page: %lu page_count: %u",
+ (ulong) block->page, block->page_count));
+
+ if (best_prefix_area_size)
+ {
+ ulonglong tmp;
+ /* Convert offset back to bits */
+ best_prefix_area_size= 16 - best_prefix_area_size;
+ if (best_area_size < best_prefix_area_size)
+ {
+ tmp= (LL(1) << best_area_size*3) - 1;
+ best_area_size= best_prefix_area_size; /* for easy end test */
+ }
+ else
+ tmp= (LL(1) << best_prefix_area_size*3) - 1;
+ tmp<<= (16 - best_prefix_area_size) * 3;
+ DBUG_ASSERT((best_prefix_bits & tmp) == 0);
+ best_prefix_bits|= tmp;
+ int6store(best_data, best_prefix_bits);
+ if (!(best_area_size-= best_prefix_area_size))
+ {
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+ DBUG_RETURN(block->page_count);
+ }
+ best_data+= 6;
+ }
+ best_area_size*= 3; /* Bits to set */
+ size= best_area_size/8; /* Bytes to set */
+ bfill(best_data, size, 255);
+ best_data+= size;
+ if ((best_area_size-= size * 8))
+ {
+ /* fill last uchar */
+ *best_data|= (uchar) ((1 << best_area_size) -1);
+ best_data++;
+ }
+ if (data_end < best_data)
+ {
+ bitmap->used_size= (uint) (best_data - bitmap->map);
+ DBUG_ASSERT(bitmap->used_size <= bitmap->total_size);
+ }
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+ DBUG_RETURN(block->page_count);
+}
+
+
+/****************************************************************************
+ Find right bitmaps where to store data
+****************************************************************************/
+
+/*
+ Find right bitmap and position for head block
+
+ SYNOPSIS
+ find_head()
+ info Maria handler
+ length Size of data region we need store
+ position Position in bitmap_blocks where to store the
+ information for the head block.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool find_head(MARIA_HA *info, uint length, uint position)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ MARIA_BITMAP_BLOCK *block;
+ /*
+ There is always place for the head block in bitmap_blocks as these are
+ preallocated at _ma_init_block_record().
+ */
+ block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *);
+
+ /*
+ We need to have DIRENTRY_SIZE here to take into account that we may
+ need an extra directory entry for the row
+ */
+ while (allocate_head(bitmap, length + DIR_ENTRY_SIZE, block))
+ if (move_to_next_bitmap(info, bitmap))
+ return 1;
+ return 0;
+}
+
+
+/*
+ Find right bitmap and position for tail
+
+ SYNOPSIS
+ find_tail()
+ info Maria handler
+ length Size of data region we need store
+ position Position in bitmap_blocks where to store the
+ information for the head block.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool find_tail(MARIA_HA *info, uint length, uint position)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ MARIA_BITMAP_BLOCK *block;
+ DBUG_ENTER("find_tail");
+ DBUG_ASSERT(length <= info->s->block_size - PAGE_OVERHEAD_SIZE);
+
+ /* Needed, as there is no error checking in dynamic_element */
+ if (allocate_dynamic(&info->bitmap_blocks, position))
+ DBUG_RETURN(1);
+ block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *);
+
+ /*
+ We have to add DIR_ENTRY_SIZE to ensure we have space for the tail and
+ it's directroy entry on the page
+ */
+ while (allocate_tail(bitmap, length + DIR_ENTRY_SIZE, block))
+ if (move_to_next_bitmap(info, bitmap))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Find right bitmap and position for full blocks in one extent
+
+ SYNOPSIS
+ find_mid()
+ info Maria handler.
+ pages How many pages to allocate.
+ position Position in bitmap_blocks where to store the
+ information for the head block.
+ NOTES
+ This is used to allocate the main extent after the 'head' block
+ (Ie, the middle part of the head-middle-tail entry)
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool find_mid(MARIA_HA *info, ulong pages, uint position)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ MARIA_BITMAP_BLOCK *block;
+ block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *);
+
+ while (!allocate_full_pages(bitmap, pages, block, 1))
+ {
+ if (move_to_next_bitmap(info, bitmap))
+ return 1;
+ }
+ return 0;
+}
+
+
+/*
+ Find right bitmap and position for putting a blob
+
+ SYNOPSIS
+ find_blob()
+ info Maria handler.
+ length Length of the blob
+
+ NOTES
+ The extents are stored last in info->bitmap_blocks
+
+ IMPLEMENTATION
+ Allocate all full pages for the block + optionally one tail
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool find_blob(MARIA_HA *info, ulong length)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ uint full_page_size= FULL_PAGE_SIZE(info->s->block_size);
+ ulong pages;
+ uint rest_length, used;
+ uint first_block_pos;
+ MARIA_BITMAP_BLOCK *first_block= 0;
+ DBUG_ENTER("find_blob");
+ DBUG_PRINT("enter", ("length: %lu", length));
+ LINT_INIT(first_block_pos);
+
+ pages= length / full_page_size;
+ rest_length= (uint) (length - pages * full_page_size);
+ if (rest_length >= MAX_TAIL_SIZE(info->s->block_size))
+ {
+ pages++;
+ rest_length= 0;
+ }
+
+ first_block_pos= info->bitmap_blocks.elements;
+ if (pages)
+ {
+ MARIA_BITMAP_BLOCK *block;
+ if (allocate_dynamic(&info->bitmap_blocks,
+ info->bitmap_blocks.elements +
+ pages / BLOB_SEGMENT_MIN_SIZE + 2))
+ DBUG_RETURN(1);
+ block= dynamic_element(&info->bitmap_blocks, info->bitmap_blocks.elements,
+ MARIA_BITMAP_BLOCK*);
+ do
+ {
+ /*
+ We use 0x3fff here as the two upmost bits are reserved for
+ TAIL_BIT and START_EXTENT_BIT
+ */
+ used= allocate_full_pages(bitmap,
+ (pages >= 0x3fff ? 0x3fff : (uint) pages),
+ block, 0);
+ if (!used)
+ {
+ if (move_to_next_bitmap(info, bitmap))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ pages-= used;
+ info->bitmap_blocks.elements++;
+ block++;
+ }
+ } while (pages != 0);
+ }
+ if (rest_length && find_tail(info, rest_length,
+ info->bitmap_blocks.elements++))
+ DBUG_RETURN(1);
+ first_block= dynamic_element(&info->bitmap_blocks, first_block_pos,
+ MARIA_BITMAP_BLOCK*);
+ first_block->sub_blocks= info->bitmap_blocks.elements - first_block_pos;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Find pages to put ALL blobs
+
+ SYNOPSIS
+ allocate_blobs()
+ info Maria handler
+ row Information of what is in the row (from calc_record_size())
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool allocate_blobs(MARIA_HA *info, MARIA_ROW *row)
+{
+ ulong *length, *end;
+ uint elements;
+ /*
+ Reserve size for:
+ head block
+ one extent
+ tail block
+ */
+ elements= info->bitmap_blocks.elements;
+ for (length= row->blob_lengths, end= length + info->s->base.blobs;
+ length < end; length++)
+ {
+ if (*length && find_blob(info, *length))
+ return 1;
+ }
+ row->extents_count= (info->bitmap_blocks.elements - elements);
+ return 0;
+}
+
+
+/*
+ Store in the bitmap the new size for a head page
+
+ SYNOPSIS
+ use_head()
+ info Maria handler
+ page Page number to update
+ (Note that caller guarantees this is in the active
+ bitmap)
+ size How much free space is left on the page
+ block_position In which info->bitmap_block we have the
+ information about the head block.
+
+ NOTES
+ This is used on update where we are updating an existing head page
+*/
+
+static void use_head(MARIA_HA *info, pgcache_page_no_t page, uint size,
+ uint block_position)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ MARIA_BITMAP_BLOCK *block;
+ uchar *data;
+ uint offset, tmp, offset_page;
+ DBUG_ENTER("use_head");
+
+ DBUG_ASSERT(page % bitmap->pages_covered);
+
+ block= dynamic_element(&info->bitmap_blocks, block_position,
+ MARIA_BITMAP_BLOCK*);
+ block->page= page;
+ block->page_count= 1 + TAIL_BIT;
+ block->empty_space= size;
+ block->used= BLOCKUSED_TAIL;
+
+ /*
+ Mark place used by reading/writing 2 bytes at a time to handle
+ bitmaps in overlapping bytes
+ */
+ offset_page= (uint) (page - bitmap->page - 1) * 3;
+ offset= offset_page & 7;
+ data= bitmap->map + offset_page / 8;
+ tmp= uint2korr(data);
+ block->org_bitmap_value= (tmp >> offset) & 7;
+ tmp= (tmp & ~(7 << offset)) | (FULL_HEAD_PAGE << offset);
+ int2store(data, tmp);
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Find out where to split the row (ie, what goes in head, middle, tail etc)
+
+ SYNOPSIS
+ find_where_to_split_row()
+ share Maria share
+ row Information of what is in the row (from calc_record_size())
+ extents_length Number of bytes needed to store all extents
+ split_size Free size on the page (The head length must be less
+ than this)
+
+ RETURN
+ row_length for the head block.
+*/
+
+static uint find_where_to_split_row(MARIA_SHARE *share, MARIA_ROW *row,
+ uint extents_length, uint split_size)
+{
+ uint *lengths, *lengths_end;
+ /*
+ Ensure we have the minimum required space on head page:
+ - Header + length of field lengths (row->min_length)
+ - Number of extents
+ - One extent
+ */
+ uint row_length= (row->min_length +
+ size_to_store_key_length(extents_length) +
+ ROW_EXTENT_SIZE);
+ DBUG_ASSERT(row_length < split_size);
+ /*
+ Store first in all_field_lengths the different parts that are written
+ to the row. This needs to be in same order as in
+ ma_block_rec.c::write_block_record()
+ */
+ row->null_field_lengths[-3]= extents_length;
+ row->null_field_lengths[-2]= share->base.fixed_not_null_fields_length;
+ row->null_field_lengths[-1]= row->field_lengths_length;
+ for (lengths= row->null_field_lengths - EXTRA_LENGTH_FIELDS,
+ lengths_end= (lengths + share->base.pack_fields - share->base.blobs +
+ EXTRA_LENGTH_FIELDS); lengths < lengths_end; lengths++)
+ {
+ if (row_length + *lengths > split_size)
+ break;
+ row_length+= *lengths;
+ }
+ return row_length;
+}
+
+
+/*
+ Find where to write the middle parts of the row and the tail
+
+ SYNOPSIS
+ write_rest_of_head()
+ info Maria handler
+ position Position in bitmap_blocks. Is 0 for rows that needs
+ full blocks (ie, has a head, middle part and optional tail)
+ rest_length How much left of the head block to write.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool write_rest_of_head(MARIA_HA *info, uint position,
+ ulong rest_length)
+{
+ MARIA_SHARE *share= info->s;
+ uint full_page_size= FULL_PAGE_SIZE(share->block_size);
+ MARIA_BITMAP_BLOCK *block;
+ DBUG_ENTER("write_rest_of_head");
+ DBUG_PRINT("enter", ("position: %u rest_length: %lu", position,
+ rest_length));
+
+ if (position == 0)
+ {
+ /* Write out full pages */
+ uint pages= rest_length / full_page_size;
+
+ rest_length%= full_page_size;
+ if (rest_length >= MAX_TAIL_SIZE(share->block_size))
+ {
+ /* Put tail on a full page */
+ pages++;
+ rest_length= 0;
+ }
+ if (find_mid(info, pages, 1))
+ DBUG_RETURN(1);
+ /*
+ Insert empty block after full pages, to allow write_block_record() to
+ split segment into used + free page
+ */
+ block= dynamic_element(&info->bitmap_blocks, 2, MARIA_BITMAP_BLOCK*);
+ block->page_count= 0;
+ block->used= 0;
+ }
+ if (rest_length)
+ {
+ if (find_tail(info, rest_length, ELEMENTS_RESERVED_FOR_MAIN_PART - 1))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ /* Empty tail block */
+ block= dynamic_element(&info->bitmap_blocks,
+ ELEMENTS_RESERVED_FOR_MAIN_PART - 1,
+ MARIA_BITMAP_BLOCK *);
+ block->page_count= 0;
+ block->used= 0;
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Find where to store one row
+
+ SYNPOSIS
+ _ma_bitmap_find_place()
+ info Maria handler
+ row Information about row to write
+ blocks Store data about allocated places here
+
+ RETURN
+ 0 ok
+ row->space_on_head_page contains minimum number of bytes we
+ expect to put on the head page.
+ 1 error
+ my_errno is set to error
+*/
+
+my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row,
+ MARIA_BITMAP_BLOCKS *blocks)
+{
+ MARIA_SHARE *share= info->s;
+ my_bool res= 1;
+ uint full_page_size, position, max_page_size;
+ uint head_length, row_length, rest_length, extents_length;
+ DBUG_ENTER("_ma_bitmap_find_place");
+
+ blocks->count= 0;
+ blocks->tail_page_skipped= blocks->page_skipped= 0;
+ row->extents_count= 0;
+
+ /*
+ Reserve place for the following blocks:
+ - Head block
+ - Full page block
+ - Marker block to allow write_block_record() to split full page blocks
+ into full and free part
+ - Tail block
+ */
+
+ info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART;
+ max_page_size= (share->block_size - PAGE_OVERHEAD_SIZE);
+
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+
+ if (row->total_length <= max_page_size)
+ {
+ /* Row fits in one page */
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1;
+ if (find_head(info, (uint) row->total_length, position))
+ goto abort;
+ row->space_on_head_page= row->total_length;
+ goto end;
+ }
+
+ /*
+ First allocate all blobs so that we can find out the needed size for
+ the main block.
+ */
+ if (row->blob_length && allocate_blobs(info, row))
+ goto abort;
+
+ extents_length= row->extents_count * ROW_EXTENT_SIZE;
+ /*
+ The + 3 is reserved for storing the number of segments in the row header.
+ */
+ if ((head_length= (row->head_length + extents_length + 3)) <=
+ max_page_size)
+ {
+ /* Main row part fits into one page */
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1;
+ if (find_head(info, head_length, position))
+ goto abort;
+ row->space_on_head_page= head_length;
+ goto end;
+ }
+
+ /* Allocate enough space */
+ head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE;
+
+ /* The first segment size is stored in 'row_length' */
+ row_length= find_where_to_split_row(share, row, extents_length,
+ max_page_size);
+
+ full_page_size= MAX_TAIL_SIZE(share->block_size);
+ position= 0;
+ if (head_length - row_length <= full_page_size)
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */
+ if (find_head(info, row_length, position))
+ goto abort;
+ row->space_on_head_page= row_length;
+
+ rest_length= head_length - row_length;
+ if (write_rest_of_head(info, position, rest_length))
+ goto abort;
+
+end:
+ blocks->block= dynamic_element(&info->bitmap_blocks, position,
+ MARIA_BITMAP_BLOCK*);
+ blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position;
+ /* First block's page_count is for all blocks */
+ blocks->count= info->bitmap_blocks.elements - position;
+ res= 0;
+
+abort:
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Find where to put row on update (when head page is already defined)
+
+ SYNPOSIS
+ _ma_bitmap_find_new_place()
+ info Maria handler
+ row Information about row to write
+ page On which page original row was stored
+ free_size Free size on head page
+ blocks Store data about allocated places here
+
+ NOTES
+ This function is only called when the new row can't fit in the space of
+ the old row in the head page.
+
+ This is essently same as _ma_bitmap_find_place() except that
+ we don't call find_head() to search in bitmaps where to put the page.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *row,
+ pgcache_page_no_t page, uint free_size,
+ MARIA_BITMAP_BLOCKS *blocks)
+{
+ MARIA_SHARE *share= info->s;
+ my_bool res= 1;
+ uint position;
+ uint head_length, row_length, rest_length, extents_length;
+ ulonglong bitmap_page;
+ DBUG_ENTER("_ma_bitmap_find_new_place");
+
+ blocks->count= 0;
+ blocks->tail_page_skipped= blocks->page_skipped= 0;
+ row->extents_count= 0;
+ info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART;
+
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+
+ /*
+ First allocate all blobs (so that we can find out the needed size for
+ the main block.
+ */
+ if (row->blob_length && allocate_blobs(info, row))
+ goto abort;
+
+ /* Switch bitmap to current head page */
+ bitmap_page= page / share->bitmap.pages_covered;
+ bitmap_page*= share->bitmap.pages_covered;
+
+ if (share->bitmap.page != bitmap_page &&
+ _ma_change_bitmap_page(info, &share->bitmap, bitmap_page))
+ goto abort;
+
+ extents_length= row->extents_count * ROW_EXTENT_SIZE;
+ if ((head_length= (row->head_length + extents_length + 3)) <= free_size)
+ {
+ /* Main row part fits into one page */
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1;
+ use_head(info, page, head_length, position);
+ row->space_on_head_page= head_length;
+ goto end;
+ }
+
+ /* Allocate enough space */
+ head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE;
+
+ /* The first segment size is stored in 'row_length' */
+ row_length= find_where_to_split_row(share, row, extents_length, free_size);
+
+ position= 0;
+ if (head_length - row_length < MAX_TAIL_SIZE(share->block_size))
+ position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */
+ use_head(info, page, row_length, position);
+ row->space_on_head_page= row_length;
+
+ rest_length= head_length - row_length;
+ if (write_rest_of_head(info, position, rest_length))
+ goto abort;
+
+end:
+ blocks->block= dynamic_element(&info->bitmap_blocks, position,
+ MARIA_BITMAP_BLOCK*);
+ blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position;
+ /* First block's page_count is for all blocks */
+ blocks->count= info->bitmap_blocks.elements - position;
+ res= 0;
+
+abort:
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ DBUG_RETURN(res);
+}
+
+
+/****************************************************************************
+ Clear and reset bits
+****************************************************************************/
+
+/*
+ Set fill pattern for a page
+
+ set_page_bits()
+ info Maria handler
+ bitmap Bitmap handler
+ page Adress to page
+ fill_pattern Pattern (not size) for page
+
+ NOTES
+ Page may not be part of active bitmap
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool set_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page, uint fill_pattern)
+{
+ pgcache_page_no_t bitmap_page;
+ uint offset_page, offset, tmp, org_tmp;
+ uchar *data;
+ DBUG_ENTER("set_page_bits");
+ DBUG_ASSERT(fill_pattern <= 7);
+
+ bitmap_page= page - page % bitmap->pages_covered;
+ if (bitmap_page != bitmap->page &&
+ _ma_change_bitmap_page(info, bitmap, bitmap_page))
+ DBUG_RETURN(1);
+
+ /* Find page number from start of bitmap */
+ offset_page= (uint) (page - bitmap->page - 1);
+ /*
+ Mark place used by reading/writing 2 bytes at a time to handle
+ bitmaps in overlapping bytes
+ */
+ offset_page*= 3;
+ offset= offset_page & 7;
+ data= bitmap->map + offset_page / 8;
+ org_tmp= tmp= uint2korr(data);
+ tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset);
+ if (tmp == org_tmp)
+ DBUG_RETURN(0); /* No changes */
+ int2store(data, tmp);
+
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+ if (fill_pattern != 3 && fill_pattern != 7)
+ set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page);
+ /*
+ Note that if the condition above is false (page is full), and all pages of
+ this bitmap are now full, and that bitmap page was
+ first_bitmap_with_space, we don't modify first_bitmap_with_space, indeed
+ its value still tells us where to start our search for a bitmap with space
+ (which is for sure after this full one).
+ That does mean that first_bitmap_with_space is only a lower bound.
+ */
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Get bitmap pattern for a given page
+
+ SYNOPSIS
+ get_page_bits()
+ info Maria handler
+ bitmap Bitmap handler
+ page Page number
+
+ RETURN
+ 0-7 Bitmap pattern
+ ~0 Error (couldn't read page)
+*/
+
+uint _ma_bitmap_get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page)
+{
+ pgcache_page_no_t bitmap_page;
+ uint offset_page, offset, tmp;
+ uchar *data;
+ DBUG_ENTER("_ma_bitmap_get_page_bits");
+
+ bitmap_page= page - page % bitmap->pages_covered;
+ if (bitmap_page != bitmap->page &&
+ _ma_change_bitmap_page(info, bitmap, bitmap_page))
+ DBUG_RETURN(~ (uint) 0);
+
+ /* Find page number from start of bitmap */
+ offset_page= (uint) (page - bitmap->page - 1);
+ /*
+ Mark place used by reading/writing 2 bytes at a time to handle
+ bitmaps in overlapping bytes
+ */
+ offset_page*= 3;
+ offset= offset_page & 7;
+ data= bitmap->map + offset_page / 8;
+ tmp= uint2korr(data);
+ DBUG_RETURN((tmp >> offset) & 7);
+}
+
+
+/*
+ Mark all pages in a region as free
+
+ SYNOPSIS
+ _ma_bitmap_reset_full_page_bits()
+ info Maria handler
+ bitmap Bitmap handler
+ page Start page
+ page_count Number of pages
+
+ NOTES
+ We assume that all pages in region is covered by same bitmap
+ One must have a lock on info->s->bitmap.bitmap_lock
+
+ RETURN
+ 0 ok
+ 1 Error (when reading bitmap)
+*/
+
+my_bool _ma_bitmap_reset_full_page_bits(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page,
+ uint page_count)
+{
+ ulonglong bitmap_page;
+ uint offset, bit_start, bit_count, tmp;
+ uchar *data;
+ DBUG_ENTER("_ma_bitmap_reset_full_page_bits");
+ DBUG_PRINT("enter", ("page: %lu page_count: %u", (ulong) page, page_count));
+ safe_mutex_assert_owner(&info->s->bitmap.bitmap_lock);
+
+ bitmap_page= page - page % bitmap->pages_covered;
+ DBUG_ASSERT(page != bitmap_page);
+
+ if (bitmap_page != bitmap->page &&
+ _ma_change_bitmap_page(info, bitmap, bitmap_page))
+ DBUG_RETURN(1);
+
+ /* Find page number from start of bitmap */
+ offset= (uint) (page - bitmap->page - 1);
+
+ /* Clear bits from 'page * 3' -> '(page + page_count) * 3' */
+ bit_start= offset * 3;
+ bit_count= page_count * 3;
+
+ data= bitmap->map + bit_start / 8;
+ offset= bit_start & 7;
+
+ tmp= (255 << offset); /* Bits to keep */
+ if (bit_count + offset < 8)
+ {
+ /* Only clear bits between 'offset' and 'offset+bit_count-1' */
+ tmp^= (255 << (offset + bit_count));
+ }
+ *data&= ~tmp;
+
+ if ((int) (bit_count-= (8 - offset)) > 0)
+ {
+ uint fill;
+ data++;
+ /*
+ -1 is here to avoid one 'if' statement and to let the following code
+ handle the last byte
+ */
+ if ((fill= (bit_count - 1) / 8))
+ {
+ bzero(data, fill);
+ data+= fill;
+ }
+ bit_count-= fill * 8; /* Bits left to clear */
+ tmp= (1 << bit_count) - 1;
+ *data&= ~tmp;
+ }
+ set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page);
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+ DBUG_RETURN(0);
+}
+
+/*
+ Set all pages in a region as used
+
+ SYNOPSIS
+ _ma_bitmap_set_full_page_bits()
+ info Maria handler
+ bitmap Bitmap handler
+ page Start page
+ page_count Number of pages
+
+ NOTES
+ We assume that all pages in region is covered by same bitmap
+ One must have a lock on info->s->bitmap.bitmap_lock
+
+ RETURN
+ 0 ok
+ 1 Error (when reading bitmap)
+*/
+
+my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page, uint page_count)
+{
+ ulonglong bitmap_page;
+ uint offset, bit_start, bit_count, tmp;
+ uchar *data;
+ DBUG_ENTER("_ma_bitmap_set_full_page_bits");
+ DBUG_PRINT("enter", ("page: %lu page_count: %u", (ulong) page, page_count));
+ safe_mutex_assert_owner(&info->s->bitmap.bitmap_lock);
+
+ bitmap_page= page - page % bitmap->pages_covered;
+ if (page == bitmap_page ||
+ page + page_count >= bitmap_page + bitmap->pages_covered)
+ {
+ DBUG_ASSERT(0); /* Wrong in data */
+ DBUG_RETURN(1);
+ }
+
+ if (bitmap_page != bitmap->page &&
+ _ma_change_bitmap_page(info, bitmap, bitmap_page))
+ DBUG_RETURN(1);
+
+ /* Find page number from start of bitmap */
+ offset= (uint) (page - bitmap->page - 1);
+
+ /* Set bits from 'page * 3' -> '(page + page_count) * 3' */
+ bit_start= offset * 3;
+ bit_count= page_count * 3;
+
+ data= bitmap->map + bit_start / 8;
+ offset= bit_start & 7;
+
+ tmp= (255 << offset); /* Bits to keep */
+ if (bit_count + offset < 8)
+ {
+ /* Only set bits between 'offset' and 'offset+bit_count-1' */
+ tmp^= (255 << (offset + bit_count));
+ }
+ *data|= tmp;
+
+ if ((int) (bit_count-= (8 - offset)) > 0)
+ {
+ uint fill;
+ data++;
+ /*
+ -1 is here to avoid one 'if' statement and to let the following code
+ handle the last byte
+ */
+ if ((fill= (bit_count - 1) / 8))
+ {
+ bfill(data, fill, 255);
+ data+= fill;
+ }
+ bit_count-= fill * 8; /* Bits left to set */
+ tmp= (1 << bit_count) - 1;
+ *data|= tmp;
+ }
+ bitmap->changed= 1;
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief
+ Make a transition of MARIA_FILE_BITMAP::non_flushable.
+ If the bitmap becomes flushable, which requires that REDO-UNDO has been
+ logged and all bitmap pages touched by the thread have a correct
+ allocation, it unpins all bitmap pages, and if _ma_bitmap_flush_all() is
+ waiting (in practice it is a checkpoint), it wakes it up.
+ If the bitmap becomes or stays unflushable, the function merely records it
+ unless a concurrent _ma_bitmap_flush_all() is happening, in which case the
+ function first waits for the flush to be done.
+
+ @note
+ this sets info->non_flushable_state to 1 if we have incremented
+ bitmap->non_flushable and not yet decremented it.
+
+ @param share Table's share
+ @param non_flushable_inc Increment of MARIA_FILE_BITMAP::non_flushable
+ (-1 or +1).
+*/
+
+void _ma_bitmap_flushable(MARIA_HA *info, int non_flushable_inc)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_FILE_BITMAP *bitmap;
+ DBUG_ENTER("_ma_bitmap_flushable");
+
+ /*
+ Not transactional tables are never automaticly flushed and needs no
+ protection
+ */
+ if (!share->now_transactional)
+ DBUG_VOID_RETURN;
+
+ bitmap= &share->bitmap;
+ pthread_mutex_lock(&bitmap->bitmap_lock);
+
+ if (non_flushable_inc == -1)
+ {
+ DBUG_ASSERT((int) bitmap->non_flushable > 0);
+ DBUG_ASSERT(info->non_flushable_state == 1);
+ if (--bitmap->non_flushable == 0)
+ {
+ /*
+ We unlock and unpin pages locked and pinned by other threads. It does
+ not seem to be an issue as all bitmap changes are serialized with
+ the bitmap's mutex.
+ */
+ _ma_bitmap_unpin_all(share);
+ if (unlikely(bitmap->flush_all_requested))
+ {
+ DBUG_PRINT("info", ("bitmap flushable waking up flusher"));
+ pthread_cond_broadcast(&bitmap->bitmap_cond);
+ }
+ }
+ DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
+ pthread_mutex_unlock(&bitmap->bitmap_lock);
+ info->non_flushable_state= 0;
+ DBUG_VOID_RETURN;
+ }
+ DBUG_ASSERT(non_flushable_inc == 1);
+ DBUG_ASSERT(info->non_flushable_state == 0);
+ while (unlikely(bitmap->flush_all_requested))
+ {
+ /*
+ Some other thread is waiting for the bitmap to become
+ flushable. Not the moment to make the bitmap unflushable or more
+ unflushable; let's rather back off and wait. If we didn't do this, with
+ multiple writers, there may always be one thread causing the bitmap to
+ be unflushable and _ma_bitmap_flush_all() would wait for long.
+ There should not be a deadlock because if our thread increased
+ non_flushable (and thus _ma_bitmap_flush_all() is waiting for at least
+ our thread), it is not going to increase it more so is not going to come
+ here.
+ */
+ DBUG_PRINT("info", ("waiting for bitmap flusher"));
+ pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock);
+ }
+ bitmap->non_flushable++;
+ DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
+ pthread_mutex_unlock(&bitmap->bitmap_lock);
+ info->non_flushable_state= 1;
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Correct bitmap pages to reflect the true allocation
+
+ SYNOPSIS
+ _ma_bitmap_release_unused()
+ info Maria handle
+ blocks Bitmap blocks
+
+ IMPLEMENTATION
+ If block->used & BLOCKUSED_TAIL is set:
+ If block->used & BLOCKUSED_USED is set, then the bits for the
+ corresponding page is set according to block->empty_space
+ If block->used & BLOCKUSED_USED is not set, then the bits for
+ the corresponding page is set to org_bitmap_value;
+
+ If block->used & BLOCKUSED_TAIL is not set:
+ if block->used is not set, the bits for the corresponding page are
+ cleared
+
+ For the first block (head block) the logic is same as for a tail block
+
+ Note that we may have 'filler blocks' that are used to split a block
+ in half; These can be recognized by that they have page_count == 0.
+
+ This code also reverse the effect of ma_bitmap_flushable(.., 1);
+
+ RETURN
+ 0 ok
+ 1 error (Couldn't write or read bitmap page)
+*/
+
+my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks)
+{
+ MARIA_BITMAP_BLOCK *block= blocks->block, *end= block + blocks->count;
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ uint bits, current_bitmap_value;
+ DBUG_ENTER("_ma_bitmap_release_unused");
+
+ /*
+ We can skip FULL_HEAD_PAGE (4) as the page was marked as 'full'
+ when we allocated space in the page
+ */
+ current_bitmap_value= FULL_HEAD_PAGE;
+
+ pthread_mutex_lock(&bitmap->bitmap_lock);
+
+ /* First handle head block */
+ if (block->used & BLOCKUSED_USED)
+ {
+ DBUG_PRINT("info", ("head page: %lu empty_space: %u",
+ (ulong) block->page, block->empty_space));
+ bits= _ma_free_size_to_head_pattern(bitmap, block->empty_space);
+ if (block->used & BLOCKUSED_USE_ORG_BITMAP)
+ current_bitmap_value= block->org_bitmap_value;
+ }
+ else
+ bits= block->org_bitmap_value;
+ if (bits != current_bitmap_value)
+ {
+ if (set_page_bits(info, bitmap, block->page, bits))
+ goto err;
+ }
+ else
+ {
+ DBUG_ASSERT(current_bitmap_value ==
+ _ma_bitmap_get_page_bits(info, bitmap, block->page));
+ }
+
+ /* Handle all full pages and tail pages (for head page and blob) */
+ for (block++; block < end; block++)
+ {
+ uint page_count;
+ if (!block->page_count)
+ continue; /* Skip 'filler blocks' */
+
+ page_count= block->page_count;
+ if (block->used & BLOCKUSED_TAIL)
+ {
+ current_bitmap_value= FULL_TAIL_PAGE;
+ /* The bitmap page is only one page */
+ page_count= 1;
+ if (block->used & BLOCKUSED_USED)
+ {
+ DBUG_PRINT("info", ("tail page: %lu empty_space: %u",
+ (ulong) block->page, block->empty_space));
+ bits= free_size_to_tail_pattern(bitmap, block->empty_space);
+ if (block->used & BLOCKUSED_USE_ORG_BITMAP)
+ current_bitmap_value= block->org_bitmap_value;
+ }
+ else
+ bits= block->org_bitmap_value;
+
+ /*
+ The page has all bits set; The following test is an optimization
+ to not set the bits to the same value as before.
+ */
+ if (bits != current_bitmap_value)
+ {
+ if (set_page_bits(info, bitmap, block->page, bits))
+ goto err;
+ }
+ else
+ {
+ DBUG_ASSERT(current_bitmap_value ==
+ _ma_bitmap_get_page_bits(info, bitmap, block->page));
+ }
+ }
+ else if (!(block->used & BLOCKUSED_USED) &&
+ _ma_bitmap_reset_full_page_bits(info, bitmap,
+ block->page, page_count))
+ goto err;
+ }
+
+ /* This duplicates ma_bitmap_flushable(-1) except it already has mutex */
+ if (info->non_flushable_state)
+ {
+ DBUG_ASSERT(((int) (bitmap->non_flushable)) > 0);
+ info->non_flushable_state= 0;
+ if (--bitmap->non_flushable == 0)
+ {
+ _ma_bitmap_unpin_all(info->s);
+ if (unlikely(bitmap->flush_all_requested))
+ {
+ DBUG_PRINT("info", ("bitmap flushable waking up flusher"));
+ pthread_cond_broadcast(&bitmap->bitmap_cond);
+ }
+ }
+ }
+ DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
+
+ pthread_mutex_unlock(&bitmap->bitmap_lock);
+ DBUG_RETURN(0);
+
+err:
+ pthread_mutex_unlock(&bitmap->bitmap_lock);
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Free full pages from bitmap and pagecache
+
+ SYNOPSIS
+ _ma_bitmap_free_full_pages()
+ info Maria handle
+ extents Extents (as stored on disk)
+ count Number of extents
+
+ IMPLEMENTATION
+ Mark all full pages (not tails) from extents as free, both in bitmap
+ and page cache.
+
+ RETURN
+ 0 ok
+ 1 error (Couldn't write or read bitmap page)
+*/
+
+my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents,
+ uint count)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ DBUG_ENTER("_ma_bitmap_free_full_pages");
+
+ pthread_mutex_lock(&bitmap->bitmap_lock);
+ for (; count--; extents+= ROW_EXTENT_SIZE)
+ {
+ pgcache_page_no_t page= uint5korr(extents);
+ uint page_count= (uint2korr(extents + ROW_EXTENT_PAGE_SIZE) &
+ ~START_EXTENT_BIT);
+ if (!(page_count & TAIL_BIT))
+ {
+ if (page == 0 && page_count == 0)
+ continue; /* Not used extent */
+ if (pagecache_delete_pages(info->s->pagecache, &info->dfile, page,
+ page_count, PAGECACHE_LOCK_WRITE, 1) ||
+ _ma_bitmap_reset_full_page_bits(info, bitmap, page, page_count))
+ {
+ pthread_mutex_unlock(&bitmap->bitmap_lock);
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ pthread_mutex_unlock(&bitmap->bitmap_lock);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Mark in the bitmap how much free space there is on a page
+
+ SYNOPSIS
+ _ma_bitmap_set()
+ info Maria handler
+ page Adress to page
+ head 1 if page is a head page, 0 if tail page
+ empty_space How much empty space there is on page
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_bitmap_set(MARIA_HA *info, pgcache_page_no_t page, my_bool head,
+ uint empty_space)
+{
+ MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+ uint bits;
+ my_bool res;
+ DBUG_ENTER("_ma_bitmap_set");
+ DBUG_PRINT("enter", ("page: %lu head: %d empty_space: %u",
+ (ulong) page, head, empty_space));
+
+ pthread_mutex_lock(&info->s->bitmap.bitmap_lock);
+ bits= (head ?
+ _ma_free_size_to_head_pattern(bitmap, empty_space) :
+ free_size_to_tail_pattern(bitmap, empty_space));
+ res= set_page_bits(info, bitmap, page, bits);
+ pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Check that bitmap pattern is correct for a page
+
+ NOTES
+ Used in maria_chk
+
+ SYNOPSIS
+ _ma_check_bitmap_data()
+ info Maria handler
+ page_type What kind of page this is
+ page Adress to page
+ empty_space Empty space on page
+ bitmap_pattern Store here the pattern that was in the bitmap for the
+ page. This is always updated.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_check_bitmap_data(MARIA_HA *info,
+ enum en_page_type page_type, pgcache_page_no_t page,
+ uint empty_space, uint *bitmap_pattern)
+{
+ uint bits;
+ switch (page_type) {
+ case UNALLOCATED_PAGE:
+ case MAX_PAGE_TYPE:
+ bits= 0;
+ break;
+ case HEAD_PAGE:
+ bits= _ma_free_size_to_head_pattern(&info->s->bitmap, empty_space);
+ break;
+ case TAIL_PAGE:
+ bits= free_size_to_tail_pattern(&info->s->bitmap, empty_space);
+ break;
+ case BLOB_PAGE:
+ bits= FULL_TAIL_PAGE;
+ break;
+ default:
+ bits= 0; /* to satisfy compiler */
+ DBUG_ASSERT(0);
+ }
+ return ((*bitmap_pattern= _ma_bitmap_get_page_bits(info, &info->s->bitmap,
+ page)) != bits);
+}
+
+
+/*
+ Check if the page type matches the one that we have in the bitmap
+
+ SYNOPSIS
+ _ma_check_if_right_bitmap_type()
+ info Maria handler
+ page_type What kind of page this is
+ page Adress to page
+ bitmap_pattern Store here the pattern that was in the bitmap for the
+ page. This is always updated.
+
+ NOTES
+ Used in maria_chk
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info,
+ enum en_page_type page_type,
+ pgcache_page_no_t page,
+ uint *bitmap_pattern)
+{
+ if ((*bitmap_pattern= _ma_bitmap_get_page_bits(info, &info->s->bitmap,
+ page)) > 7)
+ return 1; /* Couldn't read page */
+ switch (page_type) {
+ case HEAD_PAGE:
+ return *bitmap_pattern < 1 || *bitmap_pattern > 4;
+ case TAIL_PAGE:
+ return *bitmap_pattern < 5;
+ case BLOB_PAGE:
+ return *bitmap_pattern != 7;
+ default:
+ break;
+ }
+ DBUG_ASSERT(0);
+ return 1;
+}
+
+
+/**
+ @brief create the first bitmap page of a freshly created data file
+
+ @param share table's share
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int _ma_bitmap_create_first(MARIA_SHARE *share)
+{
+ uint block_size= share->bitmap.block_size;
+ File file= share->bitmap.file.file;
+ uchar marker[CRC_SIZE];
+
+ /*
+ Next write operation of the page will write correct CRC
+ if it is needed
+ */
+ int4store(marker, MARIA_NO_CRC_BITMAP_PAGE);
+
+ if (my_chsize(file, block_size - sizeof(marker),
+ 0, MYF(MY_WME)) ||
+ my_pwrite(file, marker, sizeof(marker),
+ block_size - sizeof(marker),
+ MYF(MY_NABP | MY_WME)))
+ return 1;
+ share->state.state.data_file_length= block_size;
+ _ma_bitmap_delete_all(share);
+ return 0;
+}
+
+
+/**
+ @brief Pagecache callback to get the TRANSLOG_ADDRESS to flush up to, when a
+ bitmap page needs to be flushed.
+
+ @param page Page's content
+ @param page_no Page's number (<offset>/<page length>)
+ @param data_ptr Callback data pointer (pointer to MARIA_SHARE)
+
+ @retval TRANSLOG_ADDRESS to flush up to.
+*/
+
+static my_bool
+flush_log_for_bitmap(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no __attribute__((unused)),
+ uchar *data_ptr __attribute__((unused)))
+{
+#ifndef DBUG_OFF
+ const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr;
+#endif
+ DBUG_ENTER("flush_log_for_bitmap");
+ DBUG_ASSERT(share->now_transactional);
+ /*
+ WAL imposes that UNDOs reach disk before bitmap is flushed. We don't know
+ the LSN of the last UNDO about this bitmap page, so we flush whole log.
+ */
+ DBUG_RETURN(translog_flush(translog_get_horizon()));
+}
+
+
+/**
+ @brief Set callbacks for bitmap pages
+
+ @note
+ We don't use pagecache_file_init here, as we want to keep the
+ code readable
+*/
+
+void _ma_bitmap_set_pagecache_callbacks(PAGECACHE_FILE *file,
+ MARIA_SHARE *share)
+{
+ file->callback_data= (uchar*) share;
+ file->flush_log_callback= maria_flush_log_for_page_none;
+ file->write_fail= maria_page_write_failure;
+
+ if (share->temporary)
+ {
+ file->read_callback= &maria_page_crc_check_none;
+ file->write_callback= &maria_page_filler_set_none;
+ }
+ else
+ {
+ file->read_callback= &maria_page_crc_check_bitmap;
+ if (share->options & HA_OPTION_PAGE_CHECKSUM)
+ file->write_callback= &maria_page_crc_set_normal;
+ else
+ file->write_callback= &maria_page_filler_set_bitmap;
+ if (share->now_transactional)
+ file->flush_log_callback= flush_log_for_bitmap;
+ }
+}
+
+
+/**
+ Extends data file with zeroes and creates new bitmap pages into page cache.
+
+ Writes all bitmap pages in [from, to].
+
+ Non-bitmap pages of zeroes are correct as they are marked empty in
+ bitmaps. Bitmap pages will not be zeroes: they will get their CRC fixed when
+ flushed. And if there is a crash before flush (so they are zeroes at
+ restart), a REDO will re-create them in page cache.
+*/
+
+static my_bool
+_ma_bitmap_create_missing_into_pagecache(MARIA_SHARE *share,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t from,
+ pgcache_page_no_t to,
+ uchar *zeroes)
+{
+ pgcache_page_no_t i;
+ /*
+ We do not use my_chsize() because there can be a race between when it
+ reads the physical size and when it writes (assume data_file_length is 10,
+ physical length is 8 and two data pages are in cache, and here we do a
+ my_chsize: my_chsize sees physical length is 8, then the two data pages go
+ to disk then my_chsize writes from page 8 and so overwrites the two data
+ pages, wrongly).
+ We instead rely on the filesystem filling gaps with zeroes.
+ */
+ for (i= from; i <= to; i+= bitmap->pages_covered)
+ {
+ /**
+ No need to keep them pinned, they are new so flushable.
+ @todo but we may want to keep them pinned, as an optimization: if they
+ are not pinned they may go to disk before the data pages go (so, the
+ physical pages would be in non-ascending "sparse" order on disk), or the
+ filesystem may fill gaps with zeroes physically which is a waste of
+ time.
+ */
+ if (pagecache_write(share->pagecache,
+ &bitmap->file, i, 0,
+ zeroes, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE))
+ goto err;
+ }
+ /*
+ Data pages after data_file_length are full of zeroes but that is allowed
+ as they are marked empty in the bitmap.
+ */
+ return FALSE;
+err:
+ return TRUE;
+}
+
+
+/**
+ Creates missing bitmaps when we extend the data file.
+
+ At run-time, when we need a new bitmap page we come here; and only one bitmap
+ page at a time is created.
+
+ In some recovery cases we insert at a large offset in the data file, way
+ beyond state.data_file_length, so can need to create more than one bitmap
+ page in one go. Known case is:
+ Start a transaction in Maria;
+ delete last row of very large table (with delete_row)
+ do a bulk insert
+ crash
+ Then UNDO_BULK_INSERT will truncate table files, and
+ UNDO_ROW_DELETE will want to put the row back to its original position,
+ extending the data file a lot: bitmap page*s* in the hole must be created,
+ or he table would look corrupted.
+
+ We need to log REDOs for bitmap creation, consider: we apply a REDO for a
+ data page, which creates the first data page covered by a new bitmap
+ not yet created. If the data page is flushed but the bitmap page is not and
+ there is a crash, re-execution of the REDO will complain about the zeroed
+ bitmap page (see it as corruption). Thus a REDO is needed to re-create the
+ bitmap.
+
+ @param info Maria handler
+ @param bitmap Bitmap handler
+ @param page Last bitmap page to create
+
+ @note When this function is called this must be true:
+ ((page + 1) * bitmap->block_size > info->s->state.state.data_file_length)
+
+*/
+
+static my_bool _ma_bitmap_create_missing(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page)
+{
+ MARIA_SHARE *share= info->s;
+ uint block_size= bitmap->block_size;
+ pgcache_page_no_t from, to;
+ my_off_t data_file_length= share->state.state.data_file_length;
+ DBUG_ENTER("_ma_bitmap_create_missing");
+
+ /* First (in offset order) bitmap page to create */
+ if (data_file_length < block_size)
+ goto err; /* corrupted, should have first bitmap page */
+
+ from= (data_file_length / block_size - 1) / bitmap->pages_covered + 1;
+ from*= bitmap->pages_covered;
+ /*
+ page>=from because:
+ (page + 1) * bs > dfl, and page == k * pc so:
+ (k * pc + 1) * bs > dfl; k * pc + 1 > dfl / bs; k * pc > dfl / bs - 1
+ k > (dfl / bs - 1) / pc; k >= (dfl / bs - 1) / pc + 1
+ k * pc >= ((dfl / bs - 1) / pc + 1) * pc == from.
+ */
+ DBUG_ASSERT(page >= from);
+
+ if (share->now_transactional)
+ {
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2];
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ page_store(log_data + FILEID_STORE_SIZE, from);
+ page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ /*
+ We don't use info->trn so that this REDO is always executed even though
+ the UNDO does not reach disk due to crash. This is also consistent with
+ the fact that the new bitmap pages are not pinned.
+ */
+ if (translog_write_record(&lsn, LOGREC_REDO_BITMAP_NEW_PAGE,
+ &dummy_transaction_object, info,
+ (translog_size_t)sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data, NULL))
+ goto err;
+ /*
+ No need to flush the log: the bitmap pages we are going to create will
+ flush it when they go to disk.
+ */
+ }
+
+ /*
+ Last bitmap page. It has special creation: will go to the page cache
+ only later as we are going to modify it very soon.
+ */
+ bzero(bitmap->map, bitmap->block_size);
+ bitmap->used_size= 0;
+#ifndef DBUG_OFF
+ memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+#endif
+
+ /* Last bitmap page to create before 'page' */
+ DBUG_ASSERT(page >= bitmap->pages_covered);
+ to= page - bitmap->pages_covered;
+ /*
+ In run-time situations, from>=to is always false, i.e. we always create
+ one bitmap at a time ('page').
+ */
+ if ((from <= to) &&
+ _ma_bitmap_create_missing_into_pagecache(share, bitmap, from, to,
+ bitmap->map))
+ goto err;
+
+ share->state.state.data_file_length= (page + 1) * bitmap->block_size;
+
+ DBUG_RETURN(FALSE);
+err:
+ DBUG_RETURN(TRUE);
+}
+
+
+my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info,
+ LSN lsn __attribute__ ((unused)),
+ const uchar *header)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ my_bool error;
+ pgcache_page_no_t from, to, min_from;
+ DBUG_ENTER("_ma_apply_redo_bitmap_new_page");
+
+ from= page_korr(header);
+ to= page_korr(header + PAGE_STORE_SIZE);
+ DBUG_PRINT("info", ("from: %lu to: %lu", (ulong)from, (ulong)to));
+ if ((from > to) ||
+ (from % bitmap->pages_covered) != 0 ||
+ (to % bitmap->pages_covered) != 0)
+ {
+ error= TRUE; /* corrupted log record */
+ goto err;
+ }
+
+ min_from= (share->state.state.data_file_length / bitmap->block_size - 1) /
+ bitmap->pages_covered + 1;
+ min_from*= bitmap->pages_covered;
+ if (from < min_from)
+ {
+ DBUG_PRINT("info", ("overwrite bitmap pages from %lu", (ulong)min_from));
+ /*
+ We have to overwrite. It could be that there was a bitmap page in
+ memory, covering a data page which went to disk, then crash: the
+ bitmap page is now full of zeros and is ==min_from, we have to overwrite
+ it with correct checksum.
+ */
+ }
+ share->state.changed|= STATE_CHANGED;
+ bzero(info->buff, bitmap->block_size);
+ if (!(error=
+ _ma_bitmap_create_missing_into_pagecache(share, bitmap, from, to,
+ info->buff)))
+ share->state.state.data_file_length= (to + 1) * bitmap->block_size;
+
+err:
+ DBUG_RETURN(error);
+}
diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c
new file mode 100644
index 00000000000..fd02e2ac0ec
--- /dev/null
+++ b/storage/maria/ma_blockrec.c
@@ -0,0 +1,7404 @@
+/* Copyright (C) 2007-2008 Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Storage of records in block
+
+ Some clarifications about the abbrev used:
+
+ NULL fields -> Fields that may have contain a NULL value.
+ Not null fields -> Fields that may not contain a NULL value.
+ Critical fields -> Fields that can't be null and can't be dropped without
+ causing a table reorganization.
+
+
+ Maria will have a LSN at start of each page (excluding the bitmap pages)
+
+ The different page types that are in a data file are:
+
+ Bitmap pages Map of free pages in the next extent (8192 page size
+ gives us 256M of mapped pages / bitmap)
+ Head page Start of rows are stored on this page.
+ A rowid always points to a head page
+ Blob page This page is totally filled with data from one blob or by
+ a set of long VARCHAR/CHAR fields
+ Tail page This contains the last part from different rows, blobs
+ or varchar fields.
+
+ The data file starts with a bitmap page, followed by as many data
+ pages as the bitmap can cover. After this there is a new bitmap page
+ and more data pages etc.
+
+ For information about the bitmap page, see ma_bitmap.c
+
+ Structure of data and tail page:
+
+ The page has a row directory at end of page to allow us to do deletes
+ without having to reorganize the page. It also allows us to later store
+ some more bytes after each row to allow them to grow without having to move
+ around other rows.
+
+ Page header:
+
+ LSN 7 bytes Log position for last page change
+ PAGE_TYPE 1 uchar 1 for head / 2 for tail / 3 for blob
+ DIR_COUNT 1 uchar Number of row/tail entries on page
+ FREE_DIR_LINK 1 uchar Pointer to first free director entry or 255 if no
+ empty space 2 bytes Empty space on page
+
+ The most significant bit in PAGE_TYPE is set to 1 if the data on the page
+ can be compacted to get more space. (PAGE_CAN_BE_COMPACTED)
+
+ Row data
+
+ Row directory of NO entries, that consist of the following for each row
+ (in reverse order; i.e., first record is stored last):
+
+ Position 2 bytes Position of row on page
+ Length 2 bytes Length of entry
+
+ For Position and Length, the 1 most significant bit of the position and
+ the 1 most significant bit of the length could be used for some states of
+ the row (in other words, we should try to keep these reserved)
+
+ Position is 0 if the entry is not used. In this case length[0] points
+ to a previous free entry (255 if no previous entry) and length[1]
+ to the next free entry (or 255 if last free entry). This works because
+ the directory entry 255 can never be marked free (if the first directory
+ entry is freed, the directory is shrinked).
+
+ checksum 4 bytes Reserved for full page read testing and live backup.
+
+ ----------------
+
+ Structure of blob pages:
+
+ LSN 7 bytes Log position for last page change
+ PAGE_TYPE 1 uchar 3
+
+ data
+
+ -----------------
+
+ Row data structure:
+
+ Flag 1 uchar Marker of which header field exists
+ TRANSID 6 bytes TRANSID of changing transaction
+ (optional, added on insert and first
+ update/delete)
+ VER_PTR 7 bytes Pointer to older version in log
+ (undo record)
+ (optional, added after first
+ update/delete)
+ DELETE_TRANSID 6 bytes (optional). TRANSID of original row.
+ Added on delete.
+ Nulls_extended 1 uchar To allow us to add new DEFAULT NULL
+ fields (optional, added after first
+ change of row after alter table)
+ Number of ROW_EXTENT's 1-3 uchar Length encoded, optional
+ This is the number of extents the
+ row is split into
+ First row_extent 7 uchar Pointer to first row extent (optional)
+
+ Total length of length array 1-3 uchar Only used if we have
+ char/varchar/blob fields.
+ Row checksum 1 uchar Only if table created with checksums
+ Null_bits .. One bit for each NULL field (a field that may
+ have the value NULL)
+ Empty_bits .. One bit for each field that may be 'empty'.
+ (Both for null and not null fields).
+ This bit is 1 if the value for the field is
+ 0 or empty string.
+
+ field_offsets 2 byte/offset
+ For each 32'th field, there is one offset
+ that points to where the field information
+ starts in the block. This is to provide
+ fast access to later field in the row
+ when we only need to return a small
+ set of fields.
+ TODO: Implement this.
+
+ Things marked above as 'optional' will only be present if the
+ corresponding bit is set in 'Flag' field. Flag gives us a way to
+ get more space on a page when doing page compaction as we don't need
+ to store TRANSID that have committed before the smallest running
+ transaction we have in memory.
+
+ Data in the following order:
+ (Field order is precalculated when table is created)
+
+ Critical fixed length, not null, fields. (Note, these can't be dropped)
+ Fixed length, null fields
+
+ Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields.
+ Number of bytes used in length array per entry is depending on max length
+ for field.
+
+ ROW_EXTENT's
+ CHAR data (space stripped)
+ VARCHAR data
+ BLOB data
+
+ Fields marked in null_bits or empty_bits are not stored in data part or
+ length array.
+
+ If row doesn't fit into the given block, then the first EXTENT will be
+ stored last on the row. This is done so that we don't break any field
+ data in the middle.
+
+ We first try to store the full row into one block. If that's not possible
+ we move out each big blob into their own extents. If this is not enough we
+ move out a concatenation of all varchars to their own extent.
+
+ Each blob and the concatenated char/varchar fields are stored the following
+ way:
+ - Store the parts in as many full-contiguous pages as possible.
+ - The last part, that doesn't fill a full page, is stored in tail page.
+
+ When doing an insert of a new row, we don't have to have
+ VER_PTR in the row. This will make rows that are not changed stored
+ efficiently. On update and delete we would add TRANSID (if it was an old
+ committed row) and VER_PTR to
+ the row. On row page compaction we can easily detect rows where
+ TRANSID was committed before the longest running transaction
+ started and we can then delete TRANSID and VER_PTR from the row to
+ gain more space.
+
+ If a row is deleted in Maria, we change TRANSID to the deleting
+ transaction's id, change VER_PTR to point to the undo record for the delete,
+ and add DELETE_TRANSID (the id of the transaction which last
+ inserted/updated the row before its deletion). DELETE_TRANSID allows an old
+ transaction to avoid reading the log to know if it can see the last version
+ before delete (in other words it reduces the probability of having to follow
+ VER_PTR). TODO: depending on a compilation option, evaluate the performance
+ impact of not storing DELETE_TRANSID (which would make the row smaller).
+
+ Description of the different parts:
+
+ Flag is coded as:
+
+ Description bit
+ TRANS_ID_exists 0
+ VER_PTR_exists 1
+ Row is deleted 2 (Means that DELETE_TRANSID exists)
+ Nulls_extended_exists 3
+ Row is split 7 This means that 'Number_of_row_extents' exists
+
+ Nulls_extended is the number of new DEFAULT NULL fields in the row
+ compared to the number of DEFAULT NULL fields when the first version
+ of the table was created. If Nulls_extended doesn't exist in the row,
+ we know it's 0 as this must be one of the original rows from when the
+ table was created first time. This coding allows us to add 255*8 =
+ 2048 new fields without requiring a full alter table.
+
+ Empty_bits is used to allow us to store 0, 0.0, empty string, empty
+ varstring and empty blob efficiently. (This is very good for data
+ warehousing where NULL's are often regarded as evil). Having this
+ bitmap also allows us to drop information of a field during a future
+ delete if field was deleted with ALTER TABLE DROP COLUMN. To be able
+ to handle DROP COLUMN, we must store in the index header the fields
+ that has been dropped. When unpacking a row we will ignore dropped
+ fields. When storing a row, we will mark a dropped field either with a
+ null in the null bit map or in the empty_bits and not store any data
+ for it.
+ TODO: Add code for handling dropped fields.
+
+
+ A ROW EXTENT is range of pages. One ROW_EXTENT is coded as:
+
+ START_PAGE 5 bytes
+ PAGE_COUNT 2 bytes. Bit 16 is set if this is a tail page.
+ Bit 15 is to set if this is start of a new
+ blob extent.
+
+ With 8K pages, we can cover 256M in one extent. This coding gives us a
+ maximum file size of 2^40*8192 = 8192 tera
+
+ As an example of ROW_EXTENT handling, assume a row with one integer
+ field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2
+ big BLOB fields that we have updated.
+
+ The record format for storing this into an empty file would be:
+
+ Page 1:
+
+ 00 00 00 00 00 00 00 LSN
+ 01 Only one row in page
+ FF No free dir entry
+ xx xx Empty space on page
+
+ 10 Flag: row split, VER_PTR exists
+ 01 00 00 00 00 00 TRANSID 1
+ 00 00 00 00 00 01 00 VER_PTR to first block in LOG file 1
+ 5 Number of row extents
+ 02 00 00 00 00 03 00 VARCHAR's are stored in full pages 2,3,4
+ 0 No null fields
+ 0 No empty fields
+ 05 00 00 00 00 00 80 Tail page for VARCHAR, rowid 0
+ 06 00 00 00 00 80 00 First blob, stored at page 6-133
+ 05 00 00 00 00 01 80 Tail of first blob (896 bytes) at page 5
+ 86 00 00 00 00 80 00 Second blob, stored at page 134-262
+ 05 00 00 00 00 02 80 Tail of second blob (896 bytes) at page 5
+ 05 00 5 integer
+ FA Length of first varchar field (size 250)
+ 00 60 Length of second varchar field (size 8192*3)
+ 00 60 10 First medium BLOB, 1M
+ 01 00 10 00 Second BLOB, 1M
+ xx xx xx xx xx xx Varchars are stored here until end of page
+
+ ..... until end of page
+
+ 09 00 F4 1F Start position 9, length 8180
+ xx xx xx xx Checksum
+
+ A data page is allowed to have a wrong CRC and header as long as it is
+ marked empty in the bitmap and its directory's count is 0.
+*/
+
+#include "maria_def.h"
+#include "ma_blockrec.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+#include "ma_recovery_util.h"
+#include <lf.h>
+
+/*
+ Struct for having a cursor over a set of extent.
+ This is used to loop over all extents for a row when reading
+ the row data. It's also used to store the tail positions for
+ a read row to be used by a later update/delete command.
+*/
+
+typedef struct st_maria_extent_cursor
+{
+ /*
+ Pointer to packed uchar array of extents for the row.
+ Format is described above in the header
+ */
+ uchar *extent;
+ /* Where data starts on page; Only for debugging */
+ uchar *data_start;
+ /* Position to all tails in the row. Updated when reading a row */
+ MARIA_RECORD_POS *tail_positions;
+ /* Current page */
+ pgcache_page_no_t page;
+ /* How many pages in the page region */
+ uint page_count;
+ /* What kind of lock to use for tail pages */
+ enum pagecache_page_lock lock_for_tail_pages;
+ /* Total number of extents (i.e., entries in the 'extent' slot) */
+ uint extent_count;
+ /* <> 0 if current extent is a tail page; Set while using cursor */
+ uint tail;
+ /* Position for tail on tail page */
+ uint tail_row_nr;
+ /*
+ == 1 if we are working on the first extent (i.e., the one that is stored in
+ the row header, not an extent that is stored as part of the row data).
+ */
+ my_bool first_extent;
+} MARIA_EXTENT_CURSOR;
+
+
+/**
+ @brief Structure for passing down info to write_hook_for_clr_end().
+ This hooks needs to know the variation of the live checksum caused by the
+ current operation to update state.checksum under log's mutex,
+ needs to know the transaction's previous undo_lsn to set
+ trn->undo_lsn under log mutex, and needs to know the type of UNDO being
+ undone now to modify state.records under log mutex.
+*/
+
+/** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */
+#define store_checksum_in_rec(S,D,E,P,L) do \
+ { \
+ D= 0; \
+ if ((S)->calc_checksum != NULL) \
+ { \
+ D= (E); \
+ ha_checksum_store(P, D); \
+ L+= HA_CHECKSUM_STORE_SIZE; \
+ } \
+ } while (0)
+
+
+static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails);
+static my_bool delete_head_or_tail(MARIA_HA *info,
+ pgcache_page_no_t page, uint record_number,
+ my_bool head, my_bool from_update);
+#ifndef DBUG_OFF
+static void _ma_print_directory(FILE *file, uchar *buff, uint block_size);
+#endif
+static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block,
+ uint block_size, ulong length,
+ uint *tot_ranges);
+static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
+ LEX_CUSTRING *log_parts,
+ uint *log_parts_count);
+static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
+ const uchar *newrec,
+ LEX_CUSTRING *log_parts,
+ uint *log_parts_count);
+
+/****************************************************************************
+ Initialization
+****************************************************************************/
+
+/*
+ Initialize data needed for block structures
+*/
+
+
+/* Size of the different header elements for a row */
+
+static uchar header_sizes[]=
+{
+ TRANSID_SIZE,
+ VERPTR_SIZE,
+ TRANSID_SIZE, /* Delete transid */
+ 1 /* Null extends */
+};
+
+/*
+ Calculate array of all used headers
+
+ Used to speed up:
+
+ size= 1;
+ if (flag & 1)
+ size+= TRANSID_SIZE;
+ if (flag & 2)
+ size+= VERPTR_SIZE;
+ if (flag & 4)
+ size+= TRANSID_SIZE
+ if (flag & 8)
+ size+= 1;
+
+ NOTES
+ This is called only once at startup of Maria
+*/
+
+static uchar total_header_size[1 << array_elements(header_sizes)];
+#define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1)
+
+void _ma_init_block_record_data(void)
+{
+ uint i;
+ bzero(total_header_size, sizeof(total_header_size));
+ total_header_size[0]= FLAG_SIZE; /* Flag uchar */
+ for (i= 1; i < array_elements(total_header_size); i++)
+ {
+ uint size= FLAG_SIZE, j, bit;
+ for (j= 0; (bit= (1 << j)) <= i; j++)
+ {
+ if (i & bit)
+ size+= header_sizes[j];
+ }
+ total_header_size[i]= size;
+ }
+}
+
+
+my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file)
+{
+
+ share->base.max_data_file_length=
+ (((ulonglong) 1 << ((share->base.rec_reflength-1)*8))-1) *
+ share->block_size;
+#if SIZEOF_OFF_T == 4
+ set_if_smaller(share->base.max_data_file_length, INT_MAX32);
+#endif
+ return _ma_bitmap_init(share, data_file);
+}
+
+
+my_bool _ma_once_end_block_record(MARIA_SHARE *share)
+{
+ int res= _ma_bitmap_end(share);
+ if (share->bitmap.file.file >= 0)
+ {
+ if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
+ ((share->temporary || share->deleting) ?
+ FLUSH_IGNORE_CHANGED :
+ FLUSH_RELEASE)))
+ res= 1;
+ /*
+ File must be synced as it is going out of the maria_open_list and so
+ becoming unknown to Checkpoint.
+ */
+ if (share->now_transactional &&
+ my_sync(share->bitmap.file.file, MYF(MY_WME)))
+ res= 1;
+ if (my_close(share->bitmap.file.file, MYF(MY_WME)))
+ res= 1;
+ /*
+ Trivial assignment to guard against multiple invocations
+ (May happen if file are closed but we want to keep the maria object
+ around a bit longer)
+ */
+ share->bitmap.file.file= -1;
+ }
+ if (share->id != 0)
+ {
+ /*
+ We de-assign the id even though index has not been flushed, this is ok
+ as close_lock serializes us with a Checkpoint looking at our share.
+ */
+ translog_deassign_id_from_share(share);
+ }
+ return res;
+}
+
+
+/* Init info->cur_row structure */
+
+my_bool _ma_init_block_record(MARIA_HA *info)
+{
+ MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row;
+ MARIA_SHARE *share= info->s;
+ uint default_extents;
+ DBUG_ENTER("_ma_init_block_record");
+
+ if (!my_multi_malloc(MY_WME,
+ &row->empty_bits, share->base.pack_bytes,
+ &row->field_lengths,
+ share->base.max_field_lengths + 2,
+ &row->blob_lengths, sizeof(ulong) * share->base.blobs,
+ &row->null_field_lengths, (sizeof(uint) *
+ (share->base.fields -
+ share->base.blobs +
+ EXTRA_LENGTH_FIELDS)),
+ &row->tail_positions, (sizeof(MARIA_RECORD_POS) *
+ (share->base.blobs + 2)),
+ &new_row->empty_bits, share->base.pack_bytes,
+ &new_row->field_lengths,
+ share->base.max_field_lengths + 2,
+ &new_row->blob_lengths,
+ sizeof(ulong) * share->base.blobs,
+ &new_row->null_field_lengths, (sizeof(uint) *
+ (share->base.fields -
+ share->base.blobs +
+ EXTRA_LENGTH_FIELDS)),
+ &info->log_row_parts,
+ sizeof(*info->log_row_parts) *
+ (TRANSLOG_INTERNAL_PARTS + 3 +
+ share->base.fields + 3),
+ &info->update_field_data,
+ (share->base.fields * 4 +
+ share->base.max_field_lengths + 1 + 4),
+ NullS, 0))
+ DBUG_RETURN(1);
+ /* Skip over bytes used to store length of field length for logging */
+ row->field_lengths+= 2;
+ new_row->field_lengths+= 2;
+
+ /* Reserve some initial space to avoid mallocs during execution */
+ default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 +
+ (AVERAGE_BLOB_SIZE /
+ FULL_PAGE_SIZE(share->block_size) /
+ BLOB_SEGMENT_MIN_SIZE));
+
+ if (my_init_dynamic_array(&info->bitmap_blocks,
+ sizeof(MARIA_BITMAP_BLOCK), default_extents,
+ 64))
+ goto err;
+ info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE;
+ if (!(info->cur_row.extents= my_malloc(info->cur_row.extents_buffer_length,
+ MYF(MY_WME))))
+ goto err;
+
+ info->row_base_length= share->base_length;
+ info->row_flag= share->base.default_row_flag;
+
+ /*
+ We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in
+ null_field_lengths to allow splitting of rows in 'find_where_to_split_row'
+ */
+ row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
+ new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
+
+ DBUG_RETURN(0);
+
+err:
+ _ma_end_block_record(info);
+ DBUG_RETURN(1);
+}
+
+
+void _ma_end_block_record(MARIA_HA *info)
+{
+ DBUG_ENTER("_ma_end_block_record");
+ my_free(info->cur_row.empty_bits, MYF(MY_ALLOW_ZERO_PTR));
+ delete_dynamic(&info->bitmap_blocks);
+ my_free(info->cur_row.extents, MYF(MY_ALLOW_ZERO_PTR));
+ my_free(info->blob_buff, MYF(MY_ALLOW_ZERO_PTR));
+ /*
+ The data file is closed, when needed, in ma_once_end_block_record().
+ The following protects us from doing an extra, not allowed, close
+ in maria_close()
+ */
+ info->dfile.file= -1;
+ DBUG_VOID_RETURN;
+}
+
+
+/****************************************************************************
+ Helper functions
+****************************************************************************/
+
+/*
+ Return the next unused postion on the page after a directory entry.
+
+ SYNOPSIS
+ start_of_next_entry()
+ dir Directory entry to be used. This can not be the
+ the last entry on the page!
+
+ RETURN
+ # Position in page where next entry starts.
+ Everything between the '*dir' and this are free to be used.
+*/
+
+static inline uint start_of_next_entry(uchar *dir)
+{
+ uchar *prev;
+ /*
+ Find previous used entry. (There is always a previous entry as
+ the directory never starts with a deleted entry)
+ */
+ for (prev= dir - DIR_ENTRY_SIZE ;
+ prev[0] == 0 && prev[1] == 0 ;
+ prev-= DIR_ENTRY_SIZE)
+ {}
+ return (uint) uint2korr(prev);
+}
+
+
+/*
+ Return the offset where the previous entry ends (before on page)
+
+ SYNOPSIS
+ end_of_previous_entry()
+ dir Address for current directory entry
+ end Address to last directory entry
+
+ RETURN
+ # Position where previous entry ends (smallest address on page)
+ Everything between # and current entry are free to be used.
+*/
+
+
+static inline uint end_of_previous_entry(uchar *dir, uchar *end)
+{
+ uchar *pos;
+ for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE)
+ {
+ uint offset;
+ if ((offset= uint2korr(pos)))
+ return offset + uint2korr(pos+2);
+ }
+ return PAGE_HEADER_SIZE;
+}
+
+
+#ifndef DBUG_OFF
+
+static void _ma_print_directory(FILE *file, uchar *buff, uint block_size)
+{
+ uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0;
+ uint end_of_prev_row= PAGE_HEADER_SIZE;
+ uchar *dir, *end;
+
+ dir= dir_entry_pos(buff, block_size, max_entry-1);
+ end= dir_entry_pos(buff, block_size, 0);
+
+ DBUG_LOCK_FILE; /* If using DBUG_FILE */
+ fprintf(file,"Directory dump (pos:length):\n");
+
+ for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++)
+ {
+ uint offset= uint2korr(end);
+ uint length= uint2korr(end+2);
+ fprintf(file, " %4u:%4u", offset, offset ? length : 0);
+ if (!(row % (80/12)))
+ fputc('\n', file);
+ if (offset)
+ {
+ DBUG_ASSERT(offset >= end_of_prev_row);
+ end_of_prev_row= offset + length;
+ }
+ }
+ fputc('\n', file);
+ fflush(file);
+ DBUG_UNLOCK_FILE;
+}
+
+
+static void check_directory(uchar *buff, uint block_size, uint min_row_length,
+ uint real_empty_size)
+{
+ uchar *dir, *end;
+ uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+ uint start_of_dir, deleted;
+ uint end_of_prev_row= PAGE_HEADER_SIZE;
+ uint empty_size_on_page;
+ uint empty_size;
+ uchar free_entry, prev_free_entry;
+
+ dir= dir_entry_pos(buff, block_size, max_entry-1);
+ start_of_dir= (uint) (dir - buff);
+ end= dir_entry_pos(buff, block_size, 0);
+ deleted= empty_size= 0;
+
+ empty_size_on_page= (real_empty_size != (uint) -1 ? real_empty_size :
+ uint2korr(buff + EMPTY_SPACE_OFFSET));
+
+ /* Ensure that all rows are in increasing order and no overlaps */
+ for (; dir <= end ; end-= DIR_ENTRY_SIZE)
+ {
+ uint offset= uint2korr(end);
+ uint length= uint2korr(end+2);
+ if (offset)
+ {
+ DBUG_ASSERT(offset >= end_of_prev_row);
+ DBUG_ASSERT(!length || length >= min_row_length);
+ empty_size+= offset - end_of_prev_row;
+ end_of_prev_row= offset + length;
+ }
+ else
+ deleted++;
+ }
+ empty_size+= start_of_dir - end_of_prev_row;
+ DBUG_ASSERT(end_of_prev_row <= start_of_dir);
+ DBUG_ASSERT(empty_size == empty_size_on_page);
+
+ /* check free links */
+ free_entry= buff[DIR_FREE_OFFSET];
+ prev_free_entry= END_OF_DIR_FREE_LIST;
+ while (free_entry != END_OF_DIR_FREE_LIST)
+ {
+ uchar *dir= dir_entry_pos(buff, block_size, free_entry);
+ DBUG_ASSERT(dir[0] == 0 && dir[1] == 0);
+ DBUG_ASSERT(dir[2] == prev_free_entry);
+ prev_free_entry= free_entry;
+ free_entry= dir[3];
+ deleted--;
+ }
+ DBUG_ASSERT(deleted == 0);
+}
+#else
+#define check_directory(A,B,C,D)
+#endif /* DBUG_OFF */
+
+
+/**
+ @brief Calculate if there is enough entries on the page
+*/
+
+static my_bool enough_free_entries(uchar *buff, uint block_size,
+ uint wanted_entries)
+{
+ uint entries= (uint) buff[DIR_COUNT_OFFSET];
+ uint needed_free_entries, free_entry;
+
+ if (entries + wanted_entries <= MAX_ROWS_PER_PAGE)
+ return 1;
+
+ /* Check if enough free entries in free list */
+ needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE;
+
+ free_entry= (uint) buff[DIR_FREE_OFFSET];
+ while (free_entry != END_OF_DIR_FREE_LIST)
+ {
+ uchar *dir;
+ if (!--needed_free_entries)
+ return 1;
+ dir= dir_entry_pos(buff, block_size, free_entry);
+ free_entry= dir[3];
+ }
+ return 0; /* Not enough entries */
+}
+
+
+/**
+ @brief Check if there is room for more rows on page
+
+ @fn enough_free_entries_on_page
+
+ @return 0 Directory is full
+ @return 1 There is room for more entries on the page
+*/
+
+my_bool enough_free_entries_on_page(MARIA_SHARE *share,
+ uchar *page_buff)
+{
+ enum en_page_type page_type;
+ page_type= (enum en_page_type) (page_buff[PAGE_TYPE_OFFSET] &
+ ~(uchar) PAGE_CAN_BE_COMPACTED);
+
+ if (page_type == HEAD_PAGE)
+ {
+ uint row_count= (uint) page_buff[DIR_COUNT_OFFSET];
+ return !(row_count == MAX_ROWS_PER_PAGE &&
+ page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST);
+ }
+ return enough_free_entries(page_buff, share->block_size,
+ 1 + share->base.blobs);
+}
+
+
+/**
+ @brief Extend a record area to fit a given size block
+
+ @fn extend_area_on_page()
+ @param info Handler if head page and 0 if tail page
+ @param buff Page buffer
+ @param dir Pointer to dir entry in buffer
+ @param rownr Row number we working on
+ @param block_size Block size of buffer
+ @param request_length How much data we want to put at [dir]
+ @param empty_space Total empty space in buffer
+ This is updated with length after dir
+ is allocated and current block freed
+
+ @implementation
+ The logic is as follows (same as in _ma_update_block_record())
+ - If new data fits in old block, use old block.
+ - Extend block with empty space before block. If enough, use it.
+ - Extend block with empty space after block. If enough, use it.
+ - Use _ma_compact_block_page() to get all empty space at dir.
+
+ @note
+ The given directory entry is set to rec length.
+ empty_space doesn't include the new directory entry
+
+
+ @return
+ @retval 0 ok
+ @retval ret_offset Pointer to store offset to found area
+ @retval ret_length Pointer to store length of found area
+ @retval [dir] rec_offset is store here too
+
+ @retval 1 error (wrong info in block)
+*/
+
+static my_bool extend_area_on_page(MARIA_HA *info,
+ uchar *buff, uchar *dir,
+ uint rownr, uint block_size,
+ uint request_length,
+ uint *empty_space, uint *ret_offset,
+ uint *ret_length)
+{
+ uint rec_offset, length, org_rec_length;
+ uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+ DBUG_ENTER("extend_area_on_page");
+
+ /*
+ We can't check for min length here as we may have called
+ extend_directory() to create a new (empty) entry just before
+ */
+ check_directory(buff, block_size, 0, *empty_space);
+
+ rec_offset= uint2korr(dir);
+ if (rec_offset)
+ {
+ /* Extending old row; Mark current space as 'free' */
+ length= org_rec_length= uint2korr(dir + 2);
+ DBUG_PRINT("info", ("rec_offset: %u length: %u request_length: %u "
+ "empty_space: %u",
+ rec_offset, org_rec_length, request_length,
+ *empty_space));
+
+ *empty_space+= org_rec_length;
+ }
+ else
+ {
+ /* Reusing free directory entry; Free it from the directory list */
+ if (dir[2] == END_OF_DIR_FREE_LIST)
+ buff[DIR_FREE_OFFSET]= dir[3];
+ else
+ {
+ uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]);
+ DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr);
+ prev_dir[3]= dir[3];
+ }
+ if (dir[3] != END_OF_DIR_FREE_LIST)
+ {
+ uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]);
+ DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr);
+ next_dir[2]= dir[2];
+ }
+ rec_offset= start_of_next_entry(dir);
+ length= 0;
+ }
+ if (length < request_length)
+ {
+ uint old_rec_offset;
+ /*
+ New data did not fit in old position.
+ Find first possible position where to put new data.
+ */
+ old_rec_offset= rec_offset;
+ rec_offset= end_of_previous_entry(dir, buff + block_size -
+ PAGE_SUFFIX_SIZE);
+ length+= (uint) (old_rec_offset - rec_offset);
+ DBUG_ASSERT(old_rec_offset);
+ /*
+ 'length' is 0 if we are doing an insert into a not allocated block.
+ This can only happen during "REDO of INSERT" or "UNDO of DELETE."
+ */
+ if (length < request_length)
+ {
+ /*
+ Did not fit in current block + empty space. Extend with
+ empty space after block.
+ */
+ if (rownr == max_entry - 1)
+ {
+ /* Last entry; Everything is free between this and directory */
+ length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) -
+ rec_offset);
+ }
+ else
+ length= start_of_next_entry(dir) - rec_offset;
+ DBUG_ASSERT((int) length >= 0);
+ if (length < request_length)
+ {
+ /* Not enough continuous space, compact page to get more */
+ int2store(dir, rec_offset);
+ /* Reset length, as this may be a deleted block */
+ int2store(dir+2, 0);
+ _ma_compact_block_page(buff, block_size, rownr, 1,
+ info ? info->trn->min_read_from: 0,
+ info ? info->s->base.min_block_length : 0);
+ rec_offset= uint2korr(dir);
+ length= uint2korr(dir+2);
+ if (length < request_length)
+ {
+ DBUG_PRINT("error", ("Not enough space: "
+ "length: %u request_length: %u",
+ length, request_length));
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
+ DBUG_ASSERT(0); /* For debugging */
+ DBUG_RETURN(1); /* Error in block */
+ }
+ *empty_space= length; /* All space is here */
+ }
+ }
+ }
+ int2store(dir, rec_offset);
+ int2store(dir + 2, length);
+ *ret_offset= rec_offset;
+ *ret_length= length;
+
+ check_directory(buff, block_size, info ? info->s->base.min_block_length : 0,
+ *empty_space - length);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Copy not changed fields from 'from' to 'to'
+
+ @notes
+ Assumption is that most fields are not changed!
+ (Which is why we don't test if all bits are set for some bytes in bitmap)
+*/
+
+void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields,
+ uchar *to, uchar *from)
+{
+ MARIA_COLUMNDEF *column, *end_column;
+ uchar *bitmap= (uchar*) changed_fields->bitmap;
+ MARIA_SHARE *share= info->s;
+ uint bit= 1;
+
+ for (column= share->columndef, end_column= column+ share->base.fields;
+ column < end_column; column++)
+ {
+ if (!(*bitmap & bit))
+ {
+ uint field_length= column->length;
+ if (column->type == FIELD_VARCHAR)
+ {
+ if (column->fill_length == 1)
+ field_length= (uint) from[column->offset] + 1;
+ else
+ field_length= uint2korr(from + column->offset) + 2;
+ }
+ memcpy(to + column->offset, from + column->offset, field_length);
+ }
+ if ((bit= (bit << 1)) == 256)
+ {
+ bitmap++;
+ bit= 1;
+ }
+ }
+}
+
+#ifdef NOT_YET_NEEDED
+/* Calculate empty space on a page */
+
+static uint empty_space_on_page(uchar *buff, uint block_size)
+{
+ enum en_page_type;
+ page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] &
+ ~(uchar) PAGE_CAN_BE_COMPACTED);
+ if (page_type == UNALLOCATED_PAGE)
+ return block_size;
+ if ((uint) page_type <= TAIL_PAGE)
+ return uint2korr(buff+EMPTY_SPACE_OFFSET);
+ return 0; /* Blob page */
+}
+#endif
+
+
+/*
+ @brief Ensure we have space for new directory entries
+
+ @fn make_space_for_directory()
+ @param buff Page buffer
+ @param block_size Block size for pages
+ @param max_entry Number of current entries in directory
+ @param count Number of new entries to be added to directory
+ @param first_dir First directory entry on page
+ @param empty_space Total empty space in buffer. It's updated
+ to reflect the new empty space
+ @param first_pos Store position to last data byte on page here
+
+ @note
+ This function is inline as the argument passing is the biggest
+ part of the function
+
+ @return
+ @retval 0 ok
+ @retval 1 error (No data on page, fatal error)
+*/
+
+static inline my_bool
+make_space_for_directory(MARIA_HA *info,
+ uchar *buff, uint block_size, uint max_entry,
+ uint count, uchar *first_dir, uint *empty_space,
+ uint *first_pos)
+{
+ uint length_needed= DIR_ENTRY_SIZE * count;
+
+ /*
+ The following is not true only in the case and UNDO is used to reinsert
+ a row on a previously not used page
+ */
+ if (likely(max_entry))
+ {
+ /* Check if there is place for the directory entry on the page */
+ *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2);
+
+ if ((uint) (first_dir - buff) < *first_pos + length_needed)
+ {
+ /* Create place for directory */
+ _ma_compact_block_page(buff, block_size, max_entry - 1, 0,
+ info ? info->trn->min_read_from : 0,
+ info ? info->s->base.min_block_length : 0);
+ *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2));
+ *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ if (*empty_space < length_needed)
+ {
+ /*
+ We should always have space, as we only come here for
+ UNDO of DELETE (in which case we know the row was on the
+ page before) or if the bitmap told us there was space on page
+ */
+ DBUG_ASSERT(0);
+ return(1);
+ }
+ }
+ }
+ else
+ *first_pos= PAGE_HEADER_SIZE;
+
+ /* Reduce directory entry size from free space size */
+ (*empty_space)-= length_needed;
+ buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count);
+ return(0);
+}
+
+
+/*
+ Find free position in directory
+
+ SYNOPSIS
+ find_free_position()
+ info Handler if head page and 0 otherwise
+ buff Page
+ block_size Size of page
+ res_rownr Store index to free position here
+ res_length Store length of found segment here
+ empty_space Store length of empty space on disk here. This is
+ all empty space, including the found block.
+
+ NOTES
+ If there is a free directory entry (entry with position == 0),
+ then use it and change it to be the size of the empty block
+ after the previous entry. This guarantees that all row entries
+ are stored on disk in inverse directory order, which makes life easier for
+ '_ma_compact_block_page()' and to know if there is free space after any
+ block.
+
+ If there is no free entry (entry with position == 0), then we create
+ a new one. If there is not space for the directory entry (because
+ the last block overlapps with the directory), we compact the page.
+
+ We will update the offset and the length of the found dir entry to
+ match the position and empty space found.
+
+ buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller
+
+ See start of file for description of how free directory entires are linked
+
+ RETURN
+ 0 Error (directory full or last block goes over directory)
+ # Pointer to directory entry on page
+*/
+
+static uchar *find_free_position(MARIA_HA *info,
+ uchar *buff, uint block_size, uint *res_rownr,
+ uint *res_length, uint *empty_space)
+{
+ uint max_entry, free_entry;
+ uint length, first_pos;
+ uchar *dir, *first_dir;
+ DBUG_ENTER("find_free_position");
+
+ max_entry= (uint) buff[DIR_COUNT_OFFSET];
+ free_entry= (uint) buff[DIR_FREE_OFFSET];
+ *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+
+ DBUG_PRINT("info", ("max_entry: %u free_entry: %u", max_entry, free_entry));
+
+ first_dir= dir_entry_pos(buff, block_size, max_entry - 1);
+
+ /* Search after first free position */
+ if (free_entry != END_OF_DIR_FREE_LIST)
+ {
+ if (free_entry >= max_entry)
+ DBUG_RETURN(0); /* Consistency error */
+ dir= dir_entry_pos(buff, block_size, free_entry);
+ DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST);
+ /* Relink free list */
+ if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST)
+ {
+ uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
+ DBUG_ASSERT((uint) next_entry[2] == free_entry &&
+ uint2korr(next_entry) == 0);
+ next_entry[2]= END_OF_DIR_FREE_LIST; /* Backlink */
+ }
+
+ first_pos= end_of_previous_entry(dir, buff + block_size -
+ PAGE_SUFFIX_SIZE);
+ length= start_of_next_entry(dir) - first_pos;
+ int2store(dir, first_pos); /* Update dir entry */
+ int2store(dir + 2, 0);
+ *res_rownr= free_entry;
+ *res_length= length;
+
+ check_directory(buff, block_size,
+ info ? info->s->base.min_block_length : 0, (uint) -1);
+ DBUG_RETURN(dir);
+ }
+ /* No free places in dir; create a new one */
+
+ /* Check if there is place for the directory entry */
+ if (max_entry == MAX_ROWS_PER_PAGE)
+ DBUG_RETURN(0);
+
+ if (make_space_for_directory(info, buff, block_size, max_entry, 1,
+ first_dir, empty_space, &first_pos))
+ DBUG_RETURN(0);
+
+ dir= first_dir - DIR_ENTRY_SIZE;
+ length= (uint) (dir - buff - first_pos);
+ DBUG_ASSERT(length <= *empty_space);
+ int2store(dir, first_pos);
+ int2store(dir + 2, 0); /* Max length of region */
+ *res_rownr= max_entry;
+ *res_length= length;
+
+ check_directory(buff, block_size, info ? info->s->base.min_block_length : 0,
+ *empty_space);
+ DBUG_RETURN(dir);
+}
+
+
+/**
+ @brief Enlarge page directory to hold more entries
+
+ @fn extend_directory()
+ @param info Handler if head page and 0 otherwise
+ @param buff Page buffer
+ @param block_size Block size
+ @param max_entry Number of directory entries on page
+ @param new_entry Position for new entry
+ @param empty_space Total empty space in buffer. It's updated
+ to reflect the new empty space
+
+ @note
+ This is only called on UNDO when we want to expand the directory
+ to be able to re-insert row in a given position
+
+ The new directory entry will be set to cover the maximum possible space
+
+ @return
+ @retval 0 ok
+ @retval 1 error (No data on page, fatal error)
+*/
+
+static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size,
+ uint max_entry, uint new_entry,
+ uint *empty_space)
+{
+ uint length, first_pos;
+ uchar *dir, *first_dir;
+ DBUG_ENTER("extend_directory");
+
+ /*
+ Note that in if max_entry is 0, then first_dir will point to
+ an illegal directory entry. This is ok, as in this case we will
+ not access anything through first_dir.
+ */
+ first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE;
+
+ if (make_space_for_directory(info, buff, block_size, max_entry,
+ new_entry - max_entry + 1,
+ first_dir, empty_space, &first_pos))
+ DBUG_RETURN(1);
+
+ /* Set the new directory entry to cover the max possible length */
+ dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1);
+ length= (uint) (dir - buff - first_pos);
+ int2store(dir, first_pos);
+ int2store(dir+2, length);
+ *empty_space-= length;
+
+ if (new_entry-- > max_entry)
+ {
+ /* Link all row entries between new_entry and max_entry into free list */
+ uint free_entry= (uint) buff[DIR_FREE_OFFSET];
+ uint prev_entry= END_OF_DIR_FREE_LIST;
+ buff[DIR_FREE_OFFSET]= new_entry;
+ do
+ {
+ dir+= DIR_ENTRY_SIZE;
+ dir[0]= dir[1]= 0;
+ dir[2]= (uchar) prev_entry;
+ dir[3]= (uchar) new_entry-1;
+ prev_entry= new_entry;
+ } while (new_entry-- > max_entry);
+ if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST)
+ {
+ /* Relink next entry to point to newly freed entry */
+ uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
+ DBUG_ASSERT(uint2korr(next_entry) == 0 &&
+ next_entry[2] == END_OF_DIR_FREE_LIST);
+ next_entry[2]= max_entry;
+ }
+ }
+
+ check_directory(buff, block_size,
+ info ? min(info->s->base.min_block_length, length) : 0,
+ *empty_space);
+ DBUG_RETURN(0);
+}
+
+
+/****************************************************************************
+ Updating records
+****************************************************************************/
+
+/*
+ Calculate length of all the different field parts
+
+ SYNOPSIS
+ calc_record_size()
+ info Maria handler
+ record Row to store
+ row Store statistics about row here
+
+ NOTES
+ The statistics is used to find out how much space a row will need
+ and also where we can split a row when we need to split it into several
+ extents.
+*/
+
+static void calc_record_size(MARIA_HA *info, const uchar *record,
+ MARIA_ROW *row)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *field_length_data;
+ MARIA_COLUMNDEF *column, *end_column;
+ uint *null_field_lengths= row->null_field_lengths;
+ ulong *blob_lengths= row->blob_lengths;
+ DBUG_ENTER("calc_record_size");
+
+ row->normal_length= row->char_length= row->varchar_length=
+ row->blob_length= row->extents_count= 0;
+
+ /* Create empty bitmap and calculate length of each varlength/char field */
+ bzero(row->empty_bits, share->base.pack_bytes);
+ field_length_data= row->field_lengths;
+ for (column= share->columndef + share->base.fixed_not_null_fields,
+ end_column= share->columndef + share->base.fields;
+ column < end_column; column++, null_field_lengths++)
+ {
+ if ((record[column->null_pos] & column->null_bit))
+ {
+ if (column->type != FIELD_BLOB)
+ *null_field_lengths= 0;
+ else
+ *blob_lengths++= 0;
+ continue;
+ }
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ DBUG_ASSERT(column->empty_bit == 0);
+ /* fall through */
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ row->normal_length+= column->length;
+ *null_field_lengths= column->length;
+ break;
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ if (memcmp(record+ column->offset, maria_zero_string,
+ column->length) == 0)
+ {
+ row->empty_bits[column->empty_pos] |= column->empty_bit;
+ *null_field_lengths= 0;
+ }
+ else
+ {
+ row->normal_length+= column->length;
+ *null_field_lengths= column->length;
+ }
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ const uchar *pos, *end;
+ for (pos= record + column->offset, end= pos + column->length;
+ end > pos && end[-1] == ' '; end--)
+ ;
+ if (pos == end) /* If empty string */
+ {
+ row->empty_bits[column->empty_pos]|= column->empty_bit;
+ *null_field_lengths= 0;
+ }
+ else
+ {
+ uint length= (uint) (end - pos);
+ if (column->length <= 255)
+ *field_length_data++= (uchar) length;
+ else
+ {
+ int2store(field_length_data, length);
+ field_length_data+= 2;
+ }
+ row->char_length+= length;
+ *null_field_lengths= length;
+ }
+ break;
+ }
+ case FIELD_VARCHAR:
+ {
+ uint length, field_length_data_length;
+ const uchar *field_pos= record + column->offset;
+
+ /* 256 is correct as this includes the length uchar */
+ field_length_data[0]= field_pos[0];
+ if (column->length <= 256)
+ {
+ length= (uint) (uchar) *field_pos;
+ field_length_data_length= 1;
+ }
+ else
+ {
+ length= uint2korr(field_pos);
+ field_length_data[1]= field_pos[1];
+ field_length_data_length= 2;
+ }
+ *null_field_lengths= length;
+ if (!length)
+ {
+ row->empty_bits[column->empty_pos]|= column->empty_bit;
+ break;
+ }
+ row->varchar_length+= length;
+ *null_field_lengths= length;
+ field_length_data+= field_length_data_length;
+ break;
+ }
+ case FIELD_BLOB:
+ {
+ const uchar *field_pos= record + column->offset;
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
+
+ *blob_lengths++= blob_length;
+ if (!blob_length)
+ row->empty_bits[column->empty_pos]|= column->empty_bit;
+ else
+ {
+ row->blob_length+= blob_length;
+ memcpy(field_length_data, field_pos, size_length);
+ field_length_data+= size_length;
+ }
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
+ row->field_lengths_length= (uint) (field_length_data - row->field_lengths);
+ /*
+ - info->row_base_length is base information we must have on a page in first
+ extent:
+ - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes +
+ table_checksum (0 | 1)
+ - row->min_length is minimum amount of data we must store on
+ a page. bitmap code will ensure we get at list this much +
+ total number of extents and one extent information
+ - fixed_not_null_fields_length is length of fixed length fields that can't
+ be compacted
+ - head_length is the amount of data for the head page
+ (ie, all fields except blobs)
+ */
+ row->min_length= (info->row_base_length +
+ (share->base.max_field_lengths ?
+ size_to_store_key_length(row->field_lengths_length) :
+ 0));
+ row->head_length= (row->min_length +
+ share->base.fixed_not_null_fields_length +
+ row->field_lengths_length +
+ row->normal_length +
+ row->char_length + row->varchar_length);
+ row->total_length= (row->head_length + row->blob_length);
+ if (row->total_length < share->base.min_block_length)
+ row->total_length= share->base.min_block_length;
+ DBUG_PRINT("exit", ("head_length: %lu total_length: %lu",
+ (ulong) row->head_length, (ulong) row->total_length));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ Compact page by removing all space between rows
+
+ Moves up all rows to start of page. Moves blocks that are directly after
+ each other with one memmove.
+
+ @note if rownr is the last row in the page, and extend_block is false,
+ caller has to make sure to update bitmap page afterwards to reflect freed
+ space.
+
+ @param buff Page to compact
+ @param block_size Size of page
+ @param rownr Put empty data after this row
+ @param extend_block If 1, extend the block at 'rownr' to cover the
+ whole block.
+ @param min_read_from If <> 0, remove all trid's that are less than this
+*/
+
+void _ma_compact_block_page(uchar *buff, uint block_size, uint rownr,
+ my_bool extend_block, TrID min_read_from,
+ uint min_row_length)
+{
+ uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+ uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block;
+ uint freed_size= 0;
+ uchar *dir, *end;
+ DBUG_ENTER("_ma_compact_block_page");
+ DBUG_PRINT("enter", ("rownr: %u min_read_from: %lu", rownr,
+ (ulong) min_read_from));
+ DBUG_ASSERT(max_entry > 0 &&
+ max_entry < (block_size - PAGE_HEADER_SIZE -
+ PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE);
+
+ /* Move all entries before and including rownr up to start of page */
+ dir= dir_entry_pos(buff, block_size, rownr);
+ end= dir_entry_pos(buff, block_size, 0);
+ page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE;
+ diff= 0;
+ for (; dir <= end ; end-= DIR_ENTRY_SIZE)
+ {
+ uint offset= uint2korr(end);
+
+ if (offset)
+ {
+ uint row_length= uint2korr(end + 2);
+ DBUG_ASSERT(offset >= page_pos);
+ DBUG_ASSERT(buff + offset + row_length <= dir);
+ DBUG_ASSERT(row_length >= min_row_length || row_length == 0);
+
+ /* Row length can be zero if row is to be deleted */
+ if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID))
+ {
+ TrID transid= transid_korr(buff+offset+1);
+ if (transid < min_read_from)
+ {
+ /* Remove transid from row by moving the start point of the row up */
+ buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
+ offset+= TRANSID_SIZE;
+ freed_size+= TRANSID_SIZE;
+ row_length-= TRANSID_SIZE;
+ int2store(end+2, row_length);
+ }
+ }
+
+ if (offset != next_free_pos)
+ {
+ uint length= (next_free_pos - start_of_found_block);
+ /*
+ There was empty space before this and prev block
+ Check if we have to move previous block up to page start
+ */
+ if (page_pos != start_of_found_block)
+ {
+ /* move up previous block */
+ memmove(buff + page_pos, buff + start_of_found_block, length);
+ }
+ page_pos+= length;
+ /* next continuous block starts here */
+ start_of_found_block= offset;
+ diff= offset - page_pos;
+ }
+ int2store(end, offset - diff); /* correct current pos */
+ next_free_pos= offset + row_length;
+
+ if (unlikely(row_length < min_row_length) && row_length)
+ {
+ /*
+ This can only happen in the case we compacted transid and
+ the row become 'too short'
+
+ Move the current row down to it's right place and extend it
+ with 0.
+ */
+ uint row_diff= min_row_length - row_length;
+ uint length= (next_free_pos - start_of_found_block);
+
+ DBUG_ASSERT(page_pos != start_of_found_block);
+ bmove(buff + page_pos, buff + start_of_found_block, length);
+ bzero(buff+ page_pos + length, row_diff);
+ page_pos+= min_row_length;
+ int2store(end+2, min_row_length);
+ freed_size-= row_diff;
+ next_free_pos= start_of_found_block= page_pos;
+ diff= 0;
+ }
+ }
+ }
+ if (page_pos != start_of_found_block)
+ {
+ uint length= (next_free_pos - start_of_found_block);
+ memmove(buff + page_pos, buff + start_of_found_block, length);
+ }
+ start_of_found_block= uint2korr(dir);
+
+ if (rownr != max_entry - 1)
+ {
+ /* Move all entries after rownr to end of page */
+ uint rownr_length;
+
+ DBUG_ASSERT(extend_block); /* Should always be true */
+ next_free_pos= end_of_found_block= page_pos=
+ block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE;
+ diff= 0;
+ /* End points to entry before 'rownr' */
+ for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE)
+ {
+ uint offset= uint2korr(dir);
+ uint row_length;
+ uint row_end;
+ if (!offset)
+ continue;
+ row_length= uint2korr(dir + 2);
+ row_end= offset + row_length;
+ DBUG_ASSERT(offset >= start_of_found_block &&
+ row_end <= next_free_pos && row_length >= min_row_length);
+
+ if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID))
+ {
+ TrID transid= transid_korr(buff + offset+1);
+ if (transid < min_read_from)
+ {
+ /* Remove transid from row */
+ buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
+ offset+= TRANSID_SIZE;
+ row_length-= TRANSID_SIZE;
+ int2store(dir+2, row_length);
+ }
+ if (unlikely(row_length < min_row_length))
+ {
+ /*
+ This can only happen in the case we compacted transid and
+ the row become 'too short'
+ */
+ uint row_diff= min_row_length - row_length;
+ if (next_free_pos < row_end + row_diff)
+ {
+ /*
+ Not enough space for extending next block with enough
+ end 0's. Move current data down to get place for them
+ */
+ uint move_down= row_diff - (next_free_pos - row_end);
+ bmove(buff + offset - move_down, buff + offset, row_length);
+ offset-= move_down;
+ }
+ /*
+ Extend the next block with 0, which will be part of current
+ row when the blocks are joined together later
+ */
+ bzero(buff + next_free_pos - row_diff, row_diff);
+ next_free_pos-= row_diff;
+ int2store(dir+2, min_row_length);
+ }
+ row_end= offset + row_length;
+ }
+
+ if (row_end != next_free_pos)
+ {
+ uint length= (end_of_found_block - next_free_pos);
+ if (page_pos != end_of_found_block)
+ {
+ /* move next block down */
+ memmove(buff + page_pos - length, buff + next_free_pos, length);
+ }
+ page_pos-= length;
+ /* next continuous block starts here */
+ end_of_found_block= row_end;
+ diff= page_pos - row_end;
+ }
+ int2store(dir, offset + diff); /* correct current pos */
+ next_free_pos= offset;
+ }
+ if (page_pos != end_of_found_block)
+ {
+ uint length= (end_of_found_block - next_free_pos);
+ memmove(buff + page_pos - length, buff + next_free_pos, length);
+ next_free_pos= page_pos- length;
+ }
+
+ /* Extend rownr block to cover hole */
+ rownr_length= next_free_pos - start_of_found_block;
+ int2store(dir+2, rownr_length);
+ DBUG_ASSERT(rownr_length >= min_row_length);
+ }
+ else
+ {
+ if (extend_block)
+ {
+ /* Extend last block to cover whole page */
+ uint length= ((uint) (dir - buff) - start_of_found_block);
+ int2store(dir+2, length);
+ DBUG_ASSERT(length >= min_row_length);
+ }
+ else
+ {
+ /* Add length gained from freed transaction id's to this page */
+ uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size;
+ int2store(buff + EMPTY_SPACE_OFFSET, length);
+ }
+ buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED;
+ }
+ check_directory(buff, block_size, min_row_length,
+ extend_block ? 0 : (uint) -1);
+ DBUG_EXECUTE("directory", _ma_print_directory(DBUG_FILE, buff, block_size););
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Create an empty tail or head page
+
+ SYNOPSIS
+ make_empty_page()
+ buff Page buffer
+ block_size Block size
+ page_type HEAD_PAGE or TAIL_PAGE
+ create_dir_entry TRUE of we should create a directory entry
+
+ NOTES
+ EMPTY_SPACE is not updated
+*/
+
+static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type,
+ my_bool create_dir_entry)
+{
+ uint block_size= info->s->block_size;
+ DBUG_ENTER("make_empty_page");
+
+ bzero(buff, PAGE_HEADER_SIZE);
+
+#if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_valgrind)
+ /*
+ We zero the rest of the block to avoid getting old memory information
+ to disk and to allow the file to be compressed better if archived.
+ The code does not assume the block is zeroed.
+ */
+ if (page_type != BLOB_PAGE)
+ bzero(buff+ PAGE_HEADER_SIZE, block_size - PAGE_HEADER_SIZE);
+#endif
+ buff[PAGE_TYPE_OFFSET]= (uchar) page_type;
+ buff[DIR_COUNT_OFFSET]= (int) create_dir_entry;
+ buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST;
+ if (create_dir_entry)
+ {
+ /* Create directory entry to point to start of page with size 0 */
+ buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
+ int2store(buff, PAGE_HEADER_SIZE);
+ int2store(buff+2, 0);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Read or initialize new head or tail page
+
+ SYNOPSIS
+ get_head_or_tail_page()
+ info Maria handler
+ block Block to read
+ buff Suggest this buffer to key cache
+ length Minimum space needed
+ page_type HEAD_PAGE || TAIL_PAGE
+ res Store result position here
+
+ NOTES
+ We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data
+ as we don't know how much data the caller will actually use.
+
+ res->empty_space is set to length of empty space
+
+ RETURN
+ 0 ok All slots in 'res' are updated
+ 1 error my_errno is set
+*/
+
+struct st_row_pos_info
+{
+ uchar *buff; /* page buffer */
+ uchar *data; /* Place for data */
+ uchar *dir; /* Directory */
+ uint length; /* Length for data */
+ uint rownr; /* Offset in directory */
+ uint empty_space; /* Space left on page */
+};
+
+
+static my_bool get_head_or_tail_page(MARIA_HA *info,
+ MARIA_BITMAP_BLOCK *block,
+ uchar *buff, uint length, uint page_type,
+ enum pagecache_page_lock lock,
+ struct st_row_pos_info *res)
+{
+ uint block_size;
+ MARIA_PINNED_PAGE page_link;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("get_head_or_tail_page");
+ DBUG_PRINT("enter", ("page_type: %u length: %u", page_type, length));
+
+ block_size= share->block_size;
+ if (block->org_bitmap_value == 0) /* Empty block */
+ {
+ /* New page */
+ make_empty_page(info, buff, page_type, 1);
+ res->buff= buff;
+ res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE);
+ res->data= (buff + PAGE_HEADER_SIZE);
+ res->dir= res->data + res->length;
+ res->rownr= 0;
+ DBUG_ASSERT(length <= res->length);
+ }
+ else
+ {
+ uchar *dir;
+ /* Read old page */
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ res->buff= pagecache_read(share->pagecache, &info->dfile,
+ block->page, 0, 0, share->page_type,
+ lock, &page_link.link);
+ page_link.changed= res->buff != 0;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ if (!page_link.changed)
+ goto crashed;
+
+ DBUG_ASSERT((uint) (res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
+ page_type);
+ if (!(dir= find_free_position(page_type == HEAD_PAGE ? info : 0,
+ res->buff, block_size, &res->rownr,
+ &res->length, &res->empty_space)))
+ goto crashed;
+
+ if (res->length < length)
+ {
+ if (res->empty_space + res->length >= length)
+ {
+ _ma_compact_block_page(res->buff, block_size, res->rownr, 1,
+ (page_type == HEAD_PAGE ?
+ info->trn->min_read_from : 0),
+ (page_type == HEAD_PAGE ?
+ share->base.min_block_length :
+ 0));
+ /* All empty space are now after current position */
+ dir= dir_entry_pos(res->buff, block_size, res->rownr);
+ res->length= res->empty_space= uint2korr(dir+2);
+ }
+ if (res->length < length)
+ {
+ DBUG_PRINT("error", ("length: %u res->length: %u empty_space: %u",
+ length, res->length, res->empty_space));
+ goto crashed; /* Wrong bitmap information */
+ }
+ }
+ res->dir= dir;
+ res->data= res->buff + uint2korr(dir);
+ }
+ DBUG_RETURN(0);
+
+crashed:
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
+ DBUG_RETURN(1);
+}
+
+
+/*
+ @brief Create room for a head or tail row on a given page at given position
+
+ @fn get_rowpos_in_head_or_tail_page()
+ @param info Maria handler
+ @param block Block to read
+ @param buff Suggest this buffer to key cache
+ @param length Minimum space needed
+ @param page_type HEAD_PAGE || TAIL_PAGE
+ @param rownr Rownr to use
+ @param res Store result position here
+
+ @note
+ This is essential same as get_head_or_tail_page, with the difference
+ that the caller species at what position the row should be put.
+ This is used when restoring a row to it's original position as
+ part of UNDO DELETE or UNDO UPDATE
+
+ @return
+ @retval 0 ok All slots in 'res' are updated
+ @retval 1 error my_errno is set
+*/
+
+static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info,
+ MARIA_BITMAP_BLOCK *block,
+ uchar *buff, uint length,
+ uint page_type,
+ enum pagecache_page_lock lock,
+ uint rownr,
+ struct st_row_pos_info *res)
+{
+ MARIA_PINNED_PAGE page_link;
+ MARIA_SHARE *share= info->s;
+ uchar *dir;
+ uint block_size= share->block_size;
+ uint max_entry, max_length, rec_offset;
+ DBUG_ENTER("get_rowpos_in_head_or_tail_page");
+
+ if (block->org_bitmap_value == 0) /* Empty block */
+ {
+ /* New page */
+ make_empty_page(info, buff, page_type, 0);
+ res->empty_space= block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE;
+ }
+ else
+ {
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ buff= pagecache_read(share->pagecache, &info->dfile,
+ block->page, 0, 0, share->page_type,
+ lock, &page_link.link);
+ page_link.changed= buff != 0;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ if (!page_link.changed) /* Read error */
+ goto err;
+ DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
+ (uchar) page_type);
+ if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type)
+ goto err;
+ res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ }
+
+ max_entry= (uint) buff[DIR_COUNT_OFFSET];
+ if (max_entry <= rownr)
+ {
+ if (extend_directory(page_type == HEAD_PAGE ? info : 0, buff, block_size,
+ max_entry, rownr, &res->empty_space))
+ goto err;
+ }
+
+ /*
+ The following dir entry is unused in case of insert / update but
+ not in case of undo_update / undo_delete
+ */
+ dir= dir_entry_pos(buff, block_size, rownr);
+
+ if (extend_area_on_page(page_type == HEAD_PAGE ? info : 0, buff, dir,
+ rownr, block_size, length,
+ &res->empty_space, &rec_offset, &max_length))
+ goto err;
+
+ res->buff= buff;
+ res->rownr= rownr;
+ res->dir= dir;
+ res->data= buff + rec_offset;
+ res->length= length;
+ DBUG_RETURN(0);
+
+err:
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Write tail for head data or blob
+
+ SYNOPSIS
+ write_tail()
+ info Maria handler
+ block Block to tail page
+ row_part Data to write to page
+ length Length of data
+
+ NOTES
+ block->page_count is updated to the directory offset for the tail
+ so that we can store the position in the row extent information
+
+ RETURN
+ 0 ok
+ block->page_count is set to point (dir entry + TAIL_BIT)
+
+ 1 error; In this case my_errno is set to the error
+*/
+
+static my_bool write_tail(MARIA_HA *info,
+ MARIA_BITMAP_BLOCK *block,
+ uchar *row_part, uint org_length)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_PINNED_PAGE page_link;
+ uint block_size= share->block_size, empty_space, length= org_length;
+ struct st_row_pos_info row_pos;
+ my_off_t position;
+ my_bool res, block_is_read;
+ DBUG_ENTER("write_tail");
+ DBUG_PRINT("enter", ("page: %lu length: %u",
+ (ulong) block->page, length));
+
+ info->keyread_buff_used= 1;
+ /*
+ Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows
+ some place to grow in the future)
+ */
+ if (length < MIN_TAIL_SIZE)
+ length= MIN_TAIL_SIZE;
+
+ if (block->page_count == TAIL_PAGE_COUNT_MARKER)
+ {
+ /*
+ Create new tail
+ page will be pinned & locked by get_head_or_tail_page
+ */
+ if (get_head_or_tail_page(info, block, info->keyread_buff, length,
+ TAIL_PAGE, PAGECACHE_LOCK_WRITE,
+ &row_pos))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ /* Write tail on predefined row position */
+ if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff,
+ length, TAIL_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ block->page_count & ~TAIL_BIT,
+ &row_pos))
+ DBUG_RETURN(1);
+ }
+ DBUG_PRINT("info", ("tailid: %lu (%lu:%u)",
+ (ulong) ma_recordpos(block->page, row_pos.rownr),
+ (ulong) block->page, row_pos.rownr));
+
+ block_is_read= block->org_bitmap_value != 0;
+
+ memcpy(row_pos.data, row_part, org_length);
+
+ if (share->now_transactional)
+ {
+ /* Log changes in tail block */
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ LSN lsn;
+
+ /*
+ Log REDO changes of tail page
+ Note that we have to log length, not org_length, to be sure that
+ REDO, which doesn't use write_tail, also creates a block of at least
+ MIN_TAIL_SIZE
+ */
+ page_store(log_data + FILEID_STORE_SIZE, block->page);
+ dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ row_pos.rownr);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos.data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
+ if (translog_write_record(&lsn,
+ (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL :
+ LOGREC_REDO_NEW_ROW_TAIL),
+ info->trn, info,
+ (translog_size_t) (sizeof(log_data) + length),
+ TRANSLOG_INTERNAL_PARTS + 2, log_array,
+ log_data, NULL))
+ DBUG_RETURN(1);
+ }
+
+ int2store(row_pos.dir + 2, length);
+ empty_space= row_pos.empty_space - length;
+ int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space);
+ block->page_count= row_pos.rownr + TAIL_BIT;
+ /*
+ If there is less directory entries free than number of possible tails
+ we can write for a row, we mark the page full to ensure that we don't
+ during _ma_bitmap_find_place() allocate more entries on the tail page
+ than it can hold
+ */
+ block->empty_space= (enough_free_entries(row_pos.buff, share->block_size,
+ 1 + share->base.blobs) ?
+ empty_space : 0);
+ /* Keep BLOCKUSED_USE_ORG_BITMAP */
+ block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
+
+ if (block_is_read)
+ {
+ /* Current page link is last element in pinned_pages */
+ MARIA_PINNED_PAGE *page_link;
+ page_link= dynamic_element(&info->pinned_pages,
+ info->pinned_pages.elements-1,
+ MARIA_PINNED_PAGE*);
+ pagecache_unlock_by_link(share->pagecache, page_link->link,
+ PAGECACHE_LOCK_WRITE_TO_READ,
+ PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 1, FALSE);
+ DBUG_ASSERT(page_link->changed);
+ page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
+ res= 0;
+ }
+ else
+ {
+ if (!(res= pagecache_write(share->pagecache,
+ &info->dfile, block->page, 0,
+ row_pos.buff,share->page_type,
+ PAGECACHE_LOCK_READ,
+ PAGECACHE_PIN,
+ PAGECACHE_WRITE_DELAY, &page_link.link,
+ LSN_IMPOSSIBLE)))
+ {
+ page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ }
+
+ /* Increase data file size, if extended */
+ position= (my_off_t) block->page * block_size;
+ if (share->state.state.data_file_length <= position)
+ {
+ /*
+ We are modifying a state member before writing the UNDO; this is a WAL
+ violation. But for data_file_length this is ok, as long as we change
+ data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see
+ collect_tables()).
+ */
+ _ma_set_share_data_file_length(share, position + block_size);
+ }
+ }
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Write full pages
+
+ SYNOPSIS
+ write_full_pages()
+ info Maria handler
+ lsn LSN for the undo record
+ block Where to write data
+ data Data to write
+ length Length of data
+
+ NOTES
+ Logging of the changes to the full pages are done in the caller
+ write_block_record().
+
+ RETURN
+ 0 ok
+ 1 error on write
+*/
+
+static my_bool write_full_pages(MARIA_HA *info,
+ LSN lsn,
+ MARIA_BITMAP_BLOCK *block,
+ uchar *data, ulong length)
+{
+ pgcache_page_no_t page;
+ MARIA_SHARE *share= info->s;
+ uint block_size= share->block_size;
+ uint data_size= FULL_PAGE_SIZE(block_size);
+ uchar *buff= info->keyread_buff;
+ uint page_count, sub_blocks;
+ my_off_t position, max_position;
+ DBUG_ENTER("write_full_pages");
+ DBUG_PRINT("enter", ("length: %lu page: %lu page_count: %lu",
+ (ulong) length, (ulong) block->page,
+ (ulong) block->page_count));
+ DBUG_ASSERT((block->page_count & TAIL_BIT) == 0);
+
+ info->keyread_buff_used= 1;
+ page= block->page;
+ page_count= block->page_count;
+ sub_blocks= block->sub_blocks;
+
+ max_position= (my_off_t) (page + page_count) * block_size;
+
+ /* Increase data file size, if extended */
+
+ for (; length; data+= data_size)
+ {
+ uint copy_length;
+ if (!page_count--)
+ {
+ if (!--sub_blocks)
+ {
+ DBUG_ASSERT(0); /* Wrong in bitmap or UNDO */
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
+ DBUG_RETURN(1);
+ }
+
+ block++;
+ page= block->page;
+ page_count= block->page_count - 1;
+ DBUG_PRINT("info", ("page: %lu page_count: %lu",
+ (ulong) block->page, (ulong) block->page_count));
+
+ position= (page + page_count + 1) * block_size;
+ set_if_bigger(max_position, position);
+ }
+ lsn_store(buff, lsn);
+ buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE;
+ copy_length= min(data_size, length);
+ memcpy(buff + LSN_SIZE + PAGE_TYPE_SIZE, data, copy_length);
+ length-= copy_length;
+
+ /*
+ Zero out old information from the block. This removes possible
+ sensitive information from the block and also makes the file
+ easier to compress and easier to compare after recovery.
+ */
+ if (copy_length != data_size)
+ bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length),
+ (data_size - copy_length) + PAGE_SUFFIX_SIZE);
+
+ if (pagecache_write(share->pagecache,
+ &info->dfile, page, 0,
+ buff, share->page_type,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, info->trn->rec_lsn))
+ DBUG_RETURN(1);
+ page++;
+ DBUG_ASSERT(block->used & BLOCKUSED_USED);
+ }
+ if (share->state.state.data_file_length < max_position)
+ _ma_set_share_data_file_length(share, max_position);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Store ranges of full pages in compact format for logging
+
+ SYNOPSIS
+ store_page_range()
+ to Store data here
+ block Where pages are to be written
+ block_size block size
+ length Length of data to be written
+ Normally this is full pages, except for the last
+ tail block that may only partly fit the last page.
+ tot_ranges Add here the number of ranges used
+
+ NOTES
+ The format of one entry is:
+
+ Ranges SUB_RANGE_SIZE
+ Empty bytes at end of last byte BLOCK_FILLER_SIZE
+ For each range
+ Page number PAGE_STORE_SIZE
+ Number of pages PAGERANGE_STORE_SIZE
+
+ RETURN
+ # end position for 'to'
+*/
+
+static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block,
+ uint block_size, ulong length,
+ uint *tot_ranges)
+{
+ uint data_size= FULL_PAGE_SIZE(block_size);
+ ulong pages_left= (length + data_size -1) / data_size;
+ uint page_count, ranges, empty_space;
+ uchar *to_start;
+ DBUG_ENTER("store_page_range");
+
+ to_start= to;
+ to+= SUB_RANGE_SIZE;
+
+ /* Store number of unused bytes at last page */
+ empty_space= (uint) (pages_left * data_size - length);
+ int2store(to, empty_space);
+ to+= BLOCK_FILLER_SIZE;
+
+ ranges= 0;
+ do
+ {
+ pgcache_page_no_t page;
+ page= block->page;
+ page_count= block->page_count;
+ block++;
+ if (page_count > pages_left)
+ page_count= pages_left;
+
+ page_store(to, page);
+ to+= PAGE_STORE_SIZE;
+ pagerange_store(to, page_count);
+ to+= PAGERANGE_STORE_SIZE;
+ ranges++;
+ } while ((pages_left-= page_count));
+ /* Store number of ranges for this block */
+ int2store(to_start, ranges);
+ (*tot_ranges)+= ranges;
+
+ DBUG_RETURN(to);
+}
+
+
+/*
+ Store packed extent data
+
+ SYNOPSIS
+ store_extent_info()
+ to Store first packed data here
+ row_extents_second_part Store rest here
+ first_block First block to store
+ count Number of blocks
+
+ NOTES
+ We don't have to store the position for the head block
+
+ We have to set the START_EXTENT_BIT for every extent where the
+ blob will be stored on a page of it's own. We need this in the
+ UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and
+ undo-update.
+*/
+
+static void store_extent_info(uchar *to,
+ uchar *row_extents_second_part,
+ MARIA_BITMAP_BLOCK *first_block,
+ uint count)
+{
+ MARIA_BITMAP_BLOCK *block, *end_block;
+ uint copy_length;
+ my_bool first_found= 0;
+ DBUG_ENTER("store_extent_info");
+ DBUG_PRINT("enter", ("count: %u", count));
+
+ for (block= first_block, end_block= first_block+count ;
+ block < end_block; block++)
+ {
+ /* The following is only false for marker blocks */
+ if (likely(block->used & BLOCKUSED_USED))
+ {
+ uint page_count= block->page_count;
+ DBUG_ASSERT(page_count != 0);
+ page_store(to, block->page);
+ if (block->sub_blocks)
+ {
+ /*
+ Set a bit so that we later know that this was the first block
+ for a blob
+ */
+ page_count|= START_EXTENT_BIT;
+ }
+ pagerange_store(to + PAGE_STORE_SIZE, page_count);
+ DBUG_DUMP("extent", to, ROW_EXTENT_SIZE);
+ to+= ROW_EXTENT_SIZE;
+ if (!first_found)
+ {
+ first_found= 1;
+ to= row_extents_second_part;
+ }
+ }
+ }
+ copy_length= (count - 1) * ROW_EXTENT_SIZE;
+ /*
+ In some unlikely cases we have allocated to many blocks. Clear this
+ data.
+ */
+ bzero(to, (size_t) (row_extents_second_part + copy_length - to));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief
+ Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable
+ for write_block_record
+
+ @note
+ In case of blobs, this function marks all the blob pages in the bitmap
+ as full pages. The bitmap bits for other pages will be marked
+ when write_block_record() calls _ma_bitmap_release_unused().
+
+ This function will be removed in Maria 2.0 when we instead of delete rows
+ mark them as deleted and only remove them after commit.
+
+ @return
+ @retval 0 ok
+ @retval 1 Error (out of memory or disk error changing bitmap) or
+ wrong information in extent information
+*/
+
+static my_bool extent_to_bitmap_blocks(MARIA_HA *info,
+ MARIA_BITMAP_BLOCKS *blocks,
+ pgcache_page_no_t head_page,
+ uint extent_count,
+ const uchar *extent_info)
+{
+ MARIA_BITMAP_BLOCK *block, *start_block;
+ MARIA_SHARE *share= info->s;
+ uint i, tail_page;
+ DBUG_ENTER("extent_to_bitmap_blocks");
+
+ if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2))
+ DBUG_RETURN(1);
+ block= blocks->block= dynamic_element(&info->bitmap_blocks, 0,
+ MARIA_BITMAP_BLOCK*);
+ blocks->count= extent_count + 1;
+ blocks->tail_page_skipped= blocks->page_skipped= 0;
+ block->page= head_page;
+ block->page_count= 1;
+ block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
+ /* Impossible value, will force storage of real value */
+ block->org_bitmap_value= 255;
+
+ start_block= block++;
+ for (i=0 ;
+ i++ < extent_count ;
+ block++, extent_info+= ROW_EXTENT_SIZE)
+ {
+ uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE);
+ if (page_count & START_EXTENT_BIT)
+ {
+ page_count&= ~START_EXTENT_BIT;
+ start_block->sub_blocks= (uint) (block - start_block);
+ start_block= block;
+ }
+ block->page= page_korr(extent_info);
+ block->page_count= page_count;
+ block->sub_blocks= 0;
+ if (block->page_count == 0)
+ {
+ /* Extend allocated but not used by write_block_record() */
+ DBUG_ASSERT(block->page == 0);
+ /* This is the last block */
+ blocks->count= i;
+ break;
+ }
+ if ((tail_page= page_count & TAIL_BIT))
+ page_count= 1;
+
+ /* Check if wrong data */
+ if (block->page == 0 || page_count == 0 ||
+ (block->page + page_count) * share->block_size >
+ share->state.state.data_file_length)
+ {
+ DBUG_PRINT("error", ("page: %lu page_count: %u tail: %u length: %ld data_length: %ld",
+ (ulong) block->page,
+ (block->page_count & ~TAIL_BIT),
+ (uint) test(block->page_count & TAIL_BIT),
+ (ulong) ((block->page + (page_count & ~TAIL_BIT)) *
+ share->block_size),
+ (ulong) share->state.state.data_file_length));
+ DBUG_RETURN(1);
+ }
+ if (tail_page)
+ {
+ block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap,
+ block->page);
+ block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED |
+ BLOCKUSED_USE_ORG_BITMAP);
+ }
+ else
+ {
+ my_bool res;
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ res= _ma_bitmap_set_full_page_bits(info, &share->bitmap,
+ block->page, page_count);
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ if (res)
+ DBUG_RETURN(1);
+ block->used= BLOCKUSED_USED;
+ }
+ }
+ start_block->sub_blocks= (uint) (block - start_block);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Free regions of pages with logging
+
+ NOTES
+ We are removing filler events and tail page events from
+ row->extents to get smaller log.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
+{
+ uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE];
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ LSN lsn;
+ size_t extents_length;
+ uchar *extents= row->extents;
+ DBUG_ENTER("free_full_pages");
+
+ if (info->s->now_transactional)
+ {
+ /* Compact events by removing filler and tail events */
+ uchar *new_block= 0;
+ uchar *end, *to, *compact_extent_info;
+ my_bool res;
+ uint extents_count;
+
+ if (!(compact_extent_info= my_alloca(row->extents_count *
+ ROW_EXTENT_SIZE)))
+ DBUG_RETURN(1);
+
+ to= compact_extent_info;
+ for (end= extents + row->extents_count * ROW_EXTENT_SIZE ;
+ extents < end ;
+ extents+= ROW_EXTENT_SIZE)
+ {
+ uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
+ page_count&= ~START_EXTENT_BIT;
+ if (! (page_count & TAIL_BIT) && page_count != 0)
+ {
+ /* Found correct extent */
+ if (!new_block)
+ new_block= extents; /* First extent in range */
+ continue;
+ }
+ /* Found extent to remove, copy everything found so far */
+ if (new_block)
+ {
+ size_t length= (size_t) (extents - new_block);
+ memcpy(to, new_block, length);
+ to+= length;
+ new_block= 0;
+ }
+ }
+ if (new_block)
+ {
+ size_t length= (size_t) (extents - new_block);
+ memcpy(to, new_block, length);
+ to+= length;
+ }
+
+ if (!unlikely(extents_length= (uint) (to - compact_extent_info)))
+ {
+ /*
+ No ranges. This happens in the rear case when we have a allocated
+ place for a blob on a tail page but it did fit into the main page.
+ */
+ my_afree(compact_extent_info);
+ DBUG_RETURN(0);
+ }
+ extents_count= (uint) (extents_length / ROW_EXTENT_SIZE);
+ pagerange_store(log_data + FILEID_STORE_SIZE, extents_count);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= compact_extent_info;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length;
+ res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn,
+ info,
+ (translog_size_t) (sizeof(log_data) +
+ extents_length),
+ TRANSLOG_INTERNAL_PARTS + 2, log_array,
+ log_data, NULL);
+ my_afree(compact_extent_info);
+ if (res)
+ DBUG_RETURN(1);
+ }
+
+ DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents,
+ row->extents_count));
+}
+
+
+/*
+ Free one page range
+
+ NOTES
+ This is very similar to free_full_pages()
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page,
+ uint count)
+{
+ my_bool res= 0;
+ uint delete_count;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("free_full_page_range");
+
+ delete_count= count;
+ if (share->state.state.data_file_length ==
+ (page + count) * share->block_size)
+ {
+ /*
+ Don't delete last page from pagecache as this will make the file
+ shorter than expected if the last operation extended the file
+ */
+ delete_count--;
+ }
+ if (delete_count &&
+ pagecache_delete_pages(share->pagecache, &info->dfile,
+ page, delete_count, PAGECACHE_LOCK_WRITE, 0))
+ res= 1;
+
+ if (share->now_transactional)
+ {
+ LSN lsn;
+ /** @todo unify log_data's shape with delete_head_or_tail() */
+ uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+ ROW_EXTENT_SIZE];
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ DBUG_ASSERT(info->trn->rec_lsn);
+ pagerange_store(log_data + FILEID_STORE_SIZE, 1);
+ page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
+ page);
+ int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+ PAGE_STORE_SIZE, count);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+ if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS,
+ info->trn, info,
+ (translog_size_t) sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data, NULL))
+ res= 1;
+ }
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count))
+ res= 1;
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Write a record to a (set of) pages
+
+ @fn write_block_record()
+ @param info Maria handler
+ @param old_record Original record in case of update; NULL in case of
+ insert
+ @param record Record we should write
+ @param row Statistics about record (calculated by
+ calc_record_size())
+ @param map_blocks On which pages the record should be stored
+ @param row_pos Position on head page where to put head part of
+ record
+ @param undo_lsn <> LSN_ERROR if we are executing an UNDO
+ @param old_record_checksum Checksum of old_record: ignored if table does
+ not have live checksum; otherwise if
+ old_record==NULL it must be 0.
+
+ @note
+ On return all pinned pages are released.
+
+ [page_buff + EMPTY_SPACE_OFFSET] is set to
+ row_pos->empty_space - head_length
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool write_block_record(MARIA_HA *info,
+ const uchar *old_record,
+ const uchar *record,
+ MARIA_ROW *row,
+ MARIA_BITMAP_BLOCKS *bitmap_blocks,
+ my_bool head_block_is_read,
+ struct st_row_pos_info *row_pos,
+ LSN undo_lsn,
+ ha_checksum old_record_checksum)
+{
+ uchar *data, *end_of_data, *tmp_data_used, *tmp_data;
+ uchar *row_extents_first_part, *row_extents_second_part;
+ uchar *field_length_data;
+ uchar *page_buff;
+ MARIA_BITMAP_BLOCK *block, *head_block;
+ MARIA_SHARE *share= info->s;
+ MARIA_COLUMNDEF *column, *end_column;
+ MARIA_PINNED_PAGE page_link;
+ uint block_size, flag, head_length;
+ ulong *blob_lengths;
+ my_bool row_extents_in_use, blob_full_pages_exists;
+ LSN lsn;
+ my_off_t position;
+ uint save_my_errno;
+ DBUG_ENTER("write_block_record");
+
+ LINT_INIT(row_extents_first_part);
+ LINT_INIT(row_extents_second_part);
+
+ head_block= bitmap_blocks->block;
+ block_size= share->block_size;
+
+ page_buff= row_pos->buff;
+ /* Position on head page where we should store the head part */
+ data= row_pos->data;
+ end_of_data= data + row_pos->length;
+
+ /* Write header */
+ flag= info->row_flag;
+ row_extents_in_use= 0;
+ if (unlikely(row->total_length > row_pos->length))
+ {
+ /* Need extent */
+ DBUG_ASSERT(bitmap_blocks->count > 1);
+ if (bitmap_blocks->count <= 1)
+ goto crashed; /* Wrong in bitmap */
+ flag|= ROW_FLAG_EXTENTS;
+ row_extents_in_use= 1;
+ }
+ /* For now we have only a minimum header */
+ *data++= (uchar) flag;
+ if (flag & ROW_FLAG_TRANSID)
+ {
+ transid_store(data, info->trn->trid);
+ data+= TRANSID_SIZE;
+ }
+
+ if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED))
+ *data++= (uchar) (share->base.null_bytes -
+ share->base.original_null_bytes);
+ if (row_extents_in_use)
+ {
+ /* Store first extent in header */
+ store_key_length_inc(data, bitmap_blocks->count - 1);
+ row_extents_first_part= data;
+ data+= ROW_EXTENT_SIZE;
+ }
+ if (share->base.max_field_lengths)
+ store_key_length_inc(data, row->field_lengths_length);
+ if (share->calc_checksum)
+ {
+ *(data++)= (uchar) (row->checksum); /* store least significant byte */
+ DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL)));
+ }
+ memcpy(data, record, share->base.null_bytes);
+ data+= share->base.null_bytes;
+ memcpy(data, row->empty_bits, share->base.pack_bytes);
+ data+= share->base.pack_bytes;
+
+ DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
+ (uint) (data - row_pos->data) == row->min_length);
+
+ /*
+ Allocate a buffer of rest of data (except blobs)
+
+ To avoid double copying of data, we copy as many columns that fits into
+ the page. The rest goes into info->packed_row.
+
+ Using an extra buffer, instead of doing continuous writes to different
+ pages, uses less code and we don't need to have to do a complex call
+ for every data segment we want to store.
+ */
+ if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ row->head_length))
+ DBUG_RETURN(1);
+
+ tmp_data_used= 0; /* Either 0 or last used uchar in 'data' */
+ tmp_data= data;
+
+ if (row_extents_in_use)
+ {
+ uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE;
+ if (!tmp_data_used && tmp_data + copy_length > end_of_data)
+ {
+ tmp_data_used= tmp_data;
+ tmp_data= info->rec_buff;
+ }
+ row_extents_second_part= tmp_data;
+ /*
+ We will copy the extents here when we have figured out the tail
+ positions.
+ */
+ tmp_data+= copy_length;
+ }
+
+ /* Copy fields that has fixed lengths (primary key etc) */
+ for (column= share->columndef,
+ end_column= column + share->base.fixed_not_null_fields;
+ column < end_column; column++)
+ {
+ if (!tmp_data_used && tmp_data + column->length > end_of_data)
+ {
+ tmp_data_used= tmp_data;
+ tmp_data= info->rec_buff;
+ }
+ memcpy(tmp_data, record + column->offset, column->length);
+ tmp_data+= column->length;
+ }
+
+ /* Copy length of data for variable length fields */
+ if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data)
+ {
+ tmp_data_used= tmp_data;
+ tmp_data= info->rec_buff;
+ }
+ field_length_data= row->field_lengths;
+ memcpy(tmp_data, field_length_data, row->field_lengths_length);
+ tmp_data+= row->field_lengths_length;
+
+ DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
+ (uint) (tmp_data - row_pos->data) == row->min_length +
+ share->base.fixed_not_null_fields_length +
+ row->field_lengths_length);
+
+ /* Copy variable length fields and fields with null/zero */
+ for (end_column= share->columndef + share->base.fields - share->base.blobs;
+ column < end_column ;
+ column++)
+ {
+ const uchar *field_pos;
+ ulong length;
+ if ((record[column->null_pos] & column->null_bit) ||
+ (row->empty_bits[column->empty_pos] & column->empty_bit))
+ continue;
+
+ field_pos= record + column->offset;
+ switch (column->type) {
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_SKIP_PRESPACE:
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ length= column->length;
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ /* Char that is space filled */
+ if (column->length <= 255)
+ length= (uint) (uchar) *field_length_data++;
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_length_data+= 2;
+ }
+ break;
+ case FIELD_VARCHAR:
+ if (column->length <= 256)
+ {
+ length= (uint) (uchar) *field_length_data++;
+ field_pos++; /* Skip length uchar */
+ }
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_length_data+= 2;
+ field_pos+= 2;
+ }
+ DBUG_ASSERT(length <= column->length);
+ break;
+ default: /* Wrong data */
+ DBUG_ASSERT(0);
+ length=0;
+ break;
+ }
+ if (!tmp_data_used && tmp_data + length > end_of_data)
+ {
+ /* Data didn't fit in page; Change to use tmp buffer */
+ tmp_data_used= tmp_data;
+ tmp_data= info->rec_buff;
+ }
+ memcpy((char*) tmp_data, field_pos, length);
+ tmp_data+= length;
+ }
+
+ block= head_block + head_block->sub_blocks; /* Point to first blob data */
+
+ end_column= column + share->base.blobs;
+ blob_lengths= row->blob_lengths;
+ if (!tmp_data_used)
+ {
+ /* Still room on page; Copy as many blobs we can into this page */
+ data= tmp_data;
+ for (; column < end_column &&
+ *blob_lengths <= (ulong)(end_of_data - data);
+ column++, blob_lengths++)
+ {
+ uchar *tmp_pos;
+ uint length;
+ if (!*blob_lengths) /* Null or "" */
+ continue;
+ length= column->length - portable_sizeof_char_ptr;
+ memcpy_fixed((uchar*) &tmp_pos, record + column->offset + length,
+ sizeof(char*));
+ memcpy(data, tmp_pos, *blob_lengths);
+ data+= *blob_lengths;
+ /*
+ The following is not true when we want to insert data into original
+ place. In this case we don't have any extra blocks allocated
+ */
+ if (likely(undo_lsn == LSN_ERROR))
+ {
+ /* Skip over tail page that was prepared for storing blob */
+ block++;
+ bitmap_blocks->tail_page_skipped= 1;
+ }
+ }
+ if (head_block->sub_blocks > 1)
+ {
+ /* We have allocated pages that where not used */
+ bitmap_blocks->page_skipped= 1;
+ }
+ }
+ else
+ data= tmp_data_used; /* Get last used on page */
+
+ /* Update page directory */
+ head_length= (uint) (data - row_pos->data);
+ DBUG_PRINT("info", ("Used head length on page: %u header_length: %u",
+ head_length,
+ (uint) (flag & ROW_FLAG_TRANSID ? TRANSID_SIZE : 0)));
+ DBUG_ASSERT(data <= end_of_data);
+ if (head_length < share->base.min_block_length)
+ {
+ /* Extend row to be of size min_block_length */
+ uint diff_length= share->base.min_block_length - head_length;
+ bzero(data, diff_length);
+ data+= diff_length;
+ head_length= share->base.min_block_length;
+ }
+ /*
+ If this is a redo entry (ie, undo_lsn != LSN_ERROR) then we should have
+ written exactly head_length bytes (same as original record).
+ */
+ DBUG_ASSERT(undo_lsn == LSN_ERROR || head_length == row_pos->length);
+ int2store(row_pos->dir + 2, head_length);
+ /* update empty space at start of block */
+ row_pos->empty_space-= head_length;
+ int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space);
+ /* Mark in bitmaps how the current page was actually used */
+ head_block->empty_space= row_pos->empty_space;
+ if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE &&
+ page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST)
+ head_block->empty_space= 0; /* Page is full */
+ head_block->used|= BLOCKUSED_USED;
+
+ check_directory(page_buff, share->block_size, share->base.min_block_length,
+ (uint) -1);
+
+ /*
+ Now we have to write tail pages, as we need to store the position
+ to them in the row extent header.
+
+ We first write out all blob tails, to be able to store them in
+ the current page or 'tmp_data'.
+
+ Then we write the tail of the non-blob fields (The position to the
+ tail page is stored either in row header, the extents in the head
+ page or in the first full page of the non-blob data. It's never in
+ the tail page of the non-blob data)
+ */
+
+ blob_full_pages_exists= 0;
+ if (row_extents_in_use)
+ {
+ if (column != end_column) /* If blob fields */
+ {
+ MARIA_COLUMNDEF *save_column= column;
+ MARIA_BITMAP_BLOCK *save_block= block;
+ MARIA_BITMAP_BLOCK *end_block;
+ ulong *save_blob_lengths= blob_lengths;
+
+ for (; column < end_column; column++, blob_lengths++)
+ {
+ uchar *blob_pos;
+ if (!*blob_lengths) /* Null or "" */
+ continue;
+ if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
+ {
+ uint length;
+ length= column->length - portable_sizeof_char_ptr;
+ memcpy_fixed((uchar *) &blob_pos, record + column->offset + length,
+ sizeof(char*));
+ length= *blob_lengths % FULL_PAGE_SIZE(block_size); /* tail size */
+ if (length != *blob_lengths)
+ blob_full_pages_exists= 1;
+ if (write_tail(info, block + block->sub_blocks-1,
+ blob_pos + *blob_lengths - length,
+ length))
+ goto disk_err;
+ }
+ else
+ blob_full_pages_exists= 1;
+
+ for (end_block= block + block->sub_blocks; block < end_block; block++)
+ {
+ /*
+ Set only a bit, to not cause bitmap code to believe a block is full
+ when there is still a lot of entries in it.
+ */
+ block->used|= BLOCKUSED_USED;
+ }
+ }
+ DBUG_ASSERT((undo_lsn == LSN_ERROR ||
+ block == bitmap_blocks->block + bitmap_blocks->count));
+ column= save_column;
+ block= save_block;
+ blob_lengths= save_blob_lengths;
+ }
+
+ if (tmp_data_used) /* non blob data overflows */
+ {
+ MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block;
+ MARIA_BITMAP_BLOCK *head_tail_block= 0;
+ ulong length;
+ ulong data_length= (ulong) (tmp_data - info->rec_buff);
+
+#ifdef SANITY_CHECKS
+ DBUG_ASSERT(head_block->sub_blocks != 1);
+ if (head_block->sub_blocks == 1)
+ goto crashed; /* no reserved full or tails */
+#endif
+ /*
+ Find out where to write tail for non-blob fields.
+
+ Problem here is that the bitmap code may have allocated more
+ space than we need. We have to handle the following cases:
+
+ - Bitmap code allocated a tail page we don't need.
+ - The last full page allocated needs to be changed to a tail page
+ (Because we where able to put more data on the head page than
+ the bitmap allocation assumed)
+
+ The reserved pages in bitmap_blocks for the main page has one of
+ the following allocations:
+ - Full pages, with following blocks:
+ # * full pages
+ empty page ; To be used if we change last full to tail page. This
+ has 'count' = 0.
+ tail page (optional, if last full page was part full)
+ - One tail page
+ */
+
+ cur_block= head_block + 1;
+ end_block= head_block + head_block->sub_blocks;
+ /*
+ Loop until we have find a block bigger than we need or
+ we find the empty page block.
+ */
+ while (data_length >= (length= (cur_block->page_count *
+ FULL_PAGE_SIZE(block_size))) &&
+ cur_block->page_count)
+ {
+#ifdef SANITY_CHECKS
+ DBUG_ASSERT(!((cur_block == end_block) ||
+ (cur_block->used & BLOCKUSED_USED)));
+ if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED))
+ goto crashed;
+#endif
+ data_length-= length;
+ (cur_block++)->used|= BLOCKUSED_USED;
+ }
+ last_head_block= cur_block;
+ if (data_length)
+ {
+ if (cur_block->page_count == 0)
+ {
+ /* Skip empty filler block */
+ cur_block++;
+ }
+#ifdef SANITY_CHECKS
+ DBUG_ASSERT(!(cur_block >= end_block));
+ if ((cur_block >= end_block))
+ goto crashed;
+#endif
+ if (cur_block->used & BLOCKUSED_TAIL)
+ {
+ DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size));
+ /* tail written to tail page */
+ cur_block->used|= BLOCKUSED_USED;
+ head_tail_block= cur_block;
+ }
+ else if (data_length > length - MAX_TAIL_SIZE(block_size))
+ {
+ /* tail written to full page */
+ cur_block->used|= BLOCKUSED_USED;
+ if ((cur_block != end_block - 1) &&
+ (end_block[-1].used & BLOCKUSED_TAIL))
+ bitmap_blocks->tail_page_skipped= 1;
+ }
+ else
+ {
+ /*
+ cur_block is a full block, followed by an empty and optional
+ tail block. Change cur_block to a tail block or split it
+ into full blocks and tail blocks.
+
+ TODO:
+ If there is enough space on the following tail block, use
+ this instead of creating a new tail block.
+ */
+ DBUG_ASSERT(cur_block[1].page_count == 0);
+ if (cur_block->page_count == 1)
+ {
+ /* convert full block to tail block */
+ cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
+ head_tail_block= cur_block;
+ }
+ else
+ {
+ DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(block_size));
+ DBUG_PRINT("info", ("Splitting blocks into full and tail"));
+ cur_block[1].page= (cur_block->page + cur_block->page_count - 1);
+ cur_block[1].page_count= 1; /* Avoid DBUG_ASSERT */
+ cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL;
+ cur_block->page_count--;
+ cur_block->used|= BLOCKUSED_USED;
+ last_head_block= head_tail_block= cur_block+1;
+ }
+ if (end_block[-1].used & BLOCKUSED_TAIL)
+ bitmap_blocks->tail_page_skipped= 1;
+ }
+ }
+ else
+ {
+ /* Must be an empty or tail page */
+ DBUG_ASSERT(cur_block->page_count == 0 ||
+ cur_block->used & BLOCKUSED_TAIL);
+ if (end_block[-1].used & BLOCKUSED_TAIL)
+ bitmap_blocks->tail_page_skipped= 1;
+ }
+
+ /*
+ Write all extents into page or tmp_data
+
+ Note that we still don't have a correct position for the tail
+ of the non-blob fields.
+ */
+ store_extent_info(row_extents_first_part,
+ row_extents_second_part,
+ head_block+1, bitmap_blocks->count - 1);
+ if (head_tail_block)
+ {
+ ulong block_length= (ulong) (tmp_data - info->rec_buff);
+ uchar *extent_data;
+
+ length= (uint) (block_length % FULL_PAGE_SIZE(block_size));
+ if (write_tail(info, head_tail_block,
+ info->rec_buff + block_length - length,
+ length))
+ goto disk_err;
+ tmp_data-= length; /* Remove the tail */
+ if (tmp_data == info->rec_buff)
+ {
+ /* We have no full blocks to write for the head part */
+ tmp_data_used= 0;
+ }
+
+ /* Store the tail position for the non-blob fields */
+ if (head_tail_block == head_block + 1)
+ {
+ /*
+ We had a head block + tail block, which means that the
+ tail block is the first extent
+ */
+ extent_data= row_extents_first_part;
+ }
+ else
+ {
+ /*
+ We have a head block + some full blocks + tail block
+ last_head_block is pointing after the last used extent
+ for the head block.
+ */
+ extent_data= row_extents_second_part +
+ ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE;
+ }
+ DBUG_ASSERT(uint2korr(extent_data+5) & TAIL_BIT);
+ page_store(extent_data, head_tail_block->page);
+ int2store(extent_data + PAGE_STORE_SIZE, head_tail_block->page_count);
+ }
+ }
+ else
+ store_extent_info(row_extents_first_part,
+ row_extents_second_part,
+ head_block+1, bitmap_blocks->count - 1);
+ }
+
+ if (share->now_transactional)
+ {
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+
+ /* Log REDO changes of head page */
+ page_store(log_data + FILEID_STORE_SIZE, head_block->page);
+ dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ row_pos->rownr);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos->data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length;
+ if (translog_write_record(&lsn,
+ head_block_is_read ?
+ LOGREC_REDO_INSERT_ROW_HEAD :
+ LOGREC_REDO_NEW_ROW_HEAD,
+ info->trn,
+ info,
+ (translog_size_t) (sizeof(log_data) +
+ head_length),
+ TRANSLOG_INTERNAL_PARTS + 2, log_array,
+ log_data, NULL))
+ goto disk_err;
+ }
+
+#ifdef RECOVERY_EXTRA_DEBUG
+ if (info->trn->undo_lsn != LSN_IMPOSSIBLE)
+ {
+ /* Stop right after the REDO; testing incomplete log record groups */
+ DBUG_EXECUTE_IF("maria_flush_whole_log",
+ {
+ DBUG_PRINT("maria_flush_whole_log", ("now"));
+ translog_flush(translog_get_horizon());
+ });
+ DBUG_EXECUTE_IF("maria_crash",
+ { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); });
+ }
+#endif
+
+ if (head_block_is_read)
+ {
+ MARIA_PINNED_PAGE *page_link;
+ /* Head page is always the first pinned page */
+ page_link= dynamic_element(&info->pinned_pages, 0,
+ MARIA_PINNED_PAGE*);
+ pagecache_unlock_by_link(share->pagecache, page_link->link,
+ PAGECACHE_LOCK_WRITE_TO_READ,
+ PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 1, FALSE);
+ page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
+ page_link->changed= 1;
+ }
+ else
+ {
+ if (pagecache_write(share->pagecache,
+ &info->dfile, head_block->page, 0,
+ page_buff, share->page_type,
+ head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ :
+ PAGECACHE_LOCK_READ,
+ head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
+ PAGECACHE_PIN,
+ PAGECACHE_WRITE_DELAY, &page_link.link,
+ LSN_IMPOSSIBLE))
+ goto disk_err;
+ page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+ /* Increase data file size, if extended */
+ position= (my_off_t) head_block->page * block_size;
+ if (share->state.state.data_file_length <= position)
+ _ma_set_share_data_file_length(share, position + block_size);
+ }
+
+ if (share->now_transactional && (tmp_data_used || blob_full_pages_exists))
+ {
+ /*
+ Log REDO writes for all full pages (head part and all blobs)
+ We write all here to be able to generate the UNDO record early
+ so that we can write the LSN for the UNDO record to all full pages.
+ */
+ uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+ (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) *
+ ROW_EXTENTS_ON_STACK];
+ uchar *log_data, *log_pos;
+ LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 +
+ ROW_EXTENTS_ON_STACK];
+ LEX_CUSTRING *log_array_pos, *log_array;
+ int error;
+ translog_size_t log_entry_length= 0;
+ uint ext_length, extents= 0, sub_extents= 0;
+
+ /* If few extents, then allocate things on stack to avoid a malloc call */
+ if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK)
+ {
+ log_array= tmp_log_array;
+ log_data= tmp_log_data;
+ }
+ else
+ {
+ if (!my_multi_malloc(MY_WME, &log_array,
+ (uint) ((bitmap_blocks->count +
+ TRANSLOG_INTERNAL_PARTS + 2) *
+ sizeof(*log_array)),
+ &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+ bitmap_blocks->count * (ROW_EXTENT_SIZE +
+ BLOCK_FILLER_SIZE +
+ SUB_RANGE_SIZE),
+ NullS))
+ goto disk_err;
+ }
+ log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2;
+ log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1;
+
+ if (tmp_data_used)
+ {
+ /* Full head page */
+ translog_size_t block_length= (translog_size_t) (tmp_data -
+ info->rec_buff);
+ log_pos= store_page_range(log_pos, head_block+1, block_size,
+ (ulong) block_length, &extents);
+ log_array_pos->str= info->rec_buff;
+ log_array_pos->length= block_length;
+ log_entry_length+= block_length;
+ log_array_pos++;
+ sub_extents++;
+ }
+ if (blob_full_pages_exists)
+ {
+ MARIA_COLUMNDEF *tmp_column= column;
+ ulong *tmp_blob_lengths= blob_lengths;
+ MARIA_BITMAP_BLOCK *tmp_block= block;
+
+ /* Full blob pages */
+ for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++)
+ {
+ ulong blob_length;
+ uint length;
+
+ if (!*tmp_blob_lengths) /* Null or "" */
+ continue;
+ blob_length= *tmp_blob_lengths;
+ length= tmp_column->length - portable_sizeof_char_ptr;
+ /*
+ If last part of blog was on tail page, change blob_length to
+ reflect this
+ */
+ if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL)
+ blob_length-= (blob_length % FULL_PAGE_SIZE(block_size));
+ if (blob_length)
+ {
+ memcpy_fixed((uchar*) &log_array_pos->str,
+ record + tmp_column->offset + length,
+ sizeof(uchar*));
+ log_array_pos->length= blob_length;
+ log_entry_length+= blob_length;
+ log_array_pos++;
+ sub_extents++;
+
+ log_pos= store_page_range(log_pos, tmp_block, block_size,
+ blob_length, &extents);
+ }
+ tmp_block+= tmp_block->sub_blocks;
+ }
+ }
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ ext_length= (uint) (log_pos - log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length;
+ pagerange_store(log_data+ FILEID_STORE_SIZE, extents);
+ pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
+ sub_extents);
+
+ log_entry_length+= ext_length;
+ /* trn->rec_lsn is already set earlier in this function */
+ error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS,
+ info->trn, info, log_entry_length,
+ (uint) (log_array_pos - log_array),
+ log_array, log_data, NULL);
+ if (log_array != tmp_log_array)
+ my_free(log_array, MYF(0));
+ if (error)
+ goto disk_err;
+ }
+
+ /* Write UNDO or CLR record */
+ lsn= LSN_IMPOSSIBLE;
+ if (share->now_transactional)
+ {
+ LEX_CUSTRING *log_array= info->log_row_parts;
+
+ if (undo_lsn != LSN_ERROR)
+ {
+ /*
+ Store if this CLR is about UNDO_DELETE or UNDO_UPDATE;
+ in the first case, Recovery, when it sees the CLR_END in the
+ REDO phase, may decrement the records' count.
+ */
+ if (_ma_write_clr(info, undo_lsn,
+ old_record ? LOGREC_UNDO_ROW_UPDATE :
+ LOGREC_UNDO_ROW_DELETE,
+ share->calc_checksum != 0,
+ row->checksum - old_record_checksum,
+ &lsn, (void*) 0))
+ goto disk_err;
+ }
+ else
+ {
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 +
+ HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
+ ROW_EXTENT_SIZE];
+ uchar *log_pos;
+ ha_checksum checksum_delta;
+
+ /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */
+ lsn_store(log_data, info->trn->undo_lsn);
+ page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
+ head_block->page);
+ dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE,
+ row_pos->rownr);
+ log_pos= (log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE + DIRPOS_STORE_SIZE);
+ store_checksum_in_rec(share, checksum_delta,
+ row->checksum - old_record_checksum,
+ log_pos, log_pos);
+ compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE);
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+ log_data);
+
+ if (!old_record)
+ {
+ /* Store undo_lsn in case we are aborting the insert */
+ row->orig_undo_lsn= info->trn->undo_lsn;
+ /* Write UNDO log record for the INSERT */
+ if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ log_array,
+ log_data + LSN_STORE_SIZE, &checksum_delta))
+ goto disk_err;
+ }
+ else
+ {
+ /* Write UNDO log record for the UPDATE */
+ size_t row_length, extents_length;
+ uint row_parts_count, cur_head_length;
+
+ /*
+ Write head length and extents of the original row so that we
+ during UNDO can put it back in the original position.
+ We don't store size for TRANSID, as we don't write this during
+ UNDO.
+ */
+ cur_head_length= (info->cur_row.head_length -
+ info->cur_row.header_length);
+ int2store(log_pos, cur_head_length);
+ pagerange_store(log_pos + 2, info->cur_row.extents_count);
+ log_pos+= 2 + PAGERANGE_STORE_SIZE;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 +
+ PAGERANGE_STORE_SIZE);
+ info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
+ info->cur_row.extents;
+ info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
+ extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
+
+ row_length= fill_update_undo_parts(info, old_record, record,
+ log_array +
+ TRANSLOG_INTERNAL_PARTS + 2,
+ &row_parts_count);
+ if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn,
+ info,
+ (translog_size_t)
+ (log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length + extents_length +
+ row_length),
+ TRANSLOG_INTERNAL_PARTS + 2 +
+ row_parts_count,
+ log_array,
+ log_data + LSN_STORE_SIZE,
+ &checksum_delta))
+ goto disk_err;
+ }
+ }
+ }
+ /* Release not used space in used pages */
+ if (_ma_bitmap_release_unused(info, bitmap_blocks))
+ goto disk_err;
+ _ma_unpin_all_pages(info, lsn);
+
+ if (tmp_data_used)
+ {
+ /*
+ Write data stored in info->rec_buff to pages
+ This is the char/varchar data that didn't fit into the head page.
+ */
+ DBUG_ASSERT(bitmap_blocks->count != 0);
+ if (write_full_pages(info, lsn, head_block + 1,
+ info->rec_buff, (ulong) (tmp_data - info->rec_buff)))
+ goto disk_err;
+ }
+
+ /* Write rest of blobs (data, but no tails as they are already written) */
+ for (; column < end_column; column++, blob_lengths++)
+ {
+ uchar *blob_pos;
+ uint length;
+ ulong blob_length;
+ if (!*blob_lengths) /* Null or "" */
+ continue;
+ length= column->length - portable_sizeof_char_ptr;
+ memcpy_fixed((uchar*) &blob_pos, record + column->offset + length,
+ sizeof(char*));
+ /* remove tail part */
+ blob_length= *blob_lengths;
+ if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
+ blob_length-= (blob_length % FULL_PAGE_SIZE(block_size));
+
+ if (blob_length && write_full_pages(info, lsn, block,
+ blob_pos, blob_length))
+ goto disk_err;
+ block+= block->sub_blocks;
+ }
+
+ _ma_finalize_row(info);
+ DBUG_RETURN(0);
+
+crashed:
+ /* Something was wrong with data on page */
+ my_errno= HA_ERR_WRONG_IN_RECORD;
+
+disk_err:
+ /**
+ @todo RECOVERY we are going to let dirty pages go to disk while we have
+ logged UNDO, this violates WAL. We must mark the table corrupted!
+
+ @todo RECOVERY we have written some REDOs without a closing UNDO,
+ it's possible that a next operation by this transaction succeeds and then
+ Recovery would glue the "orphan REDOs" to the succeeded operation and
+ execute the failed REDOs. We need some mark "abort this group" in the
+ log, or mark the table corrupted (then user will repair it and thus REDOs
+ will be skipped).
+
+ @todo RECOVERY to not let write errors go unnoticed, pagecache_write()
+ should take a MARIA_HA* in argument, and it it
+ fails when flushing a page to disk it should call
+ (*the_maria_ha->write_error_func)(the_maria_ha)
+ and this hook will mark the table corrupted.
+ Maybe hook should be stored in the pagecache's block structure, or in a
+ hash "file->maria_ha*".
+
+ @todo RECOVERY we should distinguish below between log write error and
+ table write error. The former should stop Maria immediately, the latter
+ should mark the table corrupted.
+ */
+ /*
+ Unpin all pinned pages to not cause problems for disk cache. This is
+ safe to call even if we already called _ma_unpin_all_pages() above.
+ */
+ save_my_errno= my_errno;
+ _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+ my_errno= save_my_errno;
+ DBUG_RETURN(1);
+}
+
+
+/*
+ @brief Write a record
+
+ @fn allocate_and_write_block_record()
+ @param info Maria handler
+ @param record Record to write
+ @param row Information about fields in 'record'
+ @param undo_lsn <> LSN_ERROR if we are executing an UNDO
+
+ @return
+ @retval 0 ok
+ @retval 1 Error
+*/
+
+static my_bool allocate_and_write_block_record(MARIA_HA *info,
+ const uchar *record,
+ MARIA_ROW *row,
+ LSN undo_lsn)
+{
+ struct st_row_pos_info row_pos;
+ MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks;
+ int save_my_errno;
+ DBUG_ENTER("allocate_and_write_block_record");
+
+ _ma_bitmap_flushable(info, 1);
+ if (_ma_bitmap_find_place(info, row, blocks))
+ goto err; /* Error reading bitmap */
+
+ /*
+ Sleep; a checkpoint will happen and should not send this over-allocated
+ bitmap to disk but rather wait.
+ */
+ DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10););
+
+ /* page will be pinned & locked by get_head_or_tail_page */
+ if (get_head_or_tail_page(info, blocks->block, info->buff,
+ row->space_on_head_page, HEAD_PAGE,
+ PAGECACHE_LOCK_WRITE, &row_pos))
+ goto err;
+ row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr);
+ if (info->s->calc_checksum)
+ {
+ if (undo_lsn == LSN_ERROR)
+ row->checksum= (info->s->calc_checksum)(info, record);
+ else
+ {
+ /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */
+ DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record));
+ }
+ }
+ DBUG_PRINT("info", ("rowid: %lu (%lu:%u) length: %u", (ulong) row->lastpos,
+ (ulong) ma_recordpos_to_page(row->lastpos),
+ ma_recordpos_to_dir_entry(row->lastpos),
+ row_pos.length));
+ if (write_block_record(info, (uchar*) 0, record, row,
+ blocks, blocks->block->org_bitmap_value != 0,
+ &row_pos, undo_lsn, 0))
+ goto err;
+ /* Now let checkpoint happen but don't commit */
+ DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000););
+ DBUG_RETURN(0);
+
+err:
+ save_my_errno= my_errno;
+ if (info->non_flushable_state)
+ _ma_bitmap_flushable(info, -1);
+ _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+ my_errno= save_my_errno;
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Write a record and return rowid for it
+
+ SYNOPSIS
+ _ma_write_init_block_record()
+ info Maria handler
+ record Record to write
+
+ NOTES
+ This is done BEFORE we write the keys to the row!
+
+ RETURN
+ HA_OFFSET_ERROR Something went wrong
+ # Rowid for row
+*/
+
+MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
+ const uchar *record)
+{
+ DBUG_ENTER("_ma_write_init_block_record");
+
+ calc_record_size(info, record, &info->cur_row);
+ if (allocate_and_write_block_record(info, record,
+ &info->cur_row, LSN_ERROR))
+ DBUG_RETURN(HA_OFFSET_ERROR);
+ DBUG_RETURN(info->cur_row.lastpos);
+}
+
+
+/*
+ Dummy function for (*info->s->write_record)()
+
+ Nothing to do here, as we already wrote the record in
+ _ma_write_init_block_record()
+*/
+
+my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)),
+ const uchar *record __attribute__ ((unused)))
+{
+ return 0; /* Row already written */
+}
+
+
+/**
+ @brief Remove row written by _ma_write_block_record() and log undo
+
+ @param info Maria handler
+
+ @note
+ This is called in case we got a duplicate unique key while
+ writing keys.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool _ma_write_abort_block_record(MARIA_HA *info)
+{
+ my_bool res= 0;
+ MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
+ MARIA_BITMAP_BLOCK *block, *end;
+ LSN lsn= LSN_IMPOSSIBLE;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_write_abort_block_record");
+
+ _ma_bitmap_lock(share); /* Lock bitmap from other insert threads */
+ if (delete_head_or_tail(info,
+ ma_recordpos_to_page(info->cur_row.lastpos),
+ ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1,
+ 0))
+ res= 1;
+ for (block= blocks->block + 1, end= block + blocks->count - 1; block < end;
+ block++)
+ {
+ if (block->used & BLOCKUSED_USED)
+ {
+ if (block->used & BLOCKUSED_TAIL)
+ {
+ /*
+ block->page_count is set to the tail directory entry number in
+ write_block_record()
+ */
+ if (delete_head_or_tail(info, block->page,
+ block->page_count & ~TAIL_BIT,
+ 0, 0))
+ res= 1;
+ }
+ else
+ {
+ if (free_full_page_range(info, block->page, block->page_count))
+ res= 1;
+ }
+ }
+ }
+ if (share->now_transactional)
+ {
+ if (_ma_write_clr(info, info->cur_row.orig_undo_lsn,
+ LOGREC_UNDO_ROW_INSERT,
+ share->calc_checksum != 0,
+ (ha_checksum) 0 - info->cur_row.checksum,
+ &lsn, (void*) 0))
+ res= 1;
+ }
+ _ma_bitmap_unlock(share);
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Update a record
+
+ NOTES
+ For the moment, we assume that info->curr_row.extents is always updated
+ when a row is read. In the future we may decide to read this on demand
+ for rows split into many extents.
+*/
+
+static my_bool _ma_update_block_record2(MARIA_HA *info,
+ MARIA_RECORD_POS record_pos,
+ const uchar *oldrec,
+ const uchar *record,
+ LSN undo_lsn)
+{
+ MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
+ uchar *buff;
+ MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
+ MARIA_PINNED_PAGE page_link;
+ uint rownr, org_empty_size, head_length;
+ uint block_size= info->s->block_size;
+ uint errpos= 0;
+ uchar *dir;
+ pgcache_page_no_t page;
+ struct st_row_pos_info row_pos;
+ my_bool res;
+ ha_checksum old_checksum;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_update_block_record2");
+ DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
+
+#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
+ DBUG_DUMP("oldrec", oldrec, share->base.reclength);
+ DBUG_DUMP("newrec", record, share->base.reclength);
+#endif
+
+ /*
+ Checksums of new and old rows were computed by callers already; new
+ row's was put into cur_row, old row's was put into new_row.
+ */
+ old_checksum= new_row->checksum;
+ new_row->checksum= cur_row->checksum;
+ calc_record_size(info, record, new_row);
+ page= ma_recordpos_to_page(record_pos);
+
+ _ma_bitmap_flushable(info, 1);
+ buff= pagecache_read(share->pagecache,
+ &info->dfile, (pgcache_page_no_t) page, 0, 0,
+ share->page_type,
+ PAGECACHE_LOCK_WRITE, &page_link.link);
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= buff != 0;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ if (!buff)
+ goto err;
+
+ org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ rownr= ma_recordpos_to_dir_entry(record_pos);
+ dir= dir_entry_pos(buff, block_size, rownr);
+
+ /*
+ We can't use cur_row->head_length as the block may have been compacted
+ since we read it.
+ */
+ head_length= uint2korr(dir + 2);
+
+ if ((org_empty_size + head_length) >= new_row->total_length)
+ {
+ uint rec_offset, length;
+ MARIA_BITMAP_BLOCK block;
+
+ DBUG_PRINT("info", ("org_empty_size: %u org_length: %u new_length: %lu",
+ org_empty_size, head_length,
+ new_row->total_length));
+
+ /*
+ We can fit the new row in the same page as the original head part
+ of the row
+ */
+ block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
+ org_empty_size);
+ if (extend_area_on_page(info, buff, dir, rownr, block_size,
+ new_row->total_length, &org_empty_size,
+ &rec_offset, &length))
+ {
+ errpos= 1;
+ goto err;
+ }
+
+ row_pos.buff= buff;
+ row_pos.rownr= rownr;
+ row_pos.empty_space= org_empty_size;
+ row_pos.dir= dir;
+ row_pos.data= buff + rec_offset;
+ row_pos.length= length;
+ blocks->block= &block;
+ blocks->count= 1;
+ block.page= page;
+ block.sub_blocks= 1;
+ block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
+ block.empty_space= row_pos.empty_space;
+
+ if (*cur_row->tail_positions &&
+ delete_tails(info, cur_row->tail_positions))
+ {
+ errpos= 2;
+ goto err;
+ }
+ if (cur_row->extents_count && free_full_pages(info, cur_row))
+ {
+ errpos= 3;
+ goto err;
+ }
+ res= write_block_record(info, oldrec, record, new_row, blocks,
+ 1, &row_pos, undo_lsn, old_checksum);
+ /* We can't update or delete this without re-reading it again */
+ info->update&= ~HA_STATE_AKTIV;
+ DBUG_RETURN(res);
+ }
+ /* Delete old row */
+ if (*cur_row->tail_positions &&
+ delete_tails(info, cur_row->tail_positions))
+ {
+ errpos= 4;
+ goto err;
+ }
+ if (cur_row->extents_count && free_full_pages(info, cur_row))
+ {
+ errpos= 5;
+ goto err;
+ }
+
+ head_length= uint2korr(dir + 2);
+ if (_ma_bitmap_find_new_place(info, new_row, page, head_length +
+ org_empty_size, blocks))
+ {
+ errpos= 6;
+ goto err;
+ }
+
+ /*
+ Allocate all size in block for record
+ TODO:
+ Need to improve this to do compact if we can fit one more blob into
+ the head page
+ */
+ if ((head_length < new_row->space_on_head_page ||
+ (new_row->total_length <= head_length &&
+ org_empty_size + head_length >= new_row->total_length)))
+ {
+ _ma_compact_block_page(buff, block_size, rownr, 1,
+ info->trn->min_read_from,
+ share->base.min_block_length);
+ org_empty_size= 0;
+ head_length= uint2korr(dir + 2);
+ }
+
+ row_pos.buff= buff;
+ row_pos.rownr= rownr;
+ row_pos.empty_space= org_empty_size + head_length;
+ row_pos.dir= dir;
+ row_pos.data= buff + uint2korr(dir);
+ row_pos.length= head_length;
+ if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1,
+ &row_pos, undo_lsn, old_checksum)))
+ {
+ errpos= 7;
+ goto err;
+ }
+ DBUG_RETURN(0);
+
+err:
+ DBUG_PRINT("error", ("errpos: %d", errpos));
+ if (info->non_flushable_state)
+ _ma_bitmap_flushable(info, -1);
+ _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+ DBUG_RETURN(1);
+}
+
+
+/*
+ @brief Store new row on it's original position
+
+ @note
+ This is basicly a copy of _ma_update_block_record2
+ When we have a purge thread for deleted row, we can remove this function
+ and use _ma_update_block_record2 instead.
+
+ This is the main reason we don't make a lot of subfunctions that are
+ common between _ma_update_block_record2() and this function.
+
+ Note: If something goes wrong we mark the file crashed
+*/
+
+static my_bool _ma_update_at_original_place(MARIA_HA *info,
+ pgcache_page_no_t page,
+ uint rownr,
+ uint length_on_head_page,
+ uint extent_count,
+ const uchar *extent_info,
+ const uchar *oldrec,
+ const uchar *record,
+ LSN undo_lsn)
+{
+ MARIA_BITMAP_BLOCKS *blocks;
+ MARIA_BITMAP_BLOCK *block;
+ MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
+ MARIA_PINNED_PAGE page_link;
+ MARIA_SHARE *share= info->s;
+ ha_checksum old_checksum;
+ uint org_empty_size, empty_size;
+ uint block_size= info->s->block_size;
+ uchar *dir, *buff;
+ struct st_row_pos_info row_pos;
+ my_bool res;
+ uint rec_offset, length;
+ DBUG_ENTER("_ma_update_at_original_place");
+
+#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
+ DBUG_DUMP("oldrec", oldrec, share->base.reclength);
+ DBUG_DUMP("newrec", record, share->base.reclength);
+#endif
+
+ /*
+ Checksums of new and old rows were computed by callers already; new
+ row's was put into cur_row, old row's was put into new_row.
+ */
+ old_checksum= new_row->checksum;
+ new_row->checksum= cur_row->checksum;
+ calc_record_size(info, record, new_row);
+
+ _ma_bitmap_flushable(info, 1);
+ buff= pagecache_read(share->pagecache,
+ &info->dfile, (pgcache_page_no_t) page, 0, 0,
+ share->page_type,
+ PAGECACHE_LOCK_WRITE, &page_link.link);
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= buff != 0;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ if (!buff)
+ goto err;
+
+ org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ dir= dir_entry_pos(buff, block_size, rownr);
+
+ if ((org_empty_size + cur_row->head_length) < length_on_head_page)
+ {
+ DBUG_PRINT("error",
+ ("org_empty_size: %u head_length: %u length_on_page: %u",
+ org_empty_size, (uint) cur_row->head_length,
+ length_on_head_page));
+ my_errno= HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+
+ /*
+ We can fit the new row in the same page as the original head part
+ of the row
+ */
+ empty_size= org_empty_size;
+ if (extend_area_on_page(info, buff, dir, rownr, block_size,
+ length_on_head_page, &empty_size,
+ &rec_offset, &length))
+ goto err;
+
+ row_pos.buff= buff;
+ row_pos.rownr= rownr;
+ row_pos.empty_space= empty_size;
+ row_pos.dir= dir;
+ row_pos.data= buff + rec_offset;
+
+ /* Delete old row */
+ if (*cur_row->tail_positions &&
+ delete_tails(info, cur_row->tail_positions))
+ goto err;
+ if (cur_row->extents_count && free_full_pages(info, cur_row))
+ goto err;
+
+ /* Change extent information to be usable by write_block_record() */
+ blocks= &cur_row->insert_blocks;
+ if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
+ goto err;
+ block= blocks->block;
+ block->empty_space= row_pos.empty_space;
+ block->org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
+ org_empty_size);
+ DBUG_ASSERT(block->org_bitmap_value ==
+ _ma_bitmap_get_page_bits(info, &info->s->bitmap, page));
+ block->used|= BLOCKUSED_USE_ORG_BITMAP;
+
+ /*
+ We have to use <= below as the new_row may be smaller than the original
+ row as the new row doesn't have transaction id
+ */
+
+ DBUG_ASSERT(blocks->count > 1 ||
+ max(new_row->total_length, share->base.min_block_length) <=
+ length_on_head_page);
+
+ /* Store same amount of data on head page as on original page */
+ row_pos.length= (length_on_head_page -
+ (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
+ set_if_bigger(row_pos.length, share->base.min_block_length);
+ if ((res= write_block_record(info, oldrec, record, new_row, blocks,
+ 1, &row_pos, undo_lsn, old_checksum)))
+ goto err;
+ DBUG_RETURN(0);
+
+err:
+ _ma_mark_file_crashed(share);
+ if (info->non_flushable_state)
+ _ma_bitmap_flushable(info, -1);
+ _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+ DBUG_RETURN(1);
+}
+
+
+/* Wrapper for _ma_update_block_record2() used by ma_update() */
+
+my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos,
+ const uchar *orig_rec, const uchar *new_rec)
+{
+ return _ma_update_block_record2(info, record_pos, orig_rec, new_rec,
+ LSN_ERROR);
+}
+
+
+/*
+ Delete a directory entry
+
+ SYNOPSIS
+ delete_dir_entry()
+ buff Page buffer
+ block_size Block size
+ record_number Record number to delete
+ empty_space Empty space on page after delete
+
+ RETURN
+ -1 Error on page
+ 0 ok
+ 1 Page is now empty
+*/
+
+static int delete_dir_entry(uchar *buff, uint block_size, uint record_number,
+ uint *empty_space_res)
+{
+ uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
+ uint length, empty_space;
+ uchar *dir;
+ DBUG_ENTER("delete_dir_entry");
+
+#ifdef SANITY_CHECKS
+ if (record_number >= number_of_records ||
+ record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 -
+ PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE))
+ {
+ DBUG_PRINT("error", ("record_number: %u number_of_records: %u",
+ record_number, number_of_records));
+
+ DBUG_RETURN(-1);
+ }
+#endif
+
+ check_directory(buff, block_size, 0, (uint) -1);
+ empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ dir= dir_entry_pos(buff, block_size, record_number);
+ length= uint2korr(dir + 2);
+
+ if (record_number == number_of_records - 1)
+ {
+ /* Delete this entry and all following free directory entries */
+ uchar *end= buff + block_size - PAGE_SUFFIX_SIZE;
+ number_of_records--;
+ dir+= DIR_ENTRY_SIZE;
+ empty_space+= DIR_ENTRY_SIZE;
+
+ /* Unlink and free the next empty ones */
+ while (dir < end && dir[0] == 0 && dir[1] == 0)
+ {
+ number_of_records--;
+ if (dir[2] == END_OF_DIR_FREE_LIST)
+ buff[DIR_FREE_OFFSET]= dir[3];
+ else
+ {
+ uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]);
+ DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] ==
+ number_of_records);
+ prev_entry[3]= dir[3];
+ }
+ if (dir[3] != END_OF_DIR_FREE_LIST)
+ {
+ uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
+ DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] ==
+ number_of_records);
+ next_entry[2]= dir[2];
+ }
+ dir+= DIR_ENTRY_SIZE;
+ empty_space+= DIR_ENTRY_SIZE;
+ }
+
+ if (number_of_records == 0)
+ {
+ /* All entries on page deleted */
+ DBUG_PRINT("info", ("Page marked as unallocated"));
+ buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+ {
+ dir= dir_entry_pos(buff, block_size, record_number);
+ bzero(dir, (record_number+1) * DIR_ENTRY_SIZE);
+ }
+#endif
+ *empty_space_res= block_size;
+ DBUG_RETURN(1);
+ }
+ buff[DIR_COUNT_OFFSET]= (uchar) number_of_records;
+ }
+ else
+ {
+ /* Update directory */
+ dir[0]= dir[1]= 0;
+ dir[2]= END_OF_DIR_FREE_LIST;
+ if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST)
+ {
+ /* Relink next entry to point to newly freed entry */
+ uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
+ DBUG_ASSERT(uint2korr(next_entry) == 0 &&
+ next_entry[2] == END_OF_DIR_FREE_LIST);
+ next_entry[2]= record_number;
+ }
+ buff[DIR_FREE_OFFSET]= record_number;
+ }
+ empty_space+= length;
+
+ int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
+ buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED;
+
+ *empty_space_res= empty_space;
+
+ check_directory(buff, block_size, 0, empty_space);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Delete a head a tail part
+
+ SYNOPSIS
+ delete_head_or_tail()
+ info Maria handler
+ page Page (not file offset!) on which the row is
+ head 1 if this is a head page
+ from_update 1 if we are called from update. In this case we
+ leave the page as write locked as we may put
+ the new row into the old position.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool delete_head_or_tail(MARIA_HA *info,
+ pgcache_page_no_t page, uint record_number,
+ my_bool head, my_bool from_update)
+{
+ MARIA_SHARE *share= info->s;
+ uint empty_space;
+ uint block_size= share->block_size;
+ uchar *buff;
+ LSN lsn;
+ MARIA_PINNED_PAGE page_link;
+ int res;
+ enum pagecache_page_lock lock_at_write, lock_at_unpin;
+ DBUG_ENTER("delete_head_or_tail");
+ DBUG_PRINT("enter", ("id: %lu (%lu:%u)",
+ (ulong) ma_recordpos(page, record_number),
+ (ulong) page, record_number));
+
+ buff= pagecache_read(share->pagecache,
+ &info->dfile, page, 0, 0,
+ share->page_type,
+ PAGECACHE_LOCK_WRITE, &page_link.link);
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= buff != 0;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ if (!buff)
+ DBUG_RETURN(1);
+ DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
+ (head ? HEAD_PAGE : TAIL_PAGE));
+
+ if (from_update)
+ {
+ lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+ lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK;
+ }
+ else
+ {
+ lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ;
+ lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK;
+ }
+
+ res= delete_dir_entry(buff, block_size, record_number, &empty_space);
+ if (res < 0)
+ DBUG_RETURN(1);
+ if (res == 0) /* after our deletion, page is still not empty */
+ {
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ if (share->now_transactional)
+ {
+ /* Log REDO data */
+ page_store(log_data + FILEID_STORE_SIZE, page);
+ dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ record_number);
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD :
+ LOGREC_REDO_PURGE_ROW_TAIL),
+ info->trn, info,
+ (translog_size_t) sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data, NULL))
+ DBUG_RETURN(1);
+ }
+ }
+ else /* page is now empty */
+ {
+ if (share->now_transactional)
+ {
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE];
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ page_store(log_data + FILEID_STORE_SIZE, page);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL,
+ info->trn, info,
+ (translog_size_t) sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data, NULL))
+ DBUG_RETURN(1);
+ }
+ DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]);
+ }
+
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ lock_at_write,
+ PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 1, FALSE);
+ page_link.unlock= lock_at_unpin;
+ set_dynamic(&info->pinned_pages, (void*) &page_link,
+ info->pinned_pages.elements-1);
+
+ DBUG_PRINT("info", ("empty_space: %u", empty_space));
+
+ /*
+ If there is not enough space for all possible tails, mark the
+ page full
+ */
+ if (!head && !enough_free_entries(buff, share->block_size,
+ 1 + share->base.blobs))
+ empty_space= 0;
+
+ DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space));
+}
+
+
+/*
+ delete all tails
+
+ SYNOPSIS
+ delete_tails()
+ info Handler
+ tails Pointer to vector of tail positions, ending with 0
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails)
+{
+ my_bool res= 0;
+ DBUG_ENTER("delete_tails");
+ for (; *tails; tails++)
+ {
+ if (delete_head_or_tail(info,
+ ma_recordpos_to_page(*tails),
+ ma_recordpos_to_dir_entry(*tails), 0, 1))
+ res= 1;
+ }
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Delete a record
+
+ NOTES
+ For the moment, we assume that info->cur_row.extents is always updated
+ when a row is read. In the future we may decide to read this on demand
+ for rows with many splits.
+*/
+
+my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
+{
+ pgcache_page_no_t page;
+ uint record_number;
+ MARIA_SHARE *share= info->s;
+ LSN lsn= LSN_IMPOSSIBLE;
+ DBUG_ENTER("_ma_delete_block_record");
+
+ page= ma_recordpos_to_page(info->cur_row.lastpos);
+ record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos);
+ DBUG_PRINT("enter", ("rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos,
+ (ulong) page, record_number));
+
+ _ma_bitmap_flushable(info, 1);
+ if (delete_head_or_tail(info, page, record_number, 1, 0) ||
+ delete_tails(info, info->cur_row.tail_positions))
+ goto err;
+
+ if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
+ goto err;
+
+ if (share->now_transactional)
+ {
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
+ HA_CHECKSUM_STORE_SIZE];
+ uchar *log_pos;
+ size_t row_length;
+ uint row_parts_count, extents_length;
+ ha_checksum checksum_delta;
+
+ /* Write UNDO record */
+ lsn_store(log_data, info->trn->undo_lsn);
+ page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page);
+ log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+ dirpos_store(log_pos, record_number);
+ log_pos+= DIRPOS_STORE_SIZE;
+ int2store(log_pos, info->cur_row.head_length -
+ info->cur_row.header_length);
+ log_pos+= 2;
+ pagerange_store(log_pos, info->cur_row.extents_count);
+ log_pos+= PAGERANGE_STORE_SIZE;
+
+ info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data;
+ info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length=
+ sizeof(log_data) - HA_CHECKSUM_STORE_SIZE;
+ store_checksum_in_rec(share, checksum_delta,
+ (ha_checksum) 0 - info->cur_row.checksum, log_pos,
+ info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
+ 0].length);
+ info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
+ info->cur_row.extents;
+ info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
+ extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
+
+ row_length= fill_insert_undo_parts(info, record,
+ (info->log_row_parts +
+ TRANSLOG_INTERNAL_PARTS + 2),
+ &row_parts_count);
+
+ if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn,
+ info,
+ (translog_size_t)
+ (info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
+ 0].length + row_length +
+ extents_length),
+ TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count,
+ info->log_row_parts,
+ log_data + LSN_STORE_SIZE,
+ &checksum_delta))
+ goto err;
+ }
+
+ _ma_bitmap_flushable(info, -1);
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+ DBUG_RETURN(0);
+
+err:
+ _ma_bitmap_flushable(info, -1);
+ _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+ DBUG_RETURN(1);
+}
+
+
+/****************************************************************************
+ Reading of records
+****************************************************************************/
+
+/*
+ Read position to record from record directory at end of page
+
+ SYNOPSIS
+ get_record_position()
+ buff page buffer
+ block_size block size for page
+ record_number Record number in index
+ end_of_data pointer to end of data for record
+
+ RETURN
+ 0 Error in data
+ # Pointer to start of record.
+ In this case *end_of_data is set.
+*/
+
+static uchar *get_record_position(uchar *buff, uint block_size,
+ uint record_number, uchar **end_of_data)
+{
+ uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
+ uchar *dir;
+ uchar *data;
+ uint offset, length;
+
+#ifdef SANITY_CHECKS
+ if (record_number >= number_of_records ||
+ record_number > ((block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE) /
+ DIR_ENTRY_SIZE))
+ {
+ DBUG_PRINT("error",
+ ("Wrong row number: record_number: %u number_of_records: %u",
+ record_number, number_of_records));
+ return 0;
+ }
+#endif
+
+ dir= dir_entry_pos(buff, block_size, record_number);
+ offset= uint2korr(dir);
+ length= uint2korr(dir + 2);
+#ifdef SANITY_CHECKS
+ if (offset < PAGE_HEADER_SIZE ||
+ offset + length > (block_size -
+ number_of_records * DIR_ENTRY_SIZE -
+ PAGE_SUFFIX_SIZE))
+ {
+ DBUG_PRINT("error",
+ ("Wrong row position: record_number: %u offset: %u "
+ "length: %u number_of_records: %u",
+ record_number, offset, length, number_of_records));
+ return 0;
+ }
+#endif
+ data= buff + offset;
+ *end_of_data= data + length;
+ return data;
+}
+
+
+/*
+ Init extent
+
+ NOTES
+ extent is a cursor over which pages to read
+*/
+
+static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info,
+ uint extents, MARIA_RECORD_POS *tail_positions)
+{
+ uint page_count;
+ extent->extent= extent_info;
+ extent->extent_count= extents;
+ extent->page= page_korr(extent_info); /* First extent */
+ page_count= (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) &
+ ~START_EXTENT_BIT);
+ extent->tail= page_count & TAIL_BIT;
+ if (extent->tail)
+ {
+ extent->page_count= 1;
+ extent->tail_row_nr= page_count & ~TAIL_BIT;
+ }
+ else
+ extent->page_count= page_count;
+ extent->tail_positions= tail_positions;
+ extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED;
+}
+
+
+/*
+ Read next extent
+
+ SYNOPSIS
+ read_next_extent()
+ info Maria handler
+ extent Pointer to current extent (this is updated to point
+ to next)
+ end_of_data Pointer to end of data in read block (out)
+
+ NOTES
+ New block is read into info->buff
+
+ RETURN
+ 0 Error; my_errno is set
+ # Pointer to start of data in read block
+ In this case end_of_data is updated to point to end of data.
+*/
+
+static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent,
+ uchar **end_of_data)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *buff, *data;
+ MARIA_PINNED_PAGE page_link;
+ enum pagecache_page_lock lock;
+ DBUG_ENTER("read_next_extent");
+
+ if (!extent->page_count)
+ {
+ uint page_count;
+ if (!--extent->extent_count)
+ goto crashed;
+ extent->extent+= ROW_EXTENT_SIZE;
+ extent->page= page_korr(extent->extent);
+ page_count= (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) &
+ ~START_EXTENT_BIT);
+ if (!page_count)
+ goto crashed;
+ extent->tail= page_count & TAIL_BIT;
+ if (extent->tail)
+ extent->tail_row_nr= page_count & ~TAIL_BIT;
+ else
+ extent->page_count= page_count;
+ DBUG_PRINT("info",("New extent. Page: %lu page_count: %u tail_flag: %d",
+ (ulong) extent->page, extent->page_count,
+ extent->tail != 0));
+ }
+ extent->first_extent= 0;
+
+ lock= PAGECACHE_LOCK_LEFT_UNLOCKED;
+ if (extent->tail)
+ lock= extent->lock_for_tail_pages;
+
+ buff= pagecache_read(share->pagecache,
+ &info->dfile, extent->page, 0,
+ info->buff, share->page_type,
+ lock, &page_link.link);
+ if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
+ {
+ /* Read during UNDO */
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= buff != 0;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ }
+ if (!buff)
+ {
+ /* check if we tried to read over end of file (ie: bad data in record) */
+ if ((extent->page + 1) * share->block_size >
+ share->state.state.data_file_length)
+ goto crashed;
+ DBUG_RETURN(0);
+ }
+
+ if (!extent->tail)
+ {
+ /* Full data page */
+ if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE)
+ goto crashed;
+ extent->page++; /* point to next page */
+ extent->page_count--;
+ *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE;
+ info->cur_row.full_page_count++; /* For maria_chk */
+ DBUG_RETURN(extent->data_start= buff + LSN_SIZE + PAGE_TYPE_SIZE);
+ }
+
+ /* Found tail */
+ if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE)
+ goto crashed;
+ *(extent->tail_positions++)= ma_recordpos(extent->page,
+ extent->tail_row_nr);
+ info->cur_row.tail_count++; /* For maria_chk */
+
+ if (!(data= get_record_position(buff, share->block_size,
+ extent->tail_row_nr,
+ end_of_data)))
+ goto crashed;
+ extent->data_start= data;
+ extent->page_count= 0; /* No more data in extent */
+ DBUG_RETURN(data);
+
+
+crashed:
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
+ DBUG_PRINT("error", ("wrong extent information"));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Read data that may be split over many blocks
+
+ SYNOPSIS
+ read_long_data()
+ info Maria handler
+ to Store result string here (this is allocated)
+ extent Pointer to current extent position
+ data Current position in buffer
+ end_of_data End of data in buffer
+
+ NOTES
+ When we have to read a new buffer, it's read into info->buff
+
+ This loop is implemented by goto's instead of a for() loop as
+ the code is notable smaller and faster this way (and it's not nice
+ to jump into a for loop() or into a 'then' clause)
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static my_bool read_long_data2(MARIA_HA *info, uchar *to, ulong length,
+ MARIA_EXTENT_CURSOR *extent,
+ uchar **data, uchar **end_of_data)
+{
+ uint left_length= (uint) (*end_of_data - *data);
+ DBUG_ENTER("read_long_data2");
+ DBUG_PRINT("enter", ("length: %lu left_length: %u",
+ length, left_length));
+ DBUG_ASSERT(*data <= *end_of_data);
+
+ /*
+ Fields are never split in middle. This means that if length > rest-of-data
+ we should start reading from the next extent. The reason we may have
+ data left on the page is that if the fixed part of the row was less than
+ min_block_length the head block was extended to min_block_length.
+
+ This may change in the future, which is why we have the loop written
+ the way it's written.
+ */
+ if (extent->first_extent && length > left_length)
+ {
+ *end_of_data= *data;
+ left_length= 0;
+ }
+
+ for(;;)
+ {
+ if (unlikely(left_length >= length))
+ {
+ memcpy(to, *data, length);
+ (*data)+= length;
+ DBUG_PRINT("info", ("left_length: %u", left_length - (uint) length));
+ DBUG_RETURN(0);
+ }
+ memcpy(to, *data, left_length);
+ to+= left_length;
+ length-= left_length;
+ if (!(*data= read_next_extent(info, extent, end_of_data)))
+ break;
+ left_length= (uint) (*end_of_data - *data);
+ }
+ DBUG_RETURN(1);
+}
+
+static inline my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length,
+ MARIA_EXTENT_CURSOR *extent,
+ uchar **data, uchar **end_of_data)
+{
+ uint left_length= (uint) (*end_of_data - *data);
+ if (likely(left_length >= length))
+ {
+ memcpy(to, *data, length);
+ (*data)+= length;
+ return 0;
+ }
+ return read_long_data2(info, to, length, extent, data, end_of_data);
+}
+
+
+/*
+ Read a record from page (helper function for _ma_read_block_record())
+
+ SYNOPSIS
+ _ma_read_block_record2()
+ info Maria handler
+ record Store record here
+ data Start of head data for row
+ end_of_data End of data for row
+
+ NOTES
+ The head page is already read by caller
+ Following data is update in info->cur_row:
+
+ cur_row.head_length is set to size of entry in head block
+ cur_row.tail_positions is set to point to all tail blocks
+ cur_row.extents points to extents data
+ cur_row.extents_counts contains number of extents
+ cur_row.empty_bits is set to empty bits
+ cur_row.field_lengths contains packed length of all fields
+ cur_row.blob_length contains total length of all blobs
+ cur_row.checksum contains checksum of read record.
+
+ RETURN
+ 0 ok
+ # Error code
+*/
+
+int _ma_read_block_record2(MARIA_HA *info, uchar *record,
+ uchar *data, uchar *end_of_data)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *field_length_data, *blob_buffer, *start_of_data;
+ uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths;
+ my_bool found_blob= 0;
+ MARIA_EXTENT_CURSOR extent;
+ MARIA_COLUMNDEF *column, *end_column;
+ MARIA_ROW *cur_row= &info->cur_row;
+ DBUG_ENTER("_ma_read_block_record2");
+
+ LINT_INIT(field_length_data);
+ LINT_INIT(blob_buffer);
+
+ start_of_data= data;
+ flag= (uint) (uchar) data[0];
+ cur_null_bytes= share->base.original_null_bytes;
+ null_bytes= share->base.null_bytes;
+ cur_row->head_length= (uint) (end_of_data - data);
+ cur_row->full_page_count= cur_row->tail_count= 0;
+ cur_row->blob_length= 0;
+ /* Number of bytes in header that we don't need to write during undo */
+ cur_row->header_length= total_header_size[(flag & PRECALC_HEADER_BITMASK)]-1;
+
+ if (flag & ROW_FLAG_TRANSID)
+ {
+ cur_row->trid= transid_korr(data+1);
+ if (!info->trn)
+ DBUG_RETURN(my_errno= HA_ERR_WRONG_IN_RECORD); /* File crashed */
+ if (!trnman_can_read_from(info->trn, cur_row->trid))
+ DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE);
+ }
+
+ /* Skip trans header (for now, until we have MVCC csupport) */
+ data+= cur_row->header_length + 1 ;
+ if (flag & ROW_FLAG_NULLS_EXTENDED)
+ cur_null_bytes+= data[-1];
+
+ row_extents= 0;
+ if (flag & ROW_FLAG_EXTENTS)
+ {
+ uint row_extent_size;
+ /*
+ Record is split over many data pages.
+ Get number of extents and first extent
+ */
+ get_key_length(row_extents, data);
+ cur_row->extents_count= row_extents;
+ row_extent_size= row_extents * ROW_EXTENT_SIZE;
+ if (cur_row->extents_buffer_length < row_extent_size &&
+ _ma_alloc_buffer(&cur_row->extents,
+ &cur_row->extents_buffer_length,
+ row_extent_size))
+ DBUG_RETURN(my_errno);
+ memcpy(cur_row->extents, data, ROW_EXTENT_SIZE);
+ data+= ROW_EXTENT_SIZE;
+ init_extent(&extent, cur_row->extents, row_extents,
+ cur_row->tail_positions);
+ }
+ else
+ {
+ cur_row->extents_count= 0;
+ (*cur_row->tail_positions)= 0;
+ extent.page_count= 0;
+ extent.extent_count= 1;
+ }
+ extent.first_extent= 1;
+
+ field_lengths= 0;
+ if (share->base.max_field_lengths)
+ {
+ get_key_length(field_lengths, data);
+ cur_row->field_lengths_length= field_lengths;
+#ifdef SANITY_CHECKS
+ if (field_lengths > share->base.max_field_lengths)
+ goto err;
+#endif
+ }
+
+ if (share->calc_checksum)
+ cur_row->checksum= (uint) (uchar) *data++;
+ /* data now points on null bits */
+ memcpy(record, data, cur_null_bytes);
+ if (unlikely(cur_null_bytes != null_bytes))
+ {
+ /*
+ This only happens if we have added more NULL columns with
+ ALTER TABLE and are fetching an old, not yet modified old row
+ */
+ bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes));
+ }
+ data+= null_bytes;
+ /* We copy the empty bits to be able to use them for delete/update */
+ memcpy(cur_row->empty_bits, data, share->base.pack_bytes);
+ data+= share->base.pack_bytes;
+
+ /* TODO: Use field offsets, instead of just skipping them */
+ data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
+
+ /*
+ Read row extents (note that first extent was already read into
+ cur_row->extents above)
+ */
+ if (row_extents > 1)
+ {
+ if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE,
+ (row_extents - 1) * ROW_EXTENT_SIZE,
+ &extent, &data, &end_of_data))
+ DBUG_RETURN(my_errno);
+ }
+
+ /*
+ Data now points to start of fixed length field data that can't be null
+ or 'empty'. Note that these fields can't be split over blocks.
+ */
+ for (column= share->columndef,
+ end_column= column + share->base.fixed_not_null_fields;
+ column < end_column; column++)
+ {
+ uint column_length= column->length;
+ if (data + column_length > end_of_data &&
+ !(data= read_next_extent(info, &extent, &end_of_data)))
+ goto err;
+ memcpy(record + column->offset, data, column_length);
+ data+= column_length;
+ }
+
+ /* Read array of field lengths. This may be stored in several extents */
+ if (field_lengths)
+ {
+ field_length_data= cur_row->field_lengths;
+ if (read_long_data(info, field_length_data, field_lengths, &extent,
+ &data, &end_of_data))
+ DBUG_RETURN(my_errno);
+ }
+
+ /* Read variable length data. Each of these may be split over many extents */
+ for (end_column= share->columndef + share->base.fields;
+ column < end_column; column++)
+ {
+ enum en_fieldtype type= column->type;
+ uchar *field_pos= record + column->offset;
+ /* First check if field is present in record */
+ if ((record[column->null_pos] & column->null_bit) ||
+ (cur_row->empty_bits[column->empty_pos] & column->empty_bit))
+ {
+ bfill(record + column->offset, column->fill_length,
+ type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+ continue;
+ }
+ switch (type) {
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_SKIP_PRESPACE:
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ if (data + column->length > end_of_data &&
+ !(data= read_next_extent(info, &extent, &end_of_data)))
+ goto err;
+ memcpy(field_pos, data, column->length);
+ data+= column->length;
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ /* Char that is space filled */
+ uint length;
+ if (column->length <= 255)
+ length= (uint) (uchar) *field_length_data++;
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_length_data+= 2;
+ }
+#ifdef SANITY_CHECKS
+ if (length > column->length)
+ goto err;
+#endif
+ if (read_long_data(info, field_pos, length, &extent, &data,
+ &end_of_data))
+ DBUG_RETURN(my_errno);
+ bfill(field_pos + length, column->length - length, ' ');
+ break;
+ }
+ case FIELD_VARCHAR:
+ {
+ ulong length;
+ if (column->length <= 256)
+ {
+ length= (uint) (uchar) (*field_pos++= *field_length_data++);
+ }
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_pos[0]= field_length_data[0];
+ field_pos[1]= field_length_data[1];
+ field_pos+= 2;
+ field_length_data+= 2;
+ }
+#ifdef SANITY_CHECKS
+ if (length > column->length)
+ goto err;
+#endif
+ if (read_long_data(info, field_pos, length, &extent, &data,
+ &end_of_data))
+ DBUG_RETURN(my_errno);
+ break;
+ }
+ case FIELD_BLOB:
+ {
+ uint column_size_length= column->length - portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(column_size_length,
+ field_length_data);
+
+ if (!found_blob)
+ {
+ /* Calculate total length for all blobs */
+ ulong blob_lengths= 0;
+ uchar *length_data= field_length_data;
+ MARIA_COLUMNDEF *blob_field= column;
+
+ found_blob= 1;
+ for (; blob_field < end_column; blob_field++)
+ {
+ uint size_length;
+ if ((record[blob_field->null_pos] & blob_field->null_bit) ||
+ (cur_row->empty_bits[blob_field->empty_pos] &
+ blob_field->empty_bit))
+ continue;
+ size_length= blob_field->length - portable_sizeof_char_ptr;
+ blob_lengths+= _ma_calc_blob_length(size_length, length_data);
+ length_data+= size_length;
+ }
+ cur_row->blob_length= blob_lengths;
+ DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths));
+ if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
+ blob_lengths))
+ DBUG_RETURN(my_errno);
+ blob_buffer= info->blob_buff;
+ }
+
+ memcpy(field_pos, field_length_data, column_size_length);
+ memcpy_fixed(field_pos + column_size_length, (uchar *) &blob_buffer,
+ sizeof(char*));
+ field_length_data+= column_size_length;
+
+ /*
+ After we have read one extent, then each blob is in it's own extent
+ */
+ if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length)
+ end_of_data= data; /* Force read of next extent */
+
+ if (read_long_data(info, blob_buffer, blob_length, &extent, &data,
+ &end_of_data))
+ DBUG_RETURN(my_errno);
+ blob_buffer+= blob_length;
+ break;
+ }
+ default:
+#ifdef EXTRA_DEBUG
+ DBUG_ASSERT(0); /* purecov: deadcode */
+#endif
+ goto err;
+ }
+ continue;
+ }
+
+ if (row_extents)
+ {
+ DBUG_PRINT("info", ("Row read: page_count: %u extent_count: %u",
+ extent.page_count, extent.extent_count));
+ *extent.tail_positions= 0; /* End marker */
+ if (extent.page_count)
+ goto err;
+ if (extent.extent_count > 1)
+ {
+ if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE,
+ (extent.extent_count-1) * ROW_EXTENT_SIZE))
+ {
+ DBUG_PRINT("error", ("Data in extent is not zero"));
+ DBUG_DUMP("extent", extent.extent + ROW_EXTENT_SIZE,
+ (extent.extent_count-1) * ROW_EXTENT_SIZE);
+ goto err;
+ }
+ }
+ }
+ else
+ {
+ DBUG_PRINT("info", ("Row read"));
+ /*
+ data should normally point to end_of_date. The only exception is if
+ the row is very short in which case we allocated 'min_block_length' data
+ for allowing the row to expand.
+ */
+ if (data != end_of_data && (uint) (end_of_data - start_of_data) >
+ share->base.min_block_length)
+ goto err;
+ }
+#ifdef EXTRA_DEBUG
+ if (share->calc_checksum)
+ {
+ /* Esnure that row checksum is correct */
+ DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) ==
+ cur_row->checksum);
+ }
+#endif
+ info->update|= HA_STATE_AKTIV; /* We have an active record */
+ DBUG_RETURN(0);
+
+err:
+ /* Something was wrong with data on record */
+ DBUG_PRINT("error", ("Found record with wrong data"));
+ DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
+}
+
+
+/** @brief Read positions to tail blocks and full blocks
+
+ @fn read_row_extent_info()
+ @param info Handler
+
+ @notes
+ This function is a simpler version of _ma_read_block_record2()
+ The data about the used pages is stored in info->cur_row.
+
+ @return Status
+ @retval 0 ok
+ @retval 1 Error. my_errno contains error number
+*/
+
+static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff,
+ uint record_number)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_EXTENT_CURSOR extent;
+ MARIA_RECORD_POS *tail_pos;
+ uchar *data, *end_of_data;
+ uint flag, row_extents, row_extents_size, field_lengths;
+ uchar *extents, *end;
+ DBUG_ENTER("read_row_extent_info");
+
+ if (!(data= get_record_position(buff, share->block_size,
+ record_number, &end_of_data)))
+ DBUG_RETURN(1); /* Wrong in record */
+
+ flag= (uint) (uchar) data[0];
+ /* Skip trans header */
+ data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
+
+ row_extents= 0;
+ row_extents_size= 0;
+ if (flag & ROW_FLAG_EXTENTS)
+ {
+ /*
+ Record is split over many data pages.
+ Get number of extents and first extent
+ */
+ get_key_length(row_extents, data);
+ row_extents_size= row_extents * ROW_EXTENT_SIZE;
+ if (info->cur_row.extents_buffer_length < row_extents_size &&
+ _ma_alloc_buffer(&info->cur_row.extents,
+ &info->cur_row.extents_buffer_length,
+ row_extents_size))
+ DBUG_RETURN(1);
+ memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE);
+ data+= ROW_EXTENT_SIZE;
+ init_extent(&extent, info->cur_row.extents, row_extents,
+ info->cur_row.tail_positions);
+ extent.first_extent= 1;
+ }
+ info->cur_row.extents_count= row_extents;
+
+ if (share->base.max_field_lengths)
+ get_key_length(field_lengths, data);
+
+ if (share->calc_checksum)
+ info->cur_row.checksum= (uint) (uchar) *data++;
+ if (row_extents > 1)
+ {
+ data+= share->base.null_bytes;
+ data+= share->base.pack_bytes;
+ data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
+
+ /*
+ Read row extents (note that first extent was already read into
+ info->cur_row.extents above)
+ Lock tails with write lock as we will delete them later.
+ */
+ extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+ if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE,
+ row_extents_size - ROW_EXTENT_SIZE,
+ &extent, &data, &end_of_data))
+ DBUG_RETURN(1);
+ }
+
+ /* Update tail_positions with pointer to tails */
+ tail_pos= info->cur_row.tail_positions;
+ for (extents= info->cur_row.extents, end= extents + row_extents_size;
+ extents < end;
+ extents+= ROW_EXTENT_SIZE)
+ {
+ pgcache_page_no_t page= uint5korr(extents);
+ uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
+ if (page_count & TAIL_BIT)
+ *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT |
+ START_EXTENT_BIT)));
+ }
+ *tail_pos= 0; /* End marker */
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Read a record based on record position
+
+ @fn _ma_read_block_record()
+ @param info Maria handler
+ @param record Store record here
+ @param record_pos Record position
+
+ @return Status
+ @retval 0 ok
+ @retval # Error number
+*/
+
+int _ma_read_block_record(MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS record_pos)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *data, *end_of_data, *buff;
+ uint offset;
+ uint block_size= share->block_size;
+ DBUG_ENTER("_ma_read_block_record");
+ DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
+ (ulong) record_pos,
+ (ulong) ma_recordpos_to_page(record_pos),
+ ma_recordpos_to_dir_entry(record_pos)));
+
+ offset= ma_recordpos_to_dir_entry(record_pos);
+
+ if (!(buff= pagecache_read(share->pagecache,
+ &info->dfile, ma_recordpos_to_page(record_pos), 0,
+ info->buff, share->page_type,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+ DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE);
+ if (!(data= get_record_position(buff, block_size, offset, &end_of_data)))
+ {
+ DBUG_PRINT("error", ("Wrong directory entry in data block"));
+ my_errno= HA_ERR_RECORD_DELETED; /* File crashed */
+ DBUG_RETURN(HA_ERR_RECORD_DELETED);
+ }
+ DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data));
+}
+
+
+/* compare unique constraint between stored rows */
+
+my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos)
+{
+ uchar *org_rec_buff, *old_record;
+ size_t org_rec_buff_size;
+ int error;
+ DBUG_ENTER("_ma_cmp_block_unique");
+
+ if (!(old_record= my_alloca(info->s->base.reclength)))
+ DBUG_RETURN(1);
+
+ /* Don't let the compare destroy blobs that may be in use */
+ org_rec_buff= info->rec_buff;
+ org_rec_buff_size= info->rec_buff_size;
+ if (info->s->base.blobs)
+ {
+ /* Force realloc of record buffer*/
+ info->rec_buff= 0;
+ info->rec_buff_size= 0;
+ }
+ error= _ma_read_block_record(info, old_record, pos);
+ if (!error)
+ error= _ma_unique_comp(def, record, old_record, def->null_are_equal);
+ if (info->s->base.blobs)
+ {
+ my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ info->rec_buff= org_rec_buff;
+ info->rec_buff_size= org_rec_buff_size;
+ }
+ DBUG_PRINT("exit", ("result: %d", error));
+ my_afree(old_record);
+ DBUG_RETURN(error != 0);
+}
+
+
+/****************************************************************************
+ Table scan
+****************************************************************************/
+
+/*
+ Allocate buffers for table scan
+
+ SYNOPSIS
+ _ma_scan_init_block_record(MARIA_HA *info)
+
+ IMPLEMENTATION
+ We allocate one buffer for the current bitmap and one buffer for the
+ current page
+
+ RETURN
+ 0 ok
+ 1 error (couldn't allocate memory or disk error)
+*/
+
+my_bool _ma_scan_init_block_record(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_scan_init_block_record");
+ /*
+ bitmap_buff may already be allocated if this is the second call to
+ rnd_init() without a rnd_end() in between, see sql/handler.h
+ */
+ if (!(info->scan.bitmap_buff ||
+ ((info->scan.bitmap_buff=
+ (uchar *) my_malloc(share->block_size * 2, MYF(MY_WME))))))
+ DBUG_RETURN(1);
+ info->scan.page_buff= info->scan.bitmap_buff + share->block_size;
+ info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.total_size;
+
+ /* Set scan variables to get _ma_scan_block() to start with reading bitmap */
+ info->scan.number_of_rows= 0;
+ info->scan.bitmap_pos= info->scan.bitmap_end;
+ info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered;
+ info->scan.max_page= share->state.state.data_file_length / share->block_size;
+ /*
+ We need to flush what's in memory (bitmap.map) to page cache otherwise, as
+ we are going to read bitmaps from page cache in table scan (see
+ _ma_scan_block_record()), we may miss recently inserted rows (bitmap page
+ in page cache would be too old).
+ */
+ DBUG_RETURN(_ma_bitmap_flush(info->s));
+}
+
+
+/* Free buffers allocated by _ma_scan_block_init() */
+
+void _ma_scan_end_block_record(MARIA_HA *info)
+{
+ DBUG_ENTER("_ma_scan_end_block_record");
+ my_free(info->scan.bitmap_buff, MYF(MY_ALLOW_ZERO_PTR));
+ info->scan.bitmap_buff= 0;
+ if (info->scan_save)
+ {
+ my_free(info->scan_save, MYF(0));
+ info->scan_save= 0;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Save current scan position
+
+ @note
+ For the moment we can only remember one position, but this is
+ good enough for MySQL usage
+
+ @Warning
+ When this function is called, we assume that the thread is not deleting
+ or updating the current row before ma_scan_restore_block_record()
+ is called!
+
+ @return
+ @retval 0 ok
+ @retval HA_ERR_WRONG_IN_RECORD Could not allocate memory to hold position
+*/
+
+int _ma_scan_remember_block_record(MARIA_HA *info,
+ MARIA_RECORD_POS *lastpos)
+{
+ uchar *bitmap_buff;
+ DBUG_ENTER("_ma_scan_remember_block_record");
+ if (!(info->scan_save))
+ {
+ if (!(info->scan_save= my_malloc(ALIGN_SIZE(sizeof(*info->scan_save)) +
+ info->s->block_size * 2,
+ MYF(MY_WME))))
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+ info->scan_save->bitmap_buff= ((uchar*) info->scan_save +
+ ALIGN_SIZE(sizeof(*info->scan_save)));
+ }
+ /* Point to the last read row */
+ *lastpos= info->cur_row.nextpos - 1;
+ info->scan.dir+= DIR_ENTRY_SIZE;
+
+ /* Remember used bitmap and used head page */
+ bitmap_buff= info->scan_save->bitmap_buff;
+ memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save));
+ info->scan_save->bitmap_buff= bitmap_buff;
+ memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief restore scan block it's original values
+
+ @note
+ In theory we could swap bitmap buffers instead of copy them.
+ For the moment we don't do that because there are variables pointing
+ inside the buffers and it's a bit of hassle to either make them relative
+ or repoint them.
+*/
+
+void _ma_scan_restore_block_record(MARIA_HA *info,
+ MARIA_RECORD_POS lastpos)
+{
+ uchar *bitmap_buff;
+ DBUG_ENTER("_ma_scan_restore_block_record");
+
+ info->cur_row.nextpos= lastpos;
+ bitmap_buff= info->scan.bitmap_buff;
+ memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save));
+ info->scan.bitmap_buff= bitmap_buff;
+ memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Read next record while scanning table
+
+ SYNOPSIS
+ _ma_scan_block_record()
+ info Maria handler
+ record Store found here
+ record_pos Value stored in info->cur_row.next_pos after last call
+ skip_deleted
+
+ NOTES
+ - One must have called mi_scan() before this
+ - In this version, we don't actually need record_pos, we as easily
+ use a variable in info->scan
+
+ IMPLEMENTATION
+ Current code uses a lot of goto's to separate the different kind of
+ states we may be in. This gives us a minimum of executed if's for
+ the normal cases. I tried several different ways to code this, but
+ the current one was in the end the most readable and fastest.
+
+ RETURN
+ 0 ok
+ # Error code
+*/
+
+int _ma_scan_block_record(MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS record_pos,
+ my_bool skip_deleted __attribute__ ((unused)))
+{
+ uint block_size;
+ my_off_t filepos;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_scan_block_record");
+
+restart_record_read:
+ /* Find next row in current page */
+ while (likely(record_pos < info->scan.number_of_rows))
+ {
+ uint length, offset;
+ uchar *data, *end_of_data;
+ int error;
+
+ while (!(offset= uint2korr(info->scan.dir)))
+ {
+ info->scan.dir-= DIR_ENTRY_SIZE;
+ record_pos++;
+#ifdef SANITY_CHECKS
+ if (info->scan.dir < info->scan.dir_end)
+ {
+ DBUG_ASSERT(0);
+ goto err;
+ }
+#endif
+ }
+ /* found row */
+ info->cur_row.lastpos= info->scan.row_base_page + record_pos;
+ info->cur_row.nextpos= record_pos + 1;
+ data= info->scan.page_buff + offset;
+ length= uint2korr(info->scan.dir + 2);
+ end_of_data= data + length;
+ info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */
+#ifdef SANITY_CHECKS
+ if (end_of_data > info->scan.dir_end ||
+ offset < PAGE_HEADER_SIZE || length < share->base.min_block_length)
+ {
+ DBUG_ASSERT(!(end_of_data > info->scan.dir_end));
+ DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE));
+ DBUG_ASSERT(!(length < share->base.min_block_length));
+ goto err;
+ }
+#endif
+ DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
+ error= _ma_read_block_record2(info, record, data, end_of_data);
+ if (error != HA_ERR_ROW_NOT_VISIBLE)
+ DBUG_RETURN(error);
+ record_pos++;
+ }
+
+ /* Find next head page in current bitmap */
+restart_bitmap_scan:
+ block_size= share->block_size;
+ if (likely(info->scan.bitmap_pos < info->scan.bitmap_end))
+ {
+ uchar *data= info->scan.bitmap_pos;
+ longlong bits= info->scan.bits;
+ uint bit_pos= info->scan.bit_pos;
+
+ do
+ {
+ while (likely(bits))
+ {
+ uint pattern= (uint) (bits & 7);
+ bits >>= 3;
+ bit_pos++;
+ if (pattern > 0 && pattern <= 4)
+ {
+ /* Found head page; Read it */
+ pgcache_page_no_t page;
+ info->scan.bitmap_pos= data;
+ info->scan.bits= bits;
+ info->scan.bit_pos= bit_pos;
+ page= (info->scan.bitmap_page + 1 +
+ (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1);
+ info->scan.row_base_page= ma_recordpos(page, 0);
+ if (page >= info->scan.max_page)
+ {
+ DBUG_PRINT("info", ("Found end of file"));
+ DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
+ }
+ if (!(pagecache_read(share->pagecache,
+ &info->dfile,
+ page, 0, info->scan.page_buff,
+ share->page_type,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+ if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) !=
+ HEAD_PAGE))
+ {
+ /*
+ This may happen if someone has been deleting all rows
+ from a page since we read the bitmap, so it may be ok.
+ Print warning in debug log and continue.
+ */
+ DBUG_PRINT("warning",
+ ("Found page of type %d when expecting head page",
+ (info->scan.page_buff[PAGE_TYPE_OFFSET] &
+ PAGE_TYPE_MASK)));
+ continue;
+ }
+ if ((info->scan.number_of_rows=
+ (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0)
+ {
+ DBUG_PRINT("error", ("Wrong page header"));
+ DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
+ }
+ DBUG_PRINT("info", ("Page %lu has %u rows",
+ (ulong) page, info->scan.number_of_rows));
+ info->scan.dir= (info->scan.page_buff + block_size -
+ PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
+ info->scan.dir_end= (info->scan.dir -
+ (info->scan.number_of_rows - 1) *
+ DIR_ENTRY_SIZE);
+ record_pos= 0;
+ goto restart_record_read;
+ }
+ }
+ for (data+= 6; data < info->scan.bitmap_end; data+= 6)
+ {
+ bits= uint6korr(data);
+ /* Skip not allocated pages and blob / full tail pages */
+ if (bits && bits != LL(07777777777777777))
+ break;
+ }
+ bit_pos= 0;
+ } while (data < info->scan.bitmap_end);
+ }
+
+ /* Read next bitmap */
+ info->scan.bitmap_page+= share->bitmap.pages_covered;
+ filepos= (my_off_t) info->scan.bitmap_page * block_size;
+ if (unlikely(filepos >= share->state.state.data_file_length))
+ {
+ DBUG_PRINT("info", ("Found end of file"));
+ DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
+ }
+ DBUG_PRINT("info", ("Reading bitmap at %lu",
+ (ulong) info->scan.bitmap_page));
+ if (!(pagecache_read(share->pagecache, &info->s->bitmap.file,
+ info->scan.bitmap_page,
+ 0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ DBUG_RETURN(my_errno);
+ /* Skip scanning 'bits' in bitmap scan code */
+ info->scan.bitmap_pos= info->scan.bitmap_buff - 6;
+ info->scan.bits= 0;
+ goto restart_bitmap_scan;
+
+err:
+ DBUG_PRINT("error", ("Wrong data on page"));
+ DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
+}
+
+
+/*
+ Compare a row against a stored one
+
+ NOTES
+ Not implemented, as block record is not supposed to be used in a shared
+ global environment
+*/
+
+my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)),
+ const uchar *record __attribute__ ((unused)))
+{
+ return 0;
+}
+
+
+/*
+ Store an integer with simple packing
+
+ SYNOPSIS
+ ma_store_integer()
+ to Store the packed integer here
+ nr Integer to store
+
+ NOTES
+ This is mostly used to store field numbers and lengths of strings.
+ We have to cast the result for the LL() becasue of a bug in Forte CC
+ compiler.
+
+ Packing used is:
+ nr < 251 is stored as is (in 1 byte)
+ Numbers that require 1-4 bytes are stored as char(250+byte_length), data
+ Bigger numbers are stored as 255, data as ulonglong (not yet done).
+
+ RETURN
+ Position in 'to' after the packed length
+*/
+
+uchar *ma_store_length(uchar *to, ulong nr)
+{
+ if (nr < 251)
+ {
+ *to=(uchar) nr;
+ return to+1;
+ }
+ if (nr < 65536)
+ {
+ if (nr <= 255)
+ {
+ to[0]= (uchar) 251;
+ to[1]= (uchar) nr;
+ return to+2;
+ }
+ to[0]= (uchar) 252;
+ int2store(to+1, nr);
+ return to+3;
+ }
+ if (nr < 16777216)
+ {
+ *to++= (uchar) 253;
+ int3store(to, nr);
+ return to+3;
+ }
+ *to++= (uchar) 254;
+ int4store(to, nr);
+ return to+4;
+}
+
+
+/* Calculate how many bytes needed to store a number */
+
+uint ma_calc_length_for_store_length(ulong nr)
+{
+ if (nr < 251)
+ return 1;
+ if (nr < 65536)
+ {
+ if (nr <= 255)
+ return 2;
+ return 3;
+ }
+ if (nr < 16777216)
+ return 4;
+ return 5;
+}
+
+
+/* Retrive a stored number */
+
+static ulong ma_get_length(const uchar **packet)
+{
+ reg1 const uchar *pos= *packet;
+ if (*pos < 251)
+ {
+ (*packet)++;
+ return (ulong) *pos;
+ }
+ if (*pos == 251)
+ {
+ (*packet)+= 2;
+ return (ulong) pos[1];
+ }
+ if (*pos == 252)
+ {
+ (*packet)+= 3;
+ return (ulong) uint2korr(pos+1);
+ }
+ if (*pos == 253)
+ {
+ (*packet)+= 4;
+ return (ulong) uint3korr(pos+1);
+ }
+ DBUG_ASSERT(*pos == 254);
+ (*packet)+= 5;
+ return (ulong) uint4korr(pos+1);
+}
+
+
+/*
+ Fill array with pointers to field parts to be stored in log for insert
+
+ SYNOPSIS
+ fill_insert_undo_parts()
+ info Maria handler
+ record Inserted row
+ log_parts Store pointers to changed memory areas here
+ log_parts_count See RETURN
+
+ NOTES
+ We have information in info->cur_row about the read row.
+
+ RETURN
+ length of data in log_parts.
+ log_parts_count contains number of used log_parts
+*/
+
+static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
+ LEX_CUSTRING *log_parts,
+ uint *log_parts_count)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_COLUMNDEF *column, *end_column;
+ uchar *field_lengths= info->cur_row.field_lengths;
+ size_t row_length;
+ MARIA_ROW *cur_row= &info->cur_row;
+ LEX_CUSTRING *start_log_parts;
+ DBUG_ENTER("fill_insert_undo_parts");
+
+ start_log_parts= log_parts;
+
+ /* Store null bits */
+ log_parts->str= record;
+ log_parts->length= share->base.null_bytes;
+ row_length= log_parts->length;
+ log_parts++;
+
+ /* Stored bitmap over packed (zero length or all-zero fields) */
+ log_parts->str= info->cur_row.empty_bits;
+ log_parts->length= share->base.pack_bytes;
+ row_length+= log_parts->length;
+ log_parts++;
+
+ if (share->base.max_field_lengths)
+ {
+ /* Store length of all not empty char, varchar and blob fields */
+ log_parts->str= field_lengths - 2;
+ log_parts->length= info->cur_row.field_lengths_length+2;
+ int2store(log_parts->str, info->cur_row.field_lengths_length);
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+
+ if (share->base.blobs)
+ {
+ /*
+ Store total blob length to make buffer allocation easier during UNDO
+ */
+ log_parts->str= info->length_buff;
+ log_parts->length= (uint) (ma_store_length(info->length_buff,
+ info->cur_row.blob_length) -
+ (uchar*) log_parts->str);
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+
+ /* Handle constant length fields that are always present */
+ for (column= share->columndef,
+ end_column= column+ share->base.fixed_not_null_fields;
+ column < end_column;
+ column++)
+ {
+ log_parts->str= record + column->offset;
+ log_parts->length= column->length;
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+
+ /* Handle NULL fields and CHAR/VARCHAR fields */
+ for (end_column= share->columndef + share->base.fields - share->base.blobs;
+ column < end_column;
+ column++)
+ {
+ const uchar *column_pos;
+ size_t column_length;
+ if ((record[column->null_pos] & column->null_bit) ||
+ cur_row->empty_bits[column->empty_pos] & column->empty_bit)
+ continue;
+
+ column_pos= record+ column->offset;
+ column_length= column->length;
+
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ if (column->length <= 255)
+ column_length= *field_lengths++;
+ else
+ {
+ column_length= uint2korr(field_lengths);
+ field_lengths+= 2;
+ }
+ break;
+ }
+ case FIELD_VARCHAR:
+ {
+ if (column->fill_length == 1)
+ column_length= *field_lengths;
+ else
+ column_length= uint2korr(field_lengths);
+ field_lengths+= column->fill_length;
+ column_pos+= column->fill_length;
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+ log_parts->str= column_pos;
+ log_parts->length= column_length;
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+
+ /* Add blobs */
+ for (end_column+= share->base.blobs; column < end_column; column++)
+ {
+ const uchar *field_pos= record + column->offset;
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
+
+ /*
+ We don't have to check for null, as blob_length is guranteed to be 0
+ if the blob is null
+ */
+ if (blob_length)
+ {
+ uchar *blob_pos;
+ memcpy_fixed(&blob_pos, record + column->offset + size_length,
+ sizeof(blob_pos));
+ log_parts->str= blob_pos;
+ log_parts->length= blob_length;
+ row_length+= log_parts->length;
+ log_parts++;
+ }
+ }
+ *log_parts_count= (uint) (log_parts - start_log_parts);
+ DBUG_RETURN(row_length);
+}
+
+
+/*
+ Fill array with pointers to field parts to be stored in log for update
+
+ SYNOPSIS
+ fill_update_undo_parts()
+ info Maria handler
+ oldrec Original row
+ newrec New row
+ log_parts Store pointers to changed memory areas here
+ log_parts_count See RETURN
+
+ IMPLEMENTATION
+ Format of undo record:
+
+ Fields are stored in same order as the field array.
+
+ Offset to changed field data (packed)
+
+ For each changed field
+ Fieldnumber (packed)
+ Length, if variable length field (packed)
+
+ For each changed field
+ Data
+
+ Packing is using ma_store_integer()
+
+ The reason we store field numbers & length separated from data (ie, not
+ after each other) is to get better cpu caching when we loop over
+ fields (as we probably don't have to access data for each field when we
+ want to read and old row through the undo log record).
+
+ As a special case, we use '255' for the field number of the null bitmap.
+
+ RETURN
+ length of data in log_parts.
+ log_parts_count contains number of used log_parts
+*/
+
+static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
+ const uchar *newrec,
+ LEX_CUSTRING *log_parts,
+ uint *log_parts_count)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_COLUMNDEF *column, *end_column;
+ MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row;
+ uchar *field_data, *start_field_data, *length_str;
+ uchar *old_field_lengths= old_row->field_lengths;
+ uchar *new_field_lengths= new_row->field_lengths;
+ size_t row_length= 0;
+ uint field_lengths;
+ LEX_CUSTRING *start_log_parts;
+ my_bool new_column_is_empty;
+ DBUG_ENTER("fill_update_undo_parts");
+
+ start_log_parts= log_parts;
+
+ /*
+ First log part is for number of fields, field numbers and lengths
+ The +4 is to reserve place for the number of changed fields.
+ */
+ start_field_data= field_data= info->update_field_data + 4;
+ log_parts++;
+
+ if (memcmp(oldrec, newrec, share->base.null_bytes))
+ {
+ /* Store changed null bits */
+ *field_data++= (uchar) 255; /* Special case */
+ log_parts->str= oldrec;
+ log_parts->length= share->base.null_bytes;
+ row_length= log_parts->length;
+ log_parts++;
+ }
+
+ /* Handle constant length fields */
+ for (column= share->columndef,
+ end_column= column+ share->base.fixed_not_null_fields;
+ column < end_column;
+ column++)
+ {
+ if (memcmp(oldrec + column->offset, newrec + column->offset,
+ column->length))
+ {
+ field_data= ma_store_length(field_data,
+ (uint) (column - share->columndef));
+ log_parts->str= oldrec + column->offset;
+ log_parts->length= column->length;
+ row_length+= column->length;
+ log_parts++;
+ }
+ }
+
+ /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */
+ for (end_column= share->columndef + share->base.fields;
+ column < end_column;
+ column++)
+ {
+ const uchar *new_column_pos, *old_column_pos;
+ size_t new_column_length, old_column_length;
+
+ /* First check if old column is null or empty */
+ if (oldrec[column->null_pos] & column->null_bit)
+ {
+ /*
+ It's safe to skip this one as either the new column is also null
+ (no change) or the new_column is not null, in which case the null-bit
+ maps differed and we have already stored the null bitmap.
+ */
+ continue;
+ }
+ if (old_row->empty_bits[column->empty_pos] & column->empty_bit)
+ {
+ if (new_row->empty_bits[column->empty_pos] & column->empty_bit)
+ continue; /* Both are empty; skip */
+
+ /* Store null length column */
+ field_data= ma_store_length(field_data,
+ (uint) (column - share->columndef));
+ field_data= ma_store_length(field_data, 0);
+ continue;
+ }
+ /*
+ Remember if the 'new' value is empty (as in this case we must always
+ log the original value
+ */
+ new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) ||
+ (new_row->empty_bits[column->empty_pos] &
+ column->empty_bit));
+
+ old_column_pos= oldrec + column->offset;
+ new_column_pos= newrec + column->offset;
+ old_column_length= new_column_length= column->length;
+
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ break;
+ case FIELD_VARCHAR:
+ new_column_length--; /* Skip length prefix */
+ old_column_pos+= column->fill_length;
+ new_column_pos+= column->fill_length;
+ /* Fall through */
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ if (new_column_length <= 255)
+ {
+ old_column_length= *old_field_lengths++;
+ if (!new_column_is_empty)
+ new_column_length= *new_field_lengths++;
+ }
+ else
+ {
+ old_column_length= uint2korr(old_field_lengths);
+ old_field_lengths+= 2;
+ if (!new_column_is_empty)
+ {
+ new_column_length= uint2korr(new_field_lengths);
+ new_field_lengths+= 2;
+ }
+ }
+ break;
+ }
+ case FIELD_BLOB:
+ {
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ old_column_length= _ma_calc_blob_length(size_length, old_column_pos);
+ memcpy_fixed((uchar*) &old_column_pos,
+ oldrec + column->offset + size_length,
+ sizeof(old_column_pos));
+ if (!new_column_is_empty)
+ {
+ new_column_length= _ma_calc_blob_length(size_length, new_column_pos);
+ memcpy_fixed((uchar*) &new_column_pos,
+ newrec + column->offset + size_length,
+ sizeof(old_column_pos));
+ }
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+
+ if (new_column_is_empty || new_column_length != old_column_length ||
+ memcmp(old_column_pos, new_column_pos, new_column_length))
+ {
+ field_data= ma_store_length(field_data,
+ (ulong) (column - share->columndef));
+ field_data= ma_store_length(field_data, (ulong) old_column_length);
+
+ log_parts->str= old_column_pos;
+ log_parts->length= old_column_length;
+ row_length+= old_column_length;
+ log_parts++;
+ }
+ }
+
+ *log_parts_count= (uint) (log_parts - start_log_parts);
+
+ /* Store length of field length data before the field/field_lengths */
+ field_lengths= (uint) (field_data - start_field_data);
+ length_str= start_field_data - ma_calc_length_for_store_length(field_lengths);
+ start_log_parts->str= length_str;
+ ma_store_length(length_str, field_lengths);
+ start_log_parts->length= (size_t) (field_data - start_log_parts->str);
+ row_length+= start_log_parts->length;
+ DBUG_RETURN(row_length);
+}
+
+/***************************************************************************
+ In-write hooks called under log's lock when log record is written
+***************************************************************************/
+
+/**
+ @brief Sets transaction's rec_lsn if needed
+
+ A transaction sometimes writes a REDO even before the page is in the
+ pagecache (example: brand new head or tail pages; full pages). So, if
+ Checkpoint happens just after the REDO write, it needs to know that the
+ REDO phase must start before this REDO. Scanning the pagecache cannot
+ tell that as the page is not in the cache. So, transaction sets its rec_lsn
+ to the REDO's LSN or somewhere before, and Checkpoint reads the
+ transaction's rec_lsn.
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_redo(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info
+ __attribute__ ((unused)),
+ LSN *lsn, void *hook_arg
+ __attribute__ ((unused)))
+{
+ /*
+ Users of dummy_transaction_object must keep this TRN clean as it
+ is used by many threads (like those manipulating non-transactional
+ tables). It might be dangerous if one user sets rec_lsn or some other
+ member and it is picked up by another user (like putting this rec_lsn into
+ a page of a non-transactional table); it's safer if all members stay 0. So
+ non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not
+ call this hook; we trust them but verify ;)
+ */
+ DBUG_ASSERT(trn->trid != 0);
+ /*
+ If the hook stays so simple, it would be faster to pass
+ !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn
+ to translog_write_record(), like Monty did in his original code, and not
+ have a hook. For now we keep it like this.
+ */
+ if (trn->rec_lsn == 0)
+ trn->rec_lsn= *lsn;
+ return 0;
+}
+
+
+/**
+ @brief Sets transaction's undo_lsn, first_undo_lsn if needed
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info
+ __attribute__ ((unused)),
+ LSN *lsn, void *hook_arg
+ __attribute__ ((unused)))
+{
+ DBUG_ASSERT(trn->trid != 0);
+ trn->undo_lsn= *lsn;
+ if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0))
+ trn->first_undo_lsn=
+ trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
+ return 0;
+ /*
+ when we implement purging, we will specialize this hook: UNDO_PURGE
+ records will additionally set trn->undo_purge_lsn
+ */
+}
+
+
+/**
+ @brief Sets the table's records count and checksum and others to 0, then
+ calls the generic REDO hook.
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_redo_delete_all(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info
+ __attribute__ ((unused)),
+ LSN *lsn, void *hook_arg)
+{
+ _ma_reset_status(tbl_info);
+ return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+ @brief Updates "records" and "checksum" and calls the generic UNDO hook
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_row_insert(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg)
+{
+ MARIA_SHARE *share= tbl_info->s;
+ share->state.state.records++;
+ share->state.state.checksum+= *(ha_checksum *)hook_arg;
+ return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+ @brief Upates "records" and calls the generic UNDO hook
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_row_delete(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg)
+{
+ MARIA_SHARE *share= tbl_info->s;
+ share->state.state.records--;
+ share->state.state.checksum+= *(ha_checksum *)hook_arg;
+ return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+ @brief Upates "records" and "checksum" and calls the generic UNDO hook
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_row_update(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg)
+{
+ MARIA_SHARE *share= tbl_info->s;
+ share->state.state.checksum+= *(ha_checksum *)hook_arg;
+ return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg)
+{
+ /*
+ We are going to call maria_delete_all_rows(), but without logging and
+ syncing, as an optimization (if we crash before commit, the UNDO will
+ empty; if we crash after commit, we have flushed and forced the files).
+ Status still needs to be reset under log mutex, in case of a concurrent
+ checkpoint.
+ */
+ _ma_reset_status(tbl_info);
+ return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+ @brief Updates table's lsn_of_file_id.
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_file_id(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn
+ __attribute__ ((unused)),
+ MARIA_HA *tbl_info,
+ LSN *lsn,
+ void *hook_arg
+ __attribute__ ((unused)))
+{
+ DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0);
+ tbl_info->s->lsn_of_file_id= *lsn;
+ return 0;
+}
+
+
+/**
+ Updates transaction's rec_lsn when committing.
+
+ A transaction writes its commit record before being committed in trnman, so
+ if Checkpoint happens just between the COMMIT record log write and the
+ commit in trnman, it will record that transaction is not committed. Assume
+ the transaction (trn1) did an INSERT; after the checkpoint, a second
+ transaction (trn2) does a DELETE of what trn1 has inserted. Then crash,
+ Checkpoint record says that trn1 was not committed, and REDO phase starts
+ from Checkpoint record's LSN. So it will not find the COMMIT record of
+ trn1, will want to roll back trn1, which will fail because the row/key
+ which it wants to delete does not exist anymore.
+ To avoid this, Checkpoint needs to know that the REDO phase must start
+ before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's
+ record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint
+ will know.
+
+ @note so after commit trn->rec_lsn is a "commit LSN", which could be of
+ use later.
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_commit(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn,
+ MARIA_HA *tbl_info
+ __attribute__ ((unused)),
+ LSN *lsn,
+ void *hook_arg
+ __attribute__ ((unused)))
+{
+ trn->rec_lsn= *lsn;
+ return 0;
+}
+
+
+/***************************************************************************
+ Applying of REDO log records
+***************************************************************************/
+
+/*
+ Apply changes to head and tail pages
+
+ SYNOPSIS
+ _ma_apply_redo_insert_row_head_or_tail()
+ info Maria handler
+ lsn LSN to put on page
+ page_type HEAD_PAGE or TAIL_PAGE
+ new_page True if this is first entry on page
+ header Header (without FILEID)
+ data Data to be put on page
+ data_length Length of data
+
+ NOTE
+ Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL
+ LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL
+
+ RETURN
+ 0 ok
+ # Error number
+*/
+
+uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
+ uint page_type,
+ my_bool new_page,
+ const uchar *header,
+ const uchar *data,
+ size_t data_length)
+{
+ MARIA_SHARE *share= info->s;
+ pgcache_page_no_t page;
+ uint rownr, empty_space;
+ uint block_size= share->block_size;
+ uint rec_offset;
+ uchar *buff, *dir;
+ uint result;
+ MARIA_PINNED_PAGE page_link;
+ enum pagecache_page_lock unlock_method;
+ enum pagecache_page_pin unpin_method;
+ my_off_t end_of_page;
+ uint error;
+ DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail");
+
+ page= page_korr(header);
+ rownr= dirpos_korr(header + PAGE_STORE_SIZE);
+
+ DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u data_length: %u",
+ (ulong) ma_recordpos(page, rownr),
+ (ulong) page, rownr, (uint) data_length));
+
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+
+ end_of_page= (page + 1) * share->block_size;
+ if (end_of_page > share->state.state.data_file_length)
+ {
+ DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
+ (ulong) share->state.state.data_file_length,
+ (ulong) end_of_page));
+ /*
+ New page at end of file. Note that the test above is also positive if
+ data_file_length is not a multiple of block_size (system crashed while
+ writing the last page): in this case we just extend the last page and
+ fill it entirely with zeroes, then the REDO will put correct data on
+ it.
+ */
+ unlock_method= PAGECACHE_LOCK_WRITE;
+ unpin_method= PAGECACHE_PIN;
+
+ DBUG_ASSERT(rownr == 0 && new_page);
+ if (rownr != 0 || !new_page)
+ goto crashed_file;
+
+ buff= info->keyread_buff;
+ info->keyread_buff_used= 1;
+ make_empty_page(info, buff, page_type, 1);
+ empty_space= (block_size - PAGE_OVERHEAD_SIZE);
+ rec_offset= PAGE_HEADER_SIZE;
+ dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
+ }
+ else
+ {
+ unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+ unpin_method= PAGECACHE_PIN_LEFT_PINNED;
+
+ share->pagecache->readwrite_flags&= ~MY_WME;
+ buff= pagecache_read(share->pagecache, &info->dfile,
+ page, 0, 0,
+ PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+ &page_link.link);
+ share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags;
+ if (!buff)
+ {
+ /* Skip errors when reading outside of file and uninitialized pages */
+ if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT &&
+ my_errno != HA_ERR_WRONG_CRC))
+ {
+ DBUG_PRINT("error", ("Error %d when reading page", (int) my_errno));
+ goto err;
+ }
+ /* Create new page */
+ buff= pagecache_block_link_to_buffer(page_link.link);
+ buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
+ }
+ else if (lsn_korr(buff) >= lsn) /* Test if already applied */
+ {
+ /* Fix bitmap, just in case */
+ empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ if (!enough_free_entries_on_page(share, buff))
+ empty_space= 0; /* Page is full */
+
+ if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
+ goto err;
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ DBUG_RETURN(0);
+ }
+
+ if (((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type))
+ {
+ /*
+ This is a page that has been freed before and now should be
+ changed to new type.
+ */
+ if (!new_page)
+ {
+ DBUG_PRINT("error",
+ ("Found page of wrong type: %u, should have been %u",
+ (uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK),
+ page_type));
+ goto crashed_file;
+ }
+ make_empty_page(info, buff, page_type, 0);
+ empty_space= block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE;
+ (void) extend_directory(page_type == HEAD_PAGE ? info: 0, buff,
+ block_size, 0, rownr, &empty_space);
+ rec_offset= PAGE_HEADER_SIZE;
+ dir= dir_entry_pos(buff, block_size, rownr);
+ empty_space+= uint2korr(dir+2);
+ }
+ else
+ {
+ uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+ uint length;
+
+ DBUG_ASSERT(!new_page);
+ dir= dir_entry_pos(buff, block_size, rownr);
+ empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+
+ if (max_entry <= rownr)
+ {
+ /* Add directory entry first in directory and data last on page */
+ if (extend_directory(page_type == HEAD_PAGE ? info : 0, buff,
+ block_size, max_entry, rownr, &empty_space))
+ goto crashed_file;
+ }
+ if (extend_area_on_page(page_type == HEAD_PAGE ? info : 0, buff,
+ dir, rownr, block_size,
+ (uint) data_length, &empty_space,
+ &rec_offset, &length))
+ goto crashed_file;
+ }
+ }
+ /* Copy data */
+ int2store(dir+2, data_length);
+ memcpy(buff + rec_offset, data, data_length);
+ empty_space-= (uint) data_length;
+ int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
+
+ /*
+ If page was not read before, write it but keep it pinned.
+ We don't update its LSN When we have processed all REDOs for this page
+ in the current REDO's group, we will stamp page with UNDO's LSN
+ (if we stamped it now, a next REDO, in
+ this group, for this page, would be skipped) and unpin then.
+ */
+ result= 0;
+ if (unlock_method == PAGECACHE_LOCK_WRITE &&
+ pagecache_write(share->pagecache,
+ &info->dfile, page, 0,
+ buff, PAGECACHE_PLAIN_PAGE,
+ unlock_method, unpin_method,
+ PAGECACHE_WRITE_DELAY, &page_link.link,
+ LSN_IMPOSSIBLE))
+ result= my_errno;
+
+ /* Fix bitmap */
+ if (!enough_free_entries_on_page(share, buff))
+ empty_space= 0; /* Page is full */
+ if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
+ goto err;
+
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+ /*
+ Data page and bitmap page are in place, we can update data_file_length in
+ case we extended the file. We could not do it earlier: bitmap code tests
+ data_file_length to know if it has to create a new page or not.
+ */
+ set_if_bigger(share->state.state.data_file_length, end_of_page);
+ DBUG_RETURN(result);
+
+crashed_file:
+ my_errno= HA_ERR_WRONG_IN_RECORD;
+err:
+ error= my_errno;
+ if (unlock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED)
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ _ma_mark_file_crashed(share);
+ DBUG_ASSERT(0); /* catch recovery errors early */
+ DBUG_RETURN((my_errno= error));
+}
+
+
+/*
+ Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL
+
+ SYNOPSIS
+ _ma_apply_redo_purge_row_head_or_tail()
+ info Maria handler
+ lsn LSN to put on page
+ page_type HEAD_PAGE or TAIL_PAGE
+ header Header (without FILEID)
+
+ NOTES
+ This function is very similar to delete_head_or_tail()
+
+ RETURN
+ 0 ok
+ # Error number
+*/
+
+uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
+ uint page_type,
+ const uchar *header)
+{
+ MARIA_SHARE *share= info->s;
+ pgcache_page_no_t page;
+ uint rownr, empty_space;
+ uint block_size= share->block_size;
+ uchar *buff;
+ int result;
+ uint error;
+ MARIA_PINNED_PAGE page_link;
+ DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail");
+
+ page= page_korr(header);
+ rownr= dirpos_korr(header+PAGE_STORE_SIZE);
+ DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
+ (ulong) ma_recordpos(page, rownr),
+ (ulong) page, rownr));
+
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+
+ if (!(buff= pagecache_read(share->pagecache, &info->dfile,
+ page, 0, 0,
+ PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+ &page_link.link)))
+ goto err;
+
+ if (lsn_korr(buff) >= lsn)
+ {
+ /*
+ Already applied
+ Note that in case the page is not anymore a head or tail page
+ a future redo will fix the bitmap.
+ */
+ if ((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type)
+ {
+ empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET);
+ if (!enough_free_entries_on_page(share, buff))
+ empty_space= 0; /* Page is full */
+ if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE,
+ empty_space))
+ goto err;
+ }
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ DBUG_RETURN(0);
+ }
+
+ DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type);
+
+ if (delete_dir_entry(buff, block_size, rownr, &empty_space) < 0)
+ {
+ my_errno= HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+ result= 0;
+ if (!enough_free_entries_on_page(share, buff))
+ empty_space= 0; /* Page is full */
+ /* This will work even if the page was marked as UNALLOCATED_PAGE */
+ if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
+ result= my_errno;
+
+ DBUG_RETURN(result);
+
+err:
+ error= my_errno;
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ _ma_mark_file_crashed(share);
+ DBUG_ASSERT(0);
+ DBUG_RETURN((my_errno= error));
+
+}
+
+
+/**
+ @brief Apply LOGREC_REDO_FREE_BLOCKS
+
+ @param info Maria handler
+ @param header Header (without FILEID)
+
+ @note It marks the pages free in the bitmap
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+uint _ma_apply_redo_free_blocks(MARIA_HA *info,
+ LSN lsn __attribute__((unused)),
+ const uchar *header)
+{
+ MARIA_SHARE *share= info->s;
+ uint ranges;
+ DBUG_ENTER("_ma_apply_redo_free_blocks");
+
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+
+ ranges= pagerange_korr(header);
+ header+= PAGERANGE_STORE_SIZE;
+ DBUG_ASSERT(ranges > 0);
+
+ while (ranges--)
+ {
+ my_bool res;
+ uint page_range;
+ pgcache_page_no_t page, start_page;
+
+ start_page= page= page_korr(header);
+ header+= PAGE_STORE_SIZE;
+ /* Page range may have this bit set to indicate a tail page */
+ page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT);
+ DBUG_ASSERT(page_range > 0);
+
+ header+= PAGERANGE_STORE_SIZE;
+
+ DBUG_PRINT("info", ("page: %lu pages: %u", (long) page, page_range));
+
+ /** @todo leave bitmap lock to the bitmap code... */
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page,
+ page_range);
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ if (res)
+ {
+ _ma_mark_file_crashed(share);
+ DBUG_ASSERT(0);
+ DBUG_RETURN(res);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL
+
+ @param info Maria handler
+ @param header Header (without FILEID)
+
+ @note It marks the page free in the bitmap, and sets the directory's count
+ to 0.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
+ const uchar *header)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *buff;
+ pgcache_page_no_t page;
+ MARIA_PINNED_PAGE page_link;
+ my_bool res;
+ DBUG_ENTER("_ma_apply_redo_free_head_or_tail");
+
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+
+ page= page_korr(header);
+
+ if (!(buff= pagecache_read(share->pagecache,
+ &info->dfile,
+ page, 0, 0,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE, &page_link.link)))
+ {
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ goto err;
+ }
+ if (lsn_korr(buff) >= lsn)
+ {
+ /* Already applied */
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ }
+ else
+ {
+ buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+ {
+ uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
+ uchar *dir= dir_entry_pos(buff, share->block_size,
+ number_of_records-1);
+ buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST;
+ bzero(dir, number_of_records * DIR_ENTRY_SIZE);
+ }
+#endif
+
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ }
+ /** @todo leave bitmap lock to the bitmap code... */
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1);
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ if (res)
+ goto err;
+ DBUG_RETURN(0);
+
+err:
+ _ma_mark_file_crashed(share);
+ DBUG_ASSERT(0);
+ DBUG_RETURN(1);
+}
+
+
+/**
+ @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS
+
+ @param info Maria handler
+ @parma lsn LSN to put on pages
+ @param header Header (with FILEID)
+ @param redo_lsn REDO record's LSN
+ @param[out] number_of_blobs Number of blobs found in log record
+ @param[out] number_of_ranges Number of ranges found
+ @param[out] first_page First page touched
+ @param[out] last_page Last page touched
+
+ @note Write full pages (full head & blob pages)
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
+ LSN lsn, const uchar *header,
+ LSN redo_lsn,
+ uint * const number_of_blobs,
+ uint * const number_of_ranges,
+ pgcache_page_no_t * const first_page,
+ pgcache_page_no_t * const last_page)
+{
+ MARIA_SHARE *share= info->s;
+ const uchar *data;
+ uint data_size= FULL_PAGE_SIZE(share->block_size);
+ uint blob_count, ranges;
+ uint16 sid;
+ pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0;
+ DBUG_ENTER("_ma_apply_redo_insert_row_blobs");
+
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+
+ sid= fileid_korr(header);
+ header+= FILEID_STORE_SIZE;
+ *number_of_ranges= ranges= pagerange_korr(header);
+ header+= PAGERANGE_STORE_SIZE;
+ *number_of_blobs= blob_count= pagerange_korr(header);
+ header+= PAGERANGE_STORE_SIZE;
+ DBUG_ASSERT(ranges >= blob_count);
+
+ data= (header + ranges * ROW_EXTENT_SIZE +
+ blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE));
+
+ while (blob_count--)
+ {
+ uint sub_ranges, empty_space;
+
+ sub_ranges= uint2korr(header);
+ header+= SUB_RANGE_SIZE;
+ empty_space= uint2korr(header);
+ header+= BLOCK_FILLER_SIZE;
+ DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size);
+ ranges-= sub_ranges;
+
+ while (sub_ranges--)
+ {
+ uint i;
+ uint res;
+ uint page_range;
+ pgcache_page_no_t page, start_page;
+ uchar *buff;
+
+ start_page= page= page_korr(header);
+ header+= PAGE_STORE_SIZE;
+ page_range= pagerange_korr(header);
+ header+= PAGERANGE_STORE_SIZE;
+
+ for (i= page_range; i-- > 0 ; page++)
+ {
+ MARIA_PINNED_PAGE page_link;
+ enum pagecache_page_lock unlock_method;
+ enum pagecache_page_pin unpin_method;
+ uint length;
+
+ set_if_smaller(first_page2, page);
+ set_if_bigger(last_page2, page);
+ if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE))
+ continue;
+
+ if (((page + 1) * share->block_size) >
+ share->state.state.data_file_length)
+ {
+ /* New page or half written page at end of file */
+ DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
+ (ulong) share->state.state.data_file_length,
+ (ulong) ((page + 1 ) * share->block_size)));
+ share->state.state.data_file_length= (page + 1) * share->block_size;
+ buff= info->keyread_buff;
+ info->keyread_buff_used= 1;
+ make_empty_page(info, buff, BLOB_PAGE, 0);
+ unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED;
+ unpin_method= PAGECACHE_PIN_LEFT_UNPINNED;
+ }
+ else
+ {
+ share->pagecache->readwrite_flags&= ~MY_WME;
+ buff= pagecache_read(share->pagecache,
+ &info->dfile,
+ page, 0, 0,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE, &page_link.link);
+ share->pagecache->readwrite_flags= share->pagecache->
+ org_readwrite_flags;
+ if (!buff)
+ {
+ if (my_errno != HA_ERR_FILE_TOO_SHORT &&
+ my_errno != HA_ERR_WRONG_CRC)
+ {
+ /* If not read outside of file */
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ goto err;
+ }
+ /*
+ Physical file was too short, create new page. It can be that
+ recovery started with a file with N pages, wrote page N+2 into
+ pagecache (increased data_file_length but not physical file
+ length), now reads page N+1: the read fails.
+ */
+ buff= pagecache_block_link_to_buffer(page_link.link);
+ make_empty_page(info, buff, BLOB_PAGE, 0);
+ }
+ else
+ {
+#ifndef DBUG_OFF
+ uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
+#endif
+ if (lsn_korr(buff) >= lsn)
+ {
+ /* Already applied */
+ DBUG_PRINT("info", ("already applied %llu >= %llu",
+ lsn_korr(buff), lsn));
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ continue;
+ }
+ DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) ||
+ (found_page_type == (uchar) UNALLOCATED_PAGE));
+ }
+ unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
+ unpin_method= PAGECACHE_UNPIN;
+ }
+
+ /*
+ Blob pages are never updated twice in same redo-undo chain, so
+ it's safe to update lsn for them here
+ */
+ lsn_store(buff, lsn);
+ buff[PAGE_TYPE_OFFSET]= BLOB_PAGE;
+
+ length= data_size;
+ if (i == 0 && sub_ranges == 0)
+ {
+ /*
+ Last page may be only partly filled. We zero the rest, like
+ write_full_pages() does.
+ */
+ length-= empty_space;
+ bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space,
+ empty_space);
+ }
+ memcpy(buff+ PAGE_TYPE_OFFSET + 1, data, length);
+ data+= length;
+ if (pagecache_write(share->pagecache,
+ &info->dfile, page, 0,
+ buff, PAGECACHE_PLAIN_PAGE,
+ unlock_method, unpin_method,
+ PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE))
+ goto err;
+ }
+ /** @todo leave bitmap lock to the bitmap code... */
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, start_page,
+ page_range);
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ if (res)
+ goto err;
+ }
+ }
+ *first_page= first_page2;
+ *last_page= last_page2;
+ DBUG_RETURN(0);
+
+err:
+ _ma_mark_file_crashed(share);
+ DBUG_ASSERT(0);
+ DBUG_RETURN(1);
+}
+
+
+/****************************************************************************
+ Applying of UNDO entries
+****************************************************************************/
+
+/** Execute undo of a row insert (delete the inserted row) */
+
+my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header)
+{
+ pgcache_page_no_t page;
+ uint rownr;
+ uchar *buff;
+ my_bool res;
+ MARIA_PINNED_PAGE page_link;
+ MARIA_SHARE *share= info->s;
+ ha_checksum checksum;
+ LSN lsn;
+ DBUG_ENTER("_ma_apply_undo_row_insert");
+
+ page= page_korr(header);
+ header+= PAGE_STORE_SIZE;
+ rownr= dirpos_korr(header);
+ header+= DIRPOS_STORE_SIZE;
+ DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
+ (ulong) ma_recordpos(page, rownr),
+ (ulong) page, rownr));
+
+ buff= pagecache_read(share->pagecache,
+ &info->dfile, page, 0,
+ 0, share->page_type,
+ PAGECACHE_LOCK_WRITE,
+ &page_link.link);
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= buff != 0;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ if (!buff)
+ goto err;
+
+ if (read_row_extent_info(info, buff, rownr))
+ goto err;
+
+ _ma_bitmap_flushable(info, 1);
+ if (delete_head_or_tail(info, page, rownr, 1, 1) ||
+ delete_tails(info, info->cur_row.tail_positions))
+ goto err;
+
+ if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
+ goto err;
+
+ checksum= 0;
+ if (share->calc_checksum)
+ checksum= (ha_checksum) 0 - ha_checksum_korr(header);
+ info->last_auto_increment= ~ (ulonglong) 0;
+ if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT,
+ share->calc_checksum != 0, checksum, &lsn, (void*) 0))
+ goto err;
+
+ res= 0;
+end:
+ if (info->non_flushable_state)
+ _ma_bitmap_flushable(info, -1);
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+ DBUG_RETURN(res);
+
+err:
+ res= 1;
+ _ma_mark_file_crashed(share);
+ goto end;
+}
+
+
+/** Execute undo of a row delete (insert the row back where it was) */
+
+my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, size_t header_length
+ __attribute__((unused)))
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_ROW row;
+ MARIA_COLUMNDEF *column, *end_column;
+ MARIA_BITMAP_BLOCKS *blocks;
+ struct st_row_pos_info row_pos;
+ uchar *record;
+ const uchar *null_bits, *field_length_data, *extent_info;
+ pgcache_page_no_t page;
+ ulong *blob_lengths;
+ uint *null_field_lengths, extent_count, rownr, length_on_head_page;
+ DBUG_ENTER("_ma_apply_undo_row_delete");
+
+ /*
+ Use cur row as a base; We need to make a copy as we will change
+ some buffers to point directly to 'header'
+ */
+ memcpy(&row, &info->cur_row, sizeof(row));
+
+ page= page_korr(header);
+ header+= PAGE_STORE_SIZE;
+ rownr= dirpos_korr(header);
+ header+= DIRPOS_STORE_SIZE;
+ length_on_head_page= uint2korr(header);
+ header+= 2;
+ extent_count= pagerange_korr(header);
+ header+= PAGERANGE_STORE_SIZE;
+ DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
+ (ulong) ma_recordpos(page, rownr),
+ (ulong) page, rownr));
+
+ if (share->calc_checksum)
+ {
+ /*
+ We extract the checksum delta here, saving a recomputation in
+ allocate_and_write_block_record(). It's only an optimization.
+ */
+ row.checksum= (ha_checksum) 0 - ha_checksum_korr(header);
+ header+= HA_CHECKSUM_STORE_SIZE;
+ }
+ extent_info= header;
+ header+= extent_count * ROW_EXTENT_SIZE;
+
+ null_field_lengths= row.null_field_lengths;
+ blob_lengths= row.blob_lengths;
+
+ /*
+ Fill in info->cur_row with information about the row, like in
+ calc_record_size(), to be used by write_block_record()
+ */
+
+ row.normal_length= row.char_length= row.varchar_length=
+ row.blob_length= row.extents_count= row.field_lengths_length= 0;
+
+ null_bits= header;
+ header+= share->base.null_bytes;
+ /* This will not be changed */
+ row.empty_bits= (uchar*) header;
+ header+= share->base.pack_bytes;
+ if (share->base.max_field_lengths)
+ {
+ row.field_lengths_length= uint2korr(header);
+ row.field_lengths= (uchar*) header + 2 ;
+ header+= 2 + row.field_lengths_length;
+ }
+ if (share->base.blobs)
+ row.blob_length= ma_get_length(&header);
+
+ /* We need to build up a record (without blobs) in rec_buff */
+ if (!(record= my_malloc(share->base.reclength, MYF(MY_WME))))
+ DBUG_RETURN(1);
+
+ memcpy(record, null_bits, share->base.null_bytes);
+
+ /* Copy field information from header to record */
+
+ /* Handle constant length fields that are always present */
+ for (column= share->columndef,
+ end_column= column+ share->base.fixed_not_null_fields;
+ column < end_column;
+ column++)
+ {
+ memcpy(record + column->offset, header, column->length);
+ header+= column->length;
+ }
+
+ /* Handle NULL fields and CHAR/VARCHAR fields */
+ field_length_data= row.field_lengths;
+ for (end_column= share->columndef + share->base.fields;
+ column < end_column;
+ column++, null_field_lengths++)
+ {
+ if ((record[column->null_pos] & column->null_bit) ||
+ row.empty_bits[column->empty_pos] & column->empty_bit)
+ {
+ if (column->type != FIELD_BLOB)
+ *null_field_lengths= 0;
+ else
+ *blob_lengths++= 0;
+ if (share->calc_checksum)
+ bfill(record + column->offset, column->fill_length,
+ column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+ continue;
+ }
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ case FIELD_SKIP_ZERO: /* Fixed length field */
+ row.normal_length+= column->length;
+ *null_field_lengths= column->length;
+ memcpy(record + column->offset, header, column->length);
+ header+= column->length;
+ break;
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ uint length;
+ if (column->length <= 255)
+ length= (uint) *field_length_data++;
+ else
+ {
+ length= uint2korr(field_length_data);
+ field_length_data+= 2;
+ }
+ row.char_length+= length;
+ *null_field_lengths= length;
+ memcpy(record + column->offset, header, length);
+ if (share->calc_checksum)
+ bfill(record + column->offset + length, (column->length - length),
+ ' ');
+ header+= length;
+ break;
+ }
+ case FIELD_VARCHAR:
+ {
+ uint length;
+ uchar *field_pos= record + column->offset;
+
+ /* 256 is correct as this includes the length uchar */
+ if (column->fill_length == 1)
+ {
+ field_pos[0]= *field_length_data;
+ length= (uint) *field_length_data;
+ }
+ else
+ {
+ field_pos[0]= field_length_data[0];
+ field_pos[1]= field_length_data[1];
+ length= uint2korr(field_length_data);
+ }
+ field_length_data+= column->fill_length;
+ field_pos+= column->fill_length;
+ row.varchar_length+= length;
+ *null_field_lengths= length;
+ memcpy(field_pos, header, length);
+ header+= length;
+ break;
+ }
+ case FIELD_BLOB:
+ {
+ /* Copy length of blob and pointer to blob data to record */
+ uchar *field_pos= record + column->offset;
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(size_length, field_length_data);
+
+ memcpy(field_pos, field_length_data, size_length);
+ field_length_data+= size_length;
+ memcpy(field_pos + size_length, &header, sizeof(&header));
+ header+= blob_length;
+ *blob_lengths++= blob_length;
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
+ row.head_length= (info->row_base_length +
+ share->base.fixed_not_null_fields_length +
+ row.field_lengths_length +
+ size_to_store_key_length(row.field_lengths_length) +
+ row.normal_length +
+ row.char_length + row.varchar_length);
+ row.total_length= (row.head_length + row.blob_length);
+ if (row.total_length < share->base.min_block_length)
+ row.total_length= share->base.min_block_length;
+
+ /*
+ Row is now generated. Now we need to insert record on the original
+ pages with original size on each page.
+ */
+
+ _ma_bitmap_flushable(info, 1);
+ /* Change extent information to be usable by write_block_record() */
+ blocks= &row.insert_blocks;
+ if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
+ goto err;
+ blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info,
+ &share->bitmap,
+ page);
+ blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP;
+
+ /* Read head page and allocate data for rowid */
+ if (get_rowpos_in_head_or_tail_page(info, blocks->block,
+ info->buff,
+ length_on_head_page,
+ HEAD_PAGE, PAGECACHE_LOCK_WRITE,
+ rownr, &row_pos))
+ goto err;
+
+ if (share->calc_checksum)
+ {
+ DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record));
+ }
+ /* Store same amount of data on head page as on original page */
+ row_pos.length= (length_on_head_page -
+ (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
+ set_if_bigger(row_pos.length, share->base.min_block_length);
+ if (write_block_record(info, (uchar*) 0, record, &row,
+ blocks, blocks->block->org_bitmap_value != 0,
+ &row_pos, undo_lsn, 0))
+ goto err;
+
+ my_free(record, MYF(0));
+ DBUG_RETURN(0);
+
+err:
+ _ma_mark_file_crashed(share);
+ if (info->non_flushable_state)
+ _ma_bitmap_flushable(info, -1);
+ _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+ my_free(record, MYF(0));
+ DBUG_RETURN(1);
+}
+
+
+/**
+ Execute undo of a row update
+
+ @fn _ma_apply_undo_row_update()
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header,
+ size_t header_length
+ __attribute__((unused)))
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_RECORD_POS record_pos;
+ const uchar *field_length_data, *field_length_data_end, *extent_info;
+ uchar *current_record, *orig_record;
+ pgcache_page_no_t page;
+ ha_checksum checksum_delta;
+ uint rownr, field_length_header, extent_count, length_on_head_page;
+ int error;
+ DBUG_ENTER("_ma_apply_undo_row_update");
+ LINT_INIT(checksum_delta);
+
+ page= page_korr(header);
+ header+= PAGE_STORE_SIZE;
+ rownr= dirpos_korr(header);
+ header+= DIRPOS_STORE_SIZE;
+
+ record_pos= ma_recordpos(page, rownr);
+ DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
+ (ulong) record_pos, (ulong) page, rownr));
+
+ if (share->calc_checksum)
+ {
+ checksum_delta= ha_checksum_korr(header);
+ header+= HA_CHECKSUM_STORE_SIZE;
+ }
+ length_on_head_page= uint2korr(header);
+ set_if_bigger(length_on_head_page, share->base.min_block_length);
+ header+= 2;
+ extent_count= pagerange_korr(header);
+ header+= PAGERANGE_STORE_SIZE;
+ extent_info= header;
+ header+= extent_count * ROW_EXTENT_SIZE;
+
+ /*
+ Set header to point to old field values, generated by
+ fill_update_undo_parts()
+ */
+ field_length_header= ma_get_length(&header);
+ field_length_data= (uchar*) header;
+ header+= field_length_header;
+ field_length_data_end= header;
+
+ /* Allocate buffer for current row & original row */
+ if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME))))
+ DBUG_RETURN(1);
+ orig_record= current_record+ share->base.reclength;
+
+ /* Read current record */
+ if (_ma_read_block_record(info, current_record, record_pos))
+ goto err;
+
+ if (*field_length_data == 255)
+ {
+ /* Bitmap changed */
+ field_length_data++;
+ memcpy(orig_record, header, share->base.null_bytes);
+ header+= share->base.null_bytes;
+ }
+ else
+ memcpy(orig_record, current_record, share->base.null_bytes);
+ bitmap_clear_all(&info->changed_fields);
+
+ while (field_length_data < field_length_data_end)
+ {
+ uint field_nr= ma_get_length(&field_length_data), field_length;
+ MARIA_COLUMNDEF *column= share->columndef + field_nr;
+ uchar *orig_field_pos= orig_record + column->offset;
+
+ bitmap_set_bit(&info->changed_fields, field_nr);
+ if (field_nr >= share->base.fixed_not_null_fields)
+ {
+ if (!(field_length= ma_get_length(&field_length_data)))
+ {
+ /* Null field or empty field */
+ bfill(orig_field_pos, column->fill_length,
+ column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+ continue;
+ }
+ }
+ else
+ field_length= column->length;
+
+ switch (column->type) {
+ case FIELD_CHECK:
+ case FIELD_NORMAL: /* Fixed length field */
+ case FIELD_ZERO:
+ case FIELD_SKIP_PRESPACE: /* Not packed */
+ memcpy(orig_field_pos, header, column->length);
+ header+= column->length;
+ break;
+ case FIELD_SKIP_ZERO: /* Number */
+ case FIELD_SKIP_ENDSPACE: /* CHAR */
+ {
+ uint diff;
+ memcpy(orig_field_pos, header, field_length);
+ if ((diff= (column->length - field_length)))
+ bfill(orig_field_pos + column->length - diff, diff,
+ column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+ header+= field_length;
+ }
+ break;
+ case FIELD_VARCHAR:
+ if (column->length <= 256)
+ {
+ *orig_field_pos++= (uchar) field_length;
+ }
+ else
+ {
+ int2store(orig_field_pos, field_length);
+ orig_field_pos+= 2;
+ }
+ memcpy(orig_field_pos, header, field_length);
+ header+= field_length;
+ break;
+ case FIELD_BLOB:
+ {
+ uint size_length= column->length - portable_sizeof_char_ptr;
+ _ma_store_blob_length(orig_field_pos, size_length, field_length);
+ memcpy_fixed(orig_field_pos + size_length, &header, sizeof(header));
+ header+= field_length;
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
+ copy_not_changed_fields(info, &info->changed_fields,
+ orig_record, current_record);
+
+ if (share->calc_checksum)
+ {
+ info->new_row.checksum= checksum_delta +
+ (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record));
+ /* verify that record's content is sane */
+ DBUG_ASSERT(info->new_row.checksum ==
+ (*share->calc_checksum)(info, current_record));
+ }
+
+ info->last_auto_increment= ~ (ulonglong) 0;
+ /* Now records are up to date, execute the update to original values */
+ if (_ma_update_at_original_place(info, page, rownr, length_on_head_page,
+ extent_count, extent_info,
+ current_record, orig_record, undo_lsn))
+ goto err;
+
+ error= 0;
+end:
+ my_free(current_record, MYF(0));
+ DBUG_RETURN(error);
+
+err:
+ error= 1;
+ _ma_mark_file_crashed(share);
+ goto end;
+}
+
+
+/**
+ Execute undo of a bulk insert which used repair
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn)
+{
+ my_bool error;
+ LSN lsn;
+ DBUG_ENTER("_ma_apply_undo_bulk_insert");
+ /*
+ We delete all rows, re-enable indices as bulk insert had disabled
+ non-unique ones.
+ */
+ error= (maria_delete_all_rows(info) ||
+ maria_enable_indexes(info) ||
+ /* we enabled indices so need '2' below */
+ _ma_state_info_write(info->s,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_FULL_INFO |
+ MA_STATE_INFO_WRITE_LOCK) ||
+ _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT,
+ FALSE, 0, &lsn, NULL));
+ DBUG_RETURN(error);
+}
+
+
+/**
+ @brief Get the TRANSLOG_ADDRESS to flush up to
+
+ @param page Page's content
+ @param page_no Page's number (<offset>/<page length>)
+ @param data_ptr Callback data pointer (pointer to MARIA_SHARE)
+
+ @note
+ Usable for data (non-bitmap) and index pages
+
+ @retval LSN to flush up to
+*/
+
+TRANSLOG_ADDRESS
+maria_page_get_lsn(uchar *page,
+ pgcache_page_no_t page_no __attribute__((unused)),
+ uchar* data_ptr __attribute__((unused)))
+{
+#ifndef DBUG_OFF
+ const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr;
+ DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE &&
+ share->now_transactional);
+#endif
+ return lsn_korr(page);
+}
+
+
+/**
+ @brief Enable reading of all rows, ignoring versioning
+
+ @note
+ This is mainly useful in single user applications, like maria_pack,
+ where we want to be able to read all rows without having to read the
+ transaction id from the control file
+*/
+
+void maria_ignore_trids(MARIA_HA *info)
+{
+ if (info->s->base.born_transactional)
+ {
+ if (!info->trn)
+ _ma_set_trn_for_table(info, &dummy_transaction_object);
+ /* Ignore transaction id when row is read */
+ info->trn->min_read_from= ~(TrID) 0;
+ }
+}
+
+
+#ifndef DBUG_OFF
+
+/* The following functions are useful to call from debugger */
+
+void _ma_print_block_info(uchar *buff)
+{
+ LSN lsn= lsn_korr(buff);
+
+ printf("LSN: %lu,0x%lx type: %u dir_entries: %u dir_free: %u empty_space: %u\n",
+ LSN_IN_PARTS(lsn),
+ (uint)buff[PAGE_TYPE_OFFSET],
+ (uint)buff[DIR_COUNT_OFFSET],
+ (uint)buff[DIR_FREE_OFFSET],
+ (uint) uint2korr(buff + EMPTY_SPACE_OFFSET));
+ printf("Start of directory: %lu\n",
+ maria_block_size - PAGE_SUFFIX_SIZE -
+ (uint) buff[DIR_COUNT_OFFSET] * DIR_ENTRY_SIZE);
+ _ma_print_directory(stdout, buff, maria_block_size);
+}
+#endif
diff --git a/storage/maria/ma_blockrec.h b/storage/maria/ma_blockrec.h
new file mode 100644
index 00000000000..a5858880dd0
--- /dev/null
+++ b/storage/maria/ma_blockrec.h
@@ -0,0 +1,290 @@
+/* Copyright (C) 2007 Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Storage of records in block
+*/
+
+#define LSN_SIZE 7
+#define DIR_COUNT_SIZE 1 /* Stores number of rows on page */
+#define DIR_FREE_SIZE 1 /* Pointer to first free dir entry */
+#define EMPTY_SPACE_SIZE 2 /* Stores empty space on page */
+#define PAGE_TYPE_SIZE 1
+#define PAGE_SUFFIX_SIZE 4 /* Bytes for checksum */
+#define PAGE_HEADER_SIZE (LSN_SIZE + DIR_COUNT_SIZE + DIR_FREE_SIZE +\
+ EMPTY_SPACE_SIZE + PAGE_TYPE_SIZE)
+#define PAGE_OVERHEAD_SIZE (PAGE_HEADER_SIZE + DIR_ENTRY_SIZE + \
+ PAGE_SUFFIX_SIZE)
+#define BLOCK_RECORD_POINTER_SIZE 6
+
+#define FULL_PAGE_SIZE(block_size) ((block_size) - LSN_SIZE - \
+ PAGE_TYPE_SIZE - PAGE_SUFFIX_SIZE)
+
+#define ROW_EXTENT_PAGE_SIZE 5
+#define ROW_EXTENT_COUNT_SIZE 2
+#define SUB_RANGE_SIZE 2
+#define BLOCK_FILLER_SIZE 2
+#define ROW_EXTENT_SIZE (ROW_EXTENT_PAGE_SIZE + ROW_EXTENT_COUNT_SIZE)
+#define TAIL_BIT 0x8000 /* Bit in page_count to signify tail */
+#define START_EXTENT_BIT 0x4000 /* Bit in page_count to signify start*/
+/* page_count set by bitmap code for tail pages */
+#define TAIL_PAGE_COUNT_MARKER 0xffff
+/* Number of extents reserved MARIA_BITMAP_BLOCKS to store head part */
+#define ELEMENTS_RESERVED_FOR_MAIN_PART 4
+/* This is just used to prealloc a dynamic array */
+#define AVERAGE_BLOB_SIZE 1024L*1024L
+/* Number of pages to store continuous blob parts */
+#define BLOB_SEGMENT_MIN_SIZE 128
+
+/* Fields before 'row->null_field_lengths' used by find_where_to_split_row */
+#define EXTRA_LENGTH_FIELDS 3
+
+/* Size for the different parts in the row header (and head page) */
+#define FLAG_SIZE 1
+#define VERPTR_SIZE 7
+#define DIR_ENTRY_SIZE 4
+#define FIELD_OFFSET_SIZE 2 /* size of pointers to field starts */
+
+/* Minimum header size needed for a new row */
+#define BASE_ROW_HEADER_SIZE FLAG_SIZE
+#define TRANS_ROW_EXTRA_HEADER_SIZE TRANSID_SIZE
+
+#define PAGE_TYPE_MASK 7
+enum en_page_type { UNALLOCATED_PAGE, HEAD_PAGE, TAIL_PAGE, BLOB_PAGE, MAX_PAGE_TYPE };
+#define PAGE_CAN_BE_COMPACTED 128 /* Bit in PAGE_TYPE */
+
+#define PAGE_TYPE_OFFSET LSN_SIZE
+#define DIR_COUNT_OFFSET (LSN_SIZE+PAGE_TYPE_SIZE)
+#define DIR_FREE_OFFSET (DIR_COUNT_OFFSET+DIR_COUNT_SIZE)
+#define EMPTY_SPACE_OFFSET (DIR_FREE_OFFSET+DIR_FREE_SIZE)
+
+/* Bits used for flag uchar (one byte, first in record) */
+#define ROW_FLAG_TRANSID 1
+#define ROW_FLAG_VER_PTR 2
+#define ROW_FLAG_DELETE_TRANSID 4
+#define ROW_FLAG_NULLS_EXTENDED 8
+#define ROW_FLAG_EXTENTS 128
+#define ROW_FLAG_ALL (1+2+4+8+128)
+
+/******** Variables that affects how data pages are utilized ********/
+
+/* Minium size of tail segment */
+#define MIN_TAIL_SIZE 32
+
+/*
+ Fixed length part of Max possible header size; See row data structure
+ table in ma_blockrec.c.
+*/
+#define MAX_FIXED_HEADER_SIZE (FLAG_SIZE + 3 + ROW_EXTENT_SIZE + 3)
+#define TRANS_MAX_FIXED_HEADER_SIZE (MAX_FIXED_HEADER_SIZE + \
+ TRANSID_SIZE + VERPTR_SIZE + \
+ TRANSID_SIZE)
+
+/* We use 1 uchar in record header to store number of directory entries */
+#define MAX_ROWS_PER_PAGE 255
+#define END_OF_DIR_FREE_LIST ((uchar) 255)
+
+/* Bits for MARIA_BITMAP_BLOCKS->used */
+/* We stored data on disk in the block */
+#define BLOCKUSED_USED 1
+/* Bitmap on disk is block->org_bitmap_value ; Happens only on update */
+#define BLOCKUSED_USE_ORG_BITMAP 2
+/* We stored tail data on disk for the block */
+#define BLOCKUSED_TAIL 4
+
+/******* defines that affects allocation (density) of data *******/
+
+/*
+ If the tail part (from the main block or a blob) would use more than 75 % of
+ the size of page, store the tail on a full page instead of a shared
+ tail page.
+*/
+#define MAX_TAIL_SIZE(block_size) ((block_size) *3 / 4)
+
+/* Don't allocate memory for too many row extents on the stack */
+#define ROW_EXTENTS_ON_STACK 32
+
+/* Functions to convert MARIA_RECORD_POS to/from page:offset */
+
+static inline MARIA_RECORD_POS ma_recordpos(pgcache_page_no_t page,
+ uint dir_entry)
+{
+ DBUG_ASSERT(dir_entry <= 255);
+ DBUG_ASSERT(page > 0); /* page 0 is bitmap, not data page */
+ return (MARIA_RECORD_POS) (((ulonglong) page << 8) | dir_entry);
+}
+
+static inline pgcache_page_no_t ma_recordpos_to_page(MARIA_RECORD_POS record_pos)
+{
+ return (pgcache_page_no_t) (record_pos >> 8);
+}
+
+static inline uint ma_recordpos_to_dir_entry(MARIA_RECORD_POS record_pos)
+{
+ return (uint) (record_pos & 255);
+}
+
+static inline uchar *dir_entry_pos(uchar *buff, uint block_size, uint pos)
+{
+ return (buff + block_size - DIR_ENTRY_SIZE * pos - PAGE_SUFFIX_SIZE -
+ DIR_ENTRY_SIZE);
+}
+
+/* ma_blockrec.c */
+void _ma_init_block_record_data(void);
+my_bool _ma_once_init_block_record(MARIA_SHARE *share, File dfile);
+my_bool _ma_once_end_block_record(MARIA_SHARE *share);
+my_bool _ma_init_block_record(MARIA_HA *info);
+void _ma_end_block_record(MARIA_HA *info);
+
+my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+ const uchar *oldrec, const uchar *newrec);
+my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record);
+int _ma_read_block_record(MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS record_pos);
+int _ma_read_block_record2(MARIA_HA *info, uchar *record,
+ uchar *data, uchar *end_of_data);
+int _ma_scan_block_record(MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS, my_bool);
+my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos);
+my_bool _ma_scan_init_block_record(MARIA_HA *info);
+void _ma_scan_end_block_record(MARIA_HA *info);
+int _ma_scan_remember_block_record(MARIA_HA *info,
+ MARIA_RECORD_POS *lastpos);
+void _ma_scan_restore_block_record(MARIA_HA *info,
+ MARIA_RECORD_POS lastpos);
+
+MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
+ const uchar *record);
+my_bool _ma_write_block_record(MARIA_HA *info, const uchar *record);
+my_bool _ma_write_abort_block_record(MARIA_HA *info);
+my_bool _ma_compare_block_record(register MARIA_HA *info,
+ register const uchar *record);
+void _ma_compact_block_page(uchar *buff, uint block_size, uint rownr,
+ my_bool extend_block, TrID min_read_from,
+ uint min_row_length);
+my_bool enough_free_entries_on_page(MARIA_SHARE *share, uchar *page_buff);
+TRANSLOG_ADDRESS
+maria_page_get_lsn(uchar *page, pgcache_page_no_t page_no, uchar* data_ptr);
+
+/* ma_bitmap.c */
+my_bool _ma_bitmap_init(MARIA_SHARE *share, File file);
+my_bool _ma_bitmap_end(MARIA_SHARE *share);
+my_bool _ma_bitmap_flush(MARIA_SHARE *share);
+my_bool _ma_bitmap_flush_all(MARIA_SHARE *share);
+void _ma_bitmap_reset_cache(MARIA_SHARE *share);
+my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row,
+ MARIA_BITMAP_BLOCKS *result_blocks);
+my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks);
+my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents,
+ uint count);
+my_bool _ma_bitmap_set(MARIA_HA *info, pgcache_page_no_t pos, my_bool head,
+ uint empty_space);
+my_bool _ma_bitmap_reset_full_page_bits(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page,
+ uint page_count);
+my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info,
+ MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page, uint page_count);
+uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size);
+my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *new_row,
+ pgcache_page_no_t page, uint free_size,
+ MARIA_BITMAP_BLOCKS *result_blocks);
+my_bool _ma_check_bitmap_data(MARIA_HA *info,
+ enum en_page_type page_type,
+ pgcache_page_no_t page,
+ uint empty_space, uint *bitmap_pattern);
+my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info,
+ enum en_page_type page_type,
+ pgcache_page_no_t page,
+ uint *bitmap_pattern);
+uint _ma_bitmap_get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+ pgcache_page_no_t page);
+void _ma_bitmap_delete_all(MARIA_SHARE *share);
+int _ma_bitmap_create_first(MARIA_SHARE *share);
+void _ma_bitmap_flushable(MARIA_HA *info, int non_flushable_inc);
+void _ma_bitmap_lock(MARIA_SHARE *share);
+void _ma_bitmap_unlock(MARIA_SHARE *share);
+void _ma_bitmap_set_pagecache_callbacks(PAGECACHE_FILE *file,
+ MARIA_SHARE *share);
+#ifndef DBUG_OFF
+void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data,
+ pgcache_page_no_t page);
+#endif
+
+uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
+ uint page_type,
+ my_bool new_page,
+ const uchar *header,
+ const uchar *data,
+ size_t data_length);
+uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
+ uint page_type,
+ const uchar *header);
+uint _ma_apply_redo_free_blocks(MARIA_HA *info, LSN lsn,
+ const uchar *header);
+uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
+ const uchar *header);
+uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, LSN lsn,
+ const uchar *header, LSN redo_lsn,
+ uint * const number_of_blobs,
+ uint * const number_of_ranges,
+ pgcache_page_no_t * const first_page,
+ pgcache_page_no_t * const last_page);
+my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info, LSN lsn,
+ const uchar *header);
+my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header);
+my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, size_t length);
+my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, size_t length);
+my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn);
+
+my_bool write_hook_for_redo(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+ void *hook_arg);
+my_bool write_hook_for_undo(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+ void *hook_arg);
+my_bool write_hook_for_redo_delete_all(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg);
+my_bool write_hook_for_undo_row_insert(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg);
+my_bool write_hook_for_undo_row_delete(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg);
+my_bool write_hook_for_undo_row_update(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg);
+my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg);
+my_bool write_hook_for_file_id(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+ void *hook_arg);
+my_bool write_hook_for_commit(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+ void *hook_arg);
+void _ma_block_get_status(void *param, my_bool concurrent_insert);
+my_bool _ma_block_start_trans(void* param);
+my_bool _ma_block_start_trans_no_versioning(void *param);
+void _ma_block_update_status(void *param);
+void _ma_block_restore_status(void *param);
+my_bool _ma_block_check_status(void *param);
diff --git a/storage/maria/ma_cache.c b/storage/maria/ma_cache.c
new file mode 100644
index 00000000000..82b5ddd8047
--- /dev/null
+++ b/storage/maria/ma_cache.c
@@ -0,0 +1,107 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Functions for read record cacheing with maria
+ Used for reading dynamic/compressed records from datafile.
+
+ Can fetch data directly from file (outside cache),
+ if reading a small chunk straight before the cached part (with possible
+ overlap).
+
+ Can be explicitly asked not to use cache (by not setting READING_NEXT in
+ flag) - useful for occasional out-of-cache reads, when the next read is
+ expected to hit the cache again.
+
+ Allows "partial read" errors in the record header (when READING_HEADER flag
+ is set) - unread part is bzero'ed
+
+ Note: out-of-cache reads are enabled for shared IO_CACHE's too,
+ as these reads will be cached by OS cache (and my_pread is always atomic)
+*/
+
+
+#include "maria_def.h"
+
+my_bool _ma_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos,
+ size_t length, uint flag)
+{
+ size_t read_length,in_buff_length;
+ my_off_t offset;
+ uchar *in_buff_pos;
+ DBUG_ENTER("_ma_read_cache");
+
+ if (pos < info->pos_in_file)
+ {
+ read_length=length;
+ if ((my_off_t) read_length > (my_off_t) (info->pos_in_file-pos))
+ read_length=(uint) (info->pos_in_file-pos);
+ info->seek_not_done=1;
+ if (my_pread(info->file,buff,read_length,pos,MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ if (!(length-=read_length))
+ DBUG_RETURN(0);
+ pos+=read_length;
+ buff+=read_length;
+ }
+ if (pos >= info->pos_in_file &&
+ (offset= (my_off_t) (pos - info->pos_in_file)) <
+ (my_off_t) (info->read_end - info->request_pos))
+ {
+ in_buff_pos=info->request_pos+(uint) offset;
+ in_buff_length= min(length,(size_t) (info->read_end-in_buff_pos));
+ memcpy(buff,info->request_pos+(uint) offset,(size_t) in_buff_length);
+ if (!(length-=in_buff_length))
+ DBUG_RETURN(0);
+ pos+=in_buff_length;
+ buff+=in_buff_length;
+ }
+ else
+ in_buff_length=0;
+ if (flag & READING_NEXT)
+ {
+ if (pos != (info->pos_in_file +
+ (uint) (info->read_end - info->request_pos)))
+ {
+ info->pos_in_file=pos; /* Force start here */
+ info->read_pos=info->read_end=info->request_pos; /* Everything used */
+ info->seek_not_done=1;
+ }
+ else
+ info->read_pos=info->read_end; /* All block used */
+ if (!(*info->read_function)(info,buff,length))
+ DBUG_RETURN(0);
+ read_length=info->error;
+ }
+ else
+ {
+ info->seek_not_done=1;
+ if ((read_length=my_pread(info->file,buff,length,pos,MYF(0))) == length)
+ DBUG_RETURN(0);
+ }
+ if (!(flag & READING_HEADER) || (int) read_length == -1 ||
+ read_length+in_buff_length < 3)
+ {
+ DBUG_PRINT("error",
+ ("Error %d reading next-multi-part block (Got %d bytes)",
+ my_errno, (int) read_length));
+ if (!my_errno || my_errno == HA_ERR_FILE_TOO_SHORT)
+ my_errno= HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(1);
+ }
+ bzero(buff+read_length,MARIA_BLOCK_INFO_HEADER_LENGTH - in_buff_length -
+ read_length);
+ DBUG_RETURN(0);
+} /* _ma_read_cache */
diff --git a/storage/maria/ma_changed.c b/storage/maria/ma_changed.c
new file mode 100644
index 00000000000..4d0964581f6
--- /dev/null
+++ b/storage/maria/ma_changed.c
@@ -0,0 +1,33 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Check if somebody has changed table since last check. */
+
+#include "maria_def.h"
+
+ /* Return 0 if table isn't changed */
+
+int maria_is_changed(MARIA_HA *info)
+{
+ int result;
+ DBUG_ENTER("maria_is_changed");
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(-1);
+ VOID(_ma_writeinfo(info,0));
+ result=(int) info->data_changed;
+ info->data_changed=0;
+ DBUG_PRINT("exit",("result: %d",result));
+ DBUG_RETURN(result);
+}
diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c
new file mode 100644
index 00000000000..307befab5c7
--- /dev/null
+++ b/storage/maria/ma_check.c
@@ -0,0 +1,6805 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Describe, check and repair of MARIA tables */
+
+/*
+ About checksum calculation.
+
+ There are two types of checksums. Table checksum and row checksum.
+
+ Row checksum is an additional uchar at the end of dynamic length
+ records. It must be calculated if the table is configured for them.
+ Otherwise they must not be used. The variable
+ MYISAM_SHARE::calc_checksum determines if row checksums are used.
+ MI_INFO::checksum is used as temporary storage during row handling.
+ For parallel repair we must assure that only one thread can use this
+ variable. There is no problem on the write side as this is done by one
+ thread only. But when checking a record after read this could go
+ wrong. But since all threads read through a common read buffer, it is
+ sufficient if only one thread checks it.
+
+ Table checksum is an eight uchar value in the header of the index file.
+ It can be calculated even if row checksums are not used. The variable
+ MI_CHECK::glob_crc is calculated over all records.
+ MI_SORT_PARAM::calc_checksum determines if this should be done. This
+ variable is not part of MI_CHECK because it must be set per thread for
+ parallel repair. The global glob_crc must be changed by one thread
+ only. And it is sufficient to calculate the checksum once only.
+*/
+
+#include "ma_ftdefs.h"
+#include "ma_rt_index.h"
+#include "ma_blockrec.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+#include <stdarg.h>
+#include <my_getopt.h>
+#ifdef HAVE_SYS_VADVISE_H
+#include <sys/vadvise.h>
+#endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+/* Functions defined in this file */
+
+static int check_k_link(HA_CHECK *param, MARIA_HA *info, my_off_t next_link);
+static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ MARIA_PAGE *page, ha_rows *keys,
+ ha_checksum *key_checksum, uint level);
+static uint isam_key_length(MARIA_HA *info,MARIA_KEYDEF *keyinfo);
+static ha_checksum calc_checksum(ha_rows count);
+static int writekeys(MARIA_SORT_PARAM *sort_param);
+static int sort_one_index(HA_CHECK *param, MARIA_HA *info,
+ MARIA_KEYDEF *keyinfo,
+ my_off_t pagepos, File new_file);
+static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key);
+static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key);
+static int sort_get_next_record(MARIA_SORT_PARAM *sort_param);
+static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a,
+ const void *b);
+static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param,
+ const uchar *a);
+static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a);
+static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo, const uchar *key);
+static int sort_insert_key(MARIA_SORT_PARAM *sort_param,
+ reg1 SORT_KEY_BLOCKS *key_block,
+ const uchar *key, my_off_t prev_block);
+static int sort_delete_record(MARIA_SORT_PARAM *sort_param);
+/*static int _ma_flush_pending_blocks(HA_CHECK *param);*/
+static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
+ uint buffer_length);
+static ha_checksum maria_byte_checksum(const uchar *buf, uint length);
+static void set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share);
+static void restore_data_file_type(MARIA_SHARE *share);
+static void change_data_file_descriptor(MARIA_HA *info, File new_file);
+static void unuse_data_file_descriptor(MARIA_HA *info);
+static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info,
+ MARIA_HA *info, uchar *record);
+static void copy_data_file_state(MARIA_STATE_INFO *to,
+ MARIA_STATE_INFO *from);
+static void report_keypage_fault(HA_CHECK *param, MARIA_HA *info,
+ my_off_t position);
+static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file);
+static my_bool _ma_flush_table_files_before_swap(HA_CHECK *param,
+ MARIA_HA *info);
+static TrID max_trid_in_system(void);
+static void _ma_check_print_not_visible_error(HA_CHECK *param, TrID used_trid);
+void retry_if_quick(MARIA_SORT_PARAM *param, int error);
+
+
+/* Initialize check param with default values */
+
+void maria_chk_init(HA_CHECK *param)
+{
+ bzero((uchar*) param,sizeof(*param));
+ param->opt_follow_links=1;
+ param->keys_in_use= ~(ulonglong) 0;
+ param->search_after_block=HA_OFFSET_ERROR;
+ param->auto_increment_value= 0;
+ param->use_buffers=USE_BUFFER_INIT;
+ param->read_buffer_length=READ_BUFFER_INIT;
+ param->write_buffer_length=READ_BUFFER_INIT;
+ param->sort_buffer_length=SORT_BUFFER_INIT;
+ param->sort_key_blocks=BUFFERS_WHEN_SORTING;
+ param->tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
+ param->myf_rw=MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL);
+ param->start_check_pos=0;
+ param->max_record_length= LONGLONG_MAX;
+ param->pagecache_block_size= KEY_CACHE_BLOCK_SIZE;
+ param->stats_method= MI_STATS_METHOD_NULLS_NOT_EQUAL;
+}
+
+
+/* Initialize check param and maria handler for check of table */
+
+void maria_chk_init_for_check(HA_CHECK *param, MARIA_HA *info)
+{
+ param->not_visible_rows_found= 0;
+ param->max_found_trid= 0;
+
+ /*
+ Set up transaction handler so that we can see all rows. When rows is read
+ we will check the found id against param->max_tried
+ */
+ if (param->max_trid == 0)
+ {
+ if (!ma_control_file_inited())
+ param->max_trid= 0; /* Give warning for first trid found */
+ else
+ param->max_trid= max_trid_in_system();
+ }
+ maria_ignore_trids(info);
+}
+
+
+ /* Check the status flags for the table */
+
+int maria_chk_status(HA_CHECK *param, MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+
+ if (maria_is_crashed_on_repair(info))
+ _ma_check_print_warning(param,
+ "Table is marked as crashed and last repair failed");
+ else if (maria_in_repair(info))
+ _ma_check_print_warning(param,
+ "Last repair was aborted before finishing");
+ else if (maria_is_crashed(info))
+ _ma_check_print_warning(param,
+ "Table is marked as crashed");
+ if (share->state.open_count != (uint) (share->global_changed ? 1 : 0))
+ {
+ /* Don't count this as a real warning, as check can correct this ! */
+ uint save=param->warning_printed;
+ _ma_check_print_warning(param,
+ share->state.open_count==1 ?
+ "%d client is using or hasn't closed the table properly" :
+ "%d clients are using or haven't closed the table properly",
+ share->state.open_count);
+ /* If this will be fixed by the check, forget the warning */
+ if (param->testflag & T_UPDATE_STATE)
+ param->warning_printed=save;
+ }
+ return 0;
+}
+
+/*
+ Check delete links in row data
+*/
+
+int maria_chk_del(HA_CHECK *param, register MARIA_HA *info,
+ ulonglong test_flag)
+{
+ MARIA_SHARE *share= info->s;
+ reg2 ha_rows i;
+ uint delete_link_length;
+ my_off_t empty,next_link,old_link;
+ char buff[22],buff2[22];
+ DBUG_ENTER("maria_chk_del");
+
+ LINT_INIT(old_link);
+
+ param->record_checksum=0;
+
+ if (share->data_file_type == BLOCK_RECORD)
+ DBUG_RETURN(0); /* No delete links here */
+
+ delete_link_length=((share->options & HA_OPTION_PACK_RECORD) ? 20 :
+ share->rec_reflength+1);
+
+ if (!(test_flag & T_SILENT))
+ puts("- check record delete-chain");
+
+ next_link=share->state.dellink;
+ if (share->state.state.del == 0)
+ {
+ if (test_flag & T_VERBOSE)
+ {
+ puts("No recordlinks");
+ }
+ }
+ else
+ {
+ if (test_flag & T_VERBOSE)
+ printf("Recordlinks: ");
+ empty=0;
+ for (i= share->state.state.del ; i > 0L && next_link != HA_OFFSET_ERROR ; i--)
+ {
+ if (_ma_killed_ptr(param))
+ DBUG_RETURN(1);
+ if (test_flag & T_VERBOSE)
+ printf(" %9s",llstr(next_link,buff));
+ if (next_link >= share->state.state.data_file_length)
+ goto wrong;
+ if (my_pread(info->dfile.file, (uchar*) buff, delete_link_length,
+ next_link,MYF(MY_NABP)))
+ {
+ if (test_flag & T_VERBOSE) puts("");
+ _ma_check_print_error(param,"Can't read delete-link at filepos: %s",
+ llstr(next_link,buff));
+ DBUG_RETURN(1);
+ }
+ if (*buff != '\0')
+ {
+ if (test_flag & T_VERBOSE) puts("");
+ _ma_check_print_error(param,"Record at pos: %s is not remove-marked",
+ llstr(next_link,buff));
+ goto wrong;
+ }
+ if (share->options & HA_OPTION_PACK_RECORD)
+ {
+ my_off_t prev_link=mi_sizekorr(buff+12);
+ if (empty && prev_link != old_link)
+ {
+ if (test_flag & T_VERBOSE) puts("");
+ _ma_check_print_error(param,"Deleted block at %s doesn't point back at previous delete link",llstr(next_link,buff2));
+ goto wrong;
+ }
+ old_link=next_link;
+ next_link=mi_sizekorr(buff+4);
+ empty+=mi_uint3korr(buff+1);
+ }
+ else
+ {
+ param->record_checksum+=(ha_checksum) next_link;
+ next_link= _ma_rec_pos(share, (uchar *) buff + 1);
+ empty+=share->base.pack_reclength;
+ }
+ }
+ if (share->state.state.del && (test_flag & T_VERBOSE))
+ puts("\n");
+ if (empty != share->state.state.empty)
+ {
+ _ma_check_print_warning(param,
+ "Found %s deleted space in delete link chain. Should be %s",
+ llstr(empty,buff2),
+ llstr(share->state.state.empty,buff));
+ }
+ if (next_link != HA_OFFSET_ERROR)
+ {
+ _ma_check_print_error(param,
+ "Found more than the expected %s deleted rows in delete link chain",
+ llstr(share->state.state.del, buff));
+ goto wrong;
+ }
+ if (i != 0)
+ {
+ _ma_check_print_error(param,
+ "Found %s deleted rows in delete link chain. Should be %s",
+ llstr(share->state.state.del - i, buff2),
+ llstr(share->state.state.del, buff));
+ goto wrong;
+ }
+ }
+ DBUG_RETURN(0);
+
+wrong:
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ if (test_flag & T_VERBOSE)
+ puts("");
+ _ma_check_print_error(param,"record delete-link-chain corrupted");
+ DBUG_RETURN(1);
+} /* maria_chk_del */
+
+
+/* Check delete links in index file */
+
+static int check_k_link(HA_CHECK *param, register MARIA_HA *info,
+ my_off_t next_link)
+{
+ MARIA_SHARE *share= info->s;
+ uint block_size= share->block_size;
+ ha_rows records;
+ char llbuff[21], llbuff2[21];
+ uchar *buff;
+ DBUG_ENTER("check_k_link");
+
+ if (next_link == HA_OFFSET_ERROR)
+ DBUG_RETURN(0); /* Avoid printing empty line */
+
+ records= (ha_rows) (share->state.state.key_file_length / block_size);
+ while (next_link != HA_OFFSET_ERROR && records > 0)
+ {
+ if (_ma_killed_ptr(param))
+ DBUG_RETURN(1);
+ if (param->testflag & T_VERBOSE)
+ printf("%16s",llstr(next_link,llbuff));
+
+ /* Key blocks must lay within the key file length entirely. */
+ if (next_link + block_size > share->state.state.key_file_length)
+ {
+ /* purecov: begin tested */
+ _ma_check_print_error(param, "Invalid key block position: %s "
+ "key block size: %u file_length: %s",
+ llstr(next_link, llbuff), block_size,
+ llstr(share->state.state.key_file_length, llbuff2));
+ DBUG_RETURN(1);
+ /* purecov: end */
+ }
+
+ /* Key blocks must be aligned at block_size */
+ if (next_link & (block_size -1))
+ {
+ /* purecov: begin tested */
+ _ma_check_print_error(param, "Mis-aligned key block: %s "
+ "minimum key block length: %u",
+ llstr(next_link, llbuff),
+ block_size);
+ DBUG_RETURN(1);
+ /* purecov: end */
+ }
+
+ DBUG_ASSERT(share->pagecache->block_size == block_size);
+ if (!(buff= pagecache_read(share->pagecache,
+ &share->kfile,
+ (pgcache_page_no_t) (next_link / block_size),
+ DFLT_INIT_HITS,
+ info->buff, PAGECACHE_READ_UNKNOWN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ {
+ /* purecov: begin tested */
+ _ma_check_print_error(param, "key cache read error for block: %s",
+ llstr(next_link,llbuff));
+ DBUG_RETURN(1);
+ /* purecov: end */
+ }
+ if (_ma_get_keynr(info->s, buff) != MARIA_DELETE_KEY_NR)
+ _ma_check_print_error(param, "Page at %s is not delete marked",
+ llstr(next_link, llbuff));
+
+ next_link= mi_sizekorr(buff + share->keypage_header);
+ records--;
+ param->key_file_blocks+=block_size;
+ }
+ if (param->testflag & T_VERBOSE)
+ {
+ if (next_link != HA_OFFSET_ERROR)
+ printf("%16s\n",llstr(next_link,llbuff));
+ else
+ puts("");
+ }
+ DBUG_RETURN (next_link != HA_OFFSET_ERROR);
+} /* check_k_link */
+
+
+ /* Check sizes of files */
+
+int maria_chk_size(HA_CHECK *param, register MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ int error;
+ register my_off_t skr,size;
+ char buff[22],buff2[22];
+ DBUG_ENTER("maria_chk_size");
+
+ if (!(param->testflag & T_SILENT))
+ puts("- check file-size");
+
+ /*
+ The following is needed if called externally (not from maria_chk).
+ To get a correct physical size we need to flush them.
+ */
+ if ((error= _ma_flush_table_files(info,
+ MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE)))
+ _ma_check_print_error(param, "Failed to flush data or index file");
+
+ size= my_seek(share->kfile.file, 0L, MY_SEEK_END, MYF(MY_THREADSAFE));
+ if ((skr=(my_off_t) share->state.state.key_file_length) != size)
+ {
+ /* Don't give error if file generated by mariapack */
+ if (skr > size && maria_is_any_key_active(share->state.key_map))
+ {
+ error=1;
+ _ma_check_print_error(param,
+ "Size of indexfile is: %-8s Should be: %s",
+ llstr(size,buff), llstr(skr,buff2));
+ }
+ else if (!(param->testflag & T_VERY_SILENT))
+ _ma_check_print_warning(param,
+ "Size of indexfile is: %-8s Should be: %s",
+ llstr(size,buff), llstr(skr,buff2));
+ }
+ if (!(param->testflag & T_VERY_SILENT) &&
+ ! (share->options & HA_OPTION_COMPRESS_RECORD) &&
+ ulonglong2double(share->state.state.key_file_length) >
+ ulonglong2double(share->base.margin_key_file_length)*0.9)
+ _ma_check_print_warning(param,"Keyfile is almost full, %10s of %10s used",
+ llstr(share->state.state.key_file_length,buff),
+ llstr(share->base.max_key_file_length-1,buff));
+
+ size= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0));
+ skr=(my_off_t) share->state.state.data_file_length;
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ skr+= MEMMAP_EXTRA_MARGIN;
+#ifdef USE_RELOC
+ if (share->data_file_type == STATIC_RECORD &&
+ skr < (my_off_t) share->base.reloc*share->base.min_pack_length)
+ skr=(my_off_t) share->base.reloc*share->base.min_pack_length;
+#endif
+ if (skr != size)
+ {
+ if (skr > size && skr != size + MEMMAP_EXTRA_MARGIN)
+ {
+ share->state.state.data_file_length=size; /* Skip other errors */
+ error=1;
+ _ma_check_print_error(param,"Size of datafile is: %-9s Should be: %s",
+ llstr(size,buff), llstr(skr,buff2));
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ }
+ else
+ {
+ _ma_check_print_warning(param,
+ "Size of datafile is: %-9s Should be: %s",
+ llstr(size,buff), llstr(skr,buff2));
+ }
+ }
+ if (!(param->testflag & T_VERY_SILENT) &&
+ !(share->options & HA_OPTION_COMPRESS_RECORD) &&
+ ulonglong2double(share->state.state.data_file_length) >
+ (ulonglong2double(share->base.max_data_file_length)*0.9))
+ _ma_check_print_warning(param, "Datafile is almost full, %10s of %10s used",
+ llstr(share->state.state.data_file_length,buff),
+ llstr(share->base.max_data_file_length-1,buff2));
+ DBUG_RETURN(error);
+} /* maria_chk_size */
+
+
+/* Check keys */
+
+int maria_chk_key(HA_CHECK *param, register MARIA_HA *info)
+{
+ uint key,found_keys=0,full_text_keys=0,result=0;
+ ha_rows keys;
+ ha_checksum old_record_checksum,init_checksum;
+ my_off_t all_keydata,all_totaldata,key_totlength,length;
+ double *rec_per_key_part;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo;
+ char buff[22],buff2[22];
+ MARIA_PAGE page;
+ DBUG_ENTER("maria_chk_key");
+
+ if (!(param->testflag & T_SILENT))
+ puts("- check key delete-chain");
+
+ param->key_file_blocks=share->base.keystart;
+ if (check_k_link(param, info, share->state.key_del))
+ {
+ if (param->testflag & T_VERBOSE) puts("");
+ _ma_check_print_error(param,"key delete-link-chain corrupted");
+ DBUG_RETURN(-1);
+ }
+
+ if (!(param->testflag & T_SILENT))
+ puts("- check index reference");
+
+ all_keydata=all_totaldata=key_totlength=0;
+ init_checksum=param->record_checksum;
+ old_record_checksum=0;
+ if (share->data_file_type == STATIC_RECORD)
+ old_record_checksum= (calc_checksum(share->state.state.records +
+ share->state.state.del-1) *
+ share->base.pack_reclength);
+ rec_per_key_part= param->new_rec_per_key_part;
+ for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ;
+ rec_per_key_part+=keyinfo->keysegs, key++, keyinfo++)
+ {
+ param->key_crc[key]=0;
+ if (! maria_is_key_active(share->state.key_map, key))
+ {
+ /* Remember old statistics for key */
+ memcpy((char*) rec_per_key_part,
+ (char*) (share->state.rec_per_key_part +
+ (uint) (rec_per_key_part - param->new_rec_per_key_part)),
+ keyinfo->keysegs*sizeof(*rec_per_key_part));
+ continue;
+ }
+ found_keys++;
+
+ param->record_checksum=init_checksum;
+
+ bzero((char*) &param->unique_count,sizeof(param->unique_count));
+ bzero((char*) &param->notnull_count,sizeof(param->notnull_count));
+
+ if ((!(param->testflag & T_SILENT)))
+ printf ("- check data record references index: %d\n",key+1);
+ if (keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL))
+ full_text_keys++;
+ if (share->state.key_root[key] == HA_OFFSET_ERROR)
+ {
+ if (share->state.state.records != 0 && !(keyinfo->flag & HA_FULLTEXT))
+ _ma_check_print_error(param, "Key tree %u is empty", key + 1);
+ goto do_stat;
+ }
+ if (_ma_fetch_keypage(&page, info, keyinfo, share->state.key_root[key],
+ PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS,
+ info->buff, 0))
+ {
+ report_keypage_fault(param, info, share->state.key_root[key]);
+ if (!(param->testflag & T_INFO))
+ DBUG_RETURN(-1);
+ result= -1;
+ continue;
+ }
+ param->key_file_blocks+=keyinfo->block_length;
+ keys=0;
+ param->keydata=param->totaldata=0;
+ param->key_blocks=0;
+ param->max_level=0;
+ if (chk_index(param, info,keyinfo, &page, &keys, param->key_crc+key,1))
+ DBUG_RETURN(-1);
+ if (!(keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL | HA_RTREE_INDEX)))
+ {
+ if (keys != share->state.state.records)
+ {
+ _ma_check_print_error(param,"Found %s keys of %s",llstr(keys,buff),
+ llstr(share->state.state.records,buff2));
+ if (!(param->testflag & T_INFO))
+ DBUG_RETURN(-1);
+ result= -1;
+ continue;
+ }
+ if ((found_keys - full_text_keys == 1 &&
+ !(share->data_file_type == STATIC_RECORD)) ||
+ (param->testflag & T_DONT_CHECK_CHECKSUM))
+ old_record_checksum= param->record_checksum;
+ else if (old_record_checksum != param->record_checksum)
+ {
+ if (key)
+ _ma_check_print_error(param,
+ "Key %u doesn't point at same records as "
+ "key 1",
+ key+1);
+ else
+ _ma_check_print_error(param,"Key 1 doesn't point at all records");
+ if (!(param->testflag & T_INFO))
+ DBUG_RETURN(-1);
+ result= -1;
+ continue;
+ }
+ }
+ if ((uint) share->base.auto_key -1 == key)
+ {
+ /* Check that auto_increment key is bigger than max key value */
+ ulonglong auto_increment;
+ const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg;
+ info->lastinx=key;
+ _ma_read_key_record(info, info->rec_buff, 0);
+ auto_increment=
+ ma_retrieve_auto_increment(info->rec_buff + keyseg->start,
+ keyseg->type);
+ if (auto_increment > share->state.auto_increment)
+ {
+ _ma_check_print_warning(param, "Auto-increment value: %s is smaller "
+ "than max used value: %s",
+ llstr(share->state.auto_increment,buff2),
+ llstr(auto_increment, buff));
+ }
+ if (param->testflag & T_AUTO_INC)
+ {
+ set_if_bigger(share->state.auto_increment,
+ auto_increment);
+ set_if_bigger(share->state.auto_increment,
+ param->auto_increment_value);
+ }
+
+ /* Check that there isn't a row with auto_increment = 0 in the table */
+ maria_extra(info,HA_EXTRA_KEYREAD,0);
+ bzero(info->lastkey_buff, keyinfo->seg->length);
+ if (!maria_rkey(info, info->rec_buff, key,
+ info->lastkey_buff,
+ (key_part_map) 1, HA_READ_KEY_EXACT))
+ {
+ /* Don't count this as a real warning, as maria_chk can't correct it */
+ uint save=param->warning_printed;
+ _ma_check_print_warning(param, "Found row where the auto_increment "
+ "column has the value 0");
+ param->warning_printed=save;
+ }
+ maria_extra(info,HA_EXTRA_NO_KEYREAD,0);
+ }
+
+ length=(my_off_t) isam_key_length(info,keyinfo)*keys + param->key_blocks*2;
+ if (param->testflag & T_INFO && param->totaldata != 0L && keys != 0L)
+ printf("Key: %2d: Keyblocks used: %3d%% Packed: %4d%% Max levels: %2d\n",
+ key+1,
+ (int) (my_off_t2double(param->keydata)*100.0/my_off_t2double(param->totaldata)),
+ (int) ((my_off_t2double(length) - my_off_t2double(param->keydata))*100.0/
+ my_off_t2double(length)),
+ param->max_level);
+ all_keydata+=param->keydata; all_totaldata+=param->totaldata; key_totlength+=length;
+
+do_stat:
+ if (param->testflag & T_STATISTICS)
+ maria_update_key_parts(keyinfo, rec_per_key_part, param->unique_count,
+ param->stats_method == MI_STATS_METHOD_IGNORE_NULLS?
+ param->notnull_count: NULL,
+ (ulonglong)share->state.state.records);
+ }
+ if (param->testflag & T_INFO)
+ {
+ if (all_totaldata != 0L && found_keys > 0)
+ printf("Total: Keyblocks used: %3d%% Packed: %4d%%\n\n",
+ (int) (my_off_t2double(all_keydata)*100.0/
+ my_off_t2double(all_totaldata)),
+ (int) ((my_off_t2double(key_totlength) -
+ my_off_t2double(all_keydata))*100.0/
+ my_off_t2double(key_totlength)));
+ else if (all_totaldata != 0L && maria_is_any_key_active(share->state.key_map))
+ puts("");
+ }
+ if (param->key_file_blocks != share->state.state.key_file_length &&
+ share->state.key_map == ~(ulonglong) 0)
+ _ma_check_print_warning(param, "Some data are unreferenced in keyfile");
+ if (found_keys != full_text_keys)
+ param->record_checksum=old_record_checksum-init_checksum; /* Remove delete links */
+ else
+ param->record_checksum=0;
+ DBUG_RETURN(result);
+} /* maria_chk_key */
+
+
+
+static int chk_index_down(HA_CHECK *param, MARIA_HA *info,
+ MARIA_KEYDEF *keyinfo,
+ my_off_t page, uchar *buff, ha_rows *keys,
+ ha_checksum *key_checksum, uint level)
+{
+ char llbuff[22],llbuff2[22];
+ MARIA_SHARE *share= info->s;
+ MARIA_PAGE ma_page;
+ DBUG_ENTER("chk_index_down");
+
+ /* Key blocks must lay within the key file length entirely. */
+ if (page + keyinfo->block_length > share->state.state.key_file_length)
+ {
+ /* purecov: begin tested */
+ /* Give it a chance to fit in the real file size. */
+ my_off_t max_length= my_seek(info->s->kfile.file, 0L, MY_SEEK_END,
+ MYF(MY_THREADSAFE));
+ _ma_check_print_error(param, "Invalid key block position: %s "
+ "key block size: %u file_length: %s",
+ llstr(page, llbuff), keyinfo->block_length,
+ llstr(share->state.state.key_file_length, llbuff2));
+ if (page + keyinfo->block_length > max_length)
+ goto err;
+ /* Fix the remembered key file length. */
+ share->state.state.key_file_length= (max_length &
+ ~ (my_off_t) (keyinfo->block_length -
+ 1));
+ /* purecov: end */
+ }
+
+ /* Key blocks must be aligned at block length */
+ if (page & (info->s->block_size -1))
+ {
+ /* purecov: begin tested */
+ _ma_check_print_error(param, "Mis-aligned key block: %s "
+ "key block length: %u",
+ llstr(page, llbuff), info->s->block_size);
+ goto err;
+ /* purecov: end */
+ }
+
+ if (_ma_fetch_keypage(&ma_page, info, keyinfo, page,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ DFLT_INIT_HITS, buff, 0))
+ {
+ report_keypage_fault(param, info, page);
+ goto err;
+ }
+ param->key_file_blocks+=keyinfo->block_length;
+ if (chk_index(param, info, keyinfo, &ma_page, keys, key_checksum,level))
+ goto err;
+
+ DBUG_RETURN(0);
+
+ /* purecov: begin tested */
+err:
+ DBUG_RETURN(1);
+ /* purecov: end */
+}
+
+
+/*
+ "Ignore NULLs" statistics collection method: process first index tuple.
+
+ SYNOPSIS
+ maria_collect_stats_nonulls_first()
+ keyseg IN Array of key part descriptions
+ notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i}
+ tuples that don't contain NULLs)
+ key IN Key values tuple
+
+ DESCRIPTION
+ Process the first index tuple - find out which prefix tuples don't
+ contain NULLs, and update the array of notnull counters accordingly.
+*/
+
+static
+void maria_collect_stats_nonulls_first(HA_KEYSEG *keyseg, ulonglong *notnull,
+ const uchar *key)
+{
+ uint first_null, kp;
+ first_null= ha_find_null(keyseg, key) - keyseg;
+ /*
+ All prefix tuples that don't include keypart_{first_null} are not-null
+ tuples (and all others aren't), increment counters for them.
+ */
+ for (kp= 0; kp < first_null; kp++)
+ notnull[kp]++;
+}
+
+
+/*
+ "Ignore NULLs" statistics collection method: process next index tuple.
+
+ SYNOPSIS
+ maria_collect_stats_nonulls_next()
+ keyseg IN Array of key part descriptions
+ notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i}
+ tuples that don't contain NULLs)
+ prev_key IN Previous key values tuple
+ last_key IN Next key values tuple
+
+ DESCRIPTION
+ Process the next index tuple:
+ 1. Find out which prefix tuples of last_key don't contain NULLs, and
+ update the array of notnull counters accordingly.
+ 2. Find the first keypart number where the prev_key and last_key tuples
+ are different(A), or last_key has NULL value(B), and return it, so the
+ caller can count number of unique tuples for each key prefix. We don't
+ need (B) to be counted, and that is compensated back in
+ maria_update_key_parts().
+
+ RETURN
+ 1 + number of first keypart where values differ or last_key tuple has NULL
+*/
+
+static
+int maria_collect_stats_nonulls_next(HA_KEYSEG *keyseg, ulonglong *notnull,
+ const uchar *prev_key,
+ const uchar *last_key)
+{
+ uint diffs[2];
+ uint first_null_seg, kp;
+ HA_KEYSEG *seg;
+
+ /*
+ Find the first keypart where values are different or either of them is
+ NULL. We get results in diffs array:
+ diffs[0]= 1 + number of first different keypart
+ diffs[1]=offset: (last_key + diffs[1]) points to first value in
+ last_key that is NULL or different from corresponding
+ value in prev_key.
+ */
+ ha_key_cmp(keyseg, prev_key, last_key, USE_WHOLE_KEY,
+ SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diffs);
+ seg= keyseg + diffs[0] - 1;
+
+ /* Find first NULL in last_key */
+ first_null_seg= ha_find_null(seg, last_key + diffs[1]) - keyseg;
+ for (kp= 0; kp < first_null_seg; kp++)
+ notnull[kp]++;
+
+ /*
+ Return 1+ number of first key part where values differ. Don't care if
+ these were NULLs and not .... We compensate for that in
+ maria_update_key_parts.
+ */
+ return diffs[0];
+}
+
+
+/* Check if index is ok */
+
+static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ MARIA_PAGE *anc_page, ha_rows *keys,
+ ha_checksum *key_checksum, uint level)
+{
+ int flag;
+ uint comp_flag, page_flag, nod_flag;
+ uchar *temp_buff, *keypos, *old_keypos, *endpos;
+ my_off_t next_page,record;
+ MARIA_SHARE *share= info->s;
+ char llbuff[22];
+ uint diff_pos[2];
+ uchar tmp_key_buff[MARIA_MAX_KEY_BUFF];
+ MARIA_KEY tmp_key;
+ DBUG_ENTER("chk_index");
+ DBUG_DUMP("buff", anc_page->buff, anc_page->size);
+
+ /* TODO: implement appropriate check for RTree keys */
+ if (keyinfo->flag & (HA_SPATIAL | HA_RTREE_INDEX))
+ DBUG_RETURN(0);
+
+ if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length)))
+ {
+ _ma_check_print_error(param,"Not enough memory for keyblock");
+ DBUG_RETURN(-1);
+ }
+
+ if (keyinfo->flag & HA_NOSAME)
+ {
+ /* Not real duplicates */
+ comp_flag=SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT;
+ }
+ else
+ comp_flag=SEARCH_SAME; /* Keys in positionorder */
+
+ page_flag= anc_page->flag;
+ nod_flag= anc_page->node;
+ old_keypos= anc_page->buff + share->keypage_header;
+ keypos= old_keypos + nod_flag;
+ endpos= anc_page->buff + anc_page->size;
+
+ param->keydata+= anc_page->size;
+ param->totaldata+= keyinfo->block_length; /* INFO */
+ param->key_blocks++;
+ if (level > param->max_level)
+ param->max_level=level;
+
+ if (_ma_get_keynr(share, anc_page->buff) !=
+ (uint) (keyinfo - share->keyinfo))
+ _ma_check_print_error(param, "Page at %s is not marked for index %u",
+ llstr(anc_page->pos, llbuff),
+ (uint) (keyinfo - share->keyinfo));
+ if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) &&
+ !share->base.born_transactional)
+ {
+ _ma_check_print_error(param,
+ "Page at %s is marked with HAS_TRANSID even if "
+ "table is not transactional",
+ llstr(anc_page->pos, llbuff));
+ }
+
+ if (anc_page->size > share->max_index_block_size)
+ {
+ _ma_check_print_error(param,
+ "Page at %s has impossible (too big) pagelength",
+ llstr(anc_page->pos, llbuff));
+ goto err;
+ }
+
+ info->last_key.keyinfo= tmp_key.keyinfo= keyinfo;
+ tmp_key.data= tmp_key_buff;
+ for ( ;; )
+ {
+ if (nod_flag)
+ {
+ if (_ma_killed_ptr(param))
+ goto err;
+ next_page= _ma_kpos(nod_flag,keypos);
+ if (chk_index_down(param,info,keyinfo,next_page,
+ temp_buff,keys,key_checksum,level+1))
+ {
+ DBUG_DUMP("page_data", old_keypos, (uint) (keypos - old_keypos));
+ goto err;
+ }
+ }
+ old_keypos=keypos;
+ if (keypos >= endpos ||
+ !(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos))
+ break;
+ if (keypos > endpos)
+ {
+ _ma_check_print_error(param,
+ "Page length and length of keys don't match at "
+ "page: %s",
+ llstr(anc_page->pos,llbuff));
+ goto err;
+ }
+ if (share->data_file_type == BLOCK_RECORD &&
+ !(page_flag & KEYPAGE_FLAG_HAS_TRANSID) &&
+ key_has_transid(tmp_key.data + tmp_key.data_length +
+ share->rec_reflength-1))
+ {
+ _ma_check_print_error(param,
+ "Found key marked for transid on page that is not "
+ "marked for transid at: %s",
+ llstr(anc_page->pos,llbuff));
+ goto err;
+ }
+
+ if ((*keys)++ &&
+ (flag=ha_key_cmp(keyinfo->seg, info->last_key.data, tmp_key.data,
+ tmp_key.data_length + tmp_key.ref_length,
+ (comp_flag | SEARCH_INSERT | (tmp_key.flag >> 1) |
+ info->last_key.flag), diff_pos)) >=0)
+ {
+ DBUG_DUMP_KEY("old", &info->last_key);
+ DBUG_DUMP_KEY("new", &tmp_key);
+ DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos));
+
+ if ((comp_flag & SEARCH_FIND) && flag == 0)
+ _ma_check_print_error(param,"Found duplicated key at page %s",
+ llstr(anc_page->pos,llbuff));
+ else
+ _ma_check_print_error(param,"Key in wrong position at page %s",
+ llstr(anc_page->pos,llbuff));
+ goto err;
+ }
+
+ if (param->testflag & T_STATISTICS)
+ {
+ if (*keys != 1L) /* not first_key */
+ {
+ if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL)
+ ha_key_cmp(keyinfo->seg, info->last_key.data,
+ tmp_key.data, tmp_key.data_length,
+ SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL,
+ diff_pos);
+ else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+ {
+ diff_pos[0]= maria_collect_stats_nonulls_next(keyinfo->seg,
+ param->notnull_count,
+ info->last_key.data,
+ tmp_key.data);
+ }
+ param->unique_count[diff_pos[0]-1]++;
+ }
+ else
+ {
+ if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+ maria_collect_stats_nonulls_first(keyinfo->seg, param->notnull_count,
+ tmp_key.data);
+ }
+ }
+ _ma_copy_key(&info->last_key, &tmp_key);
+ (*key_checksum)+= maria_byte_checksum(tmp_key.data, tmp_key.data_length);
+ record= _ma_row_pos_from_key(&tmp_key);
+
+ if (keyinfo->flag & HA_FULLTEXT) /* special handling for ft2 */
+ {
+ uint off;
+ int subkeys;
+ get_key_full_length_rdonly(off, tmp_key.data);
+ subkeys= ft_sintXkorr(tmp_key.data + off);
+ if (subkeys < 0)
+ {
+ ha_rows tmp_keys=0;
+ if (chk_index_down(param,info,&share->ft2_keyinfo,record,
+ temp_buff,&tmp_keys,key_checksum,1))
+ goto err;
+ if (tmp_keys + subkeys)
+ {
+ _ma_check_print_error(param,
+ "Number of words in the 2nd level tree "
+ "does not match the number in the header. "
+ "Parent word in on the page %s, offset %u",
+ llstr(anc_page->pos,llbuff),
+ (uint) (old_keypos - anc_page->buff));
+ goto err;
+ }
+ (*keys)+=tmp_keys-1;
+ continue;
+ }
+ /* fall through */
+ }
+ if ((share->data_file_type != BLOCK_RECORD &&
+ record >= share->state.state.data_file_length) ||
+ (share->data_file_type == BLOCK_RECORD &&
+ ma_recordpos_to_page(record) * share->base.min_block_length >=
+ share->state.state.data_file_length))
+ {
+#ifndef DBUG_OFF
+ char llbuff2[22], llbuff3[22];
+#endif
+ _ma_check_print_error(param,
+ "Found key at page %s that points to record "
+ "outside datafile",
+ llstr(anc_page->pos,llbuff));
+ DBUG_PRINT("test",("page: %s record: %s filelength: %s",
+ llstr(anc_page->pos,llbuff),llstr(record,llbuff2),
+ llstr(share->state.state.data_file_length,llbuff3)));
+ DBUG_DUMP_KEY("key", &tmp_key);
+ DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos));
+ goto err;
+ }
+ param->record_checksum+= (ha_checksum) record;
+ }
+ if (keypos != endpos)
+ {
+ _ma_check_print_error(param,
+ "Keyblock size at page %s is not correct. "
+ "Block length: %u key length: %u",
+ llstr(anc_page->pos, llbuff), anc_page->size,
+ (uint) (keypos - anc_page->buff));
+ goto err;
+ }
+ my_afree(temp_buff);
+ DBUG_RETURN(0);
+ err:
+ my_afree(temp_buff);
+ DBUG_RETURN(1);
+} /* chk_index */
+
+
+ /* Calculate a checksum of 1+2+3+4...N = N*(N+1)/2 without overflow */
+
+static ha_checksum calc_checksum(ha_rows count)
+{
+ ulonglong sum,a,b;
+ DBUG_ENTER("calc_checksum");
+
+ sum=0;
+ a=count; b=count+1;
+ if (a & 1)
+ b>>=1;
+ else
+ a>>=1;
+ while (b)
+ {
+ if (b & 1)
+ sum+=a;
+ a<<=1; b>>=1;
+ }
+ DBUG_PRINT("exit",("sum: %lx",(ulong) sum));
+ DBUG_RETURN((ha_checksum) sum);
+} /* calc_checksum */
+
+
+ /* Calc length of key in normal isam */
+
+static uint isam_key_length(MARIA_HA *info, register MARIA_KEYDEF *keyinfo)
+{
+ uint length;
+ HA_KEYSEG *keyseg;
+ DBUG_ENTER("isam_key_length");
+
+ length= info->s->rec_reflength;
+ for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++)
+ length+= keyseg->length;
+
+ DBUG_PRINT("exit",("length: %d",length));
+ DBUG_RETURN(length);
+} /* key_length */
+
+
+
+static void record_pos_to_txt(MARIA_HA *info, my_off_t recpos,
+ char *buff)
+{
+ if (info->s->data_file_type != BLOCK_RECORD)
+ llstr(recpos, buff);
+ else
+ {
+ my_off_t page= ma_recordpos_to_page(recpos);
+ uint row= ma_recordpos_to_dir_entry(recpos);
+ char *end= longlong10_to_str(page, buff, 10);
+ *(end++)= ':';
+ longlong10_to_str(row, end, 10);
+ }
+}
+
+
+/*
+ Check that keys in records exist in index tree
+
+ SYNOPSIS
+ check_keys_in_record()
+ param Check paramenter
+ info Maria handler
+ extend Type of check (extended or normal)
+ start_recpos Position to row
+ record Record buffer
+
+ NOTES
+ This function also calculates record checksum & number of rows
+*/
+
+static int check_keys_in_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ my_off_t start_recpos, uchar *record)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo;
+ char llbuff[22+4];
+ uint keynr;
+
+ param->tmp_record_checksum+= (ha_checksum) start_recpos;
+ param->records++;
+ if (param->testflag & T_WRITE_LOOP && param->records % WRITE_COUNT == 0)
+ {
+ printf("%s\r", llstr(param->records, llbuff));
+ VOID(fflush(stdout));
+ }
+
+ /* Check if keys match the record */
+ for (keynr=0, keyinfo= share->keyinfo; keynr < share->base.keys;
+ keynr++, keyinfo++)
+ {
+ if (maria_is_key_active(share->state.key_map, keynr))
+ {
+ MARIA_KEY key;
+ if (!(keyinfo->flag & HA_FULLTEXT))
+ {
+ (*keyinfo->make_key)(info, &key, keynr, info->lastkey_buff, record,
+ start_recpos, 0);
+ if (extend)
+ {
+ /* We don't need to lock the key tree here as we don't allow
+ concurrent threads when running maria_chk
+ */
+ int search_result=
+#ifdef HAVE_RTREE_KEYS
+ (keyinfo->flag & (HA_SPATIAL | HA_RTREE_INDEX)) ?
+ maria_rtree_find_first(info, &key, MBR_EQUAL | MBR_DATA) :
+#endif
+ _ma_search(info, &key, SEARCH_SAME, share->state.key_root[keynr]);
+ if (search_result)
+ {
+ record_pos_to_txt(info, start_recpos, llbuff);
+ _ma_check_print_error(param,
+ "Record at: %14s "
+ "Can't find key for index: %2d",
+ llbuff, keynr+1);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ return -1;
+ }
+ }
+ else
+ param->tmp_key_crc[keynr]+=
+ maria_byte_checksum(key.data, key.data_length);
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ Functions to loop through all rows and check if they are ok
+
+ NOTES
+ One function for each record format
+
+ RESULT
+ 0 ok
+ -1 Interrupted by user
+ 1 Error
+*/
+
+static int check_static_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ uchar *record)
+{
+ MARIA_SHARE *share= info->s;
+ my_off_t start_recpos, pos;
+ char llbuff[22];
+
+ pos= 0;
+ while (pos < share->state.state.data_file_length)
+ {
+ if (_ma_killed_ptr(param))
+ return -1;
+ if (my_b_read(&param->read_cache, record,
+ share->base.pack_reclength))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at position: "
+ "%s",
+ my_errno, llstr(pos, llbuff));
+ return 1;
+ }
+ start_recpos= pos;
+ pos+= share->base.pack_reclength;
+ param->splits++;
+ if (*record == '\0')
+ {
+ param->del_blocks++;
+ param->del_length+= share->base.pack_reclength;
+ continue; /* Record removed */
+ }
+ param->glob_crc+= _ma_static_checksum(info,record);
+ param->used+= share->base.pack_reclength;
+ if (check_keys_in_record(param, info, extend, start_recpos, record))
+ return 1;
+ }
+ return 0;
+}
+
+
+static int check_dynamic_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ uchar *record)
+{
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share= info->s;
+ my_off_t start_recpos, start_block, pos;
+ uchar *to;
+ ulong left_length;
+ uint b_type;
+ char llbuff[22],llbuff2[22],llbuff3[22];
+ DBUG_ENTER("check_dynamic_record");
+
+ LINT_INIT(left_length);
+ LINT_INIT(start_recpos);
+ LINT_INIT(to);
+
+ pos= 0;
+ while (pos < share->state.state.data_file_length)
+ {
+ my_bool got_error= 0;
+ int flag;
+ if (_ma_killed_ptr(param))
+ DBUG_RETURN(-1);
+
+ flag= block_info.second_read=0;
+ block_info.next_filepos=pos;
+ do
+ {
+ if (_ma_read_cache(&param->read_cache, block_info.header,
+ (start_block=block_info.next_filepos),
+ sizeof(block_info.header),
+ (flag ? 0 : READING_NEXT) | READING_HEADER))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at "
+ "position: %s",
+ my_errno, llstr(start_block, llbuff));
+ DBUG_RETURN(1);
+ }
+
+ if (start_block & (MARIA_DYN_ALIGN_SIZE-1))
+ {
+ _ma_check_print_error(param,"Wrong aligned block at %s",
+ llstr(start_block,llbuff));
+ DBUG_RETURN(1);
+ }
+ b_type= _ma_get_block_info(&block_info,-1,start_block);
+ if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ if (b_type & BLOCK_SYNC_ERROR)
+ {
+ if (flag)
+ {
+ _ma_check_print_error(param,"Unexpected byte: %d at link: %s",
+ (int) block_info.header[0],
+ llstr(start_block,llbuff));
+ DBUG_RETURN(1);
+ }
+ pos=block_info.filepos+block_info.block_len;
+ goto next;
+ }
+ if (b_type & BLOCK_DELETED)
+ {
+ if (block_info.block_len < share->base.min_block_length)
+ {
+ _ma_check_print_error(param,
+ "Deleted block with impossible length %lu "
+ "at %s",
+ block_info.block_len,llstr(pos,llbuff));
+ DBUG_RETURN(1);
+ }
+ if ((block_info.next_filepos != HA_OFFSET_ERROR &&
+ block_info.next_filepos >= share->state.state.data_file_length) ||
+ (block_info.prev_filepos != HA_OFFSET_ERROR &&
+ block_info.prev_filepos >= share->state.state.data_file_length))
+ {
+ _ma_check_print_error(param,"Delete link points outside datafile "
+ "at %s",
+ llstr(pos,llbuff));
+ DBUG_RETURN(1);
+ }
+ param->del_blocks++;
+ param->del_length+= block_info.block_len;
+ param->splits++;
+ pos= block_info.filepos+block_info.block_len;
+ goto next;
+ }
+ _ma_check_print_error(param,"Wrong bytesec: %d-%d-%d at linkstart: %s",
+ block_info.header[0],block_info.header[1],
+ block_info.header[2],
+ llstr(start_block,llbuff));
+ DBUG_RETURN(1);
+ }
+ if (share->state.state.data_file_length < block_info.filepos+
+ block_info.block_len)
+ {
+ _ma_check_print_error(param,
+ "Recordlink that points outside datafile at %s",
+ llstr(pos,llbuff));
+ got_error=1;
+ break;
+ }
+ param->splits++;
+ if (!flag++) /* First block */
+ {
+ start_recpos=pos;
+ pos=block_info.filepos+block_info.block_len;
+ if (block_info.rec_len > (uint) share->base.max_pack_length)
+ {
+ _ma_check_print_error(param,"Found too long record (%lu) at %s",
+ (ulong) block_info.rec_len,
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ break;
+ }
+ if (share->base.blobs)
+ {
+ if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ block_info.rec_len +
+ share->base.extra_rec_buff_size))
+
+ {
+ _ma_check_print_error(param,
+ "Not enough memory (%lu) for blob at %s",
+ (ulong) block_info.rec_len,
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ break;
+ }
+ }
+ to= info->rec_buff;
+ left_length= block_info.rec_len;
+ }
+ if (left_length < block_info.data_len)
+ {
+ _ma_check_print_error(param,"Found too long record (%lu) at %s",
+ (ulong) block_info.data_len,
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ break;
+ }
+ if (_ma_read_cache(&param->read_cache, to, block_info.filepos,
+ (uint) block_info.data_len,
+ flag == 1 ? READING_NEXT : 0))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at "
+ "position: %s", my_errno,
+ llstr(block_info.filepos, llbuff));
+
+ DBUG_RETURN(1);
+ }
+ to+=block_info.data_len;
+ param->link_used+= block_info.filepos-start_block;
+ param->used+= block_info.filepos - start_block + block_info.data_len;
+ param->empty+= block_info.block_len-block_info.data_len;
+ left_length-= block_info.data_len;
+ if (left_length)
+ {
+ if (b_type & BLOCK_LAST)
+ {
+ _ma_check_print_error(param,
+ "Wrong record length %s of %s at %s",
+ llstr(block_info.rec_len-left_length,llbuff),
+ llstr(block_info.rec_len, llbuff2),
+ llstr(start_recpos,llbuff3));
+ got_error=1;
+ break;
+ }
+ if (share->state.state.data_file_length < block_info.next_filepos)
+ {
+ _ma_check_print_error(param,
+ "Found next-recordlink that points outside "
+ "datafile at %s",
+ llstr(block_info.filepos,llbuff));
+ got_error=1;
+ break;
+ }
+ }
+ } while (left_length);
+
+ if (! got_error)
+ {
+ if (_ma_rec_unpack(info,record,info->rec_buff,block_info.rec_len) ==
+ MY_FILE_ERROR)
+ {
+ _ma_check_print_error(param,"Found wrong record at %s",
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ }
+ else
+ {
+ ha_checksum checksum= 0;
+ if (share->calc_checksum)
+ checksum= (*share->calc_checksum)(info, record);
+
+ if (param->testflag & (T_EXTEND | T_MEDIUM | T_VERBOSE))
+ {
+ if (_ma_rec_check(info,record, info->rec_buff,block_info.rec_len,
+ test(share->calc_checksum), checksum))
+ {
+ _ma_check_print_error(param,"Found wrong packed record at %s",
+ llstr(start_recpos,llbuff));
+ got_error= 1;
+ }
+ }
+ param->glob_crc+= checksum;
+ }
+
+ if (! got_error)
+ {
+ if (check_keys_in_record(param, info, extend, start_recpos, record))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ }
+ }
+ else if (!flag)
+ pos= block_info.filepos+block_info.block_len;
+next:;
+ }
+ DBUG_RETURN(0);
+}
+
+
+static int check_compressed_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ uchar *record)
+{
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share= info->s;
+ my_off_t start_recpos, pos;
+ char llbuff[22];
+ my_bool got_error= 0;
+ DBUG_ENTER("check_compressed_record");
+
+ pos= share->pack.header_length; /* Skip header */
+ while (pos < share->state.state.data_file_length)
+ {
+ if (_ma_killed_ptr(param))
+ DBUG_RETURN(-1);
+
+ if (_ma_read_cache(&param->read_cache, block_info.header, pos,
+ share->pack.ref_length, READING_NEXT))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at position: "
+ "%s",
+ my_errno, llstr(pos, llbuff));
+ DBUG_RETURN(1);
+ }
+
+ start_recpos= pos;
+ param->splits++;
+ VOID(_ma_pack_get_block_info(info, &info->bit_buff, &block_info,
+ &info->rec_buff, &info->rec_buff_size, -1,
+ start_recpos));
+ pos=block_info.filepos+block_info.rec_len;
+ if (block_info.rec_len < (uint) share->min_pack_length ||
+ block_info.rec_len > (uint) share->max_pack_length)
+ {
+ _ma_check_print_error(param,
+ "Found block with wrong recordlength: %lu at %s",
+ block_info.rec_len, llstr(start_recpos,llbuff));
+ got_error=1;
+ goto end;
+ }
+ if (_ma_read_cache(&param->read_cache, info->rec_buff,
+ block_info.filepos, block_info.rec_len, READING_NEXT))
+ {
+ _ma_check_print_error(param,
+ "got error: %d when reading datafile at position: "
+ "%s",
+ my_errno, llstr(block_info.filepos, llbuff));
+ DBUG_RETURN(1);
+ }
+ if (_ma_pack_rec_unpack(info, &info->bit_buff, record,
+ info->rec_buff, block_info.rec_len))
+ {
+ _ma_check_print_error(param,"Found wrong record at %s",
+ llstr(start_recpos,llbuff));
+ got_error=1;
+ goto end;
+ }
+ param->glob_crc+= (*share->calc_checksum)(info,record);
+ param->link_used+= (block_info.filepos - start_recpos);
+ param->used+= (pos-start_recpos);
+
+end:
+ if (! got_error)
+ {
+ if (check_keys_in_record(param, info, extend, start_recpos, record))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ got_error= 0; /* Reset for next loop */
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Check if layout on head or tail page is ok
+
+ NOTES
+ This is for rows-in-block format.
+*/
+
+static int check_page_layout(HA_CHECK *param, MARIA_HA *info,
+ my_off_t page_pos, uchar *page,
+ uint row_count, uint head_empty,
+ uint *real_rows_found, uint *free_slots_found)
+{
+ uint empty, last_row_end, row, first_dir_entry, free_entry, block_size;
+ uint free_entries, prev_free_entry;
+ uchar *dir_entry;
+ char llbuff[22];
+ my_bool error_in_free_list= 0;
+ DBUG_ENTER("check_page_layout");
+
+ block_size= info->s->block_size;
+ empty= 0;
+ last_row_end= PAGE_HEADER_SIZE;
+ *real_rows_found= 0;
+
+ /* Check free directory list */
+ free_entry= (uint) page[DIR_FREE_OFFSET];
+ free_entries= 0;
+ prev_free_entry= END_OF_DIR_FREE_LIST;
+ while (free_entry != END_OF_DIR_FREE_LIST)
+ {
+ uchar *dir;
+ if (free_entry > row_count)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Directory free entry points outside "
+ "directory",
+ llstr(page_pos, llbuff));
+ error_in_free_list= 1;
+ break;
+ }
+ dir= dir_entry_pos(page, block_size, free_entry);
+ if (uint2korr(dir) != 0)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Directory free entry points to "
+ "not deleted entry",
+ llstr(page_pos, llbuff));
+ error_in_free_list= 1;
+ break;
+ }
+ if (dir[2] != prev_free_entry)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Directory free list back pointer "
+ "points to wrong entry",
+ llstr(page_pos, llbuff));
+ error_in_free_list= 1;
+ break;
+ }
+ prev_free_entry= free_entry;
+ free_entry= dir[3];
+ free_entries++;
+ }
+ *free_slots_found= free_entries;
+
+ /* Check directry */
+ dir_entry= page+ block_size - PAGE_SUFFIX_SIZE;
+ first_dir_entry= (block_size - row_count * DIR_ENTRY_SIZE -
+ PAGE_SUFFIX_SIZE);
+ for (row= 0 ; row < row_count ; row++)
+ {
+ uint pos, length;
+ dir_entry-= DIR_ENTRY_SIZE;
+ pos= uint2korr(dir_entry);
+ if (!pos)
+ {
+ free_entries--;
+ if (row == row_count -1)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: First entry in directory is 0",
+ llstr(page_pos, llbuff));
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ }
+ continue; /* Deleted row */
+ }
+ (*real_rows_found)++;
+ length= uint2korr(dir_entry+2);
+ param->used+= length;
+ if (pos < last_row_end)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row %3u overlapps with previous row",
+ llstr(page_pos, llbuff), row);
+ DBUG_RETURN(1);
+ }
+ empty+= (pos - last_row_end);
+ last_row_end= pos + length;
+ if (last_row_end > first_dir_entry)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row %3u overlapps with directory",
+ llstr(page_pos, llbuff), row);
+ DBUG_RETURN(1);
+ }
+ }
+ empty+= (first_dir_entry - last_row_end);
+
+ if (empty != head_empty)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Wrong empty size. Stored: %5u "
+ "Actual: %5u",
+ llstr(page_pos, llbuff), head_empty, empty);
+ param->err_count++;
+ }
+ if (free_entries != 0 && !error_in_free_list)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Directory free link don't include "
+ "all free entries",
+ llstr(page_pos, llbuff));
+ param->err_count++;
+ }
+ DBUG_RETURN(param->err_count &&
+ (param->err_count >= MAXERR || !(param->testflag & T_VERBOSE)));
+}
+
+
+/*
+ Check all rows on head page
+
+ NOTES
+ This is for rows-in-block format.
+
+ Before this, we have already called check_page_layout(), so
+ we know the block is logicaly correct (even if the rows may not be that)
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+
+static my_bool check_head_page(HA_CHECK *param, MARIA_HA *info, uchar *record,
+ int extend, my_off_t page_pos, uchar *page_buff,
+ uint row_count)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *dir_entry;
+ uint row;
+ char llbuff[22], llbuff2[22];
+ ulonglong page= page_pos / share->block_size;
+ DBUG_ENTER("check_head_page");
+
+ dir_entry= page_buff+ share->block_size - PAGE_SUFFIX_SIZE;
+ for (row= 0 ; row < row_count ; row++)
+ {
+ uint pos, length, flag;
+ dir_entry-= DIR_ENTRY_SIZE;
+ pos= uint2korr(dir_entry);
+ if (!pos)
+ continue;
+ length= uint2korr(dir_entry+2);
+ if (length < share->base.min_block_length)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row %3u is too short "
+ "(%d of min %d bytes)",
+ llstr(page, llbuff), row, length,
+ (uint) share->base.min_block_length);
+ DBUG_RETURN(1);
+ }
+ flag= (uint) (uchar) page_buff[pos];
+ if (flag & ~(ROW_FLAG_ALL))
+ _ma_check_print_error(param,
+ "Page %9s: Row %3u has wrong flag: %u",
+ llstr(page, llbuff), row, flag);
+
+ DBUG_PRINT("info", ("rowid: %s page: %lu row: %u",
+ llstr(ma_recordpos(page, row), llbuff),
+ (ulong) page, row));
+ info->cur_row.trid= 0;
+ if (_ma_read_block_record2(info, record, page_buff+pos,
+ page_buff+pos+length))
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row %3d is crashed",
+ llstr(page, llbuff), row);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ continue;
+ }
+ set_if_bigger(param->max_found_trid, info->cur_row.trid);
+ if (info->cur_row.trid > param->max_trid)
+ _ma_check_print_not_visible_error(param, info->cur_row.trid);
+
+ if (share->calc_checksum)
+ {
+ ha_checksum checksum= (*share->calc_checksum)(info, record);
+ if (info->cur_row.checksum != (checksum & 255))
+ _ma_check_print_error(param, "Page %9s: Row %3d has wrong checksum",
+ llstr(page, llbuff), row);
+ param->glob_crc+= checksum;
+ }
+ if (info->cur_row.extents_count)
+ {
+ uchar *extents= info->cur_row.extents;
+ uint i;
+ /* Check that bitmap has the right marker for the found extents */
+ for (i= 0 ; i < info->cur_row.extents_count ; i++)
+ {
+ pgcache_page_no_t extent_page;
+ uint page_count, page_type;
+ extent_page= uint5korr(extents);
+ page_count= uint2korr(extents+5) & ~START_EXTENT_BIT;
+ extents+= ROW_EXTENT_SIZE;
+ page_type= BLOB_PAGE;
+ if (page_count & TAIL_BIT)
+ {
+ page_count= 1;
+ page_type= TAIL_PAGE;
+ }
+ /*
+ TODO OPTIMIZE:
+ Check the whole extent with one test and only do the loop if
+ something is wrong (for exact error reporting)
+ */
+ for ( ; page_count--; extent_page++)
+ {
+ uint bitmap_pattern;
+ if (_ma_check_if_right_bitmap_type(info, page_type, extent_page,
+ &bitmap_pattern))
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Row: %3d has an extent with "
+ "wrong information in bitmap: "
+ "Page: %9s Page_type: %d Bitmap: %d",
+ llstr(page, llbuff), row,
+ llstr(extent_page, llbuff2),
+ page_type, bitmap_pattern);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ }
+ param->full_page_count+= info->cur_row.full_page_count;
+ param->tail_count+= info->cur_row.tail_count;
+ if (check_keys_in_record(param, info, extend,
+ ma_recordpos(page, row), record))
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Check if rows-in-block data file is consistent
+*/
+
+static int check_block_record(HA_CHECK *param, MARIA_HA *info, int extend,
+ uchar *record)
+{
+ MARIA_SHARE *share= info->s;
+ my_off_t pos;
+ pgcache_page_no_t page;
+ uchar *page_buff, *bitmap_buff, *data;
+ char llbuff[22], llbuff2[22];
+ uint block_size= share->block_size;
+ ha_rows full_page_count, tail_count;
+ my_bool full_dir;
+ uint offset_page, offset, free_count;
+
+ LINT_INIT(full_dir);
+
+ if (_ma_scan_init_block_record(info))
+ {
+ _ma_check_print_error(param, "got error %d when initializing scan",
+ my_errno);
+ return 1;
+ }
+ bitmap_buff= info->scan.bitmap_buff;
+ page_buff= info->scan.page_buff;
+ full_page_count= tail_count= 0;
+ param->full_page_count= param->tail_count= 0;
+ param->used= param->link_used= 0;
+ param->splits= share->state.state.data_file_length / block_size;
+
+ for (pos= 0, page= 0;
+ pos < share->state.state.data_file_length;
+ pos+= block_size, page++)
+ {
+ uint row_count, real_row_count, empty_space, page_type, bitmap_pattern;
+ LINT_INIT(row_count);
+ LINT_INIT(empty_space);
+
+ if (_ma_killed_ptr(param))
+ {
+ _ma_scan_end_block_record(info);
+ return -1;
+ }
+ if ((page % share->bitmap.pages_covered) == 0)
+ {
+ /* Bitmap page */
+ if (pagecache_read(share->pagecache,
+ &info->s->bitmap.file,
+ page, 1,
+ bitmap_buff,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Got error: %d when reading datafile",
+ llstr(page, llbuff), my_errno);
+ goto err;
+ }
+ param->used+= block_size;
+ param->link_used+= block_size;
+ continue;
+ }
+ /* Skip pages marked as empty in bitmap */
+ offset_page= (uint) ((page % share->bitmap.pages_covered) -1) * 3;
+ offset= offset_page & 7;
+ data= bitmap_buff + offset_page / 8;
+ bitmap_pattern= uint2korr(data);
+ if (!((bitmap_pattern >> offset) & 7))
+ {
+ param->empty+= block_size;
+ param->del_blocks++;
+ continue;
+ }
+
+ if (pagecache_read(share->pagecache,
+ &info->dfile,
+ page, 1,
+ page_buff,
+ share->page_type,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0)
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Got error: %d when reading datafile",
+ llstr(page, llbuff), my_errno);
+ goto err;
+ }
+ page_type= page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK;
+ if (page_type == UNALLOCATED_PAGE || page_type >= MAX_PAGE_TYPE)
+ {
+ _ma_check_print_error(param,
+ "Page: %9s Found wrong page type %d",
+ llstr(page, llbuff), page_type);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ goto err;
+ continue;
+ }
+ switch ((enum en_page_type) page_type) {
+ case UNALLOCATED_PAGE:
+ case MAX_PAGE_TYPE:
+ default:
+ DBUG_ASSERT(0); /* Impossible */
+ break;
+ case HEAD_PAGE:
+ row_count= page_buff[DIR_COUNT_OFFSET];
+ empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET);
+ param->used+= block_size - empty_space;
+ param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE +
+ row_count * DIR_ENTRY_SIZE);
+ if (empty_space < share->bitmap.sizes[3])
+ param->lost+= empty_space;
+ if (check_page_layout(param, info, pos, page_buff, row_count,
+ empty_space, &real_row_count, &free_count))
+ goto err;
+ full_dir= (row_count == MAX_ROWS_PER_PAGE &&
+ page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST);
+ break;
+ case TAIL_PAGE:
+ row_count= page_buff[DIR_COUNT_OFFSET];
+ empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET);
+ param->used+= block_size - empty_space;
+ param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE +
+ row_count * DIR_ENTRY_SIZE);
+ if (empty_space < share->bitmap.sizes[6])
+ param->lost+= empty_space;
+ if (check_page_layout(param, info, pos, page_buff, row_count,
+ empty_space, &real_row_count, &free_count))
+ goto err;
+ full_dir= (row_count - free_count >= MAX_ROWS_PER_PAGE -
+ share->base.blobs);
+ break;
+ case BLOB_PAGE:
+ full_page_count++;
+ full_dir= 0;
+ empty_space= block_size; /* for error reporting */
+ param->link_used+= (LSN_SIZE + PAGE_TYPE_SIZE);
+ param->used+= block_size;
+ break;
+ }
+ if (_ma_check_bitmap_data(info, page_type, page,
+ full_dir ? 0 : empty_space,
+ &bitmap_pattern))
+ {
+ if (bitmap_pattern == ~(uint) 0)
+ _ma_check_print_error(param,
+ "Page %9s: Wrong bitmap for data on page",
+ llstr(page, llbuff));
+ else
+ _ma_check_print_error(param,
+ "Page %9s: Wrong data in bitmap. Page_type: "
+ "%d full: %d empty_space: %u Bitmap-bits: %d",
+ llstr(page, llbuff), page_type, full_dir,
+ empty_space, bitmap_pattern);
+ if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+ goto err;
+ }
+ if ((enum en_page_type) page_type == BLOB_PAGE)
+ continue;
+ param->empty+= empty_space;
+ if ((enum en_page_type) page_type == TAIL_PAGE)
+ {
+ tail_count+= real_row_count;
+ continue;
+ }
+ if (check_head_page(param, info, record, extend, pos, page_buff,
+ row_count))
+ goto err;
+ }
+
+ /* Verify that rest of bitmap is zero */
+
+ if (page % share->bitmap.pages_covered)
+ {
+ /* Not at end of bitmap */
+ uint bitmap_pattern;
+ offset_page= (uint) ((page % share->bitmap.pages_covered) -1) * 3;
+ offset= offset_page & 7;
+ data= bitmap_buff + offset_page / 8;
+ bitmap_pattern= uint2korr(data);
+ if (((bitmap_pattern >> offset)) ||
+ (data + 2 < bitmap_buff + share->bitmap.total_size &&
+ _ma_check_if_zero(data+2, bitmap_buff + share->bitmap.total_size -
+ data - 2)))
+ {
+ ulonglong bitmap_page;
+ bitmap_page= page / share->bitmap.pages_covered;
+ bitmap_page*= share->bitmap.pages_covered;
+
+ _ma_check_print_error(param,
+ "Bitmap at page %s has pages reserved outside of "
+ "data file length",
+ llstr(bitmap_page, llbuff));
+ DBUG_EXECUTE("bitmap", _ma_print_bitmap(&share->bitmap, bitmap_buff,
+ bitmap_page););
+ }
+ }
+
+ _ma_scan_end_block_record(info);
+
+ if (full_page_count != param->full_page_count)
+ _ma_check_print_error(param, "Full page count read through records was %s "
+ "but we found %s pages while scanning table",
+ llstr(param->full_page_count, llbuff),
+ llstr(full_page_count, llbuff2));
+ if (tail_count != param->tail_count)
+ _ma_check_print_error(param, "Tail count read through records was %s but "
+ "we found %s tails while scanning table",
+ llstr(param->tail_count, llbuff),
+ llstr(tail_count, llbuff2));
+
+ return param->error_printed != 0;
+
+err:
+ _ma_scan_end_block_record(info);
+ return 1;
+}
+
+
+/* Check that record-link is ok */
+
+int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info, my_bool extend)
+{
+ MARIA_SHARE *share= info->s;
+ int error;
+ uchar *record;
+ char llbuff[22],llbuff2[22],llbuff3[22];
+ DBUG_ENTER("maria_chk_data_link");
+
+ if (!(param->testflag & T_SILENT))
+ {
+ if (extend)
+ puts("- check records and index references");
+ else
+ puts("- check record links");
+ }
+
+ if (!(record= (uchar*) my_malloc(share->base.default_rec_buff_size, MYF(0))))
+ {
+ _ma_check_print_error(param,"Not enough memory for record");
+ DBUG_RETURN(-1);
+ }
+ param->records= param->del_blocks= 0;
+ param->used= param->link_used= param->splits= param->del_length= 0;
+ param->lost= 0;
+ param->tmp_record_checksum= param->glob_crc= 0;
+ param->err_count= 0;
+
+ error= 0;
+ param->empty= share->pack.header_length;
+
+ bzero((char*) param->tmp_key_crc,
+ share->base.keys * sizeof(param->tmp_key_crc[0]));
+
+ switch (share->data_file_type) {
+ case BLOCK_RECORD:
+ error= check_block_record(param, info, extend, record);
+ break;
+ case STATIC_RECORD:
+ error= check_static_record(param, info, extend, record);
+ break;
+ case DYNAMIC_RECORD:
+ error= check_dynamic_record(param, info, extend, record);
+ break;
+ case COMPRESSED_RECORD:
+ error= check_compressed_record(param, info, extend, record);
+ break;
+ } /* switch */
+
+ if (error)
+ goto err;
+
+ if (param->testflag & T_WRITE_LOOP)
+ {
+ VOID(fputs(" \r",stdout)); VOID(fflush(stdout));
+ }
+ if (param->records != share->state.state.records)
+ {
+ _ma_check_print_error(param,
+ "Record-count is not ok; found %-10s Should be: %s",
+ llstr(param->records,llbuff),
+ llstr(share->state.state.records,llbuff2));
+ error=1;
+ }
+ else if (param->record_checksum &&
+ param->record_checksum != param->tmp_record_checksum)
+ {
+ _ma_check_print_error(param,
+ "Key pointers and record positions doesn't match");
+ error=1;
+ }
+ else if (param->glob_crc != share->state.state.checksum &&
+ (share->options &
+ (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)))
+ {
+ _ma_check_print_warning(param,
+ "Record checksum is not the same as checksum "
+ "stored in the index file");
+ error=1;
+ }
+ else if (!extend)
+ {
+ uint key;
+ for (key=0 ; key < share->base.keys; key++)
+ {
+ if (param->tmp_key_crc[key] != param->key_crc[key] &&
+ !(share->keyinfo[key].flag &
+ (HA_FULLTEXT | HA_SPATIAL | HA_RTREE_INDEX)))
+ {
+ _ma_check_print_error(param,"Checksum for key: %2d doesn't match "
+ "checksum for records",
+ key+1);
+ error=1;
+ }
+ }
+ }
+
+ if (param->del_length != share->state.state.empty)
+ {
+ _ma_check_print_warning(param,
+ "Found %s deleted space. Should be %s",
+ llstr(param->del_length,llbuff2),
+ llstr(share->state.state.empty,llbuff));
+ }
+ /* Skip following checks for BLOCK RECORD as they don't make any sence */
+ if (share->data_file_type != BLOCK_RECORD)
+ {
+ if (param->used + param->empty + param->del_length !=
+ share->state.state.data_file_length)
+ {
+ _ma_check_print_warning(param,
+ "Found %s record data and %s unused data and %s "
+ "deleted data",
+ llstr(param->used, llbuff),
+ llstr(param->empty,llbuff2),
+ llstr(param->del_length,llbuff3));
+ _ma_check_print_warning(param,
+ "Total %s Should be: %s",
+ llstr((param->used+param->empty +
+ param->del_length), llbuff),
+ llstr(share->state.state.data_file_length,
+ llbuff2));
+ }
+ if (param->del_blocks != share->state.state.del)
+ {
+ _ma_check_print_warning(param,
+ "Found %10s deleted blocks. Should be: %s",
+ llstr(param->del_blocks,llbuff),
+ llstr(share->state.state.del,llbuff2));
+ }
+ if (param->splits != share->state.split)
+ {
+ _ma_check_print_warning(param,
+ "Found %10s parts. Should be: %s",
+ llstr(param->splits, llbuff),
+ llstr(share->state.split,llbuff2));
+ }
+ }
+ if (param->testflag & T_INFO)
+ {
+ if (param->warning_printed || param->error_printed)
+ puts("");
+ if (param->used != 0 && ! param->error_printed)
+ {
+ if (param->records)
+ {
+ printf("Records:%18s M.recordlength:%9lu Packed:%14.0f%%\n",
+ llstr(param->records,llbuff),
+ (long)((param->used - param->link_used)/param->records),
+ (share->base.blobs ? 0.0 :
+ (ulonglong2double((ulonglong) share->base.reclength *
+ param->records)-
+ my_off_t2double(param->used))/
+ ulonglong2double((ulonglong) share->base.reclength *
+ param->records)*100.0));
+ printf("Recordspace used:%9.0f%% Empty space:%12d%% "
+ "Blocks/Record: %6.2f\n",
+ (ulonglong2double(param->used - param->link_used)/
+ ulonglong2double(param->used-param->link_used+param->empty) *
+ 100.0),
+ (!param->records ? 100 :
+ (int) (ulonglong2double(param->del_length+param->empty)/
+ my_off_t2double(param->used)*100.0)),
+ ulonglong2double(param->splits - param->del_blocks) /
+ param->records);
+ }
+ else
+ printf("Records:%18s\n", "0");
+ }
+ printf("Record blocks:%12s Delete blocks:%10s\n",
+ llstr(param->splits - param->del_blocks, llbuff),
+ llstr(param->del_blocks, llbuff2));
+ printf("Record data: %12s Deleted data: %10s\n",
+ llstr(param->used - param->link_used,llbuff),
+ llstr(param->del_length, llbuff2));
+ printf("Empty space: %12s Linkdata: %10s\n",
+ llstr(param->empty, llbuff),llstr(param->link_used, llbuff2));
+ if (param->lost)
+ printf("Lost space: %12s", llstr(param->lost, llbuff));
+ if (param->max_found_trid)
+ {
+ printf("Max trans. id: %11s\n",
+ llstr(param->max_found_trid, llbuff));
+ }
+ }
+ my_free(record,MYF(0));
+ DBUG_RETURN (error);
+
+err:
+ my_free(record,MYF(0));
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ DBUG_RETURN(1);
+} /* maria_chk_data_link */
+
+
+/**
+ Prepares a table for a repair or index sort: flushes pages, records durably
+ in the table that it is undergoing the operation (if that op crashes, that
+ info will serve for Recovery and the user).
+
+ If we start overwriting the index file, and crash then, old REDOs will
+ be tried and fail. To prevent that, we bump skip_redo_lsn, and thus we have
+ to flush and sync pages so that old REDOs can be skipped.
+ If this is not a bulk insert, which Recovery can handle gracefully (by
+ truncating files, see UNDO_BULK_INSERT) we also mark the table
+ crashed-on-repair, so that user knows it has to re-repair. If bulk insert we
+ shouldn't mark it crashed-on-repair, because if we did this, the UNDO phase
+ would skip the table (UNDO_BULK_INSERT would not be applied),
+ and maria_chk would not improve that.
+ If this is an OPTIMIZE which merely sorts index, we need to do the same
+ too: old REDOs should not apply to the new index file.
+ Only the flush is needed when in maria_chk which is not crash-safe.
+
+ @param info table
+ @param param repair parameters
+ @param discard_index if index pages can be thrown away
+*/
+
+static my_bool protect_against_repair_crash(MARIA_HA *info,
+ const HA_CHECK *param,
+ my_bool discard_index)
+{
+ MARIA_SHARE *share= info->s;
+
+ /*
+ There are other than recovery-related reasons to do the writes below:
+ - the physical size of the data file is sometimes used during repair: we
+ need to flush to have it exact
+ - we flush the state because maria_open(HA_OPEN_COPY) will want to read
+ it from disk.
+ */
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_FORCE_WRITE,
+ discard_index ? FLUSH_IGNORE_CHANGED :
+ FLUSH_FORCE_WRITE) ||
+ (share->changed &&
+ _ma_state_info_write(share,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_FULL_INFO |
+ MA_STATE_INFO_WRITE_LOCK)))
+ return TRUE;
+ /* In maria_chk this is not needed: */
+ if (maria_multi_threaded && share->base.born_transactional)
+ {
+ if ((param->testflag & T_NO_CREATE_RENAME_LSN) == 0)
+ {
+ /* this can be true only for a transactional table */
+ maria_mark_in_repair(info);
+ if (_ma_state_info_write(share,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_LOCK))
+ return TRUE;
+ }
+ if (translog_status == TRANSLOG_OK &&
+ _ma_update_state_lsns(share, translog_get_horizon(),
+ share->state.create_trid, FALSE, FALSE))
+ return TRUE;
+ if (_ma_sync_table_files(info))
+ return TRUE;
+ }
+ return FALSE;
+}
+
+
+/**
+ @brief Initialize variables for repair
+*/
+
+static int initialize_variables_for_repair(HA_CHECK *param,
+ MARIA_SORT_INFO *sort_info,
+ MARIA_SORT_PARAM *sort_param,
+ MARIA_HA *info,
+ my_bool rep_quick,
+ MARIA_SHARE *org_share)
+{
+ MARIA_SHARE *share= info->s;
+
+ /* Ro allow us to restore state and check how state changed */
+ memcpy(org_share, share, sizeof(*share));
+
+ /* Repair code relies on share->state.state so we have to update it here */
+ if (share->lock.update_status)
+ (*share->lock.update_status)(info);
+
+ bzero((char*) sort_info, sizeof(*sort_info));
+ bzero((char*) sort_param, sizeof(*sort_param));
+
+ param->testflag|= T_REP; /* for easy checking */
+ if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+ param->testflag|= T_CALC_CHECKSUM;
+ param->glob_crc= 0;
+ if (rep_quick)
+ param->testflag|= T_QUICK;
+ else
+ param->testflag&= ~T_QUICK;
+ param->org_key_map= share->state.key_map;
+
+ sort_param->sort_info= sort_info;
+ sort_param->fix_datafile= ! rep_quick;
+ sort_param->calc_checksum= test(param->testflag & T_CALC_CHECKSUM);
+ sort_info->info= sort_info->new_info= info;
+ sort_info->param= param;
+ set_data_file_type(sort_info, info->s);
+ sort_info->org_data_file_type= share->data_file_type;
+
+ bzero(&info->rec_cache, sizeof(info->rec_cache));
+ info->rec_cache.file= info->dfile.file;
+ info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ if (protect_against_repair_crash(info, param, !test(param->testflag &
+ T_CREATE_MISSING_KEYS)))
+ return 1;
+
+ /* calculate max_records */
+ sort_info->filelength= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0));
+ if ((param->testflag & T_CREATE_MISSING_KEYS) ||
+ sort_info->org_data_file_type == COMPRESSED_RECORD)
+ sort_info->max_records= share->state.state.records;
+ else
+ {
+ ulong rec_length;
+ rec_length= max(share->base.min_pack_length,
+ share->base.min_block_length);
+ sort_info->max_records= (ha_rows) (sort_info->filelength / rec_length);
+ }
+
+ /* Set up transaction handler so that we can see all rows */
+ if (param->max_trid == 0)
+ {
+ if (!ma_control_file_inited())
+ param->max_trid= 0; /* Give warning for first trid found */
+ else
+ param->max_trid= max_trid_in_system();
+ }
+ maria_ignore_trids(info);
+ /* Don't write transid's during repair */
+ maria_versioning(info, 0);
+ return 0;
+}
+
+
+/*
+ During initialize_variables_for_repair and related functions we set some
+ variables to values that makes sence during repair.
+ This function restores these values to their original values so that we can
+ use the handler in MariaDB without having to close and open the table.
+*/
+
+static void restore_table_state_after_repair(MARIA_HA *info,
+ MARIA_SHARE *org_share)
+{
+ maria_versioning(info, info->s->have_versioning);
+ info->s->lock_key_trees= org_share->lock_key_trees;
+}
+
+
+
+
+/**
+ @brief Drop all indexes
+
+ @param[in] param check parameters
+ @param[in] info MARIA_HA handle
+ @param[in] force if to force drop all indexes
+
+ @return status
+ @retval 0 OK
+ @retval != 0 Error
+
+ @note
+ Once allocated, index blocks remain part of the key file forever.
+ When indexes are disabled, no block is freed. When enabling indexes,
+ no block is freed either. The new indexes are create from new
+ blocks. (Bug #4692)
+
+ Before recreating formerly disabled indexes, the unused blocks
+ must be freed. There are two options to do this:
+ - Follow the tree of disabled indexes, add all blocks to the
+ deleted blocks chain. Would require a lot of random I/O.
+ - Drop all blocks by clearing all index root pointers and all
+ delete chain pointers and resetting key_file_length to the end
+ of the index file header. This requires to recreate all indexes,
+ even those that may still be intact.
+ The second method is probably faster in most cases.
+
+ When disabling indexes, MySQL disables either all indexes or all
+ non-unique indexes. When MySQL [re-]enables disabled indexes
+ (T_CREATE_MISSING_KEYS), then we either have "lost" blocks in the
+ index file, or there are no non-unique indexes. In the latter case,
+ maria_repair*() would not be called as there would be no disabled
+ indexes.
+
+ If there would be more unique indexes than disabled (non-unique)
+ indexes, we could do the first method. But this is not implemented
+ yet. By now we drop and recreate all indexes when repair is called.
+
+ However, there is an exception. Sometimes MySQL disables non-unique
+ indexes when the table is empty (e.g. when copying a table in
+ mysql_alter_table()). When enabling the non-unique indexes, they
+ are still empty. So there is no index block that can be lost. This
+ optimization is implemented in this function.
+
+ Note that in normal repair (T_CREATE_MISSING_KEYS not set) we
+ recreate all enabled indexes unconditonally. We do not change the
+ key_map. Otherwise we invert the key map temporarily (outside of
+ this function) and recreate the then "seemingly" enabled indexes.
+ When we cannot use the optimization, and drop all indexes, we
+ pretend that all indexes were disabled. By the inversion, we will
+ then recrate all indexes.
+*/
+
+static int maria_drop_all_indexes(HA_CHECK *param, MARIA_HA *info,
+ my_bool force)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_STATE_INFO *state= &share->state;
+ uint i;
+ DBUG_ENTER("maria_drop_all_indexes");
+
+ /*
+ If any of the disabled indexes has a key block assigned, we must
+ drop and recreate all indexes to avoid losing index blocks.
+
+ If we want to recreate disabled indexes only _and_ all of these
+ indexes are empty, we don't need to recreate the existing indexes.
+ */
+ if (!force && (param->testflag & T_CREATE_MISSING_KEYS))
+ {
+ DBUG_PRINT("repair", ("creating missing indexes"));
+ for (i= 0; i < share->base.keys; i++)
+ {
+ DBUG_PRINT("repair", ("index #: %u key_root: 0x%lx active: %d",
+ i, (long) state->key_root[i],
+ maria_is_key_active(state->key_map, i)));
+ if ((state->key_root[i] != HA_OFFSET_ERROR) &&
+ !maria_is_key_active(state->key_map, i))
+ {
+ /*
+ This index has at least one key block and it is disabled.
+ We would lose its block(s) if would just recreate it.
+ So we need to drop and recreate all indexes.
+ */
+ DBUG_PRINT("repair", ("nonempty and disabled: recreate all"));
+ break;
+ }
+ }
+ if (i >= share->base.keys)
+ goto end;
+
+ /*
+ We do now drop all indexes and declare them disabled. With the
+ T_CREATE_MISSING_KEYS flag, maria_repair*() will recreate all
+ disabled indexes and enable them.
+ */
+ maria_clear_all_keys_active(state->key_map);
+ DBUG_PRINT("repair", ("declared all indexes disabled"));
+ }
+
+ /* Clear index root block pointers. */
+ for (i= 0; i < share->base.keys; i++)
+ state->key_root[i]= HA_OFFSET_ERROR;
+
+ /* Drop the delete chain. */
+ share->state.key_del= HA_OFFSET_ERROR;
+
+ /* Reset index file length to end of index file header. */
+ share->state.state.key_file_length= share->base.keystart;
+
+end:
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Recover old table by reading each record and writing all keys
+
+ NOTES
+ Save new datafile-name in temp_filename.
+ We overwrite the index file as we go (writekeys() for example), so if we
+ crash during this the table is unusable and user (or Recovery in the
+ future) must repeat the REPAIR/OPTIMIZE operation. We could use a
+ temporary index file in the future (drawback: more disk space).
+
+ IMPLEMENTATION (for hard repair with block format)
+ - Create new, unrelated MARIA_HA of the table
+ - Create new datafile and associate it with new handler
+ - Reset all statistic information in new handler
+ - Copy all data to new handler with normal write operations
+ - Move state of new handler to old handler
+ - Close new handler
+ - Close data file in old handler
+ - Rename old data file to new data file.
+ - Reopen data file in old handler
+*/
+
+int maria_repair(HA_CHECK *param, register MARIA_HA *info,
+ char *name, my_bool rep_quick)
+{
+ int error, got_error;
+ ha_rows start_records,new_header_length;
+ my_off_t del;
+ File new_file;
+ MARIA_SHARE *share= info->s;
+ char llbuff[22],llbuff2[22];
+ MARIA_SORT_INFO sort_info;
+ MARIA_SORT_PARAM sort_param;
+ my_bool block_record, scan_inited= 0, reenable_logging= 0;
+ enum data_file_type org_data_file_type= share->data_file_type;
+ myf sync_dir= ((share->now_transactional && !share->temporary) ?
+ MY_SYNC_DIR : 0);
+ MARIA_SHARE backup_share;
+ DBUG_ENTER("maria_repair");
+
+ got_error= 1;
+ new_file= -1;
+ start_records= share->state.state.records;
+ if (!(param->testflag & T_SILENT))
+ {
+ printf("- recovering (with keycache) Aria-table '%s'\n",name);
+ printf("Data records: %s\n", llstr(start_records, llbuff));
+ }
+
+ if (initialize_variables_for_repair(param, &sort_info, &sort_param, info,
+ rep_quick, &backup_share))
+ goto err;
+
+ if ((reenable_logging= share->now_transactional))
+ _ma_tmp_disable_logging_for_table(info, 0);
+
+ sort_param.current_filepos= sort_param.filepos= new_header_length=
+ ((param->testflag & T_UNPACK) ? 0L : share->pack.header_length);
+
+ if (!rep_quick)
+ {
+ /* Get real path for data file */
+ if ((new_file= my_create(fn_format(param->temp_filename,
+ share->data_file_name.str, "",
+ DATA_TMP_EXT, 2+4),
+ 0,param->tmpfile_createflag,
+ MYF(0))) < 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ goto err;
+ }
+ if (new_header_length &&
+ maria_filecopy(param, new_file, info->dfile.file, 0L,
+ new_header_length, "datafile-header"))
+ goto err;
+ share->state.dellink= HA_OFFSET_ERROR;
+ info->rec_cache.file= new_file; /* For sort_delete_record */
+ if (share->data_file_type == BLOCK_RECORD ||
+ (param->testflag & T_UNPACK))
+ {
+ if (create_new_data_handle(&sort_param, new_file))
+ goto err;
+ sort_info.new_info->rec_cache.file= new_file;
+ }
+ }
+
+ block_record= sort_info.new_info->s->data_file_type == BLOCK_RECORD;
+
+ if (org_data_file_type != BLOCK_RECORD)
+ {
+ /* We need a read buffer to read rows in big blocks */
+ if (init_io_cache(&param->read_cache, info->dfile.file,
+ (uint) param->read_buffer_length,
+ READ_CACHE, share->pack.header_length, 1, MYF(MY_WME)))
+ goto err;
+ }
+ if (sort_info.new_info->s->data_file_type != BLOCK_RECORD)
+ {
+ /* When writing to not block records, we need a write buffer */
+ if (!rep_quick)
+ {
+ if (init_io_cache(&sort_info.new_info->rec_cache, new_file,
+ (uint) param->write_buffer_length,
+ WRITE_CACHE, new_header_length, 1,
+ MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw))
+ goto err;
+ sort_info.new_info->opt_flag|=WRITE_CACHE_USED;
+ }
+ }
+ else if (block_record)
+ {
+ scan_inited= 1;
+ if (maria_scan_init(sort_info.info))
+ goto err;
+ }
+
+ if (!(sort_param.record=
+ (uchar *) my_malloc((uint)
+ share->base.default_rec_buff_size, MYF(0))) ||
+ _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size,
+ share->base.default_rec_buff_size))
+ {
+ _ma_check_print_error(param, "Not enough memory for extra record");
+ goto err;
+ }
+
+ sort_param.read_cache=param->read_cache;
+ sort_param.pos=sort_param.max_pos=share->pack.header_length;
+ param->read_cache.end_of_file= sort_info.filelength;
+ sort_param.master=1;
+ sort_info.max_records= ~(ha_rows) 0;
+
+ del= share->state.state.del;
+ share->state.state.records= share->state.state.del= share->state.split= 0;
+ share->state.state.empty= 0;
+
+ if (param->testflag & T_CREATE_MISSING_KEYS)
+ maria_set_all_keys_active(share->state.key_map, share->base.keys);
+ maria_drop_all_indexes(param, info, TRUE);
+
+ maria_lock_memory(param); /* Everything is alloced */
+
+ /* Re-create all keys, which are set in key_map. */
+ while (!(error=sort_get_next_record(&sort_param)))
+ {
+ if (block_record && _ma_sort_write_record(&sort_param))
+ goto err;
+
+ if (writekeys(&sort_param))
+ {
+ if (my_errno != HA_ERR_FOUND_DUPP_KEY)
+ goto err;
+ DBUG_DUMP("record", sort_param.record,
+ share->base.default_rec_buff_size);
+ _ma_check_print_warning(param,
+ "Duplicate key %2d for record at %10s against "
+ "new record at %10s",
+ info->errkey+1,
+ llstr(sort_param.current_filepos, llbuff),
+ llstr(info->dup_key_pos,llbuff2));
+ if (param->testflag & T_VERBOSE)
+ {
+ MARIA_KEY tmp_key;
+ MARIA_KEYDEF *keyinfo= share->keyinfo + info->errkey;
+ (*keyinfo->make_key)(info, &tmp_key, (uint) info->errkey,
+ info->lastkey_buff,
+ sort_param.record, 0L, 0);
+ _ma_print_key(stdout, &tmp_key);
+ }
+ sort_info.dupp++;
+ if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK)
+ {
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ param->error_printed=1;
+ goto err;
+ }
+ /* purecov: begin tested */
+ if (block_record)
+ {
+ sort_info.new_info->s->state.state.records--;
+ if ((*sort_info.new_info->s->write_record_abort)(sort_info.new_info))
+ {
+ _ma_check_print_error(param,"Couldn't delete duplicate row");
+ goto err;
+ }
+ }
+ /* purecov: end */
+ continue;
+ }
+ if (!block_record)
+ {
+ if (_ma_sort_write_record(&sort_param))
+ goto err;
+ /* Filepos is pointer to where next row will be stored */
+ sort_param.current_filepos= sort_param.filepos;
+ }
+ }
+ if (error > 0 || maria_write_data_suffix(&sort_info, !rep_quick) ||
+ flush_io_cache(&sort_info.new_info->rec_cache) ||
+ param->read_cache.error < 0)
+ goto err;
+
+ if (param->testflag & T_WRITE_LOOP)
+ {
+ VOID(fputs(" \r",stdout)); VOID(fflush(stdout));
+ }
+ if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0, MYF(0)))
+ {
+ _ma_check_print_warning(param,
+ "Can't change size of indexfile, error: %d",
+ my_errno);
+ goto err;
+ }
+
+ if (rep_quick && del+sort_info.dupp != share->state.state.del)
+ {
+ _ma_check_print_error(param,"Couldn't fix table with quick recovery: "
+ "Found wrong number of deleted records");
+ _ma_check_print_error(param,"Run recovery again without -q");
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ goto err;
+ }
+
+ if (param->testflag & T_SAFE_REPAIR)
+ {
+ /* Don't repair if we loosed more than one row */
+ if (sort_info.new_info->s->state.state.records+1 < start_records)
+ {
+ share->state.state.records= start_records;
+ goto err;
+ }
+ }
+
+ VOID(end_io_cache(&sort_info.new_info->rec_cache));
+ info->opt_flag&= ~WRITE_CACHE_USED;
+
+ /*
+ As we have read the data file (sort_get_next_record()) we may have
+ cached, non-changed blocks of it in the page cache. We must throw them
+ away as we are going to close their descriptor ('new_file'). We also want
+ to flush any index block, so that it is ready for the upcoming sync.
+ */
+ if (_ma_flush_table_files_before_swap(param, info))
+ goto err;
+
+ if (!rep_quick)
+ {
+ sort_info.new_info->s->state.state.data_file_length= sort_param.filepos;
+ if (sort_info.new_info != sort_info.info)
+ {
+ MARIA_STATE_INFO save_state= sort_info.new_info->s->state;
+ if (maria_close(sort_info.new_info))
+ {
+ _ma_check_print_error(param, "Got error %d on close", my_errno);
+ goto err;
+ }
+ copy_data_file_state(&share->state, &save_state);
+ new_file= -1;
+ sort_info.new_info= info;
+ }
+ share->state.version=(ulong) time((time_t*) 0); /* Force reopen */
+
+ /* Replace the actual file with the temporary file */
+ if (new_file >= 0)
+ my_close(new_file, MYF(MY_WME));
+ new_file= -1;
+ change_data_file_descriptor(info, -1);
+ if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT,
+ DATA_TMP_EXT,
+ (param->testflag & T_BACKUP_DATA ?
+ MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) |
+ sync_dir) ||
+ _ma_open_datafile(info, share, NullS, -1))
+ {
+ goto err;
+ }
+ }
+ else
+ {
+ share->state.state.data_file_length= sort_param.max_pos;
+ }
+ if (param->testflag & T_CALC_CHECKSUM)
+ share->state.state.checksum= param->glob_crc;
+
+ if (!(param->testflag & T_SILENT))
+ {
+ if (start_records != share->state.state.records)
+ printf("Data records: %s\n", llstr(share->state.state.records,llbuff));
+ }
+ if (sort_info.dupp)
+ _ma_check_print_warning(param,
+ "%s records have been removed",
+ llstr(sort_info.dupp,llbuff));
+
+ got_error= 0;
+ /* If invoked by external program that uses thr_lock */
+ if (&share->state.state != info->state)
+ *info->state= *info->state_start= share->state.state;
+
+err:
+ if (scan_inited)
+ maria_scan_end(sort_info.info);
+ _ma_reset_state(info);
+
+ VOID(end_io_cache(&param->read_cache));
+ VOID(end_io_cache(&sort_info.new_info->rec_cache));
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ /* this below could fail, shouldn't we detect error? */
+ if (got_error)
+ {
+ if (! param->error_printed)
+ _ma_check_print_error(param,"%d for record at pos %s",my_errno,
+ llstr(sort_param.start_recpos,llbuff));
+ (void)_ma_flush_table_files_before_swap(param, info);
+ if (sort_info.new_info && sort_info.new_info != sort_info.info)
+ {
+ unuse_data_file_descriptor(sort_info.new_info);
+ maria_close(sort_info.new_info);
+ }
+ if (new_file >= 0)
+ {
+ VOID(my_close(new_file,MYF(0)));
+ VOID(my_delete(param->temp_filename, MYF(MY_WME)));
+ }
+ maria_mark_crashed_on_repair(info);
+ }
+ /* If caller had disabled logging it's not up to us to re-enable it */
+ if (reenable_logging)
+ _ma_reenable_logging_for_table(info, FALSE);
+ restore_table_state_after_repair(info, &backup_share);
+
+ my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+ if (!got_error && (param->testflag & T_UNPACK))
+ restore_data_file_type(share);
+ share->state.changed|= (STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES |
+ STATE_NOT_ANALYZED | STATE_NOT_ZEROFILLED);
+ if (!rep_quick)
+ share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_MOVABLE);
+ DBUG_RETURN(got_error);
+}
+
+
+/* Uppdate keyfile when doing repair */
+
+static int writekeys(MARIA_SORT_PARAM *sort_param)
+{
+ uint i;
+ MARIA_HA *info= sort_param->sort_info->info;
+ MARIA_SHARE *share= info->s;
+ uchar *record= sort_param->record;
+ uchar *key_buff;
+ my_off_t filepos= sort_param->current_filepos;
+ MARIA_KEY key;
+ DBUG_ENTER("writekeys");
+
+ key_buff= info->lastkey_buff+share->base.max_key_length;
+
+ for (i=0 ; i < share->base.keys ; i++)
+ {
+ if (maria_is_key_active(share->state.key_map, i))
+ {
+ if (share->keyinfo[i].flag & HA_FULLTEXT )
+ {
+ if (_ma_ft_add(info, i, key_buff, record, filepos))
+ goto err;
+ }
+ else
+ {
+ if (!(*share->keyinfo[i].make_key)(info, &key, i, key_buff, record,
+ filepos, 0))
+ goto err;
+ if ((*share->keyinfo[i].ck_insert)(info, &key))
+ goto err;
+ }
+ }
+ }
+ DBUG_RETURN(0);
+
+ err:
+ if (my_errno == HA_ERR_FOUND_DUPP_KEY)
+ {
+ info->errkey=(int) i; /* This key was found */
+ while ( i-- > 0 )
+ {
+ if (maria_is_key_active(share->state.key_map, i))
+ {
+ if (share->keyinfo[i].flag & HA_FULLTEXT)
+ {
+ if (_ma_ft_del(info,i,key_buff,record,filepos))
+ break;
+ }
+ else
+ {
+ (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record,
+ filepos, 0);
+ if (_ma_ck_delete(info, &key))
+ break;
+ }
+ }
+ }
+ }
+ /* Remove checksum that was added to glob_crc in sort_get_next_record */
+ if (sort_param->calc_checksum)
+ sort_param->sort_info->param->glob_crc-= info->cur_row.checksum;
+ DBUG_PRINT("error",("errno: %d",my_errno));
+ DBUG_RETURN(-1);
+} /* writekeys */
+
+
+ /* Change all key-pointers that points to a records */
+
+int maria_movepoint(register MARIA_HA *info, uchar *record,
+ MARIA_RECORD_POS oldpos, MARIA_RECORD_POS newpos,
+ uint prot_key)
+{
+ uint i;
+ uchar *key_buff;
+ MARIA_SHARE *share= info->s;
+ MARIA_PAGE page;
+ DBUG_ENTER("maria_movepoint");
+
+ key_buff= info->lastkey_buff + share->base.max_key_length;
+ for (i=0 ; i < share->base.keys; i++)
+ {
+ if (i != prot_key && maria_is_key_active(share->state.key_map, i))
+ {
+ MARIA_KEY key;
+ (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, oldpos,
+ 0);
+ if (key.keyinfo->flag & HA_NOSAME)
+ { /* Change pointer direct */
+ MARIA_KEYDEF *keyinfo;
+ keyinfo=share->keyinfo+i;
+ if (_ma_search(info, &key, (uint32) (SEARCH_SAME | SEARCH_SAVE_BUFF),
+ share->state.key_root[i]))
+ DBUG_RETURN(-1);
+ _ma_page_setup(&page, info, keyinfo, info->last_keypage,
+ info->keyread_buff);
+
+ _ma_dpointer(share, info->int_keypos - page.node -
+ share->rec_reflength,newpos);
+
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_UNLOCKED,
+ DFLT_INIT_HITS))
+ DBUG_RETURN(-1);
+ }
+ else
+ { /* Change old key to new */
+ if (_ma_ck_delete(info, &key))
+ DBUG_RETURN(-1);
+ (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, newpos,
+ 0);
+ if (_ma_ck_write(info, &key))
+ DBUG_RETURN(-1);
+ }
+ }
+ }
+ DBUG_RETURN(0);
+} /* maria_movepoint */
+
+
+ /* Tell system that we want all memory for our cache */
+
+void maria_lock_memory(HA_CHECK *param __attribute__((unused)))
+{
+#ifdef SUN_OS /* Key-cacheing thrases on sun 4.1 */
+ if (param->opt_maria_lock_memory)
+ {
+ int success = mlockall(MCL_CURRENT); /* or plock(DATLOCK); */
+ if (geteuid() == 0 && success != 0)
+ _ma_check_print_warning(param,
+ "Failed to lock memory. errno %d",my_errno);
+ }
+#endif
+} /* maria_lock_memory */
+
+
+/**
+ Flush all changed blocks to disk.
+
+ We release blocks as it's unlikely that they would all be needed soon.
+ This function needs to be called before swapping data or index files or
+ syncing them.
+
+ @param param description of the repair operation
+ @param info table
+*/
+
+static my_bool _ma_flush_table_files_before_swap(HA_CHECK *param,
+ MARIA_HA *info)
+{
+ DBUG_ENTER("_ma_flush_table_files_before_swap");
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_RELEASE, FLUSH_RELEASE))
+ {
+ _ma_check_print_error(param, "%d when trying to write buffers", my_errno);
+ DBUG_RETURN(TRUE);
+ }
+ DBUG_RETURN(FALSE);
+}
+
+
+ /* Sort index for more efficent reads */
+
+int maria_sort_index(HA_CHECK *param, register MARIA_HA *info, char *name)
+{
+ reg2 uint key;
+ reg1 MARIA_KEYDEF *keyinfo;
+ File new_file;
+ my_off_t index_pos[HA_MAX_POSSIBLE_KEY];
+ uint r_locks,w_locks;
+ int old_lock;
+ MARIA_SHARE *share= info->s;
+ MARIA_STATE_INFO old_state;
+ myf sync_dir= ((share->now_transactional && !share->temporary) ?
+ MY_SYNC_DIR : 0);
+ DBUG_ENTER("maria_sort_index");
+
+ /* cannot sort index files with R-tree indexes */
+ for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ;
+ key++,keyinfo++)
+ if (keyinfo->key_alg == HA_KEY_ALG_RTREE)
+ DBUG_RETURN(0);
+
+ if (!(param->testflag & T_SILENT))
+ printf("- Sorting index for Aria-table '%s'\n",name);
+
+ if (protect_against_repair_crash(info, param, FALSE))
+ DBUG_RETURN(1);
+
+ /* Get real path for index file */
+ fn_format(param->temp_filename,name,"", MARIA_NAME_IEXT,2+4+32);
+ if ((new_file=my_create(fn_format(param->temp_filename,param->temp_filename,
+ "", INDEX_TMP_EXT,2+4),
+ 0,param->tmpfile_createflag,MYF(0))) <= 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ DBUG_RETURN(-1);
+ }
+ if (maria_filecopy(param, new_file, share->kfile.file, 0L,
+ (ulong) share->base.keystart, "headerblock"))
+ goto err;
+
+ param->new_file_pos=share->base.keystart;
+ for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ;
+ key++,keyinfo++)
+ {
+ if (! maria_is_key_active(share->state.key_map, key))
+ continue;
+
+ if (share->state.key_root[key] != HA_OFFSET_ERROR)
+ {
+ index_pos[key]=param->new_file_pos; /* Write first block here */
+ if (sort_one_index(param,info,keyinfo,share->state.key_root[key],
+ new_file))
+ goto err;
+ }
+ else
+ index_pos[key]= HA_OFFSET_ERROR; /* No blocks */
+ }
+
+ /* Flush key cache for this file if we are calling this outside maria_chk */
+ flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_IGNORE_CHANGED);
+
+ share->state.version=(ulong) time((time_t*) 0);
+ old_state= share->state; /* save state if not stored */
+ r_locks= share->r_locks;
+ w_locks= share->w_locks;
+ old_lock= info->lock_type;
+
+ /* Put same locks as old file */
+ share->r_locks= share->w_locks= share->tot_locks= 0;
+ (void) _ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE);
+ pthread_mutex_lock(&share->intern_lock);
+ VOID(my_close(share->kfile.file, MYF(MY_WME)));
+ share->kfile.file = -1;
+ pthread_mutex_unlock(&share->intern_lock);
+ VOID(my_close(new_file,MYF(MY_WME)));
+ if (maria_change_to_newfile(share->index_file_name.str, MARIA_NAME_IEXT,
+ INDEX_TMP_EXT, sync_dir) ||
+ _ma_open_keyfile(share))
+ goto err2;
+ info->lock_type= F_UNLCK; /* Force maria_readinfo to lock */
+ _ma_readinfo(info,F_WRLCK,0); /* Will lock the table */
+ info->lock_type= old_lock;
+ share->r_locks= r_locks;
+ share->w_locks= w_locks;
+ share->tot_locks= r_locks+w_locks;
+ share->state= old_state; /* Restore old state */
+
+ share->state.state.key_file_length=param->new_file_pos;
+ info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ for (key=0 ; key < share->base.keys ; key++)
+ share->state.key_root[key]=index_pos[key];
+ share->state.key_del= HA_OFFSET_ERROR;
+
+ share->state.changed&= ~STATE_NOT_SORTED_PAGES;
+ DBUG_EXECUTE_IF("maria_flush_whole_log",
+ {
+ DBUG_PRINT("maria_flush_whole_log", ("now"));
+ translog_flush(translog_get_horizon());
+ });
+ DBUG_EXECUTE_IF("maria_crash_sort_index",
+ {
+ DBUG_PRINT("maria_crash_sort_index", ("now"));
+ DBUG_ABORT();
+ });
+ DBUG_RETURN(0);
+
+err:
+ VOID(my_close(new_file,MYF(MY_WME)));
+err2:
+ VOID(my_delete(param->temp_filename,MYF(MY_WME)));
+ DBUG_RETURN(-1);
+} /* maria_sort_index */
+
+
+/**
+ @brief put CRC on the page
+
+ @param buff reference on the page buffer.
+ @param pos position of the page in the file.
+ @param length length of the page
+*/
+
+static void put_crc(uchar *buff, my_off_t pos, MARIA_SHARE *share)
+{
+ maria_page_crc_set_index(buff, (pgcache_page_no_t) (pos / share->block_size),
+ (uchar*) share);
+}
+
+
+/* Sort index blocks recursive using one index */
+
+static int sort_one_index(HA_CHECK *param, MARIA_HA *info,
+ MARIA_KEYDEF *keyinfo,
+ my_off_t pagepos, File new_file)
+{
+ uint length,nod_flag;
+ uchar *buff,*keypos,*endpos;
+ my_off_t new_page_pos,next_page;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEY key;
+ MARIA_PAGE page;
+ DBUG_ENTER("sort_one_index");
+
+ /* cannot walk over R-tree indices */
+ DBUG_ASSERT(keyinfo->key_alg != HA_KEY_ALG_RTREE);
+ new_page_pos=param->new_file_pos;
+ param->new_file_pos+=keyinfo->block_length;
+ key.keyinfo= keyinfo;
+
+ if (!(buff= (uchar*) my_alloca((uint) keyinfo->block_length +
+ keyinfo->maxlength)))
+ {
+ _ma_check_print_error(param,"Not enough memory for key block");
+ DBUG_RETURN(-1);
+ }
+ key.data= buff + keyinfo->block_length;
+
+ if (_ma_fetch_keypage(&page, info, keyinfo, pagepos,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ DFLT_INIT_HITS, buff, 0))
+ {
+ report_keypage_fault(param, info, pagepos);
+ goto err;
+ }
+
+ if ((nod_flag= page.node) || keyinfo->flag & HA_FULLTEXT)
+ {
+ keypos= page.buff + share->keypage_header + nod_flag;
+ endpos= page.buff + page.size;
+
+ for ( ;; )
+ {
+ if (nod_flag)
+ {
+ next_page= _ma_kpos(nod_flag,keypos);
+ /* Save new pos */
+ _ma_kpointer(info,keypos-nod_flag,param->new_file_pos);
+ if (sort_one_index(param,info,keyinfo,next_page,new_file))
+ {
+ DBUG_PRINT("error",
+ ("From page: %ld, keyoffset: %lu used_length: %d",
+ (ulong) pagepos, (ulong) (keypos - buff),
+ (int) page.size));
+ DBUG_DUMP("buff", page.buff, page.size);
+ goto err;
+ }
+ }
+ if (keypos >= endpos ||
+ !(*keyinfo->get_key)(&key, page.flag, nod_flag, &keypos))
+ break;
+ DBUG_ASSERT(keypos <= endpos);
+ if (keyinfo->flag & HA_FULLTEXT)
+ {
+ uint off;
+ int subkeys;
+ get_key_full_length_rdonly(off, key.data);
+ subkeys= ft_sintXkorr(key.data + off);
+ if (subkeys < 0)
+ {
+ next_page= _ma_row_pos_from_key(&key);
+ _ma_dpointer(share, keypos - nod_flag - share->rec_reflength,
+ param->new_file_pos); /* Save new pos */
+ if (sort_one_index(param,info,&share->ft2_keyinfo,
+ next_page,new_file))
+ goto err;
+ }
+ }
+ }
+ }
+
+ /* Fill block with zero and write it to the new index file */
+ length= page.size;
+ bzero(buff+length,keyinfo->block_length-length);
+ put_crc(buff, new_page_pos, share);
+ if (my_pwrite(new_file, buff,(uint) keyinfo->block_length,
+ new_page_pos,MYF(MY_NABP | MY_WAIT_IF_FULL)))
+ {
+ _ma_check_print_error(param,"Can't write indexblock, error: %d",my_errno);
+ goto err;
+ }
+ my_afree(buff);
+ DBUG_RETURN(0);
+err:
+ my_afree(buff);
+ DBUG_RETURN(1);
+} /* sort_one_index */
+
+
+/**
+ @brief Fill empty space in index file with zeroes
+
+ @return
+ @retval 0 Ok
+ @retval 1 Error
+*/
+
+static my_bool maria_zerofill_index(HA_CHECK *param, MARIA_HA *info,
+ const char *name)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_PINNED_PAGE page_link;
+ char llbuff[21];
+ uchar *buff;
+ pgcache_page_no_t page;
+ my_off_t pos;
+ my_off_t key_file_length= share->state.state.key_file_length;
+ uint block_size= share->block_size;
+ my_bool zero_lsn= (share->base.born_transactional &&
+ !(param->testflag & T_ZEROFILL_KEEP_LSN));
+ DBUG_ENTER("maria_zerofill_index");
+
+ if (!(param->testflag & T_SILENT))
+ printf("- Zerofilling index for Aria-table '%s'\n",name);
+
+ /* Go through the index file */
+ for (pos= share->base.keystart, page= (ulonglong) (pos / block_size);
+ pos < key_file_length;
+ pos+= block_size, page++)
+ {
+ uint length;
+ if (!(buff= pagecache_read(share->pagecache,
+ &share->kfile, page,
+ DFLT_INIT_HITS, 0,
+ PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+ &page_link.link)))
+ {
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ _ma_check_print_error(param,
+ "Page %9s: Got error %d when reading index file",
+ llstr(pos, llbuff), my_errno);
+ DBUG_RETURN(1);
+ }
+ if (zero_lsn)
+ bzero(buff, LSN_SIZE);
+
+ if (share->base.born_transactional)
+ {
+ uint keynr= _ma_get_keynr(share, buff);
+ if (keynr != MARIA_DELETE_KEY_NR)
+ {
+ MARIA_PAGE page;
+ DBUG_ASSERT(keynr < share->base.keys);
+
+ _ma_page_setup(&page, info, share->keyinfo + keynr, pos, buff);
+ if (_ma_compact_keypage(&page, ~(TrID) 0))
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Got error %d when reading index "
+ "file",
+ llstr(pos, llbuff), my_errno);
+ DBUG_RETURN(1);
+ }
+ }
+ }
+
+ length= _ma_get_page_used(share, buff);
+ DBUG_ASSERT(length <= block_size);
+ if (length < block_size)
+ bzero(buff + length, block_size - length);
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 1, FALSE);
+ }
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_FORCE_WRITE))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Fill empty space in data file with zeroes
+
+ @todo
+ Zerofill all pages marked in bitmap as empty and change them to
+ be of type UNALLOCATED_PAGE
+
+ @return
+ @retval 0 Ok
+ @retval 1 Error
+*/
+
+static my_bool maria_zerofill_data(HA_CHECK *param, MARIA_HA *info,
+ const char *name)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_PINNED_PAGE page_link;
+ char llbuff[21];
+ my_off_t pos;
+ pgcache_page_no_t page;
+ uint block_size= share->block_size;
+ MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+ my_bool zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN), error;
+ DBUG_ENTER("maria_zerofill_data");
+
+ /* This works only with BLOCK_RECORD files */
+ if (share->data_file_type != BLOCK_RECORD)
+ DBUG_RETURN(0);
+
+ if (!(param->testflag & T_SILENT))
+ printf("- Zerofilling data for Aria-table '%s'\n",name);
+
+ /* Go through the record file */
+ for (page= 1, pos= block_size;
+ pos < share->state.state.data_file_length;
+ pos+= block_size, page++)
+ {
+ uchar *buff;
+ enum en_page_type page_type;
+
+ /* Ignore bitmap pages */
+ if ((page % share->bitmap.pages_covered) == 0)
+ continue;
+ if (!(buff= pagecache_read(share->pagecache,
+ &info->dfile,
+ page, 1, 0,
+ PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+ &page_link.link)))
+ {
+ _ma_check_print_error(param,
+ "Page %9s: Got error: %d when reading datafile",
+ llstr(pos, llbuff), my_errno);
+ goto err;
+ }
+ page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
+ switch (page_type) {
+ case UNALLOCATED_PAGE:
+ if (zero_lsn)
+ bzero(buff, block_size);
+ else
+ bzero(buff + LSN_SIZE, block_size - LSN_SIZE);
+ break;
+ case BLOB_PAGE:
+ if (_ma_bitmap_get_page_bits(info, bitmap, page) == 0)
+ {
+ /* Unallocated page */
+ if (zero_lsn)
+ bzero(buff, block_size);
+ else
+ bzero(buff + LSN_SIZE, block_size - LSN_SIZE);
+ }
+ else
+ if (zero_lsn)
+ bzero(buff, LSN_SIZE);
+ break;
+ case HEAD_PAGE:
+ case TAIL_PAGE:
+ {
+ uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+ uint offset, dir_start, empty_space;
+ uchar *dir;
+
+ if (zero_lsn)
+ bzero(buff, LSN_SIZE);
+ if (max_entry != 0)
+ {
+ my_bool is_head_page= (page_type == HEAD_PAGE);
+ dir= dir_entry_pos(buff, block_size, max_entry - 1);
+ _ma_compact_block_page(buff, block_size, max_entry -1, 0,
+ is_head_page ? ~(TrID) 0 : 0,
+ is_head_page ?
+ share->base.min_block_length : 0);
+
+ /* compactation may have increased free space */
+ empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+ if (!enough_free_entries_on_page(share, buff))
+ empty_space= 0; /* Page is full */
+ if (_ma_bitmap_set(info, page, is_head_page,
+ empty_space))
+ goto err;
+
+ /* Zerofill the not used part */
+ offset= uint2korr(dir) + uint2korr(dir+2);
+ dir_start= (uint) (dir - buff);
+ DBUG_ASSERT(dir_start >= offset);
+ if (dir_start > offset)
+ bzero(buff + offset, dir_start - offset);
+ }
+ break;
+ }
+ default:
+ _ma_check_print_error(param,
+ "Page %9s: Found unrecognizable block of type %d",
+ llstr(pos, llbuff), page_type);
+ goto err;
+ }
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 1, FALSE);
+ }
+ error= _ma_bitmap_flush(share);
+ if (flush_pagecache_blocks(share->pagecache, &info->dfile,
+ FLUSH_FORCE_WRITE))
+ error= 1;
+ DBUG_RETURN(error);
+
+err:
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ /* flush what was changed so far */
+ (void) _ma_bitmap_flush(share);
+ (void) flush_pagecache_blocks(share->pagecache, &info->dfile,
+ FLUSH_FORCE_WRITE);
+
+ DBUG_RETURN(1);
+}
+
+
+/**
+ @brief Fill empty space in index and data files with zeroes
+
+ @return
+ @retval 0 Ok
+ @retval 1 Error
+*/
+
+int maria_zerofill(HA_CHECK *param, MARIA_HA *info, const char *name)
+{
+ my_bool error, reenable_logging,
+ zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN);
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("maria_zerofill");
+ if ((reenable_logging= share->now_transactional))
+ _ma_tmp_disable_logging_for_table(info, 0);
+ if (!(error= (maria_zerofill_index(param, info, name) ||
+ maria_zerofill_data(param, info, name) ||
+ _ma_set_uuid(info, 0))))
+ {
+ /*
+ Mark that we have done zerofill of data and index. If we zeroed pages'
+ LSN, table is movable.
+ */
+ share->state.changed&= ~STATE_NOT_ZEROFILLED;
+ if (zero_lsn)
+ {
+ share->state.changed&= ~(STATE_NOT_MOVABLE | STATE_MOVED);
+ /* Table should get new LSNs */
+ share->state.create_rename_lsn= share->state.is_of_horizon=
+ share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
+ }
+ /* Ensure state is later flushed to disk, if within maria_chk */
+ info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ /* Reset create_trid to make file comparable */
+ share->state.create_trid= 0;
+ }
+ if (reenable_logging)
+ _ma_reenable_logging_for_table(info, FALSE);
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Let temporary file replace old file.
+ This assumes that the new file was created in the same
+ directory as given by realpath(filename).
+ This will ensure that any symlinks that are used will still work.
+ Copy stats from old file to new file, deletes orignal and
+ changes new file name to old file name
+*/
+
+int maria_change_to_newfile(const char * filename, const char * old_ext,
+ const char * new_ext, myf MyFlags)
+{
+ char old_filename[FN_REFLEN],new_filename[FN_REFLEN];
+#ifdef USE_RAID
+ if (raid_chunks)
+ return my_raid_redel(fn_format(old_filename,filename,"",old_ext,2+4),
+ fn_format(new_filename,filename,"",new_ext,2+4),
+ raid_chunks,
+ MYF(MY_WME | MY_LINK_WARNING | MyFlags));
+#endif
+ /* Get real path to filename */
+ (void) fn_format(old_filename,filename,"",old_ext,2+4+32);
+ return my_redel(old_filename,
+ fn_format(new_filename,old_filename,"",new_ext,2+4),
+ MYF(MY_WME | MY_LINK_WARNING | MyFlags));
+} /* maria_change_to_newfile */
+
+
+/* Copy a block between two files */
+
+int maria_filecopy(HA_CHECK *param, File to,File from,my_off_t start,
+ my_off_t length, const char *type)
+{
+ uchar tmp_buff[IO_SIZE], *buff;
+ ulong buff_length;
+ DBUG_ENTER("maria_filecopy");
+
+ buff_length=(ulong) min(param->write_buffer_length,length);
+ if (!(buff=my_malloc(buff_length,MYF(0))))
+ {
+ buff=tmp_buff; buff_length=IO_SIZE;
+ }
+
+ VOID(my_seek(from,start,MY_SEEK_SET,MYF(0)));
+ while (length > buff_length)
+ {
+ if (my_read(from, buff, buff_length, MYF(MY_NABP)) ||
+ my_write(to, buff, buff_length, param->myf_rw))
+ goto err;
+ length-= buff_length;
+ }
+ if (my_read(from, buff, (size_t) length,MYF(MY_NABP)) ||
+ my_write(to, buff, (size_t) length,param->myf_rw))
+ goto err;
+ if (buff != tmp_buff)
+ my_free(buff,MYF(0));
+ DBUG_RETURN(0);
+err:
+ if (buff != tmp_buff)
+ my_free(buff,MYF(0));
+ _ma_check_print_error(param,"Can't copy %s to tempfile, error %d",
+ type,my_errno);
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Repair table or given index using sorting
+
+ SYNOPSIS
+ maria_repair_by_sort()
+ param Repair parameters
+ info MARIA handler to repair
+ name Name of table (for warnings)
+ rep_quick set to <> 0 if we should not change data file
+
+ RESULT
+ 0 ok
+ <>0 Error
+*/
+
+int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info,
+ const char * name, my_bool rep_quick)
+{
+ int got_error;
+ uint i;
+ ha_rows start_records;
+ my_off_t new_header_length, org_header_length, del;
+ File new_file;
+ MARIA_SORT_PARAM sort_param;
+ MARIA_SHARE *share= info->s;
+ HA_KEYSEG *keyseg;
+ double *rec_per_key_part;
+ char llbuff[22];
+ MARIA_SORT_INFO sort_info;
+ ulonglong key_map;
+ myf sync_dir= ((share->now_transactional && !share->temporary) ?
+ MY_SYNC_DIR : 0);
+ my_bool scan_inited= 0, reenable_logging= 0;
+ MARIA_SHARE backup_share;
+ DBUG_ENTER("maria_repair_by_sort");
+ LINT_INIT(key_map);
+
+ got_error= 1;
+ new_file= -1;
+ start_records= share->state.state.records;
+ if (!(param->testflag & T_SILENT))
+ {
+ printf("- recovering (with sort) Aria-table '%s'\n",name);
+ printf("Data records: %s\n", llstr(start_records,llbuff));
+ }
+
+ if (initialize_variables_for_repair(param, &sort_info, &sort_param, info,
+ rep_quick, &backup_share))
+ goto err;
+
+ if ((reenable_logging= share->now_transactional))
+ _ma_tmp_disable_logging_for_table(info, 0);
+
+ org_header_length= share->pack.header_length;
+ new_header_length= (param->testflag & T_UNPACK) ? 0 : org_header_length;
+ sort_param.filepos= new_header_length;
+
+ if (!rep_quick)
+ {
+ /* Get real path for data file */
+ if ((new_file=my_create(fn_format(param->temp_filename,
+ share->data_file_name.str, "",
+ DATA_TMP_EXT, 2+4),
+ 0,param->tmpfile_createflag,
+ MYF(0))) < 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ goto err;
+ }
+ if (new_header_length &&
+ maria_filecopy(param, new_file, info->dfile.file, 0L,
+ new_header_length, "datafile-header"))
+ goto err;
+
+ share->state.dellink= HA_OFFSET_ERROR;
+ info->rec_cache.file= new_file; /* For sort_delete_record */
+ if (share->data_file_type == BLOCK_RECORD ||
+ (param->testflag & T_UNPACK))
+ {
+ if (create_new_data_handle(&sort_param, new_file))
+ goto err;
+ sort_info.new_info->rec_cache.file= new_file;
+ }
+ }
+
+ if (!(sort_info.key_block=
+ alloc_key_blocks(param,
+ (uint) param->sort_key_blocks,
+ share->base.max_key_block_length)))
+ goto err;
+ sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks;
+
+ if (share->data_file_type != BLOCK_RECORD)
+ {
+ /* We need a read buffer to read rows in big blocks */
+ if (init_io_cache(&param->read_cache, info->dfile.file,
+ (uint) param->read_buffer_length,
+ READ_CACHE, org_header_length, 1, MYF(MY_WME)))
+ goto err;
+ }
+ if (sort_info.new_info->s->data_file_type != BLOCK_RECORD)
+ {
+ /* When writing to not block records, we need a write buffer */
+ if (!rep_quick)
+ {
+ if (init_io_cache(&sort_info.new_info->rec_cache, new_file,
+ (uint) param->write_buffer_length,
+ WRITE_CACHE, new_header_length, 1,
+ MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw))
+ goto err;
+ sort_info.new_info->opt_flag|= WRITE_CACHE_USED;
+ }
+ }
+
+ if (!(sort_param.record=
+ (uchar*) my_malloc((size_t) share->base.default_rec_buff_size,
+ MYF(0))) ||
+ _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size,
+ share->base.default_rec_buff_size))
+ {
+ _ma_check_print_error(param, "Not enough memory for extra record");
+ goto err;
+ }
+
+ /* Optionally drop indexes and optionally modify the key_map */
+ maria_drop_all_indexes(param, info, FALSE);
+ key_map= share->state.key_map;
+ if (param->testflag & T_CREATE_MISSING_KEYS)
+ {
+ /* Invert the copied key_map to recreate all disabled indexes. */
+ key_map= ~key_map;
+ }
+
+ param->read_cache.end_of_file= sort_info.filelength;
+ sort_param.wordlist=NULL;
+ init_alloc_root(&sort_param.wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
+
+ sort_param.key_cmp=sort_key_cmp;
+ sort_param.lock_in_memory=maria_lock_memory;
+ sort_param.tmpdir=param->tmpdir;
+ sort_param.master =1;
+
+ del=share->state.state.del;
+
+ rec_per_key_part= param->new_rec_per_key_part;
+ for (sort_param.key=0 ; sort_param.key < share->base.keys ;
+ rec_per_key_part+=sort_param.keyinfo->keysegs, sort_param.key++)
+ {
+ sort_param.keyinfo=share->keyinfo+sort_param.key;
+ /*
+ Skip this index if it is marked disabled in the copied
+ (and possibly inverted) key_map.
+ */
+ if (! maria_is_key_active(key_map, sort_param.key))
+ {
+ /* Remember old statistics for key */
+ memcpy((char*) rec_per_key_part,
+ (char*) (share->state.rec_per_key_part +
+ (uint) (rec_per_key_part - param->new_rec_per_key_part)),
+ sort_param.keyinfo->keysegs*sizeof(*rec_per_key_part));
+ DBUG_PRINT("repair", ("skipping seemingly disabled index #: %u",
+ sort_param.key));
+ continue;
+ }
+
+ if ((!(param->testflag & T_SILENT)))
+ printf ("- Fixing index %d\n",sort_param.key+1);
+
+ sort_param.read_cache=param->read_cache;
+ sort_param.seg=sort_param.keyinfo->seg;
+ sort_param.max_pos= sort_param.pos= org_header_length;
+ keyseg=sort_param.seg;
+ bzero((char*) sort_param.unique,sizeof(sort_param.unique));
+ sort_param.key_length=share->rec_reflength;
+ for (i=0 ; keyseg[i].type != HA_KEYTYPE_END; i++)
+ {
+ sort_param.key_length+=keyseg[i].length;
+ if (keyseg[i].flag & HA_SPACE_PACK)
+ sort_param.key_length+=get_pack_length(keyseg[i].length);
+ if (keyseg[i].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ sort_param.key_length+=2 + test(keyseg[i].length >= 127);
+ if (keyseg[i].flag & HA_NULL_PART)
+ sort_param.key_length++;
+ }
+ share->state.state.records=share->state.state.del=share->state.split=0;
+ share->state.state.empty=0;
+
+ if (sort_param.keyinfo->flag & HA_FULLTEXT)
+ {
+ uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT*
+ sort_param.keyinfo->seg->charset->mbmaxlen;
+ sort_param.key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
+ /*
+ fulltext indexes may have much more entries than the
+ number of rows in the table. We estimate the number here.
+
+ Note, built-in parser is always nr. 0 - see ftparser_call_initializer()
+ */
+ if (sort_param.keyinfo->ftkey_nr == 0)
+ {
+ /*
+ for built-in parser the number of generated index entries
+ cannot be larger than the size of the data file divided
+ by the minimal word's length
+ */
+ sort_info.max_records=
+ (ha_rows) (sort_info.filelength/ft_min_word_len+1);
+ }
+ else
+ {
+ /*
+ for external plugin parser we cannot tell anything at all :(
+ so, we'll use all the sort memory and start from ~10 buffpeks.
+ (see _ma_create_index_by_sort)
+ */
+ sort_info.max_records=
+ 10*param->sort_buffer_length/sort_param.key_length;
+ }
+
+ sort_param.key_read= sort_maria_ft_key_read;
+ sort_param.key_write= sort_maria_ft_key_write;
+ }
+ else
+ {
+ sort_param.key_read= sort_key_read;
+ sort_param.key_write= sort_key_write;
+ }
+
+ if (sort_info.new_info->s->data_file_type == BLOCK_RECORD)
+ {
+ scan_inited= 1;
+ if (maria_scan_init(sort_info.info))
+ goto err;
+ }
+ if (_ma_create_index_by_sort(&sort_param,
+ (my_bool) (!(param->testflag & T_VERBOSE)),
+ (size_t) param->sort_buffer_length))
+ {
+ param->retry_repair=1;
+ _ma_check_print_error(param, "Create index by sort failed");
+ goto err;
+ }
+ DBUG_EXECUTE_IF("maria_flush_whole_log",
+ {
+ DBUG_PRINT("maria_flush_whole_log", ("now"));
+ translog_flush(translog_get_horizon());
+ });
+ DBUG_EXECUTE_IF("maria_crash_create_index_by_sort",
+ {
+ DBUG_PRINT("maria_crash_create_index_by_sort", ("now"));
+ DBUG_ABORT();
+ });
+ if (scan_inited)
+ {
+ scan_inited= 0;
+ maria_scan_end(sort_info.info);
+ }
+
+ /* No need to calculate checksum again. */
+ sort_param.calc_checksum= 0;
+ free_root(&sort_param.wordroot, MYF(0));
+
+ /* Set for next loop */
+ sort_info.max_records= (ha_rows) sort_info.new_info->s->state.state.records;
+ if (param->testflag & T_STATISTICS)
+ maria_update_key_parts(sort_param.keyinfo, rec_per_key_part,
+ sort_param.unique,
+ (param->stats_method ==
+ MI_STATS_METHOD_IGNORE_NULLS ?
+ sort_param.notnull : NULL),
+ (ulonglong) share->state.state.records);
+ maria_set_key_active(share->state.key_map, sort_param.key);
+ DBUG_PRINT("repair", ("set enabled index #: %u", sort_param.key));
+
+ if (_ma_flush_table_files_before_swap(param, info))
+ goto err;
+
+ if (sort_param.fix_datafile)
+ {
+ param->read_cache.end_of_file=sort_param.filepos;
+ if (maria_write_data_suffix(&sort_info,1) ||
+ end_io_cache(&sort_info.new_info->rec_cache))
+ {
+ _ma_check_print_error(param, "Got error when flushing row cache");
+ goto err;
+ }
+ sort_info.new_info->opt_flag&= ~WRITE_CACHE_USED;
+
+ if (param->testflag & T_SAFE_REPAIR)
+ {
+ /* Don't repair if we loosed more than one row */
+ if (share->state.state.records+1 < start_records)
+ {
+ _ma_check_print_error(param,
+ "Rows lost; Aborting because safe repair was "
+ "requested");
+ share->state.state.records=start_records;
+ goto err;
+ }
+ }
+
+ sort_info.new_info->s->state.state.data_file_length= sort_param.filepos;
+ if (sort_info.new_info != sort_info.info)
+ {
+ MARIA_STATE_INFO save_state= sort_info.new_info->s->state;
+ if (maria_close(sort_info.new_info))
+ {
+ _ma_check_print_error(param, "Got error %d on close", my_errno);
+ goto err;
+ }
+ copy_data_file_state(&share->state, &save_state);
+ new_file= -1;
+ sort_info.new_info= info;
+ info->rec_cache.file= info->dfile.file;
+ }
+
+ share->state.version=(ulong) time((time_t*) 0); /* Force reopen */
+
+ /* Replace the actual file with the temporary file */
+ if (new_file >= 0)
+ {
+ my_close(new_file, MYF(MY_WME));
+ new_file= -1;
+ }
+ change_data_file_descriptor(info, -1);
+ if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT,
+ DATA_TMP_EXT,
+ (param->testflag & T_BACKUP_DATA ?
+ MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) |
+ sync_dir) ||
+ _ma_open_datafile(info, share, NullS, -1))
+ {
+ _ma_check_print_error(param, "Couldn't change to new data file");
+ goto err;
+ }
+ if (param->testflag & T_UNPACK)
+ restore_data_file_type(share);
+
+ org_header_length= share->pack.header_length;
+ sort_info.org_data_file_type= share->data_file_type;
+ sort_info.filelength= share->state.state.data_file_length;
+ sort_param.fix_datafile=0;
+ }
+ else
+ share->state.state.data_file_length=sort_param.max_pos;
+
+ param->read_cache.file= info->dfile.file; /* re-init read cache */
+ reinit_io_cache(&param->read_cache,READ_CACHE,share->pack.header_length,
+ 1,1);
+ }
+
+ if (param->testflag & T_WRITE_LOOP)
+ {
+ VOID(fputs(" \r",stdout)); VOID(fflush(stdout));
+ }
+
+ if (rep_quick && del+sort_info.dupp != share->state.state.del)
+ {
+ _ma_check_print_error(param,"Couldn't fix table with quick recovery: "
+ "Found wrong number of deleted records");
+ _ma_check_print_error(param,"Run recovery again without -q");
+ got_error=1;
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ goto err;
+ }
+
+ if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS))
+ {
+ my_off_t skr= (share->state.state.data_file_length +
+ (sort_info.org_data_file_type == COMPRESSED_RECORD) ?
+ MEMMAP_EXTRA_MARGIN : 0);
+#ifdef USE_RELOC
+ if (sort_info.org_data_file_type == STATIC_RECORD &&
+ skr < share->base.reloc*share->base.min_pack_length)
+ skr=share->base.reloc*share->base.min_pack_length;
+#endif
+ if (skr != sort_info.filelength)
+ if (my_chsize(info->dfile.file, skr, 0, MYF(0)))
+ _ma_check_print_warning(param,
+ "Can't change size of datafile, error: %d",
+ my_errno);
+ }
+
+ if (param->testflag & T_CALC_CHECKSUM)
+ share->state.state.checksum=param->glob_crc;
+
+ if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0,
+ MYF(0)))
+ _ma_check_print_warning(param,
+ "Can't change size of indexfile, error: %d",
+ my_errno);
+
+ if (!(param->testflag & T_SILENT))
+ {
+ if (start_records != share->state.state.records)
+ printf("Data records: %s\n", llstr(share->state.state.records,llbuff));
+ }
+ if (sort_info.dupp)
+ _ma_check_print_warning(param,
+ "%s records have been removed",
+ llstr(sort_info.dupp,llbuff));
+ got_error=0;
+ /* If invoked by external program that uses thr_lock */
+ if (&share->state.state != info->state)
+ *info->state= *info->state_start= share->state.state;
+
+err:
+ if (scan_inited)
+ maria_scan_end(sort_info.info);
+ _ma_reset_state(info);
+
+ VOID(end_io_cache(&sort_info.new_info->rec_cache));
+ VOID(end_io_cache(&param->read_cache));
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ if (got_error)
+ {
+ if (! param->error_printed)
+ _ma_check_print_error(param,"%d when fixing table",my_errno);
+ (void)_ma_flush_table_files_before_swap(param, info);
+ if (sort_info.new_info && sort_info.new_info != sort_info.info)
+ {
+ unuse_data_file_descriptor(sort_info.new_info);
+ maria_close(sort_info.new_info);
+ }
+ if (new_file >= 0)
+ {
+ VOID(my_close(new_file,MYF(0)));
+ VOID(my_delete(param->temp_filename, MYF(MY_WME)));
+ }
+ maria_mark_crashed_on_repair(info);
+ }
+ else
+ {
+ if (key_map == share->state.key_map)
+ share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS;
+ /*
+ Now that we have flushed and forced everything, we can bump
+ create_rename_lsn:
+ */
+ DBUG_EXECUTE_IF("maria_flush_whole_log",
+ {
+ DBUG_PRINT("maria_flush_whole_log", ("now"));
+ translog_flush(translog_get_horizon());
+ });
+ DBUG_EXECUTE_IF("maria_crash_repair",
+ {
+ DBUG_PRINT("maria_crash_repair", ("now"));
+ DBUG_ABORT();
+ });
+ }
+ share->state.changed|= STATE_NOT_SORTED_PAGES;
+ if (!rep_quick)
+ share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+
+ /* If caller had disabled logging it's not up to us to re-enable it */
+ if (reenable_logging)
+ _ma_reenable_logging_for_table(info, FALSE);
+ restore_table_state_after_repair(info, &backup_share);
+
+ my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_info.key_block, MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+ DBUG_RETURN(got_error);
+}
+
+
+/*
+ Threaded repair of table using sorting
+
+ SYNOPSIS
+ maria_repair_parallel()
+ param Repair parameters
+ info MARIA handler to repair
+ name Name of table (for warnings)
+ rep_quick set to <> 0 if we should not change data file
+
+ DESCRIPTION
+ Same as maria_repair_by_sort but do it multithreaded
+ Each key is handled by a separate thread.
+ TODO: make a number of threads a parameter
+
+ In parallel repair we use one thread per index. There are two modes:
+
+ Quick
+
+ Only the indexes are rebuilt. All threads share a read buffer.
+ Every thread that needs fresh data in the buffer enters the shared
+ cache lock. The last thread joining the lock reads the buffer from
+ the data file and wakes all other threads.
+
+ Non-quick
+
+ The data file is rebuilt and all indexes are rebuilt to point to
+ the new record positions. One thread is the master thread. It
+ reads from the old data file and writes to the new data file. It
+ also creates one of the indexes. The other threads read from a
+ buffer which is filled by the master. If they need fresh data,
+ they enter the shared cache lock. If the masters write buffer is
+ full, it flushes it to the new data file and enters the shared
+ cache lock too. When all threads joined in the lock, the master
+ copies its write buffer to the read buffer for the other threads
+ and wakes them.
+
+ RESULT
+ 0 ok
+ <>0 Error
+*/
+
+int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info,
+ const char * name, my_bool rep_quick)
+{
+#ifndef THREAD
+ return maria_repair_by_sort(param, info, name, rep_quick);
+#else
+ int got_error;
+ uint i,key, total_key_length, istep;
+ ha_rows start_records;
+ my_off_t new_header_length,del;
+ File new_file;
+ MARIA_SORT_PARAM *sort_param=0, tmp_sort_param;
+ MARIA_SHARE *share= info->s;
+ double *rec_per_key_part;
+ HA_KEYSEG *keyseg;
+ char llbuff[22];
+ IO_CACHE new_data_cache; /* For non-quick repair. */
+ IO_CACHE_SHARE io_share;
+ MARIA_SORT_INFO sort_info;
+ MARIA_SHARE backup_share;
+ ulonglong key_map;
+ pthread_attr_t thr_attr;
+ myf sync_dir= ((share->now_transactional && !share->temporary) ?
+ MY_SYNC_DIR : 0);
+ my_bool reenable_logging= 0;
+ DBUG_ENTER("maria_repair_parallel");
+ LINT_INIT(key_map);
+
+ got_error= 1;
+ new_file= -1;
+ start_records= share->state.state.records;
+ if (!(param->testflag & T_SILENT))
+ {
+ printf("- parallel recovering (with sort) Aria-table '%s'\n",name);
+ printf("Data records: %s\n", llstr(start_records, llbuff));
+ }
+
+ if (initialize_variables_for_repair(param, &sort_info, &tmp_sort_param, info,
+ rep_quick, &backup_share))
+ goto err;
+
+ if ((reenable_logging= share->now_transactional))
+ _ma_tmp_disable_logging_for_table(info, 0);
+
+ new_header_length= ((param->testflag & T_UNPACK) ? 0 :
+ share->pack.header_length);
+
+ /*
+ Quick repair (not touching data file, rebuilding indexes):
+ {
+ Read cache is (HA_CHECK *param)->read_cache using info->dfile.file.
+ }
+
+ Non-quick repair (rebuilding data file and indexes):
+ {
+ Master thread:
+
+ Read cache is (HA_CHECK *param)->read_cache using info->dfile.file.
+ Write cache is (MARIA_INFO *info)->rec_cache using new_file.
+
+ Slave threads:
+
+ Read cache is new_data_cache synced to master rec_cache.
+
+ The final assignment of the filedescriptor for rec_cache is done
+ after the cache creation.
+
+ Don't check file size on new_data_cache, as the resulting file size
+ is not known yet.
+
+ As rec_cache and new_data_cache are synced, write_buffer_length is
+ used for the read cache 'new_data_cache'. Both start at the same
+ position 'new_header_length'.
+ }
+ */
+ DBUG_PRINT("info", ("is quick repair: %d", (int) rep_quick));
+
+ /* Initialize pthread structures before goto err. */
+ pthread_mutex_init(&sort_info.mutex, MY_MUTEX_INIT_FAST);
+ pthread_cond_init(&sort_info.cond, 0);
+
+ if (!(sort_info.key_block=
+ alloc_key_blocks(param, (uint) param->sort_key_blocks,
+ share->base.max_key_block_length)) ||
+ init_io_cache(&param->read_cache, info->dfile.file,
+ (uint) param->read_buffer_length,
+ READ_CACHE, share->pack.header_length, 1, MYF(MY_WME)) ||
+ (!rep_quick &&
+ (init_io_cache(&info->rec_cache, info->dfile.file,
+ (uint) param->write_buffer_length,
+ WRITE_CACHE, new_header_length, 1,
+ MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw) ||
+ init_io_cache(&new_data_cache, -1,
+ (uint) param->write_buffer_length,
+ READ_CACHE, new_header_length, 1,
+ MYF(MY_WME | MY_DONT_CHECK_FILESIZE)))))
+ goto err;
+ sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks;
+ info->opt_flag|=WRITE_CACHE_USED;
+ info->rec_cache.file= info->dfile.file; /* for sort_delete_record */
+
+ if (!rep_quick)
+ {
+ /* Get real path for data file */
+ if ((new_file= my_create(fn_format(param->temp_filename,
+ share->data_file_name.str, "",
+ DATA_TMP_EXT,
+ 2+4),
+ 0,param->tmpfile_createflag,
+ MYF(0))) < 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ goto err;
+ }
+ if (new_header_length &&
+ maria_filecopy(param, new_file, info->dfile.file,0L,new_header_length,
+ "datafile-header"))
+ goto err;
+ if (param->testflag & T_UNPACK)
+ restore_data_file_type(share);
+ share->state.dellink= HA_OFFSET_ERROR;
+ info->rec_cache.file=new_file;
+ }
+
+ /* Optionally drop indexes and optionally modify the key_map. */
+ maria_drop_all_indexes(param, info, FALSE);
+ key_map= share->state.key_map;
+ if (param->testflag & T_CREATE_MISSING_KEYS)
+ {
+ /* Invert the copied key_map to recreate all disabled indexes. */
+ key_map= ~key_map;
+ }
+
+ param->read_cache.end_of_file= sort_info.filelength;
+
+ /*
+ +1 below is required hack for parallel repair mode.
+ The share->state.state.records value, that is compared later
+ to sort_info.max_records and cannot exceed it, is
+ increased in sort_key_write. In maria_repair_by_sort, sort_key_write
+ is called after sort_key_read, where the comparison is performed,
+ but in parallel mode master thread can call sort_key_write
+ before some other repair thread calls sort_key_read.
+ Furthermore I'm not even sure +1 would be enough.
+ May be sort_info.max_records shold be always set to max value in
+ parallel mode.
+ */
+ sort_info.max_records++;
+
+ del=share->state.state.del;
+
+ if (!(sort_param=(MARIA_SORT_PARAM *)
+ my_malloc((uint) share->base.keys *
+ (sizeof(MARIA_SORT_PARAM) + share->base.pack_reclength),
+ MYF(MY_ZEROFILL))))
+ {
+ _ma_check_print_error(param,"Not enough memory for key!");
+ goto err;
+ }
+ total_key_length=0;
+ rec_per_key_part= param->new_rec_per_key_part;
+ share->state.state.records=share->state.state.del=share->state.split=0;
+ share->state.state.empty=0;
+
+ for (i=key=0, istep=1 ; key < share->base.keys ;
+ rec_per_key_part+=sort_param[i].keyinfo->keysegs, i+=istep, key++)
+ {
+ sort_param[i].key=key;
+ sort_param[i].keyinfo=share->keyinfo+key;
+ sort_param[i].seg=sort_param[i].keyinfo->seg;
+ /*
+ Skip this index if it is marked disabled in the copied
+ (and possibly inverted) key_map.
+ */
+ if (! maria_is_key_active(key_map, key))
+ {
+ /* Remember old statistics for key */
+ memcpy((char*) rec_per_key_part,
+ (char*) (share->state.rec_per_key_part+
+ (uint) (rec_per_key_part - param->new_rec_per_key_part)),
+ sort_param[i].keyinfo->keysegs*sizeof(*rec_per_key_part));
+ istep=0;
+ continue;
+ }
+ istep=1;
+ if ((!(param->testflag & T_SILENT)))
+ printf ("- Fixing index %d\n",key+1);
+ if (sort_param[i].keyinfo->flag & HA_FULLTEXT)
+ {
+ sort_param[i].key_read=sort_maria_ft_key_read;
+ sort_param[i].key_write=sort_maria_ft_key_write;
+ }
+ else
+ {
+ sort_param[i].key_read=sort_key_read;
+ sort_param[i].key_write=sort_key_write;
+ }
+ sort_param[i].key_cmp=sort_key_cmp;
+ sort_param[i].lock_in_memory=maria_lock_memory;
+ sort_param[i].tmpdir=param->tmpdir;
+ sort_param[i].sort_info=&sort_info;
+ sort_param[i].master=0;
+ sort_param[i].fix_datafile=0;
+ sort_param[i].calc_checksum= 0;
+
+ sort_param[i].filepos=new_header_length;
+ sort_param[i].max_pos=sort_param[i].pos=share->pack.header_length;
+
+ sort_param[i].record= (((uchar *)(sort_param+share->base.keys))+
+ (share->base.pack_reclength * i));
+ if (_ma_alloc_buffer(&sort_param[i].rec_buff, &sort_param[i].rec_buff_size,
+ share->base.default_rec_buff_size))
+ {
+ _ma_check_print_error(param,"Not enough memory!");
+ goto err;
+ }
+ sort_param[i].key_length=share->rec_reflength;
+ for (keyseg=sort_param[i].seg; keyseg->type != HA_KEYTYPE_END;
+ keyseg++)
+ {
+ sort_param[i].key_length+=keyseg->length;
+ if (keyseg->flag & HA_SPACE_PACK)
+ sort_param[i].key_length+=get_pack_length(keyseg->length);
+ if (keyseg->flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ sort_param[i].key_length+=2 + test(keyseg->length >= 127);
+ if (keyseg->flag & HA_NULL_PART)
+ sort_param[i].key_length++;
+ }
+ total_key_length+=sort_param[i].key_length;
+
+ if (sort_param[i].keyinfo->flag & HA_FULLTEXT)
+ {
+ uint ft_max_word_len_for_sort=
+ (FT_MAX_WORD_LEN_FOR_SORT *
+ sort_param[i].keyinfo->seg->charset->mbmaxlen);
+ sort_param[i].key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
+ init_alloc_root(&sort_param[i].wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
+ }
+ }
+ sort_info.total_keys=i;
+ sort_param[0].master= 1;
+ sort_param[0].fix_datafile= ! rep_quick;
+ sort_param[0].calc_checksum= test(param->testflag & T_CALC_CHECKSUM);
+
+ if (!maria_ftparser_alloc_param(info))
+ goto err;
+
+ sort_info.got_error=0;
+ pthread_mutex_lock(&sort_info.mutex);
+
+ /*
+ Initialize the I/O cache share for use with the read caches and, in
+ case of non-quick repair, the write cache. When all threads join on
+ the cache lock, the writer copies the write cache contents to the
+ read caches.
+ */
+ if (i > 1)
+ {
+ if (rep_quick)
+ init_io_cache_share(&param->read_cache, &io_share, NULL, i);
+ else
+ init_io_cache_share(&new_data_cache, &io_share, &info->rec_cache, i);
+ }
+ else
+ io_share.total_threads= 0; /* share not used */
+
+ (void) pthread_attr_init(&thr_attr);
+ (void) pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED);
+
+ for (i=0 ; i < sort_info.total_keys ; i++)
+ {
+ /*
+ Copy the properly initialized IO_CACHE structure so that every
+ thread has its own copy. In quick mode param->read_cache is shared
+ for use by all threads. In non-quick mode all threads but the
+ first copy the shared new_data_cache, which is synchronized to the
+ write cache of the first thread. The first thread copies
+ param->read_cache, which is not shared.
+ */
+ sort_param[i].read_cache= ((rep_quick || !i) ? param->read_cache :
+ new_data_cache);
+ DBUG_PRINT("io_cache_share", ("thread: %u read_cache: 0x%lx",
+ i, (long) &sort_param[i].read_cache));
+
+ /*
+ two approaches: the same amount of memory for each thread
+ or the memory for the same number of keys for each thread...
+ In the second one all the threads will fill their sort_buffers
+ (and call write_keys) at the same time, putting more stress on i/o.
+ */
+ sort_param[i].sortbuff_size=
+#ifndef USING_SECOND_APPROACH
+ param->sort_buffer_length/sort_info.total_keys;
+#else
+ param->sort_buffer_length*sort_param[i].key_length/total_key_length;
+#endif
+ if (pthread_create(&sort_param[i].thr, &thr_attr,
+ _ma_thr_find_all_keys,
+ (void *) (sort_param+i)))
+ {
+ _ma_check_print_error(param,"Cannot start a repair thread");
+ /* Cleanup: Detach from the share. Avoid others to be blocked. */
+ if (io_share.total_threads)
+ remove_io_thread(&sort_param[i].read_cache);
+ DBUG_PRINT("error", ("Cannot start a repair thread"));
+ sort_info.got_error=1;
+ }
+ else
+ sort_info.threads_running++;
+ }
+ (void) pthread_attr_destroy(&thr_attr);
+
+ /* waiting for all threads to finish */
+ while (sort_info.threads_running)
+ pthread_cond_wait(&sort_info.cond, &sort_info.mutex);
+ pthread_mutex_unlock(&sort_info.mutex);
+
+ if ((got_error= _ma_thr_write_keys(sort_param)))
+ {
+ param->retry_repair=1;
+ goto err;
+ }
+ got_error=1; /* Assume the following may go wrong */
+
+ if (_ma_flush_table_files_before_swap(param, info))
+ goto err;
+
+ if (sort_param[0].fix_datafile)
+ {
+ /*
+ Append some nulls to the end of a memory mapped file. Destroy the
+ write cache. The master thread did already detach from the share
+ by remove_io_thread() in sort.c:thr_find_all_keys().
+ */
+ if (maria_write_data_suffix(&sort_info,1) ||
+ end_io_cache(&info->rec_cache))
+ goto err;
+ if (param->testflag & T_SAFE_REPAIR)
+ {
+ /* Don't repair if we loosed more than one row */
+ if (share->state.state.records+1 < start_records)
+ {
+ share->state.state.records=start_records;
+ goto err;
+ }
+ }
+ share->state.state.data_file_length= sort_param->filepos;
+ /* Only whole records */
+ share->state.version= (ulong) time((time_t*) 0);
+ /*
+ Exchange the data file descriptor of the table, so that we use the
+ new file from now on.
+ */
+ my_close(info->dfile.file, MYF(0));
+ info->dfile.file= new_file;
+ share->pack.header_length=(ulong) new_header_length;
+ }
+ else
+ share->state.state.data_file_length=sort_param->max_pos;
+
+ if (rep_quick && del+sort_info.dupp != share->state.state.del)
+ {
+ _ma_check_print_error(param,"Couldn't fix table with quick recovery: "
+ "Found wrong number of deleted records");
+ _ma_check_print_error(param,"Run recovery again without -q");
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ goto err;
+ }
+
+ if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS))
+ {
+ my_off_t skr= (share->state.state.data_file_length +
+ (sort_info.org_data_file_type == COMPRESSED_RECORD) ?
+ MEMMAP_EXTRA_MARGIN : 0);
+#ifdef USE_RELOC
+ if (sort_info.org_data_file_type == STATIC_RECORD &&
+ skr < share->base.reloc*share->base.min_pack_length)
+ skr=share->base.reloc*share->base.min_pack_length;
+#endif
+ if (skr != sort_info.filelength)
+ if (my_chsize(info->dfile.file, skr, 0, MYF(0)))
+ _ma_check_print_warning(param,
+ "Can't change size of datafile, error: %d",
+ my_errno);
+ }
+ if (param->testflag & T_CALC_CHECKSUM)
+ share->state.state.checksum=param->glob_crc;
+
+ if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0,
+ MYF(0)))
+ _ma_check_print_warning(param,
+ "Can't change size of indexfile, error: %d",
+ my_errno);
+
+ if (!(param->testflag & T_SILENT))
+ {
+ if (start_records != share->state.state.records)
+ printf("Data records: %s\n", llstr(share->state.state.records,llbuff));
+ }
+ if (sort_info.dupp)
+ _ma_check_print_warning(param,
+ "%s records have been removed",
+ llstr(sort_info.dupp,llbuff));
+ got_error=0;
+ /* If invoked by external program that uses thr_lock */
+ if (&share->state.state != info->state)
+ *info->state= *info->state_start= share->state.state;
+
+err:
+ _ma_reset_state(info);
+
+ /*
+ Destroy the write cache. The master thread did already detach from
+ the share by remove_io_thread() or it was not yet started (if the
+ error happend before creating the thread).
+ */
+ VOID(end_io_cache(&sort_info.new_info->rec_cache));
+ VOID(end_io_cache(&param->read_cache));
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ /*
+ Destroy the new data cache in case of non-quick repair. All slave
+ threads did either detach from the share by remove_io_thread()
+ already or they were not yet started (if the error happend before
+ creating the threads).
+ */
+ if (!rep_quick)
+ VOID(end_io_cache(&new_data_cache));
+ if (!got_error)
+ {
+ /* Replace the actual file with the temporary file */
+ if (new_file >= 0)
+ {
+ my_close(new_file,MYF(0));
+ info->dfile.file= new_file= -1;
+ if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT,
+ DATA_TMP_EXT,
+ MYF((param->testflag & T_BACKUP_DATA ?
+ MY_REDEL_MAKE_BACKUP : 0) |
+ sync_dir)) ||
+ _ma_open_datafile(info,share, NullS, -1))
+ got_error=1;
+ }
+ }
+ if (got_error)
+ {
+ if (! param->error_printed)
+ _ma_check_print_error(param,"%d when fixing table",my_errno);
+ (void)_ma_flush_table_files_before_swap(param, info);
+ if (new_file >= 0)
+ {
+ VOID(my_close(new_file,MYF(0)));
+ VOID(my_delete(param->temp_filename, MYF(MY_WME)));
+ if (info->dfile.file == new_file)
+ info->dfile.file= -1;
+ }
+ maria_mark_crashed_on_repair(info);
+ }
+ else if (key_map == share->state.key_map)
+ share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS;
+ share->state.changed|= STATE_NOT_SORTED_PAGES;
+ if (!rep_quick)
+ share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+
+ pthread_cond_destroy (&sort_info.cond);
+ pthread_mutex_destroy(&sort_info.mutex);
+
+ /* If caller had disabled logging it's not up to us to re-enable it */
+ if (reenable_logging)
+ _ma_reenable_logging_for_table(info, FALSE);
+ restore_table_state_after_repair(info, &backup_share);
+
+ my_free(sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_info.key_block,MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_param,MYF(MY_ALLOW_ZERO_PTR));
+ my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+ if (!got_error && (param->testflag & T_UNPACK))
+ restore_data_file_type(share);
+ DBUG_RETURN(got_error);
+#endif /* THREAD */
+}
+
+ /* Read next record and return next key */
+
+static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key)
+{
+ int error;
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ MARIA_HA *info= sort_info->info;
+ MARIA_KEY int_key;
+ DBUG_ENTER("sort_key_read");
+
+ if ((error=sort_get_next_record(sort_param)))
+ DBUG_RETURN(error);
+ if (info->s->state.state.records == sort_info->max_records)
+ {
+ _ma_check_print_error(sort_info->param,
+ "Key %d - Found too many records; Can't continue",
+ sort_param->key+1);
+ DBUG_RETURN(1);
+ }
+ if (_ma_sort_write_record(sort_param))
+ DBUG_RETURN(1);
+
+ (*info->s->keyinfo[sort_param->key].make_key)(info, &int_key,
+ sort_param->key, key,
+ sort_param->record,
+ sort_param->current_filepos,
+ 0);
+ sort_param->real_key_length= int_key.data_length + int_key.ref_length;
+#ifdef HAVE_valgrind
+ bzero(key+sort_param->real_key_length,
+ (sort_param->key_length-sort_param->real_key_length));
+#endif
+ DBUG_RETURN(0);
+} /* sort_key_read */
+
+
+static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key)
+{
+ int error;
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ MARIA_HA *info=sort_info->info;
+ FT_WORD *wptr=0;
+ MARIA_KEY int_key;
+ DBUG_ENTER("sort_maria_ft_key_read");
+
+ if (!sort_param->wordlist)
+ {
+ for (;;)
+ {
+ free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE));
+ if ((error=sort_get_next_record(sort_param)))
+ DBUG_RETURN(error);
+ if ((error= _ma_sort_write_record(sort_param)))
+ DBUG_RETURN(error);
+ if (!(wptr= _ma_ft_parserecord(info,sort_param->key,sort_param->record,
+ &sort_param->wordroot)))
+
+ DBUG_RETURN(1);
+ if (wptr->pos)
+ break;
+ }
+ sort_param->wordptr=sort_param->wordlist=wptr;
+ }
+ else
+ {
+ error=0;
+ wptr=(FT_WORD*)(sort_param->wordptr);
+ }
+
+ _ma_ft_make_key(info, &int_key, sort_param->key, key, wptr++,
+ sort_param->current_filepos);
+ sort_param->real_key_length= int_key.data_length + int_key.ref_length;
+
+#ifdef HAVE_valgrind
+ if (sort_param->key_length > sort_param->real_key_length)
+ bzero(key+sort_param->real_key_length,
+ (sort_param->key_length-sort_param->real_key_length));
+#endif
+ if (!wptr->pos)
+ {
+ free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE));
+ sort_param->wordlist=0;
+ }
+ else
+ sort_param->wordptr=(void*)wptr;
+
+ DBUG_RETURN(error);
+} /* sort_maria_ft_key_read */
+
+
+/*
+ Read next record from file using parameters in sort_info.
+
+ SYNOPSIS
+ sort_get_next_record()
+ sort_param Information about and for the sort process
+
+ NOTES
+ Dynamic Records With Non-Quick Parallel Repair
+
+ For non-quick parallel repair we use a synchronized read/write
+ cache. This means that one thread is the master who fixes the data
+ file by reading each record from the old data file and writing it
+ to the new data file. By doing this the records in the new data
+ file are written contiguously. Whenever the write buffer is full,
+ it is copied to the read buffer. The slaves read from the read
+ buffer, which is not associated with a file. Thus read_cache.file
+ is -1. When using _mi_read_cache(), the slaves must always set
+ flag to READING_NEXT so that the function never tries to read from
+ file. This is safe because the records are contiguous. There is no
+ need to read outside the cache. This condition is evaluated in the
+ variable 'parallel_flag' for quick reference. read_cache.file must
+ be >= 0 in every other case.
+
+ RETURN
+ -1 end of file
+ 0 ok
+ sort_param->current_filepos points to record position.
+ sort_param->record contains record
+ sort_param->max_pos contains position to last byte read
+ > 0 error
+*/
+
+static int sort_get_next_record(MARIA_SORT_PARAM *sort_param)
+{
+ int searching;
+ int parallel_flag;
+ uint found_record,b_type,left_length;
+ my_off_t pos;
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ MARIA_HA *info=sort_info->info;
+ MARIA_SHARE *share= info->s;
+ char llbuff[22],llbuff2[22];
+ DBUG_ENTER("sort_get_next_record");
+
+ if (_ma_killed_ptr(param))
+ DBUG_RETURN(1);
+
+ switch (sort_info->org_data_file_type) {
+ case BLOCK_RECORD:
+ {
+ for (;;)
+ {
+ int flag;
+ /*
+ Assume table is transactional and it had LSN pages in the
+ cache. Repair has flushed them, left data pages stay in
+ cache, and disabled transactionality (so share's current page
+ type is PLAIN); page cache would assert if it finds a cached LSN page
+ while _ma_scan_block_record() requested a PLAIN page. So we use
+ UNKNOWN.
+ */
+ enum pagecache_page_type save_page_type= share->page_type;
+ share->page_type= PAGECACHE_READ_UNKNOWN_PAGE;
+ if (info != sort_info->new_info)
+ {
+ /* Safe scanning */
+ flag= _ma_safe_scan_block_record(sort_info, info,
+ sort_param->record);
+ }
+ else
+ {
+ /*
+ Scan on clean table.
+ It requires a reliable data_file_length so we set it.
+ */
+ share->state.state.data_file_length= sort_info->filelength;
+ info->cur_row.trid= 0;
+ flag= _ma_scan_block_record(info, sort_param->record,
+ info->cur_row.nextpos, 1);
+ set_if_bigger(param->max_found_trid, info->cur_row.trid);
+ if (info->cur_row.trid > param->max_trid)
+ {
+ _ma_check_print_not_visible_error(param, info->cur_row.trid);
+ flag= HA_ERR_ROW_NOT_VISIBLE;
+ }
+ }
+ share->page_type= save_page_type;
+ if (!flag)
+ {
+ if (sort_param->calc_checksum)
+ {
+ ha_checksum checksum;
+ checksum= (*share->calc_check_checksum)(info, sort_param->record);
+ if (share->calc_checksum &&
+ info->cur_row.checksum != (checksum & 255))
+ {
+ if (param->testflag & T_VERBOSE)
+ {
+ record_pos_to_txt(info, info->cur_row.lastpos, llbuff);
+ _ma_check_print_info(param,
+ "Found record with wrong checksum at %s",
+ llbuff);
+ }
+ continue;
+ }
+ info->cur_row.checksum= checksum;
+ param->glob_crc+= checksum;
+ }
+ sort_param->start_recpos= sort_param->current_filepos=
+ info->cur_row.lastpos;
+ DBUG_RETURN(0);
+ }
+ if (flag == HA_ERR_END_OF_FILE)
+ {
+ sort_param->max_pos= share->state.state.data_file_length;
+ DBUG_RETURN(-1);
+ }
+ /* Retry only if wrong record, not if disk error */
+ if (flag != HA_ERR_WRONG_IN_RECORD)
+ {
+ retry_if_quick(sort_param, flag);
+ DBUG_RETURN(flag);
+ }
+ }
+ break; /* Impossible */
+ }
+ case STATIC_RECORD:
+ for (;;)
+ {
+ if (my_b_read(&sort_param->read_cache,sort_param->record,
+ share->base.pack_reclength))
+ {
+ if (sort_param->read_cache.error)
+ param->out_flag |= O_DATA_LOST;
+ retry_if_quick(sort_param, my_errno);
+ DBUG_RETURN(-1);
+ }
+ sort_param->start_recpos=sort_param->pos;
+ if (!sort_param->fix_datafile)
+ {
+ sort_param->current_filepos= sort_param->pos;
+ if (sort_param->master)
+ share->state.split++;
+ }
+ sort_param->max_pos=(sort_param->pos+=share->base.pack_reclength);
+ if (*sort_param->record)
+ {
+ if (sort_param->calc_checksum)
+ param->glob_crc+= (info->cur_row.checksum=
+ _ma_static_checksum(info,sort_param->record));
+ DBUG_RETURN(0);
+ }
+ if (!sort_param->fix_datafile && sort_param->master)
+ {
+ share->state.state.del++;
+ share->state.state.empty+=share->base.pack_reclength;
+ }
+ }
+ case DYNAMIC_RECORD:
+ {
+ uchar *to;
+ ha_checksum checksum= 0;
+ LINT_INIT(to);
+
+ pos=sort_param->pos;
+ searching=(sort_param->fix_datafile && (param->testflag & T_EXTEND));
+ parallel_flag= (sort_param->read_cache.file < 0) ? READING_NEXT : 0;
+ for (;;)
+ {
+ found_record=block_info.second_read= 0;
+ left_length=1;
+ if (searching)
+ {
+ pos=MY_ALIGN(pos,MARIA_DYN_ALIGN_SIZE);
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ sort_param->start_recpos=pos;
+ }
+ do
+ {
+ if (pos > sort_param->max_pos)
+ sort_param->max_pos=pos;
+ if (pos & (MARIA_DYN_ALIGN_SIZE-1))
+ {
+ if ((param->testflag & T_VERBOSE) || searching == 0)
+ _ma_check_print_info(param,"Wrong aligned block at %s",
+ llstr(pos,llbuff));
+ if (searching)
+ goto try_next;
+ }
+ if (found_record && pos == param->search_after_block)
+ _ma_check_print_info(param,"Block: %s used by record at %s",
+ llstr(param->search_after_block,llbuff),
+ llstr(sort_param->start_recpos,llbuff2));
+ if (_ma_read_cache(&sort_param->read_cache,
+ block_info.header, pos,
+ MARIA_BLOCK_INFO_HEADER_LENGTH,
+ (! found_record ? READING_NEXT : 0) |
+ parallel_flag | READING_HEADER))
+ {
+ if (found_record)
+ {
+ _ma_check_print_info(param,
+ "Can't read whole record at %s (errno: %d)",
+ llstr(sort_param->start_recpos,llbuff),errno);
+ goto try_next;
+ }
+ DBUG_RETURN(-1);
+ }
+ if (searching && ! sort_param->fix_datafile)
+ {
+ param->error_printed=1;
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ DBUG_RETURN(1); /* Something wrong with data */
+ }
+ b_type= _ma_get_block_info(&block_info,-1,pos);
+ if ((b_type & (BLOCK_ERROR | BLOCK_FATAL_ERROR)) ||
+ ((b_type & BLOCK_FIRST) &&
+ (block_info.rec_len < (uint) share->base.min_pack_length ||
+ block_info.rec_len > (uint) share->base.max_pack_length)))
+ {
+ uint i;
+ if (param->testflag & T_VERBOSE || searching == 0)
+ _ma_check_print_info(param,
+ "Wrong bytesec: %3d-%3d-%3d at %10s; Skipped",
+ block_info.header[0],block_info.header[1],
+ block_info.header[2],llstr(pos,llbuff));
+ if (found_record)
+ goto try_next;
+ block_info.second_read=0;
+ searching=1;
+ /* Search after block in read header string */
+ for (i=MARIA_DYN_ALIGN_SIZE ;
+ i < MARIA_BLOCK_INFO_HEADER_LENGTH ;
+ i+= MARIA_DYN_ALIGN_SIZE)
+ if (block_info.header[i] >= 1 &&
+ block_info.header[i] <= MARIA_MAX_DYN_HEADER_BYTE)
+ break;
+ pos+=(ulong) i;
+ sort_param->start_recpos=pos;
+ continue;
+ }
+ if (b_type & BLOCK_DELETED)
+ {
+ my_bool error=0;
+ if (block_info.block_len+ (uint) (block_info.filepos-pos) <
+ share->base.min_block_length)
+ {
+ if (!searching)
+ _ma_check_print_info(param,
+ "Deleted block with impossible length %lu "
+ "at %s",
+ block_info.block_len,llstr(pos,llbuff));
+ error=1;
+ }
+ else
+ {
+ if ((block_info.next_filepos != HA_OFFSET_ERROR &&
+ block_info.next_filepos >=
+ share->state.state.data_file_length) ||
+ (block_info.prev_filepos != HA_OFFSET_ERROR &&
+ block_info.prev_filepos >=
+ share->state.state.data_file_length))
+ {
+ if (!searching)
+ _ma_check_print_info(param,
+ "Delete link points outside datafile at "
+ "%s",
+ llstr(pos,llbuff));
+ error=1;
+ }
+ }
+ if (error)
+ {
+ if (found_record)
+ goto try_next;
+ searching=1;
+ pos+= MARIA_DYN_ALIGN_SIZE;
+ sort_param->start_recpos=pos;
+ block_info.second_read=0;
+ continue;
+ }
+ }
+ else
+ {
+ if (block_info.block_len+ (uint) (block_info.filepos-pos) <
+ share->base.min_block_length ||
+ block_info.block_len > (uint) share->base.max_pack_length+
+ MARIA_SPLIT_LENGTH)
+ {
+ if (!searching)
+ _ma_check_print_info(param,
+ "Found block with impossible length %lu "
+ "at %s; Skipped",
+ block_info.block_len+
+ (uint) (block_info.filepos-pos),
+ llstr(pos,llbuff));
+ if (found_record)
+ goto try_next;
+ searching=1;
+ pos+= MARIA_DYN_ALIGN_SIZE;
+ sort_param->start_recpos=pos;
+ block_info.second_read=0;
+ continue;
+ }
+ }
+ if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))
+ {
+ if (!sort_param->fix_datafile && sort_param->master &&
+ (b_type & BLOCK_DELETED))
+ {
+ share->state.state.empty+=block_info.block_len;
+ share->state.state.del++;
+ share->state.split++;
+ }
+ if (found_record)
+ goto try_next;
+ if (searching)
+ {
+ pos+=MARIA_DYN_ALIGN_SIZE;
+ sort_param->start_recpos=pos;
+ }
+ else
+ pos=block_info.filepos+block_info.block_len;
+ block_info.second_read=0;
+ continue;
+ }
+
+ if (!sort_param->fix_datafile && sort_param->master)
+ share->state.split++;
+ if (! found_record++)
+ {
+ sort_param->find_length=left_length=block_info.rec_len;
+ sort_param->start_recpos=pos;
+ if (!sort_param->fix_datafile)
+ sort_param->current_filepos= sort_param->start_recpos;
+ if (sort_param->fix_datafile && (param->testflag & T_EXTEND))
+ sort_param->pos=block_info.filepos+1;
+ else
+ sort_param->pos=block_info.filepos+block_info.block_len;
+ if (share->base.blobs)
+ {
+ if (_ma_alloc_buffer(&sort_param->rec_buff,
+ &sort_param->rec_buff_size,
+ block_info.rec_len +
+ share->base.extra_rec_buff_size))
+
+ {
+ if (param->max_record_length >= block_info.rec_len)
+ {
+ _ma_check_print_error(param,"Not enough memory for blob at %s "
+ "(need %lu)",
+ llstr(sort_param->start_recpos,llbuff),
+ (ulong) block_info.rec_len);
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ _ma_check_print_info(param,"Not enough memory for blob at %s "
+ "(need %lu); Row skipped",
+ llstr(sort_param->start_recpos,llbuff),
+ (ulong) block_info.rec_len);
+ goto try_next;
+ }
+ }
+ }
+ to= sort_param->rec_buff;
+ }
+ if (left_length < block_info.data_len || ! block_info.data_len)
+ {
+ _ma_check_print_info(param,
+ "Found block with too small length at %s; "
+ "Skipped",
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ if (block_info.filepos + block_info.data_len >
+ sort_param->read_cache.end_of_file)
+ {
+ _ma_check_print_info(param,
+ "Found block that points outside data file "
+ "at %s",
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ /*
+ Copy information that is already read. Avoid accessing data
+ below the cache start. This could happen if the header
+ streched over the end of the previous buffer contents.
+ */
+ {
+ uint header_len= (uint) (block_info.filepos - pos);
+ uint prefetch_len= (MARIA_BLOCK_INFO_HEADER_LENGTH - header_len);
+
+ if (prefetch_len > block_info.data_len)
+ prefetch_len= block_info.data_len;
+ if (prefetch_len)
+ {
+ memcpy(to, block_info.header + header_len, prefetch_len);
+ block_info.filepos+= prefetch_len;
+ block_info.data_len-= prefetch_len;
+ left_length-= prefetch_len;
+ to+= prefetch_len;
+ }
+ }
+ if (block_info.data_len &&
+ _ma_read_cache(&sort_param->read_cache,to,block_info.filepos,
+ block_info.data_len,
+ (found_record == 1 ? READING_NEXT : 0) |
+ parallel_flag))
+ {
+ _ma_check_print_info(param,
+ "Read error for block at: %s (error: %d); "
+ "Skipped",
+ llstr(block_info.filepos,llbuff),my_errno);
+ goto try_next;
+ }
+ left_length-=block_info.data_len;
+ to+=block_info.data_len;
+ pos=block_info.next_filepos;
+ if (pos == HA_OFFSET_ERROR && left_length)
+ {
+ _ma_check_print_info(param,
+ "Wrong block with wrong total length "
+ "starting at %s",
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ if (pos + MARIA_BLOCK_INFO_HEADER_LENGTH >
+ sort_param->read_cache.end_of_file)
+ {
+ _ma_check_print_info(param,
+ "Found link that points at %s (outside data "
+ "file) at %s",
+ llstr(pos,llbuff2),
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ } while (left_length);
+
+ if (_ma_rec_unpack(info,sort_param->record,sort_param->rec_buff,
+ sort_param->find_length) != MY_FILE_ERROR)
+ {
+ if (sort_param->read_cache.error < 0)
+ DBUG_RETURN(1);
+ if (sort_param->calc_checksum)
+ checksum= (share->calc_check_checksum)(info, sort_param->record);
+ if ((param->testflag & (T_EXTEND | T_REP)) || searching)
+ {
+ if (_ma_rec_check(info, sort_param->record, sort_param->rec_buff,
+ sort_param->find_length,
+ (param->testflag & T_QUICK) &&
+ sort_param->calc_checksum &&
+ test(share->calc_checksum), checksum))
+ {
+ _ma_check_print_info(param,"Found wrong packed record at %s",
+ llstr(sort_param->start_recpos,llbuff));
+ goto try_next;
+ }
+ }
+ if (sort_param->calc_checksum)
+ param->glob_crc+= checksum;
+ DBUG_RETURN(0);
+ }
+ if (!searching)
+ _ma_check_print_info(param,"Key %d - Found wrong stored record at %s",
+ sort_param->key+1,
+ llstr(sort_param->start_recpos,llbuff));
+ try_next:
+ pos=(sort_param->start_recpos+=MARIA_DYN_ALIGN_SIZE);
+ searching=1;
+ }
+ }
+ case COMPRESSED_RECORD:
+ for (searching=0 ;; searching=1, sort_param->pos++)
+ {
+ if (_ma_read_cache(&sort_param->read_cache, block_info.header,
+ sort_param->pos,
+ share->pack.ref_length,READING_NEXT))
+ DBUG_RETURN(-1);
+ if (searching && ! sort_param->fix_datafile)
+ {
+ param->error_printed=1;
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ DBUG_RETURN(1); /* Something wrong with data */
+ }
+ sort_param->start_recpos=sort_param->pos;
+ if (_ma_pack_get_block_info(info, &sort_param->bit_buff, &block_info,
+ &sort_param->rec_buff,
+ &sort_param->rec_buff_size, -1,
+ sort_param->pos))
+ DBUG_RETURN(-1);
+ if (!block_info.rec_len &&
+ sort_param->pos + MEMMAP_EXTRA_MARGIN ==
+ sort_param->read_cache.end_of_file)
+ DBUG_RETURN(-1);
+ if (block_info.rec_len < (uint) share->min_pack_length ||
+ block_info.rec_len > (uint) share->max_pack_length)
+ {
+ if (! searching)
+ _ma_check_print_info(param,
+ "Found block with wrong recordlength: %lu "
+ "at %s\n",
+ block_info.rec_len,
+ llstr(sort_param->pos,llbuff));
+ continue;
+ }
+ if (_ma_read_cache(&sort_param->read_cache, sort_param->rec_buff,
+ block_info.filepos, block_info.rec_len,
+ READING_NEXT))
+ {
+ if (! searching)
+ _ma_check_print_info(param,"Couldn't read whole record from %s",
+ llstr(sort_param->pos,llbuff));
+ continue;
+ }
+#ifdef HAVE_valgrind
+ bzero(sort_param->rec_buff + block_info.rec_len,
+ share->base.extra_rec_buff_size);
+#endif
+ if (_ma_pack_rec_unpack(info, &sort_param->bit_buff, sort_param->record,
+ sort_param->rec_buff, block_info.rec_len))
+ {
+ if (! searching)
+ _ma_check_print_info(param,"Found wrong record at %s",
+ llstr(sort_param->pos,llbuff));
+ continue;
+ }
+ if (!sort_param->fix_datafile)
+ {
+ sort_param->current_filepos= sort_param->pos;
+ if (sort_param->master)
+ share->state.split++;
+ }
+ sort_param->max_pos= (sort_param->pos=block_info.filepos+
+ block_info.rec_len);
+ info->packed_length=block_info.rec_len;
+
+ if (sort_param->calc_checksum)
+ {
+ info->cur_row.checksum= (*share->calc_check_checksum)(info,
+ sort_param->
+ record);
+ param->glob_crc+= info->cur_row.checksum;
+ }
+ DBUG_RETURN(0);
+ }
+ }
+ DBUG_RETURN(1); /* Impossible */
+}
+
+
+/**
+ @brief Write record to new file.
+
+ @fn _ma_sort_write_record()
+ @param sort_param Sort parameters.
+
+ @note
+ This is only called by a master thread if parallel repair is used.
+
+ @return
+ @retval 0 OK
+ sort_param->current_filepos points to inserted record for
+ block_records and to the place for the next record for
+ other row types.
+ sort_param->filepos points to end of file
+ @retval 1 Error
+*/
+
+int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param)
+{
+ int flag;
+ uint length;
+ ulong block_length,reclength;
+ uchar *from;
+ uchar block_buff[8];
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param= sort_info->param;
+ MARIA_HA *info= sort_info->new_info;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_sort_write_record");
+
+ if (sort_param->fix_datafile)
+ {
+ sort_param->current_filepos= sort_param->filepos;
+ switch (sort_info->new_data_file_type) {
+ case BLOCK_RECORD:
+ if ((sort_param->current_filepos=
+ (*share->write_record_init)(info, sort_param->record)) ==
+ HA_OFFSET_ERROR)
+ DBUG_RETURN(1);
+ /* Pointer to end of file */
+ sort_param->filepos= share->state.state.data_file_length;
+ break;
+ case STATIC_RECORD:
+ if (my_b_write(&info->rec_cache,sort_param->record,
+ share->base.pack_reclength))
+ {
+ _ma_check_print_error(param,"%d when writing to datafile",my_errno);
+ DBUG_RETURN(1);
+ }
+ sort_param->filepos+=share->base.pack_reclength;
+ share->state.split++;
+ break;
+ case DYNAMIC_RECORD:
+ if (! info->blobs)
+ from=sort_param->rec_buff;
+ else
+ {
+ /* must be sure that local buffer is big enough */
+ reclength=share->base.pack_reclength+
+ _ma_calc_total_blob_length(info,sort_param->record)+
+ ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
+ MARIA_DYN_DELETE_BLOCK_HEADER;
+ if (sort_info->buff_length < reclength)
+ {
+ if (!(sort_info->buff=my_realloc(sort_info->buff, (uint) reclength,
+ MYF(MY_FREE_ON_ERROR |
+ MY_ALLOW_ZERO_PTR))))
+ DBUG_RETURN(1);
+ sort_info->buff_length=reclength;
+ }
+ from= (uchar *) sort_info->buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER);
+ }
+ /* We can use info->checksum here as only one thread calls this */
+ info->cur_row.checksum= (*share->calc_check_checksum)(info,
+ sort_param->
+ record);
+ reclength= _ma_rec_pack(info,from,sort_param->record);
+ flag=0;
+
+ do
+ {
+ block_length=reclength+ 3 + test(reclength >= (65520-3));
+ if (block_length < share->base.min_block_length)
+ block_length=share->base.min_block_length;
+ info->update|=HA_STATE_WRITE_AT_END;
+ block_length=MY_ALIGN(block_length,MARIA_DYN_ALIGN_SIZE);
+ if (block_length > MARIA_MAX_BLOCK_LENGTH)
+ block_length=MARIA_MAX_BLOCK_LENGTH;
+ if (_ma_write_part_record(info,0L,block_length,
+ sort_param->filepos+block_length,
+ &from,&reclength,&flag))
+ {
+ _ma_check_print_error(param,"%d when writing to datafile",my_errno);
+ DBUG_RETURN(1);
+ }
+ sort_param->filepos+=block_length;
+ share->state.split++;
+ } while (reclength);
+ break;
+ case COMPRESSED_RECORD:
+ reclength=info->packed_length;
+ length= _ma_save_pack_length((uint) share->pack.version, block_buff,
+ reclength);
+ if (share->base.blobs)
+ length+= _ma_save_pack_length((uint) share->pack.version,
+ block_buff + length, info->blob_length);
+ if (my_b_write(&info->rec_cache,block_buff,length) ||
+ my_b_write(&info->rec_cache, sort_param->rec_buff, reclength))
+ {
+ _ma_check_print_error(param,"%d when writing to datafile",my_errno);
+ DBUG_RETURN(1);
+ }
+ sort_param->filepos+=reclength+length;
+ share->state.split++;
+ break;
+ }
+ }
+ if (sort_param->master)
+ {
+ share->state.state.records++;
+ if ((param->testflag & T_WRITE_LOOP) &&
+ (share->state.state.records % WRITE_COUNT) == 0)
+ {
+ char llbuff[22];
+ printf("%s\r", llstr(share->state.state.records,llbuff));
+ VOID(fflush(stdout));
+ }
+ }
+ DBUG_RETURN(0);
+} /* _ma_sort_write_record */
+
+
+/* Compare two keys from _ma_create_index_by_sort */
+
+static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a,
+ const void *b)
+{
+ uint not_used[2];
+ return (ha_key_cmp(sort_param->seg, *((uchar* const *) a),
+ *((uchar* const *) b),
+ USE_WHOLE_KEY, SEARCH_SAME, not_used));
+} /* sort_key_cmp */
+
+
+static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a)
+{
+ uint diff_pos[2];
+ char llbuff[22],llbuff2[22];
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param= sort_info->param;
+ int cmp;
+
+ if (sort_info->key_block->inited)
+ {
+ cmp= ha_key_cmp(sort_param->seg, sort_info->key_block->lastkey,
+ a, USE_WHOLE_KEY,
+ SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT,
+ diff_pos);
+ if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL)
+ ha_key_cmp(sort_param->seg, sort_info->key_block->lastkey,
+ a, USE_WHOLE_KEY,
+ SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diff_pos);
+ else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+ {
+ diff_pos[0]= maria_collect_stats_nonulls_next(sort_param->seg,
+ sort_param->notnull,
+ sort_info->key_block->lastkey,
+ a);
+ }
+ sort_param->unique[diff_pos[0]-1]++;
+ }
+ else
+ {
+ cmp= -1;
+ if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+ maria_collect_stats_nonulls_first(sort_param->seg, sort_param->notnull,
+ a);
+ }
+ if ((sort_param->keyinfo->flag & HA_NOSAME) && cmp == 0)
+ {
+ sort_info->dupp++;
+ sort_info->info->cur_row.lastpos= get_record_for_key(sort_param->keyinfo,
+ a);
+ _ma_check_print_warning(param,
+ "Duplicate key %2u for record at %10s against "
+ "record at %10s",
+ sort_param->key + 1,
+ llstr(sort_info->info->cur_row.lastpos, llbuff),
+ llstr(get_record_for_key(sort_param->keyinfo,
+ sort_info->key_block->
+ lastkey),
+ llbuff2));
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ if (sort_info->param->testflag & T_VERBOSE)
+ _ma_print_keydata(stdout,sort_param->seg, a, USE_WHOLE_KEY);
+ return (sort_delete_record(sort_param));
+ }
+#ifndef DBUG_OFF
+ if (cmp > 0)
+ {
+ _ma_check_print_error(param,
+ "Internal error: Keys are not in order from sort");
+ return(1);
+ }
+#endif
+ return (sort_insert_key(sort_param, sort_info->key_block,
+ a, HA_OFFSET_ERROR));
+} /* sort_key_write */
+
+
+int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param)
+{
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ SORT_KEY_BLOCKS *key_block=sort_info->key_block;
+ MARIA_SHARE *share=sort_info->info->s;
+ uint val_off, val_len;
+ int error;
+ SORT_FT_BUF *maria_ft_buf=sort_info->ft_buf;
+ uchar *from, *to;
+
+ val_len=share->ft2_keyinfo.keylength;
+ get_key_full_length_rdonly(val_off, maria_ft_buf->lastkey);
+ to= maria_ft_buf->lastkey+val_off;
+
+ if (maria_ft_buf->buf)
+ {
+ /* flushing first-level tree */
+ error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey,
+ HA_OFFSET_ERROR);
+ for (from=to+val_len;
+ !error && from < maria_ft_buf->buf;
+ from+= val_len)
+ {
+ memcpy(to, from, val_len);
+ error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey,
+ HA_OFFSET_ERROR);
+ }
+ return error;
+ }
+ /* flushing second-level tree keyblocks */
+ error=_ma_flush_pending_blocks(sort_param);
+ /* updating lastkey with second-level tree info */
+ ft_intXstore(maria_ft_buf->lastkey+val_off, -maria_ft_buf->count);
+ _ma_dpointer(sort_info->info->s, maria_ft_buf->lastkey+val_off+HA_FT_WLEN,
+ share->state.key_root[sort_param->key]);
+ /* restoring first level tree data in sort_info/sort_param */
+ sort_info->key_block=sort_info->key_block_end- sort_info->param->sort_key_blocks;
+ sort_param->keyinfo=share->keyinfo+sort_param->key;
+ share->state.key_root[sort_param->key]=HA_OFFSET_ERROR;
+ /* writing lastkey in first-level tree */
+ return error ? error :
+ sort_insert_key(sort_param,sort_info->key_block,
+ maria_ft_buf->lastkey,HA_OFFSET_ERROR);
+}
+
+
+static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param,
+ const uchar *a)
+{
+ uint a_len, val_off, val_len, error;
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ SORT_FT_BUF *ft_buf= sort_info->ft_buf;
+ SORT_KEY_BLOCKS *key_block= sort_info->key_block;
+ MARIA_SHARE *share= sort_info->info->s;
+
+ val_len=HA_FT_WLEN+share->base.rec_reflength;
+ get_key_full_length_rdonly(a_len, a);
+
+ if (!ft_buf)
+ {
+ /*
+ use two-level tree only if key_reflength fits in rec_reflength place
+ and row format is NOT static - for _ma_dpointer not to garble offsets
+ */
+ if ((share->base.key_reflength <=
+ share->base.rec_reflength) &&
+ (share->options &
+ (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)))
+ ft_buf= (SORT_FT_BUF *)my_malloc(sort_param->keyinfo->block_length +
+ sizeof(SORT_FT_BUF), MYF(MY_WME));
+
+ if (!ft_buf)
+ {
+ sort_param->key_write=sort_key_write;
+ return sort_key_write(sort_param, a);
+ }
+ sort_info->ft_buf= ft_buf;
+ goto word_init_ft_buf; /* no need to duplicate the code */
+ }
+ get_key_full_length_rdonly(val_off, ft_buf->lastkey);
+
+ if (ha_compare_text(sort_param->seg->charset,
+ a+1,a_len-1,
+ ft_buf->lastkey+1,val_off-1, 0, 0)==0)
+ {
+ uchar *p;
+ if (!ft_buf->buf) /* store in second-level tree */
+ {
+ ft_buf->count++;
+ return sort_insert_key(sort_param,key_block,
+ a + a_len, HA_OFFSET_ERROR);
+ }
+
+ /* storing the key in the buffer. */
+ memcpy (ft_buf->buf, (const char *)a+a_len, val_len);
+ ft_buf->buf+=val_len;
+ if (ft_buf->buf < ft_buf->end)
+ return 0;
+
+ /* converting to two-level tree */
+ p=ft_buf->lastkey+val_off;
+
+ while (key_block->inited)
+ key_block++;
+ sort_info->key_block=key_block;
+ sort_param->keyinfo= &share->ft2_keyinfo;
+ ft_buf->count=(ft_buf->buf - p)/val_len;
+
+ /* flushing buffer to second-level tree */
+ for (error=0; !error && p < ft_buf->buf; p+= val_len)
+ error=sort_insert_key(sort_param,key_block,p,HA_OFFSET_ERROR);
+ ft_buf->buf=0;
+ return error;
+ }
+
+ /* flushing buffer */
+ if ((error=_ma_sort_ft_buf_flush(sort_param)))
+ return error;
+
+word_init_ft_buf:
+ a_len+=val_len;
+ memcpy(ft_buf->lastkey, a, a_len);
+ ft_buf->buf=ft_buf->lastkey+a_len;
+ /*
+ 32 is just a safety margin here
+ (at least max(val_len, sizeof(nod_flag)) should be there).
+ May be better performance could be achieved if we'd put
+ (sort_info->keyinfo->block_length-32)/XXX
+ instead.
+ TODO: benchmark the best value for XXX.
+ */
+ ft_buf->end= ft_buf->lastkey+ (sort_param->keyinfo->block_length-32);
+ return 0;
+} /* sort_maria_ft_key_write */
+
+
+/* get pointer to record from a key */
+
+static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo,
+ const uchar *key_data)
+{
+ MARIA_KEY key;
+ key.keyinfo= keyinfo;
+ key.data= (uchar*) key_data;
+ key.data_length= _ma_keylength(keyinfo, key_data);
+ return _ma_row_pos_from_key(&key);
+} /* get_record_for_key */
+
+
+/* Insert a key in sort-key-blocks */
+
+static int sort_insert_key(MARIA_SORT_PARAM *sort_param,
+ register SORT_KEY_BLOCKS *key_block,
+ const uchar *key,
+ my_off_t prev_block)
+{
+ uint a_length,t_length,nod_flag;
+ my_off_t filepos,key_file_length;
+ uchar *anc_buff,*lastkey;
+ MARIA_KEY_PARAM s_temp;
+ MARIA_KEYDEF *keyinfo=sort_param->keyinfo;
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+ MARIA_KEY tmp_key;
+ MARIA_HA *info= sort_info->info;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("sort_insert_key");
+
+ anc_buff= key_block->buff;
+ lastkey=key_block->lastkey;
+ nod_flag= (key_block == sort_info->key_block ? 0 :
+ share->base.key_reflength);
+
+ if (!key_block->inited)
+ {
+ key_block->inited=1;
+ if (key_block == sort_info->key_block_end)
+ {
+ _ma_check_print_error(param,
+ "To many key-block-levels; "
+ "Try increasing sort_key_blocks");
+ DBUG_RETURN(1);
+ }
+ a_length= share->keypage_header + nod_flag;
+ key_block->end_pos= anc_buff + share->keypage_header;
+ bzero(anc_buff, share->keypage_header);
+ _ma_store_keynr(share, anc_buff, (uint) (sort_param->keyinfo -
+ share->keyinfo));
+ lastkey=0; /* No previous key in block */
+ }
+ else
+ a_length= _ma_get_page_used(share, anc_buff);
+
+ /* Save pointer to previous block */
+ if (nod_flag)
+ {
+ _ma_store_keypage_flag(share, anc_buff, KEYPAGE_FLAG_ISNOD);
+ _ma_kpointer(info,key_block->end_pos,prev_block);
+ }
+
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data= (uchar*) key;
+ tmp_key.data_length= _ma_keylength(keyinfo, key) - share->base.rec_reflength;
+ tmp_key.ref_length= share->base.rec_reflength;
+
+ t_length= (*keyinfo->pack_key)(&tmp_key, nod_flag,
+ (uchar*) 0, lastkey, lastkey, &s_temp);
+ (*keyinfo->store_key)(keyinfo, key_block->end_pos+nod_flag,&s_temp);
+ a_length+=t_length;
+ _ma_store_page_used(share, anc_buff, a_length);
+ key_block->end_pos+=t_length;
+ if (a_length <= share->max_index_block_size)
+ {
+ MARIA_KEY tmp_key2;
+ tmp_key2.data= key_block->lastkey;
+ _ma_copy_key(&tmp_key2, &tmp_key);
+ key_block->last_length=a_length-t_length;
+ DBUG_RETURN(0);
+ }
+
+ /* Fill block with end-zero and write filled block */
+ _ma_store_page_used(share, anc_buff, key_block->last_length);
+ bzero(anc_buff+key_block->last_length,
+ keyinfo->block_length- key_block->last_length);
+ key_file_length=share->state.state.key_file_length;
+ if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR)
+ DBUG_RETURN(1);
+ _ma_fast_unlock_key_del(info);
+
+ /* If we read the page from the key cache, we have to write it back to it */
+ if (page_link->changed)
+ {
+ MARIA_PAGE page;
+ pop_dynamic(&info->pinned_pages);
+ _ma_page_setup(&page, info, keyinfo, filepos, anc_buff);
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_WRITE_UNLOCK, DFLT_INIT_HITS))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ put_crc(anc_buff, filepos, share);
+ if (my_pwrite(share->kfile.file, anc_buff,
+ (uint) keyinfo->block_length, filepos, param->myf_rw))
+ DBUG_RETURN(1);
+ }
+ DBUG_DUMP("buff", anc_buff, _ma_get_page_used(share, anc_buff));
+
+ /* Write separator-key to block in next level */
+ if (sort_insert_key(sort_param,key_block+1,key_block->lastkey,filepos))
+ DBUG_RETURN(1);
+
+ /* clear old block and write new key in it */
+ key_block->inited=0;
+ DBUG_RETURN(sort_insert_key(sort_param, key_block,key,prev_block));
+} /* sort_insert_key */
+
+
+/* Delete record when we found a duplicated key */
+
+static int sort_delete_record(MARIA_SORT_PARAM *sort_param)
+{
+ uint i;
+ int old_file,error;
+ uchar *key;
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ MARIA_HA *row_info= sort_info->new_info, *key_info= sort_info->info;
+ DBUG_ENTER("sort_delete_record");
+
+ if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK)
+ {
+ _ma_check_print_error(param,
+ "Quick-recover aborted; Run recovery without switch "
+ "-q or with switch -qq");
+ DBUG_RETURN(1);
+ }
+ if (key_info->s->options & HA_OPTION_COMPRESS_RECORD)
+ {
+ _ma_check_print_error(param,
+ "Recover aborted; Can't run standard recovery on "
+ "compressed tables with errors in data-file. "
+ "Use 'aria_chk --safe-recover' to fix it");
+ DBUG_RETURN(1);
+ }
+
+ old_file= row_info->dfile.file;
+ /* This only affects static and dynamic row formats */
+ row_info->dfile.file= row_info->rec_cache.file;
+ if (flush_io_cache(&row_info->rec_cache))
+ DBUG_RETURN(1);
+
+ key= key_info->lastkey_buff + key_info->s->base.max_key_length;
+ if ((error=(*row_info->s->read_record)(row_info, sort_param->record,
+ key_info->cur_row.lastpos)) &&
+ error != HA_ERR_RECORD_DELETED)
+ {
+ _ma_check_print_error(param,"Can't read record to be removed");
+ row_info->dfile.file= old_file;
+ DBUG_RETURN(1);
+ }
+ row_info->cur_row.lastpos= key_info->cur_row.lastpos;
+
+ for (i=0 ; i < sort_info->current_key ; i++)
+ {
+ MARIA_KEY tmp_key;
+ (*key_info->s->keyinfo[i].make_key)(key_info, &tmp_key, i, key,
+ sort_param->record,
+ key_info->cur_row.lastpos, 0);
+ if (_ma_ck_delete(key_info, &tmp_key))
+ {
+ _ma_check_print_error(param,
+ "Can't delete key %d from record to be removed",
+ i+1);
+ row_info->dfile.file= old_file;
+ DBUG_RETURN(1);
+ }
+ }
+ if (sort_param->calc_checksum)
+ param->glob_crc-=(*key_info->s->calc_check_checksum)(key_info,
+ sort_param->record);
+ error= (*row_info->s->delete_record)(row_info, sort_param->record);
+ if (error)
+ _ma_check_print_error(param,"Got error %d when deleting record",
+ my_errno);
+ row_info->dfile.file= old_file; /* restore actual value */
+ row_info->s->state.state.records--;
+ DBUG_RETURN(error);
+} /* sort_delete_record */
+
+
+/* Fix all pending blocks and flush everything to disk */
+
+int _ma_flush_pending_blocks(MARIA_SORT_PARAM *sort_param)
+{
+ uint nod_flag,length;
+ my_off_t filepos,key_file_length;
+ SORT_KEY_BLOCKS *key_block;
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ myf myf_rw=sort_info->param->myf_rw;
+ MARIA_HA *info=sort_info->info;
+ MARIA_KEYDEF *keyinfo=sort_param->keyinfo;
+ MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+ DBUG_ENTER("_ma_flush_pending_blocks");
+
+ filepos= HA_OFFSET_ERROR; /* if empty file */
+ nod_flag=0;
+ for (key_block=sort_info->key_block ; key_block->inited ; key_block++)
+ {
+ key_block->inited=0;
+ length= _ma_get_page_used(info->s, key_block->buff);
+ if (nod_flag)
+ _ma_kpointer(info,key_block->end_pos,filepos);
+ key_file_length= info->s->state.state.key_file_length;
+ bzero(key_block->buff+length, keyinfo->block_length-length);
+ if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) ==
+ HA_OFFSET_ERROR)
+ goto err;
+
+ /* If we read the page from the key cache, we have to write it back */
+ if (page_link->changed)
+ {
+ MARIA_PAGE page;
+ pop_dynamic(&info->pinned_pages);
+
+ _ma_page_setup(&page, info, keyinfo, filepos, key_block->buff);
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_WRITE_UNLOCK,
+ DFLT_INIT_HITS))
+ goto err;
+ }
+ else
+ {
+ put_crc(key_block->buff, filepos, info->s);
+ if (my_pwrite(info->s->kfile.file, key_block->buff,
+ (uint) keyinfo->block_length,filepos, myf_rw))
+ goto err;
+ }
+ DBUG_DUMP("buff",key_block->buff,length);
+ nod_flag=1;
+ }
+ info->s->state.key_root[sort_param->key]=filepos; /* Last is root for tree */
+ _ma_fast_unlock_key_del(info);
+ DBUG_RETURN(0);
+
+err:
+ _ma_fast_unlock_key_del(info);
+ DBUG_RETURN(1);
+} /* _ma_flush_pending_blocks */
+
+ /* alloc space and pointers for key_blocks */
+
+static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
+ uint buffer_length)
+{
+ reg1 uint i;
+ SORT_KEY_BLOCKS *block;
+ DBUG_ENTER("alloc_key_blocks");
+
+ if (!(block= (SORT_KEY_BLOCKS*) my_malloc((sizeof(SORT_KEY_BLOCKS)+
+ buffer_length+IO_SIZE)*blocks,
+ MYF(0))))
+ {
+ _ma_check_print_error(param,"Not enough memory for sort-key-blocks");
+ return(0);
+ }
+ for (i=0 ; i < blocks ; i++)
+ {
+ block[i].inited=0;
+ block[i].buff= (uchar*) (block+blocks)+(buffer_length+IO_SIZE)*i;
+ }
+ DBUG_RETURN(block);
+} /* alloc_key_blocks */
+
+
+ /* Check if file is almost full */
+
+int maria_test_if_almost_full(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ return 0;
+ return my_seek(share->kfile.file, 0L, MY_SEEK_END,
+ MYF(MY_THREADSAFE))/10*9 >
+ (my_off_t) share->base.max_key_file_length ||
+ my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) / 10 * 9 >
+ (my_off_t) share->base.max_data_file_length;
+}
+
+
+/* Recreate table with bigger more alloced record-data */
+
+int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename)
+{
+ int error;
+ MARIA_HA info;
+ MARIA_SHARE share;
+ MARIA_KEYDEF *keyinfo,*key,*key_end;
+ HA_KEYSEG *keysegs,*keyseg;
+ MARIA_COLUMNDEF *columndef,*column,*end;
+ MARIA_UNIQUEDEF *uniquedef,*u_ptr,*u_end;
+ MARIA_STATUS_INFO status_info;
+ uint unpack,key_parts;
+ ha_rows max_records;
+ ulonglong file_length,tmp_length;
+ MARIA_CREATE_INFO create_info;
+ DBUG_ENTER("maria_recreate_table");
+
+ error=1; /* Default error */
+ info= **org_info;
+ status_info= (*org_info)->state[0];
+ info.state= &status_info;
+ share= *(*org_info)->s;
+ unpack= ((share.data_file_type == COMPRESSED_RECORD) &&
+ (param->testflag & T_UNPACK));
+ if (!(keyinfo=(MARIA_KEYDEF*) my_alloca(sizeof(MARIA_KEYDEF) *
+ share.base.keys)))
+ DBUG_RETURN(0);
+ memcpy((uchar*) keyinfo,(uchar*) share.keyinfo,
+ (size_t) (sizeof(MARIA_KEYDEF)*share.base.keys));
+
+ key_parts= share.base.all_key_parts;
+ if (!(keysegs=(HA_KEYSEG*) my_alloca(sizeof(HA_KEYSEG)*
+ (key_parts+share.base.keys))))
+ {
+ my_afree(keyinfo);
+ DBUG_RETURN(1);
+ }
+ if (!(columndef=(MARIA_COLUMNDEF*)
+ my_alloca(sizeof(MARIA_COLUMNDEF)*(share.base.fields+1))))
+ {
+ my_afree(keyinfo);
+ my_afree(keysegs);
+ DBUG_RETURN(1);
+ }
+ if (!(uniquedef=(MARIA_UNIQUEDEF*)
+ my_alloca(sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques+1))))
+ {
+ my_afree(columndef);
+ my_afree(keyinfo);
+ my_afree(keysegs);
+ DBUG_RETURN(1);
+ }
+
+ /* Copy the column definitions in their original order */
+ for (column= share.columndef, end= share.columndef+share.base.fields;
+ column != end ;
+ column++)
+ columndef[column->column_nr]= *column;
+
+ /* Change the new key to point at the saved key segments */
+ memcpy((uchar*) keysegs,(uchar*) share.keyparts,
+ (size_t) (sizeof(HA_KEYSEG)*(key_parts+share.base.keys+
+ share.state.header.uniques)));
+ keyseg=keysegs;
+ for (key=keyinfo,key_end=keyinfo+share.base.keys; key != key_end ; key++)
+ {
+ key->seg=keyseg;
+ for (; keyseg->type ; keyseg++)
+ {
+ if (param->language)
+ keyseg->language=param->language; /* change language */
+ }
+ keyseg++; /* Skip end pointer */
+ }
+
+ /*
+ Copy the unique definitions and change them to point at the new key
+ segments
+ */
+ memcpy((uchar*) uniquedef,(uchar*) share.uniqueinfo,
+ (size_t) (sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques)));
+ for (u_ptr=uniquedef,u_end=uniquedef+share.state.header.uniques;
+ u_ptr != u_end ; u_ptr++)
+ {
+ u_ptr->seg=keyseg;
+ keyseg+=u_ptr->keysegs+1;
+ }
+
+ file_length=(ulonglong) my_seek(info.dfile.file, 0L, MY_SEEK_END, MYF(0));
+ if (share.options & HA_OPTION_COMPRESS_RECORD)
+ share.base.records=max_records=info.state->records;
+ else if (share.base.min_pack_length)
+ max_records=(ha_rows) (file_length / share.base.min_pack_length);
+ else
+ max_records=0;
+ share.options&= ~HA_OPTION_TEMP_COMPRESS_RECORD;
+
+ tmp_length= file_length+file_length/10;
+ set_if_bigger(file_length,param->max_data_file_length);
+ set_if_bigger(file_length,tmp_length);
+ set_if_bigger(file_length,(ulonglong) share.base.max_data_file_length);
+
+ VOID(maria_close(*org_info));
+
+ bzero((char*) &create_info,sizeof(create_info));
+ create_info.max_rows=max(max_records,share.base.records);
+ create_info.reloc_rows=share.base.reloc;
+ create_info.old_options=(share.options |
+ (unpack ? HA_OPTION_TEMP_COMPRESS_RECORD : 0));
+
+ create_info.data_file_length=file_length;
+ create_info.auto_increment=share.state.auto_increment;
+ create_info.language = (param->language ? param->language :
+ share.state.header.language);
+ create_info.key_file_length= status_info.key_file_length;
+ create_info.org_data_file_type= ((enum data_file_type)
+ share.state.header.org_data_file_type);
+
+ /*
+ Allow for creating an auto_increment key. This has an effect only if
+ an auto_increment key exists in the original table.
+ */
+ create_info.with_auto_increment= TRUE;
+ create_info.null_bytes= share.base.null_bytes;
+ create_info.transactional= share.base.born_transactional;
+
+ /*
+ We don't have to handle symlinks here because we are using
+ HA_DONT_TOUCH_DATA
+ */
+ if (maria_create(filename, share.data_file_type,
+ share.base.keys - share.state.header.uniques,
+ keyinfo, share.base.fields, columndef,
+ share.state.header.uniques, uniquedef,
+ &create_info,
+ HA_DONT_TOUCH_DATA))
+ {
+ _ma_check_print_error(param,
+ "Got error %d when trying to recreate indexfile",
+ my_errno);
+ goto end;
+ }
+ *org_info= maria_open(filename,O_RDWR,
+ (HA_OPEN_FOR_REPAIR |
+ ((param->testflag & T_WAIT_FOREVER) ?
+ HA_OPEN_WAIT_IF_LOCKED :
+ (param->testflag & T_DESCRIPT) ?
+ HA_OPEN_IGNORE_IF_LOCKED :
+ HA_OPEN_ABORT_IF_LOCKED)));
+ if (!*org_info)
+ {
+ _ma_check_print_error(param,
+ "Got error %d when trying to open re-created "
+ "indexfile", my_errno);
+ goto end;
+ }
+ /* We are modifing */
+ (*org_info)->s->options&= ~HA_OPTION_READ_ONLY_DATA;
+ VOID(_ma_readinfo(*org_info,F_WRLCK,0));
+ (*org_info)->s->state.state.records= info.state->records;
+ if (share.state.create_time)
+ (*org_info)->s->state.create_time=share.state.create_time;
+#ifdef EXTERNAL_LOCKING
+ (*org_info)->s->state.unique= (*org_info)->this_unique= share.state.unique;
+#endif
+ (*org_info)->s->state.state.checksum= info.state->checksum;
+ (*org_info)->s->state.state.del= info.state->del;
+ (*org_info)->s->state.dellink= share.state.dellink;
+ (*org_info)->s->state.state.empty= info.state->empty;
+ (*org_info)->s->state.state.data_file_length= info.state->data_file_length;
+ *(*org_info)->state= (*org_info)->s->state.state;
+ if (maria_update_state_info(param,*org_info,UPDATE_TIME | UPDATE_STAT |
+ UPDATE_OPEN_COUNT))
+ goto end;
+ error=0;
+end:
+ my_afree(uniquedef);
+ my_afree(keyinfo);
+ my_afree(columndef);
+ my_afree(keysegs);
+ DBUG_RETURN(error);
+}
+
+
+ /* write suffix to data file if neaded */
+
+int maria_write_data_suffix(MARIA_SORT_INFO *sort_info, my_bool fix_datafile)
+{
+ MARIA_HA *info=sort_info->new_info;
+
+ if (info->s->data_file_type == COMPRESSED_RECORD && fix_datafile)
+ {
+ uchar buff[MEMMAP_EXTRA_MARGIN];
+ bzero(buff,sizeof(buff));
+ if (my_b_write(&info->rec_cache,buff,sizeof(buff)))
+ {
+ _ma_check_print_error(sort_info->param,
+ "%d when writing to datafile",my_errno);
+ return 1;
+ }
+ sort_info->param->read_cache.end_of_file+=sizeof(buff);
+ }
+ return 0;
+}
+
+
+/* Update state and maria_chk time of indexfile */
+
+int maria_update_state_info(HA_CHECK *param, MARIA_HA *info,uint update)
+{
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("maria_update_state_info");
+
+ if (update & UPDATE_OPEN_COUNT)
+ {
+ share->state.open_count=0;
+ share->global_changed=0;
+ }
+ if (update & UPDATE_STAT)
+ {
+ uint i, key_parts= mi_uint2korr(share->state.header.key_parts);
+ share->state.records_at_analyze= share->state.state.records;
+ share->state.changed&= ~STATE_NOT_ANALYZED;
+ if (share->state.state.records)
+ {
+ for (i=0; i<key_parts; i++)
+ {
+ if (!(share->state.rec_per_key_part[i]=param->new_rec_per_key_part[i]))
+ share->state.changed|= STATE_NOT_ANALYZED;
+ }
+ }
+ }
+ if (update & (UPDATE_STAT | UPDATE_SORT | UPDATE_TIME | UPDATE_AUTO_INC))
+ {
+ if (update & UPDATE_TIME)
+ {
+ share->state.check_time= time((time_t*) 0);
+ if (!share->state.create_time)
+ share->state.create_time= share->state.check_time;
+ }
+ if (_ma_state_info_write(share,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_FULL_INFO))
+ goto err;
+ share->changed=0;
+ }
+ { /* Force update of status */
+ int error;
+ uint r_locks=share->r_locks,w_locks=share->w_locks;
+ share->r_locks= share->w_locks= share->tot_locks= 0;
+ error= _ma_writeinfo(info,WRITEINFO_NO_UNLOCK);
+ share->r_locks=r_locks;
+ share->w_locks=w_locks;
+ share->tot_locks=r_locks+w_locks;
+ if (!error)
+ DBUG_RETURN(0);
+ }
+err:
+ _ma_check_print_error(param,"%d when updating keyfile",my_errno);
+ DBUG_RETURN(1);
+}
+
+/*
+ Update auto increment value for a table
+ When setting the 'repair_only' flag we only want to change the
+ old auto_increment value if its wrong (smaller than some given key).
+ The reason is that we shouldn't change the auto_increment value
+ for a table without good reason when only doing a repair; If the
+ user have inserted and deleted rows, the auto_increment value
+ may be bigger than the biggest current row and this is ok.
+
+ If repair_only is not set, we will update the flag to the value in
+ param->auto_increment is bigger than the biggest key.
+*/
+
+void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info,
+ my_bool repair_only)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *record;
+ DBUG_ENTER("update_auto_increment_key");
+
+ if (!share->base.auto_key ||
+ ! maria_is_key_active(share->state.key_map, share->base.auto_key - 1))
+ {
+ if (!(param->testflag & T_VERY_SILENT))
+ _ma_check_print_info(param,
+ "Table: %s doesn't have an auto increment key\n",
+ param->isam_file_name);
+ DBUG_VOID_RETURN;
+ }
+ if (!(param->testflag & T_SILENT) &&
+ !(param->testflag & T_REP))
+ printf("Updating Aria file: %s\n", param->isam_file_name);
+ /*
+ We have to use an allocated buffer instead of info->rec_buff as
+ _ma_put_key_in_record() may use info->rec_buff
+ */
+ if (!(record= (uchar*) my_malloc((size_t) share->base.default_rec_buff_size,
+ MYF(0))))
+ {
+ _ma_check_print_error(param,"Not enough memory for extra record");
+ DBUG_VOID_RETURN;
+ }
+
+ maria_extra(info,HA_EXTRA_KEYREAD,0);
+ if (maria_rlast(info, record, share->base.auto_key-1))
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ maria_extra(info,HA_EXTRA_NO_KEYREAD,0);
+ my_free((char*) record, MYF(0));
+ _ma_check_print_error(param,"%d when reading last record",my_errno);
+ DBUG_VOID_RETURN;
+ }
+ if (!repair_only)
+ share->state.auto_increment=param->auto_increment_value;
+ }
+ else
+ {
+ const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg;
+ ulonglong auto_increment=
+ ma_retrieve_auto_increment(record + keyseg->start, keyseg->type);
+ set_if_bigger(share->state.auto_increment,auto_increment);
+ if (!repair_only)
+ set_if_bigger(share->state.auto_increment, param->auto_increment_value);
+ }
+ maria_extra(info,HA_EXTRA_NO_KEYREAD,0);
+ my_free((char*) record, MYF(0));
+ maria_update_state_info(param, info, UPDATE_AUTO_INC);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Update statistics for each part of an index
+
+ SYNOPSIS
+ maria_update_key_parts()
+ keyinfo IN Index information (only key->keysegs used)
+ rec_per_key_part OUT Store statistics here
+ unique IN Array of (#distinct tuples)
+ notnull_tuples IN Array of (#tuples), or NULL
+ records Number of records in the table
+
+ DESCRIPTION
+ This function is called produce index statistics values from unique and
+ notnull_tuples arrays after these arrays were produced with sequential
+ index scan (the scan is done in two places: chk_index() and
+ sort_key_write()).
+
+ This function handles all 3 index statistics collection methods.
+
+ Unique is an array:
+ unique[0]= (#different values of {keypart1}) - 1
+ unique[1]= (#different values of {keypart1,keypart2} tuple)-unique[0]-1
+ ...
+
+ For MI_STATS_METHOD_IGNORE_NULLS method, notnull_tuples is an array too:
+ notnull_tuples[0]= (#of {keypart1} tuples such that keypart1 is not NULL)
+ notnull_tuples[1]= (#of {keypart1,keypart2} tuples such that all
+ keypart{i} are not NULL)
+ ...
+ For all other statistics collection methods notnull_tuples==NULL.
+
+ Output is an array:
+ rec_per_key_part[k] =
+ = E(#records in the table such that keypart_1=c_1 AND ... AND
+ keypart_k=c_k for arbitrary constants c_1 ... c_k)
+
+ = {assuming that values have uniform distribution and index contains all
+ tuples from the domain (or that {c_1, ..., c_k} tuple is choosen from
+ index tuples}
+
+ = #tuples-in-the-index / #distinct-tuples-in-the-index.
+
+ The #tuples-in-the-index and #distinct-tuples-in-the-index have different
+ meaning depending on which statistics collection method is used:
+
+ MI_STATS_METHOD_* how are nulls compared? which tuples are counted?
+ NULLS_EQUAL NULL == NULL all tuples in table
+ NULLS_NOT_EQUAL NULL != NULL all tuples in table
+ IGNORE_NULLS n/a tuples that don't have NULLs
+*/
+
+void maria_update_key_parts(MARIA_KEYDEF *keyinfo, double *rec_per_key_part,
+ ulonglong *unique, ulonglong *notnull,
+ ulonglong records)
+{
+ ulonglong count=0, unique_tuples;
+ ulonglong tuples= records;
+ uint parts;
+ double tmp;
+ for (parts=0 ; parts < keyinfo->keysegs ; parts++)
+ {
+ count+=unique[parts];
+ unique_tuples= count + 1;
+ if (notnull)
+ {
+ tuples= notnull[parts];
+ /*
+ #(unique_tuples not counting tuples with NULLs) =
+ #(unique_tuples counting tuples with NULLs as different) -
+ #(tuples with NULLs)
+ */
+ unique_tuples -= (records - notnull[parts]);
+ }
+
+ if (unique_tuples == 0)
+ tmp= 1;
+ else if (count == 0)
+ tmp= ulonglong2double(tuples); /* 1 unique tuple */
+ else
+ tmp= ulonglong2double(tuples) / ulonglong2double(unique_tuples);
+
+ /*
+ for some weird keys (e.g. FULLTEXT) tmp can be <1 here.
+ let's ensure it is not
+ */
+ set_if_bigger(tmp,1);
+
+ *rec_per_key_part++= tmp;
+ }
+}
+
+
+static ha_checksum maria_byte_checksum(const uchar *buf, uint length)
+{
+ ha_checksum crc;
+ const uchar *end=buf+length;
+ for (crc=0; buf != end; buf++)
+ crc=((crc << 1) + *buf) +
+ test(crc & (((ha_checksum) 1) << (8*sizeof(ha_checksum)-1)));
+ return crc;
+}
+
+static my_bool maria_too_big_key_for_sort(MARIA_KEYDEF *key, ha_rows rows)
+{
+ uint key_maxlength=key->maxlength;
+ if (key->flag & HA_FULLTEXT)
+ {
+ uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT*
+ key->seg->charset->mbmaxlen;
+ key_maxlength+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
+ }
+ return (key->flag & HA_SPATIAL) ||
+ (key->flag & (HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY | HA_FULLTEXT) &&
+ ((ulonglong) rows * key_maxlength >
+ (ulonglong) maria_max_temp_length));
+}
+
+/*
+ Deactivate all not unique index that can be recreated fast
+ These include packed keys on which sorting will use more temporary
+ space than the max allowed file length or for which the unpacked keys
+ will take much more space than packed keys.
+ Note that 'rows' may be zero for the case when we don't know how many
+ rows we will put into the file.
+ */
+
+void maria_disable_non_unique_index(MARIA_HA *info, ha_rows rows)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *key=share->keyinfo;
+ uint i;
+
+ DBUG_ASSERT(share->state.state.records == 0 &&
+ (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES));
+ for (i=0 ; i < share->base.keys ; i++,key++)
+ {
+ if (!(key->flag &
+ (HA_NOSAME | HA_SPATIAL | HA_AUTO_KEY | HA_RTREE_INDEX)) &&
+ ! maria_too_big_key_for_sort(key,rows) && share->base.auto_key != i+1)
+ {
+ maria_clear_key_active(share->state.key_map, i);
+ info->update|= HA_STATE_CHANGED;
+ }
+ }
+}
+
+
+/*
+ Return TRUE if we can use repair by sorting
+ One can set the force argument to force to use sorting
+ even if the temporary file would be quite big!
+*/
+
+my_bool maria_test_if_sort_rep(MARIA_HA *info, ha_rows rows,
+ ulonglong key_map, my_bool force)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *key=share->keyinfo;
+ uint i;
+
+ /*
+ maria_repair_by_sort only works if we have at least one key. If we don't
+ have any keys, we should use the normal repair.
+ */
+ if (! maria_is_any_key_active(key_map))
+ return FALSE; /* Can't use sort */
+ for (i=0 ; i < share->base.keys ; i++,key++)
+ {
+ if (!force && maria_too_big_key_for_sort(key,rows))
+ return FALSE;
+ }
+ return TRUE;
+}
+
+
+/**
+ @brief Create a new handle for manipulation the new record file
+
+ @note
+ It's ok for Recovery to have two MARIA_SHARE on the same index file
+ because the one we create here is not transactional
+*/
+
+static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file)
+{
+
+ MARIA_SORT_INFO *sort_info= param->sort_info;
+ MARIA_HA *info= sort_info->info;
+ MARIA_HA *new_info;
+ DBUG_ENTER("create_new_data_handle");
+
+ if (!(sort_info->new_info= maria_open(info->s->open_file_name.str, O_RDWR,
+ HA_OPEN_COPY | HA_OPEN_FOR_REPAIR)))
+ DBUG_RETURN(1);
+
+ new_info= sort_info->new_info;
+ _ma_bitmap_set_pagecache_callbacks(&new_info->s->bitmap.file,
+ new_info->s);
+ _ma_set_data_pagecache_callbacks(&new_info->dfile, new_info->s);
+ change_data_file_descriptor(new_info, new_file);
+ maria_lock_database(new_info, F_EXTRA_LCK);
+ if ((sort_info->param->testflag & T_UNPACK) &&
+ info->s->data_file_type == COMPRESSED_RECORD)
+ {
+ (*new_info->s->once_end)(new_info->s);
+ (*new_info->s->end)(new_info);
+ restore_data_file_type(new_info->s);
+ _ma_setup_functions(new_info->s);
+ if ((*new_info->s->once_init)(new_info->s, new_file) ||
+ (*new_info->s->init)(new_info))
+ DBUG_RETURN(1);
+ }
+ _ma_reset_status(new_info);
+ if (_ma_initialize_data_file(new_info->s, new_file))
+ DBUG_RETURN(1);
+
+ /* Take into account any bitmap page created above: */
+ param->filepos= new_info->s->state.state.data_file_length;
+
+ /* Use new virtual functions for key generation */
+ info->s->keypos_to_recpos= new_info->s->keypos_to_recpos;
+ info->s->recpos_to_keypos= new_info->s->recpos_to_keypos;
+ DBUG_RETURN(0);
+}
+
+
+static void
+set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share)
+{
+ if ((sort_info->new_data_file_type=share->data_file_type) ==
+ COMPRESSED_RECORD && sort_info->param->testflag & T_UNPACK)
+ {
+ MARIA_SHARE tmp;
+ sort_info->new_data_file_type= share->state.header.org_data_file_type;
+ /* Set delete_function for sort_delete_record() */
+ tmp= *share;
+ tmp.state.header.data_file_type= tmp.state.header.org_data_file_type;
+ tmp.options= ~HA_OPTION_COMPRESS_RECORD;
+ _ma_setup_functions(&tmp);
+ share->delete_record=tmp.delete_record;
+ }
+}
+
+static void restore_data_file_type(MARIA_SHARE *share)
+{
+ MARIA_SHARE tmp_share;
+ share->options&= ~HA_OPTION_COMPRESS_RECORD;
+ mi_int2store(share->state.header.options,share->options);
+ share->state.header.data_file_type=
+ share->state.header.org_data_file_type;
+ share->data_file_type= share->state.header.data_file_type;
+ share->pack.header_length= 0;
+
+ /* Use new virtual functions for key generation */
+ tmp_share= *share;
+ _ma_setup_functions(&tmp_share);
+ share->keypos_to_recpos= tmp_share.keypos_to_recpos;
+ share->recpos_to_keypos= tmp_share.recpos_to_keypos;
+}
+
+
+static void change_data_file_descriptor(MARIA_HA *info, File new_file)
+{
+ my_close(info->dfile.file, MYF(MY_WME));
+ info->dfile.file= info->s->bitmap.file.file= new_file;
+ _ma_bitmap_reset_cache(info->s);
+}
+
+
+/**
+ @brief Mark the data file to not be used
+
+ @note
+ This is used in repair when we want to ensure the handler will not
+ write anything to the data file anymore
+*/
+
+static void unuse_data_file_descriptor(MARIA_HA *info)
+{
+ info->dfile.file= info->s->bitmap.file.file= -1;
+ _ma_bitmap_reset_cache(info->s);
+}
+
+
+/*
+ Copy all states that has to do with the data file
+
+ NOTES
+ This is done to copy the state from the data file generated from
+ repair to the original handler
+*/
+
+static void copy_data_file_state(MARIA_STATE_INFO *to,
+ MARIA_STATE_INFO *from)
+{
+ to->state.records= from->state.records;
+ to->state.del= from->state.del;
+ to->state.empty= from->state.empty;
+ to->state.data_file_length= from->state.data_file_length;
+ to->split= from->split;
+ to->dellink= from->dellink;
+ to->first_bitmap_with_space= from->first_bitmap_with_space;
+}
+
+
+/*
+ Read 'safely' next record while scanning table.
+
+ SYNOPSIS
+ _ma_safe_scan_block_record()
+ info Maria handler
+ record Store found here
+
+ NOTES
+ - One must have called mi_scan() before this
+
+ Differences compared to _ma_scan_block_records() are:
+ - We read all blocks, not only blocks marked by the bitmap to be safe
+ - In case of errors, next read will read next record.
+ - More sanity checks
+
+ RETURN
+ 0 ok
+ HA_ERR_END_OF_FILE End of file
+ # error number
+*/
+
+
+static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info,
+ MARIA_HA *info, uchar *record)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_RECORD_POS record_pos= info->cur_row.nextpos;
+ pgcache_page_no_t page= sort_info->page;
+ DBUG_ENTER("_ma_safe_scan_block_record");
+
+ for (;;)
+ {
+ /* Find next row in current page */
+ if (likely(record_pos < info->scan.number_of_rows))
+ {
+ uint length, offset;
+ uchar *data, *end_of_data;
+ char llbuff[22];
+
+ while (!(offset= uint2korr(info->scan.dir)))
+ {
+ info->scan.dir-= DIR_ENTRY_SIZE;
+ record_pos++;
+ if (info->scan.dir < info->scan.dir_end)
+ {
+ _ma_check_print_info(sort_info->param,
+ "Wrong directory on page %s",
+ llstr(page, llbuff));
+ goto read_next_page;
+ }
+ }
+ /* found row */
+ info->cur_row.lastpos= info->scan.row_base_page + record_pos;
+ info->cur_row.nextpos= record_pos + 1;
+ data= info->scan.page_buff + offset;
+ length= uint2korr(info->scan.dir + 2);
+ end_of_data= data + length;
+ info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */
+
+ if (end_of_data > info->scan.dir_end ||
+ offset < PAGE_HEADER_SIZE || length < share->base.min_block_length)
+ {
+ _ma_check_print_info(sort_info->param,
+ "Wrong directory entry %3u at page %s",
+ (uint) record_pos, llstr(page, llbuff));
+ record_pos++;
+ continue;
+ }
+ else
+ {
+ DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
+ DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data));
+ }
+ }
+
+read_next_page:
+ /* Read until we find next head page */
+ for (;;)
+ {
+ uint page_type;
+ char llbuff[22];
+
+ sort_info->page++; /* In case of errors */
+ page++;
+ if (!(page % share->bitmap.pages_covered))
+ {
+ /* Skip bitmap */
+ page++;
+ sort_info->page++;
+ }
+ if ((my_off_t) (page + 1) * share->block_size > sort_info->filelength)
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ if (!(pagecache_read(share->pagecache,
+ &info->dfile,
+ page, 0, info->scan.page_buff,
+ PAGECACHE_READ_UNKNOWN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+ {
+ if (my_errno == HA_ERR_WRONG_CRC)
+ {
+ _ma_check_print_info(sort_info->param,
+ "Wrong CRC on datapage at %s",
+ llstr(page, llbuff));
+ continue;
+ }
+ DBUG_RETURN(my_errno);
+ }
+ page_type= (info->scan.page_buff[PAGE_TYPE_OFFSET] &
+ PAGE_TYPE_MASK);
+ if (page_type == HEAD_PAGE)
+ {
+ if ((info->scan.number_of_rows=
+ (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) != 0)
+ break;
+ _ma_check_print_info(sort_info->param,
+ "Wrong head page at page %s",
+ llstr(page, llbuff));
+ }
+ else if (page_type >= MAX_PAGE_TYPE)
+ {
+ _ma_check_print_info(sort_info->param,
+ "Found wrong page type: %d at page %s",
+ page_type, llstr(page, llbuff));
+ }
+ }
+
+ /* New head page */
+ info->scan.dir= (info->scan.page_buff + share->block_size -
+ PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
+ info->scan.dir_end= (info->scan.dir -
+ (info->scan.number_of_rows - 1) *
+ DIR_ENTRY_SIZE);
+ info->scan.row_base_page= ma_recordpos(page, 0);
+ record_pos= 0;
+ }
+}
+
+
+/**
+ @brief Writes a LOGREC_REPAIR_TABLE record and updates create_rename_lsn
+ if needed (so that maria_read_log does not redo the repair).
+
+ @param param description of the REPAIR operation
+ @param info table
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error (disk problem)
+*/
+
+my_bool write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ /* in case this is maria_chk or recovery... */
+ if (translog_status == TRANSLOG_OK && !maria_in_recovery &&
+ share->base.born_transactional)
+ {
+ my_bool save_now_transactional= share->now_transactional;
+
+ /*
+ For now this record is only informative. It could serve when applying
+ logs to a backup, but that needs more thought. Assume table became
+ corrupted. It is repaired, then some writes happen to it.
+ Later we restore an old backup, and want to apply this REDO_REPAIR_TABLE
+ record. For it to give the same result as originally, the table should
+ be corrupted the same way, so applying previous REDOs should produce the
+ same corruption; that's really not guaranteed (different execution paths
+ in execution of REDOs vs runtime code so not same bugs hit, temporary
+ hardware issues not repeatable etc). Corruption may not be repeatable.
+ A reasonable solution is to execute the REDO_REPAIR_TABLE record and
+ check if the checksum of the resulting table matches what it was at the
+ end of the original repair (should be stored in log record); or execute
+ the REDO_REPAIR_TABLE if the checksum of the table-before-repair matches
+ was it was at the start of the original repair (should be stored in log
+ record).
+ */
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar log_data[FILEID_STORE_SIZE + 8 + 8];
+ LSN lsn;
+
+ /*
+ testflag gives an idea of what REPAIR did (in particular T_QUICK
+ or not: did it touch the data file or not?).
+ */
+ int8store(log_data + FILEID_STORE_SIZE, param->testflag);
+ /* org_key_map is used when recreating index after a load data infile */
+ int8store(log_data + FILEID_STORE_SIZE + 8, param->org_key_map);
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+ share->now_transactional= 1;
+ if (unlikely(translog_write_record(&lsn, LOGREC_REDO_REPAIR_TABLE,
+ &dummy_transaction_object, info,
+ (translog_size_t) sizeof(log_data),
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, log_data, NULL) ||
+ translog_flush(lsn)))
+ return TRUE;
+ /*
+ The table's existence was made durable earlier (MY_SYNC_DIR passed to
+ maria_change_to_newfile()). All pages have been flushed, state too, we
+ need to force it to disk. Old REDOs should not be applied to the table,
+ which is already enforced as skip_redos_lsn was increased in
+ protect_against_repair_crash(). But if this is an explicit repair,
+ even UNDO phase should ignore this table: create_rename_lsn should be
+ increased, and this also serves for the REDO_REPAIR to be ignored by
+ maria_read_log.
+ The fully correct order would be: sync data and index file, remove crash
+ mark and update LSNs then write state and sync index file. But at this
+ point state (without crash mark) is already written.
+ */
+ if ((!(param->testflag & T_NO_CREATE_RENAME_LSN) &&
+ _ma_update_state_lsns(share, lsn, share->state.create_trid, FALSE,
+ FALSE)) ||
+ _ma_sync_table_files(info))
+ return TRUE;
+ share->now_transactional= save_now_transactional;
+ }
+ return FALSE;
+}
+
+
+/**
+ Writes an UNDO record which if executed in UNDO phase, will empty the
+ table. Such record is thus logged only in certain cases of bulk insert
+ (table needs to be empty etc).
+*/
+my_bool write_log_record_for_bulk_insert(MARIA_HA *info)
+{
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE];
+ LSN lsn;
+ lsn_store(log_data, info->trn->undo_lsn);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ return translog_write_record(&lsn, LOGREC_UNDO_BULK_INSERT,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length,
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data + LSN_STORE_SIZE, NULL) ||
+ translog_flush(lsn); /* WAL */
+}
+
+
+/* Give error message why reading of key page failed */
+
+static void report_keypage_fault(HA_CHECK *param, MARIA_HA *info,
+ my_off_t position)
+{
+ char buff[11];
+ uint32 block_size= info->s->block_size;
+
+ if (my_errno == HA_ERR_CRASHED)
+ _ma_check_print_error(param,
+ "Wrong base information on indexpage at page: %s",
+ llstr(position / block_size, buff));
+ else
+ _ma_check_print_error(param,
+ "Can't read indexpage from page: %s, "
+ "error: %d",
+ llstr(position / block_size, buff), my_errno);
+}
+
+
+/**
+ When we want to check a table, we verify that the transaction ids of rows
+ and keys are not bigger than the biggest id generated by Maria so far, which
+ is returned by the function below.
+
+ @note If control file is not open, 0 may be returned; to not confuse
+ this with a valid max trid of 0, the caller should notice that it failed to
+ open the control file (ma_control_file_inited() can serve for that).
+*/
+
+static TrID max_trid_in_system(void)
+{
+ TrID id= trnman_get_max_trid(); /* 0 if transac manager not initialized */
+ /* 'id' may be far bigger, if last shutdown is old */
+ return max(id, max_trid_in_control_file);
+}
+
+
+static void _ma_check_print_not_visible_error(HA_CHECK *param, TrID used_trid)
+{
+ char buff[22], buff2[22];
+ if (!param->not_visible_rows_found++)
+ {
+ if (!ma_control_file_inited())
+ {
+ _ma_check_print_warning(param,
+ "Found row with transaction id %s but no "
+ "aria_control_file was used or specified. "
+ "The table may be corrupted",
+ llstr(used_trid, buff));
+ }
+ else
+ {
+ _ma_check_print_error(param,
+ "Found row with transaction id %s when max "
+ "transaction id according to aria_control_file "
+ "is %s",
+ llstr(used_trid, buff),
+ llstr(param->max_trid, buff2));
+ }
+ }
+}
+
+
+/**
+ Mark that we can retry normal repair if we used quick repair
+
+ We shouldn't do this in case of disk error as in this case we are likely
+ to loose much more than expected.
+*/
+
+void retry_if_quick(MARIA_SORT_PARAM *sort_param, int error)
+{
+ HA_CHECK *param=sort_param->sort_info->param;
+
+ if (!sort_param->fix_datafile && error >= HA_ERR_FIRST)
+ {
+ param->retry_repair=1;
+ param->testflag|=T_RETRY_WITHOUT_QUICK;
+ }
+}
diff --git a/storage/maria/ma_check_standalone.h b/storage/maria/ma_check_standalone.h
new file mode 100644
index 00000000000..8cda285bb99
--- /dev/null
+++ b/storage/maria/ma_check_standalone.h
@@ -0,0 +1,104 @@
+/* Copyright (C) 2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ All standalone programs which need to use functions from ma_check.c
+ (like maria_repair()) must define their version of _ma_killed_ptr()
+ and _ma_check_print_info|warning|error(). Indeed, linking with ma_check.o
+ brings in the dependencies of ma_check.o which are definitions of the above
+ functions; if the program does not define them then the ones of
+ ha_maria.o are used i.e. ha_maria.o is linked into the program, and this
+ brings dependencies of ha_maria.o on mysqld.o into the program's linking
+ which thus fails, as the program is not linked with mysqld.o.
+ This file contains the versions of these functions used by maria_chk and
+ maria_read_log.
+*/
+
+/*
+ Check if check/repair operation was killed by a signal
+*/
+
+int _ma_killed_ptr(HA_CHECK *param __attribute__((unused)))
+{
+ return 0;
+}
+
+ /* print warnings and errors */
+ /* VARARGS */
+
+void _ma_check_print_info(HA_CHECK *param __attribute__((unused)),
+ const char *fmt,...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_info");
+ DBUG_PRINT("enter", ("format: %s", fmt));
+
+ va_start(args,fmt);
+ VOID(vfprintf(stdout, fmt, args));
+ VOID(fputc('\n',stdout));
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+/* VARARGS */
+
+void _ma_check_print_warning(HA_CHECK *param, const char *fmt,...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_warning");
+ DBUG_PRINT("enter", ("format: %s", fmt));
+
+ fflush(stdout);
+ if (!param->warning_printed && !param->error_printed)
+ {
+ if (param->testflag & T_SILENT)
+ fprintf(stderr,"%s: Aria file %s\n",my_progname_short,
+ param->isam_file_name);
+ param->out_flag|= O_DATA_LOST;
+ }
+ param->warning_printed=1;
+ va_start(args,fmt);
+ fprintf(stderr,"%s: warning: ",my_progname_short);
+ VOID(vfprintf(stderr, fmt, args));
+ VOID(fputc('\n',stderr));
+ fflush(stderr);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
+
+/* VARARGS */
+
+void _ma_check_print_error(HA_CHECK *param, const char *fmt,...)
+{
+ va_list args;
+ DBUG_ENTER("_ma_check_print_error");
+ DBUG_PRINT("enter", ("format: %s", fmt));
+
+ fflush(stdout);
+ if (!param->warning_printed && !param->error_printed)
+ {
+ if (param->testflag & T_SILENT)
+ fprintf(stderr,"%s: Aria file %s\n",my_progname_short,param->isam_file_name);
+ param->out_flag|= O_DATA_LOST;
+ }
+ param->error_printed|=1;
+ va_start(args,fmt);
+ fprintf(stderr,"%s: error: ",my_progname_short);
+ VOID(vfprintf(stderr, fmt, args));
+ VOID(fputc('\n',stderr));
+ fflush(stderr);
+ va_end(args);
+ DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c
new file mode 100644
index 00000000000..cf13cee9452
--- /dev/null
+++ b/storage/maria/ma_checkpoint.c
@@ -0,0 +1,1196 @@
+/* Copyright (C) 2006,2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3071 Maria checkpoint
+ First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+/* Here is the implementation of this module */
+
+/** @todo RECOVERY BUG this is unreviewed code */
+/*
+ Summary:
+ checkpoints are done either by a background thread (checkpoint every Nth
+ second) or by a client.
+ In ha_maria, it's not made available to clients, and will soon be done by a
+ background thread (periodically taking checkpoints and flushing dirty
+ pages).
+*/
+
+#include "maria_def.h"
+#include "ma_pagecache.h"
+#include "ma_blockrec.h"
+#include "ma_checkpoint.h"
+#include "ma_loghandler_lsn.h"
+#include "ma_servicethread.h"
+
+
+/** @brief type of checkpoint currently running */
+static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE;
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_checkpoint;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_checkpoint;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL checkpoint_control=
+ {THREAD_DEAD, FALSE, &LOCK_checkpoint, &COND_checkpoint};
+/* is ulong like pagecache->blocks_changed */
+static ulong pages_to_flush_before_next_checkpoint;
+static PAGECACHE_FILE *dfiles, /**< data files to flush in background */
+ *dfiles_end; /**< list of data files ends here */
+static PAGECACHE_FILE *kfiles, /**< index files to flush in background */
+ *kfiles_end; /**< list of index files ends here */
+/* those two statistics below could serve in SHOW GLOBAL STATUS */
+static uint checkpoints_total= 0, /**< all checkpoint requests made */
+ checkpoints_ok_total= 0; /**< all checkpoints which succeeded */
+
+struct st_filter_param
+{
+ LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */
+ uint max_pages; /**< stop after flushing this number pages */
+}; /**< information to determine which dirty pages should be flushed */
+
+static enum pagecache_flush_filter_result
+filter_flush_file_medium(enum pagecache_page_type type,
+ pgcache_page_no_t page,
+ LSN rec_lsn, void *arg);
+static enum pagecache_flush_filter_result
+filter_flush_file_full(enum pagecache_page_type type,
+ pgcache_page_no_t page,
+ LSN rec_lsn, void *arg);
+static enum pagecache_flush_filter_result
+filter_flush_file_evenly(enum pagecache_page_type type,
+ pgcache_page_no_t pageno,
+ LSN rec_lsn, void *arg);
+static int really_execute_checkpoint(void);
+pthread_handler_t ma_checkpoint_background(void *arg);
+static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon);
+
+/**
+ @brief Does a checkpoint
+
+ @param level what level of checkpoint to do
+ @param no_wait if another checkpoint of same or stronger level
+ is already running, consider our job done
+
+ @note In ha_maria, there can never be two threads trying a checkpoint at
+ the same time.
+
+ @return Operation status
+ @retval 0 ok
+ @retval !=0 error
+*/
+
+int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
+{
+ int result= 0;
+ DBUG_ENTER("ma_checkpoint_execute");
+
+ if (!checkpoint_control.inited)
+ {
+ /*
+ If ha_maria failed to start, maria_panic_hton is called, we come here.
+ */
+ DBUG_RETURN(0);
+ }
+ DBUG_ASSERT(level > CHECKPOINT_NONE);
+
+ /* look for already running checkpoints */
+ pthread_mutex_lock(&LOCK_checkpoint);
+ while (checkpoint_in_progress != CHECKPOINT_NONE)
+ {
+ if (no_wait && (checkpoint_in_progress >= level))
+ {
+ /*
+ If we are the checkpoint background thread, we don't wait (it's
+ smarter to flush pages instead of waiting here while the other thread
+ finishes its checkpoint).
+ */
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ goto end;
+ }
+ pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
+ }
+
+ checkpoint_in_progress= level;
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ /* from then on, we are sure to be and stay the only checkpointer */
+
+ result= really_execute_checkpoint();
+ pthread_cond_broadcast(&COND_checkpoint);
+end:
+ DBUG_RETURN(result);
+}
+
+
+/**
+ @brief Does a checkpoint, really; expects no other checkpoints
+ running.
+
+ Checkpoint level requested is read from checkpoint_in_progress.
+
+ @return Operation status
+ @retval 0 ok
+ @retval !=0 error
+*/
+
+static int really_execute_checkpoint(void)
+{
+ uint i, error= 0;
+ /** @brief checkpoint_start_log_horizon will be stored there */
+ char *ptr;
+ LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */
+ LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
+ TRANSLOG_ADDRESS checkpoint_start_log_horizon;
+ char checkpoint_start_log_horizon_char[LSN_STORE_SIZE];
+ DBUG_ENTER("really_execute_checkpoint");
+ DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress));
+ bzero(&record_pieces, sizeof(record_pieces));
+
+ /*
+ STEP 1: record current end-of-log position using log's lock. It is
+ critical for the correctness of Checkpoint (related to memory visibility
+ rules, the log's lock is a mutex).
+ "Horizon" is a lower bound of the LSN of the next log record.
+ */
+ checkpoint_start_log_horizon= translog_get_horizon();
+ DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)",
+ LSN_IN_PARTS(checkpoint_start_log_horizon)));
+ lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
+
+ /*
+ STEP 2: fetch information about transactions.
+ We must fetch transactions before dirty pages. Indeed, a transaction
+ first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn
+ to 0. If we fetched pages first, we may see no dirty page yet, then we
+ fetch transactions but the transaction has already reset its rec_lsn to 0
+ so we miss rec_lsn again.
+ For a similar reason (over-allocated bitmap pages) we have to fetch
+ transactions before flushing bitmap pages.
+
+ min_trn_rec_lsn will serve to lower the starting point of the REDO phase
+ (down from checkpoint_start_log_horizon).
+ */
+ if (unlikely(trnman_collect_transactions(&record_pieces[0],
+ &record_pieces[1],
+ &min_trn_rec_lsn,
+ &min_first_undo_lsn)))
+ goto err;
+
+
+ /* STEP 3: fetch information about table files */
+ if (unlikely(collect_tables(&record_pieces[2],
+ checkpoint_start_log_horizon)))
+ goto err;
+
+
+ /* STEP 4: fetch information about dirty pages */
+ /*
+ It's better to do it _after_ having flushed some data pages (which
+ collect_tables() may have done), because those are now non-dirty and so we
+ have a more up-to-date dirty pages list to put into the checkpoint record,
+ and thus we will have less work at Recovery.
+ */
+ /* Using default pagecache for now */
+ if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
+ &record_pieces[3],
+ &min_page_rec_lsn)))
+ goto err;
+
+
+ /* LAST STEP: now write the checkpoint log record */
+ {
+ LSN lsn;
+ translog_size_t total_rec_length;
+ /*
+ the log handler is allowed to modify "str" and "length" (but not "*str")
+ of its argument, so we must not pass it record_pieces directly,
+ otherwise we would later not know what memory pieces to my_free().
+ */
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5];
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
+ (uchar*) checkpoint_start_log_horizon_char;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length=
+ sizeof(checkpoint_start_log_horizon_char);
+ for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
+ {
+ log_array[TRANSLOG_INTERNAL_PARTS + 1 + i]=
+ *(LEX_CUSTRING *)&record_pieces[i];
+ total_rec_length+= (translog_size_t) record_pieces[i].length;
+ }
+ if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT,
+ &dummy_transaction_object, NULL,
+ total_rec_length,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL, NULL) ||
+ translog_flush(lsn)))
+ goto err;
+ translog_lock();
+ /*
+ This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
+ such hook would be called before translog_flush (and we must be sure
+ that log was flushed before we write to the control file).
+ */
+ if (unlikely(ma_control_file_write_and_force(lsn, last_logno,
+ max_trid_in_control_file,
+ recovery_failures)))
+ {
+ translog_unlock();
+ goto err;
+ }
+ translog_unlock();
+ }
+
+ /*
+ Note that we should not alter memory structures until we have successfully
+ written the checkpoint record and control file.
+ */
+ /* checkpoint succeeded */
+ ptr= record_pieces[3].str;
+ pages_to_flush_before_next_checkpoint= uint4korr(ptr);
+ DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint",
+ (uint)pages_to_flush_before_next_checkpoint));
+
+ /* compute log's low-water mark */
+ {
+ TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn;
+ set_if_smaller(log_low_water_mark, min_trn_rec_lsn);
+ set_if_smaller(log_low_water_mark, min_first_undo_lsn);
+ set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon);
+ /**
+ Now purge unneeded logs.
+ As some systems have an unreliable fsync (drive lying), we could try to
+ be robust against that: remember a few previous checkpoints in the
+ control file, and not purge logs immediately... Think about it.
+ */
+ if (translog_purge(log_low_water_mark))
+ ma_message_no_user(0, "log purging failed");
+ }
+
+ goto end;
+
+err:
+ error= 1;
+ ma_message_no_user(0, "checkpoint failed");
+ /* we were possibly not able to determine what pages to flush */
+ pages_to_flush_before_next_checkpoint= 0;
+
+end:
+ for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
+ my_free(record_pieces[i].str, MYF(MY_ALLOW_ZERO_PTR));
+ pthread_mutex_lock(&LOCK_checkpoint);
+ checkpoint_in_progress= CHECKPOINT_NONE;
+ checkpoints_total++;
+ checkpoints_ok_total+= !error;
+ pthread_mutex_unlock(&LOCK_checkpoint);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ @brief Initializes the checkpoint module
+
+ @param interval If one wants the module to create a
+ thread which will periodically do
+ checkpoints, and flush dirty pages, in the
+ background, it should specify a non-zero
+ interval in seconds. The thread will then be
+ created and will take checkpoints separated by
+ approximately 'interval' second.
+
+ @note A checkpoint is taken only if there has been some significant
+ activity since the previous checkpoint. Between checkpoint N and N+1 the
+ thread flushes all dirty pages which were already dirty at the time of
+ checkpoint N.
+
+ @return Operation status
+ @retval 0 ok
+ @retval !=0 error
+*/
+
+int ma_checkpoint_init(ulong interval)
+{
+ pthread_t th;
+ int res= 0;
+ DBUG_ENTER("ma_checkpoint_init");
+ if (ma_service_thread_control_init(&checkpoint_control))
+ res= 1;
+ else if (interval > 0)
+ {
+ compile_time_assert(sizeof(void *) >= sizeof(ulong));
+ if (!(res= pthread_create(&th, NULL, ma_checkpoint_background,
+ (void *)interval)))
+ {
+ /* thread lives, will have to be killed */
+ checkpoint_control.status= THREAD_RUNNING;
+ }
+ }
+ DBUG_RETURN(res);
+}
+
+
+#ifndef DBUG_OFF
+/**
+ Function used to test recovery: flush some table pieces and then caller
+ crashes.
+
+ @param what_to_flush 0: current bitmap and all data pages
+ 1: state
+ 2: all bitmap pages
+*/
+static void flush_all_tables(int what_to_flush)
+{
+ int res= 0;
+ LIST *pos; /**< to iterate over open tables */
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (pos= maria_open_list; pos; pos= pos->next)
+ {
+ MARIA_HA *info= (MARIA_HA*)pos->data;
+ if (info->s->now_transactional)
+ {
+ switch (what_to_flush)
+ {
+ case 0:
+ res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_KEEP, FLUSH_KEEP);
+ break;
+ case 1:
+ res= _ma_state_info_write(info->s,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
+ MA_STATE_INFO_WRITE_LOCK);
+ DBUG_PRINT("maria_flush_states",
+ ("is_of_horizon: LSN (%lu,0x%lx)",
+ LSN_IN_PARTS(info->s->state.is_of_horizon)));
+ break;
+ case 2:
+ res= _ma_bitmap_flush_all(info->s);
+ break;
+ }
+ }
+ DBUG_ASSERT(res == 0);
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+}
+#endif
+
+
+/**
+ @brief Destroys the checkpoint module
+*/
+
+void ma_checkpoint_end(void)
+{
+ DBUG_ENTER("ma_checkpoint_end");
+ /*
+ Some intentional crash methods, usually triggered by
+ SET MARIA_CHECKPOINT_INTERVAL=X
+ */
+ DBUG_EXECUTE_IF("maria_flush_bitmap",
+ {
+ DBUG_PRINT("maria_flush_bitmap", ("now"));
+ flush_all_tables(2);
+ });
+ DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
+ {
+ DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
+ flush_all_tables(0);
+ });
+ DBUG_EXECUTE_IF("maria_flush_whole_log",
+ {
+ DBUG_PRINT("maria_flush_whole_log", ("now"));
+ translog_flush(translog_get_horizon());
+ });
+ /*
+ Note that for WAL reasons, maria_flush_states requires
+ maria_flush_whole_log.
+ */
+ DBUG_EXECUTE_IF("maria_flush_states",
+ {
+ DBUG_PRINT("maria_flush_states", ("now"));
+ flush_all_tables(1);
+ });
+ DBUG_EXECUTE_IF("maria_crash",
+ { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); });
+
+ if (checkpoint_control.inited)
+ {
+ ma_service_thread_control_end(&checkpoint_control);
+ my_free((uchar *)dfiles, MYF(MY_ALLOW_ZERO_PTR));
+ my_free((uchar *)kfiles, MYF(MY_ALLOW_ZERO_PTR));
+ dfiles= kfiles= NULL;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief dirty-page filtering criteria for MEDIUM checkpoint.
+
+ We flush data/index pages which have been dirty since the previous
+ checkpoint (this is the two-checkpoint rule: the REDO phase will not have
+ to start from earlier than the next-to-last checkpoint).
+ Bitmap pages are handled by _ma_bitmap_flush_all().
+
+ @param type Page's type
+ @param pageno Page's number
+ @param rec_lsn Page's rec_lsn
+ @param arg filter_param
+*/
+
+static enum pagecache_flush_filter_result
+filter_flush_file_medium(enum pagecache_page_type type,
+ pgcache_page_no_t pageno __attribute__ ((unused)),
+ LSN rec_lsn, void *arg)
+{
+ struct st_filter_param *param= (struct st_filter_param *)arg;
+ return (type == PAGECACHE_LSN_PAGE) &&
+ (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0);
+}
+
+
+/**
+ @brief dirty-page filtering criteria for FULL checkpoint.
+
+ We flush all dirty data/index pages.
+ Bitmap pages are handled by _ma_bitmap_flush_all().
+
+ @param type Page's type
+ @param pageno Page's number
+ @param rec_lsn Page's rec_lsn
+ @param arg filter_param
+*/
+
+static enum pagecache_flush_filter_result
+filter_flush_file_full(enum pagecache_page_type type,
+ pgcache_page_no_t pageno __attribute__ ((unused)),
+ LSN rec_lsn __attribute__ ((unused)),
+ void *arg __attribute__ ((unused)))
+{
+ return (type == PAGECACHE_LSN_PAGE);
+}
+
+
+/**
+ @brief dirty-page filtering criteria for background flushing thread.
+
+ We flush data/index pages which have been dirty since the previous
+ checkpoint (this is the two-checkpoint rule: the REDO phase will not have
+ to start from earlier than the next-to-last checkpoint), and no
+ bitmap pages. But we flush no more than a certain number of pages (to have
+ an even flushing, no write burst).
+ The reason to not flush bitmap pages is that they may not be in a flushable
+ state at this moment and we don't want to wait for them.
+
+ @param type Page's type
+ @param pageno Page's number
+ @param rec_lsn Page's rec_lsn
+ @param arg filter_param
+*/
+
+static enum pagecache_flush_filter_result
+filter_flush_file_evenly(enum pagecache_page_type type,
+ pgcache_page_no_t pageno __attribute__ ((unused)),
+ LSN rec_lsn, void *arg)
+{
+ struct st_filter_param *param= (struct st_filter_param *)arg;
+ if (unlikely(param->max_pages == 0)) /* all flushed already */
+ return FLUSH_FILTER_SKIP_ALL;
+ if ((type == PAGECACHE_LSN_PAGE) &&
+ (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0))
+ {
+ param->max_pages--;
+ return FLUSH_FILTER_OK;
+ }
+ return FLUSH_FILTER_SKIP_TRY_NEXT;
+}
+
+
+/**
+ @brief Background thread which does checkpoints and flushes periodically.
+
+ Takes a checkpoint. After this, all pages dirty at the time of that
+ checkpoint are flushed evenly until it is time to take another checkpoint.
+ This ensures that the REDO phase starts at earliest (in LSN time) at the
+ next-to-last checkpoint record ("two-checkpoint rule").
+
+ @note MikaelR questioned why the same thread does two different jobs, the
+ risk could be that while a checkpoint happens no LRD flushing happens.
+*/
+
+pthread_handler_t ma_checkpoint_background(void *arg)
+{
+ /** @brief At least this of log/page bytes written between checkpoints */
+ const uint checkpoint_min_activity= 2*1024*1024;
+ /*
+ If the interval could be changed by the user while we are in this thread,
+ it could be annoying: for example it could cause "case 2" to be executed
+ right after "case 0", thus having 'dfile' unset. So the thread cares only
+ about the interval's value when it started.
+ */
+ const ulong interval= (ulong)arg;
+ uint sleeps, sleep_time;
+ TRANSLOG_ADDRESS log_horizon_at_last_checkpoint=
+ translog_get_horizon();
+ ulonglong pagecache_flushes_at_last_checkpoint=
+ maria_pagecache->global_cache_write;
+ uint pages_bunch_size;
+ struct st_filter_param filter_param;
+ PAGECACHE_FILE *dfile; /**< data file currently being flushed */
+ PAGECACHE_FILE *kfile; /**< index file currently being flushed */
+ LINT_INIT(kfile);
+ LINT_INIT(dfile);
+ LINT_INIT(pages_bunch_size);
+
+ my_thread_init();
+ DBUG_PRINT("info",("Maria background checkpoint thread starts"));
+ DBUG_ASSERT(interval > 0);
+
+ /*
+ Recovery ended with all tables closed and a checkpoint: no need to take
+ one immediately.
+ */
+ sleeps= 1;
+ pages_to_flush_before_next_checkpoint= 0;
+
+ for(;;) /* iterations of checkpoints and dirty page flushing */
+ {
+#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
+ sleeps=0;
+#endif
+ switch (sleeps % interval)
+ {
+ case 0:
+ /*
+ With background flushing evenly distributed over the time
+ between two checkpoints, we should have only little flushing to do
+ in the checkpoint.
+ */
+ /*
+ No checkpoint if little work of interest for recovery was done
+ since last checkpoint. Such work includes log writing (lengthens
+ recovery, checkpoint would shorten it), page flushing (checkpoint
+ would decrease the amount of read pages in recovery).
+ In case of one short statement per minute (very low load), we don't
+ want to checkpoint every minute, hence the positive
+ checkpoint_min_activity.
+ */
+ if (((translog_get_horizon() - log_horizon_at_last_checkpoint) +
+ (maria_pagecache->global_cache_write -
+ pagecache_flushes_at_last_checkpoint) *
+ maria_pagecache->block_size) < checkpoint_min_activity)
+ {
+ /* don't take checkpoint, so don't know what to flush */
+ pages_to_flush_before_next_checkpoint= 0;
+ sleep_time= interval;
+ break;
+ }
+ sleep_time= 1;
+ ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
+ /*
+ Snapshot this kind of "state" of the engine. Note that the value below
+ is possibly greater than last_checkpoint_lsn.
+ */
+ log_horizon_at_last_checkpoint= translog_get_horizon();
+ pagecache_flushes_at_last_checkpoint=
+ maria_pagecache->global_cache_write;
+ /*
+ If the checkpoint above succeeded it has set d|kfiles and
+ d|kfiles_end. If is has failed, it has set
+ pages_to_flush_before_next_checkpoint to 0 so we will skip flushing
+ and sleep until the next checkpoint.
+ */
+ break;
+ case 1:
+ /* set up parameters for background page flushing */
+ filter_param.up_to_lsn= last_checkpoint_lsn;
+ pages_bunch_size= pages_to_flush_before_next_checkpoint / interval;
+ dfile= dfiles;
+ kfile= kfiles;
+ /* fall through */
+ default:
+ if (pages_bunch_size > 0)
+ {
+ DBUG_PRINT("checkpoint",
+ ("Maria background checkpoint thread: %u pages",
+ pages_bunch_size));
+ /* flush a bunch of dirty pages */
+ filter_param.max_pages= pages_bunch_size;
+ while (dfile != dfiles_end)
+ {
+ /*
+ We use FLUSH_KEEP_LAZY: if a file is already in flush, it's
+ smarter to move to the next file than wait for this one to be
+ completely flushed, which may take long.
+ StaleFilePointersInFlush: notice how below we use "dfile" which
+ is an OS file descriptor plus some function and MARIA_SHARE
+ pointers; this data dates from a previous checkpoint; since then,
+ the table may have been closed (so MARIA_SHARE* became stale), and
+ the file descriptor reassigned to another table which does not
+ have the same CRC-read-set callbacks: it is thus important that
+ flush_pagecache_blocks_with_filter() does not use the pointers,
+ only the OS file descriptor.
+ */
+ int res=
+ flush_pagecache_blocks_with_filter(maria_pagecache,
+ dfile, FLUSH_KEEP_LAZY,
+ filter_flush_file_evenly,
+ &filter_param);
+ if (unlikely(res & PCFLUSH_ERROR))
+ ma_message_no_user(0, "background data page flush failed");
+ if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
+ break; /* and we will continue with the same file */
+ dfile++; /* otherwise all this file is flushed, move to next file */
+ /*
+ MikaelR noted that he observed that Linux's file cache may never
+ fsync to disk until this cache is full, at which point it decides
+ to empty the cache, making the machine very slow. A solution was
+ to fsync after writing 2 MB. So we might want to fsync() here if
+ we wrote enough pages.
+ */
+ }
+ while (kfile != kfiles_end)
+ {
+ int res=
+ flush_pagecache_blocks_with_filter(maria_pagecache,
+ kfile, FLUSH_KEEP_LAZY,
+ filter_flush_file_evenly,
+ &filter_param);
+ if (unlikely(res & PCFLUSH_ERROR))
+ ma_message_no_user(0, "background index page flush failed");
+ if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
+ break; /* and we will continue with the same file */
+ kfile++; /* otherwise all this file is flushed, move to next file */
+ }
+ sleep_time= 1;
+ }
+ else
+ {
+ /* Can directly sleep until the next checkpoint moment */
+ sleep_time= interval - (sleeps % interval);
+ }
+ }
+ if (my_service_thread_sleep(&checkpoint_control,
+ sleep_time * 1000000000ULL))
+ break;
+ sleeps+= sleep_time;
+ }
+ DBUG_PRINT("info",("Maria background checkpoint thread ends"));
+ {
+ CHECKPOINT_LEVEL level= CHECKPOINT_FULL;
+ /*
+ That's the final one, which guarantees that a clean shutdown always ends
+ with a checkpoint.
+ */
+ DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;);
+ ma_checkpoint_execute(level, FALSE);
+ }
+ my_service_thread_signal_end(&checkpoint_control);
+ my_thread_end();
+ return 0;
+}
+
+
+/**
+ @brief Allocates buffer and stores in it some info about open tables,
+ does some flushing on those.
+
+ Does the allocation because the caller cannot know the size itself.
+ Memory freeing is to be done by the caller (if the "str" member of the
+ LEX_STRING is not NULL).
+ The caller is taking a checkpoint.
+
+ @param[out] str pointer to where the allocated buffer,
+ and its size, will be put; buffer will be filled
+ with info about open tables
+ @param checkpoint_start_log_horizon Of the in-progress checkpoint
+ record.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
+{
+ MARIA_SHARE **distinct_shares= NULL;
+ char *ptr;
+ uint error= 1, sync_error= 0, nb, nb_stored, i;
+ my_bool unmark_tables= TRUE;
+ uint total_names_length;
+ LIST *pos; /**< to iterate over open tables */
+ struct st_state_copy {
+ uint index;
+ MARIA_STATE_INFO state;
+ };
+ struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */
+ *state_copies_end, /**< cache ends here */
+ *state_copy; /**< iterator in cache */
+ TRANSLOG_ADDRESS state_copies_horizon; /**< horizon of states' _copies_ */
+ struct st_filter_param filter_param;
+ PAGECACHE_FLUSH_FILTER filter;
+ DBUG_ENTER("collect_tables");
+
+ LINT_INIT(state_copies_horizon);
+ /* let's make a list of distinct shares */
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (nb= 0, pos= maria_open_list; pos; pos= pos->next)
+ {
+ MARIA_HA *info= (MARIA_HA*)pos->data;
+ MARIA_SHARE *share= info->s;
+ /* the first three variables below can never change */
+ if (share->base.born_transactional && !share->temporary &&
+ share->mode != O_RDONLY &&
+ !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP))
+ {
+ /*
+ Apart from us, only maria_close() reads/sets in_checkpoint but cannot
+ run now as we hold THR_LOCK_maria.
+ */
+ /*
+ This table is relevant for checkpoint and not already seen. Mark it,
+ so that it is not seen again in the loop.
+ */
+ nb++;
+ DBUG_ASSERT(share->in_checkpoint == 0);
+ /* This flag ensures that we count only _distinct_ shares. */
+ share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP;
+ }
+ }
+ if (unlikely((distinct_shares=
+ (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *),
+ MYF(MY_WME))) == NULL))
+ goto err;
+ for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next)
+ {
+ MARIA_HA *info= (MARIA_HA*)pos->data;
+ MARIA_SHARE *share= info->s;
+ if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)
+ {
+ distinct_shares[i++]= share;
+ /*
+ With this we prevent the share from going away while we later flush
+ and force it without holding THR_LOCK_maria. For example if the share
+ could be my_free()d by maria_close() we would have a problem when we
+ access it to flush the table. We "pin" the share pointer.
+ And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is
+ not seen again in the loop.
+ */
+ share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME;
+ total_names_length+= share->open_file_name.length;
+ }
+ }
+
+ DBUG_ASSERT(i == nb);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ DBUG_PRINT("info",("found %u table shares", nb));
+
+ str->length=
+ 4 + /* number of tables */
+ (2 + /* short id */
+ LSN_STORE_SIZE + /* first_log_write_at_lsn */
+ 1 /* end-of-name 0 */
+ ) * nb + total_names_length;
+ if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
+ goto err;
+
+ ptr= str->str;
+ ptr+= 4; /* real number of stored tables is not yet know */
+
+ /* only possible checkpointer, so can do the read below without mutex */
+ filter_param.up_to_lsn= last_checkpoint_lsn;
+ switch(checkpoint_in_progress)
+ {
+ case CHECKPOINT_MEDIUM:
+ filter= &filter_flush_file_medium;
+ break;
+ case CHECKPOINT_FULL:
+ filter= &filter_flush_file_full;
+ break;
+ case CHECKPOINT_INDIRECT:
+ filter= NULL;
+ break;
+ default:
+ DBUG_ASSERT(0);
+ goto err;
+ }
+
+ /*
+ The principle of reading/writing the state below is explained in
+ ma_recovery.c, look for "Recovery of the state".
+ */
+#define STATE_COPIES 1024
+ state_copies= (struct st_state_copy *)
+ my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
+ dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles,
+ /* avoid size of 0 for my_realloc */
+ max(1, nb) * sizeof(PAGECACHE_FILE),
+ MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+ kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles,
+ /* avoid size of 0 for my_realloc */
+ max(1, nb) * sizeof(PAGECACHE_FILE),
+ MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+ if (unlikely((state_copies == NULL) ||
+ (dfiles == NULL) || (kfiles == NULL)))
+ goto err;
+ state_copy= state_copies_end= NULL;
+ dfiles_end= dfiles;
+ kfiles_end= kfiles;
+
+ for (nb_stored= 0, i= 0; i < nb; i++)
+ {
+ MARIA_SHARE *share= distinct_shares[i];
+ PAGECACHE_FILE kfile, dfile;
+ my_bool ignore_share;
+ if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
+ {
+ /*
+ No need for a mutex to read the above, only us can write *this* bit of
+ the in_checkpoint bitmap
+ */
+ continue;
+ }
+ /**
+ @todo We should not look at tables which didn't change since last
+ checkpoint.
+ */
+ DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str));
+ if (state_copy == state_copies_end) /* we have no more cached states */
+ {
+ /*
+ Collect and cache a bunch of states. We do this for many states at a
+ time, to not lock/unlock the log's lock too often.
+ */
+ uint j, bound= min(nb, i + STATE_COPIES);
+ state_copy= state_copies;
+ /* part of the state is protected by log's lock */
+ translog_lock();
+ state_copies_horizon= translog_get_horizon_no_lock();
+ for (j= i; j < bound; j++)
+ {
+ MARIA_SHARE *share2= distinct_shares[j];
+ if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
+ continue;
+ state_copy->index= j;
+ state_copy->state= share2->state; /* we copy the state */
+ state_copy++;
+ /*
+ data_file_length is not updated under log's lock by the bitmap
+ code, but writing a wrong data_file_length is ok: a next
+ maria_close() will correct it; if we crash before, Recovery will
+ set it to the true physical size.
+ */
+ }
+ translog_unlock();
+ /**
+ We are going to flush these states.
+ Before, all records describing how to undo such state must be
+ in the log (WAL). Usually this means UNDOs. In the special case of
+ data|key_file_length, recovery just needs to open the table to fix the
+ length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to
+ understand it must open a table, is enough; so as long as
+ data|key_file_length is updated after writing any log record it's ok:
+ if we copied new value above, it means the record was before
+ state_copies_horizon and we flush such record below.
+ Apart from data|key_file_length which are easily recoverable from the
+ real file's size, all other state members must be updated only when
+ writing the UNDO; otherwise, if updated before, if their new value is
+ flushed by a checkpoint and there is a crash before UNDO is written,
+ their REDO group will be missing or at least incomplete and skipped
+ by recovery, so bad state value will stay. For example, setting
+ key_root before writing the UNDO: the table would have old index
+ pages (they were pinned at time of crash) and a new, thus wrong,
+ key_root.
+ @todo RECOVERY BUG check that all code honours that.
+ */
+ if (translog_flush(state_copies_horizon))
+ goto err;
+ /* now we have cached states and they are WAL-safe*/
+ state_copies_end= state_copy;
+ state_copy= state_copies;
+ }
+
+ /* locate our state among these cached ones */
+ for ( ; state_copy->index != i; state_copy++)
+ DBUG_ASSERT(state_copy < state_copies_end);
+
+ /* OS file descriptors are ints which we stored in 4 bytes */
+ compile_time_assert(sizeof(int) <= 4);
+ /*
+ Protect against maria_close() (which does some memory freeing in
+ MARIA_FILE_BITMAP) with close_lock. intern_lock is not
+ sufficient as we, as well as maria_close(), are going to unlock
+ intern_lock in the middle of manipulating the table. Serializing us and
+ maria_close() should help avoid problems.
+ */
+ pthread_mutex_lock(&share->close_lock);
+ pthread_mutex_lock(&share->intern_lock);
+ /*
+ Tables in a normal state have their two file descriptors open.
+ In some rare cases like REPAIR, some descriptor may be closed or even
+ -1. If that happened, the _ma_state_info_write() may fail. This is
+ prevented by enclosing all all places which close/change kfile.file with
+ intern_lock.
+ */
+ kfile= share->kfile;
+ dfile= share->bitmap.file;
+ /*
+ Ignore table which has no logged writes (all its future log records will
+ be found naturally by Recovery). Ignore obsolete shares (_before_
+ setting themselves to last_version=0 they already did all flush and
+ sync; if we flush their state now we may be flushing an obsolete state
+ onto a newer one (assuming the table has been reopened with a different
+ share but of course same physical index file).
+ */
+ ignore_share= (share->id == 0) | (share->last_version == 0);
+ DBUG_PRINT("info", ("ignore_share: %d", ignore_share));
+ if (!ignore_share)
+ {
+ uint open_file_name_len= share->open_file_name.length + 1;
+ /* remember the descriptors for background flush */
+ *(dfiles_end++)= dfile;
+ *(kfiles_end++)= kfile;
+ /* we will store this table in the record */
+ nb_stored++;
+ int2store(ptr, share->id);
+ ptr+= 2;
+ lsn_store(ptr, share->lsn_of_file_id);
+ ptr+= LSN_STORE_SIZE;
+ /*
+ first_bitmap_with_space is not updated under log's lock, and is
+ important. We would need the bitmap's lock to get it right. Recovery
+ of this is not clear, so we just play safe: write it out as
+ unknown: if crash, _ma_bitmap_init() at next open (for example in
+ Recovery) will convert it to 0 and thus the first insertion will
+ search for free space from the file's first bitmap (0) -
+ under-optimal but safe.
+ If no crash, maria_close() will write the exact value.
+ */
+ state_copy->state.first_bitmap_with_space= ~(ulonglong)0;
+ memcpy(ptr, share->open_file_name.str, open_file_name_len);
+ ptr+= open_file_name_len;
+ if (cmp_translog_addr(share->state.is_of_horizon,
+ checkpoint_start_log_horizon) >= 0)
+ {
+ /*
+ State was flushed recently, it does not hold down the log's
+ low-water mark and will not give avoidable work to Recovery. So we
+ needn't flush it. Also, it is possible that while we copied the
+ state above (under log's lock, without intern_lock) it was being
+ modified in memory or flushed to disk (without log's lock, under
+ intern_lock, like in maria_extra()), so our copy may be incorrect
+ and we should not flush it.
+ It may also be a share which got last_version==0 since we checked
+ last_version; in this case, it flushed its state and the LSN test
+ above will catch it.
+ */
+ }
+ else
+ {
+ /*
+ We could do the state flush only if share->changed, but it's
+ tricky.
+ Consider a maria_write() which has written REDO,UNDO, and before it
+ calls _ma_writeinfo() (setting share->changed=1), checkpoint
+ happens and sees share->changed=0, does not flush state. It is
+ possible that Recovery does not start from before the REDO and thus
+ the state is not recovered. A solution may be to set
+ share->changed=1 under log mutex when writing log records.
+ But as anyway we have another problem below, this optimization would
+ be of little use.
+ */
+ /** @todo flush state only if changed since last checkpoint */
+ DBUG_ASSERT(share->last_version != 0);
+ state_copy->state.is_of_horizon= share->state.is_of_horizon=
+ state_copies_horizon;
+ if (kfile.file >= 0)
+ sync_error|=
+ _ma_state_info_write_sub(kfile.file, &state_copy->state,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
+ /*
+ We don't set share->changed=0 because it may interfere with a
+ concurrent _ma_writeinfo() doing share->changed=1 (cancel its
+ effect). The sad consequence is that we will flush the same state at
+ each checkpoint if the table was once written and then not anymore.
+ */
+ }
+ }
+ /*
+ _ma_bitmap_flush_all() may wait, so don't keep intern_lock as
+ otherwise this would deadlock with allocate_and_write_block_record()
+ calling _ma_set_share_data_file_length()
+ */
+ pthread_mutex_unlock(&share->intern_lock);
+
+ if (!ignore_share)
+ {
+ /*
+ share->bitmap is valid because it's destroyed under close_lock which
+ we hold.
+ */
+ if (_ma_bitmap_flush_all(share))
+ {
+ sync_error= 1;
+ /** @todo all write failures should mark table corrupted */
+ ma_message_no_user(0, "checkpoint bitmap page flush failed");
+ }
+ DBUG_ASSERT(share->pagecache == maria_pagecache);
+ }
+ /*
+ Clean up any unused states.
+ TODO: Only do this call if there has been # (10?) ended transactions
+ since last call.
+ We had to release intern_lock to respect lock order with LOCK_trn_list.
+ */
+ _ma_remove_not_visible_states_with_lock(share, FALSE);
+
+ if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
+ {
+ /*
+ maria_close() left us free the share. When it run it set share->id
+ to 0. As it run before we locked close_lock, we should have seen this
+ and so this assertion should be true:
+ */
+ DBUG_ASSERT(ignore_share);
+ pthread_mutex_destroy(&share->intern_lock);
+ pthread_mutex_unlock(&share->close_lock);
+ pthread_mutex_destroy(&share->close_lock);
+ my_free((uchar *)share, MYF(0));
+ }
+ else
+ {
+ /* share goes back to normal state */
+ share->in_checkpoint= 0;
+ pthread_mutex_unlock(&share->close_lock);
+ }
+
+ /*
+ We do the big disk writes out of intern_lock to not block other
+ users of this table (intern_lock is taken at the start and end of
+ every statement). This means that file descriptors may be invalid
+ (files may have been closed for example by HA_EXTRA_PREPARE_FOR_*
+ under Windows, or REPAIR). This should not be a problem as we use
+ MY_IGNORE_BADFD. Descriptors may even point to other files but then
+ the old blocks (of before the close) must have been flushed for sure,
+ so our flush will flush new blocks (of after the latest open) and that
+ should do no harm.
+ */
+ /*
+ If CHECKPOINT_MEDIUM, this big flush below may result in a
+ serious write burst. Realize that all pages dirtied between the
+ last checkpoint and the one we are doing now, will be flushed at
+ next checkpoint, except those evicted by LRU eviction (depending on
+ the size of the page cache compared to the size of the working data
+ set, eviction may be rare or frequent).
+ We avoid that burst by anticipating: those pages are flushed
+ in bunches spanned regularly over the time interval between now and
+ the next checkpoint, by a background thread. Thus the next checkpoint
+ will have only little flushing to do (CHECKPOINT_MEDIUM should thus be
+ only a little slower than CHECKPOINT_INDIRECT).
+ */
+
+ /*
+ PageCacheFlushConcurrencyBugs
+ Inside the page cache, calls to flush_pagecache_blocks_int() on the same
+ file are serialized. Examples of concurrency bugs which happened when we
+ didn't have this serialization:
+ - maria_chk_size() (via CHECK TABLE) happens concurrently with
+ Checkpoint: Checkpoint is flushing a page: it pins the page and is
+ pre-empted, maria_chk_size() wants to flush this page too so gets an
+ error because Checkpoint pinned this page. Such error makes
+ maria_chk_size() mark the table as corrupted.
+ - maria_close() happens concurrently with Checkpoint:
+ Checkpoint is flushing a page: it registers a request on the page, is
+ pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE:
+ FLUSH_RELEASE will cause a free_block() which assumes the page is in the
+ LRU, but it is not (as Checkpoint registered a request). Crash.
+ - one thread is evicting a page of the file out of the LRU: it marks it
+ iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes
+ of the same file concurrently (like above). Then one flusher sees the
+ page is in switch, removes it from changed_blocks[] and puts it in its
+ first_in_switch, so the other flusher will not see the page at all and
+ return too early. If it's maria_close() which returns too early, then
+ maria_close() may close the file descriptor, and the other flusher, and
+ the evicter will fail to write their page: corruption.
+ */
+
+ if (!ignore_share)
+ {
+ if (filter != NULL)
+ {
+ if ((flush_pagecache_blocks_with_filter(maria_pagecache,
+ &dfile, FLUSH_KEEP_LAZY,
+ filter, &filter_param) &
+ PCFLUSH_ERROR))
+ ma_message_no_user(0, "checkpoint data page flush failed");
+ if ((flush_pagecache_blocks_with_filter(maria_pagecache,
+ &kfile, FLUSH_KEEP_LAZY,
+ filter, &filter_param) &
+ PCFLUSH_ERROR))
+ ma_message_no_user(0, "checkpoint index page flush failed");
+ }
+ /*
+ fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
+ per second, so if you have touched 1000 files it's 7 seconds).
+ */
+ sync_error|=
+ my_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) |
+ my_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD));
+ /*
+ in case of error, we continue because writing other tables to disk is
+ still useful.
+ */
+ }
+ }
+
+ if (sync_error)
+ goto err;
+ /* We maybe over-estimated (due to share->id==0 or last_version==0) */
+ DBUG_ASSERT(str->length >= (uint)(ptr - str->str));
+ str->length= (uint)(ptr - str->str);
+ /*
+ As we support max 65k tables open at a time (2-byte short id), we
+ assume uint is enough for the cumulated length of table names; and
+ LEX_STRING::length is uint.
+ */
+ int4store(str->str, nb_stored);
+ error= unmark_tables= 0;
+
+err:
+ if (unlikely(unmark_tables))
+ {
+ /* maria_close() uses THR_LOCK_maria from start to end */
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (i= 0; i < nb; i++)
+ {
+ MARIA_SHARE *share= distinct_shares[i];
+ if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
+ {
+ /* maria_close() left us to free the share */
+ pthread_mutex_destroy(&share->intern_lock);
+ my_free((uchar *)share, MYF(0));
+ }
+ else
+ {
+ /* share goes back to normal state */
+ share->in_checkpoint= 0;
+ }
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ }
+ my_free((uchar *)distinct_shares, MYF(MY_ALLOW_ZERO_PTR));
+ my_free((uchar *)state_copies, MYF(MY_ALLOW_ZERO_PTR));
+ DBUG_RETURN(error);
+}
diff --git a/storage/maria/ma_checkpoint.h b/storage/maria/ma_checkpoint.h
new file mode 100644
index 00000000000..126f8111a23
--- /dev/null
+++ b/storage/maria/ma_checkpoint.h
@@ -0,0 +1,92 @@
+/* Copyright (C) 2006,2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3071 Maria checkpoint
+ First version written by Guilhem Bichot on 2006-04-27.
+ Does not compile yet.
+*/
+
+/* This is the interface of this module. */
+
+typedef enum enum_ma_checkpoint_level {
+ CHECKPOINT_NONE= 0,
+ /* just write dirty_pages, transactions table and sync files */
+ CHECKPOINT_INDIRECT,
+ /* also flush all dirty pages which were already dirty at prev checkpoint */
+ CHECKPOINT_MEDIUM,
+ /* also flush all dirty pages */
+ CHECKPOINT_FULL
+} CHECKPOINT_LEVEL;
+
+C_MODE_START
+int ma_checkpoint_init(ulong interval);
+void ma_checkpoint_end(void);
+int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait);
+C_MODE_END
+
+/**
+ @brief reads some LSNs with special trickery
+
+ If a 64-bit variable transitions between both halves being zero to both
+ halves being non-zero, and back, this function can be used to do a read of
+ it (without mutex, without atomic load) which always produces a correct
+ (though maybe slightly old) value (even on 32-bit CPUs). The value is at
+ least as new as the latest mutex unlock done by the calling thread.
+ The assumption is that the system sets both 4-byte halves either at the
+ same time, or one after the other (in any order), but NOT some bytes of the
+ first half then some bytes of the second half then the rest of bytes of the
+ first half. With this assumption, the function can detect when it is
+ seeing an inconsistent value.
+
+ @param LSN pointer to the LSN variable to read
+
+ @return LSN part (most significant byte always 0)
+*/
+#if ( SIZEOF_CHARP >= 8 )
+/* 64-bit CPU, 64-bit reads are atomic */
+#define lsn_read_non_atomic LSN_WITH_FLAGS_TO_LSN
+#else
+static inline LSN lsn_read_non_atomic_32(const volatile LSN *x)
+{
+ /*
+ 32-bit CPU, 64-bit reads may give a mixed of old half and new half (old
+ low bits and new high bits, or the contrary).
+ */
+ for (;;) /* loop until no atomicity problems */
+ {
+ /*
+ Remove most significant byte in case this is a LSN_WITH_FLAGS object.
+ Those flags in TRN::first_undo_lsn break the condition on transitions so
+ they must be removed below.
+ */
+ LSN y= LSN_WITH_FLAGS_TO_LSN(*x);
+ if (likely((y == LSN_IMPOSSIBLE) || LSN_VALID(y)))
+ return y;
+ }
+}
+#define lsn_read_non_atomic(x) lsn_read_non_atomic_32(&x)
+#endif
+
+/**
+ prints a message from a task not connected to any user (checkpoint
+ and recovery for example).
+
+ @param level 0 if error, ME_JUST_WARNING if warning,
+ ME_JUST_INFO if info
+ @param sentence text to write
+*/
+#define ma_message_no_user(level, sentence) \
+ my_printf_error(HA_ERR_GENERIC, "Aria engine: %s", MYF(level), sentence)
diff --git a/storage/maria/ma_checksum.c b/storage/maria/ma_checksum.c
new file mode 100644
index 00000000000..61ec638053a
--- /dev/null
+++ b/storage/maria/ma_checksum.c
@@ -0,0 +1,89 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Calculate a checksum for a row */
+
+#include "maria_def.h"
+
+/**
+ Calculate a checksum for the record
+
+ _ma_checksum()
+ @param info Maria handler
+ @param record Record
+
+ @note
+ To ensure that the checksum is independent of the row format
+ we need to always calculate the checksum in the original field order.
+
+ @return checksum
+*/
+
+ha_checksum _ma_checksum(MARIA_HA *info, const uchar *record)
+{
+ ha_checksum crc=0;
+ uint i,end;
+ MARIA_COLUMNDEF *base_column= info->s->columndef;
+ uint16 *column_nr= info->s->column_nr;
+
+ if (info->s->base.null_bytes)
+ crc= my_checksum(crc, record, info->s->base.null_bytes);
+
+ for (i= 0, end= info->s->base.fields ; i < end ; i++)
+ {
+ MARIA_COLUMNDEF *column= base_column + column_nr[i];
+ const uchar *pos;
+ ulong length;
+
+ if (record[column->null_pos] & column->null_bit)
+ continue; /* Null field */
+
+ pos= record + column->offset;
+ switch (column->type) {
+ case FIELD_BLOB:
+ {
+ uint blob_size_length= column->length- portable_sizeof_char_ptr;
+ length= _ma_calc_blob_length(blob_size_length, pos);
+ if (length)
+ {
+ memcpy((char*) &pos, pos + blob_size_length, sizeof(char*));
+ crc= my_checksum(crc, pos, length);
+ }
+ continue;
+ }
+ case FIELD_VARCHAR:
+ {
+ uint pack_length= column->fill_length;
+ if (pack_length == 1)
+ length= (ulong) *pos;
+ else
+ length= uint2korr(pos);
+ pos+= pack_length; /* Skip length information */
+ break;
+ }
+ default:
+ length= column->length;
+ break;
+ }
+ crc= my_checksum(crc, pos, length);
+ }
+ return crc;
+}
+
+
+ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *pos)
+{
+ return my_checksum(0, pos, info->s->base.reclength);
+}
diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c
new file mode 100644
index 00000000000..df525d45d14
--- /dev/null
+++ b/storage/maria/ma_close.c
@@ -0,0 +1,208 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* close a isam-database */
+/*
+ TODO:
+ We need to have a separate mutex on the closed file to allow other threads
+ to open other files during the time we flush the cache and close this file
+*/
+
+#include "maria_def.h"
+
+int maria_close(register MARIA_HA *info)
+{
+ int error=0,flag;
+ my_bool share_can_be_freed= FALSE;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("maria_close");
+ DBUG_PRINT("enter",("base: 0x%lx reopen: %u locks: %u",
+ (long) info, (uint) share->reopen,
+ (uint) share->tot_locks));
+
+ /* Check that we have unlocked key delete-links properly */
+ DBUG_ASSERT(info->key_del_used == 0);
+
+ pthread_mutex_lock(&THR_LOCK_maria);
+ if (info->lock_type == F_EXTRA_LCK)
+ info->lock_type=F_UNLCK; /* HA_EXTRA_NO_USER_CHANGE */
+
+ if (share->reopen == 1 && share->kfile.file >= 0)
+ _ma_decrement_open_count(info);
+
+ if (info->lock_type != F_UNLCK)
+ {
+ if (maria_lock_database(info,F_UNLCK))
+ error=my_errno;
+ }
+ pthread_mutex_lock(&share->close_lock);
+ pthread_mutex_lock(&share->intern_lock);
+
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ share->r_locks--;
+ share->tot_locks--;
+ }
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ if (end_io_cache(&info->rec_cache))
+ error=my_errno;
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ }
+ flag= !--share->reopen;
+ maria_open_list=list_delete(maria_open_list,&info->open_list);
+
+ my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ (*share->end)(info);
+
+ if (flag)
+ {
+ /* Last close of file; Flush everything */
+
+ /* Check that we don't have any dangling pointers from the transaction */
+ DBUG_ASSERT(share->in_trans == 0);
+
+ if (share->kfile.file >= 0)
+ {
+ if ((*share->once_end)(share))
+ error= my_errno;
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile,
+ ((share->temporary || share->deleting) ?
+ FLUSH_IGNORE_CHANGED :
+ FLUSH_RELEASE)))
+ error= my_errno;
+#ifdef HAVE_MMAP
+ if (share->file_map)
+ _ma_unmap_file(info);
+#endif
+ /*
+ If we are crashed, we can safely flush the current state as it will
+ not change the crashed state.
+ We can NOT write the state in other cases as other threads
+ may be using the file at this point
+ IF using --external-locking, which does not apply to Maria.
+ */
+ if (((share->changed && share->base.born_transactional) ||
+ maria_is_crashed(info)))
+ {
+ /*
+ State must be written to file as it was not done at table's
+ unlocking.
+ */
+ if (_ma_state_info_write(share, MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET))
+ error= my_errno;
+ }
+ /*
+ File must be synced as it is going out of the maria_open_list and so
+ becoming unknown to future Checkpoints.
+ */
+ if (share->now_transactional && my_sync(share->kfile.file, MYF(MY_WME)))
+ error= my_errno;
+ if (my_close(share->kfile.file, MYF(0)))
+ error= my_errno;
+ }
+#ifdef THREAD
+ thr_lock_delete(&share->lock);
+ (void) pthread_mutex_destroy(&share->key_del_lock);
+ {
+ int i,keys;
+ keys = share->state.header.keys;
+ VOID(rwlock_destroy(&share->mmap_lock));
+ for(i=0; i<keys; i++) {
+ VOID(rwlock_destroy(&share->keyinfo[i].root_lock));
+ }
+ }
+#endif
+ DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+ /*
+ We assign -1 because checkpoint does not need to flush (in case we
+ have concurrent checkpoint if no then we do not need it here also)
+ */
+ share->kfile.file= -1;
+
+ /*
+ Remember share->history for future opens
+
+ We have to unlock share->intern_lock then lock it after
+ LOCK_trn_list (trnman_lock()) to avoid dead locks.
+ */
+ pthread_mutex_unlock(&share->intern_lock);
+ _ma_remove_not_visible_states_with_lock(share, TRUE);
+ pthread_mutex_lock(&share->intern_lock);
+
+ if (share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)
+ {
+ /* we cannot my_free() the share, Checkpoint would see a bad pointer */
+ share->in_checkpoint|= MARIA_CHECKPOINT_SHOULD_FREE_ME;
+ }
+ else
+ share_can_be_freed= TRUE;
+
+ if (share->state_history)
+ {
+ MARIA_STATE_HISTORY_CLOSED *history;
+ /*
+ Here we ignore the unlikely case that we don't have memory to
+ store the state. In the worst case what happens is that any transaction
+ that tries to access this table will get a wrong status information.
+ */
+ if ((history= (MARIA_STATE_HISTORY_CLOSED *)
+ my_malloc(sizeof(*history), MYF(MY_WME))))
+ {
+ history->create_rename_lsn= share->state.create_rename_lsn;
+ history->state_history= share->state_history;
+ if (my_hash_insert(&maria_stored_state, (uchar*) history))
+ my_free(history, MYF(0));
+ }
+ /* Marker for concurrent checkpoint */
+ share->state_history= 0;
+ }
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ pthread_mutex_unlock(&share->intern_lock);
+ pthread_mutex_unlock(&share->close_lock);
+ if (share_can_be_freed)
+ {
+ (void) pthread_mutex_destroy(&share->intern_lock);
+ (void) pthread_mutex_destroy(&share->close_lock);
+ (void) pthread_cond_destroy(&share->key_del_cond);
+ my_free((uchar *)share, MYF(0));
+ /*
+ If share cannot be freed, it's because checkpoint has previously
+ recorded to include this share in the checkpoint and so is soon going to
+ look at some of its content (share->in_checkpoint/id/last_version).
+ */
+ }
+ my_free(info->ftparser_param, MYF(MY_ALLOW_ZERO_PTR));
+ if (info->dfile.file >= 0)
+ {
+ /*
+ This is outside of mutex so would confuse a concurrent
+ Checkpoint. Fortunately in BLOCK_RECORD we close earlier under mutex.
+ */
+ if (my_close(info->dfile.file, MYF(0)))
+ error= my_errno;
+ }
+
+ delete_dynamic(&info->pinned_pages);
+ my_free(info, MYF(0));
+
+ if (error)
+ {
+ DBUG_PRINT("error", ("Got error on close: %d", my_errno));
+ DBUG_RETURN(my_errno= error);
+ }
+ DBUG_RETURN(0);
+} /* maria_close */
diff --git a/storage/maria/ma_commit.c b/storage/maria/ma_commit.c
new file mode 100644
index 00000000000..70bc668a220
--- /dev/null
+++ b/storage/maria/ma_commit.c
@@ -0,0 +1,129 @@
+/* Copyright (C) 2007-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+
+/**
+ writes a COMMIT record to log and commits transaction in memory
+
+ @param trn transaction
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error (disk error or out of memory)
+*/
+
+int ma_commit(TRN *trn)
+{
+ int res;
+ LSN commit_lsn;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS];
+ DBUG_ENTER("ma_commit");
+
+ DBUG_ASSERT(trn->rec_lsn == LSN_IMPOSSIBLE);
+ if (trn->undo_lsn == 0) /* no work done, rollback (cheaper than commit) */
+ DBUG_RETURN(trnman_rollback_trn(trn));
+ /*
+ - if COMMIT record is written before trnman_commit_trn():
+ if Checkpoint comes in the middle it will see trn is not committed,
+ then if crash, Recovery might roll back trn (if min(rec_lsn) is after
+ COMMIT record) and this is not an issue as
+ * transaction's updates were not made visible to other transactions
+ * "commit ok" was not sent to client
+ Alternatively, Recovery might commit trn (if min(rec_lsn) is before COMMIT
+ record), which is ok too. All in all it means that "trn committed" is not
+ 100% equal to "COMMIT record written".
+ - if COMMIT record is written after trnman_commit_trn():
+ if crash happens between the two, trn will be rolled back which is an
+ issue (transaction's updates were made visible to other transactions).
+ So we need to go the first way.
+
+ Note that we have to use | here to ensure that all calls are made.
+ */
+
+ /*
+ We do not store "thd->transaction.xid_state.xid" for now, it will be
+ needed only when we support XA.
+ */
+ res= (translog_write_record(&commit_lsn, LOGREC_COMMIT,
+ trn, NULL, 0,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL, NULL) |
+ translog_flush(commit_lsn));
+
+ DBUG_EXECUTE_IF("maria_sleep_in_commit",
+ {
+ DBUG_PRINT("info", ("maria_sleep_in_commit"));
+ sleep(3);
+ });
+ res|= trnman_commit_trn(trn);
+
+
+ /*
+ Note: if trnman_commit_trn() fails above, we have already
+ written the COMMIT record, so Checkpoint and Recovery will see the
+ transaction as committed.
+ */
+ DBUG_RETURN(res);
+}
+
+
+/**
+ Writes a COMMIT record for a transaciton associated with a file
+
+ @param info Maria handler
+
+ @return Operation status
+ @retval 0 ok
+ @retval # error (disk error or out of memory)
+*/
+
+int maria_commit(MARIA_HA *info)
+{
+ return info->s->now_transactional ? ma_commit(info->trn) : 0;
+}
+
+
+/**
+ Starts a transaction on a file handle
+
+ @param info Maria handler
+
+ @return Operation status
+ @retval 0 ok
+ @retval # Error code.
+
+ @note this can be used only in single-threaded programs (tests),
+ because we create a transaction (trnman_new_trn) with WT_THD=0.
+ XXX it needs to be fixed when we'll start using maria_begin from SQL.
+*/
+
+int maria_begin(MARIA_HA *info)
+{
+ DBUG_ENTER("maria_begin");
+
+ if (info->s->now_transactional)
+ {
+ TRN *trn= trnman_new_trn(0);
+ if (unlikely(!trn))
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+
+ DBUG_PRINT("info", ("TRN set to 0x%lx", (ulong) trn));
+ _ma_set_trn_for_table(info, trn);
+ }
+ DBUG_RETURN(0);
+}
+
diff --git a/storage/maria/ma_commit.h b/storage/maria/ma_commit.h
new file mode 100644
index 00000000000..2c57c73fd7a
--- /dev/null
+++ b/storage/maria/ma_commit.h
@@ -0,0 +1,18 @@
+/* Copyright (C) 2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+C_MODE_START
+int ma_commit(TRN *trn);
+C_MODE_END
diff --git a/storage/maria/ma_control_file.c b/storage/maria/ma_control_file.c
new file mode 100644
index 00000000000..6f9018885e9
--- /dev/null
+++ b/storage/maria/ma_control_file.c
@@ -0,0 +1,607 @@
+/* Copyright (C) 2007 MySQL AB & Guilhem Bichot & Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3234 Maria control file
+ First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+#ifndef EXTRACT_DEFINITIONS
+#include "maria_def.h"
+#include "ma_checkpoint.h"
+#endif
+
+/*
+ A control file contains the following objects:
+
+Start of create time variables (at start of file):
+ - Magic string (including version number of Maria control file)
+ - Uuid
+ - Size of create time part
+ - Size of dynamic part
+ - Maria block size
+..... Here we can add new variables without changing format
+ - Checksum of create time part (last of block)
+
+Start of changeable part:
+ - Checksum of changeable part
+ - LSN of last checkpoint
+ - Number of last log file
+ - Max trid in control file (since Maria 1.5 May 2008)
+ - Number of consecutive recovery failures (since Maria 1.5 May 2008)
+..... Here we can add new variables without changing format
+
+The idea is that one can add new variables to the control file and still
+use it with old program versions. If one needs to do an incompatible change
+one should increment the control file version number.
+*/
+
+/* Total size should be < sector size for atomic write operation */
+#define CF_MAX_SIZE 512
+#define CF_MIN_SIZE (CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + \
+ CF_CHECKSUM_SIZE * 2 + CF_LSN_SIZE + CF_FILENO_SIZE)
+
+/* Create time variables */
+#define CF_MAGIC_STRING "\xfe\xfe\xc"
+#define CF_MAGIC_STRING_OFFSET 0
+#define CF_MAGIC_STRING_SIZE (sizeof(CF_MAGIC_STRING)-1)
+#define CF_VERSION_OFFSET (CF_MAGIC_STRING_OFFSET + CF_MAGIC_STRING_SIZE)
+#define CF_VERSION_SIZE 1
+#define CF_UUID_OFFSET (CF_VERSION_OFFSET + CF_VERSION_SIZE)
+#define CF_UUID_SIZE MY_UUID_SIZE
+#define CF_CREATE_TIME_SIZE_OFFSET (CF_UUID_OFFSET + CF_UUID_SIZE)
+#define CF_SIZE_SIZE 2
+#define CF_CHANGEABLE_SIZE_OFFSET (CF_CREATE_TIME_SIZE_OFFSET + CF_SIZE_SIZE)
+#define CF_BLOCKSIZE_OFFSET (CF_CHANGEABLE_SIZE_OFFSET + CF_SIZE_SIZE)
+#define CF_BLOCKSIZE_SIZE 2
+
+#define CF_CREATE_TIME_TOTAL_SIZE (CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + \
+ CF_CHECKSUM_SIZE)
+
+/*
+ Start of the part that changes during execution
+ This is stored at offset uint2korr(file[CF_CHANGEABLE_SIZE])
+*/
+#define CF_CHECKSUM_OFFSET 0
+#define CF_CHECKSUM_SIZE 4
+#define CF_LSN_OFFSET (CF_CHECKSUM_OFFSET + CF_CHECKSUM_SIZE)
+#define CF_LSN_SIZE LSN_STORE_SIZE
+#define CF_FILENO_OFFSET (CF_LSN_OFFSET + CF_LSN_SIZE)
+#define CF_FILENO_SIZE 4
+#define CF_MAX_TRID_OFFSET (CF_FILENO_OFFSET + CF_FILENO_SIZE)
+#define CF_MAX_TRID_SIZE TRANSID_SIZE
+#define CF_RECOV_FAIL_OFFSET (CF_MAX_TRID_OFFSET + CF_MAX_TRID_SIZE)
+#define CF_RECOV_FAIL_SIZE 1
+#define CF_CHANGEABLE_TOTAL_SIZE (CF_RECOV_FAIL_OFFSET + CF_RECOV_FAIL_SIZE)
+
+/*
+ The following values should not be changed, except when changing version
+ number of the maria control file. These are the minimum sizes of the
+ parts the code can handle.
+*/
+
+#define CF_MIN_CREATE_TIME_TOTAL_SIZE \
+(CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + CF_CHECKSUM_SIZE)
+#define CF_MIN_CHANGEABLE_TOTAL_SIZE \
+(CF_FILENO_OFFSET + CF_FILENO_SIZE)
+
+#ifndef EXTRACT_DEFINITIONS
+
+/* This module owns these two vars. */
+/**
+ This LSN serves for the two-checkpoint rule, and also to find the
+ checkpoint record when doing a recovery.
+*/
+LSN last_checkpoint_lsn= LSN_IMPOSSIBLE;
+uint32 last_logno= FILENO_IMPOSSIBLE;
+/**
+ The maximum transaction id given to a transaction. It is only updated at
+ clean shutdown (in case of crash, logs have better information).
+*/
+TrID max_trid_in_control_file= 0;
+
+/**
+ Number of consecutive log or recovery failures. Reset to 0 after recovery's
+ success.
+*/
+uint8 recovery_failures= 0;
+
+/**
+ @brief If log's lock should be asserted when writing to control file.
+
+ Can be re-used by any function which needs to be thread-safe except when
+ it is called at startup.
+*/
+my_bool maria_multi_threaded= FALSE;
+/** @brief if currently doing a recovery */
+my_bool maria_in_recovery= FALSE;
+
+/**
+ Control file is less then 512 bytes (a disk sector),
+ to be as atomic as possible
+*/
+static int control_file_fd= -1;
+
+static uint cf_create_time_size;
+static uint cf_changeable_size;
+
+/**
+ @brief Create Maria control file
+*/
+
+static CONTROL_FILE_ERROR create_control_file(const char *name,
+ int open_flags)
+{
+ uint32 sum;
+ uchar buffer[CF_CREATE_TIME_TOTAL_SIZE];
+ DBUG_ENTER("maria_create_control_file");
+
+ if ((control_file_fd= my_create(name, 0,
+ open_flags,
+ MYF(MY_SYNC_DIR | MY_WME))) < 0)
+ DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
+
+ /* Reset variables, as we are creating the file */
+ cf_create_time_size= CF_CREATE_TIME_TOTAL_SIZE;
+ cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE;
+
+ /* Create unique uuid for the control file */
+ my_uuid_init((ulong) &buffer, (ulong) &maria_uuid);
+ my_uuid(maria_uuid);
+
+ /* Prepare and write the file header */
+ memcpy(buffer, CF_MAGIC_STRING, CF_MAGIC_STRING_SIZE);
+ buffer[CF_VERSION_OFFSET]= CONTROL_FILE_VERSION;
+ memcpy(buffer + CF_UUID_OFFSET, maria_uuid, CF_UUID_SIZE);
+ int2store(buffer + CF_CREATE_TIME_SIZE_OFFSET, cf_create_time_size);
+ int2store(buffer + CF_CHANGEABLE_SIZE_OFFSET, cf_changeable_size);
+
+ /* Write create time variables */
+ int2store(buffer + CF_BLOCKSIZE_OFFSET, maria_block_size);
+
+ /* Store checksum for create time parts */
+ sum= (uint32) my_checksum(0, buffer, cf_create_time_size -
+ CF_CHECKSUM_SIZE);
+ int4store(buffer + cf_create_time_size - CF_CHECKSUM_SIZE, sum);
+
+ if (my_pwrite(control_file_fd, buffer, cf_create_time_size,
+ 0, MYF(MY_FNABP | MY_WME)))
+ DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
+
+ /*
+ To be safer we should make sure that there are no logs or data/index
+ files around (indeed it could be that the control file alone was deleted
+ or not restored, and we should not go on with life at this point).
+
+ Things should still be relatively safe as if someone tries to use
+ an old table with a new control file the different uuid:s between
+ the files will cause ma_open() to generate an HA_ERR_OLD_FILE
+ error. When used from mysqld this will cause the table to be open
+ in repair mode which will remove all dependencies between the
+ table and the old control file.
+
+ We could have a tool which can rebuild the control file, by reading the
+ directory of logs, finding the newest log, reading it to find last
+ checkpoint... Slow but can save your db. For this to be possible, we
+ must always write to the control file right after writing the checkpoint
+ log record, and do nothing in between (i.e. the checkpoint must be
+ usable as soon as it has been written to the log).
+ */
+
+ /* init the file with these "undefined" values */
+ DBUG_RETURN(ma_control_file_write_and_force(LSN_IMPOSSIBLE,
+ FILENO_IMPOSSIBLE, 0, 0));
+}
+
+
+/**
+ Locks control file exclusively. This is kept for the duration of the engine
+ process, to prevent another Maria instance to write to our logs or control
+ file.
+*/
+
+static int lock_control_file(const char *name)
+{
+ uint retry= 0;
+ /*
+ On Windows, my_lock() uses locking() which is mandatory locking and so
+ prevents maria-recovery.test from copying the control file. And in case of
+ crash, it may take a while for Windows to unlock file, causing downtime.
+ */
+ /**
+ @todo BUG We should explore my_sopen(_SH_DENYWRD) to open or create the
+ file under Windows.
+ */
+#ifndef __WIN__
+ /*
+ We can't here use the automatic wait in my_lock() as the alarm thread
+ may not yet exists.
+ */
+ while (my_lock(control_file_fd, F_WRLCK, 0L, F_TO_EOF,
+ MYF(MY_SEEK_NOT_DONE | MY_FORCE_LOCK | MY_NO_WAIT)))
+ {
+ if (retry == 0)
+ my_printf_error(HA_ERR_INITIALIZATION,
+ "Can't lock aria control file '%s' for exclusive use, "
+ "error: %d. Will retry for %d seconds", 0,
+ name, my_errno, MARIA_MAX_CONTROL_FILE_LOCK_RETRY);
+ if (retry++ > MARIA_MAX_CONTROL_FILE_LOCK_RETRY)
+ return 1;
+ sleep(1);
+ }
+#endif
+ return 0;
+}
+
+
+/*
+ @brief Initialize control file subsystem
+
+ Looks for the control file. If none and creation is requested, creates file.
+ If present, reads it to find out last checkpoint's LSN and last log, updates
+ the last_checkpoint_lsn and last_logno global variables.
+ Called at engine's start.
+
+ @note
+ The format of the control file is defined in the comments and defines
+ at the start of this file.
+
+ @param create_if_missing create file if not found
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error (in which case the file is left closed)
+*/
+
+CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing,
+ my_bool print_error)
+{
+ uchar buffer[CF_MAX_SIZE];
+ char name[FN_REFLEN], errmsg_buff[256];
+ const char *errmsg, *lock_failed_errmsg= "Could not get an exclusive lock;"
+ " file is probably in use by another process";
+ uint new_cf_create_time_size, new_cf_changeable_size, new_block_size;
+ my_off_t file_size;
+ int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR;
+ int error= CONTROL_FILE_UNKNOWN_ERROR;
+ DBUG_ENTER("ma_control_file_open");
+
+ /*
+ If you change sizes in the #defines, you at least have to change the
+ "*store" and "*korr" calls in this file, and can even create backward
+ compatibility problems. Beware!
+ */
+ DBUG_ASSERT(CF_LSN_SIZE == (3+4));
+ DBUG_ASSERT(CF_FILENO_SIZE == 4);
+
+ if (control_file_fd >= 0) /* already open */
+ DBUG_RETURN(0);
+
+ if (fn_format(name, CONTROL_FILE_BASE_NAME,
+ maria_data_root, "", MYF(MY_WME)) == NullS)
+ DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
+
+ if (my_access(name,F_OK))
+ {
+ CONTROL_FILE_ERROR create_error;
+ if (!create_if_missing)
+ {
+ error= CONTROL_FILE_MISSING;
+ errmsg= "Can't find file";
+ goto err;
+ }
+ if ((create_error= create_control_file(name, open_flags)))
+ {
+ error= create_error;
+ errmsg= "Can't create file";
+ goto err;
+ }
+ if (lock_control_file(name))
+ {
+ errmsg= lock_failed_errmsg;
+ goto err;
+ }
+ goto ok;
+ }
+
+ /* Otherwise, file exists */
+
+ if ((control_file_fd= my_open(name, open_flags, MYF(MY_WME))) < 0)
+ {
+ errmsg= "Can't open file";
+ goto err;
+ }
+
+ if (lock_control_file(name)) /* lock it before reading content */
+ {
+ errmsg= lock_failed_errmsg;
+ goto err;
+ }
+
+ file_size= my_seek(control_file_fd, 0, SEEK_END, MYF(MY_WME));
+ if (file_size == MY_FILEPOS_ERROR)
+ {
+ errmsg= "Can't read size";
+ goto err;
+ }
+ if (file_size < CF_MIN_SIZE)
+ {
+ /*
+ Given that normally we write only a sector and it's atomic, the only
+ possibility for a file to be of too short size is if we crashed at the
+ very first startup, between file creation and file write. Quite unlikely
+ (and can be made even more unlikely by doing this: create a temp file,
+ write it, and then rename it to be the control file).
+ What's more likely is if someone forgot to restore the control file,
+ just did a "touch control" to try to get Maria to start, or if the
+ disk/filesystem has a problem.
+ So let's be rigid.
+ */
+ error= CONTROL_FILE_TOO_SMALL;
+ errmsg= "Size of control file is smaller than expected";
+ goto err;
+ }
+
+ /* Check if control file is unexpectedly big */
+ if (file_size > CF_MAX_SIZE)
+ {
+ error= CONTROL_FILE_TOO_BIG;
+ errmsg= "File size bigger than expected";
+ goto err;
+ }
+
+ if (my_pread(control_file_fd, buffer, (size_t)file_size, 0, MYF(MY_FNABP)))
+ {
+ errmsg= "Can't read file";
+ goto err;
+ }
+
+ if (memcmp(buffer + CF_MAGIC_STRING_OFFSET,
+ CF_MAGIC_STRING, CF_MAGIC_STRING_SIZE))
+ {
+ error= CONTROL_FILE_BAD_MAGIC_STRING;
+ errmsg= "Missing valid id at start of file. File is not a valid aria control file";
+ goto err;
+ }
+
+ if (buffer[CF_VERSION_OFFSET] > CONTROL_FILE_VERSION)
+ {
+ error= CONTROL_FILE_BAD_VERSION;
+ sprintf(errmsg_buff, "File is from a future aria system: %d. Current version is: %d",
+ (int) buffer[CF_VERSION_OFFSET], CONTROL_FILE_VERSION);
+ errmsg= errmsg_buff;
+ goto err;
+ }
+
+ new_cf_create_time_size= uint2korr(buffer + CF_CREATE_TIME_SIZE_OFFSET);
+ new_cf_changeable_size= uint2korr(buffer + CF_CHANGEABLE_SIZE_OFFSET);
+
+ if (new_cf_create_time_size < CF_MIN_CREATE_TIME_TOTAL_SIZE ||
+ new_cf_changeable_size < CF_MIN_CHANGEABLE_TOTAL_SIZE ||
+ new_cf_create_time_size + new_cf_changeable_size != file_size)
+ {
+ error= CONTROL_FILE_INCONSISTENT_INFORMATION;
+ errmsg= "Sizes stored in control file are inconsistent";
+ goto err;
+ }
+
+ new_block_size= uint2korr(buffer + CF_BLOCKSIZE_OFFSET);
+ if (new_block_size != maria_block_size && maria_block_size)
+ {
+ error= CONTROL_FILE_WRONG_BLOCKSIZE;
+ sprintf(errmsg_buff,
+ "Block size in control file (%u) is different than given aria_block_size: %u",
+ new_block_size, (uint) maria_block_size);
+ errmsg= errmsg_buff;
+ goto err;
+ }
+ maria_block_size= new_block_size;
+
+ if (my_checksum(0, buffer, new_cf_create_time_size - CF_CHECKSUM_SIZE) !=
+ uint4korr(buffer + new_cf_create_time_size - CF_CHECKSUM_SIZE))
+ {
+ error= CONTROL_FILE_BAD_HEAD_CHECKSUM;
+ errmsg= "Fixed part checksum mismatch";
+ goto err;
+ }
+
+ if (my_checksum(0, buffer + new_cf_create_time_size + CF_CHECKSUM_SIZE,
+ new_cf_changeable_size - CF_CHECKSUM_SIZE) !=
+ uint4korr(buffer + new_cf_create_time_size))
+ {
+ error= CONTROL_FILE_BAD_CHECKSUM;
+ errmsg= "Changeable part (end of control file) checksum mismatch";
+ goto err;
+ }
+
+ memcpy(maria_uuid, buffer + CF_UUID_OFFSET, CF_UUID_SIZE);
+ cf_create_time_size= new_cf_create_time_size;
+ cf_changeable_size= new_cf_changeable_size;
+ last_checkpoint_lsn= lsn_korr(buffer + new_cf_create_time_size +
+ CF_LSN_OFFSET);
+ last_logno= uint4korr(buffer + new_cf_create_time_size + CF_FILENO_OFFSET);
+ if (new_cf_changeable_size >= (CF_MAX_TRID_OFFSET + CF_MAX_TRID_SIZE))
+ max_trid_in_control_file=
+ transid_korr(buffer + new_cf_create_time_size + CF_MAX_TRID_OFFSET);
+ if (new_cf_changeable_size >= (CF_RECOV_FAIL_OFFSET + CF_RECOV_FAIL_SIZE))
+ recovery_failures=
+ (buffer + new_cf_create_time_size + CF_RECOV_FAIL_OFFSET)[0];
+
+ok:
+ DBUG_RETURN(0);
+
+err:
+ if (print_error)
+ my_printf_error(HA_ERR_INITIALIZATION,
+ "Got error '%s' when trying to use aria control file "
+ "'%s'", 0, errmsg, name);
+ ma_control_file_end(); /* will unlock file if needed */
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Write information durably to the control file; stores this information into
+ the last_checkpoint_lsn, last_logno, max_trid_in_control_file,
+ recovery_failures global variables.
+ Called when we have created a new log (after syncing this log's creation),
+ when we have written a checkpoint (after syncing this log record), at
+ shutdown (for storing trid in case logs are soon removed by user), and
+ before and after recovery (to store recovery_failures).
+ Variables last_checkpoint_lsn and last_logno must be protected by caller
+ using log's lock, unless this function is called at startup.
+
+ SYNOPSIS
+ ma_control_file_write_and_force()
+ last_checkpoint_lsn_arg LSN of last checkpoint
+ last_logno_arg last log file number
+ max_trid_arg maximum transaction longid
+ recovery_failures_arg consecutive recovery failures
+
+ NOTE
+ We always want to do one single my_pwrite() here to be as atomic as
+ possible.
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+int ma_control_file_write_and_force(LSN last_checkpoint_lsn_arg,
+ uint32 last_logno_arg,
+ TrID max_trid_arg,
+ uint8 recovery_failures_arg)
+{
+ uchar buffer[CF_MAX_SIZE];
+ uint32 sum;
+ my_bool no_need_sync;
+ DBUG_ENTER("ma_control_file_write_and_force");
+
+ /*
+ We don't need to sync if this is just an increase of
+ recovery_failures: it's even good if that counter is not increased on disk
+ in case of power or hardware failure (less false positives when removing
+ logs).
+ */
+ no_need_sync= ((last_checkpoint_lsn == last_checkpoint_lsn_arg) &&
+ (last_logno == last_logno_arg) &&
+ (max_trid_in_control_file == max_trid_arg) &&
+ (recovery_failures_arg > 0));
+
+ if (control_file_fd < 0)
+ DBUG_RETURN(1);
+
+#ifndef DBUG_OFF
+ if (maria_multi_threaded)
+ translog_lock_handler_assert_owner();
+#endif
+
+ lsn_store(buffer + CF_LSN_OFFSET, last_checkpoint_lsn_arg);
+ int4store(buffer + CF_FILENO_OFFSET, last_logno_arg);
+ transid_store(buffer + CF_MAX_TRID_OFFSET, max_trid_arg);
+ (buffer + CF_RECOV_FAIL_OFFSET)[0]= recovery_failures_arg;
+
+ if (cf_changeable_size > CF_CHANGEABLE_TOTAL_SIZE)
+ {
+ /*
+ More room than needed for us. Must be a newer version. Clear part which
+ we cannot maintain, so that any future version notices we didn't
+ maintain its extra data.
+ */
+ uint zeroed= cf_changeable_size - CF_CHANGEABLE_TOTAL_SIZE;
+ char msg[150];
+ bzero(buffer + CF_CHANGEABLE_TOTAL_SIZE, zeroed);
+ my_snprintf(msg, sizeof(msg),
+ "Control file must be from a newer version; zero-ing out %u"
+ " unknown bytes in control file at offset %u", zeroed,
+ cf_changeable_size + cf_create_time_size);
+ ma_message_no_user(ME_JUST_WARNING, msg);
+ }
+ else
+ {
+ /* not enough room for what we need to store: enlarge */
+ cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE;
+ }
+ /* Note that the create-time portion is not touched */
+
+ /* Checksum is stored first */
+ compile_time_assert(CF_CHECKSUM_OFFSET == 0);
+ sum= my_checksum(0, buffer + CF_CHECKSUM_SIZE,
+ cf_changeable_size - CF_CHECKSUM_SIZE);
+ int4store(buffer, sum);
+
+ if (my_pwrite(control_file_fd, buffer, cf_changeable_size,
+ cf_create_time_size, MYF(MY_FNABP | MY_WME)) ||
+ (!no_need_sync && my_sync(control_file_fd, MYF(MY_WME))))
+ DBUG_RETURN(1);
+
+ last_checkpoint_lsn= last_checkpoint_lsn_arg;
+ last_logno= last_logno_arg;
+ max_trid_in_control_file= max_trid_arg;
+ recovery_failures= recovery_failures_arg;
+
+ cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE; /* no more warning */
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Free resources taken by control file subsystem
+
+ SYNOPSIS
+ ma_control_file_end()
+*/
+
+int ma_control_file_end(void)
+{
+ int close_error;
+ DBUG_ENTER("ma_control_file_end");
+
+ if (control_file_fd < 0) /* already closed */
+ DBUG_RETURN(0);
+
+#ifndef __WIN__
+ (void) my_lock(control_file_fd, F_UNLCK, 0L, F_TO_EOF,
+ MYF(MY_SEEK_NOT_DONE | MY_FORCE_LOCK));
+#endif
+
+ close_error= my_close(control_file_fd, MYF(MY_WME));
+ /*
+ As my_close() frees structures even if close() fails, we do the same,
+ i.e. we mark the file as closed in all cases.
+ */
+ control_file_fd= -1;
+ /*
+ As this module owns these variables, closing the module forbids access to
+ them (just a safety):
+ */
+ last_checkpoint_lsn= LSN_IMPOSSIBLE;
+ last_logno= FILENO_IMPOSSIBLE;
+ max_trid_in_control_file= recovery_failures= 0;
+
+ DBUG_RETURN(close_error);
+}
+
+
+/**
+ Tells if control file is initialized.
+*/
+
+my_bool ma_control_file_inited(void)
+{
+ return (control_file_fd >= 0);
+}
+
+#endif /* EXTRACT_DEFINITIONS */
diff --git a/storage/maria/ma_control_file.h b/storage/maria/ma_control_file.h
new file mode 100644
index 00000000000..f828ae69c6d
--- /dev/null
+++ b/storage/maria/ma_control_file.h
@@ -0,0 +1,74 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3234 Maria control file
+ First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+#ifndef _ma_control_file_h
+#define _ma_control_file_h
+
+#define CONTROL_FILE_BASE_NAME "aria_log_control"
+/*
+ Major version for control file. Should only be changed when doing
+ big changes that made the new control file incompatible with all
+ older versions of Maria.
+*/
+#define CONTROL_FILE_VERSION 1
+
+/* Here is the interface of this module */
+
+/*
+ LSN of the last checkoint
+ (if last_checkpoint_lsn == LSN_IMPOSSIBLE then there was never a checkpoint)
+*/
+extern LSN last_checkpoint_lsn;
+/*
+ Last log number (if last_logno == FILENO_IMPOSSIBLE then there is no log
+ file yet)
+*/
+extern uint32 last_logno;
+
+extern TrID max_trid_in_control_file;
+
+extern uint8 recovery_failures;
+
+extern my_bool maria_multi_threaded, maria_in_recovery;
+
+typedef enum enum_control_file_error {
+ CONTROL_FILE_OK= 0,
+ CONTROL_FILE_TOO_SMALL,
+ CONTROL_FILE_TOO_BIG,
+ CONTROL_FILE_BAD_MAGIC_STRING,
+ CONTROL_FILE_BAD_VERSION,
+ CONTROL_FILE_BAD_CHECKSUM,
+ CONTROL_FILE_BAD_HEAD_CHECKSUM,
+ CONTROL_FILE_MISSING,
+ CONTROL_FILE_INCONSISTENT_INFORMATION,
+ CONTROL_FILE_WRONG_BLOCKSIZE,
+ CONTROL_FILE_UNKNOWN_ERROR /* any other error */
+} CONTROL_FILE_ERROR;
+
+C_MODE_START
+CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing,
+ my_bool print_error);
+int ma_control_file_write_and_force(LSN last_checkpoint_lsn_arg,
+ uint32 last_logno_arg, TrID max_trid_arg,
+ uint8 recovery_failures_arg);
+int ma_control_file_end(void);
+my_bool ma_control_file_inited(void);
+C_MODE_END
+#endif
diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c
new file mode 100644
index 00000000000..9cf042ed21e
--- /dev/null
+++ b/storage/maria/ma_create.c
@@ -0,0 +1,1419 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Create a MARIA table */
+
+#include "ma_ftdefs.h"
+#include "ma_sp_defs.h"
+#include <my_bit.h>
+#include "ma_blockrec.h"
+#include "trnman_public.h"
+
+#if defined(MSDOS) || defined(__WIN__)
+#ifdef __WIN__
+#include <fcntl.h>
+#else
+#include <process.h> /* Prototype for getpid */
+#endif
+#endif
+#include <m_ctype.h>
+
+static int compare_columns(MARIA_COLUMNDEF **a, MARIA_COLUMNDEF **b);
+
+/*
+ Old options is used when recreating database, from maria_chk
+*/
+
+int maria_create(const char *name, enum data_file_type datafile_type,
+ uint keys,MARIA_KEYDEF *keydefs,
+ uint columns, MARIA_COLUMNDEF *columndef,
+ uint uniques, MARIA_UNIQUEDEF *uniquedefs,
+ MARIA_CREATE_INFO *ci,uint flags)
+{
+ register uint i,j;
+ File dfile,file;
+ int errpos,save_errno, create_mode= O_RDWR | O_TRUNC, res;
+ myf create_flag;
+ uint length,max_key_length,packed,pack_bytes,pointer,real_length_diff,
+ key_length,info_length,key_segs,options,min_key_length,
+ base_pos,long_varchar_count,varchar_length,
+ unique_key_parts,fulltext_keys,offset, not_block_record_extra_length;
+ uint max_field_lengths, extra_header_size, column_nr;
+ ulong reclength, real_reclength,min_pack_length;
+ char filename[FN_REFLEN], linkname[FN_REFLEN], *linkname_ptr;
+ ulong pack_reclength;
+ ulonglong tot_length,max_rows, tmp;
+ enum en_fieldtype type;
+ enum data_file_type org_datafile_type= datafile_type;
+ MARIA_SHARE share;
+ MARIA_KEYDEF *keydef,tmp_keydef;
+ MARIA_UNIQUEDEF *uniquedef;
+ HA_KEYSEG *keyseg,tmp_keyseg;
+ MARIA_COLUMNDEF *column, *end_column;
+ double *rec_per_key_part;
+ ulong *nulls_per_key_part;
+ uint16 *column_array;
+ my_off_t key_root[HA_MAX_POSSIBLE_KEY], kfile_size_before_extension;
+ MARIA_CREATE_INFO tmp_create_info;
+ my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */
+ my_bool forced_packed;
+ myf sync_dir= 0;
+ uchar *log_data= NULL;
+ DBUG_ENTER("maria_create");
+ DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u",
+ keys, columns, uniques, flags));
+
+ DBUG_ASSERT(maria_inited);
+ LINT_INIT(dfile);
+ LINT_INIT(file);
+
+ if (!ci)
+ {
+ bzero((char*) &tmp_create_info,sizeof(tmp_create_info));
+ ci=&tmp_create_info;
+ }
+
+ if (keys + uniques > MARIA_MAX_KEY)
+ {
+ DBUG_RETURN(my_errno=HA_WRONG_CREATE_OPTION);
+ }
+ errpos=0;
+ options=0;
+ bzero((uchar*) &share,sizeof(share));
+
+ if (flags & HA_DONT_TOUCH_DATA)
+ {
+ /* We come here from recreate table */
+ org_datafile_type= ci->org_data_file_type;
+ if (!(ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD))
+ options= (ci->old_options &
+ (HA_OPTION_COMPRESS_RECORD | HA_OPTION_PACK_RECORD |
+ HA_OPTION_READ_ONLY_DATA | HA_OPTION_CHECKSUM |
+ HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE |
+ HA_OPTION_LONG_BLOB_PTR | HA_OPTION_PAGE_CHECKSUM));
+ else
+ {
+ /* Uncompressing rows */
+ options= (ci->old_options &
+ (HA_OPTION_CHECKSUM | HA_OPTION_TMP_TABLE |
+ HA_OPTION_DELAY_KEY_WRITE | HA_OPTION_LONG_BLOB_PTR |
+ HA_OPTION_PAGE_CHECKSUM));
+ }
+ }
+ else
+ {
+ /* Transactional tables must be of type BLOCK_RECORD */
+ if (ci->transactional)
+ datafile_type= BLOCK_RECORD;
+ }
+
+ if (ci->reloc_rows > ci->max_rows)
+ ci->reloc_rows=ci->max_rows; /* Check if wrong parameter */
+
+ if (!(rec_per_key_part=
+ (double*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(double) +
+ (keys + uniques)*HA_MAX_KEY_SEG*sizeof(ulong) +
+ sizeof(uint16) * columns,
+ MYF(MY_WME | MY_ZEROFILL))))
+ DBUG_RETURN(my_errno);
+ nulls_per_key_part= (ulong*) (rec_per_key_part +
+ (keys + uniques) * HA_MAX_KEY_SEG);
+ column_array= (uint16*) (nulls_per_key_part +
+ (keys + uniques) * HA_MAX_KEY_SEG);
+
+
+ /* Start by checking fields and field-types used */
+ varchar_length=long_varchar_count=packed= not_block_record_extra_length=
+ pack_reclength= max_field_lengths= 0;
+ reclength= min_pack_length= ci->null_bytes;
+ forced_packed= 0;
+ column_nr= 0;
+
+ for (column= columndef, end_column= column + columns ;
+ column != end_column ;
+ column++)
+ {
+ /* Fill in not used struct parts */
+ column->column_nr= column_nr++;
+ column->offset= reclength;
+ column->empty_pos= 0;
+ column->empty_bit= 0;
+ column->fill_length= column->length;
+ if (column->null_bit)
+ options|= HA_OPTION_NULL_FIELDS;
+
+ reclength+= column->length;
+ type= column->type;
+ if (datafile_type == BLOCK_RECORD)
+ {
+ if (type == FIELD_SKIP_PRESPACE)
+ type= column->type= FIELD_NORMAL; /* SKIP_PRESPACE not supported */
+ if (type == FIELD_NORMAL &&
+ column->length > FULL_PAGE_SIZE(maria_block_size))
+ {
+ /* FIELD_NORMAL can't be split over many blocks, convert to a CHAR */
+ type= column->type= FIELD_SKIP_ENDSPACE;
+ }
+ }
+
+ if (type != FIELD_NORMAL && type != FIELD_CHECK)
+ {
+ column->empty_pos= packed/8;
+ column->empty_bit= (1 << (packed & 7));
+ if (type == FIELD_BLOB)
+ {
+ forced_packed= 1;
+ packed++;
+ share.base.blobs++;
+ if (pack_reclength != INT_MAX32)
+ {
+ if (column->length == 4+portable_sizeof_char_ptr)
+ pack_reclength= INT_MAX32;
+ else
+ {
+ /* Add max possible blob length */
+ pack_reclength+= (1 << ((column->length-
+ portable_sizeof_char_ptr)*8));
+ }
+ }
+ max_field_lengths+= (column->length - portable_sizeof_char_ptr);
+ }
+ else if (type == FIELD_SKIP_PRESPACE ||
+ type == FIELD_SKIP_ENDSPACE)
+ {
+ forced_packed= 1;
+ max_field_lengths+= column->length > 255 ? 2 : 1;
+ not_block_record_extra_length++;
+ packed++;
+ }
+ else if (type == FIELD_VARCHAR)
+ {
+ varchar_length+= column->length-1; /* Used for min_pack_length */
+ pack_reclength++;
+ not_block_record_extra_length++;
+ max_field_lengths++;
+ packed++;
+ column->fill_length= 1;
+ options|= HA_OPTION_NULL_FIELDS; /* Use ma_checksum() */
+
+ /* We must test for 257 as length includes pack-length */
+ if (test(column->length >= 257))
+ {
+ long_varchar_count++;
+ max_field_lengths++;
+ column->fill_length= 2;
+ }
+ }
+ else if (type == FIELD_SKIP_ZERO)
+ packed++;
+ else
+ {
+ if (!column->null_bit)
+ min_pack_length+= column->length;
+ else
+ {
+ /* Only BLOCK_RECORD skips NULL fields for all field values */
+ not_block_record_extra_length+= column->length;
+ }
+ column->empty_pos= 0;
+ column->empty_bit= 0;
+ }
+ }
+ else /* FIELD_NORMAL */
+ {
+ if (!column->null_bit)
+ {
+ min_pack_length+= column->length;
+ share.base.fixed_not_null_fields++;
+ share.base.fixed_not_null_fields_length+= column->length;
+ }
+ else
+ not_block_record_extra_length+= column->length;
+ }
+ }
+
+ if (datafile_type == STATIC_RECORD && forced_packed)
+ {
+ /* Can't use fixed length records, revert to block records */
+ datafile_type= BLOCK_RECORD;
+ }
+
+ if (datafile_type == DYNAMIC_RECORD)
+ options|= HA_OPTION_PACK_RECORD; /* Must use packed records */
+
+ if (datafile_type == STATIC_RECORD)
+ {
+ /* We can't use checksum with static length rows */
+ flags&= ~HA_CREATE_CHECKSUM;
+ options&= ~HA_OPTION_CHECKSUM;
+ min_pack_length= reclength;
+ packed= 0;
+ }
+ else if (datafile_type != BLOCK_RECORD)
+ min_pack_length+= not_block_record_extra_length;
+ else
+ min_pack_length+= 5; /* Min row overhead */
+
+ if (flags & HA_CREATE_TMP_TABLE)
+ {
+ options|= HA_OPTION_TMP_TABLE;
+ tmp_table= TRUE;
+ create_mode|= O_NOFOLLOW;
+ /* "CREATE TEMPORARY" tables are not crash-safe (dropped at restart) */
+ ci->transactional= FALSE;
+ flags&= ~HA_CREATE_PAGE_CHECKSUM;
+ }
+ share.base.null_bytes= ci->null_bytes;
+ share.base.original_null_bytes= ci->null_bytes;
+ share.base.born_transactional= ci->transactional;
+ share.base.max_field_lengths= max_field_lengths;
+ share.base.field_offsets= 0; /* for future */
+
+ if (flags & HA_CREATE_CHECKSUM || (options & HA_OPTION_CHECKSUM))
+ {
+ options|= HA_OPTION_CHECKSUM;
+ min_pack_length++;
+ pack_reclength++;
+ }
+ if (pack_reclength < INT_MAX32)
+ pack_reclength+= max_field_lengths + long_varchar_count;
+ else
+ pack_reclength= INT_MAX32;
+
+ if (flags & HA_CREATE_DELAY_KEY_WRITE)
+ options|= HA_OPTION_DELAY_KEY_WRITE;
+ if (flags & HA_CREATE_RELIES_ON_SQL_LAYER)
+ options|= HA_OPTION_RELIES_ON_SQL_LAYER;
+ if (flags & HA_CREATE_PAGE_CHECKSUM)
+ options|= HA_OPTION_PAGE_CHECKSUM;
+
+ pack_bytes= (packed + 7) / 8;
+ if (pack_reclength != INT_MAX32)
+ pack_reclength+= reclength+pack_bytes +
+ test(test_all_bits(options, HA_OPTION_CHECKSUM | HA_OPTION_PACK_RECORD));
+ min_pack_length+= pack_bytes;
+ /* Calculate min possible row length for rows-in-block */
+ extra_header_size= MAX_FIXED_HEADER_SIZE;
+ if (ci->transactional)
+ {
+ extra_header_size= TRANS_MAX_FIXED_HEADER_SIZE;
+ DBUG_PRINT("info",("creating a transactional table"));
+ }
+ share.base.min_block_length= (extra_header_size + share.base.null_bytes +
+ pack_bytes);
+ if (!ci->data_file_length && ci->max_rows)
+ {
+ if (pack_reclength == INT_MAX32 ||
+ (~(ulonglong) 0)/ci->max_rows < (ulonglong) pack_reclength)
+ ci->data_file_length= ~(ulonglong) 0;
+ else
+ ci->data_file_length=(ulonglong) ci->max_rows*pack_reclength;
+ }
+ else if (!ci->max_rows)
+ {
+ if (datafile_type == BLOCK_RECORD)
+ {
+ uint rows_per_page= ((maria_block_size - PAGE_OVERHEAD_SIZE) /
+ (min_pack_length + extra_header_size +
+ DIR_ENTRY_SIZE));
+ ulonglong data_file_length= ci->data_file_length;
+ if (!data_file_length)
+ data_file_length= ((((ulonglong) 1 << ((BLOCK_RECORD_POINTER_SIZE-1) *
+ 8)) -1) * maria_block_size);
+ if (rows_per_page > 0)
+ {
+ set_if_smaller(rows_per_page, MAX_ROWS_PER_PAGE);
+ ci->max_rows= data_file_length / maria_block_size * rows_per_page;
+ }
+ else
+ ci->max_rows= data_file_length / (min_pack_length +
+ extra_header_size +
+ DIR_ENTRY_SIZE);
+ }
+ else
+ ci->max_rows=(ha_rows) (ci->data_file_length/(min_pack_length +
+ ((options &
+ HA_OPTION_PACK_RECORD) ?
+ 3 : 0)));
+ }
+ max_rows= (ulonglong) ci->max_rows;
+ if (datafile_type == BLOCK_RECORD)
+ {
+ /*
+ The + 1 is for record position withing page
+ The / 2 is because we need one bit for knowing if there is transid's
+ after the row pointer
+ */
+ pointer= maria_get_pointer_length((ci->data_file_length /
+ (maria_block_size * 2)), 3) + 1;
+ set_if_smaller(pointer, BLOCK_RECORD_POINTER_SIZE);
+
+ if (!max_rows)
+ max_rows= (((((ulonglong) 1 << ((pointer-1)*8)) -1) * maria_block_size) /
+ min_pack_length / 2);
+ }
+ else
+ {
+ if (datafile_type != STATIC_RECORD)
+ pointer= maria_get_pointer_length(ci->data_file_length,
+ maria_data_pointer_size);
+ else
+ pointer= maria_get_pointer_length(ci->max_rows, maria_data_pointer_size);
+ if (!max_rows)
+ max_rows= ((((ulonglong) 1 << (pointer*8)) -1) / min_pack_length);
+ }
+
+ real_reclength=reclength;
+ if (datafile_type == STATIC_RECORD)
+ {
+ if (reclength <= pointer)
+ reclength=pointer+1; /* reserve place for delete link */
+ }
+ else
+ reclength+= long_varchar_count; /* We need space for varchar! */
+
+ max_key_length=0; tot_length=0 ; key_segs=0;
+ fulltext_keys=0;
+ share.state.rec_per_key_part= rec_per_key_part;
+ share.state.nulls_per_key_part= nulls_per_key_part;
+ share.state.key_root=key_root;
+ share.state.key_del= HA_OFFSET_ERROR;
+ if (uniques)
+ max_key_length= MARIA_UNIQUE_HASH_LENGTH + pointer;
+
+ for (i=0, keydef=keydefs ; i < keys ; i++ , keydef++)
+ {
+ share.state.key_root[i]= HA_OFFSET_ERROR;
+ length= real_length_diff= 0;
+ min_key_length= key_length= pointer;
+
+ if (keydef->key_alg == HA_KEY_ALG_RTREE)
+ keydef->flag|= HA_RTREE_INDEX; /* For easier tests */
+
+ if (keydef->flag & HA_SPATIAL)
+ {
+#ifdef HAVE_SPATIAL
+ /* BAR TODO to support 3D and more dimensions in the future */
+ uint sp_segs=SPDIMS*2;
+ keydef->flag=HA_SPATIAL;
+
+ if (flags & HA_DONT_TOUCH_DATA)
+ {
+ /*
+ Called by maria_chk - i.e. table structure was taken from
+ MYI file and SPATIAL key *does have* additional sp_segs keysegs.
+ keydef->seg here points right at the GEOMETRY segment,
+ so we only need to decrease keydef->keysegs.
+ (see maria_recreate_table() in _ma_check.c)
+ */
+ keydef->keysegs-=sp_segs-1;
+ }
+
+ for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ;
+ j++, keyseg++)
+ {
+ if (keyseg->type != HA_KEYTYPE_BINARY &&
+ keyseg->type != HA_KEYTYPE_VARBINARY1 &&
+ keyseg->type != HA_KEYTYPE_VARBINARY2)
+ {
+ my_errno=HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+ }
+ keydef->keysegs+=sp_segs;
+ key_length+=SPLEN*sp_segs;
+ length++; /* At least one length uchar */
+ min_key_length++;
+#else
+ my_errno= HA_ERR_UNSUPPORTED;
+ goto err_no_lock;
+#endif /*HAVE_SPATIAL*/
+ }
+ else if (keydef->flag & HA_FULLTEXT)
+ {
+ keydef->flag=HA_FULLTEXT | HA_PACK_KEY | HA_VAR_LENGTH_KEY;
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+
+ for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ;
+ j++, keyseg++)
+ {
+ if (keyseg->type != HA_KEYTYPE_TEXT &&
+ keyseg->type != HA_KEYTYPE_VARTEXT1 &&
+ keyseg->type != HA_KEYTYPE_VARTEXT2)
+ {
+ my_errno=HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+ if (!(keyseg->flag & HA_BLOB_PART) &&
+ (keyseg->type == HA_KEYTYPE_VARTEXT1 ||
+ keyseg->type == HA_KEYTYPE_VARTEXT2))
+ {
+ /* Make a flag that this is a VARCHAR */
+ keyseg->flag|= HA_VAR_LENGTH_PART;
+ /* Store in bit_start number of bytes used to pack the length */
+ keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1)?
+ 1 : 2);
+ }
+ }
+
+ fulltext_keys++;
+ key_length+= HA_FT_MAXBYTELEN+HA_FT_WLEN;
+ length++; /* At least one length uchar */
+ min_key_length+= 1 + HA_FT_WLEN;
+ real_length_diff=HA_FT_MAXBYTELEN-FT_MAX_WORD_LEN_FOR_SORT;
+ }
+ else
+ {
+ /* Test if prefix compression */
+ if (keydef->flag & HA_PACK_KEY)
+ {
+ /* Can't use space_compression on number keys */
+ if ((keydef->seg[0].flag & HA_SPACE_PACK) &&
+ keydef->seg[0].type == (int) HA_KEYTYPE_NUM)
+ keydef->seg[0].flag&= ~HA_SPACE_PACK;
+
+ /* Only use HA_PACK_KEY when first segment is a variable length key */
+ if (!(keydef->seg[0].flag & (HA_SPACE_PACK | HA_BLOB_PART |
+ HA_VAR_LENGTH_PART)))
+ {
+ /* pack relative to previous key */
+ keydef->flag&= ~HA_PACK_KEY;
+ keydef->flag|= HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY;
+ }
+ else
+ {
+ keydef->seg[0].flag|=HA_PACK_KEY; /* for easyer intern test */
+ keydef->flag|=HA_VAR_LENGTH_KEY;
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+ }
+ }
+ if (keydef->flag & HA_BINARY_PACK_KEY)
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+
+ if (keydef->flag & HA_AUTO_KEY && ci->with_auto_increment)
+ share.base.auto_key=i+1;
+ for (j=0, keyseg=keydef->seg ; j < keydef->keysegs ; j++, keyseg++)
+ {
+ /* numbers are stored with high by first to make compression easier */
+ switch (keyseg->type) {
+ case HA_KEYTYPE_SHORT_INT:
+ case HA_KEYTYPE_LONG_INT:
+ case HA_KEYTYPE_FLOAT:
+ case HA_KEYTYPE_DOUBLE:
+ case HA_KEYTYPE_USHORT_INT:
+ case HA_KEYTYPE_ULONG_INT:
+ case HA_KEYTYPE_LONGLONG:
+ case HA_KEYTYPE_ULONGLONG:
+ case HA_KEYTYPE_INT24:
+ case HA_KEYTYPE_UINT24:
+ case HA_KEYTYPE_INT8:
+ keyseg->flag|= HA_SWAP_KEY;
+ break;
+ case HA_KEYTYPE_VARTEXT1:
+ case HA_KEYTYPE_VARTEXT2:
+ case HA_KEYTYPE_VARBINARY1:
+ case HA_KEYTYPE_VARBINARY2:
+ if (!(keyseg->flag & HA_BLOB_PART))
+ {
+ /* Make a flag that this is a VARCHAR */
+ keyseg->flag|= HA_VAR_LENGTH_PART;
+ /* Store in bit_start number of bytes used to pack the length */
+ keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 ||
+ keyseg->type == HA_KEYTYPE_VARBINARY1) ?
+ 1 : 2);
+ }
+ break;
+ default:
+ break;
+ }
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ DBUG_ASSERT(!(keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)));
+ keydef->flag |= HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY;
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+ length++; /* At least one length uchar */
+ if (!keyseg->null_bit)
+ min_key_length++;
+ key_length+= keyseg->length;
+ if (keyseg->length >= 255)
+ {
+ /* prefix may be 3 bytes */
+ length+= 2;
+ }
+ }
+ else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))
+ {
+ DBUG_ASSERT(!test_all_bits(keyseg->flag,
+ (HA_VAR_LENGTH_PART | HA_BLOB_PART)));
+ keydef->flag|=HA_VAR_LENGTH_KEY;
+ length++; /* At least one length uchar */
+ if (!keyseg->null_bit)
+ min_key_length++;
+ options|=HA_OPTION_PACK_KEYS; /* Using packed keys */
+ key_length+= keyseg->length;
+ if (keyseg->length >= 255)
+ {
+ /* prefix may be 3 bytes */
+ length+= 2;
+ }
+ }
+ else
+ {
+ key_length+= keyseg->length;
+ if (!keyseg->null_bit)
+ min_key_length+= keyseg->length;
+ }
+ if (keyseg->null_bit)
+ {
+ key_length++;
+ /* min key part is 1 byte */
+ min_key_length++;
+ options|=HA_OPTION_PACK_KEYS;
+ keyseg->flag|=HA_NULL_PART;
+ keydef->flag|=HA_VAR_LENGTH_KEY | HA_NULL_PART_KEY;
+ }
+ }
+ } /* if HA_FULLTEXT */
+ key_segs+=keydef->keysegs;
+ if (keydef->keysegs > HA_MAX_KEY_SEG)
+ {
+ my_errno=HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+ /*
+ key_segs may be 0 in the case when we only want to be able to
+ add on row into the table. This can happen with some DISTINCT queries
+ in MySQL
+ */
+ if ((keydef->flag & (HA_NOSAME | HA_NULL_PART_KEY)) == HA_NOSAME &&
+ key_segs)
+ share.state.rec_per_key_part[key_segs-1]=1L;
+ length+=key_length;
+ /*
+ A key can't be longer than than half a index block (as we have
+ to be able to put at least 2 keys on an index block for the key
+ algorithms to work).
+ */
+ if (length > maria_max_key_length())
+ {
+ my_errno=HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+ keydef->block_length= (uint16) maria_block_size;
+ keydef->keylength= (uint16) key_length;
+ keydef->minlength= (uint16) min_key_length;
+ keydef->maxlength= (uint16) length;
+
+ if (length > max_key_length)
+ max_key_length= length;
+ tot_length+= ((max_rows/(ulong) (((uint) maria_block_size -
+ MAX_KEYPAGE_HEADER_SIZE -
+ KEYPAGE_CHECKSUM_SIZE)/
+ (length*2))) *
+ maria_block_size);
+ }
+
+ unique_key_parts=0;
+ for (i=0, uniquedef=uniquedefs ; i < uniques ; i++ , uniquedef++)
+ {
+ uniquedef->key=keys+i;
+ unique_key_parts+=uniquedef->keysegs;
+ share.state.key_root[keys+i]= HA_OFFSET_ERROR;
+ tot_length+= (max_rows/(ulong) (((uint) maria_block_size -
+ MAX_KEYPAGE_HEADER_SIZE -
+ KEYPAGE_CHECKSUM_SIZE) /
+ ((MARIA_UNIQUE_HASH_LENGTH + pointer)*2)))*
+ (ulong) maria_block_size;
+ }
+ keys+=uniques; /* Each unique has 1 key */
+ key_segs+=uniques; /* Each unique has 1 key seg */
+
+ base_pos=(MARIA_STATE_INFO_SIZE + keys * MARIA_STATE_KEY_SIZE +
+ key_segs * MARIA_STATE_KEYSEG_SIZE);
+ info_length= base_pos+(uint) (MARIA_BASE_INFO_SIZE+
+ keys * MARIA_KEYDEF_SIZE+
+ uniques * MARIA_UNIQUEDEF_SIZE +
+ (key_segs + unique_key_parts)*HA_KEYSEG_SIZE+
+ columns*(MARIA_COLUMNDEF_SIZE + 2));
+
+ DBUG_PRINT("info", ("info_length: %u", info_length));
+ /* There are only 16 bits for the total header length. */
+ if (info_length > 65535)
+ {
+ my_printf_error(HA_WRONG_CREATE_OPTION,
+ "Aria table '%s' has too many columns and/or "
+ "indexes and/or unique constraints.",
+ MYF(0), name + dirname_length(name));
+ my_errno= HA_WRONG_CREATE_OPTION;
+ goto err_no_lock;
+ }
+
+ bmove(share.state.header.file_version, maria_file_magic, 4);
+ ci->old_options=options | (ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD ?
+ HA_OPTION_COMPRESS_RECORD |
+ HA_OPTION_TEMP_COMPRESS_RECORD: 0);
+ mi_int2store(share.state.header.options,ci->old_options);
+ mi_int2store(share.state.header.header_length,info_length);
+ mi_int2store(share.state.header.state_info_length,MARIA_STATE_INFO_SIZE);
+ mi_int2store(share.state.header.base_info_length,MARIA_BASE_INFO_SIZE);
+ mi_int2store(share.state.header.base_pos,base_pos);
+ share.state.header.data_file_type= share.data_file_type= datafile_type;
+ share.state.header.org_data_file_type= org_datafile_type;
+ share.state.header.language= (ci->language ?
+ ci->language : default_charset_info->number);
+
+ share.state.dellink = HA_OFFSET_ERROR;
+ share.state.first_bitmap_with_space= 0;
+#ifdef EXTERNAL_LOCKING
+ share.state.process= (ulong) getpid();
+#endif
+ share.state.version= (ulong) time((time_t*) 0);
+ share.state.sortkey= (ushort) ~0;
+ share.state.auto_increment=ci->auto_increment;
+ share.options=options;
+ share.base.rec_reflength=pointer;
+ share.base.block_size= maria_block_size;
+
+ /*
+ Get estimate for index file length (this may be wrong for FT keys)
+ This is used for pointers to other key pages.
+ */
+ tmp= (tot_length + maria_block_size * keys *
+ MARIA_INDEX_BLOCK_MARGIN) / maria_block_size;
+
+ /*
+ use maximum of key_file_length we calculated and key_file_length value we
+ got from MAI file header (see also mariapack.c:save_state)
+ */
+ share.base.key_reflength=
+ maria_get_pointer_length(max(ci->key_file_length,tmp),3);
+ share.base.keys= share.state.header.keys= keys;
+ share.state.header.uniques= uniques;
+ share.state.header.fulltext_keys= fulltext_keys;
+ mi_int2store(share.state.header.key_parts,key_segs);
+ mi_int2store(share.state.header.unique_key_parts,unique_key_parts);
+
+ maria_set_all_keys_active(share.state.key_map, keys);
+
+ share.base.keystart = share.state.state.key_file_length=
+ MY_ALIGN(info_length, maria_block_size);
+ share.base.max_key_block_length= maria_block_size;
+ share.base.max_key_length=ALIGN_SIZE(max_key_length+4);
+ share.base.records=ci->max_rows;
+ share.base.reloc= ci->reloc_rows;
+ share.base.reclength=real_reclength;
+ share.base.pack_reclength=reclength+ test(options & HA_OPTION_CHECKSUM);
+ share.base.max_pack_length=pack_reclength;
+ share.base.min_pack_length=min_pack_length;
+ share.base.pack_bytes= pack_bytes;
+ share.base.fields= columns;
+ share.base.pack_fields= packed;
+
+ if (share.data_file_type == BLOCK_RECORD)
+ {
+ /*
+ we are going to create a first bitmap page, set data_file_length
+ to reflect this, before the state goes to disk
+ */
+ share.state.state.data_file_length= maria_block_size;
+ /* Add length of packed fields + length */
+ share.base.pack_reclength+= share.base.max_field_lengths+3;
+
+ /* Adjust max_pack_length, to be used if we have short rows */
+ if (share.base.max_pack_length < maria_block_size)
+ {
+ share.base.max_pack_length+= FLAG_SIZE;
+ if (ci->transactional)
+ share.base.max_pack_length+= TRANSID_SIZE * 2;
+ }
+ }
+
+ /* max_data_file_length and max_key_file_length are recalculated on open */
+ if (tmp_table)
+ share.base.max_data_file_length= (my_off_t) ci->data_file_length;
+ else if (ci->transactional && translog_status == TRANSLOG_OK &&
+ !maria_in_recovery)
+ {
+ /*
+ we have checked translog_inited above, because maria_chk may call us
+ (via maria_recreate_table()) and it does not have a log.
+ */
+ sync_dir= MY_SYNC_DIR;
+ /*
+ If crash between _ma_state_info_write_sub() and
+ _ma_update_state__lsns_sub(), table should be ignored by Recovery (or
+ old REDOs would fail), so we cannot let LSNs be 0:
+ */
+ share.state.skip_redo_lsn= share.state.is_of_horizon=
+ share.state.create_rename_lsn= LSN_MAX;
+ }
+
+ if (datafile_type == DYNAMIC_RECORD)
+ {
+ share.base.min_block_length=
+ (share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH &&
+ ! share.base.blobs) ?
+ max(share.base.pack_reclength,MARIA_MIN_BLOCK_LENGTH) :
+ MARIA_EXTEND_BLOCK_LENGTH;
+ }
+ else if (datafile_type == STATIC_RECORD)
+ share.base.min_block_length= share.base.pack_reclength;
+
+ if (! (flags & HA_DONT_TOUCH_DATA))
+ share.state.create_time= time((time_t*) 0);
+
+ pthread_mutex_lock(&THR_LOCK_maria);
+
+ /*
+ NOTE: For test_if_reopen() we need a real path name. Hence we need
+ MY_RETURN_REAL_PATH for every fn_format(filename, ...).
+ */
+ if (ci->index_file_name)
+ {
+ char *iext= strrchr(ci->index_file_name, '.');
+ int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT);
+ if (tmp_table)
+ {
+ char *path;
+ /* chop off the table name, tempory tables use generated name */
+ if ((path= strrchr(ci->index_file_name, FN_LIBCHAR)))
+ *path= '\0';
+ fn_format(filename, name, ci->index_file_name, MARIA_NAME_IEXT,
+ MY_REPLACE_DIR | MY_UNPACK_FILENAME |
+ MY_RETURN_REAL_PATH | MY_APPEND_EXT);
+ }
+ else
+ {
+ fn_format(filename, ci->index_file_name, "", MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH |
+ (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+ }
+ fn_format(linkname, name, "", MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ linkname_ptr= linkname;
+ /*
+ Don't create the table if the link or file exists to ensure that one
+ doesn't accidently destroy another table.
+ Don't sync dir now if the data file has the same path.
+ */
+ create_flag=
+ (ci->data_file_name &&
+ !strcmp(ci->index_file_name, ci->data_file_name)) ? 0 : sync_dir;
+ }
+ else
+ {
+ char *iext= strrchr(name, '.');
+ int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT);
+ fn_format(filename, name, "", MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH |
+ (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+ linkname_ptr= NullS;
+ /*
+ Replace the current file.
+ Don't sync dir now if the data file has the same path.
+ */
+ create_flag= (flags & HA_CREATE_KEEP_FILES) ? 0 : MY_DELETE_OLD;
+ create_flag|= (!ci->data_file_name ? 0 : sync_dir);
+ }
+
+ /*
+ If a MRG_MARIA table is in use, the mapped MARIA tables are open,
+ but no entry is made in the table cache for them.
+ A TRUNCATE command checks for the table in the cache only and could
+ be fooled to believe, the table is not open.
+ Pull the emergency brake in this situation. (Bug #8306)
+
+
+ NOTE: The filename is compared against unique_file_name of every
+ open table. Hence we need a real path here.
+ */
+ if (_ma_test_if_reopen(filename))
+ {
+ my_printf_error(0, "Aria table '%s' is in use "
+ "(most likely by a MERGE table). Try FLUSH TABLES.",
+ MYF(0), name + dirname_length(name));
+ my_errno= HA_ERR_TABLE_EXIST;
+ goto err;
+ }
+
+ if ((file= my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+ MYF(MY_WME|create_flag))) < 0)
+ goto err;
+ errpos=1;
+
+ DBUG_PRINT("info", ("write state info and base info"));
+ if (_ma_state_info_write_sub(file, &share.state,
+ MA_STATE_INFO_WRITE_FULL_INFO) ||
+ _ma_base_info_write(file, &share.base))
+ goto err;
+ DBUG_PRINT("info", ("base_pos: %d base_info_size: %d",
+ base_pos, MARIA_BASE_INFO_SIZE));
+ DBUG_ASSERT(my_tell(file,MYF(0)) == base_pos+ MARIA_BASE_INFO_SIZE);
+
+ /* Write key and keyseg definitions */
+ DBUG_PRINT("info", ("write key and keyseg definitions"));
+ for (i=0 ; i < share.base.keys - uniques; i++)
+ {
+ uint sp_segs=(keydefs[i].flag & HA_SPATIAL) ? 2*SPDIMS : 0;
+
+ if (_ma_keydef_write(file, &keydefs[i]))
+ goto err;
+ for (j=0 ; j < keydefs[i].keysegs-sp_segs ; j++)
+ if (_ma_keyseg_write(file, &keydefs[i].seg[j]))
+ goto err;
+#ifdef HAVE_SPATIAL
+ for (j=0 ; j < sp_segs ; j++)
+ {
+ HA_KEYSEG sseg;
+ sseg.type=SPTYPE;
+ sseg.language= 7; /* Binary */
+ sseg.null_bit=0;
+ sseg.bit_start=0;
+ sseg.bit_end=0;
+ sseg.bit_length= 0;
+ sseg.bit_pos= 0;
+ sseg.length=SPLEN;
+ sseg.null_pos=0;
+ sseg.start=j*SPLEN;
+ sseg.flag= HA_SWAP_KEY;
+ if (_ma_keyseg_write(file, &sseg))
+ goto err;
+ }
+#endif
+ }
+ /* Create extra keys for unique definitions */
+ offset= real_reclength - uniques*MARIA_UNIQUE_HASH_LENGTH;
+ bzero((char*) &tmp_keydef,sizeof(tmp_keydef));
+ bzero((char*) &tmp_keyseg,sizeof(tmp_keyseg));
+ for (i=0; i < uniques ; i++)
+ {
+ tmp_keydef.keysegs=1;
+ tmp_keydef.flag= HA_UNIQUE_CHECK;
+ tmp_keydef.block_length= (uint16) maria_block_size;
+ tmp_keydef.keylength= MARIA_UNIQUE_HASH_LENGTH + pointer;
+ tmp_keydef.minlength=tmp_keydef.maxlength=tmp_keydef.keylength;
+ tmp_keyseg.type= MARIA_UNIQUE_HASH_TYPE;
+ tmp_keyseg.length= MARIA_UNIQUE_HASH_LENGTH;
+ tmp_keyseg.start= offset;
+ offset+= MARIA_UNIQUE_HASH_LENGTH;
+ if (_ma_keydef_write(file,&tmp_keydef) ||
+ _ma_keyseg_write(file,(&tmp_keyseg)))
+ goto err;
+ }
+
+ /* Save unique definition */
+ DBUG_PRINT("info", ("write unique definitions"));
+ for (i=0 ; i < share.state.header.uniques ; i++)
+ {
+ HA_KEYSEG *keyseg_end;
+ keyseg= uniquedefs[i].seg;
+ if (_ma_uniquedef_write(file, &uniquedefs[i]))
+ goto err;
+ for (keyseg= uniquedefs[i].seg, keyseg_end= keyseg+ uniquedefs[i].keysegs;
+ keyseg < keyseg_end;
+ keyseg++)
+ {
+ switch (keyseg->type) {
+ case HA_KEYTYPE_VARTEXT1:
+ case HA_KEYTYPE_VARTEXT2:
+ case HA_KEYTYPE_VARBINARY1:
+ case HA_KEYTYPE_VARBINARY2:
+ if (!(keyseg->flag & HA_BLOB_PART))
+ {
+ keyseg->flag|= HA_VAR_LENGTH_PART;
+ keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 ||
+ keyseg->type == HA_KEYTYPE_VARBINARY1) ?
+ 1 : 2);
+ }
+ break;
+ default:
+ DBUG_ASSERT((keyseg->flag & HA_VAR_LENGTH_PART) == 0);
+ break;
+ }
+ if (_ma_keyseg_write(file, keyseg))
+ goto err;
+ }
+ }
+ DBUG_PRINT("info", ("write field definitions"));
+ if (datafile_type == BLOCK_RECORD)
+ {
+ /* Store columns in a more efficent order */
+ MARIA_COLUMNDEF **col_order, **pos;
+ if (!(col_order= (MARIA_COLUMNDEF**) my_malloc(share.base.fields *
+ sizeof(MARIA_COLUMNDEF*),
+ MYF(MY_WME))))
+ goto err;
+ for (column= columndef, pos= col_order ;
+ column != end_column ;
+ column++, pos++)
+ *pos= column;
+ qsort(col_order, share.base.fields, sizeof(*col_order),
+ (qsort_cmp) compare_columns);
+ for (i=0 ; i < share.base.fields ; i++)
+ {
+ column_array[col_order[i]->column_nr]= i;
+ if (_ma_columndef_write(file, col_order[i]))
+ {
+ my_free(col_order, MYF(0));
+ goto err;
+ }
+ }
+ my_free(col_order, MYF(0));
+ }
+ else
+ {
+ for (i=0 ; i < share.base.fields ; i++)
+ {
+ column_array[i]= (uint16) i;
+ if (_ma_columndef_write(file, &columndef[i]))
+ goto err;
+ }
+ }
+ if (_ma_column_nr_write(file, column_array, columns))
+ goto err;
+
+ if ((kfile_size_before_extension= my_tell(file,MYF(0))) == MY_FILEPOS_ERROR)
+ goto err;
+#ifndef DBUG_OFF
+ if (kfile_size_before_extension != info_length)
+ DBUG_PRINT("warning",("info_length: %u != used_length: %u",
+ info_length, (uint)kfile_size_before_extension));
+#endif
+
+ if (sync_dir)
+ {
+ /*
+ we log the first bytes and then the size to which we extend; this is
+ not log 1 KB of mostly zeroes if this is a small table.
+ */
+ char empty_string[]= "";
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+ translog_size_t total_rec_length= 0;
+ uint k;
+ LSN lsn;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= 1 + 2 + 2 +
+ (uint) kfile_size_before_extension;
+ /* we are needing maybe 64 kB, so don't use the stack */
+ log_data= my_malloc(log_array[TRANSLOG_INTERNAL_PARTS + 1].length, MYF(0));
+ if ((log_data == NULL) ||
+ my_pread(file, 1 + 2 + 2 + log_data,
+ (size_t) kfile_size_before_extension, 0, MYF(MY_NABP)))
+ goto err;
+ /*
+ remember if the data file was created or not, to know if Recovery can
+ do it or not, in the future
+ */
+ log_data[0]= test(flags & HA_DONT_TOUCH_DATA);
+ int2store(log_data + 1, kfile_size_before_extension);
+ int2store(log_data + 1 + 2, share.base.keystart);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar *)name;
+ /* we store the end-zero, for Recovery to just pass it to my_create() */
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_data;
+ /* symlink description is also needed for re-creation by Recovery: */
+ {
+ const char *s= ci->data_file_name ? ci->data_file_name : empty_string;
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].str= (uchar*)s;
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].length= strlen(s) + 1;
+ s= ci->index_file_name ? ci->index_file_name : empty_string;
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].str= (uchar*)s;
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].length= strlen(s) + 1;
+ }
+ for (k= TRANSLOG_INTERNAL_PARTS;
+ k < (sizeof(log_array)/sizeof(log_array[0])); k++)
+ total_rec_length+= (translog_size_t) log_array[k].length;
+ /**
+ For this record to be of any use for Recovery, we need the upper
+ MySQL layer to be crash-safe, which it is not now (that would require
+ work using the ddl_log of sql/sql_table.cc); when it is, we should
+ reconsider the moment of writing this log record (before or after op,
+ under THR_LOCK_maria or not...), how to use it in Recovery.
+ For now this record can serve when we apply logs to a backup,
+ so we sync it. This happens before the data file is created. If the
+ data file was created before, and we crashed before writing the log
+ record, at restart the table may be used, so we would not have a
+ trustable history in the log (impossible to apply this log to a
+ backup). The way we do it, if we crash before writing the log record
+ then there is no data file and the table cannot be used.
+ @todo Note that in case of TRUNCATE TABLE we also come here; for
+ Recovery to be able to finish TRUNCATE TABLE, instead of leaving a
+ half-truncated table, we should log the record at start of
+ maria_create(); for that we shouldn't write to the index file but to a
+ buffer (DYNAMIC_STRING), put the buffer into the record, then put the
+ buffer into the index file (so, change _ma_keydef_write() etc). That
+ would also enable Recovery to finish a CREATE TABLE. The final result
+ would be that we would be able to finish what the SQL layer has asked
+ for: it would be atomic.
+ When in CREATE/TRUNCATE (or DROP or RENAME or REPAIR) we have not
+ called external_lock(), so have no TRN. It does not matter, as all
+ these operations are non-transactional and sync their files.
+ */
+ if (unlikely(translog_write_record(&lsn,
+ LOGREC_REDO_CREATE_TABLE,
+ &dummy_transaction_object, NULL,
+ total_rec_length,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL, NULL) ||
+ translog_flush(lsn)))
+ goto err;
+ share.kfile.file= file;
+ DBUG_EXECUTE_IF("maria_flush_whole_log",
+ {
+ DBUG_PRINT("maria_flush_whole_log", ("now"));
+ translog_flush(translog_get_horizon());
+ });
+ DBUG_EXECUTE_IF("maria_crash_create_table",
+ {
+ DBUG_PRINT("maria_crash_create_table", ("now"));
+ DBUG_ABORT();
+ });
+ /*
+ store LSN into file, needed for Recovery to not be confused if a
+ DROP+CREATE happened (applying REDOs to the wrong table).
+ */
+ if (_ma_update_state_lsns_sub(&share, lsn, trnman_get_min_safe_trid(),
+ FALSE, TRUE))
+ goto err;
+ my_free(log_data, MYF(0));
+ }
+
+ if (!(flags & HA_DONT_TOUCH_DATA))
+ {
+ if (ci->data_file_name)
+ {
+ char *dext= strrchr(ci->data_file_name, '.');
+ int have_dext= dext && !strcmp(dext, MARIA_NAME_DEXT);
+
+ if (tmp_table)
+ {
+ char *path;
+ /* chop off the table name, tempory tables use generated name */
+ if ((path= strrchr(ci->data_file_name, FN_LIBCHAR)))
+ *path= '\0';
+ fn_format(filename, name, ci->data_file_name, MARIA_NAME_DEXT,
+ MY_REPLACE_DIR | MY_UNPACK_FILENAME | MY_APPEND_EXT);
+ }
+ else
+ {
+ fn_format(filename, ci->data_file_name, "", MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME |
+ (have_dext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+ }
+ fn_format(linkname, name, "",MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT);
+ linkname_ptr= linkname;
+ create_flag=0;
+ }
+ else
+ {
+ fn_format(filename,name,"", MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT);
+ linkname_ptr= NullS;
+ create_flag= (flags & HA_CREATE_KEEP_FILES) ? 0 : MY_DELETE_OLD;
+ }
+ if ((dfile=
+ my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+ MYF(MY_WME | create_flag | sync_dir))) < 0)
+ goto err;
+ errpos=3;
+
+ if (_ma_initialize_data_file(&share, dfile))
+ goto err;
+ }
+
+ /* Enlarge files */
+ DBUG_PRINT("info", ("enlarge to keystart: %lu",
+ (ulong) share.base.keystart));
+ if (my_chsize(file,(ulong) share.base.keystart,0,MYF(0)))
+ goto err;
+
+ if (sync_dir && my_sync(file, MYF(0)))
+ goto err;
+
+ if (! (flags & HA_DONT_TOUCH_DATA))
+ {
+#ifdef USE_RELOC
+ if (my_chsize(dfile,share.base.min_pack_length*ci->reloc_rows,0,MYF(0)))
+ goto err;
+#endif
+ if (sync_dir && my_sync(dfile, MYF(0)))
+ goto err;
+ if (my_close(dfile,MYF(0)))
+ goto err;
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ res= 0;
+ my_free((char*) rec_per_key_part,MYF(0));
+ errpos=0;
+ if (my_close(file,MYF(0)))
+ res= my_errno;
+ DBUG_RETURN(res);
+
+err:
+ pthread_mutex_unlock(&THR_LOCK_maria);
+
+err_no_lock:
+ save_errno=my_errno;
+ switch (errpos) {
+ case 3:
+ VOID(my_close(dfile,MYF(0)));
+ /* fall through */
+ case 2:
+ if (! (flags & HA_DONT_TOUCH_DATA))
+ my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT),
+ sync_dir);
+ /* fall through */
+ case 1:
+ VOID(my_close(file,MYF(0)));
+ if (! (flags & HA_DONT_TOUCH_DATA))
+ my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT),
+ sync_dir);
+ }
+ my_free(log_data, MYF(MY_ALLOW_ZERO_PTR));
+ my_free((char*) rec_per_key_part, MYF(0));
+ DBUG_RETURN(my_errno=save_errno); /* return the fatal errno */
+}
+
+
+uint maria_get_pointer_length(ulonglong file_length, uint def)
+{
+ DBUG_ASSERT(def >= 2 && def <= 7);
+ if (file_length) /* If not default */
+ {
+#ifdef NOT_YET_READY_FOR_8_BYTE_POINTERS
+ if (file_length >= (ULL(1) << 56))
+ def=8;
+ else
+#endif
+ if (file_length >= (ULL(1) << 48))
+ def=7;
+ else if (file_length >= (ULL(1) << 40))
+ def=6;
+ else if (file_length >= (ULL(1) << 32))
+ def=5;
+ else if (file_length >= (ULL(1) << 24))
+ def=4;
+ else if (file_length >= (ULL(1) << 16))
+ def=3;
+ else
+ def=2;
+ }
+ return def;
+}
+
+
+/*
+ Sort columns for records-in-block
+
+ IMPLEMENTATION
+ Sort columns in following order:
+
+ Fixed size, not null columns
+ Fixed length, null fields
+ Numbers (zero fill fields)
+ Variable length fields (CHAR, VARCHAR) according to length
+ Blobs
+
+ For same kind of fields, keep fields in original order
+*/
+
+static inline int sign(long a)
+{
+ return a < 0 ? -1 : (a > 0 ? 1 : 0);
+}
+
+
+static int compare_columns(MARIA_COLUMNDEF **a_ptr, MARIA_COLUMNDEF **b_ptr)
+{
+ MARIA_COLUMNDEF *a= *a_ptr, *b= *b_ptr;
+ enum en_fieldtype a_type, b_type;
+
+ a_type= (a->type == FIELD_CHECK) ? FIELD_NORMAL : a->type;
+ b_type= (b->type == FIELD_CHECK) ? FIELD_NORMAL : b->type;
+
+ if (a_type == FIELD_NORMAL && !a->null_bit)
+ {
+ if (b_type != FIELD_NORMAL || b->null_bit)
+ return -1;
+ return sign((long) a->offset - (long) b->offset);
+ }
+ if (b_type == FIELD_NORMAL && !b->null_bit)
+ return 1;
+ if (a_type == b_type)
+ return sign((long) a->offset - (long) b->offset);
+ if (a_type == FIELD_NORMAL)
+ return -1;
+ if (b_type == FIELD_NORMAL)
+ return 1;
+ if (a_type == FIELD_SKIP_ZERO)
+ return -1;
+ if (b_type == FIELD_SKIP_ZERO)
+ return 1;
+ if (a->type != FIELD_BLOB && b->type != FIELD_BLOB)
+ if (a->length != b->length)
+ return sign((long) a->length - (long) b->length);
+ if (a_type == FIELD_BLOB)
+ return 1;
+ if (b_type == FIELD_BLOB)
+ return -1;
+ return sign((long) a->offset - (long) b->offset);
+}
+
+
+/**
+ @brief Initialize data file
+
+ @note
+ In BLOCK_RECORD, a freshly created datafile is one page long; while in
+ other formats it is 0-byte long.
+ */
+
+int _ma_initialize_data_file(MARIA_SHARE *share, File dfile)
+{
+ if (share->data_file_type == BLOCK_RECORD)
+ {
+ share->bitmap.block_size= share->base.block_size;
+ share->bitmap.file.file = dfile;
+ return _ma_bitmap_create_first(share);
+ }
+ return 0;
+}
+
+
+/**
+ @brief Writes create_rename_lsn, skip_redo_lsn and is_of_horizon to disk,
+ can force.
+
+ This is for special cases where:
+ - we don't want to write the full state to disk (so, not call
+ _ma_state_info_write()) because some parts of the state may be
+ currently inconsistent, or because it would be overkill
+ - we must sync these LSNs immediately for correctness.
+ It acquires intern_lock to protect the LSNs and state write.
+
+ @param share table's share
+ @param lsn LSN to write to log files
+ @param create_trid Trid to be used as state.create_trid
+ @param do_sync if the write should be forced to disk
+ @param update_create_rename_lsn if this LSN should be updated or not
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error (disk problem)
+*/
+
+int _ma_update_state_lsns(MARIA_SHARE *share, LSN lsn, TrID create_trid,
+ my_bool do_sync, my_bool update_create_rename_lsn)
+{
+ int res;
+ pthread_mutex_lock(&share->intern_lock);
+ res= _ma_update_state_lsns_sub(share, lsn, create_trid, do_sync,
+ update_create_rename_lsn);
+ pthread_mutex_unlock(&share->intern_lock);
+ return res;
+}
+
+
+/**
+ @brief Writes create_rename_lsn, skip_redo_lsn and is_of_horizon to disk,
+ can force.
+
+ Shortcut of _ma_update_state_lsns() when we know that intern_lock is not
+ needed (when creating a table or opening it for the first time).
+
+ @param share table's share
+ @param lsn LSN to write to state; if LSN_IMPOSSIBLE, write
+ a LOGREC_IMPORTED_TABLE and use its LSN as lsn.
+ @param create_trid Trid to be used as state.create_trid
+ @param do_sync if the write should be forced to disk
+ @param update_create_rename_lsn if this LSN should be updated or not
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error (disk problem)
+*/
+
+#if (_MSC_VER == 1310)
+/*
+ Visual Studio 2003 compiler produces internal compiler error
+ in this function. Disable optimizations to workaround.
+*/
+#pragma optimize("",off)
+#endif
+int _ma_update_state_lsns_sub(MARIA_SHARE *share, LSN lsn, TrID create_trid,
+ my_bool do_sync,
+ my_bool update_create_rename_lsn)
+{
+ uchar buf[LSN_STORE_SIZE * 3], *ptr;
+ uchar trid_buff[8];
+ File file= share->kfile.file;
+ DBUG_ASSERT(file >= 0);
+
+ if (lsn == LSN_IMPOSSIBLE)
+ {
+ int res;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ /* table name is logged only for information */
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
+ (uchar *)(share->open_file_name.str);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length=
+ share->open_file_name.length + 1;
+ if ((res= translog_write_record(&lsn, LOGREC_IMPORTED_TABLE,
+ &dummy_transaction_object, NULL,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL, NULL)))
+ return res;
+ }
+
+ for (ptr= buf; ptr < (buf + sizeof(buf)); ptr+= LSN_STORE_SIZE)
+ lsn_store(ptr, lsn);
+ share->state.skip_redo_lsn= share->state.is_of_horizon= lsn;
+ share->state.create_trid= create_trid;
+ mi_int8store(trid_buff, create_trid);
+ if (update_create_rename_lsn)
+ {
+ share->state.create_rename_lsn= lsn;
+ if (share->id != 0)
+ {
+ /*
+ If OP is the operation which is calling us, if table is later written,
+ we could see in the log:
+ FILE_ID ... REDO_OP ... REDO_INSERT.
+ (that can happen in real life at least with OP=REPAIR).
+ As FILE_ID will be ignored by Recovery because it is <
+ create_rename_lsn, REDO_INSERT would be ignored too, wrongly.
+ To avoid that, we force a LOGREC_FILE_ID to be logged at next write:
+ */
+ translog_deassign_id_from_share(share);
+ }
+ }
+ else
+ lsn_store(buf, share->state.create_rename_lsn);
+ return (my_pwrite(file, buf, sizeof(buf),
+ sizeof(share->state.header) +
+ MARIA_FILE_CREATE_RENAME_LSN_OFFSET, MYF(MY_NABP)) ||
+ my_pwrite(file, trid_buff, sizeof(trid_buff),
+ sizeof(share->state.header) +
+ MARIA_FILE_CREATE_TRID_OFFSET, MYF(MY_NABP)) ||
+ (do_sync && my_sync(file, MYF(0))));
+}
+#if (_MSC_VER == 1310)
+#pragma optimize("",on)
+#endif /*VS2003 compiler bug workaround*/
diff --git a/storage/maria/ma_dbug.c b/storage/maria/ma_dbug.c
new file mode 100644
index 00000000000..af90a108e2a
--- /dev/null
+++ b/storage/maria/ma_dbug.c
@@ -0,0 +1,201 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Support rutiner with are using with dbug */
+
+#include "maria_def.h"
+
+void _ma_print_key(FILE *stream, MARIA_KEY *key)
+{
+ _ma_print_keydata(stream, key->keyinfo->seg, key->data, key->data_length);
+}
+
+
+/* Print a key in a user understandable format */
+
+void _ma_print_keydata(FILE *stream, register HA_KEYSEG *keyseg,
+ const uchar *key, uint length)
+{
+ int flag;
+ short int s_1;
+ long int l_1;
+ float f_1;
+ double d_1;
+ const uchar *end;
+ const uchar *key_end= key + length;
+
+ VOID(fputs("Key: \"",stream));
+ flag=0;
+ for (; keyseg->type && key < key_end ;keyseg++)
+ {
+ if (flag++)
+ VOID(putc('-',stream));
+ end= key+ keyseg->length;
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ /* A NULL value is encoded by a 1-byte flag. Zero means NULL. */
+ if (! *(key++))
+ {
+ fprintf(stream,"NULL");
+ continue;
+ }
+ end++;
+ }
+
+ switch (keyseg->type) {
+ case HA_KEYTYPE_BINARY:
+ if (!(keyseg->flag & HA_SPACE_PACK) && keyseg->length == 1)
+ { /* packed binary digit */
+ VOID(fprintf(stream,"%d",(uint) *key++));
+ break;
+ }
+ /* fall through */
+ case HA_KEYTYPE_TEXT:
+ case HA_KEYTYPE_NUM:
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ VOID(fprintf(stream,"%.*s",(int) *key,key+1));
+ key+= (int) *key+1;
+ }
+ else
+ {
+ VOID(fprintf(stream,"%.*s",(int) keyseg->length,key));
+ key=end;
+ }
+ break;
+ case HA_KEYTYPE_INT8:
+ VOID(fprintf(stream,"%d",(int) *((const signed char*) key)));
+ key=end;
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ s_1= mi_sint2korr(key);
+ VOID(fprintf(stream,"%d",(int) s_1));
+ key=end;
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ {
+ ushort u_1;
+ u_1= mi_uint2korr(key);
+ VOID(fprintf(stream,"%u",(uint) u_1));
+ key=end;
+ break;
+ }
+ case HA_KEYTYPE_LONG_INT:
+ l_1=mi_sint4korr(key);
+ VOID(fprintf(stream,"%ld",l_1));
+ key=end;
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ l_1=mi_uint4korr(key);
+ VOID(fprintf(stream,"%lu",(ulong) l_1));
+ key=end;
+ break;
+ case HA_KEYTYPE_INT24:
+ VOID(fprintf(stream,"%ld",(long) mi_sint3korr(key)));
+ key=end;
+ break;
+ case HA_KEYTYPE_UINT24:
+ VOID(fprintf(stream,"%lu",(ulong) mi_uint3korr(key)));
+ key=end;
+ break;
+ case HA_KEYTYPE_FLOAT:
+ mi_float4get(f_1,key);
+ VOID(fprintf(stream,"%g",(double) f_1));
+ key=end;
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ mi_float8get(d_1,key);
+ VOID(fprintf(stream,"%g",d_1));
+ key=end;
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ {
+ char buff[21];
+ longlong10_to_str(mi_sint8korr(key),buff,-10);
+ VOID(fprintf(stream,"%s",buff));
+ key=end;
+ break;
+ }
+ case HA_KEYTYPE_ULONGLONG:
+ {
+ char buff[21];
+ longlong10_to_str(mi_sint8korr(key),buff,10);
+ VOID(fprintf(stream,"%s",buff));
+ key=end;
+ break;
+ }
+#endif
+ case HA_KEYTYPE_BIT:
+ {
+ uint i;
+ fputs("0x",stream);
+ for (i=0 ; i < keyseg->length ; i++)
+ fprintf(stream, "%02x", (uint) *key++);
+ key= end;
+ break;
+ }
+ case HA_KEYTYPE_VARTEXT1: /* VARCHAR and TEXT */
+ case HA_KEYTYPE_VARTEXT2: /* VARCHAR and TEXT */
+ case HA_KEYTYPE_VARBINARY1: /* VARBINARY and BLOB */
+ case HA_KEYTYPE_VARBINARY2: /* VARBINARY and BLOB */
+ {
+ uint tmp_length;
+ get_key_length(tmp_length,key);
+ /*
+ The following command sometimes gives a warning from valgrind.
+ Not yet sure if the bug is in valgrind, glibc or mysqld
+ */
+ VOID(fprintf(stream,"%.*s",(int) tmp_length,key));
+ key+=tmp_length;
+ break;
+ }
+ default: break; /* This never happens */
+ }
+ }
+ VOID(fputs("\"\n",stream));
+ return;
+} /* print_key */
+
+
+#ifdef EXTRA_DEBUG
+
+my_bool _ma_check_table_is_closed(const char *name, const char *where)
+{
+ char filename[FN_REFLEN];
+ LIST *pos;
+ DBUG_ENTER("_ma_check_table_is_closed");
+
+ (void) fn_format(filename,name,"",MARIA_NAME_IEXT,4+16+32);
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (pos=maria_open_list ; pos ; pos=pos->next)
+ {
+ MARIA_HA *info=(MARIA_HA*) pos->data;
+ MARIA_SHARE *share= info->s;
+ if (!strcmp(share->unique_file_name.str, filename))
+ {
+ if (share->last_version)
+ {
+ fprintf(stderr,"Warning: Table: %s is open on %s\n", name,where);
+ DBUG_PRINT("warning",("Table: %s is open on %s", name,where));
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ DBUG_RETURN(0);
+}
+#endif /* EXTRA_DEBUG */
diff --git a/storage/maria/ma_delete.c b/storage/maria/ma_delete.c
new file mode 100644
index 00000000000..5c04f358b14
--- /dev/null
+++ b/storage/maria/ma_delete.c
@@ -0,0 +1,1650 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+ Copyright (C) 2009-2010 Monty Program Ab
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "ma_fulltext.h"
+#include "ma_rt_index.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
+ MARIA_PAGE *page);
+static int del(MARIA_HA *info, MARIA_KEY *key,
+ MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page,
+ uchar *keypos, my_off_t next_block, uchar *ret_key_buff);
+static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page,
+ uchar *keypos);
+static uint remove_key(MARIA_KEYDEF *keyinfo, uint page_flag, uint nod_flag,
+ uchar *keypos, uchar *lastkey, uchar *page_end,
+ my_off_t *next_block, MARIA_KEY_PARAM *s_temp);
+
+/* @breif Remove a row from a MARIA table */
+
+int maria_delete(MARIA_HA *info,const uchar *record)
+{
+ uint i;
+ uchar *old_key;
+ int save_errno;
+ char lastpos[8];
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo;
+ DBUG_ENTER("maria_delete");
+
+ /* Test if record is in datafile */
+ DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage",
+ maria_print_error(share, HA_ERR_CRASHED);
+ DBUG_RETURN(my_errno= HA_ERR_CRASHED););
+ DBUG_EXECUTE_IF("my_error_test_undefined_error",
+ maria_print_error(share, INT_MAX);
+ DBUG_RETURN(my_errno= INT_MAX););
+ if (!(info->update & HA_STATE_AKTIV))
+ {
+ DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No database read */
+ }
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ DBUG_RETURN(my_errno=EACCES);
+ }
+ if (_ma_readinfo(info,F_WRLCK,1))
+ DBUG_RETURN(my_errno);
+ if ((*share->compare_record)(info,record))
+ goto err; /* Error on read-check */
+
+ if (_ma_mark_file_changed(info))
+ goto err;
+
+ /* Ensure we don't change the autoincrement value */
+ info->last_auto_increment= ~(ulonglong) 0;
+ /* Remove all keys from the index file */
+
+ old_key= info->lastkey_buff2;
+
+ for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++)
+ {
+ if (maria_is_key_active(share->state.key_map, i))
+ {
+ keyinfo->version++;
+ if (keyinfo->flag & HA_FULLTEXT)
+ {
+ if (_ma_ft_del(info, i, old_key, record, info->cur_row.lastpos))
+ goto err;
+ }
+ else
+ {
+ MARIA_KEY key;
+ if (keyinfo->ck_delete(info,
+ (*keyinfo->make_key)(info, &key, i, old_key,
+ record,
+ info->cur_row.lastpos,
+ info->cur_row.trid)))
+ goto err;
+ }
+ /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+ }
+ }
+
+ if (share->calc_checksum)
+ {
+ /*
+ We can't use the row based checksum as this doesn't have enough
+ precision.
+ */
+ info->cur_row.checksum= (*share->calc_checksum)(info, record);
+ }
+
+ if ((*share->delete_record)(info, record))
+ goto err; /* Remove record from database */
+
+ info->state->checksum-= info->cur_row.checksum;
+ info->state->records--;
+ info->update= HA_STATE_CHANGED+HA_STATE_DELETED+HA_STATE_ROW_CHANGED;
+ share->state.changed|= (STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_MOVABLE |
+ STATE_NOT_ZEROFILLED);
+ info->state->changed=1;
+
+ mi_sizestore(lastpos, info->cur_row.lastpos);
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ if (info->invalidator != 0)
+ {
+ DBUG_PRINT("info", ("invalidator... '%s' (delete)",
+ share->open_file_name.str));
+ (*info->invalidator)(share->open_file_name.str);
+ info->invalidator=0;
+ }
+ DBUG_RETURN(0);
+
+err:
+ save_errno= my_errno;
+ DBUG_ASSERT(save_errno);
+ if (!save_errno)
+ save_errno= HA_ERR_INTERNAL_ERROR; /* Should never happen */
+
+ mi_sizestore(lastpos, info->cur_row.lastpos);
+ if (save_errno != HA_ERR_RECORD_CHANGED)
+ {
+ maria_print_error(share, HA_ERR_CRASHED);
+ maria_mark_crashed(info); /* mark table crashed */
+ }
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ info->update|=HA_STATE_WRITTEN; /* Buffer changed */
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ if (save_errno == HA_ERR_KEY_NOT_FOUND)
+ {
+ maria_print_error(share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ }
+ DBUG_RETURN(my_errno= save_errno);
+} /* maria_delete */
+
+
+/*
+ Remove a key from the btree index
+
+ TODO:
+ Change ma_ck_real_delete() to use another buffer for changed keys instead
+ of key->data. This would allows us to remove the copying of the key here.
+*/
+
+my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key)
+{
+ MARIA_SHARE *share= info->s;
+ int res;
+ LSN lsn= LSN_IMPOSSIBLE;
+ my_off_t new_root= share->state.key_root[key->keyinfo->key_nr];
+ uchar key_buff[MARIA_MAX_KEY_BUFF], *save_key_data;
+ MARIA_KEY org_key;
+ DBUG_ENTER("_ma_ck_delete");
+
+ LINT_INIT_STRUCT(org_key);
+
+ save_key_data= key->data;
+ if (share->now_transactional)
+ {
+ /* Save original value as the key may change */
+ memcpy(key_buff, key->data, key->data_length + key->ref_length);
+ org_key= *key;
+ key->data= key_buff;
+ }
+
+ if ((res= _ma_ck_real_delete(info, key, &new_root)))
+ {
+ /* We have to mark the table crashed before unpin_all_pages() */
+ maria_mark_crashed(info);
+ }
+
+ key->data= save_key_data;
+ if (!res && share->now_transactional)
+ res= _ma_write_undo_key_delete(info, &org_key, new_root, &lsn);
+ else
+ {
+ share->state.key_root[key->keyinfo->key_nr]= new_root;
+ _ma_fast_unlock_key_del(info);
+ }
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+ DBUG_RETURN(res != 0);
+} /* _ma_ck_delete */
+
+
+my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key,
+ my_off_t *root)
+{
+ int error;
+ my_bool result= 0;
+ my_off_t old_root;
+ uchar *root_buff;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_PAGE page;
+ DBUG_ENTER("_ma_ck_real_delete");
+
+ if ((old_root=*root) == HA_OFFSET_ERROR)
+ {
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(1);
+ }
+ if (!(root_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+ MARIA_MAX_KEY_BUFF*2)))
+ {
+ DBUG_PRINT("error",("Couldn't allocate memory"));
+ my_errno=ENOMEM;
+ DBUG_RETURN(1);
+ }
+ DBUG_PRINT("info",("root_page: %lu",
+ (ulong) (old_root / keyinfo->block_length)));
+ if (_ma_fetch_keypage(&page, info, keyinfo, old_root,
+ PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, root_buff, 0))
+ {
+ result= 1;
+ goto err;
+ }
+ if ((error= d_search(info, key, (keyinfo->flag & HA_FULLTEXT ?
+ SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT:
+ SEARCH_SAME),
+ &page)))
+ {
+ if (error < 0)
+ result= 1;
+ else if (error == 2)
+ {
+ DBUG_PRINT("test",("Enlarging of root when deleting"));
+ if (_ma_enlarge_root(info, key, root))
+ result= 1;
+ }
+ else /* error == 1 */
+ {
+ MARIA_SHARE *share= info->s;
+
+ page_mark_changed(info, &page);
+
+ if (page.size <= page.node + share->keypage_header + 1)
+ {
+ if (page.node)
+ *root= _ma_kpos(page.node, root_buff +share->keypage_header +
+ page.node);
+ else
+ *root=HA_OFFSET_ERROR;
+ if (_ma_dispose(info, old_root, 0))
+ result= 1;
+ }
+ else if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ result= 1;
+ }
+ }
+err:
+ my_afree(root_buff);
+ DBUG_PRINT("exit",("Return: %d",result));
+ DBUG_RETURN(result);
+} /* _ma_ck_real_delete */
+
+
+/**
+ @brief Remove key below key root
+
+ @param key Key to delete. Will contain new key if block was enlarged
+
+ @return
+ @retval 0 ok (anc_page is not changed)
+ @retval 1 If data on page is too small; In this case anc_buff is not saved
+ @retval 2 If data on page is too big
+ @retval -1 On errors
+*/
+
+static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
+ MARIA_PAGE *anc_page)
+{
+ int flag,ret_value,save_flag;
+ uint nod_flag, page_flag;
+ my_bool last_key;
+ uchar *leaf_buff,*keypos;
+ uchar lastkey[MARIA_MAX_KEY_BUFF];
+ MARIA_KEY_PARAM s_temp;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_PAGE leaf_page;
+ DBUG_ENTER("d_search");
+ DBUG_DUMP("page", anc_page->buff, anc_page->size);
+
+ flag=(*keyinfo->bin_search)(key, anc_page, comp_flag, &keypos, lastkey,
+ &last_key);
+ if (flag == MARIA_FOUND_WRONG_KEY)
+ {
+ DBUG_PRINT("error",("Found wrong key"));
+ DBUG_RETURN(-1);
+ }
+ page_flag= anc_page->flag;
+ nod_flag= anc_page->node;
+
+ if (!flag && (keyinfo->flag & HA_FULLTEXT))
+ {
+ uint off;
+ int subkeys;
+
+ get_key_full_length_rdonly(off, lastkey);
+ subkeys=ft_sintXkorr(lastkey+off);
+ DBUG_ASSERT(info->ft1_to_ft2==0 || subkeys >=0);
+ comp_flag=SEARCH_SAME;
+ if (subkeys >= 0)
+ {
+ /* normal word, one-level tree structure */
+ if (info->ft1_to_ft2)
+ {
+ /* we're in ft1->ft2 conversion mode. Saving key data */
+ insert_dynamic(info->ft1_to_ft2, (lastkey+off));
+ }
+ else
+ {
+ /* we need exact match only if not in ft1->ft2 conversion mode */
+ flag=(*keyinfo->bin_search)(key, anc_page, comp_flag, &keypos,
+ lastkey, &last_key);
+ }
+ /* fall through to normal delete */
+ }
+ else
+ {
+ /* popular word. two-level tree. going down */
+ uint tmp_key_length;
+ my_off_t root;
+ uchar *kpos=keypos;
+ MARIA_KEY tmp_key;
+
+ tmp_key.data= lastkey;
+ tmp_key.keyinfo= keyinfo;
+
+ if (!(tmp_key_length=(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag,
+ &kpos)))
+ {
+ my_errno= HA_ERR_CRASHED;
+ DBUG_RETURN(-1);
+ }
+ root= _ma_row_pos_from_key(&tmp_key);
+ if (subkeys == -1)
+ {
+ /* the last entry in sub-tree */
+ if (_ma_dispose(info, root, 1))
+ DBUG_RETURN(-1);
+ /* fall through to normal delete */
+ }
+ else
+ {
+ MARIA_KEY word_key;
+ keyinfo=&share->ft2_keyinfo;
+ /* we'll modify key entry 'in vivo' */
+ kpos-=keyinfo->keylength+nod_flag;
+ get_key_full_length_rdonly(off, key->data);
+
+ word_key.data= key->data + off;
+ word_key.keyinfo= &share->ft2_keyinfo;
+ word_key.data_length= HA_FT_WLEN;
+ word_key.ref_length= 0;
+ word_key.flag= 0;
+ ret_value= _ma_ck_real_delete(info, &word_key, &root);
+ _ma_dpointer(share, kpos+HA_FT_WLEN, root);
+ subkeys++;
+ ft_intXstore(kpos, subkeys);
+ if (!ret_value)
+ {
+ page_mark_changed(info, anc_page);
+ ret_value= _ma_write_keypage(anc_page,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS);
+ }
+ DBUG_PRINT("exit",("Return: %d",ret_value));
+ DBUG_RETURN(ret_value);
+ }
+ }
+ }
+ leaf_buff=0;
+ if (nod_flag)
+ {
+ /* Read left child page */
+ leaf_page.pos= _ma_kpos(nod_flag,keypos);
+ if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+ MARIA_MAX_KEY_BUFF*2)))
+ {
+ DBUG_PRINT("error", ("Couldn't allocate memory"));
+ my_errno=ENOMEM;
+ DBUG_RETURN(-1);
+ }
+ if (_ma_fetch_keypage(&leaf_page, info,keyinfo, leaf_page.pos,
+ PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, leaf_buff,
+ 0))
+ goto err;
+ }
+
+ if (flag != 0)
+ {
+ if (!nod_flag)
+ {
+ DBUG_PRINT("error",("Didn't find key"));
+ my_errno=HA_ERR_CRASHED; /* This should newer happend */
+ goto err;
+ }
+ save_flag=0;
+ ret_value= d_search(info, key, comp_flag, &leaf_page);
+ }
+ else
+ { /* Found key */
+ uint tmp;
+ uint anc_buff_length= anc_page->size;
+ uint anc_page_flag= anc_page->flag;
+ my_off_t next_block;
+
+ if (!(tmp= remove_key(keyinfo, anc_page_flag, nod_flag, keypos, lastkey,
+ anc_page->buff + anc_buff_length,
+ &next_block, &s_temp)))
+ goto err;
+
+ page_mark_changed(info, anc_page);
+ anc_buff_length-= tmp;
+ anc_page->size= anc_buff_length;
+ page_store_size(share, anc_page);
+
+ /*
+ Log initial changes on pages
+ If there is an underflow, there will be more changes logged to the
+ page
+ */
+ if (share->now_transactional &&
+ _ma_log_delete(anc_page, s_temp.key_pos,
+ s_temp.changed_length, s_temp.move_length,
+ 0, KEY_OP_DEBUG_LOG_DEL_CHANGE_1))
+ DBUG_RETURN(-1);
+
+ if (!nod_flag)
+ { /* On leaf page */
+ if (anc_buff_length <= (info->quick_mode ?
+ MARIA_MIN_KEYBLOCK_LENGTH :
+ (uint) keyinfo->underflow_block_length))
+ {
+ /* Page will be written by caller if we return 1 */
+ DBUG_RETURN(1);
+ }
+ if (_ma_write_keypage(anc_page,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+ DBUG_RETURN(-1);
+ DBUG_RETURN(0);
+ }
+ save_flag=1; /* Mark that anc_buff is changed */
+ ret_value= del(info, key, anc_page, &leaf_page,
+ keypos, next_block, lastkey);
+ }
+ if (ret_value >0)
+ {
+ save_flag= 2;
+ if (ret_value == 1)
+ ret_value= underflow(info, keyinfo, anc_page, &leaf_page, keypos);
+ else
+ {
+ /* This can only happen with variable length keys */
+ MARIA_KEY last_key;
+ DBUG_PRINT("test",("Enlarging of key when deleting"));
+
+ last_key.data= lastkey;
+ last_key.keyinfo= keyinfo;
+ if (!_ma_get_last_key(&last_key, anc_page, keypos))
+ goto err;
+ ret_value= _ma_insert(info, key, anc_page, keypos,
+ last_key.data,
+ (MARIA_PAGE*) 0, (uchar*) 0, (my_bool) 0);
+
+ if (_ma_write_keypage(&leaf_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ ret_value= -1;
+ }
+ }
+ if (ret_value == 0 && anc_page->size > share->max_index_block_size)
+ {
+ /*
+ parent buffer got too big ; We have to split the page.
+ The | 2 is there to force write of anc page below
+ */
+ save_flag= 3;
+ ret_value= _ma_split_page(info, key, anc_page,
+ share->max_index_block_size,
+ (uchar*) 0, 0, 0, lastkey, 0) | 2;
+ DBUG_ASSERT(anc_page->org_size == anc_page->size);
+ }
+ if (save_flag && ret_value != 1)
+ {
+ page_mark_changed(info, anc_page);
+ if (_ma_write_keypage(anc_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ ret_value= -1;
+ }
+ else
+ {
+ DBUG_DUMP("page", anc_page->buff, anc_page->size);
+ }
+ my_afree(leaf_buff);
+ DBUG_PRINT("exit",("Return: %d",ret_value));
+ DBUG_RETURN(ret_value);
+
+err:
+ my_afree(leaf_buff);
+ DBUG_PRINT("exit",("Error: %d",my_errno));
+ DBUG_RETURN (-1);
+} /* d_search */
+
+
+/**
+ @brief Remove a key that has a page-reference
+
+ @param info Maria handler
+ @param key Buffer for key to be inserted at upper level
+ @param anc_page Page address for page where deleted key was
+ @param anc_buff Page buffer (nod) where deleted key was
+ @param leaf_page Page address for nod before the deleted key
+ @param leaf_buff Buffer for leaf_page
+ @param leaf_buff_link Pinned page link for leaf_buff
+ @param keypos Pos to where deleted key was on anc_buff
+ @param next_block Page adress for nod after deleted key
+ @param ret_key_buff Key before keypos in anc_buff
+
+ @notes
+ leaf_page must be written to disk if retval > 0
+ anc_page is not updated on disk. Caller should do this
+
+ @return
+ @retval < 0 Error
+ @retval 0 OK. leaf_buff is written to disk
+
+ @retval 1 key contains key to upper level (from balance page)
+ leaf_buff has underflow
+ @retval 2 key contains key to upper level (from split space)
+*/
+
+static int del(MARIA_HA *info, MARIA_KEY *key,
+ MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page,
+ uchar *keypos, my_off_t next_block, uchar *ret_key_buff)
+{
+ int ret_value,length;
+ uint a_length, page_flag, nod_flag, leaf_length, new_leaf_length;
+ uchar keybuff[MARIA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key;
+ uchar *anc_buff;
+ MARIA_KEY_PARAM s_temp;
+ MARIA_KEY tmp_key;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_KEY ret_key;
+ MARIA_PAGE next_page;
+ DBUG_ENTER("del");
+ DBUG_PRINT("enter",("leaf_page: %lu keypos: 0x%lx",
+ (ulong) (leaf_page->pos / share->block_size),
+ (ulong) keypos));
+ DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size);
+
+ page_flag= leaf_page->flag;
+ leaf_length= leaf_page->size;
+ nod_flag= leaf_page->node;
+
+ endpos= leaf_page->buff + leaf_length;
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data= keybuff;
+
+ if (!(key_start= _ma_get_last_key(&tmp_key, leaf_page, endpos)))
+ DBUG_RETURN(-1);
+
+ if (nod_flag)
+ {
+ next_page.pos= _ma_kpos(nod_flag,endpos);
+ if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+ MARIA_MAX_KEY_BUFF*2)))
+ DBUG_RETURN(-1);
+ if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos,
+ PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, next_buff, 0))
+ ret_value= -1;
+ else
+ {
+ DBUG_DUMP("next_page", next_page.buff, next_page.size);
+ if ((ret_value= del(info, key, anc_page, &next_page,
+ keypos, next_block, ret_key_buff)) >0)
+ {
+ /* Get new length after key was deleted */
+ endpos= leaf_page->buff+ leaf_page->size;
+ if (ret_value == 1)
+ {
+ /* underflow writes "next_page" to disk */
+ ret_value= underflow(info, keyinfo, leaf_page, &next_page,
+ endpos);
+ if (ret_value == 0 && leaf_page->size >
+ share->max_index_block_size)
+ {
+ ret_value= (_ma_split_page(info, key, leaf_page,
+ share->max_index_block_size,
+ (uchar*) 0, 0, 0,
+ ret_key_buff, 0) | 2);
+ }
+ }
+ else
+ {
+ if (_ma_write_keypage(&next_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ DBUG_PRINT("test",("Inserting of key when deleting"));
+ if (!_ma_get_last_key(&tmp_key, leaf_page, endpos))
+ goto err;
+ ret_value= _ma_insert(info, key, leaf_page, endpos,
+ tmp_key.data, (MARIA_PAGE *) 0, (uchar*) 0,
+ 0);
+ }
+ }
+ page_mark_changed(info, leaf_page);
+ /*
+ If ret_value <> 0, then leaf_page underflowed and caller will have
+ to handle underflow and write leaf_page to disk.
+ We can't write it here, as if leaf_page is empty we get an assert
+ in _ma_write_keypage.
+ */
+ if (ret_value == 0 && _ma_write_keypage(leaf_page,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ }
+ my_afree(next_buff);
+ DBUG_RETURN(ret_value);
+ }
+
+ /*
+ Remove last key from leaf page
+ Note that leaf_page page may only have had one key (can normally only
+ happen in quick mode), in which ase it will now temporary have 0 keys
+ on it. This will be corrected by the caller as we will return 0.
+ */
+ new_leaf_length= (uint) (key_start - leaf_page->buff);
+ leaf_page->size= new_leaf_length;
+ page_store_size(share, leaf_page);
+
+ if (share->now_transactional &&
+ _ma_log_suffix(leaf_page, leaf_length, new_leaf_length))
+ goto err;
+
+ page_mark_changed(info, leaf_page); /* Safety */
+ if (new_leaf_length <= (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+ (uint) keyinfo->underflow_block_length))
+ {
+ /* Underflow, leaf_page will be written by caller */
+ ret_value= 1;
+ }
+ else
+ {
+ ret_value= 0;
+ if (_ma_write_keypage(leaf_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ }
+
+ /* Place last key in ancestor page on deleted key position */
+ a_length= anc_page->size;
+ anc_buff= anc_page->buff;
+ endpos= anc_buff + a_length;
+
+ ret_key.keyinfo= keyinfo;
+ ret_key.data= ret_key_buff;
+
+ prev_key= 0;
+ if (keypos != anc_buff+share->keypage_header + share->base.key_reflength)
+ {
+ if (!_ma_get_last_key(&ret_key, anc_page, keypos))
+ goto err;
+ prev_key= ret_key.data;
+ }
+ length= (*keyinfo->pack_key)(&tmp_key, share->base.key_reflength,
+ keypos == endpos ? (uchar*) 0 : keypos,
+ prev_key, prev_key,
+ &s_temp);
+ if (length > 0)
+ bmove_upp(endpos+length,endpos,(uint) (endpos-keypos));
+ else
+ bmove(keypos,keypos-length, (int) (endpos-keypos)+length);
+ (*keyinfo->store_key)(keyinfo,keypos,&s_temp);
+ key_start= keypos;
+ if (tmp_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+ SEARCH_PAGE_KEY_HAS_TRANSID))
+ {
+ _ma_mark_page_with_transid(share, anc_page);
+ }
+
+ /* Save pointer to next leaf on parent page */
+ if (!(*keyinfo->get_key)(&ret_key, page_flag, share->base.key_reflength,
+ &keypos))
+ goto err;
+ _ma_kpointer(info,keypos - share->base.key_reflength,next_block);
+ anc_page->size= a_length + length;
+ page_store_size(share, anc_page);
+
+ if (share->now_transactional &&
+ _ma_log_add(anc_page, a_length,
+ key_start, s_temp.changed_length, s_temp.move_length, 1,
+ KEY_OP_DEBUG_LOG_ADD_2))
+ goto err;
+
+ DBUG_RETURN(new_leaf_length <=
+ (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+ (uint) keyinfo->underflow_block_length));
+err:
+ DBUG_RETURN(-1);
+} /* del */
+
+
+/**
+ @brief Balances adjacent pages if underflow occours
+
+ @fn underflow()
+ @param anc_buff Anchestor page data
+ @param leaf_page Leaf page (page that underflowed)
+ @param leaf_page_link Pointer to pin information about leaf page
+ @param keypos Position after current key in anc_buff
+
+ @note
+ This function writes redo entries for all changes
+ leaf_page is saved to disk
+ Caller must save anc_buff
+
+ @return
+ @retval 0 ok
+ @retval 1 ok, but anc_buff did underflow
+ @retval -1 error
+ */
+
+static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page,
+ uchar *keypos)
+{
+ int t_length;
+ uint anc_length,buff_length,leaf_length,p_length,s_length,nod_flag;
+ uint next_buff_length, new_buff_length, key_reflength;
+ uint unchanged_leaf_length, new_leaf_length, new_anc_length;
+ uint anc_page_flag, page_flag;
+ uchar anc_key_buff[MARIA_MAX_KEY_BUFF], leaf_key_buff[MARIA_MAX_KEY_BUFF];
+ uchar *endpos, *next_keypos, *anc_pos, *half_pos, *prev_key;
+ uchar *anc_buff, *leaf_buff;
+ uchar *after_key, *anc_end_pos;
+ MARIA_KEY_PARAM key_deleted, key_inserted;
+ MARIA_SHARE *share= info->s;
+ my_bool first_key;
+ MARIA_KEY tmp_key, anc_key, leaf_key;
+ MARIA_PAGE next_page;
+ DBUG_ENTER("underflow");
+ DBUG_PRINT("enter",("leaf_page: %lu keypos: 0x%lx",
+ (ulong) (leaf_page->pos / share->block_size),
+ (ulong) keypos));
+ DBUG_DUMP("anc_buff", anc_page->buff, anc_page->size);
+ DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size);
+
+ anc_page_flag= anc_page->flag;
+ anc_buff= anc_page->buff;
+ leaf_buff= leaf_page->buff;
+ info->keyread_buff_used=1;
+ next_keypos=keypos;
+ nod_flag= leaf_page->node;
+ p_length= nod_flag+share->keypage_header;
+ anc_length= anc_page->size;
+ leaf_length= leaf_page->size;
+ key_reflength= share->base.key_reflength;
+ if (share->keyinfo+info->lastinx == keyinfo)
+ info->page_changed=1;
+ first_key= keypos == anc_buff + share->keypage_header + key_reflength;
+
+ tmp_key.data= info->buff;
+ anc_key.data= anc_key_buff;
+ leaf_key.data= leaf_key_buff;
+ tmp_key.keyinfo= leaf_key.keyinfo= anc_key.keyinfo= keyinfo;
+
+ if ((keypos < anc_buff + anc_length && (info->state->records & 1)) ||
+ first_key)
+ {
+ size_t tmp_length;
+ uint next_page_flag;
+ /* Use page right of anc-page */
+ DBUG_PRINT("test",("use right page"));
+
+ /*
+ Calculate position after the current key. Note that keydata itself is
+ not used
+ */
+ if (keyinfo->flag & HA_BINARY_PACK_KEY)
+ {
+ if (!(next_keypos= _ma_get_key(&tmp_key, anc_page, keypos)))
+ goto err;
+ }
+ else
+ {
+ /* Avoid length error check if packed key */
+ tmp_key.data[0]= tmp_key.data[1]= 0;
+ /* Got to end of found key */
+ if (!(*keyinfo->get_key)(&tmp_key, anc_page_flag, key_reflength,
+ &next_keypos))
+ goto err;
+ }
+ next_page.pos= _ma_kpos(key_reflength, next_keypos);
+ if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos,
+ PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, info->buff, 0))
+ goto err;
+ next_buff_length= next_page.size;
+ next_page_flag= next_page.flag;
+ DBUG_DUMP("next", next_page.buff, next_page.size);
+
+ /* find keys to make a big key-page */
+ bmove(next_keypos-key_reflength, next_page.buff + share->keypage_header,
+ key_reflength);
+
+ if (!_ma_get_last_key(&anc_key, anc_page, next_keypos) ||
+ !_ma_get_last_key(&leaf_key, leaf_page, leaf_buff+leaf_length))
+ goto err;
+
+ /* merge pages and put parting key from anc_page between */
+ prev_key= (leaf_length == p_length ? (uchar*) 0 : leaf_key.data);
+ t_length= (*keyinfo->pack_key)(&anc_key, nod_flag, next_page.buff+p_length,
+ prev_key, prev_key, &key_inserted);
+ tmp_length= next_buff_length - p_length;
+ endpos= next_page.buff + tmp_length + leaf_length + t_length;
+ /* next_page.buff will always be larger than before !*/
+ bmove_upp(endpos, next_page.buff + next_buff_length, tmp_length);
+ memcpy(next_page.buff, leaf_buff,(size_t) leaf_length);
+ (*keyinfo->store_key)(keyinfo, next_page.buff+leaf_length, &key_inserted);
+ buff_length= (uint) (endpos - next_page.buff);
+
+ /* Set page flag from combination of both key pages and parting key */
+ page_flag= next_page_flag | leaf_page->flag;
+ if (anc_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+ SEARCH_PAGE_KEY_HAS_TRANSID))
+ page_flag|= KEYPAGE_FLAG_HAS_TRANSID;
+
+ next_page.size= buff_length;
+ next_page.flag= page_flag;
+ page_store_info(share, &next_page);
+
+ /* remove key from anc_page */
+ if (!(s_length=remove_key(keyinfo, anc_page_flag, key_reflength, keypos,
+ anc_key_buff, anc_buff+anc_length,
+ (my_off_t *) 0, &key_deleted)))
+ goto err;
+
+ new_anc_length= anc_length - s_length;
+ anc_page->size= new_anc_length;
+ page_store_size(share, anc_page);
+
+ if (buff_length <= share->max_index_block_size)
+ {
+ /* All keys fitted into one page */
+ page_mark_changed(info, &next_page);
+ if (_ma_dispose(info, next_page.pos, 0))
+ goto err;
+
+ memcpy(leaf_buff, next_page.buff, (size_t) buff_length);
+ leaf_page->size= next_page.size;
+ leaf_page->flag= next_page.flag;
+
+ if (share->now_transactional)
+ {
+ /*
+ Log changes to parent page. Note that this page may have been
+ temporarily bigger than block_size.
+ */
+ if (_ma_log_delete(anc_page, key_deleted.key_pos,
+ key_deleted.changed_length,
+ key_deleted.move_length,
+ anc_length - anc_page->org_size,
+ KEY_OP_DEBUG_LOG_DEL_CHANGE_2))
+ goto err;
+ /*
+ Log changes to leaf page. Data for leaf page is in leaf_buff
+ which contains original leaf_buff, parting key and next_buff
+ */
+ if (_ma_log_suffix(leaf_page, leaf_length, buff_length))
+ goto err;
+ }
+ }
+ else
+ {
+ /*
+ Balancing didn't free a page, so we have to split 'buff' into two
+ pages:
+ - Find key in middle of buffer
+ - Store everything before key in 'leaf_page'
+ - Pack key into anc_page at position of deleted key
+ Note that anc_page may overflow! (is handled by caller)
+ - Store remaining keys in next_page (buff)
+ */
+ MARIA_KEY_PARAM anc_key_inserted;
+
+ anc_end_pos= anc_buff + new_anc_length;
+
+ DBUG_PRINT("test",("anc_buff: 0x%lx anc_end_pos: 0x%lx",
+ (long) anc_buff, (long) anc_end_pos));
+
+ if (!first_key && !_ma_get_last_key(&anc_key, anc_page, keypos))
+ goto err;
+ if (!(half_pos= _ma_find_half_pos(&leaf_key, &next_page, &after_key)))
+ goto err;
+ new_leaf_length= (uint) (half_pos - next_page.buff);
+ memcpy(leaf_buff, next_page.buff, (size_t) new_leaf_length);
+
+ leaf_page->size= new_leaf_length;
+ leaf_page->flag= page_flag;
+ page_store_info(share, leaf_page);
+
+ /* Correct new keypointer to leaf_page */
+ half_pos=after_key;
+ _ma_kpointer(info,
+ leaf_key.data + leaf_key.data_length + leaf_key.ref_length,
+ next_page.pos);
+
+ /* Save key in anc_page */
+ prev_key= (first_key ? (uchar*) 0 : anc_key.data);
+ t_length= (*keyinfo->pack_key)(&leaf_key, key_reflength,
+ (keypos == anc_end_pos ? (uchar*) 0 :
+ keypos),
+ prev_key, prev_key, &anc_key_inserted);
+ if (t_length >= 0)
+ bmove_upp(anc_end_pos+t_length, anc_end_pos,
+ (uint) (anc_end_pos - keypos));
+ else
+ bmove(keypos,keypos-t_length,(uint) (anc_end_pos-keypos)+t_length);
+ (*keyinfo->store_key)(keyinfo,keypos, &anc_key_inserted);
+ new_anc_length+= t_length;
+ anc_page->size= new_anc_length;
+ page_store_size(share, anc_page);
+
+ if (leaf_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+ SEARCH_PAGE_KEY_HAS_TRANSID))
+ _ma_mark_page_with_transid(share, anc_page);
+
+ /* Store key first in new page */
+ if (nod_flag)
+ bmove(next_page.buff + share->keypage_header, half_pos-nod_flag,
+ (size_t) nod_flag);
+ if (!(*keyinfo->get_key)(&leaf_key, page_flag, nod_flag, &half_pos))
+ goto err;
+ t_length=(int) (*keyinfo->pack_key)(&leaf_key, nod_flag, (uchar*) 0,
+ (uchar*) 0, (uchar*) 0,
+ &key_inserted);
+ /* t_length will always be > 0 for a new page !*/
+ tmp_length= (size_t) ((next_page.buff + buff_length) - half_pos);
+ bmove(next_page.buff + p_length + t_length, half_pos, tmp_length);
+ (*keyinfo->store_key)(keyinfo, next_page.buff + p_length, &key_inserted);
+ new_buff_length= tmp_length + t_length + p_length;
+ next_page.size= new_buff_length;
+ page_store_size(share, &next_page);
+ /* keypage flag is already up to date */
+
+ if (share->now_transactional)
+ {
+ /*
+ Log changes to parent page
+ This has one key deleted from it and one key inserted to it at
+ keypos
+
+ ma_log_add ensures that we don't log changes that is outside of
+ key block size, as the REDO code can't handle that
+ */
+ if (_ma_log_add(anc_page, anc_length, keypos,
+ anc_key_inserted.move_length +
+ max(anc_key_inserted.changed_length -
+ anc_key_inserted.move_length,
+ key_deleted.changed_length),
+ anc_key_inserted.move_length -
+ key_deleted.move_length, 1,
+ KEY_OP_DEBUG_LOG_ADD_3))
+ goto err;
+
+ /*
+ Log changes to leaf page.
+ This contains original data with new data added at end
+ */
+ DBUG_ASSERT(leaf_length <= new_leaf_length);
+ if (_ma_log_suffix(leaf_page, leaf_length, new_leaf_length))
+ goto err;
+ /*
+ Log changes to next page
+
+ This contains original data with some prefix data deleted and
+ some compressed data at start possible extended
+
+ Data in buff was originally:
+ org_leaf_buff [leaf_length]
+ separator_key [buff_key_inserted.move_length]
+ next_key_changes [buff_key_inserted.changed_length -move_length]
+ next_page_data [next_buff_length - p_length -
+ (buff_key_inserted.changed_length -move_length)]
+
+ After changes it's now:
+ unpacked_key [key_inserted.changed_length]
+ next_suffix [next_buff_length - key_inserted.changed_length]
+
+ */
+ DBUG_ASSERT(new_buff_length <= next_buff_length);
+ if (_ma_log_prefix(&next_page, key_inserted.changed_length,
+ (int) (new_buff_length - next_buff_length),
+ KEY_OP_DEBUG_LOG_PREFIX_1))
+ goto err;
+ }
+ page_mark_changed(info, &next_page);
+ if (_ma_write_keypage(&next_page,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+ goto err;
+ }
+
+ page_mark_changed(info, leaf_page);
+ if (_ma_write_keypage(leaf_page,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+ goto err;
+ DBUG_RETURN(new_anc_length <=
+ ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+ (uint) keyinfo->underflow_block_length)));
+ }
+
+ DBUG_PRINT("test",("use left page"));
+
+ keypos= _ma_get_last_key(&anc_key, anc_page, keypos);
+ if (!keypos)
+ goto err;
+ next_page.pos= _ma_kpos(key_reflength,keypos);
+ if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos,
+ PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, info->buff, 0))
+ goto err;
+ buff_length= next_page.size;
+ endpos= next_page.buff + buff_length;
+ DBUG_DUMP("prev", next_page.buff, next_page.size);
+
+ /* find keys to make a big key-page */
+ bmove(next_keypos - key_reflength, leaf_buff + share->keypage_header,
+ key_reflength);
+ next_keypos=keypos;
+ if (!(*keyinfo->get_key)(&anc_key, anc_page_flag, key_reflength,
+ &next_keypos))
+ goto err;
+ if (!_ma_get_last_key(&leaf_key, &next_page, endpos))
+ goto err;
+
+ /* merge pages and put parting key from anc_page between */
+ prev_key= (leaf_length == p_length ? (uchar*) 0 : leaf_key.data);
+ t_length=(*keyinfo->pack_key)(&anc_key, nod_flag,
+ (leaf_length == p_length ?
+ (uchar*) 0 : leaf_buff+p_length),
+ prev_key, prev_key,
+ &key_inserted);
+ if (t_length >= 0)
+ bmove(endpos+t_length, leaf_buff+p_length,
+ (size_t) (leaf_length-p_length));
+ else /* We gained space */
+ bmove(endpos,leaf_buff+((int) p_length-t_length),
+ (size_t) (leaf_length-p_length+t_length));
+ (*keyinfo->store_key)(keyinfo,endpos, &key_inserted);
+
+ /* Remember for logging how many bytes of leaf_buff that are not changed */
+ DBUG_ASSERT((int) key_inserted.changed_length >= key_inserted.move_length);
+ unchanged_leaf_length= (leaf_length - p_length -
+ (key_inserted.changed_length -
+ key_inserted.move_length));
+
+ new_buff_length= buff_length + leaf_length - p_length + t_length;
+
+#ifdef EXTRA_DEBUG
+ /* Ensure that unchanged_leaf_length is correct */
+ DBUG_ASSERT(bcmp(next_page.buff + new_buff_length - unchanged_leaf_length,
+ leaf_buff + leaf_length - unchanged_leaf_length,
+ unchanged_leaf_length) == 0);
+#endif
+
+ page_flag= next_page.flag | leaf_page->flag;
+ if (anc_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+ SEARCH_PAGE_KEY_HAS_TRANSID))
+ page_flag|= KEYPAGE_FLAG_HAS_TRANSID;
+
+ next_page.size= new_buff_length;
+ next_page.flag= page_flag;
+ page_store_info(share, &next_page);
+
+ /* remove key from anc_page */
+ if (!(s_length= remove_key(keyinfo, anc_page_flag, key_reflength, keypos,
+ anc_key_buff,
+ anc_buff+anc_length, (my_off_t *) 0,
+ &key_deleted)))
+ goto err;
+
+ new_anc_length= anc_length - s_length;
+ anc_page->size= new_anc_length;
+ page_store_size(share, anc_page);
+
+ if (new_buff_length <= share->max_index_block_size)
+ {
+ /* All keys fitted into one page */
+ page_mark_changed(info, leaf_page);
+ if (_ma_dispose(info, leaf_page->pos, 0))
+ goto err;
+
+ if (share->now_transactional)
+ {
+ /*
+ Log changes to parent page. Note that this page may have been
+ temporarily bigger than block_size.
+ */
+ if (_ma_log_delete(anc_page, key_deleted.key_pos,
+ key_deleted.changed_length, key_deleted.move_length,
+ anc_length - anc_page->org_size,
+ KEY_OP_DEBUG_LOG_DEL_CHANGE_3))
+ goto err;
+ /*
+ Log changes to next page. Data for leaf page is in buff
+ that contains original leaf_buff, parting key and next_buff
+ */
+ if (_ma_log_suffix(&next_page, buff_length, new_buff_length))
+ goto err;
+ }
+ }
+ else
+ {
+ /*
+ Balancing didn't free a page, so we have to split 'next_page' into two
+ pages
+ - Find key in middle of buffer (buff)
+ - Pack key at half_buff into anc_page at position of deleted key
+ Note that anc_page may overflow! (is handled by caller)
+ - Move everything after middlekey to 'leaf_buff'
+ - Shorten buff at 'endpos'
+ */
+ MARIA_KEY_PARAM anc_key_inserted;
+ size_t tmp_length;
+
+ if (keypos == anc_buff + share->keypage_header + key_reflength)
+ anc_pos= 0; /* First key */
+ else
+ {
+ if (!_ma_get_last_key(&anc_key, anc_page, keypos))
+ goto err;
+ anc_pos= anc_key.data;
+ }
+ if (!(endpos= _ma_find_half_pos(&leaf_key, &next_page, &half_pos)))
+ goto err;
+
+ /* Correct new keypointer to leaf_page */
+ _ma_kpointer(info,leaf_key.data + leaf_key.data_length +
+ leaf_key.ref_length, leaf_page->pos);
+
+ /* Save key in anc_page */
+ DBUG_DUMP("anc_buff", anc_buff, new_anc_length);
+ DBUG_DUMP_KEY("key_to_anc", &leaf_key);
+ anc_end_pos= anc_buff + new_anc_length;
+ t_length=(*keyinfo->pack_key)(&leaf_key, key_reflength,
+ keypos == anc_end_pos ? (uchar*) 0
+ : keypos,
+ anc_pos, anc_pos,
+ &anc_key_inserted);
+ if (t_length >= 0)
+ bmove_upp(anc_end_pos+t_length, anc_end_pos,
+ (uint) (anc_end_pos-keypos));
+ else
+ bmove(keypos,keypos-t_length,(uint) (anc_end_pos-keypos)+t_length);
+ (*keyinfo->store_key)(keyinfo,keypos, &anc_key_inserted);
+ new_anc_length+= t_length;
+ anc_page->size= new_anc_length;
+ page_store_size(share, anc_page);
+
+ if (leaf_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+ SEARCH_PAGE_KEY_HAS_TRANSID))
+ _ma_mark_page_with_transid(share, anc_page);
+
+ /* Store first key on new page */
+ if (nod_flag)
+ bmove(leaf_buff + share->keypage_header, half_pos-nod_flag,
+ (size_t) nod_flag);
+ if (!(*keyinfo->get_key)(&leaf_key, page_flag, nod_flag, &half_pos))
+ goto err;
+ DBUG_DUMP_KEY("key_to_leaf", &leaf_key);
+ t_length=(*keyinfo->pack_key)(&leaf_key, nod_flag, (uchar*) 0,
+ (uchar*) 0, (uchar*) 0, &key_inserted);
+ /* t_length will always be > 0 for a new page !*/
+ tmp_length= (size_t) ((next_page.buff + new_buff_length) - half_pos);
+ DBUG_PRINT("info",("t_length: %d length: %d",t_length, (int) tmp_length));
+ bmove(leaf_buff+p_length+t_length, half_pos, tmp_length);
+ (*keyinfo->store_key)(keyinfo,leaf_buff+p_length, &key_inserted);
+ new_leaf_length= tmp_length + t_length + p_length;
+
+ leaf_page->size= new_leaf_length;
+ leaf_page->flag= page_flag;
+ page_store_info(share, leaf_page);
+
+ new_buff_length= (uint) (endpos - next_page.buff);
+ next_page.size= new_buff_length;
+ page_store_size(share, &next_page);
+
+ if (share->now_transactional)
+ {
+ /*
+ Log changes to parent page
+ This has one key deleted from it and one key inserted to it at
+ keypos
+
+ ma_log_add() ensures that we don't log changes that is outside of
+ key block size, as the REDO code can't handle that
+ */
+ if (_ma_log_add(anc_page, anc_length, keypos,
+ anc_key_inserted.move_length +
+ max(anc_key_inserted.changed_length -
+ anc_key_inserted.move_length,
+ key_deleted.changed_length),
+ anc_key_inserted.move_length -
+ key_deleted.move_length, 1,KEY_OP_DEBUG_LOG_ADD_4))
+ goto err;
+
+ /*
+ Log changes to leaf page.
+ This contains original data with new data added first
+ */
+ DBUG_ASSERT(leaf_length <= new_leaf_length);
+ DBUG_ASSERT(new_leaf_length >= unchanged_leaf_length);
+ if (_ma_log_prefix(leaf_page, new_leaf_length - unchanged_leaf_length,
+ (int) (new_leaf_length - leaf_length),
+ KEY_OP_DEBUG_LOG_PREFIX_2))
+ goto err;
+ /*
+ Log changes to next page
+ This contains original data with some suffix data deleted
+
+ */
+ DBUG_ASSERT(new_buff_length <= buff_length);
+ if (_ma_log_suffix(&next_page, buff_length, new_buff_length))
+ goto err;
+ }
+
+ page_mark_changed(info, leaf_page);
+ if (_ma_write_keypage(leaf_page,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+ goto err;
+ }
+ page_mark_changed(info, &next_page);
+ if (_ma_write_keypage(&next_page,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+ goto err;
+
+ DBUG_RETURN(new_anc_length <=
+ ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+ (uint) keyinfo->underflow_block_length)));
+
+err:
+ DBUG_RETURN(-1);
+} /* underflow */
+
+
+/**
+ @brief Remove a key from page
+
+ @fn remove_key()
+ keyinfo Key handle
+ nod_flag Length of node ptr
+ keypos Where on page key starts
+ lastkey Buffer for storing keys to be removed
+ page_end Pointer to end of page
+ next_block If <> 0 and node-page, this is set to address of
+ next page
+ s_temp Information about what changes was done one the page:
+ s_temp.key_pos Start of key
+ s_temp.move_length Number of bytes removed at keypos
+ s_temp.changed_length Number of bytes changed at keypos
+
+ @todo
+ The current code doesn't handle the case that the next key may be
+ packed better against the previous key if there is a case difference
+
+ @return
+ @retval 0 error
+ @retval # How many chars was removed
+*/
+
+static uint remove_key(MARIA_KEYDEF *keyinfo, uint page_flag, uint nod_flag,
+ uchar *keypos, uchar *lastkey,
+ uchar *page_end, my_off_t *next_block,
+ MARIA_KEY_PARAM *s_temp)
+{
+ int s_length;
+ uchar *start;
+ DBUG_ENTER("remove_key");
+ DBUG_PRINT("enter", ("keypos: 0x%lx page_end: 0x%lx",
+ (long) keypos, (long) page_end));
+
+ start= s_temp->key_pos= keypos;
+ s_temp->changed_length= 0;
+ if (!(keyinfo->flag &
+ (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY |
+ HA_BINARY_PACK_KEY)) &&
+ !(page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+ {
+ /* Static length key */
+ s_length=(int) (keyinfo->keylength+nod_flag);
+ if (next_block && nod_flag)
+ *next_block= _ma_kpos(nod_flag,keypos+s_length);
+ }
+ else
+ {
+ /* Let keypos point at next key */
+ MARIA_KEY key;
+
+ /* Calculate length of key */
+ key.keyinfo= keyinfo;
+ key.data= lastkey;
+ if (!(*keyinfo->get_key)(&key, page_flag, nod_flag, &keypos))
+ DBUG_RETURN(0); /* Error */
+
+ if (next_block && nod_flag)
+ *next_block= _ma_kpos(nod_flag,keypos);
+ s_length=(int) (keypos-start);
+ if (keypos != page_end)
+ {
+ if (keyinfo->flag & HA_BINARY_PACK_KEY)
+ {
+ uchar *old_key= start;
+ uint next_length,prev_length,prev_pack_length;
+
+ /* keypos points here on start of next key */
+ get_key_length(next_length,keypos);
+ get_key_pack_length(prev_length,prev_pack_length,old_key);
+ if (next_length > prev_length)
+ {
+ uint diff= (next_length-prev_length);
+ /* We have to copy data from the current key to the next key */
+ keypos-= diff + prev_pack_length;
+ store_key_length(keypos, prev_length);
+ bmove(keypos + prev_pack_length, lastkey + prev_length, diff);
+ s_length=(int) (keypos-start);
+ s_temp->changed_length= diff + prev_pack_length;
+ }
+ }
+ else
+ {
+ /* Check if a variable length first key part */
+ if ((keyinfo->seg->flag & HA_PACK_KEY) && *keypos & 128)
+ {
+ /* Next key is packed against the current one */
+ uint next_length,prev_length,prev_pack_length,lastkey_length,
+ rest_length;
+ if (keyinfo->seg[0].length >= 127)
+ {
+ if (!(prev_length=mi_uint2korr(start) & 32767))
+ goto end;
+ next_length=mi_uint2korr(keypos) & 32767;
+ keypos+=2;
+ prev_pack_length=2;
+ }
+ else
+ {
+ if (!(prev_length= *start & 127))
+ goto end; /* Same key as previous*/
+ next_length= *keypos & 127;
+ keypos++;
+ prev_pack_length=1;
+ }
+ if (!(*start & 128))
+ prev_length=0; /* prev key not packed */
+ if (keyinfo->seg[0].flag & HA_NULL_PART)
+ lastkey++; /* Skip null marker */
+ get_key_length(lastkey_length,lastkey);
+ if (!next_length) /* Same key after */
+ {
+ next_length=lastkey_length;
+ rest_length=0;
+ }
+ else
+ get_key_length(rest_length,keypos);
+
+ if (next_length >= prev_length)
+ {
+ /* Next key is based on deleted key */
+ uint pack_length;
+ uint diff= (next_length-prev_length);
+
+ /* keypos points to data of next key (after key length) */
+ bmove(keypos - diff, lastkey + prev_length, diff);
+ rest_length+= diff;
+ pack_length= prev_length ? get_pack_length(rest_length): 0;
+ keypos-= diff + pack_length + prev_pack_length;
+ s_length=(int) (keypos-start);
+ if (prev_length) /* Pack against prev key */
+ {
+ *keypos++= start[0];
+ if (prev_pack_length == 2)
+ *keypos++= start[1];
+ store_key_length(keypos,rest_length);
+ }
+ else
+ {
+ /* Next key is not packed anymore */
+ if (keyinfo->seg[0].flag & HA_NULL_PART)
+ {
+ rest_length++; /* Mark not null */
+ }
+ if (prev_pack_length == 2)
+ {
+ mi_int2store(keypos,rest_length);
+ }
+ else
+ *keypos= rest_length;
+ }
+ s_temp->changed_length= diff + pack_length + prev_pack_length;
+ }
+ }
+ }
+ }
+ }
+ end:
+ bmove(start, start+s_length, (uint) (page_end-start-s_length));
+ s_temp->move_length= s_length;
+ DBUG_RETURN((uint) s_length);
+} /* remove_key */
+
+
+/****************************************************************************
+ Logging of redos
+****************************************************************************/
+
+/**
+ @brief
+ log entry where some parts are deleted and some things are changed
+ and some data could be added last.
+
+ @fn _ma_log_delete()
+ @param info Maria handler
+ @param page Pageaddress for changed page
+ @param buff Page buffer
+ @param key_pos Start of change area
+ @param changed_length How many bytes where changed at key_pos
+ @param move_length How many bytes where deleted at key_pos
+ @param append_length Length of data added last
+ This is taken from end of ma_page->buff
+
+ This is mainly used when a key is deleted. The append happens
+ when we delete a key from a page with data > block_size kept in
+ memory and we have to add back the data that was stored > block_size
+*/
+
+my_bool _ma_log_delete(MARIA_PAGE *ma_page, const uchar *key_pos,
+ uint changed_length, uint move_length,
+ uint append_length __attribute__((unused)),
+ enum en_key_debug debug_marker __attribute__((unused)))
+{
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 5+ 2 + 3 + 3 + 6 + 3 + 7];
+ uchar *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 7];
+ uint translog_parts, current_size, extra_length;
+ uint offset= (uint) (key_pos - ma_page->buff);
+ MARIA_HA *info= ma_page->info;
+ MARIA_SHARE *share= info->s;
+ my_off_t page= ma_page->pos / share->block_size;
+ DBUG_ENTER("_ma_log_delete");
+ DBUG_PRINT("enter", ("page: %lu changed_length: %u move_length: %d",
+ (ulong) page, changed_length, move_length));
+ DBUG_ASSERT(share->now_transactional && move_length);
+ DBUG_ASSERT(offset + changed_length <= ma_page->size);
+ DBUG_ASSERT(ma_page->org_size - move_length + append_length == ma_page->size);
+ DBUG_ASSERT(move_length <= ma_page->org_size - share->keypage_header);
+
+ /* Store address of new root page */
+ page_store(log_data + FILEID_STORE_SIZE, page);
+ log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+ current_size= ma_page->org_size;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+ *log_pos++= KEY_OP_DEBUG;
+ *log_pos++= debug_marker;
+
+ *log_pos++= KEY_OP_DEBUG_2;
+ int2store(log_pos, ma_page->org_size);
+ int2store(log_pos+2, ma_page->size);
+ log_pos+=4;
+#endif
+
+ /* Store keypage_flag */
+ *log_pos++= KEY_OP_SET_PAGEFLAG;
+ *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+ log_pos[0]= KEY_OP_OFFSET;
+ int2store(log_pos+1, offset);
+ log_pos+= 3;
+ translog_parts= TRANSLOG_INTERNAL_PARTS + 1;
+ extra_length= 0;
+
+ if (changed_length)
+ {
+ if (offset + changed_length >= share->max_index_block_size)
+ {
+ changed_length= share->max_index_block_size - offset;
+ move_length= 0; /* Nothing to move */
+ current_size= share->max_index_block_size;
+ }
+
+ log_pos[0]= KEY_OP_CHANGE;
+ int2store(log_pos+1, changed_length);
+ log_pos+= 3;
+ log_array[translog_parts].str= ma_page->buff + offset;
+ log_array[translog_parts].length= changed_length;
+ translog_parts++;
+
+ /* We only have to move things after offset+changed_length */
+ offset+= changed_length;
+ }
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data);
+
+ if (move_length)
+ {
+ uint log_length;
+ if (offset + move_length < share->max_index_block_size)
+ {
+ /*
+ Move down things that is on page.
+ page_offset in apply_redo_inxed() will be at original offset
+ + changed_length.
+ */
+ log_pos[0]= KEY_OP_SHIFT;
+ int2store(log_pos+1, - (int) move_length);
+ log_length= 3;
+ current_size-= move_length;
+ }
+ else
+ {
+ /* Delete to end of page */
+ uint tmp= current_size - offset;
+ current_size= offset;
+ log_pos[0]= KEY_OP_DEL_SUFFIX;
+ int2store(log_pos+1, tmp);
+ log_length= 3;
+ }
+ log_array[translog_parts].str= log_pos;
+ log_array[translog_parts].length= log_length;
+ translog_parts++;
+ log_pos+= log_length;
+ extra_length+= log_length;
+ }
+
+ if (current_size != ma_page->size &&
+ current_size != share->max_index_block_size)
+ {
+ /* Append data that didn't fit on the page before */
+ uint length= (min(ma_page->size, share->max_index_block_size) -
+ current_size);
+ uchar *data= ma_page->buff + current_size;
+
+ DBUG_ASSERT(length <= append_length);
+
+ log_pos[0]= KEY_OP_ADD_SUFFIX;
+ int2store(log_pos+1, length);
+ log_array[translog_parts].str= log_pos;
+ log_array[translog_parts].length= 3;
+ log_array[translog_parts + 1].str= data;
+ log_array[translog_parts + 1].length= length;
+ log_pos+= 3;
+ translog_parts+= 2;
+ current_size+= length;
+ extra_length+= 3 + length;
+ }
+
+ _ma_log_key_changes(ma_page,
+ log_array + translog_parts,
+ log_pos, &extra_length, &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ ma_page->org_size= current_size;
+
+ if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS].length +
+ changed_length + extra_length, translog_parts,
+ log_array, log_data, NULL))
+ DBUG_RETURN(1);
+
+ DBUG_RETURN(0);
+}
+
+
+/****************************************************************************
+ Logging of undos
+****************************************************************************/
+
+my_bool _ma_write_undo_key_delete(MARIA_HA *info, const MARIA_KEY *key,
+ my_off_t new_root, LSN *res_lsn)
+{
+ MARIA_SHARE *share= info->s;
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ KEY_NR_STORE_SIZE + PAGE_STORE_SIZE], *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ struct st_msg_to_write_hook_for_undo_key msg;
+ enum translog_record_type log_type= LOGREC_UNDO_KEY_DELETE;
+ uint keynr= key->keyinfo->key_nr;
+
+ lsn_store(log_data, info->trn->undo_lsn);
+ key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, keynr);
+ log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE;
+
+ /**
+ @todo BUG if we had concurrent insert/deletes, reading state's key_root
+ like this would be unsafe.
+ */
+ if (new_root != share->state.key_root[keynr])
+ {
+ my_off_t page;
+ page= ((new_root == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO :
+ new_root / share->block_size);
+ page_store(log_pos, page);
+ log_pos+= PAGE_STORE_SIZE;
+ log_type= LOGREC_UNDO_KEY_DELETE_WITH_ROOT;
+ }
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key->data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= (key->data_length +
+ key->ref_length);
+
+ msg.root= &share->state.key_root[keynr];
+ msg.value= new_root;
+ /*
+ set autoincrement to 1 if this is an auto_increment key
+ This is only used if we are now in a rollback of a duplicate key
+ */
+ msg.auto_increment= share->base.auto_key == keynr + 1;
+
+ return translog_write_record(res_lsn, log_type,
+ info->trn, info,
+ (translog_size_t)
+ (log_array[TRANSLOG_INTERNAL_PARTS + 0].length +
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length),
+ TRANSLOG_INTERNAL_PARTS + 2, log_array,
+ log_data + LSN_STORE_SIZE, &msg) ? -1 : 0;
+}
diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c
new file mode 100644
index 00000000000..4661ea0ab59
--- /dev/null
+++ b/storage/maria/ma_delete_all.c
@@ -0,0 +1,192 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Remove all rows from a MARIA table */
+/* This clears the status information and truncates files */
+
+#include "maria_def.h"
+#include "trnman.h"
+
+/**
+ @brief deletes all rows from a table
+
+ @param info Maria handler
+
+ @note It is important that this function does not rely on the state
+ information, as it may be called by ma_apply_undo_bulk_insert() on an
+ inconsistent table left by a crash.
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error
+*/
+
+int maria_delete_all_rows(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ my_bool log_record;
+ LSN lsn;
+ DBUG_ENTER("maria_delete_all_rows");
+
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ DBUG_RETURN(my_errno=EACCES);
+ }
+ /**
+ @todo LOCK take X-lock on table here.
+ When we have versioning, if some other thread is looking at this table,
+ we cannot shrink the file like this.
+ */
+ if (_ma_readinfo(info,F_WRLCK,1))
+ DBUG_RETURN(my_errno);
+ log_record= share->now_transactional && !share->temporary;
+ if (_ma_mark_file_changed(info))
+ goto err;
+
+ if (log_record)
+ {
+ /*
+ This record will be used by Recovery to finish the deletion if it
+ crashed. We force it to have a complete history in the log.
+ */
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar log_data[FILEID_STORE_SIZE];
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DELETE_ALL,
+ info->trn, info, 0,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, log_data, NULL) ||
+ translog_flush(lsn)))
+ goto err;
+ /*
+ If we fail in this function after this point, log and table will be
+ inconsistent.
+ */
+ }
+ else
+ {
+ /* Other branch called function below when writing log record, in hook */
+ _ma_reset_status(info);
+ }
+ /* Remove old history as the table is now empty for everyone */
+ _ma_reset_state(info);
+
+ /*
+ If we are using delayed keys or if the user has done changes to the tables
+ since it was locked then there may be key blocks in the page cache. Or
+ there may be data blocks there. We need to throw them away or they may
+ re-enter the emptied table or another table later.
+ */
+
+#ifdef HAVE_MMAP
+ if (share->file_map)
+ _ma_unmap_file(info);
+#endif
+
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX,
+ FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) ||
+ my_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) ||
+ my_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME)))
+ goto err;
+
+ if (_ma_initialize_data_file(share, info->dfile.file))
+ goto err;
+
+ if (log_record)
+ {
+ /*
+ Because LOGREC_REDO_DELETE_ALL does not operate on pages, it has the
+ following problem:
+ delete_all; inserts (redo_insert); all pages get flushed; checkpoint:
+ the dirty pages list will be empty. In recovery, delete_all is executed,
+ but redo_insert are skipped (dirty pages list is empty).
+ To avoid this, we need to set skip_redo_lsn now, and thus need to sync
+ files.
+ Also fixes the problem of:
+ bulk insert; insert; delete_all; crash:
+ "bulk insert" is skipped (no REDOs), so if "insert" would not be skipped
+ (if we didn't update skip_redo_lsn below) then "insert" would be tried
+ and fail, saying that it sees that the first page has to be created
+ though the inserted row has rownr>0.
+ */
+ my_bool error= _ma_state_info_write(share,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_LOCK) ||
+ _ma_update_state_lsns(share, lsn, trnman_get_min_trid(), FALSE, FALSE) ||
+ _ma_sync_table_files(info);
+ info->trn->rec_lsn= LSN_IMPOSSIBLE;
+ if (error)
+ goto err;
+ }
+
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+#ifdef HAVE_MMAP
+ /* Map again */
+ if (share->file_map)
+ _ma_dynmap_file(info, (my_off_t) 0);
+#endif
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ DBUG_RETURN(0);
+
+err:
+ {
+ int save_errno=my_errno;
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ info->update|=HA_STATE_WRITTEN; /* Buffer changed */
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ DBUG_RETURN(my_errno=save_errno);
+ }
+} /* maria_delete_all_rows */
+
+
+/*
+ Reset status information
+
+ SYNOPSIS
+ _ma_reset_status()
+ maria Maria handler
+
+ DESCRIPTION
+ Resets data and index file information as if the file would be empty
+ Files are not touched.
+*/
+
+void _ma_reset_status(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_STATE_INFO *state= &share->state;
+ uint i;
+ DBUG_ENTER("_ma_reset_status");
+
+ state->split= 0;
+ state->state.records= state->state.del= 0;
+ state->changed= 0; /* File is optimized */
+ state->dellink= HA_OFFSET_ERROR;
+ state->sortkey= (ushort) ~0;
+ state->state.key_file_length= share->base.keystart;
+ state->state.data_file_length= 0;
+ state->state.empty= state->state.key_empty= 0;
+ state->state.checksum= 0;
+
+ *info->state= state->state;
+
+ /* Drop the delete key chain. */
+ state->key_del= HA_OFFSET_ERROR;
+ /* Clear all keys */
+ for (i=0 ; i < share->base.keys ; i++)
+ state->key_root[i]= HA_OFFSET_ERROR;
+ DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_delete_table.c b/storage/maria/ma_delete_table.c
new file mode 100644
index 00000000000..0237bb884c5
--- /dev/null
+++ b/storage/maria/ma_delete_table.c
@@ -0,0 +1,107 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "ma_fulltext.h"
+#include "trnman_public.h"
+
+/**
+ @brief drops (deletes) a table
+
+ @param name table's name
+
+ @return Operation status
+ @retval 0 ok
+ @retval 1 error
+*/
+
+int maria_delete_table(const char *name)
+{
+ char from[FN_REFLEN];
+#ifdef USE_RAID
+ uint raid_type=0,raid_chunks=0;
+#endif
+ MARIA_HA *info;
+ myf sync_dir;
+ DBUG_ENTER("maria_delete_table");
+
+#ifdef EXTRA_DEBUG
+ _ma_check_table_is_closed(name,"delete");
+#endif
+ /** @todo LOCK take X-lock on table */
+ /*
+ We need to know if this table is transactional.
+ When built with RAID support, we also need to determine if this table
+ makes use of the raid feature. If yes, we need to remove all raid
+ chunks. This is done with my_raid_delete(). Unfortunately it is
+ necessary to open the table just to check this. We use
+ 'open_for_repair' to be able to open even a crashed table. If even
+ this open fails, we assume no raid configuration for this table
+ and try to remove the normal data file only. This may however
+ leave the raid chunks behind.
+ */
+ if (!(info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR)))
+ {
+#ifdef USE_RAID
+ raid_type= 0;
+#endif
+ sync_dir= 0;
+ }
+ else
+ {
+#ifdef USE_RAID
+ raid_type= info->s->base.raid_type;
+ raid_chunks= info->s->base.raid_chunks;
+#endif
+ sync_dir= (info->s->now_transactional && !info->s->temporary &&
+ !maria_in_recovery) ?
+ MY_SYNC_DIR : 0;
+ maria_close(info);
+ }
+
+ if (sync_dir)
+ {
+ /*
+ For this log record to be of any use for Recovery, we need the upper
+ MySQL layer to be crash-safe in DDLs.
+ For now this record can serve when we apply logs to a backup, so we sync
+ it.
+ */
+ LSN lsn;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar*)name;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1;
+ if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DROP_TABLE,
+ &dummy_transaction_object, NULL,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL, NULL) ||
+ translog_flush(lsn)))
+ DBUG_RETURN(1);
+ }
+
+ fn_format(from,name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ if (my_delete_with_symlink(from, MYF(MY_WME | sync_dir)))
+ DBUG_RETURN(my_errno);
+ fn_format(from,name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+#ifdef USE_RAID
+ if (raid_type)
+ DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME | sync_dir)) ?
+ my_errno : 0);
+#endif
+ DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME | sync_dir)) ?
+ my_errno : 0);
+}
diff --git a/storage/maria/ma_dynrec.c b/storage/maria/ma_dynrec.c
new file mode 100644
index 00000000000..57b76b713f4
--- /dev/null
+++ b/storage/maria/ma_dynrec.c
@@ -0,0 +1,2042 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Functions to handle space-packed-records and blobs
+
+ A row may be stored in one or more linked blocks.
+ The block size is between MARIA_MIN_BLOCK_LENGTH and MARIA_MAX_BLOCK_LENGTH.
+ Each block is aligned on MARIA_DYN_ALIGN_SIZE.
+ The reson for the max block size is to not have too many different types
+ of blocks. For the differnet block types, look at _ma_get_block_info()
+*/
+
+#include "maria_def.h"
+
+static my_bool write_dynamic_record(MARIA_HA *info,const uchar *record,
+ ulong reclength);
+static int _ma_find_writepos(MARIA_HA *info,ulong reclength,my_off_t *filepos,
+ ulong *length);
+static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos,
+ uchar *record, ulong reclength);
+static my_bool delete_dynamic_record(MARIA_HA *info,MARIA_RECORD_POS filepos,
+ uint second_read);
+static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos,
+ uint length);
+
+#ifdef THREAD
+/* Play it safe; We have a small stack when using threads */
+#undef my_alloca
+#undef my_afree
+#define my_alloca(A) my_malloc((A),MYF(0))
+#define my_afree(A) my_free((A),MYF(0))
+#endif
+
+ /* Interface function from MARIA_HA */
+
+#ifdef HAVE_MMAP
+
+/*
+ Create mmaped area for MARIA handler
+
+ SYNOPSIS
+ _ma_dynmap_file()
+ info MARIA handler
+
+ RETURN
+ 0 ok
+ 1 error.
+*/
+
+my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size)
+{
+ DBUG_ENTER("_ma_dynmap_file");
+ if (size > (my_off_t) (~((size_t) 0)) - MEMMAP_EXTRA_MARGIN)
+ {
+ DBUG_PRINT("warning", ("File is too large for mmap"));
+ DBUG_RETURN(1);
+ }
+ /*
+ Ingo wonders if it is good to use MAP_NORESERVE. From the Linux man page:
+ MAP_NORESERVE
+ Do not reserve swap space for this mapping. When swap space is
+ reserved, one has the guarantee that it is possible to modify the
+ mapping. When swap space is not reserved one might get SIGSEGV
+ upon a write if no physical memory is available.
+ */
+ info->s->file_map= (uchar*)
+ my_mmap(0, (size_t)(size + MEMMAP_EXTRA_MARGIN),
+ info->s->mode==O_RDONLY ? PROT_READ :
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_NORESERVE,
+ info->dfile.file, 0L);
+ if (info->s->file_map == (uchar*) MAP_FAILED)
+ {
+ info->s->file_map= NULL;
+ DBUG_RETURN(1);
+ }
+#if defined(HAVE_MADVISE)
+ madvise((char*) info->s->file_map, size, MADV_RANDOM);
+#endif
+ info->s->mmaped_length= size;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Resize mmaped area for MARIA handler
+
+ SYNOPSIS
+ _ma_remap_file()
+ info MARIA handler
+
+ RETURN
+*/
+
+void _ma_remap_file(MARIA_HA *info, my_off_t size)
+{
+ if (info->s->file_map)
+ {
+ VOID(my_munmap((char*) info->s->file_map,
+ (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN));
+ _ma_dynmap_file(info, size);
+ }
+}
+#endif
+
+
+/*
+ Read bytes from MySAM handler, using mmap or pread
+
+ SYNOPSIS
+ _ma_mmap_pread()
+ info MARIA handler
+ Buffer Input buffer
+ Count Count of bytes for read
+ offset Start position
+ MyFlags
+
+ RETURN
+ 0 ok
+*/
+
+size_t _ma_mmap_pread(MARIA_HA *info, uchar *Buffer,
+ size_t Count, my_off_t offset, myf MyFlags)
+{
+ DBUG_PRINT("info", ("maria_read with mmap %d\n", info->dfile.file));
+ if (info->s->lock_key_trees)
+ rw_rdlock(&info->s->mmap_lock);
+
+ /*
+ The following test may fail in the following cases:
+ - We failed to remap a memory area (fragmented memory?)
+ - This thread has done some writes, but not yet extended the
+ memory mapped area.
+ */
+
+ if (info->s->mmaped_length >= offset + Count)
+ {
+ memcpy(Buffer, info->s->file_map + offset, Count);
+ if (info->s->lock_key_trees)
+ rw_unlock(&info->s->mmap_lock);
+ return 0;
+ }
+ else
+ {
+ if (info->s->lock_key_trees)
+ rw_unlock(&info->s->mmap_lock);
+ return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags);
+ }
+}
+
+
+ /* wrapper for my_pread in case if mmap isn't used */
+
+size_t _ma_nommap_pread(MARIA_HA *info, uchar *Buffer,
+ size_t Count, my_off_t offset, myf MyFlags)
+{
+ return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags);
+}
+
+
+/*
+ Write bytes to MySAM handler, using mmap or pwrite
+
+ SYNOPSIS
+ _ma_mmap_pwrite()
+ info MARIA handler
+ Buffer Output buffer
+ Count Count of bytes for write
+ offset Start position
+ MyFlags
+
+ RETURN
+ 0 ok
+ !=0 error. In this case return error from pwrite
+*/
+
+size_t _ma_mmap_pwrite(MARIA_HA *info, const uchar *Buffer,
+ size_t Count, my_off_t offset, myf MyFlags)
+{
+ DBUG_PRINT("info", ("maria_write with mmap %d\n", info->dfile.file));
+ if (info->s->lock_key_trees)
+ rw_rdlock(&info->s->mmap_lock);
+
+ /*
+ The following test may fail in the following cases:
+ - We failed to remap a memory area (fragmented memory?)
+ - This thread has done some writes, but not yet extended the
+ memory mapped area.
+ */
+
+ if (info->s->mmaped_length >= offset + Count)
+ {
+ memcpy(info->s->file_map + offset, Buffer, Count);
+ if (info->s->lock_key_trees)
+ rw_unlock(&info->s->mmap_lock);
+ return 0;
+ }
+ else
+ {
+ info->s->nonmmaped_inserts++;
+ if (info->s->lock_key_trees)
+ rw_unlock(&info->s->mmap_lock);
+ return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
+ }
+
+}
+
+
+ /* wrapper for my_pwrite in case if mmap isn't used */
+
+size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer,
+ size_t Count, my_off_t offset, myf MyFlags)
+{
+ return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
+}
+
+
+my_bool _ma_write_dynamic_record(MARIA_HA *info, const uchar *record)
+{
+ ulong reclength= _ma_rec_pack(info,info->rec_buff + MARIA_REC_BUFF_OFFSET,
+ record);
+ return (write_dynamic_record(info,info->rec_buff + MARIA_REC_BUFF_OFFSET,
+ reclength));
+}
+
+my_bool _ma_update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+ const uchar *oldrec __attribute__ ((unused)),
+ const uchar *record)
+{
+ uint length= _ma_rec_pack(info, info->rec_buff + MARIA_REC_BUFF_OFFSET,
+ record);
+ return (update_dynamic_record(info, pos,
+ info->rec_buff + MARIA_REC_BUFF_OFFSET,
+ length));
+}
+
+
+my_bool _ma_write_blob_record(MARIA_HA *info, const uchar *record)
+{
+ uchar *rec_buff;
+ int error;
+ ulong reclength,reclength2,extra;
+
+ extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
+ MARIA_DYN_DELETE_BLOCK_HEADER+1);
+ reclength= (info->s->base.pack_reclength +
+ _ma_calc_total_blob_length(info,record)+ extra);
+ if (!(rec_buff=(uchar*) my_alloca(reclength)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
+ return(1);
+ }
+ reclength2= _ma_rec_pack(info,
+ rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+ record);
+ DBUG_PRINT("info",("reclength: %lu reclength2: %lu",
+ reclength, reclength2));
+ DBUG_ASSERT(reclength2 <= reclength);
+ error= write_dynamic_record(info,
+ rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+ reclength2);
+ my_afree(rec_buff);
+ return(error != 0);
+}
+
+
+my_bool _ma_update_blob_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+ const uchar *oldrec __attribute__ ((unused)),
+ const uchar *record)
+{
+ uchar *rec_buff;
+ int error;
+ ulong reclength,extra;
+
+ extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
+ MARIA_DYN_DELETE_BLOCK_HEADER);
+ reclength= (info->s->base.pack_reclength+
+ _ma_calc_total_blob_length(info,record)+ extra);
+#ifdef NOT_USED /* We now support big rows */
+ if (reclength > MARIA_DYN_MAX_ROW_LENGTH)
+ {
+ my_errno=HA_ERR_TO_BIG_ROW;
+ return 1;
+ }
+#endif
+ if (!(rec_buff=(uchar*) my_alloca(reclength)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
+ return(1);
+ }
+ reclength= _ma_rec_pack(info,rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+ record);
+ error=update_dynamic_record(info,pos,
+ rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+ reclength);
+ my_afree(rec_buff);
+ return(error != 0);
+}
+
+
+my_bool _ma_delete_dynamic_record(MARIA_HA *info,
+ const uchar *record __attribute__ ((unused)))
+{
+ return delete_dynamic_record(info, info->cur_row.lastpos, 0);
+}
+
+
+/**
+ Write record to data-file.
+
+ @todo it's cheating: it casts "const uchar*" to uchar*.
+*/
+
+static my_bool write_dynamic_record(MARIA_HA *info, const uchar *record,
+ ulong reclength)
+{
+ int flag;
+ ulong length;
+ my_off_t filepos;
+ DBUG_ENTER("write_dynamic_record");
+
+ flag=0;
+
+ /*
+ Check if we have enough room for the new record.
+ First we do simplified check to make usual case faster.
+ Then we do more precise check for the space left.
+ Though it still is not absolutely precise, as
+ we always use MARIA_MAX_DYN_BLOCK_HEADER while it can be
+ less in the most of the cases.
+ */
+
+ if (unlikely(info->s->base.max_data_file_length -
+ info->state->data_file_length <
+ reclength + MARIA_MAX_DYN_BLOCK_HEADER))
+ {
+ if (info->s->base.max_data_file_length - info->state->data_file_length +
+ info->state->empty - info->state->del * MARIA_MAX_DYN_BLOCK_HEADER <
+ reclength + MARIA_MAX_DYN_BLOCK_HEADER)
+ {
+ my_errno=HA_ERR_RECORD_FILE_FULL;
+ DBUG_RETURN(1);
+ }
+ }
+
+ do
+ {
+ if (_ma_find_writepos(info,reclength,&filepos,&length))
+ goto err;
+ if (_ma_write_part_record(info,filepos,length,
+ (info->append_insert_at_end ?
+ HA_OFFSET_ERROR : info->s->state.dellink),
+ (uchar**) &record,&reclength,&flag))
+ goto err;
+ } while (reclength);
+
+ DBUG_RETURN(0);
+err:
+ DBUG_RETURN(1);
+}
+
+
+ /* Get a block for data ; The given data-area must be used !! */
+
+static int _ma_find_writepos(MARIA_HA *info,
+ ulong reclength, /* record length */
+ my_off_t *filepos, /* Return file pos */
+ ulong *length) /* length of block at filepos */
+{
+ MARIA_BLOCK_INFO block_info;
+ ulong tmp;
+ DBUG_ENTER("_ma_find_writepos");
+
+ if (info->s->state.dellink != HA_OFFSET_ERROR &&
+ !info->append_insert_at_end)
+ {
+ /* Deleted blocks exists; Get last used block */
+ *filepos=info->s->state.dellink;
+ block_info.second_read=0;
+ info->rec_cache.seek_not_done=1;
+ if (!(_ma_get_block_info(&block_info, info->dfile.file,
+ info->s->state.dellink) &
+ BLOCK_DELETED))
+ {
+ DBUG_PRINT("error",("Delete link crashed"));
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(-1);
+ }
+ info->s->state.dellink=block_info.next_filepos;
+ info->state->del--;
+ info->state->empty-= block_info.block_len;
+ *length= block_info.block_len;
+ }
+ else
+ {
+ /* No deleted blocks; Allocate a new block */
+ *filepos=info->state->data_file_length;
+ if ((tmp=reclength+3 + test(reclength >= (65520-3))) <
+ info->s->base.min_block_length)
+ tmp= info->s->base.min_block_length;
+ else
+ tmp= ((tmp+MARIA_DYN_ALIGN_SIZE-1) &
+ (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1)));
+ if (info->state->data_file_length >
+ (info->s->base.max_data_file_length - tmp))
+ {
+ my_errno=HA_ERR_RECORD_FILE_FULL;
+ DBUG_RETURN(-1);
+ }
+ if (tmp > MARIA_MAX_BLOCK_LENGTH)
+ tmp=MARIA_MAX_BLOCK_LENGTH;
+ *length= tmp;
+ info->state->data_file_length+= tmp;
+ info->s->state.split++;
+ info->update|=HA_STATE_WRITE_AT_END;
+ }
+ DBUG_RETURN(0);
+} /* _ma_find_writepos */
+
+
+
+/*
+ Unlink a deleted block from the deleted list.
+ This block will be combined with the preceding or next block to form
+ a big block.
+*/
+
+static my_bool unlink_deleted_block(MARIA_HA *info,
+ MARIA_BLOCK_INFO *block_info)
+{
+ DBUG_ENTER("unlink_deleted_block");
+ if (block_info->filepos == info->s->state.dellink)
+ {
+ /* First deleted block; We can just use this ! */
+ info->s->state.dellink=block_info->next_filepos;
+ }
+ else
+ {
+ MARIA_BLOCK_INFO tmp;
+ tmp.second_read=0;
+ /* Unlink block from the previous block */
+ if (!(_ma_get_block_info(&tmp, info->dfile.file, block_info->prev_filepos)
+ & BLOCK_DELETED))
+ DBUG_RETURN(1); /* Something is wrong */
+ mi_sizestore(tmp.header+4,block_info->next_filepos);
+ if (info->s->file_write(info, tmp.header+4,8,
+ block_info->prev_filepos+4, MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ /* Unlink block from next block */
+ if (block_info->next_filepos != HA_OFFSET_ERROR)
+ {
+ if (!(_ma_get_block_info(&tmp, info->dfile.file,
+ block_info->next_filepos)
+ & BLOCK_DELETED))
+ DBUG_RETURN(1); /* Something is wrong */
+ mi_sizestore(tmp.header+12,block_info->prev_filepos);
+ if (info->s->file_write(info, tmp.header+12,8,
+ block_info->next_filepos+12,
+ MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ }
+ }
+ /* We now have one less deleted block */
+ info->state->del--;
+ info->state->empty-= block_info->block_len;
+ info->s->state.split--;
+
+ /*
+ If this was a block that we where accessing through table scan
+ (maria_rrnd() or maria_scan(), then ensure that we skip over this block
+ when doing next maria_rrnd() or maria_scan().
+ */
+ if (info->cur_row.nextpos == block_info->filepos)
+ info->cur_row.nextpos+= block_info->block_len;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Add a backward link to delete block
+
+ SYNOPSIS
+ update_backward_delete_link()
+ info MARIA handler
+ delete_block Position to delete block to update.
+ If this is 'HA_OFFSET_ERROR', nothing will be done
+ filepos Position to block that 'delete_block' should point to
+
+ RETURN
+ 0 ok
+ 1 error. In this case my_error is set.
+*/
+
+static my_bool update_backward_delete_link(MARIA_HA *info,
+ my_off_t delete_block,
+ MARIA_RECORD_POS filepos)
+{
+ MARIA_BLOCK_INFO block_info;
+ DBUG_ENTER("update_backward_delete_link");
+
+ if (delete_block != HA_OFFSET_ERROR)
+ {
+ block_info.second_read=0;
+ if (_ma_get_block_info(&block_info, info->dfile.file, delete_block)
+ & BLOCK_DELETED)
+ {
+ uchar buff[8];
+ mi_sizestore(buff,filepos);
+ if (info->s->file_write(info,buff, 8, delete_block+12, MYF(MY_NABP)))
+ DBUG_RETURN(1); /* Error on write */
+ }
+ else
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(1); /* Wrong delete link */
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+/* Delete datarecord from database */
+/* info->rec_cache.seek_not_done is updated in cmp_record */
+
+static my_bool delete_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos,
+ uint second_read)
+{
+ uint length,b_type;
+ MARIA_BLOCK_INFO block_info,del_block;
+ int error;
+ my_bool remove_next_block;
+ DBUG_ENTER("delete_dynamic_record");
+
+ /* First add a link from the last block to the new one */
+ error= update_backward_delete_link(info, info->s->state.dellink, filepos);
+
+ block_info.second_read=second_read;
+ do
+ {
+ /* Remove block at 'filepos' */
+ if ((b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos))
+ & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR) ||
+ (length=(uint) (block_info.filepos-filepos) +block_info.block_len) <
+ MARIA_MIN_BLOCK_LENGTH)
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(1);
+ }
+ /* Check if next block is a delete block */
+ del_block.second_read=0;
+ remove_next_block=0;
+ if (_ma_get_block_info(&del_block, info->dfile.file, filepos + length) &
+ BLOCK_DELETED && del_block.block_len+length <
+ MARIA_DYN_MAX_BLOCK_LENGTH)
+ {
+ /* We can't remove this yet as this block may be the head block */
+ remove_next_block=1;
+ length+=del_block.block_len;
+ }
+
+ block_info.header[0]=0;
+ mi_int3store(block_info.header+1,length);
+ mi_sizestore(block_info.header+4,info->s->state.dellink);
+ if (b_type & BLOCK_LAST)
+ bfill(block_info.header+12,8,255);
+ else
+ mi_sizestore(block_info.header+12,block_info.next_filepos);
+ if (info->s->file_write(info, block_info.header, 20, filepos,
+ MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ info->s->state.dellink = filepos;
+ info->state->del++;
+ info->state->empty+=length;
+ filepos=block_info.next_filepos;
+
+ /* Now it's safe to unlink the deleted block directly after this one */
+ if (remove_next_block && unlink_deleted_block(info,&del_block))
+ error=1;
+ } while (!(b_type & BLOCK_LAST));
+
+ DBUG_RETURN(error);
+}
+
+
+ /* Write a block to datafile */
+
+int _ma_write_part_record(MARIA_HA *info,
+ my_off_t filepos, /* points at empty block */
+ ulong length, /* length of block */
+ my_off_t next_filepos,/* Next empty block */
+ uchar **record, /* pointer to record ptr */
+ ulong *reclength, /* length of *record */
+ int *flag) /* *flag == 0 if header */
+{
+ ulong head_length,res_length,extra_length,long_block,del_length;
+ uchar *pos,*record_end;
+ my_off_t next_delete_block;
+ uchar temp[MARIA_SPLIT_LENGTH+MARIA_DYN_DELETE_BLOCK_HEADER];
+ DBUG_ENTER("_ma_write_part_record");
+
+ next_delete_block=HA_OFFSET_ERROR;
+
+ res_length=extra_length=0;
+ if (length > *reclength + MARIA_SPLIT_LENGTH)
+ { /* Splitt big block */
+ res_length=MY_ALIGN(length- *reclength - MARIA_EXTEND_BLOCK_LENGTH,
+ MARIA_DYN_ALIGN_SIZE);
+ length-= res_length; /* Use this for first part */
+ }
+ long_block= (length < 65520L && *reclength < 65520L) ? 0 : 1;
+ if (length == *reclength+ 3 + long_block)
+ {
+ /* Block is exactly of the right length */
+ temp[0]=(uchar) (1+ *flag)+(uchar) long_block; /* Flag is 0 or 6 */
+ if (long_block)
+ {
+ mi_int3store(temp+1,*reclength);
+ head_length=4;
+ }
+ else
+ {
+ mi_int2store(temp+1,*reclength);
+ head_length=3;
+ }
+ }
+ else if (length-long_block < *reclength+4)
+ { /* To short block */
+ if (next_filepos == HA_OFFSET_ERROR)
+ next_filepos= (info->s->state.dellink != HA_OFFSET_ERROR &&
+ !info->append_insert_at_end ?
+ info->s->state.dellink : info->state->data_file_length);
+ if (*flag == 0) /* First block */
+ {
+ if (*reclength > MARIA_MAX_BLOCK_LENGTH)
+ {
+ head_length= 16;
+ temp[0]=13;
+ mi_int4store(temp+1,*reclength);
+ mi_int3store(temp+5,length-head_length);
+ mi_sizestore(temp+8,next_filepos);
+ }
+ else
+ {
+ head_length=5+8+long_block*2;
+ temp[0]=5+(uchar) long_block;
+ if (long_block)
+ {
+ mi_int3store(temp+1,*reclength);
+ mi_int3store(temp+4,length-head_length);
+ mi_sizestore(temp+7,next_filepos);
+ }
+ else
+ {
+ mi_int2store(temp+1,*reclength);
+ mi_int2store(temp+3,length-head_length);
+ mi_sizestore(temp+5,next_filepos);
+ }
+ }
+ }
+ else
+ {
+ head_length=3+8+long_block;
+ temp[0]=11+(uchar) long_block;
+ if (long_block)
+ {
+ mi_int3store(temp+1,length-head_length);
+ mi_sizestore(temp+4,next_filepos);
+ }
+ else
+ {
+ mi_int2store(temp+1,length-head_length);
+ mi_sizestore(temp+3,next_filepos);
+ }
+ }
+ }
+ else
+ { /* Block with empty info last */
+ head_length=4+long_block;
+ extra_length= length- *reclength-head_length;
+ temp[0]= (uchar) (3+ *flag)+(uchar) long_block; /* 3,4 or 9,10 */
+ if (long_block)
+ {
+ mi_int3store(temp+1,*reclength);
+ temp[4]= (uchar) (extra_length);
+ }
+ else
+ {
+ mi_int2store(temp+1,*reclength);
+ temp[3]= (uchar) (extra_length);
+ }
+ length= *reclength+head_length; /* Write only what is needed */
+ }
+ DBUG_DUMP("header", temp, head_length);
+
+ /* Make a long block for one write */
+ record_end= *record+length-head_length;
+ del_length=(res_length ? MARIA_DYN_DELETE_BLOCK_HEADER : 0);
+ bmove((*record-head_length), temp, head_length);
+ memcpy(temp,record_end,(size_t) (extra_length+del_length));
+ bzero(record_end, extra_length);
+
+ if (res_length)
+ {
+ /* Check first if we can join this block with the next one */
+ MARIA_BLOCK_INFO del_block;
+ my_off_t next_block=filepos+length+extra_length+res_length;
+
+ del_block.second_read=0;
+ if (next_block < info->state->data_file_length &&
+ info->s->state.dellink != HA_OFFSET_ERROR)
+ {
+ if ((_ma_get_block_info(&del_block, info->dfile.file, next_block)
+ & BLOCK_DELETED) &&
+ res_length + del_block.block_len < MARIA_DYN_MAX_BLOCK_LENGTH)
+ {
+ if (unlink_deleted_block(info,&del_block))
+ goto err;
+ res_length+=del_block.block_len;
+ }
+ }
+
+ /* Create a delete link of the last part of the block */
+ pos=record_end+extra_length;
+ pos[0]= '\0';
+ mi_int3store(pos+1,res_length);
+ mi_sizestore(pos+4,info->s->state.dellink);
+ bfill(pos+12,8,255); /* End link */
+ next_delete_block=info->s->state.dellink;
+ info->s->state.dellink= filepos+length+extra_length;
+ info->state->del++;
+ info->state->empty+=res_length;
+ info->s->state.split++;
+ }
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->update & HA_STATE_WRITE_AT_END)
+ {
+ if (info->update & HA_STATE_EXTEND_BLOCK)
+ {
+ info->update&= ~HA_STATE_EXTEND_BLOCK;
+ if (my_block_write(&info->rec_cache, *record-head_length,
+ length+extra_length+del_length,filepos))
+ goto err;
+ }
+ else if (my_b_write(&info->rec_cache, *record-head_length,
+ length+extra_length+del_length))
+ goto err;
+ }
+ else
+ {
+ info->rec_cache.seek_not_done=1;
+ if (info->s->file_write(info, *record-head_length,
+ length+extra_length+
+ del_length,filepos,info->s->write_flag))
+ goto err;
+ }
+ memcpy(record_end,temp,(size_t) (extra_length+del_length));
+ *record=record_end;
+ *reclength-=(length-head_length);
+ *flag=6;
+
+ if (del_length)
+ {
+ /* link the next delete block to this */
+ if (update_backward_delete_link(info, next_delete_block,
+ info->s->state.dellink))
+ goto err;
+ }
+
+ DBUG_RETURN(0);
+err:
+ DBUG_PRINT("exit",("errno: %d",my_errno));
+ DBUG_RETURN(1);
+} /* _ma_write_part_record */
+
+
+ /* update record from datafile */
+
+static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos,
+ uchar *record, ulong reclength)
+{
+ int flag;
+ uint error;
+ ulong length;
+ MARIA_BLOCK_INFO block_info;
+ DBUG_ENTER("update_dynamic_record");
+
+ flag=block_info.second_read=0;
+ /*
+ Check if we have enough room for the record.
+ First we do simplified check to make usual case faster.
+ Then we do more precise check for the space left.
+ Though it still is not absolutely precise, as
+ we always use MARIA_MAX_DYN_BLOCK_HEADER while it can be
+ less in the most of the cases.
+ */
+
+ /*
+ compare with just the reclength as we're going
+ to get some space from the old replaced record
+ */
+ if (unlikely(info->s->base.max_data_file_length -
+ info->state->data_file_length < reclength))
+ {
+ /* If new record isn't longer, we can go on safely */
+ if (info->cur_row.total_length < reclength)
+ {
+ if (info->s->base.max_data_file_length - info->state->data_file_length +
+ info->state->empty - info->state->del * MARIA_MAX_DYN_BLOCK_HEADER <
+ reclength - info->cur_row.total_length + MARIA_MAX_DYN_BLOCK_HEADER)
+ {
+ my_errno=HA_ERR_RECORD_FILE_FULL;
+ goto err;
+ }
+ }
+ }
+ /* Remember length for updated row if it's updated again */
+ info->cur_row.total_length= reclength;
+
+ while (reclength > 0)
+ {
+ if (filepos != info->s->state.dellink)
+ {
+ block_info.next_filepos= HA_OFFSET_ERROR;
+ if ((error= _ma_get_block_info(&block_info, info->dfile.file, filepos))
+ & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ DBUG_PRINT("error",("Got wrong block info"));
+ if (!(error & BLOCK_FATAL_ERROR))
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+ length=(ulong) (block_info.filepos-filepos) + block_info.block_len;
+ if (length < reclength)
+ {
+ uint tmp=MY_ALIGN(reclength - length + 3 +
+ test(reclength >= 65520L),MARIA_DYN_ALIGN_SIZE);
+ /* Don't create a block bigger than MARIA_MAX_BLOCK_LENGTH */
+ tmp= min(length+tmp, MARIA_MAX_BLOCK_LENGTH)-length;
+ /* Check if we can extend this block */
+ if (block_info.filepos + block_info.block_len ==
+ info->state->data_file_length &&
+ info->state->data_file_length <
+ info->s->base.max_data_file_length-tmp)
+ {
+ /* extend file */
+ DBUG_PRINT("info",("Extending file with %d bytes",tmp));
+ if (info->cur_row.nextpos == info->state->data_file_length)
+ info->cur_row.nextpos+= tmp;
+ info->state->data_file_length+= tmp;
+ info->update|= HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK;
+ length+=tmp;
+ }
+ else if (length < MARIA_MAX_BLOCK_LENGTH - MARIA_MIN_BLOCK_LENGTH)
+ {
+ /*
+ Check if next block is a deleted block
+ Above we have MARIA_MIN_BLOCK_LENGTH to avoid the problem where
+ the next block is so small it can't be splited which could
+ casue problems
+ */
+
+ MARIA_BLOCK_INFO del_block;
+ del_block.second_read=0;
+ if (_ma_get_block_info(&del_block, info->dfile.file,
+ block_info.filepos + block_info.block_len) &
+ BLOCK_DELETED)
+ {
+ /* Use; Unlink it and extend the current block */
+ DBUG_PRINT("info",("Extending current block"));
+ if (unlink_deleted_block(info,&del_block))
+ goto err;
+ if ((length+=del_block.block_len) > MARIA_MAX_BLOCK_LENGTH)
+ {
+ /*
+ New block was too big, link overflow part back to
+ delete list
+ */
+ my_off_t next_pos;
+ ulong rest_length= length-MARIA_MAX_BLOCK_LENGTH;
+ set_if_bigger(rest_length, MARIA_MIN_BLOCK_LENGTH);
+ next_pos= del_block.filepos+ del_block.block_len - rest_length;
+
+ if (update_backward_delete_link(info, info->s->state.dellink,
+ next_pos))
+ DBUG_RETURN(1);
+
+ /* create delete link for data that didn't fit into the page */
+ del_block.header[0]=0;
+ mi_int3store(del_block.header+1, rest_length);
+ mi_sizestore(del_block.header+4,info->s->state.dellink);
+ bfill(del_block.header+12,8,255);
+ if (info->s->file_write(info, del_block.header, 20,
+ next_pos, MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ info->s->state.dellink= next_pos;
+ info->s->state.split++;
+ info->state->del++;
+ info->state->empty+= rest_length;
+ length-= rest_length;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ if (_ma_find_writepos(info,reclength,&filepos,&length))
+ goto err;
+ }
+ if (_ma_write_part_record(info,filepos,length,block_info.next_filepos,
+ &record,&reclength,&flag))
+ goto err;
+ if ((filepos=block_info.next_filepos) == HA_OFFSET_ERROR)
+ {
+ /* Start writing data on deleted blocks */
+ filepos=info->s->state.dellink;
+ }
+ }
+
+ if (block_info.next_filepos != HA_OFFSET_ERROR)
+ if (delete_dynamic_record(info,block_info.next_filepos,1))
+ goto err;
+
+ DBUG_RETURN(0);
+err:
+ DBUG_RETURN(1);
+}
+
+
+ /* Pack a record. Return new reclength */
+
+uint _ma_rec_pack(MARIA_HA *info, register uchar *to,
+ register const uchar *from)
+{
+ uint length,new_length,flag,bit,i;
+ const uchar *pos,*end;
+ uchar *startpos,*packpos;
+ enum en_fieldtype type;
+ reg3 MARIA_COLUMNDEF *column;
+ MARIA_BLOB *blob;
+ DBUG_ENTER("_ma_rec_pack");
+
+ flag= 0;
+ bit= 1;
+ startpos= packpos=to;
+ to+= info->s->base.pack_bytes;
+ blob= info->blobs;
+ column= info->s->columndef;
+ if (info->s->base.null_bytes)
+ {
+ memcpy(to, from, info->s->base.null_bytes);
+ from+= info->s->base.null_bytes;
+ to+= info->s->base.null_bytes;
+ }
+
+ for (i=info->s->base.fields ; i-- > 0; from+= length, column++)
+ {
+ length=(uint) column->length;
+ if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL)
+ {
+ if (type == FIELD_BLOB)
+ {
+ if (!blob->length)
+ flag|=bit;
+ else
+ {
+ char *temp_pos;
+ size_t tmp_length=length-portable_sizeof_char_ptr;
+ memcpy(to,from,tmp_length);
+ memcpy_fixed(&temp_pos,from+tmp_length,sizeof(char*));
+ memcpy(to+tmp_length,temp_pos,(size_t) blob->length);
+ to+=tmp_length+blob->length;
+ }
+ blob++;
+ }
+ else if (type == FIELD_SKIP_ZERO)
+ {
+ if (memcmp(from, maria_zero_string, length) == 0)
+ flag|=bit;
+ else
+ {
+ memcpy(to, from, (size_t) length);
+ to+=length;
+ }
+ }
+ else if (type == FIELD_SKIP_ENDSPACE ||
+ type == FIELD_SKIP_PRESPACE)
+ {
+ pos= from; end= from + length;
+ if (type == FIELD_SKIP_ENDSPACE)
+ { /* Pack trailing spaces */
+ while (end > from && *(end-1) == ' ')
+ end--;
+ }
+ else
+ { /* Pack pref-spaces */
+ while (pos < end && *pos == ' ')
+ pos++;
+ }
+ new_length=(uint) (end-pos);
+ if (new_length +1 + test(column->length > 255 && new_length > 127)
+ < length)
+ {
+ if (column->length > 255 && new_length > 127)
+ {
+ to[0]= (uchar) ((new_length & 127) + 128);
+ to[1]= (uchar) (new_length >> 7);
+ to+=2;
+ }
+ else
+ *to++= (uchar) new_length;
+ memcpy(to, pos, (size_t) new_length); to+=new_length;
+ flag|=bit;
+ }
+ else
+ {
+ memcpy(to,from,(size_t) length); to+=length;
+ }
+ }
+ else if (type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1);
+ uint tmp_length;
+ if (pack_length == 1)
+ {
+ tmp_length= (uint) *from;
+ *to++= *from;
+ }
+ else
+ {
+ tmp_length= uint2korr(from);
+ store_key_length_inc(to,tmp_length);
+ }
+ memcpy(to, from+pack_length,tmp_length);
+ to+= tmp_length;
+ continue;
+ }
+ else
+ {
+ memcpy(to,from,(size_t) length); to+=length;
+ continue; /* Normal field */
+ }
+ if ((bit= bit << 1) >= 256)
+ {
+ *packpos++ = (uchar) flag;
+ bit=1; flag=0;
+ }
+ }
+ else
+ {
+ memcpy(to,from,(size_t) length); to+=length;
+ }
+ }
+ if (bit != 1)
+ *packpos= (uchar) flag;
+ if (info->s->calc_checksum)
+ *to++= (uchar) info->cur_row.checksum;
+ DBUG_PRINT("exit",("packed length: %d",(int) (to-startpos)));
+ DBUG_RETURN((uint) (to-startpos));
+} /* _ma_rec_pack */
+
+
+
+/*
+ Check if a record was correctly packed. Used only by maria_chk
+ Returns 0 if record is ok.
+*/
+
+my_bool _ma_rec_check(MARIA_HA *info,const uchar *record, uchar *rec_buff,
+ ulong packed_length, my_bool with_checksum,
+ ha_checksum checksum)
+{
+ uint length,new_length,flag,bit,i;
+ const uchar *pos,*end;
+ uchar *packpos,*to;
+ enum en_fieldtype type;
+ reg3 MARIA_COLUMNDEF *column;
+ DBUG_ENTER("_ma_rec_check");
+
+ packpos=rec_buff; to= rec_buff+info->s->base.pack_bytes;
+ column= info->s->columndef;
+ flag= *packpos; bit=1;
+ record+= info->s->base.null_bytes;
+ to+= info->s->base.null_bytes;
+
+ for (i=info->s->base.fields ; i-- > 0; record+= length, column++)
+ {
+ length=(uint) column->length;
+ if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL)
+ {
+ if (type == FIELD_BLOB)
+ {
+ uint blob_length=
+ _ma_calc_blob_length(length-portable_sizeof_char_ptr,record);
+ if (!blob_length && !(flag & bit))
+ goto err;
+ if (blob_length)
+ to+=length - portable_sizeof_char_ptr+ blob_length;
+ }
+ else if (type == FIELD_SKIP_ZERO)
+ {
+ if (memcmp(record, maria_zero_string, length) == 0)
+ {
+ if (!(flag & bit))
+ goto err;
+ }
+ else
+ to+=length;
+ }
+ else if (type == FIELD_SKIP_ENDSPACE ||
+ type == FIELD_SKIP_PRESPACE)
+ {
+ pos= record; end= record + length;
+ if (type == FIELD_SKIP_ENDSPACE)
+ { /* Pack trailing spaces */
+ while (end > record && *(end-1) == ' ')
+ end--;
+ }
+ else
+ { /* Pack pre-spaces */
+ while (pos < end && *pos == ' ')
+ pos++;
+ }
+ new_length=(uint) (end-pos);
+ if (new_length +1 + test(column->length > 255 && new_length > 127)
+ < length)
+ {
+ if (!(flag & bit))
+ goto err;
+ if (column->length > 255 && new_length > 127)
+ {
+ /* purecov: begin inspected */
+ if (to[0] != (uchar) ((new_length & 127) + 128) ||
+ to[1] != (uchar) (new_length >> 7))
+ goto err;
+ to+=2;
+ /* purecov: end */
+ }
+ else if (*to++ != (uchar) new_length)
+ goto err;
+ to+=new_length;
+ }
+ else
+ to+=length;
+ }
+ else if (type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1);
+ uint tmp_length;
+ if (pack_length == 1)
+ {
+ tmp_length= (uint) *record;
+ to+= 1+ tmp_length;
+ continue;
+ }
+ else
+ {
+ tmp_length= uint2korr(record);
+ to+= get_pack_length(tmp_length)+tmp_length;
+ }
+ continue;
+ }
+ else
+ {
+ to+=length;
+ continue; /* Normal field */
+ }
+ if ((bit= bit << 1) >= 256)
+ {
+ flag= *++packpos;
+ bit=1;
+ }
+ }
+ else
+ to+= length;
+ }
+ if (packed_length != (uint) (to - rec_buff) +
+ test(info->s->calc_checksum) || (bit != 1 && (flag & ~(bit - 1))))
+ goto err;
+ if (with_checksum && ((uchar) checksum != (uchar) *to))
+ {
+ DBUG_PRINT("error",("wrong checksum for row"));
+ goto err;
+ }
+ DBUG_RETURN(0);
+
+err:
+ DBUG_RETURN(1);
+}
+
+
+/*
+ @brief Unpacks a record
+
+ @return Recordlength
+ @retval >0 ok
+ @retval MY_FILE_ERROR (== -1) Error.
+ my_errno is set to HA_ERR_WRONG_IN_RECORD
+*/
+
+ulong _ma_rec_unpack(register MARIA_HA *info, register uchar *to, uchar *from,
+ ulong found_length)
+{
+ uint flag,bit,length,min_pack_length, column_length;
+ enum en_fieldtype type;
+ uchar *from_end,*to_end,*packpos;
+ reg3 MARIA_COLUMNDEF *column, *end_column;
+ DBUG_ENTER("_ma_rec_unpack");
+
+ to_end=to + info->s->base.reclength;
+ from_end=from+found_length;
+ flag= (uchar) *from; bit=1; packpos=from;
+ if (found_length < info->s->base.min_pack_length)
+ goto err;
+ from+= info->s->base.pack_bytes;
+ min_pack_length= info->s->base.min_pack_length - info->s->base.pack_bytes;
+
+ if ((length= info->s->base.null_bytes))
+ {
+ memcpy(to, from, length);
+ from+= length;
+ to+= length;
+ min_pack_length-= length;
+ }
+
+ for (column= info->s->columndef, end_column= column + info->s->base.fields;
+ column < end_column ; to+= column_length, column++)
+ {
+ column_length= column->length;
+ if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL &&
+ (type != FIELD_CHECK))
+ {
+ if (type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(column_length-1);
+ if (pack_length == 1)
+ {
+ length= (uint) *(uchar*) from;
+ if (length > column_length-1)
+ goto err;
+ *to= *from++;
+ }
+ else
+ {
+ get_key_length(length, from);
+ if (length > column_length-2)
+ goto err;
+ int2store(to,length);
+ }
+ if (from+length > from_end)
+ goto err;
+ memcpy(to+pack_length, from, length);
+ from+= length;
+ min_pack_length--;
+ continue;
+ }
+ if (flag & bit)
+ {
+ if (type == FIELD_BLOB || type == FIELD_SKIP_ZERO)
+ bzero(to, column_length);
+ else if (type == FIELD_SKIP_ENDSPACE ||
+ type == FIELD_SKIP_PRESPACE)
+ {
+ if (column->length > 255 && *from & 128)
+ {
+ if (from + 1 >= from_end)
+ goto err;
+ length= (*from & 127)+ ((uint) (uchar) *(from+1) << 7); from+=2;
+ }
+ else
+ {
+ if (from == from_end)
+ goto err;
+ length= (uchar) *from++;
+ }
+ min_pack_length--;
+ if (length >= column_length ||
+ min_pack_length + length > (uint) (from_end - from))
+ goto err;
+ if (type == FIELD_SKIP_ENDSPACE)
+ {
+ memcpy(to, from, (size_t) length);
+ bfill(to+length, column_length-length, ' ');
+ }
+ else
+ {
+ bfill(to, column_length-length, ' ');
+ memcpy(to+column_length-length, from, (size_t) length);
+ }
+ from+=length;
+ }
+ }
+ else if (type == FIELD_BLOB)
+ {
+ uint size_length=column_length- portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(size_length,from);
+ ulong from_left= (ulong) (from_end - from);
+ if (from_left < size_length ||
+ from_left - size_length < blob_length ||
+ from_left - size_length - blob_length < min_pack_length)
+ goto err;
+ memcpy(to, from, (size_t) size_length);
+ from+=size_length;
+ memcpy_fixed(to+size_length,(uchar*) &from,sizeof(char*));
+ from+=blob_length;
+ }
+ else
+ {
+ if (type == FIELD_SKIP_ENDSPACE || type == FIELD_SKIP_PRESPACE)
+ min_pack_length--;
+ if (min_pack_length + column_length > (uint) (from_end - from))
+ goto err;
+ memcpy(to, from, (size_t) column_length); from+=column_length;
+ }
+ if ((bit= bit << 1) >= 256)
+ {
+ flag= (uchar) *++packpos; bit=1;
+ }
+ }
+ else
+ {
+ if (min_pack_length > (uint) (from_end - from))
+ goto err;
+ min_pack_length-=column_length;
+ memcpy(to, from, (size_t) column_length);
+ from+=column_length;
+ }
+ }
+ if (info->s->calc_checksum)
+ info->cur_row.checksum= (uint) (uchar) *from++;
+ if (to == to_end && from == from_end && (bit == 1 || !(flag & ~(bit-1))))
+ DBUG_RETURN(found_length);
+
+err:
+ my_errno= HA_ERR_WRONG_IN_RECORD;
+ DBUG_PRINT("error",("to_end: 0x%lx -> 0x%lx from_end: 0x%lx -> 0x%lx",
+ (long) to, (long) to_end, (long) from, (long) from_end));
+ DBUG_DUMP("from", info->rec_buff, info->s->base.min_pack_length);
+ DBUG_RETURN(MY_FILE_ERROR);
+} /* _ma_rec_unpack */
+
+
+ /* Calc length of blob. Update info in blobs->length */
+
+ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record)
+{
+ ulong length;
+ MARIA_BLOB *blob,*end;
+
+ for (length=0, blob= info->blobs, end=blob+info->s->base.blobs ;
+ blob != end;
+ blob++)
+ {
+ blob->length= _ma_calc_blob_length(blob->pack_length,
+ record + blob->offset);
+ length+=blob->length;
+ }
+ return length;
+}
+
+
+ulong _ma_calc_blob_length(uint length, const uchar *pos)
+{
+ switch (length) {
+ case 1:
+ return (uint) (uchar) *pos;
+ case 2:
+ return (uint) uint2korr(pos);
+ case 3:
+ return uint3korr(pos);
+ case 4:
+ return uint4korr(pos);
+ default:
+ break;
+ }
+ return 0; /* Impossible */
+}
+
+
+void _ma_store_blob_length(uchar *pos,uint pack_length,uint length)
+{
+ switch (pack_length) {
+ case 1:
+ *pos= (uchar) length;
+ break;
+ case 2:
+ int2store(pos,length);
+ break;
+ case 3:
+ int3store(pos,length);
+ break;
+ case 4:
+ int4store(pos,length);
+ default:
+ break;
+ }
+ return;
+}
+
+
+/*
+ Read record from datafile.
+
+ SYNOPSIS
+ _ma_read_dynamic_record()
+ info MARIA_HA pointer to table.
+ filepos From where to read the record.
+ buf Destination for record.
+
+ NOTE
+ If a write buffer is active, it needs to be flushed if its contents
+ intersects with the record to read. We always check if the position
+ of the first uchar of the write buffer is lower than the position
+ past the last uchar to read. In theory this is also true if the write
+ buffer is completely below the read segment. That is, if there is no
+ intersection. But this case is unusual. We flush anyway. Only if the
+ first uchar in the write buffer is above the last uchar to read, we do
+ not flush.
+
+ A dynamic record may need several reads. So this check must be done
+ before every read. Reading a dynamic record starts with reading the
+ block header. If the record does not fit into the free space of the
+ header, the block may be longer than the header. In this case a
+ second read is necessary. These one or two reads repeat for every
+ part of the record.
+
+ RETURN
+ 0 OK
+ # Error number
+*/
+
+int _ma_read_dynamic_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos)
+{
+ int block_of_record;
+ uint b_type;
+ MARIA_BLOCK_INFO block_info;
+ File file;
+ uchar *to;
+ uint left_length;
+ DBUG_ENTER("_ma_read_dynamic_record");
+
+ if (filepos == HA_OFFSET_ERROR)
+ goto err;
+
+ LINT_INIT(to);
+ LINT_INIT(left_length);
+ file= info->dfile.file;
+ block_of_record= 0; /* First block of record is numbered as zero. */
+ block_info.second_read= 0;
+ do
+ {
+ /* A corrupted table can have wrong pointers. (Bug# 19835) */
+ if (filepos == HA_OFFSET_ERROR)
+ goto panic;
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ (info->rec_cache.pos_in_file < filepos +
+ MARIA_BLOCK_INFO_HEADER_LENGTH) &&
+ flush_io_cache(&info->rec_cache))
+ goto err;
+ info->rec_cache.seek_not_done=1;
+ if ((b_type= _ma_get_block_info(&block_info, file, filepos)) &
+ (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED))
+ my_errno=HA_ERR_RECORD_DELETED;
+ goto err;
+ }
+ if (block_of_record++ == 0) /* First block */
+ {
+ info->cur_row.total_length= block_info.rec_len;
+ if (block_info.rec_len > (uint) info->s->base.max_pack_length)
+ goto panic;
+ if (info->s->base.blobs)
+ {
+ if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ block_info.rec_len +
+ info->s->base.extra_rec_buff_size))
+ goto err;
+ }
+ to= info->rec_buff;
+ left_length=block_info.rec_len;
+ }
+ if (left_length < block_info.data_len || ! block_info.data_len)
+ goto panic; /* Wrong linked record */
+ /* copy information that is already read */
+ {
+ uint offset= (uint) (block_info.filepos - filepos);
+ uint prefetch_len= (sizeof(block_info.header) - offset);
+ filepos+= sizeof(block_info.header);
+
+ if (prefetch_len > block_info.data_len)
+ prefetch_len= block_info.data_len;
+ if (prefetch_len)
+ {
+ memcpy(to, block_info.header + offset, prefetch_len);
+ block_info.data_len-= prefetch_len;
+ left_length-= prefetch_len;
+ to+= prefetch_len;
+ }
+ }
+ /* read rest of record from file */
+ if (block_info.data_len)
+ {
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->rec_cache.pos_in_file < filepos + block_info.data_len &&
+ flush_io_cache(&info->rec_cache))
+ goto err;
+ /*
+ What a pity that this method is not called 'file_pread' and that
+ there is no equivalent without seeking. We are at the right
+ position already. :(
+ */
+ if (info->s->file_read(info, to, block_info.data_len,
+ filepos, MYF(MY_NABP)))
+ goto panic;
+ left_length-=block_info.data_len;
+ to+=block_info.data_len;
+ }
+ filepos= block_info.next_filepos;
+ } while (left_length);
+
+ info->update|= HA_STATE_AKTIV; /* We have a aktive record */
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) !=
+ MY_FILE_ERROR ? 0 : my_errno);
+
+err:
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(my_errno);
+
+panic:
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+}
+
+ /* compare unique constraint between stored rows */
+
+my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos)
+{
+ uchar *old_rec_buff,*old_record;
+ size_t old_rec_buff_size;
+ my_bool error;
+ DBUG_ENTER("_ma_cmp_dynamic_unique");
+
+ if (!(old_record=my_alloca(info->s->base.reclength)))
+ DBUG_RETURN(1);
+
+ /* Don't let the compare destroy blobs that may be in use */
+ old_rec_buff= info->rec_buff;
+ old_rec_buff_size= info->rec_buff_size;
+
+ if (info->s->base.blobs)
+ {
+ info->rec_buff= 0;
+ info->rec_buff_size= 0;
+ }
+ error= _ma_read_dynamic_record(info, old_record, pos) != 0;
+ if (!error)
+ error=_ma_unique_comp(def, record, old_record, def->null_are_equal) != 0;
+ if (info->s->base.blobs)
+ {
+ my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ info->rec_buff= old_rec_buff;
+ info->rec_buff_size= old_rec_buff_size;
+ }
+ my_afree(old_record);
+ DBUG_RETURN(error);
+}
+
+
+ /* Compare of record on disk with packed record in memory */
+
+my_bool _ma_cmp_dynamic_record(register MARIA_HA *info,
+ register const uchar *record)
+{
+ uint flag, reclength, b_type,cmp_length;
+ my_off_t filepos;
+ uchar *buffer;
+ MARIA_BLOCK_INFO block_info;
+ my_bool error= 1;
+ DBUG_ENTER("_ma_cmp_dynamic_record");
+
+ /* We are going to do changes; dont let anybody disturb */
+ dont_break(); /* Dont allow SIGHUP or SIGINT */
+
+ if (info->opt_flag & WRITE_CACHE_USED)
+ {
+ info->update&= ~(HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK);
+ if (flush_io_cache(&info->rec_cache))
+ DBUG_RETURN(1);
+ }
+ info->rec_cache.seek_not_done=1;
+
+ /* If nobody have touched the database we don't have to test rec */
+
+ buffer=info->rec_buff;
+ if ((info->opt_flag & READ_CHECK_USED))
+ { /* If check isn't disabled */
+ if (info->s->base.blobs)
+ {
+ if (!(buffer=(uchar*) my_alloca(info->s->base.pack_reclength+
+ _ma_calc_total_blob_length(info,record))))
+ DBUG_RETURN(1);
+ }
+ reclength= _ma_rec_pack(info,buffer,record);
+ record= buffer;
+
+ filepos= info->cur_row.lastpos;
+ flag=block_info.second_read=0;
+ block_info.next_filepos=filepos;
+ while (reclength > 0)
+ {
+ if ((b_type= _ma_get_block_info(&block_info, info->dfile.file,
+ block_info.next_filepos))
+ & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED))
+ my_errno=HA_ERR_RECORD_CHANGED;
+ goto err;
+ }
+ if (flag == 0) /* First block */
+ {
+ flag=1;
+ if (reclength != block_info.rec_len)
+ {
+ my_errno=HA_ERR_RECORD_CHANGED;
+ goto err;
+ }
+ } else if (reclength < block_info.data_len)
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+ reclength-= block_info.data_len;
+ cmp_length= block_info.data_len;
+ if (!reclength && info->s->calc_checksum)
+ cmp_length--; /* 'record' may not contain checksum */
+
+ if (_ma_cmp_buffer(info->dfile.file, record, block_info.filepos,
+ cmp_length))
+ {
+ my_errno=HA_ERR_RECORD_CHANGED;
+ goto err;
+ }
+ flag=1;
+ record+=block_info.data_len;
+ }
+ }
+ my_errno=0;
+ error= 0;
+err:
+ if (buffer != info->rec_buff)
+ my_afree(buffer);
+ DBUG_PRINT("exit", ("result: %d", error));
+ DBUG_RETURN(error);
+}
+
+
+ /* Compare file to buffert */
+
+static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos,
+ uint length)
+{
+ uint next_length;
+ uchar temp_buff[IO_SIZE*2];
+ DBUG_ENTER("_ma_cmp_buffer");
+
+ next_length= IO_SIZE*2 - (uint) (filepos & (IO_SIZE-1));
+
+ while (length > IO_SIZE*2)
+ {
+ if (my_pread(file,temp_buff,next_length,filepos, MYF(MY_NABP)) ||
+ memcmp(buff, temp_buff, next_length))
+ goto err;
+ filepos+=next_length;
+ buff+=next_length;
+ length-= next_length;
+ next_length=IO_SIZE*2;
+ }
+ if (my_pread(file,temp_buff,length,filepos,MYF(MY_NABP)))
+ goto err;
+ DBUG_RETURN(memcmp(buff, temp_buff, length) != 0);
+err:
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Read next record from datafile during table scan.
+
+ SYNOPSIS
+ _ma_read_rnd_dynamic_record()
+ info MARIA_HA pointer to table.
+ buf Destination for record.
+ filepos From where to read the record.
+ skip_deleted_blocks If to repeat reading until a non-deleted
+ record is found.
+
+ NOTE
+ This is identical to _ma_read_dynamic_record(), except the following
+ cases:
+
+ - If there is no active row at 'filepos', continue scanning for
+ an active row. (This is becasue the previous
+ _ma_read_rnd_dynamic_record() call stored the next block position
+ in filepos, but this position may not be a start block for a row
+ - We may have READ_CACHING enabled, in which case we use the cache
+ to read rows.
+
+ For other comments, check _ma_read_dynamic_record()
+
+ RETURN
+ 0 OK
+ != 0 Error number
+*/
+
+int _ma_read_rnd_dynamic_record(MARIA_HA *info,
+ uchar *buf,
+ MARIA_RECORD_POS filepos,
+ my_bool skip_deleted_blocks)
+{
+ int block_of_record, info_read;
+ uint left_len,b_type;
+ uchar *to;
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_read_rnd_dynamic_record");
+
+ info_read=0;
+ LINT_INIT(to);
+
+ if (info->lock_type == F_UNLCK)
+ {
+#ifndef UNSAFE_LOCKING
+#else
+ info->tmp_lock_type=F_RDLCK;
+#endif
+ }
+ else
+ info_read=1; /* memory-keyinfoblock is ok */
+
+ block_of_record= 0; /* First block of record is numbered as zero. */
+ block_info.second_read= 0;
+ left_len=1;
+ do
+ {
+ if (filepos >= info->state->data_file_length)
+ {
+ if (!info_read)
+ { /* Check if changed */
+ info_read=1;
+ info->rec_cache.seek_not_done=1;
+ if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+ goto panic;
+ }
+ if (filepos >= info->state->data_file_length)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ goto err;
+ }
+ }
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (_ma_read_cache(&info->rec_cache, block_info.header, filepos,
+ sizeof(block_info.header),
+ (!block_of_record && skip_deleted_blocks ?
+ READING_NEXT : 0) | READING_HEADER))
+ goto panic;
+ b_type= _ma_get_block_info(&block_info,-1,filepos);
+ }
+ else
+ {
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->rec_cache.pos_in_file < filepos + MARIA_BLOCK_INFO_HEADER_LENGTH &&
+ flush_io_cache(&info->rec_cache))
+ DBUG_RETURN(my_errno);
+ info->rec_cache.seek_not_done=1;
+ b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos);
+ }
+
+ if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+ BLOCK_FATAL_ERROR))
+ {
+ if ((b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))
+ && skip_deleted_blocks)
+ {
+ filepos=block_info.filepos+block_info.block_len;
+ block_info.second_read=0;
+ continue; /* Search after next_record */
+ }
+ if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))
+ {
+ my_errno= HA_ERR_RECORD_DELETED;
+ info->cur_row.lastpos= block_info.filepos;
+ info->cur_row.nextpos= block_info.filepos+block_info.block_len;
+ }
+ goto err;
+ }
+ if (block_of_record == 0) /* First block */
+ {
+ info->cur_row.total_length= block_info.rec_len;
+ if (block_info.rec_len > (uint) share->base.max_pack_length)
+ goto panic;
+ info->cur_row.lastpos= filepos;
+ if (share->base.blobs)
+ {
+ if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ block_info.rec_len +
+ info->s->base.extra_rec_buff_size))
+ goto err;
+ }
+ to= info->rec_buff;
+ left_len=block_info.rec_len;
+ }
+ if (left_len < block_info.data_len)
+ goto panic; /* Wrong linked record */
+
+ /* copy information that is already read */
+ {
+ uint offset=(uint) (block_info.filepos - filepos);
+ uint tmp_length= (sizeof(block_info.header) - offset);
+ filepos=block_info.filepos;
+
+ if (tmp_length > block_info.data_len)
+ tmp_length= block_info.data_len;
+ if (tmp_length)
+ {
+ memcpy(to, block_info.header+offset, tmp_length);
+ block_info.data_len-=tmp_length;
+ left_len-=tmp_length;
+ to+=tmp_length;
+ filepos+=tmp_length;
+ }
+ }
+ /* read rest of record from file */
+ if (block_info.data_len)
+ {
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (_ma_read_cache(&info->rec_cache, to,filepos,
+ block_info.data_len,
+ (!block_of_record && skip_deleted_blocks) ?
+ READING_NEXT : 0))
+ goto panic;
+ }
+ else
+ {
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->rec_cache.pos_in_file <
+ block_info.filepos + block_info.data_len &&
+ flush_io_cache(&info->rec_cache))
+ goto err;
+ /* VOID(my_seek(info->dfile.file, filepos, MY_SEEK_SET, MYF(0))); */
+ if (my_read(info->dfile.file, to, block_info.data_len, MYF(MY_NABP)))
+ {
+ if (my_errno == HA_ERR_FILE_TOO_SHORT)
+ my_errno= HA_ERR_WRONG_IN_RECORD; /* Unexpected end of file */
+ goto err;
+ }
+ }
+ }
+ /*
+ Increment block-of-record counter. If it was the first block,
+ remember the position behind the block for the next call.
+ */
+ if (block_of_record++ == 0)
+ {
+ info->cur_row.nextpos= block_info.filepos+block_info.block_len;
+ skip_deleted_blocks=0;
+ }
+ left_len-=block_info.data_len;
+ to+=block_info.data_len;
+ filepos=block_info.next_filepos;
+ } while (left_len);
+
+ info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+ fast_ma_writeinfo(info);
+ if (_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) !=
+ MY_FILE_ERROR)
+ DBUG_RETURN(0);
+ DBUG_RETURN(my_errno); /* Wrong record */
+
+panic:
+ my_errno=HA_ERR_WRONG_IN_RECORD; /* Something is fatal wrong */
+err:
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(my_errno);
+}
+
+
+ /* Read and process header from a dynamic-record-file */
+
+uint _ma_get_block_info(MARIA_BLOCK_INFO *info, File file, my_off_t filepos)
+{
+ uint return_val=0;
+ uchar *header=info->header;
+
+ if (file >= 0)
+ {
+ /*
+ We do not use my_pread() here because we want to have the file
+ pointer set to the end of the header after this function.
+ my_pread() may leave the file pointer untouched.
+ */
+ VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0)));
+ if (my_read(file, header, sizeof(info->header),MYF(0)) !=
+ sizeof(info->header))
+ goto err;
+ }
+ DBUG_DUMP("header",header,MARIA_BLOCK_INFO_HEADER_LENGTH);
+ if (info->second_read)
+ {
+ if (info->header[0] <= 6 || info->header[0] == 13)
+ return_val=BLOCK_SYNC_ERROR;
+ }
+ else
+ {
+ if (info->header[0] > 6 && info->header[0] != 13)
+ return_val=BLOCK_SYNC_ERROR;
+ }
+ info->next_filepos= HA_OFFSET_ERROR; /* Dummy if no next block */
+
+ switch (info->header[0]) {
+ case 0:
+ if ((info->block_len=(uint) mi_uint3korr(header+1)) <
+ MARIA_MIN_BLOCK_LENGTH ||
+ (info->block_len & (MARIA_DYN_ALIGN_SIZE -1)))
+ goto err;
+ info->filepos=filepos;
+ info->next_filepos=mi_sizekorr(header+4);
+ info->prev_filepos=mi_sizekorr(header+12);
+#if SIZEOF_OFF_T == 4
+ if ((mi_uint4korr(header+4) != 0 &&
+ (mi_uint4korr(header+4) != (ulong) ~0 ||
+ info->next_filepos != (ulong) ~0)) ||
+ (mi_uint4korr(header+12) != 0 &&
+ (mi_uint4korr(header+12) != (ulong) ~0 ||
+ info->prev_filepos != (ulong) ~0)))
+ goto err;
+#endif
+ return return_val | BLOCK_DELETED; /* Deleted block */
+
+ case 1:
+ info->rec_len=info->data_len=info->block_len=mi_uint2korr(header+1);
+ info->filepos=filepos+3;
+ return return_val | BLOCK_FIRST | BLOCK_LAST;
+ case 2:
+ info->rec_len=info->data_len=info->block_len=mi_uint3korr(header+1);
+ info->filepos=filepos+4;
+ return return_val | BLOCK_FIRST | BLOCK_LAST;
+
+ case 13:
+ info->rec_len=mi_uint4korr(header+1);
+ info->block_len=info->data_len=mi_uint3korr(header+5);
+ info->next_filepos=mi_sizekorr(header+8);
+ info->second_read=1;
+ info->filepos=filepos+16;
+ return return_val | BLOCK_FIRST;
+
+ case 3:
+ info->rec_len=info->data_len=mi_uint2korr(header+1);
+ info->block_len=info->rec_len+ (uint) header[3];
+ info->filepos=filepos+4;
+ return return_val | BLOCK_FIRST | BLOCK_LAST;
+ case 4:
+ info->rec_len=info->data_len=mi_uint3korr(header+1);
+ info->block_len=info->rec_len+ (uint) header[4];
+ info->filepos=filepos+5;
+ return return_val | BLOCK_FIRST | BLOCK_LAST;
+
+ case 5:
+ info->rec_len=mi_uint2korr(header+1);
+ info->block_len=info->data_len=mi_uint2korr(header+3);
+ info->next_filepos=mi_sizekorr(header+5);
+ info->second_read=1;
+ info->filepos=filepos+13;
+ return return_val | BLOCK_FIRST;
+ case 6:
+ info->rec_len=mi_uint3korr(header+1);
+ info->block_len=info->data_len=mi_uint3korr(header+4);
+ info->next_filepos=mi_sizekorr(header+7);
+ info->second_read=1;
+ info->filepos=filepos+15;
+ return return_val | BLOCK_FIRST;
+
+ /* The following blocks are identical to 1-6 without rec_len */
+ case 7:
+ info->data_len=info->block_len=mi_uint2korr(header+1);
+ info->filepos=filepos+3;
+ return return_val | BLOCK_LAST;
+ case 8:
+ info->data_len=info->block_len=mi_uint3korr(header+1);
+ info->filepos=filepos+4;
+ return return_val | BLOCK_LAST;
+
+ case 9:
+ info->data_len=mi_uint2korr(header+1);
+ info->block_len=info->data_len+ (uint) header[3];
+ info->filepos=filepos+4;
+ return return_val | BLOCK_LAST;
+ case 10:
+ info->data_len=mi_uint3korr(header+1);
+ info->block_len=info->data_len+ (uint) header[4];
+ info->filepos=filepos+5;
+ return return_val | BLOCK_LAST;
+
+ case 11:
+ info->data_len=info->block_len=mi_uint2korr(header+1);
+ info->next_filepos=mi_sizekorr(header+3);
+ info->second_read=1;
+ info->filepos=filepos+11;
+ return return_val;
+ case 12:
+ info->data_len=info->block_len=mi_uint3korr(header+1);
+ info->next_filepos=mi_sizekorr(header+4);
+ info->second_read=1;
+ info->filepos=filepos+12;
+ return return_val;
+ }
+
+err:
+ my_errno=HA_ERR_WRONG_IN_RECORD; /* Garbage */
+ return BLOCK_ERROR;
+}
diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c
new file mode 100644
index 00000000000..7a30b613ea5
--- /dev/null
+++ b/storage/maria/ma_extra.c
@@ -0,0 +1,637 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#include "ma_blockrec.h"
+
+static void maria_extra_keyflag(MARIA_HA *info,
+ enum ha_extra_function function);
+
+/**
+ @brief Set options and buffers to optimize table handling
+
+ @param name table's name
+ @param info open table
+ @param function operation
+ @param extra_arg Pointer to extra argument (normally pointer to
+ ulong); used when function is one of:
+ HA_EXTRA_WRITE_CACHE
+ HA_EXTRA_CACHE
+
+ @return Operation status
+ @retval 0 ok
+ @retval !=0 error
+*/
+
+int maria_extra(MARIA_HA *info, enum ha_extra_function function,
+ void *extra_arg)
+{
+ int error= 0;
+ ulong cache_size;
+ MARIA_SHARE *share= info->s;
+ my_bool block_records= share->data_file_type == BLOCK_RECORD;
+ DBUG_ENTER("maria_extra");
+ DBUG_PRINT("enter",("function: %d",(int) function));
+
+ switch (function) {
+ case HA_EXTRA_RESET_STATE: /* Reset state (don't free buffers) */
+ info->lastinx= 0; /* Use first index as def */
+ info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->page_changed= 1;
+ /* Next/prev gives first/last */
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ reinit_io_cache(&info->rec_cache,READ_CACHE,0,
+ (pbool) (info->lock_type != F_UNLCK),
+ (pbool) test(info->update & HA_STATE_ROW_CHANGED)
+ );
+ }
+ info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND |
+ HA_STATE_PREV_FOUND);
+ break;
+ case HA_EXTRA_CACHE:
+ if (block_records)
+ break; /* Not supported */
+
+ if (info->lock_type == F_UNLCK &&
+ (share->options & HA_OPTION_PACK_RECORD))
+ {
+ error= 1; /* Not possibly if not locked */
+ my_errno= EACCES;
+ break;
+ }
+ if (info->s->file_map) /* Don't use cache if mmap */
+ break;
+#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
+ if ((share->options & HA_OPTION_COMPRESS_RECORD))
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ if (_ma_memmap_file(info))
+ {
+ /* We don't nead MADV_SEQUENTIAL if small file */
+ madvise((char*) share->file_map, share->state.state.data_file_length,
+ share->state.state.data_file_length <= RECORD_CACHE_SIZE*16 ?
+ MADV_RANDOM : MADV_SEQUENTIAL);
+ pthread_mutex_unlock(&share->intern_lock);
+ break;
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+#endif
+ if (info->opt_flag & WRITE_CACHE_USED)
+ {
+ info->opt_flag&= ~WRITE_CACHE_USED;
+ if ((error= end_io_cache(&info->rec_cache)))
+ break;
+ }
+ if (!(info->opt_flag &
+ (READ_CACHE_USED | WRITE_CACHE_USED | MEMMAP_USED)))
+ {
+ cache_size= (extra_arg ? *(ulong*) extra_arg :
+ my_default_record_cache_size);
+ if (!(init_io_cache(&info->rec_cache, info->dfile.file,
+ (uint) min(share->state.state.data_file_length+1,
+ cache_size),
+ READ_CACHE,0L,(pbool) (info->lock_type != F_UNLCK),
+ MYF(share->write_flag & MY_WAIT_IF_FULL))))
+ {
+ info->opt_flag|= READ_CACHE_USED;
+ info->update&= ~HA_STATE_ROW_CHANGED;
+ }
+ if (share->non_transactional_concurrent_insert)
+ info->rec_cache.end_of_file= info->state->data_file_length;
+ }
+ break;
+ case HA_EXTRA_REINIT_CACHE:
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ reinit_io_cache(&info->rec_cache, READ_CACHE, info->cur_row.nextpos,
+ (pbool) (info->lock_type != F_UNLCK),
+ (pbool) test(info->update & HA_STATE_ROW_CHANGED));
+ info->update&= ~HA_STATE_ROW_CHANGED;
+ if (share->non_transactional_concurrent_insert)
+ info->rec_cache.end_of_file= info->state->data_file_length;
+ }
+ break;
+ case HA_EXTRA_WRITE_CACHE:
+ if (info->lock_type == F_UNLCK)
+ {
+ error= 1; /* Not possibly if not locked */
+ break;
+ }
+ if (block_records)
+ break; /* Not supported */
+
+ cache_size= (extra_arg ? *(ulong*) extra_arg :
+ my_default_record_cache_size);
+ if (!(info->opt_flag &
+ (READ_CACHE_USED | WRITE_CACHE_USED | OPT_NO_ROWS)) &&
+ !share->state.header.uniques)
+ if (!(init_io_cache(&info->rec_cache, info->dfile.file, cache_size,
+ WRITE_CACHE,share->state.state.data_file_length,
+ (pbool) (info->lock_type != F_UNLCK),
+ MYF(share->write_flag & MY_WAIT_IF_FULL))))
+ {
+ info->opt_flag|= WRITE_CACHE_USED;
+ info->update&= ~(HA_STATE_ROW_CHANGED |
+ HA_STATE_WRITE_AT_END |
+ HA_STATE_EXTEND_BLOCK);
+ }
+ break;
+ case HA_EXTRA_PREPARE_FOR_UPDATE:
+ if (info->s->data_file_type != DYNAMIC_RECORD)
+ break;
+ /* Remove read/write cache if dynamic rows */
+ case HA_EXTRA_NO_CACHE:
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ error= end_io_cache(&info->rec_cache);
+ /* Sergei will insert full text index caching here */
+ }
+#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
+ if (info->opt_flag & MEMMAP_USED)
+ madvise((char*) share->file_map, share->state.state.data_file_length,
+ MADV_RANDOM);
+#endif
+ break;
+ case HA_EXTRA_FLUSH_CACHE:
+ if (info->opt_flag & WRITE_CACHE_USED)
+ {
+ if ((error= flush_io_cache(&info->rec_cache)))
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info); /* Fatal error found */
+ }
+ }
+ break;
+ case HA_EXTRA_NO_READCHECK:
+ info->opt_flag&= ~READ_CHECK_USED; /* No readcheck */
+ break;
+ case HA_EXTRA_READCHECK:
+ info->opt_flag|= READ_CHECK_USED;
+ break;
+ case HA_EXTRA_KEYREAD: /* Read only keys to record */
+ case HA_EXTRA_REMEMBER_POS:
+ info->opt_flag|= REMEMBER_OLD_POS;
+ bmove(info->last_key.data + share->base.max_key_length*2,
+ info->last_key.data,
+ info->last_key.data_length + info->last_key.ref_length);
+ info->save_update= info->update;
+ info->save_lastinx= info->lastinx;
+ info->save_lastpos= info->cur_row.lastpos;
+ info->save_lastkey_data_length= info->last_key.data_length;
+ info->save_lastkey_ref_length= info->last_key.ref_length;
+ if (function == HA_EXTRA_REMEMBER_POS)
+ break;
+ /* fall through */
+ case HA_EXTRA_KEYREAD_CHANGE_POS:
+ info->opt_flag|= KEY_READ_USED;
+ info->read_record= _ma_read_key_record;
+ break;
+ case HA_EXTRA_NO_KEYREAD:
+ case HA_EXTRA_RESTORE_POS:
+ if (info->opt_flag & REMEMBER_OLD_POS)
+ {
+ bmove(info->last_key.data,
+ info->last_key.data + share->base.max_key_length*2,
+ info->save_lastkey_data_length + info->save_lastkey_ref_length);
+ info->update= info->save_update | HA_STATE_WRITTEN;
+ info->lastinx= info->save_lastinx;
+ info->cur_row.lastpos= info->save_lastpos;
+ info->last_key.data_length= info->save_lastkey_data_length;
+ info->last_key.ref_length= info->save_lastkey_ref_length;
+ info->last_key.flag= 0;
+ }
+ info->read_record= share->read_record;
+ info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS);
+ break;
+ case HA_EXTRA_NO_USER_CHANGE: /* Database is somehow locked agains changes */
+ info->lock_type= F_EXTRA_LCK; /* Simulate as locked */
+ break;
+ case HA_EXTRA_WAIT_LOCK:
+ info->lock_wait= 0;
+ break;
+ case HA_EXTRA_NO_WAIT_LOCK:
+ info->lock_wait= MY_SHORT_WAIT;
+ break;
+ case HA_EXTRA_NO_KEYS:
+ /* we're going to modify pieces of the state, stall Checkpoint */
+ pthread_mutex_lock(&share->intern_lock);
+ if (info->lock_type == F_UNLCK)
+ {
+ pthread_mutex_unlock(&share->intern_lock);
+ error= 1; /* Not possibly if not lock */
+ break;
+ }
+ if (maria_is_any_key_active(share->state.key_map))
+ {
+ MARIA_KEYDEF *key= share->keyinfo;
+ uint i;
+ for (i =0 ; i < share->base.keys ; i++,key++)
+ {
+ if (!(key->flag & HA_NOSAME) && info->s->base.auto_key != i+1)
+ {
+ maria_clear_key_active(share->state.key_map, i);
+ info->update|= HA_STATE_CHANGED;
+ }
+ }
+
+ if (!share->changed)
+ {
+ share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED;
+ share->changed= 1; /* Update on close */
+ if (!share->global_changed)
+ {
+ share->global_changed= 1;
+ share->state.open_count++;
+ }
+ }
+ if (!share->now_transactional)
+ share->state.state= *info->state;
+ /*
+ That state write to disk must be done, even for transactional tables;
+ indeed the table's share is going to be lost (there was a
+ HA_EXTRA_FORCE_REOPEN before, which set share->last_version to
+ 0), and so the only way it leaves information (share->state.key_map)
+ for the posterity is by writing it to disk.
+ */
+ DBUG_ASSERT(!maria_in_recovery);
+ error= _ma_state_info_write(share,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_FULL_INFO);
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+ break;
+ case HA_EXTRA_FORCE_REOPEN:
+ /*
+ MySQL uses this case after it has closed all other instances
+ of this table.
+ We however do a flush here for additional safety.
+ */
+ /** @todo consider porting these flush-es to MyISAM */
+ DBUG_ASSERT(share->reopen == 1);
+ error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE);
+ if (!error && share->changed)
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ if (!(error= _ma_state_info_write(share,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
+ MA_STATE_INFO_WRITE_FULL_INFO)))
+ share->changed= 0;
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+ pthread_mutex_lock(&THR_LOCK_maria);
+ pthread_mutex_lock(&share->intern_lock); /* protect against Checkpoint */
+ /* this makes the share not be re-used next time the table is opened */
+ share->last_version= 0L; /* Impossible version */
+ pthread_mutex_unlock(&share->intern_lock);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ break;
+ case HA_EXTRA_PREPARE_FOR_DROP:
+ /* Signals about intent to delete this table */
+ share->deleting= TRUE;
+ share->global_changed= FALSE; /* force writing changed flag */
+ /* To force repair if reopened */
+ _ma_mark_file_changed(info);
+ /* Fall trough */
+ case HA_EXTRA_PREPARE_FOR_RENAME:
+ {
+ my_bool do_flush= test(function != HA_EXTRA_PREPARE_FOR_DROP);
+ enum flush_type type;
+ pthread_mutex_lock(&THR_LOCK_maria);
+ /*
+ This share, to have last_version=0, needs to save all its data/index
+ blocks to disk if this is not for a DROP TABLE. Otherwise they would be
+ invisible to future openers; and they could even go to disk late and
+ cancel the work of future openers.
+ */
+ if (info->lock_type != F_UNLCK && !info->was_locked)
+ {
+ info->was_locked= info->lock_type;
+ if (maria_lock_database(info, F_UNLCK))
+ error= my_errno;
+ info->lock_type= F_UNLCK;
+ }
+ /*
+ We don't need to call _mi_decrement_open_count() if we are
+ dropping the table, as the files will be removed anyway. If we
+ are aborted before the files is removed, it's better to not
+ call it as in that case the automatic repair on open will add
+ the missing index entries
+ */
+ pthread_mutex_lock(&share->intern_lock);
+ if (share->kfile.file >= 0 && function != HA_EXTRA_PREPARE_FOR_DROP)
+ _ma_decrement_open_count(info);
+ if (info->trn)
+ {
+ _ma_remove_table_from_trnman(share, info->trn);
+ /* Ensure we don't point to the deleted data in trn */
+ info->state= info->state_start= &share->state.state;
+ }
+
+ type= do_flush ? FLUSH_RELEASE : FLUSH_IGNORE_CHANGED;
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ type, type))
+ {
+ error=my_errno;
+ share->changed= 1;
+ }
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ if (end_io_cache(&info->rec_cache))
+ error= 1;
+ }
+ if (share->kfile.file >= 0)
+ {
+ if (do_flush)
+ {
+ /* Save the state so that others can find it from disk. */
+ if ((share->changed &&
+ _ma_state_info_write(share,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_FULL_INFO)) ||
+ my_sync(share->kfile.file, MYF(0)))
+ error= my_errno;
+ else
+ share->changed= 0;
+ }
+ else
+ {
+ /* be sure that state is not tried for write as file may be closed */
+ share->changed= 0;
+ }
+ }
+ if (share->data_file_type == BLOCK_RECORD &&
+ share->bitmap.file.file >= 0)
+ {
+ if (do_flush && my_sync(share->bitmap.file.file, MYF(0)))
+ error= my_errno;
+ }
+ /* For protection against Checkpoint, we set under intern_lock: */
+ share->last_version= 0L; /* Impossible version */
+ pthread_mutex_unlock(&share->intern_lock);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ break;
+ }
+ case HA_EXTRA_PREPARE_FOR_FORCED_CLOSE:
+ if (info->trn)
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ _ma_remove_table_from_trnman(share, info->trn);
+ /* Ensure we don't point to the deleted data in trn */
+ info->state= info->state_start= &share->state.state;
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+ break;
+ case HA_EXTRA_FLUSH:
+ if (!share->temporary)
+ error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_KEEP, FLUSH_KEEP);
+#ifdef HAVE_PWRITE
+ _ma_decrement_open_count(info);
+#endif
+ if (share->not_flushed)
+ {
+ share->not_flushed= 0;
+ if (_ma_sync_table_files(info))
+ error= my_errno;
+ if (error)
+ {
+ share->changed= 1;
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info); /* Fatal error found */
+ }
+ }
+ break;
+ case HA_EXTRA_NORMAL: /* Theese isn't in use */
+ info->quick_mode= 0;
+ break;
+ case HA_EXTRA_QUICK:
+ info->quick_mode= 1;
+ break;
+ case HA_EXTRA_NO_ROWS:
+ if (!share->state.header.uniques)
+ info->opt_flag|= OPT_NO_ROWS;
+ break;
+ case HA_EXTRA_PRELOAD_BUFFER_SIZE:
+ info->preload_buff_size= *((ulong *) extra_arg);
+ break;
+ case HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
+ case HA_EXTRA_CHANGE_KEY_TO_DUP:
+ maria_extra_keyflag(info, function);
+ break;
+ case HA_EXTRA_MMAP:
+#ifdef HAVE_MMAP
+ if (block_records)
+ break; /* Not supported */
+ pthread_mutex_lock(&share->intern_lock);
+ /*
+ Memory map the data file if it is not already mapped. It is safe
+ to memory map a file while other threads are using file I/O on it.
+ Assigning a new address to a function pointer is an atomic
+ operation. intern_lock prevents that two or more mappings are done
+ at the same time.
+ */
+ if (!share->file_map)
+ {
+ if (_ma_dynmap_file(info, share->state.state.data_file_length))
+ {
+ DBUG_PRINT("warning",("mmap failed: errno: %d",errno));
+ error= my_errno= errno;
+ }
+ else
+ {
+ share->file_read= _ma_mmap_pread;
+ share->file_write= _ma_mmap_pwrite;
+ }
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+#endif
+ break;
+ case HA_EXTRA_MARK_AS_LOG_TABLE:
+ pthread_mutex_lock(&share->intern_lock);
+ share->is_log_table= TRUE;
+ pthread_mutex_unlock(&share->intern_lock);
+ break;
+ case HA_EXTRA_KEY_CACHE:
+ case HA_EXTRA_NO_KEY_CACHE:
+ default:
+ break;
+ }
+ DBUG_RETURN(error);
+} /* maria_extra */
+
+
+void ma_set_index_cond_func(MARIA_HA *info, index_cond_func_t func,
+ void *func_arg)
+{
+ info->index_cond_func= func;
+ info->index_cond_func_arg= func_arg;
+}
+
+
+/*
+ Start/Stop Inserting Duplicates Into a Table, WL#1648.
+*/
+
+static void maria_extra_keyflag(MARIA_HA *info,
+ enum ha_extra_function function)
+{
+ uint idx;
+
+ for (idx= 0; idx< info->s->base.keys; idx++)
+ {
+ switch (function) {
+ case HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
+ info->s->keyinfo[idx].flag|= HA_NOSAME;
+ break;
+ case HA_EXTRA_CHANGE_KEY_TO_DUP:
+ info->s->keyinfo[idx].flag&= ~(HA_NOSAME);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+
+int maria_reset(MARIA_HA *info)
+{
+ int error= 0;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("maria_reset");
+ /*
+ Free buffers and reset the following flags:
+ EXTRA_CACHE, EXTRA_WRITE_CACHE, EXTRA_KEYREAD, EXTRA_QUICK
+
+ If the row buffer cache is large (for dynamic tables), reduce it
+ to save memory.
+ */
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ error= end_io_cache(&info->rec_cache);
+ }
+ /* Free memory used for keeping blobs */
+ if (share->base.blobs)
+ {
+ if (info->rec_buff_size > share->base.default_rec_buff_size)
+ {
+ info->rec_buff_size= 1; /* Force realloc */
+ _ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+ share->base.default_rec_buff_size);
+ }
+ if (info->blob_buff_size > MARIA_SMALL_BLOB_BUFFER)
+ {
+ info->blob_buff_size= 1; /* Force realloc */
+ _ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
+ MARIA_SMALL_BLOB_BUFFER);
+ }
+ }
+#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
+ if (info->opt_flag & MEMMAP_USED)
+ madvise((char*) share->file_map, share->state.state.data_file_length,
+ MADV_RANDOM);
+#endif
+ info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS);
+ info->quick_mode= 0;
+ info->lastinx= 0; /* Use first index as def */
+ info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->page_changed= 1;
+ info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND |
+ HA_STATE_PREV_FOUND);
+ DBUG_RETURN(error);
+}
+
+
+int _ma_sync_table_files(const MARIA_HA *info)
+{
+ return (my_sync(info->dfile.file, MYF(MY_WME)) ||
+ my_sync(info->s->kfile.file, MYF(MY_WME)));
+}
+
+
+/**
+ @brief flushes the data and/or index file of a table
+
+ This is useful when one wants to read a table using OS syscalls (like
+ my_copy()) and first wants to be sure that MySQL-level caches go down to
+ the OS so that OS syscalls can see all data. It can flush rec_cache,
+ bitmap, pagecache of data file, pagecache of index file.
+
+ @param info table
+ @param flush_data_or_index one or two of these flags:
+ MARIA_FLUSH_DATA, MARIA_FLUSH_INDEX
+ @param flush_type_for_data
+ @param flush_type_for_index
+
+ @note does not sync files (@see _ma_sync_table_files()).
+ @note Progressively this function will be used in all places where we flush
+ the index but not the data file (probable bugs).
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
+ enum flush_type flush_type_for_data,
+ enum flush_type flush_type_for_index)
+{
+ int error= 0;
+ MARIA_SHARE *share= info->s;
+ /* flush data file first because it's more critical */
+ if (flush_data_or_index & MARIA_FLUSH_DATA)
+ {
+ if ((info->opt_flag & WRITE_CACHE_USED) &&
+ flush_type_for_data != FLUSH_IGNORE_CHANGED &&
+ flush_io_cache(&info->rec_cache))
+ error= 1;
+ if (share->data_file_type == BLOCK_RECORD)
+ {
+ if (flush_type_for_data != FLUSH_IGNORE_CHANGED)
+ {
+ if (_ma_bitmap_flush(share))
+ error= 1;
+ }
+ else
+ {
+ pthread_mutex_lock(&share->bitmap.bitmap_lock);
+ share->bitmap.changed= 0;
+ pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+ }
+ if (flush_pagecache_blocks(share->pagecache, &info->dfile,
+ flush_type_for_data))
+ error= 1;
+ }
+ }
+ if ((flush_data_or_index & MARIA_FLUSH_INDEX) &&
+ flush_pagecache_blocks(share->pagecache, &share->kfile,
+ flush_type_for_index))
+ error= 1;
+ if (!error)
+ return 0;
+
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ return 1;
+}
+
diff --git a/storage/maria/ma_ft_boolean_search.c b/storage/maria/ma_ft_boolean_search.c
new file mode 100644
index 00000000000..0783f679843
--- /dev/null
+++ b/storage/maria/ma_ft_boolean_search.c
@@ -0,0 +1,1042 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* TODO: add caching - pre-read several index entries at once */
+
+/*
+ Added optimization for full-text queries with plus-words. It was
+ implemented by sharing maximal document id (max_docid) variable
+ inside plus subtree. max_docid could be used by any word in plus
+ subtree, but it could be updated by plus-word only.
+
+ Fulltext "smarter index merge" optimization assumes that rows
+ it gets are ordered by doc_id. That is not the case when we
+ search for a word with truncation operator. It may return
+ rows in random order. Thus we may not use "smarter index merge"
+ optimization with "trunc-words".
+
+ The idea is: there is no need to search for docid smaller than
+ biggest docid inside current plus subtree or any upper plus subtree.
+
+ Examples:
+ +word1 word2
+ share same max_docid
+ max_docid updated by word1
+ +word1 +(word2 word3)
+ share same max_docid
+ max_docid updated by word1
+ +(word1 -word2) +(+word3 word4)
+ share same max_docid
+ max_docid updated by word3
+ +word1 word2 (+word3 word4 (+word5 word6))
+ three subexpressions (including the top-level one),
+ every one has its own max_docid, updated by its plus word.
+ but for the search word6 uses
+ max(word1.max_docid, word3.max_docid, word5.max_docid),
+ while word4 uses, accordingly,
+ max(word1.max_docid, word3.max_docid).
+*/
+
+#define FT_CORE
+#include "ma_ftdefs.h"
+
+/* search with boolean queries */
+
+static double _wghts[11]=
+{
+ 0.131687242798354,
+ 0.197530864197531,
+ 0.296296296296296,
+ 0.444444444444444,
+ 0.666666666666667,
+ 1.000000000000000,
+ 1.500000000000000,
+ 2.250000000000000,
+ 3.375000000000000,
+ 5.062500000000000,
+ 7.593750000000000};
+static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */
+
+static double _nwghts[11]=
+{
+ -0.065843621399177,
+ -0.098765432098766,
+ -0.148148148148148,
+ -0.222222222222222,
+ -0.333333333333334,
+ -0.500000000000000,
+ -0.750000000000000,
+ -1.125000000000000,
+ -1.687500000000000,
+ -2.531250000000000,
+ -3.796875000000000};
+static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
+
+#define FTB_FLAG_TRUNC 1
+/* At most one of the following flags can be set */
+#define FTB_FLAG_YES 2
+#define FTB_FLAG_NO 4
+#define FTB_FLAG_WONLY 8
+
+typedef struct st_ftb_expr FTB_EXPR;
+struct st_ftb_expr
+{
+ FTB_EXPR *up;
+ uint flags;
+/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
+ my_off_t docid[2];
+ my_off_t max_docid;
+ float weight;
+ float cur_weight;
+ LIST *phrase; /* phrase words */
+ LIST *document; /* for phrase search */
+ uint yesses; /* number of "yes" words matched */
+ uint nos; /* number of "no" words matched */
+ uint ythresh; /* number of "yes" words in expr */
+ uint yweaks; /* number of "yes" words for scan only */
+};
+
+typedef struct st_ftb_word
+{
+ FTB_EXPR *up;
+ uint flags;
+/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
+ my_off_t docid[2]; /* for index search and for scan */
+ my_off_t key_root;
+ FTB_EXPR *max_docid_expr;
+ MARIA_KEYDEF *keyinfo;
+ struct st_ftb_word *prev;
+ float weight;
+ uint ndepth;
+ uint len;
+ uchar off;
+ uchar word[1];
+} FTB_WORD;
+
+typedef struct st_ft_info
+{
+ struct _ft_vft *please;
+ MARIA_HA *info;
+ CHARSET_INFO *charset;
+ FTB_EXPR *root;
+ FTB_WORD **list;
+ FTB_WORD *last_word;
+ MEM_ROOT mem_root;
+ QUEUE queue;
+ TREE no_dupes;
+ my_off_t lastpos;
+ uint keynr;
+ uchar with_scan;
+ enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state;
+} FTB;
+
+static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b)
+{
+ int i;
+
+ /* if a==curdoc, take it as a < b */
+ if (v && a->docid[0] == *v)
+ return -1;
+
+ /* ORDER BY docid, ndepth DESC */
+ i=CMP_NUM(a->docid[0], b->docid[0]);
+ if (!i)
+ i=CMP_NUM(b->ndepth,a->ndepth);
+ return i;
+}
+
+static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
+{
+ /* ORDER BY word, ndepth */
+ int i= ha_compare_text(cs, (uchar*) (*a)->word + 1,(*a)->len - 1,
+ (uchar*) (*b)->word + 1,(*b)->len - 1, 0, 0);
+ if (!i)
+ i=CMP_NUM((*a)->ndepth, (*b)->ndepth);
+ return i;
+}
+
+
+typedef struct st_my_ftb_param
+{
+ FTB *ftb;
+ FTB_EXPR *ftbe;
+ uchar *up_quot;
+ uint depth;
+} MY_FTB_PARAM;
+
+
+static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
+ const uchar *word, mysql_ft_size_t word_len,
+ MYSQL_FTPARSER_BOOLEAN_INFO *info)
+{
+ MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
+ FTB_WORD *ftbw;
+ FTB_EXPR *ftbe, *tmp_expr;
+ FT_WORD *phrase_word;
+ LIST *tmp_element;
+ int r= info->weight_adjust;
+ float weight= (float)
+ (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
+
+ switch (info->type) {
+ case FT_TOKEN_WORD:
+ ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
+ sizeof(FTB_WORD) +
+ (info->trunc ? MARIA_MAX_KEY_BUFF :
+ word_len * ftb_param->ftb->charset->mbmaxlen +
+ HA_FT_WLEN +
+ ftb_param->ftb->info->s->rec_reflength));
+ ftbw->len= word_len + 1;
+ ftbw->flags= 0;
+ ftbw->off= 0;
+ if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES;
+ if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO;
+ if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC;
+ ftbw->weight= weight;
+ ftbw->up= ftb_param->ftbe;
+ ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR;
+ ftbw->ndepth= (info->yesno < 0) + ftb_param->depth;
+ ftbw->key_root= HA_OFFSET_ERROR;
+ memcpy(ftbw->word + 1, word, word_len);
+ ftbw->word[0]= word_len;
+ if (info->yesno > 0) ftbw->up->ythresh++;
+ ftb_param->ftb->queue.max_elements++;
+ ftbw->prev= ftb_param->ftb->last_word;
+ ftb_param->ftb->last_word= ftbw;
+ ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC);
+ for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up)
+ if (! (tmp_expr->flags & FTB_FLAG_YES))
+ break;
+ ftbw->max_docid_expr= tmp_expr;
+ /* fall through */
+ case FT_TOKEN_STOPWORD:
+ if (! ftb_param->up_quot) break;
+ phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
+ tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
+ phrase_word->pos= (uchar*) word;
+ phrase_word->len= word_len;
+ tmp_element->data= (void *)phrase_word;
+ ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element);
+ /* Allocate document list at this point.
+ It allows to avoid huge amount of allocs/frees for each row.*/
+ tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
+ tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
+ ftb_param->ftbe->document=
+ list_add(ftb_param->ftbe->document, tmp_element);
+ break;
+ case FT_TOKEN_LEFT_PAREN:
+ ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR));
+ ftbe->flags= 0;
+ if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES;
+ if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO;
+ ftbe->weight= weight;
+ ftbe->up= ftb_param->ftbe;
+ ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
+ ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR;
+ ftbe->phrase= NULL;
+ ftbe->document= 0;
+ if (info->quot) ftb_param->ftb->with_scan|= 2;
+ if (info->yesno > 0) ftbe->up->ythresh++;
+ ftb_param->ftbe= ftbe;
+ ftb_param->depth++;
+ ftb_param->up_quot= (uchar*) info->quot;
+ break;
+ case FT_TOKEN_RIGHT_PAREN:
+ if (ftb_param->ftbe->document)
+ {
+ /* Circuit document list */
+ for (tmp_element= ftb_param->ftbe->document;
+ tmp_element->next; tmp_element= tmp_element->next) /* no-op */;
+ tmp_element->next= ftb_param->ftbe->document;
+ ftb_param->ftbe->document->prev= tmp_element;
+ }
+ info->quot= 0;
+ if (ftb_param->ftbe->up)
+ {
+ DBUG_ASSERT(ftb_param->depth);
+ ftb_param->ftbe= ftb_param->ftbe->up;
+ ftb_param->depth--;
+ ftb_param->up_quot= 0;
+ }
+ break;
+ case FT_TOKEN_EOF:
+ default:
+ break;
+ }
+ return(0);
+}
+
+
+static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param,
+ const uchar *query, mysql_ft_size_t len)
+{
+ MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
+ MYSQL_FTPARSER_BOOLEAN_INFO info;
+ CHARSET_INFO *cs= ftb_param->ftb->charset;
+ const uchar **start= &query;
+ const uchar *end= query + len;
+ FT_WORD w;
+
+ info.prev= ' ';
+ info.quot= 0;
+ while (maria_ft_get_word(cs, start, end, &w, &info))
+ param->mysql_add_word(param, w.pos, w.len, &info);
+ return(0);
+}
+
+
+static int _ftb_parse_query(FTB *ftb, uchar *query, size_t len,
+ struct st_mysql_ftparser *parser)
+{
+ MYSQL_FTPARSER_PARAM *param;
+ MY_FTB_PARAM ftb_param;
+ DBUG_ENTER("_ftb_parse_query");
+ DBUG_ASSERT(parser);
+
+ if (ftb->state != UNINITIALIZED)
+ DBUG_RETURN(0);
+ if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
+ DBUG_RETURN(1);
+
+ ftb_param.ftb= ftb;
+ ftb_param.depth= 0;
+ ftb_param.ftbe= ftb->root;
+ ftb_param.up_quot= 0;
+
+ param->mysql_parse= ftb_parse_query_internal;
+ param->mysql_add_word= ftb_query_add_word;
+ param->mysql_ftparam= (void *)&ftb_param;
+ param->cs= ftb->charset;
+ param->doc= query;
+ param->length= len;
+ param->flags= 0;
+ param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO;
+ DBUG_RETURN(parser->parse(param));
+}
+
+
+static int _ftb_no_dupes_cmp(void* not_used __attribute__((unused)),
+ const void *a,const void *b)
+{
+ return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b)));
+}
+
+
+/* returns 1 if the search was finished (must-word wasn't found) */
+
+static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
+{
+ int r;
+ int subkeys=1;
+ my_bool can_go_down;
+ MARIA_HA *info=ftb->info;
+ uint off, extra=HA_FT_WLEN+info->s->base.rec_reflength;
+ uchar *lastkey_buf= ftbw->word+ftbw->off;
+ MARIA_KEY key;
+ LINT_INIT(off);
+
+ if (ftbw->flags & FTB_FLAG_TRUNC)
+ lastkey_buf+=ftbw->len;
+
+ if (init_search)
+ {
+ ftbw->key_root=info->s->state.key_root[ftb->keynr];
+ ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
+ key.keyinfo= ftbw->keyinfo;
+ key.data= ftbw->word;
+ key.data_length= ftbw->len;
+ key.ref_length= 0;
+ key.flag= 0;
+
+ r= _ma_search(info, &key, SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root);
+ }
+ else
+ {
+ uint sflag= SEARCH_BIGGER;
+ my_off_t max_docid=0;
+ FTB_EXPR *tmp;
+
+ for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up)
+ set_if_bigger(max_docid, tmp->max_docid);
+
+ if (ftbw->docid[0] < max_docid)
+ {
+ sflag|= SEARCH_SAME;
+ _ma_dpointer(info->s, (uchar*) (ftbw->word + ftbw->len + HA_FT_WLEN),
+ max_docid);
+ }
+
+ key.keyinfo= ftbw->keyinfo;
+ key.data= lastkey_buf;
+ key.data_length= USE_WHOLE_KEY;
+ key.ref_length= 0;
+ key.flag= 0;
+
+ r= _ma_search(info, &key, sflag, ftbw->key_root);
+ }
+
+ can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC)));
+ /* Skip rows inserted by concurrent insert */
+ while (!r)
+ {
+ if (can_go_down)
+ {
+ /* going down ? */
+ off= info->last_key.data_length + info->last_key.ref_length - extra;
+ subkeys=ft_sintXkorr(info->last_key.data + off);
+ }
+ if (subkeys<0 || info->cur_row.lastpos < info->state->data_file_length)
+ break;
+ r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, ftbw->key_root);
+ }
+
+ if (!r && !ftbw->off)
+ {
+ r= ha_compare_text(ftb->charset,
+ info->last_key.data+1,
+ info->last_key.data_length + info->last_key.ref_length-
+ extra-1,
+ (uchar*) ftbw->word+1,
+ ftbw->len-1,
+ (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0);
+ }
+
+ if (r) /* not found */
+ {
+ if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC))
+ {
+ ftbw->docid[0]=HA_OFFSET_ERROR;
+ if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0)
+ {
+ /*
+ This word MUST BE present in every document returned,
+ so we can stop the search right now
+ */
+ ftb->state=INDEX_DONE;
+ return 1; /* search is done */
+ }
+ else
+ return 0;
+ }
+
+ /* going up to the first-level tree to continue search there */
+ _ma_dpointer(info->s, (lastkey_buf+HA_FT_WLEN), ftbw->key_root);
+ ftbw->key_root=info->s->state.key_root[ftb->keynr];
+ ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
+ ftbw->off=0;
+ return _ft2_search(ftb, ftbw, 0);
+ }
+
+ /* matching key found */
+ memcpy(lastkey_buf, info->last_key.data,
+ info->last_key.data_length + info->last_key.ref_length);
+ if (lastkey_buf == ftbw->word)
+ ftbw->len= info->last_key.data_length + info->last_key.ref_length - extra;
+
+ /* going down ? */
+ if (subkeys<0)
+ {
+ /*
+ yep, going down, to the second-level tree
+ TODO here: subkey-based optimization
+ */
+ ftbw->off=off;
+ ftbw->key_root= info->cur_row.lastpos;
+ ftbw->keyinfo=& info->s->ft2_keyinfo;
+ r= _ma_search_first(info, ftbw->keyinfo, ftbw->key_root);
+ DBUG_ASSERT(r==0); /* found something */
+ memcpy(lastkey_buf+off, info->last_key.data,
+ info->last_key.data_length + info->last_key.ref_length);
+ }
+ ftbw->docid[0]= info->cur_row.lastpos;
+ if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC))
+ ftbw->max_docid_expr->max_docid= info->cur_row.lastpos;
+ return 0;
+}
+
+static void _ftb_init_index_search(FT_INFO *ftb)
+{
+ int i;
+ FTB_WORD *ftbw;
+
+ if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY)
+ return;
+ ftb->state=INDEX_SEARCH;
+
+ for (i= queue_last_element(&ftb->queue);
+ (int) i >= (int) queue_first_element(&ftb->queue);
+ i--)
+ {
+ ftbw=(FTB_WORD *)(queue_element(&ftb->queue, i));
+
+ if (ftbw->flags & FTB_FLAG_TRUNC)
+ {
+ /*
+ special treatment for truncation operator
+ 1. there are some (besides this) +words
+ | no need to search in the index, it can never ADD new rows
+ | to the result, and to remove half-matched rows we do scan anyway
+ 2. -trunc*
+ | same as 1.
+ 3. in 1 and 2, +/- need not be on the same expr. level,
+ but can be on any upper level, as in +word +(trunc1* trunc2*)
+ 4. otherwise
+ | We have to index-search for this prefix.
+ | It may cause duplicates, as in the index (sorted by <word,docid>)
+ | <aaaa,row1>
+ | <aabb,row2>
+ | <aacc,row1>
+ | Searching for "aa*" will find row1 twice...
+ */
+ FTB_EXPR *ftbe;
+ for (ftbe=(FTB_EXPR*)ftbw;
+ ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC);
+ ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up)
+ {
+ if (ftbe->flags & FTB_FLAG_NO || /* 2 */
+ ftbe->up->ythresh - ftbe->up->yweaks >
+ (uint) test(ftbe->flags & FTB_FLAG_YES)) /* 1 */
+ {
+ FTB_EXPR *top_ftbe=ftbe->up;
+ ftbw->docid[0]=HA_OFFSET_ERROR;
+ for (ftbe=(FTB_EXPR *)ftbw;
+ ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO);
+ ftbe=ftbe->up)
+ ftbe->up->yweaks++;
+ ftbe=0;
+ break;
+ }
+ }
+ if (!ftbe)
+ continue;
+ /* 4 */
+ if (!is_tree_inited(& ftb->no_dupes))
+ init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t),
+ _ftb_no_dupes_cmp,0,0,0);
+ else
+ reset_tree(& ftb->no_dupes);
+ }
+
+ ftbw->off=0; /* in case of reinit */
+ if (_ft2_search(ftb, ftbw, 1))
+ return;
+ }
+ queue_fix(& ftb->queue);
+}
+
+
+FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr,
+ uchar *query, size_t query_len,
+ CHARSET_INFO *cs)
+{
+ FTB *ftb;
+ FTB_EXPR *ftbe;
+ FTB_WORD *ftbw;
+
+ if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
+ return 0;
+ ftb->please= (struct _ft_vft *) & _ma_ft_vft_boolean;
+ ftb->state=UNINITIALIZED;
+ ftb->info=info;
+ ftb->keynr=keynr;
+ ftb->charset=cs;
+ DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset);
+ ftb->with_scan=0;
+ ftb->lastpos=HA_OFFSET_ERROR;
+ bzero(& ftb->no_dupes, sizeof(TREE));
+ ftb->last_word= 0;
+
+ init_alloc_root(&ftb->mem_root, 1024, 1024);
+ ftb->queue.max_elements= 0;
+ if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
+ goto err;
+ ftbe->weight=1;
+ ftbe->flags=FTB_FLAG_YES;
+ ftbe->nos=1;
+ ftbe->up=0;
+ ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
+ ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR;
+ ftbe->phrase= NULL;
+ ftbe->document= 0;
+ ftb->root=ftbe;
+ if (unlikely(_ftb_parse_query(ftb, query, query_len,
+ keynr == NO_SUCH_KEY ? &ft_default_parser :
+ info->s->keyinfo[keynr].parser)))
+ goto err;
+ /*
+ Hack: instead of init_queue, we'll use reinit queue to be able
+ to alloc queue with alloc_root()
+ */
+ if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root,
+ (ftb->queue.max_elements + 1) *
+ sizeof(void *))))
+ goto err;
+ reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0,
+ (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0, 0, 0);
+ for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev)
+ queue_insert(&ftb->queue, (uchar *)ftbw);
+ ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
+ sizeof(FTB_WORD *)*ftb->queue.elements);
+ memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
+ my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
+ (qsort2_cmp)FTB_WORD_cmp_list, (void*) ftb->charset);
+ if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
+ ftb->state=READY;
+ return ftb;
+err:
+ free_root(& ftb->mem_root, MYF(0));
+ my_free(ftb, MYF(0));
+ return 0;
+}
+
+
+typedef struct st_my_ftb_phrase_param
+{
+ LIST *phrase;
+ LIST *document;
+ CHARSET_INFO *cs;
+ uint phrase_length;
+ uint document_length;
+ uint match;
+} MY_FTB_PHRASE_PARAM;
+
+
+static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
+ const uchar *word, mysql_ft_size_t word_len,
+ MYSQL_FTPARSER_BOOLEAN_INFO
+ *boolean_info __attribute__((unused)))
+{
+ MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
+ FT_WORD *w= (FT_WORD *)phrase_param->document->data;
+ LIST *phrase, *document;
+ w->pos= (uchar*) word;
+ w->len= word_len;
+ phrase_param->document= phrase_param->document->prev;
+ if (phrase_param->phrase_length > phrase_param->document_length)
+ {
+ phrase_param->document_length++;
+ return 0;
+ }
+ /* TODO: rewrite phrase search to avoid
+ comparing the same word twice. */
+ for (phrase= phrase_param->phrase, document= phrase_param->document->next;
+ phrase; phrase= phrase->next, document= document->next)
+ {
+ FT_WORD *phrase_word= (FT_WORD *)phrase->data;
+ FT_WORD *document_word= (FT_WORD *)document->data;
+ if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos,
+ phrase_word->len,
+ (uchar*) document_word->pos, document_word->len))
+ return 0;
+ }
+ phrase_param->match++;
+ return 0;
+}
+
+
+static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
+ const uchar *document,
+ mysql_ft_size_t len)
+{
+ FT_WORD word;
+ MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
+ const uchar *docend= document + len;
+ while (maria_ft_simple_get_word(phrase_param->cs, &document,
+ docend, &word, FALSE))
+ {
+ param->mysql_add_word(param, word.pos, word.len, 0);
+ if (phrase_param->match)
+ break;
+ }
+ return 0;
+}
+
+
+/*
+ Checks if given buffer matches phrase list.
+
+ SYNOPSIS
+ _ftb_check_phrase()
+ s0 start of buffer
+ e0 end of buffer
+ phrase broken into list phrase
+ cs charset info
+
+ RETURN VALUE
+ 1 is returned if phrase found, 0 else.
+ -1 is returned if error occurs.
+*/
+
+static int _ftb_check_phrase(FTB *ftb, const uchar *document, size_t len,
+ FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
+{
+ MY_FTB_PHRASE_PARAM ftb_param;
+ MYSQL_FTPARSER_PARAM *param;
+ DBUG_ENTER("_ftb_check_phrase");
+ DBUG_ASSERT(parser);
+
+ if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 1)))
+ DBUG_RETURN(0);
+ ftb_param.phrase= ftbe->phrase;
+ ftb_param.document= ftbe->document;
+ ftb_param.cs= ftb->charset;
+ ftb_param.phrase_length= list_length(ftbe->phrase);
+ ftb_param.document_length= 1;
+ ftb_param.match= 0;
+
+ param->mysql_parse= ftb_check_phrase_internal;
+ param->mysql_add_word= ftb_phrase_add_word;
+ param->mysql_ftparam= (void *)&ftb_param;
+ param->cs= ftb->charset;
+ param->doc= document;
+ param->length= len;
+ param->flags= 0;
+ param->mode= MYSQL_FTPARSER_WITH_STOPWORDS;
+ if (unlikely(parser->parse(param)))
+ return -1;
+ DBUG_RETURN(ftb_param.match ? 1 : 0);
+}
+
+
+static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
+{
+ FT_SEG_ITERATOR ftsi;
+ FTB_EXPR *ftbe;
+ float weight=ftbw->weight;
+ int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0);
+ my_off_t curdoc=ftbw->docid[mode];
+ struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
+ &ft_default_parser :
+ ftb->info->s->keyinfo[ftb->keynr].parser;
+
+ for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
+ {
+ ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks);
+ if (ftbe->docid[mode] != curdoc)
+ {
+ ftbe->cur_weight=0;
+ ftbe->yesses=ftbe->nos=0;
+ ftbe->docid[mode]=curdoc;
+ }
+ if (ftbe->nos)
+ break;
+ if (yn_flag & FTB_FLAG_YES)
+ {
+ weight /= ftbe->ythresh;
+ ftbe->cur_weight += weight;
+ if ((int) ++ftbe->yesses == ythresh)
+ {
+ yn_flag=ftbe->flags;
+ weight=ftbe->cur_weight*ftbe->weight;
+ if (mode && ftbe->phrase)
+ {
+ int found= 0;
+
+ memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
+ while (_ma_ft_segiterator(&ftsi) && !found)
+ {
+ if (!ftsi.pos)
+ continue;
+ found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser);
+ if (unlikely(found < 0))
+ return 1;
+ }
+ if (!found)
+ break;
+ } /* ftbe->quot */
+ }
+ else
+ break;
+ }
+ else
+ if (yn_flag & FTB_FLAG_NO)
+ {
+ /*
+ NOTE: special sort function of queue assures that all
+ (yn_flag & FTB_FLAG_NO) != 0
+ events for every particular subexpression will
+ "auto-magically" happen BEFORE all the
+ (yn_flag & FTB_FLAG_YES) != 0 events. So no
+ already matched expression can become not-matched again.
+ */
+ ++ftbe->nos;
+ break;
+ }
+ else
+ {
+ if (ftbe->ythresh)
+ weight/=3;
+ ftbe->cur_weight += weight;
+ if ((int) ftbe->yesses < ythresh)
+ break;
+ if (!(yn_flag & FTB_FLAG_WONLY))
+ yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ;
+ weight*= ftbe->weight;
+ }
+ }
+ return 0;
+}
+
+
+int maria_ft_boolean_read_next(FT_INFO *ftb, char *record)
+{
+ FTB_EXPR *ftbe;
+ FTB_WORD *ftbw;
+ MARIA_HA *info=ftb->info;
+ my_off_t curdoc;
+
+ if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE)
+ return -1;
+
+ /* black magic ON */
+ if ((int) _ma_check_index(info, ftb->keynr) < 0)
+ return my_errno;
+ if (_ma_readinfo(info, F_RDLCK, 1))
+ return my_errno;
+ /* black magic OFF */
+
+ if (!ftb->queue.elements)
+ return my_errno=HA_ERR_END_OF_FILE;
+
+ /* Attention!!! Address of a local variable is used here! See err: label */
+ ftb->queue.first_cmp_arg=(void *)&curdoc;
+
+ while (ftb->state == INDEX_SEARCH &&
+ (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) !=
+ HA_OFFSET_ERROR)
+ {
+ while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
+ {
+ if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM;
+ goto err;
+ }
+
+ /* update queue */
+ _ft2_search(ftb, ftbw, 0);
+ queue_replace_top(&ftb->queue);
+ }
+
+ ftbe=ftb->root;
+ if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 &&
+ ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos)
+ {
+ /* curdoc matched ! */
+ if (is_tree_inited(&ftb->no_dupes) &&
+ tree_insert(&ftb->no_dupes, &curdoc, 0,
+ ftb->no_dupes.custom_arg)->count >1)
+ /* but it managed already to get past this line once */
+ continue;
+
+ info->cur_row.lastpos= curdoc;
+ /* Clear all states, except that the table was updated */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ if (!(*info->read_record)(info, (uchar *) record, curdoc))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ if (ftb->with_scan &&
+ maria_ft_boolean_find_relevance(ftb, (uchar *) record, 0)==0)
+ continue; /* no match */
+ my_errno=0;
+ goto err;
+ }
+ goto err;
+ }
+ }
+ ftb->state=INDEX_DONE;
+ my_errno=HA_ERR_END_OF_FILE;
+err:
+ ftb->queue.first_cmp_arg=(void *)0;
+ return my_errno;
+}
+
+
+typedef struct st_my_ftb_find_param
+{
+ FT_INFO *ftb;
+ FT_SEG_ITERATOR *ftsi;
+} MY_FTB_FIND_PARAM;
+
+
+static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
+ const uchar *word, mysql_ft_size_t len,
+ MYSQL_FTPARSER_BOOLEAN_INFO
+ *boolean_info __attribute__((unused)))
+{
+ MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
+ FT_INFO *ftb= ftb_param->ftb;
+ FTB_WORD *ftbw;
+ int a, b, c;
+ /*
+ Find right-most element in the array of query words matching this
+ word from a document.
+ */
+ for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
+ {
+ ftbw= ftb->list[c];
+ if (ha_compare_text(ftb->charset, (uchar*)word, len,
+ (uchar*)ftbw->word+1, ftbw->len-1,
+ (my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) < 0)
+ b= c;
+ else
+ a= c;
+ }
+ /*
+ If there were no words with truncation operator, we iterate to the
+ beginning of an array until array element is equal to the word from
+ a document. This is done mainly because the same word may be
+ mentioned twice (or more) in the query.
+
+ In case query has words with truncation operator we must iterate
+ to the beginning of the array. There may be non-matching query words
+ between matching word with truncation operator and the right-most
+ matching element. E.g., if we're looking for 'aaa15' in an array of
+ 'aaa1* aaa14 aaa15 aaa16'.
+
+ Worse of that there still may be match even if the binary search
+ above didn't find matching element. E.g., if we're looking for
+ 'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will
+ stop at 'aaa16'.
+ */
+ for (; c >= 0; c--)
+ {
+ ftbw= ftb->list[c];
+ if (ha_compare_text(ftb->charset, (uchar*)word, len,
+ (uchar*)ftbw->word + 1,ftbw->len - 1,
+ (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
+ {
+ if (ftb->with_scan & FTB_FLAG_TRUNC)
+ continue;
+ else
+ break;
+ }
+ if (ftbw->docid[1] == ftb->info->cur_row.lastpos)
+ continue;
+ ftbw->docid[1]= ftb->info->cur_row.lastpos;
+ if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi)))
+ return 1;
+ }
+ return(0);
+}
+
+
+static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param,
+ const uchar *doc, mysql_ft_size_t len)
+{
+ MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
+ FT_INFO *ftb= ftb_param->ftb;
+ const uchar *end= doc + len;
+ FT_WORD w;
+ while (maria_ft_simple_get_word(ftb->charset, &doc, end, &w, TRUE))
+ param->mysql_add_word(param, w.pos, w.len, 0);
+ return(0);
+}
+
+
+float maria_ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length)
+{
+ FTB_EXPR *ftbe;
+ FT_SEG_ITERATOR ftsi, ftsi2;
+ MARIA_RECORD_POS docid= ftb->info->cur_row.lastpos;
+ MY_FTB_FIND_PARAM ftb_param;
+ MYSQL_FTPARSER_PARAM *param;
+ struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
+ &ft_default_parser :
+ ftb->info->s->keyinfo[ftb->keynr].parser;
+
+ if (docid == HA_OFFSET_ERROR)
+ return -2.0;
+ if (!ftb->queue.elements)
+ return 0;
+ if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
+ return 0;
+
+ if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos)
+ {
+ FTB_EXPR *x;
+ uint i;
+
+ for (i=0; i < ftb->queue.elements; i++)
+ {
+ ftb->list[i]->docid[1]=HA_OFFSET_ERROR;
+ for (x=ftb->list[i]->up; x; x=x->up)
+ x->docid[1]=HA_OFFSET_ERROR;
+ }
+ }
+
+ ftb->lastpos=docid;
+
+ if (ftb->keynr==NO_SUCH_KEY)
+ _ma_ft_segiterator_dummy_init(record, length, &ftsi);
+ else
+ _ma_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
+ memcpy(&ftsi2, &ftsi, sizeof(ftsi));
+
+ ftb_param.ftb= ftb;
+ ftb_param.ftsi= &ftsi2;
+ param->mysql_parse= ftb_find_relevance_parse;
+ param->mysql_add_word= ftb_find_relevance_add_word;
+ param->mysql_ftparam= (void *)&ftb_param;
+ param->flags= 0;
+ param->cs= ftb->charset;
+ param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
+
+ while (_ma_ft_segiterator(&ftsi))
+ {
+ if (!ftsi.pos)
+ continue;
+ param->doc= ftsi.pos;
+ param->length= ftsi.len;
+ if (unlikely(parser->parse(param)))
+ return 0;
+ }
+ ftbe=ftb->root;
+ if (ftbe->docid[1]==docid && ftbe->cur_weight>0 &&
+ ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
+ { /* row matched ! */
+ return ftbe->cur_weight;
+ }
+ else
+ { /* match failed ! */
+ return 0.0;
+ }
+}
+
+
+void maria_ft_boolean_close_search(FT_INFO *ftb)
+{
+ if (is_tree_inited(& ftb->no_dupes))
+ {
+ delete_tree(& ftb->no_dupes);
+ }
+ free_root(& ftb->mem_root, MYF(0));
+ my_free(ftb, MYF(0));
+}
+
+
+float maria_ft_boolean_get_relevance(FT_INFO *ftb)
+{
+ return ftb->root->cur_weight;
+}
+
+
+void maria_ft_boolean_reinit_search(FT_INFO *ftb)
+{
+ _ftb_init_index_search(ftb);
+}
diff --git a/storage/maria/ma_ft_eval.c b/storage/maria/ma_ft_eval.c
new file mode 100644
index 00000000000..5fc67c6c664
--- /dev/null
+++ b/storage/maria/ma_ft_eval.c
@@ -0,0 +1,254 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+ added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ma_ftdefs.h"
+#include "maria_ft_eval.h"
+#include <stdarg.h>
+#include <my_getopt.h>
+
+static void print_error(int exit_code, const char *fmt,...);
+static void get_options(int argc, char *argv[]);
+static int create_record(char *pos, FILE *file);
+static void usage();
+
+static struct my_option my_long_options[] =
+{
+ {"", 's', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'q', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", '#', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+int main(int argc, char *argv[])
+{
+ MARIA_HA *file;
+ int i,j;
+
+ MY_INIT(argv[0]);
+ get_options(argc,argv);
+ bzero((char*)recinfo,sizeof(recinfo));
+
+ maria_init();
+ /* First define 2 columns */
+ recinfo[0].type=FIELD_SKIP_ENDSPACE;
+ recinfo[0].length=docid_length;
+ recinfo[1].type=FIELD_BLOB;
+ recinfo[1].length= 4+portable_sizeof_char_ptr;
+
+ /* Define a key over the first column */
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[0].seg[0].type= HA_KEYTYPE_TEXT;
+ keyinfo[0].seg[0].flag= HA_BLOB_PART;
+ keyinfo[0].seg[0].start=recinfo[0].length;
+ keyinfo[0].seg[0].length=key_length;
+ keyinfo[0].seg[0].null_bit=0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].seg[0].bit_start=4;
+ keyinfo[0].seg[0].language=MY_CHARSET_CURRENT;
+ keyinfo[0].flag = HA_FULLTEXT;
+
+ if (!silent)
+ printf("- Creating isam-file\n");
+ if (maria_create(filename,1,keyinfo,2,recinfo,0,NULL,(MARIA_CREATE_INFO*) 0,0))
+ goto err;
+ if (!(file=maria_open(filename,2,0)))
+ goto err;
+ if (!silent)
+ printf("Initializing stopwords\n");
+ maria_ft_init_stopwords(stopwordlist);
+
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ my_errno=0;
+ i=0;
+ while (create_record(record,df))
+ {
+ error=maria_write(file,record);
+ if (error)
+ printf("I= %2d maria_write: %d errno: %d\n",i,error,my_errno);
+ i++;
+ }
+ fclose(df);
+
+ if (maria_close(file)) goto err;
+ if (!silent)
+ printf("- Reopening file\n");
+ if (!(file=maria_open(filename,2,0))) goto err;
+ if (!silent)
+ printf("- Reading rows with key\n");
+ for (i=1;create_record(record,qf);i++)
+ {
+ FT_DOCLIST *result;
+ double w;
+ int t, err;
+
+ result=maria_ft_nlq_init_search(file,0,blob_record,(uint) strlen(blob_record),1);
+ if (!result)
+ {
+ printf("Query %d failed with errno %3d\n",i,my_errno);
+ goto err;
+ }
+ if (!silent)
+ printf("Query %d. Found: %d.\n",i,result->ndocs);
+ for (j=0;(err=maria_ft_nlq_read_next(result, read_record))==0;j++)
+ {
+ t=uint2korr(read_record);
+ w=maria_ft_nlq_get_relevance(result);
+ printf("%d %.*s %f\n",i,t,read_record+2,w);
+ }
+ if (err != HA_ERR_END_OF_FILE)
+ {
+ printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno);
+ goto err;
+ }
+ maria_ft_nlq_close_search(result);
+ }
+
+ if (maria_close(file)) goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return (0);
+
+ err:
+ printf("got error: %3d when using maria-database\n",my_errno);
+ return 1; /* skip warning */
+
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument)
+{
+ switch (optid) {
+ case 's':
+ if (stopwordlist && stopwordlist != maria_ft_precompiled_stopwords)
+ break;
+ {
+ FILE *f; char s[HA_FT_MAXLEN]; int i=0,n=SWL_INIT;
+
+ if (!(stopwordlist=(const char**) malloc(n*sizeof(char *))))
+ print_error(1,"malloc(%d)",n*sizeof(char *));
+ if (!(f=fopen(argument,"r")))
+ print_error(1,"fopen(%s)",argument);
+ while (!feof(f))
+ {
+ if (!(fgets(s,HA_FT_MAXLEN,f)))
+ print_error(1,"fgets(s,%d,%s)",HA_FT_MAXLEN,argument);
+ if (!(stopwordlist[i++]=strdup(s)))
+ print_error(1,"strdup(%s)",s);
+ if (i >= n)
+ {
+ n+=SWL_PLUS;
+ if (!(stopwordlist=(const char**) realloc((char*) stopwordlist,
+ n*sizeof(char *))))
+ print_error(1,"realloc(%d)",n*sizeof(char *));
+ }
+ }
+ fclose(f);
+ stopwordlist[i]=NULL;
+ break;
+ }
+ case 'q': silent=1; break;
+ case 'S': if (stopwordlist==maria_ft_precompiled_stopwords) stopwordlist=NULL; break;
+ case '#':
+ DBUG_PUSH (argument);
+ break;
+ case 'V':
+ case '?':
+ case 'h':
+ usage();
+ exit(1);
+ }
+ return 0;
+}
+
+
+static void get_options(int argc, char *argv[])
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ if (!(d_file=argv[optind])) print_error(1,"No d_file");
+ if (!(df=fopen(d_file,"r")))
+ print_error(1,"fopen(%s)",d_file);
+ if (!(q_file=argv[optind+1])) print_error(1,"No q_file");
+ if (!(qf=fopen(q_file,"r")))
+ print_error(1,"fopen(%s)",q_file);
+ return;
+} /* get options */
+
+
+static int create_record(char *pos, FILE *file)
+{
+ uint tmp; char *ptr;
+
+ bzero((char *)pos,MAX_REC_LENGTH);
+
+ /* column 1 - VARCHAR */
+ if (!(fgets(pos+2,MAX_REC_LENGTH-32,file)))
+ {
+ if (feof(file))
+ return 0;
+ else
+ print_error(1,"fgets(docid) - 1");
+ }
+ tmp=(uint) strlen(pos+2)-1;
+ int2store(pos,tmp);
+ pos+=recinfo[0].length;
+
+ /* column 2 - BLOB */
+
+ if (!(fgets(blob_record,MAX_BLOB_LENGTH,file)))
+ print_error(1,"fgets(docid) - 2");
+ tmp=(uint) strlen(blob_record);
+ int4store(pos,tmp);
+ ptr=blob_record;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ return 1;
+}
+
+/* VARARGS */
+
+static void print_error(int exit_code, const char *fmt,...)
+{
+ va_list args;
+
+ va_start(args,fmt);
+ fprintf(stderr,"%s: error: ",my_progname);
+ VOID(vfprintf(stderr, fmt, args));
+ VOID(fputc('\n',stderr));
+ fflush(stderr);
+ va_end(args);
+ exit(exit_code);
+}
+
+
+static void usage()
+{
+ printf("%s [options]\n", my_progname);
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+}
diff --git a/storage/maria/ma_ft_eval.h b/storage/maria/ma_ft_eval.h
new file mode 100644
index 00000000000..481943dfb0b
--- /dev/null
+++ b/storage/maria/ma_ft_eval.h
@@ -0,0 +1,41 @@
+/* Copyright (C) 2006 MySQL AB & Sergei A. Golubchik
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+const char **stopwordlist=maria_ft_precompiled_stopwords;
+
+#define MAX_REC_LENGTH 128
+#define MAX_BLOB_LENGTH 60000
+char record[MAX_REC_LENGTH], read_record[MAX_REC_LENGTH+MAX_BLOB_LENGTH];
+char blob_record[MAX_BLOB_LENGTH+20*20];
+
+char *filename= (char*) "EVAL";
+
+int silent=0, error=0;
+
+uint key_length=MAX_BLOB_LENGTH,docid_length=32;
+char *d_file, *q_file;
+FILE *df,*qf;
+
+MARIA_COLUMNDEF recinfo[3];
+MARIA_KEYDEF keyinfo[2];
+HA_KEYSEG keyseg[10];
+
+#define SWL_INIT 500
+#define SWL_PLUS 50
+
+#define MAX_LINE_LENGTH 128
+char line[MAX_LINE_LENGTH];
diff --git a/storage/maria/ma_ft_nlq_search.c b/storage/maria/ma_ft_nlq_search.c
new file mode 100644
index 00000000000..3bb7defcaaf
--- /dev/null
+++ b/storage/maria/ma_ft_nlq_search.c
@@ -0,0 +1,380 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+#define FT_CORE
+#include "ma_ftdefs.h"
+
+/* search with natural language queries */
+
+typedef struct ft_doc_rec
+{
+ my_off_t dpos;
+ double weight;
+} FT_DOC;
+
+struct st_ft_info
+{
+ struct _ft_vft *please;
+ MARIA_HA *info;
+ int ndocs;
+ int curdoc;
+ FT_DOC doc[1];
+};
+
+typedef struct st_all_in_one
+{
+ MARIA_HA *info;
+ uint keynr;
+ CHARSET_INFO *charset;
+ uchar *keybuff;
+ TREE dtree;
+} ALL_IN_ONE;
+
+typedef struct st_ft_superdoc
+{
+ FT_DOC doc;
+ FT_WORD *word_ptr;
+ double tmp_weight;
+} FT_SUPERDOC;
+
+
+static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)),
+ FT_SUPERDOC *p1, FT_SUPERDOC *p2)
+{
+ if (p1->doc.dpos < p2->doc.dpos)
+ return -1;
+ if (p1->doc.dpos == p2->doc.dpos)
+ return 0;
+ return 1;
+}
+
+static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
+{
+ FT_WEIGTH subkeys;
+ int r;
+ uint doc_cnt;
+ FT_SUPERDOC sdoc, *sptr;
+ TREE_ELEMENT *selem;
+ double gweight=1;
+ MARIA_HA *info= aio->info;
+ uchar *keybuff= aio->keybuff;
+ MARIA_KEYDEF *keyinfo= info->s->keyinfo+aio->keynr;
+ my_off_t key_root=info->s->state.key_root[aio->keynr];
+ uint extra=HA_FT_WLEN+info->s->base.rec_reflength;
+ MARIA_KEY key;
+#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
+ float tmp_weight;
+#else
+#error
+#endif
+ DBUG_ENTER("walk_and_match");
+
+ word->weight=LWS_FOR_QUERY;
+
+ _ma_ft_make_key(info, &key, aio->keynr, keybuff, word, 0);
+ key.data_length-= HA_FT_WLEN;
+ doc_cnt=0;
+
+ /* Skip rows inserted by current inserted */
+ for (r= _ma_search(info, &key, SEARCH_FIND, key_root) ;
+ !r &&
+ (subkeys.i= ft_sintXkorr(info->last_key.data +
+ info->last_key.data_length +
+ info->last_key.ref_length - extra)) > 0 &&
+ info->cur_row.lastpos >= info->state->data_file_length ;
+ r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root))
+ ;
+
+ info->update|= HA_STATE_AKTIV; /* for _ma_test_if_changed() */
+
+ /* The following should be safe, even if we compare doubles */
+ while (!r && gweight)
+ {
+
+ if (key.data_length &&
+ ha_compare_text(aio->charset,
+ info->last_key.data+1,
+ info->last_key.data_length +
+ info->last_key.ref_length - extra - 1,
+ key.data+1, key.data_length-1, 0, 0))
+ break;
+
+ if (subkeys.i < 0)
+ {
+ if (doc_cnt)
+ DBUG_RETURN(1); /* index is corrupted */
+ /*
+ TODO here: unsafe optimization, should this word
+ be skipped (based on subkeys) ?
+ */
+ keybuff+= key.data_length;
+ keyinfo= &info->s->ft2_keyinfo;
+ key_root= info->cur_row.lastpos;
+ key.data_length= 0;
+ r= _ma_search_first(info, keyinfo, key_root);
+ goto do_skip;
+ }
+#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
+ /* The weight we read was actually a float */
+ tmp_weight= subkeys.f;
+#else
+#error
+#endif
+ /* The following should be safe, even if we compare doubles */
+ if (tmp_weight==0)
+ DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */
+
+ sdoc.doc.dpos= info->cur_row.lastpos;
+
+ /* saving document matched into dtree */
+ if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg)))
+ DBUG_RETURN(1);
+
+ sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem);
+
+ if (selem->count==1) /* document's first match */
+ sptr->doc.weight=0;
+ else
+ sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight;
+
+ sptr->word_ptr=word;
+ sptr->tmp_weight=tmp_weight;
+
+ doc_cnt++;
+
+ gweight=word->weight*GWS_IN_USE;
+ if (gweight < 0 || doc_cnt > 2000000)
+ gweight=0;
+
+ if (_ma_test_if_changed(info) == 0)
+ r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root);
+ else
+ r= _ma_search(info, &info->last_key, SEARCH_BIGGER, key_root);
+do_skip:
+ while ((subkeys.i= ft_sintXkorr(info->last_key.data +
+ info->last_key.data_length +
+ info->last_key.ref_length - extra)) > 0 &&
+ !r && info->cur_row.lastpos >= info->state->data_file_length)
+ r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root);
+
+ }
+ word->weight=gweight;
+
+ DBUG_RETURN(0);
+}
+
+
+static int walk_and_copy(FT_SUPERDOC *from,
+ uint32 count __attribute__((unused)), FT_DOC **to)
+{
+ DBUG_ENTER("walk_and_copy");
+ from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
+ (*to)->dpos=from->doc.dpos;
+ (*to)->weight=from->doc.weight;
+ (*to)++;
+ DBUG_RETURN(0);
+}
+
+static int walk_and_push(FT_SUPERDOC *from,
+ uint32 count __attribute__((unused)), QUEUE *best)
+{
+ DBUG_ENTER("walk_and_copy");
+ from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
+ set_if_smaller(best->elements, ft_query_expansion_limit-1);
+ queue_insert(best, (uchar *)& from->doc);
+ DBUG_RETURN(0);
+}
+
+
+static int FT_DOC_cmp(void *unused __attribute__((unused)),
+ FT_DOC *a, FT_DOC *b)
+{
+ return sgn(b->weight - a->weight);
+}
+
+
+FT_INFO *maria_ft_init_nlq_search(MARIA_HA *info, uint keynr, uchar *query,
+ size_t query_len, uint flags, uchar *record)
+{
+ TREE wtree;
+ ALL_IN_ONE aio;
+ FT_DOC *dptr;
+ FT_INFO *dlist=NULL;
+ MARIA_RECORD_POS saved_lastpos= info->cur_row.lastpos;
+ struct st_mysql_ftparser *parser;
+ MYSQL_FTPARSER_PARAM *ftparser_param;
+ DBUG_ENTER("maria_ft_init_nlq_search");
+
+ /* black magic ON */
+ if ((int) (keynr = _ma_check_index(info,keynr)) < 0)
+ DBUG_RETURN(NULL);
+ if (_ma_readinfo(info,F_RDLCK,1))
+ DBUG_RETURN(NULL);
+ /* black magic OFF */
+
+ aio.info=info;
+ aio.keynr=keynr;
+ aio.charset=info->s->keyinfo[keynr].seg->charset;
+ aio.keybuff= info->lastkey_buff2;
+ parser= info->s->keyinfo[keynr].parser;
+ if (! (ftparser_param= maria_ftparser_call_initializer(info, keynr, 0)))
+ goto err;
+
+ bzero(&wtree,sizeof(wtree));
+
+ init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
+ NULL, NULL);
+
+ maria_ft_parse_init(&wtree, aio.charset);
+ ftparser_param->flags= 0;
+ if (maria_ft_parse(&wtree, query, query_len, parser, ftparser_param,
+ &wtree.mem_root))
+ goto err;
+
+ if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
+ left_root_right))
+ goto err;
+
+ if (flags & FT_EXPAND && ft_query_expansion_limit)
+ {
+ QUEUE best;
+ init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp,
+ 0, 0, 0);
+ tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push,
+ &best, left_root_right);
+ while (best.elements)
+ {
+ my_off_t docid= ((FT_DOC *)queue_remove_top(&best))->dpos;
+ if (!(*info->read_record)(info, record, docid))
+ {
+ info->update|= HA_STATE_AKTIV;
+ ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY;
+ if (unlikely(_ma_ft_parse(&wtree, info, keynr, record, ftparser_param,
+ &wtree.mem_root)))
+ {
+ delete_queue(&best);
+ goto err;
+ }
+ }
+ }
+ delete_queue(&best);
+ reset_tree(&aio.dtree);
+ if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
+ left_root_right))
+ goto err;
+
+ }
+
+ /*
+ If ndocs == 0, this will not allocate RAM for FT_INFO.doc[],
+ so if ndocs == 0, FT_INFO.doc[] must not be accessed.
+ */
+ dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+
+ sizeof(FT_DOC)*
+ (int)(aio.dtree.elements_in_tree-1),
+ MYF(0));
+ if (!dlist)
+ goto err;
+
+ dlist->please= (struct _ft_vft *) & _ma_ft_vft_nlq;
+ dlist->ndocs=aio.dtree.elements_in_tree;
+ dlist->curdoc=-1;
+ dlist->info=aio.info;
+ dptr=dlist->doc;
+
+ tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy,
+ &dptr, left_root_right);
+
+ if (flags & FT_SORTED)
+ my_qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC),
+ (qsort2_cmp)&FT_DOC_cmp, 0);
+
+err:
+ delete_tree(&aio.dtree);
+ delete_tree(&wtree);
+ info->cur_row.lastpos= saved_lastpos;
+ DBUG_RETURN(dlist);
+}
+
+
+int maria_ft_nlq_read_next(FT_INFO *handler, char *record)
+{
+ MARIA_HA *info= (MARIA_HA *) handler->info;
+
+ if (++handler->curdoc >= handler->ndocs)
+ {
+ --handler->curdoc;
+ return HA_ERR_END_OF_FILE;
+ }
+
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ info->cur_row.lastpos= handler->doc[handler->curdoc].dpos;
+ if (!(*info->read_record)(info, (uchar *) record, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ return 0;
+ }
+ return my_errno;
+}
+
+
+float maria_ft_nlq_find_relevance(FT_INFO *handler,
+ uchar *record __attribute__((unused)),
+ uint length __attribute__((unused)))
+{
+ int a,b,c;
+ FT_DOC *docs=handler->doc;
+ MARIA_RECORD_POS docid= handler->info->cur_row.lastpos;
+
+ if (docid == HA_POS_ERROR)
+ return -5.0;
+
+ /* Assuming docs[] is sorted by dpos... */
+
+ for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2)
+ {
+ if (docs[c].dpos > docid)
+ b=c;
+ else
+ a=c;
+ }
+ /* bounds check to avoid accessing unallocated handler->doc */
+ if (a < handler->ndocs && docs[a].dpos == docid)
+ return (float) docs[a].weight;
+ else
+ return 0.0;
+}
+
+
+void maria_ft_nlq_close_search(FT_INFO *handler)
+{
+ my_free(handler, MYF(0));
+}
+
+
+float maria_ft_nlq_get_relevance(FT_INFO *handler)
+{
+ return (float) handler->doc[handler->curdoc].weight;
+}
+
+
+void maria_ft_nlq_reinit_search(FT_INFO *handler)
+{
+ handler->curdoc=-1;
+}
+
diff --git a/storage/maria/ma_ft_parser.c b/storage/maria/ma_ft_parser.c
new file mode 100644
index 00000000000..b35c2227ca2
--- /dev/null
+++ b/storage/maria/ma_ft_parser.c
@@ -0,0 +1,417 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+#include "ma_ftdefs.h"
+
+typedef struct st_maria_ft_docstat {
+ FT_WORD *list;
+ uint uniq;
+ double sum;
+} FT_DOCSTAT;
+
+
+typedef struct st_my_maria_ft_parser_param
+{
+ TREE *wtree;
+ MEM_ROOT *mem_root;
+} MY_FT_PARSER_PARAM;
+
+
+static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
+{
+ return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
+ (uchar*) w2->pos, w2->len, 0, 0);
+}
+
+static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
+{
+ word->weight=LWS_IN_USE;
+ docstat->sum+=word->weight;
+ memcpy_fixed((docstat->list)++,word,sizeof(FT_WORD));
+ return 0;
+}
+
+/* transforms tree of words into the array, applying normalization */
+
+FT_WORD * maria_ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
+{
+ FT_WORD *wlist,*p;
+ FT_DOCSTAT docstat;
+ DBUG_ENTER("maria_ft_linearize");
+
+ if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
+ (1+wtree->elements_in_tree))))
+ {
+ docstat.list=wlist;
+ docstat.uniq=wtree->elements_in_tree;
+ docstat.sum=0;
+ tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
+ }
+ delete_tree(wtree);
+ if (!wlist)
+ DBUG_RETURN(NULL);
+
+ docstat.list->pos=NULL;
+
+ for (p=wlist;p->pos;p++)
+ {
+ p->weight=PRENORM_IN_USE;
+ }
+
+ for (p=wlist;p->pos;p++)
+ {
+ p->weight/=NORM_IN_USE;
+ }
+
+ DBUG_RETURN(wlist);
+}
+
+my_bool maria_ft_boolean_check_syntax_string(const uchar *str)
+{
+ uint i, j;
+
+ if (!str ||
+ (strlen((const char *) str) + 1 != sizeof(ft_boolean_syntax)) ||
+ (str[0] != ' ' && str[1] != ' '))
+ return 1;
+ for (i=0; i<sizeof(ft_boolean_syntax); i++)
+ {
+ /* limiting to 7-bit ascii only */
+ if ((unsigned char)(str[i]) > 127 ||
+ my_isalnum(default_charset_info, str[i]))
+ return 1;
+ for (j=0; j<i; j++)
+ if (str[i] == str[j] && (i != 11 || j != 10))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ RETURN VALUE
+ 0 - eof
+ 1 - word found
+ 2 - left bracket
+ 3 - right bracket
+ 4 - stopword found
+*/
+uchar maria_ft_get_word(CHARSET_INFO *cs, const uchar **start,
+ const uchar *end,
+ FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
+{
+ const uchar *doc= *start;
+ int ctype;
+ uint mwc, length;
+ int mbl;
+
+ param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
+ param->weight_adjust= param->wasign= 0;
+ param->type= FT_TOKEN_EOF;
+
+ while (doc<end)
+ {
+ for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, doc, end);
+ if (true_word_char(ctype, *doc))
+ break;
+ if (*doc == FTB_RQUOT && param->quot)
+ {
+ param->quot= (char *) doc;
+ *start=doc+1;
+ param->type= FT_TOKEN_RIGHT_PAREN;
+ goto ret;
+ }
+ if (!param->quot)
+ {
+ if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
+ {
+ /* param->prev=' '; */
+ *start=doc+1;
+ if (*doc == FTB_LQUOT)
+ param->quot= (char *) *start;
+ param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
+ goto ret;
+ }
+ if (param->prev == ' ')
+ {
+ if (*doc == FTB_YES ) { param->yesno=+1; continue; } else
+ if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else
+ if (*doc == FTB_NO ) { param->yesno=-1; continue; } else
+ if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
+ if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
+ if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
+ }
+ }
+ param->prev=*doc;
+ param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
+ param->weight_adjust= param->wasign= 0;
+ }
+
+ mwc=length=0;
+ for (word->pos= doc; doc < end; length++,
+ doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, doc, end);
+ if (true_word_char(ctype, *doc))
+ mwc=0;
+ else if (!misc_word_char(*doc) || mwc)
+ break;
+ else
+ mwc++;
+ }
+ param->prev='A'; /* be sure *prev is true_word_char */
+ word->len= (uint)(doc-word->pos) - mwc;
+ if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
+ doc++;
+
+ if (((length >= ft_min_word_len && !is_stopword((char *) word->pos,
+ word->len))
+ || param->trunc) && length < ft_max_word_len)
+ {
+ *start=doc;
+ param->type= FT_TOKEN_WORD;
+ goto ret;
+ }
+ else if (length) /* make sure length > 0 (if start contains spaces only) */
+ {
+ *start= doc;
+ param->type= FT_TOKEN_STOPWORD;
+ goto ret;
+ }
+ }
+ if (param->quot)
+ {
+ param->quot= (char *)(*start= doc);
+ param->type= 3; /* FT_RBR */
+ goto ret;
+ }
+ret:
+ return param->type;
+}
+
+uchar maria_ft_simple_get_word(CHARSET_INFO *cs, const uchar **start,
+ const uchar *end, FT_WORD *word,
+ my_bool skip_stopwords)
+{
+ const uchar *doc= *start;
+ uint mwc, length;
+ int ctype, mbl;
+ DBUG_ENTER("maria_ft_simple_get_word");
+
+ do
+ {
+ for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
+ {
+ if (doc >= end)
+ DBUG_RETURN(0);
+ mbl= cs->cset->ctype(cs, &ctype, doc, end);
+ if (true_word_char(ctype, *doc))
+ break;
+ }
+
+ mwc= length= 0;
+ for (word->pos= doc; doc < end; length++,
+ doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, doc, end);
+ if (true_word_char(ctype, *doc))
+ mwc= 0;
+ else if (!misc_word_char(*doc) || mwc)
+ break;
+ else
+ mwc++;
+ }
+
+ word->len= (uint)(doc-word->pos) - mwc;
+
+ if (skip_stopwords == FALSE ||
+ (length >= ft_min_word_len && length < ft_max_word_len &&
+ !is_stopword((char *) word->pos, word->len)))
+ {
+ *start= doc;
+ DBUG_RETURN(1);
+ }
+ } while (doc < end);
+ DBUG_RETURN(0);
+}
+
+void maria_ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
+{
+ DBUG_ENTER("maria_ft_parse_init");
+ if (!is_tree_inited(wtree))
+ init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0, NULL,
+ (void*) cs);
+ DBUG_VOID_RETURN;
+}
+
+
+static int maria_ft_add_word(MYSQL_FTPARSER_PARAM *param,
+ const uchar *word, mysql_ft_size_t word_len,
+ MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info
+ __attribute__((unused)))
+{
+ TREE *wtree;
+ FT_WORD w;
+ MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
+ DBUG_ENTER("maria_ft_add_word");
+ wtree= ft_param->wtree;
+ if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
+ {
+ uchar *ptr;
+ DBUG_ASSERT(wtree->with_delete == 0);
+ ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
+ memcpy(ptr, word, word_len);
+ w.pos= ptr;
+ }
+ else
+ w.pos= word;
+ w.len= word_len;
+ if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
+ {
+ delete_tree(wtree);
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+
+static int maria_ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
+ const uchar *doc_arg,
+ mysql_ft_size_t doc_len)
+{
+ const uchar *doc= doc_arg;
+ const uchar *end= doc + doc_len;
+ MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
+ TREE *wtree= ft_param->wtree;
+ FT_WORD w;
+ DBUG_ENTER("maria_ft_parse_internal");
+
+ while (maria_ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
+ if (param->mysql_add_word(param, w.pos, w.len, 0))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+int maria_ft_parse(TREE *wtree, uchar *doc, size_t doclen,
+ struct st_mysql_ftparser *parser,
+ MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
+{
+ MY_FT_PARSER_PARAM my_param;
+ DBUG_ENTER("maria_ft_parse");
+ DBUG_ASSERT(parser);
+ my_param.wtree= wtree;
+ my_param.mem_root= mem_root;
+
+ param->mysql_parse= maria_ft_parse_internal;
+ param->mysql_add_word= maria_ft_add_word;
+ param->mysql_ftparam= &my_param;
+ param->cs= wtree->custom_arg;
+ param->doc= doc;
+ param->length= doclen;
+ param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
+ DBUG_RETURN(parser->parse(param));
+}
+
+
+#define MAX_PARAM_NR 2
+
+MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info)
+{
+ if (!info->ftparser_param)
+ {
+ /*
+. info->ftparser_param can not be zero after the initialization,
+ because it always includes built-in fulltext parser. And built-in
+ parser can be called even if the table has no fulltext indexes and
+ no varchar/text fields.
+
+ ftb_find_relevance... parser (ftb_find_relevance_parse,
+ ftb_find_relevance_add_word) calls ftb_check_phrase... parser
+ (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
+ */
+ info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
+ my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
+ info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
+ init_alloc_root(&info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
+ }
+ return info->ftparser_param;
+}
+
+
+MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
+ uint keynr, uint paramnr)
+{
+ uint32 ftparser_nr;
+ struct st_mysql_ftparser *parser;
+
+ if (!maria_ftparser_alloc_param(info))
+ return 0;
+
+ if (keynr == NO_SUCH_KEY)
+ {
+ ftparser_nr= 0;
+ parser= &ft_default_parser;
+ }
+ else
+ {
+ ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
+ parser= info->s->keyinfo[keynr].parser;
+ }
+ DBUG_ASSERT(paramnr < MAX_PARAM_NR);
+ ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
+ if (! info->ftparser_param[ftparser_nr].mysql_add_word)
+ {
+ /* Note, that mysql_add_word is used here as a flag:
+ mysql_add_word == 0 - parser is not initialized
+ mysql_add_word != 0 - parser is initialized, or no
+ initialization needed. */
+ info->ftparser_param[ftparser_nr].mysql_add_word=
+ (int (*)(struct st_mysql_ftparser_param *, const uchar *,
+ mysql_ft_size_t, MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
+ if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
+ return 0;
+ }
+ return &info->ftparser_param[ftparser_nr];
+}
+
+
+void maria_ftparser_call_deinitializer(MARIA_HA *info)
+{
+ uint i, j, keys= info->s->state.header.keys;
+ free_root(&info->ft_memroot, MYF(0));
+ if (! info->ftparser_param)
+ return;
+ for (i= 0; i < keys; i++)
+ {
+ MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i];
+ for (j=0; j < MAX_PARAM_NR; j++)
+ {
+ MYSQL_FTPARSER_PARAM *ftparser_param=
+ &info->ftparser_param[keyinfo->ftkey_nr*MAX_PARAM_NR + j];
+ if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
+ {
+ if (keyinfo->parser->deinit)
+ keyinfo->parser->deinit(ftparser_param);
+ ftparser_param->mysql_add_word= 0;
+ }
+ else
+ break;
+ }
+ }
+}
diff --git a/storage/maria/ma_ft_stem.c b/storage/maria/ma_ft_stem.c
new file mode 100644
index 00000000000..06fc0b2df6c
--- /dev/null
+++ b/storage/maria/ma_ft_stem.c
@@ -0,0 +1,18 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* mulitingual stem */
diff --git a/storage/maria/ma_ft_test1.c b/storage/maria/ma_ft_test1.c
new file mode 100644
index 00000000000..4c98e766234
--- /dev/null
+++ b/storage/maria/ma_ft_test1.c
@@ -0,0 +1,317 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+ added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ma_ftdefs.h"
+#include "maria_ft_test1.h"
+#include <my_getopt.h>
+
+static int key_field=FIELD_VARCHAR,extra_field=FIELD_SKIP_ENDSPACE;
+static uint key_length=200,extra_length=50;
+static int key_type=HA_KEYTYPE_TEXT;
+static int verbose=0,silent=0,skip_update=0,
+ no_keys=0,no_stopwords=0,no_search=0,no_fulltext=0;
+static int create_flag=0,error=0;
+
+#define MAX_REC_LENGTH 300
+static char record[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH];
+
+static int run_test(const char *filename);
+static void get_options(int argc, char *argv[]);
+static void create_record(char *, int);
+static void usage();
+
+static struct my_option my_long_options[] =
+{
+ {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 's', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'N', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'K', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'F', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", 'U', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"", '#', "", 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+int main(int argc, char *argv[])
+{
+ MY_INIT(argv[0]);
+
+ get_options(argc,argv);
+ maria_init();
+
+ exit(run_test("FT1"));
+}
+
+static MARIA_COLUMNDEF recinfo[3];
+static MARIA_KEYDEF keyinfo[2];
+static HA_KEYSEG keyseg[10];
+
+static int run_test(const char *filename)
+{
+ MARIA_HA *file;
+ int i,j;
+ my_off_t pos;
+
+ bzero((char*) recinfo,sizeof(recinfo));
+
+ /* First define 2 columns */
+ recinfo[0].type=extra_field;
+ recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr :
+ extra_length);
+ if (extra_field == FIELD_VARCHAR)
+ recinfo[0].length+= HA_VARCHAR_PACKLENGTH(extra_length);
+ recinfo[1].type=key_field;
+ recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr :
+ key_length);
+ if (key_field == FIELD_VARCHAR)
+ recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length);
+
+ /* Define a key over the first column */
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[0].seg[0].type= key_type;
+ keyinfo[0].seg[0].flag= (key_field == FIELD_BLOB) ? HA_BLOB_PART:
+ (key_field == FIELD_VARCHAR) ? HA_VAR_LENGTH_PART:0;
+ keyinfo[0].seg[0].start=recinfo[0].length;
+ keyinfo[0].seg[0].length=key_length;
+ keyinfo[0].seg[0].null_bit= 0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].seg[0].language= default_charset_info->number;
+ keyinfo[0].flag = (no_fulltext?HA_PACK_KEY:HA_FULLTEXT);
+
+ if (!silent)
+ printf("- Creating isam-file\n");
+ if (maria_create(filename,(no_keys?0:1),keyinfo,2,recinfo,0,NULL,
+ (MARIA_CREATE_INFO*) 0, create_flag))
+ goto err;
+ if (!(file=maria_open(filename,2,0)))
+ goto err;
+
+ if (!silent)
+ printf("- %s stopwords\n",no_stopwords?"Skipping":"Initializing");
+ maria_ft_init_stopwords(no_stopwords?NULL:maria_ft_precompiled_stopwords);
+
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ my_errno=0;
+ for (i=NUPD ; i<NDATAS; i++ )
+ {
+ create_record(record,i);
+ error=maria_write(file,record);
+ if (verbose || error)
+ printf("I= %2d maria_write: %d errno: %d, record: %s\n",
+ i,error,my_errno,data[i].f0);
+ }
+
+ if (!skip_update)
+ {
+ if (!silent)
+ printf("- Updating rows\n");
+
+ /* Read through all rows and update them */
+ pos=(ha_rows) 0;
+ i=0;
+ while ((error=maria_rrnd(file,read_record,pos)) == 0)
+ {
+ create_record(record,NUPD-i-1);
+ if (maria_update(file,read_record,record))
+ {
+ printf("Can't update row: %.*s, error: %d\n",
+ keyinfo[0].seg[0].length,record,my_errno);
+ }
+ if(++i == NUPD) break;
+ pos=HA_OFFSET_ERROR;
+ }
+ if (i != NUPD)
+ printf("Found %d of %d rows\n", i,NUPD);
+ }
+
+ if (maria_close(file)) goto err;
+ if(no_search) return 0;
+ if (!silent)
+ printf("- Reopening file\n");
+ if (!(file=maria_open(filename,2,0))) goto err;
+ if (!silent)
+ printf("- Reading rows with key\n");
+ for (i=0 ; i < NQUERIES ; i++)
+ {
+ FT_DOCLIST *result;
+ result=maria_ft_nlq_init_search(file,0,(char*) query[i],strlen(query[i]),1);
+ if(!result)
+ {
+ printf("Query %d: `%s' failed with errno %3d\n",i,query[i],my_errno);
+ continue;
+ }
+ printf("Query %d: `%s'. Found: %d. Top five documents:\n",
+ i,query[i],result->ndocs);
+ for (j=0;j<5;j++)
+ {
+ double w; int err;
+ err= maria_ft_nlq_read_next(result, read_record);
+ if (err==HA_ERR_END_OF_FILE)
+ {
+ printf("No more matches!\n");
+ break;
+ }
+ else if (err)
+ {
+ printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno);
+ break;
+ }
+ w=maria_ft_nlq_get_relevance(result);
+ if (key_field == FIELD_VARCHAR)
+ {
+ uint l;
+ char *p;
+ p=recinfo[0].length+read_record;
+ l=uint2korr(p);
+ printf("%10.7f: %.*s\n",w,(int) l,p+2);
+ }
+ else
+ printf("%10.7f: %.*s\n",w,recinfo[1].length,
+ recinfo[0].length+read_record);
+ }
+ maria_ft_nlq_close_search(result);
+ }
+
+ if (maria_close(file)) goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return (0);
+err:
+ printf("got error: %3d when using maria-database\n",my_errno);
+ return 1; /* skip warning */
+}
+
+static char blob_key[MAX_REC_LENGTH];
+/* static char blob_record[MAX_REC_LENGTH+20*20]; */
+
+void create_record(char *pos, int n)
+{
+ bzero((char*) pos,MAX_REC_LENGTH);
+ if (recinfo[0].type == FIELD_BLOB)
+ {
+ uint tmp;
+ char *ptr;
+ strnmov(blob_key,data[n].f0,keyinfo[0].seg[0].length);
+ tmp=strlen(blob_key);
+ int4store(pos,tmp);
+ ptr=blob_key;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ pos+=recinfo[0].length;
+ }
+ else if (recinfo[0].type == FIELD_VARCHAR)
+ {
+ uint tmp;
+ /* -1 is here because pack_length is stored in seg->length */
+ uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1);
+ strnmov(pos+pack_length,data[n].f0,keyinfo[0].seg[0].length);
+ tmp=strlen(pos+pack_length);
+ if (pack_length == 1)
+ *pos= (char) tmp;
+ else
+ int2store(pos,tmp);
+ pos+=recinfo[0].length;
+ }
+ else
+ {
+ strnmov(pos,data[n].f0,keyinfo[0].seg[0].length);
+ pos+=recinfo[0].length;
+ }
+ if (recinfo[1].type == FIELD_BLOB)
+ {
+ uint tmp;
+ char *ptr;
+ strnmov(blob_key,data[n].f2,keyinfo[0].seg[0].length);
+ tmp=strlen(blob_key);
+ int4store(pos,tmp);
+ ptr=blob_key;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ pos+=recinfo[1].length;
+ }
+ else if (recinfo[1].type == FIELD_VARCHAR)
+ {
+ uint tmp;
+ /* -1 is here because pack_length is stored in seg->length */
+ uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1);
+ strnmov(pos+pack_length,data[n].f2,keyinfo[0].seg[0].length);
+ tmp=strlen(pos+1);
+ if (pack_length == 1)
+ *pos= (char) tmp;
+ else
+ int2store(pos,tmp);
+ pos+=recinfo[1].length;
+ }
+ else
+ {
+ strnmov(pos,data[n].f2,keyinfo[0].seg[0].length);
+ pos+=recinfo[1].length;
+ }
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument)
+{
+ switch(optid) {
+ case 'v': verbose=1; break;
+ case 's': silent=1; break;
+ case 'F': no_fulltext=1; no_search=1;
+ case 'U': skip_update=1; break;
+ case 'K': no_keys=no_search=1; break;
+ case 'N': no_search=1; break;
+ case 'S': no_stopwords=1; break;
+ case '#':
+ DBUG_PUSH (argument);
+ break;
+ case 'V':
+ case '?':
+ case 'h':
+ usage();
+ exit(1);
+ }
+ return 0;
+}
+
+/* Read options */
+
+static void get_options(int argc,char *argv[])
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(ho_error);
+ return;
+} /* get options */
+
+
+static void usage()
+{
+ printf("%s [options]\n", my_progname);
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+}
diff --git a/storage/maria/ma_ft_test1.h b/storage/maria/ma_ft_test1.h
new file mode 100644
index 00000000000..5883c42f5c5
--- /dev/null
+++ b/storage/maria/ma_ft_test1.h
@@ -0,0 +1,420 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+#define NUPD 20
+#define NDATAS 389
+struct { const char *f0, *f2; } data[NDATAS] = {
+ {"1", "General Information about MySQL"},
+ {"1.1", "What is MySQL?"},
+ {"1.2", "About this manual"},
+ {"1.3", "History of MySQL"},
+ {"1.4", "The main features of MySQL"},
+ {"1.5", "General SQL information and tutorials"},
+ {"1.6", "Useful MySQL-related links"},
+ {"1.7", "What are stored procedures and triggers and so on?"},
+ {"2", "MySQL mailing lists and how to ask questions/give error (bug) reports"},
+ {"2.1", "Subscribing to/un-subscribing from the MySQL mailing list"},
+ {"2.2", "Asking questions or reporting bugs"},
+ {"2.3", "I think I have found a bug. What information do you need to help me?"},
+ {"2.3.1", "MySQL keeps crashing"},
+ {"2.4", "Guidelines for answering questions on the mailing list"},
+ {"3", "Licensing or When do I have/want to pay for MySQL?"},
+ {"3.1", "How much does MySQL cost?"},
+ {"3.2", "How do I get commercial support?"},
+ {"3.2.1", "Types of commercial support"},
+ {"3.2.1.1", "Basic email support"},
+ {"3.2.1.2", "Extended email support"},
+/*------------------------------- NUPD=20 -------------------------------*/
+ {"3.2.1.3", "Asking: Login support"},
+ {"3.2.1.4", "Extended login support"},
+ {"3.3", "How do I pay for licenses/support?"},
+ {"3.4", "Who do I contact when I want more information about licensing/support?"},
+ {"3.5", "What Copyright does MySQL use?"},
+ {"3.6", "When may I distribute MySQL commercially without a fee?"},
+ {"3.7", "I want to sell a product that can be configured to use MySQL"},
+ {"3.8", "I am running a commercial web server using MySQL"},
+ {"3.9", "Do I need a license to sell commercial Perl/tcl/PHP/Web+ etc applications?"},
+ {"3.10", "Possible future changes in the licensing"},
+ {"4", "Compiling and installing MySQL"},
+ {"4.1", "How do I get MySQL?"},
+ {"4.2", "Which MySQL version should I use?"},
+ {"4.3", "How/when will you release updates?"},
+ {"4.4", "What operating systems does MySQL support?"},
+ {"4.5", "Compiling MySQL from source code"},
+ {"4.5.1", "Quick installation overview"},
+ {"4.5.2", "Usual configure switches"},
+ {"4.5.3", "Applying a patch"},
+ {"4.6", "Problems compiling?"},
+ {"4.7", "General compilation notes"},
+ {"4.8", "MIT-pthreads notes (FreeBSD)"},
+ {"4.9", "Perl installation comments"},
+ {"4.10", "Special things to consider for some machine/OS combinations"},
+ {"4.10.1", "Solaris notes"},
+ {"4.10.2", "SunOS 4 notes"},
+ {"4.10.3", "Linux notes for all versions"},
+ {"4.10.3.1", "Linux-x86 notes"},
+ {"4.10.3.2", "RedHat 5.0"},
+ {"4.10.3.3", "RedHat 5.1"},
+ {"4.10.3.4", "Linux-Sparc notes"},
+ {"4.10.3.5", "Linux-Alpha notes"},
+ {"4.10.3.6", "MkLinux notes"},
+ {"4.10.4", "Alpha-DEC-Unix notes"},
+ {"4.10.5", "Alpha-DEC-OSF1 notes"},
+ {"4.10.6", "SGI-IRIX notes"},
+ {"4.10.7", "FreeBSD notes"},
+ {"4.10.7.1", "FreeBSD-3.0 notes"},
+ {"4.10.8", "BSD/OS 2.# notes"},
+ {"4.10.8.1", "BSD/OS 3.# notes"},
+ {"4.10.9", "SCO notes"},
+ {"4.10.10", "SCO Unixware 7.0 notes"},
+ {"4.10.11", "IBM-AIX notes"},
+ {"4.10.12", "HP-UX notes"},
+ {"4.11", "TcX binaries"},
+ {"4.12", "Win32 notes"},
+ {"4.13", "Installation instructions for MySQL binary releases"},
+ {"4.13.1", "How to get MySQL Perl support working"},
+ {"4.13.2", "Linux notes"},
+ {"4.13.3", "HP-UX notes"},
+ {"4.13.4", "Linking client libraries"},
+ {"4.14", "Problems running mysql_install_db"},
+ {"4.15", "Problems starting MySQL"},
+ {"4.16", "Automatic start/stop of MySQL"},
+ {"4.17", "Option files"},
+ {"5", "How standards-compatible is MySQL?"},
+ {"5.1", "What extensions has MySQL to ANSI SQL92?"},
+ {"5.2", "What functionality is missing in MySQL?"},
+ {"5.2.1", "Sub-selects"},
+ {"5.2.2", "SELECT INTO TABLE"},
+ {"5.2.3", "Transactions"},
+ {"5.2.4", "Triggers"},
+ {"5.2.5", "Foreign Keys"},
+ {"5.2.5.1", "Some reasons NOT to use FOREIGN KEYS"},
+ {"5.2.6", "Views"},
+ {"5.2.7", "-- as start of a comment"},
+ {"5.3", "What standards does MySQL follow?"},
+ {"5.4", "What functions exist only for compatibility?"},
+ {"5.5", "Limitations of BLOB and TEXT types"},
+ {"5.6", "How to cope without COMMIT-ROLLBACK"},
+ {"6", "The MySQL access privilege system"},
+ {"6.1", "What the privilege system does"},
+ {"6.2", "Connecting to the MySQL server"},
+ {"6.2.1", "Keeping your password secure"},
+ {"6.3", "Privileges provided by MySQL"},
+ {"6.4", "How the privilege system works"},
+ {"6.5", "The privilege tables"},
+ {"6.6", "Setting up the initial MySQL privileges"},
+ {"6.7", "Adding new user privileges to MySQL"},
+ {"6.8", "An example permission setup"},
+ {"6.9", "Causes of Access denied errors"},
+ {"6.10", "How to make MySQL secure against crackers"},
+ {"7", "MySQL language reference"},
+ {"7.1", "Literals: how to write strings and numbers"},
+ {"7.1.1", "Strings"},
+ {"7.1.2", "Numbers"},
+ {"7.1.3", "NULL values"},
+ {"7.1.4", "Database, table, index, column and alias names"},
+ {"7.1.4.1", "Case sensitivity in names"},
+ {"7.2", "Column types"},
+ {"7.2.1", "Column type storage requirements"},
+ {"7.2.5", "Numeric types"},
+ {"7.2.6", "Date and time types"},
+ {"7.2.6.1", "The DATE type"},
+ {"7.2.6.2", "The TIME type"},
+ {"7.2.6.3", "The DATETIME type"},
+ {"7.2.6.4", "The TIMESTAMP type"},
+ {"7.2.6.5", "The YEAR type"},
+ {"7.2.6.6", "Miscellaneous date and time properties"},
+ {"7.2.7", "String types"},
+ {"7.2.7.1", "The CHAR and VARCHAR types"},
+ {"7.2.7.2", "The BLOB and TEXT types"},
+ {"7.2.7.3", "The ENUM type"},
+ {"7.2.7.4", "The SET type"},
+ {"7.2.8", "Choosing the right type for a column"},
+ {"7.2.9", "Column indexes"},
+ {"7.2.10", "Multiple-column indexes"},
+ {"7.2.11", "Using column types from other database engines"},
+ {"7.3", "Functions for use in SELECT and WHERE clauses"},
+ {"7.3.1", "Grouping functions"},
+ {"7.3.2", "Normal arithmetic operations"},
+ {"7.3.3", "Bit functions"},
+ {"7.3.4", "Logical operations"},
+ {"7.3.5", "Comparison operators"},
+ {"7.3.6", "String comparison functions"},
+ {"7.3.7", "Control flow functions"},
+ {"7.3.8", "Mathematical functions"},
+ {"7.3.9", "String functions"},
+ {"7.3.10", "Date and time functions"},
+ {"7.3.11", "Miscellaneous functions"},
+ {"7.3.12", "Functions for use with GROUP BY clauses"},
+ {"7.4", "CREATE DATABASE syntax"},
+ {"7.5", "DROP DATABASE syntax"},
+ {"7.6", "CREATE TABLE syntax"},
+ {"7.7", "ALTER TABLE syntax"},
+ {"7.8", "OPTIMIZE TABLE syntax"},
+ {"7.9", "DROP TABLE syntax"},
+ {"7.10", "DELETE syntax"},
+ {"7.11", "SELECT syntax"},
+ {"7.12", "JOIN syntax"},
+ {"7.13", "INSERT syntax"},
+ {"7.14", "REPLACE syntax"},
+ {"7.15", "LOAD DATA INFILE syntax"},
+ {"7.16", "UPDATE syntax"},
+ {"7.17", "USE syntax"},
+ {"7.18", "SHOW syntax (Get information about tables, columns...)"},
+ {"7.19", "EXPLAIN syntax (Get information about a SELECT)"},
+ {"7.20", "DESCRIBE syntax (Get information about columns)"},
+ {"7.21", "LOCK TABLES/UNLOCK TABLES syntax"},
+ {"7.22", "SET OPTION syntax"},
+ {"7.23", "GRANT syntax (Compatibility function)"},
+ {"7.24", "CREATE INDEX syntax (Compatibility function)"},
+ {"7.25", "DROP INDEX syntax (Compatibility function)"},
+ {"7.26", "Comment syntax"},
+ {"7.27", "CREATE FUNCTION/DROP FUNCTION syntax"},
+ {"7.28", "Is MySQL picky about reserved words?"},
+ {"8", "Example SQL queries"},
+ {"8.1", "Queries from twin project"},
+ {"8.1.1", "Find all non-distributed twins"},
+ {"8.1.2", "Show a table on twin pair status"},
+ {"9", "How safe/stable is MySQL?"},
+ {"9.1", "How stable is MySQL?"},
+ {"9.2", "Why are there is so many releases of MySQL?"},
+ {"9.3", "Checking a table for errors"},
+ {"9.4", "How to repair tables"},
+ {"9.5", "Is there anything special to do when upgrading/downgrading MySQL?"},
+ {"9.5.1", "Upgrading from a 3.21 version to 3.22"},
+ {"9.5.2", "Upgrading from a 3.20 version to 3.21"},
+ {"9.5.3", "Upgrading to another architecture"},
+ {"9.6", "Year 2000 compliance"},
+ {"10", "MySQL Server functions"},
+ {"10.1", "What languages are supported by MySQL?"},
+ {"10.1.1", "Character set used for data &#38; sorting"},
+ {"10.2", "The update log"},
+ {"10.3", "How big can MySQL tables be?"},
+ {"11", "Getting maximum performance from MySQL"},
+ {"11.1", "How does one change the size of MySQL buffers?"},
+ {"11.2", "How compiling and linking affects the speed of MySQL"},
+ {"11.3", "How does MySQL use memory?"},
+ {"11.4", "How does MySQL use indexes?"},
+ {"11.5", "What optimizations are done on WHERE clauses?"},
+ {"11.6", "How does MySQL open &#38; close tables?"},
+ {"11.6.0.1", "What are the drawbacks of creating possibly thousands of tables in a database?"},
+ {"11.7", "How does MySQL lock tables?"},
+ {"11.8", "How should I arrange my table to be as fast/small as possible?"},
+ {"11.9", "What affects the speed of INSERT statements?"},
+ {"11.10", "What affects the speed DELETE statements?"},
+ {"11.11", "How do I get MySQL to run at full speed?"},
+ {"11.12", "What are the different row formats? Or, when should VARCHAR/CHAR be used?"},
+ {"11.13", "Why so many open tables?"},
+ {"12", "MySQL benchmark suite"},
+ {"13", "MySQL Utilites"},
+ {"13.1", "Overview of the different MySQL programs"},
+ {"13.2", "The MySQL table check, optimize and repair program"},
+ {"13.2.1", "isamchk memory use"},
+ {"13.2.2", "Getting low-level table information"},
+ {"13.3", "The MySQL compressed read-only table generator"},
+ {"14", "Adding new functions to MySQL"},
+ {"15", "MySQL ODBC Support"},
+ {"15.1", "Operating systems supported by MyODBC"},
+ {"15.2", "How to report problems with MyODBC"},
+ {"15.3", "Programs known to work with MyODBC"},
+ {"15.4", "How to fill in the various fields in the ODBC administrator program"},
+ {"15.5", "How to get the value of an AUTO_INCREMENT column in ODBC"},
+ {"16", "Problems and common errors"},
+ {"16.1", "Some common errors when using MySQL"},
+ {"16.1.1", "MySQL server has gone away error"},
+ {"16.1.2", "Can't connect to local MySQL server error"},
+ {"16.1.3", "Out of memory error"},
+ {"16.1.4", "Packet too large error"},
+ {"16.1.5", "The table is full error"},
+ {"16.1.6", "Commands out of sync error in client"},
+ {"16.1.7", "Removing user error"},
+ {"16.2", "How MySQL handles a full disk"},
+ {"16.3", "How to run SQL commands from a text file"},
+ {"16.4", "Where MySQL stores temporary files"},
+ {"16.5", "Access denied error"},
+ {"16.6", "How to run MySQL as a normal user"},
+ {"16.7", "Problems with file permissions"},
+ {"16.8", "File not found"},
+ {"16.9", "Problems using DATE columns"},
+ {"16.10", "Case sensitivity in searches"},
+ {"16.11", "Problems with NULL values"},
+ {"17", "Solving some common problems with MySQL"},
+ {"17.1", "Database replication"},
+ {"17.2", "Database backups"},
+ {"18", "MySQL client tools and API's"},
+ {"18.1", "MySQL C API"},
+ {"18.2", "C API datatypes"},
+ {"18.3", "C API function overview"},
+ {"18.4", "C API function descriptions"},
+ {"18.4.1", "mysql_affected_rows()"},
+ {"18.4.2", "mysql_close()"},
+ {"18.4.3", "mysql_connect()"},
+ {"18.4.4", "mysql_create_db()"},
+ {"18.4.5", "mysql_data_seek()"},
+ {"18.4.6", "mysql_debug()"},
+ {"18.4.7", "mysql_drop_db()"},
+ {"18.4.8", "mysql_dump_debug_info()"},
+ {"18.4.9", "mysql_eof()"},
+ {"18.4.10", "mysql_errno()"},
+ {"18.4.11", "mysql_error()"},
+ {"18.4.12", "mysql_escape_string()"},
+ {"18.4.13", "mysql_fetch_field()"},
+ {"18.4.14", "mysql_fetch_fields()"},
+ {"18.4.15", "mysql_fetch_field_direct()"},
+ {"18.4.16", "mysql_fetch_lengths()"},
+ {"18.4.17", "mysql_fetch_row()"},
+ {"18.4.18", "mysql_field_seek()"},
+ {"18.4.19", "mysql_field_tell()"},
+ {"18.4.20", "mysql_free_result()"},
+ {"18.4.21", "mysql_get_client_info()"},
+ {"18.4.22", "mysql_get_host_info()"},
+ {"18.4.23", "mysql_get_proto_info()"},
+ {"18.4.24", "mysql_get_server_info()"},
+ {"18.4.25", "mysql_info()"},
+ {"18.4.26", "mysql_init()"},
+ {"18.4.27", "mysql_insert_id()"},
+ {"18.4.28", "mysql_kill()"},
+ {"18.4.29", "mysql_list_dbs()"},
+ {"18.4.30", "mysql_list_fields()"},
+ {"18.4.31", "mysql_list_processes()"},
+ {"18.4.32", "mysql_list_tables()"},
+ {"18.4.33", "mysql_num_fields()"},
+ {"18.4.34", "mysql_num_rows()"},
+ {"18.4.35", "mysql_query()"},
+ {"18.4.36", "mysql_real_connect()"},
+ {"18.4.37", "mysql_real_query()"},
+ {"18.4.38", "mysql_reload()"},
+ {"18.4.39", "mysql_row_tell()"},
+ {"18.4.40", "mysql_select_db()"},
+ {"18.4.41", "mysql_shutdown()"},
+ {"18.4.42", "mysql_stat()"},
+ {"18.4.43", "mysql_store_result()"},
+ {"18.4.44", "mysql_thread_id()"},
+ {"18.4.45", "mysql_use_result()"},
+ {"18.4.46", "Why is it that after mysql_query() returns success, mysql_store_result() sometimes returns NULL?"},
+ {"18.4.47", "What results can I get from a query?"},
+ {"18.4.48", "How can I get the unique ID for the last inserted row?"},
+ {"18.4.49", "Problems linking with the C API"},
+ {"18.4.50", "How to make a thread-safe client"},
+ {"18.5", "MySQL Perl API's"},
+ {"18.5.1", "DBI with DBD::mysql"},
+ {"18.5.1.1", "The DBI interface"},
+ {"18.5.1.2", "More DBI/DBD information"},
+ {"18.6", "MySQL Java connectivity (JDBC)"},
+ {"18.7", "MySQL PHP API's"},
+ {"18.8", "MySQL C++ API's"},
+ {"18.9", "MySQL Python API's"},
+ {"18.10", "MySQL TCL API's"},
+ {"19", "How MySQL compares to other databases"},
+ {"19.1", "How MySQL compares to mSQL"},
+ {"19.1.1", "How to convert mSQL tools for MySQL"},
+ {"19.1.2", "How mSQL and MySQL client/server communications protocols differ"},
+ {"19.1.3", "How mSQL 2.0 SQL syntax differs from MySQL"},
+ {"19.2", "How MySQL compares to PostgreSQL"},
+ {"A", "Some users of MySQL"},
+ {"B", "Contributed programs"},
+ {"C", "Contributors to MySQL"},
+ {"D", "MySQL change history"},
+ {"19.3", "Changes in release 3.22.x (Alpha version)"},
+ {"19.3.1", "Changes in release 3.22.7"},
+ {"19.3.2", "Changes in release 3.22.6"},
+ {"19.3.3", "Changes in release 3.22.5"},
+ {"19.3.4", "Changes in release 3.22.4"},
+ {"19.3.5", "Changes in release 3.22.3"},
+ {"19.3.6", "Changes in release 3.22.2"},
+ {"19.3.7", "Changes in release 3.22.1"},
+ {"19.3.8", "Changes in release 3.22.0"},
+ {"19.4", "Changes in release 3.21.x"},
+ {"19.4.1", "Changes in release 3.21.33"},
+ {"19.4.2", "Changes in release 3.21.32"},
+ {"19.4.3", "Changes in release 3.21.31"},
+ {"19.4.4", "Changes in release 3.21.30"},
+ {"19.4.5", "Changes in release 3.21.29"},
+ {"19.4.6", "Changes in release 3.21.28"},
+ {"19.4.7", "Changes in release 3.21.27"},
+ {"19.4.8", "Changes in release 3.21.26"},
+ {"19.4.9", "Changes in release 3.21.25"},
+ {"19.4.10", "Changes in release 3.21.24"},
+ {"19.4.11", "Changes in release 3.21.23"},
+ {"19.4.12", "Changes in release 3.21.22"},
+ {"19.4.13", "Changes in release 3.21.21a"},
+ {"19.4.14", "Changes in release 3.21.21"},
+ {"19.4.15", "Changes in release 3.21.20"},
+ {"19.4.16", "Changes in release 3.21.19"},
+ {"19.4.17", "Changes in release 3.21.18"},
+ {"19.4.18", "Changes in release 3.21.17"},
+ {"19.4.19", "Changes in release 3.21.16"},
+ {"19.4.20", "Changes in release 3.21.15"},
+ {"19.4.21", "Changes in release 3.21.14b"},
+ {"19.4.22", "Changes in release 3.21.14a"},
+ {"19.4.23", "Changes in release 3.21.13"},
+ {"19.4.24", "Changes in release 3.21.12"},
+ {"19.4.25", "Changes in release 3.21.11"},
+ {"19.4.26", "Changes in release 3.21.10"},
+ {"19.4.27", "Changes in release 3.21.9"},
+ {"19.4.28", "Changes in release 3.21.8"},
+ {"19.4.29", "Changes in release 3.21.7"},
+ {"19.4.30", "Changes in release 3.21.6"},
+ {"19.4.31", "Changes in release 3.21.5"},
+ {"19.4.32", "Changes in release 3.21.4"},
+ {"19.4.33", "Changes in release 3.21.3"},
+ {"19.4.34", "Changes in release 3.21.2"},
+ {"19.4.35", "Changes in release 3.21.0"},
+ {"19.5", "Changes in release 3.20.x"},
+ {"19.5.1", "Changes in release 3.20.18"},
+ {"19.5.2", "Changes in release 3.20.17"},
+ {"19.5.3", "Changes in release 3.20.16"},
+ {"19.5.4", "Changes in release 3.20.15"},
+ {"19.5.5", "Changes in release 3.20.14"},
+ {"19.5.6", "Changes in release 3.20.13"},
+ {"19.5.7", "Changes in release 3.20.11"},
+ {"19.5.8", "Changes in release 3.20.10"},
+ {"19.5.9", "Changes in release 3.20.9"},
+ {"19.5.10", "Changes in release 3.20.8"},
+ {"19.5.11", "Changes in release 3.20.7"},
+ {"19.5.12", "Changes in release 3.20.6"},
+ {"19.5.13", "Changes in release 3.20.3"},
+ {"19.5.14", "Changes in release 3.20.0"},
+ {"19.6", "Changes in release 3.19.x"},
+ {"19.6.1", "Changes in release 3.19.5"},
+ {"19.6.2", "Changes in release 3.19.4"},
+ {"19.6.3", "Changes in release 3.19.3"},
+ {"E", "Known errors and design deficiencies in MySQL"},
+ {"F", "List of things we want to add to MySQL in the future (The TODO)"},
+ {"19.7", "Things that must done in the real near future"},
+ {"19.8", "Things that have to be done sometime"},
+ {"19.9", "Some things we don't have any plans to do"},
+ {"G", "Comments on porting to other systems"},
+ {"19.10", "Debugging MySQL"},
+ {"19.11", "Comments about RTS threads"},
+ {"19.12", "What is the difference between different thread packages?"},
+ {"H", "Description of MySQL regular expression syntax"},
+ {"I", "What is Unireg?"},
+ {"J", "The MySQL server license"},
+ {"K", "The MySQL license for Microsoft operating systems"},
+ {"*", "SQL command, type and function index"},
+ {"*", "Concept Index"}
+};
+
+#define NQUERIES 5
+const char *query[NQUERIES]={
+ "mysql information and manual",
+ "upgrading from previous version",
+ "column indexes",
+ "against about after more right the with/without", /* stopwords test */
+ "mysql license and copyright"
+};
diff --git a/storage/maria/ma_ft_update.c b/storage/maria/ma_ft_update.c
new file mode 100644
index 00000000000..f38990efab9
--- /dev/null
+++ b/storage/maria/ma_ft_update.c
@@ -0,0 +1,379 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* functions to work with full-text indices */
+
+#include "ma_ftdefs.h"
+#include <math.h>
+
+void _ma_ft_segiterator_init(MARIA_HA *info, uint keynr, const uchar *record,
+ FT_SEG_ITERATOR *ftsi)
+{
+ DBUG_ENTER("_ma_ft_segiterator_init");
+
+ ftsi->num=info->s->keyinfo[keynr].keysegs;
+ ftsi->seg=info->s->keyinfo[keynr].seg;
+ ftsi->rec=record;
+ DBUG_VOID_RETURN;
+}
+
+void _ma_ft_segiterator_dummy_init(const uchar *record, uint len,
+ FT_SEG_ITERATOR *ftsi)
+{
+ DBUG_ENTER("_ma_ft_segiterator_dummy_init");
+
+ ftsi->num=1;
+ ftsi->seg=0;
+ ftsi->pos=record;
+ ftsi->len=len;
+ DBUG_VOID_RETURN;
+}
+
+/*
+ This function breaks convention "return 0 in success"
+ but it's easier to use like this
+
+ while(_ma_ft_segiterator())
+
+ so "1" means "OK", "0" means "EOF"
+*/
+
+uint _ma_ft_segiterator(register FT_SEG_ITERATOR *ftsi)
+{
+ DBUG_ENTER("_ma_ft_segiterator");
+
+ if (!ftsi->num)
+ DBUG_RETURN(0);
+
+ ftsi->num--;
+ if (!ftsi->seg)
+ DBUG_RETURN(1);
+
+ ftsi->seg--;
+
+ if (ftsi->seg->null_bit &&
+ (ftsi->rec[ftsi->seg->null_pos] & ftsi->seg->null_bit))
+ {
+ ftsi->pos=0;
+ DBUG_RETURN(1);
+ }
+ ftsi->pos= ftsi->rec+ftsi->seg->start;
+ if (ftsi->seg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint pack_length= (ftsi->seg->bit_start);
+ ftsi->len= (pack_length == 1 ? (uint) * ftsi->pos :
+ uint2korr(ftsi->pos));
+ ftsi->pos+= pack_length; /* Skip VARCHAR length */
+ DBUG_RETURN(1);
+ }
+ if (ftsi->seg->flag & HA_BLOB_PART)
+ {
+ ftsi->len= _ma_calc_blob_length(ftsi->seg->bit_start,ftsi->pos);
+ memcpy_fixed((char*) &ftsi->pos, ftsi->pos+ftsi->seg->bit_start,
+ sizeof(char*));
+ DBUG_RETURN(1);
+ }
+ ftsi->len=ftsi->seg->length;
+ DBUG_RETURN(1);
+}
+
+
+/* parses a document i.e. calls maria_ft_parse for every keyseg */
+
+uint _ma_ft_parse(TREE *parsed, MARIA_HA *info, uint keynr, const uchar *record,
+ MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
+{
+ FT_SEG_ITERATOR ftsi;
+ struct st_mysql_ftparser *parser;
+ DBUG_ENTER("_ma_ft_parse");
+
+ _ma_ft_segiterator_init(info, keynr, record, &ftsi);
+
+ maria_ft_parse_init(parsed, info->s->keyinfo[keynr].seg->charset);
+ parser= info->s->keyinfo[keynr].parser;
+ while (_ma_ft_segiterator(&ftsi))
+ {
+ /** @todo this casts ftsi.pos (const) to non-const */
+ if (ftsi.pos)
+ if (maria_ft_parse(parsed, (uchar *)ftsi.pos, ftsi.len, parser, param,
+ mem_root))
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+FT_WORD * _ma_ft_parserecord(MARIA_HA *info, uint keynr, const uchar *record,
+ MEM_ROOT *mem_root)
+{
+ TREE ptree;
+ MYSQL_FTPARSER_PARAM *param;
+ DBUG_ENTER("_ma_ft_parserecord");
+ if (! (param= maria_ftparser_call_initializer(info, keynr, 0)))
+ DBUG_RETURN(NULL);
+ bzero((char*) &ptree, sizeof(ptree));
+ param->flags= 0;
+ if (_ma_ft_parse(&ptree, info, keynr, record, param, mem_root))
+ DBUG_RETURN(NULL);
+
+ DBUG_RETURN(maria_ft_linearize(&ptree, mem_root));
+}
+
+static int _ma_ft_store(MARIA_HA *info, uint keynr, uchar *keybuf,
+ FT_WORD *wlist, my_off_t filepos)
+{
+ DBUG_ENTER("_ma_ft_store");
+
+ for (; wlist->pos; wlist++)
+ {
+ MARIA_KEY key;
+ _ma_ft_make_key(info, &key, keynr, keybuf, wlist, filepos);
+ if (_ma_ck_write(info, &key))
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+static int _ma_ft_erase(MARIA_HA *info, uint keynr, uchar *keybuf,
+ FT_WORD *wlist, my_off_t filepos)
+{
+ uint err=0;
+ DBUG_ENTER("_ma_ft_erase");
+
+ for (; wlist->pos; wlist++)
+ {
+ MARIA_KEY key;
+ _ma_ft_make_key(info, &key, keynr, keybuf, wlist, filepos);
+ if (_ma_ck_delete(info, &key))
+ err=1;
+ }
+ DBUG_RETURN(err);
+}
+
+/*
+ Compares an appropriate parts of two WORD_KEY keys directly out of records
+ returns 1 if they are different
+*/
+
+#define THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT 1
+#define GEE_THEY_ARE_ABSOLUTELY_IDENTICAL 0
+
+int _ma_ft_cmp(MARIA_HA *info, uint keynr, const uchar *rec1, const uchar *rec2)
+{
+ FT_SEG_ITERATOR ftsi1, ftsi2;
+ CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
+ DBUG_ENTER("_ma_ft_cmp");
+
+ _ma_ft_segiterator_init(info, keynr, rec1, &ftsi1);
+ _ma_ft_segiterator_init(info, keynr, rec2, &ftsi2);
+
+ while (_ma_ft_segiterator(&ftsi1) && _ma_ft_segiterator(&ftsi2))
+ {
+ if ((ftsi1.pos != ftsi2.pos) &&
+ (!ftsi1.pos || !ftsi2.pos ||
+ ha_compare_text(cs, ftsi1.pos,ftsi1.len,
+ ftsi2.pos,ftsi2.len,0,0)))
+ DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT);
+ }
+ DBUG_RETURN(GEE_THEY_ARE_ABSOLUTELY_IDENTICAL);
+}
+
+
+/* update a document entry */
+
+int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf,
+ const uchar *oldrec, const uchar *newrec, my_off_t pos)
+{
+ int error= -1;
+ FT_WORD *oldlist,*newlist, *old_word, *new_word;
+ CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
+ int cmp, cmp2;
+ DBUG_ENTER("_ma_ft_update");
+
+ if (!(old_word=oldlist=_ma_ft_parserecord(info, keynr, oldrec,
+ &info->ft_memroot)) ||
+ !(new_word=newlist=_ma_ft_parserecord(info, keynr, newrec,
+ &info->ft_memroot)))
+ goto err;
+
+ error=0;
+ while(old_word->pos && new_word->pos)
+ {
+ cmp= ha_compare_text(cs, (uchar*) old_word->pos,old_word->len,
+ (uchar*) new_word->pos,new_word->len,0,0);
+ cmp2= cmp ? 0 : (fabs(old_word->weight - new_word->weight) > 1.e-5);
+
+ if (cmp < 0 || cmp2)
+ {
+ MARIA_KEY key;
+ _ma_ft_make_key(info, &key, keynr, keybuf, old_word, pos);
+ if (_ma_ck_delete(info, &key))
+ {
+ error= -1;
+ goto err;
+ }
+ }
+ if (cmp > 0 || cmp2)
+ {
+ MARIA_KEY key;
+ _ma_ft_make_key(info, &key, keynr, keybuf, new_word,pos);
+ if ((error= _ma_ck_write(info, &key)))
+ goto err;
+ }
+ if (cmp<=0) old_word++;
+ if (cmp>=0) new_word++;
+ }
+ if (old_word->pos)
+ error= _ma_ft_erase(info,keynr,keybuf,old_word,pos);
+ else if (new_word->pos)
+ error= _ma_ft_store(info,keynr,keybuf,new_word,pos);
+
+err:
+ free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE));
+ DBUG_RETURN(error);
+}
+
+
+/* adds a document to the collection */
+
+int _ma_ft_add(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record,
+ my_off_t pos)
+{
+ int error= -1;
+ FT_WORD *wlist;
+ DBUG_ENTER("_ma_ft_add");
+ DBUG_PRINT("enter",("keynr: %d",keynr));
+
+ if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot)))
+ error= _ma_ft_store(info,keynr,keybuf,wlist,pos);
+ free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE));
+ DBUG_PRINT("exit",("Return: %d",error));
+ DBUG_RETURN(error);
+}
+
+
+/* removes a document from the collection */
+
+int _ma_ft_del(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record,
+ my_off_t pos)
+{
+ int error= -1;
+ FT_WORD *wlist;
+ DBUG_ENTER("_ma_ft_del");
+ DBUG_PRINT("enter",("keynr: %d",keynr));
+
+ if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot)))
+ error= _ma_ft_erase(info,keynr,keybuf,wlist,pos);
+ free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE));
+ DBUG_PRINT("exit",("Return: %d",error));
+ DBUG_RETURN(error);
+}
+
+
+MARIA_KEY *_ma_ft_make_key(MARIA_HA *info, MARIA_KEY *key, uint keynr,
+ uchar *keybuf,
+ FT_WORD *wptr, my_off_t filepos)
+{
+ uchar buf[HA_FT_MAXBYTELEN+16];
+ DBUG_ENTER("_ma_ft_make_key");
+
+#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
+ {
+ float weight=(float) ((filepos==HA_OFFSET_ERROR) ? 0 : wptr->weight);
+ mi_float4store(buf,weight);
+ }
+#else
+#error
+#endif
+
+ int2store(buf+HA_FT_WLEN,wptr->len);
+ memcpy(buf+HA_FT_WLEN+2,wptr->pos,wptr->len);
+ /* Can't be spatial so it's ok to call _ma_make_key directly here */
+ DBUG_RETURN(_ma_make_key(info, key, keynr, keybuf, buf, filepos, 0));
+}
+
+
+/*
+ convert key value to ft2
+*/
+
+my_bool _ma_ft_convert_to_ft2(MARIA_HA *info, MARIA_KEY *key)
+{
+ MARIA_SHARE *share= info->s;
+ my_off_t root;
+ DYNAMIC_ARRAY *da=info->ft1_to_ft2;
+ MARIA_KEYDEF *keyinfo=&share->ft2_keyinfo;
+ uchar *key_ptr= (uchar*) dynamic_array_ptr(da, 0), *end;
+ uint length, key_length;
+ MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+ MARIA_KEY tmp_key;
+ MARIA_PAGE page;
+ DBUG_ENTER("_ma_ft_convert_to_ft2");
+
+ /* we'll generate one pageful at once, and insert the rest one-by-one */
+ /* calculating the length of this page ...*/
+ length=(keyinfo->block_length-2) / keyinfo->keylength;
+ set_if_smaller(length, da->elements);
+ length=length * keyinfo->keylength;
+
+ get_key_full_length_rdonly(key_length, key->data);
+ while (_ma_ck_delete(info, key) == 0)
+ {
+ /*
+ nothing to do here.
+ _ma_ck_delete() will populate info->ft1_to_ft2 with deleted keys
+ */
+ }
+
+ /* creating pageful of keys */
+ bzero(info->buff, share->keypage_header);
+ _ma_store_keynr(share, info->buff, keyinfo->key_nr);
+ _ma_store_page_used(share, info->buff, length + share->keypage_header);
+ memcpy(info->buff + share->keypage_header, key_ptr, length);
+ info->keyread_buff_used= info->page_changed=1; /* info->buff is used */
+ /**
+ @todo RECOVERY BUG this is not logged yet. Ok as this code is never
+ called, but soon it will be.
+ */
+ if ((root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR)
+ DBUG_RETURN(1);
+
+ _ma_page_setup(&page, info, keyinfo, root, info->buff);
+ if (_ma_write_keypage(&page, page_link->write_lock, DFLT_INIT_HITS))
+ DBUG_RETURN(1);
+
+ /* inserting the rest of key values */
+ end= (uchar*) dynamic_array_ptr(da, da->elements);
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data_length= keyinfo->keylength;
+ tmp_key.ref_length= 0;
+ tmp_key.flag= 0;
+ for (key_ptr+=length; key_ptr < end; key_ptr+=keyinfo->keylength)
+ {
+ tmp_key.data= key_ptr;
+ if (_ma_ck_real_write_btree(info, key, &root, SEARCH_SAME))
+ DBUG_RETURN(1);
+ }
+
+ /* now, writing the word key entry */
+ ft_intXstore(key->data + key_length, - (int) da->elements);
+ _ma_dpointer(share, key->data + key_length + HA_FT_WLEN, root);
+
+ DBUG_RETURN(_ma_ck_real_write_btree(info, key,
+ &share->state.key_root[key->keyinfo->
+ key_nr],
+ SEARCH_SAME));
+}
diff --git a/storage/maria/ma_ftdefs.h b/storage/maria/ma_ftdefs.h
new file mode 100644
index 00000000000..4ce4e9e22ba
--- /dev/null
+++ b/storage/maria/ma_ftdefs.h
@@ -0,0 +1,156 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* some definitions for full-text indices */
+
+#include "ma_fulltext.h"
+#include <m_ctype.h>
+#include <my_tree.h>
+#include <queues.h>
+#include <mysql/plugin.h>
+
+#define true_word_char(ctype, character) \
+ ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
+ (character) == '_')
+#define misc_word_char(X) 0
+
+#define FT_MAX_WORD_LEN_FOR_SORT 31
+
+#define FTPARSER_MEMROOT_ALLOC_SIZE 65536
+
+#define COMPILE_STOPWORDS_IN
+
+/* Interested readers may consult SMART
+ (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z)
+ for an excellent implementation of vector space model we use.
+ It also demonstrate the usage of different weghting techniques.
+ This code, though, is completely original and is not based on the
+ SMART code but was in some cases inspired by it.
+
+ NORM_PIVOT was taken from the article
+ A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization",
+ ACM SIGIR'96, 21-29, 1996
+ */
+
+#define LWS_FOR_QUERY LWS_TF
+#define LWS_IN_USE LWS_LOG
+#define PRENORM_IN_USE PRENORM_AVG
+#define NORM_IN_USE NORM_PIVOT
+#define GWS_IN_USE GWS_PROB
+/*==============================================================*/
+#define LWS_TF (count)
+#define LWS_BINARY (count>0)
+#define LWS_SQUARE (count*count)
+#define LWS_LOG (count?(log( (double) count)+1):0)
+/*--------------------------------------------------------------*/
+#define PRENORM_NONE (p->weight)
+#define PRENORM_MAX (p->weight/docstat.max)
+#define PRENORM_AUG (0.4+0.6*p->weight/docstat.max)
+#define PRENORM_AVG (p->weight/docstat.sum*docstat.uniq)
+#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq)))
+/*--------------------------------------------------------------*/
+#define NORM_NONE (1)
+#define NORM_SUM (docstat.nsum)
+#define NORM_COS (sqrt(docstat.nsum2))
+
+#define PIVOT_VAL (0.0115)
+#define NORM_PIVOT (1+PIVOT_VAL*docstat.uniq)
+/*---------------------------------------------------------------*/
+#define GWS_NORM (1/sqrt(sum2))
+#define GWS_GFIDF (sum/doc_cnt)
+/* Mysterious, but w/o (double) GWS_IDF performs better :-o */
+#define GWS_IDF log(aio->info->state->records/doc_cnt)
+#define GWS_IDF1 log((double)aio->info->state->records/doc_cnt)
+#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 )
+#define GWS_FREQ (1.0/doc_cnt)
+#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2)
+#define GWS_CUBIC pow(log((double)aio->info->state->records/doc_cnt),3)
+#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records))
+/*=================================================================*/
+
+/* Boolean search operators */
+#define FTB_YES (ft_boolean_syntax[0])
+#define FTB_EGAL (ft_boolean_syntax[1])
+#define FTB_NO (ft_boolean_syntax[2])
+#define FTB_INC (ft_boolean_syntax[3])
+#define FTB_DEC (ft_boolean_syntax[4])
+#define FTB_LBR (ft_boolean_syntax[5])
+#define FTB_RBR (ft_boolean_syntax[6])
+#define FTB_NEG (ft_boolean_syntax[7])
+#define FTB_TRUNC (ft_boolean_syntax[8])
+#define FTB_LQUOT (ft_boolean_syntax[10])
+#define FTB_RQUOT (ft_boolean_syntax[11])
+
+typedef struct st_maria_ft_word {
+ const uchar * pos;
+ uint len;
+ double weight;
+} FT_WORD;
+
+int is_stopword(char *word, uint len);
+
+MARIA_KEY *_ma_ft_make_key(MARIA_HA *, MARIA_KEY *, uint , uchar *, FT_WORD *,
+ my_off_t);
+
+uchar maria_ft_get_word(CHARSET_INFO *, const uchar **, const uchar *,
+ FT_WORD *, MYSQL_FTPARSER_BOOLEAN_INFO *);
+uchar maria_ft_simple_get_word(CHARSET_INFO *, const uchar **, const uchar *,
+ FT_WORD *, my_bool);
+
+typedef struct _st_maria_ft_seg_iterator {
+ uint num, len;
+ HA_KEYSEG *seg;
+ const uchar *rec, *pos;
+} FT_SEG_ITERATOR;
+
+void _ma_ft_segiterator_init(MARIA_HA *, uint, const uchar *, FT_SEG_ITERATOR *);
+void _ma_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *);
+uint _ma_ft_segiterator(FT_SEG_ITERATOR *);
+
+void maria_ft_parse_init(TREE *, CHARSET_INFO *);
+int maria_ft_parse(TREE *, uchar *, size_t, struct st_mysql_ftparser *parser,
+ MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
+FT_WORD * maria_ft_linearize(TREE *, MEM_ROOT *);
+FT_WORD * _ma_ft_parserecord(MARIA_HA *, uint, const uchar *, MEM_ROOT *);
+uint _ma_ft_parse(TREE *, MARIA_HA *, uint, const uchar *,
+ MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
+
+FT_INFO *maria_ft_init_nlq_search(MARIA_HA *, uint, uchar *, size_t, uint,
+ uchar *);
+FT_INFO *maria_ft_init_boolean_search(MARIA_HA *, uint, uchar *, size_t,
+ CHARSET_INFO *);
+
+extern const struct _ft_vft _ma_ft_vft_nlq;
+int maria_ft_nlq_read_next(FT_INFO *, char *);
+float maria_ft_nlq_find_relevance(FT_INFO *, uchar *, uint);
+void maria_ft_nlq_close_search(FT_INFO *);
+float maria_ft_nlq_get_relevance(FT_INFO *);
+my_off_t maria_ft_nlq_get_docid(FT_INFO *);
+void maria_ft_nlq_reinit_search(FT_INFO *);
+
+extern const struct _ft_vft _ma_ft_vft_boolean;
+int maria_ft_boolean_read_next(FT_INFO *, char *);
+float maria_ft_boolean_find_relevance(FT_INFO *, uchar *, uint);
+void maria_ft_boolean_close_search(FT_INFO *);
+float maria_ft_boolean_get_relevance(FT_INFO *);
+my_off_t maria_ft_boolean_get_docid(FT_INFO *);
+void maria_ft_boolean_reinit_search(FT_INFO *);
+MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info);
+extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
+ uint keynr,
+ uint paramnr);
+extern void maria_ftparser_call_deinitializer(MARIA_HA *info);
diff --git a/storage/maria/ma_fulltext.h b/storage/maria/ma_fulltext.h
new file mode 100644
index 00000000000..6e087990bd2
--- /dev/null
+++ b/storage/maria/ma_fulltext.h
@@ -0,0 +1,27 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* some definitions for full-text indices */
+
+#include "maria_def.h"
+#include "ft_global.h"
+
+int _ma_ft_cmp(MARIA_HA *, uint, const uchar *, const uchar *);
+int _ma_ft_add(MARIA_HA *, uint, uchar *, const uchar *, my_off_t);
+int _ma_ft_del(MARIA_HA *, uint, uchar *, const uchar *, my_off_t);
+
+my_bool _ma_ft_convert_to_ft2(MARIA_HA *, MARIA_KEY *);
diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c
new file mode 100644
index 00000000000..1bbfa3cbf7e
--- /dev/null
+++ b/storage/maria/ma_info.c
@@ -0,0 +1,142 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Return useful base information for an open table */
+
+#include "maria_def.h"
+#ifdef __WIN__
+#include <sys/stat.h>
+#endif
+
+ /* Get position to last record */
+
+MARIA_RECORD_POS maria_position(MARIA_HA *info)
+{
+ return info->cur_row.lastpos;
+}
+
+
+/* Get information about the table */
+/* if flag == 2 one get current info (no sync from database */
+
+int maria_status(MARIA_HA *info, register MARIA_INFO *x, uint flag)
+{
+ MY_STAT state;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("maria_status");
+
+ x->recpos= info->cur_row.lastpos;
+ if (flag == HA_STATUS_POS)
+ DBUG_RETURN(0); /* Compatible with ISAM */
+ if (!(flag & HA_STATUS_NO_LOCK))
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ VOID(_ma_readinfo(info,F_RDLCK,0));
+ fast_ma_writeinfo(info);
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+ if (flag & HA_STATUS_VARIABLE)
+ {
+ x->records = info->state->records;
+ x->deleted = share->state.state.del;
+ x->delete_length = share->state.state.empty;
+ x->data_file_length = share->state.state.data_file_length;
+ x->index_file_length= share->state.state.key_file_length;
+
+ x->keys = share->state.header.keys;
+ x->check_time = share->state.check_time;
+ x->mean_reclength = x->records ?
+ (ulong) ((x->data_file_length - x->delete_length) /x->records) :
+ (ulong) share->min_pack_length;
+ }
+ if (flag & HA_STATUS_ERRKEY)
+ {
+ x->errkey= info->errkey;
+ x->dup_key_pos= info->dup_key_pos;
+ }
+ if (flag & HA_STATUS_CONST)
+ {
+ x->reclength = share->base.reclength;
+ x->max_data_file_length=share->base.max_data_file_length;
+ x->max_index_file_length=info->s->base.max_key_file_length;
+ x->filenr = info->dfile.file;
+ x->options = share->options;
+ x->create_time=share->state.create_time;
+ x->reflength= maria_get_pointer_length(share->base.max_data_file_length,
+ maria_data_pointer_size);
+ x->record_offset= (info->s->data_file_type == STATIC_RECORD ?
+ share->base.pack_reclength: 0);
+ x->sortkey= -1; /* No clustering */
+ x->rec_per_key = share->state.rec_per_key_part;
+ x->key_map = share->state.key_map;
+ x->data_file_name = share->data_file_name.str;
+ x->index_file_name = share->index_file_name.str;
+ x->data_file_type = share->data_file_type;
+ }
+ if ((flag & HA_STATUS_TIME) && !my_fstat(info->dfile.file, &state, MYF(0)))
+ x->update_time=state.st_mtime;
+ else
+ x->update_time=0;
+ if (flag & HA_STATUS_AUTO)
+ {
+ x->auto_increment= share->state.auto_increment+1;
+ if (!x->auto_increment) /* This shouldn't happen */
+ x->auto_increment= ~(ulonglong) 0;
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write a message to the error log.
+
+ SYNOPSIS
+ _ma_report_error()
+ file_name Name of table file (e.g. index_file_name).
+ errcode Error number.
+
+ DESCRIPTION
+ This function supplies my_error() with a table name. Most error
+ messages need one. Since string arguments in error messages are limited
+ to 64 characters by convention, we ensure that in case of truncation,
+ that the end of the index file path is in the message. This contains
+ the most valuable information (the table name and the database name).
+
+ RETURN
+ void
+*/
+
+void _ma_report_error(int errcode, const LEX_STRING *name)
+{
+ size_t length;
+ const char *file_name= name->str;
+ DBUG_ENTER("_ma_report_error");
+ DBUG_PRINT("enter",("errcode %d, table '%s'", errcode, file_name));
+
+ if ((length= name->length) > 64)
+ {
+ /* we first remove the directory */
+ size_t dir_length= dirname_length(file_name);
+ file_name+= dir_length;
+ if ((length-= dir_length) > 64)
+ {
+ /* still too long, chop start of table name */
+ file_name+= length - 64;
+ }
+ }
+
+ my_error(errcode, MYF(ME_NOREFRESH), file_name);
+ DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_init.c b/storage/maria/ma_init.c
new file mode 100644
index 00000000000..902f06d93e5
--- /dev/null
+++ b/storage/maria/ma_init.c
@@ -0,0 +1,184 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Initialize an maria-database */
+
+#include "maria_def.h"
+#include <ft_global.h>
+#include "ma_blockrec.h"
+#include "trnman_public.h"
+#include "ma_checkpoint.h"
+#include <hash.h>
+
+void history_state_free(MARIA_STATE_HISTORY_CLOSED *closed_history)
+{
+ MARIA_STATE_HISTORY *history, *next;
+
+ /*
+ Free all active history
+ In case of maria_open() this list should be empty as the history is moved
+ to handler->share.
+ */
+ for (history= closed_history->state_history; history ; history= next)
+ {
+ next= history->next;
+ my_free(history, MYF(0));
+ }
+ my_free(closed_history, MYF(0));
+}
+
+
+static int dummy_maria_create_trn_hook(MARIA_HA *info __attribute__((unused)))
+{
+ return 0;
+}
+
+/*
+ Initialize maria
+
+ SYNOPSIS
+ maria_init()
+
+ TODO
+ Open log files and do recovery if need
+
+ RETURN
+ 0 ok
+ # error number
+*/
+
+int maria_init(void)
+{
+ DBUG_ASSERT(maria_block_size &&
+ maria_block_size % MARIA_MIN_KEY_BLOCK_LENGTH == 0);
+ if (!maria_inited)
+ {
+ maria_inited= TRUE;
+ pthread_mutex_init(&THR_LOCK_maria,MY_MUTEX_INIT_SLOW);
+ _ma_init_block_record_data();
+ trnman_end_trans_hook= _ma_trnman_end_trans_hook;
+ maria_create_trn_hook= dummy_maria_create_trn_hook;
+ my_handler_error_register();
+ }
+ hash_init(&maria_stored_state, &my_charset_bin, 32,
+ 0, sizeof(LSN), 0, (hash_free_key) history_state_free, 0);
+ DBUG_PRINT("info",("dummy_transaction_object: %p",
+ &dummy_transaction_object));
+ return 0;
+}
+
+
+void maria_end(void)
+{
+ if (maria_inited)
+ {
+ TrID trid;
+ maria_inited= maria_multi_threaded= FALSE;
+ ft_free_stopwords();
+ ma_checkpoint_end();
+ if (translog_status == TRANSLOG_OK)
+ {
+ translog_soft_sync_end();
+ translog_sync();
+ }
+ if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
+ {
+ /*
+ Store max transaction id into control file, in case logs are removed
+ by user, or maria_chk wants to check tables (it cannot access max trid
+ from the log, as it cannot process REDOs).
+ */
+ (void)ma_control_file_write_and_force(last_checkpoint_lsn, last_logno,
+ trid, recovery_failures);
+ }
+ trnman_destroy();
+ if (translog_status == TRANSLOG_OK)
+ translog_destroy();
+ end_pagecache(maria_log_pagecache, TRUE);
+ end_pagecache(maria_pagecache, TRUE);
+ ma_control_file_end();
+ pthread_mutex_destroy(&THR_LOCK_maria);
+ hash_free(&maria_stored_state);
+ }
+}
+
+/**
+ Upgrade from older Aria versions:
+
+ - In MariaDB 5.1, the name of the control file and log files had the
+ 'maria' prefix, now they have the 'aria' prefix.
+
+ @return: 0 ok
+ 1 error
+
+*/
+
+my_bool maria_upgrade()
+{
+ char name[FN_REFLEN], new_name[FN_REFLEN];
+ DBUG_ENTER("maria_upgrade");
+
+ fn_format(name, "maria_log_control", maria_data_root, "", MYF(MY_WME));
+
+ if (!my_access(name,F_OK))
+ {
+ /*
+ Old style control file found; Rename the control file and the log files.
+ We start by renaming all log files, so that if we get a crash
+ we will continue from where we left.
+ */
+ uint i;
+ MY_DIR *dir= my_dir(maria_data_root, MYF(MY_WME));
+ if (!dir)
+ DBUG_RETURN(1);
+
+ my_message(HA_ERR_INITIALIZATION,
+ "Found old style Maria log files; "
+ "Converting them to Aria names",
+ MYF(ME_JUST_INFO));
+
+ for (i= 0; i < dir->number_off_files; i++)
+ {
+ const char *file= dir->dir_entry[i].name;
+ if (strncmp(file, "maria_log.", 10) == 0 &&
+ file[10] >= '0' && file[10] <= '9' &&
+ file[11] >= '0' && file[11] <= '9' &&
+ file[12] >= '0' && file[12] <= '9' &&
+ file[13] >= '0' && file[13] <= '9' &&
+ file[14] >= '0' && file[14] <= '9' &&
+ file[15] >= '0' && file[15] <= '9' &&
+ file[16] >= '0' && file[16] <= '9' &&
+ file[17] >= '0' && file[17] <= '9' &&
+ file[18] == '\0')
+ {
+ /* Remove the 'm' in 'maria' */
+ char old_logname[FN_REFLEN], new_logname[FN_REFLEN];
+ fn_format(old_logname, file, maria_data_root, "", MYF(0));
+ fn_format(new_logname, file+1, maria_data_root, "", MYF(0));
+ if (my_rename(old_logname, new_logname, MYF(MY_WME)))
+ {
+ my_dirend(dir);
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ my_dirend(dir);
+
+ fn_format(new_name, CONTROL_FILE_BASE_NAME, maria_data_root, "", MYF(0));
+ if (my_rename(name, new_name, MYF(MY_WME)))
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
diff --git a/storage/maria/ma_key.c b/storage/maria/ma_key.c
new file mode 100644
index 00000000000..ac23bf5fef6
--- /dev/null
+++ b/storage/maria/ma_key.c
@@ -0,0 +1,775 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Functions to handle keys */
+
+#include "maria_def.h"
+#include "m_ctype.h"
+#include "ma_sp_defs.h"
+#include "ma_blockrec.h" /* For ROW_FLAG_TRANSID */
+#include "trnman.h"
+#ifdef HAVE_IEEEFP_H
+#include <ieeefp.h>
+#endif
+
+#define CHECK_KEYS /* Enable safety checks */
+
+static int _ma_put_key_in_record(MARIA_HA *info, uint keynr,
+ my_bool unpack_blobs, uchar *record);
+
+#define FIX_LENGTH(cs, pos, length, char_length) \
+ do { \
+ if (length > char_length) \
+ char_length= (uint) my_charpos(cs, pos, pos+length, char_length); \
+ set_if_smaller(char_length,length); \
+ } while(0)
+
+
+/**
+ Store trid in a packed format as part of a key
+
+ @fn transid_store_packed
+ @param info Maria handler
+ @param to End of key to which we should store a packed transid
+ @param trid Trid to be stored
+
+ @notes
+
+ Keys that have a transid has the lowest bit set for the last byte of the key
+ This function sets this bit for the key.
+
+ Trid is max 6 bytes long
+
+ First Trid it's converted to a smaller number by using
+ trid= trid - create_trid.
+ Then trid is then shifted up one bit so that we can use the
+ lowest bit as a marker if it's followed by another trid.
+
+ Trid is then stored as follows:
+
+ if trid < 256-12
+ one byte
+ else
+ one byte prefix length_of_trid_in_bytes + 249 followed by data
+ in high-byte-first order
+
+ Prefix bytes 244 to 249 are reserved for negative transid, that can be used
+ when we pack transid relative to each other on a key block.
+
+ We have to store transid in high-byte-first order so that we can compare
+ them unpacked byte per byte and as soon we find a difference we know
+ which is smaller.
+
+ For example, assuming we the following data:
+
+ key_data: 1 (4 byte integer)
+ pointer_to_row: 2 << 8 + 3 = 515 (page 2, row 3)
+ table_create_transid 1000 Defined at create table time and
+ stored in table definition
+ transid 1010 Transaction that created row
+ delete_transid 2011 Transaction that deleted row
+
+ In addition we assume the table is created with a data pointer length
+ of 4 bytes (this is automatically calculated based on the medium
+ length of rows and the given max number of rows)
+
+ The binary data for the key would then look like this in hex:
+
+ 00 00 00 01 Key data (1 stored high byte first)
+ 00 00 00 47 (515 << 1) + 1 ; The last 1 is marker that key cont.
+ 15 ((1010-1000) << 1) + 1 ; The last 1 is marker that key cont.
+ FB 07 E6 Length byte (FE = 249 + 2 means 2 bytes) and
+ ((2011 - 1000) << 1) = 07 E6
+*/
+
+uint transid_store_packed(MARIA_HA *info, uchar *to, ulonglong trid)
+{
+ uchar *start;
+ uint length;
+ uchar buff[8];
+ DBUG_ASSERT(trid < (LL(1) << (MARIA_MAX_PACK_TRANSID_SIZE*8)));
+ DBUG_ASSERT(trid >= info->s->state.create_trid);
+
+ trid= (trid - info->s->state.create_trid) << 1;
+
+ /* Mark that key contains transid */
+ to[-1]|= 1;
+
+ if (trid < MARIA_MIN_TRANSID_PACK_OFFSET)
+ {
+ to[0]= (uchar) trid;
+ return 1;
+ }
+ start= to;
+
+ /* store things in low-byte-first-order in buff */
+ to= buff;
+ do
+ {
+ *to++= (uchar) trid;
+ trid= trid>>8;
+ } while (trid);
+
+ length= (uint) (to - buff);
+ /* Store length prefix */
+ start[0]= (uchar) (length + MARIA_TRANSID_PACK_OFFSET);
+ start++;
+ /* Copy things in high-byte-first order to output buffer */
+ do
+ {
+ *start++= *--to;
+ } while (to != buff);
+ return length+1;
+}
+
+
+/**
+ Read packed transid
+
+ @fn transid_get_packed
+ @param info Maria handler
+ @param from Transid is stored here
+
+ See transid_store_packed() for how transid is packed
+
+*/
+
+ulonglong transid_get_packed(MARIA_SHARE *share, const uchar *from)
+{
+ ulonglong value;
+ uint length;
+
+ if (from[0] < MARIA_MIN_TRANSID_PACK_OFFSET)
+ value= (ulonglong) from[0];
+ else
+ {
+ value= 0;
+ for (length= (uint) (from[0] - MARIA_TRANSID_PACK_OFFSET),
+ value= (ulonglong) from[1], from+=2;
+ --length ;
+ from++)
+ value= (value << 8) + ((ulonglong) *from);
+ }
+ return (value >> 1) + share->state.create_trid;
+}
+
+
+/*
+ Make a normal (not spatial or fulltext) intern key from a record
+
+ SYNOPSIS
+ _ma_make_key()
+ info MyiSAM handler
+ int_key Store created key here
+ keynr key number
+ key Buffer used to store key data
+ record Record
+ filepos Position to record in the data file
+
+ NOTES
+ This is used to generate keys from the record on insert, update and delete
+
+ RETURN
+ key
+*/
+
+MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr,
+ uchar *key, const uchar *record,
+ MARIA_RECORD_POS filepos, ulonglong trid)
+{
+ const uchar *pos;
+ reg1 HA_KEYSEG *keyseg;
+ my_bool is_ft;
+ DBUG_ENTER("_ma_make_key");
+
+ int_key->data= key;
+ int_key->flag= 0; /* Always return full key */
+ int_key->keyinfo= info->s->keyinfo + keynr;
+
+ is_ft= int_key->keyinfo->flag & HA_FULLTEXT;
+ for (keyseg= int_key->keyinfo->seg ; keyseg->type ;keyseg++)
+ {
+ enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type;
+ uint length=keyseg->length;
+ uint char_length;
+ CHARSET_INFO *cs=keyseg->charset;
+
+ if (keyseg->null_bit)
+ {
+ if (record[keyseg->null_pos] & keyseg->null_bit)
+ {
+ *key++= 0; /* NULL in key */
+ continue;
+ }
+ *key++=1; /* Not NULL */
+ }
+
+ char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen :
+ length);
+
+ pos= record+keyseg->start;
+ if (type == HA_KEYTYPE_BIT)
+ {
+ if (keyseg->bit_length)
+ {
+ uchar bits= get_rec_bits(record + keyseg->bit_pos,
+ keyseg->bit_start, keyseg->bit_length);
+ *key++= (char) bits;
+ length--;
+ }
+ memcpy(key, pos, length);
+ key+= length;
+ continue;
+ }
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ if (type != HA_KEYTYPE_NUM)
+ {
+ length= (uint) cs->cset->lengthsp(cs, (const char*)pos, length);
+ }
+ else
+ {
+ const uchar *end= pos + length;
+ while (pos < end && pos[0] == ' ')
+ pos++;
+ length= (uint) (end-pos);
+ }
+ FIX_LENGTH(cs, pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ memcpy(key, pos, (size_t) char_length);
+ key+=char_length;
+ continue;
+ }
+ if (keyseg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint pack_length= (keyseg->bit_start == 1 ? 1 : 2);
+ uint tmp_length= (pack_length == 1 ? (uint) *pos :
+ uint2korr(pos));
+ pos+= pack_length; /* Skip VARCHAR length */
+ set_if_smaller(length,tmp_length);
+ FIX_LENGTH(cs, pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ memcpy(key,pos,(size_t) char_length);
+ key+= char_length;
+ continue;
+ }
+ else if (keyseg->flag & HA_BLOB_PART)
+ {
+ uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos);
+ uchar *blob_pos;
+ memcpy_fixed(&blob_pos, pos+keyseg->bit_start,sizeof(char*));
+ set_if_smaller(length,tmp_length);
+ FIX_LENGTH(cs, blob_pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ memcpy(key, blob_pos, (size_t) char_length);
+ key+= char_length;
+ continue;
+ }
+ else if (keyseg->flag & HA_SWAP_KEY)
+ { /* Numerical column */
+#ifdef HAVE_ISNAN
+ if (type == HA_KEYTYPE_FLOAT)
+ {
+ float nr;
+ float4get(nr,pos);
+ if (isnan(nr))
+ {
+ /* Replace NAN with zero */
+ bzero(key,length);
+ key+=length;
+ continue;
+ }
+ }
+ else if (type == HA_KEYTYPE_DOUBLE)
+ {
+ double nr;
+ float8get(nr,pos);
+ if (isnan(nr))
+ {
+ bzero(key,length);
+ key+=length;
+ continue;
+ }
+ }
+#endif
+ pos+=length;
+ while (length--)
+ {
+ *key++ = *--pos;
+ }
+ continue;
+ }
+ FIX_LENGTH(cs, pos, length, char_length);
+ memcpy(key, pos, char_length);
+ if (length > char_length)
+ cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+ key+= length;
+ }
+ _ma_dpointer(info->s, key, filepos);
+ int_key->data_length= (key - int_key->data);
+ int_key->ref_length= info->s->rec_reflength;
+ int_key->flag= 0;
+ if (_ma_have_versioning(info) && trid)
+ {
+ int_key->ref_length+= transid_store_packed(info,
+ key + int_key->ref_length,
+ (TrID) trid);
+ int_key->flag|= SEARCH_USER_KEY_HAS_TRANSID;
+ }
+
+ DBUG_PRINT("exit",("keynr: %d",keynr));
+ DBUG_DUMP_KEY("key", int_key);
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, int_key););
+ DBUG_RETURN(int_key);
+} /* _ma_make_key */
+
+
+/*
+ Pack a key to intern format from given format (c_rkey)
+
+ SYNOPSIS
+ _ma_pack_key()
+ info MARIA handler
+ int_key Store key here
+ keynr key number
+ key Buffer for key data
+ old Original not packed key
+ keypart_map bitmap of used keyparts
+ last_used_keyseg out parameter. May be NULL
+
+ RETURN
+ int_key
+
+ last_use_keyseg Store pointer to the keyseg after the last used one
+*/
+
+MARIA_KEY *_ma_pack_key(register MARIA_HA *info, MARIA_KEY *int_key,
+ uint keynr, uchar *key,
+ const uchar *old, key_part_map keypart_map,
+ HA_KEYSEG **last_used_keyseg)
+{
+ HA_KEYSEG *keyseg;
+ my_bool is_ft;
+ DBUG_ENTER("_ma_pack_key");
+
+ int_key->data= key;
+ int_key->keyinfo= info->s->keyinfo + keynr;
+
+ /* "one part" rtree key is 2*SPDIMS part key in Maria */
+ if (int_key->keyinfo->key_alg == HA_KEY_ALG_RTREE)
+ keypart_map= (((key_part_map)1) << (2*SPDIMS)) - 1;
+
+ /* only key prefixes are supported */
+ DBUG_ASSERT(((keypart_map+1) & keypart_map) == 0);
+
+ is_ft= int_key->keyinfo->flag & HA_FULLTEXT;
+ for (keyseg=int_key->keyinfo->seg ; keyseg->type && keypart_map;
+ old+= keyseg->length, keyseg++)
+ {
+ enum ha_base_keytype type= (enum ha_base_keytype) keyseg->type;
+ uint length= keyseg->length;
+ uint char_length;
+ const uchar *pos;
+ CHARSET_INFO *cs=keyseg->charset;
+
+ keypart_map>>= 1;
+ if (keyseg->null_bit)
+ {
+ if (!(*key++= (char) 1-*old++)) /* Copy null marker */
+ {
+ if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))
+ old+= 2;
+ continue; /* Found NULL */
+ }
+ }
+ char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen :
+ length);
+ pos= old;
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ const uchar *end= pos + length;
+ if (type == HA_KEYTYPE_NUM)
+ {
+ while (pos < end && pos[0] == ' ')
+ pos++;
+ }
+ else if (type != HA_KEYTYPE_BINARY)
+ {
+ while (end > pos && end[-1] == ' ')
+ end--;
+ }
+ length=(uint) (end-pos);
+ FIX_LENGTH(cs, pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ memcpy(key,pos,(size_t) char_length);
+ key+= char_length;
+ continue;
+ }
+ else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))
+ {
+ /* Length of key-part used with maria_rkey() always 2 */
+ uint tmp_length=uint2korr(pos);
+ pos+=2;
+ set_if_smaller(length,tmp_length); /* Safety */
+ FIX_LENGTH(cs, pos, length, char_length);
+ store_key_length_inc(key,char_length);
+ old+=2; /* Skip length */
+ memcpy(key, pos,(size_t) char_length);
+ key+= char_length;
+ continue;
+ }
+ else if (keyseg->flag & HA_SWAP_KEY)
+ { /* Numerical column */
+ pos+=length;
+ while (length--)
+ *key++ = *--pos;
+ continue;
+ }
+ FIX_LENGTH(cs, pos, length, char_length);
+ memcpy(key, pos, char_length);
+ if (length > char_length)
+ cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+ key+= length;
+ }
+ if (last_used_keyseg)
+ *last_used_keyseg= keyseg;
+
+ /* set flag to SEARCH_PART_KEY if we are not using all key parts */
+ int_key->flag= keyseg->type ? SEARCH_PART_KEY : 0;
+ int_key->ref_length= 0;
+ int_key->data_length= (key - int_key->data);
+
+ DBUG_PRINT("exit", ("length: %u", int_key->data_length));
+ DBUG_RETURN(int_key);
+} /* _ma_pack_key */
+
+
+/**
+ Copy a key
+*/
+
+void _ma_copy_key(MARIA_KEY *to, const MARIA_KEY *from)
+{
+ memcpy(to->data, from->data, from->data_length + from->ref_length);
+ to->keyinfo= from->keyinfo;
+ to->data_length= from->data_length;
+ to->ref_length= from->ref_length;
+ to->flag= from->flag;
+}
+
+
+/*
+ Store found key in record
+
+ SYNOPSIS
+ _ma_put_key_in_record()
+ info MARIA handler
+ keynr Key number that was used
+ unpack_blobs TRUE <=> Unpack blob columns
+ FALSE <=> Skip them. This is used by index condition
+ pushdown check function
+ record Store key here
+
+ Last read key is in info->lastkey
+
+ NOTES
+ Used when only-keyread is wanted
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+
+static int _ma_put_key_in_record(register MARIA_HA *info, uint keynr,
+ my_bool unpack_blobs, uchar *record)
+{
+ reg2 uchar *key;
+ uchar *pos,*key_end;
+ reg1 HA_KEYSEG *keyseg;
+ uchar *blob_ptr;
+ DBUG_ENTER("_ma_put_key_in_record");
+
+ blob_ptr= info->lastkey_buff2; /* Place to put blob parts */
+ key= info->last_key.data; /* Key that was read */
+ key_end= key + info->last_key.data_length;
+ for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type ;keyseg++)
+ {
+ if (keyseg->null_bit)
+ {
+ if (!*key++)
+ {
+ record[keyseg->null_pos]|= keyseg->null_bit;
+ continue;
+ }
+ record[keyseg->null_pos]&= ~keyseg->null_bit;
+ }
+ if (keyseg->type == HA_KEYTYPE_BIT)
+ {
+ uint length= keyseg->length;
+
+ if (keyseg->bit_length)
+ {
+ uchar bits= *key++;
+ set_rec_bits(bits, record + keyseg->bit_pos, keyseg->bit_start,
+ keyseg->bit_length);
+ length--;
+ }
+ else
+ {
+ clr_rec_bits(record + keyseg->bit_pos, keyseg->bit_start,
+ keyseg->bit_length);
+ }
+ memcpy(record + keyseg->start, key, length);
+ key+= length;
+ continue;
+ }
+ if (keyseg->flag & HA_SPACE_PACK)
+ {
+ uint length;
+ get_key_length(length,key);
+#ifdef CHECK_KEYS
+ if (length > keyseg->length || key+length > key_end)
+ goto err;
+#endif
+ pos= record+keyseg->start;
+ if (keyseg->type != (int) HA_KEYTYPE_NUM)
+ {
+ memcpy(pos,key,(size_t) length);
+ keyseg->charset->cset->fill(keyseg->charset,
+ (char*) pos + length,
+ keyseg->length - length,
+ ' ');
+ }
+ else
+ {
+ bfill(pos,keyseg->length-length,' ');
+ memcpy(pos+keyseg->length-length,key,(size_t) length);
+ }
+ key+=length;
+ continue;
+ }
+
+ if (keyseg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint length;
+ get_key_length(length,key);
+#ifdef CHECK_KEYS
+ if (length > keyseg->length || key+length > key_end)
+ goto err;
+#endif
+ /* Store key length */
+ if (keyseg->bit_start == 1)
+ *(uchar*) (record+keyseg->start)= (uchar) length;
+ else
+ int2store(record+keyseg->start, length);
+ /* And key data */
+ memcpy(record+keyseg->start + keyseg->bit_start, key, length);
+ key+= length;
+ }
+ else if (keyseg->flag & HA_BLOB_PART)
+ {
+ uint length;
+ get_key_length(length,key);
+#ifdef CHECK_KEYS
+ if (length > keyseg->length || key+length > key_end)
+ goto err;
+#endif
+ if (unpack_blobs)
+ {
+ memcpy(record+keyseg->start+keyseg->bit_start,
+ (char*) &blob_ptr,sizeof(char*));
+ memcpy(blob_ptr,key,length);
+ blob_ptr+=length;
+
+ /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+
+ _ma_store_blob_length(record+keyseg->start,
+ (uint) keyseg->bit_start,length);
+ }
+ key+=length;
+ }
+ else if (keyseg->flag & HA_SWAP_KEY)
+ {
+ uchar *to= record+keyseg->start+keyseg->length;
+ uchar *end= key+keyseg->length;
+#ifdef CHECK_KEYS
+ if (end > key_end)
+ goto err;
+#endif
+ do
+ {
+ *--to= *key++;
+ } while (key != end);
+ continue;
+ }
+ else
+ {
+#ifdef CHECK_KEYS
+ if (key+keyseg->length > key_end)
+ goto err;
+#endif
+ memcpy(record+keyseg->start, key, (size_t) keyseg->length);
+ key+= keyseg->length;
+ }
+ }
+ DBUG_RETURN(0);
+
+err:
+ DBUG_PRINT("info",("error"));
+ DBUG_RETURN(1); /* Crashed row */
+} /* _ma_put_key_in_record */
+
+
+ /* Here when key reads are used */
+
+int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
+{
+ fast_ma_writeinfo(info);
+ if (filepos != HA_OFFSET_ERROR)
+ {
+ if (info->lastinx >= 0)
+ { /* Read only key */
+ if (_ma_put_key_in_record(info, (uint)info->lastinx, TRUE, buf))
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ return -1;
+ }
+ info->update|= HA_STATE_AKTIV; /* We should find a record */
+ return 0;
+ }
+ my_errno=HA_ERR_WRONG_INDEX;
+ }
+ return(-1); /* Wrong data to read */
+}
+
+
+
+/*
+ Save current key tuple to record and call index condition check function
+
+ SYNOPSIS
+ ma_check_index_cond()
+ info MyISAM handler
+ keynr Index we're running a scan on
+ record Record buffer to use (it is assumed that index check function
+ will look for column values there)
+
+ RETURN
+ ICP_ERROR Error
+ ICP_NO_MATCH Index condition is not satisfied, continue scanning
+ ICP_MATCH Index condition is satisfied
+ ICP_OUT_OF_RANGE Index condition is not satisfied, end the scan.
+*/
+
+int ma_check_index_cond(register MARIA_HA *info, uint keynr, uchar *record)
+{
+ if (info->index_cond_func)
+ {
+ if (_ma_put_key_in_record(info, keynr, FALSE, record))
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ return -1;
+ }
+ return info->index_cond_func(info->index_cond_func_arg);
+ }
+ return 1;
+}
+
+
+/*
+ Retrieve auto_increment info
+
+ SYNOPSIS
+ retrieve_auto_increment()
+ key Auto-increment key
+ key_type Key's type
+
+ NOTE
+ 'key' should in "record" format, that is, how it is packed in a record
+ (this matters with HA_SWAP_KEY).
+
+ IMPLEMENTATION
+ For signed columns we don't retrieve the auto increment value if it's
+ less than zero.
+*/
+
+ulonglong ma_retrieve_auto_increment(const uchar *key, uint8 key_type)
+{
+ ulonglong value= 0; /* Store unsigned values here */
+ longlong s_value= 0; /* Store signed values here */
+
+ switch (key_type) {
+ case HA_KEYTYPE_INT8:
+ s_value= (longlong) *(const char*)key;
+ break;
+ case HA_KEYTYPE_BINARY:
+ value=(ulonglong) *key;
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ s_value= (longlong) sint2korr(key);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ value=(ulonglong) uint2korr(key);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ s_value= (longlong) sint4korr(key);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ value=(ulonglong) uint4korr(key);
+ break;
+ case HA_KEYTYPE_INT24:
+ s_value= (longlong) sint3korr(key);
+ break;
+ case HA_KEYTYPE_UINT24:
+ value=(ulonglong) uint3korr(key);
+ break;
+ case HA_KEYTYPE_FLOAT: /* This shouldn't be used */
+ {
+ float f_1;
+ float4get(f_1,key);
+ /* Ignore negative values */
+ value = (f_1 < (float) 0.0) ? 0 : (ulonglong) f_1;
+ break;
+ }
+ case HA_KEYTYPE_DOUBLE: /* This shouldn't be used */
+ {
+ double f_1;
+ float8get(f_1,key);
+ /* Ignore negative values */
+ value = (f_1 < 0.0) ? 0 : (ulonglong) f_1;
+ break;
+ }
+ case HA_KEYTYPE_LONGLONG:
+ s_value= sint8korr(key);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ value= uint8korr(key);
+ break;
+ default:
+ DBUG_ASSERT(0);
+ value=0; /* Error */
+ break;
+ }
+
+ /*
+ The following code works becasue if s_value < 0 then value is 0
+ and if s_value == 0 then value will contain either s_value or the
+ correct value.
+ */
+ return (s_value > 0) ? (ulonglong) s_value : value;
+}
diff --git a/storage/maria/ma_key_recover.c b/storage/maria/ma_key_recover.c
new file mode 100644
index 00000000000..6de5253a2dd
--- /dev/null
+++ b/storage/maria/ma_key_recover.c
@@ -0,0 +1,1432 @@
+/* Copyright (C) 2007 Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Redo of index */
+
+#include "maria_def.h"
+#include "ma_blockrec.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+#include "ma_rt_index.h"
+
+/****************************************************************************
+ Some helper functions used both by key page loggin and block page loggin
+****************************************************************************/
+
+/**
+ @brief Unpin all pinned pages
+
+ @fn _ma_unpin_all_pages()
+ @param info Maria handler
+ @param undo_lsn LSN for undo pages. LSN_IMPOSSIBLE if we shouldn't write
+ undo (like on duplicate key errors)
+
+ info->pinned_pages is the list of pages to unpin. Each member of the list
+ must have its 'changed' saying if the page was changed or not.
+
+ @note
+ We unpin pages in the reverse order as they where pinned; This is not
+ necessary now, but may simplify things in the future.
+
+ @return
+ @retval 0 ok
+ @retval 1 error (fatal disk error)
+*/
+
+void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn)
+{
+ MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*)
+ dynamic_array_ptr(&info->pinned_pages, 0));
+ MARIA_PINNED_PAGE *pinned_page= page_link + info->pinned_pages.elements;
+ DBUG_ENTER("_ma_unpin_all_pages");
+ DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn));
+
+ if (!info->s->now_transactional)
+ DBUG_ASSERT(undo_lsn == LSN_IMPOSSIBLE || maria_in_recovery);
+
+ while (pinned_page-- != page_link)
+ {
+ /*
+ Note this assert fails if we got a disk error or the record file
+ is corrupted, which means we should have this enabled only in debug
+ builds.
+ */
+#ifdef EXTRA_DEBUG
+ DBUG_ASSERT((!pinned_page->changed ||
+ undo_lsn != LSN_IMPOSSIBLE || !info->s->now_transactional) ||
+ (info->s->state.changed & STATE_CRASHED));
+#endif
+ pagecache_unlock_by_link(info->s->pagecache, pinned_page->link,
+ pinned_page->unlock, PAGECACHE_UNPIN,
+ info->trn->rec_lsn, undo_lsn,
+ pinned_page->changed, FALSE);
+ }
+
+ info->pinned_pages.elements= 0;
+ DBUG_VOID_RETURN;
+}
+
+
+my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn,
+ enum translog_record_type undo_type,
+ my_bool store_checksum, ha_checksum checksum,
+ LSN *res_lsn, void *extra_msg)
+{
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE +
+ HA_CHECKSUM_STORE_SIZE+ KEY_NR_STORE_SIZE + PAGE_STORE_SIZE];
+ uchar *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ struct st_msg_to_write_hook_for_clr_end msg;
+ my_bool res;
+ DBUG_ENTER("_ma_write_clr");
+
+ /* undo_lsn must be first for compression to work */
+ lsn_store(log_data, undo_lsn);
+ clr_type_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, undo_type);
+ log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE;
+
+ /* Extra_msg is handled in write_hook_for_clr_end() */
+ msg.undone_record_type= undo_type;
+ msg.previous_undo_lsn= undo_lsn;
+ msg.extra_msg= extra_msg;
+ msg.checksum_delta= 0;
+
+ if (store_checksum)
+ {
+ msg.checksum_delta= checksum;
+ ha_checksum_store(log_pos, checksum);
+ log_pos+= HA_CHECKSUM_STORE_SIZE;
+ }
+ else if (undo_type == LOGREC_UNDO_KEY_INSERT_WITH_ROOT ||
+ undo_type == LOGREC_UNDO_KEY_DELETE_WITH_ROOT)
+ {
+ /* Key root changed. Store new key root */
+ struct st_msg_to_write_hook_for_undo_key *undo_msg= extra_msg;
+ pgcache_page_no_t page;
+ key_nr_store(log_pos, undo_msg->keynr);
+ page= (undo_msg->value == HA_OFFSET_ERROR ? IMPOSSIBLE_PAGE_NO :
+ undo_msg->value / info->s->block_size);
+ page_store(log_pos + KEY_NR_STORE_SIZE, page);
+ log_pos+= KEY_NR_STORE_SIZE + PAGE_STORE_SIZE;
+ }
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data);
+
+
+ /*
+ We need intern_lock mutex for calling _ma_state_info_write in the trigger.
+ We do it here to have the same sequence of mutexes locking everywhere
+ (first intern_lock then transactional log buffer lock)
+ */
+ if (undo_type == LOGREC_UNDO_BULK_INSERT)
+ pthread_mutex_lock(&info->s->intern_lock);
+
+ res= translog_write_record(res_lsn, LOGREC_CLR_END,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length,
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data + LSN_STORE_SIZE, &msg);
+ if (undo_type == LOGREC_UNDO_BULK_INSERT)
+ pthread_mutex_unlock(&info->s->intern_lock);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Sets transaction's undo_lsn, first_undo_lsn if needed
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_clr_end(enum translog_record_type type
+ __attribute__ ((unused)),
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn __attribute__ ((unused)),
+ void *hook_arg)
+{
+ MARIA_SHARE *share= tbl_info->s;
+ struct st_msg_to_write_hook_for_clr_end *msg=
+ (struct st_msg_to_write_hook_for_clr_end *)hook_arg;
+ my_bool error= FALSE;
+ DBUG_ENTER("write_hook_for_clr_end");
+ DBUG_ASSERT(trn->trid != 0);
+ trn->undo_lsn= msg->previous_undo_lsn;
+
+ switch (msg->undone_record_type) {
+ case LOGREC_UNDO_ROW_DELETE:
+ share->state.state.records++;
+ share->state.state.checksum+= msg->checksum_delta;
+ break;
+ case LOGREC_UNDO_ROW_INSERT:
+ share->state.state.records--;
+ share->state.state.checksum+= msg->checksum_delta;
+ break;
+ case LOGREC_UNDO_ROW_UPDATE:
+ share->state.state.checksum+= msg->checksum_delta;
+ break;
+ case LOGREC_UNDO_KEY_INSERT_WITH_ROOT:
+ case LOGREC_UNDO_KEY_DELETE_WITH_ROOT:
+ {
+ /* Update key root */
+ struct st_msg_to_write_hook_for_undo_key *extra_msg=
+ (struct st_msg_to_write_hook_for_undo_key *) msg->extra_msg;
+ *extra_msg->root= extra_msg->value;
+ break;
+ }
+ case LOGREC_UNDO_KEY_INSERT:
+ case LOGREC_UNDO_KEY_DELETE:
+ break;
+ case LOGREC_UNDO_BULK_INSERT:
+ safe_mutex_assert_owner(&share->intern_lock);
+ error= (maria_enable_indexes(tbl_info) ||
+ /* we enabled indices, need '2' below */
+ _ma_state_info_write(share,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_FULL_INFO));
+ /* no need for _ma_reset_status(): REDO_DELETE_ALL is just before us */
+ break;
+ default:
+ DBUG_ASSERT(0);
+ }
+ if (trn->undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */
+ trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ @brief write hook for undo key
+*/
+
+my_bool write_hook_for_undo_key(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg)
+{
+ struct st_msg_to_write_hook_for_undo_key *msg=
+ (struct st_msg_to_write_hook_for_undo_key *) hook_arg;
+
+ *msg->root= msg->value;
+ _ma_fast_unlock_key_del(tbl_info);
+ return write_hook_for_undo(type, trn, tbl_info, lsn, 0);
+}
+
+
+/**
+ Updates "auto_increment" and calls the generic UNDO_KEY hook
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_key_insert(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg)
+{
+ struct st_msg_to_write_hook_for_undo_key *msg=
+ (struct st_msg_to_write_hook_for_undo_key *) hook_arg;
+ MARIA_SHARE *share= tbl_info->s;
+ if (msg->auto_increment > 0)
+ {
+ /*
+ Only reason to set it here is to have a mutex protect from checkpoint
+ reading at the same time (would see a corrupted value).
+
+ The purpose of the following code is to set auto_increment if the row
+ has a with auto_increment value higher than the current one. We also
+ want to be able to restore the old value, in case of rollback,
+ if no one else has tried to set the value.
+
+ The logic used is that we only restore the auto_increment value if
+ tbl_info->last_auto_increment == share->last_auto_increment
+ when it's time to do the rollback.
+ */
+ DBUG_PRINT("info",("auto_inc: %lu new auto_inc: %lu",
+ (ulong)share->state.auto_increment,
+ (ulong)msg->auto_increment));
+ if (share->state.auto_increment < msg->auto_increment)
+ {
+ /* Remember the original value, in case of rollback */
+ tbl_info->last_auto_increment= share->last_auto_increment=
+ share->state.auto_increment;
+ share->state.auto_increment= msg->auto_increment;
+ }
+ else
+ {
+ /*
+ If the current value would have affected the original auto_increment
+ value, set it to an impossible value so that it's not restored on
+ rollback
+ */
+ if (msg->auto_increment > share->last_auto_increment)
+ share->last_auto_increment= ~(ulonglong) 0;
+ }
+ }
+ return write_hook_for_undo_key(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+ @brief Updates "share->auto_increment" in case of abort and calls
+ generic UNDO_KEY hook
+
+ @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_key_delete(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg)
+{
+ struct st_msg_to_write_hook_for_undo_key *msg=
+ (struct st_msg_to_write_hook_for_undo_key *) hook_arg;
+ MARIA_SHARE *share= tbl_info->s;
+ if (msg->auto_increment > 0) /* If auto increment key */
+ {
+ /* Restore auto increment if no one has changed it in between */
+ if (share->last_auto_increment == tbl_info->last_auto_increment &&
+ tbl_info->last_auto_increment != ~(ulonglong) 0)
+ share->state.auto_increment= tbl_info->last_auto_increment;
+ }
+ return write_hook_for_undo_key(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/*****************************************************************************
+ Functions for logging of key page changes
+*****************************************************************************/
+
+/**
+ @brief
+ Write log entry for page that has got data added or deleted at start of page
+*/
+
+my_bool _ma_log_prefix(MARIA_PAGE *ma_page, uint changed_length,
+ int move_length,
+ enum en_key_debug debug_marker __attribute__((unused)))
+{
+ uint translog_parts;
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 7 + 7 + 2 + 2];
+ uchar *log_pos;
+ uchar *buff= ma_page->buff;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+ MARIA_HA *info= ma_page->info;
+ pgcache_page_no_t page= ma_page->pos / info->s->block_size;
+ DBUG_ENTER("_ma_log_prefix");
+ DBUG_PRINT("enter", ("page: %lu changed_length: %u move_length: %d",
+ (ulong) page, changed_length, move_length));
+
+ DBUG_ASSERT(ma_page->size == ma_page->org_size + move_length);
+
+ log_pos= log_data + FILEID_STORE_SIZE;
+ page_store(log_pos, page);
+ log_pos+= PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+ (*log_pos++)= KEY_OP_DEBUG;
+ (*log_pos++)= debug_marker;
+#endif
+
+ /* Store keypage_flag */
+ *log_pos++= KEY_OP_SET_PAGEFLAG;
+ *log_pos++= buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+ if (move_length < 0)
+ {
+ /* Delete prefix */
+ log_pos[0]= KEY_OP_DEL_PREFIX;
+ int2store(log_pos+1, -move_length);
+ log_pos+= 3;
+ if (changed_length)
+ {
+ /*
+ We don't need a KEY_OP_OFFSET as KEY_OP_DEL_PREFIX has an implicit
+ offset
+ */
+ log_pos[0]= KEY_OP_CHANGE;
+ int2store(log_pos+1, changed_length);
+ log_pos+= 3;
+ }
+ }
+ else
+ {
+ /* Add prefix */
+ DBUG_ASSERT(changed_length >0 && (int) changed_length >= move_length);
+ log_pos[0]= KEY_OP_ADD_PREFIX;
+ int2store(log_pos+1, move_length);
+ int2store(log_pos+3, changed_length);
+ log_pos+= 5;
+ }
+
+ translog_parts= 1;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+ log_data);
+ if (changed_length)
+ {
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (buff +
+ info->s->keypage_header);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length;
+ translog_parts= 2;
+ }
+
+ _ma_log_key_changes(ma_page, log_array + TRANSLOG_INTERNAL_PARTS +
+ translog_parts, log_pos, &changed_length,
+ &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ ma_page->org_size= ma_page->size;
+
+ DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length + changed_length,
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL));
+}
+
+
+/**
+ @brief
+ Write log entry for page that has got data added or deleted at end of page
+*/
+
+my_bool _ma_log_suffix(MARIA_PAGE *ma_page, uint org_length, uint new_length)
+{
+ LSN lsn;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 10 + 7 + 2], *log_pos;
+ uchar *buff= ma_page->buff;
+ int diff;
+ uint translog_parts, extra_length;
+ MARIA_HA *info= ma_page->info;
+ pgcache_page_no_t page= ma_page->pos / info->s->block_size;
+ DBUG_ENTER("_ma_log_suffix");
+ DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u",
+ (ulong) page, org_length, new_length));
+ DBUG_ASSERT(ma_page->size == new_length);
+ DBUG_ASSERT(ma_page->org_size == org_length);
+
+ log_pos= log_data + FILEID_STORE_SIZE;
+ page_store(log_pos, page);
+ log_pos+= PAGE_STORE_SIZE;
+
+ /* Store keypage_flag */
+ *log_pos++= KEY_OP_SET_PAGEFLAG;
+ *log_pos++= buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+ if ((diff= (int) (new_length - org_length)) < 0)
+ {
+ log_pos[0]= KEY_OP_DEL_SUFFIX;
+ int2store(log_pos+1, -diff);
+ log_pos+= 3;
+ translog_parts= 1;
+ extra_length= 0;
+ }
+ else
+ {
+ log_pos[0]= KEY_OP_ADD_SUFFIX;
+ int2store(log_pos+1, diff);
+ log_pos+= 3;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= buff + org_length;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= (uint) diff;
+ translog_parts= 2;
+ extra_length= (uint) diff;
+ }
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+ log_data);
+
+ _ma_log_key_changes(ma_page,
+ log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_pos, &extra_length, &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ ma_page->org_size= ma_page->size;
+
+ DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length + extra_length,
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL));
+}
+
+
+/**
+ @brief Log that a key was added to the page
+
+ @param ma_page Changed page
+ @param org_page_length Length of data in page before key was added
+ Final length in ma_page->size
+
+ @note
+ If handle_overflow is set, then we have to protect against
+ logging changes that is outside of the page.
+ This may happen during underflow() handling where the buffer
+ in memory temporary contains more data than block_size
+
+ ma_page may be a page that was previously logged and cuted down
+ becasue it's too big. (org_page_length > ma_page->org_size)
+*/
+
+my_bool _ma_log_add(MARIA_PAGE *ma_page,
+ uint org_page_length __attribute__ ((unused)),
+ uchar *key_pos, uint changed_length, int move_length,
+ my_bool handle_overflow __attribute__ ((unused)),
+ enum en_key_debug debug_marker __attribute__((unused)))
+{
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 3 + 3 + 3 + 3 + 7 +
+ 3 + 2];
+ uchar *log_pos;
+ uchar *buff= ma_page->buff;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6];
+ MARIA_HA *info= ma_page->info;
+ uint offset= (uint) (key_pos - buff);
+ uint max_page_size= info->s->max_index_block_size;
+ uint translog_parts, current_size;
+ pgcache_page_no_t page_pos= ma_page->pos / info->s->block_size;
+ DBUG_ENTER("_ma_log_add");
+ DBUG_PRINT("enter", ("page: %lu org_page_length: %u changed_length: %u "
+ "move_length: %d",
+ (ulong) page_pos, org_page_length, changed_length,
+ move_length));
+ DBUG_ASSERT(info->s->now_transactional);
+ DBUG_ASSERT(move_length <= (int) changed_length);
+ DBUG_ASSERT(ma_page->org_size == min(org_page_length, max_page_size));
+ DBUG_ASSERT(ma_page->size == org_page_length + move_length);
+ DBUG_ASSERT(offset <= ma_page->org_size);
+
+ /*
+ Write REDO entry that contains the logical operations we need
+ to do the page
+ */
+ log_pos= log_data + FILEID_STORE_SIZE;
+ page_store(log_pos, page_pos);
+ current_size= ma_page->org_size;
+ log_pos+= PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+ *log_pos++= KEY_OP_DEBUG;
+ *log_pos++= debug_marker;
+#endif
+
+ /* Store keypage_flag */
+ *log_pos++= KEY_OP_SET_PAGEFLAG;
+ *log_pos++= buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+ /*
+ Don't overwrite page boundary
+ It's ok to cut this as we will append the data at end of page
+ in the next log entry
+ */
+ if (offset + changed_length > max_page_size)
+ {
+ DBUG_ASSERT(handle_overflow);
+ changed_length= max_page_size - offset; /* Update to end of page */
+ move_length= 0; /* Nothing to move */
+ /* Extend the page to max length on recovery */
+ *log_pos++= KEY_OP_MAX_PAGELENGTH;
+ current_size= max_page_size;
+ }
+
+ /* Check if adding the key made the page overflow */
+ if (current_size + move_length > max_page_size)
+ {
+ /*
+ Adding the key caused an overflow. Cut away the part of the
+ page that doesn't fit.
+ */
+ uint diff;
+ DBUG_ASSERT(handle_overflow);
+ diff= current_size + move_length - max_page_size;
+ log_pos[0]= KEY_OP_DEL_SUFFIX;
+ int2store(log_pos+1, diff);
+ log_pos+= 3;
+ current_size= max_page_size - move_length;
+ }
+
+ if (offset == current_size)
+ {
+ log_pos[0]= KEY_OP_ADD_SUFFIX;
+ current_size+= changed_length;
+ }
+ else
+ {
+ log_pos[0]= KEY_OP_OFFSET;
+ int2store(log_pos+1, offset);
+ log_pos+= 3;
+ if (move_length)
+ {
+ if (move_length < 0)
+ {
+ DBUG_ASSERT(offset - move_length <= org_page_length);
+ if (offset - move_length > current_size)
+ {
+ /*
+ Truncate to end of page. We will add data to it from
+ the page buffer below
+ */
+ move_length= (int) offset - (int) current_size;
+ }
+ }
+ log_pos[0]= KEY_OP_SHIFT;
+ int2store(log_pos+1, move_length);
+ log_pos+= 3;
+ current_size+= move_length;
+ }
+ /*
+ Handle case where page was shortend but 'changed_length' goes over
+ 'current_size'. This can only happen when there was a page overflow
+ and we will below add back the overflow part
+ */
+ if (offset + changed_length > current_size)
+ {
+ DBUG_ASSERT(offset + changed_length <= ma_page->size);
+ changed_length= current_size - offset;
+ }
+ log_pos[0]= KEY_OP_CHANGE;
+ }
+ int2store(log_pos+1, changed_length);
+ log_pos+= 3;
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+ log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length;
+ translog_parts= TRANSLOG_INTERNAL_PARTS + 2;
+
+ /*
+ If page was originally > block_size before operation and now all data
+ fits, append the end data that was not part of the previous logged
+ page to it.
+ */
+ DBUG_ASSERT(current_size <= max_page_size && current_size <= ma_page->size);
+ if (current_size != ma_page->size && current_size != max_page_size)
+ {
+ uint length= min(ma_page->size, max_page_size) - current_size;
+ uchar *data= ma_page->buff + current_size;
+
+ log_pos[0]= KEY_OP_ADD_SUFFIX;
+ int2store(log_pos+1, length);
+ log_array[translog_parts].str= log_pos;
+ log_array[translog_parts].length= 3;
+ log_array[translog_parts+1].str= data;
+ log_array[translog_parts+1].length= length;
+ log_pos+= 3;
+ translog_parts+= 2;
+ current_size+= length;
+ changed_length+= length + 3;
+ }
+
+ _ma_log_key_changes(ma_page, log_array + translog_parts,
+ log_pos, &changed_length, &translog_parts);
+ /*
+ Remember new page length for future log entries for same page
+ Note that this can be different from ma_page->size in case of page
+ overflow!
+ */
+ ma_page->org_size= current_size;
+ DBUG_ASSERT(ma_page->org_size == min(ma_page->size, max_page_size));
+
+ if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length +
+ changed_length, translog_parts,
+ log_array, log_data, NULL))
+ DBUG_RETURN(-1);
+ DBUG_RETURN(0);
+}
+
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+
+/* Log checksum and optionally key page to log */
+
+void _ma_log_key_changes(MARIA_PAGE *ma_page, LEX_CUSTRING *log_array,
+ uchar *log_pos, uint *changed_length,
+ uint *translog_parts)
+{
+ MARIA_SHARE *share= ma_page->info->s;
+ int page_length= min(ma_page->size, share->max_index_block_size);
+ uint org_length;
+ ha_checksum crc;
+
+ DBUG_ASSERT(ma_page->flag == (uint) ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET]);
+
+ /* We have to change length as the page may have been shortened */
+ org_length= _ma_get_page_used(share, ma_page->buff);
+ _ma_store_page_used(share, ma_page->buff, page_length);
+ crc= my_checksum(0, ma_page->buff + LSN_STORE_SIZE,
+ page_length - LSN_STORE_SIZE);
+ _ma_store_page_used(share, ma_page->buff, org_length);
+
+ log_pos[0]= KEY_OP_CHECK;
+ int2store(log_pos+1, page_length);
+ int4store(log_pos+3, crc);
+
+ log_array[0].str= log_pos;
+ log_array[0].length= 7;
+ (*changed_length)+= 7;
+ (*translog_parts)++;
+#ifdef EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES
+ log_array[1].str= ma_page->buff;
+ log_array[1].length= page_length;
+ (*changed_length)+= page_length;
+ (*translog_parts)++;
+#endif /* EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES */
+}
+
+#endif /* EXTRA_DEBUG_KEY_CHANGES */
+
+/****************************************************************************
+ Redo of key pages
+****************************************************************************/
+
+/**
+ @brief Apply LOGREC_REDO_INDEX_NEW_PAGE
+
+ @param info Maria handler
+ @param header Header (without FILEID)
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn,
+ const uchar *header, uint length)
+{
+ pgcache_page_no_t root_page= page_korr(header);
+ pgcache_page_no_t free_page= page_korr(header + PAGE_STORE_SIZE);
+ uint key_nr= key_nr_korr(header + PAGE_STORE_SIZE * 2);
+ my_bool page_type_flag= header[PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE];
+ enum pagecache_page_lock unlock_method;
+ enum pagecache_page_pin unpin_method;
+ MARIA_PINNED_PAGE page_link;
+ my_off_t file_size;
+ uchar *buff;
+ uint result;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_apply_redo_index_new_page");
+ DBUG_PRINT("enter", ("root_page: %lu free_page: %lu",
+ (ulong) root_page, (ulong) free_page));
+
+ /* Set header to point at key data */
+
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS |
+ STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+
+ header+= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1;
+ length-= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1;
+
+ file_size= (my_off_t) (root_page + 1) * share->block_size;
+ if (cmp_translog_addr(lsn, share->state.is_of_horizon) >= 0)
+ {
+ /* free_page is 0 if we shouldn't set key_del */
+ if (free_page)
+ {
+ if (free_page != IMPOSSIBLE_PAGE_NO)
+ share->state.key_del= (my_off_t) free_page * share->block_size;
+ else
+ share->state.key_del= HA_OFFSET_ERROR;
+ }
+ if (page_type_flag) /* root page */
+ share->state.key_root[key_nr]= file_size - share->block_size;
+ }
+
+ if (file_size > share->state.state.key_file_length)
+ {
+ share->state.state.key_file_length= file_size;
+ buff= info->keyread_buff;
+ info->keyread_buff_used= 1;
+ unlock_method= PAGECACHE_LOCK_WRITE;
+ unpin_method= PAGECACHE_PIN;
+ }
+ else
+ {
+ if (!(buff= pagecache_read(share->pagecache, &share->kfile,
+ root_page, 0, 0,
+ PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+ &page_link.link)))
+ {
+ if (my_errno != HA_ERR_FILE_TOO_SHORT &&
+ my_errno != HA_ERR_WRONG_CRC)
+ {
+ result= 1;
+ goto err;
+ }
+ buff= pagecache_block_link_to_buffer(page_link.link);
+ }
+ else if (lsn_korr(buff) >= lsn)
+ {
+ /* Already applied */
+ DBUG_PRINT("info", ("Page is up to date, skipping redo"));
+ result= 0;
+ goto err;
+ }
+ unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+ unpin_method= PAGECACHE_PIN_LEFT_PINNED;
+ }
+
+ /* Write modified page */
+ bzero(buff, LSN_STORE_SIZE);
+ memcpy(buff + LSN_STORE_SIZE, header, length);
+ bzero(buff + LSN_STORE_SIZE + length,
+ share->max_index_block_size - LSN_STORE_SIZE - length);
+ bfill(buff + share->block_size - KEYPAGE_CHECKSUM_SIZE,
+ KEYPAGE_CHECKSUM_SIZE, (uchar) 255);
+
+ result= 0;
+ if (unlock_method == PAGECACHE_LOCK_WRITE &&
+ pagecache_write(share->pagecache,
+ &share->kfile, root_page, 0,
+ buff, PAGECACHE_PLAIN_PAGE,
+ unlock_method, unpin_method,
+ PAGECACHE_WRITE_DELAY, &page_link.link,
+ LSN_IMPOSSIBLE))
+ result= 1;
+
+ /* Mark page to be unlocked and written at _ma_unpin_all_pages() */
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ DBUG_RETURN(result);
+
+err:
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ DBUG_RETURN(result);
+}
+
+
+/**
+ @brief Apply LOGREC_REDO_INDEX_FREE_PAGE
+
+ @param info Maria handler
+ @param header Header (without FILEID)
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+uint _ma_apply_redo_index_free_page(MARIA_HA *info,
+ LSN lsn,
+ const uchar *header)
+{
+ pgcache_page_no_t page= page_korr(header);
+ pgcache_page_no_t free_page= page_korr(header + PAGE_STORE_SIZE);
+ my_off_t old_link;
+ MARIA_PINNED_PAGE page_link;
+ MARIA_SHARE *share= info->s;
+ uchar *buff;
+ int result;
+ DBUG_ENTER("_ma_apply_redo_index_free_page");
+ DBUG_PRINT("enter", ("page: %lu free_page: %lu",
+ (ulong) page, (ulong) free_page));
+
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS |
+ STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+
+ if (cmp_translog_addr(lsn, share->state.is_of_horizon) >= 0)
+ share->state.key_del= (my_off_t) page * share->block_size;
+
+ old_link= ((free_page != IMPOSSIBLE_PAGE_NO) ?
+ (my_off_t) free_page * share->block_size :
+ HA_OFFSET_ERROR);
+ if (!(buff= pagecache_read(share->pagecache, &share->kfile,
+ page, 0, 0,
+ PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+ &page_link.link)))
+ {
+ result= (uint) my_errno;
+ goto err;
+ }
+ if (lsn_korr(buff) >= lsn)
+ {
+ /* Already applied */
+ result= 0;
+ goto err;
+ }
+ /* Free page */
+ bzero(buff + LSN_STORE_SIZE, share->keypage_header - LSN_STORE_SIZE);
+ _ma_store_keynr(share, buff, (uchar) MARIA_DELETE_KEY_NR);
+ _ma_store_page_used(share, buff, share->keypage_header + 8);
+ mi_sizestore(buff + share->keypage_header, old_link);
+
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+ {
+ bzero(buff + share->keypage_header + 8,
+ share->block_size - share->keypage_header - 8 -
+ KEYPAGE_CHECKSUM_SIZE);
+ }
+#endif
+
+ /* Mark page to be unlocked and written at _ma_unpin_all_pages() */
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ DBUG_RETURN(0);
+
+err:
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ DBUG_RETURN(result);
+}
+
+
+/**
+ @brief Apply LOGREC_REDO_INDEX
+
+ @fn ma_apply_redo_index()
+ @param info Maria handler
+ @param header Header (without FILEID)
+
+ @notes
+ Data for this part is a set of logical instructions of how to
+ construct the key page.
+
+ Information of the layout of the components for REDO_INDEX:
+
+ Name Parameters (in byte) Information
+ KEY_OP_OFFSET 2 Set position for next operations
+ KEY_OP_SHIFT 2 (signed int) How much to shift down or up
+ KEY_OP_CHANGE 2 length, data Data to replace at 'pos'
+ KEY_OP_ADD_PREFIX 2 move-length How much data should be moved up
+ 2 change-length Data to be replaced at page start
+ KEY_OP_DEL_PREFIX 2 length Bytes to be deleted at page start
+ KEY_OP_ADD_SUFFIX 2 length, data Add data to end of page
+ KEY_OP_DEL_SUFFIX 2 length Reduce page length with this
+ Sets position to start of page
+ KEY_OP_CHECK 6 page_length[2],CRC Used only when debugging
+ This may be followed by page_length
+ of data (until end of log record)
+ KEY_OP_COMPACT_PAGE 6 transid
+ KEY_OP_SET_PAGEFLAG 1 flag for page
+ KEY_OP_MAX_PAGELENGTH 0 Set page to max length
+ KEY_OP_DEBUG 1 Info where logging was done
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+long my_counter= 0;
+
+uint _ma_apply_redo_index(MARIA_HA *info,
+ LSN lsn, const uchar *header, uint head_length)
+{
+ MARIA_SHARE *share= info->s;
+ pgcache_page_no_t page_pos= page_korr(header);
+ MARIA_PINNED_PAGE page_link;
+ uchar *buff;
+ const uchar *header_end= header + head_length;
+ uint page_offset= 0, org_page_length;
+ uint nod_flag, page_length, keypage_header, keynr;
+ uint max_page_size= share->max_index_block_size;
+ int result;
+ MARIA_PAGE page;
+ DBUG_ENTER("_ma_apply_redo_index");
+ DBUG_PRINT("enter", ("page: %lu", (ulong) page_pos));
+
+ /* Set header to point at key data */
+ header+= PAGE_STORE_SIZE;
+
+ if (!(buff= pagecache_read(share->pagecache, &share->kfile,
+ page_pos, 0, 0,
+ PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+ &page_link.link)))
+ {
+ result= 1;
+ goto err;
+ }
+ if (lsn_korr(buff) >= lsn)
+ {
+ /* Already applied */
+ DBUG_PRINT("info", ("Page is up to date, skipping redo"));
+ result= 0;
+ goto err;
+ }
+
+ keynr= _ma_get_keynr(share, buff);
+ _ma_page_setup(&page, info, share->keyinfo + keynr, page_pos, buff);
+ nod_flag= page.node;
+ org_page_length= page_length= page.size;
+
+ keypage_header= share->keypage_header;
+ DBUG_PRINT("redo", ("page_length: %u", page_length));
+
+ /* Apply modifications to page */
+ do
+ {
+ switch ((enum en_key_op) (*header++)) {
+ case KEY_OP_OFFSET: /* 1 */
+ page_offset= uint2korr(header);
+ header+= 2;
+ DBUG_PRINT("redo", ("key_op_offset: %u", page_offset));
+ DBUG_ASSERT(page_offset >= keypage_header && page_offset <= page_length);
+ break;
+ case KEY_OP_SHIFT: /* 2 */
+ {
+ int length= sint2korr(header);
+ header+= 2;
+ DBUG_PRINT("redo", ("key_op_shift: %d", length));
+ DBUG_ASSERT(page_offset != 0 && page_offset <= page_length &&
+ page_length + length <= max_page_size);
+
+ if (length < 0)
+ {
+ DBUG_ASSERT(page_offset - length <= page_length);
+ bmove(buff + page_offset, buff + page_offset - length,
+ page_length - page_offset + length);
+ }
+ else if (page_length != page_offset)
+ bmove_upp(buff + page_length + length, buff + page_length,
+ page_length - page_offset);
+ page_length+= length;
+ break;
+ }
+ case KEY_OP_CHANGE: /* 3 */
+ {
+ uint length= uint2korr(header);
+ DBUG_PRINT("redo", ("key_op_change: %u", length));
+ DBUG_ASSERT(page_offset != 0 && page_offset + length <= page_length);
+
+ memcpy(buff + page_offset, header + 2 , length);
+ page_offset+= length; /* Put offset after changed length */
+ header+= 2 + length;
+ break;
+ }
+ case KEY_OP_ADD_PREFIX: /* 4 */
+ {
+ uint insert_length= uint2korr(header);
+ uint changed_length= uint2korr(header+2);
+ DBUG_PRINT("redo", ("key_op_add_prefix: %u %u",
+ insert_length, changed_length));
+
+ DBUG_ASSERT(insert_length <= changed_length &&
+ page_length + changed_length <= max_page_size);
+
+ bmove_upp(buff + page_length + insert_length, buff + page_length,
+ page_length - keypage_header);
+ memcpy(buff + keypage_header, header + 4 , changed_length);
+ header+= 4 + changed_length;
+ page_length+= insert_length;
+ break;
+ }
+ case KEY_OP_DEL_PREFIX: /* 5 */
+ {
+ uint length= uint2korr(header);
+ header+= 2;
+ DBUG_PRINT("redo", ("key_op_del_prefix: %u", length));
+ DBUG_ASSERT(length <= page_length - keypage_header);
+
+ bmove(buff + keypage_header, buff + keypage_header +
+ length, page_length - keypage_header - length);
+ page_length-= length;
+
+ page_offset= keypage_header; /* Prepare for change */
+ break;
+ }
+ case KEY_OP_ADD_SUFFIX: /* 6 */
+ {
+ uint insert_length= uint2korr(header);
+ DBUG_PRINT("redo", ("key_op_add_suffix: %u", insert_length));
+ DBUG_ASSERT(page_length + insert_length <= max_page_size);
+ memcpy(buff + page_length, header+2, insert_length);
+
+ page_length+= insert_length;
+ header+= 2 + insert_length;
+ break;
+ }
+ case KEY_OP_DEL_SUFFIX: /* 7 */
+ {
+ uint del_length= uint2korr(header);
+ header+= 2;
+ DBUG_PRINT("redo", ("key_op_del_suffix: %u", del_length));
+ DBUG_ASSERT(page_length - del_length >= keypage_header);
+ page_length-= del_length;
+ break;
+ }
+ case KEY_OP_CHECK: /* 8 */
+ {
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+ uint check_page_length;
+ ha_checksum crc;
+ check_page_length= uint2korr(header);
+ crc= uint4korr(header+2);
+ _ma_store_page_used(share, buff, page_length);
+ if (check_page_length != page_length ||
+ crc != (uint32) my_checksum(0, buff + LSN_STORE_SIZE,
+ page_length - LSN_STORE_SIZE))
+ {
+ DBUG_DUMP("KEY_OP_CHECK bad page", buff, page_length);
+ if (header + 6 + check_page_length <= header_end)
+ {
+ DBUG_DUMP("KEY_OP_CHECK org page", header + 6, check_page_length);
+ }
+ DBUG_ASSERT("crc failure in REDO_INDEX" == 0);
+ }
+#endif
+ DBUG_PRINT("redo", ("key_op_check"));
+ /*
+ This is the last entry in the block and it can contain page_length
+ data or not
+ */
+ DBUG_ASSERT(header + 6 == header_end ||
+ header + 6 + page_length == header_end);
+ header= header_end;
+ break;
+ }
+ case KEY_OP_DEBUG:
+ DBUG_PRINT("redo", ("Debug: %u", (uint) header[0]));
+ header++;
+ break;
+ case KEY_OP_DEBUG_2:
+ DBUG_PRINT("redo", ("org_page_length: %u new_page_length: %u",
+ uint2korr(header), uint2korr(header+2)));
+ header+= 4;
+ break;
+ case KEY_OP_MAX_PAGELENGTH:
+ DBUG_PRINT("redo", ("key_op_max_page_length"));
+ page_length= max_page_size;
+ break;
+ case KEY_OP_MULTI_COPY: /* 9 */
+ {
+ /*
+ List of fixed-len memcpy() operations with their source located inside
+ the page. The log record's piece looks like:
+ first the length 'full_length' to be used by memcpy()
+ then the number of bytes used by the list of (to,from) pairs
+ then the (to,from) pairs, so we do:
+ for (t,f) in [list of (to,from) pairs]:
+ memcpy(t, f, full_length).
+ */
+ uint full_length, log_memcpy_length;
+ const uchar *log_memcpy_end;
+
+ DBUG_PRINT("redo", ("key_op_multi_copy"));
+ full_length= uint2korr(header);
+ header+= 2;
+ log_memcpy_length= uint2korr(header);
+ header+= 2;
+ log_memcpy_end= header + log_memcpy_length;
+ DBUG_ASSERT(full_length <= max_page_size);
+ while (header < log_memcpy_end)
+ {
+ uint to, from;
+ to= uint2korr(header);
+ header+= 2;
+ from= uint2korr(header);
+ header+= 2;
+ /* "from" is a place in the existing page */
+ DBUG_ASSERT(max(from, to) < max_page_size);
+ memcpy(buff + to, buff + from, full_length);
+ }
+ break;
+ }
+ case KEY_OP_SET_PAGEFLAG:
+ DBUG_PRINT("redo", ("key_op_set_pageflag"));
+ buff[KEYPAGE_TRANSFLAG_OFFSET]= *header++;
+ break;
+ case KEY_OP_COMPACT_PAGE:
+ {
+ TrID transid= transid_korr(header);
+
+ DBUG_PRINT("redo", ("key_op_compact_page"));
+ header+= TRANSID_SIZE;
+ if (_ma_compact_keypage(&page, transid))
+ {
+ result= 1;
+ goto err;
+ }
+ page_length= page.size;
+ }
+ case KEY_OP_NONE:
+ default:
+ DBUG_ASSERT(0);
+ result= 1;
+ goto err;
+ }
+ } while (header < header_end);
+ DBUG_ASSERT(header == header_end);
+
+ /* Write modified page */
+ page.size= page_length;
+ _ma_store_page_used(share, buff, page_length);
+
+ /*
+ Clean old stuff up. Gives us better compression of we archive things
+ and makes things easer to debug
+ */
+ if (page_length < org_page_length)
+ bzero(buff + page_length, org_page_length-page_length);
+
+ /* Mark page to be unlocked and written at _ma_unpin_all_pages() */
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ DBUG_RETURN(0);
+
+err:
+ pagecache_unlock_by_link(share->pagecache, page_link.link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+ LSN_IMPOSSIBLE, 0, FALSE);
+ if (result)
+ _ma_mark_file_crashed(share);
+ DBUG_RETURN(result);
+}
+
+
+/****************************************************************************
+ Undo of key block changes
+****************************************************************************/
+
+/**
+ @brief Undo of insert of key (ie, delete the inserted key)
+*/
+
+my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, uint length)
+{
+ LSN lsn;
+ my_bool res;
+ uint keynr;
+ uchar key_buff[MARIA_MAX_KEY_BUFF];
+ MARIA_SHARE *share= info->s;
+ MARIA_KEY key;
+ my_off_t new_root;
+ struct st_msg_to_write_hook_for_undo_key msg;
+ DBUG_ENTER("_ma_apply_undo_key_insert");
+
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS |
+ STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+ keynr= key_nr_korr(header);
+ length-= KEY_NR_STORE_SIZE;
+
+ /* We have to copy key as _ma_ck_real_delete() may change it */
+ memcpy(key_buff, header + KEY_NR_STORE_SIZE, length);
+ DBUG_DUMP("key_buff", key_buff, length);
+
+ new_root= share->state.key_root[keynr];
+ /*
+ Change the key to an internal structure.
+ It's safe to have SEARCH_USER_KEY_HAS_TRANSID even if there isn't
+ a transaction id, as ha_key_cmp() will stop comparison when key length
+ is reached.
+ For index with transid flag, the ref_length of the key is not correct.
+ This should however be safe as long as this key is only used for
+ comparsion against other keys (not for packing or for read-next etc as
+ in this case we use data_length + ref_length, which is correct.
+ */
+ key.keyinfo= share->keyinfo + keynr;
+ key.data= key_buff;
+ key.data_length= length - share->rec_reflength;
+ key.ref_length= share->rec_reflength;
+ key.flag= SEARCH_USER_KEY_HAS_TRANSID;
+
+ res= ((share->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE) ?
+ maria_rtree_real_delete(info, &key, &new_root) :
+ _ma_ck_real_delete(info, &key, &new_root));
+ if (res)
+ _ma_mark_file_crashed(share);
+ msg.root= &share->state.key_root[keynr];
+ msg.value= new_root;
+ msg.keynr= keynr;
+
+ if (_ma_write_clr(info, undo_lsn, *msg.root == msg.value ?
+ LOGREC_UNDO_KEY_INSERT : LOGREC_UNDO_KEY_INSERT_WITH_ROOT,
+ 0, 0, &lsn, (void*) &msg))
+ res= 1;
+
+ _ma_fast_unlock_key_del(info);
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Undo of delete of key (ie, insert the deleted key)
+
+ @param with_root If the UNDO is UNDO_KEY_DELETE_WITH_ROOT
+*/
+
+my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, uint length,
+ my_bool with_root)
+{
+ LSN lsn;
+ my_bool res;
+ uint keynr, skip_bytes;
+ uchar key_buff[MARIA_MAX_KEY_BUFF];
+ MARIA_SHARE *share= info->s;
+ my_off_t new_root;
+ struct st_msg_to_write_hook_for_undo_key msg;
+ MARIA_KEY key;
+ DBUG_ENTER("_ma_apply_undo_key_delete");
+
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS |
+ STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+ keynr= key_nr_korr(header);
+ skip_bytes= KEY_NR_STORE_SIZE + (with_root ? PAGE_STORE_SIZE : 0);
+ header+= skip_bytes;
+ length-= skip_bytes;
+
+ /* We have to copy key as _ma_ck_real_write_btree() may change it */
+ memcpy(key_buff, header, length);
+ DBUG_DUMP("key", key_buff, length);
+
+ key.keyinfo= share->keyinfo + keynr;
+ key.data= key_buff;
+ key.data_length= length - share->rec_reflength;
+ key.ref_length= share->rec_reflength;
+ key.flag= SEARCH_USER_KEY_HAS_TRANSID;
+
+ new_root= share->state.key_root[keynr];
+ res= (share->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE) ?
+ maria_rtree_insert_level(info, &key, -1, &new_root) :
+ _ma_ck_real_write_btree(info, &key, &new_root,
+ share->keyinfo[keynr].write_comp_flag |
+ key.flag);
+ if (res)
+ _ma_mark_file_crashed(share);
+
+ msg.root= &share->state.key_root[keynr];
+ msg.value= new_root;
+ msg.keynr= keynr;
+ if (_ma_write_clr(info, undo_lsn,
+ *msg.root == msg.value ?
+ LOGREC_UNDO_KEY_DELETE : LOGREC_UNDO_KEY_DELETE_WITH_ROOT,
+ 0, 0, &lsn,
+ (void*) &msg))
+ res= 1;
+
+ _ma_fast_unlock_key_del(info);
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+ DBUG_RETURN(res);
+}
+
+
+/****************************************************************************
+ Handle some local variables
+****************************************************************************/
+
+/**
+ @brief lock key_del for other threads usage
+
+ @fn _ma_lock_key_del()
+ @param info Maria handler
+ @param insert_at_end Set to 1 if we are doing an insert
+
+ @note
+ To allow higher concurrency in the common case where we do inserts
+ and we don't have any linked blocks we do the following:
+ - Mark in info->key_del_used that we are not using key_del
+ - Return at once (without marking key_del as used)
+
+ This is safe as we in this case don't write key_del_current into
+ the redo log and during recover we are not updating key_del.
+
+ @retval 1 Use page at end of file
+ @retval 0 Use page at share->key_del_current
+*/
+
+my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end)
+{
+ MARIA_SHARE *share= info->s;
+
+ /*
+ info->key_del_used is 0 initially.
+ If the caller needs a block (_ma_new()), we look at the free list:
+ - looks empty? then caller will create a new block at end of file and
+ remember (through info->key_del_used==2) that it will not change
+ state.key_del and does not need to wake up waiters as nobody will wait for
+ it.
+ - non-empty? then we wait for other users of the state.key_del list to
+ have finished, then we lock this list (through share->key_del_used==1)
+ because we need to prevent some other thread to also read state.key_del
+ and use the same page as ours. We remember through info->key_del_used==1
+ that we will have to set state.key_del at unlock time and wake up
+ waiters.
+ If the caller wants to free a block (_ma_dispose()), "empty" and
+ "non-empty" are treated as "non-empty" is treated above.
+ When we are ready to unlock, we copy share->key_del_current into
+ state.key_del. Unlocking happens when writing the UNDO log record, that
+ can make a long lock time.
+ Why we wrote "*looks* empty": because we are looking at state.key_del
+ which may be slightly old (share->key_del_current may be more recent and
+ exact): when we want a new page, we tolerate to treat "there was no free
+ page 1 millisecond ago" as "there is no free page". It's ok to non-pop
+ (_ma_new(), page will be found later anyway) but it's not ok to non-push
+ (_ma_dispose(), page would be lost).
+ When we leave this function, info->key_del_used is always 1 or 2.
+ */
+ if (info->key_del_used != 1)
+ {
+ pthread_mutex_lock(&share->key_del_lock);
+ if (share->state.key_del == HA_OFFSET_ERROR && insert_at_end)
+ {
+ pthread_mutex_unlock(&share->key_del_lock);
+ info->key_del_used= 2; /* insert-with-append */
+ return 1;
+ }
+#ifdef THREAD
+ while (share->key_del_used)
+ pthread_cond_wait(&share->key_del_cond, &share->key_del_lock);
+#endif
+ info->key_del_used= 1;
+ share->key_del_used= 1;
+ share->key_del_current= share->state.key_del;
+ pthread_mutex_unlock(&share->key_del_lock);
+ }
+ return share->key_del_current == HA_OFFSET_ERROR;
+}
+
+
+/**
+ @brief copy changes to key_del and unlock it
+
+ @notes
+ In case of many threads using the maria table, we always have a lock
+ on the translog when comming here.
+*/
+
+void _ma_unlock_key_del(MARIA_HA *info)
+{
+ DBUG_ASSERT(info->key_del_used);
+ if (info->key_del_used == 1) /* Ignore insert-with-append */
+ {
+ MARIA_SHARE *share= info->s;
+ pthread_mutex_lock(&share->key_del_lock);
+ share->key_del_used= 0;
+ share->state.key_del= share->key_del_current;
+ pthread_mutex_unlock(&share->key_del_lock);
+ pthread_cond_signal(&share->key_del_cond);
+ }
+ info->key_del_used= 0;
+}
diff --git a/storage/maria/ma_key_recover.h b/storage/maria/ma_key_recover.h
new file mode 100644
index 00000000000..d6b69010d5d
--- /dev/null
+++ b/storage/maria/ma_key_recover.h
@@ -0,0 +1,122 @@
+/* Copyright (C) 2007 Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ When we have finished the write/update/delete of a row, we have cleanups to
+ do. For now it is signalling to Checkpoint that all dirtied pages have
+ their rec_lsn set and page LSN set (_ma_unpin_all_pages() has been called),
+ and that bitmap pages are correct (_ma_bitmap_release_unused() has been
+ called).
+*/
+
+/* Struct for clr_end */
+
+struct st_msg_to_write_hook_for_clr_end
+{
+ LSN previous_undo_lsn;
+ enum translog_record_type undone_record_type;
+ ha_checksum checksum_delta;
+ void *extra_msg;
+};
+
+struct st_msg_to_write_hook_for_undo_key
+{
+ my_off_t *root;
+ my_off_t value;
+ uint keynr;
+ ulonglong auto_increment;
+};
+
+
+/* Function definitions for some redo functions */
+
+my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn,
+ enum translog_record_type undo_type,
+ my_bool store_checksum, ha_checksum checksum,
+ LSN *res_lsn, void *extra_msg);
+int _ma_write_undo_key_insert(MARIA_HA *info, const MARIA_KEY *key,
+ my_off_t *root, my_off_t new_root,
+ LSN *res_lsn);
+my_bool _ma_write_undo_key_delete(MARIA_HA *info, const MARIA_KEY *key,
+ my_off_t new_root, LSN *res_lsn);
+my_bool write_hook_for_clr_end(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+ void *hook_arg);
+extern my_bool write_hook_for_undo_key(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg);
+extern my_bool write_hook_for_undo_key_insert(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg);
+extern my_bool write_hook_for_undo_key_delete(enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ LSN *lsn, void *hook_arg);
+
+my_bool _ma_log_prefix(MARIA_PAGE *page, uint changed_length, int move_length,
+ enum en_key_debug debug_marker);
+my_bool _ma_log_suffix(MARIA_PAGE *page, uint org_length,
+ uint new_length);
+my_bool _ma_log_add(MARIA_PAGE *page, uint buff_length, uchar *key_pos,
+ uint changed_length, int move_length,
+ my_bool handle_overflow,
+ enum en_key_debug debug_marker);
+my_bool _ma_log_delete(MARIA_PAGE *page, const uchar *key_pos,
+ uint changed_length, uint move_length,
+ uint append_length, enum en_key_debug debug_marker);
+my_bool _ma_log_change(MARIA_PAGE *page, const uchar *key_pos, uint length,
+ enum en_key_debug debug_marker);
+my_bool _ma_log_new(MARIA_PAGE *page, my_bool root_page);
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+void _ma_log_key_changes(MARIA_PAGE *ma_page, LEX_CUSTRING *log_array,
+ uchar *log_pos, uint *changed_length,
+ uint *translog_parts);
+#else
+#define _ma_log_key_changes(A,B,C,D,E)
+#endif
+
+uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn,
+ const uchar *header, uint length);
+uint _ma_apply_redo_index_free_page(MARIA_HA *info, LSN lsn,
+ const uchar *header);
+uint _ma_apply_redo_index(MARIA_HA *info,
+ LSN lsn, const uchar *header, uint length);
+
+my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, uint length);
+my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn,
+ const uchar *header, uint length,
+ my_bool with_root);
+
+static inline void _ma_finalize_row(MARIA_HA *info)
+{
+ info->trn->rec_lsn= LSN_IMPOSSIBLE;
+}
+
+/* unpinning is often the last operation before finalizing */
+
+static inline void _ma_unpin_all_pages_and_finalize_row(MARIA_HA *info,
+ LSN undo_lsn)
+{
+ _ma_unpin_all_pages(info, undo_lsn);
+ _ma_finalize_row(info);
+}
+
+extern my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end);
+extern void _ma_unlock_key_del(MARIA_HA *info);
+static inline void _ma_fast_unlock_key_del(MARIA_HA *info)
+{
+ if (info->key_del_used)
+ _ma_unlock_key_del(info);
+}
diff --git a/storage/maria/ma_keycache.c b/storage/maria/ma_keycache.c
new file mode 100644
index 00000000000..39fc7d421ae
--- /dev/null
+++ b/storage/maria/ma_keycache.c
@@ -0,0 +1,164 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Key cache assignments
+*/
+
+#include "maria_def.h"
+
+/*
+ Assign pages of the index file for a table to a key cache
+
+ SYNOPSIS
+ maria_assign_to_pagecache()
+ info open table
+ key_map map of indexes to assign to the key cache
+ pagecache_ptr pointer to the key cache handle
+ assign_lock Mutex to lock during assignment
+
+ PREREQUESTS
+ One must have a READ lock or a WRITE lock on the table when calling
+ the function to ensure that there is no other writers to it.
+
+ The caller must also ensure that one doesn't call this function from
+ two different threads with the same table.
+
+ NOTES
+ At present pages for all indexes must be assigned to the same key cache.
+ In future only pages for indexes specified in the key_map parameter
+ of the table will be assigned to the specified key cache.
+
+ RETURN VALUE
+ 0 If a success
+ # Error code
+*/
+
+int maria_assign_to_pagecache(MARIA_HA *info,
+ ulonglong key_map __attribute__((unused)),
+ PAGECACHE *pagecache)
+{
+ int error= 0;
+ MARIA_SHARE* share= info->s;
+ DBUG_ENTER("maria_assign_to_pagecache");
+ DBUG_PRINT("enter",
+ ("old_pagecache_handle: 0x%lx new_pagecache_handle: 0x%lx",
+ (long) share->pagecache, (long) pagecache));
+
+ /*
+ Skip operation if we didn't change key cache. This can happen if we
+ call this for all open instances of the same table
+ */
+ if (share->pagecache == pagecache)
+ DBUG_RETURN(0);
+
+ /*
+ First flush all blocks for the table in the old key cache.
+ This is to ensure that the disk is consistent with the data pages
+ in memory (which may not be the case if the table uses delayed_key_write)
+
+ Note that some other read thread may still fill in the key cache with
+ new blocks during this call and after, but this doesn't matter as
+ all threads will start using the new key cache for their next call to
+ maria library and we know that there will not be any changed blocks
+ in the old key cache.
+ */
+
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE))
+ {
+ error= my_errno;
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info); /* Mark that table must be checked */
+ }
+
+ /*
+ Flush the new key cache for this file. This is needed to ensure
+ that there is no old blocks (with outdated data) left in the new key
+ cache from an earlier assign_to_keycache operation
+
+ (This can never fail as there is never any not written data in the
+ new key cache)
+ */
+ (void) flush_pagecache_blocks(pagecache, &share->kfile, FLUSH_RELEASE);
+
+ /*
+ ensure that setting the key cache and changing the multi_pagecache
+ is done atomicly
+ */
+ pthread_mutex_lock(&share->intern_lock);
+ /*
+ Tell all threads to use the new key cache
+ This should be seen at the lastes for the next call to an maria function.
+ */
+ share->pagecache= pagecache;
+
+ /* store the key cache in the global hash structure for future opens */
+ if (multi_pagecache_set((uchar*) share->unique_file_name.str,
+ share->unique_file_name.length,
+ share->pagecache))
+ error= my_errno;
+ pthread_mutex_unlock(&share->intern_lock);
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Change all MARIA entries that uses one key cache to another key cache
+
+ SYNOPSIS
+ maria_change_pagecache()
+ old_pagecache Old key cache
+ new_pagecache New key cache
+
+ NOTES
+ This is used when we delete one key cache.
+
+ To handle the case where some other threads tries to open an MARIA
+ table associated with the to-be-deleted key cache while this operation
+ is running, we have to call 'multi_pagecache_change()' from this
+ function while we have a lock on the MARIA table list structure.
+
+ This is safe as long as it's only MARIA that is using this specific
+ key cache.
+*/
+
+
+void maria_change_pagecache(PAGECACHE *old_pagecache,
+ PAGECACHE *new_pagecache)
+{
+ LIST *pos;
+ DBUG_ENTER("maria_change_pagecache");
+
+ /*
+ Lock list to ensure that no one can close the table while we manipulate it
+ */
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (pos=maria_open_list ; pos ; pos=pos->next)
+ {
+ MARIA_HA *info= (MARIA_HA*) pos->data;
+ MARIA_SHARE *share= info->s;
+ if (share->pagecache == old_pagecache)
+ maria_assign_to_pagecache(info, (ulonglong) ~0, new_pagecache);
+ }
+
+ /*
+ We have to do the following call while we have the lock on the
+ MARIA list structure to ensure that another thread is not trying to
+ open a new table that will be associted with the old key cache
+ */
+ multi_pagecache_change(old_pagecache, new_pagecache);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_locking.c b/storage/maria/ma_locking.c
new file mode 100644
index 00000000000..6bb308e5959
--- /dev/null
+++ b/storage/maria/ma_locking.c
@@ -0,0 +1,554 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Locking of Maria-tables.
+ Must be first request before doing any furter calls to any Maria function.
+ Is used to allow many process use the same non transactional Maria table
+*/
+
+#include "ma_ftdefs.h"
+
+ /* lock table by F_UNLCK, F_RDLCK or F_WRLCK */
+
+int maria_lock_database(MARIA_HA *info, int lock_type)
+{
+ int error;
+ uint count;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("maria_lock_database");
+ DBUG_PRINT("enter",("lock_type: %d old lock %d r_locks: %u w_locks: %u "
+ "global_changed: %d open_count: %u name: '%s'",
+ lock_type, info->lock_type, share->r_locks,
+ share->w_locks,
+ share->global_changed, share->state.open_count,
+ share->index_file_name.str));
+ if (share->options & HA_OPTION_READ_ONLY_DATA ||
+ info->lock_type == lock_type)
+ DBUG_RETURN(0);
+ if (lock_type == F_EXTRA_LCK) /* Used by TMP tables */
+ {
+ ++share->w_locks;
+ ++share->tot_locks;
+ info->lock_type= lock_type;
+ DBUG_RETURN(0);
+ }
+
+ error=0;
+ pthread_mutex_lock(&share->intern_lock);
+ if (share->kfile.file >= 0) /* May only be false on windows */
+ {
+ switch (lock_type) {
+ case F_UNLCK:
+ maria_ftparser_call_deinitializer(info);
+ if (info->lock_type == F_RDLCK)
+ {
+ count= --share->r_locks;
+ if (share->lock_restore_status)
+ (*share->lock_restore_status)(info);
+ }
+ else
+ {
+ count= --share->w_locks;
+ if (share->lock.update_status)
+ _ma_update_status_with_lock(info);
+ }
+ --share->tot_locks;
+ if (info->lock_type == F_WRLCK && !share->w_locks)
+ {
+ /* pages of transactional tables get flushed at Checkpoint */
+ if (!share->base.born_transactional && !share->temporary &&
+ _ma_flush_table_files(info,
+ share->delay_key_write ? MARIA_FLUSH_DATA :
+ MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_KEEP, FLUSH_KEEP))
+ error= my_errno;
+ }
+ if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+ {
+ if (end_io_cache(&info->rec_cache))
+ {
+ error=my_errno;
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ }
+ }
+ if (!count)
+ {
+ DBUG_PRINT("info",("changed: %u w_locks: %u",
+ (uint) share->changed, share->w_locks));
+ if (share->changed && !share->w_locks)
+ {
+#ifdef HAVE_MMAP
+ if ((share->mmaped_length !=
+ share->state.state.data_file_length) &&
+ (share->nonmmaped_inserts > MAX_NONMAPPED_INSERTS))
+ {
+ if (share->lock_key_trees)
+ rw_wrlock(&share->mmap_lock);
+ _ma_remap_file(info, share->state.state.data_file_length);
+ share->nonmmaped_inserts= 0;
+ if (share->lock_key_trees)
+ rw_unlock(&share->mmap_lock);
+ }
+#endif
+#ifdef EXTERNAL_LOCKING
+ share->state.process= share->last_process=share->this_process;
+ share->state.unique= info->last_unique= info->this_unique;
+ share->state.update_count= info->last_loop= ++info->this_loop;
+#endif
+ /* transactional tables rather flush their state at Checkpoint */
+ if (!share->base.born_transactional)
+ {
+ if (_ma_state_info_write_sub(share->kfile.file, &share->state,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET))
+ error= my_errno;
+ else
+ {
+ /* A value of 0 means below means "state flushed" */
+ share->changed= 0;
+ }
+ }
+ if (maria_flush)
+ {
+ if (_ma_sync_table_files(info))
+ error= my_errno;
+ }
+ else
+ share->not_flushed=1;
+ if (error)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ }
+ }
+ }
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ info->lock_type= F_UNLCK;
+ break;
+ case F_RDLCK:
+ if (info->lock_type == F_WRLCK)
+ {
+ /*
+ Change RW to READONLY
+
+ mysqld does not turn write locks to read locks,
+ so we're never here in mysqld.
+ */
+ share->w_locks--;
+ share->r_locks++;
+ info->lock_type=lock_type;
+ break;
+ }
+#ifdef MARIA_EXTERNAL_LOCKING
+ if (!share->r_locks && !share->w_locks)
+ {
+ /* note that a transactional table should not do this */
+ if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+ {
+ error=my_errno;
+ break;
+ }
+ }
+#endif
+ VOID(_ma_test_if_changed(info));
+ share->r_locks++;
+ share->tot_locks++;
+ info->lock_type=lock_type;
+ break;
+ case F_WRLCK:
+ if (info->lock_type == F_RDLCK)
+ { /* Change READONLY to RW */
+ if (share->r_locks == 1)
+ {
+ share->r_locks--;
+ share->w_locks++;
+ info->lock_type=lock_type;
+ break;
+ }
+ }
+#ifdef MARIA_EXTERNAL_LOCKING
+ if (!(share->options & HA_OPTION_READ_ONLY_DATA))
+ {
+ if (!share->w_locks)
+ {
+ if (!share->r_locks)
+ {
+ /*
+ Note that transactional tables should not do this.
+ If we enabled this code, we should make sure to skip it if
+ born_transactional is true. We should not test
+ now_transactional to decide if we can call
+ _ma_state_info_read_dsk(), because it can temporarily be 0
+ (TRUNCATE on a partitioned table) and thus it would make a state
+ modification below without mutex, confusing a concurrent
+ checkpoint running.
+ Even if this code was enabled only for non-transactional tables:
+ in scenario LOCK TABLE t1 WRITE; INSERT INTO t1; DELETE FROM t1;
+ state on disk read by DELETE is obsolete as it was not flushed
+ at the end of INSERT. MyISAM same. It however causes no issue as
+ maria_delete_all_rows() calls _ma_reset_status() thus is not
+ influenced by the obsolete read values.
+ */
+ if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+ {
+ error=my_errno;
+ break;
+ }
+ }
+ }
+ }
+#endif /* defined(MARIA_EXTERNAL_LOCKING) */
+ VOID(_ma_test_if_changed(info));
+
+ info->lock_type=lock_type;
+ info->invalidator=share->invalidator;
+ share->w_locks++;
+ share->tot_locks++;
+ break;
+ default:
+ DBUG_ASSERT(0);
+ break; /* Impossible */
+ }
+ }
+#ifdef __WIN__
+ else
+ {
+ /*
+ Check for bad file descriptors if this table is part
+ of a merge union. Failing to capture this may cause
+ a crash on windows if the table is renamed and
+ later on referenced by the merge table.
+ */
+ if( info->owned_by_merge && (info->s)->kfile.file < 0 )
+ {
+ error = HA_ERR_NO_SUCH_TABLE;
+ }
+ }
+#endif
+ pthread_mutex_unlock(&share->intern_lock);
+ DBUG_RETURN(error);
+} /* maria_lock_database */
+
+
+/****************************************************************************
+ ** functions to read / write the state
+****************************************************************************/
+
+int _ma_readinfo(register MARIA_HA *info __attribute__ ((unused)),
+ int lock_type __attribute__ ((unused)),
+ int check_keybuffer __attribute__ ((unused)))
+{
+#ifdef MARIA_EXTERNAL_LOCKING
+ DBUG_ENTER("_ma_readinfo");
+
+ if (info->lock_type == F_UNLCK)
+ {
+ MARIA_SHARE *share= info->s;
+ if (!share->tot_locks)
+ {
+ /* should not be done for transactional tables */
+ if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+ {
+ if (!my_errno)
+ my_errno= HA_ERR_FILE_TOO_SHORT;
+ DBUG_RETURN(1);
+ }
+ }
+ if (check_keybuffer)
+ VOID(_ma_test_if_changed(info));
+ info->invalidator=share->invalidator;
+ }
+ else if (lock_type == F_WRLCK && info->lock_type == F_RDLCK)
+ {
+ my_errno=EACCES; /* Not allowed to change */
+ DBUG_RETURN(-1); /* when have read_lock() */
+ }
+ DBUG_RETURN(0);
+#else
+ return 0;
+#endif /* defined(MARIA_EXTERNAL_LOCKING) */
+} /* _ma_readinfo */
+
+
+/*
+ Every isam-function that uppdates the isam-database MUST end with this
+ request
+
+ NOTES
+ my_errno is not changed if this succeeds!
+*/
+
+int _ma_writeinfo(register MARIA_HA *info, uint operation)
+{
+ int error,olderror;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_writeinfo");
+ DBUG_PRINT("info",("operation: %u tot_locks: %u", operation,
+ share->tot_locks));
+
+ error=0;
+ if (share->tot_locks == 0 && !share->base.born_transactional)
+ {
+ /* transactional tables flush their state at Checkpoint */
+ if (operation)
+ { /* Two threads can't be here */
+ olderror= my_errno; /* Remember last error */
+
+#ifdef EXTERNAL_LOCKING
+ /*
+ The following only makes sense if we want to be allow two different
+ processes access the same table at the same time
+ */
+ share->state.process= share->last_process= share->this_process;
+ share->state.unique= info->last_unique= info->this_unique;
+ share->state.update_count= info->last_loop= ++info->this_loop;
+#endif
+
+ if ((error=
+ _ma_state_info_write_sub(share->kfile.file,
+ &share->state,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)))
+ olderror=my_errno;
+#ifdef __WIN__
+ if (maria_flush)
+ {
+ _commit(share->kfile.file);
+ _commit(info->dfile.file);
+ }
+#endif
+ my_errno=olderror;
+ }
+ }
+ else if (operation)
+ share->changed= 1; /* Mark keyfile changed */
+ DBUG_RETURN(error);
+} /* _ma_writeinfo */
+
+
+/*
+ Test if an external process has changed the database
+ (Should be called after readinfo)
+*/
+
+int _ma_test_if_changed(register MARIA_HA *info)
+{
+#ifdef EXTERNAL_LOCKING
+ MARIA_SHARE *share= info->s;
+ if (share->state.process != share->last_process ||
+ share->state.unique != info->last_unique ||
+ share->state.update_count != info->last_loop)
+ { /* Keyfile has changed */
+ DBUG_PRINT("info",("index file changed"));
+ if (share->state.process != share->this_process)
+ VOID(flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_RELEASE));
+ share->last_process=share->state.process;
+ info->last_unique= share->state.unique;
+ info->last_loop= share->state.update_count;
+ info->update|= HA_STATE_WRITTEN; /* Must use file on next */
+ info->data_changed= 1; /* For maria_is_changed */
+ return 1;
+ }
+#endif
+ return (!(info->update & HA_STATE_AKTIV) ||
+ (info->update & (HA_STATE_WRITTEN | HA_STATE_DELETED |
+ HA_STATE_KEY_CHANGED)));
+} /* _ma_test_if_changed */
+
+
+/*
+ Put a mark in the .MAI file that someone is updating the table
+
+ DOCUMENTATION
+ state.open_count in the .MAI file is used the following way:
+ - For the first change of the .MYI file in this process open_count is
+ incremented by _ma_mark_file_changed(). (We have a write lock on the file
+ when this happens)
+ - In maria_close() it's decremented by _ma_decrement_open_count() if it
+ was incremented in the same process.
+
+ This mean that if we are the only process using the file, the open_count
+ tells us if the MARIA file wasn't properly closed. (This is true if
+ my_disable_locking is set).
+
+ open_count is not maintained on disk for temporary tables.
+*/
+
+#define _MA_ALREADY_MARKED_FILE_CHANGED \
+ ((share->state.changed & STATE_CHANGED) && share->global_changed)
+
+int _ma_mark_file_changed(MARIA_HA *info)
+{
+ uchar buff[3];
+ register MARIA_SHARE *share= info->s;
+ int error= 1;
+ DBUG_ENTER("_ma_mark_file_changed");
+
+ if (_MA_ALREADY_MARKED_FILE_CHANGED)
+ DBUG_RETURN(0);
+ pthread_mutex_lock(&share->intern_lock); /* recheck under mutex */
+ if (! _MA_ALREADY_MARKED_FILE_CHANGED)
+ {
+ share->state.changed|=(STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_KEYS);
+ if (!share->global_changed)
+ {
+ share->global_changed=1;
+ share->state.open_count++;
+ }
+ /*
+ Temp tables don't need an open_count as they are removed on crash.
+ In theory transactional tables are fixed by log-based recovery, so don't
+ need an open_count either, but if recovery has failed and logs have been
+ removed (by maria-force-start-after-recovery-failures), we still need to
+ detect dubious tables.
+ If we didn't maintain open_count on disk for a table, after a crash
+ we wouldn't know if it was closed at crash time (thus does not need a
+ check) or not. So we would have to check all tables: overkill.
+ */
+ if (!share->temporary)
+ {
+ mi_int2store(buff,share->state.open_count);
+ buff[2]=1; /* Mark that it's changed */
+ if (my_pwrite(share->kfile.file, buff, sizeof(buff),
+ sizeof(share->state.header) +
+ MARIA_FILE_OPEN_COUNT_OFFSET,
+ MYF(MY_NABP)))
+ goto err;
+ }
+ /* Set uuid of file if not yet set (zerofilled file) */
+ if (share->base.born_transactional &&
+ !(share->state.changed & STATE_NOT_MOVABLE))
+ {
+ /* Lock table to current installation */
+ if (_ma_set_uuid(info, 0) ||
+ (share->state.create_rename_lsn == LSN_NEEDS_NEW_STATE_LSNS &&
+ _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE,
+ trnman_get_min_trid(),
+ TRUE, TRUE)))
+ goto err;
+ share->state.changed|= STATE_NOT_MOVABLE;
+ }
+ }
+ error= 0;
+err:
+ pthread_mutex_unlock(&share->intern_lock);
+ DBUG_RETURN(error);
+#undef _MA_ALREADY_MARKED_FILE_CHANGED
+}
+
+/*
+ Check that a region is all zero
+
+ SYNOPSIS
+ check_if_zero()
+ pos Start of memory to check
+ length length of memory region
+
+ NOTES
+ Used mainly to detect rows with wrong extent information
+*/
+
+my_bool _ma_check_if_zero(uchar *pos, size_t length)
+{
+ uchar *end;
+ for (end= pos+ length; pos != end ; pos++)
+ if (pos[0] != 0)
+ return 1;
+ return 0;
+}
+
+/*
+ This is only called by close or by extra(HA_FLUSH) if the OS has the pwrite()
+ call. In these context the following code should be safe!
+ */
+
+int _ma_decrement_open_count(MARIA_HA *info)
+{
+ uchar buff[2];
+ register MARIA_SHARE *share= info->s;
+ int lock_error=0,write_error=0;
+ if (share->global_changed)
+ {
+ uint old_lock=info->lock_type;
+ share->global_changed=0;
+ lock_error= my_disable_locking ? 0 : maria_lock_database(info, F_WRLCK);
+ /* Its not fatal even if we couldn't get the lock ! */
+ if (share->state.open_count > 0)
+ {
+ share->state.open_count--;
+ share->changed= 1; /* We have to update state */
+ if (!share->temporary)
+ {
+ mi_int2store(buff,share->state.open_count);
+ write_error= (int) my_pwrite(share->kfile.file, buff, sizeof(buff),
+ sizeof(share->state.header) +
+ MARIA_FILE_OPEN_COUNT_OFFSET,
+ MYF(MY_NABP));
+ }
+ }
+ if (!lock_error && !my_disable_locking)
+ lock_error=maria_lock_database(info,old_lock);
+ }
+ return test(lock_error || write_error);
+}
+
+
+/** @brief mark file as crashed */
+
+void _ma_mark_file_crashed(MARIA_SHARE *share)
+{
+ uchar buff[2];
+ DBUG_ENTER("_ma_mark_file_crashed");
+
+ share->state.changed|= STATE_CRASHED;
+ mi_int2store(buff, share->state.changed);
+ /*
+ We can ignore the errors, as if the mark failed, there isn't anything
+ else we can do; The user should already have got an error that the
+ table was crashed.
+ */
+ (void) my_pwrite(share->kfile.file, buff, sizeof(buff),
+ sizeof(share->state.header) +
+ MARIA_FILE_CHANGED_OFFSET,
+ MYF(MY_NABP));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Set uuid of for a Maria file
+
+ @fn _ma_set_uuid()
+ @param info Maria handler
+ @param reset_uuid Instead of setting file to maria_uuid, set it to
+ 0 to mark it as movable
+*/
+
+my_bool _ma_set_uuid(MARIA_HA *info, my_bool reset_uuid)
+{
+ uchar buff[MY_UUID_SIZE], *uuid;
+
+ uuid= maria_uuid;
+ if (reset_uuid)
+ {
+ bzero(buff, sizeof(buff));
+ uuid= buff;
+ }
+ return (my_bool) my_pwrite(info->s->kfile.file, uuid, MY_UUID_SIZE,
+ mi_uint2korr(info->s->state.header.base_pos),
+ MYF(MY_NABP));
+}
diff --git a/storage/maria/ma_loghandler.c b/storage/maria/ma_loghandler.c
new file mode 100644
index 00000000000..dc99554a08d
--- /dev/null
+++ b/storage/maria/ma_loghandler.c
@@ -0,0 +1,9316 @@
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_blockrec.h" /* for some constants and in-write hooks */
+#include "ma_key_recover.h" /* For some in-write hooks */
+#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
+
+/*
+ On Windows, neither my_open() nor my_sync() work for directories.
+ Also there is no need to flush filesystem changes ,i.e to sync()
+ directories.
+*/
+#ifdef __WIN__
+#define sync_dir(A,B) 0
+#else
+#define sync_dir(A,B) my_sync(A,B)
+#endif
+
+/**
+ @file
+ @brief Module which writes and reads to a transaction log
+*/
+
+/* 0xFF can never be valid first byte of a chunk */
+#define TRANSLOG_FILLER 0xFF
+
+/* number of opened log files in the pagecache (should be at least 2) */
+#define OPENED_FILES_NUM 3
+#define CACHED_FILES_NUM 5
+#define CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT 7
+#if CACHED_FILES_NUM > CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT
+#include <hash.h>
+#include <m_ctype.h>
+#endif
+
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+ {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
+/* transaction log file descriptor */
+typedef struct st_translog_file
+{
+ uint32 number;
+ PAGECACHE_FILE handler;
+ my_bool was_recovered;
+ my_bool is_sync;
+} TRANSLOG_FILE;
+
+/* records buffer size (should be TRANSLOG_PAGE_SIZE * n) */
+#define TRANSLOG_WRITE_BUFFER (1024*1024)
+/*
+ pagecache_read/write/inject() use bmove512() on their buffers so those must
+ be long-aligned, which we guarantee by using the type below:
+*/
+typedef union
+{
+ ulonglong dummy;
+ uchar buffer[TRANSLOG_PAGE_SIZE];
+} TRANSLOG_PAGE_SIZE_BUFF;
+
+/* min chunk length */
+#define TRANSLOG_MIN_CHUNK 3
+/*
+ Number of buffers used by loghandler
+
+ Should be at least 4, because one thread can block up to 2 buffers in
+ normal circumstances (less then half of one and full other, or just
+ switched one and other), But if we met end of the file in the middle and
+ have to switch buffer it will be 3. + 1 buffer for flushing/writing.
+ We have a bigger number here for higher concurrency and to make division
+ faster.
+
+ The number should be power of 2 to be fast.
+*/
+#define TRANSLOG_BUFFERS_NO 8
+/* number of bytes (+ header) which can be unused on first page in sequence */
+#define TRANSLOG_MINCHUNK_CONTENT 1
+/* version of log file */
+#define TRANSLOG_VERSION_ID 10000 /* 1.00.00 */
+
+#define TRANSLOG_PAGE_FLAGS 6 /* transaction log page flags offset */
+
+/* Maximum length of compressed LSNs (the worst case of whole LSN storing) */
+#define COMPRESSED_LSN_MAX_STORE_SIZE (2 + LSN_STORE_SIZE)
+#define MAX_NUMBER_OF_LSNS_PER_RECORD 2
+
+
+/* max lsn calculation for buffer */
+#define BUFFER_MAX_LSN(B) \
+ ((B)->last_lsn == LSN_IMPOSSIBLE ? (B)->prev_last_lsn : (B)->last_lsn)
+
+/* log write buffer descriptor */
+struct st_translog_buffer
+{
+ /*
+ Cache for current log. Comes first to be aligned for bmove512() in
+ pagecache_inject()
+ */
+ uchar buffer[TRANSLOG_WRITE_BUFFER];
+ /*
+ Maximum LSN of records which ends in this buffer (or IMPOSSIBLE_LSN
+ if no LSNs ends here)
+ */
+ LSN last_lsn;
+ /* last_lsn of previous buffer or IMPOSSIBLE_LSN if it is very first one */
+ LSN prev_last_lsn;
+ /* This buffer offset in the file */
+ TRANSLOG_ADDRESS offset;
+ /*
+ Next buffer offset in the file (it is not always offset + size,
+ in case of flush by LSN it can be offset + size - TRANSLOG_PAGE_SIZE)
+ */
+ TRANSLOG_ADDRESS next_buffer_offset;
+ /* Previous buffer offset to detect it flush finish */
+ TRANSLOG_ADDRESS prev_buffer_offset;
+ /*
+ If the buffer was forced to close it save value of its horizon
+ otherwise LSN_IMPOSSIBLE
+ */
+ TRANSLOG_ADDRESS pre_force_close_horizon;
+ /*
+ How much is written (or will be written when copy_to_buffer_in_progress
+ become 0) to this buffer
+ */
+ translog_size_t size;
+ /*
+ When moving from one log buffer to another, we write the last of the
+ previous buffer to file and then move to start using the new log
+ buffer. In the case of a part filed last page, this page is not moved
+ to the start of the new buffer but instead we set the 'skip_data'
+ variable to tell us how much data at the beginning of the buffer is not
+ relevant.
+ */
+ uint skipped_data;
+ /* File handler for this buffer */
+ TRANSLOG_FILE *file;
+ /* Threads which are waiting for buffer filling/freeing */
+ pthread_cond_t waiting_filling_buffer;
+ /* Number of records which are in copy progress */
+ uint copy_to_buffer_in_progress;
+ /* list of waiting buffer ready threads */
+ struct st_my_thread_var *waiting_flush;
+ /*
+ If true then previous buffer overlap with this one (due to flush of
+ loghandler, the last page of that buffer is the same as the first page
+ of this buffer) and have to be written first (because contain old
+ content of page which present in both buffers)
+ */
+ my_bool overlay;
+ uint buffer_no;
+ /*
+ Lock for the buffer.
+
+ Current buffer also lock the whole handler (if one want lock the handler
+ one should lock the current buffer).
+
+ Buffers are locked only in one direction (with overflow and beginning
+ from the first buffer). If we keep lock on buffer N we can lock only
+ buffer N+1 (never N-1).
+
+ One thread do not lock more then 2 buffer in a time, so to make dead
+ lock it should be N thread (where N equal number of buffers) takes one
+ buffer and try to lock next. But it is impossible because there is only
+ 2 cases when thread take 2 buffers: 1) one thread finishes current
+ buffer (where horizon is) and start next (to which horizon moves). 2)
+ flush start from buffer after current (oldest) and go till the current
+ crabbing by buffer sequence. And there is only one flush in a moment
+ (they are serialised).
+
+ Because of above and number of buffers equal 5 we can't get dead lock (it is
+ impossible to get all 5 buffers locked simultaneously).
+ */
+ pthread_mutex_t mutex;
+ /*
+ Some thread is going to close the buffer and it should be
+ done only by that thread
+ */
+ my_bool is_closing_buffer;
+ /*
+ Version of the buffer increases every time buffer the buffer flushed.
+ With file and offset it allow detect buffer changes
+ */
+ uint8 ver;
+
+ /*
+ When previous buffer sent to disk it set its address here to allow
+ to detect when it is done
+ (we have to keep it in this buffer to lock buffers only in one direction).
+ */
+ TRANSLOG_ADDRESS prev_sent_to_disk;
+ pthread_cond_t prev_sent_to_disk_cond;
+};
+
+
+struct st_buffer_cursor
+{
+ /* pointer into the buffer */
+ uchar *ptr;
+ /* current buffer */
+ struct st_translog_buffer *buffer;
+ /* How many bytes we wrote on the current page */
+ uint16 current_page_fill;
+ /*
+ How many times we write the page on the disk during flushing process
+ (for sector protection).
+ */
+ uint16 write_counter;
+ /* previous write offset */
+ uint16 previous_offset;
+ /* Number of current buffer */
+ uint8 buffer_no;
+ /*
+ True if it is just filling buffer after advancing the pointer to
+ the horizon.
+ */
+ my_bool chaser;
+ /*
+ Is current page of the cursor already finished (sector protection
+ should be applied if it is needed)
+ */
+ my_bool protected;
+};
+
+
+typedef uint8 dirty_buffer_mask_t;
+
+struct st_translog_descriptor
+{
+ /* *** Parameters of the log handler *** */
+
+ /* Page cache for the log reads */
+ PAGECACHE *pagecache;
+ uint flags;
+ /* File open flags */
+ uint open_flags;
+ /* max size of one log size (for new logs creation) */
+ uint32 log_file_max_size;
+ uint32 server_version;
+ /* server ID (used for replication) */
+ uint32 server_id;
+ /* Loghandler's buffer capacity in case of chunk 2 filling */
+ uint32 buffer_capacity_chunk_2;
+ /*
+ Half of the buffer capacity in case of chunk 2 filling,
+ used to decide will we write a record in one group or many.
+ It is written to the variable just to avoid devision every
+ time we need it.
+ */
+ uint32 half_buffer_capacity_chunk_2;
+ /* Page overhead calculated by flags (whether CRC is enabled, etc) */
+ uint16 page_overhead;
+ /*
+ Page capacity ("useful load") calculated by flags
+ (TRANSLOG_PAGE_SIZE - page_overhead-1)
+ */
+ uint16 page_capacity_chunk_2;
+ /* Path to the directory where we store log store files */
+ char directory[FN_REFLEN];
+
+ /* *** Current state of the log handler *** */
+ /* list of opened files */
+ DYNAMIC_ARRAY open_files;
+ /* min/max number of file in the array */
+ uint32 max_file, min_file;
+ /* the opened files list guard */
+ rw_lock_t open_files_lock;
+
+ /*
+ File descriptor of the directory where we store log files for syncing
+ it.
+ */
+ File directory_fd;
+ /* buffers for log writing */
+ struct st_translog_buffer buffers[TRANSLOG_BUFFERS_NO];
+ /* Mask where 1 in position N mean that buffer N is not flushed */
+ dirty_buffer_mask_t dirty_buffer_mask;
+ /* The above variable protection */
+ pthread_mutex_t dirty_buffer_mask_lock;
+ /*
+ horizon - visible end of the log (here is absolute end of the log:
+ position where next chunk can start
+ */
+ TRANSLOG_ADDRESS horizon;
+ /* horizon buffer cursor */
+ struct st_buffer_cursor bc;
+ /* maximum LSN of the current (not finished) file */
+ LSN max_lsn;
+
+ /*
+ Last flushed LSN (protected by log_flush_lock).
+ Pointers in the log ordered like this:
+ last_lsn_checked <= flushed <= sent_to_disk <= in_buffers_only <=
+ max_lsn <= horizon
+ */
+ LSN flushed;
+ /* Last LSN sent to the disk (but maybe not written yet) */
+ LSN sent_to_disk;
+ /* Horizon from which log started after initialization */
+ TRANSLOG_ADDRESS log_start;
+ TRANSLOG_ADDRESS previous_flush_horizon;
+ /* All what is after this address is not sent to disk yet */
+ TRANSLOG_ADDRESS in_buffers_only;
+ /* protection of sent_to_disk and in_buffers_only */
+ pthread_mutex_t sent_to_disk_lock;
+ /*
+ Protect flushed (see above) and for flush serialization (will
+ be removed in v1.5
+ */
+ pthread_mutex_t log_flush_lock;
+ pthread_cond_t log_flush_cond;
+ pthread_cond_t new_goal_cond;
+
+ /* Protects changing of headers of finished files (max_lsn) */
+ pthread_mutex_t file_header_lock;
+
+ /*
+ Sorted array (with protection) of files where we started writing process
+ and so we can't give last LSN yet
+ */
+ pthread_mutex_t unfinished_files_lock;
+ DYNAMIC_ARRAY unfinished_files;
+
+ /*
+ minimum number of still need file calculeted during last
+ translog_purge call
+ */
+ uint32 min_need_file;
+ /* Purger data: minimum file in the log (or 0 if unknown) */
+ uint32 min_file_number;
+ /* Protect purger from many calls and it's data */
+ pthread_mutex_t purger_lock;
+ /* last low water mark checked */
+ LSN last_lsn_checked;
+ /**
+ Must be set to 0 under loghandler lock every time a new LSN
+ is generated.
+ */
+ my_bool is_everything_flushed;
+ /* True when flush pass is in progress */
+ my_bool flush_in_progress;
+ /* The flush number (used to distinguish two flushes goes one by one) */
+ volatile int flush_no;
+ /* Next flush pass variables */
+ TRANSLOG_ADDRESS next_pass_max_lsn;
+ pthread_t max_lsn_requester;
+};
+
+static struct st_translog_descriptor log_descriptor;
+
+ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
+ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
+ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
+
+/* Marker for end of log */
+static uchar end_of_log= 0;
+#define END_OF_LOG &end_of_log
+/**
+ Switch for "soft" sync (no real sync() but periodical sync by service
+ thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+ Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+ File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static uint32 soft_need_sync= 1;
+/**
+ stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
+
+enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
+
+/* chunk types */
+#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
+#define TRANSLOG_CHUNK_FIXED (1 << 6) /* 1 (pseudo)fixed record (also LSN) */
+#define TRANSLOG_CHUNK_NOHDR (2 << 6) /* 2 no head chunk (till page end) */
+#define TRANSLOG_CHUNK_LNGTH (3 << 6) /* 3 chunk with chunk length */
+#define TRANSLOG_CHUNK_TYPE (3 << 6) /* Mask to get chunk type */
+#define TRANSLOG_REC_TYPE 0x3F /* Mask to get record type */
+#define TRANSLOG_CHUNK_0_CONT 0x3F /* the type to mark chunk 0 continue */
+
+/* compressed (relative) LSN constants */
+#define TRANSLOG_CLSN_LEN_BITS 0xC0 /* Mask to get compressed LSN length */
+
+
+#include <my_atomic.h>
+/* an array that maps id of a MARIA_SHARE to this MARIA_SHARE */
+static MARIA_SHARE **id_to_share= NULL;
+/* lock for id_to_share */
+static my_atomic_rwlock_t LOCK_id_to_share;
+
+static my_bool translog_dummy_callback(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar* data_ptr);
+static my_bool translog_page_validator(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar* data_ptr);
+
+static my_bool translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner);
+static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected);
+LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon);
+
+
+/*
+ Initialize log_record_type_descriptors
+*/
+
+LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES];
+
+
+#ifndef DBUG_OFF
+
+#define translog_buffer_lock_assert_owner(B) \
+ safe_mutex_assert_owner(&(B)->mutex)
+#define translog_lock_assert_owner() \
+ safe_mutex_assert_owner(&log_descriptor.bc.buffer->mutex)
+void translog_lock_handler_assert_owner()
+{
+ translog_lock_assert_owner();
+}
+
+/**
+ @brief check the description table validity
+
+ @param num how many records should be filled
+*/
+
+static void check_translog_description_table(int num)
+{
+ int i;
+ DBUG_ENTER("check_translog_description_table");
+ DBUG_PRINT("enter", ("last record: %d", num));
+ DBUG_ASSERT(num > 0);
+ /* last is reserved for extending the table */
+ DBUG_ASSERT(num < LOGREC_NUMBER_OF_TYPES - 1);
+ DBUG_ASSERT(log_record_type_descriptor[0].rclass == LOGRECTYPE_NOT_ALLOWED);
+
+ for (i= 0; i <= num; i++)
+ {
+ DBUG_PRINT("info",
+ ("record type: %d class: %d fixed: %u header: %u LSNs: %u "
+ "name: %s",
+ i, log_record_type_descriptor[i].rclass,
+ (uint)log_record_type_descriptor[i].fixed_length,
+ (uint)log_record_type_descriptor[i].read_header_len,
+ (uint)log_record_type_descriptor[i].compressed_LSN,
+ log_record_type_descriptor[i].name));
+ switch (log_record_type_descriptor[i].rclass) {
+ case LOGRECTYPE_NOT_ALLOWED:
+ DBUG_ASSERT(i == 0);
+ break;
+ case LOGRECTYPE_VARIABLE_LENGTH:
+ DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == 0);
+ DBUG_ASSERT((log_record_type_descriptor[i].compressed_LSN == 0) ||
+ ((log_record_type_descriptor[i].compressed_LSN == 1) &&
+ (log_record_type_descriptor[i].read_header_len >=
+ LSN_STORE_SIZE)) ||
+ ((log_record_type_descriptor[i].compressed_LSN == 2) &&
+ (log_record_type_descriptor[i].read_header_len >=
+ LSN_STORE_SIZE * 2)));
+ break;
+ case LOGRECTYPE_PSEUDOFIXEDLENGTH:
+ DBUG_ASSERT(log_record_type_descriptor[i].fixed_length ==
+ log_record_type_descriptor[i].read_header_len);
+ DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN > 0);
+ DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN <= 2);
+ break;
+ case LOGRECTYPE_FIXEDLENGTH:
+ DBUG_ASSERT(log_record_type_descriptor[i].fixed_length ==
+ log_record_type_descriptor[i].read_header_len);
+ DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN == 0);
+ break;
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
+ for (i= num + 1; i < LOGREC_NUMBER_OF_TYPES; i++)
+ {
+ DBUG_ASSERT(log_record_type_descriptor[i].rclass ==
+ LOGRECTYPE_NOT_ALLOWED);
+ }
+ DBUG_VOID_RETURN;
+}
+#else
+#define translog_buffer_lock_assert_owner(B) {}
+#define translog_lock_assert_owner() {}
+#endif
+
+static LOG_DESC INIT_LOGREC_RESERVED_FOR_CHUNKS23=
+{LOGRECTYPE_NOT_ALLOWED, 0, 0, NULL, NULL, NULL, 0,
+ "reserved", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL };
+
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_HEAD=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_insert_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_TAIL=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_insert_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_HEAD=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_new_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_TAIL=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_new_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOBS=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_insert_row_blobs", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_HEAD=
+{LOGRECTYPE_FIXEDLENGTH,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_purge_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_TAIL=
+{LOGRECTYPE_FIXEDLENGTH,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_purge_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_FREE_BLOCKS=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_free_blocks", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL=
+{LOGRECTYPE_FIXEDLENGTH,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_free_head_or_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+/* not yet used; for when we have versioning */
+static LOG_DESC INIT_LOGREC_REDO_DELETE_ROW=
+{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0,
+ "redo_delete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+/** @todo RECOVERY BUG unused, remove? */
+static LOG_DESC INIT_LOGREC_REDO_UPDATE_ROW_HEAD=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0,
+ "redo_update_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INDEX=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0,
+ "redo_index", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INDEX_NEW_PAGE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_index_new_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INDEX_FREE_PAGE=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_index_free_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_UNDELETE_ROW=
+{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0,
+ "redo_undelete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_CLR_END=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ CLR_TYPE_STORE_SIZE, NULL, write_hook_for_clr_end, NULL, 1,
+ "clr_end", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_PURGE_END=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, NULL, NULL, 1,
+ "purge_end", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_ROW_INSERT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_undo_row_insert, NULL, 1,
+ "undo_row_insert", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_ROW_DELETE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_undo_row_delete, NULL, 1,
+ "undo_row_delete", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_ROW_UPDATE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_undo_row_update, NULL, 1,
+ "undo_row_update", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE,
+ NULL, write_hook_for_undo_key_insert, NULL, 1,
+ "undo_key_insert", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+/* This will never be in the log, only in the clr */
+static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE,
+ NULL, write_hook_for_undo_key, NULL, 1,
+ "undo_key_insert_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE,
+ NULL, write_hook_for_undo_key_delete, NULL, 1,
+ "undo_key_delete", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE,
+ NULL, write_hook_for_undo_key_delete, NULL, 1,
+ "undo_key_delete_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_PREPARE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "prepare", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_PREPARE_WITH_UNDO_PURGE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE, NULL, NULL, NULL, 1,
+ "prepare_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_COMMIT=
+{LOGRECTYPE_FIXEDLENGTH, 0, 0, NULL,
+ write_hook_for_commit, NULL, 0, "commit", LOGREC_IS_GROUP_ITSELF, NULL,
+ NULL};
+
+static LOG_DESC INIT_LOGREC_COMMIT_WITH_UNDO_PURGE=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, write_hook_for_commit, NULL, 1,
+ "commit_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_CHECKPOINT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "checkpoint", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_CREATE_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 1 + 2, NULL, NULL, NULL, 0,
+"redo_create_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_RENAME_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "redo_rename_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_DROP_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "redo_drop_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_DELETE_ALL=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE,
+ NULL, write_hook_for_redo_delete_all, NULL, 0,
+ "redo_delete_all", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_REPAIR_TABLE=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + 8 + 8, FILEID_STORE_SIZE + 8 + 8,
+ NULL, NULL, NULL, 0,
+ "redo_repair_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_FILE_ID=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 2, NULL, write_hook_for_file_id, NULL, 0,
+ "file_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_LONG_TRANSACTION_ID=
+{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0,
+ "long_transaction_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_INCOMPLETE_LOG=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE,
+ NULL, NULL, NULL, 0,
+ "incomplete_log", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_INCOMPLETE_GROUP=
+{LOGRECTYPE_FIXEDLENGTH, 0, 0,
+ NULL, NULL, NULL, 0,
+ "incomplete_group", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_BULK_INSERT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE,
+ NULL, write_hook_for_undo_bulk_insert, NULL, 1,
+ "undo_bulk_insert", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_BITMAP_NEW_PAGE=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2,
+ NULL, NULL, NULL, 0,
+ "redo_create_bitmap", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_IMPORTED_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "imported_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_DEBUG_INFO=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "info", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL;
+
+void translog_table_init()
+{
+ int i;
+ log_record_type_descriptor[LOGREC_RESERVED_FOR_CHUNKS23]=
+ INIT_LOGREC_RESERVED_FOR_CHUNKS23;
+ log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_HEAD]=
+ INIT_LOGREC_REDO_INSERT_ROW_HEAD;
+ log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_TAIL]=
+ INIT_LOGREC_REDO_INSERT_ROW_TAIL;
+ log_record_type_descriptor[LOGREC_REDO_NEW_ROW_HEAD]=
+ INIT_LOGREC_REDO_NEW_ROW_HEAD;
+ log_record_type_descriptor[LOGREC_REDO_NEW_ROW_TAIL]=
+ INIT_LOGREC_REDO_NEW_ROW_TAIL;
+ log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_BLOBS]=
+ INIT_LOGREC_REDO_INSERT_ROW_BLOBS;
+ log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_HEAD]=
+ INIT_LOGREC_REDO_PURGE_ROW_HEAD;
+ log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_TAIL]=
+ INIT_LOGREC_REDO_PURGE_ROW_TAIL;
+ log_record_type_descriptor[LOGREC_REDO_FREE_BLOCKS]=
+ INIT_LOGREC_REDO_FREE_BLOCKS;
+ log_record_type_descriptor[LOGREC_REDO_FREE_HEAD_OR_TAIL]=
+ INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL;
+ log_record_type_descriptor[LOGREC_REDO_DELETE_ROW]=
+ INIT_LOGREC_REDO_DELETE_ROW;
+ log_record_type_descriptor[LOGREC_REDO_UPDATE_ROW_HEAD]=
+ INIT_LOGREC_REDO_UPDATE_ROW_HEAD;
+ log_record_type_descriptor[LOGREC_REDO_INDEX]=
+ INIT_LOGREC_REDO_INDEX;
+ log_record_type_descriptor[LOGREC_REDO_INDEX_NEW_PAGE]=
+ INIT_LOGREC_REDO_INDEX_NEW_PAGE;
+ log_record_type_descriptor[LOGREC_REDO_INDEX_FREE_PAGE]=
+ INIT_LOGREC_REDO_INDEX_FREE_PAGE;
+ log_record_type_descriptor[LOGREC_REDO_UNDELETE_ROW]=
+ INIT_LOGREC_REDO_UNDELETE_ROW;
+ log_record_type_descriptor[LOGREC_CLR_END]=
+ INIT_LOGREC_CLR_END;
+ log_record_type_descriptor[LOGREC_PURGE_END]=
+ INIT_LOGREC_PURGE_END;
+ log_record_type_descriptor[LOGREC_UNDO_ROW_INSERT]=
+ INIT_LOGREC_UNDO_ROW_INSERT;
+ log_record_type_descriptor[LOGREC_UNDO_ROW_DELETE]=
+ INIT_LOGREC_UNDO_ROW_DELETE;
+ log_record_type_descriptor[LOGREC_UNDO_ROW_UPDATE]=
+ INIT_LOGREC_UNDO_ROW_UPDATE;
+ log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT]=
+ INIT_LOGREC_UNDO_KEY_INSERT;
+ log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT_WITH_ROOT]=
+ INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT;
+ log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE]=
+ INIT_LOGREC_UNDO_KEY_DELETE;
+ log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE_WITH_ROOT]=
+ INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT;
+ log_record_type_descriptor[LOGREC_PREPARE]=
+ INIT_LOGREC_PREPARE;
+ log_record_type_descriptor[LOGREC_PREPARE_WITH_UNDO_PURGE]=
+ INIT_LOGREC_PREPARE_WITH_UNDO_PURGE;
+ log_record_type_descriptor[LOGREC_COMMIT]=
+ INIT_LOGREC_COMMIT;
+ log_record_type_descriptor[LOGREC_COMMIT_WITH_UNDO_PURGE]=
+ INIT_LOGREC_COMMIT_WITH_UNDO_PURGE;
+ log_record_type_descriptor[LOGREC_CHECKPOINT]=
+ INIT_LOGREC_CHECKPOINT;
+ log_record_type_descriptor[LOGREC_REDO_CREATE_TABLE]=
+ INIT_LOGREC_REDO_CREATE_TABLE;
+ log_record_type_descriptor[LOGREC_REDO_RENAME_TABLE]=
+ INIT_LOGREC_REDO_RENAME_TABLE;
+ log_record_type_descriptor[LOGREC_REDO_DROP_TABLE]=
+ INIT_LOGREC_REDO_DROP_TABLE;
+ log_record_type_descriptor[LOGREC_REDO_DELETE_ALL]=
+ INIT_LOGREC_REDO_DELETE_ALL;
+ log_record_type_descriptor[LOGREC_REDO_REPAIR_TABLE]=
+ INIT_LOGREC_REDO_REPAIR_TABLE;
+ log_record_type_descriptor[LOGREC_FILE_ID]=
+ INIT_LOGREC_FILE_ID;
+ log_record_type_descriptor[LOGREC_LONG_TRANSACTION_ID]=
+ INIT_LOGREC_LONG_TRANSACTION_ID;
+ log_record_type_descriptor[LOGREC_INCOMPLETE_LOG]=
+ INIT_LOGREC_INCOMPLETE_LOG;
+ log_record_type_descriptor[LOGREC_INCOMPLETE_GROUP]=
+ INIT_LOGREC_INCOMPLETE_GROUP;
+ log_record_type_descriptor[LOGREC_UNDO_BULK_INSERT]=
+ INIT_LOGREC_UNDO_BULK_INSERT;
+ log_record_type_descriptor[LOGREC_REDO_BITMAP_NEW_PAGE]=
+ INIT_LOGREC_REDO_BITMAP_NEW_PAGE;
+ log_record_type_descriptor[LOGREC_IMPORTED_TABLE]=
+ INIT_LOGREC_IMPORTED_TABLE;
+ log_record_type_descriptor[LOGREC_DEBUG_INFO]=
+ INIT_LOGREC_DEBUG_INFO;
+
+ for (i= LOGREC_FIRST_FREE; i < LOGREC_NUMBER_OF_TYPES; i++)
+ log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED;
+#ifndef DBUG_OFF
+ check_translog_description_table(LOGREC_FIRST_FREE -1);
+#endif
+}
+
+
+/* all possible flags page overheads */
+static uint page_overhead[TRANSLOG_FLAGS_NUM];
+
+typedef struct st_translog_validator_data
+{
+ TRANSLOG_ADDRESS *addr;
+ my_bool was_recovered;
+} TRANSLOG_VALIDATOR_DATA;
+
+
+/*
+ Check cursor/buffer consistence
+
+ SYNOPSIS
+ translog_check_cursor
+ cursor cursor which will be checked
+*/
+
+static void translog_check_cursor(struct st_buffer_cursor *cursor
+ __attribute__((unused)))
+{
+ DBUG_ASSERT(cursor->chaser ||
+ ((ulong) (cursor->ptr - cursor->buffer->buffer) ==
+ cursor->buffer->size));
+ DBUG_ASSERT(cursor->buffer->buffer_no == cursor->buffer_no);
+ DBUG_ASSERT((cursor->ptr -cursor->buffer->buffer) %TRANSLOG_PAGE_SIZE ==
+ cursor->current_page_fill % TRANSLOG_PAGE_SIZE);
+ DBUG_ASSERT(cursor->current_page_fill <= TRANSLOG_PAGE_SIZE);
+}
+
+
+/**
+ @brief switch the loghandler in read only mode in case of write error
+*/
+
+void translog_stop_writing()
+{
+ DBUG_ENTER("translog_stop_writing");
+ DBUG_PRINT("error", ("errno: %d my_errno: %d", errno, my_errno));
+ translog_status= (translog_status == TRANSLOG_SHUTDOWN ?
+ TRANSLOG_UNINITED :
+ TRANSLOG_READONLY);
+ log_descriptor.is_everything_flushed= 1;
+ log_descriptor.open_flags= O_BINARY | O_RDONLY;
+ DBUG_ASSERT(0);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ @brief Get file name of the log by log number
+
+ @param file_no Number of the log we want to open
+ @param path Pointer to buffer where file name will be
+ stored (must be FN_REFLEN bytes at least)
+
+ @return pointer to path
+*/
+
+char *translog_filename_by_fileno(uint32 file_no, char *path)
+{
+ char buff[11], *end;
+ uint length;
+ DBUG_ENTER("translog_filename_by_fileno");
+ DBUG_ASSERT(file_no <= 0xfffffff);
+
+ /* log_descriptor.directory is already formated */
+ end= strxmov(path, log_descriptor.directory, "aria_log.0000000", NullS);
+ length= (uint) (int10_to_str(file_no, buff, 10) - buff);
+ strmov(end - length +1, buff);
+
+ DBUG_PRINT("info", ("Path: '%s' path: 0x%lx", path, (ulong) path));
+ DBUG_RETURN(path);
+}
+
+
+/**
+ @brief Create log file with given number without cache
+
+ @param file_no Number of the log we want to open
+
+ retval -1 error
+ retval # file descriptor number
+*/
+
+static File create_logfile_by_number_no_cache(uint32 file_no)
+{
+ File file;
+ char path[FN_REFLEN];
+ DBUG_ENTER("create_logfile_by_number_no_cache");
+
+ if (translog_status != TRANSLOG_OK)
+ DBUG_RETURN(-1);
+
+ /* TODO: add O_DIRECT to open flags (when buffer is aligned) */
+ if ((file= my_create(translog_filename_by_fileno(file_no, path),
+ 0, O_BINARY | O_RDWR, MYF(MY_WME))) < 0)
+ {
+ DBUG_PRINT("error", ("Error %d during creating file '%s'", errno, path));
+ translog_stop_writing();
+ DBUG_RETURN(-1);
+ }
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_NEWFILE &&
+ sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD)))
+ {
+ DBUG_PRINT("error", ("Error %d during syncing directory '%s'",
+ errno, log_descriptor.directory));
+ translog_stop_writing();
+ DBUG_RETURN(-1);
+ }
+ DBUG_PRINT("info", ("File: '%s' handler: %d", path, file));
+ DBUG_RETURN(file);
+}
+
+/**
+ @brief Open (not create) log file with given number without cache
+
+ @param file_no Number of the log we want to open
+
+ retval -1 error
+ retval # file descriptor number
+*/
+
+static File open_logfile_by_number_no_cache(uint32 file_no)
+{
+ File file;
+ char path[FN_REFLEN];
+ DBUG_ENTER("open_logfile_by_number_no_cache");
+
+ /* TODO: add O_DIRECT to open flags (when buffer is aligned) */
+ /* TODO: use my_create() */
+ if ((file= my_open(translog_filename_by_fileno(file_no, path),
+ log_descriptor.open_flags,
+ MYF(MY_WME))) < 0)
+ {
+ DBUG_PRINT("error", ("Error %d during opening file '%s'", errno, path));
+ DBUG_RETURN(-1);
+ }
+ DBUG_PRINT("info", ("File: '%s' handler: %d", path, file));
+ DBUG_RETURN(file);
+}
+
+
+/**
+ @brief get file descriptor by given number using cache
+
+ @param file_no Number of the log we want to open
+
+ retval # file descriptor
+ retval NULL file is not opened
+*/
+
+static TRANSLOG_FILE *get_logfile_by_number(uint32 file_no)
+{
+ TRANSLOG_FILE *file;
+ DBUG_ENTER("get_logfile_by_number");
+ rw_rdlock(&log_descriptor.open_files_lock);
+ if (log_descriptor.max_file - file_no >=
+ log_descriptor.open_files.elements)
+ {
+ DBUG_PRINT("info", ("File #%u is not opened", file_no));
+ rw_unlock(&log_descriptor.open_files_lock);
+ DBUG_RETURN(NULL);
+ }
+ DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+ log_descriptor.open_files.elements);
+ DBUG_ASSERT(log_descriptor.max_file >= file_no);
+ DBUG_ASSERT(log_descriptor.min_file <= file_no);
+
+ file= *dynamic_element(&log_descriptor.open_files,
+ log_descriptor.max_file - file_no, TRANSLOG_FILE **);
+ rw_unlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("File 0x%lx File no: %lu, File handler: %d",
+ (ulong)file, (ulong)file_no,
+ (file ? file->handler.file : -1)));
+ DBUG_ASSERT(!file || file->number == file_no);
+ DBUG_RETURN(file);
+}
+
+
+/**
+ @brief get current file descriptor
+
+ retval # file descriptor
+*/
+
+static TRANSLOG_FILE *get_current_logfile()
+{
+ TRANSLOG_FILE *file;
+ DBUG_ENTER("get_current_logfile");
+ rw_rdlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu",
+ (ulong) log_descriptor.max_file,
+ (ulong) log_descriptor.min_file,
+ (ulong) log_descriptor.open_files.elements));
+ DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+ log_descriptor.open_files.elements);
+ file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
+ rw_unlock(&log_descriptor.open_files_lock);
+ DBUG_RETURN(file);
+}
+
+uchar NEAR maria_trans_file_magic[]=
+{ (uchar) 254, (uchar) 254, (uchar) 11, '\001', 'M', 'A', 'R', 'I', 'A',
+ 'L', 'O', 'G' };
+#define LOG_HEADER_DATA_SIZE (sizeof(maria_trans_file_magic) + \
+ 8 + 4 + 4 + 4 + 2 + 3 + \
+ LSN_STORE_SIZE)
+
+
+/*
+ Write log file page header in the just opened new log file
+
+ SYNOPSIS
+ translog_write_file_header();
+
+ NOTES
+ First page is just a marker page; We don't store any real log data in it.
+
+ RETURN
+ 0 OK
+ 1 ERROR
+*/
+
+static my_bool translog_write_file_header()
+{
+ TRANSLOG_FILE *file;
+ ulonglong timestamp;
+ uchar page_buff[TRANSLOG_PAGE_SIZE], *page= page_buff;
+ my_bool rc;
+ DBUG_ENTER("translog_write_file_header");
+
+ /* file tag */
+ memcpy(page, maria_trans_file_magic, sizeof(maria_trans_file_magic));
+ page+= sizeof(maria_trans_file_magic);
+ /* timestamp */
+ timestamp= my_getsystime();
+ int8store(page, timestamp);
+ page+= 8;
+ /* maria version */
+ int4store(page, TRANSLOG_VERSION_ID);
+ page+= 4;
+ /* mysql version (MYSQL_VERSION_ID) */
+ int4store(page, log_descriptor.server_version);
+ page+= 4;
+ /* server ID */
+ int4store(page, log_descriptor.server_id);
+ page+= 4;
+ /* loghandler page_size */
+ int2store(page, TRANSLOG_PAGE_SIZE - 1);
+ page+= 2;
+ /* file number */
+ int3store(page, LSN_FILE_NO(log_descriptor.horizon));
+ page+= 3;
+ lsn_store(page, LSN_IMPOSSIBLE);
+ page+= LSN_STORE_SIZE;
+ memset(page, TRANSLOG_FILLER, sizeof(page_buff) - (page- page_buff));
+
+ file= get_current_logfile();
+ rc= my_pwrite(file->handler.file, page_buff, sizeof(page_buff), 0,
+ log_write_flags) != 0;
+ /*
+ Dropping the flag in such way can make false alarm: signalling than the
+ file in not sync when it is sync, but the situation is quite rare and
+ protections with mutexes give much more overhead to the whole engine
+ */
+ file->is_sync= 0;
+ DBUG_RETURN(rc);
+}
+
+/*
+ @brief write the new LSN on the given file header
+
+ @param file The file descriptor
+ @param lsn That LSN which should be written
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_max_lsn_to_header(File file, LSN lsn)
+{
+ uchar lsn_buff[LSN_STORE_SIZE];
+ my_bool rc;
+ DBUG_ENTER("translog_max_lsn_to_header");
+ DBUG_PRINT("enter", ("File descriptor: %ld "
+ "lsn: (%lu,0x%lx)",
+ (long) file,
+ LSN_IN_PARTS(lsn)));
+
+ lsn_store(lsn_buff, lsn);
+
+ rc= (my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0);
+ /*
+ We should not increase counter in case of error above, but it is so
+ unlikely that we can ignore this case
+ */
+ translog_syncs++;
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ Information from transaction log file header
+*/
+
+typedef struct st_loghandler_file_info
+{
+ /*
+ LSN_IMPOSSIBLE for current file (not finished file).
+ Maximum LSN of the record which parts stored in the
+ file.
+ */
+ LSN max_lsn;
+ ulonglong timestamp; /* Time stamp */
+ ulong maria_version; /* Version of maria loghandler */
+ ulong mysql_version; /* Version of mysql server */
+ ulong server_id; /* Server ID */
+ ulong page_size; /* Loghandler page size */
+ ulong file_number; /* Number of the file (from the file header) */
+} LOGHANDLER_FILE_INFO;
+
+/*
+ @brief Extract hander file information from loghandler file page
+
+ @param desc header information descriptor to be filled with information
+ @param page_buff buffer with the page content
+*/
+
+static void translog_interpret_file_header(LOGHANDLER_FILE_INFO *desc,
+ uchar *page_buff)
+{
+ uchar *ptr;
+
+ ptr= page_buff + sizeof(maria_trans_file_magic);
+ desc->timestamp= uint8korr(ptr);
+ ptr+= 8;
+ desc->maria_version= uint4korr(ptr);
+ ptr+= 4;
+ desc->mysql_version= uint4korr(ptr);
+ ptr+= 4;
+ desc->server_id= uint4korr(ptr + 4);
+ ptr+= 4;
+ desc->page_size= uint2korr(ptr) + 1;
+ ptr+= 2;
+ desc->file_number= uint3korr(ptr);
+ ptr+=3;
+ desc->max_lsn= lsn_korr(ptr);
+}
+
+
+/*
+ @brief Read hander file information from loghandler file
+
+ @param desc header information descriptor to be filled with information
+ @param file file descriptor to read
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool translog_read_file_header(LOGHANDLER_FILE_INFO *desc, File file)
+{
+ uchar page_buff[LOG_HEADER_DATA_SIZE];
+ DBUG_ENTER("translog_read_file_header");
+
+ if (my_pread(file, page_buff,
+ sizeof(page_buff), 0, MYF(MY_FNABP | MY_WME)))
+ {
+ DBUG_PRINT("info", ("log read fail error: %d", my_errno));
+ DBUG_RETURN(1);
+ }
+ translog_interpret_file_header(desc, page_buff);
+ DBUG_PRINT("info", ("timestamp: %llu aria ver: %lu mysql ver: %lu "
+ "server id %lu page size %lu file number %lu "
+ "max lsn: (%lu,0x%lx)",
+ (ulonglong) desc->timestamp,
+ (ulong) desc->maria_version,
+ (ulong) desc->mysql_version,
+ (ulong) desc->server_id,
+ desc->page_size, (ulong) desc->file_number,
+ LSN_IN_PARTS(desc->max_lsn)));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ @brief set the lsn to the files from_file - to_file if it is greater
+ then written in the file
+
+ @param from_file first file number (min)
+ @param to_file last file number (max)
+ @param lsn the lsn for writing
+ @param is_locked true if current thread locked the log handler
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_set_lsn_for_files(uint32 from_file, uint32 to_file,
+ LSN lsn, my_bool is_locked)
+{
+ uint32 file;
+ DBUG_ENTER("translog_set_lsn_for_files");
+ DBUG_PRINT("enter", ("From: %lu to: %lu lsn: (%lu,0x%lx) locked: %d",
+ (ulong) from_file, (ulong) to_file,
+ LSN_IN_PARTS(lsn),
+ is_locked));
+ DBUG_ASSERT(from_file <= to_file);
+ DBUG_ASSERT(from_file > 0); /* we have not file 0 */
+
+ /* Checks the current file (not finished yet file) */
+ if (!is_locked)
+ translog_lock();
+ if (to_file == (uint32) LSN_FILE_NO(log_descriptor.horizon))
+ {
+ if (likely(cmp_translog_addr(lsn, log_descriptor.max_lsn) > 0))
+ log_descriptor.max_lsn= lsn;
+ to_file--;
+ }
+ if (!is_locked)
+ translog_unlock();
+
+ /* Checks finished files if they are */
+ pthread_mutex_lock(&log_descriptor.file_header_lock);
+ for (file= from_file; file <= to_file; file++)
+ {
+ LOGHANDLER_FILE_INFO info;
+ File fd;
+ LINT_INIT(info.max_lsn);
+
+ fd= open_logfile_by_number_no_cache(file);
+ if ((fd < 0) ||
+ ((translog_read_file_header(&info, fd) ||
+ (cmp_translog_addr(lsn, info.max_lsn) > 0 &&
+ translog_max_lsn_to_header(fd, lsn))) |
+ my_close(fd, MYF(MY_WME))))
+ {
+ translog_stop_writing();
+ DBUG_RETURN(1);
+ }
+ }
+ pthread_mutex_unlock(&log_descriptor.file_header_lock);
+
+ DBUG_RETURN(0);
+}
+
+
+/* descriptor of file in unfinished_files */
+struct st_file_counter
+{
+ uint32 file; /* file number */
+ uint32 counter; /* counter for started writes */
+};
+
+
+/*
+ @brief mark file "in progress" (for multi-group records)
+
+ @param file log file number
+*/
+
+static void translog_mark_file_unfinished(uint32 file)
+{
+ int place, i;
+ struct st_file_counter fc, *fc_ptr;
+
+ DBUG_ENTER("translog_mark_file_unfinished");
+ DBUG_PRINT("enter", ("file: %lu", (ulong) file));
+
+ fc.file= file; fc.counter= 1;
+ pthread_mutex_lock(&log_descriptor.unfinished_files_lock);
+
+ if (log_descriptor.unfinished_files.elements == 0)
+ {
+ insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc);
+ DBUG_PRINT("info", ("The first element inserted"));
+ goto end;
+ }
+
+ for (place= log_descriptor.unfinished_files.elements - 1;
+ place >= 0;
+ place--)
+ {
+ fc_ptr= dynamic_element(&log_descriptor.unfinished_files,
+ place, struct st_file_counter *);
+ if (fc_ptr->file <= file)
+ break;
+ }
+
+ if (place >= 0 && fc_ptr->file == file)
+ {
+ fc_ptr->counter++;
+ DBUG_PRINT("info", ("counter increased"));
+ goto end;
+ }
+
+ if (place == (int)log_descriptor.unfinished_files.elements)
+ {
+ insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc);
+ DBUG_PRINT("info", ("The last element inserted"));
+ goto end;
+ }
+ /* shift and assign new element */
+ insert_dynamic(&log_descriptor.unfinished_files,
+ (uchar*)
+ dynamic_element(&log_descriptor.unfinished_files,
+ log_descriptor.unfinished_files.elements- 1,
+ struct st_file_counter *));
+ for(i= log_descriptor.unfinished_files.elements - 1; i > place; i--)
+ {
+ /* we do not use set_dynamic() to avoid unneeded checks */
+ memcpy(dynamic_element(&log_descriptor.unfinished_files,
+ i, struct st_file_counter *),
+ dynamic_element(&log_descriptor.unfinished_files,
+ i + 1, struct st_file_counter *),
+ sizeof(struct st_file_counter));
+ }
+ memcpy(dynamic_element(&log_descriptor.unfinished_files,
+ place + 1, struct st_file_counter *),
+ &fc, sizeof(struct st_file_counter));
+end:
+ pthread_mutex_unlock(&log_descriptor.unfinished_files_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ @brief remove file mark "in progress" (for multi-group records)
+
+ @param file log file number
+*/
+
+static void translog_mark_file_finished(uint32 file)
+{
+ int i;
+ struct st_file_counter *fc_ptr;
+ DBUG_ENTER("translog_mark_file_finished");
+ DBUG_PRINT("enter", ("file: %lu", (ulong) file));
+
+ LINT_INIT(fc_ptr);
+
+ pthread_mutex_lock(&log_descriptor.unfinished_files_lock);
+
+ DBUG_ASSERT(log_descriptor.unfinished_files.elements > 0);
+ for (i= 0;
+ i < (int) log_descriptor.unfinished_files.elements;
+ i++)
+ {
+ fc_ptr= dynamic_element(&log_descriptor.unfinished_files,
+ i, struct st_file_counter *);
+ if (fc_ptr->file == file)
+ {
+ break;
+ }
+ }
+ DBUG_ASSERT(i < (int) log_descriptor.unfinished_files.elements);
+
+ if (! --fc_ptr->counter)
+ delete_dynamic_element(&log_descriptor.unfinished_files, i);
+ pthread_mutex_unlock(&log_descriptor.unfinished_files_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ @brief get max LSN of the record which parts stored in this file
+
+ @param file file number
+
+ @return requested LSN or LSN_IMPOSSIBLE/LSN_ERROR
+ @retval LSN_IMPOSSIBLE File is still not finished
+ @retval LSN_ERROR Error opening file
+ @retval # LSN of the record which parts stored in this file
+*/
+
+LSN translog_get_file_max_lsn_stored(uint32 file)
+{
+ uint32 limit= FILENO_IMPOSSIBLE;
+ DBUG_ENTER("translog_get_file_max_lsn_stored");
+ DBUG_PRINT("enter", ("file: %lu", (ulong)file));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+
+ pthread_mutex_lock(&log_descriptor.unfinished_files_lock);
+
+ /* find file with minimum file number "in progress" */
+ if (log_descriptor.unfinished_files.elements > 0)
+ {
+ struct st_file_counter *fc_ptr;
+ fc_ptr= dynamic_element(&log_descriptor.unfinished_files,
+ 0, struct st_file_counter *);
+ limit= fc_ptr->file; /* minimal file number "in progress" */
+ }
+ pthread_mutex_unlock(&log_descriptor.unfinished_files_lock);
+
+ /*
+ if there is no "in progress file" then unfinished file is in progress
+ for sure
+ */
+ if (limit == FILENO_IMPOSSIBLE)
+ {
+ TRANSLOG_ADDRESS horizon= translog_get_horizon();
+ limit= LSN_FILE_NO(horizon);
+ }
+
+ if (file >= limit)
+ {
+ DBUG_PRINT("info", ("The file in in progress"));
+ DBUG_RETURN(LSN_IMPOSSIBLE);
+ }
+
+ {
+ LOGHANDLER_FILE_INFO info;
+ File fd;
+ LINT_INIT_STRUCT(info);
+ fd= open_logfile_by_number_no_cache(file);
+ if ((fd < 0) ||
+ (translog_read_file_header(&info, fd) | my_close(fd, MYF(MY_WME))))
+ {
+ DBUG_PRINT("error", ("Can't read file header"));
+ DBUG_RETURN(LSN_ERROR);
+ }
+ DBUG_PRINT("info", ("Max lsn: (%lu,0x%lx)",
+ LSN_IN_PARTS(info.max_lsn)));
+ DBUG_RETURN(info.max_lsn);
+ }
+}
+
+/*
+ Initialize transaction log file buffer
+
+ SYNOPSIS
+ translog_buffer_init()
+ buffer The buffer to initialize
+ num Number of this buffer
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
+{
+ DBUG_ENTER("translog_buffer_init");
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn=
+ LSN_IMPOSSIBLE;
+ DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
+ (ulong) buffer));
+
+ buffer->buffer_no= (uint8) num;
+ /* This Buffer File */
+ buffer->file= NULL;
+ buffer->overlay= 0;
+ /* cache for current log */
+ memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
+ /* Buffer size */
+ buffer->size= 0;
+ buffer->skipped_data= 0;
+ /* cond of thread which is waiting for buffer filling */
+ if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
+ DBUG_RETURN(1);
+ /* Number of records which are in copy progress */
+ buffer->copy_to_buffer_in_progress= 0;
+ /* list of waiting buffer ready threads */
+ buffer->waiting_flush= 0;
+ /*
+ Buffers locked by fallowing mutex. As far as buffers create logical
+ circle (after last buffer goes first) it trigger false alarm of deadlock
+ detect system, so we remove check of deadlock for this buffers. In deed
+ all mutex locks concentrated around current buffer except flushing
+ thread (but it is only one thread). One thread can't take more then
+ 2 buffer locks at once. So deadlock is impossible here.
+
+ To prevent false alarm of dead lock detection we switch dead lock
+ detection for one buffer in the middle of the buffers chain. Excluding
+ only one of eight buffers from deadlock detection hardly can hide other
+ possible problems which include this mutexes.
+ */
+ if (my_pthread_mutex_init(&buffer->mutex, MY_MUTEX_INIT_FAST,
+ "translog_buffer->mutex",
+ (num == TRANSLOG_BUFFERS_NO - 2 ?
+ MYF_NO_DEADLOCK_DETECTION : 0)) ||
+ pthread_cond_init(&buffer->prev_sent_to_disk_cond, 0))
+ DBUG_RETURN(1);
+ buffer->is_closing_buffer= 0;
+ buffer->prev_sent_to_disk= LSN_IMPOSSIBLE;
+ buffer->prev_buffer_offset= LSN_IMPOSSIBLE;
+ buffer->ver= 0;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ @brief close transaction log file by descriptor
+
+ @param file pagegecache file descriptor reference
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_close_log_file(TRANSLOG_FILE *file)
+{
+ int rc= 0;
+ flush_pagecache_blocks(log_descriptor.pagecache, &file->handler,
+ FLUSH_RELEASE);
+ /*
+ Sync file when we close it
+ TODO: sync only we have changed the log
+ */
+ if (!file->is_sync)
+ {
+ rc= my_sync(file->handler.file, MYF(MY_WME));
+ translog_syncs++;
+ }
+ rc|= my_close(file->handler.file, MYF(MY_WME));
+ my_free(file, MYF(0));
+ return test(rc);
+}
+
+
+/**
+ @brief Dummy function for write failure (the log to not use
+ pagecache writing)
+*/
+
+void translog_dummy_write_failure(uchar *data __attribute__((unused)))
+{
+ return;
+}
+
+
+/**
+ @brief Initializes TRANSLOG_FILE structure
+
+ @param file reference on the file to initialize
+ @param number file number
+ @param is_sync is file synced on disk
+*/
+
+static void translog_file_init(TRANSLOG_FILE *file, uint32 number,
+ my_bool is_sync)
+{
+ pagecache_file_init(file->handler, &translog_page_validator,
+ &translog_dummy_callback,
+ &translog_dummy_write_failure,
+ maria_flush_log_for_page_none, file);
+ file->number= number;
+ file->was_recovered= 0;
+ file->is_sync= is_sync;
+}
+
+
+/**
+ @brief Create and fill header of new file.
+
+ @note the caller must call it right after it has increased
+ log_descriptor.horizon to the new file
+ (log_descriptor.horizon+= LSN_ONE_FILE)
+
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_create_new_file()
+{
+ TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(sizeof(TRANSLOG_FILE),
+ MYF(0));
+
+ TRANSLOG_FILE *old= get_current_logfile();
+ uint32 file_no= LSN_FILE_NO(log_descriptor.horizon);
+ DBUG_ENTER("translog_create_new_file");
+
+ if (file == NULL)
+ goto error;
+
+ /*
+ Writes max_lsn to the file header before finishing it (there is no need
+ to lock file header buffer because it is still unfinished file, so only
+ one thread can finish the file and nobody interested of LSN of current
+ (unfinished) file, because no one can purge it).
+ */
+ if (translog_max_lsn_to_header(old->handler.file, log_descriptor.max_lsn))
+ goto error;
+
+ rw_wrlock(&log_descriptor.open_files_lock);
+ DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+ log_descriptor.open_files.elements);
+ DBUG_ASSERT(file_no == log_descriptor.max_file + 1);
+ if (allocate_dynamic(&log_descriptor.open_files,
+ log_descriptor.max_file - log_descriptor.min_file + 2))
+ goto error_lock;
+ if ((file->handler.file=
+ create_logfile_by_number_no_cache(file_no)) == -1)
+ goto error_lock;
+ translog_file_init(file, file_no, 0);
+
+ /* this call just expand the array */
+ insert_dynamic(&log_descriptor.open_files, (uchar*)&file);
+ log_descriptor.max_file++;
+ {
+ char *start= (char*) dynamic_element(&log_descriptor.open_files, 0,
+ TRANSLOG_FILE**);
+ memmove(start + sizeof(TRANSLOG_FILE*), start,
+ sizeof(TRANSLOG_FILE*) *
+ (log_descriptor.max_file - log_descriptor.min_file + 1 - 1));
+ }
+ /* can't fail we because we expanded array */
+ set_dynamic(&log_descriptor.open_files, (uchar*)&file, 0);
+ DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+ log_descriptor.open_files.elements);
+ rw_unlock(&log_descriptor.open_files_lock);
+
+ DBUG_PRINT("info", ("file_no: %lu", (ulong)file_no));
+
+ if (translog_write_file_header())
+ DBUG_RETURN(1);
+
+ if (ma_control_file_write_and_force(last_checkpoint_lsn, file_no,
+ max_trid_in_control_file,
+ recovery_failures))
+ {
+ translog_stop_writing();
+ DBUG_RETURN(1);
+ }
+
+ DBUG_RETURN(0);
+
+error_lock:
+ rw_unlock(&log_descriptor.open_files_lock);
+error:
+ translog_stop_writing();
+ DBUG_RETURN(1);
+}
+
+
+/**
+ @brief Locks the loghandler buffer.
+
+ @param buffer This buffer which should be locked
+
+ @note See comment before buffer 'mutex' variable.
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static void translog_buffer_lock(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_buffer_lock");
+ DBUG_PRINT("enter",
+ ("Lock buffer #%u: (0x%lx)", (uint) buffer->buffer_no,
+ (ulong) buffer));
+ pthread_mutex_lock(&buffer->mutex);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unlock the loghandler buffer
+
+ SYNOPSIS
+ translog_buffer_unlock()
+ buffer This buffer which should be unlocked
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static void translog_buffer_unlock(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_buffer_unlock");
+ DBUG_PRINT("enter", ("Unlock buffer... #%u (0x%lx)",
+ (uint) buffer->buffer_no, (ulong) buffer));
+
+ pthread_mutex_unlock(&buffer->mutex);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Write a header on the page
+
+ SYNOPSIS
+ translog_new_page_header()
+ horizon Where to write the page
+ cursor Where to write the page
+
+ NOTE
+ - space for page header should be checked before
+*/
+
+static uchar translog_sector_random;
+
+static void translog_new_page_header(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor)
+{
+ uchar *ptr;
+
+ DBUG_ENTER("translog_new_page_header");
+ DBUG_ASSERT(cursor->ptr);
+
+ cursor->protected= 0;
+
+ ptr= cursor->ptr;
+ /* Page number */
+ int3store(ptr, LSN_OFFSET(*horizon) / TRANSLOG_PAGE_SIZE);
+ ptr+= 3;
+ /* File number */
+ int3store(ptr, LSN_FILE_NO(*horizon));
+ ptr+= 3;
+ DBUG_ASSERT(TRANSLOG_PAGE_FLAGS == (ptr - cursor->ptr));
+ cursor->ptr[TRANSLOG_PAGE_FLAGS]= (uchar) log_descriptor.flags;
+ ptr++;
+ if (log_descriptor.flags & TRANSLOG_PAGE_CRC)
+ {
+#ifndef DBUG_OFF
+ DBUG_PRINT("info", ("write 0x11223344 CRC to (%lu,0x%lx)",
+ LSN_IN_PARTS(*horizon)));
+ /* This will be overwritten by real CRC; This is just for debugging */
+ int4store(ptr, 0x11223344);
+#endif
+ /* CRC will be put when page is finished */
+ ptr+= CRC_SIZE;
+ }
+ if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION)
+ {
+ /*
+ translog_sector_randmo works like "random" values producer because
+ it is enough to have such "random" for this purpose and it will
+ not interfere with higher level pseudo random value generator
+ */
+ ptr[0]= translog_sector_random++;
+ ptr+= TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+ }
+ {
+ uint len= (ptr - cursor->ptr);
+ (*horizon)+= len; /* increasing the offset part of the address */
+ cursor->current_page_fill= len;
+ if (!cursor->chaser)
+ cursor->buffer->size+= len;
+ }
+ cursor->ptr= ptr;
+ DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu) "
+ "Horizon: (%lu,0x%lx)",
+ (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+ cursor->chaser, (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer),
+ LSN_IN_PARTS(*horizon)));
+ translog_check_cursor(cursor);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Put sector protection on the page image
+
+ SYNOPSIS
+ translog_put_sector_protection()
+ page reference on the page content
+ cursor cursor of the buffer
+
+ NOTES
+ We put a sector protection on all following sectors on the page,
+ except the first sector that is protected by page header.
+*/
+
+static void translog_put_sector_protection(uchar *page,
+ struct st_buffer_cursor *cursor)
+{
+ uchar *table= page + log_descriptor.page_overhead -
+ TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+ uint i, offset;
+ uint16 last_protected_sector= ((cursor->previous_offset - 1) /
+ DISK_DRIVE_SECTOR_SIZE);
+ uint16 start_sector= cursor->previous_offset / DISK_DRIVE_SECTOR_SIZE;
+ uint8 value= table[0] + cursor->write_counter;
+ DBUG_ENTER("translog_put_sector_protection");
+
+ if (start_sector == 0)
+ {
+ /* First sector is protected by file & page numbers in the page header. */
+ start_sector= 1;
+ }
+
+ DBUG_PRINT("enter", ("Write counter:%u value:%u offset:%u, "
+ "last protected:%u start sector:%u",
+ (uint) cursor->write_counter,
+ (uint) value,
+ (uint) cursor->previous_offset,
+ (uint) last_protected_sector, (uint) start_sector));
+ if (last_protected_sector == start_sector)
+ {
+ i= last_protected_sector;
+ offset= last_protected_sector * DISK_DRIVE_SECTOR_SIZE;
+ /* restore data, because we modified sector which was protected */
+ if (offset < cursor->previous_offset)
+ page[offset]= table[i];
+ }
+ for (i= start_sector, offset= start_sector * DISK_DRIVE_SECTOR_SIZE;
+ i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+ i++, (offset+= DISK_DRIVE_SECTOR_SIZE))
+ {
+ DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x",
+ i, offset, (uint) page[offset]));
+ table[i]= page[offset];
+ page[offset]= value;
+ DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x",
+ i, offset, (uint) page[offset]));
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Calculate CRC32 of given area
+
+ SYNOPSIS
+ translog_crc()
+ area Pointer of the area beginning
+ length The Area length
+
+ RETURN
+ CRC32
+*/
+
+static uint32 translog_crc(uchar *area, uint length)
+{
+ DBUG_ENTER("translog_crc");
+ DBUG_RETURN(crc32(0L, (unsigned char*) area, length));
+}
+
+
+/*
+ Finish current page with zeros
+
+ SYNOPSIS
+ translog_finish_page()
+ horizon \ horizon & buffer pointers
+ cursor /
+*/
+
+static void translog_finish_page(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor)
+{
+ uint16 left= TRANSLOG_PAGE_SIZE - cursor->current_page_fill;
+ uchar *page= cursor->ptr - cursor->current_page_fill;
+ DBUG_ENTER("translog_finish_page");
+ DBUG_PRINT("enter", ("Buffer: #%u 0x%lx "
+ "Buffer addr: (%lu,0x%lx) "
+ "Page addr: (%lu,0x%lx) "
+ "size:%lu (%lu) Pg:%u left:%u",
+ (uint) cursor->buffer_no, (ulong) cursor->buffer,
+ LSN_IN_PARTS(cursor->buffer->offset),
+ (ulong) LSN_FILE_NO(*horizon),
+ (ulong) (LSN_OFFSET(*horizon) -
+ cursor->current_page_fill),
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr -cursor->buffer->buffer),
+ (uint) cursor->current_page_fill, (uint) left));
+ DBUG_ASSERT(LSN_FILE_NO(*horizon) == LSN_FILE_NO(cursor->buffer->offset));
+ translog_check_cursor(cursor);
+ if (cursor->protected)
+ {
+ DBUG_PRINT("info", ("Already protected and finished"));
+ DBUG_VOID_RETURN;
+ }
+ cursor->protected= 1;
+
+ DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
+ if (left != 0)
+ {
+ DBUG_PRINT("info", ("left: %u", (uint) left));
+ memset(cursor->ptr, TRANSLOG_FILLER, left);
+ cursor->ptr+= left;
+ (*horizon)+= left; /* offset increasing */
+ if (!cursor->chaser)
+ cursor->buffer->size+= left;
+ /* We are finishing the page so reset the counter */
+ cursor->current_page_fill= 0;
+ DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
+ "chaser: %d Size: %lu (%lu)",
+ (uint) cursor->buffer->buffer_no,
+ (ulong) cursor->buffer, cursor->chaser,
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ translog_check_cursor(cursor);
+ }
+ /*
+ When we are finishing the page other thread might not finish the page
+ header yet (in case if we started from the middle of the page) so we
+ have to read log_descriptor.flags but not the flags from the page.
+ */
+ if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION)
+ {
+ translog_put_sector_protection(page, cursor);
+ DBUG_PRINT("info", ("drop write_counter"));
+ cursor->write_counter= 0;
+ cursor->previous_offset= 0;
+ }
+ if (log_descriptor.flags & TRANSLOG_PAGE_CRC)
+ {
+ uint32 crc= translog_crc(page + log_descriptor.page_overhead,
+ TRANSLOG_PAGE_SIZE -
+ log_descriptor.page_overhead);
+ DBUG_PRINT("info", ("CRC: %lx", (ulong) crc));
+ /* We have page number, file number and flag before crc */
+ int4store(page + 3 + 3 + 1, crc);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ @brief Wait until all threads have finished closing this buffer.
+
+ @param buffer This buffer should be check
+*/
+
+static void translog_wait_for_closing(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_wait_for_closing");
+ DBUG_PRINT("enter", ("Buffer #%u 0x%lx copies in progress: %u "
+ "is closing %u File: %d size: %lu",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (uint) buffer->copy_to_buffer_in_progress,
+ (uint) buffer->is_closing_buffer,
+ (buffer->file ? buffer->file->handler.file : -1),
+ (ulong) buffer->size));
+ translog_buffer_lock_assert_owner(buffer);
+
+ while (buffer->is_closing_buffer)
+ {
+ DBUG_PRINT("info", ("wait for writers... buffer: #%u 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer));
+ DBUG_ASSERT(buffer->file != NULL);
+ pthread_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex);
+ DBUG_PRINT("info", ("wait for writers done buffer: #%u 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer));
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ @brief Wait until all threads have finished filling this buffer.
+
+ @param buffer This buffer should be check
+*/
+
+static void translog_wait_for_writers(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_wait_for_writers");
+ DBUG_PRINT("enter", ("Buffer #%u 0x%lx copies in progress: %u "
+ "is closing %u File: %d size: %lu",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (uint) buffer->copy_to_buffer_in_progress,
+ (uint) buffer->is_closing_buffer,
+ (buffer->file ? buffer->file->handler.file : -1),
+ (ulong) buffer->size));
+ translog_buffer_lock_assert_owner(buffer);
+
+ while (buffer->copy_to_buffer_in_progress)
+ {
+ DBUG_PRINT("info", ("wait for writers... buffer: #%u 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer));
+ DBUG_ASSERT(buffer->file != NULL);
+ pthread_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex);
+ DBUG_PRINT("info", ("wait for writers done buffer: #%u 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer));
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+
+ Wait for buffer to become free
+
+ SYNOPSIS
+ translog_wait_for_buffer_free()
+ buffer The buffer we are waiting for
+
+ NOTE
+ - this buffer should be locked
+*/
+
+static void translog_wait_for_buffer_free(struct st_translog_buffer *buffer)
+{
+ TRANSLOG_ADDRESS offset= buffer->offset;
+ TRANSLOG_FILE *file= buffer->file;
+ uint8 ver= buffer->ver;
+ DBUG_ENTER("translog_wait_for_buffer_free");
+ DBUG_PRINT("enter", ("Buffer #%u 0x%lx copies in progress: %u "
+ "is closing %u File: %d size: %lu",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (uint) buffer->copy_to_buffer_in_progress,
+ (uint) buffer->is_closing_buffer,
+ (buffer->file ? buffer->file->handler.file : -1),
+ (ulong) buffer->size));
+
+ translog_wait_for_writers(buffer);
+
+ if (offset != buffer->offset || file != buffer->file || ver != buffer->ver)
+ DBUG_VOID_RETURN; /* the buffer if already freed */
+
+ while (buffer->file != NULL)
+ {
+ DBUG_PRINT("info", ("wait for writers... buffer: #%u 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer));
+ pthread_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex);
+ DBUG_PRINT("info", ("wait for writers done. buffer: #%u 0x%lx",
+ (uint) buffer->buffer_no, (ulong) buffer));
+ }
+ DBUG_ASSERT(buffer->copy_to_buffer_in_progress == 0);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Initialize the cursor for a buffer
+
+ SYNOPSIS
+ translog_cursor_init()
+ buffer The buffer
+ cursor It's cursor
+ buffer_no Number of buffer
+*/
+
+static void translog_cursor_init(struct st_buffer_cursor *cursor,
+ struct st_translog_buffer *buffer,
+ uint8 buffer_no)
+{
+ DBUG_ENTER("translog_cursor_init");
+ cursor->ptr= buffer->buffer;
+ cursor->buffer= buffer;
+ cursor->buffer_no= buffer_no;
+ cursor->current_page_fill= 0;
+ cursor->chaser= (cursor != &log_descriptor.bc);
+ cursor->write_counter= 0;
+ cursor->previous_offset= 0;
+ cursor->protected= 0;
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ @brief Initialize buffer for the current file, and a cursor for this buffer.
+
+ @param buffer The buffer
+ @param cursor It's cursor
+ @param buffer_no Number of buffer
+*/
+
+static void translog_start_buffer(struct st_translog_buffer *buffer,
+ struct st_buffer_cursor *cursor,
+ uint buffer_no)
+{
+ DBUG_ENTER("translog_start_buffer");
+ DBUG_PRINT("enter",
+ ("Assign buffer: #%u (0x%lx) offset: 0x%lx(%lu)",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (ulong) LSN_OFFSET(log_descriptor.horizon),
+ (ulong) LSN_OFFSET(log_descriptor.horizon)));
+ DBUG_ASSERT(buffer_no == buffer->buffer_no);
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
+ (ulong) buffer));
+ buffer->offset= log_descriptor.horizon;
+ buffer->next_buffer_offset= LSN_IMPOSSIBLE;
+ buffer->file= get_current_logfile();
+ buffer->overlay= 0;
+ buffer->size= 0;
+ buffer->skipped_data= 0;
+ translog_cursor_init(cursor, buffer, buffer_no);
+ DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx "
+ "chaser: %d Size: %lu (%lu)",
+ (long) (buffer->file ? buffer->file->number : 0),
+ (buffer->file ? buffer->file->handler.file : -1),
+ (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+ cursor->chaser, (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ translog_check_cursor(cursor);
+ pthread_mutex_lock(&log_descriptor.dirty_buffer_mask_lock);
+ log_descriptor.dirty_buffer_mask|= (1 << buffer->buffer_no);
+ pthread_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ @brief Switch to the next buffer in a chain.
+
+ @param horizon \ Pointers on current position in file and buffer
+ @param cursor /
+ @param new_file Also start new file
+
+ @note
+ - loghandler should be locked
+ - after return new and old buffer still are locked
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_buffer_next(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor,
+ my_bool new_file)
+{
+ uint old_buffer_no= cursor->buffer_no;
+ uint new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ struct st_translog_buffer *new_buffer= log_descriptor.buffers + new_buffer_no;
+ my_bool chasing= cursor->chaser;
+ DBUG_ENTER("translog_buffer_next");
+
+ DBUG_PRINT("info", ("horizon: (%lu,0x%lx) chasing: %d",
+ LSN_IN_PARTS(log_descriptor.horizon), chasing));
+
+ DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, *horizon) >= 0);
+
+ translog_finish_page(horizon, cursor);
+
+ if (!chasing)
+ {
+ translog_buffer_lock(new_buffer);
+#ifndef DBUG_OFF
+ {
+ TRANSLOG_ADDRESS offset= new_buffer->offset;
+ TRANSLOG_FILE *file= new_buffer->file;
+ uint8 ver= new_buffer->ver;
+ translog_lock_assert_owner();
+#endif
+ translog_wait_for_buffer_free(new_buffer);
+#ifndef DBUG_OFF
+ /* We keep the handler locked so nobody can start this new buffer */
+ DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL &&
+ (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver);
+ }
+#endif
+ }
+ else
+ DBUG_ASSERT(new_buffer->file != NULL);
+
+ if (new_file)
+ {
+ /* move the horizon to the next file and its header page */
+ (*horizon)+= LSN_ONE_FILE;
+ (*horizon)= LSN_REPLACE_OFFSET(*horizon, TRANSLOG_PAGE_SIZE);
+ if (!chasing && translog_create_new_file())
+ {
+ DBUG_RETURN(1);
+ }
+ }
+
+ /* prepare next page */
+ if (chasing)
+ translog_cursor_init(cursor, new_buffer, new_buffer_no);
+ else
+ {
+ translog_lock_assert_owner();
+ translog_start_buffer(new_buffer, cursor, new_buffer_no);
+ new_buffer->prev_buffer_offset=
+ log_descriptor.buffers[old_buffer_no].offset;
+ new_buffer->prev_last_lsn=
+ BUFFER_MAX_LSN(log_descriptor.buffers + old_buffer_no);
+ }
+ log_descriptor.buffers[old_buffer_no].next_buffer_offset= new_buffer->offset;
+ DBUG_PRINT("info", ("prev_last_lsn set to (%lu,0x%lx) buffer: 0x%lx",
+ LSN_IN_PARTS(new_buffer->prev_last_lsn),
+ (ulong) new_buffer));
+ translog_new_page_header(horizon, cursor);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Sets max LSN sent to file, and address from which data is only in the buffer
+
+ SYNOPSIS
+ translog_set_sent_to_disk()
+ buffer buffer which we have sent to disk
+
+ TODO: use atomic operations if possible (64bit architectures?)
+*/
+
+static void translog_set_sent_to_disk(struct st_translog_buffer *buffer)
+{
+ LSN lsn= buffer->last_lsn;
+ TRANSLOG_ADDRESS in_buffers= buffer->next_buffer_offset;
+
+ DBUG_ENTER("translog_set_sent_to_disk");
+ pthread_mutex_lock(&log_descriptor.sent_to_disk_lock);
+ DBUG_PRINT("enter", ("lsn: (%lu,0x%lx) in_buffers: (%lu,0x%lx) "
+ "in_buffers_only: (%lu,0x%lx) start: (%lu,0x%lx) "
+ "sent_to_disk: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn),
+ LSN_IN_PARTS(in_buffers),
+ LSN_IN_PARTS(log_descriptor.log_start),
+ LSN_IN_PARTS(log_descriptor.in_buffers_only),
+ LSN_IN_PARTS(log_descriptor.sent_to_disk)));
+ /*
+ We write sequentially (first part of following assert) but we rewrite
+ the same page in case we started mysql and shut it down immediately
+ (second part of the following assert)
+ */
+ DBUG_ASSERT(cmp_translog_addr(lsn, log_descriptor.sent_to_disk) >= 0 ||
+ cmp_translog_addr(lsn, log_descriptor.log_start) < 0);
+ log_descriptor.sent_to_disk= lsn;
+ /* LSN_IMPOSSIBLE == 0 => it will work for very first time */
+ if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0)
+ {
+ log_descriptor.in_buffers_only= in_buffers;
+ DBUG_PRINT("info", ("set new in_buffers_only"));
+ }
+ pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Sets address from which data is only in the buffer
+
+ SYNOPSIS
+ translog_set_only_in_buffers()
+ lsn LSN to assign
+ in_buffers to assign to in_buffers_only
+*/
+
+static void translog_set_only_in_buffers(TRANSLOG_ADDRESS in_buffers)
+{
+ DBUG_ENTER("translog_set_only_in_buffers");
+ pthread_mutex_lock(&log_descriptor.sent_to_disk_lock);
+ DBUG_PRINT("enter", ("in_buffers: (%lu,0x%lx) "
+ "in_buffers_only: (%lu,0x%lx)",
+ LSN_IN_PARTS(in_buffers),
+ LSN_IN_PARTS(log_descriptor.in_buffers_only)));
+ /* LSN_IMPOSSIBLE == 0 => it will work for very first time */
+ if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0)
+ {
+ if (translog_status != TRANSLOG_OK)
+ DBUG_VOID_RETURN;
+ log_descriptor.in_buffers_only= in_buffers;
+ DBUG_PRINT("info", ("set new in_buffers_only"));
+ }
+ pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Gets address from which data is only in the buffer
+
+ SYNOPSIS
+ translog_only_in_buffers()
+
+ RETURN
+ address from which data is only in the buffer
+*/
+
+static TRANSLOG_ADDRESS translog_only_in_buffers()
+{
+ register TRANSLOG_ADDRESS addr;
+ DBUG_ENTER("translog_only_in_buffers");
+ pthread_mutex_lock(&log_descriptor.sent_to_disk_lock);
+ addr= log_descriptor.in_buffers_only;
+ pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock);
+ DBUG_RETURN(addr);
+}
+
+
+/*
+ Get max LSN sent to file
+
+ SYNOPSIS
+ translog_get_sent_to_disk()
+
+ RETURN
+ max LSN send to file
+*/
+
+static LSN translog_get_sent_to_disk()
+{
+ register LSN lsn;
+ DBUG_ENTER("translog_get_sent_to_disk");
+ pthread_mutex_lock(&log_descriptor.sent_to_disk_lock);
+ lsn= log_descriptor.sent_to_disk;
+ DBUG_PRINT("info", ("sent to disk up to (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock);
+ DBUG_RETURN(lsn);
+}
+
+
+/*
+ Get first chunk address on the given page
+
+ SYNOPSIS
+ translog_get_first_chunk_offset()
+ page The page where to find first chunk
+
+ RETURN
+ first chunk offset
+*/
+
+static my_bool translog_get_first_chunk_offset(uchar *page)
+{
+ DBUG_ENTER("translog_get_first_chunk_offset");
+ DBUG_ASSERT(page[TRANSLOG_PAGE_FLAGS] < TRANSLOG_FLAGS_NUM);
+ DBUG_RETURN(page_overhead[page[TRANSLOG_PAGE_FLAGS]]);
+}
+
+
+/*
+ Write coded length of record
+
+ SYNOPSIS
+ translog_write_variable_record_1group_code_len
+ dst Destination buffer pointer
+ length Length which should be coded
+ header_len Calculated total header length
+*/
+
+static void
+translog_write_variable_record_1group_code_len(uchar *dst,
+ translog_size_t length,
+ uint16 header_len)
+{
+ switch (header_len) {
+ case 6: /* (5 + 1) */
+ DBUG_ASSERT(length <= 250);
+ *dst= (uint8) length;
+ return;
+ case 8: /* (5 + 3) */
+ DBUG_ASSERT(length <= 0xFFFF);
+ *dst= 251;
+ int2store(dst + 1, length);
+ return;
+ case 9: /* (5 + 4) */
+ DBUG_ASSERT(length <= (ulong) 0xFFFFFF);
+ *dst= 252;
+ int3store(dst + 1, length);
+ return;
+ case 10: /* (5 + 5) */
+ *dst= 253;
+ int4store(dst + 1, length);
+ return;
+ default:
+ DBUG_ASSERT(0);
+ }
+ return;
+}
+
+
+/*
+ Decode record data length and advance given pointer to the next field
+
+ SYNOPSIS
+ translog_variable_record_1group_decode_len()
+ src The pointer to the pointer to the length beginning
+
+ RETURN
+ decoded length
+*/
+
+static translog_size_t translog_variable_record_1group_decode_len(uchar **src)
+{
+ uint8 first= (uint8) (**src);
+ switch (first) {
+ case 251:
+ (*src)+= 3;
+ return (uint2korr((*src) - 2));
+ case 252:
+ (*src)+= 4;
+ return (uint3korr((*src) - 3));
+ case 253:
+ (*src)+= 5;
+ return (uint4korr((*src) - 4));
+ case 254:
+ case 255:
+ DBUG_ASSERT(0); /* reserved for future use */
+ return (0);
+ default:
+ (*src)++;
+ return (first);
+ }
+}
+
+
+/*
+ Get total length of this chunk (not only body)
+
+ SYNOPSIS
+ translog_get_total_chunk_length()
+ page The page where chunk placed
+ offset Offset of the chunk on this place
+
+ RETURN
+ total length of the chunk
+*/
+
+static uint16 translog_get_total_chunk_length(uchar *page, uint16 offset)
+{
+ DBUG_ENTER("translog_get_total_chunk_length");
+ switch (page[offset] & TRANSLOG_CHUNK_TYPE) {
+ case TRANSLOG_CHUNK_LSN:
+ {
+ /* 0 chunk referred as LSN (head or tail) */
+ translog_size_t rec_len;
+ uchar *start= page + offset;
+ uchar *ptr= start + 1 + 2; /* chunk type and short trid */
+ uint16 chunk_len, header_len, page_rest;
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN"));
+ rec_len= translog_variable_record_1group_decode_len(&ptr);
+ chunk_len= uint2korr(ptr);
+ header_len= (uint16) (ptr -start) + 2;
+ DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u",
+ (ulong) rec_len, (uint) chunk_len, (uint) header_len));
+ if (chunk_len)
+ {
+ DBUG_PRINT("info", ("chunk len: %u + %u = %u",
+ (uint) header_len, (uint) chunk_len,
+ (uint) (chunk_len + header_len)));
+ DBUG_RETURN(chunk_len + header_len);
+ }
+ page_rest= TRANSLOG_PAGE_SIZE - offset;
+ DBUG_PRINT("info", ("page_rest %u", (uint) page_rest));
+ if (rec_len + header_len < page_rest)
+ DBUG_RETURN(rec_len + header_len);
+ DBUG_RETURN(page_rest);
+ }
+ case TRANSLOG_CHUNK_FIXED:
+ {
+ uchar *ptr;
+ uint type= page[offset] & TRANSLOG_REC_TYPE;
+ uint length;
+ int i;
+ /* 1 (pseudo)fixed record (also LSN) */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED"));
+ DBUG_ASSERT(log_record_type_descriptor[type].rclass ==
+ LOGRECTYPE_FIXEDLENGTH ||
+ log_record_type_descriptor[type].rclass ==
+ LOGRECTYPE_PSEUDOFIXEDLENGTH);
+ if (log_record_type_descriptor[type].rclass == LOGRECTYPE_FIXEDLENGTH)
+ {
+ DBUG_PRINT("info",
+ ("Fixed length: %u",
+ (uint) (log_record_type_descriptor[type].fixed_length + 3)));
+ DBUG_RETURN(log_record_type_descriptor[type].fixed_length + 3);
+ }
+
+ ptr= page + offset + 3; /* first compressed LSN */
+ length= log_record_type_descriptor[type].fixed_length + 3;
+ for (i= 0; i < log_record_type_descriptor[type].compressed_LSN; i++)
+ {
+ /* first 2 bits is length - 2 */
+ uint len= (((uint8) (*ptr)) >> 6) + 2;
+ if (ptr[0] == 0 && ((uint8) ptr[1]) == 1)
+ len+= LSN_STORE_SIZE; /* case of full LSN storing */
+ ptr+= len;
+ /* subtract saved bytes */
+ length-= (LSN_STORE_SIZE - len);
+ }
+ DBUG_PRINT("info", ("Pseudo-fixed length: %u", length));
+ DBUG_RETURN(length);
+ }
+ case TRANSLOG_CHUNK_NOHDR:
+ /* 2 no header chunk (till page end) */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR length: %u",
+ (uint) (TRANSLOG_PAGE_SIZE - offset)));
+ DBUG_RETURN(TRANSLOG_PAGE_SIZE - offset);
+ case TRANSLOG_CHUNK_LNGTH: /* 3 chunk with chunk length */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH"));
+ DBUG_ASSERT(TRANSLOG_PAGE_SIZE - offset >= 3);
+ DBUG_PRINT("info", ("length: %u", uint2korr(page + offset + 1) + 3));
+ DBUG_RETURN(uint2korr(page + offset + 1) + 3);
+ default:
+ DBUG_ASSERT(0);
+ DBUG_RETURN(0);
+ }
+}
+
+/*
+ @brief Waits previous buffer flush finish
+
+ @param buffer buffer for check
+
+ @retval 0 previous buffer flushed and this thread have to flush this one
+ @retval 1 previous buffer flushed and this buffer flushed by other thread too
+*/
+
+my_bool translog_prev_buffer_flush_wait(struct st_translog_buffer *buffer)
+{
+ TRANSLOG_ADDRESS offset= buffer->offset;
+ TRANSLOG_FILE *file= buffer->file;
+ uint8 ver= buffer->ver;
+ DBUG_ENTER("translog_prev_buffer_flush_wait");
+ DBUG_PRINT("enter", ("buffer: 0x%lx #%u offset: (%lu,0x%lx) "
+ "prev sent: (%lu,0x%lx) prev offset: (%lu,0x%lx)",
+ (ulong) buffer, (uint) buffer->buffer_no,
+ LSN_IN_PARTS(buffer->offset),
+ LSN_IN_PARTS(buffer->prev_sent_to_disk),
+ LSN_IN_PARTS(buffer->prev_buffer_offset)));
+ translog_buffer_lock_assert_owner(buffer);
+ /*
+ if prev_sent_to_disk == LSN_IMPOSSIBLE then
+ prev_buffer_offset should be LSN_IMPOSSIBLE
+ because it means that this buffer was never used
+ */
+ DBUG_ASSERT((buffer->prev_sent_to_disk == LSN_IMPOSSIBLE &&
+ buffer->prev_buffer_offset == LSN_IMPOSSIBLE) ||
+ buffer->prev_sent_to_disk != LSN_IMPOSSIBLE);
+ if (buffer->prev_buffer_offset != buffer->prev_sent_to_disk)
+ {
+ do {
+ pthread_cond_wait(&buffer->prev_sent_to_disk_cond, &buffer->mutex);
+ if (buffer->file != file || buffer->offset != offset ||
+ buffer->ver != ver)
+ {
+ translog_buffer_unlock(buffer);
+ DBUG_RETURN(1); /* some the thread flushed the buffer already */
+ }
+ } while(buffer->prev_buffer_offset != buffer->prev_sent_to_disk);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Flush given buffer
+
+ SYNOPSIS
+ translog_buffer_flush()
+ buffer This buffer should be flushed
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_buffer_flush(struct st_translog_buffer *buffer)
+{
+ uint32 i, pg;
+ TRANSLOG_ADDRESS offset= buffer->offset;
+ TRANSLOG_FILE *file= buffer->file;
+ uint8 ver= buffer->ver;
+ uint skipped_data;
+ DBUG_ENTER("translog_buffer_flush");
+ DBUG_PRINT("enter",
+ ("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ buffer->file->handler.file,
+ LSN_IN_PARTS(buffer->offset),
+ (ulong) buffer->size));
+ translog_buffer_lock_assert_owner(buffer);
+
+ if (buffer->file == NULL)
+ DBUG_RETURN(0);
+
+ translog_wait_for_writers(buffer);
+
+ if (buffer->file != file || buffer->offset != offset || buffer->ver != ver)
+ DBUG_RETURN(0); /* some the thread flushed the buffer already */
+
+ if (buffer->is_closing_buffer)
+ {
+ /* some other flush in progress */
+ translog_wait_for_closing(buffer);
+ }
+
+ if (buffer->file != file || buffer->offset != offset || buffer->ver != ver)
+ DBUG_RETURN(0); /* some the thread flushed the buffer already */
+
+ if (buffer->overlay && translog_prev_buffer_flush_wait(buffer))
+ DBUG_RETURN(0); /* some the thread flushed the buffer already */
+
+ /*
+ Send page by page in the pagecache what we are going to write on the
+ disk
+ */
+ file= buffer->file;
+ skipped_data= buffer->skipped_data;
+ DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
+ for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
+ i < buffer->size;
+ i+= TRANSLOG_PAGE_SIZE, pg++)
+ {
+ TRANSLOG_ADDRESS addr= (buffer->offset + i);
+ TRANSLOG_VALIDATOR_DATA data;
+ DBUG_PRINT("info", ("send log form %lu till %lu address: (%lu,0x%lx) "
+ "page #: %lu buffer size: %lu buffer: 0x%lx",
+ (ulong) i, (ulong) (i + TRANSLOG_PAGE_SIZE),
+ LSN_IN_PARTS(addr), (ulong) pg, (ulong) buffer->size,
+ (ulong) buffer));
+ data.addr= &addr;
+ DBUG_ASSERT(log_descriptor.pagecache->block_size == TRANSLOG_PAGE_SIZE);
+ DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
+ if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
+ DBUG_RETURN(1);
+ if (pagecache_write_part(log_descriptor.pagecache,
+ &file->handler, pg, 3,
+ buffer->buffer + i,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0,
+ LSN_IMPOSSIBLE,
+ skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data))
+ {
+ DBUG_PRINT("error",
+ ("Can't write page (%lu,0x%lx) to pagecache, error: %d",
+ (ulong) buffer->file->number,
+ (ulong) (LSN_OFFSET(buffer->offset)+ i),
+ my_errno));
+ translog_stop_writing();
+ DBUG_RETURN(1);
+ }
+ skipped_data= 0;
+ }
+ file->is_sync= 0;
+ if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+ buffer->size - buffer->skipped_data,
+ LSN_OFFSET(buffer->offset) + buffer->skipped_data,
+ log_write_flags))
+ {
+ DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
+ "to the disk (%d)",
+ (ulong) file->handler.file,
+ (ulong) LSN_OFFSET(buffer->offset),
+ (ulong) buffer->size, errno));
+ translog_stop_writing();
+ DBUG_RETURN(1);
+ }
+ /*
+ Dropping the flag in such way can make false alarm: signalling than the
+ file in not sync when it is sync, but the situation is quite rare and
+ protections with mutexes give much more overhead to the whole engine
+ */
+ file->is_sync= 0;
+
+ if (LSN_OFFSET(buffer->last_lsn) != 0) /* if buffer->last_lsn is set */
+ {
+ if (translog_prev_buffer_flush_wait(buffer))
+ DBUG_RETURN(0); /* some the thread flushed the buffer already */
+ translog_set_sent_to_disk(buffer);
+ }
+ else
+ translog_set_only_in_buffers(buffer->next_buffer_offset);
+
+ /* say to next buffer that we are finished */
+ {
+ struct st_translog_buffer *next_buffer=
+ log_descriptor.buffers + ((buffer->buffer_no + 1) % TRANSLOG_BUFFERS_NO);
+ if (likely(translog_status == TRANSLOG_OK)){
+ translog_buffer_lock(next_buffer);
+ next_buffer->prev_sent_to_disk= buffer->offset;
+ translog_buffer_unlock(next_buffer);
+ pthread_cond_broadcast(&next_buffer->prev_sent_to_disk_cond);
+ }
+ else
+ {
+ /*
+ It is shutdown =>
+ 1) there is only one thread
+ 2) mutexes of other buffers can be destroyed => we can't use them
+ */
+ next_buffer->prev_sent_to_disk= buffer->offset;
+ }
+ }
+ /* Free buffer */
+ buffer->file= NULL;
+ buffer->overlay= 0;
+ buffer->ver++;
+ pthread_mutex_lock(&log_descriptor.dirty_buffer_mask_lock);
+ log_descriptor.dirty_buffer_mask&= ~(1 << buffer->buffer_no);
+ pthread_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock);
+ pthread_cond_broadcast(&buffer->waiting_filling_buffer);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Recover page with sector protection (wipe out failed chunks)
+
+ SYNOPSYS
+ translog_recover_page_up_to_sector()
+ page reference on the page
+ offset offset of failed sector
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_recover_page_up_to_sector(uchar *page, uint16 offset)
+{
+ uint16 chunk_offset= translog_get_first_chunk_offset(page), valid_chunk_end;
+ DBUG_ENTER("translog_recover_page_up_to_sector");
+ DBUG_PRINT("enter", ("offset: %u first chunk: %u",
+ (uint) offset, (uint) chunk_offset));
+
+ while (page[chunk_offset] != TRANSLOG_FILLER && chunk_offset < offset)
+ {
+ uint16 chunk_length;
+ if ((chunk_length=
+ translog_get_total_chunk_length(page, chunk_offset)) == 0)
+ {
+ DBUG_PRINT("error", ("cant get chunk length (offset %u)",
+ (uint) chunk_offset));
+ DBUG_RETURN(1);
+ }
+ DBUG_PRINT("info", ("chunk: offset: %u length %u",
+ (uint) chunk_offset, (uint) chunk_length));
+ if (((ulong) chunk_offset) + ((ulong) chunk_length) > TRANSLOG_PAGE_SIZE)
+ {
+ DBUG_PRINT("error", ("damaged chunk (offset %u) in trusted area",
+ (uint) chunk_offset));
+ DBUG_RETURN(1);
+ }
+ chunk_offset+= chunk_length;
+ }
+
+ valid_chunk_end= chunk_offset;
+ /* end of trusted area - sector parsing */
+ while (page[chunk_offset] != TRANSLOG_FILLER)
+ {
+ uint16 chunk_length;
+ if ((chunk_length=
+ translog_get_total_chunk_length(page, chunk_offset)) == 0)
+ break;
+
+ DBUG_PRINT("info", ("chunk: offset: %u length %u",
+ (uint) chunk_offset, (uint) chunk_length));
+ if (((ulong) chunk_offset) + ((ulong) chunk_length) >
+ (uint) (offset + DISK_DRIVE_SECTOR_SIZE))
+ break;
+
+ chunk_offset+= chunk_length;
+ valid_chunk_end= chunk_offset;
+ }
+ DBUG_PRINT("info", ("valid chunk end offset: %u", (uint) valid_chunk_end));
+
+ memset(page + valid_chunk_end, TRANSLOG_FILLER,
+ TRANSLOG_PAGE_SIZE - valid_chunk_end);
+
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Dummy write callback.
+*/
+
+static my_bool
+translog_dummy_callback(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no __attribute__((unused)),
+ uchar* data_ptr __attribute__((unused)))
+{
+ return 0;
+}
+
+
+/**
+ @brief Checks and removes sector protection.
+
+ @param page reference on the page content.
+ @param file transaction log descriptor.
+
+ @retvat 0 OK
+ @retval 1 Error
+*/
+
+static my_bool
+translog_check_sector_protection(uchar *page, TRANSLOG_FILE *file)
+{
+ uint i, offset;
+ uchar *table= page + page_overhead[page[TRANSLOG_PAGE_FLAGS]] -
+ TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+ uint8 current= table[0];
+ DBUG_ENTER("translog_check_sector_protection");
+
+ for (i= 1, offset= DISK_DRIVE_SECTOR_SIZE;
+ i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+ i++, offset+= DISK_DRIVE_SECTOR_SIZE)
+ {
+ /*
+ TODO: add chunk counting for "suspecting" sectors (difference is
+ more than 1-2), if difference more then present chunks then it is
+ the problem.
+ */
+ uint8 test= page[offset];
+ DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx "
+ "read: 0x%x stored: 0x%x%x",
+ i, offset, (ulong) current,
+ (uint) uint2korr(page + offset), (uint) table[i],
+ (uint) table[i + 1]));
+ /*
+ 3 is minimal possible record length. So we can have "distance"
+ between 2 sectors value more then DISK_DRIVE_SECTOR_SIZE / 3
+ only if it is old value, i.e. the sector was not written.
+ */
+ if (((test < current) &&
+ ((uint)(0xFFL - current + test) > DISK_DRIVE_SECTOR_SIZE / 3)) ||
+ ((test >= current) &&
+ ((uint)(test - current) > DISK_DRIVE_SECTOR_SIZE / 3)))
+ {
+ if (translog_recover_page_up_to_sector(page, offset))
+ DBUG_RETURN(1);
+ file->was_recovered= 1;
+ DBUG_RETURN(0);
+ }
+
+ /* Restore value on the page */
+ page[offset]= table[i];
+ current= test;
+ DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx "
+ "read: 0x%x stored: 0x%x",
+ i, offset, (ulong) current,
+ (uint) page[offset], (uint) table[i]));
+ }
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Log page validator (read callback)
+
+ @param page The page data to check
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr Read callback data pointer (pointer to TRANSLOG_FILE)
+
+ @todo: add turning loghandler to read-only mode after merging with
+ that patch.
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_page_validator(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar* data_ptr)
+{
+ uint this_page_page_overhead;
+ uint flags;
+ uchar *page_pos;
+ TRANSLOG_FILE *data= (TRANSLOG_FILE *) data_ptr;
+#ifndef DBUG_OFF
+ pgcache_page_no_t offset= page_no * TRANSLOG_PAGE_SIZE;
+#endif
+ DBUG_ENTER("translog_page_validator");
+
+ data->was_recovered= 0;
+
+ if ((pgcache_page_no_t) uint3korr(page) != page_no ||
+ (uint32) uint3korr(page + 3) != data->number)
+ {
+ DBUG_PRINT("error", ("Page (%lu,0x%lx): "
+ "page address written in the page is incorrect: "
+ "File %lu instead of %lu or page %lu instead of %lu",
+ (ulong) data->number, (ulong) offset,
+ (ulong) uint3korr(page + 3), (ulong) data->number,
+ (ulong) uint3korr(page),
+ (ulong) page_no));
+ DBUG_RETURN(1);
+ }
+ flags= (uint)(page[TRANSLOG_PAGE_FLAGS]);
+ this_page_page_overhead= page_overhead[flags];
+ if (flags & ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION |
+ TRANSLOG_RECORD_CRC))
+ {
+ DBUG_PRINT("error", ("Page (%lu,0x%lx): "
+ "Garbage in the page flags field detected : %x",
+ (ulong) data->number, (ulong) offset,
+ (uint) flags));
+ DBUG_RETURN(1);
+ }
+ page_pos= page + (3 + 3 + 1);
+ if (flags & TRANSLOG_PAGE_CRC)
+ {
+ uint32 crc= translog_crc(page + this_page_page_overhead,
+ TRANSLOG_PAGE_SIZE -
+ this_page_page_overhead);
+ if (crc != uint4korr(page_pos))
+ {
+ DBUG_PRINT("error", ("Page (%lu,0x%lx): "
+ "CRC mismatch: calculated: %lx on the page %lx",
+ (ulong) data->number, (ulong) offset,
+ (ulong) crc, (ulong) uint4korr(page_pos)));
+ DBUG_RETURN(1);
+ }
+ page_pos+= CRC_SIZE; /* Skip crc */
+ }
+ if (flags & TRANSLOG_SECTOR_PROTECTION &&
+ translog_check_sector_protection(page, data))
+ {
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Locks the loghandler.
+*/
+
+void translog_lock()
+{
+ uint8 current_buffer;
+ DBUG_ENTER("translog_lock");
+
+ /*
+ Locking the loghandler mean locking current buffer, but it can change
+ during locking, so we should check it
+ */
+ for (;;)
+ {
+ /*
+ log_descriptor.bc.buffer_no is only one byte so its reading is
+ an atomic operation
+ */
+ current_buffer= log_descriptor.bc.buffer_no;
+ translog_buffer_lock(log_descriptor.buffers + current_buffer);
+ if (log_descriptor.bc.buffer_no == current_buffer)
+ break;
+ translog_buffer_unlock(log_descriptor.buffers + current_buffer);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unlock the loghandler
+
+ SYNOPSIS
+ translog_unlock()
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+void translog_unlock()
+{
+ translog_buffer_unlock(log_descriptor.bc.buffer);
+}
+
+
+/**
+ @brief Get log page by file number and offset of the beginning of the page
+
+ @param data validator data, which contains the page address
+ @param buffer buffer for page placing
+ (might not be used in some cache implementations)
+ @param direct_link if it is not NULL then caller can accept direct
+ link to the page cache
+
+ @retval NULL Error
+ @retval # pointer to the page cache which should be used to read this page
+*/
+
+static uchar *translog_get_page(TRANSLOG_VALIDATOR_DATA *data, uchar *buffer,
+ PAGECACHE_BLOCK_LINK **direct_link)
+{
+ TRANSLOG_ADDRESS addr= *(data->addr), in_buffers;
+ uint32 file_no= LSN_FILE_NO(addr);
+ TRANSLOG_FILE *file;
+ DBUG_ENTER("translog_get_page");
+ DBUG_PRINT("enter", ("File: %lu Offset: %lu(0x%lx)",
+ (ulong) file_no,
+ (ulong) LSN_OFFSET(addr),
+ (ulong) LSN_OFFSET(addr)));
+
+ /* it is really page address */
+ DBUG_ASSERT(LSN_OFFSET(addr) % TRANSLOG_PAGE_SIZE == 0);
+ if (direct_link)
+ *direct_link= NULL;
+
+restart:
+
+ in_buffers= translog_only_in_buffers();
+ DBUG_PRINT("info", ("in_buffers: (%lu,0x%lx)",
+ LSN_IN_PARTS(in_buffers)));
+ if (in_buffers != LSN_IMPOSSIBLE &&
+ cmp_translog_addr(addr, in_buffers) >= 0)
+ {
+ translog_lock();
+ DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0);
+ /* recheck with locked loghandler */
+ in_buffers= translog_only_in_buffers();
+ if (cmp_translog_addr(addr, in_buffers) >= 0)
+ {
+ uint16 buffer_no= log_descriptor.bc.buffer_no;
+#ifndef DBUG_OFF
+ uint16 buffer_start= buffer_no;
+#endif
+ struct st_translog_buffer *buffer_unlock= log_descriptor.bc.buffer;
+ struct st_translog_buffer *curr_buffer= log_descriptor.bc.buffer;
+ for (;;)
+ {
+ /*
+ if the page is in the buffer and it is the last version of the
+ page (in case of division the page by buffer flush)
+ */
+ if (curr_buffer->file != NULL &&
+ cmp_translog_addr(addr, curr_buffer->offset) >= 0 &&
+ cmp_translog_addr(addr,
+ (curr_buffer->next_buffer_offset ?
+ curr_buffer->next_buffer_offset:
+ curr_buffer->offset + curr_buffer->size)) < 0)
+ {
+ TRANSLOG_ADDRESS offset= curr_buffer->offset;
+ TRANSLOG_FILE *fl= curr_buffer->file;
+ uchar *from, *table= NULL;
+ int is_last_unfinished_page;
+ uint last_protected_sector= 0;
+ uint skipped_data= curr_buffer->skipped_data;
+ TRANSLOG_FILE file_copy;
+ uint8 ver= curr_buffer->ver;
+ translog_wait_for_writers(curr_buffer);
+ if (offset != curr_buffer->offset || fl != curr_buffer->file ||
+ ver != curr_buffer->ver)
+ {
+ DBUG_ASSERT(buffer_unlock == curr_buffer);
+ translog_buffer_unlock(buffer_unlock);
+ goto restart;
+ }
+ DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
+ from= curr_buffer->buffer + (addr - curr_buffer->offset);
+ if (skipped_data && addr == curr_buffer->offset)
+ {
+ /*
+ We read page part of which is not present in buffer,
+ so we should read absent part from file (page cache actually)
+ */
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ /*
+ it's ok to not lock the page because:
+ - The log handler has it's own page cache.
+ - There is only one thread that can access the log
+ cache at a time
+ */
+ if (!(buffer= pagecache_read(log_descriptor.pagecache,
+ &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL)))
+ DBUG_RETURN(NULL);
+ }
+ else
+ skipped_data= 0; /* Read after skipped in buffer data */
+ /*
+ Now we have correct data in buffer up to 'skipped_data'. The
+ following memcpy() will move the data from the internal buffer
+ that was not yet on disk.
+ */
+ memcpy(buffer + skipped_data, from + skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data);
+ /*
+ We can use copy then in translog_page_validator() because it
+ do not put it permanently somewhere.
+ We have to use copy because after releasing log lock we can't
+ guaranty that the file still be present (in real life it will be
+ present but theoretically possible that it will be released
+ already from last files cache);
+ */
+ file_copy= *(curr_buffer->file);
+ file_copy.handler.callback_data= (uchar*) &file_copy;
+ is_last_unfinished_page= ((log_descriptor.bc.buffer ==
+ curr_buffer) &&
+ (log_descriptor.bc.ptr >= from) &&
+ (log_descriptor.bc.ptr <
+ from + TRANSLOG_PAGE_SIZE));
+ if (is_last_unfinished_page &&
+ (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION))
+ {
+ last_protected_sector= ((log_descriptor.bc.previous_offset - 1) /
+ DISK_DRIVE_SECTOR_SIZE);
+ table= buffer + log_descriptor.page_overhead -
+ TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+ }
+
+ DBUG_ASSERT(buffer_unlock == curr_buffer);
+ translog_buffer_unlock(buffer_unlock);
+ if (is_last_unfinished_page)
+ {
+ uint i;
+ /*
+ This is last unfinished page => we should not check CRC and
+ remove only that protection which already installed (no need
+ to check it)
+
+ We do not check the flag of sector protection, because if
+ (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) is
+ not set then last_protected_sector will be 0 so following loop
+ will be never executed
+ */
+ DBUG_PRINT("info", ("This is last unfinished page, "
+ "last protected sector %u",
+ last_protected_sector));
+ for (i= 1; i <= last_protected_sector; i++)
+ {
+ uint offset= i * DISK_DRIVE_SECTOR_SIZE;
+ DBUG_PRINT("info", ("Sector %u: 0x%02x <- 0x%02x",
+ i, buffer[offset],
+ table[i]));
+ buffer[offset]= table[i];
+ }
+ }
+ else
+ {
+ /*
+ This IF should be true because we use in-memory data which
+ supposed to be correct.
+ */
+ if (translog_page_validator(buffer,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ (uchar*) &file_copy))
+ {
+ DBUG_ASSERT(0);
+ buffer= NULL;
+ }
+ }
+ DBUG_RETURN(buffer);
+ }
+ buffer_no= (buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ curr_buffer= log_descriptor.buffers + buffer_no;
+ translog_buffer_lock(curr_buffer);
+ translog_buffer_unlock(buffer_unlock);
+ buffer_unlock= curr_buffer;
+ /* we can't make a full circle */
+ DBUG_ASSERT(buffer_start != buffer_no);
+ }
+ }
+ translog_unlock();
+ }
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ buffer= pagecache_read(log_descriptor.pagecache, &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, (direct_link ? NULL : buffer),
+ PAGECACHE_PLAIN_PAGE,
+ (direct_link ?
+ PAGECACHE_LOCK_READ :
+ PAGECACHE_LOCK_LEFT_UNLOCKED),
+ direct_link);
+ DBUG_PRINT("info", ("Direct link is assigned to : 0x%lx * 0x%lx",
+ (ulong) direct_link,
+ (ulong)(direct_link ? *direct_link : NULL)));
+ data->was_recovered= file->was_recovered;
+ DBUG_RETURN(buffer);
+}
+
+
+/**
+ @brief free direct log page link
+
+ @param direct_link the direct log page link to be freed
+
+*/
+
+static void translog_free_link(PAGECACHE_BLOCK_LINK *direct_link)
+{
+ DBUG_ENTER("translog_free_link");
+ DBUG_PRINT("info", ("Direct link: 0x%lx",
+ (ulong) direct_link));
+ if (direct_link)
+ pagecache_unlock_by_link(log_descriptor.pagecache, direct_link,
+ PAGECACHE_LOCK_READ_UNLOCK, PAGECACHE_UNPIN,
+ LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, 0, FALSE);
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Finds last full page of the given log file.
+
+ @param addr address structure to fill with data, which contain
+ file number of the log file
+ @param last_page_ok Result of the check whether last page OK.
+ (for now only we check only that file length
+ divisible on page length).
+ @param no_errors suppress messages about non-critical errors
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_get_last_page_addr(TRANSLOG_ADDRESS *addr,
+ my_bool *last_page_ok,
+ my_bool no_errors)
+{
+ char path[FN_REFLEN];
+ uint32 rec_offset;
+ my_off_t file_size;
+ uint32 file_no= LSN_FILE_NO(*addr);
+ TRANSLOG_FILE *file;
+#ifndef DBUG_OFF
+ char buff[21];
+#endif
+ DBUG_ENTER("translog_get_last_page_addr");
+
+ if (likely((file= get_logfile_by_number(file_no)) != NULL))
+ {
+ /*
+ This function used only during initialization of loghandler or in
+ scanner (which mean we need read that part of the log), so the
+ requested log file have to be opened and can't be freed after
+ returning pointer on it (file_size).
+ */
+ file_size= my_seek(file->handler.file, 0, SEEK_END, MYF(0));
+ }
+ else
+ {
+ /*
+ This branch is used only during very early initialization
+ when files are not opened.
+ */
+ File fd;
+ if ((fd= my_open(translog_filename_by_fileno(file_no, path),
+ O_RDONLY, (no_errors ? MYF(0) : MYF(MY_WME)))) < 0)
+ {
+ my_errno= errno;
+ DBUG_PRINT("error", ("Error %d during opening file #%d",
+ errno, file_no));
+ DBUG_RETURN(1);
+ }
+ file_size= my_seek(fd, 0, SEEK_END, MYF(0));
+ my_close(fd, MYF(0));
+ }
+ DBUG_PRINT("info", ("File size: %s", llstr(file_size, buff)));
+ if (file_size == MY_FILEPOS_ERROR)
+ DBUG_RETURN(1);
+ DBUG_ASSERT(file_size < ULL(0xffffffff));
+ if (((uint32)file_size) > TRANSLOG_PAGE_SIZE)
+ {
+ rec_offset= (((((uint32)file_size) / TRANSLOG_PAGE_SIZE) - 1) *
+ TRANSLOG_PAGE_SIZE);
+ *last_page_ok= (((uint32)file_size) == rec_offset + TRANSLOG_PAGE_SIZE);
+ }
+ else
+ {
+ *last_page_ok= 0;
+ rec_offset= 0;
+ }
+ *addr= MAKE_LSN(file_no, rec_offset);
+ DBUG_PRINT("info", ("Last page: 0x%lx ok: %d", (ulong) rec_offset,
+ *last_page_ok));
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Get number bytes for record length storing
+
+ @param length Record length which will be encoded
+
+ @return 1,3,4,5 - number of bytes to store given length
+*/
+
+static uint translog_variable_record_length_bytes(translog_size_t length)
+{
+ if (length < 250)
+ return 1;
+ if (length < 0xFFFF)
+ return 3;
+ if (length < (ulong) 0xFFFFFF)
+ return 4;
+ return 5;
+}
+
+
+/**
+ @brief Gets header of this chunk.
+
+ @param chunk The pointer to the chunk beginning
+
+ @retval # total length of the chunk
+ @retval 0 Error
+*/
+
+static uint16 translog_get_chunk_header_length(uchar *chunk)
+{
+ DBUG_ENTER("translog_get_chunk_header_length");
+ switch (*chunk & TRANSLOG_CHUNK_TYPE) {
+ case TRANSLOG_CHUNK_LSN:
+ {
+ /* 0 chunk referred as LSN (head or tail) */
+ translog_size_t rec_len;
+ uchar *start= chunk;
+ uchar *ptr= start + 1 + 2;
+ uint16 chunk_len, header_len;
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN"));
+ rec_len= translog_variable_record_1group_decode_len(&ptr);
+ chunk_len= uint2korr(ptr);
+ header_len= (uint16) (ptr - start) +2;
+ DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u",
+ (ulong) rec_len, (uint) chunk_len, (uint) header_len));
+ if (chunk_len)
+ {
+ /* TODO: fine header end */
+ /*
+ The last chunk of multi-group record can be base for it header
+ calculation (we skip to the first group to read the header) so if we
+ stuck here something is wrong.
+ */
+ DBUG_ASSERT(0);
+ DBUG_RETURN(0); /* Keep compiler happy */
+ }
+ DBUG_RETURN(header_len);
+ }
+ case TRANSLOG_CHUNK_FIXED:
+ {
+ /* 1 (pseudo)fixed record (also LSN) */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED = 3"));
+ DBUG_RETURN(3);
+ }
+ case TRANSLOG_CHUNK_NOHDR:
+ /* 2 no header chunk (till page end) */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR = 1"));
+ DBUG_RETURN(1);
+ break;
+ case TRANSLOG_CHUNK_LNGTH:
+ /* 3 chunk with chunk length */
+ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH = 3"));
+ DBUG_RETURN(3);
+ break;
+ default:
+ DBUG_ASSERT(0);
+ DBUG_RETURN(0); /* Keep compiler happy */
+ }
+}
+
+
+/**
+ @brief Truncate the log to the given address. Used during the startup if the
+ end of log if corrupted.
+
+ @param addr new horizon
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_truncate_log(TRANSLOG_ADDRESS addr)
+{
+ uchar *page;
+ TRANSLOG_ADDRESS current_page;
+ uint32 next_page_offset, page_rest;
+ uint32 i;
+ File fd;
+ int rc;
+ TRANSLOG_VALIDATOR_DATA data;
+ char path[FN_REFLEN];
+ uchar page_buff[TRANSLOG_PAGE_SIZE];
+ DBUG_ENTER("translog_truncate_log");
+ /* TODO: write warning to the client */
+ DBUG_PRINT("warning", ("removing all records from (%lu,0x%lx) "
+ "till (%lu,0x%lx)",
+ LSN_IN_PARTS(addr),
+ LSN_IN_PARTS(log_descriptor.horizon)));
+ DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0);
+ /* remove files between the address and horizon */
+ for (i= LSN_FILE_NO(addr) + 1; i <= LSN_FILE_NO(log_descriptor.horizon); i++)
+ if (my_delete(translog_filename_by_fileno(i, path), MYF(MY_WME)))
+ {
+ translog_unlock();
+ DBUG_RETURN(1);
+ }
+
+ /* truncate the last file up to the last page */
+ next_page_offset= LSN_OFFSET(addr);
+ next_page_offset= (next_page_offset -
+ ((next_page_offset - 1) % TRANSLOG_PAGE_SIZE + 1) +
+ TRANSLOG_PAGE_SIZE);
+ page_rest= next_page_offset - LSN_OFFSET(addr);
+ memset(page_buff, TRANSLOG_FILLER, page_rest);
+ rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+ ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+ (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+ log_write_flags)) ||
+ my_sync(fd, MYF(MY_WME)))));
+ translog_syncs++;
+ rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+ {
+ rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ translog_syncs++;
+ }
+ if (rc)
+ DBUG_RETURN(1);
+
+ /* fix the horizon */
+ log_descriptor.horizon= addr;
+ /* fix the buffer data */
+ current_page= MAKE_LSN(LSN_FILE_NO(addr), (next_page_offset -
+ TRANSLOG_PAGE_SIZE));
+ data.addr= &current_page;
+ if ((page= translog_get_page(&data, log_descriptor.buffers->buffer, NULL)) ==
+ NULL)
+ DBUG_RETURN(1);
+ if (page != log_descriptor.buffers->buffer)
+ memcpy(log_descriptor.buffers->buffer, page, TRANSLOG_PAGE_SIZE);
+ log_descriptor.bc.buffer->offset= current_page;
+ log_descriptor.bc.buffer->size= LSN_OFFSET(addr) - LSN_OFFSET(current_page);
+ log_descriptor.bc.ptr=
+ log_descriptor.buffers->buffer + log_descriptor.bc.buffer->size;
+ log_descriptor.bc.current_page_fill= log_descriptor.bc.buffer->size;
+ DBUG_RETURN(0);
+}
+
+
+/**
+ Applies function 'callback' to all files (in a directory) which
+ name looks like a log's name (aria_log.[0-9]{7}).
+ If 'callback' returns TRUE this interrupts the walk and returns
+ TRUE. Otherwise FALSE is returned after processing all log files.
+ It cannot just use log_descriptor.directory because that may not yet have
+ been initialized.
+
+ @param directory directory to scan
+ @param callback function to apply; is passed directory and base
+ name of found file
+*/
+
+my_bool translog_walk_filenames(const char *directory,
+ my_bool (*callback)(const char *,
+ const char *))
+{
+ MY_DIR *dirp;
+ uint i;
+ my_bool rc= FALSE;
+
+ /* Finds and removes transaction log files */
+ if (!(dirp = my_dir(directory, MYF(MY_DONT_SORT))))
+ return FALSE;
+
+ for (i= 0; i < dirp->number_off_files; i++)
+ {
+ char *file= dirp->dir_entry[i].name;
+ if (strncmp(file, "aria_log.", 10) == 0 &&
+ file[10] >= '0' && file[10] <= '9' &&
+ file[11] >= '0' && file[11] <= '9' &&
+ file[12] >= '0' && file[12] <= '9' &&
+ file[13] >= '0' && file[13] <= '9' &&
+ file[14] >= '0' && file[14] <= '9' &&
+ file[15] >= '0' && file[15] <= '9' &&
+ file[16] >= '0' && file[16] <= '9' &&
+ file[17] >= '0' && file[17] <= '9' &&
+ file[18] == '\0' && (*callback)(directory, file))
+ {
+ rc= TRUE;
+ break;
+ }
+ }
+ my_dirend(dirp);
+ return rc;
+}
+
+
+/**
+ @brief Fills table of dependence length of page header from page flags
+*/
+
+static void translog_fill_overhead_table()
+{
+ uint i;
+ for (i= 0; i < TRANSLOG_FLAGS_NUM; i++)
+ {
+ page_overhead[i]= 7;
+ if (i & TRANSLOG_PAGE_CRC)
+ page_overhead[i]+= CRC_SIZE;
+ if (i & TRANSLOG_SECTOR_PROTECTION)
+ page_overhead[i]+= TRANSLOG_PAGE_SIZE /
+ DISK_DRIVE_SECTOR_SIZE;
+ }
+}
+
+
+/**
+ Callback to find first log in directory.
+*/
+
+static my_bool translog_callback_search_first(const char *directory
+ __attribute__((unused)),
+ const char *filename
+ __attribute__((unused)))
+{
+ return TRUE;
+}
+
+
+/**
+ @brief Checks that chunk is LSN one
+
+ @param type type of the chunk
+
+ @retval 1 the chunk is LNS
+ @retval 0 the chunk is not LSN
+*/
+
+static my_bool translog_is_LSN_chunk(uchar type)
+{
+ DBUG_ENTER("translog_is_LSN_chunk");
+ DBUG_PRINT("info", ("byte: %x chunk type: %u record type: %u",
+ type, type >> 6, type & TRANSLOG_REC_TYPE));
+ DBUG_RETURN(((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_FIXED) ||
+ (((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_LSN) &&
+ ((type & TRANSLOG_REC_TYPE)) != TRANSLOG_CHUNK_0_CONT));
+}
+
+
+/**
+ @brief Initialize transaction log
+
+ @param directory Directory where log files are put
+ @param log_file_max_size max size of one log size (for new logs creation)
+ @param server_version version of MySQL server (MYSQL_VERSION_ID)
+ @param server_id server ID (replication & Co)
+ @param pagecache Page cache for the log reads
+ @param flags flags (TRANSLOG_PAGE_CRC, TRANSLOG_SECTOR_PROTECTION
+ TRANSLOG_RECORD_CRC)
+ @param read_only Put transaction log in read-only mode
+ @param init_table_func function to initialize record descriptors table
+ @param no_errors suppress messages about non-critical errors
+
+ @todo
+ Free used resources in case of error.
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool translog_init_with_table(const char *directory,
+ uint32 log_file_max_size,
+ uint32 server_version,
+ uint32 server_id, PAGECACHE *pagecache,
+ uint flags, my_bool readonly,
+ void (*init_table_func)(),
+ my_bool no_errors)
+{
+ int i;
+ int old_log_was_recovered= 0, logs_found= 0;
+ uint old_flags= flags;
+ uint32 start_file_num= 1;
+ TRANSLOG_ADDRESS sure_page, last_page, last_valid_page, checkpoint_lsn;
+ my_bool version_changed= 0;
+ DBUG_ENTER("translog_init_with_table");
+
+ translog_syncs= 0;
+ flush_start= 0;
+ id_to_share= NULL;
+
+ log_descriptor.directory_fd= -1;
+ log_descriptor.is_everything_flushed= 1;
+ log_descriptor.flush_in_progress= 0;
+ log_descriptor.flush_no= 0;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+
+ (*init_table_func)();
+ compile_time_assert(sizeof(log_descriptor.dirty_buffer_mask) * 8 >=
+ TRANSLOG_BUFFERS_NO);
+ log_descriptor.dirty_buffer_mask= 0;
+ if (readonly)
+ log_descriptor.open_flags= O_BINARY | O_RDONLY;
+ else
+ log_descriptor.open_flags= O_BINARY | O_RDWR;
+ if (pthread_mutex_init(&log_descriptor.sent_to_disk_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_mutex_init(&log_descriptor.file_header_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_mutex_init(&log_descriptor.unfinished_files_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_mutex_init(&log_descriptor.purger_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_mutex_init(&log_descriptor.log_flush_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
+ MY_MUTEX_INIT_FAST) ||
+ pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+ pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
+ my_rwlock_init(&log_descriptor.open_files_lock,
+ NULL) ||
+ my_init_dynamic_array(&log_descriptor.open_files,
+ sizeof(TRANSLOG_FILE*), 10, 10) ||
+ my_init_dynamic_array(&log_descriptor.unfinished_files,
+ sizeof(struct st_file_counter),
+ 10, 10))
+ goto err;
+ log_descriptor.min_need_file= 0;
+ log_descriptor.min_file_number= 0;
+ log_descriptor.last_lsn_checked= LSN_IMPOSSIBLE;
+
+ /* Directory to store files */
+ unpack_dirname(log_descriptor.directory, directory);
+#ifndef __WIN__
+ if ((log_descriptor.directory_fd= my_open(log_descriptor.directory,
+ O_RDONLY, MYF(MY_WME))) < 0)
+ {
+ my_errno= errno;
+ DBUG_PRINT("error", ("Error %d during opening directory '%s'",
+ errno, log_descriptor.directory));
+ goto err;
+ }
+#endif
+ log_descriptor.in_buffers_only= LSN_IMPOSSIBLE;
+ DBUG_ASSERT(log_file_max_size % TRANSLOG_PAGE_SIZE == 0 &&
+ log_file_max_size >= TRANSLOG_MIN_FILE_SIZE);
+ /* max size of one log size (for new logs creation) */
+ log_file_size= log_descriptor.log_file_max_size=
+ log_file_max_size;
+ /* server version */
+ log_descriptor.server_version= server_version;
+ /* server ID */
+ log_descriptor.server_id= server_id;
+ /* Page cache for the log reads */
+ log_descriptor.pagecache= pagecache;
+ /* Flags */
+ DBUG_ASSERT((flags &
+ ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION |
+ TRANSLOG_RECORD_CRC)) == 0);
+ log_descriptor.flags= flags;
+ translog_fill_overhead_table();
+ log_descriptor.page_overhead= page_overhead[flags];
+ log_descriptor.page_capacity_chunk_2=
+ TRANSLOG_PAGE_SIZE - log_descriptor.page_overhead - 1;
+ compile_time_assert(TRANSLOG_WRITE_BUFFER % TRANSLOG_PAGE_SIZE == 0);
+ log_descriptor.buffer_capacity_chunk_2=
+ (TRANSLOG_WRITE_BUFFER / TRANSLOG_PAGE_SIZE) *
+ log_descriptor.page_capacity_chunk_2;
+ log_descriptor.half_buffer_capacity_chunk_2=
+ log_descriptor.buffer_capacity_chunk_2 / 2;
+ DBUG_PRINT("info",
+ ("Overhead: %u pc2: %u bc2: %u, bc2/2: %u",
+ log_descriptor.page_overhead,
+ log_descriptor.page_capacity_chunk_2,
+ log_descriptor.buffer_capacity_chunk_2,
+ log_descriptor.half_buffer_capacity_chunk_2));
+
+ /* Just to init it somehow (hack for bootstrap)*/
+ {
+ TRANSLOG_FILE *file= 0;
+ log_descriptor.min_file = log_descriptor.max_file= 1;
+ insert_dynamic(&log_descriptor.open_files, (uchar *)&file);
+ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+ pop_dynamic(&log_descriptor.open_files);
+ }
+
+ /* Buffers for log writing */
+ for (i= 0; i < TRANSLOG_BUFFERS_NO; i++)
+ {
+ if (translog_buffer_init(log_descriptor.buffers + i, i))
+ goto err;
+ DBUG_PRINT("info", ("translog_buffer buffer #%u: 0x%lx",
+ i, (ulong) log_descriptor.buffers + i));
+ }
+
+ /*
+ last_logno and last_checkpoint_lsn were set in
+ ma_control_file_create_or_open()
+ */
+ logs_found= (last_logno != FILENO_IMPOSSIBLE);
+
+ translog_status= (readonly ? TRANSLOG_READONLY : TRANSLOG_OK);
+ checkpoint_lsn= last_checkpoint_lsn;
+
+ if (logs_found)
+ {
+ my_bool pageok;
+ DBUG_PRINT("info", ("log found..."));
+ /*
+ TODO: scan directory for aria_log.XXXXXXXX files and find
+ highest XXXXXXXX & set logs_found
+ TODO: check that last checkpoint within present log addresses space
+
+ find the log end
+ */
+ if (LSN_FILE_NO(last_checkpoint_lsn) == FILENO_IMPOSSIBLE)
+ {
+ DBUG_ASSERT(LSN_OFFSET(last_checkpoint_lsn) == 0);
+ /* only last log needs to be checked */
+ sure_page= MAKE_LSN(last_logno, TRANSLOG_PAGE_SIZE);
+ }
+ else
+ {
+ sure_page= last_checkpoint_lsn;
+ DBUG_ASSERT(LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE != 0);
+ sure_page-= LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE;
+ }
+ /* Set horizon to the beginning of the last file first */
+ log_descriptor.horizon= last_page= MAKE_LSN(last_logno, 0);
+ if (translog_get_last_page_addr(&last_page, &pageok, no_errors))
+ {
+ if (!translog_walk_filenames(log_descriptor.directory,
+ &translog_callback_search_first))
+ {
+ /*
+ Files was deleted, just start from the next log number, so that
+ existing tables are in the past.
+ */
+ start_file_num= last_logno + 1;
+ checkpoint_lsn= LSN_IMPOSSIBLE; /* no log so no checkpoint */
+ logs_found= 0;
+ }
+ else
+ goto err;
+ }
+ else if (LSN_OFFSET(last_page) == 0)
+ {
+ if (LSN_FILE_NO(last_page) == 1)
+ {
+ logs_found= 0; /* file #1 has no pages */
+ DBUG_PRINT("info", ("log found. But is is empty => no log assumed"));
+ }
+ else
+ {
+ last_page-= LSN_ONE_FILE;
+ if (translog_get_last_page_addr(&last_page, &pageok, 0))
+ goto err;
+ }
+ }
+ if (logs_found)
+ {
+ uint32 i;
+ log_descriptor.min_file= translog_first_file(log_descriptor.horizon, 1);
+ log_descriptor.max_file= last_logno;
+ /* Open all files */
+ if (allocate_dynamic(&log_descriptor.open_files,
+ log_descriptor.max_file -
+ log_descriptor.min_file + 1))
+ goto err;
+ for (i = log_descriptor.max_file; i >= log_descriptor.min_file; i--)
+ {
+ /*
+ We can't allocate all file together because they will be freed
+ one by one
+ */
+ TRANSLOG_FILE *file= (TRANSLOG_FILE *)my_malloc(sizeof(TRANSLOG_FILE),
+ MYF(0));
+
+ compile_time_assert(MY_FILEPOS_ERROR > ULL(0xffffffff));
+ if (file == NULL ||
+ (file->handler.file=
+ open_logfile_by_number_no_cache(i)) < 0 ||
+ my_seek(file->handler.file, 0, SEEK_END, MYF(0)) >=
+ ULL(0xffffffff))
+ {
+ int j;
+ for (j= i - log_descriptor.min_file - 1; j > 0; j--)
+ {
+ TRANSLOG_FILE *el=
+ *dynamic_element(&log_descriptor.open_files, j,
+ TRANSLOG_FILE **);
+ my_close(el->handler.file, MYF(MY_WME));
+ my_free(el, MYF(0));
+ }
+ if (file)
+ {
+ free(file);
+ goto err;
+ }
+ else
+ goto err;
+ }
+ translog_file_init(file, i, 1);
+ /* we allocated space so it can't fail */
+ insert_dynamic(&log_descriptor.open_files, (uchar *)&file);
+ }
+ DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+ log_descriptor.open_files.elements);
+ }
+ }
+ else if (readonly)
+ {
+ /* There is no logs and there is read-only mode => nothing to read */
+ DBUG_PRINT("error", ("No logs and read-only mode"));
+ goto err;
+ }
+
+ if (logs_found)
+ {
+ TRANSLOG_ADDRESS current_page= sure_page;
+ my_bool pageok;
+
+ DBUG_PRINT("info", ("The log is really present"));
+ DBUG_ASSERT(sure_page <= last_page);
+
+ /* TODO: check page size */
+
+ last_valid_page= LSN_IMPOSSIBLE;
+ /*
+ Scans and validate pages. We need it to show "outside" only for sure
+ valid part of the log. If the log was damaged then fixed we have to
+ cut off damaged part before some other process start write something
+ in the log.
+ */
+ do
+ {
+ TRANSLOG_ADDRESS current_file_last_page;
+ current_file_last_page= current_page;
+ if (translog_get_last_page_addr(&current_file_last_page, &pageok, 0))
+ goto err;
+ if (!pageok)
+ {
+ DBUG_PRINT("error", ("File %lu have no complete last page",
+ (ulong) LSN_FILE_NO(current_file_last_page)));
+ old_log_was_recovered= 1;
+ /* This file is not written till the end so it should be last */
+ last_page= current_file_last_page;
+ /* TODO: issue warning */
+ }
+ do
+ {
+ TRANSLOG_VALIDATOR_DATA data;
+ TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+ uchar *page;
+ data.addr= &current_page;
+ if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL)
+ goto err;
+ if (data.was_recovered)
+ {
+ DBUG_PRINT("error", ("file no: %lu (%d) "
+ "rec_offset: 0x%lx (%lu) (%d)",
+ (ulong) LSN_FILE_NO(current_page),
+ (uint3korr(page + 3) !=
+ LSN_FILE_NO(current_page)),
+ (ulong) LSN_OFFSET(current_page),
+ (ulong) (LSN_OFFSET(current_page) /
+ TRANSLOG_PAGE_SIZE),
+ (uint3korr(page) !=
+ LSN_OFFSET(current_page) /
+ TRANSLOG_PAGE_SIZE)));
+ old_log_was_recovered= 1;
+ break;
+ }
+ old_flags= page[TRANSLOG_PAGE_FLAGS];
+ last_valid_page= current_page;
+ current_page+= TRANSLOG_PAGE_SIZE; /* increase offset */
+ } while (current_page <= current_file_last_page);
+ current_page+= LSN_ONE_FILE;
+ current_page= LSN_REPLACE_OFFSET(current_page, TRANSLOG_PAGE_SIZE);
+ } while (LSN_FILE_NO(current_page) <= LSN_FILE_NO(last_page) &&
+ !old_log_was_recovered);
+ if (last_valid_page == LSN_IMPOSSIBLE)
+ {
+ /* Panic!!! Even page which should be valid is invalid */
+ /* TODO: issue error */
+ goto err;
+ }
+ DBUG_PRINT("info", ("Last valid page is in file: %lu "
+ "offset: %lu (0x%lx) "
+ "Logs found: %d was recovered: %d "
+ "flags match: %d",
+ (ulong) LSN_FILE_NO(last_valid_page),
+ (ulong) LSN_OFFSET(last_valid_page),
+ (ulong) LSN_OFFSET(last_valid_page),
+ logs_found, old_log_was_recovered,
+ (old_flags == flags)));
+
+ /* TODO: check server ID */
+ if (logs_found && !old_log_was_recovered && old_flags == flags)
+ {
+ TRANSLOG_VALIDATOR_DATA data;
+ TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+ uchar *page;
+ uint16 chunk_offset;
+ data.addr= &last_valid_page;
+ /* continue old log */
+ DBUG_ASSERT(LSN_FILE_NO(last_valid_page)==
+ LSN_FILE_NO(log_descriptor.horizon));
+ if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL ||
+ (chunk_offset= translog_get_first_chunk_offset(page)) == 0)
+ goto err;
+
+ /* Puts filled part of old page in the buffer */
+ log_descriptor.horizon= last_valid_page;
+ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+ /*
+ Free space if filled with TRANSLOG_FILLER and first uchar of
+ real chunk can't be TRANSLOG_FILLER
+ */
+ while (chunk_offset < TRANSLOG_PAGE_SIZE &&
+ page[chunk_offset] != TRANSLOG_FILLER)
+ {
+ uint16 chunk_length;
+ if ((chunk_length=
+ translog_get_total_chunk_length(page, chunk_offset)) == 0)
+ goto err;
+ DBUG_PRINT("info", ("chunk: offset: %u length: %u",
+ (uint) chunk_offset, (uint) chunk_length));
+ chunk_offset+= chunk_length;
+
+ /* chunk can't cross the page border */
+ DBUG_ASSERT(chunk_offset <= TRANSLOG_PAGE_SIZE);
+ }
+ memcpy(log_descriptor.buffers->buffer, page, chunk_offset);
+ log_descriptor.bc.buffer->size+= chunk_offset;
+ log_descriptor.bc.ptr+= chunk_offset;
+ log_descriptor.bc.current_page_fill= chunk_offset;
+ log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon,
+ (chunk_offset +
+ LSN_OFFSET(last_valid_page)));
+ DBUG_PRINT("info", ("Move Page #%u: 0x%lx chaser: %d Size: %lu (%lu)",
+ (uint) log_descriptor.bc.buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ log_descriptor.bc.chaser,
+ (ulong) log_descriptor.bc.buffer->size,
+ (ulong) (log_descriptor.bc.ptr - log_descriptor.bc.
+ buffer->buffer)));
+ translog_check_cursor(&log_descriptor.bc);
+ }
+ if (!old_log_was_recovered && old_flags == flags)
+ {
+ LOGHANDLER_FILE_INFO info;
+ LINT_INIT_STRUCT(info);
+
+ /*
+ Accessing &log_descriptor.open_files without mutex is safe
+ because it is initialization
+ */
+ if (translog_read_file_header(&info,
+ (*dynamic_element(&log_descriptor.
+ open_files,
+ 0, TRANSLOG_FILE **))->
+ handler.file))
+ goto err;
+ version_changed= (info.maria_version != TRANSLOG_VERSION_ID);
+ }
+ }
+ DBUG_PRINT("info", ("Logs found: %d was recovered: %d",
+ logs_found, old_log_was_recovered));
+ if (!logs_found)
+ {
+ TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(sizeof(TRANSLOG_FILE),
+ MYF(0));
+ DBUG_PRINT("info", ("The log is not found => we will create new log"));
+ if (file == NULL)
+ goto err;
+ /* Start new log system from scratch */
+ log_descriptor.horizon= MAKE_LSN(start_file_num,
+ TRANSLOG_PAGE_SIZE); /* header page */
+ if ((file->handler.file=
+ create_logfile_by_number_no_cache(start_file_num)) == -1)
+ goto err;
+ translog_file_init(file, start_file_num, 0);
+ if (insert_dynamic(&log_descriptor.open_files, (uchar*)&file))
+ goto err;
+ log_descriptor.min_file= log_descriptor.max_file= start_file_num;
+ if (translog_write_file_header())
+ goto err;
+ DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+ log_descriptor.open_files.elements);
+
+ if (ma_control_file_write_and_force(checkpoint_lsn, start_file_num,
+ max_trid_in_control_file,
+ recovery_failures))
+ goto err;
+ /* assign buffer 0 */
+ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+ translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc);
+ }
+ else if ((old_log_was_recovered || old_flags != flags || version_changed) &&
+ !readonly)
+ {
+ /* leave the damaged file untouched */
+ log_descriptor.horizon+= LSN_ONE_FILE;
+ /* header page */
+ log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon,
+ TRANSLOG_PAGE_SIZE);
+ if (translog_create_new_file())
+ goto err;
+ /*
+ Buffer system left untouched after recovery => we should init it
+ (starting from buffer 0)
+ */
+ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+ translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc);
+ }
+
+ /* all LSNs that are on disk are flushed */
+ log_descriptor.log_start= log_descriptor.sent_to_disk=
+ log_descriptor.flushed= log_descriptor.horizon;
+ log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
+ log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
+ /*
+ Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
+ address of the next LSN and we want indicate that all LSNs that are
+ already on the disk are flushed so we need decrease horizon on 1 (we are
+ sure that there is no LSN on the disk which is greater then 'flushed'
+ and there will not be LSN created that is equal or less then the value
+ of the 'flushed').
+ */
+ log_descriptor.flushed--; /* offset decreased */
+ log_descriptor.sent_to_disk--; /* offset decreased */
+ /*
+ Log records will refer to a MARIA_SHARE by a unique 2-byte id; set up
+ structures for generating 2-byte ids:
+ */
+ my_atomic_rwlock_init(&LOCK_id_to_share);
+ id_to_share= (MARIA_SHARE **) my_malloc(SHARE_ID_MAX * sizeof(MARIA_SHARE*),
+ MYF(MY_WME | MY_ZEROFILL));
+ if (unlikely(!id_to_share))
+ goto err;
+ id_to_share--; /* min id is 1 */
+
+ /* Check the last LSN record integrity */
+ if (logs_found)
+ {
+ TRANSLOG_SCANNER_DATA scanner;
+ TRANSLOG_ADDRESS page_addr;
+ LSN last_lsn= LSN_IMPOSSIBLE;
+ /*
+ take very last page address and try to find LSN record on it
+ if it fail take address of previous page and so on
+ */
+ page_addr= (log_descriptor.horizon -
+ ((log_descriptor.horizon - 1) % TRANSLOG_PAGE_SIZE + 1));
+ if (translog_scanner_init(page_addr, 1, &scanner, 1))
+ goto err;
+ scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]];
+ for (;;)
+ {
+ uint chunk_1byte;
+ chunk_1byte= scanner.page[scanner.page_offset];
+ while (!translog_is_LSN_chunk(chunk_1byte) &&
+ scanner.page != END_OF_LOG &&
+ scanner.page[scanner.page_offset] != TRANSLOG_FILLER &&
+ scanner.page_addr == page_addr)
+ {
+ if (translog_get_next_chunk(&scanner))
+ {
+ translog_destroy_scanner(&scanner);
+ goto err;
+ }
+ if (scanner.page != END_OF_LOG)
+ chunk_1byte= scanner.page[scanner.page_offset];
+ }
+ if (translog_is_LSN_chunk(chunk_1byte))
+ {
+ last_lsn= scanner.page_addr + scanner.page_offset;
+ if (translog_get_next_chunk(&scanner))
+ {
+ translog_destroy_scanner(&scanner);
+ goto err;
+ }
+ if (scanner.page == END_OF_LOG)
+ break; /* it was the last record */
+ chunk_1byte= scanner.page[scanner.page_offset];
+ continue; /* try to find other record on this page */
+ }
+
+ if (last_lsn != LSN_IMPOSSIBLE)
+ break; /* there is no more records on the page */
+
+ /* We have to make step back */
+ if (unlikely(LSN_OFFSET(page_addr) == TRANSLOG_PAGE_SIZE))
+ {
+ uint32 file_no= LSN_FILE_NO(page_addr);
+ my_bool last_page_ok;
+ /* it is beginning of the current file */
+ if (unlikely(file_no == 1))
+ {
+ /*
+ It is beginning of the log => there is no LSNs in the log =>
+ There is no harm in leaving it "as-is".
+ */
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.
+ previous_flush_horizon)));
+ DBUG_RETURN(0);
+ }
+ file_no--;
+ page_addr= MAKE_LSN(file_no, TRANSLOG_PAGE_SIZE);
+ translog_get_last_page_addr(&page_addr, &last_page_ok, 0);
+ /* page should be OK as it is not the last file */
+ DBUG_ASSERT(last_page_ok);
+ }
+ else
+ {
+ page_addr-= TRANSLOG_PAGE_SIZE;
+ }
+ translog_destroy_scanner(&scanner);
+ if (translog_scanner_init(page_addr, 1, &scanner, 1))
+ goto err;
+ scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]];
+ }
+ translog_destroy_scanner(&scanner);
+
+ /* Now scanner points to the last LSN chunk, lets check it */
+ {
+ TRANSLOG_HEADER_BUFFER rec;
+ translog_size_t rec_len;
+ int len;
+ uchar buffer[1];
+ DBUG_PRINT("info", ("going to check the last found record (%lu,0x%lx)",
+ LSN_IN_PARTS(last_lsn)));
+
+ len=
+ translog_read_record_header(last_lsn, &rec);
+ if (unlikely (len == RECHEADER_READ_ERROR ||
+ len == RECHEADER_READ_EOF))
+ {
+ DBUG_PRINT("error", ("unexpected end of log or record during "
+ "reading record header: (%lu,0x%lx) len: %d",
+ LSN_IN_PARTS(last_lsn), len));
+ if (readonly)
+ log_descriptor.log_start= log_descriptor.horizon= last_lsn;
+ else if (translog_truncate_log(last_lsn))
+ {
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ else
+ {
+ DBUG_ASSERT(last_lsn == rec.lsn);
+ if (likely(rec.record_length != 0))
+ {
+ /*
+ Reading the last byte of record will trigger scanning all
+ record chunks for now
+ */
+ rec_len= translog_read_record(rec.lsn, rec.record_length - 1, 1,
+ buffer, NULL);
+ if (rec_len != 1)
+ {
+ DBUG_PRINT("error", ("unexpected end of log or record during "
+ "reading record body: (%lu,0x%lx) len: %d",
+ LSN_IN_PARTS(rec.lsn),
+ len));
+ if (readonly)
+ log_descriptor.log_start= log_descriptor.horizon= last_lsn;
+
+ else if (translog_truncate_log(last_lsn))
+ {
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ }
+ }
+ translog_free_record_header(&rec);
+ }
+ }
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
+ DBUG_RETURN(0);
+err:
+ ma_message_no_user(0, "log initialization failed");
+ DBUG_RETURN(1);
+}
+
+
+/*
+ @brief Free transaction log file buffer.
+
+ @param buffer_no The buffer to free
+*/
+
+static void translog_buffer_destroy(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_buffer_destroy");
+ DBUG_PRINT("enter",
+ ("Buffer #%u: 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ (buffer->file ? buffer->file->handler.file : -1),
+ LSN_IN_PARTS(buffer->offset),
+ (ulong) buffer->size));
+ if (buffer->file != NULL)
+ {
+ /*
+ We ignore errors here, because we can't do something about it
+ (it is shutting down)
+
+ We also have to take the locks even if there can't be any other
+ threads running, because translog_buffer_flush()
+ requires that we have the buffer locked.
+ */
+ translog_buffer_lock(buffer);
+ translog_buffer_flush(buffer);
+ translog_buffer_unlock(buffer);
+ }
+ DBUG_PRINT("info", ("Destroy mutex: 0x%lx", (ulong) &buffer->mutex));
+ pthread_mutex_destroy(&buffer->mutex);
+ pthread_cond_destroy(&buffer->waiting_filling_buffer);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Free log handler resources
+
+ SYNOPSIS
+ translog_destroy()
+*/
+
+void translog_destroy()
+{
+ TRANSLOG_FILE **file;
+ uint i;
+ uint8 current_buffer;
+ DBUG_ENTER("translog_destroy");
+
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ translog_lock();
+ current_buffer= log_descriptor.bc.buffer_no;
+ translog_status= (translog_status == TRANSLOG_READONLY ?
+ TRANSLOG_UNINITED :
+ TRANSLOG_SHUTDOWN);
+ if (log_descriptor.bc.buffer->file != NULL)
+ translog_finish_page(&log_descriptor.horizon, &log_descriptor.bc);
+ translog_unlock();
+
+ for (i= 0; i < TRANSLOG_BUFFERS_NO; i++)
+ {
+ struct st_translog_buffer *buffer= (log_descriptor.buffers +
+ ((i + current_buffer + 1) %
+ TRANSLOG_BUFFERS_NO));
+ translog_buffer_destroy(buffer);
+ }
+ translog_status= TRANSLOG_UNINITED;
+
+ /* close files */
+ while ((file= (TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files)))
+ translog_close_log_file(*file);
+ pthread_mutex_destroy(&log_descriptor.sent_to_disk_lock);
+ pthread_mutex_destroy(&log_descriptor.file_header_lock);
+ pthread_mutex_destroy(&log_descriptor.unfinished_files_lock);
+ pthread_mutex_destroy(&log_descriptor.purger_lock);
+ pthread_mutex_destroy(&log_descriptor.log_flush_lock);
+ pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
+ pthread_cond_destroy(&log_descriptor.log_flush_cond);
+ pthread_cond_destroy(&log_descriptor.new_goal_cond);
+ rwlock_destroy(&log_descriptor.open_files_lock);
+ delete_dynamic(&log_descriptor.open_files);
+ delete_dynamic(&log_descriptor.unfinished_files);
+
+ if (log_descriptor.directory_fd >= 0)
+ my_close(log_descriptor.directory_fd, MYF(MY_WME));
+ my_atomic_rwlock_destroy(&LOCK_id_to_share);
+ if (id_to_share != NULL)
+ my_free((id_to_share + 1), MYF(MY_WME));
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ @brief Starts new page.
+
+ @param horizon \ Position in file and buffer where we are
+ @param cursor /
+ @param prev_buffer Buffer which should be flushed will be assigned here.
+ This is always set (to NULL if nothing to flush).
+
+ @note We do not want to flush the buffer immediately because we want to
+ let caller of this function first advance 'horizon' pointer and unlock the
+ loghandler and only then flush the log which can take some time.
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_page_next(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor,
+ struct st_translog_buffer **prev_buffer)
+{
+ struct st_translog_buffer *buffer= cursor->buffer;
+ DBUG_ENTER("translog_page_next");
+
+ *prev_buffer= NULL;
+ if ((cursor->ptr + TRANSLOG_PAGE_SIZE >
+ cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER) ||
+ (LSN_OFFSET(*horizon) >
+ log_descriptor.log_file_max_size - TRANSLOG_PAGE_SIZE))
+ {
+ DBUG_PRINT("info", ("Switch to next buffer Buffer Size: %lu (%lu) => %d "
+ "File size: %lu max: %lu => %d",
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer),
+ (cursor->ptr + TRANSLOG_PAGE_SIZE >
+ cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER),
+ (ulong) LSN_OFFSET(*horizon),
+ (ulong) log_descriptor.log_file_max_size,
+ (LSN_OFFSET(*horizon) >
+ (log_descriptor.log_file_max_size -
+ TRANSLOG_PAGE_SIZE))));
+ if (translog_buffer_next(horizon, cursor,
+ LSN_OFFSET(*horizon) >
+ (log_descriptor.log_file_max_size -
+ TRANSLOG_PAGE_SIZE)))
+ DBUG_RETURN(1);
+ *prev_buffer= buffer;
+ DBUG_PRINT("info", ("Buffer #%u (0x%lu): have to be flushed",
+ (uint) buffer->buffer_no, (ulong) buffer));
+ }
+ else
+ {
+ DBUG_PRINT("info", ("Use the same buffer #%u (0x%lu): "
+ "Buffer Size: %lu (%lu)",
+ (uint) buffer->buffer_no,
+ (ulong) buffer,
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ translog_finish_page(horizon, cursor);
+ translog_new_page_header(horizon, cursor);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write data of given length to the current page
+
+ SYNOPSIS
+ translog_write_data_on_page()
+ horizon \ Pointers on file and buffer
+ cursor /
+ length IN length of the chunk
+ buffer buffer with data
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_write_data_on_page(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor,
+ translog_size_t length,
+ uchar *buffer)
+{
+ DBUG_ENTER("translog_write_data_on_page");
+ DBUG_PRINT("enter", ("Chunk length: %lu Page size %u",
+ (ulong) length, (uint) cursor->current_page_fill));
+ DBUG_ASSERT(length > 0);
+ DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE);
+ DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer +
+ TRANSLOG_WRITE_BUFFER);
+
+ memcpy(cursor->ptr, buffer, length);
+ cursor->ptr+= length;
+ (*horizon)+= length; /* adds offset */
+ cursor->current_page_fill+= length;
+ if (!cursor->chaser)
+ cursor->buffer->size+= length;
+ DBUG_PRINT("info", ("Write data buffer #%u: 0x%lx "
+ "chaser: %d Size: %lu (%lu)",
+ (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+ cursor->chaser, (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ translog_check_cursor(cursor);
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write data from parts of given length to the current page
+
+ SYNOPSIS
+ translog_write_parts_on_page()
+ horizon \ Pointers on file and buffer
+ cursor /
+ length IN length of the chunk
+ parts IN/OUT chunk source
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_write_parts_on_page(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor,
+ translog_size_t length,
+ struct st_translog_parts *parts)
+{
+ translog_size_t left= length;
+ uint cur= (uint) parts->current;
+ DBUG_ENTER("translog_write_parts_on_page");
+ DBUG_PRINT("enter", ("Chunk length: %lu parts: %u of %u. Page size: %u "
+ "Buffer size: %lu (%lu)",
+ (ulong) length,
+ (uint) (cur + 1), (uint) parts->elements,
+ (uint) cursor->current_page_fill,
+ (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer)));
+ DBUG_ASSERT(length > 0);
+ DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE);
+ DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer +
+ TRANSLOG_WRITE_BUFFER);
+
+ do
+ {
+ translog_size_t len;
+ LEX_CUSTRING *part;
+ const uchar *buff;
+
+ DBUG_ASSERT(cur < parts->elements);
+ part= parts->parts + cur;
+ buff= part->str;
+ DBUG_PRINT("info", ("Part: %u Length: %lu left: %lu buff: 0x%lx",
+ (uint) (cur + 1), (ulong) part->length, (ulong) left,
+ (ulong) buff));
+
+ if (part->length > left)
+ {
+ /* we should write less then the current part */
+ len= left;
+ part->length-= len;
+ part->str+= len;
+ DBUG_PRINT("info", ("Set new part: %u Length: %lu",
+ (uint) (cur + 1), (ulong) part->length));
+ }
+ else
+ {
+ len= (translog_size_t) part->length;
+ cur++;
+ DBUG_PRINT("info", ("moved to next part (len: %lu)", (ulong) len));
+ }
+ DBUG_PRINT("info", ("copy: 0x%lx <- 0x%lx %u",
+ (ulong) cursor->ptr, (ulong)buff, (uint)len));
+ if (likely(len))
+ {
+ memcpy(cursor->ptr, buff, len);
+ left-= len;
+ cursor->ptr+= len;
+ }
+ } while (left);
+
+ DBUG_PRINT("info", ("Horizon: (%lu,0x%lx) Length %lu(0x%lx)",
+ LSN_IN_PARTS(*horizon),
+ (ulong) length, (ulong) length));
+ parts->current= cur;
+ (*horizon)+= length; /* offset increasing */
+ cursor->current_page_fill+= length;
+ if (!cursor->chaser)
+ cursor->buffer->size+= length;
+ /*
+ We do not not updating parts->total_record_length here because it is
+ need only before writing record to have total length
+ */
+ DBUG_PRINT("info", ("Write parts buffer #%u: 0x%lx "
+ "chaser: %d Size: %lu (%lu) "
+ "Horizon: (%lu,0x%lx) buff offset: 0x%lx",
+ (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+ cursor->chaser, (ulong) cursor->buffer->size,
+ (ulong) (cursor->ptr - cursor->buffer->buffer),
+ LSN_IN_PARTS(*horizon),
+ (ulong) (LSN_OFFSET(cursor->buffer->offset) +
+ cursor->buffer->size)));
+ translog_check_cursor(cursor);
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Put 1 group chunk type 0 header into parts array
+
+ SYNOPSIS
+ translog_write_variable_record_1group_header()
+ parts Descriptor of record source parts
+ type The log record type
+ short_trid Short transaction ID or 0 if it has no sense
+ header_length Calculated header length of chunk type 0
+ chunk0_header Buffer for the chunk header writing
+*/
+
+static void
+translog_write_variable_record_1group_header(struct st_translog_parts *parts,
+ enum translog_record_type type,
+ SHORT_TRANSACTION_ID short_trid,
+ uint16 header_length,
+ uchar *chunk0_header)
+{
+ LEX_CUSTRING *part;
+ DBUG_ASSERT(parts->current != 0); /* first part is left for header */
+ part= parts->parts + (--parts->current);
+ parts->total_record_length+= (translog_size_t) (part->length= header_length);
+ part->str= chunk0_header;
+ /* puts chunk type */
+ *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN);
+ int2store(chunk0_header + 1, short_trid);
+ /* puts record length */
+ translog_write_variable_record_1group_code_len(chunk0_header + 3,
+ parts->record_length,
+ header_length);
+ /* puts 0 as chunk length which indicate 1 group record */
+ int2store(chunk0_header + header_length - 2, 0);
+}
+
+
+/*
+ Increase number of writers for this buffer
+
+ SYNOPSIS
+ translog_buffer_increase_writers()
+ buffer target buffer
+*/
+
+static inline void
+translog_buffer_increase_writers(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_buffer_increase_writers");
+ translog_buffer_lock_assert_owner(buffer);
+ buffer->copy_to_buffer_in_progress++;
+ DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u 0x%lx progress: %d",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ buffer->copy_to_buffer_in_progress));
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Decrease number of writers for this buffer
+
+ SYNOPSIS
+ translog_buffer_decrease_writers()
+ buffer target buffer
+*/
+
+static void translog_buffer_decrease_writers(struct st_translog_buffer *buffer)
+{
+ DBUG_ENTER("translog_buffer_decrease_writers");
+ translog_buffer_lock_assert_owner(buffer);
+ buffer->copy_to_buffer_in_progress--;
+ DBUG_PRINT("info",
+ ("copy_to_buffer_in_progress. Buffer #%u 0x%lx progress: %d",
+ (uint) buffer->buffer_no, (ulong) buffer,
+ buffer->copy_to_buffer_in_progress));
+ if (buffer->copy_to_buffer_in_progress == 0)
+ pthread_cond_broadcast(&buffer->waiting_filling_buffer);
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Skip to the next page for chaser (thread which advanced horizon
+ pointer and now feeling the buffer)
+
+ @param horizon \ Pointers on file position and buffer
+ @param cursor /
+
+ @retval 1 OK
+ @retval 0 Error
+*/
+
+static my_bool translog_chaser_page_next(TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor)
+{
+ struct st_translog_buffer *buffer_to_flush;
+ my_bool rc;
+ DBUG_ENTER("translog_chaser_page_next");
+ DBUG_ASSERT(cursor->chaser);
+ rc= translog_page_next(horizon, cursor, &buffer_to_flush);
+ if (buffer_to_flush != NULL)
+ {
+ translog_buffer_lock(buffer_to_flush);
+ translog_buffer_decrease_writers(buffer_to_flush);
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ translog_buffer_unlock(buffer_to_flush);
+ }
+ DBUG_RETURN(rc);
+}
+
+/*
+ Put chunk 2 from new page beginning
+
+ SYNOPSIS
+ translog_write_variable_record_chunk2_page()
+ parts Descriptor of record source parts
+ horizon \ Pointers on file position and buffer
+ cursor /
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_chunk2_page(struct st_translog_parts *parts,
+ TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor)
+{
+ uchar chunk2_header[1];
+ DBUG_ENTER("translog_write_variable_record_chunk2_page");
+ chunk2_header[0]= TRANSLOG_CHUNK_NOHDR;
+
+ if (translog_chaser_page_next(horizon, cursor))
+ DBUG_RETURN(1);
+
+ /* Puts chunk type */
+ translog_write_data_on_page(horizon, cursor, 1, chunk2_header);
+ /* Puts chunk body */
+ translog_write_parts_on_page(horizon, cursor,
+ log_descriptor.page_capacity_chunk_2, parts);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Put chunk 3 of requested length in the buffer from new page beginning
+
+ SYNOPSIS
+ translog_write_variable_record_chunk3_page()
+ parts Descriptor of record source parts
+ length Length of this chunk
+ horizon \ Pointers on file position and buffer
+ cursor /
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_chunk3_page(struct st_translog_parts *parts,
+ uint16 length,
+ TRANSLOG_ADDRESS *horizon,
+ struct st_buffer_cursor *cursor)
+{
+ LEX_CUSTRING *part;
+ uchar chunk3_header[1 + 2];
+ DBUG_ENTER("translog_write_variable_record_chunk3_page");
+
+ if (translog_chaser_page_next(horizon, cursor))
+ DBUG_RETURN(1);
+
+ if (length == 0)
+ {
+ /* It was call to write page header only (no data for chunk 3) */
+ DBUG_PRINT("info", ("It is a call to make page header only"));
+ DBUG_RETURN(0);
+ }
+
+ DBUG_ASSERT(parts->current != 0); /* first part is left for header */
+ part= parts->parts + (--parts->current);
+ parts->total_record_length+= (translog_size_t) (part->length= 1 + 2);
+ part->str= chunk3_header;
+ /* Puts chunk type */
+ *chunk3_header= (uchar) (TRANSLOG_CHUNK_LNGTH);
+ /* Puts chunk length */
+ int2store(chunk3_header + 1, length);
+
+ translog_write_parts_on_page(horizon, cursor, length + 1 + 2, parts);
+ DBUG_RETURN(0);
+}
+
+/*
+ Move log pointer (horizon) on given number pages starting from next page,
+ and given offset on the last page
+
+ SYNOPSIS
+ translog_advance_pointer()
+ pages Number of full pages starting from the next one
+ last_page_data Plus this data on the last page
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_advance_pointer(int pages, uint16 last_page_data)
+{
+ translog_size_t last_page_offset= (log_descriptor.page_overhead +
+ last_page_data);
+ translog_size_t offset= (TRANSLOG_PAGE_SIZE -
+ log_descriptor.bc.current_page_fill +
+ pages * TRANSLOG_PAGE_SIZE + last_page_offset);
+ translog_size_t buffer_end_offset, file_end_offset, min_offset;
+ DBUG_ENTER("translog_advance_pointer");
+ DBUG_PRINT("enter", ("Pointer: (%lu, 0x%lx) + %u + %u pages + %u + %u",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ (uint) (TRANSLOG_PAGE_SIZE -
+ log_descriptor.bc.current_page_fill),
+ pages, (uint) log_descriptor.page_overhead,
+ (uint) last_page_data));
+ translog_lock_assert_owner();
+
+ if (pages == -1)
+ {
+ /*
+ It is special case when we advance the pointer on the same page.
+ It can happened when we write last part of multi-group record.
+ */
+ DBUG_ASSERT(last_page_data + log_descriptor.bc.current_page_fill <=
+ TRANSLOG_PAGE_SIZE);
+ offset= last_page_data;
+ last_page_offset= log_descriptor.bc.current_page_fill + last_page_data;
+ goto end;
+ }
+ DBUG_PRINT("info", ("last_page_offset %lu", (ulong) last_page_offset));
+ DBUG_ASSERT(last_page_offset <= TRANSLOG_PAGE_SIZE);
+
+ /*
+ The loop will be executed 1-3 times. Usually we advance the
+ pointer to fill only the current buffer (if we have more then 1/2 of
+ buffer free or 2 buffers (rest of current and all next). In case of
+ really huge record end where we write last group with "table of
+ content" of all groups and ignore buffer borders we can occupy
+ 3 buffers.
+ */
+ for (;;)
+ {
+ uint8 new_buffer_no;
+ struct st_translog_buffer *new_buffer;
+ struct st_translog_buffer *old_buffer;
+ buffer_end_offset= TRANSLOG_WRITE_BUFFER - log_descriptor.bc.buffer->size;
+ if (likely(log_descriptor.log_file_max_size >=
+ LSN_OFFSET(log_descriptor.horizon)))
+ file_end_offset= (log_descriptor.log_file_max_size -
+ LSN_OFFSET(log_descriptor.horizon));
+ else
+ {
+ /*
+ We already have written more then current file limit allow,
+ So we will finish this page and start new file
+ */
+ file_end_offset= (TRANSLOG_PAGE_SIZE -
+ log_descriptor.bc.current_page_fill);
+ }
+ DBUG_PRINT("info", ("offset: %lu buffer_end_offs: %lu, "
+ "file_end_offs: %lu",
+ (ulong) offset, (ulong) buffer_end_offset,
+ (ulong) file_end_offset));
+ DBUG_PRINT("info", ("Buff #%u %u (0x%lx) offset 0x%lx + size 0x%lx = "
+ "0x%lx (0x%lx)",
+ (uint) log_descriptor.bc.buffer->buffer_no,
+ (uint) log_descriptor.bc.buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ (ulong) LSN_OFFSET(log_descriptor.bc.buffer->offset),
+ (ulong) log_descriptor.bc.buffer->size,
+ (ulong) (LSN_OFFSET(log_descriptor.bc.buffer->offset) +
+ log_descriptor.bc.buffer->size),
+ (ulong) LSN_OFFSET(log_descriptor.horizon)));
+ DBUG_ASSERT(LSN_OFFSET(log_descriptor.bc.buffer->offset) +
+ log_descriptor.bc.buffer->size ==
+ LSN_OFFSET(log_descriptor.horizon));
+
+ if (offset <= buffer_end_offset && offset <= file_end_offset)
+ break;
+ old_buffer= log_descriptor.bc.buffer;
+ new_buffer_no= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ new_buffer= log_descriptor.buffers + new_buffer_no;
+
+ translog_buffer_lock(new_buffer);
+#ifndef DBUG_OFF
+ {
+ TRANSLOG_ADDRESS offset= new_buffer->offset;
+ TRANSLOG_FILE *file= new_buffer->file;
+ uint8 ver= new_buffer->ver;
+ translog_lock_assert_owner();
+#endif
+ translog_wait_for_buffer_free(new_buffer);
+#ifndef DBUG_OFF
+ /* We keep the handler locked so nobody can start this new buffer */
+ DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL &&
+ (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver);
+ }
+#endif
+
+ min_offset= min(buffer_end_offset, file_end_offset);
+ /* TODO: check is it ptr or size enough */
+ log_descriptor.bc.buffer->size+= min_offset;
+ log_descriptor.bc.ptr+= min_offset;
+ DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu)",
+ (uint) log_descriptor.bc.buffer->buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ log_descriptor.bc.chaser,
+ (ulong) log_descriptor.bc.buffer->size,
+ (ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
+ buffer->buffer)));
+ DBUG_ASSERT((ulong) (log_descriptor.bc.ptr -
+ log_descriptor.bc.buffer->buffer) ==
+ log_descriptor.bc.buffer->size);
+ DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ log_descriptor.bc.buffer_no);
+ translog_buffer_increase_writers(log_descriptor.bc.buffer);
+
+ if (file_end_offset <= buffer_end_offset)
+ {
+ log_descriptor.horizon+= LSN_ONE_FILE;
+ log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon,
+ TRANSLOG_PAGE_SIZE);
+ DBUG_PRINT("info", ("New file: %lu",
+ (ulong) LSN_FILE_NO(log_descriptor.horizon)));
+ if (translog_create_new_file())
+ {
+ DBUG_RETURN(1);
+ }
+ }
+ else
+ {
+ DBUG_PRINT("info", ("The same file"));
+ log_descriptor.horizon+= min_offset; /* offset increasing */
+ }
+ translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no);
+ old_buffer->next_buffer_offset= new_buffer->offset;
+ new_buffer->prev_buffer_offset= old_buffer->offset;
+ translog_buffer_unlock(old_buffer);
+ offset-= min_offset;
+ }
+ DBUG_PRINT("info", ("drop write_counter"));
+ log_descriptor.bc.write_counter= 0;
+ log_descriptor.bc.previous_offset= 0;
+end:
+ log_descriptor.bc.ptr+= offset;
+ log_descriptor.bc.buffer->size+= offset;
+ translog_buffer_increase_writers(log_descriptor.bc.buffer);
+ log_descriptor.horizon+= offset; /* offset increasing */
+ log_descriptor.bc.current_page_fill= last_page_offset;
+ DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu) "
+ "offset: %u last page: %u",
+ (uint) log_descriptor.bc.buffer->buffer_no,
+ (ulong) log_descriptor.bc.buffer,
+ log_descriptor.bc.chaser,
+ (ulong) log_descriptor.bc.buffer->size,
+ (ulong) (log_descriptor.bc.ptr -
+ log_descriptor.bc.buffer->
+ buffer), (uint) offset,
+ (uint) last_page_offset));
+ DBUG_PRINT("info",
+ ("pointer moved to: (%lu, 0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon)));
+ translog_check_cursor(&log_descriptor.bc);
+ log_descriptor.bc.protected= 0;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Get page rest
+
+ SYNOPSIS
+ translog_get_current_page_rest()
+
+ NOTE loghandler should be locked
+
+ RETURN
+ number of bytes left on the current page
+*/
+
+static uint translog_get_current_page_rest()
+{
+ return (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill);
+}
+
+
+/*
+ Get buffer rest in full pages
+
+ SYNOPSIS
+ translog_get_current_buffer_rest()
+
+ NOTE loghandler should be locked
+
+ RETURN
+ number of full pages left on the current buffer
+*/
+
+static uint translog_get_current_buffer_rest()
+{
+ return ((log_descriptor.bc.buffer->buffer + TRANSLOG_WRITE_BUFFER -
+ log_descriptor.bc.ptr) /
+ TRANSLOG_PAGE_SIZE);
+}
+
+/*
+ Calculate possible group size without first (current) page
+
+ SYNOPSIS
+ translog_get_current_group_size()
+
+ NOTE loghandler should be locked
+
+ RETURN
+ group size without first (current) page
+*/
+
+static translog_size_t translog_get_current_group_size()
+{
+ /* buffer rest in full pages */
+ translog_size_t buffer_rest= translog_get_current_buffer_rest();
+ DBUG_ENTER("translog_get_current_group_size");
+ DBUG_PRINT("info", ("buffer_rest in pages: %u", buffer_rest));
+
+ buffer_rest*= log_descriptor.page_capacity_chunk_2;
+ /* in case of only half of buffer free we can write this and next buffer */
+ if (buffer_rest < log_descriptor.half_buffer_capacity_chunk_2)
+ {
+ DBUG_PRINT("info", ("buffer_rest: %lu -> add %lu",
+ (ulong) buffer_rest,
+ (ulong) log_descriptor.buffer_capacity_chunk_2));
+ buffer_rest+= log_descriptor.buffer_capacity_chunk_2;
+ }
+
+ DBUG_PRINT("info", ("buffer_rest: %lu", (ulong) buffer_rest));
+
+ DBUG_RETURN(buffer_rest);
+}
+
+
+static inline void set_lsn(LSN *lsn, LSN value)
+{
+ DBUG_ENTER("set_lsn");
+ translog_lock_assert_owner();
+ *lsn= value;
+ /* we generate LSN so something is not flushed in log */
+ log_descriptor.is_everything_flushed= 0;
+ DBUG_PRINT("info", ("new LSN appeared: (%lu,0x%lx)", LSN_IN_PARTS(value)));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Write variable record in 1 group.
+
+ @param lsn LSN of the record will be written here
+ @param type the log record type
+ @param short_trid Short transaction ID or 0 if it has no sense
+ @param parts Descriptor of record source parts
+ @param buffer_to_flush Buffer which have to be flushed if it is not 0
+ @param header_length Calculated header length of chunk type 0
+ @param trn Transaction structure pointer for hooks by
+ record log type, for short_id
+ @param hook_arg Argument which will be passed to pre-write and
+ in-write hooks of this record.
+
+ @note
+ We must have a translog_lock() when entering this function
+ We must have buffer_to_flush locked (if not null)
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_1group(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ struct st_translog_buffer
+ *buffer_to_flush, uint16 header_length,
+ TRN *trn, void *hook_arg)
+{
+ TRANSLOG_ADDRESS horizon;
+ struct st_buffer_cursor cursor;
+ int rc= 0;
+ uint i;
+ translog_size_t record_rest, full_pages, first_page;
+ uint additional_chunk3_page= 0;
+ uchar chunk0_header[1 + 2 + 5 + 2];
+ DBUG_ENTER("translog_write_variable_record_1group");
+ translog_lock_assert_owner();
+ if (buffer_to_flush)
+ translog_buffer_lock_assert_owner(buffer_to_flush);
+
+ set_lsn(lsn, horizon= log_descriptor.horizon);
+ if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn),
+ *lsn, TRUE) ||
+ (log_record_type_descriptor[type].inwrite_hook &&
+ (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info,
+ lsn, hook_arg)))
+ {
+ translog_unlock();
+ DBUG_RETURN(1);
+ }
+ cursor= log_descriptor.bc;
+ cursor.chaser= 1;
+
+ /* Advance pointer to be able unlock the loghandler */
+ first_page= translog_get_current_page_rest();
+ record_rest= parts->record_length - (first_page - header_length);
+ full_pages= record_rest / log_descriptor.page_capacity_chunk_2;
+ record_rest= (record_rest % log_descriptor.page_capacity_chunk_2);
+
+ if (record_rest + 1 == log_descriptor.page_capacity_chunk_2)
+ {
+ DBUG_PRINT("info", ("2 chunks type 3 is needed"));
+ /* We will write 2 chunks type 3 at the end of this group */
+ additional_chunk3_page= 1;
+ record_rest= 1;
+ }
+
+ DBUG_PRINT("info", ("first_page: %u (%u) full_pages: %u (%lu) "
+ "additional: %u (%u) rest %u = %u",
+ first_page, first_page - header_length,
+ full_pages,
+ (ulong) full_pages *
+ log_descriptor.page_capacity_chunk_2,
+ additional_chunk3_page,
+ additional_chunk3_page *
+ (log_descriptor.page_capacity_chunk_2 - 1),
+ record_rest, parts->record_length));
+ /* record_rest + 3 is chunk type 3 overhead + record_rest */
+ rc|= translog_advance_pointer((int)(full_pages + additional_chunk3_page),
+ (record_rest ? record_rest + 3 : 0));
+ log_descriptor.bc.buffer->last_lsn= *lsn;
+ DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx) buffer: 0x%lx",
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn),
+ (ulong) log_descriptor.bc.buffer));
+
+ translog_unlock();
+
+ /*
+ Check if we switched buffer and need process it (current buffer is
+ unlocked already => we will not delay other threads
+ */
+ if (buffer_to_flush != NULL)
+ {
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ translog_buffer_unlock(buffer_to_flush);
+ }
+ if (rc)
+ DBUG_RETURN(1);
+
+ translog_write_variable_record_1group_header(parts, type, short_trid,
+ header_length, chunk0_header);
+
+ /* fill the pages */
+ translog_write_parts_on_page(&horizon, &cursor, first_page, parts);
+
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon)));
+
+ for (i= 0; i < full_pages; i++)
+ {
+ if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor))
+ DBUG_RETURN(1);
+
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon)));
+ }
+
+ if (additional_chunk3_page)
+ {
+ if (translog_write_variable_record_chunk3_page(parts,
+ log_descriptor.
+ page_capacity_chunk_2 - 2,
+ &horizon, &cursor))
+ DBUG_RETURN(1);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon)));
+ DBUG_ASSERT(cursor.current_page_fill == TRANSLOG_PAGE_SIZE);
+ }
+
+ if (translog_write_variable_record_chunk3_page(parts,
+ record_rest,
+ &horizon, &cursor))
+ DBUG_RETURN(1);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ (ulong) LSN_FILE_NO(log_descriptor.horizon),
+ (ulong) LSN_OFFSET(log_descriptor.horizon),
+ (ulong) LSN_FILE_NO(horizon),
+ (ulong) LSN_OFFSET(horizon)));
+
+ translog_buffer_lock(cursor.buffer);
+ translog_buffer_decrease_writers(cursor.buffer);
+ translog_buffer_unlock(cursor.buffer);
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief Write variable record in 1 chunk.
+
+ @param lsn LSN of the record will be written here
+ @param type the log record type
+ @param short_trid Short transaction ID or 0 if it has no sense
+ @param parts Descriptor of record source parts
+ @param buffer_to_flush Buffer which have to be flushed if it is not 0
+ @param header_length Calculated header length of chunk type 0
+ @param trn Transaction structure pointer for hooks by
+ record log type, for short_id
+ @param hook_arg Argument which will be passed to pre-write and
+ in-write hooks of this record.
+
+ @note
+ We must have a translog_lock() when entering this function
+ We must have buffer_to_flush locked (if not null)
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_1chunk(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ struct st_translog_buffer
+ *buffer_to_flush, uint16 header_length,
+ TRN *trn, void *hook_arg)
+{
+ int rc;
+ uchar chunk0_header[1 + 2 + 5 + 2];
+ DBUG_ENTER("translog_write_variable_record_1chunk");
+ translog_lock_assert_owner();
+ if (buffer_to_flush)
+ translog_buffer_lock_assert_owner(buffer_to_flush);
+
+ translog_write_variable_record_1group_header(parts, type, short_trid,
+ header_length, chunk0_header);
+ set_lsn(lsn, log_descriptor.horizon);
+ if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn),
+ *lsn, TRUE) ||
+ (log_record_type_descriptor[type].inwrite_hook &&
+ (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info,
+ lsn, hook_arg)))
+ {
+ translog_unlock();
+ DBUG_RETURN(1);
+ }
+
+ rc= translog_write_parts_on_page(&log_descriptor.horizon,
+ &log_descriptor.bc,
+ parts->total_record_length, parts);
+ log_descriptor.bc.buffer->last_lsn= *lsn;
+ DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx) buffer: 0x%lx",
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn),
+ (ulong) log_descriptor.bc.buffer));
+ translog_unlock();
+
+ /*
+ check if we switched buffer and need process it (current buffer is
+ unlocked already => we will not delay other threads
+ */
+ if (buffer_to_flush != NULL)
+ {
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ translog_buffer_unlock(buffer_to_flush);
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Calculates and write LSN difference (compressed LSN).
+
+ @param base_lsn LSN from which we calculate difference
+ @param lsn LSN for codding
+ @param dst Result will be written to dst[-pack_length] .. dst[-1]
+
+ @note To store an LSN in a compact way we will use the following compression:
+ If a log record has LSN1, and it contains the LSN2 as a back reference,
+ Instead of LSN2 we write LSN1-LSN2, encoded as:
+ two bits the number N (see below)
+ 14 bits
+ N bytes
+ That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2
+ is stored in the first two bits.
+
+ @note function made to write the result in backward direction with no
+ special sense or tricks both directions are equal in complicity
+
+ @retval # pointer on coded LSN
+*/
+
+static uchar *translog_put_LSN_diff(LSN base_lsn, LSN lsn, uchar *dst)
+{
+ uint64 diff;
+ DBUG_ENTER("translog_put_LSN_diff");
+ DBUG_PRINT("enter", ("Base: (%lu,0x%lx) val: (%lu,0x%lx) dst: 0x%lx",
+ LSN_IN_PARTS(base_lsn), LSN_IN_PARTS(lsn),
+ (ulong) dst));
+ DBUG_ASSERT(base_lsn > lsn);
+ diff= base_lsn - lsn;
+ DBUG_PRINT("info", ("Diff: 0x%llx", (ulonglong) diff));
+ if (diff <= 0x3FFF)
+ {
+ dst-= 2;
+ /*
+ Note we store this high uchar first to ensure that first uchar has
+ 0 in the 3 upper bits.
+ */
+ dst[0]= (uchar)(diff >> 8);
+ dst[1]= (uchar)(diff & 0xFF);
+ }
+ else if (diff <= 0x3FFFFFL)
+ {
+ dst-= 3;
+ dst[0]= (uchar)(0x40 | (diff >> 16));
+ int2store(dst + 1, diff & 0xFFFF);
+ }
+ else if (diff <= 0x3FFFFFFFL)
+ {
+ dst-= 4;
+ dst[0]= (uchar)(0x80 | (diff >> 24));
+ int3store(dst + 1, diff & 0xFFFFFFL);
+ }
+ else if (diff <= LL(0x3FFFFFFFFF))
+
+ {
+ dst-= 5;
+ dst[0]= (uchar)(0xC0 | (diff >> 32));
+ int4store(dst + 1, diff & 0xFFFFFFFFL);
+ }
+ else
+ {
+ /*
+ It is full LSN after special 1 diff (which is impossible
+ in real life)
+ */
+ dst-= 2 + LSN_STORE_SIZE;
+ dst[0]= 0;
+ dst[1]= 1;
+ lsn_store(dst + 2, lsn);
+ }
+ DBUG_PRINT("info", ("new dst: 0x%lx", (ulong) dst));
+ DBUG_RETURN(dst);
+}
+
+
+/*
+ Get LSN from LSN-difference (compressed LSN)
+
+ SYNOPSIS
+ translog_get_LSN_from_diff()
+ base_lsn LSN from which we calculate difference
+ src pointer to coded lsn
+ dst pointer to buffer where to write 7byte LSN
+
+ NOTE:
+ To store an LSN in a compact way we will use the following compression:
+
+ If a log record has LSN1, and it contains the lSN2 as a back reference,
+ Instead of LSN2 we write LSN1-LSN2, encoded as:
+
+ two bits the number N (see below)
+ 14 bits
+ N bytes
+
+ That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2
+ is stored in the first two bits.
+
+ RETURN
+ pointer to buffer after decoded LSN
+*/
+
+static uchar *translog_get_LSN_from_diff(LSN base_lsn, uchar *src, uchar *dst)
+{
+ LSN lsn;
+ uint32 diff;
+ uint32 first_byte;
+ uint32 file_no, rec_offset;
+ uint8 code;
+ DBUG_ENTER("translog_get_LSN_from_diff");
+ DBUG_PRINT("enter", ("Base: (%lu,0x%lx) src: 0x%lx dst 0x%lx",
+ LSN_IN_PARTS(base_lsn), (ulong) src, (ulong) dst));
+ first_byte= *((uint8*) src);
+ code= first_byte >> 6; /* Length is in 2 most significant bits */
+ first_byte&= 0x3F;
+ src++; /* Skip length + encode */
+ file_no= LSN_FILE_NO(base_lsn); /* Assume relative */
+ DBUG_PRINT("info", ("code: %u first byte: %lu",
+ (uint) code, (ulong) first_byte));
+ switch (code) {
+ case 0:
+ if (first_byte == 0 && *((uint8*)src) == 1)
+ {
+ /*
+ It is full LSN after special 1 diff (which is impossible
+ in real life)
+ */
+ memcpy(dst, src + 1, LSN_STORE_SIZE);
+ DBUG_PRINT("info", ("Special case of full LSN, new src: 0x%lx",
+ (ulong) (src + 1 + LSN_STORE_SIZE)));
+ DBUG_RETURN(src + 1 + LSN_STORE_SIZE);
+ }
+ rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 8) + *((uint8*)src));
+ break;
+ case 1:
+ diff= uint2korr(src);
+ rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 16) + diff);
+ break;
+ case 2:
+ diff= uint3korr(src);
+ rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 24) + diff);
+ break;
+ case 3:
+ {
+ ulonglong base_offset= LSN_OFFSET(base_lsn);
+ diff= uint4korr(src);
+ if (diff > LSN_OFFSET(base_lsn))
+ {
+ /* take 1 from file offset */
+ first_byte++;
+ base_offset+= LL(0x100000000);
+ }
+ file_no= LSN_FILE_NO(base_lsn) - first_byte;
+ DBUG_ASSERT(base_offset - diff <= UINT_MAX);
+ rec_offset= (uint32)(base_offset - diff);
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ DBUG_RETURN(NULL);
+ }
+ lsn= MAKE_LSN(file_no, rec_offset);
+ src+= code + 1;
+ lsn_store(dst, lsn);
+ DBUG_PRINT("info", ("new src: 0x%lx", (ulong) src));
+ DBUG_RETURN(src);
+}
+
+
+/**
+ @brief Encodes relative LSNs listed in the parameters.
+
+ @param parts Parts list with encoded LSN(s)
+ @param base_lsn LSN which is base for encoding
+ @param lsns number of LSN(s) to encode
+ @param compressed_LSNs buffer which can be used for storing compressed LSN(s)
+*/
+
+static void translog_relative_LSN_encode(struct st_translog_parts *parts,
+ LSN base_lsn,
+ uint lsns, uchar *compressed_LSNs)
+{
+ LEX_CUSTRING *part;
+ uint lsns_len= lsns * LSN_STORE_SIZE;
+ uchar buffer_src[MAX_NUMBER_OF_LSNS_PER_RECORD * LSN_STORE_SIZE];
+ uchar *buffer= buffer_src;
+ const uchar *cbuffer;
+
+ DBUG_ENTER("translog_relative_LSN_encode");
+
+ DBUG_ASSERT(parts->current != 0);
+ part= parts->parts + parts->current;
+
+ /* collect all LSN(s) in one chunk if it (they) is (are) divided */
+ if (part->length < lsns_len)
+ {
+ uint copied= part->length;
+ LEX_CUSTRING *next_part;
+ DBUG_PRINT("info", ("Using buffer: 0x%lx", (ulong) compressed_LSNs));
+ memcpy(buffer, part->str, part->length);
+ next_part= parts->parts + parts->current + 1;
+ do
+ {
+ DBUG_ASSERT(next_part < parts->parts + parts->elements);
+ if ((next_part->length + copied) < lsns_len)
+ {
+ memcpy(buffer + copied, next_part->str,
+ next_part->length);
+ copied+= next_part->length;
+ next_part->length= 0; next_part->str= 0;
+ /* delete_dynamic_element(&parts->parts, parts->current + 1); */
+ next_part++;
+ parts->current++;
+ part= parts->parts + parts->current;
+ }
+ else
+ {
+ uint len= lsns_len - copied;
+ memcpy(buffer + copied, next_part->str, len);
+ copied= lsns_len;
+ next_part->str+= len;
+ next_part->length-= len;
+ }
+ } while (copied < lsns_len);
+ cbuffer= buffer;
+ }
+ else
+ {
+ cbuffer= part->str;
+ part->str+= lsns_len;
+ part->length-= lsns_len;
+ parts->current--;
+ part= parts->parts + parts->current;
+ }
+
+ {
+ /* Compress */
+ LSN ref;
+ int economy;
+ const uchar *src_ptr;
+ uchar *dst_ptr= compressed_LSNs + (MAX_NUMBER_OF_LSNS_PER_RECORD *
+ COMPRESSED_LSN_MAX_STORE_SIZE);
+ /*
+ We write the result in backward direction with no special sense or
+ tricks both directions are equal in complicity
+ */
+ for (src_ptr= cbuffer + lsns_len - LSN_STORE_SIZE;
+ src_ptr >= (const uchar*)cbuffer;
+ src_ptr-= LSN_STORE_SIZE)
+ {
+ ref= lsn_korr(src_ptr);
+ dst_ptr= translog_put_LSN_diff(base_lsn, ref, dst_ptr);
+ }
+ part->length= (uint)((compressed_LSNs +
+ (MAX_NUMBER_OF_LSNS_PER_RECORD *
+ COMPRESSED_LSN_MAX_STORE_SIZE)) -
+ dst_ptr);
+ parts->record_length-= (economy= lsns_len - part->length);
+ DBUG_PRINT("info", ("new length of LSNs: %lu economy: %d",
+ (ulong)part->length, economy));
+ parts->total_record_length-= economy;
+ part->str= dst_ptr;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Write multi-group variable-size record.
+
+ @param lsn LSN of the record will be written here
+ @param type the log record type
+ @param short_trid Short transaction ID or 0 if it has no sense
+ @param parts Descriptor of record source parts
+ @param buffer_to_flush Buffer which have to be flushed if it is not 0
+ @param header_length Header length calculated for 1 group
+ @param buffer_rest Beginning from which we plan to write in full pages
+ @param trn Transaction structure pointer for hooks by
+ record log type, for short_id
+ @param hook_arg Argument which will be passed to pre-write and
+ in-write hooks of this record.
+
+ @note
+ We must have a translog_lock() when entering this function
+
+ We must have buffer_to_flush locked (if not null)
+ buffer_to_flush should *NOT* be locked when calling this function.
+ (This is note is here as this is different from most other
+ translog_write...() functions which require the buffer to be locked)
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool
+translog_write_variable_record_mgroup(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ struct st_translog_buffer
+ *buffer_to_flush,
+ uint16 header_length,
+ translog_size_t buffer_rest,
+ TRN *trn, void *hook_arg)
+{
+ TRANSLOG_ADDRESS horizon;
+ struct st_buffer_cursor cursor;
+ int rc= 0;
+ uint i, chunk2_page, full_pages;
+ uint curr_group= 0;
+ translog_size_t record_rest, first_page, chunk3_pages, chunk0_pages= 1;
+ translog_size_t done= 0;
+ struct st_translog_group_descriptor group;
+ DYNAMIC_ARRAY groups;
+ uint16 chunk3_size;
+ uint16 page_capacity= log_descriptor.page_capacity_chunk_2 + 1;
+ uint16 last_page_capacity;
+ my_bool new_page_before_chunk0= 1, first_chunk0= 1;
+ uchar chunk0_header[1 + 2 + 5 + 2 + 2], group_desc[7 + 1];
+ uchar chunk2_header[1];
+ uint header_fixed_part= header_length + 2;
+ uint groups_per_page= (page_capacity - header_fixed_part) / (7 + 1);
+ uint file_of_the_first_group;
+ int pages_to_skip;
+ struct st_translog_buffer *buffer_of_last_lsn;
+ DBUG_ENTER("translog_write_variable_record_mgroup");
+ translog_lock_assert_owner();
+
+ chunk2_header[0]= TRANSLOG_CHUNK_NOHDR;
+
+ if (my_init_dynamic_array(&groups,
+ sizeof(struct st_translog_group_descriptor),
+ 10, 10))
+ {
+ translog_unlock();
+ DBUG_PRINT("error", ("init array failed"));
+ DBUG_RETURN(1);
+ }
+
+ first_page= translog_get_current_page_rest();
+ record_rest= parts->record_length - (first_page - 1);
+ DBUG_PRINT("info", ("Record Rest: %lu", (ulong) record_rest));
+
+ if (record_rest < buffer_rest)
+ {
+ /*
+ The record (group 1 type) is larger than the free space on the page
+ - we need to split it in two. But when we split it in two, the first
+ part is big enough to hold all the data of the record (because the
+ header of the first part of the split is smaller than the header of
+ the record as a whole when it takes only one chunk)
+ */
+ DBUG_PRINT("info", ("too many free space because changing header"));
+ buffer_rest-= log_descriptor.page_capacity_chunk_2;
+ DBUG_ASSERT(record_rest >= buffer_rest);
+ }
+
+ file_of_the_first_group= LSN_FILE_NO(log_descriptor.horizon);
+ translog_mark_file_unfinished(file_of_the_first_group);
+ do
+ {
+ group.addr= horizon= log_descriptor.horizon;
+ cursor= log_descriptor.bc;
+ cursor.chaser= 1;
+ if ((full_pages= buffer_rest / log_descriptor.page_capacity_chunk_2) > 255)
+ {
+ /* sizeof(uint8) == 256 is max number of chunk in multi-chunks group */
+ full_pages= 255;
+ buffer_rest= full_pages * log_descriptor.page_capacity_chunk_2;
+ }
+ /*
+ group chunks =
+ full pages + first page (which actually can be full, too).
+ But here we assign number of chunks - 1
+ */
+ group.num= full_pages;
+ if (insert_dynamic(&groups, (uchar*) &group))
+ {
+ DBUG_PRINT("error", ("insert into array failed"));
+ goto err_unlock;
+ }
+
+ DBUG_PRINT("info", ("chunk: #%u first_page: %u (%u) "
+ "full_pages: %lu (%lu) "
+ "Left %lu",
+ groups.elements,
+ first_page, first_page - 1,
+ (ulong) full_pages,
+ (ulong) (full_pages *
+ log_descriptor.page_capacity_chunk_2),
+ (ulong)(parts->record_length - (first_page - 1 +
+ buffer_rest) -
+ done)));
+ rc|= translog_advance_pointer((int)full_pages, 0);
+
+ translog_unlock();
+
+ if (buffer_to_flush != NULL)
+ {
+ translog_buffer_decrease_writers(buffer_to_flush);
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ translog_buffer_unlock(buffer_to_flush);
+ buffer_to_flush= NULL;
+ }
+ if (rc)
+ {
+ DBUG_PRINT("error", ("flush of unlock buffer failed"));
+ goto err;
+ }
+
+ translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header);
+ translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) "
+ "Left %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - (first_page - 1) -
+ done)));
+
+ for (i= 0; i < full_pages; i++)
+ {
+ if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor))
+ goto err;
+
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) "
+ "local: (%lu,0x%lx) "
+ "Left: %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - (first_page - 1) -
+ i * log_descriptor.page_capacity_chunk_2 -
+ done)));
+ }
+
+ done+= (first_page - 1 + buffer_rest);
+
+ if (translog_chaser_page_next(&horizon, &cursor))
+ {
+ DBUG_PRINT("error", ("flush of unlock buffer failed"));
+ goto err;
+ }
+ translog_buffer_lock(cursor.buffer);
+ translog_buffer_decrease_writers(cursor.buffer);
+ translog_buffer_unlock(cursor.buffer);
+
+ translog_lock();
+
+ /* Check that we have place for chunk type 2 */
+ first_page= translog_get_current_page_rest();
+ if (first_page <= 1)
+ {
+ if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc,
+ &buffer_to_flush))
+ goto err_unlock;
+ first_page= translog_get_current_page_rest();
+ }
+ buffer_rest= translog_get_current_group_size();
+ } while ((translog_size_t)(first_page + buffer_rest) <
+ (translog_size_t)(parts->record_length - done));
+
+ group.addr= horizon= log_descriptor.horizon;
+ cursor= log_descriptor.bc;
+ cursor.chaser= 1;
+ group.num= 0; /* 0 because it does not matter */
+ if (insert_dynamic(&groups, (uchar*) &group))
+ {
+ DBUG_PRINT("error", ("insert into array failed"));
+ goto err_unlock;
+ }
+ record_rest= parts->record_length - done;
+ DBUG_PRINT("info", ("Record rest: %lu", (ulong) record_rest));
+ if (first_page > record_rest + 1)
+ {
+ /*
+ We have not so much data to fill all first page
+ (no speaking about full pages)
+ so it will be:
+ <chunk0 <data>>
+ or
+ <chunk0>...<chunk0><chunk0 <data>>
+ or
+ <chunk3 <data>><chunk0>...<chunk0><chunk0 <possible data of 1 byte>>
+ */
+ chunk2_page= full_pages= 0;
+ last_page_capacity= first_page;
+ pages_to_skip= -1;
+ }
+ else
+ {
+ /*
+ We will have:
+ <chunk2 <data>>...<chunk2 <data>><chunk0 <data>>
+ or
+ <chunk2 <data>>...<chunk2 <data>><chunk0>...<chunk0><chunk0 <data>>
+ or
+ <chunk3 <data>><chunk0>...<chunk0><chunk0 <possible data of 1 byte>>
+ */
+ chunk2_page= 1;
+ record_rest-= (first_page - 1);
+ pages_to_skip= full_pages=
+ record_rest / log_descriptor.page_capacity_chunk_2;
+ record_rest= (record_rest % log_descriptor.page_capacity_chunk_2);
+ last_page_capacity= page_capacity;
+ }
+ chunk3_size= 0;
+ chunk3_pages= 0;
+ if (last_page_capacity > record_rest + 1 && record_rest != 0)
+ {
+ if (last_page_capacity >
+ record_rest + header_fixed_part + groups.elements * (7 + 1))
+ {
+ /* 1 record of type 0 */
+ chunk3_pages= 0;
+ }
+ else
+ {
+ pages_to_skip++;
+ chunk3_pages= 1;
+ if (record_rest + 2 == last_page_capacity)
+ {
+ chunk3_size= record_rest - 1;
+ record_rest= 1;
+ }
+ else
+ {
+ chunk3_size= record_rest;
+ record_rest= 0;
+ }
+ }
+ }
+ /*
+ A first non-full page will hold type 0 chunk only if it fit in it with
+ all its headers
+ */
+ while (page_capacity <
+ record_rest + header_fixed_part +
+ (groups.elements - groups_per_page * (chunk0_pages - 1)) * (7 + 1))
+ chunk0_pages++;
+ DBUG_PRINT("info", ("chunk0_pages: %u groups %u groups per full page: %u "
+ "Group on last page: %u",
+ chunk0_pages, groups.elements,
+ groups_per_page,
+ (groups.elements -
+ ((page_capacity - header_fixed_part) / (7 + 1)) *
+ (chunk0_pages - 1))));
+ DBUG_PRINT("info", ("first_page: %u chunk2: %u full_pages: %u (%lu) "
+ "chunk3: %u (%u) rest: %u",
+ first_page,
+ chunk2_page, full_pages,
+ (ulong) full_pages *
+ log_descriptor.page_capacity_chunk_2,
+ chunk3_pages, (uint) chunk3_size, (uint) record_rest));
+ rc= translog_advance_pointer(pages_to_skip + (int)(chunk0_pages - 1),
+ record_rest + header_fixed_part +
+ (groups.elements -
+ ((page_capacity -
+ header_fixed_part) / (7 + 1)) *
+ (chunk0_pages - 1)) * (7 + 1));
+ buffer_of_last_lsn= log_descriptor.bc.buffer;
+ translog_unlock();
+
+ if (buffer_to_flush != NULL)
+ {
+ translog_buffer_decrease_writers(buffer_to_flush);
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ translog_buffer_unlock(buffer_to_flush);
+ buffer_to_flush= NULL;
+ }
+ if (rc)
+ {
+ DBUG_PRINT("error", ("flush of unlock buffer failed"));
+ goto err;
+ }
+
+ if (rc)
+ goto err;
+
+ if (chunk2_page)
+ {
+ DBUG_PRINT("info", ("chunk 2 to finish first page"));
+ translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header);
+ translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) "
+ "Left: %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - (first_page - 1) -
+ done)));
+ }
+ else if (chunk3_pages)
+ {
+ uchar chunk3_header[3];
+ DBUG_PRINT("info", ("chunk 3"));
+ DBUG_ASSERT(full_pages == 0);
+ chunk3_pages= 0;
+ chunk3_header[0]= TRANSLOG_CHUNK_LNGTH;
+ int2store(chunk3_header + 1, chunk3_size);
+ translog_write_data_on_page(&horizon, &cursor, 3, chunk3_header);
+ translog_write_parts_on_page(&horizon, &cursor, chunk3_size, parts);
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) "
+ "Left: %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - chunk3_size - done)));
+ }
+ else
+ {
+ DBUG_PRINT("info", ("no new_page_before_chunk0"));
+ new_page_before_chunk0= 0;
+ }
+
+ for (i= 0; i < full_pages; i++)
+ {
+ DBUG_ASSERT(chunk2_page != 0);
+ if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor))
+ goto err;
+
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) "
+ "Left: %lu",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon),
+ (ulong) (parts->record_length - (first_page - 1) -
+ i * log_descriptor.page_capacity_chunk_2 -
+ done)));
+ }
+
+ if (chunk3_pages &&
+ translog_write_variable_record_chunk3_page(parts,
+ chunk3_size,
+ &horizon, &cursor))
+ goto err;
+ DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon),
+ LSN_IN_PARTS(horizon)));
+
+ *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN);
+ int2store(chunk0_header + 1, short_trid);
+ translog_write_variable_record_1group_code_len(chunk0_header + 3,
+ parts->record_length,
+ header_length);
+ do
+ {
+ int limit;
+ if (new_page_before_chunk0 &&
+ translog_chaser_page_next(&horizon, &cursor))
+ {
+ DBUG_PRINT("error", ("flush of unlock buffer failed"));
+ goto err;
+ }
+ new_page_before_chunk0= 1;
+
+ if (first_chunk0)
+ {
+ first_chunk0= 0;
+
+ /*
+ We can drop "log_descriptor.is_everything_flushed" earlier when have
+ lock on loghandler and assign initial value of "horizon" variable or
+ before unlocking loghandler (because we will increase writers
+ counter on the buffer and every thread which wanted flush the buffer
+ will wait till we finish with it). But IMHO better here take short
+ lock and do not bother other threads with waiting.
+ */
+ translog_lock();
+ set_lsn(lsn, horizon);
+ buffer_of_last_lsn->last_lsn= *lsn;
+ DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx) buffer: 0x%lx",
+ LSN_IN_PARTS(buffer_of_last_lsn->last_lsn),
+ (ulong) buffer_of_last_lsn));
+ if (log_record_type_descriptor[type].inwrite_hook &&
+ (*log_record_type_descriptor[type].inwrite_hook) (type, trn,
+ tbl_info,
+ lsn, hook_arg))
+ goto err_unlock;
+ translog_unlock();
+ }
+
+ /*
+ A first non-full page will hold type 0 chunk only if it fit in it with
+ all its headers => the fist page is full or number of groups less then
+ possible number of full page.
+ */
+ limit= (groups_per_page < groups.elements - curr_group ?
+ groups_per_page : groups.elements - curr_group);
+ DBUG_PRINT("info", ("Groups: %u curr: %u limit: %u",
+ (uint) groups.elements, (uint) curr_group,
+ (uint) limit));
+
+ if (chunk0_pages == 1)
+ {
+ DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) + %u = %u",
+ (uint) limit, (uint) record_rest,
+ (uint) (2 + limit * (7 + 1) + record_rest)));
+ int2store(chunk0_header + header_length - 2,
+ 2 + limit * (7 + 1) + record_rest);
+ }
+ else
+ {
+ DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) = %u",
+ (uint) limit, (uint) (2 + limit * (7 + 1))));
+ int2store(chunk0_header + header_length - 2, 2 + limit * (7 + 1));
+ }
+ int2store(chunk0_header + header_length, groups.elements - curr_group);
+ translog_write_data_on_page(&horizon, &cursor, header_fixed_part,
+ chunk0_header);
+ for (i= curr_group; i < limit + curr_group; i++)
+ {
+ struct st_translog_group_descriptor *grp_ptr;
+ grp_ptr= dynamic_element(&groups, i,
+ struct st_translog_group_descriptor *);
+ lsn_store(group_desc, grp_ptr->addr);
+ group_desc[7]= grp_ptr->num;
+ translog_write_data_on_page(&horizon, &cursor, (7 + 1), group_desc);
+ }
+
+ if (chunk0_pages == 1 && record_rest != 0)
+ translog_write_parts_on_page(&horizon, &cursor, record_rest, parts);
+
+ chunk0_pages--;
+ curr_group+= limit;
+ /* put special type to indicate that it is not LSN chunk */
+ *chunk0_header= (uchar) (TRANSLOG_CHUNK_LSN | TRANSLOG_CHUNK_0_CONT);
+ } while (chunk0_pages != 0);
+ translog_buffer_lock(cursor.buffer);
+ translog_buffer_decrease_writers(cursor.buffer);
+ translog_buffer_unlock(cursor.buffer);
+ rc= 0;
+
+ if (translog_set_lsn_for_files(file_of_the_first_group, LSN_FILE_NO(*lsn),
+ *lsn, FALSE))
+ goto err;
+
+ translog_mark_file_finished(file_of_the_first_group);
+
+ delete_dynamic(&groups);
+ DBUG_RETURN(rc);
+
+err_unlock:
+
+ translog_unlock();
+
+err:
+ if (buffer_to_flush != NULL)
+ {
+ /* This is to prevent locking buffer forever in case of error */
+ translog_buffer_decrease_writers(buffer_to_flush);
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ translog_buffer_unlock(buffer_to_flush);
+ buffer_to_flush= NULL;
+ }
+
+
+ translog_mark_file_finished(file_of_the_first_group);
+
+ delete_dynamic(&groups);
+ DBUG_RETURN(1);
+}
+
+
+/**
+ @brief Write the variable length log record.
+
+ @param lsn LSN of the record will be written here
+ @param type the log record type
+ @param short_trid Short transaction ID or 0 if it has no sense
+ @param parts Descriptor of record source parts
+ @param trn Transaction structure pointer for hooks by
+ record log type, for short_id
+ @param hook_arg Argument which will be passed to pre-write and
+ in-write hooks of this record.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_write_variable_record(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ TRN *trn, void *hook_arg)
+{
+ struct st_translog_buffer *buffer_to_flush= NULL;
+ uint header_length1= 1 + 2 + 2 +
+ translog_variable_record_length_bytes(parts->record_length);
+ ulong buffer_rest;
+ uint page_rest;
+ /* Max number of such LSNs per record is 2 */
+ uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD *
+ COMPRESSED_LSN_MAX_STORE_SIZE];
+ my_bool res;
+ DBUG_ENTER("translog_write_variable_record");
+
+ translog_lock();
+ DBUG_PRINT("info", ("horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon)));
+ page_rest= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill;
+ DBUG_PRINT("info", ("header length: %u page_rest: %u",
+ header_length1, page_rest));
+
+ /*
+ header and part which we should read have to fit in one chunk
+ TODO: allow to divide readable header
+ */
+ if (page_rest <
+ (header_length1 + log_record_type_descriptor[type].read_header_len))
+ {
+ DBUG_PRINT("info",
+ ("Next page, size: %u header: %u + %u",
+ log_descriptor.bc.current_page_fill,
+ header_length1,
+ log_record_type_descriptor[type].read_header_len));
+ translog_page_next(&log_descriptor.horizon, &log_descriptor.bc,
+ &buffer_to_flush);
+ /* Chunk 2 header is 1 byte, so full page capacity will be one uchar more */
+ page_rest= log_descriptor.page_capacity_chunk_2 + 1;
+ DBUG_PRINT("info", ("page_rest: %u", page_rest));
+ }
+
+ /*
+ To minimize compressed size we will compress always relative to
+ very first chunk address (log_descriptor.horizon for now)
+ */
+ if (log_record_type_descriptor[type].compressed_LSN > 0)
+ {
+ translog_relative_LSN_encode(parts, log_descriptor.horizon,
+ log_record_type_descriptor[type].
+ compressed_LSN, compressed_LSNs);
+ /* recalculate header length after compression */
+ header_length1= 1 + 2 + 2 +
+ translog_variable_record_length_bytes(parts->record_length);
+ DBUG_PRINT("info", ("after compressing LSN(s) header length: %u "
+ "record length: %lu",
+ header_length1, (ulong)parts->record_length));
+ }
+
+ /* TODO: check space on current page for header + few bytes */
+ if (page_rest >= parts->record_length + header_length1)
+ {
+ /* following function makes translog_unlock(); */
+ res= translog_write_variable_record_1chunk(lsn, type, tbl_info,
+ short_trid,
+ parts, buffer_to_flush,
+ header_length1, trn, hook_arg);
+ DBUG_RETURN(res);
+ }
+
+ buffer_rest= translog_get_current_group_size();
+
+ if (buffer_rest >= parts->record_length + header_length1 - page_rest)
+ {
+ /* following function makes translog_unlock(); */
+ res= translog_write_variable_record_1group(lsn, type, tbl_info,
+ short_trid,
+ parts, buffer_to_flush,
+ header_length1, trn, hook_arg);
+ DBUG_RETURN(res);
+ }
+ /* following function makes translog_unlock(); */
+ res= translog_write_variable_record_mgroup(lsn, type, tbl_info,
+ short_trid,
+ parts, buffer_to_flush,
+ header_length1,
+ buffer_rest, trn, hook_arg);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Write the fixed and pseudo-fixed log record.
+
+ @param lsn LSN of the record will be written here
+ @param type the log record type
+ @param short_trid Short transaction ID or 0 if it has no sense
+ @param parts Descriptor of record source parts
+ @param trn Transaction structure pointer for hooks by
+ record log type, for short_id
+ @param hook_arg Argument which will be passed to pre-write and
+ in-write hooks of this record.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_write_fixed_record(LSN *lsn,
+ enum translog_record_type type,
+ MARIA_HA *tbl_info,
+ SHORT_TRANSACTION_ID short_trid,
+ struct st_translog_parts *parts,
+ TRN *trn, void *hook_arg)
+{
+ struct st_translog_buffer *buffer_to_flush= NULL;
+ uchar chunk1_header[1 + 2];
+ /* Max number of such LSNs per record is 2 */
+ uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD *
+ COMPRESSED_LSN_MAX_STORE_SIZE];
+ LEX_CUSTRING *part;
+ int rc= 1;
+ DBUG_ENTER("translog_write_fixed_record");
+ DBUG_ASSERT((log_record_type_descriptor[type].rclass ==
+ LOGRECTYPE_FIXEDLENGTH &&
+ parts->record_length ==
+ log_record_type_descriptor[type].fixed_length) ||
+ (log_record_type_descriptor[type].rclass ==
+ LOGRECTYPE_PSEUDOFIXEDLENGTH &&
+ parts->record_length ==
+ log_record_type_descriptor[type].fixed_length));
+
+ translog_lock();
+ DBUG_PRINT("info", ("horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.horizon)));
+
+ DBUG_ASSERT(log_descriptor.bc.current_page_fill <= TRANSLOG_PAGE_SIZE);
+ DBUG_PRINT("info",
+ ("Page size: %u record: %u next cond: %d",
+ log_descriptor.bc.current_page_fill,
+ (parts->record_length +
+ log_record_type_descriptor[type].compressed_LSN * 2 + 3),
+ ((((uint) log_descriptor.bc.current_page_fill) +
+ (parts->record_length +
+ log_record_type_descriptor[type].compressed_LSN * 2 + 3)) >
+ TRANSLOG_PAGE_SIZE)));
+ /*
+ check that there is enough place on current page.
+ NOTE: compressing may increase page LSN size on two bytes for every LSN
+ */
+ if ((((uint) log_descriptor.bc.current_page_fill) +
+ (parts->record_length +
+ log_record_type_descriptor[type].compressed_LSN * 2 + 3)) >
+ TRANSLOG_PAGE_SIZE)
+ {
+ DBUG_PRINT("info", ("Next page"));
+ if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc,
+ &buffer_to_flush))
+ goto err; /* rc == 1 */
+ if (buffer_to_flush)
+ translog_buffer_lock_assert_owner(buffer_to_flush);
+ }
+
+ set_lsn(lsn, log_descriptor.horizon);
+ if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn),
+ *lsn, TRUE) ||
+ (log_record_type_descriptor[type].inwrite_hook &&
+ (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info,
+ lsn, hook_arg)))
+ goto err;
+
+ /* compress LSNs */
+ if (log_record_type_descriptor[type].rclass ==
+ LOGRECTYPE_PSEUDOFIXEDLENGTH)
+ {
+ DBUG_ASSERT(log_record_type_descriptor[type].compressed_LSN > 0);
+ translog_relative_LSN_encode(parts, *lsn,
+ log_record_type_descriptor[type].
+ compressed_LSN, compressed_LSNs);
+ }
+
+ /*
+ Write the whole record at once (we know that there is enough place on
+ the destination page)
+ */
+ DBUG_ASSERT(parts->current != 0); /* first part is left for header */
+ part= parts->parts + (--parts->current);
+ parts->total_record_length+= (translog_size_t) (part->length= 1 + 2);
+ part->str= chunk1_header;
+ *chunk1_header= (uchar) (type | TRANSLOG_CHUNK_FIXED);
+ int2store(chunk1_header + 1, short_trid);
+
+ rc= translog_write_parts_on_page(&log_descriptor.horizon,
+ &log_descriptor.bc,
+ parts->total_record_length, parts);
+
+ log_descriptor.bc.buffer->last_lsn= *lsn;
+ DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx) buffer: 0x%lx",
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn),
+ (ulong) log_descriptor.bc.buffer));
+
+err:
+ translog_unlock();
+
+ /*
+ check if we switched buffer and need process it (current buffer is
+ unlocked already => we will not delay other threads
+ */
+ if (buffer_to_flush != NULL)
+ {
+ if (!rc)
+ rc= translog_buffer_flush(buffer_to_flush);
+ translog_buffer_unlock(buffer_to_flush);
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief Writes the log record
+
+ If share has no 2-byte-id yet, gives an id to the share and logs
+ LOGREC_FILE_ID. If transaction has not logged LOGREC_LONG_TRANSACTION_ID
+ yet, logs it.
+
+ @param lsn LSN of the record will be written here
+ @param type the log record type
+ @param trn Transaction structure pointer for hooks by
+ record log type, for short_id
+ @param tbl_info MARIA_HA of table or NULL
+ @param rec_len record length or 0 (count it)
+ @param part_no number of parts or 0 (count it)
+ @param parts_data zero ended (in case of number of parts is 0)
+ array of LEX_STRINGs (parts), first
+ TRANSLOG_INTERNAL_PARTS positions in the log
+ should be unused (need for loghandler)
+ @param store_share_id if tbl_info!=NULL then share's id will
+ automatically be stored in the two first bytes
+ pointed (so pointer is assumed to be !=NULL)
+ @param hook_arg argument which will be passed to pre-write and
+ in-write hooks of this record.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool translog_write_record(LSN *lsn,
+ enum translog_record_type type,
+ TRN *trn, MARIA_HA *tbl_info,
+ translog_size_t rec_len,
+ uint part_no,
+ LEX_CUSTRING *parts_data,
+ uchar *store_share_id,
+ void *hook_arg)
+{
+ struct st_translog_parts parts;
+ LEX_CUSTRING *part;
+ int rc;
+ uint short_trid= trn->short_id;
+ DBUG_ENTER("translog_write_record");
+ DBUG_PRINT("enter", ("type: %u (%s) ShortTrID: %u rec_len: %lu",
+ (uint) type, log_record_type_descriptor[type].name,
+ (uint) short_trid, (ulong) rec_len));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ if (unlikely(translog_status != TRANSLOG_OK))
+ {
+ DBUG_PRINT("error", ("Transaction log is write protected"));
+ DBUG_RETURN(1);
+ }
+
+ if (tbl_info)
+ {
+ MARIA_SHARE *share= tbl_info->s;
+ DBUG_ASSERT(share->now_transactional);
+ if (unlikely(share->id == 0))
+ {
+ /*
+ First log write for this MARIA_SHARE; give it a short id.
+ When the lock manager is enabled and needs a short id, it should be
+ assigned in the lock manager (because row locks will be taken before
+ log records are written; for example SELECT FOR UPDATE takes locks but
+ writes no log record.
+ */
+ if (unlikely(translog_assign_id_to_share(tbl_info, trn)))
+ DBUG_RETURN(1);
+ }
+ fileid_store(store_share_id, share->id);
+ }
+ if (unlikely(!(trn->first_undo_lsn & TRANSACTION_LOGGED_LONG_ID)))
+ {
+ LSN dummy_lsn;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar log_data[6];
+ DBUG_ASSERT(trn->undo_lsn == LSN_IMPOSSIBLE);
+ int6store(log_data, trn->trid);
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; /* no recursion */
+ if (unlikely(translog_write_record(&dummy_lsn, LOGREC_LONG_TRANSACTION_ID,
+ trn, NULL, sizeof(log_data),
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL, NULL)))
+ DBUG_RETURN(1);
+ }
+
+ parts.parts= parts_data;
+
+ /* count parts if they are not counted by upper level */
+ if (part_no == 0)
+ {
+ for (part_no= TRANSLOG_INTERNAL_PARTS;
+ parts_data[part_no].length != 0;
+ part_no++);
+ }
+ parts.elements= part_no;
+ parts.current= TRANSLOG_INTERNAL_PARTS;
+
+ /* clear TRANSLOG_INTERNAL_PARTS */
+ compile_time_assert(TRANSLOG_INTERNAL_PARTS != 0);
+ parts_data[0].str= 0;
+ parts_data[0].length= 0;
+
+ /* count length of the record */
+ if (rec_len == 0)
+ {
+ for(part= parts_data + TRANSLOG_INTERNAL_PARTS;\
+ part < parts_data + part_no;
+ part++)
+ {
+ rec_len+= (translog_size_t) part->length;
+ }
+ }
+ parts.record_length= rec_len;
+
+#ifndef DBUG_OFF
+ {
+ uint i;
+ uint len= 0;
+#ifdef HAVE_valgrind
+ ha_checksum checksum= 0;
+#endif
+ for (i= TRANSLOG_INTERNAL_PARTS; i < part_no; i++)
+ {
+#ifdef HAVE_valgrind
+ /* Find unitialized bytes early */
+ checksum+= my_checksum(checksum, parts_data[i].str,
+ parts_data[i].length);
+#endif
+ len+= parts_data[i].length;
+ }
+ DBUG_ASSERT(len == rec_len);
+ }
+#endif
+ /*
+ Start total_record_length from record_length then overhead will
+ be add
+ */
+ parts.total_record_length= parts.record_length;
+ DBUG_PRINT("info", ("record length: %lu", (ulong) parts.record_length));
+
+ /* process this parts */
+ if (!(rc= (log_record_type_descriptor[type].prewrite_hook &&
+ (*log_record_type_descriptor[type].prewrite_hook) (type, trn,
+ tbl_info,
+ hook_arg))))
+ {
+ switch (log_record_type_descriptor[type].rclass) {
+ case LOGRECTYPE_VARIABLE_LENGTH:
+ rc= translog_write_variable_record(lsn, type, tbl_info,
+ short_trid, &parts, trn, hook_arg);
+ break;
+ case LOGRECTYPE_PSEUDOFIXEDLENGTH:
+ case LOGRECTYPE_FIXEDLENGTH:
+ rc= translog_write_fixed_record(lsn, type, tbl_info,
+ short_trid, &parts, trn, hook_arg);
+ break;
+ case LOGRECTYPE_NOT_ALLOWED:
+ default:
+ DBUG_ASSERT(0);
+ rc= 1;
+ }
+ }
+
+ DBUG_PRINT("info", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(*lsn)));
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ Decode compressed (relative) LSN(s)
+
+ SYNOPSIS
+ translog_relative_lsn_decode()
+ base_lsn LSN for encoding
+ src Decode LSN(s) from here
+ dst Put decoded LSNs here
+ lsns number of LSN(s)
+
+ RETURN
+ position in sources after decoded LSN(s)
+*/
+
+static uchar *translog_relative_LSN_decode(LSN base_lsn,
+ uchar *src, uchar *dst, uint lsns)
+{
+ uint i;
+ for (i= 0; i < lsns; i++, dst+= LSN_STORE_SIZE)
+ {
+ src= translog_get_LSN_from_diff(base_lsn, src, dst);
+ }
+ return src;
+}
+
+/**
+ @brief Get header of fixed/pseudo length record and call hook for
+ it processing
+
+ @param page Pointer to the buffer with page where LSN chunk is
+ placed
+ @param page_offset Offset of the first chunk in the page
+ @param buff Buffer to be filled with header data
+
+ @return Length of header or operation status
+ @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+static int translog_fixed_length_header(uchar *page,
+ translog_size_t page_offset,
+ TRANSLOG_HEADER_BUFFER *buff)
+{
+ struct st_log_record_type_descriptor *desc=
+ log_record_type_descriptor + buff->type;
+ uchar *src= page + page_offset + 3;
+ uchar *dst= buff->header;
+ uchar *start= src;
+ int lsns= desc->compressed_LSN;
+ uint length= desc->fixed_length;
+ DBUG_ENTER("translog_fixed_length_header");
+
+ buff->record_length= length;
+
+ if (desc->rclass == LOGRECTYPE_PSEUDOFIXEDLENGTH)
+ {
+ DBUG_ASSERT(lsns > 0);
+ src= translog_relative_LSN_decode(buff->lsn, src, dst, lsns);
+ lsns*= LSN_STORE_SIZE;
+ dst+= lsns;
+ length-= lsns;
+ buff->compressed_LSN_economy= (lsns - (int) (src - start));
+ }
+ else
+ buff->compressed_LSN_economy= 0;
+
+ memcpy(dst, src, length);
+ buff->non_header_data_start_offset= (uint16) (page_offset +
+ ((src + length) -
+ (page + page_offset)));
+ buff->non_header_data_len= 0;
+ DBUG_RETURN(buff->record_length);
+}
+
+
+/*
+ Free resources used by TRANSLOG_HEADER_BUFFER
+
+ SYNOPSIS
+ translog_free_record_header();
+*/
+
+void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff)
+{
+ DBUG_ENTER("translog_free_record_header");
+ if (buff->groups_no != 0)
+ {
+ my_free(buff->groups, MYF(0));
+ buff->groups_no= 0;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Returns the current horizon at the end of the current log
+
+ @return Horizon
+ @retval LSN_ERROR error
+ @retvar # Horizon
+*/
+
+TRANSLOG_ADDRESS translog_get_horizon()
+{
+ TRANSLOG_ADDRESS res;
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ translog_lock();
+ res= log_descriptor.horizon;
+ translog_unlock();
+ return res;
+}
+
+
+/**
+ @brief Returns the current horizon at the end of the current log, caller is
+ assumed to already hold the lock
+
+ @return Horizon
+ @retval LSN_ERROR error
+ @retvar # Horizon
+*/
+
+TRANSLOG_ADDRESS translog_get_horizon_no_lock()
+{
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ translog_lock_assert_owner();
+ return log_descriptor.horizon;
+}
+
+
+/*
+ Set last page in the scanner data structure
+
+ SYNOPSIS
+ translog_scanner_set_last_page()
+ scanner Information about current chunk during scanning
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_scanner_set_last_page(TRANSLOG_SCANNER_DATA *scanner)
+{
+ my_bool page_ok;
+ if (LSN_FILE_NO(scanner->page_addr) == LSN_FILE_NO(scanner->horizon))
+ {
+ /* It is last file => we can easy find last page address by horizon */
+ uint pagegrest= LSN_OFFSET(scanner->horizon) % TRANSLOG_PAGE_SIZE;
+ scanner->last_file_page= (scanner->horizon -
+ (pagegrest ? pagegrest : TRANSLOG_PAGE_SIZE));
+ return (0);
+ }
+ scanner->last_file_page= scanner->page_addr;
+ return (translog_get_last_page_addr(&scanner->last_file_page, &page_ok, 0));
+}
+
+
+/**
+ @brief Get page from page cache according to requested method
+
+ @param scanner The scanner data
+
+ @return operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool
+translog_scanner_get_page(TRANSLOG_SCANNER_DATA *scanner)
+{
+ TRANSLOG_VALIDATOR_DATA data;
+ DBUG_ENTER("translog_scanner_get_page");
+ data.addr= &scanner->page_addr;
+ data.was_recovered= 0;
+ DBUG_RETURN((scanner->page=
+ translog_get_page(&data, scanner->buffer,
+ (scanner->use_direct_link ?
+ &scanner->direct_link :
+ NULL))) ==
+ NULL);
+}
+
+
+/**
+ @brief Initialize reader scanner.
+
+ @param lsn LSN with which it have to be inited
+ @param fixed_horizon true if it is OK do not read records which was written
+ after scanning beginning
+ @param scanner scanner which have to be inited
+ @param use_direct prefer using direct lings from page handler
+ where it is possible.
+
+ @note If direct link was used translog_destroy_scanner should be
+ called after it using
+
+ @return status of the operation
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool translog_scanner_init(LSN lsn,
+ my_bool fixed_horizon,
+ TRANSLOG_SCANNER_DATA *scanner,
+ my_bool use_direct)
+{
+ TRANSLOG_VALIDATOR_DATA data;
+ DBUG_ENTER("translog_scanner_init");
+ DBUG_PRINT("enter", ("Scanner: 0x%lx LSN: (%lu,0x%lx)",
+ (ulong) scanner, LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+
+ data.addr= &scanner->page_addr;
+ data.was_recovered= 0;
+
+ scanner->page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE;
+
+ scanner->fixed_horizon= fixed_horizon;
+ scanner->use_direct_link= use_direct;
+ scanner->direct_link= NULL;
+
+ scanner->horizon= translog_get_horizon();
+ DBUG_PRINT("info", ("horizon: (%lu,0x%lx)", LSN_IN_PARTS(scanner->horizon)));
+
+ /* lsn < horizon */
+ DBUG_ASSERT(lsn <= scanner->horizon);
+
+ scanner->page_addr= lsn;
+ scanner->page_addr-= scanner->page_offset; /*decrease offset */
+
+ if (translog_scanner_set_last_page(scanner))
+ DBUG_RETURN(1);
+
+ if (translog_scanner_get_page(scanner))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Destroy scanner object;
+
+ @param scanner The scanner object to destroy
+*/
+
+void translog_destroy_scanner(TRANSLOG_SCANNER_DATA *scanner)
+{
+ DBUG_ENTER("translog_destroy_scanner");
+ DBUG_PRINT("enter", ("Scanner: 0x%lx", (ulong)scanner));
+ translog_free_link(scanner->direct_link);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Checks End of the Log
+
+ SYNOPSIS
+ translog_scanner_eol()
+ scanner Information about current chunk during scanning
+
+ RETURN
+ 1 End of the Log
+ 0 OK
+*/
+
+static my_bool translog_scanner_eol(TRANSLOG_SCANNER_DATA *scanner)
+{
+ DBUG_ENTER("translog_scanner_eol");
+ DBUG_PRINT("enter",
+ ("Horizon: (%lu, 0x%lx) Current: (%lu, 0x%lx+0x%x=0x%lx)",
+ LSN_IN_PARTS(scanner->horizon),
+ LSN_IN_PARTS(scanner->page_addr),
+ (uint) scanner->page_offset,
+ (ulong) (LSN_OFFSET(scanner->page_addr) + scanner->page_offset)));
+ if (scanner->horizon > (scanner->page_addr +
+ scanner->page_offset))
+ {
+ DBUG_PRINT("info", ("Horizon is not reached"));
+ DBUG_RETURN(0);
+ }
+ if (scanner->fixed_horizon)
+ {
+ DBUG_PRINT("info", ("Horizon is fixed and reached"));
+ DBUG_RETURN(1);
+ }
+ scanner->horizon= translog_get_horizon();
+ DBUG_PRINT("info",
+ ("Horizon is re-read, EOL: %d",
+ scanner->horizon <= (scanner->page_addr +
+ scanner->page_offset)));
+ DBUG_RETURN(scanner->horizon <= (scanner->page_addr +
+ scanner->page_offset));
+}
+
+
+/**
+ @brief Cheks End of the Page
+
+ @param scanner Information about current chunk during scanning
+
+ @retval 1 End of the Page
+ @retval 0 OK
+*/
+
+static my_bool translog_scanner_eop(TRANSLOG_SCANNER_DATA *scanner)
+{
+ DBUG_ENTER("translog_scanner_eop");
+ DBUG_RETURN(scanner->page_offset >= TRANSLOG_PAGE_SIZE ||
+ scanner->page[scanner->page_offset] == TRANSLOG_FILLER);
+}
+
+
+/**
+ @brief Checks End of the File (i.e. we are scanning last page, which do not
+ mean end of this page)
+
+ @param scanner Information about current chunk during scanning
+
+ @retval 1 End of the File
+ @retval 0 OK
+*/
+
+static my_bool translog_scanner_eof(TRANSLOG_SCANNER_DATA *scanner)
+{
+ DBUG_ENTER("translog_scanner_eof");
+ DBUG_ASSERT(LSN_FILE_NO(scanner->page_addr) ==
+ LSN_FILE_NO(scanner->last_file_page));
+ DBUG_PRINT("enter", ("curr Page: 0x%lx last page: 0x%lx "
+ "normal EOF: %d",
+ (ulong) LSN_OFFSET(scanner->page_addr),
+ (ulong) LSN_OFFSET(scanner->last_file_page),
+ LSN_OFFSET(scanner->page_addr) ==
+ LSN_OFFSET(scanner->last_file_page)));
+ /*
+ TODO: detect damaged file EOF,
+ TODO: issue warning if damaged file EOF detected
+ */
+ DBUG_RETURN(scanner->page_addr ==
+ scanner->last_file_page);
+}
+
+/*
+ Move scanner to the next chunk
+
+ SYNOPSIS
+ translog_get_next_chunk()
+ scanner Information about current chunk during scanning
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool
+translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner)
+{
+ uint16 len;
+ DBUG_ENTER("translog_get_next_chunk");
+
+ if (translog_scanner_eop(scanner))
+ len= TRANSLOG_PAGE_SIZE - scanner->page_offset;
+ else if ((len= translog_get_total_chunk_length(scanner->page,
+ scanner->page_offset)) == 0)
+ DBUG_RETURN(1);
+ scanner->page_offset+= len;
+
+ if (translog_scanner_eol(scanner))
+ {
+ scanner->page= END_OF_LOG;
+ scanner->page_offset= 0;
+ DBUG_RETURN(0);
+ }
+ if (translog_scanner_eop(scanner))
+ {
+ /* before reading next page we should unpin current one if it was pinned */
+ translog_free_link(scanner->direct_link);
+ if (translog_scanner_eof(scanner))
+ {
+ DBUG_PRINT("info", ("horizon: (%lu,0x%lx) pageaddr: (%lu,0x%lx)",
+ LSN_IN_PARTS(scanner->horizon),
+ LSN_IN_PARTS(scanner->page_addr)));
+ /* if it is log end it have to be caught before */
+ DBUG_ASSERT(LSN_FILE_NO(scanner->horizon) >
+ LSN_FILE_NO(scanner->page_addr));
+ scanner->page_addr+= LSN_ONE_FILE;
+ scanner->page_addr= LSN_REPLACE_OFFSET(scanner->page_addr,
+ TRANSLOG_PAGE_SIZE);
+ if (translog_scanner_set_last_page(scanner))
+ DBUG_RETURN(1);
+ }
+ else
+ {
+ scanner->page_addr+= TRANSLOG_PAGE_SIZE; /* offset increased */
+ }
+
+ if (translog_scanner_get_page(scanner))
+ DBUG_RETURN(1);
+
+ scanner->page_offset= translog_get_first_chunk_offset(scanner->page);
+ if (translog_scanner_eol(scanner))
+ {
+ scanner->page= END_OF_LOG;
+ scanner->page_offset= 0;
+ DBUG_RETURN(0);
+ }
+ DBUG_ASSERT(scanner->page[scanner->page_offset] != TRANSLOG_FILLER);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Get header of variable length record and call hook for it processing
+
+ @param page Pointer to the buffer with page where LSN chunk is
+ placed
+ @param page_offset Offset of the first chunk in the page
+ @param buff Buffer to be filled with header data
+ @param scanner If present should be moved to the header page if
+ it differ from LSN page
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval RECHEADER_READ_EOF End of the log reached during the read
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+static int
+translog_variable_length_header(uchar *page, translog_size_t page_offset,
+ TRANSLOG_HEADER_BUFFER *buff,
+ TRANSLOG_SCANNER_DATA *scanner)
+{
+ struct st_log_record_type_descriptor *desc= (log_record_type_descriptor +
+ buff->type);
+ uchar *src= page + page_offset + 1 + 2;
+ uchar *dst= buff->header;
+ LSN base_lsn;
+ uint lsns= desc->compressed_LSN;
+ uint16 chunk_len;
+ uint16 length= desc->read_header_len;
+ uint16 buffer_length= length;
+ uint16 body_len;
+ int rc;
+ TRANSLOG_SCANNER_DATA internal_scanner;
+ DBUG_ENTER("translog_variable_length_header");
+
+ buff->record_length= translog_variable_record_1group_decode_len(&src);
+ chunk_len= uint2korr(src);
+ DBUG_PRINT("info", ("rec len: %lu chunk len: %u length: %u bufflen: %u",
+ (ulong) buff->record_length, (uint) chunk_len,
+ (uint) length, (uint) buffer_length));
+ if (chunk_len == 0)
+ {
+ uint16 page_rest;
+ DBUG_PRINT("info", ("1 group"));
+ src+= 2;
+ page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page));
+
+ base_lsn= buff->lsn;
+ body_len= min(page_rest, buff->record_length);
+ }
+ else
+ {
+ uint grp_no, curr;
+ uint header_to_skip;
+ uint16 page_rest;
+
+ DBUG_PRINT("info", ("multi-group"));
+ grp_no= buff->groups_no= uint2korr(src + 2);
+ if (!(buff->groups=
+ (TRANSLOG_GROUP*) my_malloc(sizeof(TRANSLOG_GROUP) * grp_no,
+ MYF(0))))
+ DBUG_RETURN(RECHEADER_READ_ERROR);
+ DBUG_PRINT("info", ("Groups: %u", (uint) grp_no));
+ src+= (2 + 2);
+ page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page));
+ curr= 0;
+ header_to_skip= src - (page + page_offset);
+ buff->chunk0_pages= 0;
+
+ for (;;)
+ {
+ uint i, read_length= grp_no;
+
+ buff->chunk0_pages++;
+ if (page_rest < grp_no * (7 + 1))
+ read_length= page_rest / (7 + 1);
+ DBUG_PRINT("info", ("Read chunk0 page#%u read: %u left: %u "
+ "start from: %u",
+ buff->chunk0_pages, read_length, grp_no, curr));
+ for (i= 0; i < read_length; i++, curr++)
+ {
+ DBUG_ASSERT(curr < buff->groups_no);
+ buff->groups[curr].addr= lsn_korr(src + i * (7 + 1));
+ buff->groups[curr].num= src[i * (7 + 1) + 7];
+ DBUG_PRINT("info", ("group #%u (%lu,0x%lx) chunks: %u",
+ curr,
+ LSN_IN_PARTS(buff->groups[curr].addr),
+ (uint) buff->groups[curr].num));
+ }
+ grp_no-= read_length;
+ if (grp_no == 0)
+ {
+ if (scanner)
+ {
+ buff->chunk0_data_addr= scanner->page_addr;
+ /* offset increased */
+ buff->chunk0_data_addr+= (page_offset + header_to_skip +
+ read_length * (7 + 1));
+ }
+ else
+ {
+ buff->chunk0_data_addr= buff->lsn;
+ /* offset increased */
+ buff->chunk0_data_addr+= (header_to_skip + read_length * (7 + 1));
+ }
+ buff->chunk0_data_len= chunk_len - 2 - read_length * (7 + 1);
+ DBUG_PRINT("info", ("Data address: (%lu,0x%lx) len: %u",
+ LSN_IN_PARTS(buff->chunk0_data_addr),
+ buff->chunk0_data_len));
+ break;
+ }
+ if (scanner == NULL)
+ {
+ DBUG_PRINT("info", ("use internal scanner for header reading"));
+ scanner= &internal_scanner;
+ if (translog_scanner_init(buff->lsn, 1, scanner, 0))
+ {
+ rc= RECHEADER_READ_ERROR;
+ goto exit_and_free;
+ }
+ }
+ if (translog_get_next_chunk(scanner))
+ {
+ if (scanner == &internal_scanner)
+ translog_destroy_scanner(scanner);
+ rc= RECHEADER_READ_ERROR;
+ goto exit_and_free;
+ }
+ if (scanner->page == END_OF_LOG)
+ {
+ if (scanner == &internal_scanner)
+ translog_destroy_scanner(scanner);
+ rc= RECHEADER_READ_EOF;
+ goto exit_and_free;
+ }
+ page= scanner->page;
+ page_offset= scanner->page_offset;
+ src= page + page_offset + header_to_skip;
+ chunk_len= uint2korr(src - 2 - 2);
+ DBUG_PRINT("info", ("Chunk len: %u", (uint) chunk_len));
+ page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page));
+ }
+
+ if (scanner == NULL)
+ {
+ DBUG_PRINT("info", ("use internal scanner"));
+ scanner= &internal_scanner;
+ }
+ else
+ {
+ translog_destroy_scanner(scanner);
+ }
+ base_lsn= buff->groups[0].addr;
+ translog_scanner_init(base_lsn, 1, scanner, scanner == &internal_scanner);
+ /* first group chunk is always chunk type 2 */
+ page= scanner->page;
+ page_offset= scanner->page_offset;
+ src= page + page_offset + 1;
+ page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page));
+ body_len= page_rest;
+ if (scanner == &internal_scanner)
+ translog_destroy_scanner(scanner);
+ }
+ if (lsns)
+ {
+ uchar *start= src;
+ src= translog_relative_LSN_decode(base_lsn, src, dst, lsns);
+ lsns*= LSN_STORE_SIZE;
+ dst+= lsns;
+ length-= lsns;
+ buff->record_length+= (buff->compressed_LSN_economy=
+ (int) (lsns - (src - start)));
+ DBUG_PRINT("info", ("lsns: %u length: %u economy: %d new length: %lu",
+ lsns / LSN_STORE_SIZE, (uint) length,
+ (int) buff->compressed_LSN_economy,
+ (ulong) buff->record_length));
+ body_len-= (uint16) (src - start);
+ }
+ else
+ buff->compressed_LSN_economy= 0;
+
+ DBUG_ASSERT(body_len >= length);
+ body_len-= length;
+ memcpy(dst, src, length);
+ buff->non_header_data_start_offset= (uint16) (src + length - page);
+ buff->non_header_data_len= body_len;
+ DBUG_PRINT("info", ("non_header_data_start_offset: %u len: %u buffer: %u",
+ buff->non_header_data_start_offset,
+ buff->non_header_data_len, buffer_length));
+ DBUG_RETURN(buffer_length);
+
+exit_and_free:
+ my_free(buff->groups, MYF(0));
+ buff->groups_no= 0; /* prevent try to use of buff->groups */
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief Read record header from the given buffer
+
+ @param page page content buffer
+ @param page_offset offset of the chunk in the page
+ @param buff destination buffer
+ @param scanner If this is set the scanner will be moved to the
+ record header page (differ from LSN page in case of
+ multi-group records)
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+int translog_read_record_header_from_buffer(uchar *page,
+ uint16 page_offset,
+ TRANSLOG_HEADER_BUFFER *buff,
+ TRANSLOG_SCANNER_DATA *scanner)
+{
+ translog_size_t res;
+ DBUG_ENTER("translog_read_record_header_from_buffer");
+ DBUG_PRINT("info", ("page byte: 0x%x offset: %u",
+ (uint) page[page_offset], (uint) page_offset));
+ DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
+ buff->short_trid= uint2korr(page + page_offset + 1);
+ DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
+ (uint) buff->type, (uint)buff->short_trid,
+ LSN_IN_PARTS(buff->lsn)));
+ /* Read required bytes from the header and call hook */
+ switch (log_record_type_descriptor[buff->type].rclass) {
+ case LOGRECTYPE_VARIABLE_LENGTH:
+ res= translog_variable_length_header(page, page_offset, buff,
+ scanner);
+ break;
+ case LOGRECTYPE_PSEUDOFIXEDLENGTH:
+ case LOGRECTYPE_FIXEDLENGTH:
+ res= translog_fixed_length_header(page, page_offset, buff);
+ break;
+ default:
+ DBUG_ASSERT(0); /* we read some junk (got no LSN) */
+ res= RECHEADER_READ_ERROR;
+ }
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Read record header and some fixed part of a record (the part depend
+ on record type).
+
+ @param lsn log record serial number (address of the record)
+ @param buff log record header buffer
+
+ @note Some type of record can be read completely by this call
+ @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative
+ LSN can be translated to absolute one), some fields can be added (like
+ actual header length in the record if the header has variable length)
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff)
+{
+ TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+ uchar *page;
+ translog_size_t res, page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE;
+ PAGECACHE_BLOCK_LINK *direct_link;
+ TRANSLOG_ADDRESS addr;
+ TRANSLOG_VALIDATOR_DATA data;
+ DBUG_ENTER("translog_read_record_header");
+ DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE != 0);
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+
+ buff->lsn= lsn;
+ buff->groups_no= 0;
+ data.addr= &addr;
+ data.was_recovered= 0;
+ addr= lsn;
+ addr-= page_offset; /* offset decreasing */
+ res= (!(page= translog_get_page(&data, psize_buff.buffer, &direct_link))) ?
+ RECHEADER_READ_ERROR :
+ translog_read_record_header_from_buffer(page, page_offset, buff, 0);
+ translog_free_link(direct_link);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Read record header and some fixed part of a record (the part depend
+ on record type).
+
+ @param scan scanner position to read
+ @param buff log record header buffer
+ @param move_scanner request to move scanner to the header position
+
+ @note Some type of record can be read completely by this call
+ @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative
+ LSN can be translated to absolute one), some fields can be added (like
+ actual header length in the record if the header has variable length)
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where stored
+ decoded part of the header
+*/
+
+int translog_read_record_header_scan(TRANSLOG_SCANNER_DATA *scanner,
+ TRANSLOG_HEADER_BUFFER *buff,
+ my_bool move_scanner)
+{
+ translog_size_t res;
+ DBUG_ENTER("translog_read_record_header_scan");
+ DBUG_PRINT("enter", ("Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) "
+ "Lst: (%lu,0x%lx) Offset: %u(%x) fixed %d",
+ LSN_IN_PARTS(scanner->page_addr),
+ LSN_IN_PARTS(scanner->horizon),
+ LSN_IN_PARTS(scanner->last_file_page),
+ (uint) scanner->page_offset,
+ (uint) scanner->page_offset, scanner->fixed_horizon));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ buff->groups_no= 0;
+ buff->lsn= scanner->page_addr;
+ buff->lsn+= scanner->page_offset; /* offset increasing */
+ res= translog_read_record_header_from_buffer(scanner->page,
+ scanner->page_offset,
+ buff,
+ (move_scanner ?
+ scanner : 0));
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Read record header and some fixed part of the next record (the part
+ depend on record type).
+
+ @param scanner data for scanning if lsn is NULL scanner data
+ will be used for continue scanning.
+ The scanner can be NULL.
+
+ @param buff log record header buffer
+
+ @return Length of header or operation status
+ @retval RECHEADER_READ_ERROR error
+ @retval RECHEADER_READ_EOF EOF
+ @retval # number of bytes in
+ TRANSLOG_HEADER_BUFFER::header where
+ stored decoded part of the header
+*/
+
+int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner,
+ TRANSLOG_HEADER_BUFFER *buff)
+{
+ translog_size_t res;
+
+ DBUG_ENTER("translog_read_next_record_header");
+ buff->groups_no= 0; /* to be sure that we will free it right */
+ DBUG_PRINT("enter", ("scanner: 0x%lx", (ulong) scanner));
+ DBUG_PRINT("info", ("Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) "
+ "Lst: (%lu,0x%lx) Offset: %u(%x) fixed: %d",
+ LSN_IN_PARTS(scanner->page_addr),
+ LSN_IN_PARTS(scanner->horizon),
+ LSN_IN_PARTS(scanner->last_file_page),
+ (uint) scanner->page_offset,
+ (uint) scanner->page_offset, scanner->fixed_horizon));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+
+ do
+ {
+ if (translog_get_next_chunk(scanner))
+ DBUG_RETURN(RECHEADER_READ_ERROR);
+ if (scanner->page == END_OF_LOG)
+ {
+ DBUG_PRINT("info", ("End of file from the scanner"));
+ /* Last record was read */
+ buff->lsn= LSN_IMPOSSIBLE;
+ DBUG_RETURN(RECHEADER_READ_EOF);
+ }
+ DBUG_PRINT("info", ("Page: (%lu,0x%lx) offset: %lu byte: %x",
+ LSN_IN_PARTS(scanner->page_addr),
+ (ulong) scanner->page_offset,
+ (uint) scanner->page[scanner->page_offset]));
+ } while (!translog_is_LSN_chunk(scanner->page[scanner->page_offset]) &&
+ scanner->page[scanner->page_offset] != TRANSLOG_FILLER);
+
+ if (scanner->page[scanner->page_offset] == TRANSLOG_FILLER)
+ {
+ DBUG_PRINT("info", ("End of file"));
+ /* Last record was read */
+ buff->lsn= LSN_IMPOSSIBLE;
+ /* Return 'end of log' marker */
+ res= RECHEADER_READ_EOF;
+ }
+ else
+ res= translog_read_record_header_scan(scanner, buff, 0);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Moves record data reader to the next chunk and fill the data reader
+ information about that chunk.
+
+ SYNOPSIS
+ translog_record_read_next_chunk()
+ data data cursor
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_record_read_next_chunk(TRANSLOG_READER_DATA *data)
+{
+ translog_size_t new_current_offset= data->current_offset + data->chunk_size;
+ uint16 chunk_header_len, chunk_len;
+ uint8 type;
+ DBUG_ENTER("translog_record_read_next_chunk");
+
+ if (data->eor)
+ {
+ DBUG_PRINT("info", ("end of the record flag set"));
+ DBUG_RETURN(1);
+ }
+
+ if (data->header.groups_no &&
+ data->header.groups_no - 1 != data->current_group &&
+ data->header.groups[data->current_group].num == data->current_chunk)
+ {
+ /* Goto next group */
+ data->current_group++;
+ data->current_chunk= 0;
+ DBUG_PRINT("info", ("skip to group: #%u", data->current_group));
+ translog_destroy_scanner(&data->scanner);
+ translog_scanner_init(data->header.groups[data->current_group].addr,
+ 1, &data->scanner, 1);
+ }
+ else
+ {
+ data->current_chunk++;
+ if (translog_get_next_chunk(&data->scanner))
+ DBUG_RETURN(1);
+ if (data->scanner.page == END_OF_LOG)
+ {
+ /*
+ Actually it should not happened, but we want to quit nicely in case
+ of a truncated log
+ */
+ DBUG_RETURN(1);
+ }
+ }
+ type= data->scanner.page[data->scanner.page_offset] & TRANSLOG_CHUNK_TYPE;
+
+ if (type == TRANSLOG_CHUNK_LSN && data->header.groups_no)
+ {
+ DBUG_PRINT("info",
+ ("Last chunk: data len: %u offset: %u group: %u of %u",
+ data->header.chunk0_data_len, data->scanner.page_offset,
+ data->current_group, data->header.groups_no - 1));
+ DBUG_ASSERT(data->header.groups_no - 1 == data->current_group);
+ DBUG_ASSERT(data->header.lsn ==
+ data->scanner.page_addr + data->scanner.page_offset);
+ translog_destroy_scanner(&data->scanner);
+ translog_scanner_init(data->header.chunk0_data_addr, 1, &data->scanner, 1);
+ data->chunk_size= data->header.chunk0_data_len;
+ data->body_offset= data->scanner.page_offset;
+ data->current_offset= new_current_offset;
+ data->eor= 1;
+ DBUG_RETURN(0);
+ }
+
+ if (type == TRANSLOG_CHUNK_LSN || type == TRANSLOG_CHUNK_FIXED)
+ {
+ data->eor= 1;
+ DBUG_RETURN(1); /* End of record */
+ }
+
+ chunk_header_len=
+ translog_get_chunk_header_length(data->scanner.page +
+ data->scanner.page_offset);
+ chunk_len= translog_get_total_chunk_length(data->scanner.page,
+ data->scanner.page_offset);
+ data->chunk_size= chunk_len - chunk_header_len;
+ data->body_offset= data->scanner.page_offset + chunk_header_len;
+ data->current_offset= new_current_offset;
+ DBUG_PRINT("info", ("grp: %u chunk: %u body_offset: %u chunk_size: %u "
+ "current_offset: %lu",
+ (uint) data->current_group,
+ (uint) data->current_chunk,
+ (uint) data->body_offset,
+ (uint) data->chunk_size, (ulong) data->current_offset));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Initialize record reader data from LSN
+
+ SYNOPSIS
+ translog_init_reader_data()
+ lsn reference to LSN we should start from
+ data reader data to initialize
+
+ RETURN
+ 0 OK
+ 1 Error
+*/
+
+static my_bool translog_init_reader_data(LSN lsn,
+ TRANSLOG_READER_DATA *data)
+{
+ int read_header;
+ DBUG_ENTER("translog_init_reader_data");
+ if (translog_scanner_init(lsn, 1, &data->scanner, 1) ||
+ ((read_header=
+ translog_read_record_header_scan(&data->scanner, &data->header, 1))
+ == RECHEADER_READ_ERROR))
+ DBUG_RETURN(1);
+ data->read_header= read_header;
+ data->body_offset= data->header.non_header_data_start_offset;
+ data->chunk_size= data->header.non_header_data_len;
+ data->current_offset= data->read_header;
+ data->current_group= 0;
+ data->current_chunk= 0;
+ data->eor= 0;
+ DBUG_PRINT("info", ("read_header: %u "
+ "body_offset: %u chunk_size: %u current_offset: %lu",
+ (uint) data->read_header,
+ (uint) data->body_offset,
+ (uint) data->chunk_size, (ulong) data->current_offset));
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Destroy reader data object
+*/
+
+static void translog_destroy_reader_data(TRANSLOG_READER_DATA *data)
+{
+ translog_destroy_scanner(&data->scanner);
+ translog_free_record_header(&data->header);
+}
+
+
+/*
+ Read a part of the record.
+
+ SYNOPSIS
+ translog_read_record_header()
+ lsn log record serial number (address of the record)
+ offset From the beginning of the record beginning (read
+ by translog_read_record_header).
+ length Length of record part which have to be read.
+ buffer Buffer where to read the record part (have to be at
+ least 'length' bytes length)
+
+ RETURN
+ length of data actually read
+*/
+
+translog_size_t translog_read_record(LSN lsn,
+ translog_size_t offset,
+ translog_size_t length,
+ uchar *buffer,
+ TRANSLOG_READER_DATA *data)
+{
+ translog_size_t requested_length= length;
+ translog_size_t end= offset + length;
+ TRANSLOG_READER_DATA internal_data;
+ DBUG_ENTER("translog_read_record");
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+
+ if (data == NULL)
+ {
+ DBUG_ASSERT(lsn != LSN_IMPOSSIBLE);
+ data= &internal_data;
+ }
+ if (lsn ||
+ (offset < data->current_offset &&
+ !(offset < data->read_header && offset + length < data->read_header)))
+ {
+ if (translog_init_reader_data(lsn, data))
+ DBUG_RETURN(0);
+ }
+ DBUG_PRINT("info", ("Offset: %lu length: %lu "
+ "Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) "
+ "Lst: (%lu,0x%lx) Offset: %u(%x) fixed: %d",
+ (ulong) offset, (ulong) length,
+ LSN_IN_PARTS(data->scanner.page_addr),
+ LSN_IN_PARTS(data->scanner.horizon),
+ LSN_IN_PARTS(data->scanner.last_file_page),
+ (uint) data->scanner.page_offset,
+ (uint) data->scanner.page_offset,
+ data->scanner.fixed_horizon));
+ if (offset < data->read_header)
+ {
+ uint16 len= min(data->read_header, end) - offset;
+ DBUG_PRINT("info",
+ ("enter header offset: %lu length: %lu",
+ (ulong) offset, (ulong) length));
+ memcpy(buffer, data->header.header + offset, len);
+ length-= len;
+ if (length == 0)
+ {
+ translog_destroy_reader_data(data);
+ DBUG_RETURN(requested_length);
+ }
+ offset+= len;
+ buffer+= len;
+ DBUG_PRINT("info",
+ ("len: %u offset: %lu curr: %lu length: %lu",
+ len, (ulong) offset, (ulong) data->current_offset,
+ (ulong) length));
+ }
+ /* TODO: find first page which we should read by offset */
+
+ /* read the record chunk by chunk */
+ for(;;)
+ {
+ uint page_end= data->current_offset + data->chunk_size;
+ DBUG_PRINT("info",
+ ("enter body offset: %lu curr: %lu "
+ "length: %lu page_end: %lu",
+ (ulong) offset, (ulong) data->current_offset, (ulong) length,
+ (ulong) page_end));
+ if (offset < page_end)
+ {
+ uint len= page_end - offset;
+ set_if_smaller(len, length); /* in case we read beyond record's end */
+ DBUG_ASSERT(offset >= data->current_offset);
+ memcpy(buffer,
+ data->scanner.page + data->body_offset +
+ (offset - data->current_offset), len);
+ length-= len;
+ if (length == 0)
+ {
+ translog_destroy_reader_data(data);
+ DBUG_RETURN(requested_length);
+ }
+ offset+= len;
+ buffer+= len;
+ DBUG_PRINT("info",
+ ("len: %u offset: %lu curr: %lu length: %lu",
+ len, (ulong) offset, (ulong) data->current_offset,
+ (ulong) length));
+ }
+ if (translog_record_read_next_chunk(data))
+ {
+ translog_destroy_reader_data(data);
+ DBUG_RETURN(requested_length - length);
+ }
+ }
+}
+
+
+/*
+ @brief Force skipping to the next buffer
+
+ @todo Do not copy old page content if all page protections are switched off
+ (because we do not need calculate something or change old parts of the page)
+*/
+
+static void translog_force_current_buffer_to_finish()
+{
+ TRANSLOG_ADDRESS new_buff_beginning;
+ uint16 old_buffer_no= log_descriptor.bc.buffer_no;
+ uint16 new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ struct st_translog_buffer *new_buffer= (log_descriptor.buffers +
+ new_buffer_no);
+ struct st_translog_buffer *old_buffer= log_descriptor.bc.buffer;
+ uchar *data= log_descriptor.bc.ptr - log_descriptor.bc.current_page_fill;
+ uint16 left= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill;
+ uint16 current_page_fill, write_counter, previous_offset;
+ DBUG_ENTER("translog_force_current_buffer_to_finish");
+ DBUG_PRINT("enter", ("Buffer #%u 0x%lx "
+ "Buffer addr: (%lu,0x%lx) "
+ "Page addr: (%lu,0x%lx) "
+ "size: %lu (%lu) Pg: %u left: %u in progress %u",
+ (uint) old_buffer_no,
+ (ulong) old_buffer,
+ LSN_IN_PARTS(old_buffer->offset),
+ (ulong) LSN_FILE_NO(log_descriptor.horizon),
+ (ulong) (LSN_OFFSET(log_descriptor.horizon) -
+ log_descriptor.bc.current_page_fill),
+ (ulong) old_buffer->size,
+ (ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
+ buffer->buffer),
+ (uint) log_descriptor.bc.current_page_fill,
+ (uint) left,
+ (uint) old_buffer->
+ copy_to_buffer_in_progress));
+ translog_lock_assert_owner();
+ LINT_INIT(current_page_fill);
+ new_buff_beginning= old_buffer->offset;
+ new_buff_beginning+= old_buffer->size; /* increase offset */
+
+ DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
+ DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
+ LSN_FILE_NO(old_buffer->offset));
+ translog_check_cursor(&log_descriptor.bc);
+ DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
+ if (left)
+ {
+ /*
+ TODO: if 'left' is so small that can't hold any other record
+ then do not move the page
+ */
+ DBUG_PRINT("info", ("left: %u", (uint) left));
+
+ old_buffer->pre_force_close_horizon=
+ old_buffer->offset + old_buffer->size;
+ /* decrease offset */
+ new_buff_beginning-= log_descriptor.bc.current_page_fill;
+ current_page_fill= log_descriptor.bc.current_page_fill;
+
+ memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
+ old_buffer->size+= left;
+ DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
+ "Size: %lu",
+ (uint) old_buffer->buffer_no,
+ (ulong) old_buffer,
+ (ulong) old_buffer->size));
+ DBUG_ASSERT(old_buffer->buffer_no ==
+ log_descriptor.bc.buffer_no);
+ }
+ else
+ {
+ log_descriptor.bc.current_page_fill= 0;
+ }
+
+ translog_buffer_lock(new_buffer);
+#ifndef DBUG_OFF
+ {
+ TRANSLOG_ADDRESS offset= new_buffer->offset;
+ TRANSLOG_FILE *file= new_buffer->file;
+ uint8 ver= new_buffer->ver;
+ translog_lock_assert_owner();
+#endif
+ translog_wait_for_buffer_free(new_buffer);
+#ifndef DBUG_OFF
+ /* We keep the handler locked so nobody can start this new buffer */
+ DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL &&
+ (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver);
+ }
+#endif
+
+ write_counter= log_descriptor.bc.write_counter;
+ previous_offset= log_descriptor.bc.previous_offset;
+ translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no);
+ /* Fix buffer offset (which was incorrectly set to horizon) */
+ log_descriptor.bc.buffer->offset= new_buff_beginning;
+ log_descriptor.bc.write_counter= write_counter;
+ log_descriptor.bc.previous_offset= previous_offset;
+ new_buffer->prev_last_lsn= BUFFER_MAX_LSN(old_buffer);
+ DBUG_PRINT("info", ("prev_last_lsn set to (%lu,0x%lx) buffer: 0x%lx",
+ LSN_IN_PARTS(new_buffer->prev_last_lsn),
+ (ulong) new_buffer));
+
+ /*
+ Advances this log pointer, increases writers and let other threads to
+ write to the log while we process old page content
+ */
+ if (left)
+ {
+ log_descriptor.bc.ptr+= current_page_fill;
+ log_descriptor.bc.buffer->size= log_descriptor.bc.current_page_fill=
+ current_page_fill;
+ new_buffer->overlay= 1;
+ }
+ else
+ translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc);
+ translog_buffer_increase_writers(new_buffer);
+ translog_buffer_unlock(new_buffer);
+
+ /*
+ We have to wait until all writers finish before start changing the
+ pages by applying protection and copying the page content in the
+ new buffer.
+ */
+#ifndef DBUG_OFF
+ {
+ TRANSLOG_ADDRESS offset= old_buffer->offset;
+ TRANSLOG_FILE *file= old_buffer->file;
+ uint8 ver= old_buffer->ver;
+#endif
+ /*
+ Now only one thread can flush log (buffer can flush many threads but
+ log flush log flush where this function is used can do only one thread)
+ so no other thread can set is_closing_buffer.
+ */
+ DBUG_ASSERT(!old_buffer->is_closing_buffer);
+ old_buffer->is_closing_buffer= 1; /* Other flushes will wait */
+ DBUG_PRINT("enter", ("Buffer #%u 0x%lx is_closing_buffer set",
+ (uint) old_buffer->buffer_no, (ulong) old_buffer));
+ translog_wait_for_writers(old_buffer);
+#ifndef DBUG_OFF
+ /* We blocked flushing this buffer so the buffer should not changed */
+ DBUG_ASSERT(offset == old_buffer->offset && file == old_buffer->file &&
+ ver == old_buffer->ver);
+ }
+#endif
+
+ if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION)
+ {
+ translog_put_sector_protection(data, &log_descriptor.bc);
+ if (left)
+ {
+ log_descriptor.bc.write_counter++;
+ log_descriptor.bc.previous_offset= current_page_fill;
+ }
+ else
+ {
+ DBUG_PRINT("info", ("drop write_counter"));
+ log_descriptor.bc.write_counter= 0;
+ log_descriptor.bc.previous_offset= 0;
+ }
+ }
+
+ if (log_descriptor.flags & TRANSLOG_PAGE_CRC)
+ {
+ uint32 crc= translog_crc(data + log_descriptor.page_overhead,
+ TRANSLOG_PAGE_SIZE -
+ log_descriptor.page_overhead);
+ DBUG_PRINT("info", ("CRC: 0x%lx", (ulong) crc));
+ int4store(data + 3 + 3 + 1, crc);
+ }
+ old_buffer->is_closing_buffer= 0;
+ DBUG_PRINT("enter", ("Buffer #%u 0x%lx is_closing_buffer cleared",
+ (uint) old_buffer->buffer_no, (ulong) old_buffer));
+ pthread_cond_broadcast(&old_buffer->waiting_filling_buffer);
+
+ if (left)
+ {
+ if (log_descriptor.flags &
+ (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ else
+ {
+ /*
+ This page header does not change if we add more data to the page so
+ we can not copy it and will not overwrite later
+ */
+ new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+ memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+ DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+ }
+ }
+ old_buffer->next_buffer_offset= new_buffer->offset;
+ translog_buffer_lock(new_buffer);
+ new_buffer->prev_buffer_offset= old_buffer->offset;
+ translog_buffer_decrease_writers(new_buffer);
+ translog_buffer_unlock(new_buffer);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Waits while given lsn will be flushed
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+*/
+
+void translog_flush_wait_for_end(LSN lsn)
+{
+ DBUG_ENTER("translog_flush_wait_for_end");
+ DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ safe_mutex_assert_owner(&log_descriptor.log_flush_lock);
+ while (cmp_translog_addr(log_descriptor.flushed, lsn) < 0)
+ pthread_cond_wait(&log_descriptor.log_flush_cond,
+ &log_descriptor.log_flush_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Sets goal for the next flush pass and waits for this pass end.
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+*/
+
+void translog_flush_set_new_goal_and_wait(TRANSLOG_ADDRESS lsn)
+{
+ int flush_no= log_descriptor.flush_no;
+ DBUG_ENTER("translog_flush_set_new_goal_and_wait");
+ DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ safe_mutex_assert_owner(&log_descriptor.log_flush_lock);
+ if (cmp_translog_addr(lsn, log_descriptor.next_pass_max_lsn) > 0)
+ {
+ log_descriptor.next_pass_max_lsn= lsn;
+ log_descriptor.max_lsn_requester= pthread_self();
+ pthread_cond_broadcast(&log_descriptor.new_goal_cond);
+ }
+ while (flush_no == log_descriptor.flush_no)
+ {
+ pthread_cond_wait(&log_descriptor.log_flush_cond,
+ &log_descriptor.log_flush_lock);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief sync() range of files (inclusive) and directory (by request)
+
+ @param min min internal file number to flush
+ @param max max internal file number to flush
+ @param sync_dir need sync directory
+
+ return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+ my_bool sync_dir)
+{
+ uint fn;
+ my_bool rc= 0;
+ ulonglong flush_interval;
+ DBUG_ENTER("translog_sync_files");
+ DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d",
+ (ulong) min, (ulong) max, (int) sync_dir));
+ DBUG_ASSERT(min <= max);
+
+ flush_interval= group_commit_wait;
+ if (flush_interval)
+ flush_start= my_micro_time();
+ for (fn= min; fn <= max; fn++)
+ {
+ TRANSLOG_FILE *file= get_logfile_by_number(fn);
+ DBUG_ASSERT(file != NULL);
+ if (!file->is_sync)
+ {
+ if (my_sync(file->handler.file, MYF(MY_WME)))
+ {
+ rc= 1;
+ translog_stop_writing();
+ DBUG_RETURN(rc);
+ }
+ translog_syncs++;
+ file->is_sync= 1;
+ }
+ }
+
+ if (sync_dir)
+ {
+ if (!(rc= sync_dir(log_descriptor.directory_fd,
+ MYF(MY_WME | MY_IGNORE_BADFD))))
+ translog_syncs++;
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+ @param lsn address up to which all LSNs should be flushed,
+ can be reset to real last LSN address
+ @parem sent_to_disk returns 'sent to disk' position
+ @param flush_horizon returns horizon of the flush
+
+ @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+ TRANSLOG_ADDRESS *sent_to_disk,
+ TRANSLOG_ADDRESS *flush_horizon)
+{
+ dirty_buffer_mask_t dirty_buffer_mask;
+ uint i;
+ uint8 last_buffer_no, start_buffer_no;
+ DBUG_ENTER("translog_flush_buffers");
+
+ /*
+ We will recheck information when will lock buffers one by
+ one so we can use unprotected read here (this is just for
+ speed up buffers processing)
+ */
+ dirty_buffer_mask= log_descriptor.dirty_buffer_mask;
+ DBUG_PRINT("info", ("Dirty buffer mask: %lx current buffer: %u",
+ (ulong) dirty_buffer_mask,
+ (uint) log_descriptor.bc.buffer_no));
+ for (i= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ i != log_descriptor.bc.buffer_no && !(dirty_buffer_mask & (1 << i));
+ i= (i + 1) % TRANSLOG_BUFFERS_NO) {}
+ start_buffer_no= i;
+
+ DBUG_PRINT("info",
+ ("start from: %u current: %u prev last lsn: (%lu,0x%lx)",
+ (uint) start_buffer_no, (uint) log_descriptor.bc.buffer_no,
+ LSN_IN_PARTS(log_descriptor.bc.buffer->prev_last_lsn)));
+
+
+ /*
+ if LSN up to which we have to flush bigger then maximum LSN of previous
+ buffer and at least one LSN was saved in the current buffer (last_lsn !=
+ LSN_IMPOSSIBLE) then we have to close the current buffer.
+ */
+ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+ log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
+ {
+ struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
+ *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+ DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+ last_buffer_no= log_descriptor.bc.buffer_no;
+ log_descriptor.is_everything_flushed= 1;
+ translog_force_current_buffer_to_finish();
+ translog_buffer_unlock(buffer);
+ }
+ else
+ {
+ last_buffer_no= ((log_descriptor.bc.buffer_no + TRANSLOG_BUFFERS_NO -1) %
+ TRANSLOG_BUFFERS_NO);
+ translog_unlock();
+ }
+
+ /* flush buffers */
+ *sent_to_disk= translog_get_sent_to_disk();
+ if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
+ {
+
+ DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u",
+ (uint) start_buffer_no, (uint) last_buffer_no));
+ last_buffer_no= (last_buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+ i= start_buffer_no;
+ do
+ {
+ struct st_translog_buffer *buffer= log_descriptor.buffers + i;
+ translog_buffer_lock(buffer);
+ DBUG_PRINT("info", ("Check buffer: 0x%lx #: %u "
+ "prev last LSN: (%lu,0x%lx) "
+ "last LSN: (%lu,0x%lx) status: %s",
+ (ulong)(buffer),
+ (uint) i,
+ LSN_IN_PARTS(buffer->prev_last_lsn),
+ LSN_IN_PARTS(buffer->last_lsn),
+ (buffer->file ?
+ "dirty" : "closed")));
+ if (buffer->prev_last_lsn <= *lsn &&
+ buffer->file != NULL)
+ {
+ DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+ *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+ buffer->pre_force_close_horizon :
+ buffer->offset + buffer->size);
+ /* pre_force_close_horizon is reset during new buffer start */
+ DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(*flush_horizon)));
+ DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
+ translog_buffer_flush(buffer);
+ }
+ translog_buffer_unlock(buffer);
+ i= (i + 1) % TRANSLOG_BUFFERS_NO;
+ } while (i != last_buffer_no);
+ *sent_to_disk= translog_get_sent_to_disk();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note
+
+ - Non group commit logic: Commits made in passes. Thread which started
+ flush first is performing actual flush, other threads sets new goal (LSN)
+ of the next pass (if it is maximum) and waits for the pass end or just
+ wait for the pass end.
+
+ - If hard group commit enabled and rate set to zero:
+ The first thread sends all changed buffers to disk. This is repeated
+ as long as there are new LSNs added. The process can not loop
+ forever because we have limited number of threads and they will wait
+ for the data to be synced.
+ Pseudo code:
+
+ do
+ send changed buffers to disk
+ while new_goal
+ sync
+
+ - If hard group commit switched ON and less than rate microseconds has
+ passed from last sync, then after buffers have been sent to disk
+ wait until rate microseconds has passed since last sync, do sync and return.
+ This ensures that if we call sync infrequently we don't do any waits.
+
+ - If soft group commit enabled everything works as with 'non group commit'
+ but the thread doesn't do any real sync(). If rate is not zero the
+ sync() will be performed by a service thread with the given rate
+ when needed (new LSN appears).
+
+ @note Terminology:
+ 'sent to disk' means written to disk but not sync()ed,
+ 'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+ struct timespec abstime;
+ ulonglong flush_interval;
+ ulonglong time_spent;
+ LSN sent_to_disk= LSN_IMPOSSIBLE;
+ TRANSLOG_ADDRESS flush_horizon;
+ my_bool rc= 0;
+ my_bool hgroup_commit_at_start;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ LINT_INIT(sent_to_disk);
+ LINT_INIT(flush_interval);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ if (log_descriptor.flush_in_progress)
+ {
+ translog_lock();
+ /* fix lsn if it was horizon */
+ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+ lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+ translog_unlock();
+ translog_flush_set_new_goal_and_wait(lsn);
+ if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+ {
+ /*
+ translog_flush_wait_for_end() release log_flush_lock while is
+ waiting then acquire it again
+ */
+ translog_flush_wait_for_end(lsn);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ }
+ log_descriptor.flush_in_progress= 1;
+ flush_horizon= log_descriptor.previous_flush_horizon;
+ DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(flush_horizon)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ hgroup_commit_at_start= hard_group_commit;
+ if (hgroup_commit_at_start)
+ flush_interval= group_commit_wait;
+
+ translog_lock();
+ if (log_descriptor.is_everything_flushed)
+ {
+ DBUG_PRINT("info", ("everything is flushed"));
+ translog_unlock();
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+
+ for (;;)
+ {
+ /* Following function flushes buffers and makes translog_unlock() */
+ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+ if (!hgroup_commit_at_start)
+ break; /* flush pass is ended */
+
+retest:
+ /*
+ We do not check time here because pthread_mutex_lock rarely takes
+ a lot of time so we can sacrifice a bit precision to performance
+ (taking into account that my_micro_time() might be expensive call).
+ */
+ if (flush_interval == 0)
+ break; /* flush pass is ended */
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE)
+ {
+ if (flush_interval == 0 ||
+ (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ break;
+ }
+ DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu",
+ flush_interval - time_spent,
+ flush_interval, time_spent));
+ /* wait time or next goal */
+ set_timespec_nsec(abstime, flush_interval - time_spent);
+ pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+ &log_descriptor.log_flush_lock,
+ &abstime);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("retest conditions"));
+ goto retest;
+ }
+
+ /* take next goal */
+ lsn= log_descriptor.next_pass_max_lsn;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ /* prevent other thread from continue */
+ log_descriptor.max_lsn_requester= pthread_self();
+ DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ /* next flush pass */
+ DBUG_PRINT("info", ("next flush pass"));
+ translog_lock();
+ }
+
+ /*
+ sync() files from previous flush till current one
+ */
+ if (!soft_sync || hgroup_commit_at_start)
+ {
+ if ((rc=
+ translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+ LSN_FILE_NO(lsn),
+ sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+ (LSN_FILE_NO(log_descriptor.
+ previous_flush_horizon) !=
+ LSN_FILE_NO(flush_horizon) ||
+ (LSN_OFFSET(log_descriptor.
+ previous_flush_horizon) /
+ TRANSLOG_PAGE_SIZE) !=
+ (LSN_OFFSET(flush_horizon) /
+ TRANSLOG_PAGE_SIZE)))))
+ {
+ sent_to_disk= LSN_IMPOSSIBLE;
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+ /* keep values for soft sync() and forced sync() actual */
+ {
+ uint32 fileno= LSN_FILE_NO(lsn);
+ soft_sync_min= fileno;
+ soft_sync_max= fileno;
+ }
+ }
+ else
+ {
+ soft_sync_max= LSN_FILE_NO(lsn);
+ soft_need_sync= 1;
+ }
+
+ DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ log_descriptor.previous_flush_horizon= flush_horizon;
+out:
+ if (sent_to_disk != LSN_IMPOSSIBLE)
+ log_descriptor.flushed= sent_to_disk;
+ log_descriptor.flush_in_progress= 0;
+ log_descriptor.flush_no++;
+ DBUG_PRINT("info", ("flush_in_progress is dropped"));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ pthread_cond_broadcast(&log_descriptor.log_flush_cond);
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief Gives a 2-byte-id to MARIA_SHARE and logs this fact
+
+ If a MARIA_SHARE does not yet have a 2-byte-id (unique over all currently
+ open MARIA_SHAREs), give it one and record this assignment in the log
+ (LOGREC_FILE_ID log record).
+
+ @param tbl_info table
+ @param trn calling transaction
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note Can be called even if share already has an id (then will do nothing)
+*/
+
+int translog_assign_id_to_share(MARIA_HA *tbl_info, TRN *trn)
+{
+ MARIA_SHARE *share= tbl_info->s;
+ /*
+ If you give an id to a non-BLOCK_RECORD table, you also need to release
+ this id somewhere. Then you can change the assertion.
+ */
+ DBUG_ASSERT(share->data_file_type == BLOCK_RECORD);
+ /* re-check under mutex to avoid having 2 ids for the same share */
+ pthread_mutex_lock(&share->intern_lock);
+ if (unlikely(share->id == 0))
+ {
+ LSN lsn;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ uchar log_data[FILEID_STORE_SIZE];
+ /* Inspired by set_short_trid() of trnman.c */
+ uint i= share->kfile.file % SHARE_ID_MAX + 1;
+ do
+ {
+ my_atomic_rwlock_wrlock(&LOCK_id_to_share);
+ for ( ; i <= SHARE_ID_MAX ; i++) /* the range is [1..SHARE_ID_MAX] */
+ {
+ void *tmp= NULL;
+ if (id_to_share[i] == NULL &&
+ my_atomic_casptr((void **)&id_to_share[i], &tmp, share))
+ {
+ share->id= (uint16)i;
+ break;
+ }
+ }
+ my_atomic_rwlock_wrunlock(&LOCK_id_to_share);
+ i= 1; /* scan the whole array */
+ } while (share->id == 0);
+ DBUG_PRINT("info", ("id_to_share: 0x%lx -> %u", (ulong)share, share->id));
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ /*
+ open_file_name is an unresolved name (symlinks are not resolved, datadir
+ is not realpath-ed, etc) which is good: the log can be moved to another
+ directory and continue working.
+ */
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str=
+ (uchar *)share->open_file_name.str;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length=
+ share->open_file_name.length + 1;
+ /*
+ We can't unlock share->intern_lock before the log entry is written to
+ ensure no one uses the id before it's logged.
+ */
+ if (unlikely(translog_write_record(&lsn, LOGREC_FILE_ID, trn, tbl_info,
+ (translog_size_t)
+ (sizeof(log_data) +
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 1].length),
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, log_data, NULL)))
+ {
+ pthread_mutex_unlock(&share->intern_lock);
+ return 1;
+ }
+ }
+ pthread_mutex_unlock(&share->intern_lock);
+ return 0;
+}
+
+
+/**
+ @brief Recycles a MARIA_SHARE's short id.
+
+ @param share table
+
+ @note Must be called only if share has an id (i.e. id != 0)
+*/
+
+void translog_deassign_id_from_share(MARIA_SHARE *share)
+{
+ DBUG_PRINT("info", ("id_to_share: 0x%lx id %u -> 0",
+ (ulong)share, share->id));
+ /*
+ We don't need any mutex as we are called only when closing the last
+ instance of the table or at the end of REPAIR: no writes can be
+ happening. But a Checkpoint may be reading share->id, so we require this
+ mutex:
+ */
+ safe_mutex_assert_owner(&share->intern_lock);
+ my_atomic_rwlock_rdlock(&LOCK_id_to_share);
+ my_atomic_storeptr((void **)&id_to_share[share->id], 0);
+ my_atomic_rwlock_rdunlock(&LOCK_id_to_share);
+ share->id= 0;
+ /* useless but safety: */
+ share->lsn_of_file_id= LSN_IMPOSSIBLE;
+}
+
+
+void translog_assign_id_to_share_from_recovery(MARIA_SHARE *share,
+ uint16 id)
+{
+ DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded);
+ DBUG_ASSERT(share->data_file_type == BLOCK_RECORD);
+ DBUG_ASSERT(share->id == 0);
+ DBUG_ASSERT(id_to_share[id] == NULL);
+ id_to_share[share->id= id]= share;
+}
+
+
+/**
+ @brief check if such log file exists
+
+ @param file_no number of the file to test
+
+ @retval 0 no such file
+ @retval 1 there is file with such number
+*/
+
+my_bool translog_is_file(uint file_no)
+{
+ MY_STAT stat_buff;
+ char path[FN_REFLEN];
+ return (test(my_stat(translog_filename_by_fileno(file_no, path),
+ &stat_buff, MYF(0))));
+}
+
+
+/**
+ @brief returns minimum log file number
+
+ @param horizon the end of the log
+ @param is_protected true if it is under purge_log protection
+
+ @retval minimum file number
+ @retval 0 no files found
+*/
+
+static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected)
+{
+ uint min_file= 0, max_file;
+ DBUG_ENTER("translog_first_file");
+ if (!is_protected)
+ pthread_mutex_lock(&log_descriptor.purger_lock);
+ if (log_descriptor.min_file_number &&
+ translog_is_file(log_descriptor.min_file_number))
+ {
+ DBUG_PRINT("info", ("cached %lu",
+ (ulong) log_descriptor.min_file_number));
+ if (!is_protected)
+ pthread_mutex_unlock(&log_descriptor.purger_lock);
+ DBUG_RETURN(log_descriptor.min_file_number);
+ }
+
+ max_file= LSN_FILE_NO(horizon);
+
+ /* binary search for last file */
+ while (min_file != max_file && min_file != (max_file - 1))
+ {
+ uint test= (min_file + max_file) / 2;
+ DBUG_PRINT("info", ("min_file: %u test: %u max_file: %u",
+ min_file, test, max_file));
+ if (test == max_file)
+ test--;
+ if (translog_is_file(test))
+ max_file= test;
+ else
+ min_file= test;
+ }
+ log_descriptor.min_file_number= max_file;
+ if (!is_protected)
+ pthread_mutex_unlock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("first file :%lu", (ulong) max_file));
+ DBUG_ASSERT(max_file >= 1);
+ DBUG_RETURN(max_file);
+}
+
+
+/**
+ @brief returns the most close LSN higher the given chunk address
+
+ @param addr the chunk address to start from
+ @param horizon the horizon if it is known or LSN_IMPOSSIBLE
+
+ @retval LSN_ERROR Error
+ @retval LSN_IMPOSSIBLE no LSNs after the address
+ @retval # LSN of the most close LSN higher the given chunk address
+*/
+
+LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon)
+{
+ TRANSLOG_SCANNER_DATA scanner;
+ LSN result;
+ DBUG_ENTER("translog_next_LSN");
+
+ if (horizon == LSN_IMPOSSIBLE)
+ horizon= translog_get_horizon();
+
+ if (addr == horizon)
+ DBUG_RETURN(LSN_IMPOSSIBLE);
+
+ translog_scanner_init(addr, 0, &scanner, 1);
+ /*
+ addr can point not to a chunk beginning but page end so next
+ page beginning.
+ */
+ if (addr % TRANSLOG_PAGE_SIZE == 0)
+ {
+ /*
+ We are emulating the page end which cased such horizon value to
+ trigger translog_scanner_eop().
+
+ We can't just increase addr on page header overhead because it
+ can be file end so we allow translog_get_next_chunk() to skip
+ to the next page in correct way
+ */
+ scanner.page_addr-= TRANSLOG_PAGE_SIZE;
+ scanner.page_offset= TRANSLOG_PAGE_SIZE;
+#ifndef DBUG_OFF
+ scanner.page= NULL; /* prevent using incorrect page content */
+#endif
+ }
+ /* addr can point not to a chunk beginning but to a page end */
+ if (translog_scanner_eop(&scanner))
+ {
+ if (translog_get_next_chunk(&scanner))
+ {
+ result= LSN_ERROR;
+ goto out;
+ }
+ if (scanner.page == END_OF_LOG)
+ {
+ result= LSN_IMPOSSIBLE;
+ goto out;
+ }
+ }
+
+ while (!translog_is_LSN_chunk(scanner.page[scanner.page_offset]) &&
+ scanner.page[scanner.page_offset] != TRANSLOG_FILLER)
+ {
+ if (translog_get_next_chunk(&scanner))
+ {
+ result= LSN_ERROR;
+ goto out;
+ }
+ if (scanner.page == END_OF_LOG)
+ {
+ result= LSN_IMPOSSIBLE;
+ goto out;
+ }
+ }
+
+ if (scanner.page[scanner.page_offset] == TRANSLOG_FILLER)
+ result= LSN_IMPOSSIBLE; /* reached page filler */
+ else
+ result= scanner.page_addr + scanner.page_offset;
+out:
+ translog_destroy_scanner(&scanner);
+ DBUG_RETURN(result);
+}
+
+
+/**
+ @brief returns the LSN of the first record starting in this log
+
+ @retval LSN_ERROR Error
+ @retval LSN_IMPOSSIBLE no log or the log is empty
+ @retval # LSN of the first record
+*/
+
+LSN translog_first_lsn_in_log()
+{
+ TRANSLOG_ADDRESS addr, horizon= translog_get_horizon();
+ TRANSLOG_VALIDATOR_DATA data;
+ uint file;
+ uint16 chunk_offset;
+ uchar *page;
+ DBUG_ENTER("translog_first_lsn_in_log");
+ DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(horizon)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+
+ if (!(file= translog_first_file(horizon, 0)))
+ {
+ /* log has no records yet */
+ DBUG_RETURN(LSN_IMPOSSIBLE);
+ }
+
+ addr= MAKE_LSN(file, TRANSLOG_PAGE_SIZE); /* the first page of the file */
+ data.addr= &addr;
+ {
+ TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+ if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL ||
+ (chunk_offset= translog_get_first_chunk_offset(page)) == 0)
+ DBUG_RETURN(LSN_ERROR);
+ }
+ addr+= chunk_offset;
+
+ DBUG_RETURN(translog_next_LSN(addr, horizon));
+}
+
+
+/**
+ @brief Returns theoretical first LSN if first log is present
+
+ @retval LSN_ERROR Error
+ @retval LSN_IMPOSSIBLE no log
+ @retval # LSN of the first record
+*/
+
+LSN translog_first_theoretical_lsn()
+{
+ TRANSLOG_ADDRESS addr= translog_get_horizon();
+ TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+ uchar *page;
+ TRANSLOG_VALIDATOR_DATA data;
+ DBUG_ENTER("translog_first_theoretical_lsn");
+ DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(addr)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+
+ if (!translog_is_file(1))
+ DBUG_RETURN(LSN_IMPOSSIBLE);
+ if (addr == MAKE_LSN(1, TRANSLOG_PAGE_SIZE))
+ {
+ /* log has no records yet */
+ DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE +
+ log_descriptor.page_overhead));
+ }
+
+ addr= MAKE_LSN(1, TRANSLOG_PAGE_SIZE); /* the first page of the file */
+ data.addr= &addr;
+ if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL)
+ DBUG_RETURN(LSN_ERROR);
+
+ DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE +
+ page_overhead[page[TRANSLOG_PAGE_FLAGS]]));
+}
+
+
+/**
+ @brief Checks given low water mark and purge files if it is need
+
+ @param low the last (minimum) address which is need
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool translog_purge(TRANSLOG_ADDRESS low)
+{
+ uint32 last_need_file= LSN_FILE_NO(low);
+ uint32 min_unsync;
+ int soft;
+ TRANSLOG_ADDRESS horizon= translog_get_horizon();
+ int rc= 0;
+ DBUG_ENTER("translog_purge");
+ DBUG_PRINT("enter", ("low: (%lu,0x%lx)", LSN_IN_PARTS(low)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+
+ soft= soft_sync;
+ min_unsync= soft_sync_min;
+ DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+ if (soft && min_unsync < last_need_file)
+ {
+ last_need_file= min_unsync;
+ DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+ }
+
+ pthread_mutex_lock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+ (ulong) log_descriptor.last_lsn_checked));
+ if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
+ {
+ uint32 i;
+ uint32 min_file= translog_first_file(horizon, 1);
+ DBUG_ASSERT(min_file != 0); /* log is already started */
+ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file));
+ for(i= min_file; i < last_need_file && rc == 0; i++)
+ {
+ LSN lsn= translog_get_file_max_lsn_stored(i);
+ if (lsn == LSN_IMPOSSIBLE)
+ break; /* files are still in writing */
+ if (lsn == LSN_ERROR)
+ {
+ rc= 1;
+ break;
+ }
+ if (cmp_translog_addr(lsn, low) >= 0)
+ break;
+
+ DBUG_PRINT("info", ("purge file %lu", (ulong) i));
+
+ /* remove file descriptor from the cache */
+ /*
+ log_descriptor.min_file can be changed only here during execution
+ and the function is serialized, so we can access it without problems
+ */
+ if (i >= log_descriptor.min_file)
+ {
+ TRANSLOG_FILE *file;
+ rw_wrlock(&log_descriptor.open_files_lock);
+ DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+ log_descriptor.open_files.elements);
+ DBUG_ASSERT(log_descriptor.min_file == i);
+ file= *((TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files));
+ DBUG_PRINT("info", ("Files : %d", log_descriptor.open_files.elements));
+ DBUG_ASSERT(i == file->number);
+ log_descriptor.min_file++;
+ DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+ log_descriptor.open_files.elements);
+ rw_unlock(&log_descriptor.open_files_lock);
+ translog_close_log_file(file);
+ }
+ if (log_purge_type == TRANSLOG_PURGE_IMMIDIATE)
+ {
+ char path[FN_REFLEN], *file_name;
+ file_name= translog_filename_by_fileno(i, path);
+ rc= test(my_delete(file_name, MYF(MY_WME)));
+ }
+ }
+ if (unlikely(rc == 1))
+ log_descriptor.min_need_file= 0; /* impossible value */
+ else
+ log_descriptor.min_need_file= i;
+ }
+
+ pthread_mutex_unlock(&log_descriptor.purger_lock);
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief Purges files by stored min need file in case of
+ "ondemend" purge type
+
+ @note This function do real work only if it is "ondemend" purge type
+ and translog_purge() was called at least once and last time without
+ errors
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool translog_purge_at_flush()
+{
+ uint32 i, min_file;
+ int rc= 0;
+ DBUG_ENTER("translog_purge_at_flush");
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+
+ if (unlikely(translog_status == TRANSLOG_READONLY))
+ {
+ DBUG_PRINT("info", ("The log is read only => exit"));
+ DBUG_RETURN(0);
+ }
+
+ if (log_purge_type != TRANSLOG_PURGE_ONDEMAND)
+ {
+ DBUG_PRINT("info", ("It is not \"at_flush\" => exit"));
+ DBUG_RETURN(0);
+ }
+
+ pthread_mutex_lock(&log_descriptor.purger_lock);
+
+ if (unlikely(log_descriptor.min_need_file == 0))
+ {
+ DBUG_PRINT("info", ("No info about min need file => exit"));
+ pthread_mutex_unlock(&log_descriptor.purger_lock);
+ DBUG_RETURN(0);
+ }
+
+ min_file= translog_first_file(translog_get_horizon(), 1);
+ DBUG_ASSERT(min_file != 0); /* log is already started */
+ for(i= min_file; i < log_descriptor.min_need_file && rc == 0; i++)
+ {
+ char path[FN_REFLEN], *file_name;
+ DBUG_PRINT("info", ("purge file %lu\n", (ulong) i));
+ file_name= translog_filename_by_fileno(i, path);
+ rc= test(my_delete(file_name, MYF(MY_WME)));
+ }
+
+ pthread_mutex_unlock(&log_descriptor.purger_lock);
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief Gets min file number
+
+ @param horizon the end of the log
+
+ @retval minimum file number
+ @retval 0 no files found
+*/
+
+uint32 translog_get_first_file(TRANSLOG_ADDRESS horizon)
+{
+ return translog_first_file(horizon, 0);
+}
+
+
+/**
+ @brief Gets min file number which is needed
+
+ @retval minimum file number
+ @retval 0 unknown
+*/
+
+uint32 translog_get_first_needed_file()
+{
+ uint32 file_no;
+ pthread_mutex_lock(&log_descriptor.purger_lock);
+ file_no= log_descriptor.min_need_file;
+ pthread_mutex_unlock(&log_descriptor.purger_lock);
+ return file_no;
+}
+
+
+/**
+ @brief Gets transaction log file size
+
+ @return transaction log file size
+*/
+
+uint32 translog_get_file_size()
+{
+ uint32 res;
+ translog_lock();
+ res= log_descriptor.log_file_max_size;
+ translog_unlock();
+ return (res);
+}
+
+
+/**
+ @brief Sets transaction log file size
+
+ @return Returns actually set transaction log size
+*/
+
+void translog_set_file_size(uint32 size)
+{
+ struct st_translog_buffer *old_buffer= NULL;
+ DBUG_ENTER("translog_set_file_size");
+ translog_lock();
+ DBUG_PRINT("enter", ("Size: %lu", (ulong) size));
+ DBUG_ASSERT(size % TRANSLOG_PAGE_SIZE == 0 &&
+ size >= TRANSLOG_MIN_FILE_SIZE);
+ log_descriptor.log_file_max_size= size;
+ /* if current file longer then finish it*/
+ if (LSN_OFFSET(log_descriptor.horizon) >= log_descriptor.log_file_max_size)
+ {
+ old_buffer= log_descriptor.bc.buffer;
+ translog_buffer_next(&log_descriptor.horizon, &log_descriptor.bc, 1);
+ translog_buffer_unlock(old_buffer);
+ }
+ translog_unlock();
+ if (old_buffer)
+ {
+ translog_buffer_lock(old_buffer);
+ translog_buffer_flush(old_buffer);
+ translog_buffer_unlock(old_buffer);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ Write debug information to log if we EXTRA_DEBUG is enabled
+*/
+
+my_bool translog_log_debug_info(TRN *trn __attribute__((unused)),
+ enum translog_debug_info_type type
+ __attribute__((unused)),
+ uchar *info __attribute__((unused)),
+ size_t length __attribute__((unused)))
+{
+#ifdef EXTRA_DEBUG
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ uchar debug_type;
+ LSN lsn;
+
+ if (!trn)
+ {
+ /*
+ We can't log the current transaction because we don't have
+ an active transaction. Use a temporary transaction object instead
+ */
+ trn= &dummy_transaction_object;
+ }
+ debug_type= (uchar) type;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= &debug_type;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= 1;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= info;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
+ return translog_write_record(&lsn, LOGREC_DEBUG_INFO,
+ trn, NULL,
+ (translog_size_t) (1+ length),
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL, NULL);
+#else
+ return 0;
+#endif
+}
+
+
+
+/**
+ Sets soft sync mode
+
+ @param mode TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+ soft_sync= mode;
+}
+
+
+/**
+ Sets hard group commit
+
+ @param mode TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+ hard_group_commit= mode;
+}
+
+
+/**
+ @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+ uint32 max= get_current_logfile()->number;
+ uint32 min;
+ DBUG_ENTER("ma_translog_sync");
+
+ min= soft_sync_min;
+ if (!min)
+ min= max;
+
+ translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief set rate for group commit
+
+ @param interval interval to set.
+
+ @note We use this function with additional variable because have to
+ restart service thread with new value which we can't make inside changing
+ variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+ DBUG_ENTER("translog_set_group_commit_interval");
+ group_commit_wait= interval;
+ DBUG_PRINT("info", ("wait: %llu",
+ (ulonglong)group_commit_wait));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+ my_thread_init();
+ {
+ DBUG_ENTER("ma_soft_sync_background");
+ for(;;)
+ {
+ ulonglong prev_loop= my_micro_time();
+ ulonglong time, sleep;
+ uint32 min, max, sync_request;
+ min= soft_sync_min;
+ max= soft_sync_max;
+ sync_request= soft_need_sync;
+ soft_sync_min= max;
+ soft_need_sync= 0;
+
+ sleep= group_commit_wait;
+ if (sync_request)
+ translog_sync_files(min, max, FALSE);
+ time= my_micro_time() - prev_loop;
+ if (time > sleep)
+ sleep= 0;
+ else
+ sleep-= time;
+ if (my_service_thread_sleep(&soft_sync_control, sleep))
+ break;
+ }
+ my_service_thread_signal_end(&soft_sync_control);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+/**
+ @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+ pthread_t th;
+ int res= 0;
+ uint32 min, max;
+ DBUG_ENTER("translog_soft_sync_start");
+
+ /* check and init variables */
+ min= soft_sync_min;
+ max= soft_sync_max;
+ if (!max)
+ soft_sync_max= max= get_current_logfile()->number;
+ if (!min)
+ soft_sync_min= max;
+ soft_need_sync= 1;
+
+ if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+ if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+ soft_sync_control.status= THREAD_RUNNING;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Stops syncing thread
+*/
+
+void translog_soft_sync_end(void)
+{
+ DBUG_ENTER("translog_soft_sync_end");
+ if (soft_sync_control.inited)
+ {
+ ma_service_thread_control_end(&soft_sync_control);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+#ifdef MARIA_DUMP_LOG
+#include <my_getopt.h>
+extern void translog_example_table_init();
+static const char *load_default_groups[]= { "aria_dump_log",0 };
+static void get_options(int *argc,char * * *argv);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+const char *default_dbug_option= "d:t:i:O,\\aria_dump_log.trace";
+#else
+const char *default_dbug_option= "d:t:i:o,/tmp/aria_dump_log.trace";
+#endif
+#endif
+static ulonglong opt_offset;
+static ulong opt_pages;
+static const char *opt_file= NULL;
+static File handler= -1;
+static my_bool opt_unit= 0;
+static struct my_option my_long_options[] =
+{
+#ifdef IMPLTMENTED
+ {"body", 'b',
+ "Print chunk body dump",
+ (uchar **) &opt_body, (uchar **) &opt_body, 0,
+ GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+#ifndef DBUG_OFF
+ {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.",
+ 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"file", 'f', "Path to file which will be read",
+ (uchar**) &opt_file, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"help", '?', "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { "offset", 'o', "Start reading log from this offset",
+ (uchar**) &opt_offset, (uchar**) &opt_offset,
+ 0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 },
+ { "pages", 'n', "Number of pages to read",
+ (uchar**) &opt_pages, (uchar**) &opt_pages, 0,
+ GET_ULONG, REQUIRED_ARG, (long) ~(ulong) 0,
+ (long) 1, (long) ~(ulong) 0, (long) 0,
+ (long) 1, 0},
+ {"unit-test", 'U',
+ "Use unit test record table (for logs created by unittests",
+ (uchar **) &opt_unit, (uchar **) &opt_unit, 0,
+ GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"version", 'V', "Print version and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static void print_version(void)
+{
+ VOID(printf("%s Ver 1.0 for %s on %s\n",
+ my_progname_short, SYSTEM_TYPE, MACHINE_TYPE));
+ NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+ print_version();
+ puts("Copyright (C) 2008 MySQL AB");
+ puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+ puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+ puts("Dump content of aria log pages.");
+ VOID(printf("\nUsage: %s -f file OPTIONS\n", my_progname_short));
+ my_print_help(my_long_options);
+ print_defaults("my", load_default_groups);
+ my_print_variables(my_long_options);
+}
+
+
+static my_bool
+get_one_option(int optid __attribute__((unused)),
+ const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch (optid) {
+ case '?':
+ usage();
+ exit(0);
+ case 'V':
+ print_version();
+ exit(0);
+#ifndef DBUG_OFF
+ case '#':
+ DBUG_SET_INITIAL(argument ? argument : default_dbug_option);
+ break;
+#endif
+ }
+ return 0;
+}
+
+
+static void get_options(int *argc,char ***argv)
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ if (opt_file == NULL)
+ {
+ usage();
+ exit(1);
+ }
+}
+
+
+/**
+ @brief Dump information about file header page.
+*/
+
+static void dump_header_page(uchar *buff)
+{
+ LOGHANDLER_FILE_INFO desc;
+ char strbuff[21];
+ LINT_INIT_STRUCT(desc);
+ translog_interpret_file_header(&desc, buff);
+ printf(" This can be header page:\n"
+ " Timestamp: %s\n"
+ " Aria log version: %lu\n"
+ " Server version: %lu\n"
+ " Server id %lu\n"
+ " Page size %lu\n",
+ llstr(desc.timestamp, strbuff),
+ desc.maria_version,
+ desc.mysql_version,
+ desc.server_id,
+ desc.page_size);
+ if (desc.page_size != TRANSLOG_PAGE_SIZE)
+ printf(" WARNING: page size is not equal compiled in one %lu!!!\n",
+ (ulong) TRANSLOG_PAGE_SIZE);
+ printf(" File number %lu\n"
+ " Max lsn: (%lu,0x%lx)\n",
+ desc.file_number,
+ LSN_IN_PARTS(desc.max_lsn));
+}
+
+static const char *record_class_string[]=
+{
+ "LOGRECTYPE_NOT_ALLOWED",
+ "LOGRECTYPE_VARIABLE_LENGTH",
+ "LOGRECTYPE_PSEUDOFIXEDLENGTH",
+ "LOGRECTYPE_FIXEDLENGTH"
+};
+
+
+/**
+ @brief dump information about transaction log chunk
+
+ @param buffer reference to the whole page
+ @param ptr pointer to the chunk
+
+ @reval # reference to the next chunk
+ @retval NULL can't interpret data
+*/
+
+static uchar *dump_chunk(uchar *buffer, uchar *ptr)
+{
+ uint length;
+ if (*ptr == TRANSLOG_FILLER)
+ {
+ printf(" Filler till the page end\n");
+ for (; ptr < buffer + TRANSLOG_PAGE_SIZE; ptr++)
+ {
+ if (*ptr != TRANSLOG_FILLER)
+ {
+ printf(" WARNING: non filler character met before page end "
+ "(page + 0x%04x: 0x%02x) (stop interpretation)!!!",
+ (uint) (ptr - buffer), (uint) ptr[0]);
+ return NULL;
+ }
+ }
+ return ptr;
+ }
+ if (*ptr == 0 || *ptr == 0xFF)
+ {
+ printf(" WARNING: chunk can't start from 0x0 "
+ "(stop interpretation)!!!\n");
+ return NULL;
+ }
+ switch (ptr[0] & TRANSLOG_CHUNK_TYPE) {
+ case TRANSLOG_CHUNK_LSN:
+ printf(" LSN chunk type 0 (variable length)\n");
+ if (likely((ptr[0] & TRANSLOG_REC_TYPE) != TRANSLOG_CHUNK_0_CONT))
+ {
+ printf(" Record type %u: %s record class %s compressed LSNs: %u\n",
+ ptr[0] & TRANSLOG_REC_TYPE,
+ (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ?
+ log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name :
+ "NULL"),
+ record_class_string[log_record_type_descriptor[ptr[0] &
+ TRANSLOG_REC_TYPE].
+ rclass],
+ log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].
+ compressed_LSN);
+ if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass !=
+ LOGRECTYPE_VARIABLE_LENGTH)
+ {
+ printf(" WARNING: this record class here can't be used "
+ "(stop interpretation)!!!\n");
+ break;
+ }
+ }
+ else
+ printf(" Continuation of previous chunk 0 header \n");
+ printf(" Short transaction id: %u\n", (uint) uint2korr(ptr + 1));
+ {
+ uchar *hdr_ptr= ptr + 1 + 2; /* chunk type and short trid */
+ uint16 chunk_len;
+ printf (" Record length: %lu\n",
+ (ulong) translog_variable_record_1group_decode_len(&hdr_ptr));
+ chunk_len= uint2korr(hdr_ptr);
+ if (chunk_len == 0)
+ printf (" It is 1 group record (chunk length == 0)\n");
+ else
+ {
+ uint16 groups, i;
+
+ printf (" Chunk length %u\n", (uint) chunk_len);
+ groups= uint2korr(hdr_ptr + 2);
+ hdr_ptr+= 4;
+ printf (" Number of groups left to the end %u:\n", (uint) groups);
+ for(i= 0;
+ i < groups && hdr_ptr < buffer + TRANSLOG_PAGE_SIZE;
+ i++, hdr_ptr+= LSN_STORE_SIZE + 1)
+ {
+ TRANSLOG_ADDRESS gpr_addr= lsn_korr(hdr_ptr);
+ uint pages= hdr_ptr[LSN_STORE_SIZE];
+ printf (" Group +#%u: (%lu,0x%lx) pages: %u\n",
+ (uint) i, LSN_IN_PARTS(gpr_addr), pages);
+ }
+ }
+ }
+ break;
+ case TRANSLOG_CHUNK_FIXED:
+ printf(" LSN chunk type 1 (fixed size)\n");
+ printf(" Record type %u: %s record class %s compressed LSNs: %u\n",
+ ptr[0] & TRANSLOG_REC_TYPE,
+ (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ?
+ log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name :
+ "NULL"),
+ record_class_string[log_record_type_descriptor[ptr[0] &
+ TRANSLOG_REC_TYPE].
+ rclass],
+ log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].
+ compressed_LSN);
+ if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass !=
+ LOGRECTYPE_PSEUDOFIXEDLENGTH &&
+ log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass !=
+ LOGRECTYPE_FIXEDLENGTH)
+ {
+ printf(" WARNING: this record class here can't be used "
+ "(stop interpretation)!!!\n");
+ }
+ printf(" Short transaction id: %u\n", (uint) uint2korr(ptr + 1));
+ break;
+ case TRANSLOG_CHUNK_NOHDR:
+ printf(" No header chunk type 2(till the end of the page)\n");
+ if (ptr[0] & TRANSLOG_REC_TYPE)
+ {
+ printf(" WARNING: chunk header content record type: 0x%02x "
+ "(dtop interpretation)!!!",
+ (uint) ptr[0]);
+ return NULL;
+ }
+ break;
+ case TRANSLOG_CHUNK_LNGTH:
+ printf(" Chunk with length type 3\n");
+ if (ptr[0] & TRANSLOG_REC_TYPE)
+ {
+ printf(" WARNING: chunk header content record type: 0x%02x "
+ "(dtop interpretation)!!!",
+ (uint) ptr[0]);
+ return NULL;
+ }
+ break;
+ }
+ {
+ intptr offset= ptr - buffer;
+ DBUG_ASSERT(offset >= 0 && offset <= UINT_MAX16);
+ length= translog_get_total_chunk_length(buffer, (uint16)offset);
+ }
+ printf(" Length %u\n", length);
+ ptr+= length;
+ return ptr;
+}
+
+
+/**
+ @brief Dump information about page with data.
+*/
+
+static void dump_datapage(uchar *buffer)
+{
+ uchar *ptr;
+ ulong offset;
+ uint32 page, file;
+ uint header_len;
+ printf(" Page: %ld File number: %ld\n",
+ (ulong) (page= uint3korr(buffer)),
+ (ulong) (file= uint3korr(buffer + 3)));
+ if (page == 0)
+ printf(" WARNING: page == 0!!!\n");
+ if (file == 0)
+ printf(" WARNING: file == 0!!!\n");
+ offset= page * TRANSLOG_PAGE_SIZE;
+ printf(" Flags (0x%x):\n", (uint) buffer[TRANSLOG_PAGE_FLAGS]);
+ if (buffer[TRANSLOG_PAGE_FLAGS])
+ {
+ if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC)
+ printf(" Page CRC\n");
+ if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)
+ printf(" Sector protection\n");
+ if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC)
+ printf(" Record CRC (WARNING: not yet implemented!!!)\n");
+ if (buffer[TRANSLOG_PAGE_FLAGS] & ~(TRANSLOG_PAGE_CRC |
+ TRANSLOG_SECTOR_PROTECTION |
+ TRANSLOG_RECORD_CRC))
+ {
+ printf(" WARNING: unknown flags (stop interpretation)!!!\n");
+ return;
+ }
+ }
+ else
+ printf(" No flags\n");
+ printf(" Page header length: %u\n",
+ (header_len= page_overhead[buffer[TRANSLOG_PAGE_FLAGS]]));
+ if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC)
+ {
+ uint32 crc= uint4korr(buffer + TRANSLOG_PAGE_FLAGS + 1);
+ uint32 ccrc;
+ printf (" Page CRC 0x%04lx\n", (ulong) crc);
+ ccrc= translog_crc(buffer + header_len, TRANSLOG_PAGE_SIZE - header_len);
+ if (crc != ccrc)
+ printf(" WARNING: calculated CRC: 0x%04lx!!!\n", (ulong) ccrc);
+ }
+ if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)
+ {
+ TRANSLOG_FILE tfile;
+ {
+ uchar *table= buffer + header_len -
+ TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+ uint i;
+ printf(" Sector protection current value: 0x%02x\n", (uint) table[0]);
+ for (i= 1; i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; i++)
+ {
+ printf(" Sector protection in sector: 0x%02x saved value 0x%02x\n",
+ (uint)buffer[i * DISK_DRIVE_SECTOR_SIZE],
+ (uint)table[i]);
+ }
+ }
+ tfile.number= file;
+ tfile.handler.file= handler;
+ pagecache_file_init(tfile.handler, NULL, NULL, NULL, NULL, NULL);
+ tfile.was_recovered= 0;
+ tfile.is_sync= 1;
+ if (translog_check_sector_protection(buffer, &tfile))
+ printf(" WARNING: sector protection found problems!!!\n");
+ }
+ ptr= buffer + header_len;
+ while (ptr && ptr < buffer + TRANSLOG_PAGE_SIZE)
+ {
+ printf(" Chunk (%lu,0x%lx):\n",
+ (ulong)file, (ulong) offset + (ptr - buffer));
+ ptr= dump_chunk(buffer, ptr);
+ }
+}
+
+
+/**
+ @brief Dump information about page.
+*/
+
+static void dump_page(uchar *buffer)
+{
+ printf("Page by offset %llu (0x%llx)\n", opt_offset, opt_offset);
+ if (strncmp((char*)maria_trans_file_magic, (char*)buffer,
+ sizeof(maria_trans_file_magic)) == 0)
+ {
+ dump_header_page(buffer);
+ }
+ dump_datapage(buffer);
+}
+
+
+/**
+ @brief maria_dump_log main function.
+*/
+
+int main(int argc, char **argv)
+{
+ char **default_argv;
+ uchar buffer[TRANSLOG_PAGE_SIZE];
+ MY_INIT(argv[0]);
+
+ load_defaults("my", load_default_groups, &argc, &argv);
+ default_argv= argv;
+ get_options(&argc, &argv);
+
+ if (opt_unit)
+ translog_example_table_init();
+ else
+ translog_table_init();
+ translog_fill_overhead_table();
+
+ maria_data_root= (char *)".";
+
+ if ((handler= my_open(opt_file, O_RDONLY, MYF(MY_WME))) < 0)
+ {
+ fprintf(stderr, "Can't open file: '%s' errno: %d\n",
+ opt_file, my_errno);
+ goto err;
+ }
+ if (my_seek(handler, opt_offset, SEEK_SET, MYF(MY_WME)) !=
+ opt_offset)
+ {
+ fprintf(stderr, "Can't set position %lld file: '%s' errno: %d\n",
+ opt_offset, opt_file, my_errno);
+ goto err;
+ }
+ for (;
+ opt_pages;
+ opt_offset+= TRANSLOG_PAGE_SIZE, opt_pages--)
+ {
+ if (my_pread(handler, buffer, TRANSLOG_PAGE_SIZE, opt_offset,
+ MYF(MY_NABP)))
+ {
+ if (my_errno == HA_ERR_FILE_TOO_SHORT)
+ goto end;
+ fprintf(stderr, "Can't read page at position %lld file: '%s' "
+ "errno: %d\n", opt_offset, opt_file, my_errno);
+ goto err;
+ }
+ dump_page(buffer);
+ }
+
+end:
+ my_close(handler, MYF(0));
+ free_defaults(default_argv);
+ exit(0);
+ return 0; /* No compiler warning */
+
+err:
+ my_close(handler, MYF(0));
+ fprintf(stderr, "%s: FAILED\n", my_progname_short);
+ free_defaults(default_argv);
+ exit(1);
+}
+#endif
diff --git a/storage/maria/ma_loghandler.h b/storage/maria/ma_loghandler.h
new file mode 100644
index 00000000000..698a8ead7b6
--- /dev/null
+++ b/storage/maria/ma_loghandler.h
@@ -0,0 +1,506 @@
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _ma_loghandler_h
+#define _ma_loghandler_h
+
+#define MB (1024UL*1024)
+
+/* transaction log default cache size (TODO: make it global variable) */
+#define TRANSLOG_PAGECACHE_SIZE (2*MB)
+/* transaction log default file size */
+#define TRANSLOG_FILE_SIZE (1024U*MB)
+/* minimum possible transaction log size */
+#define TRANSLOG_MIN_FILE_SIZE (8*MB)
+/* transaction log default flags (TODO: make it global variable) */
+#define TRANSLOG_DEFAULT_FLAGS 0
+
+/*
+ Transaction log flags.
+
+ We allow all kind protections to be switched on together for people who
+ really unsure in their hardware/OS.
+*/
+#define TRANSLOG_PAGE_CRC 1
+#define TRANSLOG_SECTOR_PROTECTION (1<<1)
+#define TRANSLOG_RECORD_CRC (1<<2)
+#define TRANSLOG_FLAGS_NUM ((TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | \
+ TRANSLOG_RECORD_CRC) + 1)
+
+#define RECHEADER_READ_ERROR -1
+#define RECHEADER_READ_EOF -2
+
+/*
+ Page size in transaction log
+ It should be Power of 2 and multiple of DISK_DRIVE_SECTOR_SIZE
+ (DISK_DRIVE_SECTOR_SIZE * 2^N)
+*/
+#define TRANSLOG_PAGE_SIZE (8U*1024)
+
+#include "ma_loghandler_lsn.h"
+#include "trnman_public.h"
+
+/* short transaction ID type */
+typedef uint16 SHORT_TRANSACTION_ID;
+
+struct st_maria_handler;
+
+/* Changing one of the "SIZE" below will break backward-compatibility! */
+/* Length of CRC at end of pages */
+#define ROW_EXTENT_PAGE_SIZE 5
+#define ROW_EXTENT_COUNT_SIZE 2
+/* Size of file id in logs */
+#define FILEID_STORE_SIZE 2
+/* Size of page reference in log */
+#define PAGE_STORE_SIZE ROW_EXTENT_PAGE_SIZE
+/* Size of page ranges in log */
+#define PAGERANGE_STORE_SIZE ROW_EXTENT_COUNT_SIZE
+#define DIRPOS_STORE_SIZE 1
+#define CLR_TYPE_STORE_SIZE 1
+/* If table has live checksum we store its changes in UNDOs */
+#define HA_CHECKSUM_STORE_SIZE 4
+#define KEY_NR_STORE_SIZE 1
+#define PAGE_LENGTH_STORE_SIZE 2
+
+/* Store methods to match the above sizes */
+#define fileid_store(T,A) int2store(T,A)
+#define page_store(T,A) int5store(T,((ulonglong)(A)))
+#define dirpos_store(T,A) ((*(uchar*) (T)) = A)
+#define pagerange_store(T,A) int2store(T,A)
+#define clr_type_store(T,A) ((*(uchar*) (T)) = A)
+#define key_nr_store(T, A) ((*(uchar*) (T)) = A)
+#define ha_checksum_store(T,A) int4store(T,A)
+#define fileid_korr(P) uint2korr(P)
+#define page_korr(P) uint5korr(P)
+#define dirpos_korr(P) (*(const uchar *) (P))
+#define pagerange_korr(P) uint2korr(P)
+#define clr_type_korr(P) (*(const uchar *) (P))
+#define key_nr_korr(P) (*(const uchar *) (P))
+#define ha_checksum_korr(P) uint4korr(P)
+
+/*
+ Length of disk drive sector size (we assume that writing it
+ to disk is an atomic operation)
+*/
+#define DISK_DRIVE_SECTOR_SIZE 512U
+
+/* position reserved in an array of parts of a log record */
+#define TRANSLOG_INTERNAL_PARTS 2
+
+/* types of records in the transaction log */
+/* TODO: Set numbers for these when we have all entries figured out */
+
+enum translog_record_type
+{
+ LOGREC_RESERVED_FOR_CHUNKS23= 0,
+ LOGREC_REDO_INSERT_ROW_HEAD,
+ LOGREC_REDO_INSERT_ROW_TAIL,
+ LOGREC_REDO_NEW_ROW_HEAD,
+ LOGREC_REDO_NEW_ROW_TAIL,
+ LOGREC_REDO_INSERT_ROW_BLOBS,
+ LOGREC_REDO_PURGE_ROW_HEAD,
+ LOGREC_REDO_PURGE_ROW_TAIL,
+ LOGREC_REDO_FREE_BLOCKS,
+ LOGREC_REDO_FREE_HEAD_OR_TAIL,
+ LOGREC_REDO_DELETE_ROW, /* unused */
+ LOGREC_REDO_UPDATE_ROW_HEAD, /* unused */
+ LOGREC_REDO_INDEX,
+ LOGREC_REDO_INDEX_NEW_PAGE,
+ LOGREC_REDO_INDEX_FREE_PAGE,
+ LOGREC_REDO_UNDELETE_ROW,
+ LOGREC_CLR_END,
+ LOGREC_PURGE_END,
+ LOGREC_UNDO_ROW_INSERT,
+ LOGREC_UNDO_ROW_DELETE,
+ LOGREC_UNDO_ROW_UPDATE,
+ LOGREC_UNDO_KEY_INSERT,
+ LOGREC_UNDO_KEY_INSERT_WITH_ROOT,
+ LOGREC_UNDO_KEY_DELETE,
+ LOGREC_UNDO_KEY_DELETE_WITH_ROOT,
+ LOGREC_PREPARE,
+ LOGREC_PREPARE_WITH_UNDO_PURGE,
+ LOGREC_COMMIT,
+ LOGREC_COMMIT_WITH_UNDO_PURGE,
+ LOGREC_CHECKPOINT,
+ LOGREC_REDO_CREATE_TABLE,
+ LOGREC_REDO_RENAME_TABLE,
+ LOGREC_REDO_DROP_TABLE,
+ LOGREC_REDO_DELETE_ALL,
+ LOGREC_REDO_REPAIR_TABLE,
+ LOGREC_FILE_ID,
+ LOGREC_LONG_TRANSACTION_ID,
+ LOGREC_INCOMPLETE_LOG,
+ LOGREC_INCOMPLETE_GROUP,
+ LOGREC_UNDO_BULK_INSERT,
+ LOGREC_REDO_BITMAP_NEW_PAGE,
+ LOGREC_IMPORTED_TABLE,
+ LOGREC_DEBUG_INFO,
+ LOGREC_FIRST_FREE,
+ LOGREC_RESERVED_FUTURE_EXTENSION= 63
+};
+#define LOGREC_NUMBER_OF_TYPES 64 /* Maximum, can't be extended */
+
+/* Type of operations in LOGREC_REDO_INDEX */
+
+enum en_key_op
+{
+ KEY_OP_NONE, /* Not used */
+ KEY_OP_OFFSET, /* Set current position */
+ KEY_OP_SHIFT, /* Shift up/or down at current position */
+ KEY_OP_CHANGE, /* Change data at current position */
+ KEY_OP_ADD_PREFIX, /* Insert data at start of page */
+ KEY_OP_DEL_PREFIX, /* Delete data at start of page */
+ KEY_OP_ADD_SUFFIX, /* Insert data at end of page */
+ KEY_OP_DEL_SUFFIX, /* Delete data at end of page */
+ KEY_OP_CHECK, /* For debugging; CRC of used part of page */
+ KEY_OP_MULTI_COPY, /* List of memcpy()s with fixed-len sources in page */
+ KEY_OP_SET_PAGEFLAG, /* Set pageflag from next byte */
+ KEY_OP_COMPACT_PAGE, /* Compact key page */
+ KEY_OP_MAX_PAGELENGTH, /* Set page to max page length */
+ KEY_OP_DEBUG, /* Entry for storing what triggered redo_index */
+ KEY_OP_DEBUG_2 /* Entry for pagelengths */
+};
+
+enum en_key_debug
+{
+ KEY_OP_DEBUG_RTREE_COMBINE, /* 0 */
+ KEY_OP_DEBUG_RTREE_SPLIT, /* 1 */
+ KEY_OP_DEBUG_RTREE_SET_KEY, /* 2 */
+ KEY_OP_DEBUG_FATHER_CHANGED_1, /* 3 */
+ KEY_OP_DEBUG_FATHER_CHANGED_2, /* 4 */
+ KEY_OP_DEBUG_LOG_SPLIT, /* 5 */
+ KEY_OP_DEBUG_LOG_ADD_1, /* 6 */
+ KEY_OP_DEBUG_LOG_ADD_2, /* 7 */
+ KEY_OP_DEBUG_LOG_ADD_3, /* 8 */
+ KEY_OP_DEBUG_LOG_ADD_4, /* 9 */
+ KEY_OP_DEBUG_LOG_PREFIX_1, /* 10 */
+ KEY_OP_DEBUG_LOG_PREFIX_2, /* 11 */
+ KEY_OP_DEBUG_LOG_PREFIX_3, /* 12 */
+ KEY_OP_DEBUG_LOG_PREFIX_4, /* 13 */
+ KEY_OP_DEBUG_LOG_PREFIX_5, /* 14 */
+ KEY_OP_DEBUG_LOG_DEL_CHANGE_1, /* 15 */
+ KEY_OP_DEBUG_LOG_DEL_CHANGE_2, /* 16 */
+ KEY_OP_DEBUG_LOG_DEL_CHANGE_3, /* 17 */
+ KEY_OP_DEBUG_LOG_DEL_CHANGE_RT, /* 18 */
+ KEY_OP_DEBUG_LOG_DEL_PREFIX, /* 19 */
+ KEY_OP_DEBUG_LOG_MIDDLE /* 20 */
+};
+
+
+enum translog_debug_info_type
+{
+ LOGREC_DEBUG_INFO_QUERY
+};
+
+/* Size of log file; One log file is restricted to 4G */
+typedef uint32 translog_size_t;
+
+#define TRANSLOG_RECORD_HEADER_MAX_SIZE 1024U
+
+typedef struct st_translog_group_descriptor
+{
+ TRANSLOG_ADDRESS addr;
+ uint8 num;
+} TRANSLOG_GROUP;
+
+
+typedef struct st_translog_header_buffer
+{
+ /* LSN of the read record */
+ LSN lsn;
+ /* array of groups descriptors, can be used only if groups_no > 0 */
+ TRANSLOG_GROUP *groups;
+ /* short transaction ID or 0 if it has no sense for the record */
+ SHORT_TRANSACTION_ID short_trid;
+ /*
+ The Record length in buffer (including read header, but excluding
+ hidden part of record (type, short TrID, length)
+ */
+ translog_size_t record_length;
+ /*
+ Buffer for write decoded header of the record (depend on the record
+ type)
+ */
+ uchar header[TRANSLOG_RECORD_HEADER_MAX_SIZE];
+ /* number of groups listed in */
+ uint groups_no;
+ /* in multi-group number of chunk0 pages (valid only if groups_no > 0) */
+ uint chunk0_pages;
+ /* type of the read record */
+ enum translog_record_type type;
+ /* chunk 0 data address (valid only if groups_no > 0) */
+ TRANSLOG_ADDRESS chunk0_data_addr;
+ /*
+ Real compressed LSN(s) size economy (<number of LSN(s)>*7 - <real_size>)
+ */
+ int16 compressed_LSN_economy;
+ /* short transaction ID or 0 if it has no sense for the record */
+ uint16 non_header_data_start_offset;
+ /* non read body data length in this first chunk */
+ uint16 non_header_data_len;
+ /* chunk 0 data size (valid only if groups_no > 0) */
+ uint16 chunk0_data_len;
+} TRANSLOG_HEADER_BUFFER;
+
+
+typedef struct st_translog_scanner_data
+{
+ uchar buffer[TRANSLOG_PAGE_SIZE]; /* buffer for page content */
+ TRANSLOG_ADDRESS page_addr; /* current page address */
+ /* end of the log which we saw last time */
+ TRANSLOG_ADDRESS horizon;
+ TRANSLOG_ADDRESS last_file_page; /* Last page on in this file */
+ uchar *page; /* page content pointer */
+ /* direct link on the current page or NULL if not supported/requested */
+ PAGECACHE_BLOCK_LINK *direct_link;
+ /* offset of the chunk in the page */
+ translog_size_t page_offset;
+ /* set horizon only once at init */
+ my_bool fixed_horizon;
+ /* try to get direct link on the page if it is possible */
+ my_bool use_direct_link;
+} TRANSLOG_SCANNER_DATA;
+
+
+typedef struct st_translog_reader_data
+{
+ TRANSLOG_HEADER_BUFFER header; /* Header */
+ TRANSLOG_SCANNER_DATA scanner; /* chunks scanner */
+ translog_size_t body_offset; /* current chunk body offset */
+ /* data offset from the record beginning */
+ translog_size_t current_offset;
+ /* number of bytes read in header */
+ uint16 read_header;
+ uint16 chunk_size; /* current chunk size */
+ uint current_group; /* current group */
+ uint current_chunk; /* current chunk in the group */
+ my_bool eor; /* end of the record */
+} TRANSLOG_READER_DATA;
+
+C_MODE_START
+
+/* Records types for unittests */
+#define LOGREC_FIXED_RECORD_0LSN_EXAMPLE 1
+#define LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE 2
+#define LOGREC_FIXED_RECORD_1LSN_EXAMPLE 3
+#define LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE 4
+#define LOGREC_FIXED_RECORD_2LSN_EXAMPLE 5
+#define LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE 6
+
+extern void translog_example_table_init();
+extern void translog_table_init();
+#define translog_init(D,M,V,I,C,F,R) \
+ translog_init_with_table(D,M,V,I,C,F,R,&translog_table_init,0)
+extern my_bool translog_init_with_table(const char *directory,
+ uint32 log_file_max_size,
+ uint32 server_version,
+ uint32 server_id,
+ PAGECACHE *pagecache,
+ uint flags,
+ my_bool readonly,
+ void (*init_table_func)(),
+ my_bool no_error);
+
+extern my_bool
+translog_write_record(LSN *lsn, enum translog_record_type type, TRN *trn,
+ MARIA_HA *tbl_info,
+ translog_size_t rec_len, uint part_no,
+ LEX_CUSTRING *parts_data, uchar *store_share_id,
+ void *hook_arg);
+
+extern void translog_destroy();
+
+extern int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff);
+
+extern void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff);
+
+extern translog_size_t translog_read_record(LSN lsn,
+ translog_size_t offset,
+ translog_size_t length,
+ uchar *buffer,
+ struct st_translog_reader_data
+ *data);
+
+extern my_bool translog_flush(TRANSLOG_ADDRESS lsn);
+
+extern my_bool translog_scanner_init(LSN lsn,
+ my_bool fixed_horizon,
+ struct st_translog_scanner_data *scanner,
+ my_bool use_direct_link);
+extern void translog_destroy_scanner(TRANSLOG_SCANNER_DATA *scanner);
+
+extern int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner,
+ TRANSLOG_HEADER_BUFFER *buff);
+extern LSN translog_get_file_max_lsn_stored(uint32 file);
+extern my_bool translog_purge(TRANSLOG_ADDRESS low);
+extern my_bool translog_is_file(uint file_no);
+extern void translog_lock();
+extern void translog_unlock();
+extern void translog_lock_handler_assert_owner();
+extern TRANSLOG_ADDRESS translog_get_horizon();
+extern TRANSLOG_ADDRESS translog_get_horizon_no_lock();
+extern int translog_assign_id_to_share(struct st_maria_handler *tbl_info,
+ TRN *trn);
+extern void translog_deassign_id_from_share(struct st_maria_share *share);
+extern void
+translog_assign_id_to_share_from_recovery(struct st_maria_share *share,
+ uint16 id);
+extern my_bool translog_walk_filenames(const char *directory,
+ my_bool (*callback)(const char *,
+ const char *));
+extern my_bool translog_log_debug_info(TRN *trn,
+ enum translog_debug_info_type type,
+ uchar *info, size_t length);
+
+enum enum_translog_status
+{
+ TRANSLOG_UNINITED, /* no initialization done or error during initialization */
+ TRANSLOG_OK, /* transaction log is functioning */
+ TRANSLOG_READONLY, /* read only mode due to write errors */
+ TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */
+};
+extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
+
+/*
+ all the rest added because of recovery; should we make
+ ma_loghandler_for_recovery.h ?
+*/
+
+#define SHARE_ID_MAX 65535 /* array's size */
+
+extern LSN translog_first_lsn_in_log();
+extern LSN translog_first_theoretical_lsn();
+extern LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon);
+extern my_bool translog_purge_at_flush();
+extern uint32 translog_get_first_file(TRANSLOG_ADDRESS horizon);
+extern uint32 translog_get_first_needed_file();
+extern char *translog_filename_by_fileno(uint32 file_no, char *path);
+extern void translog_set_file_size(uint32 size);
+
+/* record parts descriptor */
+struct st_translog_parts
+{
+ /* full record length */
+ translog_size_t record_length;
+ /* full record length with chunk headers */
+ translog_size_t total_record_length;
+ /* current part index */
+ uint current;
+ /* total number of elements in parts */
+ uint elements;
+ /* array of parts */
+ LEX_CUSTRING *parts;
+};
+
+typedef my_bool(*prewrite_rec_hook) (enum translog_record_type type,
+ TRN *trn,
+ struct st_maria_handler *tbl_info,
+ void *hook_arg);
+
+typedef my_bool(*inwrite_rec_hook) (enum translog_record_type type,
+ TRN *trn,
+ struct st_maria_handler *tbl_info,
+ LSN *lsn, void *hook_arg);
+
+typedef uint16(*read_rec_hook) (enum translog_record_type type,
+ uint16 read_length, uchar *read_buff,
+ uchar *decoded_buff);
+
+
+/* record classes */
+enum record_class
+{
+ LOGRECTYPE_NOT_ALLOWED,
+ LOGRECTYPE_VARIABLE_LENGTH,
+ LOGRECTYPE_PSEUDOFIXEDLENGTH,
+ LOGRECTYPE_FIXEDLENGTH
+};
+
+enum enum_record_in_group {
+ LOGREC_NOT_LAST_IN_GROUP= 0, LOGREC_LAST_IN_GROUP, LOGREC_IS_GROUP_ITSELF
+};
+
+/*
+ Descriptor of log record type
+*/
+typedef struct st_log_record_type_descriptor
+{
+ /* internal class of the record */
+ enum record_class rclass;
+ /*
+ length for fixed-size record, pseudo-fixed record
+ length with uncompressed LSNs
+ */
+ uint16 fixed_length;
+ /* how much record body (belonged to headers too) read with headers */
+ uint16 read_header_len;
+ /* HOOK for writing the record called before lock */
+ prewrite_rec_hook prewrite_hook;
+ /* HOOK for writing the record called when LSN is known, inside lock */
+ inwrite_rec_hook inwrite_hook;
+ /* HOOK for reading headers */
+ read_rec_hook read_hook;
+ /*
+ For pseudo fixed records number of compressed LSNs followed by
+ system header
+ */
+ int16 compressed_LSN;
+ /* the rest is for maria_read_log & Recovery */
+ /** @brief for debug error messages or "maria_read_log" command-line tool */
+ const char *name;
+ enum enum_record_in_group record_in_group;
+ /* a function to execute when we see the record during the REDO phase */
+ int (*record_execute_in_redo_phase)(const TRANSLOG_HEADER_BUFFER *);
+ /* a function to execute when we see the record during the UNDO phase */
+ int (*record_execute_in_undo_phase)(const TRANSLOG_HEADER_BUFFER *, TRN *);
+} LOG_DESC;
+
+extern LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES];
+
+typedef enum
+{
+ TRANSLOG_GCOMMIT_NONE,
+ TRANSLOG_GCOMMIT_HARD,
+ TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
+ TRANSLOG_PURGE_IMMIDIATE,
+ TRANSLOG_PURGE_EXTERNAL,
+ TRANSLOG_PURGE_ONDEMAND
+} enum_maria_translog_purge_type;
+extern ulong log_purge_type;
+extern ulong log_file_size;
+
+typedef enum
+{
+ TRANSLOG_SYNC_DIR_NEVER,
+ TRANSLOG_SYNC_DIR_NEWFILE,
+ TRANSLOG_SYNC_DIR_ALWAYS
+} enum_maria_sync_log_dir;
+extern ulong sync_log_dir;
+
+C_MODE_END
+#endif
diff --git a/storage/maria/ma_loghandler_lsn.h b/storage/maria/ma_loghandler_lsn.h
new file mode 100644
index 00000000000..7fa53bc0a50
--- /dev/null
+++ b/storage/maria/ma_loghandler_lsn.h
@@ -0,0 +1,111 @@
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _ma_loghandler_lsn_h
+#define _ma_loghandler_lsn_h
+
+/*
+ Transaction log record address:
+ file_no << 32 | offset
+ file_no is only 3 bytes so we can use signed integer to make
+ comparison simpler.
+*/
+typedef int64 TRANSLOG_ADDRESS;
+
+/*
+ Compare addresses
+ A1 > A2 -> result > 0
+ A1 == A2 -> 0
+ A1 < A2 -> result < 0
+*/
+#define cmp_translog_addr(A1,A2) ((A1) - (A2))
+
+/*
+ TRANSLOG_ADDRESS is just address of some byte in the log (usually some
+ chunk)
+ LSN used where address of some record in the log needed (not just any
+ address)
+*/
+typedef TRANSLOG_ADDRESS LSN;
+
+/* Gets file number part of a LSN/log address */
+#define LSN_FILE_NO(L) (uint32) ((L) >> 32)
+
+/* Gets raw file number part of a LSN/log address */
+#define LSN_FILE_NO_PART(L) ((L) & ((int64)0xFFFFFF00000000LL))
+
+/* Parts of LSN for printing */
+#define LSN_IN_PARTS(L) (ulong)LSN_FILE_NO(L),(ulong)LSN_OFFSET(L)
+
+/* Gets record offset of a LSN/log address */
+#define LSN_OFFSET(L) (ulong) ((L) & 0xFFFFFFFFL)
+
+/* Makes lsn/log address from file number and record offset */
+#define MAKE_LSN(F,S) ((LSN) ((((uint64)(F)) << 32) | (S)))
+
+/* checks LSN */
+#define LSN_VALID(L) \
+ ((LSN_FILE_NO_PART(L) != FILENO_IMPOSSIBLE) && \
+ (LSN_OFFSET(L) != LOG_OFFSET_IMPOSSIBLE))
+
+/* size of stored LSN on a disk, don't change it! */
+#define LSN_STORE_SIZE 7
+
+/* Puts LSN into buffer (dst) */
+#define lsn_store(dst, lsn) \
+ do { \
+ int3store((dst), LSN_FILE_NO(lsn)); \
+ int4store((char*)(dst) + 3, LSN_OFFSET(lsn)); \
+ } while (0)
+
+/* Unpacks LSN from the buffer (P) */
+#define lsn_korr(P) MAKE_LSN(uint3korr(P), uint4korr((const char*)(P) + 3))
+
+/* what we need to add to LSN to increase it on one file */
+#define LSN_ONE_FILE ((int64)0x100000000LL)
+
+#define LSN_REPLACE_OFFSET(L, S) (LSN_FILE_NO_PART(L) | (S))
+
+/*
+ an 8-byte type whose most significant uchar is used for "flags"; 7
+ other bytes are a LSN.
+*/
+typedef LSN LSN_WITH_FLAGS;
+#define LSN_WITH_FLAGS_TO_LSN(x) (x & ULL(0x00FFFFFFFFFFFFFF))
+#define LSN_WITH_FLAGS_TO_FLAGS(x) (x & ULL(0xFF00000000000000))
+
+#define FILENO_IMPOSSIBLE 0 /**< log file's numbering starts at 1 */
+#define LOG_OFFSET_IMPOSSIBLE 0 /**< log always has a header */
+#define LSN_IMPOSSIBLE ((LSN)0)
+/* following LSN also is impossible */
+#define LSN_ERROR ((LSN)1)
+
+/** @brief some impossible LSN serve as markers */
+
+/**
+ When table is modified by maria_chk, or auto-zerofilled, old REDOs don't
+ apply, table is freshly born again somehow: its state's LSNs need to be
+ updated to the new instance which receives this table.
+*/
+#define LSN_NEEDS_NEW_STATE_LSNS ((LSN)2)
+
+/**
+ @brief the maximum valid LSN.
+ Unlike ULONGLONG_MAX, it can be safely used in comparison with valid LSNs
+ (ULONGLONG_MAX is too big for correctness of cmp_translog_addr()).
+*/
+#define LSN_MAX (LSN)ULL(0x00FFFFFFFFFFFFFF)
+
+#endif
diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c
new file mode 100644
index 00000000000..63e1801a39a
--- /dev/null
+++ b/storage/maria/ma_open.c
@@ -0,0 +1,1945 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* open a isam-database */
+
+#include "ma_fulltext.h"
+#include "ma_sp_defs.h"
+#include "ma_rt_index.h"
+#include "ma_blockrec.h"
+#include <m_ctype.h>
+
+#if defined(MSDOS) || defined(__WIN__)
+#ifdef __WIN__
+#include <fcntl.h>
+#else
+#include <process.h> /* Prototype for getpid */
+#endif
+#endif
+
+static void setup_key_functions(MARIA_KEYDEF *keyinfo);
+static my_bool maria_scan_init_dummy(MARIA_HA *info);
+static void maria_scan_end_dummy(MARIA_HA *info);
+static my_bool maria_once_init_dummy(MARIA_SHARE *, File);
+static my_bool maria_once_end_dummy(MARIA_SHARE *);
+static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base);
+static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state);
+
+#define get_next_element(to,pos,size) { memcpy((char*) to,pos,(size_t) size); \
+ pos+=size;}
+
+
+#define disk_pos_assert(pos, end_pos) \
+if (pos > end_pos) \
+{ \
+ my_errno=HA_ERR_CRASHED; \
+ goto err; \
+}
+
+
+/******************************************************************************
+** Return the shared struct if the table is already open.
+** In MySQL the server will handle version issues.
+******************************************************************************/
+
+MARIA_HA *_ma_test_if_reopen(const char *filename)
+{
+ LIST *pos;
+
+ for (pos=maria_open_list ; pos ; pos=pos->next)
+ {
+ MARIA_HA *info=(MARIA_HA*) pos->data;
+ MARIA_SHARE *share= info->s;
+ if (!strcmp(share->unique_file_name.str,filename) && share->last_version)
+ return info;
+ }
+ return 0;
+}
+
+
+/*
+ Open a new instance of an already opened Maria table
+
+ SYNOPSIS
+ maria_clone_internal()
+ share Share of already open table
+ mode Mode of table (O_RDONLY | O_RDWR)
+ data_file Filedescriptor of data file to use < 0 if one should open
+ open it.
+
+ RETURN
+ # Maria handler
+ 0 Error
+*/
+
+
+static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, const char *name,
+ int mode, File data_file)
+{
+ int save_errno;
+ uint errpos;
+ MARIA_HA info,*m_info;
+ my_bitmap_map *changed_fields_bitmap;
+ DBUG_ENTER("maria_clone_internal");
+
+ errpos= 0;
+ bzero((uchar*) &info,sizeof(info));
+
+ if (mode == O_RDWR && share->mode == O_RDONLY)
+ {
+ my_errno=EACCES; /* Can't open in write mode */
+ goto err;
+ }
+ if (data_file >= 0)
+ info.dfile.file= data_file;
+ else if (_ma_open_datafile(&info, share, name, -1))
+ goto err;
+ errpos= 5;
+
+ /* alloc and set up private structure parts */
+ if (!my_multi_malloc(MY_WME,
+ &m_info,sizeof(MARIA_HA),
+ &info.blobs,sizeof(MARIA_BLOB)*share->base.blobs,
+ &info.buff,(share->base.max_key_block_length*2+
+ share->base.max_key_length),
+ &info.lastkey_buff,share->base.max_key_length*2+1,
+ &info.first_mbr_key, share->base.max_key_length,
+ &info.maria_rtree_recursion_state,
+ share->have_rtree ? 1024 : 0,
+ &changed_fields_bitmap,
+ bitmap_buffer_size(share->base.fields),
+ NullS))
+ goto err;
+ errpos= 6;
+
+ memcpy(info.blobs,share->blobs,sizeof(MARIA_BLOB)*share->base.blobs);
+ info.lastkey_buff2= info.lastkey_buff + share->base.max_key_length;
+ info.last_key.data= info.lastkey_buff;
+
+ info.s=share;
+ info.cur_row.lastpos= HA_OFFSET_ERROR;
+ info.update= (short) (HA_STATE_NEXT_FOUND+HA_STATE_PREV_FOUND);
+ info.opt_flag=READ_CHECK_USED;
+ info.this_unique= (ulong) info.dfile.file; /* Uniq number in process */
+#ifdef EXTERNAL_LOCKING
+ if (share->data_file_type == COMPRESSED_RECORD)
+ info.this_unique= share->state.unique;
+ info.this_loop=0; /* Update counter */
+ info.last_unique= share->state.unique;
+ info.last_loop= share->state.update_count;
+#endif
+ info.errkey= -1;
+ info.page_changed=1;
+ info.keyread_buff= info.buff + share->base.max_key_block_length;
+
+ info.lock_type= F_UNLCK;
+ if (share->options & HA_OPTION_TMP_TABLE)
+ info.lock_type= F_WRLCK;
+
+ _ma_set_data_pagecache_callbacks(&info.dfile, share);
+ bitmap_init(&info.changed_fields, changed_fields_bitmap,
+ share->base.fields, 0);
+ if ((*share->init)(&info))
+ goto err;
+
+ /* The following should be big enough for all pinning purposes */
+ if (my_init_dynamic_array(&info.pinned_pages,
+ sizeof(MARIA_PINNED_PAGE),
+ max(share->base.blobs*2 + 4,
+ MARIA_MAX_TREE_LEVELS*3), 16))
+ goto err;
+
+
+ pthread_mutex_lock(&share->intern_lock);
+ info.read_record= share->read_record;
+ share->reopen++;
+ share->write_flag=MYF(MY_NABP | MY_WAIT_IF_FULL);
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ info.lock_type=F_RDLCK;
+ share->r_locks++;
+ share->tot_locks++;
+ }
+ if ((share->options & HA_OPTION_DELAY_KEY_WRITE) &&
+ maria_delay_key_write)
+ share->delay_key_write=1;
+
+ if (!share->base.born_transactional) /* For transactional ones ... */
+ {
+ /* ... force crash if no trn given */
+ _ma_set_trn_for_table(&info, &dummy_transaction_object);
+ info.state= &share->state.state; /* Change global values by default */
+ }
+ else
+ {
+ info.state= &share->state.common;
+ *info.state= share->state.state; /* Initial values */
+ }
+ info.state_start= info.state; /* Initial values */
+
+ pthread_mutex_unlock(&share->intern_lock);
+
+ /* Allocate buffer for one record */
+ /* prerequisites: info->rec_buffer == 0 && info->rec_buff_size == 0 */
+ if (_ma_alloc_buffer(&info.rec_buff, &info.rec_buff_size,
+ share->base.default_rec_buff_size))
+ goto err;
+
+ bzero(info.rec_buff, share->base.default_rec_buff_size);
+
+ *m_info=info;
+#ifdef THREAD
+ thr_lock_data_init(&share->lock,&m_info->lock,(void*) m_info);
+#endif
+ m_info->open_list.data=(void*) m_info;
+ maria_open_list=list_add(maria_open_list,&m_info->open_list);
+
+ DBUG_RETURN(m_info);
+
+err:
+ DBUG_PRINT("error", ("error: %d", my_errno));
+ save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE;
+ if ((save_errno == HA_ERR_CRASHED) ||
+ (save_errno == HA_ERR_CRASHED_ON_USAGE) ||
+ (save_errno == HA_ERR_CRASHED_ON_REPAIR))
+ _ma_report_error(save_errno, &share->open_file_name);
+ switch (errpos) {
+ case 6:
+ (*share->end)(&info);
+ delete_dynamic(&info.pinned_pages);
+ my_free(m_info, MYF(0));
+ /* fall through */
+ case 5:
+ if (data_file < 0)
+ VOID(my_close(info.dfile.file, MYF(0)));
+ break;
+ }
+ my_errno=save_errno;
+ DBUG_RETURN (NULL);
+} /* maria_clone_internal */
+
+
+/* Make a clone of a maria table */
+
+MARIA_HA *maria_clone(MARIA_SHARE *share, int mode)
+{
+ MARIA_HA *new_info;
+ pthread_mutex_lock(&THR_LOCK_maria);
+ new_info= maria_clone_internal(share, NullS, mode,
+ share->data_file_type == BLOCK_RECORD ?
+ share->bitmap.file.file : -1);
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ return new_info;
+}
+
+
+/******************************************************************************
+ open a MARIA table
+
+ See my_base.h for the handle_locking argument
+ if handle_locking and HA_OPEN_ABORT_IF_CRASHED then abort if the table
+ is marked crashed or if we are not using locking and the table doesn't
+ have an open count of 0.
+******************************************************************************/
+
+MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
+{
+ int kfile,open_mode,save_errno;
+ uint i,j,len,errpos,head_length,base_pos,keys, realpath_err,
+ key_parts,unique_key_parts,fulltext_keys,uniques;
+ size_t info_length;
+ char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN],
+ data_name[FN_REFLEN];
+ uchar *disk_cache, *disk_pos, *end_pos;
+ MARIA_HA info,*m_info,*old_info;
+ MARIA_SHARE share_buff,*share;
+ double *rec_per_key_part;
+ ulong *nulls_per_key_part;
+ my_off_t key_root[HA_MAX_POSSIBLE_KEY];
+ ulonglong max_key_file_length, max_data_file_length;
+ my_bool versioning= 1;
+ File data_file= -1;
+ DBUG_ENTER("maria_open");
+
+ LINT_INIT(m_info);
+ kfile= -1;
+ errpos= 0;
+ head_length=sizeof(share_buff.state.header);
+ bzero((uchar*) &info,sizeof(info));
+
+ realpath_err= my_realpath(name_buff, fn_format(org_name, name, "",
+ MARIA_NAME_IEXT,
+ MY_UNPACK_FILENAME),MYF(0));
+ if (my_is_symlink(org_name) &&
+ (realpath_err || (*maria_test_invalid_symlink)(name_buff)))
+ {
+ my_errno= HA_WRONG_CREATE_OPTION;
+ DBUG_RETURN(0);
+ }
+
+ pthread_mutex_lock(&THR_LOCK_maria);
+ old_info= 0;
+ if ((open_flags & HA_OPEN_COPY) ||
+ !(old_info=_ma_test_if_reopen(name_buff)))
+ {
+ share= &share_buff;
+ bzero((uchar*) &share_buff,sizeof(share_buff));
+ share_buff.state.key_root=key_root;
+ share_buff.pagecache= multi_pagecache_search((uchar*) name_buff,
+ (uint) strlen(name_buff),
+ maria_pagecache);
+
+ DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_open",
+ if (strstr(name, "/t1"))
+ {
+ my_errno= HA_ERR_CRASHED;
+ goto err;
+ });
+ if ((kfile=my_open(name_buff,(open_mode=O_RDWR) | O_SHARE,MYF(0))) < 0)
+ {
+ if ((errno != EROFS && errno != EACCES) ||
+ mode != O_RDONLY ||
+ (kfile=my_open(name_buff,(open_mode=O_RDONLY) | O_SHARE,MYF(0))) < 0)
+ goto err;
+ }
+ share->mode=open_mode;
+ errpos= 1;
+ if (my_pread(kfile,share->state.header.file_version, head_length, 0,
+ MYF(MY_NABP)))
+ {
+ my_errno= HA_ERR_NOT_A_TABLE;
+ goto err;
+ }
+ if (memcmp(share->state.header.file_version, maria_file_magic, 4))
+ {
+ DBUG_PRINT("error",("Wrong header in %s",name_buff));
+ DBUG_DUMP("error_dump", share->state.header.file_version,
+ head_length);
+ my_errno=HA_ERR_NOT_A_TABLE;
+ goto err;
+ }
+ share->options= mi_uint2korr(share->state.header.options);
+ if (share->options &
+ ~(HA_OPTION_PACK_RECORD | HA_OPTION_PACK_KEYS |
+ HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA |
+ HA_OPTION_TEMP_COMPRESS_RECORD | HA_OPTION_CHECKSUM |
+ HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE |
+ HA_OPTION_RELIES_ON_SQL_LAYER | HA_OPTION_NULL_FIELDS |
+ HA_OPTION_PAGE_CHECKSUM))
+ {
+ DBUG_PRINT("error",("wrong options: 0x%lx", share->options));
+ my_errno=HA_ERR_NEW_FILE;
+ goto err;
+ }
+ if ((share->options & HA_OPTION_RELIES_ON_SQL_LAYER) &&
+ ! (open_flags & HA_OPEN_FROM_SQL_LAYER))
+ {
+ DBUG_PRINT("error", ("table cannot be opened from non-sql layer"));
+ my_errno= HA_ERR_UNSUPPORTED;
+ goto err;
+ }
+ /* Don't call realpath() if the name can't be a link */
+ if (!strcmp(name_buff, org_name) ||
+ my_readlink(index_name, org_name, MYF(0)) == -1)
+ (void) strmov(index_name, org_name);
+ *strrchr(org_name, FN_EXTCHAR)= '\0';
+ (void) fn_format(data_name,org_name,"",MARIA_NAME_DEXT,
+ MY_APPEND_EXT|MY_UNPACK_FILENAME|MY_RESOLVE_SYMLINKS);
+
+ info_length=mi_uint2korr(share->state.header.header_length);
+ base_pos= mi_uint2korr(share->state.header.base_pos);
+
+ /*
+ Allocate space for header information and for data that is too
+ big to keep on stack
+ */
+ if (!my_multi_malloc(MY_WME,
+ &disk_cache, info_length+128,
+ &rec_per_key_part,
+ (sizeof(*rec_per_key_part) * HA_MAX_POSSIBLE_KEY *
+ HA_MAX_KEY_SEG),
+ &nulls_per_key_part,
+ (sizeof(*nulls_per_key_part) * HA_MAX_POSSIBLE_KEY *
+ HA_MAX_KEY_SEG),
+ NullS))
+ {
+ my_errno=ENOMEM;
+ goto err;
+ }
+ share_buff.state.rec_per_key_part= rec_per_key_part;
+ share_buff.state.nulls_per_key_part= nulls_per_key_part;
+
+ end_pos=disk_cache+info_length;
+ errpos= 3;
+ if (my_pread(kfile, disk_cache, info_length, 0L, MYF(MY_NABP)))
+ {
+ my_errno=HA_ERR_CRASHED;
+ goto err;
+ }
+ len=mi_uint2korr(share->state.header.state_info_length);
+ keys= (uint) share->state.header.keys;
+ uniques= (uint) share->state.header.uniques;
+ fulltext_keys= (uint) share->state.header.fulltext_keys;
+ key_parts= mi_uint2korr(share->state.header.key_parts);
+ unique_key_parts= mi_uint2korr(share->state.header.unique_key_parts);
+ if (len != MARIA_STATE_INFO_SIZE)
+ {
+ DBUG_PRINT("warning",
+ ("saved_state_info_length: %d state_info_length: %d",
+ len,MARIA_STATE_INFO_SIZE));
+ }
+ share->state_diff_length=len-MARIA_STATE_INFO_SIZE;
+
+ _ma_state_info_read(disk_cache, &share->state);
+ len= mi_uint2korr(share->state.header.base_info_length);
+ if (len != MARIA_BASE_INFO_SIZE)
+ {
+ DBUG_PRINT("warning",("saved_base_info_length: %d base_info_length: %d",
+ len,MARIA_BASE_INFO_SIZE));
+ }
+ disk_pos= _ma_base_info_read(disk_cache + base_pos, &share->base);
+ share->state.state_length=base_pos;
+
+ if (!(open_flags & HA_OPEN_FOR_REPAIR) &&
+ ((share->state.changed & STATE_CRASHED) ||
+ ((open_flags & HA_OPEN_ABORT_IF_CRASHED) &&
+ (my_disable_locking && share->state.open_count))))
+ {
+ DBUG_PRINT("error",("Table is marked as crashed. open_flags: %u "
+ "changed: %u open_count: %u !locking: %d",
+ open_flags, share->state.changed,
+ share->state.open_count, my_disable_locking));
+ my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ?
+ HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE);
+ goto err;
+ }
+
+ /*
+ We can ignore testing uuid if STATE_NOT_MOVABLE is set, as in this
+ case the uuid will be set in _ma_mark_file_changed()
+ */
+ if ((share->state.changed & STATE_NOT_MOVABLE) &&
+ share->base.born_transactional &&
+ ((!(open_flags & HA_OPEN_IGNORE_MOVED_STATE) &&
+ memcmp(share->base.uuid, maria_uuid, MY_UUID_SIZE)) ||
+ (share->state.create_trid > trnman_get_max_trid() &&
+ !maria_in_recovery)))
+ {
+ DBUG_PRINT("warning", ("table is moved from another system. uuid_diff: %d create_trid: %lu max_trid: %lu",
+ memcmp(share->base.uuid, maria_uuid,
+ MY_UUID_SIZE) != 0,
+ (ulong) share->state.create_trid,
+ (ulong) trnman_get_max_trid()));
+ if (open_flags & HA_OPEN_FOR_REPAIR)
+ share->state.changed|= STATE_MOVED;
+ else
+ {
+ my_errno= HA_ERR_OLD_FILE;
+ goto err;
+ }
+ }
+
+ /* sanity check */
+ if (share->base.keystart > 65535 || share->base.rec_reflength > 8)
+ {
+ my_errno=HA_ERR_CRASHED;
+ goto err;
+ }
+
+ key_parts+=fulltext_keys*FT_SEGS;
+ if (share->base.max_key_length > maria_max_key_length() ||
+ keys > MARIA_MAX_KEY || key_parts > MARIA_MAX_KEY * HA_MAX_KEY_SEG)
+ {
+ DBUG_PRINT("error",("Wrong key info: Max_key_length: %d keys: %d key_parts: %d", share->base.max_key_length, keys, key_parts));
+ my_errno=HA_ERR_UNSUPPORTED;
+ goto err;
+ }
+
+ /* Ensure we have space in the key buffer for transaction id's */
+ if (share->base.born_transactional)
+ share->base.max_key_length= ALIGN_SIZE(share->base.max_key_length +
+ MARIA_MAX_PACK_TRANSID_SIZE);
+
+ /*
+ If page cache is not initialized, then assume we will create the
+ page_cache after the table is opened!
+ This is only used by maria_check to allow it to check/repair tables
+ with different block sizes.
+ */
+ if (share->base.block_size != maria_block_size &&
+ share_buff.pagecache->inited != 0)
+ {
+ DBUG_PRINT("error", ("Wrong block size %u; Expected %u",
+ (uint) share->base.block_size,
+ (uint) maria_block_size));
+ my_errno=HA_ERR_UNSUPPORTED;
+ goto err;
+ }
+
+ /* Correct max_file_length based on length of sizeof(off_t) */
+ max_data_file_length=
+ (share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) ?
+ (((ulonglong) 1 << (share->base.rec_reflength*8))-1) :
+ (_ma_safe_mul(share->base.pack_reclength,
+ (ulonglong) 1 << (share->base.rec_reflength*8))-1);
+
+ max_key_file_length=
+ _ma_safe_mul(maria_block_size,
+ ((ulonglong) 1 << (share->base.key_reflength*8))-1);
+#if SIZEOF_OFF_T == 4
+ set_if_smaller(max_data_file_length, INT_MAX32);
+ set_if_smaller(max_key_file_length, INT_MAX32);
+#endif
+ share->base.max_data_file_length=(my_off_t) max_data_file_length;
+ share->base.max_key_file_length=(my_off_t) max_key_file_length;
+
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ share->base.max_key_length+=2; /* For safety */
+ /* Add space for node pointer */
+ share->base.max_key_length+= share->base.key_reflength;
+
+ share->unique_file_name.length= strlen(name_buff);
+ share->index_file_name.length= strlen(index_name);
+ share->data_file_name.length= strlen(data_name);
+ share->open_file_name.length= strlen(name);
+ if (!my_multi_malloc(MY_WME,
+ &share,sizeof(*share),
+ &share->state.rec_per_key_part,
+ sizeof(double) * key_parts,
+ &share->state.nulls_per_key_part,
+ sizeof(long)* key_parts,
+ &share->keyinfo,keys*sizeof(MARIA_KEYDEF),
+ &share->uniqueinfo,uniques*sizeof(MARIA_UNIQUEDEF),
+ &share->keyparts,
+ (key_parts+unique_key_parts+keys+uniques) *
+ sizeof(HA_KEYSEG),
+ &share->columndef,
+ (share->base.fields+1)*sizeof(MARIA_COLUMNDEF),
+ &share->column_nr, share->base.fields*sizeof(uint16),
+ &share->blobs,sizeof(MARIA_BLOB)*share->base.blobs,
+ &share->unique_file_name.str,
+ share->unique_file_name.length+1,
+ &share->index_file_name.str,
+ share->index_file_name.length+1,
+ &share->data_file_name.str,
+ share->data_file_name.length+1,
+ &share->open_file_name.str,
+ share->open_file_name.length+1,
+ &share->state.key_root,keys*sizeof(my_off_t),
+ &share->mmap_lock,sizeof(rw_lock_t),
+ NullS))
+ goto err;
+ errpos= 4;
+
+ *share=share_buff;
+ memcpy((char*) share->state.rec_per_key_part,
+ (char*) rec_per_key_part, sizeof(double)*key_parts);
+ memcpy((char*) share->state.nulls_per_key_part,
+ (char*) nulls_per_key_part, sizeof(long)*key_parts);
+ memcpy((char*) share->state.key_root,
+ (char*) key_root, sizeof(my_off_t)*keys);
+ strmov(share->unique_file_name.str, name_buff);
+ strmov(share->index_file_name.str, index_name);
+ strmov(share->data_file_name.str, data_name);
+ strmov(share->open_file_name.str, name);
+
+ share->block_size= share->base.block_size; /* Convenience */
+ share->max_index_block_size= share->block_size - KEYPAGE_CHECKSUM_SIZE;
+ {
+ HA_KEYSEG *pos=share->keyparts;
+ uint32 ftkey_nr= 1;
+ for (i=0 ; i < keys ; i++)
+ {
+ share->keyinfo[i].share= share;
+ disk_pos=_ma_keydef_read(disk_pos, &share->keyinfo[i]);
+ share->keyinfo[i].key_nr= i;
+ disk_pos_assert(disk_pos + share->keyinfo[i].keysegs * HA_KEYSEG_SIZE,
+ end_pos);
+ if (share->keyinfo[i].key_alg == HA_KEY_ALG_RTREE)
+ share->have_rtree= 1;
+ share->keyinfo[i].seg=pos;
+ for (j=0 ; j < share->keyinfo[i].keysegs; j++,pos++)
+ {
+ disk_pos=_ma_keyseg_read(disk_pos, pos);
+ if (pos->type == HA_KEYTYPE_TEXT ||
+ pos->type == HA_KEYTYPE_VARTEXT1 ||
+ pos->type == HA_KEYTYPE_VARTEXT2)
+ {
+ if (!pos->language)
+ pos->charset=default_charset_info;
+ else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME))))
+ {
+ my_errno=HA_ERR_UNKNOWN_CHARSET;
+ goto err;
+ }
+ }
+ else if (pos->type == HA_KEYTYPE_BINARY)
+ pos->charset= &my_charset_bin;
+ }
+ if (share->keyinfo[i].flag & HA_SPATIAL)
+ {
+#ifdef HAVE_SPATIAL
+ uint sp_segs=SPDIMS*2;
+ share->keyinfo[i].seg=pos-sp_segs;
+ share->keyinfo[i].keysegs--;
+ versioning= 0;
+#else
+ my_errno=HA_ERR_UNSUPPORTED;
+ goto err;
+#endif
+ }
+ else if (share->keyinfo[i].flag & HA_FULLTEXT)
+ {
+ versioning= 0;
+ DBUG_ASSERT(fulltext_keys);
+ {
+ uint k;
+ share->keyinfo[i].seg=pos;
+ for (k=0; k < FT_SEGS; k++)
+ {
+ *pos= ft_keysegs[k];
+ pos[0].language= pos[-1].language;
+ if (!(pos[0].charset= pos[-1].charset))
+ {
+ my_errno=HA_ERR_CRASHED;
+ goto err;
+ }
+ pos++;
+ }
+ }
+ if (!share->ft2_keyinfo.seg)
+ {
+ memcpy(&share->ft2_keyinfo, &share->keyinfo[i],
+ sizeof(MARIA_KEYDEF));
+ share->ft2_keyinfo.keysegs=1;
+ share->ft2_keyinfo.flag=0;
+ share->ft2_keyinfo.keylength=
+ share->ft2_keyinfo.minlength=
+ share->ft2_keyinfo.maxlength=HA_FT_WLEN+share->base.rec_reflength;
+ share->ft2_keyinfo.seg=pos-1;
+ share->ft2_keyinfo.end=pos;
+ setup_key_functions(& share->ft2_keyinfo);
+ }
+ share->keyinfo[i].ftkey_nr= ftkey_nr++;
+ }
+ setup_key_functions(share->keyinfo+i);
+ share->keyinfo[i].end=pos;
+ pos->type=HA_KEYTYPE_END; /* End */
+ pos->length=share->base.rec_reflength;
+ pos->null_bit=0;
+ pos->flag=0; /* For purify */
+ pos++;
+ }
+ for (i=0 ; i < uniques ; i++)
+ {
+ disk_pos=_ma_uniquedef_read(disk_pos, &share->uniqueinfo[i]);
+ disk_pos_assert(disk_pos + share->uniqueinfo[i].keysegs *
+ HA_KEYSEG_SIZE, end_pos);
+ share->uniqueinfo[i].seg=pos;
+ for (j=0 ; j < share->uniqueinfo[i].keysegs; j++,pos++)
+ {
+ disk_pos=_ma_keyseg_read(disk_pos, pos);
+ if (pos->type == HA_KEYTYPE_TEXT ||
+ pos->type == HA_KEYTYPE_VARTEXT1 ||
+ pos->type == HA_KEYTYPE_VARTEXT2)
+ {
+ if (!pos->language)
+ pos->charset=default_charset_info;
+ else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME))))
+ {
+ my_errno=HA_ERR_UNKNOWN_CHARSET;
+ goto err;
+ }
+ }
+ }
+ share->uniqueinfo[i].end=pos;
+ pos->type=HA_KEYTYPE_END; /* End */
+ pos->null_bit=0;
+ pos->flag=0;
+ pos++;
+ }
+ share->ftkeys= ftkey_nr;
+ }
+ share->data_file_type= share->state.header.data_file_type;
+ share->base_length= (BASE_ROW_HEADER_SIZE +
+ share->base.is_nulls_extended +
+ share->base.null_bytes +
+ share->base.pack_bytes +
+ test(share->options & HA_OPTION_CHECKSUM));
+ share->keypage_header= ((share->base.born_transactional ?
+ LSN_STORE_SIZE + TRANSID_SIZE :
+ 0) + KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE +
+ KEYPAGE_USED_SIZE);
+ share->kfile.file= kfile;
+
+ if (open_flags & HA_OPEN_COPY)
+ {
+ /*
+ this instance will be a temporary one used just to create a data
+ file for REPAIR. Don't do logging. This base information will not go
+ to disk.
+ */
+ share->base.born_transactional= FALSE;
+ }
+ if (share->base.born_transactional)
+ {
+ share->page_type= PAGECACHE_LSN_PAGE;
+ if (share->state.create_rename_lsn == LSN_NEEDS_NEW_STATE_LSNS)
+ {
+ /*
+ Was repaired with maria_chk, maybe later maria_pack-ed. Some sort of
+ import into the server. It starts its existence (from the point of
+ view of the server, including server's recovery) now.
+ */
+ if (((open_flags & HA_OPEN_FROM_SQL_LAYER) &&
+ (share->state.changed & STATE_NOT_MOVABLE)) || maria_in_recovery)
+ _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE,
+ trnman_get_min_safe_trid(), TRUE, TRUE);
+ }
+ else if ((!LSN_VALID(share->state.create_rename_lsn) ||
+ !LSN_VALID(share->state.is_of_horizon) ||
+ (cmp_translog_addr(share->state.create_rename_lsn,
+ share->state.is_of_horizon) > 0) ||
+ !LSN_VALID(share->state.skip_redo_lsn) ||
+ (cmp_translog_addr(share->state.create_rename_lsn,
+ share->state.skip_redo_lsn) > 0)) &&
+ !(open_flags & HA_OPEN_FOR_REPAIR))
+ {
+ /*
+ If in Recovery, it will not work. If LSN is invalid and not
+ LSN_NEEDS_NEW_STATE_LSNS, header must be corrupted.
+ In both cases, must repair.
+ */
+ my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ?
+ HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE);
+ goto err;
+ }
+ }
+ else
+ share->page_type= PAGECACHE_PLAIN_PAGE;
+ share->now_transactional= share->base.born_transactional;
+
+ /* Use pack_reclength as we don't want to modify base.pack_recklength */
+ if (share->state.header.org_data_file_type == DYNAMIC_RECORD)
+ {
+ /* add bits used to pack data to pack_reclength for faster allocation */
+ share->base.pack_reclength+= share->base.pack_bytes;
+ share->base.extra_rec_buff_size=
+ (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER) + MARIA_SPLIT_LENGTH +
+ MARIA_REC_BUFF_OFFSET);
+ }
+ if (share->data_file_type == COMPRESSED_RECORD)
+ {
+ /* Need some extra bytes for decode_bytes */
+ share->base.extra_rec_buff_size+= 7;
+ }
+ share->base.default_rec_buff_size= max(share->base.pack_reclength +
+ share->base.extra_rec_buff_size,
+ share->base.max_key_length);
+
+ disk_pos_assert(disk_pos + share->base.fields *MARIA_COLUMNDEF_SIZE,
+ end_pos);
+ for (i= j= 0 ; i < share->base.fields ; i++)
+ {
+ disk_pos=_ma_columndef_read(disk_pos,&share->columndef[i]);
+ share->columndef[i].pack_type=0;
+ share->columndef[i].huff_tree=0;
+ if (share->columndef[i].type == FIELD_BLOB)
+ {
+ share->blobs[j].pack_length=
+ share->columndef[i].length-portable_sizeof_char_ptr;
+ share->blobs[j].offset= share->columndef[i].offset;
+ j++;
+ }
+ }
+ share->columndef[i].type= FIELD_LAST; /* End marker */
+ disk_pos= _ma_column_nr_read(disk_pos, share->column_nr,
+ share->base.fields);
+
+ if ((share->data_file_type == BLOCK_RECORD ||
+ share->data_file_type == COMPRESSED_RECORD))
+ {
+ if (_ma_open_datafile(&info, share, name, -1))
+ goto err;
+ data_file= info.dfile.file;
+ }
+ errpos= 5;
+
+ if (open_flags & HA_OPEN_DELAY_KEY_WRITE)
+ share->options|= HA_OPTION_DELAY_KEY_WRITE;
+ if (mode == O_RDONLY)
+ share->options|= HA_OPTION_READ_ONLY_DATA;
+ share->is_log_table= FALSE;
+
+ if (open_flags & HA_OPEN_TMP_TABLE)
+ {
+ share->options|= HA_OPTION_TMP_TABLE;
+ share->temporary= share->delay_key_write= 1;
+ share->write_flag=MYF(MY_NABP);
+ share->w_locks++; /* We don't have to update status */
+ share->tot_locks++;
+ }
+
+ _ma_set_index_pagecache_callbacks(&share->kfile, share);
+ share->this_process=(ulong) getpid();
+#ifdef EXTERNAL_LOCKING
+ share->last_process= share->state.process;
+#endif
+ share->base.key_parts=key_parts;
+ share->base.all_key_parts=key_parts+unique_key_parts;
+ if (!(share->last_version=share->state.version))
+ share->last_version=1; /* Safety */
+ share->rec_reflength=share->base.rec_reflength; /* May be changed */
+ share->base.margin_key_file_length=(share->base.max_key_file_length -
+ (keys ? MARIA_INDEX_BLOCK_MARGIN *
+ share->block_size * keys : 0));
+ share->block_size= share->base.block_size;
+ my_free(disk_cache, MYF(0));
+ _ma_setup_functions(share);
+ if ((*share->once_init)(share, info.dfile.file))
+ goto err;
+ if (share->now_transactional)
+ {
+ /* Setup initial state that is visible for all */
+ MARIA_STATE_HISTORY_CLOSED *history;
+ if ((history= (MARIA_STATE_HISTORY_CLOSED *)
+ hash_search(&maria_stored_state,
+ (uchar*) &share->state.create_rename_lsn, 0)))
+ {
+ /*
+ Move history from hash to share. This is safe to do as we
+ don't have a lock on share->intern_lock.
+ */
+ share->state_history=
+ _ma_remove_not_visible_states(history->state_history, 0, 0);
+ history->state_history= 0;
+ (void) hash_delete(&maria_stored_state, (uchar*) history);
+ }
+ else
+ {
+ /* Table is not part of any active transaction; Create new history */
+ if (!(share->state_history= (MARIA_STATE_HISTORY *)
+ my_malloc(sizeof(*share->state_history), MYF(MY_WME))))
+ goto err;
+ share->state_history->trid= 0; /* Visible by all */
+ share->state_history->state= share->state.state;
+ share->state_history->next= 0;
+ }
+ }
+#ifdef THREAD
+ thr_lock_init(&share->lock);
+ pthread_mutex_init(&share->intern_lock, MY_MUTEX_INIT_FAST);
+ pthread_mutex_init(&share->key_del_lock, MY_MUTEX_INIT_FAST);
+ pthread_cond_init(&share->key_del_cond, 0);
+ pthread_mutex_init(&share->close_lock, MY_MUTEX_INIT_FAST);
+ for (i=0; i<keys; i++)
+ VOID(my_rwlock_init(&share->keyinfo[i].root_lock, NULL));
+ VOID(my_rwlock_init(&share->mmap_lock, NULL));
+
+ share->row_is_visible= _ma_row_visible_always;
+ share->lock.get_status= _ma_reset_update_flag;
+ if (!thr_lock_inited)
+ {
+ /* Probably a single threaded program; Don't use concurrent inserts */
+ maria_concurrent_insert=0;
+ }
+ else if (maria_concurrent_insert)
+ {
+ share->non_transactional_concurrent_insert=
+ ((share->options & (HA_OPTION_READ_ONLY_DATA | HA_OPTION_TMP_TABLE |
+ HA_OPTION_COMPRESS_RECORD |
+ HA_OPTION_TEMP_COMPRESS_RECORD)) ||
+ (open_flags & HA_OPEN_TMP_TABLE) ||
+ share->data_file_type == BLOCK_RECORD ||
+ share->have_rtree) ? 0 : 1;
+ if (share->non_transactional_concurrent_insert ||
+ (!share->temporary && share->now_transactional && versioning))
+ {
+ share->lock_key_trees= 1;
+ if (share->data_file_type == BLOCK_RECORD)
+ {
+ DBUG_ASSERT(share->now_transactional);
+ share->have_versioning= 1;
+ share->row_is_visible= _ma_row_visible_transactional_table;
+ share->lock.get_status= _ma_block_get_status;
+ share->lock.check_status= _ma_block_check_status;
+ share->lock.start_trans= _ma_block_start_trans;
+ /*
+ We can for the moment only allow multiple concurrent inserts
+ only if there is no auto-increment key. To lift this restriction
+ we have to:
+ - Extend statement base replication to support auto-increment
+ intervalls.
+ - Fix that we allocate auto-increment in intervals and that
+ it's properly reset if the interval was not used
+ */
+ share->lock.allow_multiple_concurrent_insert=
+ share->base.auto_key == 0;
+ share->lock_restore_status= 0;
+ }
+ else
+ {
+ share->row_is_visible= _ma_row_visible_non_transactional_table;
+ share->lock.get_status= _ma_get_status;
+ share->lock.copy_status= _ma_copy_status;
+ share->lock.update_status= _ma_update_status;
+ share->lock.restore_status= _ma_restore_status;
+ share->lock.check_status= _ma_check_status;
+ share->lock_restore_status= _ma_restore_status;
+ }
+ }
+ else if (share->now_transactional)
+ {
+ DBUG_ASSERT(share->data_file_type == BLOCK_RECORD);
+ share->lock.start_trans= _ma_block_start_trans_no_versioning;
+ }
+ }
+#endif
+ /*
+ Memory mapping can only be requested after initializing intern_lock.
+ */
+ if (open_flags & HA_OPEN_MMAP)
+ {
+ info.s= share;
+ maria_extra(&info, HA_EXTRA_MMAP, 0);
+ }
+ }
+ else
+ {
+ share= old_info->s;
+ if (share->data_file_type == BLOCK_RECORD)
+ data_file= share->bitmap.file.file; /* Only opened once */
+ }
+
+ if (!(m_info= maria_clone_internal(share, name, mode, data_file)))
+ goto err;
+
+ if (maria_is_crashed(m_info))
+ DBUG_PRINT("warning", ("table is crashed: changed: %u",
+ share->state.changed));
+
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ DBUG_RETURN(m_info);
+
+err:
+ DBUG_PRINT("error", ("error: %d errpos: %d", my_errno, errpos));
+ save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE;
+ if ((save_errno == HA_ERR_CRASHED) ||
+ (save_errno == HA_ERR_CRASHED_ON_USAGE) ||
+ (save_errno == HA_ERR_CRASHED_ON_REPAIR))
+ {
+ LEX_STRING tmp_name;
+ tmp_name.str= (char*) name;
+ tmp_name.length= strlen(name);
+ _ma_report_error(save_errno, &tmp_name);
+ }
+ if (save_errno == HA_ERR_OLD_FILE) /* uuid is different ? */
+ save_errno= HA_ERR_CRASHED_ON_USAGE; /* the code to trigger auto-repair */
+ switch (errpos) {
+ case 5:
+ if (data_file >= 0)
+ VOID(my_close(data_file, MYF(0)));
+ if (old_info)
+ break; /* Don't remove open table */
+ (*share->once_end)(share);
+ /* fall through */
+ case 4:
+ my_free(share,MYF(0));
+ /* fall through */
+ case 3:
+ my_free(disk_cache, MYF(0));
+ /* fall through */
+ case 1:
+ VOID(my_close(kfile,MYF(0)));
+ /* fall through */
+ case 0:
+ default:
+ break;
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ my_errno= save_errno;
+ DBUG_RETURN (NULL);
+} /* maria_open */
+
+
+/*
+ Reallocate a buffer, if the current buffer is not large enough
+*/
+
+my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size,
+ size_t new_size)
+{
+ if (*old_size < new_size)
+ {
+ uchar *addr;
+ if (!(addr= (uchar*) my_realloc(*old_addr, new_size,
+ MYF(MY_ALLOW_ZERO_PTR))))
+ return 1;
+ *old_addr= addr;
+ *old_size= new_size;
+ }
+ return 0;
+}
+
+
+ulonglong _ma_safe_mul(ulonglong a, ulonglong b)
+{
+ ulonglong max_val= ~ (ulonglong) 0; /* my_off_t is unsigned */
+
+ if (!a || max_val / a < b)
+ return max_val;
+ return a*b;
+}
+
+ /* Set up functions in structs */
+
+void _ma_setup_functions(register MARIA_SHARE *share)
+{
+ share->once_init= maria_once_init_dummy;
+ share->once_end= maria_once_end_dummy;
+ share->init= maria_scan_init_dummy;
+ share->end= maria_scan_end_dummy;
+ share->scan_init= maria_scan_init_dummy;/* Compat. dummy function */
+ share->scan_end= maria_scan_end_dummy;/* Compat. dummy function */
+ share->scan_remember_pos= _ma_def_scan_remember_pos;
+ share->scan_restore_pos= _ma_def_scan_restore_pos;
+
+ share->write_record_init= _ma_write_init_default;
+ share->write_record_abort= _ma_write_abort_default;
+ share->keypos_to_recpos= _ma_transparent_recpos;
+ share->recpos_to_keypos= _ma_transparent_recpos;
+
+ switch (share->data_file_type) {
+ case COMPRESSED_RECORD:
+ share->read_record= _ma_read_pack_record;
+ share->scan= _ma_read_rnd_pack_record;
+ share->once_init= _ma_once_init_pack_row;
+ share->once_end= _ma_once_end_pack_row;
+ /*
+ Calculate checksum according to data in the original, not compressed,
+ row.
+ */
+ if (share->state.header.org_data_file_type == STATIC_RECORD &&
+ ! (share->options & HA_OPTION_NULL_FIELDS))
+ share->calc_checksum= _ma_static_checksum;
+ else
+ share->calc_checksum= _ma_checksum;
+ share->calc_write_checksum= share->calc_checksum;
+ break;
+ case DYNAMIC_RECORD:
+ share->read_record= _ma_read_dynamic_record;
+ share->scan= _ma_read_rnd_dynamic_record;
+ share->delete_record= _ma_delete_dynamic_record;
+ share->compare_record= _ma_cmp_dynamic_record;
+ share->compare_unique= _ma_cmp_dynamic_unique;
+ share->calc_checksum= share->calc_write_checksum= _ma_checksum;
+ if (share->base.blobs)
+ {
+ share->update_record= _ma_update_blob_record;
+ share->write_record= _ma_write_blob_record;
+ }
+ else
+ {
+ share->write_record= _ma_write_dynamic_record;
+ share->update_record= _ma_update_dynamic_record;
+ }
+ break;
+ case STATIC_RECORD:
+ share->read_record= _ma_read_static_record;
+ share->scan= _ma_read_rnd_static_record;
+ share->delete_record= _ma_delete_static_record;
+ share->compare_record= _ma_cmp_static_record;
+ share->update_record= _ma_update_static_record;
+ share->write_record= _ma_write_static_record;
+ share->compare_unique= _ma_cmp_static_unique;
+ share->keypos_to_recpos= _ma_static_keypos_to_recpos;
+ share->recpos_to_keypos= _ma_static_recpos_to_keypos;
+ if (share->state.header.org_data_file_type == STATIC_RECORD &&
+ ! (share->options & HA_OPTION_NULL_FIELDS))
+ share->calc_checksum= _ma_static_checksum;
+ else
+ share->calc_checksum= _ma_checksum;
+ break;
+ case BLOCK_RECORD:
+ share->once_init= _ma_once_init_block_record;
+ share->once_end= _ma_once_end_block_record;
+ share->init= _ma_init_block_record;
+ share->end= _ma_end_block_record;
+ share->write_record_init= _ma_write_init_block_record;
+ share->write_record_abort= _ma_write_abort_block_record;
+ share->scan_init= _ma_scan_init_block_record;
+ share->scan_end= _ma_scan_end_block_record;
+ share->scan= _ma_scan_block_record;
+ share->scan_remember_pos= _ma_scan_remember_block_record;
+ share->scan_restore_pos= _ma_scan_restore_block_record;
+ share->read_record= _ma_read_block_record;
+ share->delete_record= _ma_delete_block_record;
+ share->compare_record= _ma_compare_block_record;
+ share->update_record= _ma_update_block_record;
+ share->write_record= _ma_write_block_record;
+ share->compare_unique= _ma_cmp_block_unique;
+ share->calc_checksum= _ma_checksum;
+ share->keypos_to_recpos= _ma_transaction_keypos_to_recpos;
+ share->recpos_to_keypos= _ma_transaction_recpos_to_keypos;
+
+ /*
+ write_block_record() will calculate the checksum; Tell maria_write()
+ that it doesn't have to do this.
+ */
+ share->calc_write_checksum= 0;
+ break;
+ }
+ share->file_read= _ma_nommap_pread;
+ share->file_write= _ma_nommap_pwrite;
+ share->calc_check_checksum= share->calc_checksum;
+
+ if (!(share->options & HA_OPTION_CHECKSUM) &&
+ share->data_file_type != COMPRESSED_RECORD)
+ share->calc_checksum= share->calc_write_checksum= 0;
+ return;
+}
+
+
+static void setup_key_functions(register MARIA_KEYDEF *keyinfo)
+{
+ if (keyinfo->key_alg == HA_KEY_ALG_RTREE)
+ {
+#ifdef HAVE_RTREE_KEYS
+ keyinfo->ck_insert = maria_rtree_insert;
+ keyinfo->ck_delete = maria_rtree_delete;
+#else
+ DBUG_ASSERT(0); /* maria_open should check it never happens */
+#endif
+ }
+ else
+ {
+ keyinfo->ck_insert = _ma_ck_write;
+ keyinfo->ck_delete = _ma_ck_delete;
+ }
+ if (keyinfo->flag & HA_SPATIAL)
+ keyinfo->make_key= _ma_sp_make_key;
+ else
+ keyinfo->make_key= _ma_make_key;
+
+ if (keyinfo->flag & HA_BINARY_PACK_KEY)
+ { /* Simple prefix compression */
+ keyinfo->bin_search= _ma_seq_search;
+ keyinfo->get_key= _ma_get_binary_pack_key;
+ keyinfo->skip_key= _ma_skip_binary_pack_key;
+ keyinfo->pack_key= _ma_calc_bin_pack_key_length;
+ keyinfo->store_key= _ma_store_bin_pack_key;
+ }
+ else if (keyinfo->flag & HA_VAR_LENGTH_KEY)
+ {
+ keyinfo->get_key= _ma_get_pack_key;
+ keyinfo->skip_key= _ma_skip_pack_key;
+ if (keyinfo->seg[0].flag & HA_PACK_KEY)
+ { /* Prefix compression */
+ /*
+ _ma_prefix_search() compares end-space against ASCII blank (' ').
+ It cannot be used for character sets, that do not encode the
+ blank character like ASCII does. UCS2 is an example. All
+ character sets with a fixed width > 1 or a mimimum width > 1
+ cannot represent blank like ASCII does. In these cases we have
+ to use _ma_seq_search() for the search.
+ */
+ if (!keyinfo->seg->charset || use_strnxfrm(keyinfo->seg->charset) ||
+ (keyinfo->seg->flag & HA_NULL_PART) ||
+ keyinfo->seg->charset->mbminlen > 1)
+ keyinfo->bin_search= _ma_seq_search;
+ else
+ keyinfo->bin_search= _ma_prefix_search;
+ keyinfo->pack_key= _ma_calc_var_pack_key_length;
+ keyinfo->store_key= _ma_store_var_pack_key;
+ }
+ else
+ {
+ keyinfo->bin_search= _ma_seq_search;
+ keyinfo->pack_key= _ma_calc_var_key_length; /* Variable length key */
+ keyinfo->store_key= _ma_store_static_key;
+ }
+ }
+ else
+ {
+ keyinfo->bin_search= _ma_bin_search;
+ keyinfo->get_key= _ma_get_static_key;
+ keyinfo->skip_key= _ma_skip_static_key;
+ keyinfo->pack_key= _ma_calc_static_key_length;
+ keyinfo->store_key= _ma_store_static_key;
+ }
+
+ /* set keyinfo->write_comp_flag */
+ if (keyinfo->flag & HA_SORT_ALLOWS_SAME)
+ keyinfo->write_comp_flag=SEARCH_BIGGER; /* Put after same key */
+ else if (keyinfo->flag & ( HA_NOSAME | HA_FULLTEXT))
+ {
+ keyinfo->write_comp_flag= SEARCH_FIND | SEARCH_UPDATE; /* No duplicates */
+ if (keyinfo->flag & HA_NULL_ARE_EQUAL)
+ keyinfo->write_comp_flag|= SEARCH_NULL_ARE_EQUAL;
+ }
+ else
+ keyinfo->write_comp_flag= SEARCH_SAME; /* Keys in rec-pos order */
+ keyinfo->write_comp_flag|= SEARCH_INSERT;
+ return;
+}
+
+
+/**
+ @brief Function to save and store the header in the index file (.MYI)
+
+ Operates under MARIA_SHARE::intern_lock if requested.
+ Sets MARIA_SHARE::MARIA_STATE_INFO::is_of_horizon if transactional table.
+ Then calls _ma_state_info_write_sub().
+
+ @param share table
+ @param pWrite bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)
+ is set my_pwrite() is used otherwise my_write();
+ if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info
+ about keys is written (should only be needed
+ after ALTER TABLE ENABLE/DISABLE KEYS, and
+ REPAIR/OPTIMIZE); if 4 (MA_STATE_INFO_WRITE_LOCK)
+ is set, MARIA_SHARE::intern_lock is taken.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite)
+{
+ uint res;
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ return 0;
+
+ if (pWrite & MA_STATE_INFO_WRITE_LOCK)
+ pthread_mutex_lock(&share->intern_lock);
+ else if (maria_multi_threaded)
+ {
+ safe_mutex_assert_owner(&share->intern_lock);
+ }
+ if (share->base.born_transactional && translog_status == TRANSLOG_OK &&
+ !maria_in_recovery)
+ {
+ /*
+ In a recovery, we want to set is_of_horizon to the LSN of the last
+ record executed by Recovery, not the current EOF of the log (which
+ is too new). Recovery does it by itself.
+ */
+ share->state.is_of_horizon= translog_get_horizon();
+ DBUG_PRINT("info", ("is_of_horizon set to LSN (%lu,0x%lx)",
+ LSN_IN_PARTS(share->state.is_of_horizon)));
+ }
+ res= _ma_state_info_write_sub(share->kfile.file, &share->state, pWrite);
+ if (pWrite & MA_STATE_INFO_WRITE_LOCK)
+ pthread_mutex_unlock(&share->intern_lock);
+ share->changed= 0;
+ return res;
+}
+
+
+/**
+ @brief Function to save and store the header in the index file (.MYI).
+
+ Shortcut to use instead of _ma_state_info_write() when appropriate.
+
+ @param file descriptor of the index file to write
+ @param state state information to write to the file
+ @param pWrite bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)
+ is set my_pwrite() is used otherwise my_write();
+ if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info
+ about keys is written (should only be needed
+ after ALTER TABLE ENABLE/DISABLE KEYS, and
+ REPAIR/OPTIMIZE).
+
+ @notes
+ For transactional multiuser tables, this function is called
+ with intern_lock & translog_lock or when the last thread who
+ is using the table is closing it.
+ Because of the translog_lock we don't need to have a lock on
+ key_del_lock.
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite)
+{
+ uchar buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE];
+ uchar *ptr=buff;
+ uint i, keys= (uint) state->header.keys;
+ size_t res;
+ DBUG_ENTER("_ma_state_info_write_sub");
+
+ memcpy_fixed(ptr,&state->header,sizeof(state->header));
+ ptr+=sizeof(state->header);
+
+ /* open_count must be first because of _ma_mark_file_changed ! */
+ mi_int2store(ptr,state->open_count); ptr+= 2;
+ /* changed must be second, because of _ma_mark_file_crashed */
+ mi_int2store(ptr,state->changed); ptr+= 2;
+
+ /*
+ If you change the offset of these LSNs, note that some functions do a
+ direct write of them without going through this function.
+ */
+ lsn_store(ptr, state->create_rename_lsn); ptr+= LSN_STORE_SIZE;
+ lsn_store(ptr, state->is_of_horizon); ptr+= LSN_STORE_SIZE;
+ lsn_store(ptr, state->skip_redo_lsn); ptr+= LSN_STORE_SIZE;
+ mi_rowstore(ptr,state->state.records); ptr+= 8;
+ mi_rowstore(ptr,state->state.del); ptr+= 8;
+ mi_rowstore(ptr,state->split); ptr+= 8;
+ mi_sizestore(ptr,state->dellink); ptr+= 8;
+ mi_sizestore(ptr,state->first_bitmap_with_space); ptr+= 8;
+ mi_sizestore(ptr,state->state.key_file_length); ptr+= 8;
+ mi_sizestore(ptr,state->state.data_file_length); ptr+= 8;
+ mi_sizestore(ptr,state->state.empty); ptr+= 8;
+ mi_sizestore(ptr,state->state.key_empty); ptr+= 8;
+ mi_int8store(ptr,state->auto_increment); ptr+= 8;
+ mi_int8store(ptr,(ulonglong) state->state.checksum); ptr+= 8;
+ mi_int8store(ptr,state->create_trid); ptr+= 8;
+ mi_int4store(ptr,state->status); ptr+= 4;
+ mi_int4store(ptr,state->update_count); ptr+= 4;
+ *ptr++= state->sortkey;
+ *ptr++= 0; /* Reserved */
+ ptr+= state->state_diff_length;
+
+ for (i=0; i < keys; i++)
+ {
+ mi_sizestore(ptr,state->key_root[i]); ptr+= 8;
+ }
+ mi_sizestore(ptr,state->key_del); ptr+= 8;
+ if (pWrite & MA_STATE_INFO_WRITE_FULL_INFO) /* From maria_chk */
+ {
+ uint key_parts= mi_uint2korr(state->header.key_parts);
+ mi_int4store(ptr,state->sec_index_changed); ptr+= 4;
+ mi_int4store(ptr,state->sec_index_used); ptr+= 4;
+ mi_int4store(ptr,state->version); ptr+= 4;
+ mi_int8store(ptr,state->key_map); ptr+= 8;
+ mi_int8store(ptr,(ulonglong) state->create_time); ptr+= 8;
+ mi_int8store(ptr,(ulonglong) state->recover_time); ptr+= 8;
+ mi_int8store(ptr,(ulonglong) state->check_time); ptr+= 8;
+ mi_sizestore(ptr, state->records_at_analyze); ptr+= 8;
+ /* reserve place for some information per key */
+ bzero(ptr, keys*4); ptr+= keys*4;
+ for (i=0 ; i < key_parts ; i++)
+ {
+ float8store(ptr, state->rec_per_key_part[i]); ptr+= 8;
+ mi_int4store(ptr, state->nulls_per_key_part[i]); ptr+= 4;
+ }
+ }
+
+ res= (pWrite & MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET) ?
+ my_pwrite(file, buff, (size_t) (ptr-buff), 0L,
+ MYF(MY_NABP | MY_THREADSAFE)) :
+ my_write(file, buff, (size_t) (ptr-buff),
+ MYF(MY_NABP));
+ DBUG_RETURN(res != 0);
+}
+
+
+static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state)
+{
+ uint i,keys,key_parts;
+ memcpy_fixed(&state->header,ptr, sizeof(state->header));
+ ptr+= sizeof(state->header);
+ keys= (uint) state->header.keys;
+ key_parts= mi_uint2korr(state->header.key_parts);
+
+ state->open_count = mi_uint2korr(ptr); ptr+= 2;
+ state->changed= mi_uint2korr(ptr); ptr+= 2;
+ state->create_rename_lsn= lsn_korr(ptr); ptr+= LSN_STORE_SIZE;
+ state->is_of_horizon= lsn_korr(ptr); ptr+= LSN_STORE_SIZE;
+ state->skip_redo_lsn= lsn_korr(ptr); ptr+= LSN_STORE_SIZE;
+ state->state.records= mi_rowkorr(ptr); ptr+= 8;
+ state->state.del = mi_rowkorr(ptr); ptr+= 8;
+ state->split = mi_rowkorr(ptr); ptr+= 8;
+ state->dellink= mi_sizekorr(ptr); ptr+= 8;
+ state->first_bitmap_with_space= mi_sizekorr(ptr); ptr+= 8;
+ state->state.key_file_length = mi_sizekorr(ptr); ptr+= 8;
+ state->state.data_file_length= mi_sizekorr(ptr); ptr+= 8;
+ state->state.empty = mi_sizekorr(ptr); ptr+= 8;
+ state->state.key_empty= mi_sizekorr(ptr); ptr+= 8;
+ state->auto_increment=mi_uint8korr(ptr); ptr+= 8;
+ state->state.checksum=(ha_checksum) mi_uint8korr(ptr);ptr+= 8;
+ state->create_trid= mi_uint8korr(ptr); ptr+= 8;
+ state->status = mi_uint4korr(ptr); ptr+= 4;
+ state->update_count=mi_uint4korr(ptr); ptr+= 4;
+ state->sortkey= (uint) *ptr++;
+ ptr++; /* reserved */
+
+ ptr+= state->state_diff_length;
+
+ for (i=0; i < keys; i++)
+ {
+ state->key_root[i]= mi_sizekorr(ptr); ptr+= 8;
+ }
+ state->key_del= mi_sizekorr(ptr); ptr+= 8;
+ state->sec_index_changed = mi_uint4korr(ptr); ptr+= 4;
+ state->sec_index_used = mi_uint4korr(ptr); ptr+= 4;
+ state->version = mi_uint4korr(ptr); ptr+= 4;
+ state->key_map = mi_uint8korr(ptr); ptr+= 8;
+ state->create_time = (time_t) mi_sizekorr(ptr); ptr+= 8;
+ state->recover_time =(time_t) mi_sizekorr(ptr); ptr+= 8;
+ state->check_time = (time_t) mi_sizekorr(ptr); ptr+= 8;
+ state->records_at_analyze= mi_sizekorr(ptr); ptr+= 8;
+ ptr+= keys * 4; /* Skip reserved bytes */
+ for (i=0 ; i < key_parts ; i++)
+ {
+ float8get(state->rec_per_key_part[i], ptr); ptr+= 8;
+ state->nulls_per_key_part[i]= mi_uint4korr(ptr); ptr+= 4;
+ }
+ return ptr;
+}
+
+
+/**
+ @brief Fills the state by reading its copy on disk.
+
+ Should not be called for transactional tables, as their state on disk is
+ rarely current and so is often misleading for a reader.
+ Does nothing in single user mode.
+
+ @param file file to read from
+ @param state state which will be filled
+*/
+
+uint _ma_state_info_read_dsk(File file __attribute__((unused)),
+ MARIA_STATE_INFO *state __attribute__((unused)))
+{
+#ifdef EXTERNAL_LOCKING
+ uchar buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE];
+
+ /* trick to detect transactional tables */
+ DBUG_ASSERT(state->create_rename_lsn == LSN_IMPOSSIBLE);
+ if (!maria_single_user)
+ {
+ if (my_pread(file, buff, state->state_length, 0L, MYF(MY_NABP)))
+ return 1;
+ _ma_state_info_read(buff, state);
+ }
+#endif
+ return 0;
+}
+
+
+/****************************************************************************
+** store and read of MARIA_BASE_INFO
+****************************************************************************/
+
+uint _ma_base_info_write(File file, MARIA_BASE_INFO *base)
+{
+ uchar buff[MARIA_BASE_INFO_SIZE], *ptr=buff;
+
+ bmove(ptr, maria_uuid, MY_UUID_SIZE);
+ ptr+= MY_UUID_SIZE;
+ mi_sizestore(ptr,base->keystart); ptr+= 8;
+ mi_sizestore(ptr,base->max_data_file_length); ptr+= 8;
+ mi_sizestore(ptr,base->max_key_file_length); ptr+= 8;
+ mi_rowstore(ptr,base->records); ptr+= 8;
+ mi_rowstore(ptr,base->reloc); ptr+= 8;
+ mi_int4store(ptr,base->mean_row_length); ptr+= 4;
+ mi_int4store(ptr,base->reclength); ptr+= 4;
+ mi_int4store(ptr,base->pack_reclength); ptr+= 4;
+ mi_int4store(ptr,base->min_pack_length); ptr+= 4;
+ mi_int4store(ptr,base->max_pack_length); ptr+= 4;
+ mi_int4store(ptr,base->min_block_length); ptr+= 4;
+ mi_int2store(ptr,base->fields); ptr+= 2;
+ mi_int2store(ptr,base->fixed_not_null_fields); ptr+= 2;
+ mi_int2store(ptr,base->fixed_not_null_fields_length); ptr+= 2;
+ mi_int2store(ptr,base->max_field_lengths); ptr+= 2;
+ mi_int2store(ptr,base->pack_fields); ptr+= 2;
+ mi_int2store(ptr,base->extra_options) ptr+= 2;
+ mi_int2store(ptr,base->null_bytes); ptr+= 2;
+ mi_int2store(ptr,base->original_null_bytes); ptr+= 2;
+ mi_int2store(ptr,base->field_offsets); ptr+= 2;
+ mi_int2store(ptr,0); ptr+= 2; /* reserved */
+ mi_int2store(ptr,base->block_size); ptr+= 2;
+ *ptr++= base->rec_reflength;
+ *ptr++= base->key_reflength;
+ *ptr++= base->keys;
+ *ptr++= base->auto_key;
+ *ptr++= base->born_transactional;
+ *ptr++= 0; /* Reserved */
+ mi_int2store(ptr,base->pack_bytes); ptr+= 2;
+ mi_int2store(ptr,base->blobs); ptr+= 2;
+ mi_int2store(ptr,base->max_key_block_length); ptr+= 2;
+ mi_int2store(ptr,base->max_key_length); ptr+= 2;
+ mi_int2store(ptr,base->extra_alloc_bytes); ptr+= 2;
+ *ptr++= base->extra_alloc_procent;
+ bzero(ptr,16); ptr+= 16; /* extra */
+ DBUG_ASSERT((ptr - buff) == MARIA_BASE_INFO_SIZE);
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+
+static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base)
+{
+ bmove(base->uuid, ptr, MY_UUID_SIZE); ptr+= MY_UUID_SIZE;
+ base->keystart= mi_sizekorr(ptr); ptr+= 8;
+ base->max_data_file_length= mi_sizekorr(ptr); ptr+= 8;
+ base->max_key_file_length= mi_sizekorr(ptr); ptr+= 8;
+ base->records= (ha_rows) mi_sizekorr(ptr); ptr+= 8;
+ base->reloc= (ha_rows) mi_sizekorr(ptr); ptr+= 8;
+ base->mean_row_length= mi_uint4korr(ptr); ptr+= 4;
+ base->reclength= mi_uint4korr(ptr); ptr+= 4;
+ base->pack_reclength= mi_uint4korr(ptr); ptr+= 4;
+ base->min_pack_length= mi_uint4korr(ptr); ptr+= 4;
+ base->max_pack_length= mi_uint4korr(ptr); ptr+= 4;
+ base->min_block_length= mi_uint4korr(ptr); ptr+= 4;
+ base->fields= mi_uint2korr(ptr); ptr+= 2;
+ base->fixed_not_null_fields= mi_uint2korr(ptr); ptr+= 2;
+ base->fixed_not_null_fields_length= mi_uint2korr(ptr);ptr+= 2;
+ base->max_field_lengths= mi_uint2korr(ptr); ptr+= 2;
+ base->pack_fields= mi_uint2korr(ptr); ptr+= 2;
+ base->extra_options= mi_uint2korr(ptr); ptr+= 2;
+ base->null_bytes= mi_uint2korr(ptr); ptr+= 2;
+ base->original_null_bytes= mi_uint2korr(ptr); ptr+= 2;
+ base->field_offsets= mi_uint2korr(ptr); ptr+= 2;
+ ptr+= 2;
+ base->block_size= mi_uint2korr(ptr); ptr+= 2;
+
+ base->rec_reflength= *ptr++;
+ base->key_reflength= *ptr++;
+ base->keys= *ptr++;
+ base->auto_key= *ptr++;
+ base->born_transactional= *ptr++;
+ ptr++;
+ base->pack_bytes= mi_uint2korr(ptr); ptr+= 2;
+ base->blobs= mi_uint2korr(ptr); ptr+= 2;
+ base->max_key_block_length= mi_uint2korr(ptr); ptr+= 2;
+ base->max_key_length= mi_uint2korr(ptr); ptr+= 2;
+ base->extra_alloc_bytes= mi_uint2korr(ptr); ptr+= 2;
+ base->extra_alloc_procent= *ptr++;
+ ptr+= 16;
+ return ptr;
+}
+
+/*--------------------------------------------------------------------------
+ maria_keydef
+---------------------------------------------------------------------------*/
+
+my_bool _ma_keydef_write(File file, MARIA_KEYDEF *keydef)
+{
+ uchar buff[MARIA_KEYDEF_SIZE];
+ uchar *ptr=buff;
+
+ *ptr++= (uchar) keydef->keysegs;
+ *ptr++= keydef->key_alg; /* Rtree or Btree */
+ mi_int2store(ptr,keydef->flag); ptr+= 2;
+ mi_int2store(ptr,keydef->block_length); ptr+= 2;
+ mi_int2store(ptr,keydef->keylength); ptr+= 2;
+ mi_int2store(ptr,keydef->minlength); ptr+= 2;
+ mi_int2store(ptr,keydef->maxlength); ptr+= 2;
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+uchar *_ma_keydef_read(uchar *ptr, MARIA_KEYDEF *keydef)
+{
+ keydef->keysegs = (uint) *ptr++;
+ keydef->key_alg = *ptr++; /* Rtree or Btree */
+
+ keydef->flag = mi_uint2korr(ptr); ptr+= 2;
+ keydef->block_length = mi_uint2korr(ptr); ptr+= 2;
+ keydef->keylength = mi_uint2korr(ptr); ptr+= 2;
+ keydef->minlength = mi_uint2korr(ptr); ptr+= 2;
+ keydef->maxlength = mi_uint2korr(ptr); ptr+= 2;
+ keydef->underflow_block_length=keydef->block_length/3;
+ keydef->version = 0; /* Not saved */
+ keydef->parser = &ft_default_parser;
+ keydef->ftkey_nr = 0;
+ return ptr;
+}
+
+/***************************************************************************
+** maria_keyseg
+***************************************************************************/
+
+my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg)
+{
+ uchar buff[HA_KEYSEG_SIZE];
+ uchar *ptr=buff;
+ ulong pos;
+
+ *ptr++= keyseg->type;
+ *ptr++= keyseg->language;
+ *ptr++= keyseg->null_bit;
+ *ptr++= keyseg->bit_start;
+ *ptr++= keyseg->bit_end;
+ *ptr++= keyseg->bit_length;
+ mi_int2store(ptr,keyseg->flag); ptr+= 2;
+ mi_int2store(ptr,keyseg->length); ptr+= 2;
+ mi_int4store(ptr,keyseg->start); ptr+= 4;
+ pos= keyseg->null_bit ? keyseg->null_pos : keyseg->bit_pos;
+ mi_int4store(ptr, pos);
+ ptr+=4;
+
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+
+uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg)
+{
+ keyseg->type = *ptr++;
+ keyseg->language = *ptr++;
+ keyseg->null_bit = *ptr++;
+ keyseg->bit_start = *ptr++;
+ keyseg->bit_end = *ptr++;
+ keyseg->bit_length = *ptr++;
+ keyseg->flag = mi_uint2korr(ptr); ptr+= 2;
+ keyseg->length = mi_uint2korr(ptr); ptr+= 2;
+ keyseg->start = mi_uint4korr(ptr); ptr+= 4;
+ keyseg->null_pos = mi_uint4korr(ptr); ptr+= 4;
+ keyseg->charset=0; /* Will be filled in later */
+ if (keyseg->null_bit)
+ keyseg->bit_pos= (uint16)(keyseg->null_pos + (keyseg->null_bit == 7));
+ else
+ {
+ keyseg->bit_pos= (uint16)keyseg->null_pos;
+ keyseg->null_pos= 0;
+ }
+ return ptr;
+}
+
+/*--------------------------------------------------------------------------
+ maria_uniquedef
+---------------------------------------------------------------------------*/
+
+my_bool _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *def)
+{
+ uchar buff[MARIA_UNIQUEDEF_SIZE];
+ uchar *ptr=buff;
+
+ mi_int2store(ptr,def->keysegs); ptr+=2;
+ *ptr++= (uchar) def->key;
+ *ptr++ = (uchar) def->null_are_equal;
+
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+uchar *_ma_uniquedef_read(uchar *ptr, MARIA_UNIQUEDEF *def)
+{
+ def->keysegs = mi_uint2korr(ptr);
+ def->key = ptr[2];
+ def->null_are_equal=ptr[3];
+ return ptr+4; /* 1 extra uchar */
+}
+
+/***************************************************************************
+** MARIA_COLUMNDEF
+***************************************************************************/
+
+my_bool _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef)
+{
+ uchar buff[MARIA_COLUMNDEF_SIZE];
+ uchar *ptr=buff;
+
+ mi_int2store(ptr,(ulong) columndef->column_nr); ptr+= 2;
+ mi_int2store(ptr,(ulong) columndef->offset); ptr+= 2;
+ mi_int2store(ptr,columndef->type); ptr+= 2;
+ mi_int2store(ptr,columndef->length); ptr+= 2;
+ mi_int2store(ptr,columndef->fill_length); ptr+= 2;
+ mi_int2store(ptr,columndef->null_pos); ptr+= 2;
+ mi_int2store(ptr,columndef->empty_pos); ptr+= 2;
+
+ (*ptr++)= columndef->null_bit;
+ (*ptr++)= columndef->empty_bit;
+ ptr[0]= ptr[1]= ptr[2]= ptr[3]= 0; ptr+= 4; /* For future */
+ return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+uchar *_ma_columndef_read(uchar *ptr, MARIA_COLUMNDEF *columndef)
+{
+ columndef->column_nr= mi_uint2korr(ptr); ptr+= 2;
+ columndef->offset= mi_uint2korr(ptr); ptr+= 2;
+ columndef->type= mi_sint2korr(ptr); ptr+= 2;
+ columndef->length= mi_uint2korr(ptr); ptr+= 2;
+ columndef->fill_length= mi_uint2korr(ptr); ptr+= 2;
+ columndef->null_pos= mi_uint2korr(ptr); ptr+= 2;
+ columndef->empty_pos= mi_uint2korr(ptr); ptr+= 2;
+ columndef->null_bit= (uint8) *ptr++;
+ columndef->empty_bit= (uint8) *ptr++;
+ ptr+= 4;
+ return ptr;
+}
+
+my_bool _ma_column_nr_write(File file, uint16 *offsets, uint columns)
+{
+ uchar *buff, *ptr, *end;
+ size_t size= columns*2;
+ my_bool res;
+
+ if (!(buff= (uchar*) my_alloca(size)))
+ return 1;
+ for (ptr= buff, end= ptr + size; ptr < end ; ptr+= 2, offsets++)
+ int2store(ptr, *offsets);
+ res= my_write(file, buff, size, MYF(MY_NABP)) != 0;
+ my_afree(buff);
+ return res;
+}
+
+
+uchar *_ma_column_nr_read(uchar *ptr, uint16 *offsets, uint columns)
+{
+ uchar *end;
+ size_t size= columns*2;
+ for (end= ptr + size; ptr < end ; ptr+=2, offsets++)
+ *offsets= uint2korr(ptr);
+ return ptr;
+}
+
+/**
+ @brief Set callbacks for data pages
+
+ @note
+ We don't use pagecache_file_init here, as we want to keep the
+ code readable
+*/
+
+void _ma_set_data_pagecache_callbacks(PAGECACHE_FILE *file,
+ MARIA_SHARE *share)
+{
+ file->callback_data= (uchar*) share;
+ file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */
+
+ if (share->temporary)
+ {
+ file->read_callback= &maria_page_crc_check_none;
+ file->write_callback= &maria_page_filler_set_none;
+ }
+ else
+ {
+ file->read_callback= &maria_page_crc_check_data;
+ if (share->options & HA_OPTION_PAGE_CHECKSUM)
+ file->write_callback= &maria_page_crc_set_normal;
+ else
+ file->write_callback= &maria_page_filler_set_normal;
+ if (share->now_transactional)
+ file->flush_log_callback= maria_flush_log_for_page;
+ }
+}
+
+
+/**
+ @brief Set callbacks for index pages
+
+ @note
+ We don't use pagecache_file_init here, as we want to keep the
+ code readable
+*/
+
+void _ma_set_index_pagecache_callbacks(PAGECACHE_FILE *file,
+ MARIA_SHARE *share)
+{
+ file->callback_data= (uchar*) share;
+ file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */
+ file->write_fail= maria_page_write_failure;
+
+ if (share->temporary)
+ {
+ file->read_callback= &maria_page_crc_check_none;
+ file->write_callback= &maria_page_filler_set_none;
+ }
+ else
+ {
+ file->read_callback= &maria_page_crc_check_index;
+ if (share->options & HA_OPTION_PAGE_CHECKSUM)
+ file->write_callback= &maria_page_crc_set_index;
+ else
+ file->write_callback= &maria_page_filler_set_normal;
+
+ if (share->now_transactional)
+ file->flush_log_callback= maria_flush_log_for_page;
+ }
+}
+
+
+/**************************************************************************
+ Open data file
+ We can't use dup() here as the data file descriptors need to have different
+ active seek-positions.
+
+ The argument file_to_dup is here for the future if there would on some OS
+ exist a dup()-like call that would give us two different file descriptors.
+*************************************************************************/
+
+int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share, const char *org_name,
+ File file_to_dup __attribute__((unused)))
+{
+ char *data_name= share->data_file_name.str;
+ char real_data_name[FN_REFLEN];
+
+ if (org_name)
+ {
+ fn_format(real_data_name, org_name, "", MARIA_NAME_DEXT, 4);
+ if (my_is_symlink(real_data_name))
+ {
+ if (my_realpath(real_data_name, real_data_name, MYF(0)) ||
+ (*maria_test_invalid_symlink)(real_data_name))
+ {
+ my_errno= HA_WRONG_CREATE_OPTION;
+ return 1;
+ }
+ data_name= real_data_name;
+ }
+ }
+
+ info->dfile.file= share->bitmap.file.file=
+ my_open(share->data_file_name.str, share->mode | O_SHARE,
+ MYF(MY_WME));
+ return info->dfile.file >= 0 ? 0 : 1;
+}
+
+
+int _ma_open_keyfile(MARIA_SHARE *share)
+{
+ /*
+ Modifications to share->kfile should be under intern_lock to protect
+ against a concurrent checkpoint.
+ */
+ pthread_mutex_lock(&share->intern_lock);
+ share->kfile.file= my_open(share->unique_file_name.str,
+ share->mode | O_SHARE,
+ MYF(MY_WME));
+ pthread_mutex_unlock(&share->intern_lock);
+ return (share->kfile.file < 0);
+}
+
+
+/*
+ Disable all indexes.
+
+ SYNOPSIS
+ maria_disable_indexes()
+ info A pointer to the MARIA storage engine MARIA_HA struct.
+
+ DESCRIPTION
+ Disable all indexes.
+
+ RETURN
+ 0 ok
+*/
+
+int maria_disable_indexes(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+
+ maria_clear_all_keys_active(share->state.key_map);
+ return 0;
+}
+
+
+/*
+ Enable all indexes
+
+ SYNOPSIS
+ maria_enable_indexes()
+ info A pointer to the MARIA storage engine MARIA_HA struct.
+
+ DESCRIPTION
+ Enable all indexes. The indexes might have been disabled
+ by maria_disable_index() before.
+ The function works only if both data and indexes are empty,
+ otherwise a repair is required.
+ To be sure, call handler::delete_all_rows() before.
+
+ RETURN
+ 0 ok
+ HA_ERR_CRASHED data or index is non-empty.
+*/
+
+int maria_enable_indexes(MARIA_HA *info)
+{
+ int error= 0;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("maria_enable_indexes");
+
+ if ((share->state.state.data_file_length !=
+ (share->data_file_type == BLOCK_RECORD ? share->block_size : 0)) ||
+ (share->state.state.key_file_length != share->base.keystart))
+ {
+ DBUG_PRINT("error", ("data_file_length: %lu key_file_length: %lu",
+ (ulong) share->state.state.data_file_length,
+ (ulong) share->state.state.key_file_length));
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ error= HA_ERR_CRASHED;
+ }
+ else
+ maria_set_all_keys_active(share->state.key_map, share->base.keys);
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Test if indexes are disabled.
+
+ SYNOPSIS
+ maria_indexes_are_disabled()
+ info A pointer to the MARIA storage engine MARIA_HA struct.
+
+ DESCRIPTION
+ Test if indexes are disabled.
+
+ RETURN
+ 0 indexes are not disabled
+ 1 all indexes are disabled
+ 2 non-unique indexes are disabled
+*/
+
+int maria_indexes_are_disabled(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+
+ /*
+ No keys or all are enabled. keys is the number of keys. Left shifted
+ gives us only one bit set. When decreased by one, gives us all all bits
+ up to this one set and it gets unset.
+ */
+ if (!share->base.keys ||
+ (maria_is_all_keys_active(share->state.key_map, share->base.keys)))
+ return 0;
+
+ /* All are disabled */
+ if (maria_is_any_key_active(share->state.key_map))
+ return 1;
+
+ /*
+ We have keys. Some enabled, some disabled.
+ Don't check for any non-unique disabled but return directly 2
+ */
+ return 2;
+}
+
+
+static my_bool maria_scan_init_dummy(MARIA_HA *info __attribute__((unused)))
+{
+ return 0;
+}
+
+static void maria_scan_end_dummy(MARIA_HA *info __attribute__((unused)))
+{
+}
+
+static my_bool maria_once_init_dummy(MARIA_SHARE *share
+ __attribute__((unused)),
+ File dfile __attribute__((unused)))
+{
+ return 0;
+}
+
+static my_bool maria_once_end_dummy(MARIA_SHARE *share __attribute__((unused)))
+{
+ return 0;
+}
diff --git a/storage/maria/ma_packrec.c b/storage/maria/ma_packrec.c
new file mode 100644
index 00000000000..4df00d9bb88
--- /dev/null
+++ b/storage/maria/ma_packrec.c
@@ -0,0 +1,1723 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+ /* Functions to compressed records */
+
+#include "maria_def.h"
+
+#define IS_CHAR ((uint) 32768) /* Bit if char (not offset) in tree */
+
+/* Some definitions to keep in sync with maria_pack.c */
+#define HEAD_LENGTH 32 /* Length of fixed header */
+
+#if INT_MAX > 32767
+#define BITS_SAVED 32
+#define MAX_QUICK_TABLE_BITS 9 /* Because we may shift in 24 bits */
+#else
+#define BITS_SAVED 16
+#define MAX_QUICK_TABLE_BITS 6
+#endif
+
+#define get_bit(BU) ((BU)->bits ? \
+ (BU)->current_byte & ((maria_bit_type) 1 << --(BU)->bits) :\
+ (fill_buffer(BU), (BU)->bits= BITS_SAVED-1,\
+ (BU)->current_byte & ((maria_bit_type) 1 << (BITS_SAVED-1))))
+#define skip_to_next_byte(BU) ((BU)->bits&=~7)
+#define get_bits(BU,count) (((BU)->bits >= count) ? (((BU)->current_byte >> ((BU)->bits-=count)) & mask[count]) : fill_and_get_bits(BU,count))
+
+#define decode_bytes_test_bit(bit) \
+ if (low_byte & (1 << (7-bit))) \
+ pos++; \
+ if (*pos & IS_CHAR) \
+ { bits-=(bit+1); break; } \
+ pos+= *pos
+
+/*
+ Size in uint16 of a Huffman tree for uchar compression of 256 uchar values
+*/
+#define OFFSET_TABLE_SIZE 512
+
+static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file,
+ pbool fix_keys);
+static uint read_huff_table(MARIA_BIT_BUFF *bit_buff,
+ MARIA_DECODE_TREE *decode_tree,
+ uint16 **decode_table,uchar **intervall_buff,
+ uint16 *tmp_buff);
+static void make_quick_table(uint16 *to_table,uint16 *decode_table,
+ uint *next_free,uint value,uint bits,
+ uint max_bits);
+static void fill_quick_table(uint16 *table,uint bits, uint max_bits,
+ uint value);
+static uint copy_decode_table(uint16 *to_pos,uint offset,
+ uint16 *decode_table);
+static uint find_longest_bitstream(uint16 *table, uint16 *end);
+static void (*get_unpack_function(MARIA_COLUMNDEF *rec))(MARIA_COLUMNDEF *field,
+ MARIA_BIT_BUFF *buff,
+ uchar *to,
+ uchar *end);
+static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_skip_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void uf_endspace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void uf_prespace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_space_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_zerofill_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_constant(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_intervall(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end);
+static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to,uchar *end);
+static uint decode_pos(MARIA_BIT_BUFF *bit_buff,
+ MARIA_DECODE_TREE *decode_tree);
+static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff,uchar *buffer,
+ uint length);
+static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff,uint count);
+static void fill_buffer(MARIA_BIT_BUFF *bit_buff);
+static uint max_bit(uint value);
+static uint read_pack_length(uint version, const uchar *buf, ulong *length);
+#ifdef HAVE_MMAP
+static uchar *_ma_mempack_get_block_info(MARIA_HA *maria,
+ MARIA_BIT_BUFF *bit_buff,
+ MARIA_BLOCK_INFO *info,
+ uchar **rec_buff_p,
+ size_t *rec_buff_size_p,
+ uchar *header);
+#endif
+
+static maria_bit_type mask[]=
+{
+ 0x00000000,
+ 0x00000001, 0x00000003, 0x00000007, 0x0000000f,
+ 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
+ 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
+ 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
+#if BITS_SAVED > 16
+ 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff,
+ 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
+ 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff,
+ 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff,
+#endif
+};
+
+
+my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile)
+{
+ share->options|= HA_OPTION_READ_ONLY_DATA;
+ return (_ma_read_pack_info(share, dfile,
+ (pbool)
+ test(!(share->options &
+ (HA_OPTION_PACK_RECORD |
+ HA_OPTION_TEMP_COMPRESS_RECORD)))));
+}
+
+
+my_bool _ma_once_end_pack_row(MARIA_SHARE *share)
+{
+ if (share->decode_trees)
+ {
+ my_free(share->decode_trees,MYF(0));
+ my_free(share->decode_tables,MYF(0));
+ }
+ return 0;
+}
+
+
+/* Read all packed info, allocate memory and fix field structs */
+
+static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file,
+ pbool fix_keys)
+{
+ int diff_length;
+ uint i,trees,huff_tree_bits,rec_reflength,length;
+ uint16 *decode_table,*tmp_buff;
+ ulong elements,intervall_length;
+ uchar *disk_cache;
+ uchar *intervall_buff;
+ uchar header[HEAD_LENGTH];
+ MARIA_BIT_BUFF bit_buff;
+ DBUG_ENTER("_ma_read_pack_info");
+
+ if (maria_quick_table_bits < 4)
+ maria_quick_table_bits=4;
+ else if (maria_quick_table_bits > MAX_QUICK_TABLE_BITS)
+ maria_quick_table_bits=MAX_QUICK_TABLE_BITS;
+
+ my_errno=0;
+ if (my_read(file, header, sizeof(header), MYF(MY_NABP)))
+ {
+ if (!my_errno)
+ my_errno=HA_ERR_END_OF_FILE;
+ goto err0;
+ }
+ /* Only the first three bytes of magic number are independent of version. */
+ if (memcmp(header, maria_pack_file_magic, 3))
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err0;
+ }
+ share->pack.version= header[3]; /* fourth uchar of magic number */
+ share->pack.header_length= uint4korr(header+4);
+ share->min_pack_length=(uint) uint4korr(header+8);
+ share->max_pack_length=(uint) uint4korr(header+12);
+ set_if_bigger(share->base.default_rec_buff_size,
+ share->max_pack_length + 7);
+ elements=uint4korr(header+16);
+ intervall_length=uint4korr(header+20);
+ trees=uint2korr(header+24);
+ share->pack.ref_length=header[26];
+ rec_reflength=header[27];
+ diff_length=(int) rec_reflength - (int) share->base.rec_reflength;
+ if (fix_keys)
+ share->rec_reflength=rec_reflength;
+ DBUG_PRINT("info", ("fixed header length: %u", HEAD_LENGTH));
+ DBUG_PRINT("info", ("total header length: %lu", share->pack.header_length));
+ DBUG_PRINT("info", ("pack file version: %u", share->pack.version));
+ DBUG_PRINT("info", ("min pack length: %lu", share->min_pack_length));
+ DBUG_PRINT("info", ("max pack length: %lu", share->max_pack_length));
+ DBUG_PRINT("info", ("elements of all trees: %lu", elements));
+ DBUG_PRINT("info", ("distinct values bytes: %lu", intervall_length));
+ DBUG_PRINT("info", ("number of code trees: %u", trees));
+ DBUG_PRINT("info", ("bytes for record lgt: %u", share->pack.ref_length));
+ DBUG_PRINT("info", ("record pointer length: %u", rec_reflength));
+
+
+ /*
+ Memory segment #1:
+ - Decode tree heads
+ - Distinct column values
+ */
+ if (!(share->decode_trees=(MARIA_DECODE_TREE*)
+ my_malloc((uint) (trees*sizeof(MARIA_DECODE_TREE)+
+ intervall_length*sizeof(uchar)),
+ MYF(MY_WME))))
+ goto err0;
+ intervall_buff=(uchar*) (share->decode_trees+trees);
+
+ /*
+ Memory segment #2:
+ - Decode tables
+ - Quick decode tables
+ - Temporary decode table
+ - Compressed data file header cache
+ This segment will be reallocated after construction of the tables.
+ */
+ length=(uint) (elements*2+trees*(1 << maria_quick_table_bits));
+ if (!(share->decode_tables=(uint16*)
+ my_malloc((length+OFFSET_TABLE_SIZE)*sizeof(uint16)+
+ (uint) (share->pack.header_length - sizeof(header)) +
+ share->base.extra_rec_buff_size,
+ MYF(MY_WME | MY_ZEROFILL))))
+ goto err1;
+ tmp_buff=share->decode_tables+length;
+ disk_cache=(uchar*) (tmp_buff+OFFSET_TABLE_SIZE);
+
+ if (my_read(file,disk_cache,
+ (uint) (share->pack.header_length-sizeof(header)),
+ MYF(MY_NABP)))
+ goto err2;
+#ifdef HAVE_valgrind
+ /* Zero bytes accessed by fill_buffer */
+ bzero(disk_cache + (share->pack.header_length-sizeof(header)),
+ share->base.extra_rec_buff_size);
+#endif
+
+ huff_tree_bits=max_bit(trees ? trees-1 : 0);
+ init_bit_buffer(&bit_buff, disk_cache,
+ (uint) (share->pack.header_length-sizeof(header)));
+ /* Read new info for each field */
+ for (i=0 ; i < share->base.fields ; i++)
+ {
+ share->columndef[i].base_type=(enum en_fieldtype) get_bits(&bit_buff,5);
+ share->columndef[i].pack_type=(uint) get_bits(&bit_buff,6);
+ share->columndef[i].space_length_bits=get_bits(&bit_buff,5);
+ share->columndef[i].huff_tree=share->decode_trees+(uint) get_bits(&bit_buff,
+ huff_tree_bits);
+ share->columndef[i].unpack= get_unpack_function(share->columndef + i);
+ DBUG_PRINT("info", ("col: %2u type: %2u pack: %u slbits: %2u",
+ i, share->columndef[i].base_type,
+ share->columndef[i].pack_type,
+ share->columndef[i].space_length_bits));
+ }
+ skip_to_next_byte(&bit_buff);
+ /*
+ Construct the decoding tables from the file header. Keep track of
+ the used memory.
+ */
+ decode_table=share->decode_tables;
+ for (i=0 ; i < trees ; i++)
+ if (read_huff_table(&bit_buff,share->decode_trees+i,&decode_table,
+ &intervall_buff,tmp_buff))
+ goto err3;
+ /* Reallocate the decoding tables to the used size. */
+ decode_table=(uint16*)
+ my_realloc((uchar*) share->decode_tables,
+ (uint) ((uchar*) decode_table - (uchar*) share->decode_tables),
+ MYF(MY_HOLD_ON_ERROR));
+ /* Fix the table addresses in the tree heads. */
+ {
+ my_ptrdiff_t diff= PTR_BYTE_DIFF(decode_table,share->decode_tables);
+ share->decode_tables=decode_table;
+ for (i=0 ; i < trees ; i++)
+ share->decode_trees[i].table=ADD_TO_PTR(share->decode_trees[i].table,
+ diff, uint16*);
+ }
+
+ /* Fix record-ref-length for keys */
+ if (fix_keys)
+ {
+ for (i=0 ; i < share->base.keys ; i++)
+ {
+ MARIA_KEYDEF *keyinfo= &share->keyinfo[i];
+ keyinfo->keylength+= (uint16) diff_length;
+ keyinfo->minlength+= (uint16) diff_length;
+ keyinfo->maxlength+= (uint16) diff_length;
+ keyinfo->seg[keyinfo->flag & HA_FULLTEXT ?
+ FT_SEGS : keyinfo->keysegs].length= (uint16) rec_reflength;
+ }
+ if (share->ft2_keyinfo.seg)
+ {
+ MARIA_KEYDEF *ft2_keyinfo= &share->ft2_keyinfo;
+ ft2_keyinfo->keylength+= (uint16) diff_length;
+ ft2_keyinfo->minlength+= (uint16) diff_length;
+ ft2_keyinfo->maxlength+= (uint16) diff_length;
+ }
+ }
+
+ if (bit_buff.error || bit_buff.pos < bit_buff.end)
+ goto err3;
+
+ DBUG_RETURN(0);
+
+err3:
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+err2:
+ my_free(share->decode_tables, MYF(0));
+err1:
+ my_free(share->decode_trees, MYF(0));
+err0:
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Read a huff-code-table from datafile.
+
+ SYNOPSIS
+ read_huff_table()
+ bit_buff Bit buffer pointing at start of the
+ decoding table in the file header cache.
+ decode_tree Pointer to the decode tree head.
+ decode_table IN/OUT Address of a pointer to the next free space.
+ intervall_buff IN/OUT Address of a pointer to the next unused values.
+ tmp_buff Buffer for temporary extraction of a full
+ decoding table as read from bit_buff.
+
+ RETURN
+ 0 OK.
+ 1 Error.
+*/
+static uint read_huff_table(MARIA_BIT_BUFF *bit_buff,
+ MARIA_DECODE_TREE *decode_tree,
+ uint16 **decode_table, uchar **intervall_buff,
+ uint16 *tmp_buff)
+{
+ uint min_chr,elements,char_bits,offset_bits,size,intervall_length,table_bits,
+ next_free_offset;
+ uint16 *ptr,*end;
+ DBUG_ENTER("read_huff_table");
+
+ if (!get_bits(bit_buff,1))
+ {
+ /* Byte value compression. */
+ min_chr=get_bits(bit_buff,8);
+ elements=get_bits(bit_buff,9);
+ char_bits=get_bits(bit_buff,5);
+ offset_bits=get_bits(bit_buff,5);
+ intervall_length=0;
+ ptr=tmp_buff;
+ ptr=tmp_buff;
+ DBUG_PRINT("info", ("byte value compression"));
+ DBUG_PRINT("info", ("minimum uchar value: %u", min_chr));
+ DBUG_PRINT("info", ("number of tree nodes: %u", elements));
+ DBUG_PRINT("info", ("bits for values: %u", char_bits));
+ DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits));
+ if (elements > 256)
+ {
+ DBUG_PRINT("error", ("ERROR: illegal number of tree elements: %u",
+ elements));
+ DBUG_RETURN(1);
+ }
+ }
+ else
+ {
+ /* Distinct column value compression. */
+ min_chr=0;
+ elements=get_bits(bit_buff,15);
+ intervall_length=get_bits(bit_buff,16);
+ char_bits=get_bits(bit_buff,5);
+ offset_bits=get_bits(bit_buff,5);
+ decode_tree->quick_table_bits=0;
+ ptr= *decode_table;
+ DBUG_PRINT("info", ("distinct column value compression"));
+ DBUG_PRINT("info", ("number of tree nodes: %u", elements));
+ DBUG_PRINT("info", ("value buffer length: %u", intervall_length));
+ DBUG_PRINT("info", ("bits for value index: %u", char_bits));
+ DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits));
+ }
+ size=elements*2-2;
+ DBUG_PRINT("info", ("tree size in uint16: %u", size));
+ DBUG_PRINT("info", ("tree size in bytes: %u",
+ size * (uint) sizeof(uint16)));
+
+ for (end=ptr+size ; ptr < end ; ptr++)
+ {
+ if (get_bit(bit_buff))
+ {
+ *ptr= (uint16) get_bits(bit_buff,offset_bits);
+ if ((ptr + *ptr >= end) || !*ptr)
+ {
+ DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree"));
+ DBUG_RETURN(1);
+ }
+ }
+ else
+ *ptr= (uint16) (IS_CHAR + (get_bits(bit_buff,char_bits) + min_chr));
+ }
+ skip_to_next_byte(bit_buff);
+
+ decode_tree->table= *decode_table;
+ decode_tree->intervalls= *intervall_buff;
+ if (! intervall_length)
+ {
+ /* Byte value compression. ptr started from tmp_buff. */
+ /* Find longest Huffman code from begin to end of tree in bits. */
+ table_bits= find_longest_bitstream(tmp_buff, ptr);
+ if (table_bits >= OFFSET_TABLE_SIZE)
+ DBUG_RETURN(1);
+ if (table_bits > maria_quick_table_bits)
+ table_bits=maria_quick_table_bits;
+ DBUG_PRINT("info", ("table bits: %u", table_bits));
+
+ next_free_offset= (1 << table_bits);
+ make_quick_table(*decode_table,tmp_buff,&next_free_offset,0,table_bits,
+ table_bits);
+ (*decode_table)+= next_free_offset;
+ decode_tree->quick_table_bits=table_bits;
+ }
+ else
+ {
+ /* Distinct column value compression. ptr started from *decode_table */
+ (*decode_table)=end;
+ /*
+ get_bits() moves some bytes to a cache buffer in advance. May need
+ to step back.
+ */
+ bit_buff->pos-= bit_buff->bits/8;
+ /* Copy the distinct column values from the buffer. */
+ memcpy(*intervall_buff,bit_buff->pos,(size_t) intervall_length);
+ (*intervall_buff)+=intervall_length;
+ bit_buff->pos+=intervall_length;
+ bit_buff->bits=0;
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Make a quick_table for faster decoding.
+
+ SYNOPSIS
+ make_quick_table()
+ to_table Target quick_table and remaining decode table.
+ decode_table Source Huffman (sub-)tree within tmp_buff.
+ next_free_offset IN/OUT Next free offset from to_table.
+ Starts behind quick_table on the top-level.
+ value Huffman bits found so far.
+ bits Remaining bits to be collected.
+ max_bits Total number of bits to collect (table_bits).
+
+ DESCRIPTION
+
+ The quick table is an array of 16-bit values. There exists one value
+ for each possible code representable by max_bits (table_bits) bits.
+ In most cases table_bits is 9. So there are 512 16-bit values.
+
+ If the high-order bit (16) is set (IS_CHAR) then the array slot for
+ this value is a valid Huffman code for a resulting uchar value.
+
+ The low-order 8 bits (1..8) are the resulting uchar value.
+
+ Bits 9..14 are the length of the Huffman code for this uchar value.
+ This means so many bits from the input stream were needed to
+ represent this uchar value. The remaining bits belong to later
+ Huffman codes. This also means that for every Huffman code shorter
+ than table_bits there are multiple entires in the array, which
+ differ just in the unused bits.
+
+ If the high-order bit (16) is clear (0) then the remaining bits are
+ the position of the remaining Huffman decode tree segment behind the
+ quick table.
+
+ RETURN
+ void
+*/
+
+static void make_quick_table(uint16 *to_table, uint16 *decode_table,
+ uint *next_free_offset, uint value, uint bits,
+ uint max_bits)
+{
+ DBUG_ENTER("make_quick_table");
+
+ /*
+ When down the table to the requested maximum, copy the rest of the
+ Huffman table.
+ */
+ if (!bits--)
+ {
+ /*
+ Remaining left Huffman tree segment starts behind quick table.
+ Remaining right Huffman tree segment starts behind left segment.
+ */
+ to_table[value]= (uint16) *next_free_offset;
+ /*
+ Re-construct the remaining Huffman tree segment at
+ next_free_offset in to_table.
+ */
+ *next_free_offset=copy_decode_table(to_table, *next_free_offset,
+ decode_table);
+ DBUG_VOID_RETURN;
+ }
+
+ /* Descent on the left side. Left side bits are clear (0). */
+ if (!(*decode_table & IS_CHAR))
+ {
+ /* Not a leaf. Follow the pointer. */
+ make_quick_table(to_table,decode_table+ *decode_table,
+ next_free_offset,value,bits,max_bits);
+ }
+ else
+ {
+ /*
+ A leaf. A Huffman code is complete. Fill the quick_table
+ array for all possible bit strings starting with this Huffman
+ code.
+ */
+ fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table);
+ }
+
+ /* Descent on the right side. Right side bits are set (1). */
+ decode_table++;
+ value|= (1 << bits);
+ if (!(*decode_table & IS_CHAR))
+ {
+ /* Not a leaf. Follow the pointer. */
+ make_quick_table(to_table,decode_table+ *decode_table,
+ next_free_offset,value,bits,max_bits);
+ }
+ else
+ {
+ /*
+ A leaf. A Huffman code is complete. Fill the quick_table
+ array for all possible bit strings starting with this Huffman
+ code.
+ */
+ fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table);
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Fill quick_table for all possible values starting with this Huffman code.
+
+ SYNOPSIS
+ fill_quick_table()
+ table Target quick_table position.
+ bits Unused bits from max_bits.
+ max_bits Total number of bits to collect (table_bits).
+ value The uchar encoded by the found Huffman code.
+
+ DESCRIPTION
+
+ Fill the segment (all slots) of the quick_table array with the
+ resulting value for the found Huffman code. There are as many slots
+ as there are combinations representable by the unused bits.
+
+ In most cases we use 9 table bits. Assume a 3-bit Huffman code. Then
+ there are 6 unused bits. Hence we fill 2**6 = 64 slots with the
+ value.
+
+ RETURN
+ void
+*/
+
+static void fill_quick_table(uint16 *table, uint bits, uint max_bits,
+ uint value)
+{
+ uint16 *end;
+ DBUG_ENTER("fill_quick_table");
+
+ /*
+ Bits 1..8 of value represent the decoded uchar value.
+ Bits 9..14 become the length of the Huffman code for this uchar value.
+ Bit 16 flags a valid code (IS_CHAR).
+ */
+ value|= (max_bits - bits) << 8 | IS_CHAR;
+
+ for (end= table + ((my_ptrdiff_t) 1 << bits); table < end; table++)
+ {
+ *table= (uint16) value;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Reconstruct a decode subtree at the target position.
+
+ SYNOPSIS
+ copy_decode_table()
+ to_pos Target quick_table and remaining decode table.
+ offset Next free offset from to_pos.
+ decode_table Source Huffman subtree within tmp_buff.
+
+ NOTE
+ Pointers in the decode tree are relative to the pointers position.
+
+ RETURN
+ next free offset from to_pos.
+*/
+
+static uint copy_decode_table(uint16 *to_pos, uint offset,
+ uint16 *decode_table)
+{
+ uint prev_offset= offset;
+ DBUG_ENTER("copy_decode_table");
+
+ /* Descent on the left side. */
+ if (!(*decode_table & IS_CHAR))
+ {
+ /* Set a pointer to the next target node. */
+ to_pos[offset]=2;
+ /* Copy the left hand subtree there. */
+ offset=copy_decode_table(to_pos,offset+2,decode_table+ *decode_table);
+ }
+ else
+ {
+ /* Copy the uchar value. */
+ to_pos[offset]= *decode_table;
+ /* Step behind this node. */
+ offset+=2;
+ }
+
+ /* Descent on the right side. */
+ decode_table++;
+ if (!(*decode_table & IS_CHAR))
+ {
+ /* Set a pointer to the next free target node. */
+ to_pos[prev_offset+1]=(uint16) (offset-prev_offset-1);
+ /* Copy the right hand subtree to the entry of that node. */
+ offset=copy_decode_table(to_pos,offset,decode_table+ *decode_table);
+ }
+ else
+ {
+ /* Copy the uchar value. */
+ to_pos[prev_offset+1]= *decode_table;
+ }
+ DBUG_RETURN(offset);
+}
+
+
+/*
+ Find the length of the longest Huffman code in this table in bits.
+
+ SYNOPSIS
+ find_longest_bitstream()
+ table Code (sub-)table start.
+ end End of code table.
+
+ IMPLEMENTATION
+
+ Recursively follow the branch(es) of the code pair on every level of
+ the tree until two uchar values (and no branch) are found. Add one to
+ each level when returning back from each recursion stage.
+
+ 'end' is used for error checking only. A clean tree terminates
+ before reaching 'end'. Hence the exact value of 'end' is not too
+ important. However having it higher than necessary could lead to
+ misbehaviour should 'next' jump into the dirty area.
+
+ RETURN
+ length Length of longest Huffman code in bits.
+ >= OFFSET_TABLE_SIZE Error, broken tree. It does not end before 'end'.
+*/
+
+static uint find_longest_bitstream(uint16 *table, uint16 *end)
+{
+ uint length=1;
+ uint length2;
+ if (!(*table & IS_CHAR))
+ {
+ uint16 *next= table + *table;
+ if (next > end || next == table)
+ {
+ DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree"));
+ return OFFSET_TABLE_SIZE;
+ }
+ length=find_longest_bitstream(next, end)+1;
+ }
+ table++;
+ if (!(*table & IS_CHAR))
+ {
+ uint16 *next= table + *table;
+ if (next > end || next == table)
+ {
+ DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree"));
+ return OFFSET_TABLE_SIZE;
+ }
+ length2= find_longest_bitstream(next, end) + 1;
+ length=max(length,length2);
+ }
+ return length;
+}
+
+
+/*
+ Read record from datafile.
+
+ SYNOPSIS
+ _ma_read_pack_record()
+ info A pointer to MARIA_HA.
+ filepos File offset of the record.
+ buf RETURN The buffer to receive the record.
+
+ RETURN
+ 0 On success
+ # Error number
+*/
+
+int _ma_read_pack_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
+{
+ MARIA_BLOCK_INFO block_info;
+ File file;
+ DBUG_ENTER("maria_read_pack_record");
+
+ if (filepos == HA_OFFSET_ERROR)
+ DBUG_RETURN(my_errno); /* _search() didn't find record */
+
+ file= info->dfile.file;
+ if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info,
+ &info->rec_buff, &info->rec_buff_size, file,
+ filepos))
+ goto err;
+ if (my_read(file, info->rec_buff + block_info.offset ,
+ block_info.rec_len - block_info.offset, MYF(MY_NABP)))
+ goto panic;
+ info->update|= HA_STATE_AKTIV;
+ DBUG_RETURN(_ma_pack_rec_unpack(info,&info->bit_buff, buf,
+ info->rec_buff, block_info.rec_len));
+panic:
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+err:
+ DBUG_RETURN(my_errno);
+}
+
+
+
+int _ma_pack_rec_unpack(register MARIA_HA *info, MARIA_BIT_BUFF *bit_buff,
+ register uchar *to, uchar *from, ulong reclength)
+{
+ uchar *end_field;
+ reg3 MARIA_COLUMNDEF *end;
+ MARIA_COLUMNDEF *current_field;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_pack_rec_unpack");
+
+ if (info->s->base.null_bytes)
+ {
+ memcpy(to, from, info->s->base.null_bytes);
+ to+= info->s->base.null_bytes;
+ from+= info->s->base.null_bytes;
+ reclength-= info->s->base.null_bytes;
+ }
+ init_bit_buffer(bit_buff, from, reclength);
+ for (current_field=share->columndef, end=current_field+share->base.fields ;
+ current_field < end ;
+ current_field++,to=end_field)
+ {
+ end_field=to+current_field->length;
+ (*current_field->unpack)(current_field, bit_buff, to, end_field);
+ }
+ if (!bit_buff->error &&
+ bit_buff->pos - bit_buff->bits / 8 == bit_buff->end)
+ DBUG_RETURN(0);
+ info->update&= ~HA_STATE_AKTIV;
+ DBUG_RETURN(my_errno=HA_ERR_WRONG_IN_RECORD);
+} /* _ma_pack_rec_unpack */
+
+
+ /* Return function to unpack field */
+
+static void (*get_unpack_function(MARIA_COLUMNDEF *rec))
+ (MARIA_COLUMNDEF *, MARIA_BIT_BUFF *, uchar *, uchar *)
+{
+ switch (rec->base_type) {
+ case FIELD_SKIP_ZERO:
+ if (rec->pack_type & PACK_TYPE_ZERO_FILL)
+ return &uf_zerofill_skip_zero;
+ return &uf_skip_zero;
+ case FIELD_NORMAL:
+ if (rec->pack_type & PACK_TYPE_SPACE_FIELDS)
+ return &uf_space_normal;
+ if (rec->pack_type & PACK_TYPE_ZERO_FILL)
+ return &uf_zerofill_normal;
+ return &decode_bytes;
+ case FIELD_SKIP_ENDSPACE:
+ if (rec->pack_type & PACK_TYPE_SPACE_FIELDS)
+ {
+ if (rec->pack_type & PACK_TYPE_SELECTED)
+ return &uf_space_endspace_selected;
+ return &uf_space_endspace;
+ }
+ if (rec->pack_type & PACK_TYPE_SELECTED)
+ return &uf_endspace_selected;
+ return &uf_endspace;
+ case FIELD_SKIP_PRESPACE:
+ if (rec->pack_type & PACK_TYPE_SPACE_FIELDS)
+ {
+ if (rec->pack_type & PACK_TYPE_SELECTED)
+ return &uf_space_prespace_selected;
+ return &uf_space_prespace;
+ }
+ if (rec->pack_type & PACK_TYPE_SELECTED)
+ return &uf_prespace_selected;
+ return &uf_prespace;
+ case FIELD_CONSTANT:
+ return &uf_constant;
+ case FIELD_INTERVALL:
+ return &uf_intervall;
+ case FIELD_ZERO:
+ case FIELD_CHECK:
+ return &uf_zero;
+ case FIELD_BLOB:
+ return &uf_blob;
+ case FIELD_VARCHAR:
+ if (rec->length <= 256) /* 255 + 1 uchar length */
+ return &uf_varchar1;
+ return &uf_varchar2;
+ case FIELD_LAST:
+ default:
+ return 0; /* This should never happend */
+ }
+}
+
+ /* The different functions to unpack a field */
+
+static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ if (get_bit(bit_buff))
+ bzero((char*) to,(uint) (end-to));
+ else
+ {
+ end-=rec->space_length_bits;
+ decode_bytes(rec,bit_buff,to,end);
+ bzero((char*) end,rec->space_length_bits);
+ }
+}
+
+static void uf_skip_zero(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ if (get_bit(bit_buff))
+ bzero((char*) to,(uint) (end-to));
+ else
+ decode_bytes(rec,bit_buff,to,end);
+}
+
+static void uf_space_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ if (get_bit(bit_buff))
+ bfill(to, (end-to), ' ');
+ else
+ decode_bytes(rec,bit_buff,to,end);
+}
+
+static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ bfill(to, (end-to), ' ');
+ else
+ {
+ if (get_bit(bit_buff))
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to,end-spaces);
+ bfill(end - spaces, spaces, ' ');
+ }
+ else
+ decode_bytes(rec,bit_buff,to,end);
+ }
+}
+
+static void uf_endspace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to,end-spaces);
+ bfill(end - spaces, spaces, ' ');
+ }
+ else
+ decode_bytes(rec,bit_buff,to,end);
+}
+
+static void uf_space_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ bfill(to, (end-to), ' ');
+ else
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to,end-spaces);
+ bfill(end - spaces, spaces, ' ');
+ }
+}
+
+static void uf_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to,end-spaces);
+ bfill(end - spaces, spaces, ' ');
+}
+
+static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ bfill(to, (end-to), ' ');
+ else
+ {
+ if (get_bit(bit_buff))
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ bfill(to, spaces, ' ');
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to+spaces,end);
+ }
+ else
+ decode_bytes(rec,bit_buff,to,end);
+ }
+}
+
+
+static void uf_prespace_selected(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ bfill(to, spaces, ' ');
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to+spaces,end);
+ }
+ else
+ decode_bytes(rec,bit_buff,to,end);
+}
+
+
+static void uf_space_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if (get_bit(bit_buff))
+ bfill(to, (end-to), ' ');
+ else
+ {
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ bfill(to, spaces, ' ');
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to+spaces,end);
+ }
+}
+
+static void uf_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ uint spaces;
+ if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+ {
+ bit_buff->error=1;
+ return;
+ }
+ bfill(to, spaces, ' ');
+ if (to+spaces != end)
+ decode_bytes(rec,bit_buff,to+spaces,end);
+}
+
+static void uf_zerofill_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ end-=rec->space_length_bits;
+ decode_bytes(rec,bit_buff, to, end);
+ bzero((char*) end,rec->space_length_bits);
+}
+
+static void uf_constant(MARIA_COLUMNDEF *rec,
+ MARIA_BIT_BUFF *bit_buff __attribute__((unused)),
+ uchar *to, uchar *end)
+{
+ memcpy(to,rec->huff_tree->intervalls,(size_t) (end-to));
+}
+
+static void uf_intervall(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to,
+ uchar *end)
+{
+ reg1 uint field_length=(uint) (end-to);
+ memcpy(to,rec->huff_tree->intervalls+field_length*decode_pos(bit_buff,
+ rec->huff_tree),
+ (size_t) field_length);
+}
+
+
+/*ARGSUSED*/
+static void uf_zero(MARIA_COLUMNDEF *rec __attribute__((unused)),
+ MARIA_BIT_BUFF *bit_buff __attribute__((unused)),
+ uchar *to, uchar *end)
+{
+ bzero(to, (uint) (end-to));
+}
+
+static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ if (get_bit(bit_buff))
+ bzero(to, (uint) (end-to));
+ else
+ {
+ ulong length=get_bits(bit_buff,rec->space_length_bits);
+ uint pack_length=(uint) (end-to)-portable_sizeof_char_ptr;
+ if (bit_buff->blob_pos+length > bit_buff->blob_end)
+ {
+ bit_buff->error=1;
+ bzero(to, (end-to));
+ return;
+ }
+ decode_bytes(rec, bit_buff, bit_buff->blob_pos,
+ bit_buff->blob_pos + length);
+ _ma_store_blob_length(to, pack_length, length);
+ memcpy_fixed((uchar*) to+pack_length,(uchar*) &bit_buff->blob_pos,
+ sizeof(uchar*));
+ bit_buff->blob_pos+=length;
+ }
+}
+
+
+static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end __attribute__((unused)))
+{
+ if (get_bit(bit_buff))
+ to[0]= 0; /* Zero lengths */
+ else
+ {
+ ulong length=get_bits(bit_buff,rec->space_length_bits);
+ *to= (char) length;
+ decode_bytes(rec,bit_buff,to+1,to+1+length);
+ }
+}
+
+
+static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end __attribute__((unused)))
+{
+ if (get_bit(bit_buff))
+ to[0]=to[1]=0; /* Zero lengths */
+ else
+ {
+ ulong length=get_bits(bit_buff,rec->space_length_bits);
+ int2store(to,length);
+ decode_bytes(rec,bit_buff,to+2,to+2+length);
+ }
+}
+
+ /* Functions to decode of buffer of bits */
+
+#if BITS_SAVED == 64
+
+static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ reg1 uint bits,low_byte;
+ reg3 uint16 *pos;
+ reg4 uint table_bits,table_and;
+ MARIA_DECODE_TREE *decode_tree;
+
+ decode_tree=rec->decode_tree;
+ bits=bit_buff->bits; /* Save in reg for quicker access */
+ table_bits=decode_tree->quick_table_bits;
+ table_and= (1 << table_bits)-1;
+
+ do
+ {
+ if (bits <= 32)
+ {
+ if (bit_buff->pos > bit_buff->end+4)
+ {
+ bit_buff->error=1;
+ return; /* Can't be right */
+ }
+ bit_buff->current_byte= (bit_buff->current_byte << 32) +
+ ((((uint) bit_buff->pos[3])) +
+ (((uint) bit_buff->pos[2]) << 8) +
+ (((uint) bit_buff->pos[1]) << 16) +
+ (((uint) bit_buff->pos[0]) << 24));
+ bit_buff->pos+=4;
+ bits+=32;
+ }
+ /*
+ First use info in quick_table.
+
+ The quick table is an array of 16-bit values. There exists one
+ value for each possible code representable by table_bits bits.
+ In most cases table_bits is 9. So there are 512 16-bit values.
+
+ If the high-order bit (16) is set (IS_CHAR) then the array slot
+ for this value is a valid Huffman code for a resulting uchar value.
+
+ The low-order 8 bits (1..8) are the resulting uchar value.
+
+ Bits 9..14 are the length of the Huffman code for this uchar value.
+ This means so many bits from the input stream were needed to
+ represent this uchar value. The remaining bits belong to later
+ Huffman codes. This also means that for every Huffman code shorter
+ than table_bits there are multiple entires in the array, which
+ differ just in the unused bits.
+
+ If the high-order bit (16) is clear (0) then the remaining bits are
+ the position of the remaining Huffman decode tree segment behind the
+ quick table.
+ */
+ low_byte=(uint) (bit_buff->current_byte >> (bits - table_bits)) & table_and;
+ low_byte=decode_tree->table[low_byte];
+ if (low_byte & IS_CHAR)
+ {
+ /*
+ All Huffman codes of less or equal table_bits length are in the
+ quick table. This is one of them.
+ */
+ *to++ = (char) (low_byte & 255); /* Found char in quick table */
+ bits-= ((low_byte >> 8) & 31); /* Remove bits used */
+ }
+ else
+ { /* Map through rest of decode-table */
+ /* This means that the Huffman code must be longer than table_bits. */
+ pos=decode_tree->table+low_byte;
+ bits-=table_bits;
+ /* NOTE: decode_bytes_test_bit() is a macro wich contains a break !!! */
+ for (;;)
+ {
+ low_byte=(uint) (bit_buff->current_byte >> (bits-8));
+ decode_bytes_test_bit(0);
+ decode_bytes_test_bit(1);
+ decode_bytes_test_bit(2);
+ decode_bytes_test_bit(3);
+ decode_bytes_test_bit(4);
+ decode_bytes_test_bit(5);
+ decode_bytes_test_bit(6);
+ decode_bytes_test_bit(7);
+ bits-=8;
+ }
+ *to++ = (char) *pos;
+ }
+ } while (to != end);
+
+ bit_buff->bits=bits;
+ return;
+}
+
+#else
+
+static void decode_bytes(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *end)
+{
+ reg1 uint bits,low_byte;
+ reg3 uint16 *pos;
+ reg4 uint table_bits,table_and;
+ MARIA_DECODE_TREE *decode_tree;
+
+ decode_tree=rec->huff_tree;
+ bits=bit_buff->bits; /* Save in reg for quicker access */
+ table_bits=decode_tree->quick_table_bits;
+ table_and= (1 << table_bits)-1;
+
+ do
+ {
+ if (bits < table_bits)
+ {
+ if (bit_buff->pos > bit_buff->end+1)
+ {
+ bit_buff->error=1;
+ return; /* Can't be right */
+ }
+#if BITS_SAVED == 32
+ bit_buff->current_byte= (bit_buff->current_byte << 24) +
+ (((uint) ((uchar) bit_buff->pos[2]))) +
+ (((uint) ((uchar) bit_buff->pos[1])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[0])) << 16);
+ bit_buff->pos+=3;
+ bits+=24;
+#else
+ if (bits) /* We must have at leasts 9 bits */
+ {
+ bit_buff->current_byte= (bit_buff->current_byte << 8) +
+ (uint) ((uchar) bit_buff->pos[0]);
+ bit_buff->pos++;
+ bits+=8;
+ }
+ else
+ {
+ bit_buff->current_byte= ((uint) ((uchar) bit_buff->pos[0]) << 8) +
+ ((uint) ((uchar) bit_buff->pos[1]));
+ bit_buff->pos+=2;
+ bits+=16;
+ }
+#endif
+ }
+ /* First use info in quick_table */
+ low_byte=(bit_buff->current_byte >> (bits - table_bits)) & table_and;
+ low_byte=decode_tree->table[low_byte];
+ if (low_byte & IS_CHAR)
+ {
+ *to++ = (low_byte & 255); /* Found char in quick table */
+ bits-= ((low_byte >> 8) & 31); /* Remove bits used */
+ }
+ else
+ { /* Map through rest of decode-table */
+ pos=decode_tree->table+low_byte;
+ bits-=table_bits;
+ for (;;)
+ {
+ if (bits < 8)
+ { /* We don't need to check end */
+#if BITS_SAVED == 32
+ bit_buff->current_byte= (bit_buff->current_byte << 24) +
+ (((uint) ((uchar) bit_buff->pos[2]))) +
+ (((uint) ((uchar) bit_buff->pos[1])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[0])) << 16);
+ bit_buff->pos+=3;
+ bits+=24;
+#else
+ bit_buff->current_byte= (bit_buff->current_byte << 8) +
+ (uint) ((uchar) bit_buff->pos[0]);
+ bit_buff->pos+=1;
+ bits+=8;
+#endif
+ }
+ low_byte=(uint) (bit_buff->current_byte >> (bits-8));
+ decode_bytes_test_bit(0);
+ decode_bytes_test_bit(1);
+ decode_bytes_test_bit(2);
+ decode_bytes_test_bit(3);
+ decode_bytes_test_bit(4);
+ decode_bytes_test_bit(5);
+ decode_bytes_test_bit(6);
+ decode_bytes_test_bit(7);
+ bits-=8;
+ }
+ *to++ = (char) *pos;
+ }
+ } while (to != end);
+
+ bit_buff->bits=bits;
+ return;
+}
+#endif /* BIT_SAVED == 64 */
+
+
+static uint decode_pos(MARIA_BIT_BUFF *bit_buff,
+ MARIA_DECODE_TREE *decode_tree)
+{
+ uint16 *pos=decode_tree->table;
+ for (;;)
+ {
+ if (get_bit(bit_buff))
+ pos++;
+ if (*pos & IS_CHAR)
+ return (uint) (*pos & ~IS_CHAR);
+ pos+= *pos;
+ }
+}
+
+
+int _ma_read_rnd_pack_record(MARIA_HA *info,
+ uchar *buf,
+ register MARIA_RECORD_POS filepos,
+ my_bool skip_deleted_blocks)
+{
+ File file;
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_read_rnd_pack_record");
+
+ if (filepos >= info->state->data_file_length)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ goto err;
+ }
+
+ file= info->dfile.file;
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (_ma_read_cache(&info->rec_cache, block_info.header,
+ filepos, share->pack.ref_length,
+ skip_deleted_blocks ? READING_NEXT : 0))
+ goto err;
+ file= -1;
+ }
+ if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info,
+ &info->rec_buff, &info->rec_buff_size,
+ file, filepos))
+ goto err; /* Error code is already set */
+#ifndef DBUG_OFF
+ if (block_info.rec_len > share->max_pack_length)
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+#endif
+
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (_ma_read_cache(&info->rec_cache, info->rec_buff,
+ block_info.filepos, block_info.rec_len,
+ skip_deleted_blocks ? READING_NEXT : 0))
+ goto err;
+ }
+ else
+ {
+ if (my_read(info->dfile.file, info->rec_buff + block_info.offset,
+ block_info.rec_len-block_info.offset,
+ MYF(MY_NABP)))
+ goto err;
+ }
+ info->packed_length= block_info.rec_len;
+ info->cur_row.lastpos= filepos;
+ info->cur_row.nextpos= block_info.filepos+block_info.rec_len;
+ info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+
+ DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+ info->rec_buff, block_info.rec_len));
+ err:
+ DBUG_RETURN(my_errno);
+}
+
+
+ /* Read and process header from a huff-record-file */
+
+uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff,
+ MARIA_BLOCK_INFO *info,
+ uchar **rec_buff_p, size_t *rec_buff_size_p,
+ File file, my_off_t filepos)
+{
+ uchar *header= info->header;
+ uint head_length,ref_length;
+ LINT_INIT(ref_length);
+
+ if (file >= 0)
+ {
+ ref_length=maria->s->pack.ref_length;
+ /*
+ We can't use my_pread() here because _ma_read_rnd_pack_record assumes
+ position is ok
+ */
+ VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0)));
+ if (my_read(file, header,ref_length,MYF(MY_NABP)))
+ return BLOCK_FATAL_ERROR;
+ DBUG_DUMP("header", header, ref_length);
+ }
+ head_length= read_pack_length((uint) maria->s->pack.version, header,
+ &info->rec_len);
+ if (maria->s->base.blobs)
+ {
+ head_length+= read_pack_length((uint) maria->s->pack.version,
+ header + head_length, &info->blob_len);
+ /*
+ Ensure that the record buffer is big enough for the compressed
+ record plus all expanded blobs. [We do not have an extra buffer
+ for the resulting blobs. Sigh.]
+ */
+ if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p,
+ info->rec_len + info->blob_len +
+ maria->s->base.extra_rec_buff_size))
+ return BLOCK_FATAL_ERROR; /* not enough memory */
+ bit_buff->blob_pos= *rec_buff_p + info->rec_len;
+ bit_buff->blob_end= bit_buff->blob_pos + info->blob_len;
+ maria->blob_length=info->blob_len;
+ }
+ info->filepos=filepos+head_length;
+ if (file > 0)
+ {
+ info->offset=min(info->rec_len, ref_length - head_length);
+ memcpy(*rec_buff_p, header + head_length, info->offset);
+ }
+ return 0;
+}
+
+
+ /* rutines for bit buffer */
+ /* Note buffer must be 6 uchar bigger than longest row */
+
+static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff, uchar *buffer,
+ uint length)
+{
+ bit_buff->pos=buffer;
+ bit_buff->end=buffer+length;
+ bit_buff->bits=bit_buff->error=0;
+ bit_buff->current_byte=0; /* Avoid purify errors */
+}
+
+static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff, uint count)
+{
+ uint tmp;
+ count-=bit_buff->bits;
+ tmp=(bit_buff->current_byte & mask[bit_buff->bits]) << count;
+ fill_buffer(bit_buff);
+ bit_buff->bits=BITS_SAVED - count;
+ return tmp+(bit_buff->current_byte >> (BITS_SAVED - count));
+}
+
+ /* Fill in empty bit_buff->current_byte from buffer */
+ /* Sets bit_buff->error if buffer is exhausted */
+
+static void fill_buffer(MARIA_BIT_BUFF *bit_buff)
+{
+ if (bit_buff->pos >= bit_buff->end)
+ {
+ bit_buff->error= 1;
+ bit_buff->current_byte=0;
+ return;
+ }
+#if BITS_SAVED == 64
+ bit_buff->current_byte= ((((uint) ((uchar) bit_buff->pos[7]))) +
+ (((uint) ((uchar) bit_buff->pos[6])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[5])) << 16) +
+ (((uint) ((uchar) bit_buff->pos[4])) << 24) +
+ ((ulonglong)
+ ((((uint) ((uchar) bit_buff->pos[3]))) +
+ (((uint) ((uchar) bit_buff->pos[2])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[1])) << 16) +
+ (((uint) ((uchar) bit_buff->pos[0])) << 24)) << 32));
+ bit_buff->pos+=8;
+#else
+#if BITS_SAVED == 32
+ bit_buff->current_byte= (((uint) ((uchar) bit_buff->pos[3])) +
+ (((uint) ((uchar) bit_buff->pos[2])) << 8) +
+ (((uint) ((uchar) bit_buff->pos[1])) << 16) +
+ (((uint) ((uchar) bit_buff->pos[0])) << 24));
+ bit_buff->pos+=4;
+#else
+ bit_buff->current_byte= (uint) (((uint) ((uchar) bit_buff->pos[1]))+
+ (((uint) ((uchar) bit_buff->pos[0])) << 8));
+ bit_buff->pos+=2;
+#endif
+#endif
+}
+
+ /* Get number of bits neaded to represent value */
+
+static uint max_bit(register uint value)
+{
+ reg2 uint power=1;
+
+ while ((value>>=1))
+ power++;
+ return (power);
+}
+
+
+/*****************************************************************************
+ Some redefined functions to handle files when we are using memmap
+*****************************************************************************/
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef HAVE_MMAP
+
+static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos);
+static int _ma_read_rnd_mempack_record(MARIA_HA*, uchar *, MARIA_RECORD_POS,
+ my_bool);
+
+my_bool _ma_memmap_file(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("maria_memmap_file");
+
+ if (!info->s->file_map)
+ {
+ if (my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) <
+ share->state.state.data_file_length+MEMMAP_EXTRA_MARGIN)
+ {
+ DBUG_PRINT("warning",("File isn't extended for memmap"));
+ DBUG_RETURN(0);
+ }
+ if (_ma_dynmap_file(info, share->state.state.data_file_length))
+ DBUG_RETURN(0);
+ }
+ info->opt_flag|= MEMMAP_USED;
+ info->read_record= share->read_record= _ma_read_mempack_record;
+ share->scan= _ma_read_rnd_mempack_record;
+ DBUG_RETURN(1);
+}
+
+
+void _ma_unmap_file(MARIA_HA *info)
+{
+ VOID(my_munmap((char*) info->s->file_map,
+ (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN));
+}
+
+
+static uchar *
+_ma_mempack_get_block_info(MARIA_HA *maria,
+ MARIA_BIT_BUFF *bit_buff,
+ MARIA_BLOCK_INFO *info,
+ uchar **rec_buff_p,
+ size_t *rec_buff_size_p,
+ uchar *header)
+{
+ header+= read_pack_length((uint) maria->s->pack.version, header,
+ &info->rec_len);
+ if (maria->s->base.blobs)
+ {
+ header+= read_pack_length((uint) maria->s->pack.version, header,
+ &info->blob_len);
+ /* _ma_alloc_rec_buff sets my_errno on error */
+ if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p,
+ info->blob_len + maria->s->base.extra_rec_buff_size))
+ return 0; /* not enough memory */
+ bit_buff->blob_pos= *rec_buff_p;
+ bit_buff->blob_end= *rec_buff_p + info->blob_len;
+ }
+ return header;
+}
+
+
+static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos)
+{
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share= info->s;
+ uchar *pos;
+ DBUG_ENTER("maria_read_mempack_record");
+
+ if (filepos == HA_OFFSET_ERROR)
+ DBUG_RETURN(my_errno); /* _search() didn't find record */
+
+ if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff,
+ &block_info, &info->rec_buff,
+ &info->rec_buff_size,
+ (uchar*) share->file_map+
+ filepos)))
+ DBUG_RETURN(my_errno);
+ DBUG_RETURN(_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+ pos, block_info.rec_len));
+}
+
+
+/*ARGSUSED*/
+static int _ma_read_rnd_mempack_record(MARIA_HA *info,
+ uchar *buf,
+ register MARIA_RECORD_POS filepos,
+ my_bool skip_deleted_blocks
+ __attribute__((unused)))
+{
+ MARIA_BLOCK_INFO block_info;
+ MARIA_SHARE *share= info->s;
+ uchar *pos,*start;
+ DBUG_ENTER("_ma_read_rnd_mempack_record");
+
+ if (filepos >= share->state.state.data_file_length)
+ {
+ my_errno=HA_ERR_END_OF_FILE;
+ goto err;
+ }
+ if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff,
+ &block_info,
+ &info->rec_buff,
+ &info->rec_buff_size,
+ (uchar*)
+ (start= share->file_map +
+ filepos))))
+ goto err;
+#ifndef DBUG_OFF
+ if (block_info.rec_len > info->s->max_pack_length)
+ {
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ goto err;
+ }
+#endif
+ info->packed_length=block_info.rec_len;
+ info->cur_row.lastpos= filepos;
+ info->cur_row.nextpos= filepos+(uint) (pos-start)+block_info.rec_len;
+ info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+
+ DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+ pos, block_info.rec_len));
+ err:
+ DBUG_RETURN(my_errno);
+}
+
+#endif /* HAVE_MMAP */
+
+ /* Save length of row */
+
+uint _ma_save_pack_length(uint version, uchar *block_buff, ulong length)
+{
+ if (length < 254)
+ {
+ *(uchar*) block_buff= (uchar) length;
+ return 1;
+ }
+ if (length <= 65535)
+ {
+ *(uchar*) block_buff=254;
+ int2store(block_buff+1,(uint) length);
+ return 3;
+ }
+ *(uchar*) block_buff=255;
+ if (version == 1) /* old format */
+ {
+ DBUG_ASSERT(length <= 0xFFFFFF);
+ int3store(block_buff + 1, (ulong) length);
+ return 4;
+ }
+ else
+ {
+ int4store(block_buff + 1, (ulong) length);
+ return 5;
+ }
+}
+
+
+static uint read_pack_length(uint version, const uchar *buf, ulong *length)
+{
+ if (buf[0] < 254)
+ {
+ *length= buf[0];
+ return 1;
+ }
+ else if (buf[0] == 254)
+ {
+ *length= uint2korr(buf + 1);
+ return 3;
+ }
+ if (version == 1) /* old format */
+ {
+ *length= uint3korr(buf + 1);
+ return 4;
+ }
+ else
+ {
+ *length= uint4korr(buf + 1);
+ return 5;
+ }
+}
+
+
+uint _ma_calc_pack_length(uint version, ulong length)
+{
+ return (length < 254) ? 1 : (length < 65536) ? 3 : (version == 1) ? 4 : 5;
+}
diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c
new file mode 100644
index 00000000000..a4423133270
--- /dev/null
+++ b/storage/maria/ma_page.c
@@ -0,0 +1,619 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Read and write key blocks
+
+ The basic structure of a key block is as follows:
+
+ LSN 7 (LSN_STORE_SIZE); Log number for last change;
+ Only for transactional pages
+ PACK_TRANSID 6 (TRANSID_SIZE); Relative transid to pack page transid's
+ Only for transactional pages
+ KEYNR 1 (KEYPAGE_KEYID_SIZE) Which index this page belongs to
+ FLAG 1 (KEYPAGE_FLAG_SIZE) Flags for page
+ PAGE_SIZE 2 (KEYPAGE_USED_SIZE) How much of the page is used.
+ high-byte-first
+
+ The flag is a combination of the following values:
+
+ KEYPAGE_FLAG_ISNOD Page is a node
+ KEYPAGE_FLAG_HAS_TRANSID There may be a transid on the page.
+
+ After this we store key data, either packed or not packed, directly
+ after each other. If the page is a node flag, there is a pointer to
+ the next key page at page start and after each key.
+
+ At end of page the last KEYPAGE_CHECKSUM_SIZE bytes are reserved for a
+ page checksum.
+*/
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+/**
+ Fill MARIA_PAGE structure for usage with _ma_write_keypage
+*/
+
+void _ma_page_setup(MARIA_PAGE *page, MARIA_HA *info,
+ const MARIA_KEYDEF *keyinfo, my_off_t pos,
+ uchar *buff)
+{
+ MARIA_SHARE *share= info->s;
+
+ page->info= info;
+ page->keyinfo= keyinfo;
+ page->buff= buff;
+ page->pos= pos;
+ page->size= _ma_get_page_used(share, buff);
+ page->org_size= page->size;
+ page->flag= _ma_get_keypage_flag(share, buff);
+ page->node= ((page->flag & KEYPAGE_FLAG_ISNOD) ?
+ share->base.key_reflength : 0);
+}
+
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+void page_cleanup(MARIA_SHARE *share, MARIA_PAGE *page)
+{
+ uint length= page->size;
+ DBUG_ASSERT(length <= share->max_index_block_size);
+ bzero(page->buff + length, share->block_size - length);
+}
+#endif
+
+
+/**
+ Fetch a key-page in memory
+
+ @fn _ma_fetch_keypage()
+ @param page Fill this struct with information about read page
+ @param info Maria handler
+ @param keyinfo Key definition for used key
+ @param pos Position for page (in bytes)
+ @param lock Lock type for page
+ @param level Importance of page; Priority for page cache
+ @param buff Buffer to use for page
+ @param return_buffer Set to 1 if we want to force useage of buff
+
+ @return
+ @retval 0 ok
+ @retval 1 error
+*/
+
+my_bool _ma_fetch_keypage(MARIA_PAGE *page, MARIA_HA *info,
+ const MARIA_KEYDEF *keyinfo,
+ my_off_t pos, enum pagecache_page_lock lock,
+ int level, uchar *buff,
+ my_bool return_buffer __attribute__ ((unused)))
+{
+ uchar *tmp;
+ MARIA_PINNED_PAGE page_link;
+ MARIA_SHARE *share= info->s;
+ uint block_size= share->block_size;
+ DBUG_ENTER("_ma_fetch_keypage");
+ DBUG_PRINT("enter",("page: %lu", (ulong) (pos / block_size)));
+
+ tmp= pagecache_read(share->pagecache, &share->kfile,
+ (pgcache_page_no_t) (pos / block_size), level, buff,
+ share->page_type, lock, &page_link.link);
+
+ if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
+ {
+ DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE || PAGECACHE_LOCK_READ);
+ page_link.unlock= (lock == PAGECACHE_LOCK_WRITE ?
+ PAGECACHE_LOCK_WRITE_UNLOCK :
+ PAGECACHE_LOCK_READ_UNLOCK);
+ page_link.changed= 0;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ page->link_offset= info->pinned_pages.elements-1;
+ }
+
+ if (tmp == info->buff)
+ info->keyread_buff_used=1;
+ else if (!tmp)
+ {
+ DBUG_PRINT("error",("Got errno: %d from pagecache_read",my_errno));
+ info->last_keypage=HA_OFFSET_ERROR;
+ maria_print_error(share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(1);
+ }
+ info->last_keypage= pos;
+
+ /*
+ Setup page structure to make pages easy to use
+ This is same as page_fill_info, but here inlined as this si used
+ so often.
+ */
+ page->info= info;
+ page->keyinfo= keyinfo;
+ page->buff= tmp;
+ page->pos= pos;
+ page->size= _ma_get_page_used(share, tmp);
+ page->org_size= page->size; /* For debugging */
+ page->flag= _ma_get_keypage_flag(share, tmp);
+ page->node= ((page->flag & KEYPAGE_FLAG_ISNOD) ?
+ share->base.key_reflength : 0);
+
+#ifdef EXTRA_DEBUG
+ {
+ uint page_size= page->size;
+ if (page_size < 4 || page_size > share->max_index_block_size ||
+ _ma_get_keynr(share, tmp) != keyinfo->key_nr)
+ {
+ DBUG_PRINT("error",("page %lu had wrong page length: %u keynr: %u",
+ (ulong) (pos / block_size), page_size,
+ _ma_get_keynr(share, tmp)));
+ DBUG_DUMP("page", tmp, page_size);
+ info->last_keypage = HA_OFFSET_ERROR;
+ maria_print_error(share, HA_ERR_CRASHED);
+ my_errno= HA_ERR_CRASHED;
+ DBUG_RETURN(1);
+ }
+ }
+#endif
+ DBUG_RETURN(0);
+} /* _ma_fetch_keypage */
+
+
+/* Write a key-page on disk */
+
+my_bool _ma_write_keypage(MARIA_PAGE *page, enum pagecache_page_lock lock,
+ int level)
+{
+ MARIA_SHARE *share= page->info->s;
+ uint block_size= share->block_size;
+ uchar *buff= page->buff;
+ my_bool res;
+ MARIA_PINNED_PAGE page_link;
+ DBUG_ENTER("_ma_write_keypage");
+
+ /*
+ The following ensures that for transactional tables we have logged
+ all changes that changes the page size (as the logging code sets
+ page->org_size)
+ */
+ DBUG_ASSERT(!share->now_transactional || page->size == page->org_size);
+
+#ifdef EXTRA_DEBUG /* Safety check */
+ {
+ uint page_length, nod_flag;
+ page_length= _ma_get_page_used(share, buff);
+ nod_flag= _ma_test_if_nod(share, buff);
+
+ DBUG_ASSERT(page->size == page_length);
+ DBUG_ASSERT(page->flag == _ma_get_keypage_flag(share, buff));
+
+ if (page->pos < share->base.keystart ||
+ page->pos+block_size > share->state.state.key_file_length ||
+ (page->pos & (maria_block_size-1)))
+ {
+ DBUG_PRINT("error",("Trying to write inside key status region: "
+ "key_start: %lu length: %lu page_pos: %lu",
+ (long) share->base.keystart,
+ (long) share->state.state.key_file_length,
+ (long) page->pos));
+ my_errno=EINVAL;
+ DBUG_ASSERT(0);
+ DBUG_RETURN(1);
+ }
+ DBUG_PRINT("page",("write page at: %lu",(ulong) (page->pos / block_size)));
+ DBUG_DUMP("buff", buff, page_length);
+ DBUG_ASSERT(page_length >= share->keypage_header + nod_flag +
+ page->keyinfo->minlength || maria_in_recovery);
+ }
+#endif
+
+ /* Verify that keynr is correct */
+ DBUG_ASSERT(_ma_get_keynr(share, buff) == page->keyinfo->key_nr);
+
+#if defined(EXTRA_DEBUG) && defined(HAVE_valgrind) && defined(NOT_ANYMORE)
+ {
+ /* This is here to catch uninitialized bytes */
+ uint length= page->size;
+ ulong crc= my_checksum(0, buff, length);
+ int4store(buff + block_size - KEYPAGE_CHECKSUM_SIZE, crc);
+ }
+#endif
+
+ page_cleanup(share, page);
+ res= pagecache_write(share->pagecache,
+ &share->kfile,
+ (pgcache_page_no_t) (page->pos / block_size),
+ level, buff, share->page_type,
+ lock,
+ lock == PAGECACHE_LOCK_LEFT_WRITELOCKED ?
+ PAGECACHE_PIN_LEFT_PINNED :
+ (lock == PAGECACHE_LOCK_WRITE_UNLOCK ?
+ PAGECACHE_UNPIN : PAGECACHE_PIN),
+ PAGECACHE_WRITE_DELAY, &page_link.link,
+ LSN_IMPOSSIBLE);
+
+ if (lock == PAGECACHE_LOCK_WRITE)
+ {
+ /* It was not locked before, we have to unlock it when we unpin pages */
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&page->info->pinned_pages, (void*) &page_link);
+ }
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Put page in free list
+
+ @fn _ma_dispose()
+ @param info Maria handle
+ @param pos Address to page
+ @param page_not_read 1 if page has not yet been read
+
+ @note
+ The page at 'pos' must have been read with a write lock.
+ This function does logging (unlike _ma_new()).
+
+ @return
+ @retval 0 ok
+ @retval 1 error
+
+*/
+
+int _ma_dispose(register MARIA_HA *info, my_off_t pos, my_bool page_not_read)
+{
+ my_off_t old_link;
+ uchar buff[MAX_KEYPAGE_HEADER_SIZE+ 8 + 2];
+ ulonglong page_no;
+ MARIA_SHARE *share= info->s;
+ MARIA_PINNED_PAGE page_link;
+ uint block_size= share->block_size;
+ int result= 0;
+ enum pagecache_page_lock lock_method;
+ enum pagecache_page_pin pin_method;
+ DBUG_ENTER("_ma_dispose");
+ DBUG_PRINT("enter",("page: %lu", (ulong) (pos / block_size)));
+ DBUG_ASSERT(pos % block_size == 0);
+
+ (void) _ma_lock_key_del(info, 0);
+
+ old_link= share->key_del_current;
+ share->key_del_current= pos;
+ page_no= pos / block_size;
+ bzero(buff, share->keypage_header);
+ _ma_store_keynr(share, buff, (uchar) MARIA_DELETE_KEY_NR);
+ _ma_store_page_used(share, buff, share->keypage_header + 8);
+ mi_sizestore(buff + share->keypage_header, old_link);
+ share->state.changed|= STATE_NOT_SORTED_PAGES;
+
+ if (share->now_transactional)
+ {
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2];
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ my_off_t page;
+
+ /* Store address of deleted page */
+ page_store(log_data + FILEID_STORE_SIZE, page_no);
+
+ /* Store link to next unused page (the link that is written to page) */
+ page= (old_link == HA_OFFSET_ERROR ? IMPOSSIBLE_PAGE_NO :
+ old_link / block_size);
+ page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page);
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+ if (translog_write_record(&lsn, LOGREC_REDO_INDEX_FREE_PAGE,
+ info->trn, info,
+ (translog_size_t) sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data, NULL))
+ result= 1;
+ }
+
+ if (page_not_read)
+ {
+ lock_method= PAGECACHE_LOCK_WRITE;
+ pin_method= PAGECACHE_PIN;
+ }
+ else
+ {
+ lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+ pin_method= PAGECACHE_PIN_LEFT_PINNED;
+ }
+
+ if (pagecache_write_part(share->pagecache,
+ &share->kfile, (pgcache_page_no_t) page_no,
+ PAGECACHE_PRIORITY_LOW, buff,
+ share->page_type,
+ lock_method, pin_method,
+ PAGECACHE_WRITE_DELAY, &page_link.link,
+ LSN_IMPOSSIBLE,
+ 0, share->keypage_header + 8))
+ result= 1;
+
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+ {
+ uchar *page_buff= pagecache_block_link_to_buffer(page_link.link);
+ bzero(page_buff + share->keypage_header + 8,
+ block_size - share->keypage_header - 8 - KEYPAGE_CHECKSUM_SIZE);
+ }
+#endif
+
+ if (page_not_read)
+ {
+ /* It was not locked before, we have to unlock it when we unpin pages */
+ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ page_link.changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) &page_link);
+ }
+
+ DBUG_RETURN(result);
+} /* _ma_dispose */
+
+
+/**
+ @brief Get address for free page to use
+
+ @fn _ma_new()
+ @param info Maria handle
+ @param level Type of key block (caching priority for pagecache)
+ @param page_link Pointer to page in page cache if read. One can
+ check if this is used by checking if
+ page_link->changed != 0
+
+ @note Logging of this is left to the caller (so that the "new"ing and the
+ first changes done to this new page can be logged as one single entry - one
+ single _ma_log_new()) call).
+
+ @return
+ HA_OFFSET_ERROR File is full or page read error
+ # Page address to use
+*/
+
+my_off_t _ma_new(register MARIA_HA *info, int level,
+ MARIA_PINNED_PAGE **page_link)
+
+{
+ my_off_t pos;
+ MARIA_SHARE *share= info->s;
+ uint block_size= share->block_size;
+ DBUG_ENTER("_ma_new");
+
+ if (_ma_lock_key_del(info, 1))
+ {
+ pthread_mutex_lock(&share->intern_lock);
+ pos= share->state.state.key_file_length;
+ if (pos >= share->base.max_key_file_length - block_size)
+ {
+ my_errno=HA_ERR_INDEX_FILE_FULL;
+ pthread_mutex_unlock(&share->intern_lock);
+ DBUG_RETURN(HA_OFFSET_ERROR);
+ }
+ share->state.state.key_file_length+= block_size;
+ /* Following is for not transactional tables */
+ info->state->key_file_length= share->state.state.key_file_length;
+ pthread_mutex_unlock(&share->intern_lock);
+ (*page_link)->changed= 0;
+ (*page_link)->write_lock= PAGECACHE_LOCK_WRITE;
+ }
+ else
+ {
+ uchar *buff;
+ pos= share->key_del_current; /* Protected */
+ DBUG_ASSERT(share->pagecache->block_size == block_size);
+ if (!(buff= pagecache_read(share->pagecache,
+ &share->kfile,
+ (pgcache_page_no_t) (pos / block_size), level,
+ 0, share->page_type,
+ PAGECACHE_LOCK_WRITE, &(*page_link)->link)))
+ pos= HA_OFFSET_ERROR;
+ else
+ {
+ /*
+ Next deleted page's number is in the header of the present page
+ (single linked list):
+ */
+#ifndef DBUG_OFF
+ my_off_t key_del_current;
+#endif
+ share->key_del_current= mi_sizekorr(buff+share->keypage_header);
+#ifndef DBUG_OFF
+ key_del_current= share->key_del_current;
+ DBUG_ASSERT((key_del_current != 0) &&
+ ((key_del_current == HA_OFFSET_ERROR) ||
+ (key_del_current <=
+ (share->state.state.key_file_length - block_size))));
+#endif
+ }
+
+ (*page_link)->unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+ (*page_link)->write_lock= PAGECACHE_LOCK_WRITE;
+ /*
+ We have to mark it changed as _ma_flush_pending_blocks() uses
+ 'changed' to know if we used the page cache or not
+ */
+ (*page_link)->changed= 1;
+ push_dynamic(&info->pinned_pages, (void*) *page_link);
+ *page_link= dynamic_element(&info->pinned_pages,
+ info->pinned_pages.elements-1,
+ MARIA_PINNED_PAGE *);
+ }
+ share->state.changed|= STATE_NOT_SORTED_PAGES;
+ DBUG_PRINT("exit",("Pos: %ld",(long) pos));
+ DBUG_RETURN(pos);
+} /* _ma_new */
+
+
+/**
+ Log compactation of a index page
+*/
+
+static my_bool _ma_log_compact_keypage(MARIA_PAGE *ma_page,
+ TrID min_read_from)
+{
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 1 + 7 + TRANSID_SIZE];
+ uchar *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ MARIA_HA *info= ma_page->info;
+ MARIA_SHARE *share= info->s;
+ uint translog_parts, extra_length;
+ my_off_t page= ma_page->pos;
+ DBUG_ENTER("_ma_log_compact_keypage");
+ DBUG_PRINT("enter", ("page: %lu", (ulong) (page / share->block_size)));
+
+ /* Store address of new root page */
+ page/= share->block_size;
+ page_store(log_data + FILEID_STORE_SIZE, page);
+
+ log_pos= log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+
+ log_pos[0]= KEY_OP_COMPACT_PAGE;
+ transid_store(log_pos + 1, min_read_from);
+ log_pos+= 1 + TRANSID_SIZE;
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+ log_data);
+ translog_parts= 1;
+ extra_length= 0;
+
+ _ma_log_key_changes(ma_page,
+ log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_pos, &extra_length, &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ ma_page->org_size= ma_page->size;
+
+ if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length + extra_length,
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ Remove all transaction id's less than given one from a key page
+
+ @fn _ma_compact_keypage()
+ @param keyinfo Key handler
+ @param page_pos Page position on disk
+ @param page Buffer for page
+ @param min_read_from Remove all trids from page less than this
+
+ @retval 0 Ok
+ ®retval 1 Error; my_errno contains the error
+*/
+
+my_bool _ma_compact_keypage(MARIA_PAGE *ma_page, TrID min_read_from)
+{
+ MARIA_HA *info= ma_page->info;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEY key;
+ uchar *page, *endpos, *start_of_empty_space;
+ uint page_flag, nod_flag, saved_space;
+ my_bool page_has_transid;
+ DBUG_ENTER("_ma_compact_keypage");
+
+ page_flag= ma_page->flag;
+ if (!(page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+ DBUG_RETURN(0); /* No transaction id on page */
+
+ nod_flag= ma_page->node;
+ page= ma_page->buff;
+ endpos= page + ma_page->size;
+ key.data= info->lastkey_buff;
+ key.keyinfo= (MARIA_KEYDEF*) ma_page->keyinfo;
+
+ page_has_transid= 0;
+ page+= share->keypage_header + nod_flag;
+ key.data[0]= 0; /* safety */
+ start_of_empty_space= 0;
+ saved_space= 0;
+ do
+ {
+ if (!(page= (*ma_page->keyinfo->skip_key)(&key, 0, 0, page)))
+ {
+ DBUG_PRINT("error",("Couldn't find last key: page_pos: 0x%lx",
+ (long) page));
+ maria_print_error(share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(1);
+ }
+ if (key_has_transid(page-1))
+ {
+ uint transid_length;
+ transid_length= transid_packed_length(page);
+
+ if (min_read_from == ~(TrID) 0 ||
+ min_read_from < transid_get_packed(share, page))
+ {
+ page[-1]&= 254; /* Remove transid marker */
+ transid_length= transid_packed_length(page);
+ if (start_of_empty_space)
+ {
+ /* Move block before the transid up in page */
+ uint copy_length= (uint) (page - start_of_empty_space) - saved_space;
+ memmove(start_of_empty_space, start_of_empty_space + saved_space,
+ copy_length);
+ start_of_empty_space+= copy_length;
+ }
+ else
+ start_of_empty_space= page;
+ saved_space+= transid_length;
+ }
+ else
+ page_has_transid= 1; /* At least one id left */
+ page+= transid_length;
+ }
+ page+= nod_flag;
+ } while (page < endpos);
+
+ DBUG_ASSERT(page == endpos);
+
+ if (start_of_empty_space)
+ {
+ /*
+ Move last block down
+ This is always true if any transid was removed
+ */
+ uint copy_length= (uint) (endpos - start_of_empty_space) - saved_space;
+
+ if (copy_length)
+ memmove(start_of_empty_space, start_of_empty_space + saved_space,
+ copy_length);
+ ma_page->size= (uint) (start_of_empty_space + copy_length - ma_page->buff);
+ page_store_size(share, ma_page);
+ }
+
+ if (!page_has_transid)
+ {
+ ma_page->flag&= ~KEYPAGE_FLAG_HAS_TRANSID;
+ _ma_store_keypage_flag(share, ma_page->buff, ma_page->flag);
+ /* Clear packed transid (in case of zerofill) */
+ bzero(ma_page->buff + LSN_STORE_SIZE, TRANSID_SIZE);
+ }
+
+ if (share->now_transactional)
+ {
+ if (_ma_log_compact_keypage(ma_page, min_read_from))
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c
new file mode 100644
index 00000000000..441310a60ea
--- /dev/null
+++ b/storage/maria/ma_pagecache.c
@@ -0,0 +1,5104 @@
+/* Copyright (C) 2000-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ These functions handle page caching for Maria tables.
+
+ One cache can handle many files.
+ It must contain buffers of the same blocksize.
+ init_pagecache() should be used to init cache handler.
+
+ The free list (free_block_list) is a stack like structure.
+ When a block is freed by free_block(), it is pushed onto the stack.
+ When a new block is required it is first tried to pop one from the stack.
+ If the stack is empty, it is tried to get a never-used block from the pool.
+ If this is empty too, then a block is taken from the LRU ring, flushing it
+ to disk, if necessary. This is handled in find_block().
+ With the new free list, the blocks can have three temperatures:
+ hot, warm and cold (which is free). This is remembered in the block header
+ by the enum PCBLOCK_TEMPERATURE temperature variable. Remembering the
+ temperature is necessary to correctly count the number of warm blocks,
+ which is required to decide when blocks are allowed to become hot. Whenever
+ a block is inserted to another (sub-)chain, we take the old and new
+ temperature into account to decide if we got one more or less warm block.
+ blocks_unused is the sum of never used blocks in the pool and of currently
+ free blocks. blocks_used is the number of blocks fetched from the pool and
+ as such gives the maximum number of in-use blocks at any time.
+
+ TODO: Write operation locks whole cache till the end of the operation.
+ Should be fixed.
+*/
+
+#include "maria_def.h"
+#include <m_string.h>
+#include "ma_pagecache.h"
+#include "ma_blockrec.h"
+#include <my_bit.h>
+#include <errno.h>
+
+/*
+ Some compilation flags have been added specifically for this module
+ to control the following:
+ - not to let a thread to yield the control when reading directly
+ from page cache, which might improve performance in many cases;
+ to enable this add:
+ #define SERIALIZED_READ_FROM_CACHE
+ - to set an upper bound for number of threads simultaneously
+ using the page cache; this setting helps to determine an optimal
+ size for hash table and improve performance when the number of
+ blocks in the page cache much less than the number of threads
+ accessing it;
+ to set this number equal to <N> add
+ #define MAX_THREADS <N>
+ - to substitute calls of pthread_cond_wait for calls of
+ pthread_cond_timedwait (wait with timeout set up);
+ this setting should be used only when you want to trap a deadlock
+ situation, which theoretically should not happen;
+ to set timeout equal to <T> seconds add
+ #define PAGECACHE_TIMEOUT <T>
+ - to enable the module traps and to send debug information from
+ page cache module to a special debug log add:
+ #define PAGECACHE_DEBUG
+ the name of this debug log file <LOG NAME> can be set through:
+ #define PAGECACHE_DEBUG_LOG <LOG NAME>
+ if the name is not defined, it's set by default;
+ if the PAGECACHE_DEBUG flag is not set up and we are in a debug
+ mode, i.e. when ! defined(DBUG_OFF), the debug information from the
+ module is sent to the regular debug log.
+
+ Example of the settings:
+ #define SERIALIZED_READ_FROM_CACHE
+ #define MAX_THREADS 100
+ #define PAGECACHE_TIMEOUT 1
+ #define PAGECACHE_DEBUG
+ #define PAGECACHE_DEBUG_LOG "my_pagecache_debug.log"
+*/
+
+/*
+ In key cache we have external raw locking here we use
+ SERIALIZED_READ_FROM_CACHE to avoid problem of reading
+ not consistent data from the page.
+ (keycache functions (key_cache_read(), key_cache_insert() and
+ key_cache_write()) rely on external MyISAM lock, we don't)
+*/
+#define SERIALIZED_READ_FROM_CACHE yes
+
+#define PCBLOCK_INFO(B) \
+ DBUG_PRINT("info", \
+ ("block: 0x%lx fd: %lu page: %lu s: %0x hshL: " \
+ " 0x%lx req: %u/%u wrlocks: %u rdlocks %u " \
+ "rdlocks_q: %u pins: %u status: %u type: %s", \
+ (ulong)(B), \
+ (ulong)((B)->hash_link ? \
+ (B)->hash_link->file.file : \
+ 0), \
+ (ulong)((B)->hash_link ? \
+ (B)->hash_link->pageno : \
+ 0), \
+ (B)->status, \
+ (ulong)(B)->hash_link, \
+ (uint) (B)->requests, \
+ (uint)((B)->hash_link ? \
+ (B)->hash_link->requests : \
+ 0), \
+ block->wlocks, block->rlocks, block->rlocks_queue, \
+ (uint)(B)->pins, (uint)(B)->status, \
+ page_cache_page_type_str[(B)->type]))
+
+/* TODO: put it to my_static.c */
+my_bool my_disable_flush_pagecache_blocks= 0;
+
+#define STRUCT_PTR(TYPE, MEMBER, a) \
+ (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER))
+
+/* types of condition variables */
+#define COND_FOR_REQUESTED 0 /* queue of thread waiting for read operation */
+#define COND_FOR_SAVED 1 /* queue of thread waiting for flush */
+#define COND_FOR_WRLOCK 2 /* queue of write lock */
+#define COND_SIZE 3 /* number of COND_* queues */
+
+typedef pthread_cond_t KEYCACHE_CONDVAR;
+
+/* descriptor of the page in the page cache block buffer */
+struct st_pagecache_page
+{
+ PAGECACHE_FILE file; /* file to which the page belongs to */
+ pgcache_page_no_t pageno; /* number of the page in the file */
+};
+
+/* element in the chain of a hash table bucket */
+struct st_pagecache_hash_link
+{
+ struct st_pagecache_hash_link
+ *next, **prev; /* to connect links in the same bucket */
+ struct st_pagecache_block_link
+ *block; /* reference to the block for the page: */
+ PAGECACHE_FILE file; /* from such a file */
+ pgcache_page_no_t pageno; /* this page */
+ uint requests; /* number of requests for the page */
+};
+
+/* simple states of a block */
+#define PCBLOCK_ERROR 1 /* an error occurred when performing disk i/o */
+#define PCBLOCK_READ 2 /* the is page in the block buffer */
+#define PCBLOCK_IN_SWITCH 4 /* block is preparing to read new page */
+#define PCBLOCK_REASSIGNED 8 /* block does not accept requests for old page */
+#define PCBLOCK_IN_FLUSH 16 /* block is in flush operation */
+#define PCBLOCK_CHANGED 32 /* block buffer contains a dirty page */
+#define PCBLOCK_DIRECT_W 64 /* possible direct write to the block */
+
+/* page status, returned by find_block */
+#define PAGE_READ 0
+#define PAGE_TO_BE_READ 1
+#define PAGE_WAIT_TO_BE_READ 2
+
+/* block temperature determines in which (sub-)chain the block currently is */
+enum PCBLOCK_TEMPERATURE { PCBLOCK_COLD /*free*/ , PCBLOCK_WARM , PCBLOCK_HOT };
+
+/* debug info */
+#ifndef DBUG_OFF
+static const char *page_cache_page_type_str[]=
+{
+ /* used only for control page type changing during debugging */
+ "EMPTY",
+ "PLAIN",
+ "LSN",
+ "READ_UNKNOWN"
+};
+
+static const char *page_cache_page_write_mode_str[]=
+{
+ "DELAY",
+ "DONE"
+};
+
+static const char *page_cache_page_lock_str[]=
+{
+ "free -> free",
+ "read -> read",
+ "write -> write",
+ "free -> read",
+ "free -> write",
+ "read -> free",
+ "write -> free",
+ "write -> read"
+};
+
+static const char *page_cache_page_pin_str[]=
+{
+ "pinned -> pinned",
+ "unpinned -> unpinned",
+ "unpinned -> pinned",
+ "pinned -> unpinned"
+};
+
+
+typedef struct st_pagecache_pin_info
+{
+ struct st_pagecache_pin_info *next, **prev;
+ struct st_my_thread_var *thread;
+} PAGECACHE_PIN_INFO;
+
+/*
+ st_pagecache_lock_info structure should be kept in next, prev, thread part
+ compatible with st_pagecache_pin_info to be compatible in functions.
+*/
+
+typedef struct st_pagecache_lock_info
+{
+ struct st_pagecache_lock_info *next, **prev;
+ struct st_my_thread_var *thread;
+ my_bool write_lock;
+} PAGECACHE_LOCK_INFO;
+
+
+/* service functions maintain debugging info about pin & lock */
+
+
+/*
+ Links information about thread pinned/locked the block to the list
+
+ SYNOPSIS
+ info_link()
+ list the list to link in
+ node the node which should be linked
+*/
+
+static void info_link(PAGECACHE_PIN_INFO **list, PAGECACHE_PIN_INFO *node)
+{
+ if ((node->next= *list))
+ node->next->prev= &(node->next);
+ *list= node;
+ node->prev= list;
+}
+
+
+/*
+ Unlinks information about thread pinned/locked the block from the list
+
+ SYNOPSIS
+ info_unlink()
+ node the node which should be unlinked
+*/
+
+static void info_unlink(PAGECACHE_PIN_INFO *node)
+{
+ if ((*node->prev= node->next))
+ node->next->prev= node->prev;
+}
+
+
+/*
+ Finds information about given thread in the list of threads which
+ pinned/locked this block.
+
+ SYNOPSIS
+ info_find()
+ list the list where to find the thread
+ thread thread ID (reference to the st_my_thread_var
+ of the thread)
+ any return any thread of the list
+
+ RETURN
+ 0 - the thread was not found
+ pointer to the information node of the thread in the list, or, if 'any',
+ to any thread of the list.
+*/
+
+static PAGECACHE_PIN_INFO *info_find(PAGECACHE_PIN_INFO *list,
+ struct st_my_thread_var *thread,
+ my_bool any)
+{
+ register PAGECACHE_PIN_INFO *i= list;
+ if (any)
+ return i;
+ for(; i != 0; i= i->next)
+ if (i->thread == thread)
+ return i;
+ return 0;
+}
+
+#endif /* !DBUG_OFF */
+
+/* page cache block */
+struct st_pagecache_block_link
+{
+ struct st_pagecache_block_link
+ *next_used, **prev_used; /* to connect links in the LRU chain (ring) */
+ struct st_pagecache_block_link
+ *next_changed, **prev_changed; /* for lists of file dirty/clean blocks */
+ struct st_pagecache_hash_link
+ *hash_link; /* backward ptr to referring hash_link */
+#ifndef DBUG_OFF
+ PAGECACHE_PIN_INFO *pin_list;
+ PAGECACHE_LOCK_INFO *lock_list;
+#endif
+ KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */
+ uchar *buffer; /* buffer for the block page */
+ pthread_t write_locker;
+
+ ulonglong last_hit_time; /* timestamp of the last hit */
+ WQUEUE
+ wqueue[COND_SIZE]; /* queues on waiting requests for new/old pages */
+ uint32 requests; /* number of requests for the block */
+ uint32 pins; /* pin counter */
+ uint32 wlocks; /* write locks counter */
+ uint32 rlocks; /* read locks counter */
+ uint32 rlocks_queue; /* rd. locks waiting wr. lock of this thread */
+ uint16 status; /* state of the block */
+ int16 error; /* error code for block in case of error */
+ enum PCBLOCK_TEMPERATURE temperature; /* block temperature: cold, warm, hot*/
+ enum pagecache_page_type type; /* type of the block */
+ uint hits_left; /* number of hits left until promotion */
+ /** @brief LSN when first became dirty; LSN_MAX means "not yet set" */
+ LSN rec_lsn;
+};
+
+/** @brief information describing a run of flush_pagecache_blocks_int() */
+struct st_file_in_flush
+{
+ File file;
+ /**
+ @brief threads waiting for the thread currently flushing this file to be
+ done
+ */
+ WQUEUE flush_queue;
+ /**
+ @brief if the thread currently flushing the file has a non-empty
+ first_in_switch list.
+ */
+ my_bool first_in_switch;
+};
+
+#ifndef DBUG_OFF
+/* debug checks */
+
+#ifdef NOT_USED
+static my_bool info_check_pin(PAGECACHE_BLOCK_LINK *block,
+ enum pagecache_page_pin mode
+ __attribute__((unused)))
+{
+ struct st_my_thread_var *thread= my_thread_var;
+ PAGECACHE_PIN_INFO *info= info_find(block->pin_list, thread);
+ DBUG_ENTER("info_check_pin");
+ DBUG_PRINT("enter", ("thread: 0x%lx pin: %s",
+ (ulong) thread, page_cache_page_pin_str[mode]));
+ if (info)
+ {
+ if (mode == PAGECACHE_PIN_LEFT_UNPINNED)
+ {
+ DBUG_PRINT("info",
+ ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_UNPINNED!!!",
+ (ulong)thread, (ulong)block));
+ DBUG_RETURN(1);
+ }
+ else if (mode == PAGECACHE_PIN)
+ {
+ DBUG_PRINT("info",
+ ("info_check_pin: thread: 0x%lx block: 0x%lx ; PIN!!!",
+ (ulong)thread, (ulong)block));
+ DBUG_RETURN(1);
+ }
+ }
+ else
+ {
+ if (mode == PAGECACHE_PIN_LEFT_PINNED)
+ {
+ DBUG_PRINT("info",
+ ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_PINNED!!!",
+ (ulong)thread, (ulong)block));
+ DBUG_RETURN(1);
+ }
+ else if (mode == PAGECACHE_UNPIN)
+ {
+ DBUG_PRINT("info",
+ ("info_check_pin: thread: 0x%lx block: 0x%lx ; UNPIN!!!",
+ (ulong)thread, (ulong)block));
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Debug function which checks current lock/pin state and requested changes
+
+ SYNOPSIS
+ info_check_lock()
+ lock requested lock changes
+ pin requested pin changes
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool info_check_lock(PAGECACHE_BLOCK_LINK *block,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin)
+{
+ struct st_my_thread_var *thread= my_thread_var;
+ PAGECACHE_LOCK_INFO *info=
+ (PAGECACHE_LOCK_INFO *) info_find((PAGECACHE_PIN_INFO *) block->lock_list,
+ thread);
+ DBUG_ENTER("info_check_lock");
+ switch(lock) {
+ case PAGECACHE_LOCK_LEFT_UNLOCKED:
+ if (pin != PAGECACHE_PIN_LEFT_UNPINNED ||
+ info)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_LEFT_READLOCKED:
+ if ((pin != PAGECACHE_PIN_LEFT_UNPINNED &&
+ pin != PAGECACHE_PIN_LEFT_PINNED) ||
+ info == 0 || info->write_lock)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_LEFT_WRITELOCKED:
+ if (pin != PAGECACHE_PIN_LEFT_PINNED ||
+ info == 0 || !info->write_lock)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_READ:
+ if ((pin != PAGECACHE_PIN_LEFT_UNPINNED &&
+ pin != PAGECACHE_PIN) ||
+ info != 0)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_WRITE:
+ if (pin != PAGECACHE_PIN ||
+ info != 0)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_READ_UNLOCK:
+ if ((pin != PAGECACHE_PIN_LEFT_UNPINNED &&
+ pin != PAGECACHE_UNPIN) ||
+ info == 0 || info->write_lock)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_WRITE_UNLOCK:
+ if (pin != PAGECACHE_UNPIN ||
+ info == 0 || !info->write_lock)
+ goto error;
+ break;
+ case PAGECACHE_LOCK_WRITE_TO_READ:
+ if ((pin != PAGECACHE_PIN_LEFT_PINNED &&
+ pin != PAGECACHE_UNPIN) ||
+ info == 0 || !info->write_lock)
+ goto error;
+ break;
+ }
+ DBUG_RETURN(0);
+error:
+ DBUG_PRINT("info",
+ ("info_check_lock: thread: 0x%lx block 0x%lx: info: %d wrt: %d,"
+ "to lock: %s, to pin: %s",
+ (ulong)thread, (ulong)block, test(info),
+ (info ? info->write_lock : 0),
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ DBUG_RETURN(1);
+}
+#endif /* NOT_USED */
+#endif /* !DBUG_OFF */
+
+#define FLUSH_CACHE 2000 /* sort this many blocks at once */
+
+static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block);
+#ifndef DBUG_OFF
+static void test_key_cache(PAGECACHE *pagecache,
+ const char *where, my_bool lock);
+#endif
+
+#define PAGECACHE_HASH(p, f, pos) (((ulong) (pos) + \
+ (ulong) (f).file) & (p->hash_entries-1))
+#define FILE_HASH(f) ((uint) (f).file & (PAGECACHE_CHANGED_BLOCKS_HASH - 1))
+
+#define DEFAULT_PAGECACHE_DEBUG_LOG "pagecache_debug.log"
+
+#if defined(PAGECACHE_DEBUG) && ! defined(PAGECACHE_DEBUG_LOG)
+#define PAGECACHE_DEBUG_LOG DEFAULT_PAGECACHE_DEBUG_LOG
+#endif
+
+#if defined(PAGECACHE_DEBUG_LOG)
+static FILE *pagecache_debug_log= NULL;
+static void pagecache_debug_print _VARARGS((const char *fmt, ...));
+#define PAGECACHE_DEBUG_OPEN \
+ if (!pagecache_debug_log) \
+ { \
+ pagecache_debug_log= fopen(PAGECACHE_DEBUG_LOG, "w"); \
+ (void) setvbuf(pagecache_debug_log, NULL, _IOLBF, BUFSIZ); \
+ }
+
+#define PAGECACHE_DEBUG_CLOSE \
+ if (pagecache_debug_log) \
+ { \
+ fclose(pagecache_debug_log); \
+ pagecache_debug_log= 0; \
+ }
+#else
+#define PAGECACHE_DEBUG_OPEN
+#define PAGECACHE_DEBUG_CLOSE
+#endif /* defined(PAGECACHE_DEBUG_LOG) */
+
+#if defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG)
+#define KEYCACHE_DBUG_PRINT(l, m) \
+ { if (pagecache_debug_log) \
+ fprintf(pagecache_debug_log, "%s: ", l); \
+ pagecache_debug_print m; }
+
+#define KEYCACHE_DBUG_ASSERT(a) \
+ { if (! (a) && pagecache_debug_log) \
+ fclose(pagecache_debug_log); \
+ assert(a); }
+#else
+#define KEYCACHE_DBUG_PRINT(l, m) DBUG_PRINT(l, m)
+#define KEYCACHE_DBUG_ASSERT(a) DBUG_ASSERT(a)
+#endif /* defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG) */
+
+#if defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF)
+#ifdef THREAD
+static long pagecache_thread_id;
+#define KEYCACHE_THREAD_TRACE(l) \
+ KEYCACHE_DBUG_PRINT(l,("|thread %ld",pagecache_thread_id))
+
+#define KEYCACHE_THREAD_TRACE_BEGIN(l) \
+ { struct st_my_thread_var *thread_var= my_thread_var; \
+ pagecache_thread_id= thread_var->id; \
+ KEYCACHE_DBUG_PRINT(l,("[thread %ld",pagecache_thread_id)) }
+
+#define KEYCACHE_THREAD_TRACE_END(l) \
+ KEYCACHE_DBUG_PRINT(l,("]thread %ld",pagecache_thread_id))
+#else /* THREAD */
+#define KEYCACHE_THREAD_TRACE(l) KEYCACHE_DBUG_PRINT(l,(""))
+#define KEYCACHE_THREAD_TRACE_BEGIN(l) KEYCACHE_DBUG_PRINT(l,(""))
+#define KEYCACHE_THREAD_TRACE_END(l) KEYCACHE_DBUG_PRINT(l,(""))
+#endif /* THREAD */
+#else
+#define KEYCACHE_THREAD_TRACE_BEGIN(l)
+#define KEYCACHE_THREAD_TRACE_END(l)
+#define KEYCACHE_THREAD_TRACE(l)
+#endif /* defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF) */
+
+#define PCBLOCK_NUMBER(p, b) \
+ ((uint) (((char*)(b)-(char *) p->block_root)/sizeof(PAGECACHE_BLOCK_LINK)))
+#define PAGECACHE_HASH_LINK_NUMBER(p, h) \
+ ((uint) (((char*)(h)-(char *) p->hash_link_root)/ \
+ sizeof(PAGECACHE_HASH_LINK)))
+
+#if (defined(PAGECACHE_TIMEOUT) && !defined(__WIN__)) || defined(PAGECACHE_DEBUG)
+static int pagecache_pthread_cond_wait(pthread_cond_t *cond,
+ pthread_mutex_t *mutex);
+#else
+#define pagecache_pthread_cond_wait pthread_cond_wait
+#endif
+
+#if defined(PAGECACHE_DEBUG)
+static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex);
+static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex);
+static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond);
+#define pagecache_pthread_mutex_lock(M) \
+{ DBUG_PRINT("lock", ("mutex lock 0x%lx %u", (ulong)(M), __LINE__)); \
+ ___pagecache_pthread_mutex_lock(M);}
+#define pagecache_pthread_mutex_unlock(M) \
+{ DBUG_PRINT("lock", ("mutex unlock 0x%lx %u", (ulong)(M), __LINE__)); \
+ ___pagecache_pthread_mutex_unlock(M);}
+#define pagecache_pthread_cond_signal(M) \
+{ DBUG_PRINT("lock", ("signal 0x%lx %u", (ulong)(M), __LINE__)); \
+ ___pagecache_pthread_cond_signal(M);}
+#else
+#define pagecache_pthread_mutex_lock pthread_mutex_lock
+#define pagecache_pthread_mutex_unlock pthread_mutex_unlock
+#define pagecache_pthread_cond_signal pthread_cond_signal
+#endif /* defined(PAGECACHE_DEBUG) */
+
+extern my_bool translog_flush(TRANSLOG_ADDRESS lsn);
+
+/*
+ Write page to the disk
+
+ SYNOPSIS
+ pagecache_fwrite()
+ pagecache - page cache pointer
+ filedesc - pagecache file descriptor structure
+ buffer - buffer which we will write
+ type - page type (plain or with LSN)
+ flags - MYF() flags
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool pagecache_fwrite(PAGECACHE *pagecache,
+ PAGECACHE_FILE *filedesc,
+ uchar *buffer,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_type type
+ __attribute__((unused)),
+ myf flags)
+{
+ DBUG_ENTER("pagecache_fwrite");
+ DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE);
+
+ /* Todo: Integrate this with write_callback so we have only one callback */
+ if ((*filedesc->flush_log_callback)(buffer, pageno, filedesc->callback_data))
+ DBUG_RETURN(1);
+ DBUG_PRINT("info", ("write_callback: 0x%lx data: 0x%lx",
+ (ulong) filedesc->write_callback,
+ (ulong) filedesc->callback_data));
+ if ((*filedesc->write_callback)(buffer, pageno, filedesc->callback_data))
+ {
+ DBUG_PRINT("error", ("write callback problem"));
+ DBUG_RETURN(1);
+ }
+ if (my_pwrite(filedesc->file, buffer, pagecache->block_size,
+ ((my_off_t) pageno << pagecache->shift), flags))
+ {
+ (*filedesc->write_fail)(filedesc->callback_data);
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Read page from the disk
+
+ SYNOPSIS
+ pagecache_fread()
+ pagecache - page cache pointer
+ filedesc - pagecache file descriptor structure
+ buffer - buffer in which we will read
+ pageno - page number
+ flags - MYF() flags
+*/
+#define pagecache_fread(pagecache, filedesc, buffer, pageno, flags) \
+ my_pread((filedesc)->file, buffer, pagecache->block_size, \
+ ((my_off_t) pageno << pagecache->shift), flags)
+
+
+/**
+ @brief set rec_lsn of pagecache block (if it is needed)
+
+ @param block block where to set rec_lsn
+ @param first_REDO_LSN_for_page the LSN to set
+*/
+
+static inline void pagecache_set_block_rec_lsn(PAGECACHE_BLOCK_LINK *block,
+ LSN first_REDO_LSN_for_page)
+{
+ if (block->rec_lsn == LSN_MAX)
+ block->rec_lsn= first_REDO_LSN_for_page;
+ else
+ DBUG_ASSERT(cmp_translog_addr(block->rec_lsn,
+ first_REDO_LSN_for_page) <= 0);
+}
+
+
+/*
+ next_power(value) is 2 at the power of (1+floor(log2(value)));
+ e.g. next_power(2)=4, next_power(3)=4.
+*/
+static inline uint next_power(uint value)
+{
+ return (uint) my_round_up_to_next_power((uint32) value) << 1;
+}
+
+
+/*
+ Initialize a page cache
+
+ SYNOPSIS
+ init_pagecache()
+ pagecache pointer to a page cache data structure
+ key_cache_block_size size of blocks to keep cached data
+ use_mem total memory to use for the key cache
+ division_limit division limit (may be zero)
+ age_threshold age threshold (may be zero)
+ block_size size of block (should be power of 2)
+ my_read_flags Flags used for all pread/pwrite calls
+ Usually MY_WME in case of recovery
+
+ RETURN VALUE
+ number of blocks in the key cache, if successful,
+ 0 - otherwise.
+
+ NOTES.
+ if pagecache->inited != 0 we assume that the key cache
+ is already initialized. This is for now used by myisamchk, but shouldn't
+ be something that a program should rely on!
+
+ It's assumed that no two threads call this function simultaneously
+ referring to the same key cache handle.
+
+*/
+
+ulong init_pagecache(PAGECACHE *pagecache, size_t use_mem,
+ uint division_limit, uint age_threshold,
+ uint block_size, myf my_readwrite_flags)
+{
+ ulong blocks, hash_links, length;
+ int error;
+ DBUG_ENTER("init_pagecache");
+ DBUG_ASSERT(block_size >= 512);
+
+ PAGECACHE_DEBUG_OPEN;
+ if (pagecache->inited && pagecache->disk_blocks > 0)
+ {
+ DBUG_PRINT("warning",("key cache already in use"));
+ DBUG_RETURN(0);
+ }
+
+ pagecache->global_cache_w_requests= pagecache->global_cache_r_requests= 0;
+ pagecache->global_cache_read= pagecache->global_cache_write= 0;
+ pagecache->disk_blocks= -1;
+ if (! pagecache->inited)
+ {
+ if (pthread_mutex_init(&pagecache->cache_lock, MY_MUTEX_INIT_FAST) ||
+ hash_init(&pagecache->files_in_flush, &my_charset_bin, 32,
+ offsetof(struct st_file_in_flush, file),
+ sizeof(((struct st_file_in_flush *)NULL)->file),
+ NULL, NULL, 0))
+ goto err;
+ pagecache->inited= 1;
+ pagecache->in_init= 0;
+ pagecache->resize_queue.last_thread= NULL;
+ }
+
+ pagecache->mem_size= use_mem;
+ pagecache->block_size= block_size;
+ pagecache->shift= my_bit_log2(block_size);
+ pagecache->readwrite_flags= my_readwrite_flags | MY_NABP | MY_WAIT_IF_FULL;
+ pagecache->org_readwrite_flags= pagecache->readwrite_flags;
+ DBUG_PRINT("info", ("block_size: %u", block_size));
+ DBUG_ASSERT(((uint)(1 << pagecache->shift)) == block_size);
+
+ blocks= (ulong) (use_mem / (sizeof(PAGECACHE_BLOCK_LINK) +
+ 2 * sizeof(PAGECACHE_HASH_LINK) +
+ sizeof(PAGECACHE_HASH_LINK*) *
+ 5/4 + block_size));
+ /*
+ We need to support page cache with just one block to be able to do
+ scanning of rows-in-block files
+ */
+ for ( ; ; )
+ {
+ if (blocks < 8)
+ {
+ my_errno= ENOMEM;
+ goto err;
+ }
+ /* Set my_hash_entries to the next bigger 2 power */
+ if ((pagecache->hash_entries= next_power(blocks)) <
+ (blocks) * 5/4)
+ pagecache->hash_entries<<= 1;
+ hash_links= 2 * blocks;
+#if defined(MAX_THREADS)
+ if (hash_links < MAX_THREADS + blocks - 1)
+ hash_links= MAX_THREADS + blocks - 1;
+#endif
+ while ((length= (ALIGN_SIZE(blocks * sizeof(PAGECACHE_BLOCK_LINK)) +
+ ALIGN_SIZE(hash_links * sizeof(PAGECACHE_HASH_LINK)) +
+ ALIGN_SIZE(sizeof(PAGECACHE_HASH_LINK*) *
+ pagecache->hash_entries))) +
+ (blocks << pagecache->shift) > use_mem)
+ blocks--;
+ /* Allocate memory for cache page buffers */
+ if ((pagecache->block_mem=
+ my_large_malloc((ulong) blocks * pagecache->block_size,
+ MYF(MY_WME))))
+ {
+ /*
+ Allocate memory for blocks, hash_links and hash entries;
+ For each block 2 hash links are allocated
+ */
+ if ((pagecache->block_root=
+ (PAGECACHE_BLOCK_LINK*) my_malloc((size_t) length, MYF(0))))
+ break;
+ my_large_free(pagecache->block_mem, MYF(0));
+ pagecache->block_mem= 0;
+ }
+ blocks= blocks / 4*3;
+ }
+ pagecache->blocks_unused= blocks;
+ pagecache->disk_blocks= (long) blocks;
+ pagecache->hash_links= hash_links;
+ pagecache->hash_root=
+ (PAGECACHE_HASH_LINK**) ((char*) pagecache->block_root +
+ ALIGN_SIZE(blocks*sizeof(PAGECACHE_BLOCK_LINK)));
+ pagecache->hash_link_root=
+ (PAGECACHE_HASH_LINK*) ((char*) pagecache->hash_root +
+ ALIGN_SIZE((sizeof(PAGECACHE_HASH_LINK*) *
+ pagecache->hash_entries)));
+ bzero((uchar*) pagecache->block_root,
+ pagecache->disk_blocks * sizeof(PAGECACHE_BLOCK_LINK));
+ bzero((uchar*) pagecache->hash_root,
+ pagecache->hash_entries * sizeof(PAGECACHE_HASH_LINK*));
+ bzero((uchar*) pagecache->hash_link_root,
+ pagecache->hash_links * sizeof(PAGECACHE_HASH_LINK));
+ pagecache->hash_links_used= 0;
+ pagecache->free_hash_list= NULL;
+ pagecache->blocks_used= pagecache->blocks_changed= 0;
+
+ pagecache->global_blocks_changed= 0;
+ pagecache->blocks_available=0; /* For debugging */
+
+ /* The LRU chain is empty after initialization */
+ pagecache->used_last= NULL;
+ pagecache->used_ins= NULL;
+ pagecache->free_block_list= NULL;
+ pagecache->time= 0;
+ pagecache->warm_blocks= 0;
+ pagecache->min_warm_blocks= (division_limit ?
+ blocks * division_limit / 100 + 1 :
+ blocks);
+ pagecache->age_threshold= (age_threshold ?
+ blocks * age_threshold / 100 :
+ blocks);
+
+ pagecache->cnt_for_resize_op= 0;
+ pagecache->resize_in_flush= 0;
+ pagecache->can_be_used= 1;
+
+ pagecache->waiting_for_hash_link.last_thread= NULL;
+ pagecache->waiting_for_block.last_thread= NULL;
+ DBUG_PRINT("exit",
+ ("disk_blocks: %ld block_root: 0x%lx hash_entries: %ld\
+ hash_root: 0x%lx hash_links: %ld hash_link_root: 0x%lx",
+ pagecache->disk_blocks, (long) pagecache->block_root,
+ pagecache->hash_entries, (long) pagecache->hash_root,
+ pagecache->hash_links, (long) pagecache->hash_link_root));
+ bzero((uchar*) pagecache->changed_blocks,
+ sizeof(pagecache->changed_blocks[0]) *
+ PAGECACHE_CHANGED_BLOCKS_HASH);
+ bzero((uchar*) pagecache->file_blocks,
+ sizeof(pagecache->file_blocks[0]) *
+ PAGECACHE_CHANGED_BLOCKS_HASH);
+
+ pagecache->blocks= pagecache->disk_blocks > 0 ? pagecache->disk_blocks : 0;
+ DBUG_RETURN((ulong) pagecache->disk_blocks);
+
+err:
+ error= my_errno;
+ pagecache->disk_blocks= 0;
+ pagecache->blocks= 0;
+ if (pagecache->block_mem)
+ {
+ my_large_free(pagecache->block_mem, MYF(0));
+ pagecache->block_mem= NULL;
+ }
+ if (pagecache->block_root)
+ {
+ my_free(pagecache->block_root, MYF(0));
+ pagecache->block_root= NULL;
+ }
+ my_errno= error;
+ pagecache->can_be_used= 0;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Flush all blocks in the key cache to disk
+*/
+
+#ifdef NOT_USED
+static int flush_all_key_blocks(PAGECACHE *pagecache)
+{
+#if defined(PAGECACHE_DEBUG)
+ uint cnt=0;
+#endif
+ while (pagecache->blocks_changed > 0)
+ {
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->used_last->next_used ; ; block=block->next_used)
+ {
+ if (block->hash_link)
+ {
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+ if (flush_pagecache_blocks_int(pagecache, &block->hash_link->file,
+ FLUSH_RELEASE, NULL, NULL))
+ return 1;
+ break;
+ }
+ if (block == pagecache->used_last)
+ break;
+ }
+ }
+ return 0;
+}
+#endif /* NOT_USED */
+
+/*
+ Resize a key cache
+
+ SYNOPSIS
+ resize_pagecache()
+ pagecache pointer to a page cache data structure
+ use_mem total memory to use for the new key cache
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ RETURN VALUE
+ number of blocks in the key cache, if successful,
+ 0 - otherwise.
+
+ NOTES.
+ The function first compares the memory size parameter
+ with the key cache value.
+
+ If they differ the function free the the memory allocated for the
+ old key cache blocks by calling the end_pagecache function and
+ then rebuilds the key cache with new blocks by calling
+ init_key_cache.
+
+ The function starts the operation only when all other threads
+ performing operations with the key cache let her to proceed
+ (when cnt_for_resize=0).
+
+ Before being usable, this function needs:
+ - to receive fixes for BUG#17332 "changing key_buffer_size on a running
+ server can crash under load" similar to those done to the key cache
+ - to have us (Sanja) look at the additional constraints placed on
+ resizing, due to the page locking specific to this page cache.
+ So we disable it for now.
+*/
+#if NOT_USED /* keep disabled until code is fixed see above !! */
+ulong resize_pagecache(PAGECACHE *pagecache,
+ size_t use_mem, uint division_limit,
+ uint age_threshold)
+{
+ ulong blocks;
+#ifdef THREAD
+ struct st_my_thread_var *thread;
+ WQUEUE *wqueue;
+
+#endif
+ DBUG_ENTER("resize_pagecache");
+
+ if (!pagecache->inited)
+ DBUG_RETURN(pagecache->disk_blocks);
+
+ if(use_mem == pagecache->mem_size)
+ {
+ change_pagecache_param(pagecache, division_limit, age_threshold);
+ DBUG_RETURN(pagecache->disk_blocks);
+ }
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+#ifdef THREAD
+ wqueue= &pagecache->resize_queue;
+ thread= my_thread_var;
+ wqueue_link_into_queue(wqueue, thread);
+
+ while (wqueue->last_thread->next != thread)
+ {
+ pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock);
+ }
+#endif
+
+ pagecache->resize_in_flush= 1;
+ if (flush_all_key_blocks(pagecache))
+ {
+ /* TODO: if this happens, we should write a warning in the log file ! */
+ pagecache->resize_in_flush= 0;
+ blocks= 0;
+ pagecache->can_be_used= 0;
+ goto finish;
+ }
+ pagecache->resize_in_flush= 0;
+ pagecache->can_be_used= 0;
+#ifdef THREAD
+ while (pagecache->cnt_for_resize_op)
+ {
+ KEYCACHE_DBUG_PRINT("resize_pagecache: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock);
+ }
+#else
+ KEYCACHE_DBUG_ASSERT(pagecache->cnt_for_resize_op == 0);
+#endif
+
+ end_pagecache(pagecache, 0); /* Don't free mutex */
+ /* The following will work even if use_mem is 0 */
+ blocks= init_pagecache(pagecache, pagecache->block_size, use_mem,
+ division_limit, age_threshold,
+ pagecache->readwrite_flags);
+
+finish:
+#ifdef THREAD
+ wqueue_unlink_from_queue(wqueue, thread);
+ /* Signal for the next resize request to proceeed if any */
+ if (wqueue->last_thread)
+ {
+ KEYCACHE_DBUG_PRINT("resize_pagecache: signal",
+ ("thread %ld", wqueue->last_thread->next->id));
+ pagecache_pthread_cond_signal(&wqueue->last_thread->next->suspend);
+ }
+#endif
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_RETURN(blocks);
+}
+#endif /* 0 */
+
+
+/*
+ Increment counter blocking resize key cache operation
+*/
+static inline void inc_counter_for_resize_op(PAGECACHE *pagecache)
+{
+ pagecache->cnt_for_resize_op++;
+}
+
+
+/*
+ Decrement counter blocking resize key cache operation;
+ Signal the operation to proceed when counter becomes equal zero
+*/
+static inline void dec_counter_for_resize_op(PAGECACHE *pagecache)
+{
+#ifdef THREAD
+ struct st_my_thread_var *last_thread;
+ if (!--pagecache->cnt_for_resize_op &&
+ (last_thread= pagecache->resize_queue.last_thread))
+ {
+ KEYCACHE_DBUG_PRINT("dec_counter_for_resize_op: signal",
+ ("thread %ld", last_thread->next->id));
+ pagecache_pthread_cond_signal(&last_thread->next->suspend);
+ }
+#else
+ pagecache->cnt_for_resize_op--;
+#endif
+}
+
+/*
+ Change the page cache parameters
+
+ SYNOPSIS
+ change_pagecache_param()
+ pagecache pointer to a page cache data structure
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ RETURN VALUE
+ none
+
+ NOTES.
+ Presently the function resets the key cache parameters
+ concerning midpoint insertion strategy - division_limit and
+ age_threshold.
+*/
+
+void change_pagecache_param(PAGECACHE *pagecache, uint division_limit,
+ uint age_threshold)
+{
+ DBUG_ENTER("change_pagecache_param");
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (division_limit)
+ pagecache->min_warm_blocks= (pagecache->disk_blocks *
+ division_limit / 100 + 1);
+ if (age_threshold)
+ pagecache->age_threshold= (pagecache->disk_blocks *
+ age_threshold / 100);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Removes page cache from memory. Does NOT flush pages to disk.
+
+ SYNOPSIS
+ end_pagecache()
+ pagecache page cache handle
+ cleanup Complete free (Free also mutex for key cache)
+
+ RETURN VALUE
+ none
+*/
+
+void end_pagecache(PAGECACHE *pagecache, my_bool cleanup)
+{
+ DBUG_ENTER("end_pagecache");
+ DBUG_PRINT("enter", ("key_cache: 0x%lx", (long) pagecache));
+
+ if (!pagecache->inited)
+ DBUG_VOID_RETURN;
+
+ if (pagecache->disk_blocks > 0)
+ {
+ if (pagecache->block_mem)
+ {
+ my_large_free(pagecache->block_mem, MYF(0));
+ pagecache->block_mem= NULL;
+ my_free(pagecache->block_root, MYF(0));
+ pagecache->block_root= NULL;
+ }
+ pagecache->disk_blocks= -1;
+ /* Reset blocks_changed to be safe if flush_all_key_blocks is called */
+ pagecache->blocks_changed= 0;
+ }
+
+ DBUG_PRINT("status", ("used: %lu changed: %lu w_requests: %lu "
+ "writes: %lu r_requests: %lu reads: %lu",
+ pagecache->blocks_used,
+ pagecache->global_blocks_changed,
+ (ulong) pagecache->global_cache_w_requests,
+ (ulong) pagecache->global_cache_write,
+ (ulong) pagecache->global_cache_r_requests,
+ (ulong) pagecache->global_cache_read));
+
+ if (cleanup)
+ {
+ hash_free(&pagecache->files_in_flush);
+ pthread_mutex_destroy(&pagecache->cache_lock);
+ pagecache->inited= pagecache->can_be_used= 0;
+ PAGECACHE_DEBUG_CLOSE;
+ }
+ DBUG_VOID_RETURN;
+} /* end_pagecache */
+
+
+/*
+ Unlink a block from the chain of dirty/clean blocks
+*/
+
+static inline void unlink_changed(PAGECACHE_BLOCK_LINK *block)
+{
+ if (block->next_changed)
+ block->next_changed->prev_changed= block->prev_changed;
+ *block->prev_changed= block->next_changed;
+}
+
+
+/*
+ Link a block into the chain of dirty/clean blocks
+*/
+
+static inline void link_changed(PAGECACHE_BLOCK_LINK *block,
+ PAGECACHE_BLOCK_LINK **phead)
+{
+ block->prev_changed= phead;
+ if ((block->next_changed= *phead))
+ (*phead)->prev_changed= &block->next_changed;
+ *phead= block;
+}
+
+
+/*
+ Unlink a block from the chain of dirty/clean blocks, if it's asked for,
+ and link it to the chain of clean blocks for the specified file
+*/
+
+static void link_to_file_list(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ PAGECACHE_FILE *file, my_bool unlink_flag)
+{
+ if (unlink_flag)
+ unlink_changed(block);
+ link_changed(block, &pagecache->file_blocks[FILE_HASH(*file)]);
+ if (block->status & PCBLOCK_CHANGED)
+ {
+ block->status&= ~PCBLOCK_CHANGED;
+ block->rec_lsn= LSN_MAX;
+ pagecache->blocks_changed--;
+ pagecache->global_blocks_changed--;
+ }
+}
+
+
+/*
+ Unlink a block from the chain of clean blocks for the specified
+ file and link it to the chain of dirty blocks for this file
+*/
+
+static inline void link_to_changed_list(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block)
+{
+ unlink_changed(block);
+ link_changed(block,
+ &pagecache->changed_blocks[FILE_HASH(block->hash_link->file)]);
+ block->status|=PCBLOCK_CHANGED;
+ pagecache->blocks_changed++;
+ pagecache->global_blocks_changed++;
+}
+
+
+/*
+ Link a block to the LRU chain at the beginning or at the end of
+ one of two parts.
+
+ SYNOPSIS
+ link_block()
+ pagecache pointer to a page cache data structure
+ block pointer to the block to link to the LRU chain
+ hot <-> to link the block into the hot subchain
+ at_end <-> to link the block at the end of the subchain
+
+ RETURN VALUE
+ none
+
+ NOTES.
+ The LRU chain is represented by a circular list of block structures.
+ The list is double-linked of the type (**prev,*next) type.
+ The LRU chain is divided into two parts - hot and warm.
+ There are two pointers to access the last blocks of these two
+ parts. The beginning of the warm part follows right after the
+ end of the hot part.
+ Only blocks of the warm part can be used for replacement.
+ The first block from the beginning of this subchain is always
+ taken for eviction (pagecache->last_used->next)
+
+ LRU chain: +------+ H O T +------+
+ +----| end |----...<----| beg |----+
+ | +------+last +------+ |
+ v<-link in latest hot (new end) |
+ | link in latest warm (new end)->^
+ | +------+ W A R M +------+ |
+ +----| beg |---->...----| end |----+
+ +------+ +------+ins
+ first for eviction
+*/
+
+static void link_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block,
+ my_bool hot, my_bool at_end)
+{
+ PAGECACHE_BLOCK_LINK *ins;
+ PAGECACHE_BLOCK_LINK **ptr_ins;
+
+ PCBLOCK_INFO(block);
+ KEYCACHE_DBUG_ASSERT(! (block->hash_link && block->hash_link->requests));
+#ifdef THREAD
+ if (!hot && pagecache->waiting_for_block.last_thread)
+ {
+ /* Signal that in the LRU warm sub-chain an available block has appeared */
+ struct st_my_thread_var *last_thread=
+ pagecache->waiting_for_block.last_thread;
+ struct st_my_thread_var *first_thread= last_thread->next;
+ struct st_my_thread_var *next_thread= first_thread;
+ PAGECACHE_HASH_LINK *hash_link=
+ (PAGECACHE_HASH_LINK *) first_thread->opt_info;
+ struct st_my_thread_var *thread;
+ do
+ {
+ thread= next_thread;
+ next_thread= thread->next;
+ /*
+ We notify about the event all threads that ask
+ for the same page as the first thread in the queue
+ */
+ if ((PAGECACHE_HASH_LINK *) thread->opt_info == hash_link)
+ {
+ KEYCACHE_DBUG_PRINT("link_block: signal", ("thread: %ld", thread->id));
+ pagecache_pthread_cond_signal(&thread->suspend);
+ wqueue_unlink_from_queue(&pagecache->waiting_for_block, thread);
+ block->requests++;
+ }
+ }
+ while (thread != last_thread);
+ hash_link->block= block;
+ KEYCACHE_THREAD_TRACE("link_block: after signaling");
+#if defined(PAGECACHE_DEBUG)
+ KEYCACHE_DBUG_PRINT("link_block",
+ ("linked,unlinked block: %u status: %x #requests: %u #available: %u",
+ PCBLOCK_NUMBER(pagecache, block), block->status,
+ block->requests, pagecache->blocks_available));
+#endif
+ return;
+ }
+#else /* THREAD */
+ KEYCACHE_DBUG_ASSERT(! (!hot && pagecache->waiting_for_block.last_thread));
+ /* Condition not transformed using DeMorgan, to keep the text identical */
+#endif /* THREAD */
+ ptr_ins= hot ? &pagecache->used_ins : &pagecache->used_last;
+ ins= *ptr_ins;
+ if (ins)
+ {
+ ins->next_used->prev_used= &block->next_used;
+ block->next_used= ins->next_used;
+ block->prev_used= &ins->next_used;
+ ins->next_used= block;
+ if (at_end)
+ *ptr_ins= block;
+ }
+ else
+ {
+ /* The LRU chain is empty */
+ pagecache->used_last= pagecache->used_ins= block->next_used= block;
+ block->prev_used= &block->next_used;
+ }
+ KEYCACHE_THREAD_TRACE("link_block");
+#if defined(PAGECACHE_DEBUG)
+ pagecache->blocks_available++;
+ KEYCACHE_DBUG_PRINT("link_block",
+ ("linked block: %u:%1u status: %x #requests: %u #available: %u",
+ PCBLOCK_NUMBER(pagecache, block), at_end, block->status,
+ block->requests, pagecache->blocks_available));
+ KEYCACHE_DBUG_ASSERT((ulong) pagecache->blocks_available <=
+ pagecache->blocks_used);
+#endif
+}
+
+
+/*
+ Unlink a block from the LRU chain
+
+ SYNOPSIS
+ unlink_block()
+ pagecache pointer to a page cache data structure
+ block pointer to the block to unlink from the LRU chain
+
+ RETURN VALUE
+ none
+
+ NOTES.
+ See NOTES for link_block
+*/
+
+static void unlink_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block)
+{
+ DBUG_ENTER("unlink_block");
+ DBUG_PRINT("unlink_block", ("unlink 0x%lx", (ulong)block));
+ DBUG_ASSERT(block->next_used != NULL);
+ if (block->next_used == block)
+ {
+ /* The list contains only one member */
+ pagecache->used_last= pagecache->used_ins= NULL;
+ }
+ else
+ {
+ block->next_used->prev_used= block->prev_used;
+ *block->prev_used= block->next_used;
+ if (pagecache->used_last == block)
+ pagecache->used_last= STRUCT_PTR(PAGECACHE_BLOCK_LINK,
+ next_used, block->prev_used);
+ if (pagecache->used_ins == block)
+ pagecache->used_ins= STRUCT_PTR(PAGECACHE_BLOCK_LINK,
+ next_used, block->prev_used);
+ }
+ block->next_used= NULL;
+
+ KEYCACHE_THREAD_TRACE("unlink_block");
+#if defined(PAGECACHE_DEBUG)
+ KEYCACHE_DBUG_ASSERT(pagecache->blocks_available != 0);
+ pagecache->blocks_available--;
+ KEYCACHE_DBUG_PRINT("unlink_block",
+ ("unlinked block: 0x%lx (%u) status: %x #requests: %u #available: %u",
+ (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+ block->status,
+ block->requests, pagecache->blocks_available));
+ PCBLOCK_INFO(block);
+#endif
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Register requests for a block
+
+ SYNOPSIS
+ reg_requests()
+ pagecache this page cache reference
+ block the block we request reference
+ count how many requests we register (it is 1 everywhere)
+
+ NOTE
+ Registration of request means we are going to use this block so we exclude
+ it from the LRU if it is first request
+*/
+static void reg_requests(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block,
+ int count)
+{
+ DBUG_ENTER("reg_requests");
+ DBUG_PRINT("enter", ("block: 0x%lx (%u) status: %x reqs: %u",
+ (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+ block->status, block->requests));
+ PCBLOCK_INFO(block);
+ if (! block->requests)
+ /* First request for the block unlinks it */
+ unlink_block(pagecache, block);
+ block->requests+= count;
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unregister request for a block
+ linking it to the LRU chain if it's the last request
+
+ SYNOPSIS
+ unreg_request()
+ pagecache pointer to a page cache data structure
+ block pointer to the block to link to the LRU chain
+ at_end <-> to link the block at the end of the LRU chain
+
+ RETURN VALUE
+ none
+
+ NOTES.
+ Every linking to the LRU chain decrements by one a special block
+ counter (if it's positive). If the at_end parameter is TRUE the block is
+ added either at the end of warm sub-chain or at the end of hot sub-chain.
+ It is added to the hot subchain if its counter is zero and number of
+ blocks in warm sub-chain is not less than some low limit (determined by
+ the division_limit parameter). Otherwise the block is added to the warm
+ sub-chain. If the at_end parameter is FALSE the block is always added
+ at beginning of the warm sub-chain.
+ Thus a warm block can be promoted to the hot sub-chain when its counter
+ becomes zero for the first time.
+ At the same time the block at the very beginning of the hot subchain
+ might be moved to the beginning of the warm subchain if it stays untouched
+ for a too long time (this time is determined by parameter age_threshold).
+*/
+
+static void unreg_request(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block, int at_end)
+{
+ DBUG_ENTER("unreg_request");
+ DBUG_PRINT("enter", ("block 0x%lx (%u) status: %x reqs: %u",
+ (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+ block->status, block->requests));
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->requests > 0);
+ if (! --block->requests)
+ {
+ my_bool hot;
+ if (block->hits_left)
+ block->hits_left--;
+ hot= !block->hits_left && at_end &&
+ pagecache->warm_blocks > pagecache->min_warm_blocks;
+ if (hot)
+ {
+ if (block->temperature == PCBLOCK_WARM)
+ pagecache->warm_blocks--;
+ block->temperature= PCBLOCK_HOT;
+ KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu",
+ pagecache->warm_blocks));
+ }
+ link_block(pagecache, block, hot, (my_bool)at_end);
+ block->last_hit_time= pagecache->time;
+ pagecache->time++;
+
+ block= pagecache->used_ins;
+ /* Check if we should link a hot block to the warm block */
+ if (block && pagecache->time - block->last_hit_time >
+ pagecache->age_threshold)
+ {
+ unlink_block(pagecache, block);
+ link_block(pagecache, block, 0, 0);
+ if (block->temperature != PCBLOCK_WARM)
+ {
+ pagecache->warm_blocks++;
+ block->temperature= PCBLOCK_WARM;
+ }
+ KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu",
+ pagecache->warm_blocks));
+ }
+ }
+ DBUG_VOID_RETURN;
+}
+
+/*
+ Remove a reader of the page in block
+*/
+
+static inline void remove_reader(PAGECACHE_BLOCK_LINK *block)
+{
+ DBUG_ENTER("remove_reader");
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->hash_link->requests > 0);
+#ifdef THREAD
+ if (! --block->hash_link->requests && block->condvar)
+ pagecache_pthread_cond_signal(block->condvar);
+#else
+ --block->hash_link->requests;
+#endif
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Wait until the last reader of the page in block
+ signals on its termination
+*/
+
+static inline void wait_for_readers(PAGECACHE *pagecache
+ __attribute__((unused)),
+ PAGECACHE_BLOCK_LINK *block)
+{
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ while (block->hash_link->requests)
+ {
+ KEYCACHE_DBUG_PRINT("wait_for_readers: wait",
+ ("suspend thread: %ld block: %u",
+ thread->id, PCBLOCK_NUMBER(pagecache, block)));
+ block->condvar= &thread->suspend;
+ pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock);
+ block->condvar= NULL;
+ }
+#else
+ KEYCACHE_DBUG_ASSERT(block->hash_link->requests == 0);
+#endif
+}
+
+
+/*
+ Add a hash link to a bucket in the hash_table
+*/
+
+static inline void link_hash(PAGECACHE_HASH_LINK **start,
+ PAGECACHE_HASH_LINK *hash_link)
+{
+ if (*start)
+ (*start)->prev= &hash_link->next;
+ hash_link->next= *start;
+ hash_link->prev= start;
+ *start= hash_link;
+}
+
+
+/*
+ Remove a hash link from the hash table
+*/
+
+static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link)
+{
+ KEYCACHE_DBUG_PRINT("unlink_hash", ("fd: %u pos_ %lu #requests=%u",
+ (uint) hash_link->file.file, (ulong) hash_link->pageno,
+ hash_link->requests));
+ KEYCACHE_DBUG_ASSERT(hash_link->requests == 0);
+ if ((*hash_link->prev= hash_link->next))
+ hash_link->next->prev= hash_link->prev;
+ hash_link->block= NULL;
+#ifdef THREAD
+ if (pagecache->waiting_for_hash_link.last_thread)
+ {
+ /* Signal that a free hash link has appeared */
+ struct st_my_thread_var *last_thread=
+ pagecache->waiting_for_hash_link.last_thread;
+ struct st_my_thread_var *first_thread= last_thread->next;
+ struct st_my_thread_var *next_thread= first_thread;
+ PAGECACHE_PAGE *first_page= (PAGECACHE_PAGE *) (first_thread->opt_info);
+ struct st_my_thread_var *thread;
+
+ hash_link->file= first_page->file;
+ DBUG_ASSERT(first_page->pageno < ((ULL(1)) << 40));
+ hash_link->pageno= first_page->pageno;
+ do
+ {
+ PAGECACHE_PAGE *page;
+ thread= next_thread;
+ page= (PAGECACHE_PAGE *) thread->opt_info;
+ next_thread= thread->next;
+ /*
+ We notify about the event all threads that ask
+ for the same page as the first thread in the queue
+ */
+ if (page->file.file == hash_link->file.file &&
+ page->pageno == hash_link->pageno)
+ {
+ KEYCACHE_DBUG_PRINT("unlink_hash: signal", ("thread %ld", thread->id));
+ pagecache_pthread_cond_signal(&thread->suspend);
+ wqueue_unlink_from_queue(&pagecache->waiting_for_hash_link, thread);
+ }
+ }
+ while (thread != last_thread);
+ link_hash(&pagecache->hash_root[PAGECACHE_HASH(pagecache,
+ hash_link->file,
+ hash_link->pageno)],
+ hash_link);
+ return;
+ }
+#else /* THREAD */
+ KEYCACHE_DBUG_ASSERT(! (pagecache->waiting_for_hash_link.last_thread));
+#endif /* THREAD */
+ hash_link->next= pagecache->free_hash_list;
+ pagecache->free_hash_list= hash_link;
+}
+
+
+/*
+ Get the hash link for the page if it is in the cache (do not put the
+ page in the cache if it is absent there)
+
+ SYNOPSIS
+ get_present_hash_link()
+ pagecache Pagecache reference
+ file file ID
+ pageno page number in the file
+ start where to put pointer to found hash bucket (for
+ direct referring it)
+
+ RETURN
+ found hashlink pointer
+*/
+
+static PAGECACHE_HASH_LINK *get_present_hash_link(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ PAGECACHE_HASH_LINK ***start)
+{
+ reg1 PAGECACHE_HASH_LINK *hash_link;
+#if defined(PAGECACHE_DEBUG)
+ int cnt;
+#endif
+ DBUG_ENTER("get_present_hash_link");
+
+ KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u pos: %lu",
+ (uint) file->file, (ulong) pageno));
+
+ /*
+ Find the bucket in the hash table for the pair (file, pageno);
+ start contains the head of the bucket list,
+ hash_link points to the first member of the list
+ */
+ hash_link= *(*start= &pagecache->hash_root[PAGECACHE_HASH(pagecache,
+ *file, pageno)]);
+#if defined(PAGECACHE_DEBUG)
+ cnt= 0;
+#endif
+ /* Look for an element for the pair (file, pageno) in the bucket chain */
+ while (hash_link &&
+ (hash_link->pageno != pageno ||
+ hash_link->file.file != file->file))
+ {
+ hash_link= hash_link->next;
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ if (! (cnt <= pagecache->hash_links_used))
+ {
+ int i;
+ for (i=0, hash_link= **start ;
+ i < cnt ; i++, hash_link= hash_link->next)
+ {
+ KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u pos: %lu",
+ (uint) hash_link->file.file, (ulong) hash_link->pageno));
+ }
+ }
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->hash_links_used);
+#endif
+ }
+ if (hash_link)
+ {
+ /* Register the request for the page */
+ hash_link->requests++;
+ }
+ /*
+ As soon as the caller will release the page cache's lock, "hash_link"
+ will be potentially obsolete (unusable) information.
+ */
+ DBUG_RETURN(hash_link);
+}
+
+
+/*
+ Get the hash link for a page
+*/
+
+static PAGECACHE_HASH_LINK *get_hash_link(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno)
+{
+ reg1 PAGECACHE_HASH_LINK *hash_link;
+ PAGECACHE_HASH_LINK **start;
+
+ KEYCACHE_DBUG_PRINT("get_hash_link", ("fd: %u pos: %lu",
+ (uint) file->file, (ulong) pageno));
+
+restart:
+ /* try to find the page in the cache */
+ hash_link= get_present_hash_link(pagecache, file, pageno,
+ &start);
+ if (!hash_link)
+ {
+ /* There is no hash link in the hash table for the pair (file, pageno) */
+ if (pagecache->free_hash_list)
+ {
+ hash_link= pagecache->free_hash_list;
+ pagecache->free_hash_list= hash_link->next;
+ }
+ else if (pagecache->hash_links_used < pagecache->hash_links)
+ {
+ hash_link= &pagecache->hash_link_root[pagecache->hash_links_used++];
+ }
+ else
+ {
+#ifdef THREAD
+ /* Wait for a free hash link */
+ struct st_my_thread_var *thread= my_thread_var;
+ PAGECACHE_PAGE page;
+ KEYCACHE_DBUG_PRINT("get_hash_link", ("waiting"));
+ page.file= *file;
+ page.pageno= pageno;
+ thread->opt_info= (void *) &page;
+ wqueue_link_into_queue(&pagecache->waiting_for_hash_link, thread);
+ KEYCACHE_DBUG_PRINT("get_hash_link: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ thread->opt_info= NULL;
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+#endif
+ DBUG_PRINT("info", ("restarting..."));
+ goto restart;
+ }
+ hash_link->file= *file;
+ DBUG_ASSERT(pageno < ((ULL(1)) << 40));
+ hash_link->pageno= pageno;
+ link_hash(start, hash_link);
+ /* Register the request for the page */
+ hash_link->requests++;
+ }
+
+ return hash_link;
+}
+
+
+/*
+ Get a block for the file page requested by a pagecache read/write operation;
+ If the page is not in the cache return a free block, if there is none
+ return the lru block after saving its buffer if the page is dirty.
+
+ SYNOPSIS
+
+ find_block()
+ pagecache pointer to a page cache data structure
+ file handler for the file to read page from
+ pageno number of the page in the file
+ init_hits_left how initialize the block counter for the page
+ wrmode <-> get for writing
+ reg_req Register request to thye page
+ page_st out {PAGE_READ,PAGE_TO_BE_READ,PAGE_WAIT_TO_BE_READ}
+
+ RETURN VALUE
+ Pointer to the found block if successful, 0 - otherwise
+
+ NOTES.
+ For the page from file positioned at pageno the function checks whether
+ the page is in the key cache specified by the first parameter.
+ If this is the case it immediately returns the block.
+ If not, the function first chooses a block for this page. If there is
+ no not used blocks in the key cache yet, the function takes the block
+ at the very beginning of the warm sub-chain. It saves the page in that
+ block if it's dirty before returning the pointer to it.
+ The function returns in the page_st parameter the following values:
+ PAGE_READ - if page already in the block,
+ PAGE_TO_BE_READ - if it is to be read yet by the current thread
+ WAIT_TO_BE_READ - if it is to be read by another thread
+ If an error occurs THE PCBLOCK_ERROR bit is set in the block status.
+ It might happen that there are no blocks in LRU chain (in warm part) -
+ all blocks are unlinked for some read/write operations. Then the function
+ waits until first of this operations links any block back.
+*/
+
+static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ int init_hits_left,
+ my_bool wrmode,
+ my_bool reg_req,
+ int *page_st)
+{
+ PAGECACHE_HASH_LINK *hash_link;
+ PAGECACHE_BLOCK_LINK *block;
+ int error= 0;
+ int page_status;
+
+ DBUG_ENTER("find_block");
+ KEYCACHE_THREAD_TRACE("find_block:begin");
+ DBUG_PRINT("enter", ("fd: %d pos: %lu wrmode: %d",
+ file->file, (ulong) pageno, wrmode));
+ KEYCACHE_DBUG_PRINT("find_block", ("fd: %d pos: %lu wrmode: %d",
+ file->file, (ulong) pageno,
+ wrmode));
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache, "start of find_block", 0););
+#endif
+
+restart:
+ /* Find the hash link for the requested page (file, pageno) */
+ hash_link= get_hash_link(pagecache, file, pageno);
+
+ page_status= -1;
+ if ((block= hash_link->block) &&
+ block->hash_link == hash_link && (block->status & PCBLOCK_READ))
+ page_status= PAGE_READ;
+
+ if (wrmode && pagecache->resize_in_flush)
+ {
+ /* This is a write request during the flush phase of a resize operation */
+
+ if (page_status != PAGE_READ)
+ {
+ /* We don't need the page in the cache: we are going to write on disk */
+ DBUG_ASSERT(hash_link->requests > 0);
+ hash_link->requests--;
+ unlink_hash(pagecache, hash_link);
+ return 0;
+ }
+ if (!(block->status & PCBLOCK_IN_FLUSH))
+ {
+ DBUG_ASSERT(hash_link->requests > 0);
+ hash_link->requests--;
+ /*
+ Remove block to invalidate the page in the block buffer
+ as we are going to write directly on disk.
+ Although we have an exclusive lock for the updated key part
+ the control can be yielded by the current thread as we might
+ have unfinished readers of other key parts in the block
+ buffer. Still we are guaranteed not to have any readers
+ of the key part we are writing into until the block is
+ removed from the cache as we set the PCBLOCK_REASSIGNED
+ flag (see the code below that handles reading requests).
+ */
+ free_block(pagecache, block);
+ return 0;
+ }
+ /* Wait until the page is flushed on disk */
+ DBUG_ASSERT(hash_link->requests > 0);
+ hash_link->requests--;
+ {
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("find_block: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while(thread->next);
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+ /*
+ Given the use of "resize_in_flush", it seems impossible
+ that this whole branch is ever entered in single-threaded case
+ because "(wrmode && pagecache->resize_in_flush)" cannot be true.
+ TODO: Check this, and then put the whole branch into the
+ "#ifdef THREAD" guard.
+ */
+#endif
+ }
+ /* Invalidate page in the block if it has not been done yet */
+ if (block->status)
+ free_block(pagecache, block);
+ return 0;
+ }
+
+ if (page_status == PAGE_READ &&
+ (block->status & (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED)))
+ {
+ /* This is a request for a page to be removed from cache */
+
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("request for old page in block: %u "
+ "wrmode: %d block->status: %d",
+ PCBLOCK_NUMBER(pagecache, block), wrmode,
+ block->status));
+ /*
+ Only reading requests can proceed until the old dirty page is flushed,
+ all others are to be suspended, then resubmitted
+ */
+ if (!wrmode && !(block->status & PCBLOCK_REASSIGNED))
+ {
+ if (reg_req)
+ reg_requests(pagecache, block, 1);
+ }
+ else
+ {
+ DBUG_ASSERT(hash_link->requests > 0);
+ hash_link->requests--;
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("request waiting for old page to be saved"));
+ {
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ /* Put the request into the queue of those waiting for the old page */
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread);
+ /* Wait until the request can be resubmitted */
+ do
+ {
+ KEYCACHE_DBUG_PRINT("find_block: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while(thread->next);
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+ /* No parallel requests in single-threaded case */
+#endif
+ }
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("request for old page resubmitted"));
+ DBUG_PRINT("info", ("restarting..."));
+ /* Resubmit the request */
+ goto restart;
+ }
+ }
+ else
+ {
+ /* This is a request for a new page or for a page not to be removed */
+ if (! block)
+ {
+ /* No block is assigned for the page yet */
+ if (pagecache->blocks_unused)
+ {
+ if (pagecache->free_block_list)
+ {
+ /* There is a block in the free list. */
+ block= pagecache->free_block_list;
+ pagecache->free_block_list= block->next_used;
+ block->next_used= NULL;
+ }
+ else
+ {
+ /* There are some never used blocks, take first of them */
+ block= &pagecache->block_root[pagecache->blocks_used];
+ block->buffer= ADD_TO_PTR(pagecache->block_mem,
+ ((ulong) pagecache->blocks_used*
+ pagecache->block_size),
+ uchar*);
+ pagecache->blocks_used++;
+ }
+ pagecache->blocks_unused--;
+ DBUG_ASSERT(block->wlocks == 0);
+ DBUG_ASSERT(block->rlocks == 0);
+ DBUG_ASSERT(block->rlocks_queue == 0);
+ DBUG_ASSERT(block->pins == 0);
+ block->status= 0;
+#ifndef DBUG_OFF
+ block->type= PAGECACHE_EMPTY_PAGE;
+#endif
+ block->requests= 1;
+ block->temperature= PCBLOCK_COLD;
+ block->hits_left= init_hits_left;
+ block->last_hit_time= 0;
+ block->rec_lsn= LSN_MAX;
+ link_to_file_list(pagecache, block, file, 0);
+ block->hash_link= hash_link;
+ hash_link->block= block;
+ page_status= PAGE_TO_BE_READ;
+ DBUG_PRINT("info", ("page to be read set for page 0x%lx",
+ (ulong)block));
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("got free or never used block %u",
+ PCBLOCK_NUMBER(pagecache, block)));
+ }
+ else
+ {
+ /* There are no never used blocks, use a block from the LRU chain */
+
+ /*
+ Wait until a new block is added to the LRU chain;
+ several threads might wait here for the same page,
+ all of them must get the same block
+ */
+
+#ifdef THREAD
+ if (! pagecache->used_last)
+ {
+ struct st_my_thread_var *thread= my_thread_var;
+ thread->opt_info= (void *) hash_link;
+ wqueue_link_into_queue(&pagecache->waiting_for_block, thread);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("find_block: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while (thread->next);
+ thread->opt_info= NULL;
+ }
+#else
+ KEYCACHE_DBUG_ASSERT(pagecache->used_last);
+#endif
+ block= hash_link->block;
+ if (! block)
+ {
+ /*
+ Take the first block from the LRU chain
+ unlinking it from the chain
+ */
+ block= pagecache->used_last->next_used;
+ block->hits_left= init_hits_left;
+ block->last_hit_time= 0;
+ if (reg_req)
+ reg_requests(pagecache, block, 1);
+ hash_link->block= block;
+ }
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->wlocks == 0);
+ DBUG_ASSERT(block->rlocks == 0);
+ DBUG_ASSERT(block->rlocks_queue == 0);
+ DBUG_ASSERT(block->pins == 0);
+
+ if (block->hash_link != hash_link &&
+ ! (block->status & PCBLOCK_IN_SWITCH) )
+ {
+ /* this is a primary request for a new page */
+ DBUG_ASSERT(block->wlocks == 0);
+ DBUG_ASSERT(block->rlocks == 0);
+ DBUG_ASSERT(block->rlocks_queue == 0);
+ DBUG_ASSERT(block->pins == 0);
+ block->status|= PCBLOCK_IN_SWITCH;
+
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("got block %u for new page",
+ PCBLOCK_NUMBER(pagecache, block)));
+
+ if (block->status & PCBLOCK_CHANGED)
+ {
+ /* The block contains a dirty page - push it out of the cache */
+
+ KEYCACHE_DBUG_PRINT("find_block", ("block is dirty"));
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /*
+ The call is thread safe because only the current
+ thread might change the block->hash_link value
+ */
+ DBUG_ASSERT(block->pins == 0);
+ error= pagecache_fwrite(pagecache,
+ &block->hash_link->file,
+ block->buffer,
+ block->hash_link->pageno,
+ block->type,
+ pagecache->readwrite_flags);
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ pagecache->global_cache_write++;
+ }
+
+ block->status|= PCBLOCK_REASSIGNED;
+ if (block->hash_link)
+ {
+ /*
+ Wait until all pending read requests
+ for this page are executed
+ (we could have avoided this waiting, if we had read
+ a page in the cache in a sweep, without yielding control)
+ */
+ wait_for_readers(pagecache, block);
+
+ /* Remove the hash link for this page from the hash table */
+ unlink_hash(pagecache, block->hash_link);
+ /* All pending requests for this page must be resubmitted */
+#ifdef THREAD
+ if (block->wqueue[COND_FOR_SAVED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]);
+#endif
+ }
+ link_to_file_list(pagecache, block, file,
+ (my_bool)(block->hash_link ? 1 : 0));
+ PCBLOCK_INFO(block);
+ block->status= error ? PCBLOCK_ERROR : 0;
+ block->error= (int16) my_errno;
+#ifndef DBUG_OFF
+ block->type= PAGECACHE_EMPTY_PAGE;
+ if (error)
+ my_debug_put_break_here();
+#endif
+ block->hash_link= hash_link;
+ page_status= PAGE_TO_BE_READ;
+ DBUG_PRINT("info", ("page to be read set for page 0x%lx",
+ (ulong)block));
+
+ KEYCACHE_DBUG_ASSERT(block->hash_link->block == block);
+ KEYCACHE_DBUG_ASSERT(hash_link->block->hash_link == hash_link);
+ }
+ else
+ {
+ /* This is for secondary requests for a new page only */
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("block->hash_link: %p hash_link: %p "
+ "block->status: %u", block->hash_link,
+ hash_link, block->status ));
+ page_status= (((block->hash_link == hash_link) &&
+ (block->status & PCBLOCK_READ)) ?
+ PAGE_READ : PAGE_WAIT_TO_BE_READ);
+ }
+ }
+ }
+ else
+ {
+ if (reg_req)
+ reg_requests(pagecache, block, 1);
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("block->hash_link: %p hash_link: %p "
+ "block->status: %u", block->hash_link,
+ hash_link, block->status ));
+ page_status= (((block->hash_link == hash_link) &&
+ (block->status & PCBLOCK_READ)) ?
+ PAGE_READ : PAGE_WAIT_TO_BE_READ);
+ }
+ }
+
+ KEYCACHE_DBUG_ASSERT(page_status != -1);
+ *page_st= page_status;
+ DBUG_PRINT("info",
+ ("block: 0x%lx fd: %u pos: %lu block->status: %u page_status: %u",
+ (ulong) block, (uint) file->file,
+ (ulong) pageno, block->status, (uint) page_status));
+ KEYCACHE_DBUG_PRINT("find_block",
+ ("block: 0x%lx fd: %d pos: %lu block->status: %u page_status: %d",
+ (ulong) block,
+ file->file, (ulong) pageno, block->status,
+ page_status));
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache, "end of find_block",0););
+#endif
+ KEYCACHE_THREAD_TRACE("find_block:end");
+ DBUG_RETURN(block);
+}
+
+
+static void add_pin(PAGECACHE_BLOCK_LINK *block)
+{
+ DBUG_ENTER("add_pin");
+ DBUG_PRINT("enter", ("block: 0x%lx pins: %u",
+ (ulong) block,
+ block->pins));
+ PCBLOCK_INFO(block);
+ block->pins++;
+#ifndef DBUG_OFF
+ {
+ PAGECACHE_PIN_INFO *info=
+ (PAGECACHE_PIN_INFO *)my_malloc(sizeof(PAGECACHE_PIN_INFO), MYF(0));
+ info->thread= my_thread_var;
+ info_link(&block->pin_list, info);
+ }
+#endif
+ DBUG_VOID_RETURN;
+}
+
+static void remove_pin(PAGECACHE_BLOCK_LINK *block, my_bool any
+#ifdef DBUG_OFF
+ __attribute__((unused))
+#endif
+ )
+{
+ DBUG_ENTER("remove_pin");
+ DBUG_PRINT("enter", ("block: 0x%lx pins: %u any: %d",
+ (ulong) block,
+ block->pins, (int)any));
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->pins > 0);
+ block->pins--;
+#ifndef DBUG_OFF
+ {
+ PAGECACHE_PIN_INFO *info= info_find(block->pin_list, my_thread_var, any);
+ DBUG_ASSERT(info != 0);
+ info_unlink(info);
+ my_free(info, MYF(0));
+ }
+#endif
+ DBUG_VOID_RETURN;
+}
+#ifndef DBUG_OFF
+static void info_add_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl)
+{
+ PAGECACHE_LOCK_INFO *info=
+ (PAGECACHE_LOCK_INFO *)my_malloc(sizeof(PAGECACHE_LOCK_INFO), MYF(0));
+ info->thread= my_thread_var;
+ info->write_lock= wl;
+ info_link((PAGECACHE_PIN_INFO **)&block->lock_list,
+ (PAGECACHE_PIN_INFO *)info);
+}
+static void info_remove_lock(PAGECACHE_BLOCK_LINK *block)
+{
+ PAGECACHE_LOCK_INFO *info=
+ (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list,
+ my_thread_var, FALSE);
+ DBUG_ASSERT(info != 0);
+ info_unlink((PAGECACHE_PIN_INFO *)info);
+ my_free(info, MYF(0));
+}
+static void info_change_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl)
+{
+ PAGECACHE_LOCK_INFO *info=
+ (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list,
+ my_thread_var, FALSE);
+ DBUG_ASSERT(info != 0);
+ DBUG_ASSERT(info->write_lock != wl);
+ info->write_lock= wl;
+}
+#else
+#define info_add_lock(B,W)
+#define info_remove_lock(B)
+#define info_change_lock(B,W)
+#endif
+
+
+/**
+ @brief waiting for lock for read and write lock
+
+ @parem pagecache pointer to a page cache data structure
+ @parem block the block to work with
+ @param file file of the block when it was locked
+ @param pageno page number of the block when it was locked
+ @param lock_type MY_PTHREAD_LOCK_READ or MY_PTHREAD_LOCK_WRITE
+
+ @retval 0 OK
+ @retval 1 Can't lock this block, need retry
+*/
+
+static my_bool pagecache_wait_lock(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ PAGECACHE_FILE file,
+ pgcache_page_no_t pageno,
+ uint lock_type)
+{
+ /* Lock failed we will wait */
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ DBUG_ENTER("pagecache_wait_lock");
+ DBUG_PRINT("info", ("fail to lock, waiting... 0x%lx", (ulong)block));
+ thread->lock_type= lock_type;
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_WRLOCK], thread);
+ dec_counter_for_resize_op(pagecache);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("get_wrlock: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while(thread->next);
+#else
+ DBUG_ASSERT(0);
+#endif
+ PCBLOCK_INFO(block);
+ if ((block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH)) ||
+ file.file != block->hash_link->file.file ||
+ pageno != block->hash_link->pageno)
+ {
+ DBUG_PRINT("info", ("the block 0x%lx changed => need retry "
+ "status: %x files %d != %d or pages %lu != %lu",
+ (ulong)block, block->status,
+ file.file, block->hash_link->file.file,
+ (ulong) pageno, (ulong) block->hash_link->pageno));
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+/**
+ @brief Put on the block write lock
+
+ @parem pagecache pointer to a page cache data structure
+ @parem block the block to work with
+
+ @note We have loose scheme for locking by the same thread:
+ * Downgrade to read lock if no other locks are taken
+ * Our scheme of locking allow for the same thread
+ - the same kind of lock
+ - taking read lock if write lock present
+ - downgrading to read lock if still other place the same
+ thread keep write lock
+ * But unlock operation number should be the same to lock operation.
+ * If we try to get read lock having active write locks we put read
+ locks to queue, and as soon as write lock(s) gone the read locks
+ from queue came in force.
+ * If read lock is unlocked earlier then it came to force it
+ just removed from the queue
+
+ @retval 0 OK
+ @retval 1 Can't lock this block, need retry
+*/
+
+static my_bool get_wrlock(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block)
+{
+ PAGECACHE_FILE file= block->hash_link->file;
+ pgcache_page_no_t pageno= block->hash_link->pageno;
+ pthread_t locker= pthread_self();
+ DBUG_ENTER("get_wrlock");
+ DBUG_PRINT("info", ("the block 0x%lx "
+ "files %d(%d) pages %lu(%lu)",
+ (ulong) block,
+ file.file, block->hash_link->file.file,
+ (ulong) pageno, (ulong) block->hash_link->pageno));
+ PCBLOCK_INFO(block);
+ /*
+ We assume that the same thread will try write lock on block on which it
+ has already read lock.
+ */
+ while ((block->wlocks && !pthread_equal(block->write_locker, locker)) ||
+ block->rlocks)
+ {
+ /* Lock failed we will wait */
+ if (pagecache_wait_lock(pagecache, block, file, pageno,
+ MY_PTHREAD_LOCK_WRITE))
+ DBUG_RETURN(1);
+ }
+ /* we are doing it by global cache mutex protection, so it is OK */
+ block->wlocks++;
+ block->write_locker= locker;
+ DBUG_PRINT("info", ("WR lock set, block 0x%lx", (ulong)block));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ @brief Put on the block read lock
+
+ @param pagecache pointer to a page cache data structure
+ @param block the block to work with
+ @param user_file Unique handler per handler file. Used to check if
+ we request many write locks withing the same
+ statement
+
+ @note see note for get_wrlock().
+
+ @retvalue 0 OK
+ @retvalue 1 Can't lock this block, need retry
+*/
+
+static my_bool get_rdlock(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block)
+{
+ PAGECACHE_FILE file= block->hash_link->file;
+ pgcache_page_no_t pageno= block->hash_link->pageno;
+ pthread_t locker= pthread_self();
+ DBUG_ENTER("get_rdlock");
+ DBUG_PRINT("info", ("the block 0x%lx "
+ "files %d(%d) pages %lu(%lu)",
+ (ulong) block,
+ file.file, block->hash_link->file.file,
+ (ulong) pageno, (ulong) block->hash_link->pageno));
+ PCBLOCK_INFO(block);
+ while (block->wlocks && !pthread_equal(block->write_locker, locker))
+ {
+ /* Lock failed we will wait */
+ if (pagecache_wait_lock(pagecache, block, file, pageno,
+ MY_PTHREAD_LOCK_READ))
+ DBUG_RETURN(1);
+ }
+ /* we are doing it by global cache mutex protection, so it is OK */
+ if (block->wlocks)
+ {
+ DBUG_ASSERT(pthread_equal(block->write_locker, locker));
+ block->rlocks_queue++;
+ DBUG_PRINT("info", ("RD lock put into queue, block 0x%lx", (ulong)block));
+ }
+ else
+ {
+ block->rlocks++;
+ DBUG_PRINT("info", ("RD lock set, block 0x%lx", (ulong)block));
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ @brief Remove write lock from the block
+
+ @param pagecache pointer to a page cache data structure
+ @param block the block to work with
+ @param read_lock downgrade to read lock
+
+ @note see note for get_wrlock().
+*/
+
+static void release_wrlock(PAGECACHE_BLOCK_LINK *block, my_bool read_lock)
+{
+ DBUG_ENTER("release_wrlock");
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->wlocks > 0);
+ DBUG_ASSERT(block->rlocks == 0);
+ DBUG_ASSERT(block->pins > 0);
+ if (read_lock)
+ block->rlocks_queue++;
+ if (block->wlocks == 1)
+ {
+ block->rlocks= block->rlocks_queue;
+ block->rlocks_queue= 0;
+ }
+ block->wlocks--;
+ if (block->wlocks > 0)
+ DBUG_VOID_RETURN; /* Multiple write locked */
+ DBUG_PRINT("info", ("WR lock reset, block 0x%lx", (ulong)block));
+#ifdef THREAD
+ /* release all threads waiting for read lock or one waiting for write */
+ if (block->wqueue[COND_FOR_WRLOCK].last_thread)
+ wqueue_release_one_locktype_from_queue(&block->wqueue[COND_FOR_WRLOCK]);
+#endif
+ PCBLOCK_INFO(block);
+ DBUG_VOID_RETURN;
+}
+
+/*
+ @brief Remove read lock from the block
+
+ @param pagecache pointer to a page cache data structure
+ @param block the block to work with
+
+ @note see note for get_wrlock().
+*/
+
+static void release_rdlock(PAGECACHE_BLOCK_LINK *block)
+{
+ DBUG_ENTER("release_wrlock");
+ PCBLOCK_INFO(block);
+ if (block->wlocks)
+ {
+ DBUG_ASSERT(pthread_equal(block->write_locker, pthread_self()));
+ DBUG_ASSERT(block->rlocks == 0);
+ DBUG_ASSERT(block->rlocks_queue > 0);
+ block->rlocks_queue--;
+ DBUG_PRINT("info", ("RD lock queue decreased, block 0x%lx", (ulong)block));
+ DBUG_VOID_RETURN;
+ }
+ DBUG_ASSERT(block->rlocks > 0);
+ DBUG_ASSERT(block->rlocks_queue == 0);
+ block->rlocks--;
+ DBUG_PRINT("info", ("RD lock decreased, block 0x%lx", (ulong)block));
+ if (block->rlocks > 0)
+ DBUG_VOID_RETURN; /* Multiple write locked */
+ DBUG_PRINT("info", ("RD lock reset, block 0x%lx", (ulong)block));
+#ifdef THREAD
+ /* release all threads waiting for read lock or one waiting for write */
+ if (block->wqueue[COND_FOR_WRLOCK].last_thread)
+ wqueue_release_one_locktype_from_queue(&block->wqueue[COND_FOR_WRLOCK]);
+#endif
+ PCBLOCK_INFO(block);
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Try to lock/unlock and pin/unpin the block
+
+ @param pagecache pointer to a page cache data structure
+ @param block the block to work with
+ @param lock lock change mode
+ @param pin pinchange mode
+ @param file File handler requesting pin
+ @param any allow unpinning block pinned by any thread; possible
+ only if not locked, see pagecache_unlock_by_link()
+
+ @retval 0 OK
+ @retval 1 Try to lock the block failed
+*/
+
+static my_bool make_lock_and_pin(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ my_bool any)
+{
+ DBUG_ENTER("make_lock_and_pin");
+
+ DBUG_PRINT("enter", ("block: 0x%lx", (ulong)block));
+#ifndef DBUG_OFF
+ if (block)
+ {
+ DBUG_PRINT("enter", ("block: 0x%lx (%u) wrlocks: %u rdlocks: %u "
+ "rdlocks_q: %u pins: %u lock: %s pin: %s any %d",
+ (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+ block->wlocks, block->rlocks, block->rlocks_queue,
+ block->pins,
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin], (int)any));
+ PCBLOCK_INFO(block);
+ }
+#endif
+
+ DBUG_ASSERT(!any ||
+ ((lock == PAGECACHE_LOCK_LEFT_UNLOCKED) &&
+ (pin == PAGECACHE_UNPIN)));
+
+ switch (lock) {
+ case PAGECACHE_LOCK_WRITE: /* free -> write */
+ /* Writelock and pin the buffer */
+ if (get_wrlock(pagecache, block))
+ {
+ /* Couldn't lock because block changed status => need retry */
+ goto retry;
+ }
+
+ /* The cache is locked so nothing afraid of */
+ add_pin(block);
+ info_add_lock(block, 1);
+ break;
+ case PAGECACHE_LOCK_WRITE_TO_READ: /* write -> read */
+ case PAGECACHE_LOCK_WRITE_UNLOCK: /* write -> free */
+ /* Removes write lock and puts read lock */
+ release_wrlock(block, lock == PAGECACHE_LOCK_WRITE_TO_READ);
+ /* fall through */
+ case PAGECACHE_LOCK_READ_UNLOCK: /* read -> free */
+ if (lock == PAGECACHE_LOCK_READ_UNLOCK)
+ release_rdlock(block);
+ /* fall through */
+ case PAGECACHE_LOCK_LEFT_READLOCKED: /* read -> read */
+ if (pin == PAGECACHE_UNPIN)
+ {
+ remove_pin(block, FALSE);
+ }
+ if (lock == PAGECACHE_LOCK_WRITE_TO_READ)
+ {
+ info_change_lock(block, 0);
+ }
+ else if (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+ lock == PAGECACHE_LOCK_READ_UNLOCK)
+ {
+ info_remove_lock(block);
+ }
+ break;
+ case PAGECACHE_LOCK_READ: /* free -> read */
+ if (get_rdlock(pagecache, block))
+ {
+ /* Couldn't lock because block changed status => need retry */
+ goto retry;
+ }
+
+ if (pin == PAGECACHE_PIN)
+ {
+ /* The cache is locked so nothing afraid off */
+ add_pin(block);
+ }
+ info_add_lock(block, 0);
+ break;
+ case PAGECACHE_LOCK_LEFT_UNLOCKED: /* free -> free */
+ if (pin == PAGECACHE_UNPIN)
+ {
+ remove_pin(block, any);
+ }
+ /* fall through */
+ case PAGECACHE_LOCK_LEFT_WRITELOCKED: /* write -> write */
+ break; /* do nothing */
+ default:
+ DBUG_ASSERT(0); /* Never should happened */
+ }
+
+#ifndef DBUG_OFF
+ if (block)
+ PCBLOCK_INFO(block);
+#endif
+ DBUG_RETURN(0);
+retry:
+ DBUG_PRINT("INFO", ("Retry block 0x%lx", (ulong)block));
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block->hash_link->requests > 0);
+ block->hash_link->requests--;
+ PCBLOCK_INFO(block);
+ DBUG_RETURN(1);
+
+}
+
+
+/*
+ Read into a key cache block buffer from disk.
+
+ SYNOPSIS
+
+ read_block()
+ pagecache pointer to a page cache data structure
+ block block to which buffer the data is to be read
+ primary <-> the current thread will read the data
+
+ RETURN VALUE
+ None
+
+ NOTES.
+ The function either reads a page data from file to the block buffer,
+ or waits until another thread reads it. What page to read is determined
+ by a block parameter - reference to a hash link for this page.
+ If an error occurs THE PCBLOCK_ERROR bit is set in the block status.
+
+ On entry cache_lock is locked
+*/
+
+static void read_block(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ my_bool primary)
+{
+
+ DBUG_ENTER("read_block");
+ DBUG_PRINT("enter", ("read block: 0x%lx primary: %d",
+ (ulong)block, primary));
+ if (primary)
+ {
+ size_t error;
+ /*
+ This code is executed only by threads
+ that submitted primary requests
+ */
+
+ pagecache->global_cache_read++;
+ /* Page is not in buffer yet, is to be read from disk */
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /*
+ Here other threads may step in and register as secondary readers.
+ They will register in block->wqueue[COND_FOR_REQUESTED].
+ */
+ error= pagecache_fread(pagecache, &block->hash_link->file,
+ block->buffer,
+ block->hash_link->pageno,
+ pagecache->readwrite_flags);
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (error)
+ {
+ block->status|= PCBLOCK_ERROR;
+ block->error= (int16) my_errno;
+ my_debug_put_break_here();
+ }
+ else
+ {
+ block->status|= PCBLOCK_READ;
+ if ((*block->hash_link->file.read_callback)(block->buffer,
+ block->hash_link->pageno,
+ block->hash_link->
+ file.callback_data))
+ {
+ DBUG_PRINT("error", ("read callback problem"));
+ block->status|= PCBLOCK_ERROR;
+ block->error= (int16) my_errno;
+ my_debug_put_break_here();
+ }
+ }
+ DBUG_PRINT("read_block",
+ ("primary request: new page in cache"));
+ /* Signal that all pending requests for this page now can be processed */
+#ifdef THREAD
+ if (block->wqueue[COND_FOR_REQUESTED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]);
+#endif
+ }
+ else
+ {
+ /*
+ This code is executed only by threads
+ that submitted secondary requests
+ */
+
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ /* Put the request into a queue and wait until it can be processed */
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_REQUESTED], thread);
+ do
+ {
+ DBUG_PRINT("read_block: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while (thread->next);
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+ /* No parallel requests in single-threaded case */
+#endif
+ DBUG_PRINT("read_block",
+ ("secondary request: new page in cache"));
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Set LSN on the page to the given one if the given LSN is bigger
+
+ @param pagecache pointer to a page cache data structure
+ @param lsn LSN to set
+ @param block block to check and set
+*/
+
+static void check_and_set_lsn(PAGECACHE *pagecache,
+ LSN lsn, PAGECACHE_BLOCK_LINK *block)
+{
+ LSN old;
+ DBUG_ENTER("check_and_set_lsn");
+ /*
+ In recovery, we can _ma_unpin_all_pages() to put a LSN on page, though
+ page would be PAGECACHE_PLAIN_PAGE (transactionality temporarily disabled
+ to not log REDOs).
+ */
+ DBUG_ASSERT((block->type == PAGECACHE_LSN_PAGE) || maria_in_recovery);
+ old= lsn_korr(block->buffer);
+ DBUG_PRINT("info", ("old lsn: (%lu, 0x%lx) new lsn: (%lu, 0x%lx)",
+ LSN_IN_PARTS(old), LSN_IN_PARTS(lsn)));
+ if (cmp_translog_addr(lsn, old) > 0)
+ {
+
+ DBUG_ASSERT(block->type != PAGECACHE_READ_UNKNOWN_PAGE);
+ lsn_store(block->buffer, lsn);
+ /* we stored LSN in page so we dirtied it */
+ if (!(block->status & PCBLOCK_CHANGED))
+ link_to_changed_list(pagecache, block);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Unlock/unpin page and put LSN stamp if it need
+
+ @param pagecache pointer to a page cache data structure
+ @pagam file handler for the file for the block of data to be read
+ @param pageno number of the block of data in the file
+ @param lock lock change
+ @param pin pin page
+ @param first_REDO_LSN_for_page do not set it if it is zero
+ @param lsn if it is not LSN_IMPOSSIBLE (0) and it
+ is bigger then LSN on the page it will be written on
+ the page
+ @param was_changed should be true if the page was write locked with
+ direct link giving and the page was changed
+
+ @note
+ Pininig uses requests registration mechanism it works following way:
+ | beginnig | ending |
+ | of func. | of func. |
+ ----------------------------+-------------+---------------+
+ PAGECACHE_PIN_LEFT_PINNED | - | - |
+ PAGECACHE_PIN_LEFT_UNPINNED | reg request | unreg request |
+ PAGECACHE_PIN | reg request | - |
+ PAGECACHE_UNPIN | - | unreg request |
+
+
+*/
+
+void pagecache_unlock(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ LSN first_REDO_LSN_for_page,
+ LSN lsn, my_bool was_changed)
+{
+ PAGECACHE_BLOCK_LINK *block;
+ int page_st;
+ DBUG_ENTER("pagecache_unlock");
+ DBUG_PRINT("enter", ("fd: %u page: %lu %s %s",
+ (uint) file->file, (ulong) pageno,
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ /* we do not allow any lock/pin increasing here */
+ DBUG_ASSERT(pin != PAGECACHE_PIN);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_READ);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE);
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ /*
+ As soon as we keep lock cache can be used, and we have lock because want
+ to unlock.
+ */
+ DBUG_ASSERT(pagecache->can_be_used);
+
+ inc_counter_for_resize_op(pagecache);
+ /* See NOTE for pagecache_unlock about registering requests */
+ block= find_block(pagecache, file, pageno, 0, 0,
+ pin == PAGECACHE_PIN_LEFT_UNPINNED, &page_st);
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(block != 0 && page_st == PAGE_READ);
+ if (first_REDO_LSN_for_page)
+ {
+ DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK);
+ DBUG_ASSERT(pin == PAGECACHE_UNPIN);
+ pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
+ }
+ if (lsn != LSN_IMPOSSIBLE)
+ check_and_set_lsn(pagecache, lsn, block);
+
+ /* if we lock for write we must link the block to changed blocks */
+ DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 ||
+ (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+ lock == PAGECACHE_LOCK_WRITE_TO_READ ||
+ lock == PAGECACHE_LOCK_LEFT_WRITELOCKED));
+ /*
+ if was_changed then status should be PCBLOCK_DIRECT_W or marked
+ as dirty
+ */
+ DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) ||
+ (block->status & PCBLOCK_CHANGED));
+ if ((block->status & PCBLOCK_DIRECT_W) &&
+ (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+ lock == PAGECACHE_LOCK_WRITE_TO_READ))
+ {
+ if (!(block->status & PCBLOCK_CHANGED) && was_changed)
+ link_to_changed_list(pagecache, block);
+ block->status&= ~PCBLOCK_DIRECT_W;
+ DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx",
+ (ulong) block));
+ }
+
+ if (make_lock_and_pin(pagecache, block, lock, pin, FALSE))
+ {
+ DBUG_ASSERT(0); /* should not happend */
+ }
+
+ remove_reader(block);
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests.
+ */
+ if (pin != PAGECACHE_PIN_LEFT_PINNED)
+ unreg_request(pagecache, block, 1);
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unpin page
+
+ SYNOPSIS
+ pagecache_unpin()
+ pagecache pointer to a page cache data structure
+ file handler for the file for the block of data to be read
+ pageno number of the block of data in the file
+ lsn if it is not LSN_IMPOSSIBLE (0) and it
+ is bigger then LSN on the page it will be written on
+ the page
+*/
+
+void pagecache_unpin(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ LSN lsn)
+{
+ PAGECACHE_BLOCK_LINK *block;
+ int page_st;
+ DBUG_ENTER("pagecache_unpin");
+ DBUG_PRINT("enter", ("fd: %u page: %lu",
+ (uint) file->file, (ulong) pageno));
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ /*
+ As soon as we keep lock cache can be used, and we have lock bacause want
+ aunlock.
+ */
+ DBUG_ASSERT(pagecache->can_be_used);
+
+ inc_counter_for_resize_op(pagecache);
+ /* See NOTE for pagecache_unlock about registering requests */
+ block= find_block(pagecache, file, pageno, 0, 0, 0, &page_st);
+ DBUG_ASSERT(block != 0);
+ DBUG_ASSERT(page_st == PAGE_READ);
+ /* we can't unpin such page without unlock */
+ DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0);
+
+ if (lsn != LSN_IMPOSSIBLE)
+ check_and_set_lsn(pagecache, lsn, block);
+
+ /*
+ we can just unpin only with keeping read lock because:
+ a) we can't pin without any lock
+ b) we can't unpin keeping write lock
+ */
+ if (make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_LEFT_READLOCKED,
+ PAGECACHE_UNPIN, FALSE))
+ DBUG_ASSERT(0); /* should not happend */
+
+ remove_reader(block);
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests
+ */
+ unreg_request(pagecache, block, 1);
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief Unlock/unpin page and put LSN stamp if it need
+ (uses direct block/page pointer)
+
+ @param pagecache pointer to a page cache data structure
+ @param link direct link to page (returned by read or write)
+ @param lock lock change
+ @param pin pin page
+ @param first_REDO_LSN_for_page do not set it if it is LSN_IMPOSSIBLE (0)
+ @param lsn if it is not LSN_IMPOSSIBLE and it is bigger then
+ LSN on the page it will be written on the page
+ @param was_changed should be true if the page was write locked with
+ direct link giving and the page was changed
+ @param any allow unpinning block pinned by any thread; possible
+ only if not locked
+
+ @note 'any' is a hack so that _ma_bitmap_unpin_all() is allowed to unpin
+ non-locked bitmap pages pinned by other threads. Because it always uses
+ PAGECACHE_LOCK_LEFT_UNLOCKED and PAGECACHE_UNPIN
+ (see write_changed_bitmap()), the hack is limited to these conditions.
+*/
+
+void pagecache_unlock_by_link(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ LSN first_REDO_LSN_for_page,
+ LSN lsn, my_bool was_changed,
+ my_bool any)
+{
+ DBUG_ENTER("pagecache_unlock_by_link");
+ DBUG_PRINT("enter", ("block: 0x%lx fd: %u page: %lu changed: %d %s %s",
+ (ulong) block,
+ (uint) block->hash_link->file.file,
+ (ulong) block->hash_link->pageno, was_changed,
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ /*
+ We do not allow any lock/pin increasing here and page can't be
+ unpinned because we use direct link.
+ */
+ DBUG_ASSERT(pin != PAGECACHE_PIN);
+ DBUG_ASSERT(pin != PAGECACHE_PIN_LEFT_UNPINNED);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_READ);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE);
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (pin == PAGECACHE_PIN_LEFT_UNPINNED &&
+ lock == PAGECACHE_LOCK_READ_UNLOCK)
+ {
+ if (make_lock_and_pin(pagecache, block, lock, pin, FALSE))
+ DBUG_ASSERT(0); /* should not happend */
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_VOID_RETURN;
+ }
+
+ /*
+ As soon as we keep lock cache can be used, and we have lock because want
+ unlock.
+ */
+ DBUG_ASSERT(pagecache->can_be_used);
+
+ inc_counter_for_resize_op(pagecache);
+ if (was_changed)
+ {
+ if (first_REDO_LSN_for_page != LSN_IMPOSSIBLE)
+ {
+ /*
+ LOCK_READ_UNLOCK is ok here as the page may have first locked
+ with WRITE lock that was temporarly converted to READ lock before
+ it's unpinned
+ */
+ DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+ lock == PAGECACHE_LOCK_READ_UNLOCK);
+ DBUG_ASSERT(pin == PAGECACHE_UNPIN);
+ pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
+ }
+ if (lsn != LSN_IMPOSSIBLE)
+ check_and_set_lsn(pagecache, lsn, block);
+ /*
+ Reset error flag. Mark also that page is active; This may not have
+ been the case if there was an error reading the page
+ */
+ block->status= (block->status & ~PCBLOCK_ERROR) | PCBLOCK_READ;
+ }
+
+ /* if we lock for write we must link the block to changed blocks */
+ DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 ||
+ (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+ lock == PAGECACHE_LOCK_WRITE_TO_READ ||
+ lock == PAGECACHE_LOCK_LEFT_WRITELOCKED));
+ /*
+ If was_changed then status should be PCBLOCK_DIRECT_W or marked
+ as dirty
+ */
+ DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) ||
+ (block->status & PCBLOCK_CHANGED));
+ if ((block->status & PCBLOCK_DIRECT_W) &&
+ (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+ lock == PAGECACHE_LOCK_WRITE_TO_READ))
+ {
+ if (!(block->status & PCBLOCK_CHANGED) && was_changed)
+ link_to_changed_list(pagecache, block);
+ block->status&= ~PCBLOCK_DIRECT_W;
+ DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx",
+ (ulong) block));
+ }
+
+ if (make_lock_and_pin(pagecache, block, lock, pin, any))
+ DBUG_ASSERT(0); /* should not happend */
+
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests.
+ */
+ if (pin != PAGECACHE_PIN_LEFT_PINNED)
+ unreg_request(pagecache, block, 1);
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Unpin page
+ (uses direct block/page pointer)
+
+ SYNOPSIS
+ pagecache_unpin_by_link()
+ pagecache pointer to a page cache data structure
+ link direct link to page (returned by read or write)
+ lsn if it is not LSN_IMPOSSIBLE (0) and it
+ is bigger then LSN on the page it will be written on
+ the page
+*/
+
+void pagecache_unpin_by_link(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ LSN lsn)
+{
+ DBUG_ENTER("pagecache_unpin_by_link");
+ DBUG_PRINT("enter", ("block: 0x%lx fd: %u page: %lu",
+ (ulong) block,
+ (uint) block->hash_link->file.file,
+ (ulong) block->hash_link->pageno));
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ /*
+ As soon as we keep lock cache can be used, and we have lock because want
+ unlock.
+ */
+ DBUG_ASSERT(pagecache->can_be_used);
+ /* we can't unpin such page without unlock */
+ DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0);
+
+ inc_counter_for_resize_op(pagecache);
+
+ if (lsn != LSN_IMPOSSIBLE)
+ check_and_set_lsn(pagecache, lsn, block);
+
+ /*
+ We can just unpin only with keeping read lock because:
+ a) we can't pin without any lock
+ b) we can't unpin keeping write lock
+ */
+ if (make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_LEFT_READLOCKED,
+ PAGECACHE_UNPIN, FALSE))
+ DBUG_ASSERT(0); /* should not happend */
+
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests.
+ */
+ unreg_request(pagecache, block, 1);
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+/* description of how to change lock before and after read/write */
+struct rw_lock_change
+{
+ my_bool need_lock_change; /* need changing of lock at the end */
+ enum pagecache_page_lock new_lock; /* lock at the beginning */
+ enum pagecache_page_lock unlock_lock; /* lock at the end */
+};
+
+/* description of how to change pin before and after read/write */
+struct rw_pin_change
+{
+ enum pagecache_page_pin new_pin; /* pin status at the beginning */
+ enum pagecache_page_pin unlock_pin; /* pin status at the end */
+};
+
+/**
+ Depending on the lock which the user wants in pagecache_read(), we
+ need to acquire a first type of lock at start of pagecache_read(), and
+ downgrade it to a second type of lock at end. For example, if user
+ asked for no lock (PAGECACHE_LOCK_LEFT_UNLOCKED) this translates into
+ taking first a read lock PAGECACHE_LOCK_READ (to rightfully block on
+ existing write locks) then read then unlock the lock i.e. change lock
+ to PAGECACHE_LOCK_READ_UNLOCK (the "1" below tells that a change is
+ needed).
+*/
+
+static struct rw_lock_change lock_to_read[8]=
+{
+ { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/
+ 1,
+ PAGECACHE_LOCK_READ, PAGECACHE_LOCK_READ_UNLOCK
+ },
+ { /*PAGECACHE_LOCK_LEFT_READLOCKED*/
+ 0,
+ PAGECACHE_LOCK_LEFT_READLOCKED, PAGECACHE_LOCK_LEFT_READLOCKED
+ },
+ { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/
+ 0,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_LEFT_WRITELOCKED
+ },
+ { /*PAGECACHE_LOCK_READ*/
+ 1,
+ PAGECACHE_LOCK_READ, PAGECACHE_LOCK_LEFT_READLOCKED
+ },
+ { /*PAGECACHE_LOCK_WRITE*/
+ 1,
+ PAGECACHE_LOCK_WRITE, PAGECACHE_LOCK_LEFT_WRITELOCKED
+ },
+ { /*PAGECACHE_LOCK_READ_UNLOCK*/
+ 1,
+ PAGECACHE_LOCK_LEFT_READLOCKED, PAGECACHE_LOCK_READ_UNLOCK
+ },
+ { /*PAGECACHE_LOCK_WRITE_UNLOCK*/
+ 1,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_WRITE_UNLOCK
+ },
+ { /*PAGECACHE_LOCK_WRITE_TO_READ*/
+ 1,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_WRITE_TO_READ
+ }
+};
+
+/**
+ Two sets of pin modes (every as for lock upper but for pinning). The
+ difference between sets if whether we are going to provide caller with
+ reference on the block or not
+*/
+
+static struct rw_pin_change lock_to_pin[2][8]=
+{
+ {
+ { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_PIN_LEFT_UNPINNED
+ },
+ { /*PAGECACHE_LOCK_LEFT_READLOCKED*/
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ },
+ { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_PIN_LEFT_PINNED
+ },
+ { /*PAGECACHE_LOCK_READ*/
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_PIN_LEFT_UNPINNED
+ },
+ { /*PAGECACHE_LOCK_WRITE*/
+ PAGECACHE_PIN,
+ PAGECACHE_PIN_LEFT_PINNED
+ },
+ { /*PAGECACHE_LOCK_READ_UNLOCK*/
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_PIN_LEFT_UNPINNED
+ },
+ { /*PAGECACHE_LOCK_WRITE_UNLOCK*/
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_UNPIN
+ },
+ { /*PAGECACHE_LOCK_WRITE_TO_READ*/
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_UNPIN
+ }
+ },
+ {
+ { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_PIN_LEFT_UNPINNED
+ },
+ { /*PAGECACHE_LOCK_LEFT_READLOCKED*/
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ },
+ { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_PIN_LEFT_PINNED
+ },
+ { /*PAGECACHE_LOCK_READ*/
+ PAGECACHE_PIN,
+ PAGECACHE_PIN_LEFT_PINNED
+ },
+ { /*PAGECACHE_LOCK_WRITE*/
+ PAGECACHE_PIN,
+ PAGECACHE_PIN_LEFT_PINNED
+ },
+ { /*PAGECACHE_LOCK_READ_UNLOCK*/
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_PIN_LEFT_UNPINNED
+ },
+ { /*PAGECACHE_LOCK_WRITE_UNLOCK*/
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_UNPIN
+ },
+ { /*PAGECACHE_LOCK_WRITE_TO_READ*/
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_PIN_LEFT_PINNED,
+ }
+ }
+};
+
+
+/*
+ @brief Read a block of data from a cached file into a buffer;
+
+ @param pagecache pointer to a page cache data structure
+ @param file handler for the file for the block of data to be read
+ @param pageno number of the block of data in the file
+ @param level determines the weight of the data
+ @param buff buffer to where the data must be placed
+ @param type type of the page
+ @param lock lock change
+ @param link link to the page if we pin it
+
+ @return address from where the data is placed if successful, 0 - otherwise.
+
+ @note Pin will be chosen according to lock parameter (see lock_to_pin)
+
+ @note 'buff', if not NULL, must be long-aligned.
+
+ @note If buff==0 then we provide reference on the page so should keep the
+ page pinned.
+*/
+
+uchar *pagecache_read(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint level,
+ uchar *buff,
+ enum pagecache_page_type type,
+ enum pagecache_page_lock lock,
+ PAGECACHE_BLOCK_LINK **page_link)
+{
+ my_bool error= 0;
+ enum pagecache_page_pin
+ new_pin= lock_to_pin[buff==0][lock].new_pin,
+ unlock_pin= lock_to_pin[buff==0][lock].unlock_pin;
+ PAGECACHE_BLOCK_LINK *fake_link;
+ my_bool reg_request;
+#ifndef DBUG_OFF
+ char llbuf[22];
+ DBUG_ENTER("pagecache_read");
+ DBUG_PRINT("enter", ("fd: %u page: %s buffer: 0x%lx level: %u "
+ "t:%s (%d)%s->%s %s->%s",
+ (uint) file->file, ullstr(pageno, llbuf),
+ (ulong) buff, level,
+ page_cache_page_type_str[type],
+ lock_to_read[lock].need_lock_change,
+ page_cache_page_lock_str[lock_to_read[lock].new_lock],
+ page_cache_page_lock_str[lock_to_read[lock].unlock_lock],
+ page_cache_page_pin_str[new_pin],
+ page_cache_page_pin_str[unlock_pin]));
+ DBUG_ASSERT(buff != 0 || (buff == 0 && (unlock_pin == PAGECACHE_PIN ||
+ unlock_pin == PAGECACHE_PIN_LEFT_PINNED)));
+ DBUG_ASSERT(pageno < ((ULL(1)) << 40));
+#endif
+
+ if (!page_link)
+ page_link= &fake_link;
+ *page_link= 0; /* Catch errors */
+
+restart:
+
+ if (pagecache->can_be_used)
+ {
+ /* Key cache is used */
+ PAGECACHE_BLOCK_LINK *block;
+ uint status;
+ int page_st;
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (!pagecache->can_be_used)
+ {
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ goto no_key_cache;
+ }
+
+ inc_counter_for_resize_op(pagecache);
+ pagecache->global_cache_r_requests++;
+ /* See NOTE for pagecache_unlock about registering requests. */
+ reg_request= ((new_pin == PAGECACHE_PIN_LEFT_UNPINNED) ||
+ (new_pin == PAGECACHE_PIN));
+ block= find_block(pagecache, file, pageno, level,
+ lock == PAGECACHE_LOCK_WRITE,
+ reg_request, &page_st);
+ DBUG_PRINT("info", ("Block type: %s current type %s",
+ page_cache_page_type_str[block->type],
+ page_cache_page_type_str[type]));
+ if (((block->status & PCBLOCK_ERROR) == 0) && (page_st != PAGE_READ))
+ {
+ /* The requested page is to be read into the block buffer */
+ read_block(pagecache, block,
+ (my_bool)(page_st == PAGE_TO_BE_READ));
+ DBUG_PRINT("info", ("read is done"));
+ }
+ /*
+ Assert after block is read. Imagine two concurrent SELECTs on same
+ table (thread1 and 2), which want to pagecache_read() the same
+ pageno/fileno. Thread1 calls find_block(), decides to evict a dirty
+ page from LRU; while it's writing this dirty page to disk, it is
+ pre-empted and thread2 runs its find_block(), gets the block (in
+ PAGE_TO_BE_READ state). This block is still containing the in-eviction
+ dirty page so has an its type, which cannot be tested.
+ So thread2 has to wait for read_block() to finish (when it wakes up in
+ read_block(), it's woken up by read_block() of thread1, which implies
+ that block's type was set to EMPTY by thread1 as part of find_block()).
+ */
+ DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE ||
+ block->type == type ||
+ type == PAGECACHE_LSN_PAGE ||
+ type == PAGECACHE_READ_UNKNOWN_PAGE ||
+ block->type == PAGECACHE_READ_UNKNOWN_PAGE);
+ if (type != PAGECACHE_READ_UNKNOWN_PAGE ||
+ block->type == PAGECACHE_EMPTY_PAGE)
+ block->type= type;
+
+ if (make_lock_and_pin(pagecache, block, lock_to_read[lock].new_lock,
+ new_pin, FALSE))
+ {
+ /*
+ We failed to write lock the block, cache is unlocked,
+ we will try to get the block again.
+ */
+ if (reg_request)
+ unreg_request(pagecache, block, 1);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_PRINT("info", ("restarting..."));
+ goto restart;
+ }
+
+ status= block->status;
+ if (!buff)
+ {
+ buff= block->buffer;
+ /* possibly we will write here (resolved on unlock) */
+ if ((lock == PAGECACHE_LOCK_WRITE ||
+ lock == PAGECACHE_LOCK_LEFT_WRITELOCKED) &&
+ !(block->status & PCBLOCK_CHANGED))
+ {
+ block->status|= PCBLOCK_DIRECT_W;
+ DBUG_PRINT("info", ("Set PCBLOCK_DIRECT_W for block: 0x%lx",
+ (ulong) block));
+ }
+ }
+ else
+ {
+ if (!(status & PCBLOCK_ERROR))
+ {
+#if !defined(SERIALIZED_READ_FROM_CACHE)
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+#endif
+
+ DBUG_ASSERT((pagecache->block_size & 511) == 0);
+ /* Copy data from the cache buffer */
+ bmove512(buff, block->buffer, pagecache->block_size);
+
+#if !defined(SERIALIZED_READ_FROM_CACHE)
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+#endif
+ }
+ else
+ my_errno= block->error;
+ }
+
+ remove_reader(block);
+ if (lock_to_read[lock].need_lock_change)
+ {
+ if (make_lock_and_pin(pagecache, block,
+ lock_to_read[lock].unlock_lock,
+ unlock_pin, FALSE))
+ DBUG_ASSERT(0);
+ }
+ /*
+ Link the block into the LRU chain if it's the last submitted request
+ for the block and block will not be pinned.
+ See NOTE for pagecache_unlock about registering requests.
+ */
+ if (unlock_pin == PAGECACHE_PIN_LEFT_UNPINNED ||
+ unlock_pin == PAGECACHE_UNPIN)
+ unreg_request(pagecache, block, 1);
+ else
+ *page_link= block;
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ if (status & PCBLOCK_ERROR)
+ {
+ DBUG_ASSERT(my_errno != 0);
+ DBUG_PRINT("error", ("Got error %d when doing page read", my_errno));
+ DBUG_RETURN((uchar *) 0);
+ }
+
+ DBUG_RETURN(buff);
+ }
+
+no_key_cache: /* Key cache is not used */
+
+ /* We can't use mutex here as the key cache may not be initialized */
+ pagecache->global_cache_r_requests++;
+ pagecache->global_cache_read++;
+ if (pagecache_fread(pagecache, file, buff, pageno,
+ pagecache->readwrite_flags))
+ error= 1;
+ DBUG_RETURN(error ? (uchar*) 0 : buff);
+}
+
+
+/*
+ @brief Delete page from the buffer (common part for link and file/page)
+
+ @param pagecache pointer to a page cache data structure
+ @param block direct link to page (returned by read or write)
+ @param page_link hash link of the block
+ @param flush flush page if it is dirty
+
+ @retval 0 deleted or was not present at all
+ @retval 1 error
+
+*/
+
+static my_bool pagecache_delete_internal(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ PAGECACHE_HASH_LINK *page_link,
+ my_bool flush)
+{
+ my_bool error= 0;
+ if (block->status & PCBLOCK_CHANGED)
+ {
+ if (flush)
+ {
+ /* The block contains a dirty page - push it out of the cache */
+
+ KEYCACHE_DBUG_PRINT("find_block", ("block is dirty"));
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /*
+ The call is thread safe because only the current
+ thread might change the block->hash_link value
+ */
+ DBUG_ASSERT(block->pins == 1);
+ error= pagecache_fwrite(pagecache,
+ &block->hash_link->file,
+ block->buffer,
+ block->hash_link->pageno,
+ block->type,
+ pagecache->readwrite_flags);
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ pagecache->global_cache_write++;
+
+ if (error)
+ {
+ block->status|= PCBLOCK_ERROR;
+ block->error= (int16) my_errno;
+ my_debug_put_break_here();
+ goto err;
+ }
+ }
+ pagecache->blocks_changed--;
+ pagecache->global_blocks_changed--;
+ /*
+ free_block() will change the status and rec_lsn of the block so no
+ need to change them here.
+ */
+ }
+ /* Cache is locked, so we can relese page before freeing it */
+ if (make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, FALSE))
+ DBUG_ASSERT(0);
+ DBUG_ASSERT(block->hash_link->requests > 0);
+ page_link->requests--;
+ /* See NOTE for pagecache_unlock about registering requests. */
+ free_block(pagecache, block);
+
+err:
+ dec_counter_for_resize_op(pagecache);
+ return error;
+}
+
+
+/*
+ @brief Delete page from the buffer by link
+
+ @param pagecache pointer to a page cache data structure
+ @param link direct link to page (returned by read or write)
+ @param lock lock change
+ @param flush flush page if it is dirty
+
+ @retval 0 deleted or was not present at all
+ @retval 1 error
+
+ @note lock can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was
+ write locked before) or PAGECACHE_LOCK_WRITE (delete will write
+ lock page before delete)
+*/
+
+my_bool pagecache_delete_by_link(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ enum pagecache_page_lock lock,
+ my_bool flush)
+{
+ my_bool error= 0;
+ enum pagecache_page_pin pin= PAGECACHE_PIN_LEFT_PINNED;
+ DBUG_ENTER("pagecache_delete_by_link");
+ DBUG_PRINT("enter", ("fd: %d block 0x%lx %s %s",
+ block->hash_link->file.file,
+ (ulong) block,
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE ||
+ lock == PAGECACHE_LOCK_LEFT_WRITELOCKED);
+ DBUG_ASSERT(block->pins != 0); /* should be pinned */
+
+ if (pagecache->can_be_used)
+ {
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (!pagecache->can_be_used)
+ goto end;
+
+ /*
+ This block should be pinned (i.e. has not zero request counter) =>
+ Such block can't be chosen for eviction.
+ */
+ DBUG_ASSERT((block->status &
+ (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED)) == 0);
+ /*
+ make_lock_and_pin() can't fail here, because we are keeping pin on the
+ block and it can't be evicted (which is cause of lock fail and retry)
+ */
+ if (make_lock_and_pin(pagecache, block, lock, pin, FALSE))
+ DBUG_ASSERT(0);
+
+ /*
+ get_present_hash_link() side effect emulation before call
+ pagecache_delete_internal()
+ */
+ block->hash_link->requests++;
+
+ error= pagecache_delete_internal(pagecache, block, block->hash_link,
+ flush);
+end:
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ }
+
+ DBUG_RETURN(error);
+}
+
+
+/**
+ @brief Returns "hits" for promotion
+
+ @return "hits" for promotion
+*/
+
+uint pagecache_pagelevel(PAGECACHE_BLOCK_LINK *block)
+{
+ return block->hits_left;
+}
+
+/*
+ @brief Adds "hits" to the page
+
+ @param link direct link to page (returned by read or write)
+ @param level number of "hits" which we add to the page
+*/
+
+void pagecache_add_level_by_link(PAGECACHE_BLOCK_LINK *block,
+ uint level)
+{
+ DBUG_ASSERT(block->pins != 0); /* should be pinned */
+ /*
+ Operation is just for statistics so it is not really important
+ if it interfere with other hit increasing => we are doing it without
+ locking the pagecache.
+ */
+ block->hits_left+= level;
+}
+
+/*
+ @brief Delete page from the buffer
+
+ @param pagecache pointer to a page cache data structure
+ @param file handler for the file for the block of data to be read
+ @param pageno number of the block of data in the file
+ @param lock lock change
+ @param flush flush page if it is dirty
+
+ @retval 0 deleted or was not present at all
+ @retval 1 error
+
+ @note lock can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was
+ write locked before) or PAGECACHE_LOCK_WRITE (delete will write
+ lock page before delete)
+*/
+static enum pagecache_page_pin lock_to_pin_one_phase[8]=
+{
+ PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_UNLOCKED*/,
+ PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_READLOCKED*/,
+ PAGECACHE_PIN_LEFT_PINNED /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/,
+ PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ*/,
+ PAGECACHE_PIN /*PAGECACHE_LOCK_WRITE*/,
+ PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ_UNLOCK*/,
+ PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_UNLOCK*/,
+ PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_TO_READ*/
+};
+
+my_bool pagecache_delete(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_lock lock,
+ my_bool flush)
+{
+ my_bool error= 0;
+ enum pagecache_page_pin pin= lock_to_pin_one_phase[lock];
+ DBUG_ENTER("pagecache_delete");
+ DBUG_PRINT("enter", ("fd: %u page: %lu %s %s",
+ (uint) file->file, (ulong) pageno,
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin]));
+ DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE ||
+ lock == PAGECACHE_LOCK_LEFT_WRITELOCKED);
+ DBUG_ASSERT(pin == PAGECACHE_PIN ||
+ pin == PAGECACHE_PIN_LEFT_PINNED);
+restart:
+
+ DBUG_ASSERT(pageno < ((ULL(1)) << 40));
+ if (pagecache->can_be_used)
+ {
+ /* Key cache is used */
+ reg1 PAGECACHE_BLOCK_LINK *block;
+ PAGECACHE_HASH_LINK **unused_start, *page_link;
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (!pagecache->can_be_used)
+ goto end;
+
+ inc_counter_for_resize_op(pagecache);
+ page_link= get_present_hash_link(pagecache, file, pageno, &unused_start);
+ if (!page_link)
+ {
+ DBUG_PRINT("info", ("There is no such page in the cache"));
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_RETURN(0);
+ }
+ block= page_link->block;
+ if (block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH))
+ {
+ DBUG_PRINT("info", ("Block 0x%0lx already is %s",
+ (ulong) block,
+ ((block->status & PCBLOCK_REASSIGNED) ?
+ "reassigned" : "in switch")));
+ PCBLOCK_INFO(block);
+ page_link->requests--;
+ goto end;
+ }
+ /* See NOTE for pagecache_unlock about registering requests. */
+ if (pin == PAGECACHE_PIN)
+ reg_requests(pagecache, block, 1);
+ DBUG_ASSERT(block != 0);
+ if (make_lock_and_pin(pagecache, block, lock, pin, FALSE))
+ {
+ /*
+ We failed to writelock the block, cache is unlocked, and last write
+ lock is released, we will try to get the block again.
+ */
+ if (pin == PAGECACHE_PIN)
+ unreg_request(pagecache, block, 1);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_PRINT("info", ("restarting..."));
+ goto restart;
+ }
+
+ /* we can't delete with opened direct link for write */
+ DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0);
+
+ error= pagecache_delete_internal(pagecache, block, page_link, flush);
+end:
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ }
+
+ DBUG_RETURN(error);
+}
+
+
+my_bool pagecache_delete_pages(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint page_count,
+ enum pagecache_page_lock lock,
+ my_bool flush)
+{
+ pgcache_page_no_t page_end;
+ DBUG_ENTER("pagecache_delete_pages");
+ DBUG_ASSERT(page_count > 0);
+
+ page_end= pageno + page_count;
+ do
+ {
+ if (pagecache_delete(pagecache, file, pageno,
+ lock, flush))
+ DBUG_RETURN(1);
+ } while (++pageno != page_end);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Writes a buffer into a cached file.
+
+ @param pagecache pointer to a page cache data structure
+ @param file handler for the file to write data to
+ @param pageno number of the block of data in the file
+ @param level determines the weight of the data
+ @param buff buffer with the data
+ @param type type of the page
+ @param lock lock change
+ @param pin pin page
+ @param write_mode how to write page
+ @param link link to the page if we pin it
+ @param first_REDO_LSN_for_page the lsn to set rec_lsn
+ @param offset offset in the page
+ @param size size of data
+ @param validator read page validator
+ @param validator_data the validator data
+
+ @retval 0 if a success.
+ @retval 1 Error.
+*/
+
+static struct rw_lock_change write_lock_change_table[]=
+{
+ {1,
+ PAGECACHE_LOCK_WRITE,
+ PAGECACHE_LOCK_WRITE_UNLOCK} /*PAGECACHE_LOCK_LEFT_UNLOCKED*/,
+ {0, /*unsupported (we can't write having the block read locked) */
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_LEFT_READLOCKED*/,
+ {0, PAGECACHE_LOCK_LEFT_WRITELOCKED, 0} /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/,
+ {1,
+ PAGECACHE_LOCK_WRITE,
+ PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_READ*/,
+ {0, PAGECACHE_LOCK_WRITE, 0} /*PAGECACHE_LOCK_WRITE*/,
+ {0, /*unsupported (we can't write having the block read locked) */
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_READ_UNLOCK*/,
+ {1,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ PAGECACHE_LOCK_WRITE_UNLOCK } /*PAGECACHE_LOCK_WRITE_UNLOCK*/,
+ {1,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_WRITE_TO_READ*/
+};
+
+
+static struct rw_pin_change write_pin_change_table[]=
+{
+ {PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN_LEFT_PINNED*/,
+ {PAGECACHE_PIN,
+ PAGECACHE_UNPIN} /*PAGECACHE_PIN_LEFT_UNPINNED*/,
+ {PAGECACHE_PIN,
+ PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN*/,
+ {PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_UNPIN} /*PAGECACHE_UNPIN*/
+};
+
+
+/**
+ @note 'buff', if not NULL, must be long-aligned.
+*/
+
+my_bool pagecache_write_part(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint level,
+ uchar *buff,
+ enum pagecache_page_type type,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ enum pagecache_write_mode write_mode,
+ PAGECACHE_BLOCK_LINK **page_link,
+ LSN first_REDO_LSN_for_page,
+ uint offset, uint size)
+{
+ PAGECACHE_BLOCK_LINK *block= NULL;
+ PAGECACHE_BLOCK_LINK *fake_link;
+ my_bool error= 0;
+ int need_lock_change= write_lock_change_table[lock].need_lock_change;
+ my_bool reg_request;
+#ifndef DBUG_OFF
+ char llbuf[22];
+ DBUG_ENTER("pagecache_write_part");
+ DBUG_PRINT("enter", ("fd: %u page: %s level: %u type: %s lock: %s "
+ "pin: %s mode: %s offset: %u size %u",
+ (uint) file->file, ullstr(pageno, llbuf), level,
+ page_cache_page_type_str[type],
+ page_cache_page_lock_str[lock],
+ page_cache_page_pin_str[pin],
+ page_cache_page_write_mode_str[write_mode],
+ offset, size));
+ DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_LEFT_READLOCKED);
+ DBUG_ASSERT(lock != PAGECACHE_LOCK_READ_UNLOCK);
+ DBUG_ASSERT(offset + size <= pagecache->block_size);
+ DBUG_ASSERT(pageno < ((ULL(1)) << 40));
+#endif
+
+ if (!page_link)
+ page_link= &fake_link;
+ *page_link= 0;
+
+restart:
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache, "start of key_cache_write", 1););
+#endif
+
+ if (pagecache->can_be_used)
+ {
+ /* Key cache is used */
+ int page_st;
+ my_bool need_page_ready_signal= FALSE;
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ if (!pagecache->can_be_used)
+ {
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ goto no_key_cache;
+ }
+
+ inc_counter_for_resize_op(pagecache);
+ pagecache->global_cache_w_requests++;
+ /* See NOTE for pagecache_unlock about registering requests. */
+ reg_request= ((pin == PAGECACHE_PIN_LEFT_UNPINNED) ||
+ (pin == PAGECACHE_PIN));
+ block= find_block(pagecache, file, pageno, level,
+ TRUE,
+ reg_request, &page_st);
+ if (!block)
+ {
+ DBUG_ASSERT(write_mode != PAGECACHE_WRITE_DONE);
+ /* It happens only for requests submitted during resize operation */
+ dec_counter_for_resize_op(pagecache);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /* Write to the disk key cache is in resize at the moment*/
+ goto no_key_cache;
+ }
+ DBUG_PRINT("info", ("page status: %d", page_st));
+ if (!(block->status & PCBLOCK_ERROR) &&
+ ((page_st == PAGE_TO_BE_READ &&
+ (offset || size < pagecache->block_size)) ||
+ (page_st == PAGE_WAIT_TO_BE_READ)))
+ {
+ /* The requested page is to be read into the block buffer */
+ read_block(pagecache, block,
+ (my_bool)(page_st == PAGE_TO_BE_READ));
+ DBUG_PRINT("info", ("read is done"));
+ }
+ else if (page_st == PAGE_TO_BE_READ)
+ {
+ need_page_ready_signal= TRUE;
+ }
+
+ DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE ||
+ block->type == PAGECACHE_READ_UNKNOWN_PAGE ||
+ block->type == type ||
+ /* this is for when going to non-trans to trans */
+ (block->type == PAGECACHE_PLAIN_PAGE &&
+ type == PAGECACHE_LSN_PAGE));
+ block->type= type;
+ /* we write to the page so it has no sense to keep the flag */
+ block->status&= ~PCBLOCK_DIRECT_W;
+ DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx",
+ (ulong) block));
+
+ if (make_lock_and_pin(pagecache, block,
+ write_lock_change_table[lock].new_lock,
+ (need_lock_change ?
+ write_pin_change_table[pin].new_pin :
+ pin), FALSE))
+ {
+ /*
+ We failed to writelock the block, cache is unlocked, and last write
+ lock is released, we will try to get the block again.
+ */
+ if (reg_request)
+ unreg_request(pagecache, block, 1);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_PRINT("info", ("restarting..."));
+ goto restart;
+ }
+
+ if (write_mode == PAGECACHE_WRITE_DONE)
+ {
+ if (block->status & PCBLOCK_ERROR)
+ {
+ my_debug_put_break_here();
+ DBUG_PRINT("warning", ("Writing on page with error"));
+ }
+ else
+ {
+ /* Copy data from buff */
+ if (!(size & 511))
+ bmove512(block->buffer + offset, buff, size);
+ else
+ memcpy(block->buffer + offset, buff, size);
+ block->status= PCBLOCK_READ;
+ /*
+ The read_callback can change the page content (removing page
+ protection) so it have to be called
+ */
+ DBUG_PRINT("info", ("read_callback: 0x%lx data: 0x%lx",
+ (ulong) block->hash_link->file.read_callback,
+ (ulong) block->hash_link->file.callback_data));
+ if ((*block->hash_link->file.read_callback)(block->buffer,
+ block->hash_link->pageno,
+ block->hash_link->
+ file.callback_data))
+ {
+ DBUG_PRINT("error", ("read callback problem"));
+ block->status|= PCBLOCK_ERROR;
+ block->error= (int16) my_errno;
+ my_debug_put_break_here();
+ }
+ KEYCACHE_DBUG_PRINT("key_cache_insert",
+ ("Page injection"));
+#ifdef THREAD
+ /* Signal that all pending requests for this now can be processed. */
+ if (block->wqueue[COND_FOR_REQUESTED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]);
+#endif
+ }
+ }
+ else
+ {
+ if (! (block->status & PCBLOCK_CHANGED))
+ link_to_changed_list(pagecache, block);
+
+ if (!(size & 511))
+ bmove512(block->buffer + offset, buff, size);
+ else
+ memcpy(block->buffer + offset, buff, size);
+ block->status|= PCBLOCK_READ;
+ /* Page is correct again if we made a full write in it */
+ if (size == pagecache->block_size)
+ block->status&= ~PCBLOCK_ERROR;
+ }
+
+#ifdef THREAD
+ if (need_page_ready_signal &&
+ block->wqueue[COND_FOR_REQUESTED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]);
+#endif
+
+ if (first_REDO_LSN_for_page)
+ {
+ /* single write action of the last write action */
+ DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+ lock == PAGECACHE_LOCK_LEFT_UNLOCKED);
+ DBUG_ASSERT(pin == PAGECACHE_UNPIN ||
+ pin == PAGECACHE_PIN_LEFT_UNPINNED);
+ pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
+ }
+
+ if (need_lock_change)
+ {
+ /*
+ We don't set rec_lsn of the block; this is ok as for the
+ Maria-block-record's pages, we always keep pages pinned here.
+ */
+ if (make_lock_and_pin(pagecache, block,
+ write_lock_change_table[lock].unlock_lock,
+ write_pin_change_table[pin].unlock_pin, FALSE))
+ DBUG_ASSERT(0);
+ }
+
+ /* Unregister the request */
+ DBUG_ASSERT(block->hash_link->requests > 0);
+ block->hash_link->requests--;
+ /* See NOTE for pagecache_unlock about registering requests. */
+ if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN)
+ unreg_request(pagecache, block, 1);
+ else
+ *page_link= block;
+
+ if (block->status & PCBLOCK_ERROR)
+ {
+ error= 1;
+ my_debug_put_break_here();
+ }
+
+ dec_counter_for_resize_op(pagecache);
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+ goto end;
+ }
+
+no_key_cache:
+ /*
+ We can't by pass the normal page cache operations because need
+ whole page for calling callbacks & so on.
+ This branch should not be used for now (but it is fixed as it
+ should be just to avoid confusing)
+ */
+ DBUG_ASSERT(0);
+ /* Key cache is not used */
+ if (write_mode == PAGECACHE_WRITE_DELAY)
+ {
+ /* We can't use mutex here as the key cache may not be initialized */
+ pagecache->global_cache_w_requests++;
+ pagecache->global_cache_write++;
+ if (offset != 0 || size != pagecache->block_size)
+ {
+ uchar *page_buffer= (uchar *) alloca(pagecache->block_size);
+
+ pagecache->global_cache_read++;
+ if ((error= (pagecache_fread(pagecache, file,
+ page_buffer,
+ pageno,
+ pagecache->readwrite_flags) != 0)))
+ goto end;
+ if ((file->read_callback)(page_buffer, pageno, file->callback_data))
+ {
+ DBUG_PRINT("error", ("read callback problem"));
+ error= 1;
+ goto end;
+ }
+ memcpy((char *)page_buffer + offset, buff, size);
+ buff= page_buffer;
+ }
+ if (pagecache_fwrite(pagecache, file, buff, pageno, type,
+ pagecache->readwrite_flags))
+ error= 1;
+ }
+
+end:
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("exec",
+ test_key_cache(pagecache, "end of key_cache_write", 1););
+#endif
+ if (block)
+ PCBLOCK_INFO(block);
+ else
+ DBUG_PRINT("info", ("No block"));
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Free block: remove reference to it from hash table,
+ remove it from the chain file of dirty/clean blocks
+ and add it to the free list.
+*/
+
+static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block)
+{
+ KEYCACHE_THREAD_TRACE("free block");
+ KEYCACHE_DBUG_PRINT("free_block",
+ ("block: %u hash_link 0x%lx",
+ PCBLOCK_NUMBER(pagecache, block),
+ (long) block->hash_link));
+ if (block->hash_link)
+ {
+ /*
+ While waiting for readers to finish, new readers might request the
+ block. But since we set block->status|= PCBLOCK_REASSIGNED, they
+ will wait on block->wqueue[COND_FOR_SAVED]. They must be signalled
+ later.
+ */
+ block->status|= PCBLOCK_REASSIGNED;
+ wait_for_readers(pagecache, block);
+ unlink_hash(pagecache, block->hash_link);
+ }
+
+ unlink_changed(block);
+ DBUG_ASSERT(block->wlocks == 0);
+ DBUG_ASSERT(block->rlocks == 0);
+ DBUG_ASSERT(block->rlocks_queue == 0);
+ DBUG_ASSERT(block->pins == 0);
+ block->status= 0;
+#ifndef DBUG_OFF
+ block->type= PAGECACHE_EMPTY_PAGE;
+#endif
+ block->rec_lsn= LSN_MAX;
+ KEYCACHE_THREAD_TRACE("free block");
+ KEYCACHE_DBUG_PRINT("free_block",
+ ("block is freed"));
+ unreg_request(pagecache, block, 0);
+ block->hash_link= NULL;
+
+ /* Remove the free block from the LRU ring. */
+ unlink_block(pagecache, block);
+ if (block->temperature == PCBLOCK_WARM)
+ pagecache->warm_blocks--;
+ block->temperature= PCBLOCK_COLD;
+ /* Insert the free block in the free list. */
+ block->next_used= pagecache->free_block_list;
+ pagecache->free_block_list= block;
+ /* Keep track of the number of currently unused blocks. */
+ pagecache->blocks_unused++;
+
+#ifdef THREAD
+ /* All pending requests for this page must be resubmitted. */
+ if (block->wqueue[COND_FOR_SAVED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]);
+#endif
+}
+
+
+static int cmp_sec_link(PAGECACHE_BLOCK_LINK **a, PAGECACHE_BLOCK_LINK **b)
+{
+ return (((*a)->hash_link->pageno < (*b)->hash_link->pageno) ? -1 :
+ ((*a)->hash_link->pageno > (*b)->hash_link->pageno) ? 1 : 0);
+}
+
+
+/**
+ @brief Flush a portion of changed blocks to disk, free used blocks
+ if requested
+
+ @param pagecache This page cache reference.
+ @param file File which should be flushed
+ @param cache Beginning of array of the block.
+ @param end Reference to the block after last in the array.
+ @param flush_type Type of the flush.
+ @param first_errno Where to store first errno of the flush.
+
+
+ @return Operation status
+ @retval PCFLUSH_OK OK
+ @retval PCFLUSH_ERROR There was errors during the flush process.
+ @retval PCFLUSH_PINNED Pinned blocks was met and skipped.
+ @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED.
+*/
+
+static int flush_cached_blocks(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ PAGECACHE_BLOCK_LINK **cache,
+ PAGECACHE_BLOCK_LINK **end,
+ enum flush_type type,
+ int *first_errno)
+{
+ int rc= PCFLUSH_OK;
+ my_bool error;
+ uint count= (uint) (end-cache);
+ DBUG_ENTER("flush_cached_blocks");
+ *first_errno= 0;
+
+ /* Don't lock the cache during the flush */
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /*
+ As all blocks referred in 'cache' are marked by PCBLOCK_IN_FLUSH
+ we are guaranteed that no thread will change them
+ */
+ qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link);
+
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ for (; cache != end; cache++)
+ {
+ PAGECACHE_BLOCK_LINK *block= *cache;
+
+ /*
+ In the case of non_transactional tables we want to flush also
+ block pinned with reads. This is becasue we may have other
+ threads reading the block during flush, as non transactional
+ tables can have many readers while the one writer is doing the
+ flush.
+ We don't want to do flush pinned blocks during checkpoint.
+ We detect the checkpoint case by checking if type is LAZY.
+ */
+ if ((type == FLUSH_KEEP_LAZY && block->pins) || block->wlocks)
+ {
+ KEYCACHE_DBUG_PRINT("flush_cached_blocks",
+ ("block: %u (0x%lx) pinned",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+ DBUG_PRINT("info", ("block: %u (0x%lx) pinned",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+ PCBLOCK_INFO(block);
+ /* undo the mark put by flush_pagecache_blocks_int(): */
+ block->status&= ~PCBLOCK_IN_FLUSH;
+ rc|= PCFLUSH_PINNED;
+ DBUG_PRINT("warning", ("Page pinned"));
+ unreg_request(pagecache, block, 1);
+ if (!*first_errno)
+ *first_errno= HA_ERR_INTERNAL_ERROR;
+ continue;
+ }
+ if (make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_READ, PAGECACHE_PIN, FALSE))
+ DBUG_ASSERT(0);
+
+ KEYCACHE_DBUG_PRINT("flush_cached_blocks",
+ ("block: %u (0x%lx) to be flushed",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+ DBUG_PRINT("info", ("block: %u (0x%lx) to be flushed",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+ PCBLOCK_INFO(block);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_PRINT("info", ("block: %u (0x%lx) pins: %u",
+ PCBLOCK_NUMBER(pagecache, block), (ulong)block,
+ block->pins));
+ /**
+ @todo IO If page is contiguous with next page to flush, group flushes
+ in one single my_pwrite().
+ */
+ /**
+ It is important to use block->hash_link->file below and not 'file', as
+ the first one is right and the second may have different out-of-date
+ content (see StaleFilePointersInFlush in ma_checkpoint.c).
+ @todo change argument of functions to be File.
+ */
+ error= pagecache_fwrite(pagecache, &block->hash_link->file,
+ block->buffer,
+ block->hash_link->pageno,
+ block->type,
+ pagecache->readwrite_flags);
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+ if (make_lock_and_pin(pagecache, block,
+ PAGECACHE_LOCK_READ_UNLOCK,
+ PAGECACHE_UNPIN, FALSE))
+ DBUG_ASSERT(0);
+
+ pagecache->global_cache_write++;
+ if (error)
+ {
+ block->status|= PCBLOCK_ERROR;
+ block->error= (int16) my_errno;
+ my_debug_put_break_here();
+ if (!*first_errno)
+ *first_errno= my_errno ? my_errno : -1;
+ rc|= PCFLUSH_ERROR;
+ }
+#ifdef THREAD
+ /*
+ Let to proceed for possible waiting requests to write to the block page.
+ It might happen only during an operation to resize the key cache.
+ */
+ if (block->wqueue[COND_FOR_SAVED].last_thread)
+ wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]);
+#endif
+ /* type will never be FLUSH_IGNORE_CHANGED here */
+ if (! (type == FLUSH_KEEP || type == FLUSH_KEEP_LAZY ||
+ type == FLUSH_FORCE_WRITE))
+ {
+ pagecache->blocks_changed--;
+ pagecache->global_blocks_changed--;
+ free_block(pagecache, block);
+ }
+ else
+ {
+ block->status&= ~PCBLOCK_IN_FLUSH;
+ link_to_file_list(pagecache, block, file, 1);
+ unreg_request(pagecache, block, 1);
+ }
+ }
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief flush all blocks for a file to disk but don't do any mutex locks
+
+ @param pagecache pointer to a pagecache data structure
+ @param file handler for the file to flush to
+ @param flush_type type of the flush
+ @param filter optional function which tells what blocks to flush;
+ can be non-NULL only if FLUSH_KEEP, FLUSH_KEEP_LAZY
+ or FLUSH_FORCE_WRITE.
+ @param filter_arg an argument to pass to 'filter'. Information about
+ the block will be passed too.
+
+ @note
+ Flushes all blocks having the same OS file descriptor as 'file->file', so
+ can flush blocks having '*block->hash_link->file' != '*file'.
+
+ @note
+ This function doesn't do any mutex locks because it needs to be called
+ both from flush_pagecache_blocks and flush_all_key_blocks (the later one
+ does the mutex lock in the resize_pagecache() function).
+
+ @note
+ This function can cause problems if two threads call it
+ concurrently on the same file (look for "PageCacheFlushConcurrencyBugs"
+ in ma_checkpoint.c); to avoid them, it has internal logic to serialize in
+ this situation.
+
+ @return Operation status
+ @retval PCFLUSH_OK OK
+ @retval PCFLUSH_ERROR There was errors during the flush process.
+ @retval PCFLUSH_PINNED Pinned blocks was met and skipped.
+ @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED.
+*/
+
+static int flush_pagecache_blocks_int(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ enum flush_type type,
+ PAGECACHE_FLUSH_FILTER filter,
+ void *filter_arg)
+{
+ PAGECACHE_BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache;
+ int last_errno= 0;
+ int rc= PCFLUSH_OK;
+ DBUG_ENTER("flush_pagecache_blocks_int");
+ DBUG_PRINT("enter",
+ ("fd: %d blocks_used: %lu blocks_changed: %lu type: %d",
+ file->file, pagecache->blocks_used, pagecache->blocks_changed,
+ type));
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache,
+ "start of flush_pagecache_blocks", 0););
+#endif
+
+ cache= cache_buff;
+ if (pagecache->disk_blocks > 0 &&
+ (!my_disable_flush_pagecache_blocks ||
+ (type != FLUSH_KEEP && type != FLUSH_KEEP_LAZY)))
+ {
+ /*
+ Key cache exists. If my_disable_flush_pagecache_blocks is true it
+ disables the operation but only FLUSH_KEEP[_LAZY]: other flushes still
+ need to be allowed: FLUSH_RELEASE has to free blocks, and
+ FLUSH_FORCE_WRITE is to overrule my_disable_flush_pagecache_blocks.
+ */
+ int error= 0;
+ uint count= 0;
+ PAGECACHE_BLOCK_LINK **pos, **end;
+ PAGECACHE_BLOCK_LINK *first_in_switch= NULL;
+ PAGECACHE_BLOCK_LINK *block, *next;
+#if defined(PAGECACHE_DEBUG)
+ uint cnt= 0;
+#endif
+
+#ifdef THREAD
+ struct st_file_in_flush us_flusher, *other_flusher;
+ us_flusher.file= file->file;
+ us_flusher.flush_queue.last_thread= NULL;
+ us_flusher.first_in_switch= FALSE;
+ while ((other_flusher= (struct st_file_in_flush *)
+ hash_search(&pagecache->files_in_flush, (uchar *)&file->file,
+ sizeof(file->file))))
+ {
+ /*
+ File is in flush already: wait, unless FLUSH_KEEP_LAZY. "Flusher"
+ means "who can mark PCBLOCK_IN_FLUSH", i.e. caller of
+ flush_pagecache_blocks_int().
+ */
+ struct st_my_thread_var *thread;
+ if (type == FLUSH_KEEP_LAZY)
+ {
+ DBUG_PRINT("info",("FLUSH_KEEP_LAZY skips"));
+ DBUG_RETURN(0);
+ }
+ thread= my_thread_var;
+ wqueue_add_to_queue(&other_flusher->flush_queue, thread);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("flush_pagecache_blocks_int: wait1",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while (thread->next);
+ }
+ /* we are the only flusher of this file now */
+ while (my_hash_insert(&pagecache->files_in_flush, (uchar *)&us_flusher))
+ {
+ /*
+ Out of memory, wait for flushers to empty the hash and retry; should
+ rarely happen. Other threads are flushing the file; when done, they
+ are going to remove themselves from the hash, and thus memory will
+ appear again. However, this memory may be stolen by yet another thread
+ (for a purpose unrelated to page cache), before we retry
+ hash_insert(). So the loop may run for long. Only if the thread was
+ killed do we abort the loop, returning 1 (error) which can cause the
+ table to be marked as corrupted (cf maria_chk_size(), maria_close())
+ and thus require a table check.
+ */
+ DBUG_ASSERT(0);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ if (my_thread_var->abort)
+ DBUG_RETURN(1); /* End if aborted by user */
+ sleep(10);
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ }
+#endif
+
+ if (type != FLUSH_IGNORE_CHANGED)
+ {
+ /*
+ Count how many key blocks we have to cache to be able
+ to flush all dirty pages with minimum seek moves.
+ */
+ for (block= pagecache->changed_blocks[FILE_HASH(*file)] ;
+ block;
+ block= block->next_changed)
+ {
+ if (block->hash_link->file.file == file->file)
+ {
+ count++;
+ KEYCACHE_DBUG_ASSERT(count<= pagecache->blocks_used);
+ }
+ }
+ /* Allocate a new buffer only if its bigger than the one we have */
+ if (count > FLUSH_CACHE &&
+ !(cache=
+ (PAGECACHE_BLOCK_LINK**)
+ my_malloc(sizeof(PAGECACHE_BLOCK_LINK*)*count, MYF(0))))
+ {
+ cache= cache_buff;
+ count= FLUSH_CACHE;
+ }
+ }
+
+ /* Retrieve the blocks and write them to a buffer to be flushed */
+restart:
+ end= (pos= cache)+count;
+ for (block= pagecache->changed_blocks[FILE_HASH(*file)] ;
+ block;
+ block= next)
+ {
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+ next= block->next_changed;
+ if (block->hash_link->file.file != file->file)
+ continue;
+ if (filter != NULL)
+ {
+ int filter_res= (*filter)(block->type, block->hash_link->pageno,
+ block->rec_lsn, filter_arg);
+ DBUG_PRINT("info",("filter returned %d", filter_res));
+ if (filter_res == FLUSH_FILTER_SKIP_TRY_NEXT)
+ continue;
+ if (filter_res == FLUSH_FILTER_SKIP_ALL)
+ break;
+ DBUG_ASSERT(filter_res == FLUSH_FILTER_OK);
+ }
+ {
+ /*
+ Mark the block with BLOCK_IN_FLUSH in order not to let
+ other threads to use it for new pages and interfere with
+ our sequence of flushing dirty file pages
+ */
+ block->status|= PCBLOCK_IN_FLUSH;
+
+ if (! (block->status & PCBLOCK_IN_SWITCH))
+ {
+ /*
+ We care only for the blocks for which flushing was not
+ initiated by other threads as a result of page swapping
+ */
+ reg_requests(pagecache, block, 1);
+ if (type != FLUSH_IGNORE_CHANGED)
+ {
+ /* It's not a temporary file */
+ if (pos == end)
+ {
+ /*
+ This happens only if there is not enough
+ memory for the big block
+ */
+ if ((rc|= flush_cached_blocks(pagecache, file, cache,
+ end, type, &error)) &
+ (PCFLUSH_ERROR | PCFLUSH_PINNED))
+ last_errno=error;
+ DBUG_PRINT("info", ("restarting..."));
+ /*
+ Restart the scan as some other thread might have changed
+ the changed blocks chain: the blocks that were in switch
+ state before the flush started have to be excluded
+ */
+ goto restart;
+ }
+ *pos++= block;
+ }
+ else
+ {
+ /* It's a temporary file */
+ pagecache->blocks_changed--;
+ pagecache->global_blocks_changed--;
+ free_block(pagecache, block);
+ }
+ }
+ else if (type != FLUSH_KEEP_LAZY)
+ {
+ /*
+ Link the block into a list of blocks 'in switch', and then we will
+ wait for this list to be empty, which means they have been flushed
+ */
+ unlink_changed(block);
+ link_changed(block, &first_in_switch);
+ us_flusher.first_in_switch= TRUE;
+ }
+ }
+ }
+ if (pos != cache)
+ {
+ if ((rc|= flush_cached_blocks(pagecache, file, cache, pos, type,
+ &error)) &
+ (PCFLUSH_ERROR | PCFLUSH_PINNED))
+ last_errno= error;
+ }
+ /* Wait until list of blocks in switch is empty */
+ while (first_in_switch)
+ {
+#if defined(PAGECACHE_DEBUG)
+ cnt= 0;
+#endif
+ block= first_in_switch;
+ {
+#ifdef THREAD
+ struct st_my_thread_var *thread= my_thread_var;
+ wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("flush_pagecache_blocks_int: wait2",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while (thread->next);
+#else
+ KEYCACHE_DBUG_ASSERT(0);
+ /* No parallel requests in single-threaded case */
+#endif
+ }
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+ }
+ us_flusher.first_in_switch= FALSE;
+ /* The following happens very seldom */
+ if (! (type == FLUSH_KEEP || type == FLUSH_KEEP_LAZY ||
+ type == FLUSH_FORCE_WRITE))
+ {
+ /*
+ this code would free all blocks while filter maybe handled only a
+ few, that is not possible.
+ */
+ DBUG_ASSERT(filter == NULL);
+#if defined(PAGECACHE_DEBUG)
+ cnt=0;
+#endif
+ for (block= pagecache->file_blocks[FILE_HASH(*file)] ;
+ block;
+ block= next)
+ {
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+ next= block->next_changed;
+ if (block->hash_link->file.file == file->file &&
+ (! (block->status & PCBLOCK_CHANGED)
+ || type == FLUSH_IGNORE_CHANGED))
+ {
+ reg_requests(pagecache, block, 1);
+ free_block(pagecache, block);
+ }
+ }
+ }
+#ifdef THREAD
+ /* wake up others waiting to flush this file */
+ hash_delete(&pagecache->files_in_flush, (uchar *)&us_flusher);
+ if (us_flusher.flush_queue.last_thread)
+ wqueue_release_queue(&us_flusher.flush_queue);
+#endif
+ }
+
+#ifndef DBUG_OFF
+ DBUG_EXECUTE("check_pagecache",
+ test_key_cache(pagecache, "end of flush_pagecache_blocks", 0););
+#endif
+ if (cache != cache_buff)
+ my_free(cache, MYF(0));
+ if (rc != 0)
+ {
+ if (last_errno)
+ my_errno= last_errno; /* Return first error */
+ DBUG_PRINT("error", ("Got error: %d", my_errno));
+ }
+ DBUG_RETURN(rc);
+}
+
+
+/**
+ @brief flush all blocks for a file to disk
+
+ @param pagecache pointer to a pagecache data structure
+ @param file handler for the file to flush to
+ @param flush_type type of the flush
+ @param filter optional function which tells what blocks to flush;
+ can be non-NULL only if FLUSH_KEEP, FLUSH_KEEP_LAZY
+ or FLUSH_FORCE_WRITE.
+ @param filter_arg an argument to pass to 'filter'. Information about
+ the block will be passed too.
+
+ @return Operation status
+ @retval PCFLUSH_OK OK
+ @retval PCFLUSH_ERROR There was errors during the flush process.
+ @retval PCFLUSH_PINNED Pinned blocks was met and skipped.
+ @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED.
+*/
+
+int flush_pagecache_blocks_with_filter(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ enum flush_type type,
+ PAGECACHE_FLUSH_FILTER filter,
+ void *filter_arg)
+{
+ int res;
+ DBUG_ENTER("flush_pagecache_blocks_with_filter");
+ DBUG_PRINT("enter", ("pagecache: 0x%lx", (long) pagecache));
+
+ if (pagecache->disk_blocks <= 0)
+ DBUG_RETURN(0);
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ inc_counter_for_resize_op(pagecache);
+ res= flush_pagecache_blocks_int(pagecache, file, type, filter, filter_arg);
+ dec_counter_for_resize_op(pagecache);
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Reset the counters of a key cache.
+
+ SYNOPSIS
+ reset_pagecache_counters()
+ name the name of a key cache
+ pagecache pointer to the pagecache to be reset
+
+ DESCRIPTION
+ This procedure is used to reset the counters of all currently used key
+ caches, both the default one and the named ones.
+
+ RETURN
+ 0 on success (always because it can't fail)
+*/
+
+int reset_pagecache_counters(const char *name __attribute__((unused)),
+ PAGECACHE *pagecache)
+{
+ DBUG_ENTER("reset_pagecache_counters");
+ if (!pagecache->inited)
+ {
+ DBUG_PRINT("info", ("Key cache %s not initialized.", name));
+ DBUG_RETURN(0);
+ }
+ DBUG_PRINT("info", ("Resetting counters for key cache %s.", name));
+
+ pagecache->global_blocks_changed= 0; /* Key_blocks_not_flushed */
+ pagecache->global_cache_r_requests= 0; /* Key_read_requests */
+ pagecache->global_cache_read= 0; /* Key_reads */
+ pagecache->global_cache_w_requests= 0; /* Key_write_requests */
+ pagecache->global_cache_write= 0; /* Key_writes */
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Allocates a buffer and stores in it some info about all dirty pages
+
+ Does the allocation because the caller cannot know the size itself.
+ Memory freeing is to be done by the caller (if the "str" member of the
+ LEX_STRING is not NULL).
+ Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they
+ are not interesting for a checkpoint record.
+ The caller has the intention of doing checkpoints.
+
+ @param pagecache pointer to the page cache
+ @param[out] str pointer to where the allocated buffer, and
+ its size, will be put
+ @param[out] min_rec_lsn pointer to where the minimum rec_lsn of all
+ relevant dirty pages will be put
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
+ LEX_STRING *str,
+ LSN *min_rec_lsn)
+{
+ my_bool error= 0;
+ ulong stored_list_size= 0;
+ uint file_hash;
+ char *ptr;
+ LSN minimum_rec_lsn= LSN_MAX;
+ DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN");
+
+ DBUG_ASSERT(NULL == str->str);
+ /*
+ We lock the entire cache but will be quick, just reading/writing a few MBs
+ of memory at most.
+ */
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+#ifdef THREAD
+ for (;;)
+ {
+ struct st_file_in_flush *other_flusher;
+ for (file_hash= 0;
+ (other_flusher= (struct st_file_in_flush *)
+ hash_element(&pagecache->files_in_flush, file_hash)) != NULL &&
+ !other_flusher->first_in_switch;
+ file_hash++)
+ {}
+ if (other_flusher == NULL)
+ break;
+ /*
+ other_flusher.first_in_switch is true: some thread is flushing a file
+ and has removed dirty blocks from changed_blocks[] while they were still
+ dirty (they were being evicted (=>flushed) by yet another thread, which
+ may not have flushed the block yet so it may still be dirty).
+ If Checkpoint proceeds now, it will not see the page. If there is a
+ crash right after writing the checkpoint record, before the page is
+ flushed, at recovery the page will be wrongly ignored because it won't
+ be in the dirty pages list in the checkpoint record. So wait.
+ */
+ {
+ struct st_my_thread_var *thread= my_thread_var;
+ wqueue_add_to_queue(&other_flusher->flush_queue, thread);
+ do
+ {
+ KEYCACHE_DBUG_PRINT("pagecache_collect_changed_blocks_with_lsn: wait",
+ ("suspend thread %ld", thread->id));
+ pagecache_pthread_cond_wait(&thread->suspend,
+ &pagecache->cache_lock);
+ }
+ while (thread->next);
+ }
+ }
+#endif
+
+ /* Count how many dirty pages are interesting */
+ for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+ {
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->changed_blocks[file_hash] ;
+ block;
+ block= block->next_changed)
+ {
+ /*
+ Q: is there something subtle with block->hash_link: can it be NULL?
+ does it have to be == hash_link->block... ?
+ */
+ DBUG_ASSERT(block->hash_link != NULL);
+ DBUG_ASSERT(block->status & PCBLOCK_CHANGED);
+ /*
+ Note that we don't store bitmap pages, or pages from non-transactional
+ (like temporary) tables. Don't checkpoint during Recovery which uses
+ PAGECACHE_PLAIN_PAGE.
+ */
+ if (block->type != PAGECACHE_LSN_PAGE)
+ continue; /* no need to store it */
+ stored_list_size++;
+ }
+ }
+
+ compile_time_assert(sizeof(pagecache->blocks) <= 8);
+ str->length= 8 + /* number of dirty pages */
+ (2 + /* table id */
+ 1 + /* data or index file */
+ 5 + /* pageno */
+ LSN_STORE_SIZE /* rec_lsn */
+ ) * stored_list_size;
+ if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME))))
+ goto err;
+ ptr= str->str;
+ int8store(ptr, (ulonglong)stored_list_size);
+ ptr+= 8;
+ DBUG_PRINT("info", ("found %lu dirty pages", stored_list_size));
+ if (stored_list_size == 0)
+ goto end;
+ for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+ {
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->changed_blocks[file_hash] ;
+ block;
+ block= block->next_changed)
+ {
+ uint16 table_id;
+ MARIA_SHARE *share;
+ if (block->type != PAGECACHE_LSN_PAGE)
+ continue; /* no need to store it in the checkpoint record */
+ share= (MARIA_SHARE *)(block->hash_link->file.callback_data);
+ table_id= share->id;
+ int2store(ptr, table_id);
+ ptr+= 2;
+ ptr[0]= (share->kfile.file == block->hash_link->file.file);
+ ptr++;
+ DBUG_ASSERT(block->hash_link->pageno < ((ULL(1)) << 40));
+ page_store(ptr, block->hash_link->pageno);
+ ptr+= PAGE_STORE_SIZE;
+ lsn_store(ptr, block->rec_lsn);
+ ptr+= LSN_STORE_SIZE;
+ if (block->rec_lsn != LSN_MAX)
+ {
+ DBUG_ASSERT(LSN_VALID(block->rec_lsn));
+ if (cmp_translog_addr(block->rec_lsn, minimum_rec_lsn) < 0)
+ minimum_rec_lsn= block->rec_lsn;
+ } /* otherwise, some trn->rec_lsn should hold the correct info */
+ }
+ }
+end:
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ *min_rec_lsn= minimum_rec_lsn;
+ DBUG_RETURN(error);
+
+err:
+ error= 1;
+ goto end;
+}
+
+
+#ifndef DBUG_OFF
+
+/**
+ Verifies that a file has no dirty pages.
+*/
+
+void pagecache_file_no_dirty_page(PAGECACHE *pagecache, PAGECACHE_FILE *file)
+{
+ File fd= file->file;
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->changed_blocks[FILE_HASH(*file)];
+ block != NULL;
+ block= block->next_changed)
+ if (block->hash_link->file.file == fd)
+ {
+ DBUG_PRINT("info", ("pagecache_file_not_in error"));
+ PCBLOCK_INFO(block);
+ DBUG_ASSERT(0);
+ }
+}
+
+
+/*
+ Test if disk-cache is ok
+*/
+static void test_key_cache(PAGECACHE *pagecache __attribute__((unused)),
+ const char *where __attribute__((unused)),
+ my_bool lock __attribute__((unused)))
+{
+ /* TODO */
+}
+#endif
+
+uchar *pagecache_block_link_to_buffer(PAGECACHE_BLOCK_LINK *block)
+{
+ return block->buffer;
+}
+
+#if defined(PAGECACHE_TIMEOUT)
+
+#define KEYCACHE_DUMP_FILE "pagecache_dump.txt"
+#define MAX_QUEUE_LEN 100
+
+
+static void pagecache_dump(PAGECACHE *pagecache)
+{
+ FILE *pagecache_dump_file=fopen(KEYCACHE_DUMP_FILE, "w");
+ struct st_my_thread_var *last;
+ struct st_my_thread_var *thread;
+ PAGECACHE_BLOCK_LINK *block;
+ PAGECACHE_HASH_LINK *hash_link;
+ PAGECACHE_PAGE *page;
+ uint i;
+
+ fprintf(pagecache_dump_file, "thread:%u\n", thread->id);
+
+ i=0;
+ thread=last=waiting_for_hash_link.last_thread;
+ fprintf(pagecache_dump_file, "queue of threads waiting for hash link\n");
+ if (thread)
+ do
+ {
+ thread= thread->next;
+ page= (PAGECACHE_PAGE *) thread->opt_info;
+ fprintf(pagecache_dump_file,
+ "thread:%u, (file,pageno)=(%u,%lu)\n",
+ thread->id,(uint) page->file.file,(ulong) page->pageno);
+ if (++i == MAX_QUEUE_LEN)
+ break;
+ }
+ while (thread != last);
+
+ i=0;
+ thread=last=waiting_for_block.last_thread;
+ fprintf(pagecache_dump_file, "queue of threads waiting for block\n");
+ if (thread)
+ do
+ {
+ thread=thread->next;
+ hash_link= (PAGECACHE_HASH_LINK *) thread->opt_info;
+ fprintf(pagecache_dump_file,
+ "thread:%u hash_link:%u (file,pageno)=(%u,%lu)\n",
+ thread->id, (uint) PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link),
+ (uint) hash_link->file.file,(ulong) hash_link->pageno);
+ if (++i == MAX_QUEUE_LEN)
+ break;
+ }
+ while (thread != last);
+
+ for (i=0 ; i < pagecache->blocks_used ; i++)
+ {
+ int j;
+ block= &pagecache->block_root[i];
+ hash_link= block->hash_link;
+ fprintf(pagecache_dump_file,
+ "block:%u hash_link:%d status:%x #requests=%u waiting_for_readers:%d\n",
+ i, (int) (hash_link ?
+ PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link) :
+ -1),
+ block->status, block->requests, block->condvar ? 1 : 0);
+ for (j=0 ; j < COND_SIZE; j++)
+ {
+ PAGECACHE_WQUEUE *wqueue=&block->wqueue[j];
+ thread= last= wqueue->last_thread;
+ fprintf(pagecache_dump_file, "queue #%d\n", j);
+ if (thread)
+ {
+ do
+ {
+ thread=thread->next;
+ fprintf(pagecache_dump_file,
+ "thread:%u\n", thread->id);
+ if (++i == MAX_QUEUE_LEN)
+ break;
+ }
+ while (thread != last);
+ }
+ }
+ }
+ fprintf(pagecache_dump_file, "LRU chain:");
+ block= pagecache= used_last;
+ if (block)
+ {
+ do
+ {
+ block= block->next_used;
+ fprintf(pagecache_dump_file,
+ "block:%u, ", PCBLOCK_NUMBER(pagecache, block));
+ }
+ while (block != pagecache->used_last);
+ }
+ fprintf(pagecache_dump_file, "\n");
+
+ fclose(pagecache_dump_file);
+}
+
+#endif /* defined(PAGECACHE_TIMEOUT) */
+
+#if defined(PAGECACHE_TIMEOUT) && !defined(__WIN__)
+
+
+static int pagecache_pthread_cond_wait(pthread_cond_t *cond,
+ pthread_mutex_t *mutex)
+{
+ int rc;
+ struct timeval now; /* time when we started waiting */
+ struct timespec timeout; /* timeout value for the wait function */
+ struct timezone tz;
+#if defined(PAGECACHE_DEBUG)
+ int cnt=0;
+#endif
+
+ /* Get current time */
+ gettimeofday(&now, &tz);
+ /* Prepare timeout value */
+ timeout.tv_sec= now.tv_sec + PAGECACHE_TIMEOUT;
+ /*
+ timeval uses microseconds.
+ timespec uses nanoseconds.
+ 1 nanosecond = 1000 micro seconds
+ */
+ timeout.tv_nsec= now.tv_usec * 1000;
+ KEYCACHE_THREAD_TRACE_END("started waiting");
+#if defined(PAGECACHE_DEBUG)
+ cnt++;
+ if (cnt % 100 == 0)
+ fprintf(pagecache_debug_log, "waiting...\n");
+ fflush(pagecache_debug_log);
+#endif
+ rc= pthread_cond_timedwait(cond, mutex, &timeout);
+ KEYCACHE_THREAD_TRACE_BEGIN("finished waiting");
+ if (rc == ETIMEDOUT || rc == ETIME)
+ {
+#if defined(PAGECACHE_DEBUG)
+ fprintf(pagecache_debug_log,"aborted by pagecache timeout\n");
+ fclose(pagecache_debug_log);
+ abort();
+#endif
+ pagecache_dump();
+ }
+
+#if defined(PAGECACHE_DEBUG)
+ KEYCACHE_DBUG_ASSERT(rc != ETIMEDOUT);
+#else
+ assert(rc != ETIMEDOUT);
+#endif
+ return rc;
+}
+#else
+#if defined(PAGECACHE_DEBUG)
+static int pagecache_pthread_cond_wait(pthread_cond_t *cond,
+ pthread_mutex_t *mutex)
+{
+ int rc;
+ KEYCACHE_THREAD_TRACE_END("started waiting");
+ rc= pthread_cond_wait(cond, mutex);
+ KEYCACHE_THREAD_TRACE_BEGIN("finished waiting");
+ return rc;
+}
+#endif
+#endif /* defined(PAGECACHE_TIMEOUT) && !defined(__WIN__) */
+
+#if defined(PAGECACHE_DEBUG)
+static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex)
+{
+ int rc;
+ rc= pthread_mutex_lock(mutex);
+ KEYCACHE_THREAD_TRACE_BEGIN("");
+ return rc;
+}
+
+
+static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex)
+{
+ KEYCACHE_THREAD_TRACE_END("");
+ pthread_mutex_unlock(mutex);
+}
+
+
+static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond)
+{
+ int rc;
+ KEYCACHE_THREAD_TRACE("signal");
+ rc= pthread_cond_signal(cond);
+ return rc;
+}
+
+
+#if defined(PAGECACHE_DEBUG_LOG)
+
+
+static void pagecache_debug_print(const char * fmt, ...)
+{
+ va_list args;
+ va_start(args,fmt);
+ if (pagecache_debug_log)
+ {
+ VOID(vfprintf(pagecache_debug_log, fmt, args));
+ VOID(fputc('\n',pagecache_debug_log));
+ }
+ va_end(args);
+}
+#endif /* defined(PAGECACHE_DEBUG_LOG) */
+
+#if defined(PAGECACHE_DEBUG_LOG)
+
+
+void pagecache_debug_log_close(void)
+{
+ if (pagecache_debug_log)
+ fclose(pagecache_debug_log);
+}
+#endif /* defined(PAGECACHE_DEBUG_LOG) */
+
+#endif /* defined(PAGECACHE_DEBUG) */
diff --git a/storage/maria/ma_pagecache.h b/storage/maria/ma_pagecache.h
new file mode 100644
index 00000000000..821728ef374
--- /dev/null
+++ b/storage/maria/ma_pagecache.h
@@ -0,0 +1,325 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Page cache variable structures */
+
+#ifndef _ma_pagecache_h
+#define _ma_pagecache_h
+C_MODE_START
+
+#include "ma_loghandler_lsn.h"
+#include <m_string.h>
+#include <hash.h>
+
+/* Type of the page */
+enum pagecache_page_type
+{
+ /*
+ Used only for control page type changing during debugging. This define
+ should only be using when using DBUG.
+ */
+ PAGECACHE_EMPTY_PAGE,
+ /* the page does not contain LSN */
+ PAGECACHE_PLAIN_PAGE,
+ /* the page contain LSN (maria tablespace page) */
+ PAGECACHE_LSN_PAGE,
+ /* Page type used when scanning file and we don't care about the type */
+ PAGECACHE_READ_UNKNOWN_PAGE
+};
+
+/*
+ This enum describe lock status changing. every type of page cache will
+ interpret WRITE/READ lock as it need.
+*/
+enum pagecache_page_lock
+{
+ PAGECACHE_LOCK_LEFT_UNLOCKED, /* free -> free */
+ PAGECACHE_LOCK_LEFT_READLOCKED, /* read -> read */
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, /* write -> write */
+ PAGECACHE_LOCK_READ, /* free -> read */
+ PAGECACHE_LOCK_WRITE, /* free -> write */
+ PAGECACHE_LOCK_READ_UNLOCK, /* read -> free */
+ PAGECACHE_LOCK_WRITE_UNLOCK, /* write -> free */
+ PAGECACHE_LOCK_WRITE_TO_READ /* write -> read */
+};
+/*
+ This enum describe pin status changing
+*/
+enum pagecache_page_pin
+{
+ PAGECACHE_PIN_LEFT_PINNED, /* pinned -> pinned */
+ PAGECACHE_PIN_LEFT_UNPINNED, /* unpinned -> unpinned */
+ PAGECACHE_PIN, /* unpinned -> pinned */
+ PAGECACHE_UNPIN /* pinned -> unpinned */
+};
+/* How to write the page */
+enum pagecache_write_mode
+{
+ /* do not write immediately, i.e. it will be dirty page */
+ PAGECACHE_WRITE_DELAY,
+ /* page already is in the file. (key cache insert analogue) */
+ PAGECACHE_WRITE_DONE
+};
+
+/* page number for maria */
+typedef ulonglong pgcache_page_no_t;
+
+/* file descriptor for Maria */
+typedef struct st_pagecache_file
+{
+ File file;
+ /** Cannot be NULL */
+ my_bool (*read_callback)(uchar *page, pgcache_page_no_t offset,
+ uchar *data);
+ /** Cannot be NULL */
+ my_bool (*write_callback)(uchar *page, pgcache_page_no_t offset,
+ uchar *data);
+ void (*write_fail)(uchar *data);
+ /** Cannot be NULL */
+ my_bool (*flush_log_callback)(uchar *page, pgcache_page_no_t offset,
+ uchar *data);
+ uchar *callback_data;
+} PAGECACHE_FILE;
+
+/* declare structures that is used by st_pagecache */
+
+struct st_pagecache_block_link;
+typedef struct st_pagecache_block_link PAGECACHE_BLOCK_LINK;
+struct st_pagecache_page;
+typedef struct st_pagecache_page PAGECACHE_PAGE;
+struct st_pagecache_hash_link;
+typedef struct st_pagecache_hash_link PAGECACHE_HASH_LINK;
+
+#include <wqueue.h>
+
+#define PAGECACHE_CHANGED_BLOCKS_HASH 128 /* must be power of 2 */
+#define PAGECACHE_PRIORITY_LOW 0
+#define PAGECACHE_PRIORITY_DEFAULT 3
+#define PAGECACHE_PRIORITY_HIGH 6
+
+/*
+ The page cache structure
+ It also contains read-only statistics parameters.
+*/
+
+typedef struct st_pagecache
+{
+ size_t mem_size; /* specified size of the cache memory */
+ ulong min_warm_blocks; /* min number of warm blocks; */
+ ulong age_threshold; /* age threshold for hot blocks */
+ ulonglong time; /* total number of block link operations */
+ ulong hash_entries; /* max number of entries in the hash table */
+ long hash_links; /* max number of hash links */
+ long hash_links_used; /* number of hash links taken from free links pool */
+ long disk_blocks; /* max number of blocks in the cache */
+ ulong blocks_used; /* maximum number of concurrently used blocks */
+ ulong blocks_unused; /* number of currently unused blocks */
+ ulong blocks_changed; /* number of currently dirty blocks */
+ ulong warm_blocks; /* number of blocks in warm sub-chain */
+ ulong cnt_for_resize_op; /* counter to block resize operation */
+ ulong blocks_available; /* number of blocks available in the LRU chain */
+ long blocks; /* max number of blocks in the cache */
+ uint32 block_size; /* size of the page buffer of a cache block */
+ PAGECACHE_HASH_LINK **hash_root;/* arr. of entries into hash table buckets */
+ PAGECACHE_HASH_LINK *hash_link_root;/* memory for hash table links */
+ PAGECACHE_HASH_LINK *free_hash_list;/* list of free hash links */
+ PAGECACHE_BLOCK_LINK *free_block_list;/* list of free blocks */
+ PAGECACHE_BLOCK_LINK *block_root;/* memory for block links */
+ uchar HUGE_PTR *block_mem; /* memory for block buffers */
+ PAGECACHE_BLOCK_LINK *used_last;/* ptr to the last block of the LRU chain */
+ PAGECACHE_BLOCK_LINK *used_ins;/* ptr to the insertion block in LRU chain */
+ pthread_mutex_t cache_lock; /* to lock access to the cache structure */
+ WQUEUE resize_queue; /* threads waiting during resize operation */
+ WQUEUE waiting_for_hash_link;/* waiting for a free hash link */
+ WQUEUE waiting_for_block; /* requests waiting for a free block */
+ /* hash for dirty file bl.*/
+ PAGECACHE_BLOCK_LINK *changed_blocks[PAGECACHE_CHANGED_BLOCKS_HASH];
+ /* hash for other file bl.*/
+ PAGECACHE_BLOCK_LINK *file_blocks[PAGECACHE_CHANGED_BLOCKS_HASH];
+
+ /*
+ The following variables are and variables used to hold parameters for
+ initializing the key cache.
+ */
+
+ ulonglong param_buff_size; /* size the memory allocated for the cache */
+ ulong param_block_size; /* size of the blocks in the key cache */
+ ulong param_division_limit; /* min. percentage of warm blocks */
+ ulong param_age_threshold; /* determines when hot block is downgraded */
+
+ /* Statistics variables. These are reset in reset_pagecache_counters(). */
+ ulong global_blocks_changed; /* number of currently dirty blocks */
+ ulonglong global_cache_w_requests;/* number of write requests (write hits) */
+ ulonglong global_cache_write; /* number of writes from cache to files */
+ ulonglong global_cache_r_requests;/* number of read requests (read hits) */
+ ulonglong global_cache_read; /* number of reads from files to cache */
+
+ uint shift; /* block size = 2 ^ shift */
+ myf readwrite_flags; /* Flags to pread/pwrite() */
+ myf org_readwrite_flags; /* Flags to pread/pwrite() at init */
+ my_bool inited;
+ my_bool resize_in_flush; /* true during flush of resize operation */
+ my_bool can_be_used; /* usage of cache for read/write is allowed */
+ my_bool in_init; /* Set to 1 in MySQL during init/resize */
+ HASH files_in_flush; /**< files in flush_pagecache_blocks_int() */
+} PAGECACHE;
+
+/** @brief Return values for PAGECACHE_FLUSH_FILTER */
+enum pagecache_flush_filter_result
+{
+ FLUSH_FILTER_SKIP_TRY_NEXT= 0,/**< skip page and move on to next one */
+ FLUSH_FILTER_OK, /**< flush page and move on to next one */
+ FLUSH_FILTER_SKIP_ALL /**< skip page and all next ones */
+};
+/** @brief a filter function type for flush_pagecache_blocks_with_filter() */
+typedef enum pagecache_flush_filter_result
+(*PAGECACHE_FLUSH_FILTER)(enum pagecache_page_type type,
+ pgcache_page_no_t page,
+ LSN rec_lsn, void *arg);
+
+/* The default key cache */
+extern PAGECACHE dflt_pagecache_var, *dflt_pagecache;
+
+extern ulong init_pagecache(PAGECACHE *pagecache, size_t use_mem,
+ uint division_limit, uint age_threshold,
+ uint block_size, myf my_read_flags);
+extern ulong resize_pagecache(PAGECACHE *pagecache,
+ size_t use_mem, uint division_limit,
+ uint age_threshold);
+extern void change_pagecache_param(PAGECACHE *pagecache, uint division_limit,
+ uint age_threshold);
+
+extern uchar *pagecache_read(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint level,
+ uchar *buff,
+ enum pagecache_page_type type,
+ enum pagecache_page_lock lock,
+ PAGECACHE_BLOCK_LINK **link);
+
+#define pagecache_write(P,F,N,L,B,T,O,I,M,K,R) \
+ pagecache_write_part(P,F,N,L,B,T,O,I,M,K,R,0,(P)->block_size)
+
+#define pagecache_inject(P,F,N,L,B,T,O,I,K,R) \
+ pagecache_write_part(P,F,N,L,B,T,O,I,PAGECACHE_WRITE_DONE, \
+ K,R,0,(P)->block_size)
+
+extern my_bool pagecache_write_part(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint level,
+ uchar *buff,
+ enum pagecache_page_type type,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ enum pagecache_write_mode write_mode,
+ PAGECACHE_BLOCK_LINK **link,
+ LSN first_REDO_LSN_for_page,
+ uint offset,
+ uint size);
+extern void pagecache_unlock(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ LSN first_REDO_LSN_for_page,
+ LSN lsn, my_bool was_changed);
+extern void pagecache_unlock_by_link(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *block,
+ enum pagecache_page_lock lock,
+ enum pagecache_page_pin pin,
+ LSN first_REDO_LSN_for_page,
+ LSN lsn, my_bool was_changed,
+ my_bool any);
+extern void pagecache_unpin(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ LSN lsn);
+extern void pagecache_unpin_by_link(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *link,
+ LSN lsn);
+
+
+/* Results of flush operation (bit field in fact) */
+
+/* The flush is done. */
+#define PCFLUSH_OK 0
+/* There was errors during the flush process. */
+#define PCFLUSH_ERROR 1
+/* Pinned blocks was met and skipped. */
+#define PCFLUSH_PINNED 2
+/* PCFLUSH_ERROR and PCFLUSH_PINNED. */
+#define PCFLUSH_PINNED_AND_ERROR (PCFLUSH_ERROR|PCFLUSH_PINNED)
+
+#define pagecache_file_init(F,RC,WC,WF,GLC,D) \
+ do{ \
+ (F).read_callback= (RC); (F).write_callback= (WC); \
+ (F).write_fail= (WF); \
+ (F).flush_log_callback= (GLC); (F).callback_data= (uchar*)(D); \
+ } while(0)
+
+#define flush_pagecache_blocks(A,B,C) \
+ flush_pagecache_blocks_with_filter(A,B,C,NULL,NULL)
+extern int flush_pagecache_blocks_with_filter(PAGECACHE *keycache,
+ PAGECACHE_FILE *file,
+ enum flush_type type,
+ PAGECACHE_FLUSH_FILTER filter,
+ void *filter_arg);
+extern my_bool pagecache_delete(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ enum pagecache_page_lock lock,
+ my_bool flush);
+extern my_bool pagecache_delete_by_link(PAGECACHE *pagecache,
+ PAGECACHE_BLOCK_LINK *link,
+ enum pagecache_page_lock lock,
+ my_bool flush);
+extern my_bool pagecache_delete_pages(PAGECACHE *pagecache,
+ PAGECACHE_FILE *file,
+ pgcache_page_no_t pageno,
+ uint page_count,
+ enum pagecache_page_lock lock,
+ my_bool flush);
+extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup);
+extern my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
+ LEX_STRING *str,
+ LSN *min_lsn);
+extern int reset_pagecache_counters(const char *name, PAGECACHE *pagecache);
+extern uchar *pagecache_block_link_to_buffer(PAGECACHE_BLOCK_LINK *block);
+
+extern uint pagecache_pagelevel(PAGECACHE_BLOCK_LINK *block);
+extern void pagecache_add_level_by_link(PAGECACHE_BLOCK_LINK *block,
+ uint level);
+
+/* Functions to handle multiple key caches */
+extern my_bool multi_pagecache_init(void);
+extern void multi_pagecache_free(void);
+extern PAGECACHE *multi_pagecache_search(uchar *key, uint length,
+ PAGECACHE *def);
+extern my_bool multi_pagecache_set(const uchar *key, uint length,
+ PAGECACHE *pagecache);
+extern void multi_pagecache_change(PAGECACHE *old_data,
+ PAGECACHE *new_data);
+extern int reset_pagecache_counters(const char *name,
+ PAGECACHE *pagecache);
+#ifndef DBUG_OFF
+void pagecache_file_no_dirty_page(PAGECACHE *pagecache, PAGECACHE_FILE *file);
+#else
+#define pagecache_file_no_dirty_page(A,B) {}
+#endif
+
+C_MODE_END
+#endif /* _keycache_h */
diff --git a/storage/maria/ma_pagecaches.c b/storage/maria/ma_pagecaches.c
new file mode 100644
index 00000000000..8a1423ee0d7
--- /dev/null
+++ b/storage/maria/ma_pagecaches.c
@@ -0,0 +1,104 @@
+/* Copyright (C) 2003-2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Handling of multiple key caches
+
+ The idea is to have a thread safe hash on the table name,
+ with a default key cache value that is returned if the table name is not in
+ the cache.
+*/
+
+#include "maria_def.h"
+#include "ma_pagecache.h"
+#include <hash.h>
+#include <m_string.h>
+#include "../../mysys/my_safehash.h"
+
+/*****************************************************************************
+ Functions to handle the pagecache objects
+*****************************************************************************/
+
+/* Variable to store all key cache objects */
+static SAFE_HASH pagecache_hash;
+
+
+my_bool multi_pagecache_init(void)
+{
+ return safe_hash_init(&pagecache_hash, 16, (uchar*) maria_pagecache);
+}
+
+
+void multi_pagecache_free(void)
+{
+ safe_hash_free(&pagecache_hash);
+}
+
+/*
+ Get a key cache to be used for a specific table.
+
+ SYNOPSIS
+ multi_pagecache_search()
+ key key to find (usually table path)
+ uint length Length of key.
+ def Default value if no key cache
+
+ NOTES
+ This function is coded in such a way that we will return the
+ default key cache even if one never called multi_pagecache_init.
+ This will ensure that it works with old MyISAM clients.
+
+ RETURN
+ key cache to use
+*/
+
+PAGECACHE *multi_pagecache_search(uchar *key, uint length,
+ PAGECACHE *def)
+{
+ if (!pagecache_hash.hash.records)
+ return def;
+ return (PAGECACHE*) safe_hash_search(&pagecache_hash, key, length,
+ (void*) def);
+}
+
+
+/*
+ Assosiate a key cache with a key
+
+
+ SYONOPSIS
+ multi_pagecache_set()
+ key key (path to table etc..)
+ length Length of key
+ pagecache cache to assococite with the table
+
+ NOTES
+ This can be used both to insert a new entry and change an existing
+ entry
+*/
+
+
+my_bool multi_pagecache_set(const uchar *key, uint length,
+ PAGECACHE *pagecache)
+{
+ return safe_hash_set(&pagecache_hash, key, length, (uchar*) pagecache);
+}
+
+
+void multi_pagecache_change(PAGECACHE *old_data,
+ PAGECACHE *new_data)
+{
+ safe_hash_change(&pagecache_hash, (uchar*) old_data, (uchar*) new_data);
+}
diff --git a/storage/maria/ma_pagecrc.c b/storage/maria/ma_pagecrc.c
new file mode 100644
index 00000000000..640bb8880f4
--- /dev/null
+++ b/storage/maria/ma_pagecrc.c
@@ -0,0 +1,378 @@
+/* Copyright (C) 2007-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+
+/**
+ @brief calculate crc of the page avoiding special values
+
+ @param start The value to start CRC (we use page number here)
+ @param data data pointer
+ @param length length of the data
+
+ @return crc of the page without special values
+*/
+
+static uint32 maria_page_crc(uint32 start, uchar *data, uint length)
+{
+ uint32 crc= crc32(start, data, length);
+
+ /* we need this assert to get following comparison working */
+ compile_time_assert(MARIA_NO_CRC_BITMAP_PAGE ==
+ MARIA_NO_CRC_NORMAL_PAGE - 1 &&
+ MARIA_NO_CRC_NORMAL_PAGE == 0xffffffff);
+ if (crc >= MARIA_NO_CRC_BITMAP_PAGE)
+ crc= MARIA_NO_CRC_BITMAP_PAGE - 1;
+
+ return(crc);
+}
+
+/**
+ @brief Maria pages read callback (checks the page CRC)
+
+ @param page The page data to check
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr pointer to MARIA_SHARE
+ @param no_crc_val Value which means CRC absence
+ (MARIA_NO_CRC_NORMAL_PAGE or MARIA_NO_CRC_BITMAP_PAGE)
+ @param data_length length of data to calculate CRC
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+static my_bool maria_page_crc_check(uchar *page,
+ pgcache_page_no_t page_no,
+ MARIA_SHARE *share,
+ uint32 no_crc_val,
+ int data_length)
+{
+ uint32 crc= uint4korr(page + share->block_size - CRC_SIZE), new_crc;
+ my_bool res;
+ DBUG_ENTER("maria_page_crc_check");
+
+ DBUG_ASSERT((uint)data_length <= share->block_size - CRC_SIZE);
+
+ /* we need this assert to get following comparison working */
+ compile_time_assert(MARIA_NO_CRC_BITMAP_PAGE ==
+ MARIA_NO_CRC_NORMAL_PAGE - 1 &&
+ MARIA_NO_CRC_NORMAL_PAGE == 0xffffffff);
+ /*
+ If crc is no_crc_val then
+ the page has no crc, so there is nothing to check.
+ */
+ if (crc >= MARIA_NO_CRC_BITMAP_PAGE)
+ {
+ DBUG_PRINT("info", ("No crc: %lu crc: %lu page: %lu ",
+ (ulong) no_crc_val, (ulong) crc, (ulong) page_no));
+ if (crc != no_crc_val)
+ {
+ my_errno= HA_ERR_WRONG_CRC;
+ DBUG_PRINT("error", ("Wrong no CRC value"));
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+ }
+ new_crc= maria_page_crc((uint32) page_no, page, data_length);
+ DBUG_ASSERT(new_crc != no_crc_val);
+ res= test(new_crc != crc);
+ if (res)
+ {
+ /*
+ Bitmap pages may be totally zero filled in some cases.
+ This happens when we get a crash after the pagecache has written
+ out a page that is on a newly created bitmap page and we get
+ a crash before the bitmap page is written out.
+
+ We handle this case with the following logic:
+ When reading, approve of bitmap pages where all bytes are zero
+ (This is after all a bitmap pages where no data is reserved and
+ the CRC will be corrected at next write)
+ */
+ if (no_crc_val == MARIA_NO_CRC_BITMAP_PAGE &&
+ crc == 0 && _ma_check_if_zero(page, data_length))
+ {
+ DBUG_PRINT("warning", ("Found bitmap page that was not initialized"));
+ DBUG_RETURN(0);
+ }
+
+ DBUG_PRINT("error", ("Page: %lu crc: %lu calculated crc: %lu",
+ (ulong) page_no, (ulong) crc, (ulong) new_crc));
+ my_errno= HA_ERR_WRONG_CRC;
+ }
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Maria pages write callback (sets the page CRC for data and index
+ files)
+
+ @param page The page data to set
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr Write callback data pointer (pointer to MARIA_SHARE)
+
+ @retval 0 OK
+*/
+
+my_bool maria_page_crc_set_normal(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr)
+{
+ MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+ int data_length= share->block_size - CRC_SIZE;
+ uint32 crc= maria_page_crc((uint32) page_no, page, data_length);
+ DBUG_ENTER("maria_page_crc_set_normal");
+ DBUG_PRINT("info", ("Page %lu crc: %lu", (ulong) page_no, (ulong)crc));
+
+ /* crc is on the stack so it is aligned, pagecache buffer is aligned, too */
+ int4store_aligned(page + data_length, crc);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Maria pages write callback (sets the page CRC for keys)
+
+ @param page The page data to set
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr Write callback data pointer (pointer to MARIA_SHARE)
+
+ @retval 0 OK
+*/
+
+my_bool maria_page_crc_set_index(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr)
+{
+ MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+ int data_length= _ma_get_page_used(share, page);
+ uint32 crc= maria_page_crc((uint32) page_no, page, data_length);
+ DBUG_ENTER("maria_page_crc_set_index");
+ DBUG_PRINT("info", ("Page %lu crc: %lu",
+ (ulong) page_no, (ulong) crc));
+ DBUG_ASSERT((uint)data_length <= share->block_size - CRC_SIZE);
+ /* crc is on the stack so it is aligned, pagecache buffer is aligned, too */
+ int4store_aligned(page + share->block_size - CRC_SIZE, crc);
+ DBUG_RETURN(0);
+}
+
+
+/* interface functions */
+
+
+/**
+ @brief Maria pages read callback (checks the page CRC) for index/data pages
+
+ @param page The page data to check
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr Read callback data pointer (pointer to MARIA_SHARE)
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool maria_page_crc_check_data(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr)
+{
+ MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+ return (maria_page_crc_check(page, (uint32) page_no, share,
+ MARIA_NO_CRC_NORMAL_PAGE,
+ share->block_size - CRC_SIZE));
+}
+
+
+/**
+ @brief Maria pages read callback (checks the page CRC) for bitmap pages
+
+ @param page The page data to check
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr Read callback data pointer (pointer to MARIA_SHARE)
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool maria_page_crc_check_bitmap(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr)
+{
+ MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+ return (maria_page_crc_check(page, (uint32) page_no, share,
+ MARIA_NO_CRC_BITMAP_PAGE,
+ share->block_size - CRC_SIZE));
+}
+
+
+/**
+ @brief Maria pages read callback (checks the page CRC) for index pages
+
+ @param page The page data to check
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr Read callback data pointer (pointer to MARIA_SHARE)
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool maria_page_crc_check_index(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr)
+{
+ MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+ uint length= _ma_get_page_used(share, page);
+ if (length > share->block_size - CRC_SIZE)
+ {
+ DBUG_PRINT("error", ("Wrong page length: %u", length));
+ return (my_errno= HA_ERR_WRONG_CRC);
+ }
+ return maria_page_crc_check(page, (uint32) page_no, share,
+ MARIA_NO_CRC_NORMAL_PAGE,
+ length);
+}
+
+
+/**
+ @brief Maria pages dumme read callback for temporary tables
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool maria_page_crc_check_none(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no
+ __attribute__((unused)),
+ uchar *data_ptr __attribute__((unused)))
+{
+ return 0;
+}
+
+
+/**
+ @brief Maria pages write callback (sets the page filler for index/data)
+
+ @param page The page data to set
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr Write callback data pointer (pointer to MARIA_SHARE)
+
+ @retval 0 OK
+*/
+
+my_bool maria_page_filler_set_normal(uchar *page,
+ pgcache_page_no_t page_no
+ __attribute__((unused)),
+ uchar *data_ptr)
+{
+ DBUG_ENTER("maria_page_filler_set_normal");
+ DBUG_ASSERT(page_no != 0); /* Catches some simple bugs */
+ int4store_aligned(page + ((MARIA_SHARE *)data_ptr)->block_size - CRC_SIZE,
+ MARIA_NO_CRC_NORMAL_PAGE);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Maria pages write callback (sets the page filler for bitmap)
+
+ @param page The page data to set
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr Write callback data pointer (pointer to MARIA_SHARE)
+
+ @retval 0 OK
+*/
+
+my_bool maria_page_filler_set_bitmap(uchar *page,
+ pgcache_page_no_t page_no
+ __attribute__((unused)),
+ uchar *data_ptr)
+{
+ DBUG_ENTER("maria_page_filler_set_bitmap");
+ int4store_aligned(page + ((MARIA_SHARE *)data_ptr)->block_size - CRC_SIZE,
+ MARIA_NO_CRC_BITMAP_PAGE);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Maria pages dummy write callback for temporary tables
+
+ @retval 0 OK
+*/
+
+my_bool maria_page_filler_set_none(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no
+ __attribute__((unused)),
+ uchar *data_ptr __attribute__((unused)))
+{
+#ifdef HAVE_valgrind
+ int4store_aligned(page + ((MARIA_SHARE *)data_ptr)->block_size - CRC_SIZE,
+ 0);
+#endif
+ return 0;
+}
+
+
+/**
+ @brief Write failure callback (mark table as corrupted)
+
+ @param data_ptr Write callback data pointer (pointer to MARIA_SHARE)
+*/
+
+void maria_page_write_failure(uchar* data_ptr)
+{
+ maria_mark_crashed_share((MARIA_SHARE *)data_ptr);
+}
+
+
+/**
+ @brief Maria flush log log if needed
+
+ @param page The page data to set
+ @param page_no The page number (<offset>/<page length>)
+ @param data_ptr Write callback data pointer (pointer to MARIA_SHARE)
+
+ @retval 0 OK
+ @retval 1 error
+*/
+
+my_bool maria_flush_log_for_page(uchar *page,
+ pgcache_page_no_t page_no
+ __attribute__((unused)),
+ uchar *data_ptr __attribute__((unused)))
+{
+ LSN lsn;
+#ifndef DBUG_OFF
+ const MARIA_SHARE *share= (MARIA_SHARE*) data_ptr;
+#endif
+ DBUG_ENTER("maria_flush_log_for_page");
+ /* share is 0 here only in unittest */
+ DBUG_ASSERT(!share || (share->page_type == PAGECACHE_LSN_PAGE &&
+ share->now_transactional));
+ lsn= lsn_korr(page);
+ if (translog_flush(lsn))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+my_bool maria_flush_log_for_page_none(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no
+ __attribute__((unused)),
+ uchar *data_ptr __attribute__((unused)))
+{
+ return 0;
+}
diff --git a/storage/maria/ma_panic.c b/storage/maria/ma_panic.c
new file mode 100644
index 00000000000..a86563f31fb
--- /dev/null
+++ b/storage/maria/ma_panic.c
@@ -0,0 +1,140 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "ma_fulltext.h"
+
+/*
+ Stop usage of Maria
+
+ SYNOPSIS
+ maria_panic()
+ flag HA_PANIC_CLOSE: All maria files (tables and log) are closed.
+ maria_end() is called.
+ HA_PANIC_WRITE: All misam files are unlocked and
+ all changed data in single user maria is
+ written to file
+ HA_PANIC_READ All maria files that was locked when
+ maria_panic(HA_PANIC_WRITE) was done is
+ locked. A maria_readinfo() is done for
+ all single user files to get changes
+ in database
+
+ RETURN
+ 0 ok
+ # error number in case of error
+*/
+
+int maria_panic(enum ha_panic_function flag)
+{
+ int error=0;
+ LIST *list_element,*next_open;
+ MARIA_HA *info;
+ DBUG_ENTER("maria_panic");
+
+ if (!maria_inited)
+ DBUG_RETURN(0);
+ pthread_mutex_lock(&THR_LOCK_maria);
+ for (list_element=maria_open_list ; list_element ; list_element=next_open)
+ {
+ next_open=list_element->next; /* Save if close */
+ info=(MARIA_HA*) list_element->data;
+ switch (flag) {
+ case HA_PANIC_CLOSE:
+ /*
+ If bad luck (if some tables would be used now, which normally does not
+ happen in MySQL), as we release the mutex, the list may change and so
+ we may crash.
+ */
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ if (maria_close(info))
+ error=my_errno;
+ pthread_mutex_lock(&THR_LOCK_maria);
+ break;
+ case HA_PANIC_WRITE: /* Do this to free databases */
+#ifdef CANT_OPEN_FILES_TWICE
+ if (info->s->options & HA_OPTION_READ_ONLY_DATA)
+ break;
+#endif
+ if (flush_pagecache_blocks(info->s->pagecache, &info->s->kfile,
+ FLUSH_RELEASE))
+ error=my_errno;
+ if (info->opt_flag & WRITE_CACHE_USED)
+ if (flush_io_cache(&info->rec_cache))
+ error=my_errno;
+ if (info->opt_flag & READ_CACHE_USED)
+ {
+ if (flush_io_cache(&info->rec_cache))
+ error=my_errno;
+ reinit_io_cache(&info->rec_cache,READ_CACHE,0,
+ (pbool) (info->lock_type != F_UNLCK),1);
+ }
+ if (info->lock_type != F_UNLCK && ! info->was_locked)
+ {
+ info->was_locked=info->lock_type;
+ if (maria_lock_database(info,F_UNLCK))
+ error=my_errno;
+ }
+#ifdef CANT_OPEN_FILES_TWICE
+ if (info->s->kfile.file >= 0 && my_close(info->s->kfile.file, MYF(0)))
+ error = my_errno;
+ if (info->dfile.file >= 0 && my_close(info->dfile.file, MYF(0)))
+ error = my_errno;
+ info->s->kfile.file= info->dfile.file= -1;/* Files aren't open anymore */
+ break;
+#endif
+ case HA_PANIC_READ: /* Restore to before WRITE */
+#ifdef CANT_OPEN_FILES_TWICE
+ { /* Open closed files */
+ char name_buff[FN_REFLEN];
+ MARIA_SHARE *share= info->s;
+ if (share->kfile.file < 0)
+ {
+
+ if ((share->kfile.file= my_open(fn_format(name_buff,
+ info->filename, "",
+ N_NAME_IEXT,4),
+ info->mode,
+ MYF(MY_WME))) < 0)
+ error = my_errno;
+ }
+ if (info->dfile.file < 0)
+ {
+ if ((info->dfile.file= my_open(fn_format(name_buff, info->filename,
+ "", N_NAME_DEXT, 4),
+ info->mode,
+ MYF(MY_WME))) < 0)
+ error = my_errno;
+ info->rec_cache.file= info->dfile.file;
+ }
+ if (share->bitmap.file.file < 0)
+ share->bitmap.file.file= info->dfile.file;
+ }
+#endif
+ if (info->was_locked)
+ {
+ if (maria_lock_database(info, info->was_locked))
+ error=my_errno;
+ info->was_locked=0;
+ }
+ break;
+ }
+ }
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ if (flag == HA_PANIC_CLOSE)
+ maria_end();
+ if (!error)
+ DBUG_RETURN(0);
+ DBUG_RETURN(my_errno=error);
+} /* maria_panic */
diff --git a/storage/maria/ma_preload.c b/storage/maria/ma_preload.c
new file mode 100644
index 00000000000..6dfb4e437b6
--- /dev/null
+++ b/storage/maria/ma_preload.c
@@ -0,0 +1,116 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Preload indexes into key cache
+*/
+
+#include "maria_def.h"
+
+
+/*
+ Preload pages of the index file for a table into the key cache
+
+ SYNOPSIS
+ maria_preload()
+ info open table
+ map map of indexes to preload into key cache
+ ignore_leaves only non-leaves pages are to be preloaded
+
+ RETURN VALUE
+ 0 if a success. error code - otherwise.
+
+ NOTES.
+ At present pages for all indexes are preloaded.
+ In future only pages for indexes specified in the key_map parameter
+ of the table will be preloaded.
+ We don't yet use preload_buff_size (we read page after page).
+*/
+
+int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves)
+{
+ ulong block_length= 0;
+ uchar *buff;
+ MARIA_SHARE* share= info->s;
+ uint keynr;
+ my_off_t key_file_length= share->state.state.key_file_length;
+ pgcache_page_no_t page_no, page_no_max;
+ PAGECACHE_BLOCK_LINK *page_link;
+ DBUG_ENTER("maria_preload");
+
+ if (!share->state.header.keys || !maria_is_any_key_active(key_map) ||
+ (key_file_length == share->base.keystart))
+ DBUG_RETURN(0);
+
+ block_length= share->pagecache->block_size;
+
+ if (!(buff= (uchar *) my_malloc(block_length, MYF(MY_WME))))
+ DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM);
+
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE))
+ goto err;
+
+ /*
+ Currently when we come here all other open instances of the table have
+ been closed, and we flushed all pages of our own instance, so there
+ cannot be any page of this table in the cache. Thus my_pread() would be
+ safe. But in the future, we will allow more concurrency during
+ preloading, so we use pagecache_read() instead of my_pread() because we
+ observed that on some Linux, concurrent pread() and pwrite() (which
+ could be from a page eviction by another thread) to the same page can
+ make pread() see an half-written page.
+ In this future, we should find a way to read state.key_file_length
+ reliably, handle concurrent shrinks (delete_all_rows()) etc.
+ */
+ for ((page_no= share->base.keystart / block_length),
+ (page_no_max= key_file_length / block_length);
+ page_no < page_no_max; page_no++)
+ {
+ /**
+ @todo instead of reading pages one by one we could have a call
+ pagecache_read_several_pages() which does a single my_pread() for many
+ consecutive pages (like the my_pread() in mi_preload()).
+ */
+ if (pagecache_read(share->pagecache, &share->kfile, page_no,
+ DFLT_INIT_HITS, buff, share->page_type,
+ PAGECACHE_LOCK_WRITE, &page_link) == NULL)
+ goto err;
+ keynr= _ma_get_keynr(share, buff);
+ if (((ignore_leaves && !_ma_test_if_nod(share, buff)) ||
+ keynr == MARIA_DELETE_KEY_NR ||
+ !(key_map & ((ulonglong) 1 << keynr))) &&
+ (pagecache_pagelevel(page_link) == DFLT_INIT_HITS))
+ {
+ /*
+ This page is not interesting, and (last condition above) we are the
+ ones who put it in the cache, so nobody else is interested in it.
+ */
+ if (pagecache_delete_by_link(share->pagecache, page_link,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, FALSE))
+ goto err;
+ }
+ else /* otherwise it stays in cache: */
+ pagecache_unlock_by_link(share->pagecache, page_link,
+ PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN,
+ LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, FALSE, FALSE);
+ }
+
+ my_free(buff, MYF(0));
+ DBUG_RETURN(0);
+
+err:
+ my_free(buff, MYF(MY_ALLOW_ZERO_PTR));
+ DBUG_RETURN(my_errno= errno);
+}
diff --git a/storage/maria/ma_range.c b/storage/maria/ma_range.c
new file mode 100644
index 00000000000..5dc4e3a9959
--- /dev/null
+++ b/storage/maria/ma_range.c
@@ -0,0 +1,312 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Gives a approximated number of how many records there is between two keys.
+ Used when optimizing querries.
+ */
+
+#include "maria_def.h"
+#include "ma_rt_index.h"
+
+static ha_rows _ma_record_pos(MARIA_HA *,const uchar *, key_part_map,
+ enum ha_rkey_function);
+static double _ma_search_pos(MARIA_HA *, MARIA_KEY *, uint32, my_off_t);
+static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key);
+
+
+/**
+ @brief Estimate how many records there is in a given range
+
+ @param info MARIA handler
+ @param inx Index to use
+ @param min_key Min key. Is = 0 if no min range
+ @param max_key Max key. Is = 0 if no max range
+
+ @note
+ We should ONLY return 0 if there is no rows in range
+
+ @return Estimated number of rows or error
+ @retval HA_POS_ERROR error (or we can't estimate number of rows)
+ @retval number Estimated number of rows
+*/
+
+ha_rows maria_records_in_range(MARIA_HA *info, int inx, key_range *min_key,
+ key_range *max_key)
+{
+ ha_rows start_pos,end_pos,res;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEY key;
+ MARIA_KEYDEF *keyinfo;
+ DBUG_ENTER("maria_records_in_range");
+
+ if ((inx = _ma_check_index(info,inx)) < 0)
+ DBUG_RETURN(HA_POS_ERROR);
+
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(HA_POS_ERROR);
+ info->update&= (HA_STATE_CHANGED+HA_STATE_ROW_CHANGED);
+ keyinfo= share->keyinfo + inx;
+ if (share->lock_key_trees)
+ rw_rdlock(&keyinfo->root_lock);
+
+ switch (keyinfo->key_alg) {
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ {
+ uchar *key_buff;
+
+ /*
+ The problem is that the optimizer doesn't support
+ RTree keys properly at the moment.
+ Hope this will be fixed some day.
+ But now NULL in the min_key means that we
+ didn't make the task for the RTree key
+ and expect BTree functionality from it.
+ As it's not able to handle such request
+ we return the error.
+ */
+ if (!min_key)
+ {
+ res= HA_POS_ERROR;
+ break;
+ }
+ key_buff= info->last_key.data + share->base.max_key_length;
+ _ma_pack_key(info, &key, inx, key_buff,
+ min_key->key, min_key->keypart_map,
+ (HA_KEYSEG**) 0);
+ res= maria_rtree_estimate(info, &key, maria_read_vec[min_key->flag]);
+ res= res ? res : 1; /* Don't return 0 */
+ break;
+ }
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ start_pos= (min_key ?
+ _ma_record_pos(info, min_key->key, min_key->keypart_map,
+ min_key->flag) :
+ (ha_rows) 0);
+ end_pos= (max_key ?
+ _ma_record_pos(info, max_key->key, max_key->keypart_map,
+ max_key->flag) :
+ info->state->records + (ha_rows) 1);
+ res= (end_pos < start_pos ? (ha_rows) 0 :
+ (end_pos == start_pos ? (ha_rows) 1 : end_pos-start_pos));
+ if (start_pos == HA_POS_ERROR || end_pos == HA_POS_ERROR)
+ res=HA_POS_ERROR;
+ }
+
+ if (share->lock_key_trees)
+ rw_unlock(&keyinfo->root_lock);
+ fast_ma_writeinfo(info);
+
+ /**
+ @todo LOCK
+ If res==0 (no rows), if we need to guarantee repeatability of the search,
+ we will need to set a next-key lock in this statement.
+ Also SELECT COUNT(*)...
+ */
+
+ DBUG_PRINT("info",("records: %ld",(ulong) (res)));
+ DBUG_RETURN(res);
+}
+
+
+ /* Find relative position (in records) for key in index-tree */
+
+static ha_rows _ma_record_pos(MARIA_HA *info, const uchar *key_data,
+ key_part_map keypart_map,
+ enum ha_rkey_function search_flag)
+{
+ uint inx= (uint) info->lastinx;
+ uint32 nextflag;
+ uchar *key_buff;
+ double pos;
+ MARIA_KEY key;
+ DBUG_ENTER("_ma_record_pos");
+ DBUG_PRINT("enter",("search_flag: %d",search_flag));
+ DBUG_ASSERT(keypart_map);
+
+ key_buff= info->lastkey_buff+info->s->base.max_key_length;
+ _ma_pack_key(info, &key, inx, key_buff, key_data, keypart_map,
+ (HA_KEYSEG**) 0);
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, &key););
+ nextflag=maria_read_vec[search_flag];
+
+ /*
+ my_handler.c:ha_compare_text() has a flag 'skip_end_space'.
+ This is set in my_handler.c:ha_key_cmp() in dependence on the
+ compare flags 'nextflag' and the column type.
+
+ TEXT columns are of type HA_KEYTYPE_VARTEXT. In this case the
+ condition is skip_end_space= ((nextflag & (SEARCH_FIND |
+ SEARCH_UPDATE)) == SEARCH_FIND).
+
+ SEARCH_FIND is used for an exact key search. The combination
+ SEARCH_FIND | SEARCH_UPDATE is used in write/update/delete
+ operations with a comment like "Not real duplicates", whatever this
+ means. From the condition above we can see that 'skip_end_space' is
+ always false for these operations. The result is that trailing space
+ counts in key comparison and hence, emtpy strings ('', string length
+ zero, but not NULL) compare less that strings starting with control
+ characters and these in turn compare less than strings starting with
+ blanks.
+
+ When estimating the number of records in a key range, we request an
+ exact search for the minimum key. This translates into a plain
+ SEARCH_FIND flag. Using this alone would lead to a 'skip_end_space'
+ compare. Empty strings would be expected above control characters.
+ Their keys would not be found because they are located below control
+ characters.
+
+ This is the reason that we add the SEARCH_UPDATE flag here. It makes
+ the key estimation compare in the same way like key write operations
+ do. Olny so we will find the keys where they have been inserted.
+
+ Adding the flag unconditionally does not hurt as it is used in the
+ above mentioned condition only. So it can safely be used together
+ with other flags.
+ */
+ pos= _ma_search_pos(info, &key,
+ nextflag | SEARCH_SAVE_BUFF | SEARCH_UPDATE,
+ info->s->state.key_root[inx]);
+ if (pos >= 0.0)
+ {
+ DBUG_PRINT("exit",("pos: %ld",(ulong) (pos*info->state->records)));
+ DBUG_RETURN((ulong) (pos*info->state->records+0.5));
+ }
+ DBUG_RETURN(HA_POS_ERROR);
+}
+
+
+/**
+ Find offset for key on index page
+
+ @notes
+ Modified version of _ma_search()
+
+ @return
+ @retval 0.0 <= x <= 1.0
+*/
+
+static double _ma_search_pos(MARIA_HA *info, MARIA_KEY *key,
+ uint32 nextflag, my_off_t pos)
+{
+ int flag;
+ uint keynr, max_keynr;
+ my_bool after_key;
+ uchar *keypos;
+ double offset;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_PAGE page;
+ DBUG_ENTER("_ma_search_pos");
+ LINT_INIT(max_keynr);
+
+ if (pos == HA_OFFSET_ERROR)
+ DBUG_RETURN(0.5);
+
+ if (_ma_fetch_keypage(&page, info, keyinfo, pos,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS,
+ info->buff, 1))
+ goto err;
+ flag= (*keyinfo->bin_search)(key, &page, nextflag, &keypos,
+ info->lastkey_buff, &after_key);
+ keynr= _ma_keynr(&page, keypos, &max_keynr);
+
+ if (flag)
+ {
+ if (flag == MARIA_FOUND_WRONG_KEY)
+ DBUG_RETURN(-1); /* error */
+ /*
+ Didn't found match. keypos points at next (bigger) key
+ Try to find a smaller, better matching key.
+ Matches keynr + [0-1]
+ */
+ if (flag > 0 && ! page.node)
+ offset= 1.0;
+ else if ((offset= _ma_search_pos(info, key, nextflag,
+ _ma_kpos(page.node,keypos))) < 0)
+ DBUG_RETURN(offset);
+ }
+ else
+ {
+ /*
+ Found match. Keypos points at the start of the found key
+ Matches keynr+1
+ */
+ offset=1.0; /* Matches keynr+1 */
+ if ((nextflag & SEARCH_FIND) && page.node &&
+ ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME ||
+ (nextflag & (SEARCH_PREFIX | SEARCH_NO_FIND | SEARCH_LAST |
+ SEARCH_PART_KEY))))
+ {
+ /*
+ There may be identical keys in the tree. Try to match on of those.
+ Matches keynr + [0-1]
+ */
+ if ((offset= _ma_search_pos(info, key, SEARCH_FIND,
+ _ma_kpos(page.node,keypos))) < 0)
+ DBUG_RETURN(offset); /* Read error */
+ }
+ }
+ DBUG_PRINT("info",("keynr: %d offset: %g max_keynr: %d nod: %d flag: %d",
+ keynr,offset,max_keynr,page.node,flag));
+ DBUG_RETURN((keynr+offset)/(max_keynr+1));
+err:
+ DBUG_PRINT("exit",("Error: %d",my_errno));
+ DBUG_RETURN (-1.0);
+}
+
+
+/* Get keynummer of current key and max number of keys in nod */
+
+static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key)
+{
+ uint page_flag, nod_flag, keynr, max_key;
+ uchar t_buff[MARIA_MAX_KEY_BUFF], *pos, *end;
+ const MARIA_KEYDEF *keyinfo= page->keyinfo;
+ MARIA_KEY key;
+
+ page_flag= page->flag;
+ nod_flag= page->node;
+ pos= page->buff + page->info->s->keypage_header + nod_flag;
+ end= page->buff + page->size;
+
+ if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+ ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+ {
+ *ret_max_key= (uint) (end - pos)/(keyinfo->keylength+nod_flag);
+ return (uint) (keypos - pos)/(keyinfo->keylength+nod_flag);
+ }
+
+ max_key=keynr=0;
+ t_buff[0]=0; /* Safety */
+ key.data= t_buff;
+ key.keyinfo= (MARIA_KEYDEF*) keyinfo;
+
+ while (pos < end)
+ {
+ if (!(pos= (*keyinfo->skip_key)(&key, page_flag, nod_flag, pos)))
+ {
+ DBUG_ASSERT(0);
+ return 0; /* Error */
+ }
+ max_key++;
+ if (pos == keypos)
+ keynr= max_key;
+ }
+ *ret_max_key=max_key;
+ return(keynr);
+}
diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c
new file mode 100644
index 00000000000..7a7286e26f9
--- /dev/null
+++ b/storage/maria/ma_recovery.c
@@ -0,0 +1,3755 @@
+/* Copyright (C) 2006, 2007 MySQL AB
+ Copyright (C) 2010 Monty Program Ab
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3072 Maria recovery
+ First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+/* Here is the implementation of this module */
+
+#include "maria_def.h"
+#include "ma_recovery.h"
+#include "ma_blockrec.h"
+#include "ma_checkpoint.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+#include "ma_recovery_util.h"
+
+struct st_trn_for_recovery /* used only in the REDO phase */
+{
+ LSN group_start_lsn, undo_lsn, first_undo_lsn;
+ TrID long_trid;
+};
+struct st_table_for_recovery /* used in the REDO and UNDO phase */
+{
+ MARIA_HA *info;
+};
+/* Variables used by all functions of this module. Ok as single-threaded */
+static struct st_trn_for_recovery *all_active_trans;
+static struct st_table_for_recovery *all_tables;
+static struct st_dirty_page *dirty_pages_pool;
+static LSN current_group_end_lsn;
+#ifndef DBUG_OFF
+/** Current group of REDOs is about this table and only this one */
+static MARIA_HA *current_group_table;
+#endif
+static TrID max_long_trid= 0; /**< max long trid seen by REDO phase */
+static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */
+/** @brief to avoid writing a checkpoint if recovery did nothing. */
+static my_bool checkpoint_useful;
+static my_bool in_redo_phase;
+static my_bool trns_created;
+static ulong skipped_undo_phase;
+static ulonglong now; /**< for tracking execution time of phases */
+static int (*save_error_handler_hook)(uint, const char *,myf);
+static uint recovery_warnings; /**< count of warnings */
+static uint recovery_found_crashed_tables;
+
+#define prototype_redo_exec_hook(R) \
+ static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec)
+
+#define prototype_redo_exec_hook_dummy(R) \
+ static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec \
+ __attribute__ ((unused)))
+
+#define prototype_undo_exec_hook(R) \
+ static int exec_UNDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec, TRN *trn)
+
+prototype_redo_exec_hook(LONG_TRANSACTION_ID);
+prototype_redo_exec_hook_dummy(CHECKPOINT);
+prototype_redo_exec_hook(REDO_CREATE_TABLE);
+prototype_redo_exec_hook(REDO_RENAME_TABLE);
+prototype_redo_exec_hook(REDO_REPAIR_TABLE);
+prototype_redo_exec_hook(REDO_DROP_TABLE);
+prototype_redo_exec_hook(FILE_ID);
+prototype_redo_exec_hook(INCOMPLETE_LOG);
+prototype_redo_exec_hook_dummy(INCOMPLETE_GROUP);
+prototype_redo_exec_hook(UNDO_BULK_INSERT);
+prototype_redo_exec_hook(IMPORTED_TABLE);
+prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD);
+prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL);
+prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD);
+prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD);
+prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL);
+prototype_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL);
+prototype_redo_exec_hook(REDO_FREE_BLOCKS);
+prototype_redo_exec_hook(REDO_DELETE_ALL);
+prototype_redo_exec_hook(REDO_INDEX);
+prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE);
+prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE);
+prototype_redo_exec_hook(REDO_BITMAP_NEW_PAGE);
+prototype_redo_exec_hook(UNDO_ROW_INSERT);
+prototype_redo_exec_hook(UNDO_ROW_DELETE);
+prototype_redo_exec_hook(UNDO_ROW_UPDATE);
+prototype_redo_exec_hook(UNDO_KEY_INSERT);
+prototype_redo_exec_hook(UNDO_KEY_DELETE);
+prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
+prototype_redo_exec_hook(COMMIT);
+prototype_redo_exec_hook(CLR_END);
+prototype_redo_exec_hook(DEBUG_INFO);
+prototype_undo_exec_hook(UNDO_ROW_INSERT);
+prototype_undo_exec_hook(UNDO_ROW_DELETE);
+prototype_undo_exec_hook(UNDO_ROW_UPDATE);
+prototype_undo_exec_hook(UNDO_KEY_INSERT);
+prototype_undo_exec_hook(UNDO_KEY_DELETE);
+prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
+prototype_undo_exec_hook(UNDO_BULK_INSERT);
+
+static int run_redo_phase(LSN lsn, LSN end_lsn,
+ enum maria_apply_log_way apply);
+static uint end_of_redo_phase(my_bool prepare_for_undo_phase);
+static int run_undo_phase(uint uncommitted);
+static void display_record_position(const LOG_DESC *log_desc,
+ const TRANSLOG_HEADER_BUFFER *rec,
+ uint number);
+static int display_and_apply_record(const LOG_DESC *log_desc,
+ const TRANSLOG_HEADER_BUFFER *rec);
+static MARIA_HA *get_MARIA_HA_from_REDO_record(const
+ TRANSLOG_HEADER_BUFFER *rec);
+static MARIA_HA *get_MARIA_HA_from_UNDO_record(const
+ TRANSLOG_HEADER_BUFFER *rec);
+static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon);
+static LSN parse_checkpoint_record(LSN lsn);
+static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn,
+ LSN first_undo_lsn);
+static int new_table(uint16 sid, const char *name, LSN lsn_of_file_id);
+static int new_page(uint32 fileid, pgcache_page_no_t pageid, LSN rec_lsn,
+ struct st_dirty_page *dirty_page);
+static int close_all_tables(void);
+static my_bool close_one_table(const char *name, TRANSLOG_ADDRESS addr);
+static void print_redo_phase_progress(TRANSLOG_ADDRESS addr);
+static void delete_all_transactions();
+
+/** @brief global [out] buffer for translog_read_record(); never shrinks */
+static struct
+{
+ /*
+ uchar* is more adapted (less casts) than char*, thus we don't use
+ LEX_STRING.
+ */
+ uchar *str;
+ size_t length;
+} log_record_buffer;
+static void enlarge_buffer(const TRANSLOG_HEADER_BUFFER *rec)
+{
+ if (log_record_buffer.length < rec->record_length)
+ {
+ log_record_buffer.length= rec->record_length;
+ log_record_buffer.str= my_realloc(log_record_buffer.str,
+ rec->record_length,
+ MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+ }
+}
+/** @brief Tells what kind of progress message was printed to the error log */
+static enum recovery_message_type
+{
+ REC_MSG_NONE= 0, REC_MSG_REDO, REC_MSG_UNDO, REC_MSG_FLUSH
+} recovery_message_printed;
+
+
+/* Hook to ensure we get nicer output if we get an error */
+
+int maria_recover_error_handler_hook(uint error, const char *str,
+ myf flags)
+{
+ if (procent_printed)
+ {
+ procent_printed= 0;
+ fputc('\n', stderr);
+ fflush(stderr);
+ }
+ return (*save_error_handler_hook)(error, str, flags);
+}
+
+/* Define this if you want gdb to break in some interesting situations */
+#define ALERT_USER()
+
+static void print_preamble()
+{
+ ma_message_no_user(ME_JUST_INFO, "starting recovery");
+}
+
+
+/**
+ @brief Recovers from the last checkpoint.
+
+ Runs the REDO phase using special structures, then sets up the playground
+ of runtime: recreates transactions inside trnman, open tables with their
+ two-byte-id mapping; takes a checkpoint and runs the UNDO phase. Closes all
+ tables.
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int maria_recovery_from_log(void)
+{
+ int res= 1;
+ FILE *trace_file;
+ uint warnings_count;
+#ifdef EXTRA_DEBUG
+ char name_buff[FN_REFLEN];
+#endif
+ DBUG_ENTER("maria_recovery_from_log");
+
+ DBUG_ASSERT(!maria_in_recovery);
+ maria_in_recovery= TRUE;
+
+#ifdef EXTRA_DEBUG
+ fn_format(name_buff, "aria_recovery.trace", maria_data_root, "", MYF(0));
+ trace_file= my_fopen(name_buff, O_WRONLY|O_APPEND|O_CREAT, MYF(MY_WME));
+#else
+ trace_file= NULL; /* no trace file for being fast */
+#endif
+ tprint(trace_file, "TRACE of the last Aria recovery from mysqld\n");
+ DBUG_ASSERT(maria_pagecache->inited);
+ res= maria_apply_log(LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, MARIA_LOG_APPLY,
+ trace_file, TRUE, TRUE, TRUE, &warnings_count);
+ if (!res)
+ {
+ if (warnings_count == 0 && recovery_found_crashed_tables == 0)
+ tprint(trace_file, "SUCCESS\n");
+ else
+ tprint(trace_file, "DOUBTFUL (%u warnings, check previous output)\n",
+ warnings_count);
+ }
+ if (trace_file)
+ my_fclose(trace_file, MYF(0));
+ maria_in_recovery= FALSE;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Displays and/or applies the log
+
+ @param from_lsn LSN from which log reading/applying should start;
+ LSN_IMPOSSIBLE means "use last checkpoint"
+ @param end_lsn Apply until this. LSN_IMPOSSIBLE means until end.
+ @param apply how log records should be applied or not
+ @param trace_file trace file where progress/debug messages will go
+ @param skip_DDLs_arg Should DDL records (CREATE/RENAME/DROP/REPAIR)
+ be skipped by the REDO phase or not
+ @param take_checkpoints Should we take checkpoints or not.
+ @param[out] warnings_count Count of warnings will be put there
+
+ @todo This trace_file thing is primitive; soon we will make it similar to
+ ma_check_print_warning() etc, and a successful recovery does not need to
+ create a trace file. But for debugging now it is useful.
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int maria_apply_log(LSN from_lsn, LSN end_lsn,
+ enum maria_apply_log_way apply,
+ FILE *trace_file,
+ my_bool should_run_undo_phase, my_bool skip_DDLs_arg,
+ my_bool take_checkpoints, uint *warnings_count)
+{
+ int error= 0;
+ uint uncommitted_trans;
+ ulonglong old_now;
+ my_bool abort_message_printed= 0;
+ DBUG_ENTER("maria_apply_log");
+
+ DBUG_ASSERT(apply == MARIA_LOG_APPLY || !should_run_undo_phase);
+ DBUG_ASSERT(!maria_multi_threaded);
+ recovery_warnings= recovery_found_crashed_tables= 0;
+ maria_recovery_changed_data= 0;
+ /* checkpoints can happen only if TRNs have been built */
+ DBUG_ASSERT(should_run_undo_phase || !take_checkpoints);
+ DBUG_ASSERT(end_lsn == LSN_IMPOSSIBLE || should_run_undo_phase == 0);
+ all_active_trans= (struct st_trn_for_recovery *)
+ my_malloc((SHORT_TRID_MAX + 1) * sizeof(struct st_trn_for_recovery),
+ MYF(MY_ZEROFILL));
+ all_tables= (struct st_table_for_recovery *)
+ my_malloc((SHARE_ID_MAX + 1) * sizeof(struct st_table_for_recovery),
+ MYF(MY_ZEROFILL));
+
+ save_error_handler_hook= error_handler_hook;
+ error_handler_hook= maria_recover_error_handler_hook;
+
+ if (!all_active_trans || !all_tables)
+ goto err;
+
+ if (take_checkpoints && ma_checkpoint_init(0))
+ goto err;
+
+ recovery_message_printed= REC_MSG_NONE;
+ checkpoint_useful= trns_created= FALSE;
+ tracef= trace_file;
+#ifdef INSTANT_FLUSH_OF_MESSAGES
+ /* enable this for instant flush of messages to trace file */
+ setbuf(tracef, NULL);
+#endif
+ skip_DDLs= skip_DDLs_arg;
+ skipped_undo_phase= 0;
+
+ if (from_lsn == LSN_IMPOSSIBLE)
+ {
+ if (last_checkpoint_lsn == LSN_IMPOSSIBLE)
+ {
+ from_lsn= translog_first_lsn_in_log();
+ if (unlikely(from_lsn == LSN_ERROR))
+ goto err;
+ }
+ else
+ {
+ from_lsn= parse_checkpoint_record(last_checkpoint_lsn);
+ if (from_lsn == LSN_ERROR)
+ goto err;
+ }
+ }
+
+ now= my_getsystime();
+ in_redo_phase= TRUE;
+ trnman_init(max_trid_in_control_file);
+ if (run_redo_phase(from_lsn, end_lsn, apply))
+ {
+ ma_message_no_user(0, "Redo phase failed");
+ trnman_destroy();
+ goto err;
+ }
+ trnman_destroy();
+
+ if (end_lsn != LSN_IMPOSSIBLE)
+ {
+ abort_message_printed= 1;
+ if (!trace_file)
+ fputc('\n', stderr);
+ my_message(HA_ERR_INITIALIZATION,
+ "Maria recovery aborted as end_lsn/end of file was reached",
+ MYF(0));
+ goto err2;
+ }
+
+ if ((uncommitted_trans=
+ end_of_redo_phase(should_run_undo_phase)) == (uint)-1)
+ {
+ ma_message_no_user(0, "End of redo phase failed");
+ goto err;
+ }
+ in_redo_phase= FALSE;
+
+ old_now= now;
+ now= my_getsystime();
+ if (recovery_message_printed == REC_MSG_REDO)
+ {
+ double phase_took= (now - old_now)/10000000.0;
+ /*
+ Detailed progress info goes to stderr, because ma_message_no_user()
+ cannot put several messages on one line.
+ */
+ procent_printed= 1;
+ fprintf(stderr, " (%.1f seconds); ", phase_took);
+ fflush(stderr);
+ }
+
+ /**
+ REDO phase does not fill blocks' rec_lsn, so a checkpoint now would be
+ wrong: if a future recovery used it, the REDO phase would always
+ start from the checkpoint and never from before, wrongly skipping REDOs
+ (tested). Another problem is that the REDO phase uses
+ PAGECACHE_PLAIN_PAGE, while Checkpoint only collects PAGECACHE_LSN_PAGE.
+
+ @todo fix this. pagecache_write() now can have a rec_lsn argument. And we
+ could make a function which goes through pages at end of REDO phase and
+ changes their type.
+ */
+#ifdef FIX_AND_ENABLE_LATER
+ if (take_checkpoints && checkpoint_useful)
+ {
+ /*
+ We take a checkpoint as it can save future recovery work if we crash
+ during the UNDO phase. But we don't flush pages, as UNDOs will change
+ them again probably.
+ If we wanted to take checkpoints in the middle of the REDO phase, at a
+ moment when we haven't reached the end of log so don't have exact data
+ about transactions, we could write a special checkpoint: containing only
+ the list of dirty pages, otherwise to be treated as if it was at the
+ same LSN as the last checkpoint.
+ */
+ if (ma_checkpoint_execute(CHECKPOINT_INDIRECT, FALSE))
+ goto err;
+ }
+#endif
+
+ if (should_run_undo_phase)
+ {
+ if (run_undo_phase(uncommitted_trans))
+ {
+ ma_message_no_user(0, "Undo phase failed");
+ goto err;
+ }
+ }
+ else if (uncommitted_trans > 0)
+ {
+ eprint(tracef, "***WARNING: %u uncommitted transactions; some tables may"
+ " be left inconsistent!***", uncommitted_trans);
+ recovery_warnings++;
+ }
+
+ if (skipped_undo_phase)
+ {
+ /*
+ We could want to print a list of tables for which UNDOs were skipped,
+ but not one line per skipped UNDO.
+ */
+ eprint(tracef, "***WARNING: %lu UNDO records skipped in UNDO phase; some"
+ " tables may be left inconsistent!***", skipped_undo_phase);
+ recovery_warnings++;
+ }
+
+ old_now= now;
+ now= my_getsystime();
+ if (recovery_message_printed == REC_MSG_UNDO)
+ {
+ double phase_took= (now - old_now)/10000000.0;
+ procent_printed= 1;
+ fprintf(stderr, " (%.1f seconds); ", phase_took);
+ fflush(stderr);
+ }
+
+ /*
+ we don't use maria_panic() because it would maria_end(), and Recovery does
+ not want that (we want to keep some modules initialized for runtime).
+ */
+ if (close_all_tables())
+ {
+ ma_message_no_user(0, "closing of tables failed");
+ goto err;
+ }
+
+ old_now= now;
+ now= my_getsystime();
+ if (recovery_message_printed == REC_MSG_FLUSH)
+ {
+ double phase_took= (now - old_now)/10000000.0;
+ procent_printed= 1;
+ fprintf(stderr, " (%.1f seconds); ", phase_took);
+ fflush(stderr);
+ }
+
+ if (take_checkpoints && checkpoint_useful)
+ {
+ /* No dirty pages, all tables are closed, no active transactions, save: */
+ if (ma_checkpoint_execute(CHECKPOINT_FULL, FALSE))
+ goto err;
+ }
+
+ goto end;
+err:
+ tprint(tracef, "\nRecovery of tables with transaction logs FAILED\n");
+err2:
+ if (trns_created)
+ delete_all_transactions();
+ error= 1;
+ if (close_all_tables())
+ {
+ ma_message_no_user(0, "closing of tables failed");
+ }
+end:
+ error_handler_hook= save_error_handler_hook;
+ hash_free(&all_dirty_pages);
+ bzero(&all_dirty_pages, sizeof(all_dirty_pages));
+ my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR));
+ dirty_pages_pool= NULL;
+ my_free(all_tables, MYF(MY_ALLOW_ZERO_PTR));
+ all_tables= NULL;
+ my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR));
+ all_active_trans= NULL;
+ my_free(log_record_buffer.str, MYF(MY_ALLOW_ZERO_PTR));
+ log_record_buffer.str= NULL;
+ log_record_buffer.length= 0;
+ ma_checkpoint_end();
+ *warnings_count= recovery_warnings + recovery_found_crashed_tables;
+ if (recovery_message_printed != REC_MSG_NONE)
+ {
+ if (procent_printed)
+ {
+ procent_printed= 0;
+ fprintf(stderr, "\n");
+ fflush(stderr);
+ }
+ if (!error)
+ {
+ ma_message_no_user(ME_JUST_INFO, "recovery done");
+ maria_recovery_changed_data= 1;
+ }
+ }
+ else if (!error && max_trid_in_control_file != max_long_trid)
+ {
+ /*
+ maria_end() will set max trid in log file so that one can run
+ maria_chk on the tables
+ */
+ maria_recovery_changed_data= 1;
+ }
+
+ if (error && !abort_message_printed)
+ {
+ if (!trace_file)
+ fputc('\n', stderr);
+ my_message(HA_ERR_INITIALIZATION,
+ "Aria recovery failed. Please run aria_chk -r on all Aria "
+ "tables and delete all aria_log.######## files", MYF(0));
+ }
+ procent_printed= 0;
+ /*
+ We don't cleanly close tables if we hit some error (may corrupt them by
+ flushing some wrong blocks made from wrong REDOs). It also leaves their
+ open_count>0, which ensures that --aria-recover, if used, will try to
+ repair them.
+ */
+ DBUG_RETURN(error);
+}
+
+
+/* very basic info about the record's header */
+static void display_record_position(const LOG_DESC *log_desc,
+ const TRANSLOG_HEADER_BUFFER *rec,
+ uint number)
+{
+ /*
+ if number==0, we're going over records which we had already seen and which
+ form a group, so we indent below the group's end record
+ */
+ tprint(tracef,
+ "%sRec#%u LSN (%lu,0x%lx) short_trid %u %s(num_type:%u) len %lu\n",
+ number ? "" : " ", number, LSN_IN_PARTS(rec->lsn),
+ rec->short_trid, log_desc->name, rec->type,
+ (ulong)rec->record_length);
+ if (rec->type == LOGREC_DEBUG_INFO)
+ {
+ /* Print some extra information */
+ (*log_desc->record_execute_in_redo_phase)(rec);
+ }
+}
+
+
+static int display_and_apply_record(const LOG_DESC *log_desc,
+ const TRANSLOG_HEADER_BUFFER *rec)
+{
+ int error;
+ if (log_desc->record_execute_in_redo_phase == NULL)
+ {
+ /* die on all not-yet-handled records :) */
+ DBUG_ASSERT("one more hook to write" == 0);
+ return 1;
+ }
+ if (rec->type == LOGREC_DEBUG_INFO)
+ {
+ /* Query already printed by display_record_position() */
+ return 0;
+ }
+ if ((error= (*log_desc->record_execute_in_redo_phase)(rec)))
+ eprint(tracef, "Got error %d when executing record %s",
+ my_errno, log_desc->name);
+ return error;
+}
+
+
+prototype_redo_exec_hook(LONG_TRANSACTION_ID)
+{
+ uint16 sid= rec->short_trid;
+ TrID long_trid= all_active_trans[sid].long_trid;
+ /*
+ Any incomplete group should be of an old crash which already had a
+ recovery and thus has logged INCOMPLETE_GROUP which we must have seen.
+ */
+ DBUG_ASSERT(all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE);
+ if (long_trid != 0)
+ {
+ LSN ulsn= all_active_trans[sid].undo_lsn;
+ /*
+ If the first record of that transaction is after 'rec', it's probably
+ because that transaction was found in the checkpoint record, and then
+ it's ok, we can forget about that transaction (we'll meet it later
+ again in the REDO phase) and replace it with the one in 'rec'.
+ */
+ if ((ulsn != LSN_IMPOSSIBLE) &&
+ (cmp_translog_addr(ulsn, rec->lsn) < 0))
+ {
+ char llbuf[22];
+ llstr(long_trid, llbuf);
+ eprint(tracef, "Found an old transaction long_trid %s short_trid %u"
+ " with same short id as this new transaction, and has neither"
+ " committed nor rollback (undo_lsn: (%lu,0x%lx))",
+ llbuf, sid, LSN_IN_PARTS(ulsn));
+ goto err;
+ }
+ }
+ long_trid= uint6korr(rec->header);
+ new_transaction(sid, long_trid, LSN_IMPOSSIBLE, LSN_IMPOSSIBLE);
+ goto end;
+err:
+ ALERT_USER();
+ return 1;
+end:
+ return 0;
+}
+
+
+static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn,
+ LSN first_undo_lsn)
+{
+ char llbuf[22];
+ all_active_trans[sid].long_trid= long_id;
+ llstr(long_id, llbuf);
+ tprint(tracef, "Transaction long_trid %s short_trid %u starts,"
+ " undo_lsn (%lu,0x%lx) first_undo_lsn (%lu,0x%lx)\n",
+ llbuf, sid, LSN_IN_PARTS(undo_lsn), LSN_IN_PARTS(first_undo_lsn));
+ all_active_trans[sid].undo_lsn= undo_lsn;
+ all_active_trans[sid].first_undo_lsn= first_undo_lsn;
+ set_if_bigger(max_long_trid, long_id);
+}
+
+
+prototype_redo_exec_hook_dummy(CHECKPOINT)
+{
+ /* the only checkpoint we care about was found via control file, ignore */
+ return 0;
+}
+
+
+prototype_redo_exec_hook_dummy(INCOMPLETE_GROUP)
+{
+ /* abortion was already made */
+ return 0;
+}
+
+
+prototype_redo_exec_hook(INCOMPLETE_LOG)
+{
+ MARIA_HA *info;
+ if (skip_DDLs)
+ {
+ tprint(tracef, "we skip DDLs\n");
+ return 0;
+ }
+ if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL)
+ {
+ /* no such table, don't need to warn */
+ return 0;
+ }
+
+ if (maria_is_crashed(info))
+ return 0;
+
+ if (info->s->state.is_of_horizon > rec->lsn)
+ {
+ /*
+ This table was repaired at a time after this log entry.
+ We can assume that all rows was inserted sucessfully and we don't
+ have to warn about that the inserted data was not logged
+ */
+ return 0;
+ }
+
+ /*
+ Example of what can go wrong when replaying DDLs:
+ CREATE TABLE t (logged); INSERT INTO t VALUES(1) (logged);
+ ALTER TABLE t ... which does
+ CREATE a temporary table #sql... (logged)
+ INSERT data from t into #sql... (not logged)
+ RENAME #sql TO t (logged)
+ Removing tables by hand and replaying the log will leave in the
+ end an empty table "t": missing records. If after the RENAME an INSERT
+ into t was done, that row had number 1 in its page, executing the
+ REDO_INSERT_ROW_HEAD on the recreated empty t will fail (assertion
+ failure in _ma_apply_redo_insert_row_head_or_tail(): new data page is
+ created whereas rownr is not 0).
+ So when the server disables logging for ALTER TABLE or CREATE SELECT, it
+ logs LOGREC_INCOMPLETE_LOG to warn aria_read_log and then the user.
+
+ Another issue is that replaying of DDLs is not correct enough to work if
+ there was a crash during a DDL (see comment in execution of
+ REDO_RENAME_TABLE ).
+ */
+
+ eprint(tracef, "***WARNING: Aria engine currently logs no records "
+ "about insertion of data by ALTER TABLE and CREATE SELECT, "
+ "as they are not necessary for recovery; "
+ "present applying of log records to table '%s' may well not work."
+ "***", info->s->index_file_name.str);
+
+ /* Prevent using the table for anything else than undo repair */
+ _ma_mark_file_crashed(info->s);
+ recovery_warnings++;
+ return 0;
+}
+
+
+static my_bool create_database_if_not_exists(const char *name)
+{
+ char dirname[FN_REFLEN];
+ size_t length;
+ MY_STAT stat_info;
+ DBUG_ENTER("create_database_if_not_exists");
+
+ dirname_part(dirname, name, &length);
+ if (!length)
+ {
+ /* Skip files without directores */
+ DBUG_RETURN(0);
+ }
+ /*
+ Safety; Don't create files with hard path;
+ Should never happen with MariaDB
+ If hard path, then error will be detected when trying to create index file
+ */
+ if (test_if_hard_path(dirname))
+ DBUG_RETURN(0);
+
+ if (my_stat(dirname,&stat_info,MYF(0)))
+ DBUG_RETURN(0);
+
+
+ tprint(tracef, "Creating not existing database '%s'\n", dirname);
+ if (my_mkdir(dirname, 0777, MYF(MY_WME)))
+ {
+ eprint(tracef, "***WARNING: Can't create not existing database '%s'",
+ dirname);
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+
+
+
+
+prototype_redo_exec_hook(REDO_CREATE_TABLE)
+{
+ File dfile= -1, kfile= -1;
+ char *linkname_ptr, filename[FN_REFLEN], *name, *ptr, *ptr2,
+ *data_file_name, *index_file_name;
+ uchar *kfile_header;
+ myf create_flag;
+ uint flags;
+ int error= 1, create_mode= O_RDWR | O_TRUNC, i;
+ MARIA_HA *info= NULL;
+ uint kfile_size_before_extension, keystart;
+ DBUG_ENTER("exec_REDO_LOGREC_REDO_CREATE_TABLE");
+
+ if (skip_DDLs)
+ {
+ tprint(tracef, "we skip DDLs\n");
+ DBUG_RETURN(0);
+ }
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+ name= (char *)log_record_buffer.str;
+ /*
+ TRUNCATE TABLE and REPAIR USE_FRM call maria_create(), so below we can
+ find a REDO_CREATE_TABLE for a table which we have open, that's why we
+ need to look for any open instances and close them first.
+ */
+ if (close_one_table(name, rec->lsn))
+ {
+ eprint(tracef, "Table '%s' got error %d on close", name, my_errno);
+ ALERT_USER();
+ goto end;
+ }
+ /* we try hard to get create_rename_lsn, to avoid mistakes if possible */
+ info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+ if (info)
+ {
+ MARIA_SHARE *share= info->s;
+ /* check that we're not already using it */
+ if (share->reopen != 1)
+ {
+ eprint(tracef, "Table '%s is already open (reopen=%u)",
+ name, share->reopen);
+ ALERT_USER();
+ goto end;
+ }
+ DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+ if (!share->base.born_transactional)
+ {
+ /*
+ could be that transactional table was later dropped, and a non-trans
+ one was renamed to its name, thus create_rename_lsn is 0 and should
+ not be trusted.
+ */
+ tprint(tracef, "Table '%s' is not transactional, ignoring creation\n",
+ name);
+ ALERT_USER();
+ error= 0;
+ goto end;
+ }
+ if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+ {
+ tprint(tracef, "Table '%s' has create_rename_lsn (%lu,0x%lx) more "
+ "recent than record, ignoring creation",
+ name, LSN_IN_PARTS(share->state.create_rename_lsn));
+ error= 0;
+ goto end;
+ }
+ if (maria_is_crashed(info))
+ {
+ eprint(tracef, "Table '%s' is crashed, can't recreate it", name);
+ ALERT_USER();
+ goto end;
+ }
+ maria_close(info);
+ info= NULL;
+ }
+ else
+ {
+ /* one or two files absent, or header corrupted... */
+ tprint(tracef, "Table '%s' can't be opened (Error: %d)\n",
+ name, my_errno);
+ }
+ /* if does not exist, or is older, overwrite it */
+ ptr= name + strlen(name) + 1;
+ if ((flags= ptr[0] ? HA_DONT_TOUCH_DATA : 0))
+ tprint(tracef, ", we will only touch index file");
+ ptr++;
+ kfile_size_before_extension= uint2korr(ptr);
+ ptr+= 2;
+ keystart= uint2korr(ptr);
+ ptr+= 2;
+ kfile_header= (uchar *)ptr;
+ ptr+= kfile_size_before_extension;
+ /* set header lsns */
+ ptr2= (char *) kfile_header + sizeof(info->s->state.header) +
+ MARIA_FILE_CREATE_RENAME_LSN_OFFSET;
+ for (i= 0; i<3; i++)
+ {
+ lsn_store(ptr2, rec->lsn);
+ ptr2+= LSN_STORE_SIZE;
+ }
+ data_file_name= ptr;
+ ptr+= strlen(data_file_name) + 1;
+ index_file_name= ptr;
+ ptr+= strlen(index_file_name) + 1;
+ /** @todo handle symlinks */
+ if (data_file_name[0] || index_file_name[0])
+ {
+ eprint(tracef, "Table '%s' DATA|INDEX DIRECTORY clauses are not handled",
+ name);
+ goto end;
+ }
+ if (create_database_if_not_exists(name))
+ goto end;
+ fn_format(filename, name, "", MARIA_NAME_IEXT,
+ (MY_UNPACK_FILENAME |
+ (flags & HA_DONT_TOUCH_DATA) ? MY_RETURN_REAL_PATH : 0) |
+ MY_APPEND_EXT);
+ linkname_ptr= NULL;
+ create_flag= MY_DELETE_OLD;
+ tprint(tracef, "Table '%s' creating as '%s'\n", name, filename);
+ if ((kfile= my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+ MYF(MY_WME|create_flag))) < 0)
+ {
+ eprint(tracef, "Failed to create index file");
+ goto end;
+ }
+ if (my_pwrite(kfile, kfile_header,
+ kfile_size_before_extension, 0, MYF(MY_NABP|MY_WME)) ||
+ my_chsize(kfile, keystart, 0, MYF(MY_WME)))
+ {
+ eprint(tracef, "Failed to write to index file");
+ goto end;
+ }
+ if (!(flags & HA_DONT_TOUCH_DATA))
+ {
+ fn_format(filename,name,"", MARIA_NAME_DEXT,
+ MY_UNPACK_FILENAME | MY_APPEND_EXT);
+ linkname_ptr= NULL;
+ create_flag=MY_DELETE_OLD;
+ if (((dfile=
+ my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+ MYF(MY_WME | create_flag))) < 0) ||
+ my_close(dfile, MYF(MY_WME)))
+ {
+ eprint(tracef, "Failed to create data file");
+ goto end;
+ }
+ /*
+ we now have an empty data file. To be able to
+ _ma_initialize_data_file() we need some pieces of the share to be
+ correctly filled. So we just open the table (fortunately, an empty
+ data file does not preclude this).
+ */
+ if (((info= maria_open(name, O_RDONLY, 0)) == NULL) ||
+ _ma_initialize_data_file(info->s, info->dfile.file))
+ {
+ eprint(tracef, "Failed to open new table or write to data file");
+ goto end;
+ }
+ }
+ error= 0;
+end:
+ if (kfile >= 0)
+ error|= my_close(kfile, MYF(MY_WME));
+ if (info != NULL)
+ error|= maria_close(info);
+ DBUG_RETURN(error);
+}
+
+
+prototype_redo_exec_hook(REDO_RENAME_TABLE)
+{
+ char *old_name, *new_name;
+ int error= 1;
+ MARIA_HA *info= NULL;
+ DBUG_ENTER("exec_REDO_LOGREC_REDO_RENAME_TABLE");
+
+ if (skip_DDLs)
+ {
+ tprint(tracef, "we skip DDLs\n");
+ DBUG_RETURN(0);
+ }
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+ old_name= (char *)log_record_buffer.str;
+ new_name= old_name + strlen(old_name) + 1;
+ tprint(tracef, "Table '%s' to rename to '%s'; old-name table ", old_name,
+ new_name);
+ /*
+ Here is why we skip CREATE/DROP/RENAME when doing a recovery from
+ ha_maria (whereas we do when called from aria_read_log). Consider:
+ CREATE TABLE t;
+ RENAME TABLE t to u;
+ DROP TABLE u;
+ RENAME TABLE v to u; # crash between index rename and data rename.
+ And do a Recovery (not removing tables beforehand).
+ Recovery replays CREATE, then RENAME: the maria_open("t") works,
+ maria_open("u") does not (no data file) so table "u" is considered
+ inexistent and so maria_rename() is done which overwrites u's index file,
+ which is lost. Ok, the data file (v.MAD) is still available, but only a
+ REPAIR USE_FRM can rebuild the index, which is unsafe and downtime.
+ So it is preferrable to not execute RENAME, and leave the "mess" of files,
+ rather than possibly destroy a file. DBA will manually rename files.
+ A safe recovery method would probably require checking the existence of
+ the index file and of the data file separately (not via maria_open()), and
+ maybe also to store a create_rename_lsn in the data file too
+ For now, all we risk is to leave the mess (half-renamed files) left by the
+ crash. We however sync files and directories at each file rename. The SQL
+ layer is anyway not crash-safe for DDLs (except the repartioning-related
+ ones).
+ We replay DDLs in aria_read_log to be able to recreate tables from
+ scratch. It means that "aria_read_log -a" should not be used on a
+ database which just crashed during a DDL. And also ALTER TABLE does not
+ log insertions of records into the temporary table, so replaying may
+ fail (grep for INCOMPLETE_LOG in files).
+ */
+ info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+ if (info)
+ {
+ MARIA_SHARE *share= info->s;
+ if (!share->base.born_transactional)
+ {
+ tprint(tracef, ", is not transactional, ignoring renaming\n");
+ ALERT_USER();
+ error= 0;
+ goto end;
+ }
+ if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+ {
+ tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " record, ignoring renaming",
+ LSN_IN_PARTS(share->state.create_rename_lsn));
+ error= 0;
+ goto end;
+ }
+ if (maria_is_crashed(info))
+ {
+ tprint(tracef, ", is crashed, can't rename it");
+ ALERT_USER();
+ goto end;
+ }
+ if (close_one_table(info->s->open_file_name.str, rec->lsn) ||
+ maria_close(info))
+ goto end;
+ info= NULL;
+ tprint(tracef, ", is ok for renaming; new-name table ");
+ }
+ else /* one or two files absent, or header corrupted... */
+ {
+ tprint(tracef, ", can't be opened, probably does not exist");
+ error= 0;
+ goto end;
+ }
+ /*
+ We must also check the create_rename_lsn of the 'new_name' table if it
+ exists: otherwise we may, with our rename which overwrites, destroy
+ another table. For example:
+ CREATE TABLE t;
+ RENAME t to u;
+ DROP TABLE u;
+ RENAME v to u; # v is an old table, its creation/insertions not in log
+ And start executing the log (without removing tables beforehand): creates
+ t, renames it to u (if not testing create_rename_lsn) thus overwriting
+ old-named v, drops u, and we are stuck, we have lost data.
+ */
+ info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+ if (info)
+ {
+ MARIA_SHARE *share= info->s;
+ /* We should not have open instances on this table. */
+ if (share->reopen != 1)
+ {
+ tprint(tracef, ", is already open (reopen=%u)\n", share->reopen);
+ ALERT_USER();
+ goto end;
+ }
+ if (!share->base.born_transactional)
+ {
+ tprint(tracef, ", is not transactional, ignoring renaming\n");
+ ALERT_USER();
+ goto drop;
+ }
+ if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+ {
+ tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " record, ignoring renaming",
+ LSN_IN_PARTS(share->state.create_rename_lsn));
+ /*
+ We have to drop the old_name table. Consider:
+ CREATE TABLE t;
+ CREATE TABLE v;
+ RENAME TABLE t to u;
+ DROP TABLE u;
+ RENAME TABLE v to u;
+ and apply the log without removing tables beforehand. t will be
+ created, v too; in REDO_RENAME u will be more recent, but we still
+ have to drop t otherwise it stays.
+ */
+ goto drop;
+ }
+ if (maria_is_crashed(info))
+ {
+ tprint(tracef, ", is crashed, can't rename it");
+ ALERT_USER();
+ goto end;
+ }
+ if (maria_close(info))
+ goto end;
+ info= NULL;
+ /* abnormal situation */
+ tprint(tracef, ", exists but is older than record, can't rename it");
+ goto end;
+ }
+ else /* one or two files absent, or header corrupted... */
+ tprint(tracef, ", can't be opened, probably does not exist");
+ tprint(tracef, ", renaming '%s'", old_name);
+ if (maria_rename(old_name, new_name))
+ {
+ eprint(tracef, "Failed to rename table");
+ goto end;
+ }
+ info= maria_open(new_name, O_RDONLY, 0);
+ if (info == NULL)
+ {
+ eprint(tracef, "Failed to open renamed table");
+ goto end;
+ }
+ if (_ma_update_state_lsns(info->s, rec->lsn, info->s->state.create_trid,
+ TRUE, TRUE))
+ goto end;
+ if (maria_close(info))
+ goto end;
+ info= NULL;
+ error= 0;
+ goto end;
+drop:
+ tprint(tracef, ", only dropping '%s'", old_name);
+ if (maria_delete_table(old_name))
+ {
+ eprint(tracef, "Failed to drop table");
+ goto end;
+ }
+ error= 0;
+ goto end;
+end:
+ tprint(tracef, "\n");
+ if (info != NULL)
+ error|= maria_close(info);
+ DBUG_RETURN(error);
+}
+
+
+/*
+ The record may come from REPAIR, ALTER TABLE ENABLE KEYS, OPTIMIZE.
+*/
+prototype_redo_exec_hook(REDO_REPAIR_TABLE)
+{
+ int error= 1;
+ MARIA_HA *info;
+ HA_CHECK param;
+ char *name;
+ my_bool quick_repair;
+ DBUG_ENTER("exec_REDO_LOGREC_REDO_REPAIR_TABLE");
+
+ if (skip_DDLs)
+ {
+ /*
+ REPAIR is not exactly a DDL, but it manipulates files without logging
+ insertions into them.
+ */
+ tprint(tracef, "we skip DDLs\n");
+ DBUG_RETURN(0);
+ }
+ if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL)
+ DBUG_RETURN(0);
+ if (maria_is_crashed(info))
+ {
+ tprint(tracef, "we skip repairing crashed table\n");
+ DBUG_RETURN(0);
+ }
+ /*
+ Otherwise, the mapping is newer than the table, and our record is newer
+ than the mapping, so we can repair.
+ */
+ tprint(tracef, " repairing...\n");
+
+ maria_chk_init(&param);
+ param.isam_file_name= name= info->s->open_file_name.str;
+ param.testflag= uint8korr(rec->header + FILEID_STORE_SIZE);
+ param.tmpdir= maria_tmpdir;
+ param.max_trid= max_long_trid;
+ DBUG_ASSERT(maria_tmpdir);
+
+ info->s->state.key_map= uint8korr(rec->header + FILEID_STORE_SIZE + 8);
+ quick_repair= test(param.testflag & T_QUICK);
+
+ if (param.testflag & T_REP_PARALLEL)
+ {
+ if (maria_repair_parallel(&param, info, name, quick_repair))
+ goto end;
+ }
+ else if (param.testflag & T_REP_BY_SORT)
+ {
+ if (maria_repair_by_sort(&param, info, name, quick_repair))
+ goto end;
+ }
+ else if (maria_repair(&param, info, name, quick_repair))
+ goto end;
+
+ if (_ma_update_state_lsns(info->s, rec->lsn, trnman_get_min_safe_trid(),
+ TRUE, !(param.testflag & T_NO_CREATE_RENAME_LSN)))
+ goto end;
+ error= 0;
+
+end:
+ DBUG_RETURN(error);
+}
+
+
+prototype_redo_exec_hook(REDO_DROP_TABLE)
+{
+ char *name;
+ int error= 1;
+ MARIA_HA *info;
+ if (skip_DDLs)
+ {
+ tprint(tracef, "we skip DDLs\n");
+ return 0;
+ }
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+ name= (char *)log_record_buffer.str;
+ tprint(tracef, "Table '%s'", name);
+ info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+ if (info)
+ {
+ MARIA_SHARE *share= info->s;
+ if (!share->base.born_transactional)
+ {
+ tprint(tracef, ", is not transactional, ignoring removal\n");
+ ALERT_USER();
+ error= 0;
+ goto end;
+ }
+ if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+ {
+ tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " record, ignoring removal",
+ LSN_IN_PARTS(share->state.create_rename_lsn));
+ error= 0;
+ goto end;
+ }
+ if (maria_is_crashed(info))
+ {
+ tprint(tracef, ", is crashed, can't drop it");
+ ALERT_USER();
+ goto end;
+ }
+ if (close_one_table(info->s->open_file_name.str, rec->lsn) ||
+ maria_close(info))
+ goto end;
+ info= NULL;
+ /* if it is older, or its header is corrupted, drop it */
+ tprint(tracef, ", dropping '%s'", name);
+ if (maria_delete_table(name))
+ {
+ eprint(tracef, "Failed to drop table");
+ goto end;
+ }
+ }
+ else /* one or two files absent, or header corrupted... */
+ tprint(tracef,", can't be opened, probably does not exist");
+ error= 0;
+end:
+ tprint(tracef, "\n");
+ if (info != NULL)
+ error|= maria_close(info);
+ return error;
+}
+
+
+prototype_redo_exec_hook(FILE_ID)
+{
+ uint16 sid;
+ int error= 1;
+ const char *name;
+ MARIA_HA *info;
+ DBUG_ENTER("exec_REDO_LOGREC_FILE_ID");
+
+ if (cmp_translog_addr(rec->lsn, checkpoint_start) < 0)
+ {
+ /*
+ If that mapping was still true at checkpoint time, it was found in
+ checkpoint record, no need to recreate it. If that mapping had ended at
+ checkpoint time (table was closed or repaired), a flush and force
+ happened and so mapping is not needed.
+ */
+ tprint(tracef, "ignoring because before checkpoint\n");
+ DBUG_RETURN(0);
+ }
+
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+ sid= fileid_korr(log_record_buffer.str);
+ info= all_tables[sid].info;
+ if (info != NULL)
+ {
+ tprint(tracef, " Closing table '%s'\n", info->s->open_file_name.str);
+ prepare_table_for_close(info, rec->lsn);
+ if (maria_close(info))
+ {
+ eprint(tracef, "Failed to close table");
+ goto end;
+ }
+ all_tables[sid].info= NULL;
+ }
+ name= (char *)log_record_buffer.str + FILEID_STORE_SIZE;
+ if (new_table(sid, name, rec->lsn))
+ goto end;
+ error= 0;
+end:
+ DBUG_RETURN(error);
+}
+
+
+static int new_table(uint16 sid, const char *name, LSN lsn_of_file_id)
+{
+ /*
+ -1 (skip table): close table and return 0;
+ 1 (error): close table and return 1;
+ 0 (success): leave table open and return 0.
+ */
+ int error= 1;
+ MARIA_HA *info;
+ MARIA_SHARE *share;
+ my_off_t dfile_len, kfile_len;
+ DBUG_ENTER("new_table");
+
+ checkpoint_useful= TRUE;
+ if ((name == NULL) || (name[0] == 0))
+ {
+ /*
+ we didn't use DBUG_ASSERT() because such record corruption could
+ silently pass in the "info == NULL" test below.
+ */
+ tprint(tracef, ", record is corrupted");
+ info= NULL;
+ recovery_warnings++;
+ goto end;
+ }
+ tprint(tracef, "Table '%s', id %u", name, sid);
+ info= maria_open(name, O_RDWR, HA_OPEN_FOR_REPAIR);
+ if (info == NULL)
+ {
+ tprint(tracef, ", is absent (must have been dropped later?)"
+ " or its header is so corrupted that we cannot open it;"
+ " we skip it");
+ if (my_errno != ENOENT)
+ recovery_found_crashed_tables++;
+ error= 0;
+ goto end;
+ }
+ share= info->s;
+ /* check that we're not already using it */
+ if (share->reopen != 1)
+ {
+ tprint(tracef, ", is already open (reopen=%u)\n", share->reopen);
+ /*
+ It could be that we have in the log
+ FILE_ID(t1,10) ... (t1 was flushed) ... FILE_ID(t1,12);
+ */
+ if (close_one_table(share->open_file_name.str, lsn_of_file_id))
+ goto end;
+ /*
+ We should not try to get length of data/index files as the files
+ are not on disk yet.
+ */
+ _ma_tmp_disable_logging_for_table(info, FALSE);
+ goto set_lsn_of_file_id;
+ }
+ if (!share->base.born_transactional)
+ {
+ /*
+ This can happen if one converts a transactional table to a
+ not transactional table
+ */
+ tprint(tracef, ", is not transactional. Ignoring open request");
+ error= -1;
+ recovery_warnings++;
+ goto end;
+ }
+ if (cmp_translog_addr(lsn_of_file_id, share->state.create_rename_lsn) <= 0)
+ {
+ tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+ " LOGREC_FILE_ID's LSN (%lu,0x%lx), ignoring open request",
+ LSN_IN_PARTS(share->state.create_rename_lsn),
+ LSN_IN_PARTS(lsn_of_file_id));
+ recovery_warnings++;
+ error= -1;
+ goto end;
+ /*
+ Note that we tested that before testing corruption; a recent corrupted
+ table is not a blocker for the present log record.
+ */
+ }
+ if (maria_is_crashed(info))
+ {
+ eprint(tracef, "Table '%s' is crashed, skipping it. Please repair it with"
+ " aria_chk -r", share->open_file_name.str);
+ recovery_found_crashed_tables++;
+ error= -1; /* not fatal, try with other tables */
+ goto end;
+ /*
+ Note that if a first recovery fails to apply a REDO, it marks the table
+ corrupted and stops the entire recovery. A second recovery will find the
+ table is marked corrupted and skip it (and thus possibly handle other
+ tables).
+ */
+ }
+ /* don't log any records for this work */
+ _ma_tmp_disable_logging_for_table(info, FALSE);
+ /* execution of some REDO records relies on data_file_length */
+ dfile_len= my_seek(info->dfile.file, 0, SEEK_END, MYF(MY_WME));
+ kfile_len= my_seek(info->s->kfile.file, 0, SEEK_END, MYF(MY_WME));
+ if ((dfile_len == MY_FILEPOS_ERROR) ||
+ (kfile_len == MY_FILEPOS_ERROR))
+ {
+ tprint(tracef, ", length unknown\n");
+ recovery_warnings++;
+ goto end;
+ }
+ if (share->state.state.data_file_length != dfile_len)
+ {
+ tprint(tracef, ", has wrong state.data_file_length (fixing it)");
+ share->state.state.data_file_length= dfile_len;
+ }
+ if (share->state.state.key_file_length != kfile_len)
+ {
+ tprint(tracef, ", has wrong state.key_file_length (fixing it)");
+ share->state.state.key_file_length= kfile_len;
+ }
+ if ((dfile_len % share->block_size) || (kfile_len % share->block_size))
+ {
+ tprint(tracef, ", has too short last page\n");
+ /* Recovery will fix this, no error */
+ ALERT_USER();
+ }
+
+set_lsn_of_file_id:
+ /*
+ This LSN serves in this situation; assume log is:
+ FILE_ID(6->"t2") REDO_INSERT(6) FILE_ID(6->"t1") CHECKPOINT(6->"t1")
+ then crash, checkpoint record is parsed and opens "t1" with id 6; assume
+ REDO phase starts from the REDO_INSERT above: it will wrongly try to
+ update a page of "t1". With this LSN below, REDO_INSERT can realize the
+ mapping is newer than itself, and not execute.
+ Same example is possible with UNDO_INSERT (update of the state).
+ */
+ info->s->lsn_of_file_id= lsn_of_file_id;
+ all_tables[sid].info= info;
+ /*
+ We don't set info->s->id, it would be useless (no logging in REDO phase);
+ if you change that, know that some records in REDO phase call
+ _ma_update_state_lsns() which resets info->s->id.
+ */
+ tprint(tracef, ", opened");
+ error= 0;
+end:
+ tprint(tracef, "\n");
+ if (error)
+ {
+ if (info != NULL)
+ maria_close(info);
+ if (error == -1)
+ error= 0;
+ }
+ DBUG_RETURN(error);
+}
+
+/*
+ NOTE
+ This is called for REDO_INSERT_ROW_HEAD and READ_NEW_ROW_HEAD
+*/
+
+prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD)
+{
+ int error= 1;
+ uchar *buff= NULL;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+
+ {
+ /*
+ Table was skipped at open time (because later dropped/renamed, not
+ transactional, or create_rename_lsn newer than LOGREC_FILE_ID), or
+ record was skipped due to skip_redo_lsn; it is not an error.
+ */
+ return 0;
+ }
+ /*
+ Note that REDO is per page, we still consider it if its transaction
+ committed long ago and is unknown.
+ */
+ /*
+ If REDO's LSN is > page's LSN (read from disk), we are going to modify the
+ page and change its LSN. The normal runtime code stores the UNDO's LSN
+ into the page. Here storing the REDO's LSN (rec->lsn) would work
+ (we are not writing to the log here, so don't have to "flush up to UNDO's
+ LSN"). But in a test scenario where we do updates at runtime, then remove
+ tables, apply the log and check that this results in the same table as at
+ runtime, putting the same LSN as runtime had done will decrease
+ differences. So we use the UNDO's LSN which is current_group_end_lsn.
+ */
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL)
+ {
+ eprint(tracef, "Failed to read allocate buffer for record");
+ goto end;
+ }
+ if (translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+ buff= log_record_buffer.str;
+ if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn,
+ HEAD_PAGE,
+ (rec->type ==
+ LOGREC_REDO_NEW_ROW_HEAD),
+ buff + FILEID_STORE_SIZE,
+ buff +
+ FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE,
+ rec->record_length -
+ (FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE)))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+/*
+ NOTE
+ This is called for REDO_INSERT_ROW_TAIL and READ_NEW_ROW_TAIL
+*/
+
+prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL)
+{
+ int error= 1;
+ uchar *buff;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+ buff= log_record_buffer.str;
+ if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn,
+ TAIL_PAGE,
+ (rec->type ==
+ LOGREC_REDO_NEW_ROW_TAIL),
+ buff + FILEID_STORE_SIZE,
+ buff +
+ FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE,
+ rec->record_length -
+ (FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE +
+ DIRPOS_STORE_SIZE)))
+ goto end;
+ error= 0;
+
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_INSERT_ROW_BLOBS)
+{
+ int error= 1;
+ uchar *buff;
+ uint number_of_blobs, number_of_ranges;
+ pgcache_page_no_t first_page, last_page;
+ char llbuf1[22], llbuf2[22];
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+ buff= log_record_buffer.str;
+ if (_ma_apply_redo_insert_row_blobs(info, current_group_end_lsn,
+ buff, rec->lsn, &number_of_blobs,
+ &number_of_ranges,
+ &first_page, &last_page))
+ goto end;
+ llstr(first_page, llbuf1);
+ llstr(last_page, llbuf2);
+ tprint(tracef, " %u blobs %u ranges, first page %s last %s",
+ number_of_blobs, number_of_ranges, llbuf1, llbuf2);
+
+ error= 0;
+
+end:
+ tprint(tracef, " \n");
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+ if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn,
+ HEAD_PAGE,
+ rec->header + FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+ if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn,
+ TAIL_PAGE,
+ rec->header + FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_FREE_BLOCKS)
+{
+ int error= 1;
+ uchar *buff;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+ enlarge_buffer(rec);
+
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+
+ buff= log_record_buffer.str;
+ if (_ma_apply_redo_free_blocks(info, current_group_end_lsn,
+ buff + FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+
+ if (_ma_apply_redo_free_head_or_tail(info, current_group_end_lsn,
+ rec->header + FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_DELETE_ALL)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL)
+ return 0;
+ tprint(tracef, " deleting all %lu rows\n",
+ (ulong)info->s->state.state.records);
+ if (maria_delete_all_rows(info))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_INDEX)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+ enlarge_buffer(rec);
+
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+
+ if (_ma_apply_redo_index(info, current_group_end_lsn,
+ log_record_buffer.str + FILEID_STORE_SIZE,
+ rec->record_length - FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+ enlarge_buffer(rec);
+
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+
+ if (_ma_apply_redo_index_new_page(info, current_group_end_lsn,
+ log_record_buffer.str + FILEID_STORE_SIZE,
+ rec->record_length - FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+
+ if (_ma_apply_redo_index_free_page(info, current_group_end_lsn,
+ rec->header + FILEID_STORE_SIZE))
+ goto end;
+ error= 0;
+end:
+ return error;
+}
+
+
+prototype_redo_exec_hook(REDO_BITMAP_NEW_PAGE)
+{
+ int error= 1;
+ MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+ if (info == NULL || maria_is_crashed(info))
+ return 0;
+ enlarge_buffer(rec);
+
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ goto end;
+ }
+
+ if (cmp_translog_addr(rec->lsn, checkpoint_start) >= 0)
+ {
+ /*
+ Record is potentially after the bitmap flush made by Checkpoint, so has
+ to be replayed. It may overwrite a more recent state but that will be
+ corrected by all upcoming REDOs for data pages.
+ If the condition is false, we must not apply the record: it is unneeded
+ and nocive (may not be corrected as REDOs can be skipped due to
+ dirty-pages list).
+ */
+ if (_ma_apply_redo_bitmap_new_page(info, current_group_end_lsn,
+ log_record_buffer.str +
+ FILEID_STORE_SIZE))
+ goto end;
+ }
+ error= 0;
+end:
+ return error;
+}
+
+
+static inline void set_undo_lsn_for_active_trans(uint16 short_trid, LSN lsn)
+{
+ if (all_active_trans[short_trid].long_trid == 0)
+ {
+ /* transaction unknown, so has committed or fully rolled back long ago */
+ return;
+ }
+ all_active_trans[short_trid].undo_lsn= lsn;
+ if (all_active_trans[short_trid].first_undo_lsn == LSN_IMPOSSIBLE)
+ all_active_trans[short_trid].first_undo_lsn= lsn;
+}
+
+
+prototype_redo_exec_hook(UNDO_ROW_INSERT)
+{
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ MARIA_SHARE *share;
+
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ if (info == NULL)
+ {
+ /*
+ Note that we set undo_lsn anyway. So that if the transaction is later
+ rolled back, this UNDO is tried for execution and we get a warning (as
+ it would then be abnormal that info==NULL).
+ */
+ return 0;
+ }
+ share= info->s;
+ if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+ {
+ tprint(tracef, " state has LSN (%lu,0x%lx) older than record, updating"
+ " rows' count\n", LSN_IN_PARTS(share->state.is_of_horizon));
+ share->state.state.records++;
+ if (share->calc_checksum)
+ {
+ uchar buff[HA_CHECKSUM_STORE_SIZE];
+ if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ HA_CHECKSUM_STORE_SIZE, buff, NULL) !=
+ HA_CHECKSUM_STORE_SIZE)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+ share->state.state.checksum+= ha_checksum_korr(buff);
+ }
+ info->s->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+ }
+ tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records);
+ /* Unpin all pages, stamp them with UNDO's LSN */
+ _ma_unpin_all_pages(info, rec->lsn);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_ROW_DELETE)
+{
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ MARIA_SHARE *share;
+
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ if (info == NULL)
+ return 0;
+ share= info->s;
+ if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+ {
+ tprint(tracef, " state older than record\n");
+ share->state.state.records--;
+ if (share->calc_checksum)
+ {
+ uchar buff[HA_CHECKSUM_STORE_SIZE];
+ if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 +
+ PAGERANGE_STORE_SIZE,
+ HA_CHECKSUM_STORE_SIZE, buff, NULL) !=
+ HA_CHECKSUM_STORE_SIZE)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+ share->state.state.checksum+= ha_checksum_korr(buff);
+ }
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+ }
+ tprint(tracef, " rows' count %lu\n", (ulong)share->state.state.records);
+ _ma_unpin_all_pages(info, rec->lsn);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_ROW_UPDATE)
+{
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ MARIA_SHARE *share;
+
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ if (info == NULL)
+ return 0;
+ share= info->s;
+ if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+ {
+ if (share->calc_checksum)
+ {
+ uchar buff[HA_CHECKSUM_STORE_SIZE];
+ if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ HA_CHECKSUM_STORE_SIZE, buff, NULL) !=
+ HA_CHECKSUM_STORE_SIZE)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+ share->state.state.checksum+= ha_checksum_korr(buff);
+ }
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+ }
+ _ma_unpin_all_pages(info, rec->lsn);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_KEY_INSERT)
+{
+ MARIA_HA *info;
+ MARIA_SHARE *share;
+
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ if (!(info= get_MARIA_HA_from_UNDO_record(rec)))
+ return 0;
+ share= info->s;
+ if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+ {
+ const uchar *ptr= rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE;
+ uint keynr= key_nr_korr(ptr);
+ if (share->base.auto_key == (keynr + 1)) /* it's auto-increment */
+ {
+ const HA_KEYSEG *keyseg= info->s->keyinfo[keynr].seg;
+ ulonglong value;
+ char llbuf[22];
+ uchar *to;
+ tprint(tracef, " state older than record\n");
+ /* we read the record to find the auto_increment value */
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+ to= log_record_buffer.str + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ KEY_NR_STORE_SIZE;
+ if (keyseg->flag & HA_SWAP_KEY)
+ {
+ /* We put key from log record to "data record" packing format... */
+ uchar reversed[MARIA_MAX_KEY_BUFF];
+ uchar *key_ptr= to;
+ uchar *key_end= key_ptr + keyseg->length;
+ to= reversed + keyseg->length;
+ do
+ {
+ *--to= *key_ptr++;
+ } while (key_ptr != key_end);
+ /* ... so that we can read it with: */
+ }
+ value= ma_retrieve_auto_increment(to, keyseg->type);
+ set_if_bigger(share->state.auto_increment, value);
+ llstr(share->state.auto_increment, llbuf);
+ tprint(tracef, " auto-inc %s\n", llbuf);
+ }
+ }
+ _ma_unpin_all_pages(info, rec->lsn);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_KEY_DELETE)
+{
+ MARIA_HA *info;
+
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ if (!(info= get_MARIA_HA_from_UNDO_record(rec)))
+ return 0;
+ _ma_unpin_all_pages(info, rec->lsn);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT)
+{
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ MARIA_SHARE *share;
+
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ if (info == NULL)
+ return 0;
+ share= info->s;
+ if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+ {
+ uint key_nr;
+ my_off_t page;
+ key_nr= key_nr_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE);
+ page= page_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ KEY_NR_STORE_SIZE);
+ share->state.key_root[key_nr]= (page == IMPOSSIBLE_PAGE_NO ?
+ HA_OFFSET_ERROR :
+ page * share->block_size);
+ }
+ _ma_unpin_all_pages(info, rec->lsn);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_BULK_INSERT)
+{
+ /*
+ If the repair finished it wrote and sync the state. If it didn't finish,
+ we are going to empty the table and that will fix the state.
+ */
+ set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(IMPORTED_TABLE)
+{
+ char *name;
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+ name= (char *)log_record_buffer.str;
+ tprint(tracef, "Table '%s' was imported (auto-zerofilled) in this Aria instance\n", name);
+ return 0;
+}
+
+
+prototype_redo_exec_hook(COMMIT)
+{
+ uint16 sid= rec->short_trid;
+ TrID long_trid= all_active_trans[sid].long_trid;
+ char llbuf[22];
+ if (long_trid == 0)
+ {
+ tprint(tracef, "We don't know about transaction with short_trid %u;"
+ "it probably committed long ago, forget it\n", sid);
+ bzero(&all_active_trans[sid], sizeof(all_active_trans[sid]));
+ return 0;
+ }
+ llstr(long_trid, llbuf);
+ tprint(tracef, "Transaction long_trid %s short_trid %u committed\n",
+ llbuf, sid);
+ bzero(&all_active_trans[sid], sizeof(all_active_trans[sid]));
+#ifdef MARIA_VERSIONING
+ /*
+ if real recovery:
+ transaction was committed, move it to some separate list for later
+ purging (but don't purge now! purging may have been started before, we
+ may find REDO_PURGE records soon).
+ */
+#endif
+ return 0;
+}
+
+prototype_redo_exec_hook(CLR_END)
+{
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ MARIA_SHARE *share;
+ LSN previous_undo_lsn;
+ enum translog_record_type undone_record_type;
+ const LOG_DESC *log_desc;
+ my_bool row_entry= 0;
+ uchar *logpos;
+ DBUG_ENTER("exec_REDO_LOGREC_CLR_END");
+
+ previous_undo_lsn= lsn_korr(rec->header);
+ undone_record_type=
+ clr_type_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE);
+ log_desc= &log_record_type_descriptor[undone_record_type];
+
+ set_undo_lsn_for_active_trans(rec->short_trid, previous_undo_lsn);
+ if (info == NULL)
+ DBUG_RETURN(0);
+ share= info->s;
+ tprint(tracef, " CLR_END was about %s, undo_lsn now LSN (%lu,0x%lx)\n",
+ log_desc->name, LSN_IN_PARTS(previous_undo_lsn));
+
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+ logpos= (log_record_buffer.str + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ CLR_TYPE_STORE_SIZE);
+
+ if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+ {
+ tprint(tracef, " state older than record\n");
+ switch (undone_record_type) {
+ case LOGREC_UNDO_ROW_DELETE:
+ row_entry= 1;
+ share->state.state.records++;
+ break;
+ case LOGREC_UNDO_ROW_INSERT:
+ share->state.state.records--;
+ share->state.changed|= STATE_NOT_OPTIMIZED_ROWS;
+ row_entry= 1;
+ break;
+ case LOGREC_UNDO_ROW_UPDATE:
+ row_entry= 1;
+ break;
+ case LOGREC_UNDO_KEY_INSERT:
+ case LOGREC_UNDO_KEY_DELETE:
+ break;
+ case LOGREC_UNDO_KEY_INSERT_WITH_ROOT:
+ case LOGREC_UNDO_KEY_DELETE_WITH_ROOT:
+ {
+ uint key_nr;
+ my_off_t page;
+ key_nr= key_nr_korr(logpos);
+ page= page_korr(logpos + KEY_NR_STORE_SIZE);
+ share->state.key_root[key_nr]= (page == IMPOSSIBLE_PAGE_NO ?
+ HA_OFFSET_ERROR :
+ page * share->block_size);
+ break;
+ }
+ case LOGREC_UNDO_BULK_INSERT:
+ break;
+ default:
+ DBUG_ASSERT(0);
+ }
+ if (row_entry && share->calc_checksum)
+ share->state.state.checksum+= ha_checksum_korr(logpos);
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+ }
+ if (row_entry)
+ tprint(tracef, " rows' count %lu\n", (ulong)share->state.state.records);
+ _ma_unpin_all_pages(info, rec->lsn);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ Hock to print debug information (like MySQL query)
+*/
+
+prototype_redo_exec_hook(DEBUG_INFO)
+{
+ uchar *data;
+ enum translog_debug_info_type debug_info;
+
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record debug record");
+ return 1;
+ }
+ debug_info= (enum translog_debug_info_type) log_record_buffer.str[0];
+ data= log_record_buffer.str + 1;
+ switch (debug_info) {
+ case LOGREC_DEBUG_INFO_QUERY:
+ tprint(tracef, "Query: %.*s\n", rec->record_length - 1,
+ (char*) data);
+ break;
+ default:
+ DBUG_ASSERT(0);
+ }
+ return 0;
+}
+
+
+/**
+ In some cases we have to skip execution of an UNDO record during the UNDO
+ phase.
+*/
+
+static void skip_undo_record(LSN previous_undo_lsn, TRN *trn)
+{
+ trn->undo_lsn= previous_undo_lsn;
+ if (previous_undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */
+ trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
+ skipped_undo_phase++;
+}
+
+
+prototype_undo_exec_hook(UNDO_ROW_INSERT)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+ MARIA_SHARE *share;
+ const uchar *record_ptr;
+
+ if (info == NULL || maria_is_crashed(info))
+ {
+ /*
+ Unlike for REDOs, if the table was skipped it is abnormal; we have a
+ transaction to rollback which used this table, as it is not rolled back
+ it was supposed to hold this table and so the table should still be
+ there. Skip it (user may have repaired the table with maria_chk because
+ it was so badly corrupted that a previous recovery failed) but warn.
+ */
+ skip_undo_record(previous_undo_lsn, trn);
+ return 0;
+ }
+ share= info->s;
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED |
+ STATE_NOT_MOVABLE);
+ record_ptr= rec->header;
+ if (share->calc_checksum)
+ {
+ /*
+ We need to read more of the record to put the checksum into the record
+ buffer used by _ma_apply_undo_row_insert().
+ If the table has no live checksum, rec->header will be enough.
+ */
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+ record_ptr= log_record_buffer.str;
+ }
+
+ info->trn= trn;
+ error= _ma_apply_undo_row_insert(info, previous_undo_lsn,
+ record_ptr + LSN_STORE_SIZE +
+ FILEID_STORE_SIZE);
+ info->trn= 0;
+ /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+ tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records);
+ tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(trn->undo_lsn));
+ return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_ROW_DELETE)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+ MARIA_SHARE *share;
+
+ if (info == NULL || maria_is_crashed(info))
+ {
+ skip_undo_record(previous_undo_lsn, trn);
+ return 0;
+ }
+
+ share= info->s;
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+
+ info->trn= trn;
+ error= _ma_apply_undo_row_delete(info, previous_undo_lsn,
+ log_record_buffer.str + LSN_STORE_SIZE +
+ FILEID_STORE_SIZE,
+ rec->record_length -
+ (LSN_STORE_SIZE + FILEID_STORE_SIZE));
+ info->trn= 0;
+ tprint(tracef, " rows' count %lu\n undo_lsn now LSN (%lu,0x%lx)\n",
+ (ulong)share->state.state.records, LSN_IN_PARTS(trn->undo_lsn));
+ return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_ROW_UPDATE)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+ MARIA_SHARE *share;
+
+ if (info == NULL || maria_is_crashed(info))
+ {
+ skip_undo_record(previous_undo_lsn, trn);
+ return 0;
+ }
+
+ share= info->s;
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+
+ info->trn= trn;
+ error= _ma_apply_undo_row_update(info, previous_undo_lsn,
+ log_record_buffer.str + LSN_STORE_SIZE +
+ FILEID_STORE_SIZE,
+ rec->record_length -
+ (LSN_STORE_SIZE + FILEID_STORE_SIZE));
+ info->trn= 0;
+ tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(trn->undo_lsn));
+ return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_KEY_INSERT)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+ MARIA_SHARE *share;
+
+ if (info == NULL || maria_is_crashed(info))
+ {
+ skip_undo_record(previous_undo_lsn, trn);
+ return 0;
+ }
+
+ share= info->s;
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+
+ info->trn= trn;
+ error= _ma_apply_undo_key_insert(info, previous_undo_lsn,
+ log_record_buffer.str + LSN_STORE_SIZE +
+ FILEID_STORE_SIZE,
+ rec->record_length - LSN_STORE_SIZE -
+ FILEID_STORE_SIZE);
+ info->trn= 0;
+ /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+ tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(trn->undo_lsn));
+ return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_KEY_DELETE)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+ MARIA_SHARE *share;
+
+ if (info == NULL || maria_is_crashed(info))
+ {
+ skip_undo_record(previous_undo_lsn, trn);
+ return 0;
+ }
+
+ share= info->s;
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+
+ info->trn= trn;
+ error= _ma_apply_undo_key_delete(info, previous_undo_lsn,
+ log_record_buffer.str + LSN_STORE_SIZE +
+ FILEID_STORE_SIZE,
+ rec->record_length - LSN_STORE_SIZE -
+ FILEID_STORE_SIZE, FALSE);
+ info->trn= 0;
+ /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+ tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(trn->undo_lsn));
+ return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+ MARIA_SHARE *share;
+
+ if (info == NULL || maria_is_crashed(info))
+ {
+ skip_undo_record(previous_undo_lsn, trn);
+ return 0;
+ }
+
+ share= info->s;
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+
+ enlarge_buffer(rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec->lsn, 0, rec->record_length,
+ log_record_buffer.str, NULL) !=
+ rec->record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return 1;
+ }
+
+ info->trn= trn;
+ error= _ma_apply_undo_key_delete(info, previous_undo_lsn,
+ log_record_buffer.str + LSN_STORE_SIZE +
+ FILEID_STORE_SIZE,
+ rec->record_length - LSN_STORE_SIZE -
+ FILEID_STORE_SIZE, TRUE);
+ info->trn= 0;
+ /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+ tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(trn->undo_lsn));
+ return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_BULK_INSERT)
+{
+ my_bool error;
+ MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+ LSN previous_undo_lsn= lsn_korr(rec->header);
+ MARIA_SHARE *share;
+
+ /* Here we don't check for crashed as we can undo the bulk insert */
+ if (info == NULL)
+ {
+ skip_undo_record(previous_undo_lsn, trn);
+ return 0;
+ }
+
+ share= info->s;
+ share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+ STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+
+ info->trn= trn;
+ error= _ma_apply_undo_bulk_insert(info, previous_undo_lsn);
+ info->trn= 0;
+ /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+ tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(trn->undo_lsn));
+ return error;
+}
+
+
+static int run_redo_phase(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply)
+{
+ TRANSLOG_HEADER_BUFFER rec;
+ struct st_translog_scanner_data scanner;
+ int len;
+ uint i;
+ DBUG_ENTER("run_redo_phase");
+
+ /* install hooks for execution */
+#define install_redo_exec_hook(R) \
+ log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \
+ exec_REDO_LOGREC_ ## R;
+#define install_redo_exec_hook_shared(R,S) \
+ log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \
+ exec_REDO_LOGREC_ ## S;
+#define install_undo_exec_hook(R) \
+ log_record_type_descriptor[LOGREC_ ## R].record_execute_in_undo_phase= \
+ exec_UNDO_LOGREC_ ## R;
+ install_redo_exec_hook(LONG_TRANSACTION_ID);
+ install_redo_exec_hook(CHECKPOINT);
+ install_redo_exec_hook(REDO_CREATE_TABLE);
+ install_redo_exec_hook(REDO_RENAME_TABLE);
+ install_redo_exec_hook(REDO_REPAIR_TABLE);
+ install_redo_exec_hook(REDO_DROP_TABLE);
+ install_redo_exec_hook(FILE_ID);
+ install_redo_exec_hook(INCOMPLETE_LOG);
+ install_redo_exec_hook(INCOMPLETE_GROUP);
+ install_redo_exec_hook(REDO_INSERT_ROW_HEAD);
+ install_redo_exec_hook(REDO_INSERT_ROW_TAIL);
+ install_redo_exec_hook(REDO_INSERT_ROW_BLOBS);
+ install_redo_exec_hook(REDO_PURGE_ROW_HEAD);
+ install_redo_exec_hook(REDO_PURGE_ROW_TAIL);
+ install_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL);
+ install_redo_exec_hook(REDO_FREE_BLOCKS);
+ install_redo_exec_hook(REDO_DELETE_ALL);
+ install_redo_exec_hook(REDO_INDEX);
+ install_redo_exec_hook(REDO_INDEX_NEW_PAGE);
+ install_redo_exec_hook(REDO_INDEX_FREE_PAGE);
+ install_redo_exec_hook(REDO_BITMAP_NEW_PAGE);
+ install_redo_exec_hook(UNDO_ROW_INSERT);
+ install_redo_exec_hook(UNDO_ROW_DELETE);
+ install_redo_exec_hook(UNDO_ROW_UPDATE);
+ install_redo_exec_hook(UNDO_KEY_INSERT);
+ install_redo_exec_hook(UNDO_KEY_DELETE);
+ install_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
+ install_redo_exec_hook(COMMIT);
+ install_redo_exec_hook(CLR_END);
+ install_undo_exec_hook(UNDO_ROW_INSERT);
+ install_undo_exec_hook(UNDO_ROW_DELETE);
+ install_undo_exec_hook(UNDO_ROW_UPDATE);
+ install_undo_exec_hook(UNDO_KEY_INSERT);
+ install_undo_exec_hook(UNDO_KEY_DELETE);
+ install_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
+ /* REDO_NEW_ROW_HEAD shares entry with REDO_INSERT_ROW_HEAD */
+ install_redo_exec_hook_shared(REDO_NEW_ROW_HEAD, REDO_INSERT_ROW_HEAD);
+ /* REDO_NEW_ROW_TAIL shares entry with REDO_INSERT_ROW_TAIL */
+ install_redo_exec_hook_shared(REDO_NEW_ROW_TAIL, REDO_INSERT_ROW_TAIL);
+ install_redo_exec_hook(UNDO_BULK_INSERT);
+ install_undo_exec_hook(UNDO_BULK_INSERT);
+ install_redo_exec_hook(IMPORTED_TABLE);
+ install_redo_exec_hook(DEBUG_INFO);
+
+ current_group_end_lsn= LSN_IMPOSSIBLE;
+#ifndef DBUG_OFF
+ current_group_table= NULL;
+#endif
+
+ if (unlikely(lsn == LSN_IMPOSSIBLE || lsn == translog_get_horizon()))
+ {
+ tprint(tracef, "checkpoint address refers to the log end log or "
+ "log is empty, nothing to do.\n");
+ DBUG_RETURN(0);
+ }
+
+ len= translog_read_record_header(lsn, &rec);
+
+ if (len == RECHEADER_READ_ERROR)
+ {
+ eprint(tracef, "Failed to read header of the first record.");
+ DBUG_RETURN(1);
+ }
+ if (translog_scanner_init(lsn, 1, &scanner, 1))
+ {
+ tprint(tracef, "Scanner init failed\n");
+ DBUG_RETURN(1);
+ }
+ for (i= 1;;i++)
+ {
+ uint16 sid= rec.short_trid;
+ const LOG_DESC *log_desc= &log_record_type_descriptor[rec.type];
+ display_record_position(log_desc, &rec, i);
+ /*
+ A complete group is a set of log records with an "end mark" record
+ (e.g. a set of REDOs for an operation, terminated by an UNDO for this
+ operation); if there is no "end mark" record the group is incomplete and
+ won't be executed.
+ */
+ if ((log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF) ||
+ (log_desc->record_in_group == LOGREC_LAST_IN_GROUP))
+ {
+ if (all_active_trans[sid].group_start_lsn != LSN_IMPOSSIBLE)
+ {
+ if (log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF)
+ {
+ /*
+ Can happen if the transaction got a table write error, then
+ unlocked tables thus wrote a COMMIT record. Or can be an
+ INCOMPLETE_GROUP record written by a previous recovery.
+ */
+ tprint(tracef, "\nDiscarding incomplete group before this record\n");
+ all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+ }
+ else
+ {
+ struct st_translog_scanner_data scanner2;
+ TRANSLOG_HEADER_BUFFER rec2;
+ /*
+ There is a complete group for this transaction, containing more
+ than this event.
+ */
+ tprint(tracef, " ends a group:\n");
+ len=
+ translog_read_record_header(all_active_trans[sid].group_start_lsn,
+ &rec2);
+ if (len < 0) /* EOF or error */
+ {
+ tprint(tracef, "Cannot find record where it should be\n");
+ goto err;
+ }
+ if (lsn_end != LSN_IMPOSSIBLE && rec2.lsn >= lsn_end)
+ {
+ tprint(tracef,
+ "lsn_end reached at (%lu,0x%lx). "
+ "Skipping rest of redo entries",
+ LSN_IN_PARTS(rec2.lsn));
+ translog_destroy_scanner(&scanner);
+ translog_free_record_header(&rec);
+ DBUG_RETURN(0);
+ }
+
+ if (translog_scanner_init(rec2.lsn, 1, &scanner2, 1))
+ {
+ tprint(tracef, "Scanner2 init failed\n");
+ goto err;
+ }
+ current_group_end_lsn= rec.lsn;
+ do
+ {
+ if (rec2.short_trid == sid) /* it's in our group */
+ {
+ const LOG_DESC *log_desc2= &log_record_type_descriptor[rec2.type];
+ display_record_position(log_desc2, &rec2, 0);
+ if (apply == MARIA_LOG_CHECK)
+ {
+ translog_size_t read_len;
+ enlarge_buffer(&rec2);
+ read_len=
+ translog_read_record(rec2.lsn, 0, rec2.record_length,
+ log_record_buffer.str, NULL);
+ if (read_len != rec2.record_length)
+ {
+ tprint(tracef, "Cannot read record's body: read %u of"
+ " %u bytes\n", read_len, rec2.record_length);
+ translog_destroy_scanner(&scanner2);
+ translog_free_record_header(&rec2);
+ goto err;
+ }
+ }
+ if (apply == MARIA_LOG_APPLY &&
+ display_and_apply_record(log_desc2, &rec2))
+ {
+ translog_destroy_scanner(&scanner2);
+ translog_free_record_header(&rec2);
+ goto err;
+ }
+ }
+ translog_free_record_header(&rec2);
+ len= translog_read_next_record_header(&scanner2, &rec2);
+ if (len < 0) /* EOF or error */
+ {
+ tprint(tracef, "Cannot find record where it should be\n");
+ translog_destroy_scanner(&scanner2);
+ translog_free_record_header(&rec2);
+ goto err;
+ }
+ }
+ while (rec2.lsn < rec.lsn);
+ /* group finished */
+ all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+ current_group_end_lsn= LSN_IMPOSSIBLE; /* for debugging */
+ display_record_position(log_desc, &rec, 0);
+ translog_destroy_scanner(&scanner2);
+ translog_free_record_header(&rec2);
+ }
+ }
+ if (apply == MARIA_LOG_APPLY &&
+ display_and_apply_record(log_desc, &rec))
+ goto err;
+#ifndef DBUG_OFF
+ current_group_table= NULL;
+#endif
+ }
+ else /* record does not end group */
+ {
+ /* just record the fact, can't know if can execute yet */
+ if (all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE)
+ {
+ /* group not yet started */
+ all_active_trans[sid].group_start_lsn= rec.lsn;
+ }
+ }
+ translog_free_record_header(&rec);
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len < 0)
+ {
+ switch (len)
+ {
+ case RECHEADER_READ_EOF:
+ tprint(tracef, "EOF on the log\n");
+ break;
+ case RECHEADER_READ_ERROR:
+ tprint(tracef, "Error reading log\n");
+ goto err;
+ }
+ break;
+ }
+ }
+ translog_destroy_scanner(&scanner);
+ translog_free_record_header(&rec);
+ if (recovery_message_printed == REC_MSG_REDO)
+ {
+ fprintf(stderr, " 100%%");
+ fflush(stderr);
+ procent_printed= 1;
+ }
+ DBUG_RETURN(0);
+
+err:
+ translog_destroy_scanner(&scanner);
+ translog_free_record_header(&rec);
+ DBUG_RETURN(1);
+}
+
+
+/**
+ @brief Informs about any aborted groups or uncommitted transactions,
+ prepares for the UNDO phase if needed.
+
+ @note Observe that it may init trnman.
+*/
+static uint end_of_redo_phase(my_bool prepare_for_undo_phase)
+{
+ uint sid, uncommitted= 0;
+ char llbuf[22];
+ LSN addr;
+
+ hash_free(&all_dirty_pages);
+ /*
+ hash_free() can be called multiple times probably, but be safe if that
+ changes
+ */
+ bzero(&all_dirty_pages, sizeof(all_dirty_pages));
+ my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR));
+ dirty_pages_pool= NULL;
+
+ llstr(max_long_trid, llbuf);
+ tprint(tracef, "Maximum transaction long id seen: %s\n", llbuf);
+ llstr(max_trid_in_control_file, llbuf);
+ tprint(tracef, "Maximum transaction long id seen in control file: %s\n",
+ llbuf);
+ /*
+ If logs were deleted, or lost, trid in control file is needed to set
+ trnman's generator:
+ */
+ set_if_bigger(max_long_trid, max_trid_in_control_file);
+ if (prepare_for_undo_phase && trnman_init(max_long_trid))
+ return -1;
+
+ trns_created= TRUE;
+
+ for (sid= 0; sid <= SHORT_TRID_MAX; sid++)
+ {
+ TrID long_trid= all_active_trans[sid].long_trid;
+ LSN gslsn= all_active_trans[sid].group_start_lsn;
+ TRN *trn;
+ if (gslsn != LSN_IMPOSSIBLE)
+ {
+ tprint(tracef, "Group at LSN (%lu,0x%lx) short_trid %u incomplete\n",
+ LSN_IN_PARTS(gslsn), sid);
+ all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+ }
+ if (all_active_trans[sid].undo_lsn != LSN_IMPOSSIBLE)
+ {
+ llstr(long_trid, llbuf);
+ tprint(tracef, "Transaction long_trid %s short_trid %u uncommitted\n",
+ llbuf, sid);
+ /*
+ dummy_transaction_object serves only for DDLs, where there is never a
+ rollback or incomplete group. And unknown transactions (which have
+ long_trid==0) should have undo_lsn==LSN_IMPOSSIBLE.
+ */
+ if (long_trid ==0)
+ {
+ eprint(tracef, "Transaction with long_trid 0 should not roll back");
+ ALERT_USER();
+ return -1;
+ }
+ if (prepare_for_undo_phase)
+ {
+ if ((trn= trnman_recreate_trn_from_recovery(sid, long_trid)) == NULL)
+ return -1;
+ trn->undo_lsn= all_active_trans[sid].undo_lsn;
+ trn->first_undo_lsn= all_active_trans[sid].first_undo_lsn |
+ TRANSACTION_LOGGED_LONG_ID; /* because trn is known in log */
+ if (gslsn != LSN_IMPOSSIBLE)
+ {
+ /*
+ UNDO phase will log some records. So, a future recovery may see:
+ REDO(from incomplete group) - REDO(from rollback) - CLR_END
+ and thus execute the first REDO (finding it in "a complete
+ group"). To prevent that:
+ */
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS];
+ LSN lsn;
+ if (translog_write_record(&lsn, LOGREC_INCOMPLETE_GROUP,
+ trn, NULL, 0,
+ TRANSLOG_INTERNAL_PARTS, log_array,
+ NULL, NULL))
+ return -1;
+ }
+ }
+ uncommitted++;
+ }
+#ifdef MARIA_VERSIONING
+ /*
+ If real recovery: if transaction was committed, move it to some separate
+ list for soon purging.
+ */
+#endif
+ }
+
+ my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR));
+ all_active_trans= NULL;
+
+ /*
+ The UNDO phase uses some normal run-time code of ROLLBACK: generates log
+ records, etc; prepare tables for that
+ */
+ addr= translog_get_horizon();
+ for (sid= 0; sid <= SHARE_ID_MAX; sid++)
+ {
+ MARIA_HA *info= all_tables[sid].info;
+ if (info != NULL)
+ {
+ prepare_table_for_close(info, addr);
+ /*
+ But we don't close it; we leave it available for the UNDO phase;
+ it's likely that the UNDO phase will need it.
+ */
+ if (prepare_for_undo_phase)
+ translog_assign_id_to_share_from_recovery(info->s, sid);
+ }
+ }
+ return uncommitted;
+}
+
+
+static int run_undo_phase(uint uncommitted)
+{
+ LSN last_undo;
+ DBUG_ENTER("run_undo_phase");
+
+ if (uncommitted > 0)
+ {
+ checkpoint_useful= TRUE;
+ if (tracef != stdout)
+ {
+ if (recovery_message_printed == REC_MSG_NONE)
+ print_preamble();
+ fprintf(stderr, "transactions to roll back:");
+ recovery_message_printed= REC_MSG_UNDO;
+ }
+ tprint(tracef, "%u transactions will be rolled back\n", uncommitted);
+ procent_printed= 1;
+ for( ; ; )
+ {
+ char llbuf[22];
+ TRN *trn;
+ if (recovery_message_printed == REC_MSG_UNDO)
+ {
+ fprintf(stderr, " %u", uncommitted);
+ fflush(stderr);
+ }
+ if ((uncommitted--) == 0)
+ break;
+ trn= trnman_get_any_trn();
+ DBUG_ASSERT(trn != NULL);
+ llstr(trn->trid, llbuf);
+ tprint(tracef, "Rolling back transaction of long id %s\n", llbuf);
+ last_undo= trn->undo_lsn + 1;
+
+ /* Execute all undo entries */
+ while (trn->undo_lsn)
+ {
+ TRANSLOG_HEADER_BUFFER rec;
+ LOG_DESC *log_desc;
+ DBUG_ASSERT(trn->undo_lsn < last_undo);
+ last_undo= trn->undo_lsn;
+
+ if (translog_read_record_header(trn->undo_lsn, &rec) ==
+ RECHEADER_READ_ERROR)
+ DBUG_RETURN(1);
+ log_desc= &log_record_type_descriptor[rec.type];
+ display_record_position(log_desc, &rec, 0);
+ if (log_desc->record_execute_in_undo_phase(&rec, trn))
+ {
+ eprint(tracef, "Got error %d when executing undo %s", my_errno,
+ log_desc->name);
+ translog_free_record_header(&rec);
+ DBUG_RETURN(1);
+ }
+ translog_free_record_header(&rec);
+ }
+
+ if (trnman_rollback_trn(trn))
+ DBUG_RETURN(1);
+ /* We could want to span a few threads (4?) instead of 1 */
+ /* In the future, we want to have this phase *online* */
+ }
+ }
+ procent_printed= 0;
+ DBUG_RETURN(0);
+}
+
+
+/**
+ In case of error in recovery, deletes all transactions from the transaction
+ manager so that this module does not assert.
+
+ @note no checkpoint should be taken as those transactions matter for the
+ next recovery (they still haven't been properly dealt with).
+*/
+
+static void delete_all_transactions()
+{
+ for( ; ; )
+ {
+ TRN *trn= trnman_get_any_trn();
+ if (trn == NULL)
+ break;
+ trn->undo_lsn= trn->first_undo_lsn= LSN_IMPOSSIBLE;
+ trnman_rollback_trn(trn); /* ignore error */
+ }
+}
+
+
+/**
+ @brief re-enables transactionality, updates is_of_horizon
+
+ @param info table
+ @param horizon address to set is_of_horizon
+*/
+
+static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon)
+{
+ MARIA_SHARE *share= info->s;
+ /*
+ In a fully-forward REDO phase (no checkpoint record),
+ state is now at least as new as the LSN of the current record. It may be
+ newer, in case we are seeing a LOGREC_FILE_ID which tells us to close a
+ table, but that table was later modified further in the log.
+ But if we parsed a checkpoint record, it may be this way in the log:
+ FILE_ID(6->t2)... FILE_ID(6->t1)... CHECKPOINT(6->t1)
+ Checkpoint parsing opened t1 with id 6; first FILE_ID above is going to
+ make t1 close; the first condition below is however false (when checkpoint
+ was taken it increased is_of_horizon) and so it works. For safety we
+ add the second condition.
+ */
+ if (cmp_translog_addr(share->state.is_of_horizon, horizon) < 0 &&
+ cmp_translog_addr(share->lsn_of_file_id, horizon) < 0)
+ {
+ share->state.is_of_horizon= horizon;
+ _ma_state_info_write_sub(share->kfile.file, &share->state,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
+ }
+
+ /*
+ Ensure that info->state is up to date as
+ _ma_renable_logging_for_table() is depending on this
+ */
+ *info->state= info->s->state.state;
+
+ /*
+ This leaves PAGECACHE_PLAIN_PAGE pages into the cache, while the table is
+ going to switch back to transactional. So the table will be a mix of
+ pages, which is ok as long as we don't take any checkpoints until all
+ tables get closed at the end of the UNDO phase.
+ */
+ _ma_reenable_logging_for_table(info, FALSE);
+ info->trn= NULL; /* safety */
+}
+
+
+static MARIA_HA *get_MARIA_HA_from_REDO_record(const
+ TRANSLOG_HEADER_BUFFER *rec)
+{
+ uint16 sid;
+ pgcache_page_no_t page;
+ MARIA_HA *info;
+ MARIA_SHARE *share;
+ char llbuf[22];
+ my_bool index_page_redo_entry= FALSE, page_redo_entry= FALSE;
+ LINT_INIT(page);
+
+ print_redo_phase_progress(rec->lsn);
+ sid= fileid_korr(rec->header);
+ switch (rec->type) {
+ /* not all REDO records have a page: */
+ case LOGREC_REDO_INDEX_NEW_PAGE:
+ case LOGREC_REDO_INDEX:
+ case LOGREC_REDO_INDEX_FREE_PAGE:
+ index_page_redo_entry= 1;
+ /* Fall trough*/
+ case LOGREC_REDO_INSERT_ROW_HEAD:
+ case LOGREC_REDO_INSERT_ROW_TAIL:
+ case LOGREC_REDO_PURGE_ROW_HEAD:
+ case LOGREC_REDO_PURGE_ROW_TAIL:
+ case LOGREC_REDO_NEW_ROW_HEAD:
+ case LOGREC_REDO_NEW_ROW_TAIL:
+ case LOGREC_REDO_FREE_HEAD_OR_TAIL:
+ page_redo_entry= TRUE;
+ page= page_korr(rec->header + FILEID_STORE_SIZE);
+ llstr(page, llbuf);
+ break;
+ /*
+ For REDO_FREE_BLOCKS, no need to look at dirty pages list: it does not
+ read data pages, only reads/modifies bitmap page(s) which is cheap.
+ */
+ default:
+ break;
+ }
+ tprint(tracef, " For table of short id %u", sid);
+ info= all_tables[sid].info;
+#ifndef DBUG_OFF
+ DBUG_ASSERT(current_group_table == NULL || current_group_table == info);
+ current_group_table= info;
+#endif
+ if (info == NULL)
+ {
+ tprint(tracef, ", table skipped, so skipping record\n");
+ return NULL;
+ }
+ share= info->s;
+ tprint(tracef, ", '%s'", share->open_file_name.str);
+ DBUG_ASSERT(in_redo_phase);
+ if (cmp_translog_addr(rec->lsn, share->lsn_of_file_id) <= 0)
+ {
+ /*
+ This can happen only if processing a record before the checkpoint
+ record.
+ id->name mapping is newer than REDO record: for sure the table subject
+ of the REDO has been flushed and forced (id re-assignment implies this);
+ REDO can be ignored (and must be, as we don't know what this subject
+ table was).
+ */
+ DBUG_ASSERT(cmp_translog_addr(rec->lsn, checkpoint_start) < 0);
+ tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent"
+ " than record, skipping record",
+ LSN_IN_PARTS(share->lsn_of_file_id));
+ return NULL;
+ }
+ if (cmp_translog_addr(rec->lsn, share->state.skip_redo_lsn) <= 0)
+ {
+ /* probably a bulk insert repair */
+ tprint(tracef, ", has skip_redo_lsn (%lu,0x%lx) more recent than"
+ " record, skipping record\n",
+ LSN_IN_PARTS(share->state.skip_redo_lsn));
+ return NULL;
+ }
+ /* detect if an open instance of a dropped table (internal bug) */
+ DBUG_ASSERT(share->last_version != 0);
+ if (page_redo_entry)
+ {
+ /*
+ Consult dirty pages list.
+ REDO_INSERT_ROW_BLOBS will consult list by itself, as it covers several
+ pages.
+ */
+ tprint(tracef, " page %s", llbuf);
+ if (_ma_redo_not_needed_for_page(sid, rec->lsn, page,
+ index_page_redo_entry))
+ return NULL;
+ }
+ /*
+ So we are going to read the page, and if its LSN is older than the
+ record's we will modify the page
+ */
+ tprint(tracef, ", applying record\n");
+ _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */
+ return info;
+}
+
+
+static MARIA_HA *get_MARIA_HA_from_UNDO_record(const
+ TRANSLOG_HEADER_BUFFER *rec)
+{
+ uint16 sid;
+ MARIA_HA *info;
+ MARIA_SHARE *share;
+
+ sid= fileid_korr(rec->header + LSN_STORE_SIZE);
+ tprint(tracef, " For table of short id %u", sid);
+ info= all_tables[sid].info;
+#ifndef DBUG_OFF
+ DBUG_ASSERT(!in_redo_phase ||
+ current_group_table == NULL || current_group_table == info);
+ current_group_table= info;
+#endif
+ if (info == NULL)
+ {
+ tprint(tracef, ", table skipped, so skipping record\n");
+ return NULL;
+ }
+ share= info->s;
+ tprint(tracef, ", '%s'", share->open_file_name.str);
+ if (cmp_translog_addr(rec->lsn, share->lsn_of_file_id) <= 0)
+ {
+ tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent"
+ " than record, skipping record",
+ LSN_IN_PARTS(share->lsn_of_file_id));
+ return NULL;
+ }
+ if (in_redo_phase &&
+ cmp_translog_addr(rec->lsn, share->state.skip_redo_lsn) <= 0)
+ {
+ /* probably a bulk insert repair */
+ tprint(tracef, ", has skip_redo_lsn (%lu,0x%lx) more recent than"
+ " record, skipping record\n",
+ LSN_IN_PARTS(share->state.skip_redo_lsn));
+ return NULL;
+ }
+ DBUG_ASSERT(share->last_version != 0);
+ _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */
+ tprint(tracef, ", applying record\n");
+ return info;
+}
+
+
+/**
+ @brief Parses checkpoint record.
+
+ Builds from it the dirty_pages list (a hash), opens tables and maps them to
+ their 2-byte IDs, recreates transactions (not real TRNs though).
+
+ @return LSN from where in the log the REDO phase should start
+ @retval LSN_ERROR error
+ @retval other ok
+*/
+
+static LSN parse_checkpoint_record(LSN lsn)
+{
+ ulong i;
+ ulonglong nb_dirty_pages;
+ TRANSLOG_HEADER_BUFFER rec;
+ TRANSLOG_ADDRESS start_address;
+ int len;
+ uint nb_active_transactions, nb_committed_transactions, nb_tables;
+ uchar *ptr;
+ LSN minimum_rec_lsn_of_active_transactions, minimum_rec_lsn_of_dirty_pages;
+ struct st_dirty_page *next_dirty_page_in_pool;
+
+ tprint(tracef, "Loading data from checkpoint record at LSN (%lu,0x%lx)\n",
+ LSN_IN_PARTS(lsn));
+ if ((len= translog_read_record_header(lsn, &rec)) == RECHEADER_READ_ERROR)
+ {
+ tprint(tracef, "Cannot find checkpoint record where it should be\n");
+ return LSN_ERROR;
+ }
+
+ enlarge_buffer(&rec);
+ if (log_record_buffer.str == NULL ||
+ translog_read_record(rec.lsn, 0, rec.record_length,
+ log_record_buffer.str, NULL) !=
+ rec.record_length)
+ {
+ eprint(tracef, "Failed to read record");
+ return LSN_ERROR;
+ }
+
+ ptr= log_record_buffer.str;
+ start_address= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ tprint(tracef, "Checkpoint record has start_horizon at (%lu,0x%lx)\n",
+ LSN_IN_PARTS(start_address));
+
+ /* transactions */
+ nb_active_transactions= uint2korr(ptr);
+ ptr+= 2;
+ tprint(tracef, "%u active transactions\n", nb_active_transactions);
+ minimum_rec_lsn_of_active_transactions= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ max_long_trid= transid_korr(ptr);
+ ptr+= TRANSID_SIZE;
+
+ /*
+ how much brain juice and discussions there was to come to writing this
+ line. It may make start_address slightly decrease (only by the time it
+ takes to write one or a few rows, roughly).
+ */
+ tprint(tracef, "Checkpoint record has min_rec_lsn of active transactions"
+ " at (%lu,0x%lx)\n",
+ LSN_IN_PARTS(minimum_rec_lsn_of_active_transactions));
+ set_if_smaller(start_address, minimum_rec_lsn_of_active_transactions);
+
+ for (i= 0; i < nb_active_transactions; i++)
+ {
+ uint16 sid= uint2korr(ptr);
+ TrID long_id;
+ LSN undo_lsn, first_undo_lsn;
+ ptr+= 2;
+ long_id= uint6korr(ptr);
+ ptr+= 6;
+ DBUG_ASSERT(sid > 0 && long_id > 0);
+ undo_lsn= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ first_undo_lsn= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ new_transaction(sid, long_id, undo_lsn, first_undo_lsn);
+ }
+ nb_committed_transactions= uint4korr(ptr);
+ ptr+= 4;
+ tprint(tracef, "%lu committed transactions\n",
+ (ulong)nb_committed_transactions);
+ /* no purging => committed transactions are not important */
+ ptr+= (6 + LSN_STORE_SIZE) * nb_committed_transactions;
+
+ /* tables */
+ nb_tables= uint4korr(ptr);
+ ptr+= 4;
+ tprint(tracef, "%u open tables\n", nb_tables);
+ for (i= 0; i< nb_tables; i++)
+ {
+ char name[FN_REFLEN];
+ LSN first_log_write_lsn;
+ uint name_len;
+ uint16 sid= uint2korr(ptr);
+ ptr+= 2;
+ DBUG_ASSERT(sid > 0);
+ first_log_write_lsn= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ name_len= strlen((char *)ptr) + 1;
+ strmake(name, (char *)ptr, sizeof(name)-1);
+ ptr+= name_len;
+ if (new_table(sid, name, first_log_write_lsn))
+ return LSN_ERROR;
+ }
+
+ /* dirty pages */
+ nb_dirty_pages= uint8korr(ptr);
+
+ /* Ensure casts later will not loose significant bits. */
+ DBUG_ASSERT((nb_dirty_pages <= SIZE_T_MAX/sizeof(struct st_dirty_page)) &&
+ (nb_dirty_pages <= ULONG_MAX));
+
+ ptr+= 8;
+ tprint(tracef, "%lu dirty pages\n", (ulong) nb_dirty_pages);
+ if (hash_init(&all_dirty_pages, &my_charset_bin, (ulong)nb_dirty_pages,
+ offsetof(struct st_dirty_page, file_and_page_id),
+ sizeof(((struct st_dirty_page *)NULL)->file_and_page_id),
+ NULL, NULL, 0))
+ return LSN_ERROR;
+ dirty_pages_pool=
+ (struct st_dirty_page *)my_malloc((size_t)nb_dirty_pages *
+ sizeof(struct st_dirty_page),
+ MYF(MY_WME));
+ if (unlikely(dirty_pages_pool == NULL))
+ return LSN_ERROR;
+ next_dirty_page_in_pool= dirty_pages_pool;
+ minimum_rec_lsn_of_dirty_pages= LSN_MAX;
+ if (maria_recovery_verbose)
+ tprint(tracef, "Table_id Is_index Page_id Rec_lsn\n");
+ for (i= 0; i < nb_dirty_pages ; i++)
+ {
+ pgcache_page_no_t page_id;
+ LSN rec_lsn;
+ uint32 is_index;
+ uint16 table_id= uint2korr(ptr);
+ ptr+= 2;
+ is_index= ptr[0];
+ ptr++;
+ page_id= page_korr(ptr);
+ ptr+= PAGE_STORE_SIZE;
+ rec_lsn= lsn_korr(ptr);
+ ptr+= LSN_STORE_SIZE;
+ if (new_page((is_index << 16) | table_id,
+ page_id, rec_lsn, next_dirty_page_in_pool++))
+ return LSN_ERROR;
+ if (maria_recovery_verbose)
+ tprint(tracef, "%8u %8u %12lu %lu,0x%lx\n", (uint) table_id,
+ (uint) is_index, (ulong) page_id, LSN_IN_PARTS(rec_lsn));
+ set_if_smaller(minimum_rec_lsn_of_dirty_pages, rec_lsn);
+ }
+ /* after that, there will be no insert/delete into the hash */
+ /*
+ sanity check on record (did we screw up with all those "ptr+=", did the
+ checkpoint write code and checkpoint read code go out of sync?).
+ */
+ if (ptr != (log_record_buffer.str + log_record_buffer.length))
+ {
+ eprint(tracef, "checkpoint record corrupted\n");
+ return LSN_ERROR;
+ }
+
+ /*
+ start_address is now from where the dirty pages list can be ignored.
+ Find LSN higher or equal to this TRANSLOG_ADDRESS, suitable for
+ translog_read_record() functions.
+ */
+ start_address= checkpoint_start=
+ translog_next_LSN(start_address, LSN_IMPOSSIBLE);
+ tprint(tracef, "Checkpoint record start_horizon now adjusted to"
+ " LSN (%lu,0x%lx)\n", LSN_IN_PARTS(start_address));
+ if (checkpoint_start == LSN_IMPOSSIBLE)
+ {
+ /*
+ There must be a problem, as our checkpoint record exists and is >= the
+ address which is stored in its first bytes, which is >= start_address.
+ */
+ return LSN_ERROR;
+ }
+ /* now, where the REDO phase should start reading log: */
+ tprint(tracef, "Checkpoint has min_rec_lsn of dirty pages at"
+ " LSN (%lu,0x%lx)\n", LSN_IN_PARTS(minimum_rec_lsn_of_dirty_pages));
+ set_if_smaller(start_address, minimum_rec_lsn_of_dirty_pages);
+ DBUG_PRINT("info",
+ ("checkpoint_start: (%lu,0x%lx) start_address: (%lu,0x%lx)",
+ LSN_IN_PARTS(checkpoint_start), LSN_IN_PARTS(start_address)));
+ return start_address;
+}
+
+
+static int new_page(uint32 fileid, pgcache_page_no_t pageid, LSN rec_lsn,
+ struct st_dirty_page *dirty_page)
+{
+ /* serves as hash key */
+ dirty_page->file_and_page_id= (((uint64)fileid) << 40) | pageid;
+ dirty_page->rec_lsn= rec_lsn;
+ return my_hash_insert(&all_dirty_pages, (uchar *)dirty_page);
+}
+
+
+static int close_all_tables(void)
+{
+ int error= 0;
+ uint count= 0;
+ LIST *list_element, *next_open;
+ MARIA_HA *info;
+ TRANSLOG_ADDRESS addr;
+ DBUG_ENTER("close_all_tables");
+
+ pthread_mutex_lock(&THR_LOCK_maria);
+ if (maria_open_list == NULL)
+ goto end;
+ tprint(tracef, "Closing all tables\n");
+ if (tracef != stdout)
+ {
+ if (recovery_message_printed == REC_MSG_NONE)
+ print_preamble();
+ for (count= 0, list_element= maria_open_list ;
+ list_element ; count++, (list_element= list_element->next))
+ ;
+ fprintf(stderr, "tables to flush:");
+ recovery_message_printed= REC_MSG_FLUSH;
+ }
+ /*
+ Since the end of end_of_redo_phase(), we may have written new records
+ (if UNDO phase ran) and thus the state is newer than at
+ end_of_redo_phase(), we need to bump is_of_horizon again.
+ */
+ addr= translog_get_horizon();
+ for (list_element= maria_open_list ; ; list_element= next_open)
+ {
+ if (recovery_message_printed == REC_MSG_FLUSH)
+ {
+ fprintf(stderr, " %u", count--);
+ fflush(stderr);
+ }
+ if (list_element == NULL)
+ break;
+ next_open= list_element->next;
+ info= (MARIA_HA*)list_element->data;
+ pthread_mutex_unlock(&THR_LOCK_maria); /* ok, UNDO phase not online yet */
+ /*
+ Tables which we see here are exactly those which were open at time of
+ crash. They might have open_count>0 as Checkpoint maybe flushed their
+ state while they were used. As Recovery corrected them, don't alarm the
+ user, don't ask for a table check:
+ */
+ if (info->s->state.open_count != 0)
+ {
+ /* let ma_close() mark the table properly closed */
+ info->s->state.open_count= 1;
+ info->s->global_changed= 1;
+ }
+ prepare_table_for_close(info, addr);
+ error|= maria_close(info);
+ pthread_mutex_lock(&THR_LOCK_maria);
+ }
+end:
+ pthread_mutex_unlock(&THR_LOCK_maria);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ @brief Close all table instances with a certain name which are present in
+ all_tables.
+
+ @param name Name of table
+ @param addr Log address passed to prepare_table_for_close()
+*/
+
+static my_bool close_one_table(const char *name, TRANSLOG_ADDRESS addr)
+{
+ my_bool res= 0;
+ /* There are no other threads using the tables, so we don't need any locks */
+ struct st_table_for_recovery *internal_table, *end;
+ for (internal_table= all_tables, end= internal_table + SHARE_ID_MAX + 1;
+ internal_table < end ;
+ internal_table++)
+ {
+ MARIA_HA *info= internal_table->info;
+ if ((info != NULL) && !strcmp(info->s->open_file_name.str, name))
+ {
+ prepare_table_for_close(info, addr);
+ if (maria_close(info))
+ res= 1;
+ internal_table->info= NULL;
+ }
+ }
+ return res;
+}
+
+
+/**
+ Temporarily disables logging for this table.
+
+ If that makes the log incomplete, writes a LOGREC_INCOMPLETE_LOG to the log
+ to warn log readers.
+
+ @param info table
+ @param log_incomplete if that disabling makes the log incomplete
+
+ @note for example in the REDO phase we disable logging but that does not
+ make the log incomplete.
+*/
+
+void _ma_tmp_disable_logging_for_table(MARIA_HA *info,
+ my_bool log_incomplete)
+{
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_tmp_disable_logging_for_table");
+ if (log_incomplete)
+ {
+ uchar log_data[FILEID_STORE_SIZE];
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+ LSN lsn;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ translog_write_record(&lsn, LOGREC_INCOMPLETE_LOG,
+ &dummy_transaction_object, info,
+ (translog_size_t) sizeof(log_data),
+ TRANSLOG_INTERNAL_PARTS + 1, log_array,
+ log_data, NULL);
+ }
+
+ /* if we disabled before writing the record, record wouldn't reach log */
+ share->now_transactional= FALSE;
+
+ /*
+ Reset state pointers. This is needed as in ALTER table we may do
+ commit fllowed by _ma_renable_logging_for_table and then
+ info->state may point to a state that was deleted by
+ _ma_trnman_end_trans_hook()
+ */
+ share->state.common= *info->state;
+ info->state= &share->state.common;
+ info->switched_transactional= TRUE;
+
+ /*
+ Some code in ma_blockrec.c assumes a trn even if !now_transactional but in
+ this case it only reads trn->rec_lsn, which has to be LSN_IMPOSSIBLE and
+ should be now. info->trn may be NULL in maria_chk.
+ */
+ if (info->trn == NULL)
+ info->trn= &dummy_transaction_object;
+ DBUG_ASSERT(info->trn->rec_lsn == LSN_IMPOSSIBLE);
+ share->page_type= PAGECACHE_PLAIN_PAGE;
+ /* Functions below will pick up now_transactional and change callbacks */
+ _ma_set_data_pagecache_callbacks(&info->dfile, share);
+ _ma_set_index_pagecache_callbacks(&share->kfile, share);
+ _ma_bitmap_set_pagecache_callbacks(&share->bitmap.file, share);
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ Re-enables logging for a table which had it temporarily disabled.
+
+ Only the thread which disabled logging is allowed to reenable it. Indeed,
+ re-enabling logging affects all open instances, one must have exclusive
+ access to the table to do that. In practice, the one which disables has
+ such access.
+
+ @param info table
+ @param flush_pages if function needs to flush pages first
+*/
+
+my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages)
+{
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_reenable_logging_for_table");
+
+ if (share->now_transactional == share->base.born_transactional ||
+ !info->switched_transactional)
+ DBUG_RETURN(0);
+ info->switched_transactional= FALSE;
+
+ if ((share->now_transactional= share->base.born_transactional))
+ {
+ share->page_type= PAGECACHE_LSN_PAGE;
+
+ /*
+ Copy state information that where updated while the table was used
+ in not transactional mode
+ */
+ _ma_copy_nontrans_state_information(info);
+ _ma_reset_history(info->s);
+
+ if (flush_pages)
+ {
+ /*
+ We are going to change callbacks; if a page is flushed at this moment
+ this can cause race conditions, that's one reason to flush pages
+ now. Other reasons: a checkpoint could be running and miss pages; the
+ pages have type PAGECACHE_PLAIN_PAGE which should not remain. As
+ there are no REDOs for pages, them, bitmaps and the state also have to
+ be flushed and synced.
+ */
+ if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_RELEASE, FLUSH_RELEASE) ||
+ _ma_state_info_write(share,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_LOCK) ||
+ _ma_sync_table_files(info))
+ DBUG_RETURN(1);
+ }
+ else if (!maria_in_recovery)
+ {
+ /*
+ Except in Recovery, we mustn't leave dirty pages (see comments above).
+ Note that this does not verify that the state was flushed, but hey.
+ */
+ pagecache_file_no_dirty_page(share->pagecache, &info->dfile);
+ pagecache_file_no_dirty_page(share->pagecache, &share->kfile);
+ }
+ _ma_set_data_pagecache_callbacks(&info->dfile, share);
+ _ma_set_index_pagecache_callbacks(&share->kfile, share);
+ _ma_bitmap_set_pagecache_callbacks(&share->bitmap.file, share);
+ /*
+ info->trn was not changed in the disable/enable combo, so that it's
+ still usable in this kind of combination:
+ external_lock;
+ start_bulk_insert; # table is empty, disables logging
+ end_bulk_insert; # enables logging
+ start_bulk_insert; # table is not empty, logging stays
+ # so rows insertion needs the real trn.
+ as happens during row-based replication on the slave.
+ */
+ }
+ DBUG_RETURN(0);
+}
+
+
+static void print_redo_phase_progress(TRANSLOG_ADDRESS addr)
+{
+ static uint end_logno= FILENO_IMPOSSIBLE, percentage_printed= 0;
+ static ulong end_offset;
+ static ulonglong initial_remainder= ~(ulonglong) 0;
+
+ uint cur_logno;
+ ulong cur_offset;
+ ulonglong local_remainder;
+ uint percentage_done;
+
+ if (tracef == stdout)
+ return;
+ if (recovery_message_printed == REC_MSG_NONE)
+ {
+ print_preamble();
+ fprintf(stderr, "recovered pages: 0%%");
+ fflush(stderr);
+ procent_printed= 1;
+ recovery_message_printed= REC_MSG_REDO;
+ }
+ if (end_logno == FILENO_IMPOSSIBLE)
+ {
+ LSN end_addr= translog_get_horizon();
+ end_logno= LSN_FILE_NO(end_addr);
+ end_offset= LSN_OFFSET(end_addr);
+ }
+ cur_logno= LSN_FILE_NO(addr);
+ cur_offset= LSN_OFFSET(addr);
+ local_remainder= (cur_logno == end_logno) ? (end_offset - cur_offset) :
+ (((longlong)log_file_size) - cur_offset +
+ max(end_logno - cur_logno - 1, 0) * ((longlong)log_file_size) +
+ end_offset);
+ if (initial_remainder == (ulonglong)(-1))
+ initial_remainder= local_remainder;
+ percentage_done= (uint) ((initial_remainder - local_remainder) * ULL(100) /
+ initial_remainder);
+ if ((percentage_done - percentage_printed) >= 10)
+ {
+ percentage_printed= percentage_done;
+ fprintf(stderr, " %u%%", percentage_done);
+ fflush(stderr);
+ procent_printed= 1;
+ }
+}
+
+
+#ifdef MARIA_EXTERNAL_LOCKING
+#error Marias Checkpoint and Recovery are really not ready for it
+#endif
+
+/*
+Recovery of the state : how it works
+=====================================
+
+Here we ignore Checkpoints for a start.
+
+The state (MARIA_HA::MARIA_SHARE::MARIA_STATE_INFO) is updated in
+memory frequently (at least at every row write/update/delete) but goes
+to disk at few moments: maria_close() when closing the last open
+instance, and a few rare places like CHECK/REPAIR/ALTER
+(non-transactional tables also do it at maria_lock_database() but we
+needn't cover them here).
+
+In case of crash, state on disk is likely to be older than what it was
+in memory, the REDO phase needs to recreate the state as it was in
+memory at the time of crash. When we say Recovery here we will always
+mean "REDO phase".
+
+For example MARIA_STATUS_INFO::records (count of records). It is updated at
+the end of every row write/update/delete/delete_all. When Recovery sees the
+sign of such row operation (UNDO or REDO), it may need to update the records'
+count if that count does not reflect that operation (is older). How to know
+the age of the state compared to the log record: every time the state
+goes to disk at runtime, its member "is_of_horizon" is updated to the
+current end-of-log horizon. So Recovery just needs to compare is_of_horizon
+and the record's LSN to know if it should modify "records".
+
+Other operations like ALTER TABLE DISABLE KEYS update the state but
+don't write log records, thus the REDO phase cannot repeat their
+effect on the state in case of crash. But we make them sync the state
+as soon as they have finished. This reduces the window for a problem.
+
+It looks like only one thread at a time updates the state in memory or
+on disk. We assume that the upper level (normally MySQL) has protection
+against issuing HA_EXTRA_(FORCE_REOPEN|PREPARE_FOR_RENAME) so that these
+are not issued while there are any running transactions on the given table.
+If this is not done, we may write a corrupted state to disk.
+
+With checkpoints
+================
+
+Checkpoint module needs to read the state in memory and write it to
+disk. This may happen while some other thread is modifying the state
+in memory or on disk. Checkpoint thus may be reading changing data, it
+needs a mutex to not have it corrupted, and concurrent modifiers of
+the state need that mutex too for the same reason.
+"records" is modified for every row write/update/delete, we don't want
+to add a mutex lock/unlock there. So we re-use the mutex lock/unlock
+which is already present in these moments, namely the log's mutex which is
+taken when UNDO_ROW_INSERT|UPDATE|DELETE is written: we update "records" in
+under-log-mutex hooks when writing these records (thus "records" is
+not updated at the end of maria_write/update/delete() anymore).
+Thus Checkpoint takes the log's lock and can read "records" from
+memory an write it to disk and release log's lock.
+We however want to avoid having the disk write under the log's
+lock. So it has to be under another mutex, natural choice is
+intern_lock (as Checkpoint needs it anyway to read MARIA_SHARE::kfile,
+and as maria_close() takes it too). All state writes to disk are
+changed to be protected with intern_lock.
+So Checkpoint takes intern_lock, log's lock, reads "records" from
+memory, releases log's lock, updates is_of_horizon and writes "records" to
+disk, release intern_lock.
+In practice, not only "records" needs to be written but the full
+state. So, Checkpoint reads the full state from memory. Some other
+thread may at this moment be modifying in memory some pieces of the
+state which are not protected by the lock's log (see ma_extra.c
+HA_EXTRA_NO_KEYS), and Checkpoint would be reading a corrupted state
+from memory; to guard against that we extend the intern_lock-zone to
+changes done to the state in memory by HA_EXTRA_NO_KEYS et al, and
+also any change made in memory to create_rename_lsn/state_is_of_horizon.
+Last, we don't want in Checkpoint to do
+ log lock; read state from memory; release log lock;
+for each table, it may hold the log's lock too much in total.
+So, we instead do
+ log lock; read N states from memory; release log lock;
+Thus, the sequence above happens outside of any intern_lock.
+But this re-introduces the problem that some other thread may be changing the
+state in memory and on disk under intern_lock, without log's lock, like
+HA_EXTRA_NO_KEYS, while we read the N states. However, when Checkpoint later
+comes to handling the table under intern_lock, which is serialized with
+HA_EXTRA_NO_KEYS, it can see that is_of_horizon is higher then when the state
+was read from memory under log's lock, and thus can decide to not flush the
+obsolete state it has, knowing that the other thread flushed a more recent
+state already. If on the other hand is_of_horizon is not higher, the read
+state is current and can be flushed. So we have a per-table sequence:
+ lock intern_lock; test if is_of_horizon is higher than when we read the state
+ under log's lock; if no then flush the read state to disk.
+*/
+
+/* some comments and pseudo-code which we keep for later */
+#if 0
+ /*
+ MikaelR suggests: support checkpoints during REDO phase too: do checkpoint
+ after a certain amount of log records have been executed. This helps
+ against repeated crashes. Those checkpoints could not be user-requested
+ (as engine is not communicating during the REDO phase), so they would be
+ automatic: this changes the original assumption that we don't write to the
+ log while in the REDO phase, but why not. How often should we checkpoint?
+ */
+
+ /*
+ We want to have two steps:
+ engine->recover_with_max_memory();
+ next_engine->recover_with_max_memory();
+ engine->init_with_normal_memory();
+ next_engine->init_with_normal_memory();
+ So: in recover_with_max_memory() allocate a giant page cache, do REDO
+ phase, then all page cache is flushed and emptied and freed (only retain
+ small structures like TM): take full checkpoint, which is useful if
+ next engine crashes in its recovery the next second.
+ Destroy all shares (maria_close()), then at init_with_normal_memory() we
+ do this:
+ */
+
+ /**** UNDO PHASE *****/
+
+ /*
+ Launch one or more threads to do the background rollback. Don't wait for
+ them to complete their rollback (background rollback; for debugging, we
+ can have an option which waits). Set a counter (total_of_rollback_threads)
+ to the number of threads to lauch.
+
+ Note that InnoDB's rollback-in-background works as long as InnoDB is the
+ last engine to recover, otherwise MySQL will refuse new connections until
+ the last engine has recovered so it's not "background" from the user's
+ point of view. InnoDB is near top of sys_table_types so all others
+ (e.g. BDB) recover after it... So it's really "online rollback" only if
+ InnoDB is the only engine.
+ */
+
+ /* wake up delete/update handler */
+ /* tell the TM that it can now accept new transactions */
+
+ /*
+ mark that checkpoint requests are now allowed.
+ */
+#endif
diff --git a/storage/maria/ma_recovery.h b/storage/maria/ma_recovery.h
new file mode 100644
index 00000000000..0bfcdd17d39
--- /dev/null
+++ b/storage/maria/ma_recovery.h
@@ -0,0 +1,33 @@
+/* Copyright (C) 2006,2007 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3072 Maria recovery
+ First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+/* This is the interface of this module. */
+
+/* Performs recovery of the engine at start */
+
+C_MODE_START
+enum maria_apply_log_way
+{ MARIA_LOG_APPLY, MARIA_LOG_DISPLAY_HEADER, MARIA_LOG_CHECK };
+int maria_recovery_from_log(void);
+int maria_apply_log(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply,
+ FILE *trace_file,
+ my_bool execute_undo_phase, my_bool skip_DDLs,
+ my_bool take_checkpoints, uint *warnings_count);
+C_MODE_END
diff --git a/storage/maria/ma_recovery_util.c b/storage/maria/ma_recovery_util.c
new file mode 100644
index 00000000000..19e61daf4ef
--- /dev/null
+++ b/storage/maria/ma_recovery_util.c
@@ -0,0 +1,146 @@
+/* Copyright (C) 2006,2007,2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Q: Why isn't ma_recovery_util.c simply moved to ma_recovery.c ?
+
+ A: ma_recovery.c, because it invokes objects from ma_check.c (like
+ maria_chk_init()) causes the following problem:
+ if a source file a.c of a program invokes a function defined in
+ ma_recovery.c, then a.o depends on ma_recovery.o which depends on
+ ma_check.o: linker thus brings in ma_check.o. That brings in the
+ dependencies of ma_check.o which are definitions of _ma_check_print_info()
+ etc; if a.o does not define them then the ones of ha_maria.o are used
+ i.e. ha_maria.o is linked into the program, and this brings in dependencies
+ of ha_maria.o on mysqld.o into the program's linking which thus fails, as
+ the program is not linked with mysqld.o.
+ Thus, while several functions defined in ma_recovery.c could be useful to
+ other files, they cannot be used by them.
+ So we are going to gradually move a great share of ma_recovery.c's exported
+ functions into the present file, to isolate the problematic components and
+ avoid the problem.
+*/
+
+#include "maria_def.h"
+
+HASH all_dirty_pages;
+struct st_dirty_page /* used only in the REDO phase */
+{
+ uint64 file_and_page_id;
+ LSN rec_lsn;
+};
+/*
+ LSN after which dirty pages list does not apply. Can be slightly before
+ when ma_checkpoint_execute() started.
+*/
+LSN checkpoint_start= LSN_IMPOSSIBLE;
+
+/** @todo looks like duplicate of recovery_message_printed */
+my_bool procent_printed;
+FILE *tracef; /**< trace file for debugging */
+
+
+/** @brief Prints to a trace file if it is not NULL */
+void tprint(FILE *trace_file __attribute__ ((unused)),
+ const char *format __attribute__ ((unused)), ...)
+{
+ va_list args;
+#ifndef DBUG_OFF
+ {
+ char buff[1024];
+ va_start(args, format);
+ vsnprintf(buff, sizeof(buff)-1, format, args);
+ DBUG_PRINT("info", ("%s", buff));
+ va_end(args);
+ }
+#endif
+ va_start(args, format);
+ if (trace_file != NULL)
+ {
+ if (procent_printed)
+ {
+ procent_printed= 0;
+ fputc('\n', trace_file);
+ }
+ vfprintf(trace_file, format, args);
+ }
+ va_end(args);
+}
+
+
+void eprint(FILE *trace_file __attribute__ ((unused)),
+ const char *format __attribute__ ((unused)), ...)
+{
+ va_list args;
+ va_start(args, format);
+ DBUG_PRINT("error", ("%s", format));
+ if (!trace_file)
+ trace_file= stderr;
+
+ if (procent_printed)
+ {
+ /* In silent mode, print on another line than the 0% 10% 20% line */
+ procent_printed= 0;
+ fputc('\n', trace_file);
+ }
+ vfprintf(trace_file , format, args);
+ fputc('\n', trace_file);
+ if (trace_file != stderr)
+ {
+ va_start(args, format);
+ my_printv_error(HA_ERR_INITIALIZATION, format, MYF(0), args);
+ }
+ va_end(args);
+ fflush(trace_file);
+}
+
+
+/**
+ Tells if the dirty pages list found in checkpoint record allows to ignore a
+ REDO for a certain page.
+
+ @param shortid short id of the table
+ @param lsn REDO record's LSN
+ @param page page number
+ @param index TRUE if index page, FALSE if data page
+*/
+
+my_bool _ma_redo_not_needed_for_page(uint16 shortid, LSN lsn,
+ pgcache_page_no_t page,
+ my_bool index)
+{
+ if (cmp_translog_addr(lsn, checkpoint_start) < 0)
+ {
+ /*
+ 64-bit key is formed like this:
+ Most significant byte: 0 if data page, 1 if index page
+ Next 2 bytes: table's short id
+ Next 5 bytes: page number
+ */
+ uint64 file_and_page_id=
+ (((uint64)((index << 16) | shortid)) << 40) | page;
+ struct st_dirty_page *dirty_page= (struct st_dirty_page *)
+ hash_search(&all_dirty_pages,
+ (uchar *)&file_and_page_id, sizeof(file_and_page_id));
+ DBUG_PRINT("info", ("in dirty pages list: %d", dirty_page != NULL));
+ if ((dirty_page == NULL) ||
+ cmp_translog_addr(lsn, dirty_page->rec_lsn) < 0)
+ {
+ tprint(tracef, ", ignoring because of dirty_pages list\n");
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
diff --git a/storage/maria/ma_recovery_util.h b/storage/maria/ma_recovery_util.h
new file mode 100644
index 00000000000..a35fea84fe9
--- /dev/null
+++ b/storage/maria/ma_recovery_util.h
@@ -0,0 +1,37 @@
+/* Copyright (C) 2006,2007,2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+struct st_dirty_page /* used only in the REDO phase */
+{
+ uint64 file_and_page_id;
+ LSN rec_lsn;
+};
+extern HASH all_dirty_pages;
+/*
+ LSN after which dirty pages list does not apply. Can be slightly before
+ when ma_checkpoint_execute() started.
+*/
+extern LSN checkpoint_start;
+extern my_bool procent_printed;
+extern FILE *tracef;
+
+
+my_bool _ma_redo_not_needed_for_page(uint16 shortid, LSN lsn,
+ pgcache_page_no_t page,
+ my_bool index);
+void tprint(FILE *trace_file, const char *format, ...)
+ ATTRIBUTE_FORMAT(printf, 2, 3);
+void eprint(FILE *trace_file, const char *format, ...)
+ ATTRIBUTE_FORMAT(printf, 2, 3);
diff --git a/storage/maria/ma_rename.c b/storage/maria/ma_rename.c
new file mode 100644
index 00000000000..380f3da3c46
--- /dev/null
+++ b/storage/maria/ma_rename.c
@@ -0,0 +1,135 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Rename a table
+*/
+
+#include "ma_fulltext.h"
+#include "trnman_public.h"
+
+/**
+ @brief renames a table
+
+ @param old_name current name of table
+ @param new_name table should be renamed to this name
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int maria_rename(const char *old_name, const char *new_name)
+{
+ char from[FN_REFLEN],to[FN_REFLEN];
+ int data_file_rename_error;
+#ifdef USE_RAID
+ uint raid_type=0,raid_chunks=0;
+#endif
+ MARIA_HA *info;
+ MARIA_SHARE *share;
+ myf sync_dir;
+ DBUG_ENTER("maria_rename");
+
+#ifdef EXTRA_DEBUG
+ _ma_check_table_is_closed(old_name,"rename old_table");
+ _ma_check_table_is_closed(new_name,"rename new table2");
+#endif
+ /** @todo LOCK take X-lock on table */
+ if (!(info= maria_open(old_name, O_RDWR, HA_OPEN_FOR_REPAIR)))
+ DBUG_RETURN(my_errno);
+ share= info->s;
+#ifdef USE_RAID
+ raid_type = share->base.raid_type;
+ raid_chunks = share->base.raid_chunks;
+#endif
+
+ /*
+ the renaming of an internal table to the final table (like in ALTER TABLE)
+ is the moment when this table receives its correct create_rename_lsn and
+ this is important; make sure transactionality has been re-enabled.
+ */
+ DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+ sync_dir= (share->now_transactional && !share->temporary &&
+ !maria_in_recovery) ? MY_SYNC_DIR : 0;
+ if (sync_dir)
+ {
+ LSN lsn;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ uint old_name_len= strlen(old_name)+1, new_name_len= strlen(new_name)+1;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar*)old_name;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= old_name_len;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (uchar*)new_name;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= new_name_len;
+ /*
+ For this record to be of any use for Recovery, we need the upper
+ MySQL layer to be crash-safe, which it is not now (that would require
+ work using the ddl_log of sql/sql_table.cc); when it is, we should
+ reconsider the moment of writing this log record (before or after op,
+ under THR_LOCK_maria or not...), how to use it in Recovery.
+ For now it can serve to apply logs to a backup so we sync it.
+ */
+ if (unlikely(translog_write_record(&lsn, LOGREC_REDO_RENAME_TABLE,
+ &dummy_transaction_object, NULL,
+ old_name_len + new_name_len,
+ sizeof(log_array)/sizeof(log_array[0]),
+ log_array, NULL, NULL) ||
+ translog_flush(lsn)))
+ {
+ maria_close(info);
+ DBUG_RETURN(1);
+ }
+ /*
+ store LSN into file, needed for Recovery to not be confused if a
+ RENAME happened (applying REDOs to the wrong table).
+ */
+ if (_ma_update_state_lsns(share, lsn, share->state.create_trid, TRUE,
+ TRUE))
+ {
+ maria_close(info);
+ DBUG_RETURN(1);
+ }
+ }
+
+ maria_close(info);
+
+ fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ if (my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir)))
+ DBUG_RETURN(my_errno);
+ fn_format(from,old_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+ fn_format(to,new_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+#ifdef USE_RAID
+ if (raid_type)
+ data_file_rename_error= my_raid_rename(from, to, raid_chunks,
+ MYF(MY_WME | sync_dir));
+ else
+#endif
+ data_file_rename_error=
+ my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir));
+ if (data_file_rename_error)
+ {
+ /*
+ now we have a renamed index file and a non-renamed data file, try to
+ undo the rename of the index file.
+ */
+ data_file_rename_error= my_errno;
+ fn_format(from, old_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
+ fn_format(to, new_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
+ my_rename_with_symlink(to, from, MYF(MY_WME | sync_dir));
+ }
+ DBUG_RETURN(data_file_rename_error);
+
+}
diff --git a/storage/maria/ma_rfirst.c b/storage/maria/ma_rfirst.c
new file mode 100644
index 00000000000..226aaa551f0
--- /dev/null
+++ b/storage/maria/ma_rfirst.c
@@ -0,0 +1,26 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+ /* Read first row through a specfic key */
+
+int maria_rfirst(MARIA_HA *info, uchar *buf, int inx)
+{
+ DBUG_ENTER("maria_rfirst");
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->update|= HA_STATE_PREV_FOUND;
+ DBUG_RETURN(maria_rnext(info,buf,inx));
+} /* maria_rfirst */
diff --git a/storage/maria/ma_rkey.c b/storage/maria/ma_rkey.c
new file mode 100644
index 00000000000..24b275d0ba6
--- /dev/null
+++ b/storage/maria/ma_rkey.c
@@ -0,0 +1,215 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Read record based on a key */
+
+#include "maria_def.h"
+#include "ma_rt_index.h"
+
+/**
+ Read a record using key
+
+ @note
+ Ordinary search_flag is 0 ; Give error if no record with key
+*/
+
+int maria_rkey(MARIA_HA *info, uchar *buf, int inx, const uchar *key_data,
+ key_part_map keypart_map, enum ha_rkey_function search_flag)
+{
+ uchar *key_buff;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo;
+ HA_KEYSEG *last_used_keyseg;
+ uint32 nextflag;
+ MARIA_KEY key;
+ int icp_res= 1;
+ DBUG_ENTER("maria_rkey");
+ DBUG_PRINT("enter", ("base: 0x%lx buf: 0x%lx inx: %d search_flag: %d",
+ (long) info, (long) buf, inx, search_flag));
+
+ if ((inx = _ma_check_index(info,inx)) < 0)
+ DBUG_RETURN(my_errno);
+
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ info->last_key_func= search_flag;
+ keyinfo= share->keyinfo + inx;
+
+ key_buff= info->lastkey_buff+info->s->base.max_key_length;
+
+ if (info->once_flags & USE_PACKED_KEYS)
+ {
+ info->once_flags&= ~USE_PACKED_KEYS; /* Reset flag */
+ /*
+ key is already packed!; This happens when we are using a MERGE TABLE
+ In this key 'key_part_map' is the length of the key !
+ */
+ bmove(key_buff, key_data, keypart_map);
+ key.data= key_buff;
+ key.keyinfo= keyinfo;
+ key.data_length= keypart_map;
+ key.ref_length= 0;
+ key.flag= 0;
+
+ last_used_keyseg= keyinfo->seg + info->last_used_keyseg;
+ }
+ else
+ {
+ DBUG_ASSERT(keypart_map);
+ /* Save the packed key for later use in the second buffer of lastkey. */
+ _ma_pack_key(info, &key, inx, key_buff, key_data,
+ keypart_map, &last_used_keyseg);
+ /* Save packed_key_length for use by the MERGE engine. */
+ info->pack_key_length= key.data_length;
+ info->last_used_keyseg= (uint16) (last_used_keyseg -
+ keyinfo->seg);
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, &key););
+ }
+
+ if (fast_ma_readinfo(info))
+ goto err;
+ if (share->lock_key_trees)
+ rw_rdlock(&keyinfo->root_lock);
+
+ nextflag= maria_read_vec[search_flag] | key.flag;
+ if (search_flag != HA_READ_KEY_EXACT ||
+ ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME))
+ nextflag|= SEARCH_SAVE_BUFF;
+
+ switch (keyinfo->key_alg) {
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ if (maria_rtree_find_first(info, &key, nextflag) < 0)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ my_errno= HA_ERR_CRASHED;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ }
+ break;
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ if (!_ma_search(info, &key, nextflag, info->s->state.key_root[inx]))
+ {
+ MARIA_KEY lastkey;
+ lastkey.keyinfo= keyinfo;
+ lastkey.data= info->lastkey_buff;
+ /*
+ Found a key, but it might not be usable. We cannot use rows that
+ are inserted by other threads after we got our table lock
+ ("concurrent inserts"). The record may not even be present yet.
+ Keys are inserted into the index(es) before the record is
+ inserted into the data file.
+
+ If index condition is present, it must be either satisfied or
+ not satisfied with an out-of-range condition.
+ */
+ if ((*share->row_is_visible)(info) &&
+ ((icp_res= ma_check_index_cond(info, inx, buf)) != 0))
+ break;
+
+ /* The key references a concurrently inserted record. */
+ if (search_flag == HA_READ_KEY_EXACT &&
+ last_used_keyseg == keyinfo->seg + keyinfo->keysegs)
+ {
+ /* Simply ignore the key if it matches exactly. (Bug #29838) */
+ my_errno= HA_ERR_KEY_NOT_FOUND;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ break;
+ }
+
+ do
+ {
+ uint not_used[2];
+ /*
+ Skip rows that are inserted by other threads since we got
+ a lock. Note that this can only happen if we are not
+ searching after a full length exact key, because the keys
+ are sorted according to position.
+ */
+ lastkey.data_length= info->last_key.data_length;
+ lastkey.ref_length= info->last_key.ref_length;
+ lastkey.flag= info->last_key.flag;
+ if (_ma_search_next(info, &lastkey, maria_readnext_vec[search_flag],
+ info->s->state.key_root[inx]))
+ break; /* purecov: inspected */
+ /*
+ Check that the found key does still match the search.
+ _ma_search_next() delivers the next key regardless of its
+ value.
+ */
+ if (!(nextflag & (SEARCH_BIGGER | SEARCH_SMALLER)) &&
+ ha_key_cmp(keyinfo->seg, info->last_key.data, key.data,
+ key.data_length, SEARCH_FIND, not_used))
+ {
+ /* purecov: begin inspected */
+ my_errno= HA_ERR_KEY_NOT_FOUND;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ break;
+ /* purecov: end */
+ }
+
+ } while (!(*share->row_is_visible)(info) ||
+ ((icp_res= ma_check_index_cond(info, inx, buf)) == 0));
+ }
+ }
+ if (share->lock_key_trees)
+ rw_unlock(&keyinfo->root_lock);
+
+ if (info->cur_row.lastpos == HA_OFFSET_ERROR || (icp_res != 1))
+ {
+ if (icp_res == 2)
+ {
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ my_errno= HA_ERR_KEY_NOT_FOUND;
+ }
+ fast_ma_writeinfo(info);
+ goto err;
+ }
+
+ /* Calculate length of the found key; Used by maria_rnext_same */
+ if ((keyinfo->flag & HA_VAR_LENGTH_KEY))
+ info->last_rkey_length= _ma_keylength_part(keyinfo, info->lastkey_buff,
+ last_used_keyseg);
+ else
+ info->last_rkey_length= key.data_length;
+
+ /* Check if we don't want to have record back, only error message */
+ if (!buf)
+ {
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(0);
+ }
+ if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ DBUG_RETURN(0);
+ }
+
+ info->cur_row.lastpos= HA_OFFSET_ERROR; /* Didn't find row */
+
+err:
+ /* Store last used key as a base for read next */
+ memcpy(info->last_key.data, key_buff, key.data_length);
+ info->last_key.data_length= key.data_length;
+ info->last_key.ref_length= info->s->base.rec_reflength;
+ info->last_key.flag= 0;
+ /* Create key with rowid 0 */
+ bzero((char*) info->last_key.data + info->last_key.data_length,
+ info->s->base.rec_reflength);
+
+ if (search_flag == HA_READ_AFTER_KEY)
+ info->update|=HA_STATE_NEXT_FOUND; /* Previous gives last row */
+ DBUG_RETURN(my_errno);
+} /* _ma_rkey */
diff --git a/storage/maria/ma_rlast.c b/storage/maria/ma_rlast.c
new file mode 100644
index 00000000000..a9a470d37d9
--- /dev/null
+++ b/storage/maria/ma_rlast.c
@@ -0,0 +1,26 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+ /* Read last row with the same key as the previous read. */
+
+int maria_rlast(MARIA_HA *info, uchar *buf, int inx)
+{
+ DBUG_ENTER("maria_rlast");
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->update|= HA_STATE_NEXT_FOUND;
+ DBUG_RETURN(maria_rprev(info,buf,inx));
+} /* maria_rlast */
diff --git a/storage/maria/ma_rnext.c b/storage/maria/ma_rnext.c
new file mode 100644
index 00000000000..bdba5ff3a17
--- /dev/null
+++ b/storage/maria/ma_rnext.c
@@ -0,0 +1,130 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+#include "ma_rt_index.h"
+
+ /*
+ Read next row with the same key as previous read
+ One may have done a write, update or delete of the previous row.
+ NOTE! Even if one changes the previous row, the next read is done
+ based on the position of the last used key!
+ */
+
+int maria_rnext(MARIA_HA *info, uchar *buf, int inx)
+{
+ int error,changed;
+ uint flag;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo;
+ int icp_res= 1;
+ DBUG_ENTER("maria_rnext");
+
+ if ((inx = _ma_check_index(info,inx)) < 0)
+ DBUG_RETURN(my_errno);
+ flag=SEARCH_BIGGER; /* Read next */
+ if (info->cur_row.lastpos == HA_OFFSET_ERROR &&
+ info->update & HA_STATE_PREV_FOUND)
+ flag=0; /* Read first */
+
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(my_errno);
+ keyinfo= share->keyinfo + inx;
+ if (share->lock_key_trees)
+ rw_rdlock(&keyinfo->root_lock);
+ changed= _ma_test_if_changed(info);
+ if (!flag)
+ {
+ switch (keyinfo->key_alg){
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ error=maria_rtree_get_first(info, inx,
+ info->last_key.data_length +
+ info->last_key.ref_length);
+
+ break;
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ error= _ma_search_first(info, keyinfo, share->state.key_root[inx]);
+ break;
+ }
+ }
+ else
+ {
+ switch (keyinfo->key_alg) {
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ /*
+ Note that rtree doesn't support that the table
+ may be changed since last call, so we do need
+ to skip rows inserted by other threads like in btree
+ */
+ error= maria_rtree_get_next(info, inx, info->last_key.data_length +
+ info->last_key.ref_length);
+ break;
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ if (!changed)
+ error= _ma_search_next(info, &info->last_key,
+ flag | info->last_key.flag,
+ share->state.key_root[inx]);
+ else
+ error= _ma_search(info, &info->last_key, flag | info->last_key.flag,
+ share->state.key_root[inx]);
+ }
+ }
+
+ if (!error)
+ {
+ while (!(*share->row_is_visible)(info) ||
+ ((icp_res= ma_check_index_cond(info, inx, buf)) == 0))
+ {
+ /* Skip rows inserted by other threads since we got a lock */
+ if ((error= _ma_search_next(info, &info->last_key,
+ SEARCH_BIGGER,
+ share->state.key_root[inx])))
+ break;
+ }
+ }
+ if (share->lock_key_trees)
+ rw_unlock(&keyinfo->root_lock);
+
+ /* Don't clear if database-changed */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ info->update|= HA_STATE_NEXT_FOUND;
+
+ if (icp_res == 2)
+ my_errno=HA_ERR_END_OF_FILE; /* got beyond the end of scanned range */
+
+ if (error || icp_res != 1)
+ {
+ if (my_errno == HA_ERR_KEY_NOT_FOUND)
+ my_errno=HA_ERR_END_OF_FILE;
+ }
+ else if (!buf)
+ {
+ DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0);
+ }
+ else if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ DBUG_RETURN(0);
+ }
+ DBUG_PRINT("error",("Got error: %d, errno: %d",error, my_errno));
+ DBUG_RETURN(my_errno);
+} /* maria_rnext */
diff --git a/storage/maria/ma_rnext_same.c b/storage/maria/ma_rnext_same.c
new file mode 100644
index 00000000000..f67a76a366f
--- /dev/null
+++ b/storage/maria/ma_rnext_same.c
@@ -0,0 +1,113 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "ma_rt_index.h"
+
+/*
+ Read next row with the same key as previous read, but abort if
+ the key changes.
+ One may have done a write, update or delete of the previous row.
+
+ NOTE! Even if one changes the previous row, the next read is done
+ based on the position of the last used key!
+*/
+
+int maria_rnext_same(MARIA_HA *info, uchar *buf)
+{
+ int error;
+ uint inx,not_used[2];
+ MARIA_KEYDEF *keyinfo;
+ int icp_res= 1;
+ DBUG_ENTER("maria_rnext_same");
+
+ if ((int) (inx= info->lastinx) < 0 ||
+ info->cur_row.lastpos == HA_OFFSET_ERROR)
+ DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX);
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(my_errno);
+
+ keyinfo= info->s->keyinfo+inx;
+ if (info->s->lock_key_trees)
+ rw_rdlock(&keyinfo->root_lock);
+
+ switch (keyinfo->key_alg) {
+#ifdef HAVE_RTREE_KEYS
+ case HA_KEY_ALG_RTREE:
+ if ((error=maria_rtree_find_next(info,inx,
+ maria_read_vec[info->last_key_func])))
+ {
+ error=1;
+ my_errno=HA_ERR_END_OF_FILE;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ break;
+ }
+ break;
+#endif
+ case HA_KEY_ALG_BTREE:
+ default:
+ if (!(info->update & HA_STATE_RNEXT_SAME))
+ {
+ /* First rnext_same; Store old key */
+ memcpy(info->lastkey_buff2, info->last_key.data,
+ info->last_rkey_length);
+ }
+ for (;;)
+ {
+ if ((error= _ma_search_next(info, &info->last_key,
+ SEARCH_BIGGER,
+ info->s->state.key_root[inx])))
+ break;
+ if (ha_key_cmp(keyinfo->seg, info->last_key.data,
+ info->lastkey_buff2,
+ info->last_rkey_length, SEARCH_FIND,
+ not_used))
+ {
+ error=1;
+ my_errno=HA_ERR_END_OF_FILE;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ break;
+ }
+ /* Skip rows that are inserted by other threads since we got a lock */
+ if ((info->s->row_is_visible)(info) &&
+ ((icp_res= ma_check_index_cond(info, inx, buf)) != 0))
+ break;
+ }
+ }
+ if (info->s->lock_key_trees)
+ rw_unlock(&keyinfo->root_lock);
+ /* Don't clear if database-changed */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ info->update|= HA_STATE_NEXT_FOUND | HA_STATE_RNEXT_SAME;
+
+ if (icp_res == 2)
+ my_errno=HA_ERR_END_OF_FILE; /* got beyond the end of scanned range */
+
+ if (error || icp_res != 1)
+ {
+ if (my_errno == HA_ERR_KEY_NOT_FOUND)
+ my_errno=HA_ERR_END_OF_FILE;
+ }
+ else if (!buf)
+ {
+ DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0);
+ }
+ else if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ DBUG_RETURN(0);
+ }
+ DBUG_RETURN(my_errno);
+} /* maria_rnext_same */
diff --git a/storage/maria/ma_rprev.c b/storage/maria/ma_rprev.c
new file mode 100644
index 00000000000..b9f46d7c405
--- /dev/null
+++ b/storage/maria/ma_rprev.c
@@ -0,0 +1,86 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+ /*
+ Read previous row with the same key as previous read
+ One may have done a write, update or delete of the previous row.
+ NOTE! Even if one changes the previous row, the next read is done
+ based on the position of the last used key!
+ */
+
+int maria_rprev(MARIA_HA *info, uchar *buf, int inx)
+{
+ int error,changed;
+ register uint flag;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo;
+ DBUG_ENTER("maria_rprev");
+
+ if ((inx = _ma_check_index(info,inx)) < 0)
+ DBUG_RETURN(my_errno);
+ flag=SEARCH_SMALLER; /* Read previous */
+ if (info->cur_row.lastpos == HA_OFFSET_ERROR &&
+ info->update & HA_STATE_NEXT_FOUND)
+ flag=0; /* Read last */
+
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(my_errno);
+ keyinfo= share->keyinfo + inx;
+ changed= _ma_test_if_changed(info);
+ if (share->lock_key_trees)
+ rw_rdlock(&keyinfo->root_lock);
+ if (!flag)
+ error= _ma_search_last(info, keyinfo, share->state.key_root[inx]);
+ else if (!changed)
+ error= _ma_search_next(info, &info->last_key,
+ flag | info->last_key.flag,
+ share->state.key_root[inx]);
+ else
+ error= _ma_search(info, &info->last_key, flag | info->last_key.flag,
+ share->state.key_root[inx]);
+
+ if (!error)
+ {
+ while (!(*share->row_is_visible)(info))
+ {
+ /* Skip rows that are inserted by other threads since we got a lock */
+ if ((error= _ma_search_next(info, &info->last_key,
+ SEARCH_SMALLER,
+ share->state.key_root[inx])))
+ break;
+ }
+ }
+ if (share->lock_key_trees)
+ rw_unlock(&keyinfo->root_lock);
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ info->update|= HA_STATE_PREV_FOUND;
+ if (error)
+ {
+ if (my_errno == HA_ERR_KEY_NOT_FOUND)
+ my_errno=HA_ERR_END_OF_FILE;
+ }
+ else if (!buf)
+ {
+ DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0);
+ }
+ else if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+ {
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ DBUG_RETURN(0);
+ }
+ DBUG_RETURN(my_errno);
+} /* maria_rprev */
diff --git a/storage/maria/ma_rrnd.c b/storage/maria/ma_rrnd.c
new file mode 100644
index 00000000000..24c4bfdd467
--- /dev/null
+++ b/storage/maria/ma_rrnd.c
@@ -0,0 +1,44 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Read a record with random-access. The position to the record must
+ get by MARIA_HA. The next record can be read with pos= MARIA_POS_ERROR */
+
+
+#include "maria_def.h"
+
+/*
+ Read a row based on position.
+
+ RETURN
+ 0 Ok.
+ HA_ERR_RECORD_DELETED Record is deleted.
+ HA_ERR_END_OF_FILE EOF.
+*/
+
+int maria_rrnd(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
+{
+ DBUG_ENTER("maria_rrnd");
+
+ DBUG_ASSERT(filepos != HA_OFFSET_ERROR);
+
+ /* Init all but update-flag */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache))
+ DBUG_RETURN(my_errno);
+
+ info->cur_row.lastpos= filepos; /* Remember for update */
+ DBUG_RETURN((*info->s->read_record)(info, buf, filepos));
+}
diff --git a/storage/maria/ma_rsame.c b/storage/maria/ma_rsame.c
new file mode 100644
index 00000000000..4bdbfd526ba
--- /dev/null
+++ b/storage/maria/ma_rsame.c
@@ -0,0 +1,78 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+/**
+ Find current row with read on position or read on key
+
+ @notes
+ If inx >= 0 find record using key
+
+ @warning
+ This function is not row version safe.
+ This is not crtical as this function is not used by MySQL
+
+ @return
+ @retval 0 Ok
+ @retval HA_ERR_KEY_NOT_FOUND Row is deleted
+ @retval HA_ERR_END_OF_FILE End of file
+*/
+
+
+int maria_rsame(MARIA_HA *info, uchar *record, int inx)
+{
+ DBUG_ENTER("maria_rsame");
+
+ if (inx != -1 && ! maria_is_key_active(info->s->state.key_map, inx))
+ {
+ DBUG_PRINT("error", ("wrong index usage"));
+ DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX);
+ }
+ if (info->cur_row.lastpos == HA_OFFSET_ERROR ||
+ info->update & HA_STATE_DELETED)
+ {
+ DBUG_PRINT("error", ("no current record"));
+ DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No current record */
+ }
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ /* Read row from data file */
+ if (fast_ma_readinfo(info))
+ DBUG_RETURN(my_errno);
+
+ if (inx >= 0)
+ {
+ MARIA_KEYDEF *keyinfo= info->s->keyinfo + inx;
+ info->lastinx= inx;
+ (*keyinfo->make_key)(info, &info->last_key, (uint) inx,
+ info->lastkey_buff, record,
+ info->cur_row.lastpos,
+ info->cur_row.trid);
+ if (info->s->lock_key_trees)
+ rw_rdlock(&keyinfo->root_lock);
+ VOID(_ma_search(info, &info->last_key, SEARCH_SAME,
+ info->s->state.key_root[inx]));
+ if (info->s->lock_key_trees)
+ rw_unlock(&keyinfo->root_lock);
+ }
+
+ if (!(*info->read_record)(info, record, info->cur_row.lastpos))
+ DBUG_RETURN(0);
+ if (my_errno == HA_ERR_RECORD_DELETED)
+ my_errno=HA_ERR_KEY_NOT_FOUND;
+ DBUG_PRINT("error", ("my_errno: %d", my_errno));
+ DBUG_RETURN(my_errno);
+} /* maria_rsame */
diff --git a/storage/maria/ma_rsamepos.c b/storage/maria/ma_rsamepos.c
new file mode 100644
index 00000000000..d2099e7b116
--- /dev/null
+++ b/storage/maria/ma_rsamepos.c
@@ -0,0 +1,63 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* read record through position and fix key-position */
+/* As maria_rsame but supply a position */
+
+#include "maria_def.h"
+
+
+/*
+ Read row based on postion
+
+ @param inx If inx >= 0 postion the given index on found row
+
+ @return
+ @retval 0 Ok
+ @retval HA_ERR_KEY_NOT_FOUND Row is deleted
+ @retval HA_ERR_END_OF_FILE End of file
+*/
+
+int maria_rsame_with_pos(MARIA_HA *info, uchar *record, int inx,
+ MARIA_RECORD_POS filepos)
+{
+ DBUG_ENTER("maria_rsame_with_pos");
+ DBUG_PRINT("enter",("index: %d filepos: %ld", inx, (long) filepos));
+
+ if (inx < -1 ||
+ (inx >= 0 && ! maria_is_key_active(info->s->state.key_map, inx)))
+ {
+ DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX);
+ }
+
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ if ((*info->s->read_record)(info, record, filepos))
+ {
+ if (my_errno == HA_ERR_RECORD_DELETED)
+ my_errno=HA_ERR_KEY_NOT_FOUND;
+ DBUG_RETURN(my_errno);
+ }
+ info->cur_row.lastpos= filepos;
+ info->lastinx= inx;
+ if (inx >= 0)
+ {
+ (*info->s->keyinfo[inx].make_key)(info, &info->last_key, (uint) inx,
+ info->lastkey_buff,
+ record, info->cur_row.lastpos,
+ info->cur_row.trid);
+ info->update|=HA_STATE_KEY_CHANGED; /* Don't use indexposition */
+ }
+ DBUG_RETURN(0);
+} /* maria_rsame_pos */
diff --git a/storage/maria/ma_rt_index.c b/storage/maria/ma_rt_index.c
new file mode 100644
index 00000000000..62474dbbad8
--- /dev/null
+++ b/storage/maria/ma_rt_index.c
@@ -0,0 +1,1343 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+#define REINSERT_BUFFER_INC 10
+#define PICK_BY_AREA
+/*#define PICK_BY_PERIMETER*/
+
+typedef struct st_page_level
+{
+ uint level;
+ my_off_t offs;
+} stPageLevel;
+
+typedef struct st_page_list
+{
+ uint n_pages;
+ uint m_pages;
+ stPageLevel *pages;
+} stPageList;
+
+
+/*
+ Find next key in r-tree according to search_flag recursively
+
+ NOTES
+ Used in maria_rtree_find_first() and maria_rtree_find_next()
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+static int maria_rtree_find_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uint32 search_flag,
+ uint nod_cmp_flag, my_off_t page_pos,
+ int level)
+{
+ MARIA_SHARE *share= info->s;
+ uint nod_flag;
+ int res;
+ uchar *page_buf, *k, *last;
+ int key_data_length;
+ uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level;
+ MARIA_PAGE page;
+
+ if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM;
+ return -1;
+ }
+ if (_ma_fetch_keypage(&page, info, keyinfo, page_pos,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ DFLT_INIT_HITS, page_buf, 0))
+ goto err;
+ nod_flag= page.node;
+
+ key_data_length= keyinfo->keylength - share->base.rec_reflength;
+
+ if (info->maria_rtree_recursion_depth >= level)
+ {
+ k= page_buf + *saved_key;
+ }
+ else
+ {
+ k= rt_PAGE_FIRST_KEY(share, page_buf, nod_flag);
+ }
+ last= rt_PAGE_END(&page);
+
+ for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag))
+ {
+ if (nod_flag)
+ {
+ /* this is an internal node in the tree */
+ if (!(res= maria_rtree_key_cmp(keyinfo->seg,
+ info->first_mbr_key, k,
+ info->last_rkey_length, nod_cmp_flag)))
+ {
+ switch ((res= maria_rtree_find_req(info, keyinfo, search_flag,
+ nod_cmp_flag,
+ _ma_kpos(nod_flag, k),
+ level + 1)))
+ {
+ case 0: /* found - exit from recursion */
+ *saved_key= k - page_buf;
+ goto ok;
+ case 1: /* not found - continue searching */
+ info->maria_rtree_recursion_depth= level;
+ break;
+ default: /* error */
+ case -1:
+ goto err;
+ }
+ }
+ }
+ else
+ {
+ /* this is a leaf */
+ if (!maria_rtree_key_cmp(keyinfo->seg, info->first_mbr_key,
+ k, info->last_rkey_length, search_flag))
+ {
+ uchar *after_key= rt_PAGE_NEXT_KEY(share, k, key_data_length, 0);
+ MARIA_KEY tmp_key;
+
+ /*
+ We don't need to set all MARIA_KEY elements here as
+ _ma_row_pos_from_key() only uses a few of them.
+ */
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data= k;
+ tmp_key.data_length= key_data_length;
+
+ info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key);
+ info->last_key.keyinfo= keyinfo;
+ info->last_key.data_length= key_data_length;
+ info->last_key.ref_length= share->base.rec_reflength;
+ info->last_key.flag= 0;
+ memcpy(info->last_key.data, k,
+ info->last_key.data_length + info->last_key.ref_length);
+ info->maria_rtree_recursion_depth= level;
+ *saved_key= last - page_buf;
+
+ if (after_key < last)
+ {
+ uchar *keyread_buff= info->keyread_buff;
+ info->int_keypos= keyread_buff;
+ info->int_maxpos= keyread_buff + (last - after_key);
+ memcpy(keyread_buff, after_key, last - after_key);
+ info->keyread_buff_used= 0;
+ }
+ else
+ {
+ info->keyread_buff_used= 1;
+ }
+
+ res= 0;
+ goto ok;
+ }
+ }
+ }
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ my_errno= HA_ERR_KEY_NOT_FOUND;
+ res= 1;
+
+ok:
+ my_afree(page_buf);
+ return res;
+
+err:
+ my_afree(page_buf);
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ return -1;
+}
+
+
+/*
+ Find first key in r-tree according to search_flag condition
+
+ SYNOPSIS
+ maria_rtree_find_first()
+ info Handler to MARIA file
+ key Key to search for
+ search_flag Bitmap of flags how to do the search
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+int maria_rtree_find_first(MARIA_HA *info, MARIA_KEY *key, uint32 search_flag)
+{
+ my_off_t root;
+ uint nod_cmp_flag;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+
+ if ((root= info->s->state.key_root[keyinfo->key_nr]) == HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ return -1;
+ }
+
+ /*
+ Save searched key, include data pointer.
+ The data pointer is required if the search_flag contains MBR_DATA.
+ (minimum bounding rectangle)
+ */
+ memcpy(info->first_mbr_key, key->data, key->data_length + key->ref_length);
+ info->last_rkey_length= key->data_length;
+
+ info->maria_rtree_recursion_depth= -1;
+ info->keyread_buff_used= 1;
+
+ nod_cmp_flag= ((search_flag & (MBR_EQUAL | MBR_WITHIN)) ?
+ MBR_WITHIN : MBR_INTERSECT);
+ return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root,
+ 0);
+}
+
+
+/*
+ Find next key in r-tree according to search_flag condition
+
+ SYNOPSIS
+ maria_rtree_find_next()
+ info Handler to MARIA file
+ uint keynr Key number to use
+ search_flag Bitmap of flags how to do the search
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint32 search_flag)
+{
+ my_off_t root;
+ uint32 nod_cmp_flag;
+ MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr;
+ DBUG_ASSERT(info->last_key.keyinfo == keyinfo);
+
+ if (info->update & HA_STATE_DELETED)
+ return maria_rtree_find_first(info, &info->last_key, search_flag);
+
+ if (!info->keyread_buff_used)
+ {
+ uchar *key= info->int_keypos;
+
+ while (key < info->int_maxpos)
+ {
+ if (!maria_rtree_key_cmp(keyinfo->seg,
+ info->first_mbr_key, key,
+ info->last_rkey_length, search_flag))
+ {
+ uchar *after_key= key + keyinfo->keylength;
+ MARIA_KEY tmp_key;
+
+ /*
+ We don't need to set all MARIA_KEY elements here as
+ _ma_row_pos_from_key only uses a few of them.
+ */
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data= key;
+ tmp_key.data_length= keyinfo->keylength - info->s->base.rec_reflength;
+
+ info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key);
+ memcpy(info->last_key.data, key, info->last_key.data_length);
+
+ if (after_key < info->int_maxpos)
+ info->int_keypos= after_key;
+ else
+ info->keyread_buff_used= 1;
+ return 0;
+ }
+ key+= keyinfo->keylength;
+ }
+ }
+ if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ return -1;
+ }
+
+ nod_cmp_flag= (((search_flag & (MBR_EQUAL | MBR_WITHIN)) ?
+ MBR_WITHIN : MBR_INTERSECT));
+ return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root,
+ 0);
+}
+
+
+/*
+ Get next key in r-tree recursively
+
+ NOTES
+ Used in maria_rtree_get_first() and maria_rtree_get_next()
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+static int maria_rtree_get_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ uint key_length, my_off_t page_pos, int level)
+{
+ MARIA_SHARE *share= info->s;
+ uchar *page_buf, *last, *k;
+ uint nod_flag, key_data_length;
+ int res;
+ uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level;
+ MARIA_PAGE page;
+
+ if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+ return -1;
+ if (_ma_fetch_keypage(&page, info, keyinfo, page_pos,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ DFLT_INIT_HITS, page_buf, 0))
+ goto err;
+ nod_flag= page.node;
+
+ key_data_length= keyinfo->keylength - share->base.rec_reflength;
+
+ if (info->maria_rtree_recursion_depth >= level)
+ {
+ k= page.buff + *saved_key;
+ if (!nod_flag)
+ {
+ /* Only leaf pages contain data references. */
+ /* Need to check next key with data reference. */
+ k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag);
+ }
+ }
+ else
+ {
+ k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag);
+ }
+ last= rt_PAGE_END(&page);
+
+ for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag))
+ {
+ if (nod_flag)
+ {
+ /* this is an internal node in the tree */
+ switch ((res= maria_rtree_get_req(info, keyinfo, key_length,
+ _ma_kpos(nod_flag, k), level + 1)))
+ {
+ case 0: /* found - exit from recursion */
+ *saved_key= k - page.buff;
+ goto ok;
+ case 1: /* not found - continue searching */
+ info->maria_rtree_recursion_depth= level;
+ break;
+ default:
+ case -1: /* error */
+ goto err;
+ }
+ }
+ else
+ {
+ /* this is a leaf */
+ uchar *after_key= rt_PAGE_NEXT_KEY(share, k, key_data_length, 0);
+ MARIA_KEY tmp_key;
+
+ /*
+ We don't need to set all MARIA_KEY elements here as
+ _ma_row_pos_from_key() only uses a few of them.
+ */
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data= k;
+ tmp_key.data_length= key_data_length;
+
+ info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key);
+ info->last_key.data_length= key_data_length;
+ info->last_key.ref_length= share->base.rec_reflength;
+
+ memcpy(info->last_key.data, k,
+ info->last_key.data_length + info->last_key.ref_length);
+
+ info->maria_rtree_recursion_depth= level;
+ *saved_key= k - page.buff;
+
+ if (after_key < last)
+ {
+ uchar *keyread_buff= info->keyread_buff;
+ info->last_rtree_keypos= saved_key;
+ memcpy(keyread_buff, page.buff, page.size);
+ info->int_maxpos= keyread_buff + page.size;
+ info->keyread_buff_used= 0;
+ }
+ else
+ {
+ info->keyread_buff_used= 1;
+ }
+
+ res= 0;
+ goto ok;
+ }
+ }
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ my_errno= HA_ERR_KEY_NOT_FOUND;
+ res= 1;
+
+ok:
+ my_afree(page_buf);
+ return res;
+
+err:
+ my_afree(page_buf);
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ return -1;
+}
+
+
+/*
+ Get first key in r-tree
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length)
+{
+ my_off_t root;
+ MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr;
+
+ if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ return -1;
+ }
+
+ info->maria_rtree_recursion_depth= -1;
+ info->keyread_buff_used= 1;
+
+ return maria_rtree_get_req(info, keyinfo, key_length, root, 0);
+}
+
+
+/*
+ Get next key in r-tree
+
+ RETURN
+ -1 Error
+ 0 Found
+ 1 Not found
+*/
+
+int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length)
+{
+ my_off_t root;
+ MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr;
+ uchar *keyread_buff= info->keyread_buff;
+
+ if (!info->keyread_buff_used)
+ {
+ uint key_data_length= keyinfo->keylength - info->s->base.rec_reflength;
+ /* rt_PAGE_NEXT_KEY(*info->last_rtree_keypos) */
+ uchar *key= keyread_buff + *info->last_rtree_keypos + keyinfo->keylength;
+ /* rt_PAGE_NEXT_KEY(key) */
+ uchar *after_key= key + keyinfo->keylength;
+ MARIA_KEY tmp_key;
+
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data= key;
+ tmp_key.data_length= key_data_length;
+ tmp_key.ref_length= info->s->base.rec_reflength;
+ tmp_key.flag= 0;
+
+ info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key);
+ _ma_copy_key(&info->last_key, &tmp_key);
+
+ *info->last_rtree_keypos= (uint) (key - keyread_buff);
+ if (after_key >= info->int_maxpos)
+ {
+ info->keyread_buff_used= 1;
+ }
+
+ return 0;
+ }
+ else
+ {
+ if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ return -1;
+ }
+
+ return maria_rtree_get_req(info, &keyinfo[keynr], key_length, root, 0);
+ }
+}
+
+
+/*
+ Choose non-leaf better key for insertion
+
+ Returns a pointer inside the page_buf buffer.
+*/
+#ifdef PICK_BY_PERIMETER
+static const uchar *maria_rtree_pick_key(const MARIA_KEY *key,
+ const MARIA_PAGE *page)
+{
+ double increase;
+ double best_incr;
+ double perimeter;
+ double best_perimeter;
+ uchar *best_key= NULL;
+ const MARIA_HA *info= page->info;
+
+ uchar *k= rt_PAGE_FIRST_KEY(info->s, page->buf, page->node);
+ uchar *last= rt_PAGE_END(info, page);
+
+ LINT_INIT(best_perimeter);
+ LINT_INIT(best_key);
+ LINT_INIT(best_incr);
+
+ for (; k < last; k= rt_PAGE_NEXT_KEY(k, key->data_length, nod_flag))
+ {
+ if ((increase= maria_rtree_perimeter_increase(keyinfo->seg, k, key,
+ &perimeter)) == -1)
+ return NULL;
+ if ((increase < best_incr)||
+ (increase == best_incr && perimeter < best_perimeter))
+ {
+ best_key= k;
+ best_perimeter= perimeter;
+ best_incr= increase;
+ }
+ }
+ return best_key;
+}
+
+#endif /*PICK_BY_PERIMETER*/
+
+#ifdef PICK_BY_AREA
+static const uchar *maria_rtree_pick_key(const MARIA_KEY *key,
+ const MARIA_PAGE *page)
+{
+ const MARIA_HA *info= page->info;
+ MARIA_SHARE *share= info->s;
+ double increase;
+ double best_incr= DBL_MAX;
+ double area;
+ double best_area;
+ const uchar *best_key= NULL;
+ const uchar *k= rt_PAGE_FIRST_KEY(share, page->buff, page->node);
+ const uchar *last= rt_PAGE_END(page);
+
+ LINT_INIT(best_area);
+
+ for (; k < last;
+ k= rt_PAGE_NEXT_KEY(share, k, key->data_length, page->node))
+ {
+ /* The following is safe as -1.0 is an exact number */
+ if ((increase= maria_rtree_area_increase(key->keyinfo->seg, k, key->data,
+ key->data_length +
+ key->ref_length,
+ &area)) == -1.0)
+ return NULL;
+ /* The following should be safe, even if we compare doubles */
+ if (!best_key || increase < best_incr ||
+ ((increase == best_incr) && (area < best_area)))
+ {
+ best_key= k;
+ best_area= area;
+ best_incr= increase;
+ }
+ }
+ return best_key;
+}
+
+#endif /*PICK_BY_AREA*/
+
+/*
+ Go down and insert key into tree
+
+ RETURN
+ -1 Error
+ 0 Child was not split
+ 1 Child was split
+*/
+
+static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEY *key,
+ my_off_t page_pos, my_off_t *new_page,
+ int ins_level, int level)
+{
+ uint nod_flag;
+ uint key_length= key->data_length;
+ int res;
+ uchar *page_buf, *k;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_PAGE page;
+ DBUG_ENTER("maria_rtree_insert_req");
+
+ if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length +
+ MARIA_MAX_KEY_BUFF)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM;
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE,
+ DFLT_INIT_HITS, page_buf, 0))
+ goto err;
+ nod_flag= page.node;
+ DBUG_PRINT("rtree", ("page: %lu level: %d ins_level: %d nod_flag: %u",
+ (ulong) page.pos, level, ins_level, nod_flag));
+
+ if ((ins_level == -1 && nod_flag) || /* key: go down to leaf */
+ (ins_level > -1 && ins_level > level)) /* branch: go down to ins_level */
+ {
+ if (!(k= (uchar *)maria_rtree_pick_key(key, &page)))
+ goto err;
+ /* k is now a pointer inside the page_buf buffer */
+ switch ((res= maria_rtree_insert_req(info, key,
+ _ma_kpos(nod_flag, k), new_page,
+ ins_level, level + 1)))
+ {
+ case 0: /* child was not split, most common case */
+ {
+ maria_rtree_combine_rect(keyinfo->seg, k, key->data, k, key_length);
+ if (share->now_transactional &&
+ _ma_log_change(&page, k, key_length,
+ KEY_OP_DEBUG_RTREE_COMBINE))
+ goto err;
+ page_mark_changed(info, &page);
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ goto ok;
+ }
+ case 1: /* child was split */
+ {
+ /* Set new_key to point to a free buffer area */
+ uchar *new_key_buff= page_buf + keyinfo->block_length + nod_flag;
+ MARIA_KEY new_key;
+ MARIA_KEY k_key;
+
+ DBUG_ASSERT(nod_flag);
+ k_key.keyinfo= new_key.keyinfo= keyinfo;
+ new_key.data= new_key_buff;
+ k_key.data= k;
+ k_key.data_length= new_key.data_length= key->data_length;
+ k_key.ref_length= new_key.ref_length= key->ref_length;
+ k_key.flag= new_key.flag= 0; /* Safety */
+
+ /* set proper MBR for key */
+ if (maria_rtree_set_key_mbr(info, &k_key, _ma_kpos(nod_flag, k)))
+ goto err;
+ if (share->now_transactional &&
+ _ma_log_change(&page, k, key_length,
+ KEY_OP_DEBUG_RTREE_SPLIT))
+ goto err;
+ /* add new key for new page */
+ _ma_kpointer(info, new_key_buff - nod_flag, *new_page);
+ if (maria_rtree_set_key_mbr(info, &new_key, *new_page))
+ goto err;
+ res= maria_rtree_add_key(&new_key, &page, new_page);
+ page_mark_changed(info, &page);
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ goto ok;
+ }
+ default:
+ case -1: /* error */
+ {
+ goto err;
+ }
+ }
+ }
+ else
+ {
+ res= maria_rtree_add_key(key, &page, new_page);
+ page_mark_changed(info, &page);
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ }
+
+ok:
+ my_afree(page_buf);
+ DBUG_RETURN(res);
+
+err:
+ res= -1; /* purecov: inspected */
+ goto ok; /* purecov: inspected */
+}
+
+
+/**
+ Insert key into the tree
+
+ @param info table
+ @param key KEY to insert
+ @param ins_level at which level key insertion should start
+ @param root put new key_root there
+
+ @return Operation result
+ @retval -1 Error
+ @retval 0 Root was not split
+ @retval 1 Root was split
+*/
+
+int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, int ins_level,
+ my_off_t *root)
+{
+ my_off_t old_root;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ int res;
+ my_off_t new_page;
+ enum pagecache_page_lock write_lock;
+ DBUG_ENTER("maria_rtree_insert_level");
+
+ if ((old_root= share->state.key_root[keyinfo->key_nr]) == HA_OFFSET_ERROR)
+ {
+ MARIA_PINNED_PAGE tmp_page_link, *page_link;
+ MARIA_PAGE page;
+
+ page_link= &tmp_page_link;
+ if ((old_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) ==
+ HA_OFFSET_ERROR)
+ DBUG_RETURN(-1);
+ write_lock= page_link->write_lock;
+ info->keyread_buff_used= 1;
+ bzero(info->buff, share->block_size);
+ _ma_store_keynr(share, info->buff, keyinfo->key_nr);
+ _ma_store_page_used(share, info->buff, share->keypage_header);
+ _ma_page_setup(&page, info, keyinfo, old_root, info->buff);
+
+ if (share->now_transactional && _ma_log_new(&page, 1))
+ DBUG_RETURN(1);
+
+ res= maria_rtree_add_key(key, &page, NULL);
+ if (_ma_write_keypage(&page, write_lock, DFLT_INIT_HITS))
+ DBUG_RETURN(1);
+ *root= old_root;
+ DBUG_RETURN(res);
+ }
+
+ switch ((res= maria_rtree_insert_req(info, key, old_root, &new_page,
+ ins_level, 0)))
+ {
+ case 0: /* root was not split */
+ {
+ break;
+ }
+ case 1: /* root was split, grow a new root; very rare */
+ {
+ uchar *new_root_buf, *new_key_buff;
+ my_off_t new_root;
+ uint nod_flag= share->base.key_reflength;
+ MARIA_PINNED_PAGE tmp_page_link, *page_link;
+ MARIA_KEY new_key;
+ MARIA_PAGE page;
+ page_link= &tmp_page_link;
+
+ DBUG_PRINT("rtree", ("root was split, grow a new root"));
+ if (!(new_root_buf= (uchar*) my_alloca((uint) keyinfo->block_length +
+ MARIA_MAX_KEY_BUFF)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM;
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+
+ bzero(new_root_buf, share->block_size);
+ _ma_store_keypage_flag(share, new_root_buf, KEYPAGE_FLAG_ISNOD);
+ _ma_store_keynr(share, new_root_buf, keyinfo->key_nr);
+ _ma_store_page_used(share, new_root_buf, share->keypage_header);
+ if ((new_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) ==
+ HA_OFFSET_ERROR)
+ goto err;
+ write_lock= page_link->write_lock;
+
+ _ma_page_setup(&page, info, keyinfo, new_root, new_root_buf);
+
+ if (share->now_transactional && _ma_log_new(&page, 1))
+ goto err;
+
+ /* Point to some free space */
+ new_key_buff= new_root_buf + keyinfo->block_length + nod_flag;
+ new_key.keyinfo= keyinfo;
+ new_key.data= new_key_buff;
+ new_key.data_length= key->data_length;
+ new_key.ref_length= key->ref_length;
+ new_key.flag= 0;
+
+ _ma_kpointer(info, new_key_buff - nod_flag, old_root);
+ if (maria_rtree_set_key_mbr(info, &new_key, old_root))
+ goto err;
+ if (maria_rtree_add_key(&new_key, &page, NULL)
+ == -1)
+ goto err;
+ _ma_kpointer(info, new_key_buff - nod_flag, new_page);
+ if (maria_rtree_set_key_mbr(info, &new_key, new_page))
+ goto err;
+ if (maria_rtree_add_key(&new_key, &page, NULL)
+ == -1)
+ goto err;
+ if (_ma_write_keypage(&page, write_lock, DFLT_INIT_HITS))
+ goto err;
+ *root= new_root;
+ DBUG_PRINT("rtree", ("new root page: %lu level: %d nod_flag: %u",
+ (ulong) new_root, 0, page.node));
+
+ my_afree(new_root_buf);
+ break;
+err:
+ my_afree(new_root_buf);
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ default:
+ case -1: /* error */
+ {
+ DBUG_ASSERT(0);
+ break;
+ }
+ }
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Insert key into the tree - interface function
+
+ RETURN
+ 1 Error
+ 0 OK
+*/
+
+my_bool maria_rtree_insert(MARIA_HA *info, MARIA_KEY *key)
+{
+ int res;
+ MARIA_SHARE *share= info->s;
+ my_off_t *root, new_root;
+ LSN lsn= LSN_IMPOSSIBLE;
+ DBUG_ENTER("maria_rtree_insert");
+
+ if (!key)
+ DBUG_RETURN(1); /* _ma_sp_make_key failed */
+
+ root= &share->state.key_root[key->keyinfo->key_nr];
+ new_root= *root;
+
+ if ((res= (maria_rtree_insert_level(info, key, -1, &new_root) == -1)))
+ goto err;
+ if (share->now_transactional)
+ res= _ma_write_undo_key_insert(info, key, root, new_root, &lsn);
+ else
+ {
+ *root= new_root;
+ _ma_fast_unlock_key_del(info);
+ }
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+err:
+ DBUG_RETURN(res != 0);
+}
+
+
+/*
+ Fill reinsert page buffer
+
+ RETURN
+ 1 Error
+ 0 OK
+*/
+
+static my_bool maria_rtree_fill_reinsert_list(stPageList *ReinsertList,
+ my_off_t page, int level)
+{
+ DBUG_ENTER("maria_rtree_fill_reinsert_list");
+ DBUG_PRINT("rtree", ("page: %lu level: %d", (ulong) page, level));
+ if (ReinsertList->n_pages == ReinsertList->m_pages)
+ {
+ ReinsertList->m_pages += REINSERT_BUFFER_INC;
+ if (!(ReinsertList->pages= (stPageLevel*)my_realloc((uchar*)ReinsertList->pages,
+ ReinsertList->m_pages * sizeof(stPageLevel), MYF(MY_ALLOW_ZERO_PTR))))
+ goto err;
+ }
+ /* save page to ReinsertList */
+ ReinsertList->pages[ReinsertList->n_pages].offs= page;
+ ReinsertList->pages[ReinsertList->n_pages].level= level;
+ ReinsertList->n_pages++;
+ DBUG_RETURN(0);
+
+err:
+ DBUG_RETURN(1); /* purecov: inspected */
+}
+
+
+/*
+ Go down and delete key from the tree
+
+ RETURN
+ -1 Error
+ 0 Deleted
+ 1 Not found
+ 2 Empty leaf
+*/
+
+static int maria_rtree_delete_req(MARIA_HA *info, const MARIA_KEY *key,
+ my_off_t page_pos, uint *page_size,
+ stPageList *ReinsertList, int level)
+{
+ ulong i;
+ uint nod_flag;
+ int res;
+ uchar *page_buf, *last, *k;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_PAGE page;
+ DBUG_ENTER("maria_rtree_delete_req");
+
+ if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM;
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE,
+ DFLT_INIT_HITS, page_buf, 0))
+ goto err;
+ nod_flag= page.node;
+ DBUG_PRINT("rtree", ("page: %lu level: %d nod_flag: %u",
+ (ulong) page_pos, level, nod_flag));
+
+ k= rt_PAGE_FIRST_KEY(share, page_buf, nod_flag);
+ last= rt_PAGE_END(&page);
+
+ for (i= 0;
+ k < last;
+ k= rt_PAGE_NEXT_KEY(share, k, key->data_length, nod_flag), i++)
+ {
+ if (nod_flag)
+ {
+ /* not leaf */
+ if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key->data_length,
+ MBR_WITHIN))
+ {
+ switch ((res= maria_rtree_delete_req(info, key,
+ _ma_kpos(nod_flag, k),
+ page_size, ReinsertList,
+ level + 1)))
+ {
+ case 0: /* deleted */
+ {
+ /* test page filling */
+ if (*page_size + key->data_length >=
+ rt_PAGE_MIN_SIZE(keyinfo->block_length))
+ {
+ /* OK */
+ /* Calculate a new key value (MBR) for the shrinked block. */
+ MARIA_KEY tmp_key;
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data= k;
+ tmp_key.data_length= key->data_length;
+ tmp_key.ref_length= key->ref_length;
+ tmp_key.flag= 0; /* Safety */
+
+ if (maria_rtree_set_key_mbr(info, &tmp_key,
+ _ma_kpos(nod_flag, k)))
+ goto err;
+ if (share->now_transactional &&
+ _ma_log_change(&page, k, key->data_length,
+ KEY_OP_DEBUG_RTREE_SET_KEY))
+ goto err;
+ page_mark_changed(info, &page)
+ if (_ma_write_keypage(&page,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ }
+ else
+ {
+ /*
+ Too small: delete key & add it descendant to reinsert list.
+ Store position and level of the block so that it can be
+ accessed later for inserting the remaining keys.
+ */
+ DBUG_PRINT("rtree", ("too small. move block to reinsert list"));
+ if (maria_rtree_fill_reinsert_list(ReinsertList,
+ _ma_kpos(nod_flag, k),
+ level + 1))
+ goto err;
+ /*
+ Delete the key that references the block. This makes the
+ block disappear from the index. Hence we need to insert
+ its remaining keys later. Note: if the block is a branch
+ block, we do not only remove this block, but the whole
+ subtree. So we need to re-insert its keys on the same
+ level later to reintegrate the subtrees.
+ */
+ if (maria_rtree_delete_key(&page, k, key->data_length))
+ goto err;
+ page_mark_changed(info, &page);
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ *page_size= page.size;
+ }
+
+ goto ok;
+ }
+ case 1: /* not found - continue searching */
+ {
+ break;
+ }
+ case 2: /* vacuous case: last key in the leaf */
+ {
+ if (maria_rtree_delete_key(&page, k, key->data_length))
+ goto err;
+ page_mark_changed(info, &page);
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ *page_size= page.size;
+ res= 0;
+ goto ok;
+ }
+ default: /* error */
+ case -1:
+ {
+ goto err;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* leaf */
+ if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key->data_length,
+ MBR_EQUAL | MBR_DATA))
+ {
+ page_mark_changed(info, &page);
+ if (maria_rtree_delete_key(&page, k, key->data_length))
+ goto err;
+ *page_size= page.size;
+ if (*page_size == info->s->keypage_header)
+ {
+ /* last key in the leaf */
+ res= 2;
+ if (_ma_dispose(info, page.pos, 0))
+ goto err;
+ }
+ else
+ {
+ res= 0;
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ }
+ goto ok;
+ }
+ }
+ }
+ res= 1;
+
+ok:
+ my_afree(page_buf);
+ DBUG_RETURN(res);
+
+err:
+ my_afree(page_buf);
+ DBUG_RETURN(-1); /* purecov: inspected */
+}
+
+
+/*
+ Delete key - interface function
+
+ RETURN
+ 1 Error
+ 0 Deleted
+*/
+
+my_bool maria_rtree_delete(MARIA_HA *info, MARIA_KEY *key)
+{
+ MARIA_SHARE *share= info->s;
+ my_off_t new_root= share->state.key_root[key->keyinfo->key_nr];
+ int res;
+ LSN lsn= LSN_IMPOSSIBLE;
+ DBUG_ENTER("maria_rtree_delete");
+
+ if ((res= maria_rtree_real_delete(info, key, &new_root)))
+ goto err;
+
+ if (share->now_transactional)
+ res= _ma_write_undo_key_delete(info, key, new_root, &lsn);
+ else
+ share->state.key_root[key->keyinfo->key_nr]= new_root;
+
+err:
+ _ma_fast_unlock_key_del(info);
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+ DBUG_RETURN(res != 0);
+}
+
+
+my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
+ my_off_t *root)
+{
+ uint page_size;
+ stPageList ReinsertList;
+ my_off_t old_root;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ uint key_data_length= key->data_length;
+ DBUG_ENTER("maria_rtree_real_delete");
+
+ if ((old_root= share->state.key_root[keyinfo->key_nr]) ==
+ HA_OFFSET_ERROR)
+ {
+ my_errno= HA_ERR_END_OF_FILE;
+ DBUG_RETURN(1); /* purecov: inspected */
+ }
+ DBUG_PRINT("rtree", ("starting deletion at root page: %lu",
+ (ulong) old_root));
+
+ ReinsertList.pages= NULL;
+ ReinsertList.n_pages= 0;
+ ReinsertList.m_pages= 0;
+
+ switch (maria_rtree_delete_req(info, key, old_root, &page_size,
+ &ReinsertList, 0)) {
+ case 2: /* empty */
+ {
+ *root= HA_OFFSET_ERROR;
+ break;
+ }
+ case 0: /* deleted */
+ {
+ uint nod_flag;
+ ulong i;
+ uchar *page_buf;
+ MARIA_PAGE page;
+ MARIA_KEY tmp_key;
+ tmp_key.keyinfo= key->keyinfo;
+ tmp_key.data_length= key->data_length;
+ tmp_key.ref_length= key->ref_length;
+ tmp_key.flag= 0; /* Safety */
+
+ if (ReinsertList.n_pages)
+ {
+ if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+ {
+ my_errno= HA_ERR_OUT_OF_MEM;
+ goto err;
+ }
+
+ for (i= 0; i < ReinsertList.n_pages; ++i)
+ {
+ uchar *k, *last;
+ if (_ma_fetch_keypage(&page, info, keyinfo, ReinsertList.pages[i].offs,
+ PAGECACHE_LOCK_WRITE,
+ DFLT_INIT_HITS, page_buf, 0))
+ goto err;
+ nod_flag= page.node;
+ DBUG_PRINT("rtree", ("reinserting keys from "
+ "page: %lu level: %d nod_flag: %u",
+ (ulong) ReinsertList.pages[i].offs,
+ ReinsertList.pages[i].level, nod_flag));
+
+ k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag);
+ last= rt_PAGE_END(&page);
+ for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length,
+ nod_flag))
+ {
+ int res;
+ tmp_key.data= k;
+ if ((res= maria_rtree_insert_level(info, &tmp_key,
+ ReinsertList.pages[i].level,
+ root)) == -1)
+ {
+ my_afree(page_buf);
+ goto err;
+ }
+ if (res)
+ {
+ uint j;
+ DBUG_PRINT("rtree", ("root has been split, adjust levels"));
+ for (j= i; j < ReinsertList.n_pages; j++)
+ {
+ ReinsertList.pages[j].level++;
+ DBUG_PRINT("rtree", ("keys from page: %lu now level: %d",
+ (ulong) ReinsertList.pages[i].offs,
+ ReinsertList.pages[i].level));
+ }
+ }
+ }
+ page_mark_changed(info, &page);
+ if (_ma_dispose(info, page.pos, 0))
+ {
+ my_afree(page_buf);
+ goto err;
+ }
+ }
+ my_afree(page_buf);
+ my_free(ReinsertList.pages, MYF(0));
+ }
+
+ /* check for redundant root (not leaf, 1 child) and eliminate */
+ if ((old_root= *root) == HA_OFFSET_ERROR)
+ goto err;
+ if (_ma_fetch_keypage(&page, info, keyinfo, old_root,
+ PAGECACHE_LOCK_WRITE,
+ DFLT_INIT_HITS, info->buff, 0))
+ goto err;
+ nod_flag= page.node;
+ if (nod_flag && (page.size == share->keypage_header + key_data_length +
+ nod_flag))
+ {
+ *root= _ma_kpos(nod_flag,
+ rt_PAGE_FIRST_KEY(share, info->buff, nod_flag));
+ page_mark_changed(info, &page);
+ if (_ma_dispose(info, page.pos, 0))
+ goto err;
+ }
+ info->update= HA_STATE_DELETED;
+ break;
+ }
+ case 1: /* not found */
+ {
+ my_errno= HA_ERR_KEY_NOT_FOUND;
+ goto err;
+ }
+ case -1: /* error */
+ default:
+ goto err; /* purecov: inspected */
+ }
+ DBUG_RETURN(0);
+
+err:
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Estimate number of suitable keys in the tree
+
+ RETURN
+ estimated value
+*/
+
+ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag)
+{
+ my_off_t root;
+ uint i= 0;
+ uint nod_flag, key_data_length;
+ uchar *page_buf, *k, *last;
+ double area= 0;
+ ha_rows res= 0;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_PAGE page;
+
+ if (flag & MBR_DISJOINT)
+ return info->state->records;
+
+ if ((root= share->state.key_root[key->keyinfo->key_nr]) == HA_OFFSET_ERROR)
+ return HA_POS_ERROR;
+ if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+ return HA_POS_ERROR;
+ if (_ma_fetch_keypage(&page, info, keyinfo, root,
+ PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, page_buf,
+ 0))
+ goto err;
+ nod_flag= page.node;
+
+ key_data_length= key->data_length;
+
+ k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag);
+ last= rt_PAGE_END(&page);
+
+ for (; k < last;
+ k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag), i++)
+ {
+ if (nod_flag)
+ {
+ double k_area= maria_rtree_rect_volume(keyinfo->seg, k, key_data_length);
+
+ /* The following should be safe, even if we compare doubles */
+ if (k_area == 0)
+ {
+ if (flag & (MBR_CONTAIN | MBR_INTERSECT))
+ {
+ area+= 1;
+ }
+ else if (flag & (MBR_WITHIN | MBR_EQUAL))
+ {
+ if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length,
+ MBR_WITHIN))
+ area+= 1;
+ }
+ else
+ goto err;
+ }
+ else
+ {
+ if (flag & (MBR_CONTAIN | MBR_INTERSECT))
+ {
+ area+= maria_rtree_overlapping_area(keyinfo->seg, key->data, k,
+ key_data_length) / k_area;
+ }
+ else if (flag & (MBR_WITHIN | MBR_EQUAL))
+ {
+ if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length,
+ MBR_WITHIN))
+ area+= (maria_rtree_rect_volume(keyinfo->seg, key->data,
+ key_data_length) / k_area);
+ }
+ else
+ goto err;
+ }
+ }
+ else
+ {
+ if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length,
+ flag))
+ ++res;
+ }
+ }
+ if (nod_flag)
+ {
+ if (i)
+ res= (ha_rows) (area / i * info->state->records);
+ else
+ res= HA_POS_ERROR;
+ }
+
+ my_afree(page_buf);
+ return res;
+
+err:
+ my_afree(page_buf);
+ return HA_POS_ERROR;
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_index.h b/storage/maria/ma_rt_index.h
new file mode 100644
index 00000000000..dacaa4389b7
--- /dev/null
+++ b/storage/maria/ma_rt_index.h
@@ -0,0 +1,46 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _rt_index_h
+#define _rt_index_h
+
+#ifdef HAVE_RTREE_KEYS
+
+#define rt_PAGE_FIRST_KEY(share, page, nod_flag) (page + share->keypage_header + nod_flag)
+#define rt_PAGE_NEXT_KEY(share, key, key_length, nod_flag) (key + key_length +\
+ (nod_flag ? nod_flag : share->base.rec_reflength))
+#define rt_PAGE_END(page) ((page)->buff + (page)->size)
+
+#define rt_PAGE_MIN_SIZE(block_length) ((uint)(block_length - KEYPAGE_CHECKSUM_SIZE) / 3)
+
+my_bool maria_rtree_insert(MARIA_HA *info, MARIA_KEY *key);
+my_bool maria_rtree_delete(MARIA_HA *info, MARIA_KEY *key);
+int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key,
+ int ins_level, my_off_t *root);
+my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
+ my_off_t *root);
+int maria_rtree_find_first(MARIA_HA *info, MARIA_KEY *key, uint search_flag);
+int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint32 search_flag);
+
+int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length);
+int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length);
+
+ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag);
+
+int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page,
+ my_off_t *new_page_offs);
+#endif /*HAVE_RTREE_KEYS*/
+#endif /* _rt_index_h */
diff --git a/storage/maria/ma_rt_key.c b/storage/maria/ma_rt_key.c
new file mode 100644
index 00000000000..fa173605cd3
--- /dev/null
+++ b/storage/maria/ma_rt_key.c
@@ -0,0 +1,120 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+#ifdef HAVE_RTREE_KEYS
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+/*
+ Add key to the page
+
+ RESULT VALUES
+ -1 Error
+ 0 Not split
+ 1 Split
+*/
+
+int maria_rtree_add_key(const MARIA_KEY *key, MARIA_PAGE *page,
+ my_off_t *new_page)
+{
+ MARIA_HA *info= page->info;
+ MARIA_SHARE *share= info->s;
+ uint page_size= page->size;
+ uint nod_flag= page->node;
+ uchar *key_pos= rt_PAGE_END(page);
+ uint tot_key_length= key->data_length + key->ref_length + nod_flag;
+ DBUG_ENTER("maria_rtree_add_key");
+
+ if (page_size + tot_key_length <=
+ (uint)(key->keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE))
+ {
+ /* split won't be necessary */
+ if (nod_flag)
+ {
+ DBUG_ASSERT(_ma_kpos(nod_flag, key->data) <
+ info->state->key_file_length);
+ /* We don't store reference to row on nod pages for rtree index */
+ tot_key_length-= key->ref_length;
+ }
+ /* save key */
+ memcpy(key_pos, key->data - nod_flag, tot_key_length);
+ page->size+= tot_key_length;
+ page_store_size(share, page);
+ if (share->now_transactional &&
+ _ma_log_add(page, key_pos - page->buff,
+ key_pos, tot_key_length, tot_key_length, 0,
+ KEY_OP_DEBUG_LOG_ADD_1))
+ DBUG_RETURN(-1);
+ DBUG_RETURN(0);
+ }
+ DBUG_RETURN(maria_rtree_split_page(key, page, new_page) ? -1 : 1);
+}
+
+
+/*
+ Delete key from the page
+
+ Notes
+ key_length is only the data part of the key
+*/
+
+int maria_rtree_delete_key(MARIA_PAGE *page, uchar *key, uint key_length)
+{
+ MARIA_HA *info= page->info;
+ MARIA_SHARE *share= info->s;
+ uint key_length_with_nod_flag;
+ uchar *key_start;
+
+ key_start= key - page->node;
+ if (!page->node)
+ key_length+= share->base.rec_reflength;
+
+ memmove(key_start, key + key_length, page->size - key_length -
+ (key - page->buff));
+ key_length_with_nod_flag= key_length + page->node;
+ page->size-= key_length_with_nod_flag;
+ page_store_size(share, page);
+ if (share->now_transactional &&
+ _ma_log_delete(page, key_start, 0, key_length_with_nod_flag,
+ 0, KEY_OP_DEBUG_LOG_DEL_CHANGE_RT))
+ return -1;
+ return 0;
+}
+
+
+/*
+ Calculate and store key MBR into *key.
+*/
+
+int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEY *key,
+ my_off_t child_page)
+{
+ MARIA_PAGE page;
+ DBUG_ENTER("maria_rtree_set_key_mbr");
+ if (_ma_fetch_keypage(&page, info, key->keyinfo, child_page,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ DFLT_INIT_HITS, info->buff, 0))
+ DBUG_RETURN(-1);
+
+ DBUG_RETURN(maria_rtree_page_mbr(key->keyinfo->seg,
+ &page, key->data, key->data_length));
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_key.h b/storage/maria/ma_rt_key.h
new file mode 100644
index 00000000000..948809f3d38
--- /dev/null
+++ b/storage/maria/ma_rt_key.h
@@ -0,0 +1,31 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Ramil Kalimullin, who has a shared copyright to this code */
+
+#ifndef _rt_key_h
+#define _rt_key_h
+
+#ifdef HAVE_RTREE_KEYS
+
+int maria_rtree_add_key(const MARIA_KEY *key, MARIA_PAGE *page,
+ my_off_t *new_page);
+int maria_rtree_delete_key(MARIA_PAGE *page, uchar *key, uint key_length);
+int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEY *key,
+ my_off_t child_page);
+
+#endif /*HAVE_RTREE_KEYS*/
+#endif /* _rt_key_h */
diff --git a/storage/maria/ma_rt_mbr.c b/storage/maria/ma_rt_mbr.c
new file mode 100644
index 00000000000..b3e2b0ceab8
--- /dev/null
+++ b/storage/maria/ma_rt_mbr.c
@@ -0,0 +1,818 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_mbr.h"
+
+#define INTERSECT_CMP(amin, amax, bmin, bmax) ((amin > bmax) || (bmin > amax))
+#define CONTAIN_CMP(amin, amax, bmin, bmax) ((bmin > amin) || (bmax < amax))
+#define WITHIN_CMP(amin, amax, bmin, bmax) ((amin > bmin) || (amax < bmax))
+#define DISJOINT_CMP(amin, amax, bmin, bmax) ((amin <= bmax) && (bmin <= amax))
+#define EQUAL_CMP(amin, amax, bmin, bmax) ((amin != bmin) || (amax != bmax))
+
+#define FCMP(A, B) ((int)(A) - (int)(B))
+#define p_inc(A, B, X) {A += X; B += X;}
+
+#define RT_CMP(nextflag) \
+ if (nextflag & MBR_INTERSECT) \
+ { \
+ if (INTERSECT_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ } \
+ else if (nextflag & MBR_CONTAIN) \
+ { \
+ if (CONTAIN_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ } \
+ else if (nextflag & MBR_WITHIN) \
+ { \
+ if (WITHIN_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ } \
+ else if (nextflag & MBR_EQUAL) \
+ { \
+ if (EQUAL_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ } \
+ else if (nextflag & MBR_DISJOINT) \
+ { \
+ if (DISJOINT_CMP(amin, amax, bmin, bmax)) \
+ return 1; \
+ }\
+ else /* if unknown comparison operator */ \
+ { \
+ DBUG_ASSERT(0); \
+ }
+
+#define RT_CMP_KORR(type, korr_func, len, nextflag) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin= korr_func(a); \
+ bmin= korr_func(b); \
+ amax= korr_func(a+len); \
+ bmax= korr_func(b+len); \
+ RT_CMP(nextflag); \
+}
+
+#define RT_CMP_GET(type, get_func, len, nextflag) \
+{ \
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ RT_CMP(nextflag); \
+}
+
+/*
+ Compares two keys a and b depending on nextflag
+ nextflag can contain these flags:
+ MBR_INTERSECT(a,b) a overlaps b
+ MBR_CONTAIN(a,b) a contains b
+ MBR_DISJOINT(a,b) a disjoint b
+ MBR_WITHIN(a,b) a within b
+ MBR_EQUAL(a,b) All coordinates of MBRs are equal
+ MBR_DATA(a,b) Data reference is the same
+ Returns 0 on success.
+*/
+
+int maria_rtree_key_cmp(HA_KEYSEG *keyseg, const uchar *b, const uchar *a,
+ uint key_length, uint32 nextflag)
+{
+ for (; (int) key_length > 0; keyseg += 2 )
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_CMP_KORR(int8, mi_sint1korr, 1, nextflag);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_CMP_KORR(uint8, mi_uint1korr, 1, nextflag);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_CMP_KORR(int16, mi_sint2korr, 2, nextflag);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_CMP_KORR(uint16, mi_uint2korr, 2, nextflag);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_CMP_KORR(int32, mi_sint3korr, 3, nextflag);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_CMP_KORR(uint32, mi_uint3korr, 3, nextflag);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_CMP_KORR(int32, mi_sint4korr, 4, nextflag);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_CMP_KORR(uint32, mi_uint4korr, 4, nextflag);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_CMP_KORR(longlong, mi_sint8korr, 8, nextflag)
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_CMP_KORR(ulonglong, mi_uint8korr, 8, nextflag)
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ /* The following should be safe, even if we compare doubles */
+ RT_CMP_GET(float, mi_float4get, 4, nextflag);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_CMP_GET(double, mi_float8get, 8, nextflag);
+ break;
+ case HA_KEYTYPE_END:
+ goto end;
+ default:
+ return 1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ }
+
+end:
+ if (nextflag & MBR_DATA)
+ {
+ const uchar *end= a + keyseg->length;
+ do
+ {
+ if (*a++ != *b++)
+ return FCMP(a[-1], b[-1]);
+ } while (a != end);
+ }
+ return 0;
+}
+
+#define RT_VOL_KORR(type, korr_func, len, cast) \
+{ \
+ type amin, amax; \
+ amin= korr_func(a); \
+ amax= korr_func(a+len); \
+ res *= (cast(amax) - cast(amin)); \
+}
+
+#define RT_VOL_GET(type, get_func, len, cast) \
+{ \
+ type amin, amax; \
+ get_func(amin, a); \
+ get_func(amax, a+len); \
+ res *= (cast(amax) - cast(amin)); \
+}
+
+/*
+ Calculates rectangle volume
+*/
+double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar *a, uint key_length)
+{
+ double res= 1;
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_VOL_KORR(int8, mi_sint1korr, 1, (double));
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_VOL_KORR(uint8, mi_uint1korr, 1, (double));
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_VOL_KORR(int16, mi_sint2korr, 2, (double));
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_VOL_KORR(uint16, mi_uint2korr, 2, (double));
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_VOL_KORR(int32, mi_sint3korr, 3, (double));
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_VOL_KORR(uint32, mi_uint3korr, 3, (double));
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_VOL_KORR(int32, mi_sint4korr, 4, (double));
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_VOL_KORR(uint32, mi_uint4korr, 4, (double));
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_VOL_KORR(longlong, mi_sint8korr, 8, (double));
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_VOL_KORR(longlong, mi_sint8korr, 8, ulonglong2double);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_VOL_GET(float, mi_float4get, 4, (double));
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_VOL_GET(double, mi_float8get, 8, (double));
+ break;
+ case HA_KEYTYPE_END:
+ key_length= 0;
+ break;
+ default:
+ return -1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ }
+ return res;
+}
+
+#define RT_D_MBR_KORR(type, korr_func, len, cast) \
+{ \
+ type amin, amax; \
+ amin= korr_func(a); \
+ amax= korr_func(a+len); \
+ *res++= cast(amin); \
+ *res++= cast(amax); \
+}
+
+#define RT_D_MBR_GET(type, get_func, len, cast) \
+{ \
+ type amin, amax; \
+ get_func(amin, a); \
+ get_func(amax, a+len); \
+ *res++= cast(amin); \
+ *res++= cast(amax); \
+}
+
+
+/*
+ Creates an MBR as an array of doubles.
+ Fills *res.
+*/
+
+int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a,
+ uint key_length, double *res)
+{
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_D_MBR_KORR(int8, mi_sint1korr, 1, (double));
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_D_MBR_KORR(uint8, mi_uint1korr, 1, (double));
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_D_MBR_KORR(int16, mi_sint2korr, 2, (double));
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_D_MBR_KORR(uint16, mi_uint2korr, 2, (double));
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_D_MBR_KORR(int32, mi_sint3korr, 3, (double));
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_D_MBR_KORR(uint32, mi_uint3korr, 3, (double));
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_D_MBR_KORR(int32, mi_sint4korr, 4, (double));
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_D_MBR_KORR(uint32, mi_uint4korr, 4, (double));
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_D_MBR_KORR(longlong, mi_sint8korr, 8, (double));
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_D_MBR_KORR(longlong, mi_sint8korr, 8, ulonglong2double);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_D_MBR_GET(float, mi_float4get, 4, (double));
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_D_MBR_GET(double, mi_float8get, 8, (double));
+ break;
+ case HA_KEYTYPE_END:
+ key_length= 0;
+ break;
+ default:
+ return 1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ }
+ return 0;
+}
+
+#define RT_COMB_KORR(type, korr_func, store_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin= korr_func(a); \
+ bmin= korr_func(b); \
+ amax= korr_func(a+len); \
+ bmax= korr_func(b+len); \
+ amin= min(amin, bmin); \
+ amax= max(amax, bmax); \
+ store_func(c, amin); \
+ store_func(c+len, amax); \
+}
+
+#define RT_COMB_GET(type, get_func, store_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ amin= min(amin, bmin); \
+ amax= max(amax, bmax); \
+ store_func(c, amin); \
+ store_func(c+len, amax); \
+}
+
+/*
+ Creates common minimal bounding rectungle
+ for two input rectagnles a and b
+ Result is written to c
+*/
+
+int maria_rtree_combine_rect(const HA_KEYSEG *keyseg, const uchar* a,
+ const uchar* b, uchar* c,
+ uint key_length)
+{
+ for ( ; (int) key_length > 0 ; keyseg += 2)
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_COMB_KORR(int8, mi_sint1korr, mi_int1store, 1);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_COMB_KORR(uint8, mi_uint1korr, mi_int1store, 1);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_COMB_KORR(int16, mi_sint2korr, mi_int2store, 2);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_COMB_KORR(uint16, mi_uint2korr, mi_int2store, 2);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_COMB_KORR(int32, mi_sint3korr, mi_int3store, 3);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_COMB_KORR(uint32, mi_uint3korr, mi_int3store, 3);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_COMB_KORR(int32, mi_sint4korr, mi_int4store, 4);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_COMB_KORR(uint32, mi_uint4korr, mi_int4store, 4);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_COMB_KORR(longlong, mi_sint8korr, mi_int8store, 8);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_COMB_KORR(ulonglong, mi_uint8korr, mi_int8store, 8);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_COMB_GET(float, mi_float4get, mi_float4store, 4);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_COMB_GET(double, mi_float8get, mi_float8store, 8);
+ break;
+ case HA_KEYTYPE_END:
+ return 0;
+ default:
+ return 1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ c+= keyseg_length;
+ }
+ return 0;
+}
+
+
+#define RT_OVL_AREA_KORR(type, korr_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin= korr_func(a); \
+ bmin= korr_func(b); \
+ amax= korr_func(a+len); \
+ bmax= korr_func(b+len); \
+ amin= max(amin, bmin); \
+ amax= min(amax, bmax); \
+ if (amin >= amax) \
+ return 0; \
+ res *= amax - amin; \
+}
+
+#define RT_OVL_AREA_GET(type, get_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ amin= max(amin, bmin); \
+ amax= min(amax, bmax); \
+ if (amin >= amax) \
+ return 0; \
+ res *= amax - amin; \
+}
+
+/*
+Calculates overlapping area of two MBRs a & b
+*/
+double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b,
+ uint key_length)
+{
+ double res= 1;
+ for (; (int) key_length > 0 ; keyseg += 2)
+ {
+ uint32 keyseg_length;
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_OVL_AREA_KORR(int8, mi_sint1korr, 1);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_OVL_AREA_KORR(uint8, mi_uint1korr, 1);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_OVL_AREA_KORR(int16, mi_sint2korr, 2);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_OVL_AREA_KORR(uint16, mi_uint2korr, 2);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_OVL_AREA_KORR(int32, mi_sint3korr, 3);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_OVL_AREA_KORR(uint32, mi_uint3korr, 3);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_OVL_AREA_KORR(int32, mi_sint4korr, 4);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_OVL_AREA_KORR(uint32, mi_uint4korr, 4);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_OVL_AREA_GET(float, mi_float4get, 4);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_OVL_AREA_GET(double, mi_float8get, 8);
+ break;
+ case HA_KEYTYPE_END:
+ return res;
+ default:
+ return -1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ }
+ return res;
+}
+
+#define RT_AREA_INC_KORR(type, korr_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin= korr_func(a); \
+ bmin= korr_func(b); \
+ amax= korr_func(a+len); \
+ bmax= korr_func(b+len); \
+ a_area *= (((double)amax) - ((double)amin)); \
+ loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+#define RT_AREA_INC_GET(type, get_func, len)\
+{\
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ a_area *= (((double)amax) - ((double)amin)); \
+ loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+/*
+ Calculates MBR_AREA(a+b) - MBR_AREA(a)
+ Fills *ab_area.
+ Note: when 'a' and 'b' objects are far from each other,
+ the area increase can be really big, so this function
+ can return 'inf' as a result.
+*/
+
+double maria_rtree_area_increase(const HA_KEYSEG *keyseg, const uchar *a,
+ const uchar *b,
+ uint key_length, double *ab_area)
+{
+ double a_area= 1.0;
+ double loc_ab_area= 1.0;
+
+ *ab_area= 1.0;
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ uint32 keyseg_length;
+
+ if (keyseg->null_bit) /* Handle NULL part */
+ return -1;
+
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_AREA_INC_KORR(int8, mi_sint1korr, 1);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_AREA_INC_KORR(uint8, mi_uint1korr, 1);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_AREA_INC_KORR(int16, mi_sint2korr, 2);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_AREA_INC_KORR(uint16, mi_uint2korr, 2);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_AREA_INC_KORR(int32, mi_sint3korr, 3);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_AREA_INC_KORR(int32, mi_uint3korr, 3);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_AREA_INC_KORR(int32, mi_sint4korr, 4);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_AREA_INC_KORR(uint32, mi_uint4korr, 4);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_AREA_INC_KORR(longlong, mi_sint8korr, 8);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_AREA_INC_KORR(longlong, mi_sint8korr, 8);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_AREA_INC_GET(float, mi_float4get, 4);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_AREA_INC_GET(double, mi_float8get, 8);
+ break;
+ case HA_KEYTYPE_END:
+ goto safe_end;
+ default:
+ return -1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ }
+safe_end:
+ *ab_area= loc_ab_area;
+ return loc_ab_area - a_area;
+}
+
+#define RT_PERIM_INC_KORR(type, korr_func, len) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin= korr_func(a); \
+ bmin= korr_func(b); \
+ amax= korr_func(a+len); \
+ bmax= korr_func(b+len); \
+ a_perim+= (((double)amax) - ((double)amin)); \
+ *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+#define RT_PERIM_INC_GET(type, get_func, len)\
+{\
+ type amin, amax, bmin, bmax; \
+ get_func(amin, a); \
+ get_func(bmin, b); \
+ get_func(amax, a+len); \
+ get_func(bmax, b+len); \
+ a_perim+= (((double)amax) - ((double)amin)); \
+ *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+/*
+Calculates MBR_PERIMETER(a+b) - MBR_PERIMETER(a)
+*/
+double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b,
+ uint key_length, double *ab_perim)
+{
+ double a_perim= 0.0;
+
+ *ab_perim= 0.0;
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ uint32 keyseg_length;
+
+ if (keyseg->null_bit) /* Handle NULL part */
+ return -1;
+
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_PERIM_INC_KORR(int8, mi_sint1korr, 1);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_PERIM_INC_KORR(uint8, mi_uint1korr, 1);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_PERIM_INC_KORR(int16, mi_sint2korr, 2);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_PERIM_INC_KORR(uint16, mi_uint2korr, 2);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_PERIM_INC_KORR(int32, mi_sint3korr, 3);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_PERIM_INC_KORR(int32, mi_uint3korr, 3);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_PERIM_INC_KORR(int32, mi_sint4korr, 4);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_PERIM_INC_KORR(uint32, mi_uint4korr, 4);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_PERIM_INC_GET(float, mi_float4get, 4);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_PERIM_INC_GET(double, mi_float8get, 8);
+ break;
+ case HA_KEYTYPE_END:
+ return *ab_perim - a_perim;
+ default:
+ return -1;
+ }
+ keyseg_length= keyseg->length * 2;
+ key_length-= keyseg_length;
+ a+= keyseg_length;
+ b+= keyseg_length;
+ }
+ return *ab_perim - a_perim;
+}
+
+
+#define RT_PAGE_MBR_KORR(share, type, korr_func, store_func, len, to) \
+{ \
+ type amin, amax, bmin, bmax; \
+ amin= korr_func(k + inc); \
+ amax= korr_func(k + inc + len); \
+ k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag); \
+ for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag)) \
+{ \
+ bmin= korr_func(k + inc); \
+ bmax= korr_func(k + inc + len); \
+ if (amin > bmin) \
+ amin= bmin; \
+ if (amax < bmax) \
+ amax= bmax; \
+} \
+ store_func(to, amin); \
+ to+= len; \
+ store_func(to, amax); \
+ to += len; \
+ inc += 2 * len; \
+}
+
+#define RT_PAGE_MBR_GET(share, type, get_func, store_func, len, to) \
+{ \
+ type amin, amax, bmin, bmax; \
+ get_func(amin, k + inc); \
+ get_func(amax, k + inc + len); \
+ k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag); \
+ for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag)) \
+{ \
+ get_func(bmin, k + inc); \
+ get_func(bmax, k + inc + len); \
+ if (amin > bmin) \
+ amin= bmin; \
+ if (amax < bmax) \
+ amax= bmax; \
+} \
+ store_func(to, amin); \
+ to+= len; \
+ store_func(to, amax); \
+ to+= len; \
+ inc += 2 * len; \
+}
+
+/*
+ Calculates key page total MBR= MBR(key1) + MBR(key2) + ...
+ Stores into *to.
+*/
+int maria_rtree_page_mbr(const HA_KEYSEG *keyseg,
+ MARIA_PAGE *page,
+ uchar *to, uint key_length)
+{
+ MARIA_HA *info= page->info;
+ MARIA_SHARE *share= info->s;
+ uint inc= 0;
+ uint k_len= key_length;
+ uint nod_flag= page->node;
+ const uchar *k;
+ const uchar *last= rt_PAGE_END(page);
+
+ for (; (int)key_length > 0; keyseg += 2)
+ {
+ key_length -= keyseg->length * 2;
+
+ /* Handle NULL part */
+ if (keyseg->null_bit)
+ {
+ return 1;
+ }
+
+ k= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag);
+
+ switch ((enum ha_base_keytype) keyseg->type) {
+ case HA_KEYTYPE_INT8:
+ RT_PAGE_MBR_KORR(share, int8, mi_sint1korr, mi_int1store, 1, to);
+ break;
+ case HA_KEYTYPE_BINARY:
+ RT_PAGE_MBR_KORR(share, uint8, mi_uint1korr, mi_int1store, 1, to);
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ RT_PAGE_MBR_KORR(share, int16, mi_sint2korr, mi_int2store, 2, to);
+ break;
+ case HA_KEYTYPE_USHORT_INT:
+ RT_PAGE_MBR_KORR(share, uint16, mi_uint2korr, mi_int2store, 2, to);
+ break;
+ case HA_KEYTYPE_INT24:
+ RT_PAGE_MBR_KORR(share, int32, mi_sint3korr, mi_int3store, 3, to);
+ break;
+ case HA_KEYTYPE_UINT24:
+ RT_PAGE_MBR_KORR(share, uint32, mi_uint3korr, mi_int3store, 3, to);
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ RT_PAGE_MBR_KORR(share, int32, mi_sint4korr, mi_int4store, 4, to);
+ break;
+ case HA_KEYTYPE_ULONG_INT:
+ RT_PAGE_MBR_KORR(share, uint32, mi_uint4korr, mi_int4store, 4, to);
+ break;
+#ifdef HAVE_LONG_LONG
+ case HA_KEYTYPE_LONGLONG:
+ RT_PAGE_MBR_KORR(share, longlong, mi_sint8korr, mi_int8store, 8, to);
+ break;
+ case HA_KEYTYPE_ULONGLONG:
+ RT_PAGE_MBR_KORR(share, ulonglong, mi_uint8korr, mi_int8store, 8, to);
+ break;
+#endif
+ case HA_KEYTYPE_FLOAT:
+ RT_PAGE_MBR_GET(share, float, mi_float4get, mi_float4store, 4, to);
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ RT_PAGE_MBR_GET(share, double, mi_float8get, mi_float8store, 8, to);
+ break;
+ case HA_KEYTYPE_END:
+ return 0;
+ default:
+ return 1;
+ }
+ }
+ return 0;
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_mbr.h b/storage/maria/ma_rt_mbr.h
new file mode 100644
index 00000000000..8fcd3d37b99
--- /dev/null
+++ b/storage/maria/ma_rt_mbr.h
@@ -0,0 +1,40 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _rt_mbr_h
+#define _rt_mbr_h
+
+#ifdef HAVE_RTREE_KEYS
+
+int maria_rtree_key_cmp(HA_KEYSEG *keyseg, const uchar *a, const uchar *b,
+ uint key_length, uint32 nextflag);
+int maria_rtree_combine_rect(const HA_KEYSEG *keyseg,
+ const uchar *, const uchar *, uchar*,
+ uint key_length);
+double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar*, uint key_length);
+int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a,
+ uint key_length, double *res);
+double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar *a, uchar *b,
+ uint key_length);
+double maria_rtree_area_increase(const HA_KEYSEG *keyseg, const uchar *a,
+ const uchar *b,
+ uint key_length, double *ab_area);
+double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b,
+ uint key_length, double *ab_perim);
+int maria_rtree_page_mbr(const HA_KEYSEG *keyseg, MARIA_PAGE *page,
+ uchar *key, uint key_length);
+#endif /*HAVE_RTREE_KEYS*/
+#endif /* _rt_mbr_h */
diff --git a/storage/maria/ma_rt_split.c b/storage/maria/ma_rt_split.c
new file mode 100644
index 00000000000..856edc60490
--- /dev/null
+++ b/storage/maria/ma_rt_split.c
@@ -0,0 +1,554 @@
+/* Copyright (C) 2006 MySQL AB & Alexey Botchkov & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+typedef struct
+{
+ double square;
+ int n_node;
+ const uchar *key;
+ double *coords;
+} SplitStruct;
+
+inline static double *reserve_coords(double **d_buffer, int n_dim)
+{
+ double *coords= *d_buffer;
+ (*d_buffer)+= n_dim * 2;
+ return coords;
+}
+
+static void mbr_join(double *a, const double *b, int n_dim)
+{
+ double *end= a + n_dim * 2;
+ do
+ {
+ if (a[0] > b[0])
+ a[0]= b[0];
+
+ if (a[1] < b[1])
+ a[1]= b[1];
+
+ a+= 2;
+ b+= 2;
+ } while (a != end);
+}
+
+/*
+Counts the square of mbr which is a join of a and b
+*/
+static double mbr_join_square(const double *a, const double *b, int n_dim)
+{
+ const double *end= a + n_dim * 2;
+ double square= 1.0;
+ do
+ {
+ square *=
+ ((a[1] < b[1]) ? b[1] : a[1]) - ((a[0] > b[0]) ? b[0] : a[0]);
+
+ a+= 2;
+ b+= 2;
+ } while (a != end);
+
+ return square;
+}
+
+static double count_square(const double *a, int n_dim)
+{
+ const double *end= a + n_dim * 2;
+ double square= 1.0;
+ do
+ {
+ square *= a[1] - a[0];
+ a+= 2;
+ } while (a != end);
+ return square;
+}
+
+inline static void copy_coords(double *dst, const double *src, int n_dim)
+{
+ memcpy(dst, src, sizeof(double) * (n_dim * 2));
+}
+
+/**
+ Select two nodes to collect group upon.
+
+ Note that such function uses 'double' arithmetic so may behave differently
+ on different platforms/builds. There are others in this file.
+*/
+static void pick_seeds(SplitStruct *node, int n_entries,
+ SplitStruct **seed_a, SplitStruct **seed_b, int n_dim)
+{
+ SplitStruct *cur1;
+ SplitStruct *lim1= node + (n_entries - 1);
+ SplitStruct *cur2;
+ SplitStruct *lim2= node + n_entries;
+
+ double max_d= -DBL_MAX;
+ double d;
+
+ for (cur1= node; cur1 < lim1; cur1++)
+ {
+ for (cur2=cur1 + 1; cur2 < lim2; cur2++)
+ {
+
+ d= mbr_join_square(cur1->coords, cur2->coords, n_dim) - cur1->square -
+ cur2->square;
+ if (d > max_d)
+ {
+ max_d= d;
+ *seed_a= cur1;
+ *seed_b= cur2;
+ }
+ }
+ }
+}
+
+/*
+Select next node and group where to add
+*/
+static void pick_next(SplitStruct *node, int n_entries, double *g1, double *g2,
+ SplitStruct **choice, int *n_group, int n_dim)
+{
+ SplitStruct *cur= node;
+ SplitStruct *end= node + n_entries;
+
+ double max_diff= -DBL_MAX;
+
+ for (; cur < end; cur++)
+ {
+ double diff;
+ double abs_diff;
+
+ if (cur->n_node)
+ {
+ continue;
+ }
+
+ diff= mbr_join_square(g1, cur->coords, n_dim) -
+ mbr_join_square(g2, cur->coords, n_dim);
+
+ abs_diff= fabs(diff);
+ if (abs_diff > max_diff)
+ {
+ max_diff= abs_diff;
+ *n_group= 1 + (diff > 0);
+ *choice= cur;
+ }
+ }
+}
+
+/*
+Mark not-in-group entries as n_group
+*/
+static void mark_all_entries(SplitStruct *node, int n_entries, int n_group)
+{
+ SplitStruct *cur= node;
+ SplitStruct *end= node + n_entries;
+
+ for (; cur < end; cur++)
+ {
+ if (cur->n_node)
+ {
+ continue;
+ }
+ cur->n_node= n_group;
+ }
+}
+
+static int split_maria_rtree_node(SplitStruct *node, int n_entries,
+ int all_size, /* Total key's size */
+ int key_size,
+ int min_size, /* Minimal group size */
+ int size1, int size2 /* initial group sizes */,
+ double **d_buffer, int n_dim)
+{
+ SplitStruct *cur;
+ SplitStruct *a;
+ SplitStruct *b;
+ double *g1= reserve_coords(d_buffer, n_dim);
+ double *g2= reserve_coords(d_buffer, n_dim);
+ SplitStruct *next;
+ int next_node;
+ int i;
+ SplitStruct *end= node + n_entries;
+ LINT_INIT(a);
+ LINT_INIT(b);
+ LINT_INIT(next);
+ LINT_INIT(next_node);
+
+ if (all_size < min_size * 2)
+ {
+ return 1;
+ }
+
+ cur= node;
+ for (; cur < end; cur++)
+ {
+ cur->square= count_square(cur->coords, n_dim);
+ cur->n_node= 0;
+ }
+
+ pick_seeds(node, n_entries, &a, &b, n_dim);
+ a->n_node= 1;
+ b->n_node= 2;
+
+
+ copy_coords(g1, a->coords, n_dim);
+ size1+= key_size;
+ copy_coords(g2, b->coords, n_dim);
+ size2+= key_size;
+
+
+ for (i=n_entries - 2; i>0; --i)
+ {
+ if (all_size - (size2 + key_size) < min_size) /* Can't write into group 2 */
+ {
+ mark_all_entries(node, n_entries, 1);
+ break;
+ }
+
+ if (all_size - (size1 + key_size) < min_size) /* Can't write into group 1 */
+ {
+ mark_all_entries(node, n_entries, 2);
+ break;
+ }
+
+ pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim);
+ if (next_node == 1)
+ {
+ size1+= key_size;
+ mbr_join(g1, next->coords, n_dim);
+ }
+ else
+ {
+ size2+= key_size;
+ mbr_join(g2, next->coords, n_dim);
+ }
+ next->n_node= next_node;
+ }
+
+ return 0;
+}
+
+
+/**
+ Logs key reorganization done in a split page (new page is logged elsewhere).
+
+ The effect of a split on the split page is three changes:
+ - some piece of the page move to different places inside this page (we are
+ not interested here in the pieces which move to the new page)
+ - the key is inserted into the page or not (could be in the new page)
+ - page is shrunk
+ All this is uniquely determined by a few parameters:
+ - the key (starting at 'key-nod_flag', for 'full_length' bytes
+ (maria_rtree_split_page() seems to depend on its parameters key&key_length
+ but in fact it reads more (to the left: nod_flag, and to the right:
+ full_length)
+ - the binary content of the page
+ - some variables in the share
+ - double arithmetic, which is unpredictable from machine to machine and
+ from build to build (see pick_seeds() above: it has a comparison between
+ double-s 'if (d > max_d)' so the comparison can go differently from machine
+ to machine or build to build, it has happened in real life).
+ If one day we use precision-math instead of double-math, in GIS, then the
+ last parameter would become constant accross machines and builds and we
+ could some cheap logging: just log the few parameters above.
+ Until then, we log the list of memcpy() operations (fortunately, we often do
+ not have to log the source bytes, as they can be found in the page before
+ applying the REDO; the only source bytes to log are the key), the key if it
+ was inserted into this page, and the shrinking.
+
+ @param info table
+ @param page page's offset in the file
+ @param buff content of the page (post-split)
+ @param key_with_nod_flag pointer to key-nod_flag
+ @param full_length length of (key + (nod_flag (if node) or rowid (if
+ leaf)))
+ @param log_internal_copy encoded list of mempcy() operations done on
+ split page, having their source in the page
+ @param log_internal_copy_length length of above list, in bytes
+ @param log_key_copy operation describing the key's copy, or NULL if the
+ inserted key was not put into the page (was put in
+ new page, so does not have to be logged here)
+ @param length_diff by how much the page has shrunk during split
+*/
+
+static my_bool _ma_log_rt_split(MARIA_PAGE *page,
+ const uchar *key_with_nod_flag,
+ uint full_length,
+ const uchar *log_internal_copy,
+ uint log_internal_copy_length,
+ const uchar *log_key_copy,
+ uint length_diff)
+{
+ MARIA_HA *info= page->info;
+ MARIA_SHARE *share= info->s;
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 1 + 2 + 1 + 2 + 2 + 7],
+ *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6];
+ uint translog_parts, extra_length= 0;
+ my_off_t page_pos;
+ DBUG_ENTER("_ma_log_rt_split");
+ DBUG_PRINT("enter", ("page: %lu", (ulong) page));
+
+ DBUG_ASSERT(share->now_transactional);
+ page_pos= page->pos / share->block_size;
+ page_store(log_data + FILEID_STORE_SIZE, page_pos);
+ log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+ log_pos[0]= KEY_OP_DEL_SUFFIX;
+ log_pos++;
+ DBUG_ASSERT((int)length_diff > 0);
+ int2store(log_pos, length_diff);
+ log_pos+= 2;
+ log_pos[0]= KEY_OP_MULTI_COPY;
+ log_pos++;
+ int2store(log_pos, full_length);
+ log_pos+= 2;
+ int2store(log_pos, log_internal_copy_length);
+ log_pos+= 2;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data) - 7;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_internal_copy;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= log_internal_copy_length;
+ translog_parts= 2;
+ if (log_key_copy != NULL) /* need to store key into record */
+ {
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].str= log_key_copy;
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].length= 1 + 2 + 1 + 2;
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].str= key_with_nod_flag;
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].length= full_length;
+ extra_length= 1 + 2 + 1 + 2 + full_length;
+ translog_parts+= 2;
+ }
+
+ _ma_log_key_changes(page,
+ log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_pos, &extra_length, &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ page->org_size= page->size;
+
+ if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t) ((log_pos - log_data) +
+ log_internal_copy_length +
+ extra_length),
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+/**
+ 0 ok; the created page is put into page cache; the shortened one is not (up
+ to the caller to do it)
+ 1 or -1: error.
+ If new_page_offs==NULL, won't create new page (for redo phase).
+*/
+
+int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page,
+ my_off_t *new_page_offs)
+{
+ MARIA_HA *info= page->info;
+ MARIA_SHARE *share= info->s;
+ const my_bool transactional= share->now_transactional;
+ int n1, n2; /* Number of items in groups */
+ SplitStruct *task;
+ SplitStruct *cur;
+ SplitStruct *stop;
+ double *coord_buf;
+ double *next_coord;
+ double *old_coord;
+ int n_dim;
+ uchar *source_cur, *cur1, *cur2;
+ uchar *new_page_buff, *log_internal_copy, *log_internal_copy_ptr,
+ *log_key_copy= NULL;
+ int err_code= 0;
+ uint new_page_length;
+ uint nod_flag= page->node;
+ uint org_length= page->size;
+ uint full_length= key->data_length + (nod_flag ? nod_flag :
+ key->ref_length);
+ uint key_data_length= key->data_length;
+ int max_keys= ((org_length - share->keypage_header) / (full_length));
+ MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ DBUG_ENTER("maria_rtree_split_page");
+ DBUG_PRINT("rtree", ("splitting block"));
+
+ n_dim= keyinfo->keysegs / 2;
+
+ if (!(coord_buf= (double*) my_alloca(n_dim * 2 * sizeof(double) *
+ (max_keys + 1 + 4) +
+ sizeof(SplitStruct) * (max_keys + 1))))
+ DBUG_RETURN(-1); /* purecov: inspected */
+
+ task= (SplitStruct *)(coord_buf + n_dim * 2 * (max_keys + 1 + 4));
+
+ next_coord= coord_buf;
+
+ stop= task + max_keys;
+ source_cur= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag);
+
+ for (cur= task;
+ cur < stop;
+ cur++, source_cur= rt_PAGE_NEXT_KEY(share, source_cur, key_data_length,
+ nod_flag))
+ {
+ cur->coords= reserve_coords(&next_coord, n_dim);
+ cur->key= source_cur;
+ maria_rtree_d_mbr(keyinfo->seg, source_cur, key_data_length, cur->coords);
+ }
+
+ cur->coords= reserve_coords(&next_coord, n_dim);
+ maria_rtree_d_mbr(keyinfo->seg, key->data, key_data_length, cur->coords);
+ cur->key= key->data;
+
+ old_coord= next_coord;
+
+ if (split_maria_rtree_node(task, max_keys + 1,
+ page->size + full_length + 2,
+ full_length,
+ rt_PAGE_MIN_SIZE(keyinfo->block_length),
+ 2, 2, &next_coord, n_dim))
+ {
+ err_code= 1;
+ goto split_err;
+ }
+
+ /* Allocate buffer for new page and piece of log record */
+ if (!(new_page_buff= (uchar*) my_alloca((uint)keyinfo->block_length +
+ (transactional ?
+ (max_keys * (2 + 2) +
+ 1 + 2 + 1 + 2) : 0))))
+ {
+ err_code= -1;
+ goto split_err;
+ }
+ log_internal_copy= log_internal_copy_ptr= new_page_buff +
+ keyinfo->block_length;
+ bzero(new_page_buff, share->block_size);
+
+ stop= task + (max_keys + 1);
+ cur1= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag);
+ cur2= rt_PAGE_FIRST_KEY(share, new_page_buff, nod_flag);
+
+ n1= n2= 0;
+ for (cur= task; cur < stop; cur++)
+ {
+ uchar *to;
+ const uchar *cur_key= cur->key;
+ my_bool log_this_change;
+ DBUG_ASSERT(log_key_copy == NULL);
+ if (cur->n_node == 1)
+ {
+ to= cur1;
+ cur1= rt_PAGE_NEXT_KEY(share, cur1, key_data_length, nod_flag);
+ n1++;
+ log_this_change= transactional;
+ }
+ else
+ {
+ to= cur2;
+ cur2= rt_PAGE_NEXT_KEY(share, cur2, key_data_length, nod_flag);
+ n2++;
+ log_this_change= FALSE;
+ }
+ if (to != cur_key)
+ {
+ uchar *to_with_nod_flag= to - nod_flag;
+ const uchar *cur_key_with_nod_flag= cur_key - nod_flag;
+ memcpy(to_with_nod_flag, cur_key_with_nod_flag, full_length);
+ if (log_this_change)
+ {
+ uint to_with_nod_flag_offs= to_with_nod_flag - page->buff;
+ if (likely(cur_key != key->data))
+ {
+ /* this memcpy() is internal to the page (source in the page) */
+ uint cur_key_with_nod_flag_offs= cur_key_with_nod_flag - page->buff;
+ int2store(log_internal_copy_ptr, to_with_nod_flag_offs);
+ log_internal_copy_ptr+= 2;
+ int2store(log_internal_copy_ptr, cur_key_with_nod_flag_offs);
+ log_internal_copy_ptr+= 2;
+ }
+ else
+ {
+ /* last iteration, and this involves *key: source is external */
+ log_key_copy= log_internal_copy_ptr;
+ log_key_copy[0]= KEY_OP_OFFSET;
+ int2store(log_key_copy + 1, to_with_nod_flag_offs);
+ log_key_copy[3]= KEY_OP_CHANGE;
+ int2store(log_key_copy + 4, full_length);
+ /* _ma_log_rt_split() will store *key, right after */
+ }
+ }
+ }
+ }
+ { /* verify that above loop didn't touch header bytes */
+ uint i;
+ for (i= 0; i < share->keypage_header; i++)
+ DBUG_ASSERT(new_page_buff[i]==0);
+ }
+
+ if (nod_flag)
+ _ma_store_keypage_flag(share, new_page_buff, KEYPAGE_FLAG_ISNOD);
+ _ma_store_keynr(share, new_page_buff, keyinfo->key_nr);
+ new_page_length= share->keypage_header + n2 * full_length;
+ _ma_store_page_used(share, new_page_buff, new_page_length);
+ page->size= share->keypage_header + n1 * full_length;
+ page_store_size(share, page);
+
+ if ((*new_page_offs= _ma_new(info, DFLT_INIT_HITS, &page_link)) ==
+ HA_OFFSET_ERROR)
+ err_code= -1;
+ else
+ {
+ MARIA_PAGE new_page;
+ _ma_page_setup(&new_page, info, keyinfo, *new_page_offs, new_page_buff);
+
+ if (transactional &&
+ ( /* log change to split page */
+ _ma_log_rt_split(page, key->data - nod_flag,
+ full_length, log_internal_copy,
+ log_internal_copy_ptr - log_internal_copy,
+ log_key_copy, org_length - page->size) ||
+ /* and to new page */
+ _ma_log_new(&new_page, 0)))
+ err_code= -1;
+
+ if (_ma_write_keypage(&new_page, page_link->write_lock,
+ DFLT_INIT_HITS))
+ err_code= -1;
+ }
+ DBUG_PRINT("rtree", ("split new block: %lu", (ulong) *new_page_offs));
+
+ my_afree(new_page);
+
+split_err:
+ my_afree(coord_buf);
+ DBUG_RETURN(err_code);
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_test.c b/storage/maria/ma_rt_test.c
new file mode 100644
index 00000000000..af54e6b27be
--- /dev/null
+++ b/storage/maria/ma_rt_test.c
@@ -0,0 +1,692 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Testing of the basic functions of a MARIA rtree table */
+/* Written by Alex Barkov who has a shared copyright to this code */
+
+
+#include "maria_def.h"
+#include "ma_control_file.h"
+#include "ma_loghandler.h"
+#include "ma_checkpoint.h"
+#include "trnman.h"
+#include <my_getopt.h>
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+
+#define MAX_REC_LENGTH 1024
+#define ndims 2
+#define KEYALG HA_KEY_ALG_RTREE
+
+static int read_with_pos(MARIA_HA * file);
+static void create_record(uchar *record,uint rownr);
+static void create_record1(uchar *record,uint rownr);
+static void print_record(uchar * record,my_off_t offs,const char * tail);
+static int run_test(const char *filename);
+static void get_options(int argc, char *argv[]);
+static void usage();
+
+static double rt_data[]=
+{
+ /*1*/ 0,10,0,10,
+ /*2*/ 5,15,0,10,
+ /*3*/ 0,10,5,15,
+ /*4*/ 10,20,10,20,
+ /*5*/ 0,10,0,10,
+ /*6*/ 5,15,0,10,
+ /*7*/ 0,10,5,15,
+ /*8*/ 10,20,10,20,
+ /*9*/ 0,10,0,10,
+ /*10*/ 5,15,0,10,
+ /*11*/ 0,10,5,15,
+ /*12*/ 10,20,10,20,
+ /*13*/ 0,10,0,10,
+ /*14*/ 5,15,0,10,
+ /*15*/ 0,10,5,15,
+ /*16*/ 10,20,10,20,
+ /*17*/ 5,15,0,10,
+ /*18*/ 0,10,5,15,
+ /*19*/ 10,20,10,20,
+ /*20*/ 0,10,0,10,
+
+ /*1*/ 100,110,0,10,
+ /*2*/ 105,115,0,10,
+ /*3*/ 100,110,5,15,
+ /*4*/ 110,120,10,20,
+ /*5*/ 100,110,0,10,
+ /*6*/ 105,115,0,10,
+ /*7*/ 100,110,5,15,
+ /*8*/ 110,120,10,20,
+ /*9*/ 100,110,0,10,
+ /*10*/ 105,115,0,10,
+ /*11*/ 100,110,5,15,
+ /*12*/ 110,120,10,20,
+ /*13*/ 100,110,0,10,
+ /*14*/ 105,115,0,10,
+ /*15*/ 100,110,5,15,
+ /*16*/ 110,120,10,20,
+ /*17*/ 105,115,0,10,
+ /*18*/ 100,110,5,15,
+ /*19*/ 110,120,10,20,
+ /*20*/ 100,110,0,10,
+ -1
+};
+
+static int testflag, checkpoint, create_flag;
+static my_bool silent, transactional, die_in_middle_of_transaction,
+ opt_versioning;
+static enum data_file_type record_type= DYNAMIC_RECORD;
+
+int main(int argc, char *argv[])
+{
+ MY_INIT(argv[0]);
+ get_options(argc, argv);
+ maria_data_root= (char *)".";
+ /* Maria requires that we always have a page cache */
+ if (maria_init() ||
+ (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0,
+ maria_block_size, MY_WME) == 0) ||
+ ma_control_file_open(TRUE, TRUE) ||
+ (init_pagecache(maria_log_pagecache,
+ TRANSLOG_PAGECACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, MY_WME) == 0) ||
+ translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+ 0, 0, maria_log_pagecache,
+ TRANSLOG_DEFAULT_FLAGS, 0) ||
+ (transactional && (trnman_init(0) || ma_checkpoint_init(0))))
+ {
+ fprintf(stderr, "Error in initialization\n");
+ exit(1);
+ }
+
+ exit(run_test("rt_test"));
+}
+
+
+static int run_test(const char *filename)
+{
+ MARIA_HA *file;
+ MARIA_UNIQUEDEF uniquedef;
+ MARIA_CREATE_INFO create_info;
+ MARIA_COLUMNDEF recinfo[20];
+ MARIA_KEYDEF keyinfo[20];
+ HA_KEYSEG keyseg[20];
+ key_range range;
+
+ int opt_unique=0;
+ int key_type=HA_KEYTYPE_DOUBLE;
+ int key_length=8;
+ int null_fields=0;
+ int nrecords=sizeof(rt_data)/(sizeof(double)*4);/* 40 */
+ int rec_length=0;
+ int uniques=0;
+ int i, max_i;
+ int error;
+ int row_count=0;
+ uchar record[MAX_REC_LENGTH];
+ uchar read_record[MAX_REC_LENGTH];
+ int upd= 10;
+ ha_rows hrows;
+
+ bzero(&uniquedef, sizeof(uniquedef));
+ bzero(&create_info, sizeof(create_info));
+ bzero(recinfo, sizeof(recinfo));
+ bzero(keyinfo, sizeof(keyinfo));
+ bzero(keyseg, sizeof(keyseg));
+
+ /* Define a column for NULLs and DEL markers*/
+
+ recinfo[0].type=FIELD_NORMAL;
+ recinfo[0].length=1; /* For NULL bits */
+ rec_length=1;
+
+ /* Define 2*ndims columns for coordinates*/
+
+ for (i=1; i<=2*ndims ;i++)
+ {
+ recinfo[i].type=FIELD_NORMAL;
+ recinfo[i].length=key_length;
+ rec_length+=key_length;
+ }
+
+ /* Define a key with 2*ndims segments */
+
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=2*ndims;
+ keyinfo[0].flag=0;
+ keyinfo[0].key_alg=KEYALG;
+
+ for (i=0; i<2*ndims; i++)
+ {
+ keyinfo[0].seg[i].type= key_type;
+ keyinfo[0].seg[i].flag=0; /* Things like HA_REVERSE_SORT */
+ keyinfo[0].seg[i].start= (key_length*i)+1;
+ keyinfo[0].seg[i].length=key_length;
+ keyinfo[0].seg[i].null_bit= null_fields ? 2 : 0;
+ keyinfo[0].seg[i].null_pos=0;
+ keyinfo[0].seg[i].language=default_charset_info->number;
+ }
+
+ if (!silent)
+ printf("- Creating isam-file\n");
+
+ create_info.max_rows=10000000;
+ create_info.transactional= transactional;
+
+ if (maria_create(filename,
+ record_type,
+ 1, /* keys */
+ keyinfo,
+ 1+2*ndims+opt_unique, /* columns */
+ recinfo,uniques,&uniquedef,&create_info,create_flag))
+ goto err;
+
+ if (!silent)
+ printf("- Open isam-file\n");
+
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+ maria_begin(file);
+ if (opt_versioning)
+ maria_versioning(file, 1);
+ if (testflag == 1)
+ goto end;
+ if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ for (i=0; i<nrecords; i++ )
+ {
+ create_record(record,i);
+ error=maria_write(file,record);
+ print_record(record,maria_position(file),"\n");
+ if (!error)
+ {
+ row_count++;
+ }
+ else
+ {
+ fprintf(stderr, "maria_write: %d\n", error);
+ goto err;
+ }
+ }
+
+ if (maria_scan_init(file))
+ {
+ fprintf(stderr, "maria_scan_init failed\n");
+ goto err;
+ }
+ if ((error=read_with_pos(file)))
+ goto err;
+ maria_scan_end(file);
+
+ if (!silent)
+ printf("- Reading rows with key\n");
+
+ for (i=0 ; i < nrecords ; i++)
+ {
+ my_errno=0;
+ create_record(record,i);
+
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rkey(file,read_record,0,record+1,HA_WHOLE_KEY,HA_READ_MBR_EQUAL);
+
+ if (error && error!=HA_ERR_KEY_NOT_FOUND)
+ {
+ fprintf(stderr," maria_rkey: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ if (error == HA_ERR_KEY_NOT_FOUND)
+ {
+ print_record(record,maria_position(file)," NOT FOUND\n");
+ continue;
+ }
+ print_record(read_record,maria_position(file),"\n");
+ }
+
+ if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (testflag == 2)
+ goto end;
+
+ if (!silent)
+ printf("- Deleting rows\n");
+ if (maria_scan_init(file))
+ {
+ fprintf(stderr, "maria_scan_init failed\n");
+ goto err;
+ }
+
+ for (i=0; i < nrecords/4; i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_scan(file,read_record);
+ if (error)
+ {
+ fprintf(stderr, "pos: %2d maria_rrnd: %3d errno: %3d\n", i, error,
+ my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file),"\n");
+
+ error=maria_delete(file,read_record);
+ if (error)
+ {
+ fprintf(stderr, "pos: %2d maria_delete: %3d errno: %3d\n", i, error,
+ my_errno);
+ goto err;
+ }
+ }
+ maria_scan_end(file);
+
+ if (testflag == 3)
+ goto end;
+ if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (!silent)
+ printf("- Updating rows with position\n");
+ if (maria_scan_init(file))
+ {
+ fprintf(stderr, "maria_scan_init failed\n");
+ goto err;
+ }
+
+ /* We are looking for nrecords-necords/2 non-deleted records */
+ for (i=0, max_i= nrecords - nrecords/2; i < max_i ; i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_scan(file,read_record);
+ if (error)
+ {
+ if (error==HA_ERR_RECORD_DELETED)
+ {
+ if (!silent)
+ printf("found deleted record\n");
+ /*
+ In BLOCK_RECORD format, maria_scan() never returns deleted records,
+ while in DYNAMIC format it can. Don't count such record:
+ */
+ max_i++;
+ continue;
+ }
+ fprintf(stderr, "pos: %2d maria_rrnd: %3d errno: %3d\n",i , error,
+ my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file),"");
+ create_record1(record,i+nrecords*upd);
+ if (!silent)
+ printf("\t-> ");
+ print_record(record,maria_position(file),"\n");
+ error=maria_update(file,read_record,record);
+ if (error)
+ {
+ fprintf(stderr, "pos: %2d maria_update: %3d errno: %3d\n",i, error,
+ my_errno);
+ goto err;
+ }
+ }
+
+ if (testflag == 4)
+ goto end;
+ if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (maria_scan_init(file))
+ {
+ fprintf(stderr, "maria_scan_init failed\n");
+ goto err;
+ }
+ if ((error=read_with_pos(file)))
+ goto err;
+ maria_scan_end(file);
+
+ if (!silent)
+ printf("- Test maria_rkey then a sequence of maria_rnext_same\n");
+
+ create_record(record, nrecords*4/5);
+ print_record(record,0," search for\n");
+
+ if ((error=maria_rkey(file,read_record,0,record+1,HA_WHOLE_KEY,
+ HA_READ_MBR_INTERSECT)))
+ {
+ fprintf(stderr, "maria_rkey: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rkey\n");
+ row_count=1;
+
+ for (;;)
+ {
+ if ((error=maria_rnext_same(file,read_record)))
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ fprintf(stderr, "maria_next: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rnext_same\n");
+ row_count++;
+ }
+ if (!silent)
+ printf(" %d rows\n",row_count);
+
+ if (!silent)
+ printf("- Test maria_rfirst then a sequence of maria_rnext\n");
+
+ error=maria_rfirst(file,read_record,0);
+ if (error)
+ {
+ fprintf(stderr, "maria_rfirst: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ row_count=1;
+ print_record(read_record,maria_position(file)," maria_frirst\n");
+
+ for (i=0;i<nrecords;i++)
+ {
+ if ((error=maria_rnext(file,read_record,0)))
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ fprintf(stderr, "maria_next: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rnext\n");
+ row_count++;
+ }
+ if (!silent)
+ printf(" %d rows\n",row_count);
+
+ if (!silent)
+ printf("- Test maria_records_in_range()\n");
+
+ create_record1(record, nrecords*4/5);
+ print_record(record,0,"\n");
+
+ range.key= record+1;
+ range.length= 1000; /* Big enough */
+ range.flag= HA_READ_MBR_INTERSECT;
+ hrows= maria_records_in_range(file,0, &range, (key_range*) 0);
+ if (!silent)
+ printf(" %ld rows\n", (long) hrows);
+
+end:
+ maria_scan_end(file);
+ if (die_in_middle_of_transaction)
+ {
+ /* see similar code in ma_test2.c for comments */
+ switch (die_in_middle_of_transaction) {
+ case 1:
+ _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_RELEASE, FLUSH_RELEASE);
+ break;
+ case 2:
+ if (translog_flush(file->trn->undo_lsn))
+ goto err;
+ break;
+ case 3:
+ break;
+ case 4:
+ _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE,
+ FLUSH_RELEASE);
+ if (translog_flush(file->trn->undo_lsn))
+ goto err;
+ break;
+ }
+ if (!silent)
+ printf("Dying on request without maria_commit()/maria_close()\n");
+ exit(0);
+ }
+ if (maria_commit(file))
+ goto err;
+ if (maria_close(file)) goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return 0;
+
+err:
+ fprintf(stderr, "got error: %3d when using maria-database\n",my_errno);
+ return 1; /* skip warning */
+}
+
+
+
+static int read_with_pos (MARIA_HA * file)
+{
+ int error;
+ int i;
+ uchar read_record[MAX_REC_LENGTH];
+
+ if (!silent)
+ printf("- Reading rows with position\n");
+ for (i=0;;i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_scan(file,read_record);
+ if (error)
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ if (error==HA_ERR_RECORD_DELETED)
+ continue;
+ fprintf(stderr, "pos: %2d maria_rrnd: %3d errno: %3d\n", i, error,
+ my_errno);
+ return error;
+ }
+ print_record(read_record,maria_position(file),"\n");
+ }
+ return 0;
+}
+
+
+#ifdef NOT_USED
+static void bprint_record(char * record,
+ my_off_t offs __attribute__((unused)),
+ const char * tail)
+{
+ int i;
+ char * pos;
+ if (silent)
+ return;
+ i=(unsigned char)record[0];
+ printf("%02X ",i);
+
+ for( pos=record+1, i=0; i<32; i++,pos++){
+ int b=(unsigned char)*pos;
+ printf("%02X",b);
+ }
+ printf("%s",tail);
+}
+#endif
+
+
+static void print_record(uchar *record,
+ my_off_t offs __attribute__((unused)),
+ const char * tail)
+{
+ int i;
+ uchar *pos;
+ double c;
+
+ if (silent)
+ return;
+ printf(" rec=(%d)",(unsigned char)record[0]);
+ for ( pos=record+1, i=0; i<2*ndims; i++)
+ {
+ memcpy(&c,pos,sizeof(c));
+ float8get(c,pos);
+ printf(" %.14g ",c);
+ pos+=sizeof(c);
+ }
+ printf("pos=%ld",(long int)offs);
+ printf("%s",tail);
+}
+
+
+
+static void create_record1(uchar *record, uint rownr)
+{
+ int i;
+ uchar *pos;
+ double c=rownr+10;
+
+ bzero((char*) record,MAX_REC_LENGTH);
+ record[0]=0x01; /* DEL marker */
+
+ for ( pos=record+1, i=0; i<2*ndims; i++)
+ {
+ memcpy(pos,&c,sizeof(c));
+ float8store(pos,c);
+ pos+=sizeof(c);
+ }
+}
+
+#ifdef NOT_USED
+
+static void create_record0(char *record,uint rownr)
+{
+ int i;
+ char * pos;
+ double c=rownr+10;
+ double c0=0;
+
+ bzero((char*) record,MAX_REC_LENGTH);
+ record[0]=0x01; /* DEL marker */
+
+ for ( pos=record+1, i=0; i<ndims; i++)
+ {
+ memcpy(pos,&c0,sizeof(c0));
+ float8store(pos,c0);
+ pos+=sizeof(c0);
+ memcpy(pos,&c,sizeof(c));
+ float8store(pos,c);
+ pos+=sizeof(c);
+ }
+}
+
+#endif
+
+static void create_record(uchar *record, uint rownr)
+{
+ int i;
+ uchar *pos;
+ double *data= rt_data+rownr*4;
+ record[0]=0x01; /* DEL marker */
+ for ( pos=record+1, i=0; i<ndims*2; i++)
+ {
+ float8store(pos,data[i]);
+ pos+=8;
+ }
+}
+
+
+static struct my_option my_long_options[] =
+{
+ {"checkpoint", 'H', "Checkpoint at specified stage", (uchar**) &checkpoint,
+ (uchar**) &checkpoint, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"checksum", 'c', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+ {"debug", '#', "Undocumented",
+ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"help", '?', "Display help and exit",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"row-fixed-size", 'S', "Fixed size records",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"rows-in-block", 'M', "Store rows in block format",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"silent", 's', "Undocumented",
+ (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0,
+ 0, 0},
+ {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag,
+ (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"test-undo", 'A',
+ "Abort hard. Used for testing recovery with undo",
+ (uchar**) &die_in_middle_of_transaction,
+ (uchar**) &die_in_middle_of_transaction,
+ 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"transactional", 'T',
+ "Test in transactional mode. (Only works with block format)",
+ (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG,
+ 0, 0, 0, 0, 0, 0},
+ {"versioning", 'C', "Use row versioning (only works with block format)",
+ (uchar**) &opt_versioning, (uchar**) &opt_versioning, 0, GET_BOOL,
+ NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch(optid) {
+ case 'c':
+ create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM;
+ break;
+ case 'M':
+ record_type= BLOCK_RECORD;
+ break;
+ case 'S':
+ record_type= STATIC_RECORD;
+ break;
+ case '#':
+ DBUG_PUSH(argument);
+ break;
+ case '?':
+ usage();
+ exit(1);
+ }
+ return 0;
+}
+
+
+/* Read options */
+
+static void get_options(int argc, char *argv[])
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ return;
+} /* get options */
+
+
+static void usage()
+{
+ printf("Usage: %s [options]\n\n", my_progname);
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+}
+
+#else
+int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused)))
+{
+ exit(0);
+}
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_scan.c b/storage/maria/ma_scan.c
new file mode 100644
index 00000000000..cbac463a2c8
--- /dev/null
+++ b/storage/maria/ma_scan.c
@@ -0,0 +1,74 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Read through all rows sequntially */
+
+#include "maria_def.h"
+
+int maria_scan_init(register MARIA_HA *info)
+{
+ DBUG_ENTER("maria_scan_init");
+
+ info->cur_row.nextpos= info->s->pack.header_length; /* Read first record */
+ info->lastinx= -1; /* Can't forward or backward */
+ if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache))
+ DBUG_RETURN(my_errno);
+
+ if ((*info->s->scan_init)(info))
+ DBUG_RETURN(my_errno);
+ DBUG_RETURN(0);
+}
+
+/*
+ Read a row based on position.
+
+ SYNOPSIS
+ maria_scan()
+ info Maria handler
+ record Read data here
+
+ RETURN
+ 0 ok
+ HA_ERR_END_OF_FILE End of file
+ HA_ERR_RECORD_DELETED Record was deleted (can only happen for static rec)
+ # Error code
+*/
+
+int maria_scan(MARIA_HA *info, uchar *record)
+{
+ DBUG_ENTER("maria_scan");
+ /* Init all but update-flag */
+ info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+ DBUG_RETURN((*info->s->scan)(info, record, info->cur_row.nextpos, 1));
+}
+
+
+void maria_scan_end(MARIA_HA *info)
+{
+ (*info->s->scan_end)(info);
+}
+
+
+int _ma_def_scan_remember_pos(MARIA_HA *info, MARIA_RECORD_POS *lastpos)
+{
+ *lastpos= info->cur_row.lastpos;
+ return 0;
+}
+
+
+void _ma_def_scan_restore_pos(MARIA_HA *info, MARIA_RECORD_POS lastpos)
+{
+ info->cur_row.nextpos= lastpos;
+}
diff --git a/storage/maria/ma_search.c b/storage/maria/ma_search.c
new file mode 100644
index 00000000000..9f1e8e2554b
--- /dev/null
+++ b/storage/maria/ma_search.c
@@ -0,0 +1,2397 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* key handling functions */
+
+#include "ma_fulltext.h"
+#include "m_ctype.h"
+
+static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key,
+ uint32 nextflag, register my_off_t pos,
+ MARIA_PINNED_PAGE **res_page_link,
+ uchar **res_page_buff);
+static my_bool _ma_get_prev_key(MARIA_KEY *key, MARIA_PAGE *ma_page,
+ uchar *keypos);
+
+
+/* Check that new index is ok */
+
+int _ma_check_index(MARIA_HA *info, int inx)
+{
+ if (inx < 0 || ! maria_is_key_active(info->s->state.key_map, inx))
+ {
+ my_errno=HA_ERR_WRONG_INDEX;
+ return -1;
+ }
+ if (info->lastinx != inx) /* Index changed */
+ {
+ info->lastinx = inx;
+ info->page_changed=1;
+ info->update= ((info->update & (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED)) |
+ HA_STATE_NEXT_FOUND | HA_STATE_PREV_FOUND);
+ }
+ if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache))
+ return(-1);
+ return(inx);
+} /* _ma_check_index */
+
+
+/**
+ @breif Search after row by a key
+
+ @note
+ Position to row is stored in info->lastpos
+
+ @return
+ @retval 0 ok (key found)
+ @retval -1 Not found
+ @retval 1 If one should continue search on higher level
+*/
+
+int _ma_search(register MARIA_HA *info, MARIA_KEY *key, uint32 nextflag,
+ my_off_t pos)
+{
+ int error;
+ MARIA_PINNED_PAGE *page_link;
+ uchar *page_buff;
+
+ info->page_changed= 1; /* If page not saved */
+ if (!(error= _ma_search_no_save(info, key, nextflag, pos, &page_link,
+ &page_buff)))
+ {
+ if (nextflag & SEARCH_SAVE_BUFF)
+ {
+ bmove512(info->keyread_buff, page_buff, info->s->block_size);
+
+ /* Save position for a possible read next / previous */
+ info->int_keypos= info->keyread_buff + info->keypos_offset;
+ info->int_maxpos= info->keyread_buff + info->maxpos_offset;
+ info->int_keytree_version= key->keyinfo->version;
+ info->last_search_keypage= info->last_keypage;
+ info->page_changed= 0;
+ info->keyread_buff_used= 0;
+ }
+ }
+ _ma_unpin_all_pages(info, LSN_IMPOSSIBLE);
+ return (error);
+}
+
+/**
+ @breif Search after row by a key
+
+ ret_page_link Will contain pointer to page where we found key
+
+ @note
+ Position to row is stored in info->lastpos
+
+ @return
+ @retval 0 ok (key found)
+ @retval -1 Not found
+ @retval 1 If one should continue search on higher level
+*/
+
+static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key,
+ uint32 nextflag, register my_off_t pos,
+ MARIA_PINNED_PAGE **res_page_link,
+ uchar **res_page_buff)
+{
+ my_bool last_key_not_used;
+ int error,flag;
+ uint page_flag, nod_flag, used_length;
+ uchar *keypos,*maxpos;
+ uchar lastkey[MARIA_MAX_KEY_BUFF];
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_PAGE page;
+ MARIA_PINNED_PAGE *page_link;
+ DBUG_ENTER("_ma_search");
+ DBUG_PRINT("enter",("page: %lu nextflag: %u lastpos: %lu",
+ (ulong) (pos / info->s->block_size),
+ nextflag, (ulong) info->cur_row.lastpos));
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key););
+
+ if (pos == HA_OFFSET_ERROR)
+ {
+ my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ if (!(nextflag & (SEARCH_SMALLER | SEARCH_BIGGER | SEARCH_LAST)))
+ DBUG_RETURN(-1); /* Not found ; return error */
+ DBUG_RETURN(1); /* Search at upper levels */
+ }
+
+ if (_ma_fetch_keypage(&page, info, keyinfo, pos,
+ PAGECACHE_LOCK_READ, DFLT_INIT_HITS, 0, 0))
+ goto err;
+ page_link= dynamic_element(&info->pinned_pages,
+ info->pinned_pages.elements-1,
+ MARIA_PINNED_PAGE*);
+ DBUG_DUMP("page", page.buff, page.size);
+
+ flag= (*keyinfo->bin_search)(key, &page, nextflag, &keypos, lastkey,
+ &last_key_not_used);
+ if (flag == MARIA_FOUND_WRONG_KEY)
+ DBUG_RETURN(-1);
+ page_flag= page.flag;
+ used_length= page.size;
+ nod_flag= page.node;
+ maxpos= page.buff + used_length -1;
+
+ if (flag)
+ {
+ if ((error= _ma_search_no_save(info, key, nextflag,
+ _ma_kpos(nod_flag,keypos),
+ res_page_link, res_page_buff)) <= 0)
+ DBUG_RETURN(error);
+
+ if (flag >0)
+ {
+ if (nextflag & (SEARCH_SMALLER | SEARCH_LAST) &&
+ keypos == page.buff + info->s->keypage_header + nod_flag)
+ DBUG_RETURN(1); /* Bigger than key */
+ }
+ else if (nextflag & SEARCH_BIGGER && keypos >= maxpos)
+ DBUG_RETURN(1); /* Smaller than key */
+ }
+ else
+ {
+ /* Found matching key */
+ if ((nextflag & SEARCH_FIND) && nod_flag &&
+ ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME ||
+ (key->flag & SEARCH_PART_KEY) || info->s->base.born_transactional))
+ {
+ if ((error= _ma_search_no_save(info, key, (nextflag | SEARCH_FIND) &
+ ~(SEARCH_BIGGER | SEARCH_SMALLER |
+ SEARCH_LAST),
+ _ma_kpos(nod_flag,keypos),
+ res_page_link, res_page_buff)) >= 0 ||
+ my_errno != HA_ERR_KEY_NOT_FOUND)
+ DBUG_RETURN(error);
+ }
+ }
+
+ info->last_key.keyinfo= keyinfo;
+ if ((nextflag & (SEARCH_SMALLER | SEARCH_LAST)) && flag != 0)
+ {
+ uint not_used[2];
+ if (_ma_get_prev_key(&info->last_key, &page, keypos))
+ goto err;
+ /*
+ We have to use key->flag >> 1 here to transform
+ SEARCH_PAGE_KEY_HAS_TRANSID to SEARCH_USER_KEY_HAS_TRANSID
+ */
+ if (!(nextflag & SEARCH_SMALLER) &&
+ ha_key_cmp(keyinfo->seg, info->last_key.data, key->data,
+ key->data_length + key->ref_length,
+ SEARCH_FIND | (key->flag >> 1) | info->last_key.flag,
+ not_used))
+ {
+ my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */
+ goto err;
+ }
+ }
+ else
+ {
+ /* Set info->last_key to temporarily point to last key value */
+ info->last_key.data= lastkey;
+ /* Get key value (if not packed key) and position after key */
+ if (!(*keyinfo->get_key)(&info->last_key, page_flag, nod_flag, &keypos))
+ goto err;
+ memcpy(info->lastkey_buff, lastkey,
+ info->last_key.data_length + info->last_key.ref_length);
+ info->last_key.data= info->lastkey_buff;
+ }
+ info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key);
+ info->cur_row.trid= _ma_trid_from_key(&info->last_key);
+
+ /* Store offset to key */
+ info->keypos_offset= (uint) (keypos - page.buff);
+ info->maxpos_offset= (uint) (maxpos - page.buff);
+ info->int_nod_flag= nod_flag;
+ info->last_keypage= pos;
+ *res_page_link= page_link;
+ *res_page_buff= page.buff;
+
+ DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
+ DBUG_RETURN(0);
+
+err:
+ DBUG_PRINT("exit",("Error: %d",my_errno));
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->page_changed=1;
+ DBUG_RETURN (-1);
+}
+
+
+/*
+ Search after key in page-block
+
+ @fn _ma_bin_search
+ @param key Search after this key
+ @param page Start of data page
+ @param comp_flag How key should be compared
+ @param ret_pos
+ @param buff Buffer for holding a key (not used here)
+ @param last_key
+
+ @note
+ If keys are packed, then smaller or identical key is stored in buff
+
+ @return
+ @retval <0, 0 , >0 depending on if if found is smaller, equal or bigger than
+ 'key'
+ @retval ret_pos Points to where the identical or bigger key starts
+ @retval last_key Set to 1 if key is the last key in the page.
+*/
+
+int _ma_bin_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page,
+ uint32 comp_flag, uchar **ret_pos,
+ uchar *buff __attribute__((unused)), my_bool *last_key)
+{
+ int flag;
+ uint page_flag;
+ uint start, mid, end, save_end, totlength, nod_flag;
+ uint not_used[2];
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_SHARE *share= keyinfo->share;
+ uchar *page;
+ DBUG_ENTER("_ma_bin_search");
+
+ LINT_INIT(flag);
+
+ page_flag= ma_page->flag;
+ if (page_flag & KEYPAGE_FLAG_HAS_TRANSID)
+ {
+ /* Keys have varying length, can't use binary search */
+ DBUG_RETURN(_ma_seq_search(key, ma_page, comp_flag, ret_pos, buff,
+ last_key));
+ }
+
+ nod_flag= ma_page->node;
+ totlength= keyinfo->keylength + nod_flag;
+ DBUG_ASSERT(ma_page->size >= share->keypage_header + nod_flag + totlength);
+
+ start=0;
+ mid=1;
+ save_end= end= ((ma_page->size - nod_flag - share->keypage_header) /
+ totlength-1);
+ DBUG_PRINT("test",("page_length: %u end: %u", ma_page->size, end));
+ page= ma_page->buff + share->keypage_header + nod_flag;
+
+ while (start != end)
+ {
+ mid= (start+end)/2;
+ if ((flag=ha_key_cmp(keyinfo->seg, page + (uint) mid * totlength,
+ key->data, key->data_length + key->ref_length,
+ comp_flag, not_used))
+ >= 0)
+ end=mid;
+ else
+ start=mid+1;
+ }
+ if (mid != start)
+ flag=ha_key_cmp(keyinfo->seg, page + (uint) start * totlength,
+ key->data, key->data_length + key->ref_length, comp_flag,
+ not_used);
+ if (flag < 0)
+ start++; /* point at next, bigger key */
+ *ret_pos= (page + (uint) start * totlength);
+ *last_key= end == save_end;
+ DBUG_PRINT("exit",("flag: %d keypos: %d",flag,start));
+ DBUG_RETURN(flag);
+} /* _ma_bin_search */
+
+
+/**
+ Locate a packed key in a key page.
+
+ @fn _ma_seq_search()
+ @param key Search key.
+ @param page Key page (beginning).
+ @param comp_flag Search flags like SEARCH_SAME etc.
+ @param ret_pos
+ @param buff Buffer for holding temp keys
+ @param last_key
+
+ @description
+ Used instead of _ma_bin_search() when key is packed.
+ Puts smaller or identical key in buff.
+ Key is searched sequentially.
+
+ @todo
+ Don't copy key to buffer if we are not using key with prefix packing
+
+ @return
+ @retval > 0 Key in 'buff' is smaller than search key.
+ @retval 0 Key in 'buff' is identical to search key.
+ @retval < 0 Not found.
+
+ @retval ret_pos Points to where the identical or bigger key starts
+ @retval last_key Set to 1 if key is the last key in the page
+ @retval buff Copy of previous or identical unpacked key
+*/
+
+int _ma_seq_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page,
+ uint32 comp_flag, uchar **ret_pos,
+ uchar *buff, my_bool *last_key)
+{
+ int flag;
+ uint page_flag, nod_flag, length, not_used[2];
+ uchar t_buff[MARIA_MAX_KEY_BUFF], *end;
+ uchar *page;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_SHARE *share= keyinfo->share;
+ MARIA_KEY tmp_key;
+ DBUG_ENTER("_ma_seq_search");
+
+ LINT_INIT(flag);
+ LINT_INIT(length);
+
+ page_flag= ma_page->flag;
+ nod_flag= ma_page->node;
+ page= ma_page->buff;
+ end= page + ma_page->size;
+ page+= share->keypage_header + nod_flag;
+ *ret_pos= page;
+ t_buff[0]=0; /* Avoid bugs */
+
+ tmp_key.data= t_buff;
+ tmp_key.keyinfo= keyinfo;
+ while (page < end)
+ {
+ length=(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &page);
+ if (length == 0 || page > end)
+ {
+ maria_print_error(share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_PRINT("error",
+ ("Found wrong key: length: %u page: 0x%lx end: 0x%lx",
+ length, (long) page, (long) end));
+ DBUG_RETURN(MARIA_FOUND_WRONG_KEY);
+ }
+ if ((flag= ha_key_cmp(keyinfo->seg, t_buff, key->data,
+ key->data_length + key->ref_length,
+ comp_flag | tmp_key.flag,
+ not_used)) >= 0)
+ break;
+ DBUG_PRINT("loop_extra",("page: 0x%lx key: '%s' flag: %d",
+ (long) page, t_buff, flag));
+ memcpy(buff,t_buff,length);
+ *ret_pos=page;
+ }
+ if (flag == 0)
+ memcpy(buff,t_buff,length); /* Result is first key */
+ *last_key= page == end;
+ DBUG_PRINT("exit",("flag: %d ret_pos: 0x%lx", flag, (long) *ret_pos));
+ DBUG_RETURN(flag);
+} /* _ma_seq_search */
+
+
+/**
+ Search for key on key page with string prefix compression
+
+ @notes
+ This is an optimized function compared to calling _ma_get_pack_key()
+ for each key in the buffer
+
+ Same interface as for _ma_seq_search()
+*/
+
+int _ma_prefix_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page,
+ uint32 nextflag, uchar **ret_pos, uchar *buff,
+ my_bool *last_key)
+{
+ /*
+ my_flag is raw comparison result to be changed according to
+ SEARCH_NO_FIND,SEARCH_LAST and HA_REVERSE_SORT flags.
+ flag is the value returned by ha_key_cmp and as treated as final
+ */
+ int flag=0, my_flag=-1;
+ uint nod_flag, length, len, matched, cmplen, kseg_len;
+ uint page_flag, prefix_len,suffix_len;
+ int key_len_skip, seg_len_pack, key_len_left;
+ uchar *end, *vseg, *saved_vseg, *saved_from;
+ uchar *page;
+ uchar tt_buff[MARIA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2;
+ uchar *saved_to;
+ const uchar *kseg;
+ uint saved_length=0, saved_prefix_len=0;
+ uint length_pack;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_SHARE *share= keyinfo->share;
+ const uchar *sort_order= keyinfo->seg->charset->sort_order;
+ DBUG_ENTER("_ma_prefix_search");
+
+ LINT_INIT(length);
+ LINT_INIT(prefix_len);
+ LINT_INIT(seg_len_pack);
+ LINT_INIT(saved_from);
+ LINT_INIT(saved_to);
+ LINT_INIT(saved_vseg);
+
+ t_buff[0]=0; /* Avoid bugs */
+ page_flag= ma_page->flag;
+ nod_flag= ma_page->node;
+ page_flag&= KEYPAGE_FLAG_HAS_TRANSID; /* For faster test in loop */
+ page= ma_page->buff;
+ end= page + ma_page->size;
+ page+= share->keypage_header + nod_flag;
+ *ret_pos= page;
+ kseg= key->data;
+
+ get_key_pack_length(kseg_len, length_pack, kseg);
+ key_len_skip=length_pack+kseg_len;
+ key_len_left=(int) (key->data_length + key->ref_length) - (int) key_len_skip;
+ /* If key_len is 0, then length_pack is 1, then key_len_left is -1. */
+ cmplen= ((key_len_left>=0) ? kseg_len :
+ (key->data_length + key->ref_length - length_pack));
+ DBUG_PRINT("info",("key: '%.*s'",kseg_len,kseg));
+
+ /*
+ Keys are compressed the following way:
+
+ If the max length of first key segment <= 127 bytes the prefix is
+ 1 uchar else it's 2 byte
+
+ (prefix) length The high bit is set if this is a prefix for the prev key.
+ [suffix length] Packed length of suffix if the previous was a prefix.
+ (suffix) data Key data bytes (past the common prefix or whole segment).
+ [next-key-seg] Next key segments (([packed length], data), ...)
+ pointer Reference to the data file (last_keyseg->length).
+ */
+
+ matched=0; /* how many char's from prefix were alredy matched */
+ len=0; /* length of previous key unpacked */
+
+ while (page < end)
+ {
+ uint packed= *page & 128;
+ uint key_flag;
+
+ vseg= page;
+ if (keyinfo->seg->length >= 127)
+ {
+ suffix_len=mi_uint2korr(vseg) & 32767;
+ vseg+=2;
+ }
+ else
+ suffix_len= *vseg++ & 127;
+
+ if (packed)
+ {
+ if (suffix_len == 0)
+ {
+ /* == 0x80 or 0x8000, same key, prefix length == old key length. */
+ prefix_len=len;
+ }
+ else
+ {
+ /* > 0x80 or 0x8000, this is prefix lgt, packed suffix lgt follows. */
+ prefix_len=suffix_len;
+ get_key_length(suffix_len,vseg);
+ }
+ }
+ else
+ {
+ /* Not packed. No prefix used from last key. */
+ prefix_len=0;
+ }
+
+ len=prefix_len+suffix_len;
+ seg_len_pack=get_pack_length(len);
+ t_buff=tt_buff+3-seg_len_pack;
+ store_key_length(t_buff,len);
+
+ if (prefix_len > saved_prefix_len)
+ memcpy(t_buff+seg_len_pack+saved_prefix_len,saved_vseg,
+ prefix_len-saved_prefix_len);
+ saved_vseg=vseg;
+ saved_prefix_len=prefix_len;
+
+ DBUG_PRINT("loop",("page: '%.*s%.*s'",prefix_len,t_buff+seg_len_pack,
+ suffix_len,vseg));
+ {
+ /* Calculate length of one key */
+ uchar *from= vseg+suffix_len;
+ HA_KEYSEG *keyseg;
+
+ for (keyseg=keyinfo->seg+1 ; keyseg->type ; keyseg++ )
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ if (!(*from++))
+ continue;
+ }
+ if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK))
+ {
+ uint key_part_length;
+ get_key_length(key_part_length,from);
+ from+= key_part_length;
+ }
+ else
+ from+= keyseg->length;
+ }
+ from+= keyseg->length;
+ key_flag=0;
+
+ if (page_flag && key_has_transid(from-1))
+ {
+ from+= transid_packed_length(from);
+ key_flag= SEARCH_PAGE_KEY_HAS_TRANSID;
+ }
+ page= from + nod_flag;
+ length= (uint) (from-vseg);
+ }
+
+ if (page > end)
+ {
+ maria_print_error(share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_PRINT("error",
+ ("Found wrong key: length: %u page: 0x%lx end: %lx",
+ length, (long) page, (long) end));
+ DBUG_RETURN(MARIA_FOUND_WRONG_KEY);
+ }
+
+ if (matched >= prefix_len)
+ {
+ /* We have to compare. But we can still skip part of the key */
+ uint left;
+ const uchar *k= kseg+prefix_len;
+
+ /*
+ If prefix_len > cmplen then we are in the end-space comparison
+ phase. Do not try to acces the key any more ==> left= 0.
+ */
+ left= ((len <= cmplen) ? suffix_len :
+ ((prefix_len < cmplen) ? cmplen - prefix_len : 0));
+
+ matched=prefix_len+left;
+
+ if (sort_order)
+ {
+ for (my_flag=0;left;left--)
+ if ((my_flag= (int) sort_order[*vseg++] - (int) sort_order[*k++]))
+ break;
+ }
+ else
+ {
+ for (my_flag=0;left;left--)
+ if ((my_flag= (int) *vseg++ - (int) *k++))
+ break;
+ }
+
+ if (my_flag>0) /* mismatch */
+ break;
+ if (my_flag==0) /* match */
+ {
+ /*
+ ** len cmplen seg_left_len more_segs
+ ** < matched=len; continue search
+ ** > = prefix ? found : (matched=len;
+ * continue search)
+ ** > < - ok, found
+ ** = < - ok, found
+ ** = = - ok, found
+ ** = = + next seg
+ */
+ if (len < cmplen)
+ {
+ if ((keyinfo->seg->type != HA_KEYTYPE_TEXT &&
+ keyinfo->seg->type != HA_KEYTYPE_VARTEXT1 &&
+ keyinfo->seg->type != HA_KEYTYPE_VARTEXT2))
+ my_flag= -1;
+ else
+ {
+ /* We have to compare k and vseg as if they were space extended */
+ const uchar *k_end= k+ (cmplen - len);
+ for ( ; k < k_end && *k == ' '; k++) ;
+ if (k == k_end)
+ goto cmp_rest; /* should never happen */
+ if ((uchar) *k < (uchar) ' ')
+ {
+ my_flag= 1; /* Compared string is smaller */
+ break;
+ }
+ my_flag= -1; /* Continue searching */
+ }
+ }
+ else if (len > cmplen)
+ {
+ uchar *vseg_end;
+ if ((nextflag & SEARCH_PREFIX) && key_len_left == 0)
+ goto fix_flag;
+
+ /* We have to compare k and vseg as if they were space extended */
+ for (vseg_end= vseg + (len-cmplen) ;
+ vseg < vseg_end && *vseg == (uchar) ' ';
+ vseg++, matched++) ;
+ DBUG_ASSERT(vseg < vseg_end);
+
+ if ((uchar) *vseg > (uchar) ' ')
+ {
+ my_flag= 1; /* Compared string is smaller */
+ break;
+ }
+ my_flag= -1; /* Continue searching */
+ }
+ else
+ {
+ cmp_rest:
+ if (key_len_left>0)
+ {
+ uint not_used[2];
+ if ((flag = ha_key_cmp(keyinfo->seg+1,vseg,
+ k, key_len_left, nextflag | key_flag,
+ not_used)) >= 0)
+ break;
+ }
+ else
+ {
+ /*
+ at this line flag==-1 if the following lines were already
+ visited and 0 otherwise, i.e. flag <=0 here always !!!
+ */
+ fix_flag:
+ DBUG_ASSERT(flag <= 0);
+ if (nextflag & (SEARCH_NO_FIND | SEARCH_LAST))
+ flag=(nextflag & (SEARCH_BIGGER | SEARCH_LAST)) ? -1 : 1;
+ if (flag>=0)
+ break;
+ }
+ }
+ }
+ matched-=left;
+ }
+ /* else (matched < prefix_len) ---> do nothing. */
+
+ memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len);
+ saved_to= buff+saved_length;
+ saved_from= saved_vseg;
+ saved_length=length;
+ *ret_pos=page;
+ }
+ if (my_flag)
+ flag=(keyinfo->seg->flag & HA_REVERSE_SORT) ? -my_flag : my_flag;
+ if (flag == 0)
+ {
+ memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len);
+ saved_to= buff+saved_length;
+ saved_from= saved_vseg;
+ saved_length=length;
+ }
+ if (saved_length)
+ memcpy(saved_to, saved_from, saved_length);
+
+ *last_key= page == end;
+
+ DBUG_PRINT("exit",("flag: %d ret_pos: 0x%lx", flag, (long) *ret_pos));
+ DBUG_RETURN(flag);
+} /* _ma_prefix_search */
+
+
+/* Get pos to a key_block */
+
+my_off_t _ma_kpos(uint nod_flag, const uchar *after_key)
+{
+ after_key-=nod_flag;
+ switch (nod_flag) {
+#if SIZEOF_OFF_T > 4
+ case 7:
+ return mi_uint7korr(after_key)*maria_block_size;
+ case 6:
+ return mi_uint6korr(after_key)*maria_block_size;
+ case 5:
+ return mi_uint5korr(after_key)*maria_block_size;
+#else
+ case 7:
+ after_key++;
+ case 6:
+ after_key++;
+ case 5:
+ after_key++;
+#endif
+ case 4:
+ return ((my_off_t) mi_uint4korr(after_key))*maria_block_size;
+ case 3:
+ return ((my_off_t) mi_uint3korr(after_key))*maria_block_size;
+ case 2:
+ return (my_off_t) (mi_uint2korr(after_key)*maria_block_size);
+ case 1:
+ return (uint) (*after_key)*maria_block_size;
+ case 0: /* At leaf page */
+ default: /* Impossible */
+ return(HA_OFFSET_ERROR);
+ }
+} /* _kpos */
+
+
+/* Save pos to a key_block */
+
+void _ma_kpointer(register MARIA_HA *info, register uchar *buff, my_off_t pos)
+{
+ pos/=maria_block_size;
+ switch (info->s->base.key_reflength) {
+#if SIZEOF_OFF_T > 4
+ case 7: mi_int7store(buff,pos); break;
+ case 6: mi_int6store(buff,pos); break;
+ case 5: mi_int5store(buff,pos); break;
+#else
+ case 7: *buff++=0;
+ /* fall trough */
+ case 6: *buff++=0;
+ /* fall trough */
+ case 5: *buff++=0;
+ /* fall trough */
+#endif
+ case 4: mi_int4store(buff,pos); break;
+ case 3: mi_int3store(buff,pos); break;
+ case 2: mi_int2store(buff,(uint) pos); break;
+ case 1: buff[0]= (uchar) pos; break;
+ default: abort(); /* impossible */
+ }
+} /* _ma_kpointer */
+
+
+/* Calc pos to a data-record from a key */
+
+MARIA_RECORD_POS _ma_row_pos_from_key(const MARIA_KEY *key)
+{
+ my_off_t pos;
+ const uchar *after_key= key->data + key->data_length;
+ MARIA_SHARE *share= key->keyinfo->share;
+ switch (share->rec_reflength) {
+#if SIZEOF_OFF_T > 4
+ case 8: pos= (my_off_t) mi_uint8korr(after_key); break;
+ case 7: pos= (my_off_t) mi_uint7korr(after_key); break;
+ case 6: pos= (my_off_t) mi_uint6korr(after_key); break;
+ case 5: pos= (my_off_t) mi_uint5korr(after_key); break;
+#else
+ case 8: pos= (my_off_t) mi_uint4korr(after_key+4); break;
+ case 7: pos= (my_off_t) mi_uint4korr(after_key+3); break;
+ case 6: pos= (my_off_t) mi_uint4korr(after_key+2); break;
+ case 5: pos= (my_off_t) mi_uint4korr(after_key+1); break;
+#endif
+ case 4: pos= (my_off_t) mi_uint4korr(after_key); break;
+ case 3: pos= (my_off_t) mi_uint3korr(after_key); break;
+ case 2: pos= (my_off_t) mi_uint2korr(after_key); break;
+ default:
+ pos=0L; /* Shut compiler up */
+ }
+ return (*share->keypos_to_recpos)(share, pos);
+}
+
+
+/**
+ Get trid from a key
+
+ @param key Maria key read from a page
+
+ @retval 0 If key doesn't have a trid
+ @retval trid
+*/
+
+TrID _ma_trid_from_key(const MARIA_KEY *key)
+{
+ if (!(key->flag & (SEARCH_PAGE_KEY_HAS_TRANSID |
+ SEARCH_USER_KEY_HAS_TRANSID)))
+ return 0;
+ return transid_get_packed(key->keyinfo->share,
+ key->data + key->data_length +
+ key->keyinfo->share->rec_reflength);
+}
+
+
+/* Calc position from a record pointer ( in delete link chain ) */
+
+MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *share, uchar *ptr)
+{
+ my_off_t pos;
+ switch (share->rec_reflength) {
+#if SIZEOF_OFF_T > 4
+ case 8:
+ pos= (my_off_t) mi_uint8korr(ptr);
+ if (pos == HA_OFFSET_ERROR)
+ return HA_OFFSET_ERROR; /* end of list */
+ break;
+ case 7:
+ pos= (my_off_t) mi_uint7korr(ptr);
+ if (pos == (((my_off_t) 1) << 56) -1)
+ return HA_OFFSET_ERROR; /* end of list */
+ break;
+ case 6:
+ pos= (my_off_t) mi_uint6korr(ptr);
+ if (pos == (((my_off_t) 1) << 48) -1)
+ return HA_OFFSET_ERROR; /* end of list */
+ break;
+ case 5:
+ pos= (my_off_t) mi_uint5korr(ptr);
+ if (pos == (((my_off_t) 1) << 40) -1)
+ return HA_OFFSET_ERROR; /* end of list */
+ break;
+#else
+ case 8:
+ case 7:
+ case 6:
+ case 5:
+ ptr+= (share->rec_reflength-4);
+ /* fall through */
+#endif
+ case 4:
+ pos= (my_off_t) mi_uint4korr(ptr);
+ if (pos == (my_off_t) (uint32) ~0L)
+ return HA_OFFSET_ERROR;
+ break;
+ case 3:
+ pos= (my_off_t) mi_uint3korr(ptr);
+ if (pos == (my_off_t) (1 << 24) -1)
+ return HA_OFFSET_ERROR;
+ break;
+ case 2:
+ pos= (my_off_t) mi_uint2korr(ptr);
+ if (pos == (my_off_t) (1 << 16) -1)
+ return HA_OFFSET_ERROR;
+ break;
+ default: abort(); /* Impossible */
+ }
+ return (*share->keypos_to_recpos)(share, pos);
+}
+
+
+/* save position to record */
+
+void _ma_dpointer(MARIA_SHARE *share, uchar *buff, my_off_t pos)
+{
+ if (pos != HA_OFFSET_ERROR)
+ pos= (*share->recpos_to_keypos)(share, pos);
+
+ switch (share->rec_reflength) {
+#if SIZEOF_OFF_T > 4
+ case 8: mi_int8store(buff,pos); break;
+ case 7: mi_int7store(buff,pos); break;
+ case 6: mi_int6store(buff,pos); break;
+ case 5: mi_int5store(buff,pos); break;
+#else
+ case 8: *buff++=0;
+ /* fall trough */
+ case 7: *buff++=0;
+ /* fall trough */
+ case 6: *buff++=0;
+ /* fall trough */
+ case 5: *buff++=0;
+ /* fall trough */
+#endif
+ case 4: mi_int4store(buff,pos); break;
+ case 3: mi_int3store(buff,pos); break;
+ case 2: mi_int2store(buff,(uint) pos); break;
+ default: abort(); /* Impossible */
+ }
+} /* _ma_dpointer */
+
+
+my_off_t _ma_static_keypos_to_recpos(MARIA_SHARE *share, my_off_t pos)
+{
+ return pos * share->base.pack_reclength;
+}
+
+
+my_off_t _ma_static_recpos_to_keypos(MARIA_SHARE *share, my_off_t pos)
+{
+ return pos / share->base.pack_reclength;
+}
+
+my_off_t _ma_transparent_recpos(MARIA_SHARE *share __attribute__((unused)),
+ my_off_t pos)
+{
+ return pos;
+}
+
+my_off_t _ma_transaction_keypos_to_recpos(MARIA_SHARE *share
+ __attribute__((unused)),
+ my_off_t pos)
+{
+ /* We need one bit to store if there is transid's after position */
+ return pos >> 1;
+}
+
+my_off_t _ma_transaction_recpos_to_keypos(MARIA_SHARE *share
+ __attribute__((unused)),
+ my_off_t pos)
+{
+ return pos << 1;
+}
+
+/*
+ @brief Get key from key-block
+
+ @param key Should contain previous key. Will contain new key
+ @param page_flag Flag on page block
+ @param nod_flag Is set to nod length if we on nod
+ @param page Points at previous key; Its advanced to point at next key
+
+ @notes
+ Same as _ma_get_key but used with fixed length keys
+
+ @return
+ @retval key_length + length of data pointer (without nod length)
+ */
+
+uint _ma_get_static_key(MARIA_KEY *key, uint page_flag, uint nod_flag,
+ register uchar **page)
+{
+ register MARIA_KEYDEF *keyinfo= key->keyinfo;
+ size_t key_length= keyinfo->keylength;
+
+ key->ref_length= keyinfo->share->rec_reflength;
+ key->data_length= key_length - key->ref_length;
+ key->flag= 0;
+ if (page_flag & KEYPAGE_FLAG_HAS_TRANSID)
+ {
+ uchar *end= *page + keyinfo->keylength;
+ if (key_has_transid(end-1))
+ {
+ uint trans_length= transid_packed_length(end);
+ key->ref_length+= trans_length;
+ key_length+= trans_length;
+ key->flag= SEARCH_PAGE_KEY_HAS_TRANSID;
+ }
+ }
+ key_length+= nod_flag;
+ memcpy(key->data, *page, key_length);
+ *page+= key_length;
+ return key_length - nod_flag;
+} /* _ma_get_static_key */
+
+
+/**
+ Skip over static length key from key-block
+
+ @fn _ma_skip_static_key()
+ @param key Keyinfo and buffer that can be used
+ @param nod_flag If nod: Length of node pointer, else zero.
+ @param key Points at key
+
+ @retval pointer to next key
+*/
+
+uchar *_ma_skip_static_key(MARIA_KEY *key, uint page_flag,
+ uint nod_flag, uchar *page)
+{
+ page+= key->keyinfo->keylength;
+ if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) && key_has_transid(page-1))
+ page+= transid_packed_length(page);
+ return page+ nod_flag;
+}
+
+
+/*
+ get key which is packed against previous key or key with a NULL column.
+
+ SYNOPSIS
+ _ma_get_pack_key()
+ @param int_key Should contain previous key. Will contain new key
+ @param page_flag page_flag from page
+ @param nod_flag If nod: Length of node pointer, else zero.
+ @param page_pos Points at previous key; Its advanced to point at next key
+
+ @return
+ @retval key_length + length of data pointer
+*/
+
+uint _ma_get_pack_key(MARIA_KEY *int_key, uint page_flag,
+ uint nod_flag, uchar **page_pos)
+{
+ reg1 HA_KEYSEG *keyseg;
+ uchar *page= *page_pos;
+ uint length;
+ uchar *key= int_key->data;
+ MARIA_KEYDEF *keyinfo= int_key->keyinfo;
+
+ for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++)
+ {
+ if (keyseg->flag & HA_PACK_KEY)
+ {
+ /* key with length, packed to previous key */
+ uchar *start= key;
+ uint packed= *page & 128,tot_length,rest_length;
+ if (keyseg->length >= 127)
+ {
+ length=mi_uint2korr(page) & 32767;
+ page+=2;
+ }
+ else
+ length= *page++ & 127;
+
+ if (packed)
+ {
+ if (length > (uint) keyseg->length)
+ {
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ return 0; /* Error */
+ }
+ if (length == 0) /* Same key */
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ *key++=1; /* Can't be NULL */
+ get_key_length(length,key);
+ key+= length; /* Same diff_key as prev */
+ if (length > keyseg->length)
+ {
+ DBUG_PRINT("error",
+ ("Found too long null packed key: %u of %u at 0x%lx",
+ length, keyseg->length, (long) *page_pos));
+ DBUG_DUMP("key", *page_pos, 16);
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ return 0;
+ }
+ continue;
+ }
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ key++; /* Skip null marker*/
+ start++;
+ }
+
+ get_key_length(rest_length,page);
+ tot_length=rest_length+length;
+
+ /* If the stored length has changed, we must move the key */
+ if (tot_length >= 255 && *start != 255)
+ {
+ /* length prefix changed from a length of one to a length of 3 */
+ bmove_upp(key+length+3, key+length+1, length);
+ *key=255;
+ mi_int2store(key+1,tot_length);
+ key+=3+length;
+ }
+ else if (tot_length < 255 && *start == 255)
+ {
+ bmove(key+1,key+3,length);
+ *key=tot_length;
+ key+=1+length;
+ }
+ else
+ {
+ store_key_length_inc(key,tot_length);
+ key+=length;
+ }
+ memcpy(key,page,rest_length);
+ page+=rest_length;
+ key+=rest_length;
+ continue;
+ }
+ else
+ {
+ /* Key that is not packed against previous key */
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ if (!length--) /* Null part */
+ {
+ *key++=0;
+ continue;
+ }
+ *key++=1; /* Not null */
+ }
+ }
+ if (length > (uint) keyseg->length)
+ {
+ DBUG_PRINT("error",("Found too long packed key: %u of %u at 0x%lx",
+ length, keyseg->length, (long) *page_pos));
+ DBUG_DUMP("key", *page_pos, 16);
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ return 0; /* Error */
+ }
+ store_key_length_inc(key,length);
+ }
+ else
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ if (!(*key++ = *page++))
+ continue;
+ }
+ if (keyseg->flag &
+ (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK))
+ {
+ uchar *tmp=page;
+ get_key_length(length,tmp);
+ length+=(uint) (tmp-page);
+ }
+ else
+ length=keyseg->length;
+ }
+ memcpy(key, page,(size_t) length);
+ key+=length;
+ page+=length;
+ }
+
+ int_key->data_length= (key - int_key->data);
+ int_key->flag= 0;
+ length= keyseg->length;
+ if (page_flag & KEYPAGE_FLAG_HAS_TRANSID)
+ {
+ uchar *end= page + length;
+ if (key_has_transid(end-1))
+ {
+ length+= transid_packed_length(end);
+ int_key->flag= SEARCH_PAGE_KEY_HAS_TRANSID;
+ }
+ }
+ int_key->ref_length= length;
+ length+= nod_flag;
+ bmove(key, page, length);
+ *page_pos= page+length;
+
+ return (int_key->data_length + int_key->ref_length);
+} /* _ma_get_pack_key */
+
+
+/**
+ skip key which is packed against previous key or key with a NULL column.
+
+ @fn _ma_skip_pack_key()
+ @param key Keyinfo and buffer that can be used
+ @param nod_flag If nod: Length of node pointer, else zero.
+ @param key Points at key
+
+ @note
+ This is in principle a simpler version of _ma_get_pack_key()
+
+ @retval pointer to next key
+*/
+
+uchar *_ma_skip_pack_key(MARIA_KEY *key, uint page_flag,
+ uint nod_flag, uchar *page)
+{
+ reg1 HA_KEYSEG *keyseg;
+ for (keyseg= key->keyinfo->seg ; keyseg->type ; keyseg++)
+ {
+ if (keyseg->flag & HA_PACK_KEY)
+ {
+ /* key with length, packed to previous key */
+ uint packed= *page & 128, length;
+ if (keyseg->length >= 127)
+ {
+ length= mi_uint2korr(page) & 32767;
+ page+= 2;
+ }
+ else
+ length= *page++ & 127;
+
+ if (packed)
+ {
+ if (length == 0) /* Same key */
+ continue;
+ get_key_length(length,page);
+ page+= length;
+ continue;
+ }
+ if ((keyseg->flag & HA_NULL_PART) && length)
+ {
+ /*
+ Keys that can have null use length+1 as the length for date as the
+ number 0 is reserved for keys that have a NULL value
+ */
+ length--;
+ }
+ page+= length;
+ }
+ else
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ if (!*page++)
+ continue;
+ if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ {
+ uint length;
+ get_key_length(length,page);
+ page+=length;
+ }
+ else
+ page+= keyseg->length;
+ }
+ }
+ page+= keyseg->length;
+ if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) && key_has_transid(page-1))
+ page+= transid_packed_length(page);
+ return page + nod_flag;
+}
+
+
+/* Read key that is packed relatively to previous */
+
+uint _ma_get_binary_pack_key(MARIA_KEY *int_key, uint page_flag, uint nod_flag,
+ register uchar **page_pos)
+{
+ reg1 HA_KEYSEG *keyseg;
+ uchar *page, *page_end, *from, *from_end, *key;
+ uint length,tmp;
+ MARIA_KEYDEF *keyinfo= int_key->keyinfo;
+ DBUG_ENTER("_ma_get_binary_pack_key");
+
+ page= *page_pos;
+ page_end=page + MARIA_MAX_KEY_BUFF + 1;
+ key= int_key->data;
+
+ /*
+ Keys are compressed the following way:
+
+ prefix length Packed length of prefix common with prev key.
+ (1 or 3 bytes)
+ for each key segment:
+ [is null] Null indicator if can be null (1 byte, zero means null)
+ [length] Packed length if varlength (1 or 3 bytes)
+ key segment 'length' bytes of key segment value
+ pointer Reference to the data file (last_keyseg->length).
+
+ get_key_length() is a macro. It gets the prefix length from 'page'
+ and puts it into 'length'. It increments 'page' by 1 or 3, depending
+ on the packed length of the prefix length.
+ */
+ get_key_length(length,page);
+ if (length)
+ {
+ if (length > keyinfo->maxlength)
+ {
+ DBUG_PRINT("error",
+ ("Found too long binary packed key: %u of %u at 0x%lx",
+ length, keyinfo->maxlength, (long) *page_pos));
+ DBUG_DUMP("key", *page_pos, 16);
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0); /* Wrong key */
+ }
+ /* Key is packed against prev key, take prefix from prev key. */
+ from= key;
+ from_end= key + length;
+ }
+ else
+ {
+ /* Key is not packed against prev key, take all from page buffer. */
+ from= page;
+ from_end= page_end;
+ }
+
+ /*
+ The trouble is that key can be split in two parts:
+ The first part (prefix) is in from .. from_end - 1.
+ The second part starts at page.
+ The split can be at every byte position. So we need to check for
+ the end of the first part before using every byte.
+ */
+ for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++)
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ /* If prefix is used up, switch to rest. */
+ if (from == from_end)
+ {
+ from=page;
+ from_end=page_end;
+ }
+ if (!(*key++ = *from++))
+ continue; /* Null part */
+ }
+ if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK))
+ {
+ /* If prefix is used up, switch to rest. */
+ if (from == from_end) { from=page; from_end=page_end; }
+ /* Get length of dynamic length key part */
+ if ((length= (uint) (uchar) (*key++ = *from++)) == 255)
+ {
+ /* If prefix is used up, switch to rest. */
+ if (from == from_end) { from=page; from_end=page_end; }
+ length= ((uint) (uchar) ((*key++ = *from++))) << 8;
+ /* If prefix is used up, switch to rest. */
+ if (from == from_end) { from=page; from_end=page_end; }
+ length+= (uint) (uchar) ((*key++ = *from++));
+ }
+ }
+ else
+ length=keyseg->length;
+
+ if ((tmp=(uint) (from_end-from)) <= length)
+ {
+ key+=tmp; /* Use old key */
+ length-=tmp;
+ from=page; from_end=page_end;
+ }
+ DBUG_ASSERT((int) length >= 0);
+ DBUG_PRINT("info",("key: 0x%lx from: 0x%lx length: %u",
+ (long) key, (long) from, length));
+ memmove(key, from, (size_t) length);
+ key+=length;
+ from+=length;
+ }
+ /*
+ Last segment (type == 0) contains length of data pointer.
+ If we have mixed key blocks with data pointer and key block pointer,
+ we have to copy both.
+ */
+ int_key->data_length= (key - int_key->data);
+ int_key->ref_length= length= keyseg->length;
+ int_key->flag= 0;
+ if ((tmp=(uint) (from_end-from)) <= length)
+ {
+ /* Skip over the last common part of the data */
+ key+= tmp;
+ length-= tmp;
+ from= page;
+ }
+ else
+ {
+ /*
+ Remaining length is greater than max possible length.
+ This can happen only if we switched to the new key bytes already.
+ 'page_end' is calculated with MARIA_MAX_KEY_BUFF. So it can be far
+ behind the real end of the key.
+ */
+ if (from_end != page_end)
+ {
+ DBUG_PRINT("error",("Error when unpacking key"));
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0); /* Error */
+ }
+ }
+ if (page_flag & KEYPAGE_FLAG_HAS_TRANSID)
+ {
+ uchar *end= from + length;
+ if (key_has_transid(end-1))
+ {
+ uint trans_length= transid_packed_length(end);
+ length+= trans_length;
+ int_key->ref_length+= trans_length;
+ int_key->flag= SEARCH_PAGE_KEY_HAS_TRANSID;
+ }
+ }
+
+ /* Copy rest of data ptr and, if appropriate, trans_id and node_ptr */
+ memcpy(key, from, length + nod_flag);
+ *page_pos= from + length + nod_flag;
+
+ DBUG_RETURN(int_key->data_length + int_key->ref_length);
+}
+
+/**
+ skip key which is ptefix packed against previous key
+
+ @fn _ma_skip_binary_key()
+ @param key Keyinfo and buffer that can be used
+ @param nod_flag If nod: Length of node pointer, else zero.
+ @param key Points at key
+
+ @note
+ We have to copy the key as otherwise we don't know how much left
+ data there is of the key.
+
+ @todo
+ Implement more efficient version of this. We can ignore to copy any rest
+ key parts that are not null or not packed. We also don't have to copy
+ rowid or transid.
+
+ @retval pointer to next key
+*/
+
+uchar *_ma_skip_binary_pack_key(MARIA_KEY *key, uint page_flag,
+ uint nod_flag, uchar *page)
+{
+ if (!_ma_get_binary_pack_key(key, page_flag, nod_flag, &page))
+ return 0;
+ return page;
+}
+
+
+/**
+ @brief Get key at position without knowledge of previous key
+
+ @return pointer to next key
+*/
+
+uchar *_ma_get_key(MARIA_KEY *key, MARIA_PAGE *ma_page, uchar *keypos)
+{
+ uint page_flag, nod_flag;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ uchar *page;
+ DBUG_ENTER("_ma_get_key");
+
+ page= ma_page->buff;
+ page_flag= ma_page->flag;
+ nod_flag= ma_page->node;
+
+ if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+ ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+ {
+ bmove(key->data, keypos, keyinfo->keylength+nod_flag);
+ key->ref_length= keyinfo->share->rec_reflength;
+ key->data_length= keyinfo->keylength - key->ref_length;
+ key->flag= 0;
+ DBUG_RETURN(keypos+keyinfo->keylength+nod_flag);
+ }
+ else
+ {
+ page+= keyinfo->share->keypage_header + nod_flag;
+ key->data[0]= 0; /* safety */
+ while (page <= keypos)
+ {
+ if (!(*keyinfo->get_key)(key, page_flag, nod_flag, &page))
+ {
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0);
+ }
+ }
+ }
+ DBUG_PRINT("exit",("page: 0x%lx length: %u", (long) page,
+ key->data_length + key->ref_length));
+ DBUG_RETURN(page);
+} /* _ma_get_key */
+
+
+/*
+ @brief Get key at position without knowledge of previous key
+
+ @return
+ @retval 0 ok
+ @retval 1 error
+*/
+
+static my_bool _ma_get_prev_key(MARIA_KEY *key, MARIA_PAGE *ma_page,
+ uchar *keypos)
+{
+ uint page_flag, nod_flag;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ DBUG_ENTER("_ma_get_prev_key");
+
+ page_flag= ma_page->flag;
+ nod_flag= ma_page->node;
+
+ if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+ ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+ {
+ bmove(key->data, keypos - keyinfo->keylength - nod_flag,
+ keyinfo->keylength);
+ key->ref_length= keyinfo->share->rec_reflength;
+ key->data_length= keyinfo->keylength - key->ref_length;
+ key->flag= 0;
+ DBUG_RETURN(0);
+ }
+ else
+ {
+ uchar *page;
+
+ page= ma_page->buff + keyinfo->share->keypage_header + nod_flag;
+ key->data[0]= 0; /* safety */
+ DBUG_ASSERT(page != keypos);
+ while (page < keypos)
+ {
+ if (! (*keyinfo->get_key)(key, page_flag, nod_flag, &page))
+ {
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(1);
+ }
+ }
+ }
+ DBUG_RETURN(0);
+} /* _ma_get_prev_key */
+
+
+/*
+ @brief Get last key from key-page before 'endpos'
+
+ @note
+ endpos may be either end of buffer or start of a key
+
+ @return
+ @retval pointer to where key starts
+*/
+
+uchar *_ma_get_last_key(MARIA_KEY *key, MARIA_PAGE *ma_page, uchar *endpos)
+{
+ uint page_flag,nod_flag;
+ uchar *lastpos, *page;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ DBUG_ENTER("_ma_get_last_key");
+ DBUG_PRINT("enter",("page: 0x%lx endpos: 0x%lx", (long) ma_page->buff,
+ (long) endpos));
+
+ page_flag= ma_page->flag;
+ nod_flag= ma_page->node;
+ page= ma_page->buff + keyinfo->share->keypage_header + nod_flag;
+
+ if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+ ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+ {
+ lastpos= endpos-keyinfo->keylength-nod_flag;
+ key->ref_length= keyinfo->share->rec_reflength;
+ key->data_length= keyinfo->keylength - key->ref_length;
+ key->flag= 0;
+ if (lastpos >= page)
+ bmove(key->data, lastpos, keyinfo->keylength + nod_flag);
+ }
+ else
+ {
+ lastpos= page;
+ key->data[0]=0; /* safety */
+ while (page < endpos)
+ {
+ lastpos= page;
+ if (!(*keyinfo->get_key)(key, page_flag, nod_flag, &page))
+ {
+ DBUG_PRINT("error",("Couldn't find last key: page: 0x%lx",
+ (long) page));
+ maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0);
+ }
+ }
+ }
+ DBUG_PRINT("exit",("lastpos: 0x%lx length: %u", (ulong) lastpos,
+ key->data_length + key->ref_length));
+ DBUG_RETURN(lastpos);
+} /* _ma_get_last_key */
+
+
+/**
+ Calculate length of unpacked key
+
+ @param info Maria handler
+ @param keyinfo key handler
+ @param key data for key
+
+ @notes
+ This function is very seldom used. It's mainly used for debugging
+ or when calculating a key length from a stored key in batch insert.
+
+ This function does *NOT* calculate length of transid size!
+ This function can't be used against a prefix packed key on a page
+
+ @return
+ @retval total length for key
+*/
+
+uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key)
+{
+ reg1 HA_KEYSEG *keyseg;
+ const uchar *start;
+
+ if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)))
+ return (keyinfo->keylength);
+
+ start= key;
+ for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++)
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ if (!*key++)
+ continue;
+ if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ {
+ uint length;
+ get_key_length(length,key);
+ key+=length;
+ }
+ else
+ key+= keyseg->length;
+ }
+ return((uint) (key-start)+keyseg->length);
+} /* _ma_keylength */
+
+
+/*
+ Calculate length of part key.
+
+ Used in maria_rkey() to find the key found for the key-part that was used.
+ This is needed in case of multi-byte character sets where we may search
+ after '0xDF' but find 'ss'
+*/
+
+uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, register const uchar *key,
+ HA_KEYSEG *end)
+{
+ reg1 HA_KEYSEG *keyseg;
+ const uchar *start= key;
+
+ for (keyseg=keyinfo->seg ; keyseg != end ; keyseg++)
+ {
+ if (keyseg->flag & HA_NULL_PART)
+ if (!*key++)
+ continue;
+ if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ {
+ uint length;
+ get_key_length(length,key);
+ key+=length;
+ }
+ else
+ key+= keyseg->length;
+ }
+ return (uint) (key-start);
+}
+
+
+/*
+ Find next/previous record with same key
+
+ WARNING
+ This can't be used when database is touched after last read
+*/
+
+int _ma_search_next(register MARIA_HA *info, MARIA_KEY *key,
+ uint32 nextflag, my_off_t pos)
+{
+ int error;
+ uchar lastkey[MARIA_MAX_KEY_BUFF];
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_KEY tmp_key;
+ MARIA_PAGE page;
+ DBUG_ENTER("_ma_search_next");
+ DBUG_PRINT("enter",("nextflag: %u lastpos: %lu int_keypos: 0x%lx page_changed %d keyread_buff_used: %d",
+ nextflag, (ulong) info->cur_row.lastpos,
+ (ulong) info->int_keypos,
+ info->page_changed, info->keyread_buff_used));
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key););
+
+ /*
+ Force full read if we are at last key or if we are not on a leaf
+ and the key tree has changed since we used it last time
+ Note that even if the key tree has changed since last read, we can use
+ the last read data from the leaf if we haven't used the buffer for
+ something else.
+ */
+
+ if (((nextflag & SEARCH_BIGGER) && info->int_keypos >= info->int_maxpos) ||
+ info->page_changed ||
+ (info->int_keytree_version != keyinfo->version &&
+ (info->int_nod_flag || info->keyread_buff_used)))
+ DBUG_RETURN(_ma_search(info, key, nextflag | SEARCH_SAVE_BUFF,
+ pos));
+
+ if (info->keyread_buff_used)
+ {
+ if (_ma_fetch_keypage(&page, info, keyinfo, info->last_search_keypage,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ DFLT_INIT_HITS, info->keyread_buff, 0))
+ DBUG_RETURN(-1);
+ info->keyread_buff_used=0;
+ }
+ else
+ {
+ /* Last used buffer is in info->keyread_buff */
+ /* Todo: Add info->keyread_page to keep track of this */
+ _ma_page_setup(&page, info, keyinfo, 0, info->keyread_buff);
+ }
+
+ tmp_key.data= lastkey;
+ info->last_key.keyinfo= tmp_key.keyinfo= keyinfo;
+
+ if (nextflag & SEARCH_BIGGER) /* Next key */
+ {
+ if (page.node)
+ {
+ my_off_t tmp_pos= _ma_kpos(page.node, info->int_keypos);
+
+ if ((error= _ma_search(info, key, nextflag | SEARCH_SAVE_BUFF,
+ tmp_pos)) <=0)
+ DBUG_RETURN(error);
+ }
+ if (keyinfo->flag & (HA_PACK_KEY | HA_BINARY_PACK_KEY) &&
+ info->last_key.data != key->data)
+ memcpy(info->last_key.data, key->data,
+ key->data_length + key->ref_length);
+ if (!(*keyinfo->get_key)(&info->last_key, page.flag, page.node,
+ &info->int_keypos))
+ DBUG_RETURN(-1);
+ }
+ else /* Previous key */
+ {
+ /* Find start of previous key */
+ info->int_keypos= _ma_get_last_key(&tmp_key, &page, info->int_keypos);
+ if (!info->int_keypos)
+ DBUG_RETURN(-1);
+ if (info->int_keypos == info->keyread_buff + info->s->keypage_header)
+ {
+ /* Previous key was first key, read key before this one */
+ DBUG_RETURN(_ma_search(info, key, nextflag | SEARCH_SAVE_BUFF,
+ pos));
+ }
+ if (page.node &&
+ (error= _ma_search(info, key, nextflag | SEARCH_SAVE_BUFF,
+ _ma_kpos(page.node,info->int_keypos))) <= 0)
+ DBUG_RETURN(error);
+
+ /* QQ: We should be able to optimize away the following call */
+ if (! _ma_get_last_key(&info->last_key, &page, info->int_keypos))
+ DBUG_RETURN(-1);
+ }
+ info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key);
+ info->cur_row.trid= _ma_trid_from_key(&info->last_key);
+ DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
+ DBUG_RETURN(0);
+} /* _ma_search_next */
+
+
+/**
+ Search after position for the first row in an index
+
+ @return
+ Found row is stored in info->cur_row.lastpos
+*/
+
+int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t pos)
+{
+ uchar *first_pos;
+ MARIA_PAGE page;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_search_first");
+
+ if (pos == HA_OFFSET_ERROR)
+ {
+ my_errno=HA_ERR_KEY_NOT_FOUND;
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ DBUG_RETURN(-1);
+ }
+
+ do
+ {
+ if (_ma_fetch_keypage(&page, info, keyinfo, pos,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ DFLT_INIT_HITS, info->keyread_buff, 0))
+ {
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ DBUG_RETURN(-1);
+ }
+ first_pos= page.buff + share->keypage_header + page.node;
+ } while ((pos= _ma_kpos(page.node, first_pos)) != HA_OFFSET_ERROR);
+
+ info->last_key.keyinfo= keyinfo;
+
+ if (!(*keyinfo->get_key)(&info->last_key, page.flag, page.node, &first_pos))
+ DBUG_RETURN(-1); /* Crashed */
+
+ info->int_keypos= first_pos;
+ info->int_maxpos= (page.buff + page.size -1);
+ info->int_nod_flag= page.node;
+ info->int_keytree_version= keyinfo->version;
+ info->last_search_keypage= info->last_keypage;
+ info->page_changed=info->keyread_buff_used=0;
+ info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key);
+ info->cur_row.trid= _ma_trid_from_key(&info->last_key);
+
+ DBUG_PRINT("exit",("found key at %lu", (ulong) info->cur_row.lastpos));
+ DBUG_RETURN(0);
+} /* _ma_search_first */
+
+
+/**
+ Search after position for the last row in an index
+
+ @return
+ Found row is stored in info->cur_row.lastpos
+*/
+
+int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t pos)
+{
+ uchar *end_of_page;
+ MARIA_PAGE page;
+ DBUG_ENTER("_ma_search_last");
+
+ if (pos == HA_OFFSET_ERROR)
+ {
+ my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ DBUG_RETURN(-1);
+ }
+
+ do
+ {
+ if (_ma_fetch_keypage(&page, info, keyinfo, pos,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ DFLT_INIT_HITS, info->keyread_buff, 0))
+ {
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ DBUG_RETURN(-1);
+ }
+ end_of_page= page.buff + page.size;
+ } while ((pos= _ma_kpos(page.node, end_of_page)) != HA_OFFSET_ERROR);
+
+ info->last_key.keyinfo= keyinfo;
+
+ if (!_ma_get_last_key(&info->last_key, &page, end_of_page))
+ DBUG_RETURN(-1);
+ info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key);
+ info->cur_row.trid= _ma_trid_from_key(&info->last_key);
+ info->int_keypos= info->int_maxpos= end_of_page;
+ info->int_nod_flag= page.node;
+ info->int_keytree_version= keyinfo->version;
+ info->last_search_keypage= info->last_keypage;
+ info->page_changed=info->keyread_buff_used=0;
+
+ DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
+ DBUG_RETURN(0);
+} /* _ma_search_last */
+
+
+
+/****************************************************************************
+**
+** Functions to store and pack a key in a page
+**
+** maria_calc_xx_key_length takes the following arguments:
+** nod_flag If nod: Length of nod-pointer
+** next_key Position to pos after the new key in buffer
+** org_key Key that was before the next key in buffer
+** prev_key Last key before current key
+** key Key that will be stored
+** s_temp Information how next key will be packed
+****************************************************************************/
+
+/* Static length key */
+
+int
+_ma_calc_static_key_length(const MARIA_KEY *key, uint nod_flag,
+ uchar *next_pos __attribute__((unused)),
+ uchar *org_key __attribute__((unused)),
+ uchar *prev_key __attribute__((unused)),
+ MARIA_KEY_PARAM *s_temp)
+{
+ s_temp->key= key->data;
+ return (int) (s_temp->move_length= key->data_length + key->ref_length +
+ nod_flag);
+}
+
+/* Variable length key */
+
+int
+_ma_calc_var_key_length(const MARIA_KEY *key, uint nod_flag,
+ uchar *next_pos __attribute__((unused)),
+ uchar *org_key __attribute__((unused)),
+ uchar *prev_key __attribute__((unused)),
+ MARIA_KEY_PARAM *s_temp)
+{
+ s_temp->key= key->data;
+ return (int) (s_temp->move_length= key->data_length + key->ref_length +
+ nod_flag);
+}
+
+/**
+ @brief Calc length needed to store prefixed compressed keys
+
+ @info
+ Variable length first segment which is prefix compressed
+ (maria_chk reports 'packed + stripped')
+
+ Keys are compressed the following way:
+
+ If the max length of first key segment <= 127 bytes the prefix is
+ 1 uchar else it's 2 byte
+
+ prefix byte(s) The high bit is set if this is a prefix for the prev key
+ length Packed length if the previous was a prefix byte
+ [data_length] data bytes ('length' bytes)
+ next-key-seg Next key segments
+
+ If the first segment can have NULL:
+ If key was packed
+ data_length is length of rest of key
+ If key was not packed
+ The data_length is 0 for NULLS and 1+data_length for not null columns
+*/
+
+int
+_ma_calc_var_pack_key_length(const MARIA_KEY *int_key, uint nod_flag,
+ uchar *next_key, uchar *org_key, uchar *prev_key,
+ MARIA_KEY_PARAM *s_temp)
+{
+ reg1 HA_KEYSEG *keyseg;
+ int length;
+ uint key_length,ref_length,org_key_length=0,
+ length_pack,new_key_length,diff_flag,pack_marker;
+ const uchar *key, *start, *end, *key_end;
+ const uchar *sort_order;
+ my_bool same_length;
+ MARIA_KEYDEF *keyinfo= int_key->keyinfo;
+
+ key= int_key->data;
+ length_pack=s_temp->ref_length=s_temp->n_ref_length=s_temp->n_length=0;
+ same_length=0; keyseg=keyinfo->seg;
+ key_length= int_key->data_length + int_key->ref_length + nod_flag;
+
+ sort_order=0;
+ if ((keyinfo->flag & HA_FULLTEXT) &&
+ ((keyseg->type == HA_KEYTYPE_TEXT) ||
+ (keyseg->type == HA_KEYTYPE_VARTEXT1) ||
+ (keyseg->type == HA_KEYTYPE_VARTEXT2)) &&
+ !use_strnxfrm(keyseg->charset))
+ sort_order= keyseg->charset->sort_order;
+
+ /* diff flag contains how many bytes is needed to pack key */
+ if (keyseg->length >= 127)
+ {
+ diff_flag=2;
+ pack_marker=32768;
+ }
+ else
+ {
+ diff_flag= 1;
+ pack_marker=128;
+ }
+ s_temp->pack_marker=pack_marker;
+
+ /* Handle the case that the first part have NULL values */
+ if (keyseg->flag & HA_NULL_PART)
+ {
+ if (!*key++)
+ {
+ s_temp->key= key;
+ s_temp->key_length= 0;
+ s_temp->totlength= key_length-1+diff_flag;
+ s_temp->next_key_pos= 0; /* No next key */
+ return (s_temp->move_length= s_temp->totlength);
+ }
+ s_temp->store_not_null=1;
+ key_length--; /* We don't store NULL */
+ if (prev_key && !*prev_key++)
+ org_key=prev_key=0; /* Can't pack against prev */
+ else if (org_key)
+ org_key++; /* Skip NULL */
+ }
+ else
+ s_temp->store_not_null=0;
+ s_temp->prev_key= org_key;
+
+ /* The key part will start with a packed length */
+
+ get_key_pack_length(new_key_length,length_pack,key);
+ end= key_end= key+ new_key_length;
+ start= key;
+
+ /* Calc how many characters are identical between this and the prev. key */
+ if (prev_key)
+ {
+ get_key_length(org_key_length,prev_key);
+ s_temp->prev_key=prev_key; /* Pointer at data */
+ /* Don't use key-pack if length == 0 */
+ if (new_key_length && new_key_length == org_key_length)
+ same_length=1;
+ else if (new_key_length > org_key_length)
+ end= key + org_key_length;
+
+ if (sort_order) /* SerG */
+ {
+ while (key < end &&
+ sort_order[*key] == sort_order[*prev_key])
+ {
+ key++; prev_key++;
+ }
+ }
+ else
+ {
+ while (key < end && *key == *prev_key)
+ {
+ key++; prev_key++;
+ }
+ }
+ }
+
+ s_temp->key=key;
+ s_temp->key_length= (uint) (key_end-key);
+
+ if (same_length && key == key_end)
+ {
+ /* identical variable length key */
+ s_temp->ref_length= pack_marker;
+ length=(int) key_length-(int) (key_end-start)-length_pack;
+ length+= diff_flag;
+ if (next_key)
+ { /* Can't combine with next */
+ s_temp->n_length= *next_key; /* Needed by _ma_store_key */
+ next_key=0;
+ }
+ }
+ else
+ {
+ if (start != key)
+ { /* Starts as prev key */
+ ref_length= (uint) (key-start);
+ s_temp->ref_length= ref_length + pack_marker;
+ length= (int) (key_length - ref_length);
+
+ length-= length_pack;
+ length+= diff_flag;
+ length+= ((new_key_length-ref_length) >= 255) ? 3 : 1;/* Rest_of_key */
+ }
+ else
+ {
+ s_temp->key_length+=s_temp->store_not_null; /* If null */
+ length= key_length - length_pack+ diff_flag;
+ }
+ }
+ s_temp->totlength=(uint) length;
+ s_temp->prev_length=0;
+ DBUG_PRINT("test",("tot_length: %u length: %d uniq_key_length: %u",
+ key_length, length, s_temp->key_length));
+
+ /* If something after that hasn't length=0, test if we can combine */
+ if ((s_temp->next_key_pos=next_key))
+ {
+ uint packed,n_length;
+
+ packed = *next_key & 128;
+ if (diff_flag == 2)
+ {
+ n_length= mi_uint2korr(next_key) & 32767; /* Length of next key */
+ next_key+=2;
+ }
+ else
+ n_length= *next_key++ & 127;
+ if (!packed)
+ n_length-= s_temp->store_not_null;
+
+ if (n_length || packed) /* Don't pack 0 length keys */
+ {
+ uint next_length_pack, new_ref_length=s_temp->ref_length;
+
+ if (packed)
+ {
+ /* If first key and next key is packed (only on delete) */
+ if (!prev_key && org_key)
+ {
+ get_key_length(org_key_length,org_key);
+ key=start;
+ if (sort_order) /* SerG */
+ {
+ while (key < end &&
+ sort_order[*key] == sort_order[*org_key])
+ {
+ key++; org_key++;
+ }
+ }
+ else
+ {
+ while (key < end && *key == *org_key)
+ {
+ key++; org_key++;
+ }
+ }
+ if ((new_ref_length= (uint) (key - start)))
+ new_ref_length+=pack_marker;
+ }
+
+ if (!n_length)
+ {
+ /*
+ We put a different key between two identical variable length keys
+ Extend next key to have same prefix as this key
+ */
+ if (new_ref_length) /* prefix of previus key */
+ { /* make next key longer */
+ s_temp->part_of_prev_key= new_ref_length;
+ s_temp->prev_length= org_key_length -
+ (new_ref_length-pack_marker);
+ s_temp->n_ref_length= s_temp->part_of_prev_key;
+ s_temp->n_length= s_temp->prev_length;
+ n_length= get_pack_length(s_temp->prev_length);
+ s_temp->prev_key+= (new_ref_length - pack_marker);
+ length+= s_temp->prev_length + n_length;
+ }
+ else
+ { /* Can't use prev key */
+ s_temp->part_of_prev_key=0;
+ s_temp->prev_length= org_key_length;
+ s_temp->n_ref_length=s_temp->n_length= org_key_length;
+ length+= org_key_length;
+ }
+ return (s_temp->move_length= (int) length);
+ }
+
+ ref_length=n_length;
+ /* Get information about not packed key suffix */
+ get_key_pack_length(n_length,next_length_pack,next_key);
+
+ /* Test if new keys has fewer characters that match the previous key */
+ if (!new_ref_length)
+ { /* Can't use prev key */
+ s_temp->part_of_prev_key= 0;
+ s_temp->prev_length= ref_length;
+ s_temp->n_ref_length= s_temp->n_length= n_length+ref_length;
+ return s_temp->move_length= ((int) length+ref_length-
+ next_length_pack);
+ }
+ if (ref_length+pack_marker > new_ref_length)
+ {
+ uint new_pack_length=new_ref_length-pack_marker;
+ /* We must copy characters from the original key to the next key */
+ s_temp->part_of_prev_key= new_ref_length;
+ s_temp->prev_length= ref_length - new_pack_length;
+ s_temp->n_ref_length=s_temp->n_length=n_length + s_temp->prev_length;
+ s_temp->prev_key+= new_pack_length;
+ length-= (next_length_pack - get_pack_length(s_temp->n_length));
+ return s_temp->move_length= ((int) length + s_temp->prev_length);
+ }
+ }
+ else
+ {
+ /* Next key wasn't a prefix of previous key */
+ ref_length=0;
+ next_length_pack=0;
+ }
+ DBUG_PRINT("test",("length: %d next_key: 0x%lx", length,
+ (long) next_key));
+
+ {
+ uint tmp_length;
+ key=(start+=ref_length);
+ if (key+n_length < key_end) /* Normalize length based */
+ key_end= key+n_length;
+ if (sort_order) /* SerG */
+ {
+ while (key < key_end &&
+ sort_order[*key] == sort_order[*next_key])
+ {
+ key++; next_key++;
+ }
+ }
+ else
+ {
+ while (key < key_end && *key == *next_key)
+ {
+ key++; next_key++;
+ }
+ }
+ if (!(tmp_length=(uint) (key-start)))
+ { /* Key can't be re-packed */
+ s_temp->next_key_pos=0;
+ return (s_temp->move_length= length);
+ }
+ ref_length+=tmp_length;
+ n_length-=tmp_length;
+ length-=tmp_length+next_length_pack; /* We gained these chars */
+ }
+ if (n_length == 0 && ref_length == new_key_length)
+ {
+ s_temp->n_ref_length=pack_marker; /* Same as prev key */
+ }
+ else
+ {
+ s_temp->n_ref_length=ref_length | pack_marker;
+ length+= get_pack_length(n_length);
+ s_temp->n_length=n_length;
+ }
+ }
+ }
+ return (s_temp->move_length= length);
+}
+
+
+/* Length of key which is prefix compressed */
+
+int _ma_calc_bin_pack_key_length(const MARIA_KEY *int_key,
+ uint nod_flag,
+ uchar *next_key,
+ uchar *org_key, uchar *prev_key,
+ MARIA_KEY_PARAM *s_temp)
+{
+ uint length,key_length,ref_length;
+ const uchar *key= int_key->data;
+
+ s_temp->totlength= key_length= (int_key->data_length + int_key->ref_length+
+ nod_flag);
+#ifdef HAVE_valgrind
+ s_temp->n_length= s_temp->n_ref_length=0; /* For valgrind */
+#endif
+ s_temp->key=key;
+ s_temp->prev_key=org_key;
+ if (prev_key) /* If not first key in block */
+ {
+ /* pack key against previous key */
+ /*
+ As keys may be identical when running a sort in maria_chk, we
+ have to guard against the case where keys may be identical
+ */
+ const uchar *end;
+ end=key+key_length;
+ for ( ; *key == *prev_key && key < end; key++,prev_key++) ;
+ s_temp->ref_length= ref_length=(uint) (key-s_temp->key);
+ length=key_length - ref_length + get_pack_length(ref_length);
+ }
+ else
+ {
+ /* No previous key */
+ s_temp->ref_length=ref_length=0;
+ length=key_length+1;
+ }
+ if ((s_temp->next_key_pos=next_key)) /* If another key after */
+ {
+ /* pack key against next key */
+ uint next_length,next_length_pack;
+ get_key_pack_length(next_length,next_length_pack,next_key);
+
+ /* If first key and next key is packed (only on delete) */
+ if (!prev_key && org_key && next_length)
+ {
+ const uchar *end;
+ for (key= s_temp->key, end=key+next_length ;
+ *key == *org_key && key < end;
+ key++,org_key++) ;
+ ref_length= (uint) (key - s_temp->key);
+ }
+
+ if (next_length > ref_length)
+ {
+ /*
+ We put a key with different case between two keys with the same prefix
+ Extend next key to have same prefix as this key
+ */
+ s_temp->n_ref_length= ref_length;
+ s_temp->prev_length= next_length-ref_length;
+ s_temp->prev_key+= ref_length;
+ return s_temp->move_length= ((int) (length+ s_temp->prev_length -
+ next_length_pack +
+ get_pack_length(ref_length)));
+ }
+ /* Check how many characters are identical to next key */
+ key= s_temp->key+next_length;
+ s_temp->prev_length= 0;
+ while (*key++ == *next_key++) ;
+ if ((ref_length= (uint) (key - s_temp->key)-1) == next_length)
+ {
+ s_temp->next_key_pos=0;
+ return (s_temp->move_length= length); /* Can't pack next key */
+ }
+ s_temp->n_ref_length=ref_length;
+ return s_temp->move_length= (int) (length-(ref_length - next_length) -
+ next_length_pack +
+ get_pack_length(ref_length));
+ }
+ return (s_temp->move_length= (int) length);
+}
+
+
+/*
+** store a key packed with _ma_calc_xxx_key_length in page-buffert
+*/
+
+/* store key without compression */
+
+void _ma_store_static_key(MARIA_KEYDEF *keyinfo __attribute__((unused)),
+ register uchar *key_pos,
+ register MARIA_KEY_PARAM *s_temp)
+{
+ memcpy(key_pos, s_temp->key,(size_t) s_temp->move_length);
+ s_temp->changed_length= s_temp->move_length;
+}
+
+
+/* store variable length key with prefix compression */
+
+#define store_pack_length(test,pos,length) { \
+ if (test) { *((pos)++) = (uchar) (length); } else \
+ { *((pos)++) = (uchar) ((length) >> 8); *((pos)++) = (uchar) (length); } }
+
+
+void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)),
+ register uchar *key_pos,
+ register MARIA_KEY_PARAM *s_temp)
+{
+ uint length;
+ uchar *org_key_pos= key_pos;
+
+ if (s_temp->ref_length)
+ {
+ /* Packed against previous key */
+ store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->ref_length);
+ /* If not same key after */
+ if (s_temp->ref_length != s_temp->pack_marker)
+ store_key_length_inc(key_pos,s_temp->key_length);
+ }
+ else
+ {
+ /* Not packed against previous key */
+ store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->key_length);
+ }
+ bmove(key_pos, s_temp->key,
+ (length= s_temp->totlength - (uint) (key_pos-org_key_pos)));
+
+ key_pos+= length;
+
+ if (!s_temp->next_key_pos) /* No following key */
+ goto end;
+
+ if (s_temp->prev_length)
+ {
+ /* Extend next key because new key didn't have same prefix as prev key */
+ if (s_temp->part_of_prev_key)
+ {
+ store_pack_length(s_temp->pack_marker == 128,key_pos,
+ s_temp->part_of_prev_key);
+ store_key_length_inc(key_pos,s_temp->n_length);
+ }
+ else
+ {
+ s_temp->n_length+= s_temp->store_not_null;
+ store_pack_length(s_temp->pack_marker == 128,key_pos,
+ s_temp->n_length);
+ }
+ memcpy(key_pos, s_temp->prev_key, s_temp->prev_length);
+ key_pos+= s_temp->prev_length;
+ }
+ else if (s_temp->n_ref_length)
+ {
+ store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_ref_length);
+ if (s_temp->n_ref_length != s_temp->pack_marker)
+ {
+ /* Not identical key */
+ store_key_length_inc(key_pos,s_temp->n_length);
+ }
+ }
+ else
+ {
+ s_temp->n_length+= s_temp->store_not_null;
+ store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_length);
+ }
+
+end:
+ s_temp->changed_length= (uint) (key_pos - org_key_pos);
+}
+
+
+/* variable length key with prefix compression */
+
+void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)),
+ register uchar *key_pos,
+ register MARIA_KEY_PARAM *s_temp)
+{
+ uchar *org_key_pos= key_pos;
+ size_t length= s_temp->totlength - s_temp->ref_length;
+
+ store_key_length_inc(key_pos,s_temp->ref_length);
+ memcpy(key_pos, s_temp->key+s_temp->ref_length, length);
+ key_pos+= length;
+
+ if (s_temp->next_key_pos)
+ {
+ store_key_length_inc(key_pos,s_temp->n_ref_length);
+ if (s_temp->prev_length) /* If we must extend key */
+ {
+ memcpy(key_pos,s_temp->prev_key,s_temp->prev_length);
+ key_pos+= s_temp->prev_length;
+ }
+ }
+ s_temp->changed_length= (uint) (key_pos - org_key_pos);
+}
diff --git a/storage/maria/ma_servicethread.c b/storage/maria/ma_servicethread.c
new file mode 100644
index 00000000000..a8099c998e9
--- /dev/null
+++ b/storage/maria/ma_servicethread.c
@@ -0,0 +1,134 @@
+#include "maria_def.h"
+#include "ma_servicethread.h"
+
+/**
+ Initializes the service thread
+
+ @param control control block
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 error
+*/
+
+int ma_service_thread_control_init(MA_SERVICE_THREAD_CONTROL *control)
+{
+ int res= 0;
+ DBUG_ENTER("ma_service_thread_control_init");
+ DBUG_PRINT("init", ("control 0x%lx", (ulong) control));
+ control->inited= TRUE;
+ control->status= THREAD_DEAD; /* not yet born == dead */
+ res= (pthread_mutex_init(control->LOCK_control, MY_MUTEX_INIT_SLOW) ||
+ pthread_cond_init(control->COND_control, 0));
+ DBUG_PRINT("info", ("init: %s", (res ? "Error" : "OK")));
+ DBUG_RETURN(res);
+}
+
+
+/**
+ Kill the service thread
+
+ @param control control block
+
+ @note The service thread should react on condition and status equal
+ THREAD_DYING, by setting status THREAD_DEAD, and issuing message to
+ control thread via condition and exiting. The base way to do so is using
+ my_service_thread_sleep() and my_service_thread_signal_end()
+*/
+
+void ma_service_thread_control_end(MA_SERVICE_THREAD_CONTROL *control)
+{
+ DBUG_ENTER("ma_service_thread_control_end");
+ DBUG_PRINT("init", ("control 0x%lx", (ulong) control));
+ DBUG_ASSERT(control->inited);
+ pthread_mutex_lock(control->LOCK_control);
+ if (control->status != THREAD_DEAD) /* thread was started OK */
+ {
+ DBUG_PRINT("info",("killing Maria background thread"));
+ control->status= THREAD_DYING; /* kill it */
+ do /* and wait for it to be dead */
+ {
+ /* wake it up if it was in a sleep */
+ pthread_cond_broadcast(control->COND_control);
+ DBUG_PRINT("info",("waiting for Maria background thread to die"));
+ pthread_cond_wait(control->COND_control, control->LOCK_control);
+ }
+ while (control->status != THREAD_DEAD);
+ }
+ pthread_mutex_unlock(control->LOCK_control);
+ pthread_mutex_destroy(control->LOCK_control);
+ pthread_cond_destroy(control->COND_control);
+ control->inited= FALSE;
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ Sleep for given number of nanoseconds with reaction on thread kill
+
+ @param control control block
+ @param sleep_time time of sleeping
+
+ @return Operation status
+ @retval FALSE Time out
+ @retval TRUE Thread should be killed
+*/
+
+my_bool my_service_thread_sleep(MA_SERVICE_THREAD_CONTROL *control,
+ ulonglong sleep_time)
+{
+ struct timespec abstime;
+ my_bool res= FALSE;
+ DBUG_ENTER("my_service_thread_sleep");
+ DBUG_PRINT("init", ("control 0x%lx", (ulong) control));
+ pthread_mutex_lock(control->LOCK_control);
+ if (control->status == THREAD_DYING)
+ {
+ pthread_mutex_unlock(control->LOCK_control);
+ DBUG_RETURN(TRUE);
+ }
+#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
+ pthread_mutex_unlock(&control->LOCK_control);
+ my_sleep(100000); /* a tenth of a second */
+ pthread_mutex_lock(&control->LOCK_control);
+#else
+ /* To have a killable sleep, we use timedwait like our SQL GET_LOCK() */
+ DBUG_PRINT("info", ("sleeping %llu nano seconds", sleep_time));
+ if (sleep_time)
+ {
+ set_timespec_nsec(abstime, sleep_time);
+ pthread_cond_timedwait(control->COND_control,
+ control->LOCK_control, &abstime);
+ }
+#endif
+ if (control->status == THREAD_DYING)
+ res= TRUE;
+ pthread_mutex_unlock(control->LOCK_control);
+ DBUG_RETURN(res);
+}
+
+
+/**
+ inform about thread exiting
+
+ @param control control block
+*/
+
+void my_service_thread_signal_end(MA_SERVICE_THREAD_CONTROL *control)
+{
+ DBUG_ENTER("my_service_thread_signal_end");
+ DBUG_PRINT("init", ("control 0x%lx", (ulong) control));
+ pthread_mutex_lock(control->LOCK_control);
+ control->status = THREAD_DEAD; /* indicate that we are dead */
+ /*
+ wake up ma_service_thread_control_end which may be waiting for
+ our death
+ */
+ pthread_cond_broadcast(control->COND_control);
+ /*
+ broadcast was inside unlock because ma_service_thread_control_end
+ destroys mutex
+ */
+ pthread_mutex_unlock(control->LOCK_control);
+ DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_servicethread.h b/storage/maria/ma_servicethread.h
new file mode 100644
index 00000000000..153ff9ebd14
--- /dev/null
+++ b/storage/maria/ma_servicethread.h
@@ -0,0 +1,22 @@
+#include <my_pthread.h>
+
+enum ma_service_thread_state {THREAD_RUNNING, THREAD_DYING, THREAD_DEAD};
+
+typedef struct st_ma_service_thread_control
+{
+ /** 'kill' flag for the background thread */
+ enum ma_service_thread_state status;
+ /** if thread module was inited or not */
+ my_bool inited;
+ /** for killing the background thread */
+ pthread_mutex_t *LOCK_control;
+ /** for killing the background thread */
+ pthread_cond_t *COND_control;
+} MA_SERVICE_THREAD_CONTROL;
+
+
+int ma_service_thread_control_init(MA_SERVICE_THREAD_CONTROL *control);
+void ma_service_thread_control_end(MA_SERVICE_THREAD_CONTROL *control);
+my_bool my_service_thread_sleep(MA_SERVICE_THREAD_CONTROL *control,
+ ulonglong sleep_time);
+void my_service_thread_signal_end(MA_SERVICE_THREAD_CONTROL *control);
diff --git a/storage/maria/ma_sort.c b/storage/maria/ma_sort.c
new file mode 100644
index 00000000000..f7f79f90cf0
--- /dev/null
+++ b/storage/maria/ma_sort.c
@@ -0,0 +1,1077 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Creates a index for a database by reading keys, sorting them and outputing
+ them in sorted order through MARIA_SORT_INFO functions.
+*/
+
+#include "ma_fulltext.h"
+#if defined(MSDOS) || defined(__WIN__)
+#include <fcntl.h>
+#else
+#include <stddef.h>
+#endif
+#include <queues.h>
+
+/* static variables */
+
+#undef MIN_SORT_MEMORY
+#undef MYF_RW
+#undef DISK_BUFFER_SIZE
+
+#define MERGEBUFF 15
+#define MERGEBUFF2 31
+#define MIN_SORT_MEMORY (4096-MALLOC_OVERHEAD)
+#define MYF_RW MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL)
+#define DISK_BUFFER_SIZE (IO_SIZE*16)
+
+
+/*
+ Pointers of functions for store and read keys from temp file
+*/
+
+extern void print_error _VARARGS((const char *fmt,...));
+
+/* Functions defined in this file */
+
+static ha_rows find_all_keys(MARIA_SORT_PARAM *info,uint keys,
+ uchar **sort_keys,
+ DYNAMIC_ARRAY *buffpek,int *maxbuffer,
+ IO_CACHE *tempfile,
+ IO_CACHE *tempfile_for_exceptions);
+static int write_keys(MARIA_SORT_PARAM *info, uchar **sort_keys,
+ uint count, BUFFPEK *buffpek,IO_CACHE *tempfile);
+static int write_key(MARIA_SORT_PARAM *info, uchar *key,
+ IO_CACHE *tempfile);
+static int write_index(MARIA_SORT_PARAM *info, uchar **sort_keys,
+ uint count);
+static int merge_many_buff(MARIA_SORT_PARAM *info,uint keys,
+ uchar **sort_keys,
+ BUFFPEK *buffpek,int *maxbuffer,
+ IO_CACHE *t_file);
+static uint read_to_buffer(IO_CACHE *fromfile,BUFFPEK *buffpek,
+ uint sort_length);
+static int merge_buffers(MARIA_SORT_PARAM *info,uint keys,
+ IO_CACHE *from_file, IO_CACHE *to_file,
+ uchar **sort_keys, BUFFPEK *lastbuff,
+ BUFFPEK *Fb, BUFFPEK *Tb);
+static int merge_index(MARIA_SORT_PARAM *,uint, uchar **,BUFFPEK *, int,
+ IO_CACHE *);
+static int flush_maria_ft_buf(MARIA_SORT_PARAM *info);
+
+static int write_keys_varlen(MARIA_SORT_PARAM *info, uchar **sort_keys,
+ uint count, BUFFPEK *buffpek,
+ IO_CACHE *tempfile);
+static uint read_to_buffer_varlen(IO_CACHE *fromfile,BUFFPEK *buffpek,
+ uint sort_length);
+static int write_merge_key(MARIA_SORT_PARAM *info, IO_CACHE *to_file,
+ uchar *key, uint sort_length, uint count);
+static int write_merge_key_varlen(MARIA_SORT_PARAM *info,
+ IO_CACHE *to_file, uchar *key,
+ uint sort_length, uint count);
+static inline int
+my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs);
+
+/*
+ Creates a index of sorted keys
+
+ SYNOPSIS
+ _ma_create_index_by_sort()
+ info Sort parameters
+ no_messages Set to 1 if no output
+ sortbuff_size Size of sortbuffer to allocate
+
+ RESULT
+ 0 ok
+ <> 0 Error
+*/
+
+int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages,
+ size_t sortbuff_size)
+{
+ int error,maxbuffer,skr;
+ size_t memavl,old_memavl;
+ uint keys,sort_length;
+ DYNAMIC_ARRAY buffpek;
+ ha_rows records;
+ uchar **sort_keys;
+ IO_CACHE tempfile, tempfile_for_exceptions;
+ DBUG_ENTER("_ma_create_index_by_sort");
+ DBUG_PRINT("enter",("sort_buff_size: %lu sort_length: %d max_records: %lu",
+ (ulong) sortbuff_size, info->key_length,
+ (ulong) info->sort_info->max_records));
+
+ if (info->keyinfo->flag & HA_VAR_LENGTH_KEY)
+ {
+ info->write_keys= write_keys_varlen;
+ info->read_to_buffer=read_to_buffer_varlen;
+ info->write_key=write_merge_key_varlen;
+ }
+ else
+ {
+ info->write_keys= write_keys;
+ info->read_to_buffer=read_to_buffer;
+ info->write_key=write_merge_key;
+ }
+
+ my_b_clear(&tempfile);
+ my_b_clear(&tempfile_for_exceptions);
+ bzero((char*) &buffpek,sizeof(buffpek));
+ sort_keys= (uchar **) NULL; error= 1;
+ maxbuffer=1;
+
+ memavl=max(sortbuff_size,MIN_SORT_MEMORY);
+ records= info->sort_info->max_records;
+ sort_length= info->key_length;
+ LINT_INIT(keys);
+
+ while (memavl >= MIN_SORT_MEMORY)
+ {
+ if ((records < UINT_MAX32) &&
+ ((my_off_t) (records + 1) *
+ (sort_length + sizeof(char*)) <= (my_off_t) memavl))
+ keys= (uint)records+1;
+ else
+ do
+ {
+ skr=maxbuffer;
+ if (memavl < sizeof(BUFFPEK)*(uint) maxbuffer ||
+ (keys=(memavl-sizeof(BUFFPEK)*(uint) maxbuffer)/
+ (sort_length+sizeof(char*))) <= 1 ||
+ keys < (uint) maxbuffer)
+ {
+ _ma_check_print_error(info->sort_info->param,
+ "aria_sort_buffer_size is too small");
+ goto err;
+ }
+ }
+ while ((maxbuffer= (int) (records/(keys-1)+1)) != skr);
+
+ if ((sort_keys=(uchar**) my_malloc(keys*(sort_length+sizeof(char*))+
+ HA_FT_MAXBYTELEN, MYF(0))))
+ {
+ if (my_init_dynamic_array(&buffpek, sizeof(BUFFPEK), maxbuffer,
+ maxbuffer/2))
+ {
+ my_free(sort_keys,MYF(0));
+ sort_keys= 0;
+ }
+ else
+ break;
+ }
+ old_memavl=memavl;
+ if ((memavl=memavl/4*3) < MIN_SORT_MEMORY && old_memavl > MIN_SORT_MEMORY)
+ memavl=MIN_SORT_MEMORY;
+ }
+ if (memavl < MIN_SORT_MEMORY)
+ {
+ _ma_check_print_error(info->sort_info->param, "Aria sort buffer"
+ " too small"); /* purecov: tested */
+ goto err; /* purecov: tested */
+ }
+ (*info->lock_in_memory)(info->sort_info->param);/* Everything is allocated */
+
+ if (!no_messages)
+ printf(" - Searching for keys, allocating buffer for %d keys\n",keys);
+
+ if ((records=find_all_keys(info,keys,sort_keys,&buffpek,&maxbuffer,
+ &tempfile,&tempfile_for_exceptions))
+ == HA_POS_ERROR)
+ goto err; /* purecov: tested */
+ if (maxbuffer == 0)
+ {
+ if (!no_messages)
+ printf(" - Dumping %lu keys\n", (ulong) records);
+ if (write_index(info,sort_keys, (uint) records))
+ goto err; /* purecov: inspected */
+ }
+ else
+ {
+ keys=(keys*(sort_length+sizeof(char*)))/sort_length;
+ if (maxbuffer >= MERGEBUFF2)
+ {
+ if (!no_messages)
+ printf(" - Merging %lu keys\n", (ulong) records); /* purecov: tested */
+ if (merge_many_buff(info,keys,sort_keys,
+ dynamic_element(&buffpek,0,BUFFPEK *),&maxbuffer,&tempfile))
+ goto err; /* purecov: inspected */
+ }
+ if (flush_io_cache(&tempfile) ||
+ reinit_io_cache(&tempfile,READ_CACHE,0L,0,0))
+ goto err; /* purecov: inspected */
+ if (!no_messages)
+ printf(" - Last merge and dumping keys\n"); /* purecov: tested */
+ if (merge_index(info,keys,sort_keys,dynamic_element(&buffpek,0,BUFFPEK *),
+ maxbuffer,&tempfile))
+ goto err; /* purecov: inspected */
+ }
+
+ if (flush_maria_ft_buf(info) || _ma_flush_pending_blocks(info))
+ goto err;
+
+ if (my_b_inited(&tempfile_for_exceptions))
+ {
+ MARIA_HA *idx=info->sort_info->info;
+ uint16 key_length;
+ MARIA_KEY key;
+ key.keyinfo= idx->s->keyinfo + info->key;
+
+ if (!no_messages)
+ printf(" - Adding exceptions\n"); /* purecov: tested */
+ if (flush_io_cache(&tempfile_for_exceptions) ||
+ reinit_io_cache(&tempfile_for_exceptions,READ_CACHE,0L,0,0))
+ goto err;
+
+ while (!my_b_read(&tempfile_for_exceptions,(uchar*)&key_length,
+ sizeof(key_length))
+ && !my_b_read(&tempfile_for_exceptions,(uchar*)sort_keys,
+ (uint) key_length))
+ {
+ key.data= (uchar*) sort_keys;
+ key.ref_length= idx->s->rec_reflength;
+ key.data_length= key_length - key.ref_length;
+ key.flag= 0;
+ if (_ma_ck_write(idx, &key))
+ goto err;
+ }
+ }
+
+ error =0;
+
+err:
+ my_free(sort_keys, MYF(MY_ALLOW_ZERO_PTR));
+ delete_dynamic(&buffpek);
+ close_cached_file(&tempfile);
+ close_cached_file(&tempfile_for_exceptions);
+
+ DBUG_RETURN(error ? -1 : 0);
+} /* _ma_create_index_by_sort */
+
+
+/* Search after all keys and place them in a temp. file */
+
+static ha_rows find_all_keys(MARIA_SORT_PARAM *info, uint keys,
+ uchar **sort_keys, DYNAMIC_ARRAY *buffpek,
+ int *maxbuffer, IO_CACHE *tempfile,
+ IO_CACHE *tempfile_for_exceptions)
+{
+ int error;
+ uint idx;
+ DBUG_ENTER("find_all_keys");
+
+ idx=error=0;
+ sort_keys[0]= (uchar*) (sort_keys+keys);
+
+ while (!(error=(*info->key_read)(info,sort_keys[idx])))
+ {
+ if (info->real_key_length > info->key_length)
+ {
+ if (write_key(info,sort_keys[idx],tempfile_for_exceptions))
+ DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */
+ continue;
+ }
+
+ if (++idx == keys)
+ {
+ if (info->write_keys(info,sort_keys,idx-1,
+ (BUFFPEK *)alloc_dynamic(buffpek),
+ tempfile))
+ DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */
+
+ sort_keys[0]=(uchar*) (sort_keys+keys);
+ memcpy(sort_keys[0],sort_keys[idx-1],(size_t) info->key_length);
+ idx=1;
+ }
+ sort_keys[idx]=sort_keys[idx-1]+info->key_length;
+ }
+ if (error > 0)
+ DBUG_RETURN(HA_POS_ERROR); /* Aborted by get_key */ /* purecov: inspected */
+ if (buffpek->elements)
+ {
+ if (info->write_keys(info,sort_keys,idx,(BUFFPEK *)alloc_dynamic(buffpek),
+ tempfile))
+ DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */
+ *maxbuffer=buffpek->elements-1;
+ }
+ else
+ *maxbuffer=0;
+
+ DBUG_RETURN((*maxbuffer)*(keys-1)+idx);
+} /* find_all_keys */
+
+
+#ifdef THREAD
+/* Search after all keys and place them in a temp. file */
+
+pthread_handler_t _ma_thr_find_all_keys(void *arg)
+{
+ MARIA_SORT_PARAM *sort_param= (MARIA_SORT_PARAM*) arg;
+ int error;
+ size_t memavl,old_memavl;
+ uint sort_length;
+ ulong idx, maxbuffer, keys;
+ uchar **sort_keys=0;
+
+ LINT_INIT(keys);
+
+ error=1;
+
+ if (my_thread_init())
+ goto err;
+
+ { /* Add extra block since DBUG_ENTER declare variables */
+ DBUG_ENTER("_ma_thr_find_all_keys");
+ DBUG_PRINT("enter", ("master: %d", sort_param->master));
+ if (sort_param->sort_info->got_error)
+ goto err;
+
+ if (sort_param->keyinfo->flag & HA_VAR_LENGTH_KEY)
+ {
+ sort_param->write_keys= write_keys_varlen;
+ sort_param->read_to_buffer= read_to_buffer_varlen;
+ sort_param->write_key= write_merge_key_varlen;
+ }
+ else
+ {
+ sort_param->write_keys= write_keys;
+ sort_param->read_to_buffer= read_to_buffer;
+ sort_param->write_key= write_merge_key;
+ }
+
+ my_b_clear(&sort_param->tempfile);
+ my_b_clear(&sort_param->tempfile_for_exceptions);
+ bzero((char*) &sort_param->buffpek,sizeof(sort_param->buffpek));
+ bzero((char*) &sort_param->unique, sizeof(sort_param->unique));
+
+ memavl= max(sort_param->sortbuff_size, MIN_SORT_MEMORY);
+ idx= (uint)sort_param->sort_info->max_records;
+ sort_length= sort_param->key_length;
+ maxbuffer= 1;
+
+ while (memavl >= MIN_SORT_MEMORY)
+ {
+ if ((my_off_t) (idx+1)*(sort_length+sizeof(char*)) <= (my_off_t) memavl)
+ keys= idx+1;
+ else
+ {
+ ulong skr;
+ do
+ {
+ skr= maxbuffer;
+ if (memavl < sizeof(BUFFPEK)*maxbuffer ||
+ (keys=(memavl-sizeof(BUFFPEK)*maxbuffer)/
+ (sort_length+sizeof(char*))) <= 1 ||
+ keys < maxbuffer)
+ {
+ _ma_check_print_error(sort_param->sort_info->param,
+ "aria_sort_buffer_size is too small");
+ goto err;
+ }
+ }
+ while ((maxbuffer= (int) (idx/(keys-1)+1)) != skr);
+ }
+ if ((sort_keys= (uchar **)
+ my_malloc(keys*(sort_length+sizeof(char*))+
+ ((sort_param->keyinfo->flag & HA_FULLTEXT) ?
+ HA_FT_MAXBYTELEN : 0), MYF(0))))
+ {
+ if (my_init_dynamic_array(&sort_param->buffpek, sizeof(BUFFPEK),
+ maxbuffer, maxbuffer/2))
+ {
+ my_free(sort_keys, MYF(0));
+ sort_keys= (uchar **) NULL; /* for err: label */
+ }
+ else
+ break;
+ }
+ old_memavl= memavl;
+ if ((memavl= memavl/4*3) < MIN_SORT_MEMORY &&
+ old_memavl > MIN_SORT_MEMORY)
+ memavl= MIN_SORT_MEMORY;
+ }
+ if (memavl < MIN_SORT_MEMORY)
+ {
+ _ma_check_print_error(sort_param->sort_info->param,
+ "Aria sort buffer too small");
+ goto err; /* purecov: tested */
+ }
+
+ if (sort_param->sort_info->param->testflag & T_VERBOSE)
+ printf("Key %d - Allocating buffer for %lu keys\n",
+ sort_param->key+1, (ulong) keys);
+ sort_param->sort_keys= sort_keys;
+
+ idx= error= 0;
+ sort_keys[0]= (uchar*) (sort_keys+keys);
+
+ DBUG_PRINT("info", ("reading keys"));
+ while (!(error= sort_param->sort_info->got_error) &&
+ !(error= (*sort_param->key_read)(sort_param, sort_keys[idx])))
+ {
+ if (sort_param->real_key_length > sort_param->key_length)
+ {
+ if (write_key(sort_param,sort_keys[idx],
+ &sort_param->tempfile_for_exceptions))
+ goto err;
+ continue;
+ }
+
+ if (++idx == keys)
+ {
+ if (sort_param->write_keys(sort_param, sort_keys, idx - 1,
+ (BUFFPEK *)alloc_dynamic(&sort_param->
+ buffpek),
+ &sort_param->tempfile))
+ goto err;
+ sort_keys[0]= (uchar*) (sort_keys+keys);
+ memcpy(sort_keys[0], sort_keys[idx - 1],
+ (size_t) sort_param->key_length);
+ idx= 1;
+ }
+ sort_keys[idx]=sort_keys[idx - 1] + sort_param->key_length;
+ }
+ if (error > 0)
+ goto err;
+ if (sort_param->buffpek.elements)
+ {
+ if (sort_param->write_keys(sort_param,sort_keys, idx,
+ (BUFFPEK *) alloc_dynamic(&sort_param->
+ buffpek),
+ &sort_param->tempfile))
+ goto err;
+ sort_param->keys= (sort_param->buffpek.elements - 1) * (keys - 1) + idx;
+ }
+ else
+ sort_param->keys= idx;
+
+ sort_param->sort_keys_length= keys;
+ goto ok;
+
+err:
+ DBUG_PRINT("error", ("got some error"));
+ sort_param->sort_info->got_error= 1; /* no need to protect with a mutex */
+ my_free(sort_keys,MYF(MY_ALLOW_ZERO_PTR));
+ sort_param->sort_keys=0;
+ delete_dynamic(& sort_param->buffpek);
+ close_cached_file(&sort_param->tempfile);
+ close_cached_file(&sort_param->tempfile_for_exceptions);
+
+ok:
+ free_root(&sort_param->wordroot, MYF(0));
+ /*
+ Detach from the share if the writer is involved. Avoid others to
+ be blocked. This includes a flush of the write buffer. This will
+ also indicate EOF to the readers.
+ */
+ if (sort_param->sort_info->info->rec_cache.share)
+ remove_io_thread(&sort_param->sort_info->info->rec_cache);
+
+ /* Readers detach from the share if any. Avoid others to be blocked. */
+ if (sort_param->read_cache.share)
+ remove_io_thread(&sort_param->read_cache);
+
+ pthread_mutex_lock(&sort_param->sort_info->mutex);
+ if (!--sort_param->sort_info->threads_running)
+ pthread_cond_signal(&sort_param->sort_info->cond);
+ pthread_mutex_unlock(&sort_param->sort_info->mutex);
+ DBUG_PRINT("exit", ("======== ending thread ========"));
+ }
+ my_thread_end();
+ return NULL;
+}
+
+
+int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param)
+{
+ MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ ulong length, keys;
+ double *rec_per_key_part= param->new_rec_per_key_part;
+ int got_error=sort_info->got_error;
+ uint i;
+ MARIA_HA *info=sort_info->info;
+ MARIA_SHARE *share= info->s;
+ MARIA_SORT_PARAM *sinfo;
+ uchar *mergebuf=0;
+ DBUG_ENTER("_ma_thr_write_keys");
+ LINT_INIT(length);
+
+ for (i= 0, sinfo= sort_param ;
+ i < sort_info->total_keys ;
+ i++, rec_per_key_part+=sinfo->keyinfo->keysegs, sinfo++)
+ {
+ if (!sinfo->sort_keys)
+ {
+ got_error=1;
+ my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ continue;
+ }
+ if (!got_error)
+ {
+ maria_set_key_active(share->state.key_map, sinfo->key);
+
+ if (!sinfo->buffpek.elements)
+ {
+ if (param->testflag & T_VERBOSE)
+ {
+ printf("Key %d - Dumping %u keys\n",sinfo->key+1, sinfo->keys);
+ fflush(stdout);
+ }
+ if (write_index(sinfo, sinfo->sort_keys, sinfo->keys) ||
+ flush_maria_ft_buf(sinfo) || _ma_flush_pending_blocks(sinfo))
+ got_error=1;
+ }
+ if (!got_error && param->testflag & T_STATISTICS)
+ maria_update_key_parts(sinfo->keyinfo, rec_per_key_part, sinfo->unique,
+ param->stats_method ==
+ MI_STATS_METHOD_IGNORE_NULLS ?
+ sinfo->notnull : NULL,
+ (ulonglong) share->state.state.records);
+ }
+ my_free(sinfo->sort_keys,MYF(0));
+ my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+ sinfo->sort_keys=0;
+ }
+
+ for (i= 0, sinfo= sort_param ;
+ i < sort_info->total_keys ;
+ i++,
+ delete_dynamic(&sinfo->buffpek),
+ close_cached_file(&sinfo->tempfile),
+ close_cached_file(&sinfo->tempfile_for_exceptions),
+ sinfo++)
+ {
+ if (got_error)
+ continue;
+ if (sinfo->keyinfo->flag & HA_VAR_LENGTH_KEY)
+ {
+ sinfo->write_keys=write_keys_varlen;
+ sinfo->read_to_buffer=read_to_buffer_varlen;
+ sinfo->write_key=write_merge_key_varlen;
+ }
+ else
+ {
+ sinfo->write_keys=write_keys;
+ sinfo->read_to_buffer=read_to_buffer;
+ sinfo->write_key=write_merge_key;
+ }
+ if (sinfo->buffpek.elements)
+ {
+ uint maxbuffer=sinfo->buffpek.elements-1;
+ if (!mergebuf)
+ {
+ length=param->sort_buffer_length;
+ while (length >= MIN_SORT_MEMORY)
+ {
+ if ((mergebuf= my_malloc(length, MYF(0))))
+ break;
+ length=length*3/4;
+ }
+ if (!mergebuf)
+ {
+ got_error=1;
+ continue;
+ }
+ }
+ keys=length/sinfo->key_length;
+ if (maxbuffer >= MERGEBUFF2)
+ {
+ if (param->testflag & T_VERBOSE)
+ printf("Key %d - Merging %u keys\n",sinfo->key+1, sinfo->keys);
+ if (merge_many_buff(sinfo, keys, (uchar **) mergebuf,
+ dynamic_element(&sinfo->buffpek, 0, BUFFPEK *),
+ (int*) &maxbuffer, &sinfo->tempfile))
+ {
+ got_error=1;
+ continue;
+ }
+ }
+ if (flush_io_cache(&sinfo->tempfile) ||
+ reinit_io_cache(&sinfo->tempfile,READ_CACHE,0L,0,0))
+ {
+ got_error=1;
+ continue;
+ }
+ if (param->testflag & T_VERBOSE)
+ printf("Key %d - Last merge and dumping keys\n", sinfo->key+1);
+ if (merge_index(sinfo, keys, (uchar**) mergebuf,
+ dynamic_element(&sinfo->buffpek,0,BUFFPEK *),
+ maxbuffer,&sinfo->tempfile) ||
+ flush_maria_ft_buf(sinfo) ||
+ _ma_flush_pending_blocks(sinfo))
+ {
+ got_error=1;
+ continue;
+ }
+ }
+ if (my_b_inited(&sinfo->tempfile_for_exceptions))
+ {
+ uint16 key_length;
+
+ if (param->testflag & T_VERBOSE)
+ printf("Key %d - Dumping 'long' keys\n", sinfo->key+1);
+
+ if (flush_io_cache(&sinfo->tempfile_for_exceptions) ||
+ reinit_io_cache(&sinfo->tempfile_for_exceptions,READ_CACHE,0L,0,0))
+ {
+ got_error=1;
+ continue;
+ }
+
+ while (!got_error &&
+ !my_b_read(&sinfo->tempfile_for_exceptions,(uchar*)&key_length,
+ sizeof(key_length)))
+ {
+ uchar maria_ft_buf[HA_FT_MAXBYTELEN + HA_FT_WLEN + 10];
+ if (key_length > sizeof(maria_ft_buf) ||
+ my_b_read(&sinfo->tempfile_for_exceptions, (uchar*)maria_ft_buf,
+ (uint) key_length))
+ got_error= 1;
+ else
+ {
+ MARIA_KEY tmp_key;
+ tmp_key.keyinfo= info->s->keyinfo + sinfo->key;
+ tmp_key.data= maria_ft_buf;
+ tmp_key.ref_length= info->s->rec_reflength;
+ tmp_key.data_length= key_length - info->s->rec_reflength;
+ tmp_key.flag= 0;
+ if (_ma_ck_write(info, &tmp_key))
+ got_error=1;
+ }
+ }
+ }
+ }
+ my_free(mergebuf,MYF(MY_ALLOW_ZERO_PTR));
+ DBUG_RETURN(got_error);
+}
+#endif /* THREAD */
+
+
+/* Write all keys in memory to file for later merge */
+
+static int write_keys(MARIA_SORT_PARAM *info, register uchar **sort_keys,
+ uint count, BUFFPEK *buffpek, IO_CACHE *tempfile)
+{
+ uchar **end;
+ uint sort_length=info->key_length;
+ DBUG_ENTER("write_keys");
+
+ my_qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp,
+ info);
+ if (!my_b_inited(tempfile) &&
+ open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST",
+ DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+ DBUG_RETURN(1); /* purecov: inspected */
+
+ buffpek->file_pos=my_b_tell(tempfile);
+ buffpek->count=count;
+
+ for (end=sort_keys+count ; sort_keys != end ; sort_keys++)
+ {
+ if (my_b_write(tempfile, *sort_keys, (uint) sort_length))
+ DBUG_RETURN(1); /* purecov: inspected */
+ }
+ DBUG_RETURN(0);
+} /* write_keys */
+
+
+static inline int
+my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs)
+{
+ int err;
+ uint16 len= _ma_keylength(info->keyinfo, bufs);
+
+ /* The following is safe as this is a local file */
+ if ((err= my_b_write(to_file, (uchar*)&len, sizeof(len))))
+ return (err);
+ if ((err= my_b_write(to_file,bufs, (uint) len)))
+ return (err);
+ return (0);
+}
+
+
+static int write_keys_varlen(MARIA_SORT_PARAM *info,
+ register uchar **sort_keys,
+ uint count, BUFFPEK *buffpek,
+ IO_CACHE *tempfile)
+{
+ uchar **end;
+ int err;
+ DBUG_ENTER("write_keys_varlen");
+
+ my_qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp,
+ info);
+ if (!my_b_inited(tempfile) &&
+ open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST",
+ DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+ DBUG_RETURN(1); /* purecov: inspected */
+
+ buffpek->file_pos=my_b_tell(tempfile);
+ buffpek->count=count;
+ for (end=sort_keys+count ; sort_keys != end ; sort_keys++)
+ {
+ if ((err= my_var_write(info,tempfile, *sort_keys)))
+ DBUG_RETURN(err);
+ }
+ DBUG_RETURN(0);
+} /* write_keys_varlen */
+
+
+static int write_key(MARIA_SORT_PARAM *info, uchar *key,
+ IO_CACHE *tempfile)
+{
+ uint16 key_length=info->real_key_length;
+ DBUG_ENTER("write_key");
+
+ if (!my_b_inited(tempfile) &&
+ open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST",
+ DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+ DBUG_RETURN(1);
+
+ if (my_b_write(tempfile, (uchar*)&key_length,sizeof(key_length)) ||
+ my_b_write(tempfile, key, (uint) key_length))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+} /* write_key */
+
+
+/* Write index */
+
+static int write_index(MARIA_SORT_PARAM *info,
+ register uchar **sort_keys,
+ register uint count)
+{
+ DBUG_ENTER("write_index");
+
+ my_qsort2((uchar*) sort_keys,(size_t) count,sizeof(uchar*),
+ (qsort2_cmp) info->key_cmp,info);
+ while (count--)
+ {
+ if ((*info->key_write)(info, *sort_keys++))
+ DBUG_RETURN(-1); /* purecov: inspected */
+ }
+ DBUG_RETURN(0);
+} /* write_index */
+
+
+ /* Merge buffers to make < MERGEBUFF2 buffers */
+
+static int merge_many_buff(MARIA_SORT_PARAM *info, uint keys,
+ uchar **sort_keys, BUFFPEK *buffpek,
+ int *maxbuffer, IO_CACHE *t_file)
+{
+ register int i;
+ IO_CACHE t_file2, *from_file, *to_file, *temp;
+ BUFFPEK *lastbuff;
+ DBUG_ENTER("merge_many_buff");
+
+ if (*maxbuffer < MERGEBUFF2)
+ DBUG_RETURN(0); /* purecov: inspected */
+ if (flush_io_cache(t_file) ||
+ open_cached_file(&t_file2,my_tmpdir(info->tmpdir),"ST",
+ DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+ DBUG_RETURN(1); /* purecov: inspected */
+
+ from_file= t_file ; to_file= &t_file2;
+ while (*maxbuffer >= MERGEBUFF2)
+ {
+ reinit_io_cache(from_file,READ_CACHE,0L,0,0);
+ reinit_io_cache(to_file,WRITE_CACHE,0L,0,0);
+ lastbuff=buffpek;
+ for (i=0 ; i <= *maxbuffer-MERGEBUFF*3/2 ; i+=MERGEBUFF)
+ {
+ if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++,
+ buffpek+i,buffpek+i+MERGEBUFF-1))
+ goto cleanup;
+ }
+ if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++,
+ buffpek+i,buffpek+ *maxbuffer))
+ break; /* purecov: inspected */
+ if (flush_io_cache(to_file))
+ break; /* purecov: inspected */
+ temp=from_file; from_file=to_file; to_file=temp;
+ *maxbuffer= (int) (lastbuff-buffpek)-1;
+ }
+cleanup:
+ close_cached_file(to_file); /* This holds old result */
+ if (to_file == t_file)
+ *t_file=t_file2; /* Copy result file */
+
+ DBUG_RETURN(*maxbuffer >= MERGEBUFF2); /* Return 1 if interrupted */
+} /* merge_many_buff */
+
+
+/*
+ Read data to buffer
+
+ SYNOPSIS
+ read_to_buffer()
+ fromfile File to read from
+ buffpek Where to read from
+ sort_length max length to read
+ RESULT
+ > 0 Ammount of bytes read
+ -1 Error
+*/
+
+static uint read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek,
+ uint sort_length)
+{
+ register uint count;
+ uint length;
+
+ if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count)))
+ {
+ if (my_pread(fromfile->file, buffpek->base,
+ (length= sort_length*count),buffpek->file_pos,MYF_RW))
+ return((uint) -1); /* purecov: inspected */
+ buffpek->key=buffpek->base;
+ buffpek->file_pos+= length; /* New filepos */
+ buffpek->count-= count;
+ buffpek->mem_count= count;
+ }
+ return (count*sort_length);
+} /* read_to_buffer */
+
+static uint read_to_buffer_varlen(IO_CACHE *fromfile, BUFFPEK *buffpek,
+ uint sort_length)
+{
+ register uint count;
+ uint idx;
+ uchar *buffp;
+
+ if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count)))
+ {
+ buffp= buffpek->base;
+
+ for (idx=1;idx<=count;idx++)
+ {
+ uint16 length_of_key;
+ if (my_pread(fromfile->file,(uchar*)&length_of_key,sizeof(length_of_key),
+ buffpek->file_pos,MYF_RW))
+ return((uint) -1);
+ buffpek->file_pos+=sizeof(length_of_key);
+ if (my_pread(fromfile->file, buffp, length_of_key,
+ buffpek->file_pos,MYF_RW))
+ return((uint) -1);
+ buffpek->file_pos+=length_of_key;
+ buffp = buffp + sort_length;
+ }
+ buffpek->key=buffpek->base;
+ buffpek->count-= count;
+ buffpek->mem_count= count;
+ }
+ return (count*sort_length);
+} /* read_to_buffer_varlen */
+
+
+static int write_merge_key_varlen(MARIA_SORT_PARAM *info,
+ IO_CACHE *to_file, uchar* key,
+ uint sort_length, uint count)
+{
+ uint idx;
+ uchar *bufs = key;
+
+ for (idx=1;idx<=count;idx++)
+ {
+ int err;
+ if ((err= my_var_write(info, to_file, bufs)))
+ return (err);
+ bufs=bufs+sort_length;
+ }
+ return(0);
+}
+
+
+static int write_merge_key(MARIA_SORT_PARAM *info __attribute__((unused)),
+ IO_CACHE *to_file, uchar *key,
+ uint sort_length, uint count)
+{
+ return my_b_write(to_file, key, (size_t) sort_length*count);
+}
+
+/*
+ Merge buffers to one buffer
+ If to_file == 0 then use info->key_write
+*/
+
+static int NEAR_F
+merge_buffers(MARIA_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
+ IO_CACHE *to_file, uchar **sort_keys, BUFFPEK *lastbuff,
+ BUFFPEK *Fb, BUFFPEK *Tb)
+{
+ int error;
+ uint sort_length,maxcount;
+ ha_rows count;
+ my_off_t to_start_filepos;
+ uchar *strpos;
+ BUFFPEK *buffpek,**refpek;
+ QUEUE queue;
+ DBUG_ENTER("merge_buffers");
+
+ count=error=0;
+ maxcount=keys/((uint) (Tb-Fb) +1);
+ DBUG_ASSERT(maxcount > 0);
+ LINT_INIT(to_start_filepos);
+ if (to_file)
+ to_start_filepos=my_b_tell(to_file);
+ strpos= (uchar*) sort_keys;
+ sort_length=info->key_length;
+
+ if (init_queue(&queue,(uint) (Tb-Fb)+1,offsetof(BUFFPEK,key),0,
+ (int (*)(void*, uchar *,uchar*)) info->key_cmp,
+ (void*) info, 0, 0))
+ DBUG_RETURN(1); /* purecov: inspected */
+
+ for (buffpek= Fb ; buffpek <= Tb ; buffpek++)
+ {
+ count+= buffpek->count;
+ buffpek->base= strpos;
+ buffpek->max_keys=maxcount;
+ strpos+= (uint) (error=(int) info->read_to_buffer(from_file,buffpek,
+ sort_length));
+ if (error == -1)
+ goto err; /* purecov: inspected */
+ queue_insert(&queue,(uchar*) buffpek);
+ }
+
+ while (queue.elements > 1)
+ {
+ for (;;)
+ {
+ buffpek=(BUFFPEK*) queue_top(&queue);
+ if (to_file)
+ {
+ if (info->write_key(info,to_file, buffpek->key,
+ (uint) sort_length,1))
+ {
+ error=1; goto err; /* purecov: inspected */
+ }
+ }
+ else
+ {
+ if ((*info->key_write)(info,(void*) buffpek->key))
+ {
+ error=1; goto err; /* purecov: inspected */
+ }
+ }
+ buffpek->key+=sort_length;
+ if (! --buffpek->mem_count)
+ {
+ /* It's enough to check for killedptr before a slow operation */
+ if (_ma_killed_ptr(info->sort_info->param))
+ {
+ error=1;
+ goto err;
+ }
+ if (!(error=(int) info->read_to_buffer(from_file,buffpek,sort_length)))
+ {
+ uchar *base= buffpek->base;
+ uint max_keys=buffpek->max_keys;
+
+ VOID(queue_remove_top(&queue));
+
+ /* Put room used by buffer to use in other buffer */
+ for (refpek= (BUFFPEK**) &queue_top(&queue);
+ refpek <= (BUFFPEK**) &queue_end(&queue);
+ refpek++)
+ {
+ buffpek= *refpek;
+ if (buffpek->base+buffpek->max_keys*sort_length == base)
+ {
+ buffpek->max_keys+=max_keys;
+ break;
+ }
+ else if (base+max_keys*sort_length == buffpek->base)
+ {
+ buffpek->base=base;
+ buffpek->max_keys+=max_keys;
+ break;
+ }
+ }
+ break; /* One buffer have been removed */
+ }
+ }
+ else if (error == -1)
+ goto err; /* purecov: inspected */
+ queue_replace_top(&queue); /* Top element has been replaced */
+ }
+ }
+ buffpek=(BUFFPEK*) queue_top(&queue);
+ buffpek->base= (uchar*) sort_keys;
+ buffpek->max_keys=keys;
+ do
+ {
+ if (to_file)
+ {
+ if (info->write_key(info, to_file, buffpek->key,
+ sort_length,buffpek->mem_count))
+ {
+ error=1; goto err; /* purecov: inspected */
+ }
+ }
+ else
+ {
+ register uchar *end;
+ strpos= buffpek->key;
+ for (end= strpos+buffpek->mem_count*sort_length;
+ strpos != end ;
+ strpos+=sort_length)
+ {
+ if ((*info->key_write)(info, strpos))
+ {
+ error=1; goto err; /* purecov: inspected */
+ }
+ }
+ }
+ }
+ while ((error=(int) info->read_to_buffer(from_file,buffpek,sort_length)) !=
+ -1 && error != 0);
+
+ lastbuff->count=count;
+ if (to_file)
+ lastbuff->file_pos=to_start_filepos;
+err:
+ delete_queue(&queue);
+ DBUG_RETURN(error);
+} /* merge_buffers */
+
+
+ /* Do a merge to output-file (save only positions) */
+
+static int NEAR_F
+merge_index(MARIA_SORT_PARAM *info, uint keys, uchar **sort_keys,
+ BUFFPEK *buffpek, int maxbuffer, IO_CACHE *tempfile)
+{
+ DBUG_ENTER("merge_index");
+ if (merge_buffers(info,keys,tempfile,(IO_CACHE*) 0,sort_keys,buffpek,buffpek,
+ buffpek+maxbuffer))
+ DBUG_RETURN(1); /* purecov: inspected */
+ DBUG_RETURN(0);
+} /* merge_index */
+
+
+static int flush_maria_ft_buf(MARIA_SORT_PARAM *info)
+{
+ int err=0;
+ if (info->sort_info->ft_buf)
+ {
+ err=_ma_sort_ft_buf_flush(info);
+ my_free(info->sort_info->ft_buf, MYF(0));
+ info->sort_info->ft_buf=0;
+ }
+ return err;
+}
diff --git a/storage/maria/ma_sp_defs.h b/storage/maria/ma_sp_defs.h
new file mode 100644
index 00000000000..398bf99c52e
--- /dev/null
+++ b/storage/maria/ma_sp_defs.h
@@ -0,0 +1,48 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _SP_DEFS_H
+#define _SP_DEFS_H
+
+#define SPDIMS 2
+#define SPTYPE HA_KEYTYPE_DOUBLE
+#define SPLEN 8
+
+#ifdef HAVE_SPATIAL
+
+enum wkbType
+{
+ wkbPoint = 1,
+ wkbLineString = 2,
+ wkbPolygon = 3,
+ wkbMultiPoint = 4,
+ wkbMultiLineString = 5,
+ wkbMultiPolygon = 6,
+ wkbGeometryCollection = 7
+};
+
+enum wkbByteOrder
+{
+ wkbXDR = 0, /* Big Endian */
+ wkbNDR = 1 /* Little Endian */
+};
+
+MARIA_KEY *_ma_sp_make_key(MARIA_HA *info, MARIA_KEY *ret_key, uint keynr,
+ uchar *key, const uchar *record, my_off_t filepos,
+ ulonglong trid);
+
+#endif /*HAVE_SPATIAL*/
+#endif /* _SP_DEFS_H */
diff --git a/storage/maria/ma_sp_key.c b/storage/maria/ma_sp_key.c
new file mode 100644
index 00000000000..22944a5db0a
--- /dev/null
+++ b/storage/maria/ma_sp_key.c
@@ -0,0 +1,305 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "ma_blockrec.h" /* For ROW_FLAG_TRANSID */
+#include "trnman.h"
+
+#ifdef HAVE_SPATIAL
+
+#include "ma_sp_defs.h"
+
+static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr);
+static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr);
+static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr);
+static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr);
+static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ double *mbr, int top);
+static int sp_mbr_from_wkb(uchar (*wkb), uint size, uint n_dims, double *mbr);
+
+
+/**
+ Create spactial key
+*/
+
+MARIA_KEY *_ma_sp_make_key(MARIA_HA *info, MARIA_KEY *ret_key, uint keynr,
+ uchar *key, const uchar *record, my_off_t filepos,
+ ulonglong trid)
+{
+ HA_KEYSEG *keyseg;
+ MARIA_KEYDEF *keyinfo = &info->s->keyinfo[keynr];
+ uint len = 0;
+ const uchar *pos;
+ uint dlen;
+ uchar *dptr;
+ double mbr[SPDIMS * 2];
+ uint i;
+ DBUG_ENTER("_ma_sp_make_key");
+
+ keyseg = &keyinfo->seg[-1];
+ pos = record + keyseg->start;
+ ret_key->data= key;
+
+ dlen = _ma_calc_blob_length(keyseg->bit_start, pos);
+ memcpy_fixed(&dptr, pos + keyseg->bit_start, sizeof(char*));
+ if (!dptr)
+ {
+ my_errno= HA_ERR_NULL_IN_SPATIAL;
+ DBUG_RETURN(0);
+ }
+
+ sp_mbr_from_wkb(dptr + 4, dlen - 4, SPDIMS, mbr); /* SRID */
+
+ for (i = 0, keyseg = keyinfo->seg; keyseg->type; keyseg++, i++)
+ {
+ uint length = keyseg->length, start= keyseg->start;
+ double val;
+
+ DBUG_ASSERT(length == 8);
+ DBUG_ASSERT(!(start % 8));
+ DBUG_ASSERT(start < sizeof(mbr));
+ DBUG_ASSERT(keyseg->type == HA_KEYTYPE_DOUBLE);
+
+ val= mbr[start / sizeof (double)];
+#ifdef HAVE_ISNAN
+ if (isnan(val))
+ {
+ bzero(key, length);
+ key+= length;
+ len+= length;
+ continue;
+ }
+#endif
+
+ if (keyseg->flag & HA_SWAP_KEY)
+ {
+ mi_float8store(key, val);
+ }
+ else
+ {
+ float8store((uchar *)key, val);
+ }
+ key += length;
+ len+= length;
+ }
+ _ma_dpointer(info->s, key, filepos);
+ ret_key->keyinfo= keyinfo;
+ ret_key->data_length= len;
+ ret_key->ref_length= info->s->rec_reflength;
+ ret_key->flag= 0;
+ if (_ma_have_versioning(info) && trid)
+ {
+ ret_key->ref_length+= transid_store_packed(info,
+ key + ret_key->ref_length,
+ trid);
+ }
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, ret_key););
+ DBUG_RETURN(ret_key);
+}
+
+
+/*
+ Calculate minimal bounding rectangle (mbr) of the spatial object
+ stored in "well-known binary representation" (wkb) format.
+*/
+
+static int sp_mbr_from_wkb(uchar *wkb, uint size, uint n_dims, double *mbr)
+{
+ uint i;
+
+ for (i=0; i < n_dims; ++i)
+ {
+ mbr[i * 2] = DBL_MAX;
+ mbr[i * 2 + 1] = -DBL_MAX;
+ }
+
+ return sp_get_geometry_mbr(&wkb, wkb + size, n_dims, mbr, 1);
+}
+
+/*
+ Add one point stored in wkb to mbr
+*/
+
+static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order __attribute__((unused)),
+ double *mbr)
+{
+ double ord;
+ double *mbr_end= mbr + n_dims * 2;
+
+ while (mbr < mbr_end)
+ {
+ if ((*wkb) > end - 8)
+ return -1;
+ float8get(ord, (const uchar*) *wkb);
+ (*wkb)+= 8;
+ if (ord < *mbr)
+ *mbr= ord;
+ mbr++;
+ if (ord > *mbr)
+ *mbr= ord;
+ mbr++;
+ }
+ return 0;
+}
+
+
+static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr)
+{
+ return sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr);
+}
+
+
+static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr)
+{
+ uint n_points;
+
+ n_points = uint4korr(*wkb);
+ (*wkb) += 4;
+ for (; n_points > 0; --n_points)
+ {
+ /* Add next point to mbr */
+ if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ return 0;
+}
+
+
+static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ uchar byte_order, double *mbr)
+{
+ uint n_linear_rings;
+ uint n_points;
+
+ n_linear_rings = uint4korr((*wkb));
+ (*wkb) += 4;
+
+ for (; n_linear_rings > 0; --n_linear_rings)
+ {
+ n_points = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_points > 0; --n_points)
+ {
+ /* Add next point to mbr */
+ if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+ double *mbr, int top)
+{
+ int res;
+ uchar byte_order;
+ uint wkb_type;
+
+ byte_order = *(*wkb);
+ ++(*wkb);
+
+ wkb_type = uint4korr((*wkb));
+ (*wkb) += 4;
+
+ switch ((enum wkbType) wkb_type)
+ {
+ case wkbPoint:
+ res = sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr);
+ break;
+ case wkbLineString:
+ res = sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr);
+ break;
+ case wkbPolygon:
+ res = sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr);
+ break;
+ case wkbMultiPoint:
+ {
+ uint n_items;
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items)
+ {
+ byte_order = *(*wkb);
+ ++(*wkb);
+ (*wkb) += 4;
+ if (sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ res = 0;
+ break;
+ }
+ case wkbMultiLineString:
+ {
+ uint n_items;
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items)
+ {
+ byte_order = *(*wkb);
+ ++(*wkb);
+ (*wkb) += 4;
+ if (sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ res = 0;
+ break;
+ }
+ case wkbMultiPolygon:
+ {
+ uint n_items;
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items)
+ {
+ byte_order = *(*wkb);
+ ++(*wkb);
+ (*wkb) += 4;
+ if (sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr))
+ return -1;
+ }
+ res = 0;
+ break;
+ }
+ case wkbGeometryCollection:
+ {
+ uint n_items;
+
+ if (!top)
+ return -1;
+
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items)
+ {
+ if (sp_get_geometry_mbr(wkb, end, n_dims, mbr, 0))
+ return -1;
+ }
+ res = 0;
+ break;
+ }
+ default:
+ res = -1;
+ }
+ return res;
+}
+
+#endif /*HAVE_SPATIAL*/
diff --git a/storage/maria/ma_sp_test.c b/storage/maria/ma_sp_test.c
new file mode 100644
index 00000000000..b8c00753acb
--- /dev/null
+++ b/storage/maria/ma_sp_test.c
@@ -0,0 +1,568 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Testing of the basic functions of a MARIA spatial table */
+/* Written by Alex Barkov, who has a shared copyright to this code */
+
+#include "maria.h"
+
+#ifdef HAVE_SPATIAL
+#include "ma_sp_defs.h"
+
+#define MAX_REC_LENGTH 1024
+#define KEYALG HA_KEY_ALG_RTREE
+
+static void create_linestring(uchar *record,uint rownr);
+static void print_record(uchar * record,my_off_t offs,const char * tail);
+
+static void create_key(uchar *key,uint rownr);
+static void print_key(const uchar *key,const char * tail);
+
+static int run_test(const char *filename);
+static int read_with_pos(MARIA_HA * file, int silent);
+
+static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points,
+ uchar *wkb);
+static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims);
+
+static char blob_key[MAX_REC_LENGTH];
+
+
+int main(int argc __attribute__((unused)),char *argv[])
+{
+ MY_INIT(argv[0]);
+ maria_init();
+ exit(run_test("sp_test"));
+}
+
+
+int run_test(const char *filename)
+{
+ MARIA_HA *file;
+ MARIA_UNIQUEDEF uniquedef;
+ MARIA_CREATE_INFO create_info;
+ MARIA_COLUMNDEF recinfo[20];
+ MARIA_KEYDEF keyinfo[20];
+ HA_KEYSEG keyseg[20];
+ key_range min_range, max_range;
+ int silent=0;
+ int create_flag=0;
+ int null_fields=0;
+ int nrecords=30;
+ int uniques=0;
+ int i;
+ int error;
+ int row_count=0;
+ uchar record[MAX_REC_LENGTH];
+ uchar key[MAX_REC_LENGTH];
+ uchar read_record[MAX_REC_LENGTH];
+ int upd=10;
+ ha_rows hrows;
+
+ /* Define a column for NULLs and DEL markers*/
+
+ recinfo[0].type=FIELD_NORMAL;
+ recinfo[0].length=1; /* For NULL bits */
+
+
+ /* Define spatial column */
+
+ recinfo[1].type=FIELD_BLOB;
+ recinfo[1].length=4 + portable_sizeof_char_ptr;
+
+
+
+ /* Define a key with 1 spatial segment */
+
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].flag=HA_SPATIAL;
+ keyinfo[0].key_alg=KEYALG;
+
+ keyinfo[0].seg[0].type= HA_KEYTYPE_BINARY;
+ keyinfo[0].seg[0].flag=0;
+ keyinfo[0].seg[0].start= 1;
+ keyinfo[0].seg[0].length=1; /* Spatial ignores it anyway */
+ keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].seg[0].language=default_charset_info->number;
+ keyinfo[0].seg[0].bit_start=4; /* Long BLOB */
+
+
+ if (!silent)
+ printf("- Creating isam-file\n");
+
+ bzero((char*) &create_info,sizeof(create_info));
+ create_info.max_rows=10000000;
+
+ if (maria_create(filename,
+ DYNAMIC_RECORD,
+ 1, /* keys */
+ keyinfo,
+ 2, /* columns */
+ recinfo,uniques,&uniquedef,&create_info,create_flag))
+ goto err;
+
+ if (!silent)
+ printf("- Open isam-file\n");
+
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ for (i=0; i<nrecords; i++ )
+ {
+ create_linestring(record,i);
+ error=maria_write(file,record);
+ print_record(record,maria_position(file),"\n");
+ if (!error)
+ {
+ row_count++;
+ }
+ else
+ {
+ printf("maria_write: %d\n", error);
+ goto err;
+ }
+ }
+
+ if ((error=read_with_pos(file,silent)))
+ goto err;
+
+ if (!silent)
+ printf("- Deleting rows with position\n");
+ for (i=0; i < nrecords/4; i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+ if (error)
+ {
+ printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file),"\n");
+ error=maria_delete(file,read_record);
+ if (error)
+ {
+ printf("pos: %2d maria_delete: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ }
+
+ if (!silent)
+ printf("- Updating rows with position\n");
+ for (i=0; i < nrecords/2 ; i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+ if (error)
+ {
+ if (error==HA_ERR_RECORD_DELETED)
+ continue;
+ printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file),"");
+ create_linestring(record,i+nrecords*upd);
+ printf("\t-> ");
+ print_record(record,maria_position(file),"\n");
+ error=maria_update(file,read_record,record);
+ if (error)
+ {
+ printf("pos: %2d maria_update: %3d errno: %3d\n",i,error,my_errno);
+ goto err;
+ }
+ }
+
+ if ((error=read_with_pos(file,silent)))
+ goto err;
+
+ if (!silent)
+ printf("- Test maria_rkey then a sequence of maria_rnext_same\n");
+
+ create_key(key, nrecords*4/5);
+ print_key(key," search for INTERSECT\n");
+
+ if ((error=maria_rkey(file,read_record,0,key,0,HA_READ_MBR_INTERSECT)))
+ {
+ printf("maria_rkey: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rkey\n");
+ row_count=1;
+
+ for (;;)
+ {
+ if ((error=maria_rnext_same(file,read_record)))
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ printf("maria_next: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rnext_same\n");
+ row_count++;
+ }
+ printf(" %d rows\n",row_count);
+
+ if (!silent)
+ printf("- Test maria_rfirst then a sequence of maria_rnext\n");
+
+ error=maria_rfirst(file,read_record,0);
+ if (error)
+ {
+ printf("maria_rfirst: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ row_count=1;
+ print_record(read_record,maria_position(file)," maria_frirst\n");
+
+ for(i=0;i<nrecords;i++) {
+ if ((error=maria_rnext(file,read_record,0)))
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ printf("maria_next: %3d errno: %3d\n",error,my_errno);
+ goto err;
+ }
+ print_record(read_record,maria_position(file)," maria_rnext\n");
+ row_count++;
+ }
+ printf(" %d rows\n",row_count);
+
+ if (!silent)
+ printf("- Test maria_records_in_range()\n");
+
+ create_key(key, nrecords*upd);
+ print_key(key," INTERSECT\n");
+ min_range.key= key;
+ min_range.length= 1000; /* Big enough */
+ min_range.flag= HA_READ_MBR_INTERSECT;
+ max_range.key= record+1;
+ max_range.length= 1000; /* Big enough */
+ max_range.flag= HA_READ_KEY_EXACT;
+ hrows= maria_records_in_range(file,0, &min_range, &max_range);
+ printf(" %ld rows\n", (long) hrows);
+
+ if (maria_close(file)) goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return 0;
+
+err:
+ printf("got error: %3d when using maria-database\n",my_errno);
+ maria_end();
+ return 1; /* skip warning */
+}
+
+
+static int read_with_pos (MARIA_HA * file,int silent)
+{
+ int error;
+ int i;
+ uchar read_record[MAX_REC_LENGTH];
+ int rows=0;
+
+ if (!silent)
+ printf("- Reading rows with position\n");
+ for (i=0;;i++)
+ {
+ my_errno=0;
+ bzero((char*) read_record,MAX_REC_LENGTH);
+ error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+ if (error)
+ {
+ if (error==HA_ERR_END_OF_FILE)
+ break;
+ if (error==HA_ERR_RECORD_DELETED)
+ continue;
+ printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno);
+ return error;
+ }
+ rows++;
+ print_record(read_record,maria_position(file),"\n");
+ }
+ printf(" %d rows\n",rows);
+ return 0;
+}
+
+
+#ifdef NOT_USED
+static void bprint_record(uchar * record,
+ my_off_t offs __attribute__((unused)),
+ const char * tail)
+{
+ int i;
+ char * pos;
+ i=(unsigned char)record[0];
+ printf("%02X ",i);
+
+ for( pos=record+1, i=0; i<32; i++,pos++)
+ {
+ int b=(unsigned char)*pos;
+ printf("%02X",b);
+ }
+ printf("%s",tail);
+}
+#endif
+
+
+static void print_record(uchar * record, my_off_t offs,const char * tail)
+{
+ uchar *pos;
+ char *ptr;
+ uint len;
+
+ printf(" rec=(%d)",(unsigned char)record[0]);
+ pos=record+1;
+ len=sint4korr(pos);
+ pos+=4;
+ printf(" len=%d ",len);
+ memcpy_fixed(&ptr,pos,sizeof(char*));
+ if (ptr)
+ maria_rtree_PrintWKB((uchar*) ptr,SPDIMS);
+ else
+ printf("<NULL> ");
+ printf(" offs=%ld ",(long int)offs);
+ printf("%s",tail);
+}
+
+
+#ifdef NOT_USED
+static void create_point(uchar *record,uint rownr)
+{
+ uint tmp;
+ char *ptr;
+ char *pos=record;
+ double x[200];
+ int i;
+
+ for(i=0;i<SPDIMS;i++)
+ x[i]=rownr;
+
+ bzero((char*) record,MAX_REC_LENGTH);
+ *pos=0x01; /* DEL marker */
+ pos++;
+
+ memset(blob_key,0,sizeof(blob_key));
+ tmp=maria_rtree_CreatePointWKB(x,SPDIMS,blob_key);
+
+ int4store(pos,tmp);
+ pos+=4;
+
+ ptr=blob_key;
+ memcpy_fixed(pos,&ptr,sizeof(char*));
+}
+#endif
+
+
+static void create_linestring(uchar *record,uint rownr)
+{
+ uint tmp;
+ char *ptr;
+ uchar *pos= record;
+ double x[200];
+ int i,j;
+ int npoints=2;
+
+ for(j=0;j<npoints;j++)
+ for(i=0;i<SPDIMS;i++)
+ x[i+j*SPDIMS]=rownr*j;
+
+ bzero((char*) record,MAX_REC_LENGTH);
+ *pos=0x01; /* DEL marker */
+ pos++;
+
+ memset(blob_key,0,sizeof(blob_key));
+ tmp=maria_rtree_CreateLineStringWKB(x,SPDIMS,npoints, (uchar*) blob_key);
+
+ int4store(pos,tmp);
+ pos+=4;
+
+ ptr=blob_key;
+ memcpy_fixed(pos,&ptr,sizeof(char*));
+}
+
+
+static void create_key(uchar *key,uint rownr)
+{
+ double c=rownr;
+ uchar *pos;
+ uint i;
+
+ bzero(key,MAX_REC_LENGTH);
+ for ( pos=key, i=0; i<2*SPDIMS; i++)
+ {
+ float8store(pos,c);
+ pos+=sizeof(c);
+ }
+}
+
+static void print_key(const uchar *key,const char * tail)
+{
+ double c;
+ uint i;
+
+ printf(" key=");
+ for (i=0; i<2*SPDIMS; i++)
+ {
+ float8get(c,key);
+ key+=sizeof(c);
+ printf("%.14g ",c);
+ }
+ printf("%s",tail);
+}
+
+
+#ifdef NOT_USED
+
+static int maria_rtree_CreatePointWKB(double *ords, uint n_dims, uchar *wkb)
+{
+ uint i;
+
+ *wkb = wkbXDR;
+ ++wkb;
+ int4store(wkb, wkbPoint);
+ wkb += 4;
+
+ for (i=0; i < n_dims; ++i)
+ {
+ float8store(wkb, ords[i]);
+ wkb += 8;
+ }
+ return 5 + n_dims * 8;
+}
+#endif
+
+
+static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points,
+ uchar *wkb)
+{
+ uint i;
+ uint n_ords = n_dims * n_points;
+
+ *wkb = wkbXDR;
+ ++wkb;
+ int4store(wkb, wkbLineString);
+ wkb += 4;
+ int4store(wkb, n_points);
+ wkb += 4;
+ for (i=0; i < n_ords; ++i)
+ {
+ float8store(wkb, ords[i]);
+ wkb += 8;
+ }
+ return 9 + n_points * n_dims * 8;
+}
+
+
+static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims)
+{
+ uint wkb_type;
+
+ ++wkb;
+ wkb_type = uint4korr(wkb);
+ wkb += 4;
+
+ switch ((enum wkbType)wkb_type)
+ {
+ case wkbPoint:
+ {
+ uint i;
+ double ord;
+
+ printf("POINT(");
+ for (i=0; i < n_dims; ++i)
+ {
+ float8get(ord, wkb);
+ wkb += 8;
+ printf("%.14g", ord);
+ if (i < n_dims - 1)
+ printf(" ");
+ else
+ printf(")");
+ }
+ break;
+ }
+ case wkbLineString:
+ {
+ uint p, i;
+ uint n_points;
+ double ord;
+
+ printf("LineString(");
+ n_points = uint4korr(wkb);
+ wkb += 4;
+ for (p=0; p < n_points; ++p)
+ {
+ for (i=0; i < n_dims; ++i)
+ {
+ float8get(ord, wkb);
+ wkb += 8;
+ printf("%.14g", ord);
+ if (i < n_dims - 1)
+ printf(" ");
+ }
+ if (p < n_points - 1)
+ printf(", ");
+ else
+ printf(")");
+ }
+ break;
+ }
+ case wkbPolygon:
+ {
+ printf("POLYGON(...)");
+ break;
+ }
+ case wkbMultiPoint:
+ {
+ printf("MULTIPOINT(...)");
+ break;
+ }
+ case wkbMultiLineString:
+ {
+ printf("MULTILINESTRING(...)");
+ break;
+ }
+ case wkbMultiPolygon:
+ {
+ printf("MULTIPOLYGON(...)");
+ break;
+ }
+ case wkbGeometryCollection:
+ {
+ printf("GEOMETRYCOLLECTION(...)");
+ break;
+ }
+ default:
+ {
+ printf("UNKNOWN GEOMETRY TYPE");
+ break;
+ }
+ }
+}
+
+#else
+int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused)))
+{
+ exit(0);
+}
+#endif /*HAVE_SPATIAL*/
diff --git a/storage/maria/ma_state.c b/storage/maria/ma_state.c
new file mode 100644
index 00000000000..ca94d58264b
--- /dev/null
+++ b/storage/maria/ma_state.c
@@ -0,0 +1,795 @@
+/* Copyright (C) 2008 Sun AB and Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Functions to maintain live statistics for Maria transactional tables
+ and versioning for not transactional tables
+
+ See WL#3138; Maria - fast "SELECT COUNT(*) FROM t;" and "CHECKSUM TABLE t"
+ for details about live number of rows and live checksums
+
+ TODO
+ - Allocate MA_USED_TABLES and MA_HISTORY_STATE from a global pool (to
+ avoid calls to malloc()
+ - In trnamn_end_trans_hook(), don't call _ma_remove_not_visible_states()
+ every time. One could for example call it if there has been more than
+ 10 ended transactions since last time it was called.
+*/
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_blockrec.h"
+
+/**
+ @brief Setup initial start-of-transaction state for a table
+
+ @fn _ma_setup_live_state
+ @param info Maria handler
+
+ @notes
+ This function ensures that trn->used_tables contains a list of
+ start and live states for tables that are part of the transaction
+ and that info->state points to the current live state for the table.
+
+ @TODO
+ Change trn->table_list to a hash and share->state_history to a binary tree
+
+ @return
+ @retval 0 ok
+ @retval 1 error (out of memory)
+*/
+
+my_bool _ma_setup_live_state(MARIA_HA *info)
+{
+ TRN *trn;
+ MARIA_SHARE *share= info->s;
+ MARIA_USED_TABLES *tables;
+ MARIA_STATE_HISTORY *history;
+ DBUG_ENTER("_ma_setup_live_state");
+
+ if (maria_create_trn_hook(info))
+ DBUG_RETURN(1);
+
+ trn= info->trn;
+ for (tables= (MARIA_USED_TABLES*) info->trn->used_tables;
+ tables;
+ tables= tables->next)
+ {
+ if (tables->share == share)
+ {
+ /* Table is already used by transaction */
+ goto end;
+ }
+ }
+
+ /* Table was not used before, create new table state entry */
+ if (!(tables= (MARIA_USED_TABLES*) my_malloc(sizeof(*tables),
+ MYF(MY_WME | MY_ZEROFILL))))
+ DBUG_RETURN(1);
+ tables->next= trn->used_tables;
+ trn->used_tables= tables;
+ tables->share= share;
+
+ pthread_mutex_lock(&share->intern_lock);
+ share->in_trans++;
+ DBUG_PRINT("info", ("share: 0x%lx in_trans: %d",
+ (ulong) share, share->in_trans));
+
+ history= share->state_history;
+
+ /*
+ We must keep share locked to ensure that we don't access a history
+ link that is deleted by concurrently running checkpoint.
+
+ It's enough to compare trids here (instead of calling
+ tranman_can_read_from) as history->trid is a commit_trid
+ */
+ while (trn->trid <= history->trid)
+ history= history->next;
+ pthread_mutex_unlock(&share->intern_lock);
+ /* The current item can't be deleted as it's the first one visible for us */
+ tables->state_start= tables->state_current= history->state;
+ tables->state_current.changed= tables->state_current.no_transid= 0;
+
+ DBUG_PRINT("info", ("records: %ld", (ulong) tables->state_start.records));
+
+end:
+ info->state_start= &tables->state_start;
+ info->state= &tables->state_current;
+
+ /*
+ Mark in transaction state if we are not using transid (versioning)
+ on rows. If not, then we will in _ma_trnman_end_trans_hook()
+ ensure that the state is visible for all at end of transaction
+ */
+ tables->state_current.no_transid|= !(info->row_flag & ROW_FLAG_TRANSID);
+
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Remove states that are not visible by anyone
+
+ @fn _ma_remove_not_visible_states()
+ @param org_history List to history
+ @param all 1 if we should delete the first state if it's
+ visible for all. For the moment this is only used
+ on close() of table.
+ @param trnman_is_locked Set to 1 if we have already a lock on trnman.
+
+ @notes
+ The assumption is that items in the history list is ordered by
+ commit_trid.
+
+ A state is not visible anymore if there is no new transaction
+ that has been started between the commit_trid's of two states
+
+ As long as some states exists, we keep the newest = (last commit)
+ state as first state in the history. This is to allow us to just move
+ the history from the global list to the share when we open the table.
+
+ Note that if 'all' is set trnman_is_locked must be 0, becasue
+ trnman_get_min_trid() will take a lock on trnman.
+
+ @return
+ @retval Pointer to new history list
+*/
+
+MARIA_STATE_HISTORY
+*_ma_remove_not_visible_states(MARIA_STATE_HISTORY *org_history,
+ my_bool all,
+ my_bool trnman_is_locked)
+{
+ TrID last_trid;
+ MARIA_STATE_HISTORY *history, **parent, *next;
+ DBUG_ENTER("_ma_remove_not_visible_states");
+
+ if (!org_history)
+ DBUG_RETURN(0); /* Not versioned table */
+
+ last_trid= org_history->trid;
+ parent= &org_history->next;
+ for (history= org_history->next; history; history= next)
+ {
+ next= history->next;
+ if (!trnman_exists_active_transactions(history->trid, last_trid,
+ trnman_is_locked))
+ {
+ DBUG_PRINT("info", ("removing history->trid: %lu next: %lu",
+ (ulong) history->trid, (ulong) last_trid));
+ my_free(history, MYF(0));
+ continue;
+ }
+ *parent= history;
+ parent= &history->next;
+ last_trid= history->trid;
+ }
+ *parent= 0;
+
+ if (all && parent == &org_history->next)
+ {
+ /* There is only one state left. Delete this if it's visible for all */
+ if (last_trid < trnman_get_min_trid())
+ {
+ my_free(org_history, MYF(0));
+ org_history= 0;
+ }
+ }
+ DBUG_RETURN(org_history);
+}
+
+
+/**
+ @brief Remove not used state history
+
+ @param share Maria table information
+ @param all 1 if we should delete the first state if it's
+ visible for all. For the moment this is only used
+ on close() of table.
+
+ @notes
+ share and trnman are not locked.
+
+ We must first lock trnman and then share->intern_lock. This is becasue
+ _ma_trnman_end_trans_hook() has a lock on trnman and then
+ takes share->intern_lock.
+*/
+
+void _ma_remove_not_visible_states_with_lock(MARIA_SHARE *share,
+ my_bool all)
+{
+ my_bool is_lock_trman;
+ if ((is_lock_trman= trman_is_inited()))
+ trnman_lock();
+
+ pthread_mutex_lock(&share->intern_lock);
+ share->state_history= _ma_remove_not_visible_states(share->state_history,
+ all, 1);
+ pthread_mutex_unlock(&share->intern_lock);
+ if (is_lock_trman)
+ trnman_unlock();
+}
+
+
+/*
+ Free state history information from share->history and reset information
+ to current state.
+
+ @notes
+ Used after repair as then all rows are visible for everyone
+*/
+
+void _ma_reset_state(MARIA_HA *info)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_STATE_HISTORY *history= share->state_history;
+
+ if (history)
+ {
+ MARIA_STATE_HISTORY *next;
+
+ /* Set the current history to current state */
+ share->state_history->state= share->state.state;
+ /* Set current table handler to point to new history state */
+ info->state= info->state_start= &share->state_history->state;
+ for (history= history->next ; history ; history= next)
+ {
+ next= history->next;
+ my_free(history, MYF(0));
+ }
+ share->state_history->next= 0;
+ share->state_history->trid= 0; /* Visibile for all */
+ }
+}
+
+
+/****************************************************************************
+ The following functions are called by thr_lock() in threaded applications
+ for not transactional tables
+****************************************************************************/
+
+/*
+ Create a copy of the current status for the table
+
+ SYNOPSIS
+ _ma_get_status()
+ param Pointer to Myisam handler
+ concurrent_insert Set to 1 if we are going to do concurrent inserts
+ (THR_WRITE_CONCURRENT_INSERT was used)
+*/
+
+void _ma_get_status(void* param, my_bool concurrent_insert)
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ DBUG_ENTER("_ma_get_status");
+ DBUG_PRINT("info",("key_file: %ld data_file: %ld concurrent_insert: %d",
+ (long) info->s->state.state.key_file_length,
+ (long) info->s->state.state.data_file_length,
+ concurrent_insert));
+#ifndef DBUG_OFF
+ if (info->state->key_file_length > info->s->state.state.key_file_length ||
+ info->state->data_file_length > info->s->state.state.data_file_length)
+ DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld",
+ (long) info->state->key_file_length,
+ (long) info->state->data_file_length));
+#endif
+ info->state_save= info->s->state.state;
+ info->state= &info->state_save;
+ info->state->changed= 0;
+ info->append_insert_at_end= concurrent_insert;
+ DBUG_VOID_RETURN;
+}
+
+
+void _ma_update_status(void* param)
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ /*
+ Because someone may have closed the table we point at, we only
+ update the state if its our own state. This isn't a problem as
+ we are always pointing at our own lock or at a read lock.
+ (This is enforced by thr_multi_lock.c)
+ */
+ if (info->state == &info->state_save)
+ {
+ MARIA_SHARE *share= info->s;
+#ifndef DBUG_OFF
+ DBUG_PRINT("info",("updating status: key_file: %ld data_file: %ld",
+ (long) info->state->key_file_length,
+ (long) info->state->data_file_length));
+ if (info->state->key_file_length < share->state.state.key_file_length ||
+ info->state->data_file_length < share->state.state.data_file_length)
+ DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld",
+ (long) share->state.state.key_file_length,
+ (long) share->state.state.data_file_length));
+#endif
+ /*
+ we are going to modify the state without lock's log, this would break
+ recovery if done with a transactional table.
+ */
+ DBUG_ASSERT(!info->s->base.born_transactional);
+ share->state.state= *info->state;
+ info->state= &share->state.state;
+#ifdef HAVE_QUERY_CACHE
+ DBUG_PRINT("info", ("invalidator... '%s' (status update)",
+ info->s->data_file_name.str));
+ DBUG_ASSERT(info->s->chst_invalidator != NULL);
+ (*info->s->chst_invalidator)((const char *)info->s->data_file_name.str);
+#endif
+
+ }
+ info->append_insert_at_end= 0;
+}
+
+
+/*
+ Same as ma_update_status() but take a lock in the table lock, to protect
+ against someone calling ma_get_status() from thr_lock() at the same time.
+*/
+
+void _ma_update_status_with_lock(MARIA_HA *info)
+{
+ my_bool locked= 0;
+ if (info->state == &info->state_save)
+ {
+ locked= 1;
+ pthread_mutex_lock(&info->s->lock.mutex);
+ }
+ (*info->s->lock.update_status)(info);
+ if (locked)
+ pthread_mutex_unlock(&info->s->lock.mutex);
+}
+
+
+void _ma_restore_status(void *param)
+{
+ MARIA_HA *info= (MARIA_HA*) param;
+ info->state= &info->s->state.state;
+ info->append_insert_at_end= 0;
+}
+
+
+void _ma_copy_status(void* to, void *from)
+{
+ ((MARIA_HA*) to)->state= &((MARIA_HA*) from)->state_save;
+}
+
+
+void _ma_reset_update_flag(void *param,
+ my_bool concurrent_insert __attribute__((unused)))
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ info->state->changed= 0;
+}
+
+
+/**
+ @brief Check if should allow concurrent inserts
+
+ @implementation
+ Allow concurrent inserts if we don't have a hole in the table or
+ if there is no active write lock and there is active read locks and
+ maria_concurrent_insert == 2. In this last case the new
+ row('s) are inserted at end of file instead of filling up the hole.
+
+ The last case is to allow one to inserts into a heavily read-used table
+ even if there is holes.
+
+ @notes
+ If there is a an rtree indexes in the table, concurrent inserts are
+ disabled in maria_open()
+
+ @return
+ @retval 0 ok to use concurrent inserts
+ @retval 1 not ok
+*/
+
+my_bool _ma_check_status(void *param)
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ /*
+ The test for w_locks == 1 is here because this thread has already done an
+ external lock (in other words: w_locks == 1 means no other threads has
+ a write lock)
+ */
+ DBUG_PRINT("info",("dellink: %ld r_locks: %u w_locks: %u",
+ (long) info->s->state.dellink, (uint) info->s->r_locks,
+ (uint) info->s->w_locks));
+ return (my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR ||
+ (maria_concurrent_insert == 2 && info->s->r_locks &&
+ info->s->w_locks == 1));
+}
+
+
+/**
+ @brief write hook at end of trans to store status for all used table
+
+ @Notes
+ This function must be called under trnman_lock in trnman_end_trn()
+ because of the following reasons:
+ - After trnman_end_trn() is called, the current transaction will be
+ regarded as committed and all used tables state_history will be
+ visible to other transactions. To do this, we loop over all used
+ tables and create/update a history entries that contains the correct
+ state_history for them.
+*/
+
+my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit,
+ my_bool active_transactions)
+{
+ my_bool error= 0;
+ MARIA_USED_TABLES *tables, *next;
+ DBUG_ENTER("_ma_trnman_end_trans_hook");
+
+ for (tables= (MARIA_USED_TABLES*) trn->used_tables;
+ tables;
+ tables= next)
+ {
+ MARIA_SHARE *share= tables->share;
+ next= tables->next;
+ if (commit)
+ {
+ MARIA_STATE_HISTORY *history;
+
+ pthread_mutex_lock(&share->intern_lock);
+
+ /* We only have to update history state if something changed */
+ if (tables->state_current.changed)
+ {
+ if (tables->state_current.no_transid)
+ {
+ /*
+ The change was done without using transid on rows (like in
+ bulk insert). In this case this thread is the only one
+ that is using the table and all rows will be visble
+ for all transactions.
+ */
+ _ma_reset_history(share);
+ }
+ else
+ {
+ if (active_transactions && share->now_transactional &&
+ trnman_exists_active_transactions(share->state_history->trid,
+ trn->commit_trid, 1))
+ {
+ /*
+ There exist transactions that are still using the current
+ share->state_history. Create a new history item for this
+ commit and add it first in the state_history list. This
+ ensures that all history items are stored in the list in
+ decresing trid order.
+ */
+ if (!(history= my_malloc(sizeof(*history), MYF(MY_WME))))
+ {
+ /* purecov: begin inspected */
+ error= 1;
+ pthread_mutex_unlock(&share->intern_lock);
+ my_free(tables, MYF(0));
+ continue;
+ /* purecov: end */
+ }
+ history->state= share->state_history->state;
+ history->next= share->state_history;
+ share->state_history= history;
+ }
+ else
+ {
+ /* Previous history can't be seen by anyone, reuse old memory */
+ history= share->state_history;
+ DBUG_PRINT("info", ("removing history->trid: %lu new: %lu",
+ (ulong) history->trid,
+ (ulong) trn->commit_trid));
+ }
+
+ history->state.records+= (tables->state_current.records -
+ tables->state_start.records);
+ history->state.checksum+= (tables->state_current.checksum -
+ tables->state_start.checksum);
+ history->trid= trn->commit_trid;
+
+ share->state.last_change_trn= trn->commit_trid;
+
+ if (history->next)
+ {
+ /* Remove not visible states */
+ share->state_history= _ma_remove_not_visible_states(history, 0, 1);
+ }
+ DBUG_PRINT("info", ("share: 0x%lx in_trans: %d",
+ (ulong) share, share->in_trans));
+ }
+ }
+ share->in_trans--;
+ pthread_mutex_unlock(&share->intern_lock);
+ }
+ else
+ {
+#ifndef DBUG_OFF
+ /*
+ We need to keep share->in_trans correct in the debug library
+ because of the assert in maria_close()
+ */
+ pthread_mutex_lock(&share->intern_lock);
+ share->in_trans--;
+ pthread_mutex_unlock(&share->intern_lock);
+#endif
+ }
+ my_free(tables, MYF(0));
+ }
+ trn->used_tables= 0;
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Remove table from trnman_list
+
+ @notes
+ This is used when we unlock a table from a group of locked tables
+ just before doing a rename or drop table.
+
+ share->internal_lock must be locked when function is called
+*/
+
+void _ma_remove_table_from_trnman(MARIA_SHARE *share, TRN *trn)
+{
+ MARIA_USED_TABLES *tables, **prev;
+ DBUG_ENTER("_ma_remove_table_from_trnman");
+ DBUG_PRINT("enter", ("share: 0x%lx in_trans: %d",
+ (ulong) share, share->in_trans));
+
+ safe_mutex_assert_owner(&share->intern_lock);
+
+ for (prev= (MARIA_USED_TABLES**) (char*) &trn->used_tables, tables= *prev;
+ tables;
+ tables= *prev)
+ {
+ if (tables->share == share)
+ {
+ *prev= tables->next;
+ share->in_trans--;
+ DBUG_PRINT("info", ("in_trans: %d", share->in_trans));
+ my_free(tables, MYF(0));
+ break;
+ }
+ prev= &tables->next;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+
+/****************************************************************************
+ The following functions are called by thr_lock() in threaded applications
+ for transactional tables.
+****************************************************************************/
+
+/*
+ Create a copy of the current status for the table
+
+ SYNOPSIS
+ _ma_get_status()
+ param Pointer to Myisam handler
+ concurrent_insert Set to 1 if we are going to do concurrent inserts
+ (THR_WRITE_CONCURRENT_INSERT was used)
+*/
+
+void _ma_block_get_status(void* param, my_bool concurrent_insert)
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ DBUG_ENTER("_ma_block_get_status");
+ DBUG_PRINT("enter", ("concurrent_insert %d", concurrent_insert));
+
+ info->row_base_length= info->s->base_length;
+ info->row_flag= info->s->base.default_row_flag;
+ if (concurrent_insert)
+ {
+ DBUG_ASSERT(info->lock.type == TL_WRITE_CONCURRENT_INSERT);
+ info->row_flag|= ROW_FLAG_TRANSID;
+ info->row_base_length+= TRANSID_SIZE;
+ }
+ else
+ {
+ DBUG_ASSERT(info->lock.type != TL_WRITE_CONCURRENT_INSERT);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+my_bool _ma_block_start_trans(void* param)
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ if (info->s->lock_key_trees)
+ {
+ /*
+ Assume for now that this doesn't fail (It can only fail in
+ out of memory conditions)
+ TODO: Fix this by having one extra state pre-allocated
+ */
+ return _ma_setup_live_state(info);
+ }
+
+ /*
+ Info->trn is set if this table is already handled and we are
+ called from maria_versioning()
+ */
+ if (info->s->base.born_transactional && !info->trn)
+ {
+ /*
+ Assume for now that this doesn't fail (It can only fail in
+ out of memory conditions)
+ */
+ return maria_create_trn_hook(info) != 0;
+ }
+ return 0;
+}
+
+
+void _ma_block_update_status(void *param __attribute__((unused)))
+{
+}
+
+void _ma_block_restore_status(void *param __attribute__((unused)))
+{
+}
+
+
+/**
+ Check if should allow concurrent inserts
+
+ @return
+ @retval 0 ok to use concurrent inserts
+ @retval 1 not ok
+*/
+
+my_bool _ma_block_check_status(void *param __attribute__((unused)))
+{
+ return (my_bool) 0;
+}
+
+
+/* Get status when transactional but not versioned */
+
+my_bool _ma_block_start_trans_no_versioning(void* param)
+{
+ MARIA_HA *info=(MARIA_HA*) param;
+ DBUG_ENTER("_ma_block_get_status_no_version");
+ DBUG_ASSERT(info->s->base.born_transactional);
+
+ info->state->changed= 0; /* from _ma_reset_update_flag() */
+ if (!info->trn)
+ {
+ /*
+ Assume for now that this doesn't fail (It can only fail in
+ out of memory conditions)
+ */
+ DBUG_RETURN(maria_create_trn_hook(info));
+ }
+ DBUG_RETURN(0);
+}
+
+
+/**
+ Enable/disable versioning
+*/
+
+void maria_versioning(MARIA_HA *info, my_bool versioning)
+{
+ /* For now, this is a hack */
+ if (info->s->have_versioning)
+ {
+ enum thr_lock_type save_lock_type;
+ /* Assume is a non threaded application (for now) */
+ info->s->lock_key_trees= 0;
+ /* Set up info->lock.type temporary for _ma_block_get_status() */
+ save_lock_type= info->lock.type;
+ info->lock.type= versioning ? TL_WRITE_CONCURRENT_INSERT : TL_WRITE;
+ _ma_block_get_status((void*) info, versioning);
+ info->lock.type= save_lock_type;
+ info->state= info->state_start= &info->s->state.common;
+ }
+}
+
+
+/**
+ Update data_file_length to new length
+
+ NOTES
+ Only used by block records
+*/
+
+void _ma_set_share_data_file_length(MARIA_SHARE *share, ulonglong new_length)
+{
+ pthread_mutex_lock(&share->intern_lock);
+ if (share->state.state.data_file_length < new_length)
+ share->state.state.data_file_length= new_length;
+ pthread_mutex_unlock(&share->intern_lock);
+}
+
+
+/**
+ Copy state information that where updated while the table was used
+ in not transactional mode
+*/
+
+void _ma_copy_nontrans_state_information(MARIA_HA *info)
+{
+ info->s->state.state.records= info->state->records;
+ info->s->state.state.checksum= info->state->checksum;
+}
+
+
+void _ma_reset_history(MARIA_SHARE *share)
+{
+ MARIA_STATE_HISTORY *history, *next;
+ DBUG_ENTER("_ma_reset_history");
+
+ share->state_history->trid= 0; /* Visibly by all */
+ share->state_history->state= share->state.state;
+ history= share->state_history->next;
+ share->state_history->next= 0;
+
+ for (; history; history= next)
+ {
+ next= history->next;
+ my_free(history, MYF(0));
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/****************************************************************************
+ Virtual functions to check if row is visible
+****************************************************************************/
+
+/**
+ Row is always visible
+ This is for tables without concurrent insert
+*/
+
+my_bool _ma_row_visible_always(MARIA_HA *info __attribute__((unused)))
+{
+ return 1;
+}
+
+
+/**
+ Row visibility for non transactional tables with concurrent insert
+
+ @implementation
+ When we got our table lock, we saved the current
+ data_file_length. Concurrent inserts always go to the end of the
+ file. So we can test if the found key references a new record.
+*/
+
+my_bool _ma_row_visible_non_transactional_table(MARIA_HA *info)
+{
+ return info->cur_row.lastpos < info->state->data_file_length;
+}
+
+
+/**
+ Row visibility for transactional tables with versioning
+
+
+ @TODO
+ Add test if found key was marked deleted and it was deleted by
+ us. In that case we should return 0
+*/
+
+my_bool _ma_row_visible_transactional_table(MARIA_HA *info)
+{
+ return trnman_can_read_from(info->trn, info->cur_row.trid);
+}
diff --git a/storage/maria/ma_state.h b/storage/maria/ma_state.h
new file mode 100644
index 00000000000..03ce5c2ea8c
--- /dev/null
+++ b/storage/maria/ma_state.h
@@ -0,0 +1,86 @@
+/* Copyright (C) 2008 Sun AB & Michael Widenius
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Struct to store tables in use by one transaction */
+
+typedef struct st_maria_status_info
+{
+ ha_rows records; /* Rows in table */
+ ha_rows del; /* Removed rows */
+ my_off_t empty; /* lost space in datafile */
+ my_off_t key_empty; /* lost space in indexfile */
+ my_off_t key_file_length;
+ my_off_t data_file_length;
+ ha_checksum checksum;
+ uint32 changed:1, /* Set if table was changed */
+ no_transid:1; /* Set if no transid was set on rows */
+} MARIA_STATUS_INFO;
+
+
+typedef struct st_used_tables {
+ struct st_used_tables *next;
+ struct st_maria_share *share;
+ MARIA_STATUS_INFO state_current;
+ MARIA_STATUS_INFO state_start;
+} MARIA_USED_TABLES;
+
+
+/* Struct to store commit state at different times */
+
+typedef struct st_state_history {
+ struct st_state_history *next;
+ TrID trid;
+ MARIA_STATUS_INFO state;
+} MARIA_STATE_HISTORY;
+
+
+/* struct to remember history for closed tables */
+
+typedef struct st_state_history_closed {
+ LSN create_rename_lsn;
+ MARIA_STATE_HISTORY *state_history;
+} MARIA_STATE_HISTORY_CLOSED;
+
+
+my_bool _ma_setup_live_state(MARIA_HA *info);
+MARIA_STATE_HISTORY *_ma_remove_not_visible_states(MARIA_STATE_HISTORY
+ *org_history,
+ my_bool all,
+ my_bool trman_is_locked);
+void _ma_reset_state(MARIA_HA *info);
+void _ma_get_status(void* param, my_bool concurrent_insert);
+void _ma_update_status(void* param);
+void _ma_update_status_with_lock(MARIA_HA *info);
+void _ma_restore_status(void *param);
+void _ma_copy_status(void* to, void *from);
+void _ma_reset_update_flag(void *param, my_bool concurrent_insert);
+my_bool _ma_check_status(void *param);
+void _ma_block_get_status(void* param, my_bool concurrent_insert);
+void _ma_block_update_status(void *param);
+void _ma_block_restore_status(void *param);
+my_bool _ma_block_check_status(void *param);
+void maria_versioning(MARIA_HA *info, my_bool versioning);
+void _ma_set_share_data_file_length(struct st_maria_share *share,
+ ulonglong new_length);
+void _ma_copy_nontrans_state_information(MARIA_HA *info);
+my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit,
+ my_bool active_transactions);
+my_bool _ma_row_visible_always(MARIA_HA *info);
+my_bool _ma_row_visible_non_transactional_table(MARIA_HA *info);
+my_bool _ma_row_visible_transactional_table(MARIA_HA *info);
+void _ma_remove_not_visible_states_with_lock(struct st_maria_share *share,
+ my_bool all);
+void _ma_remove_table_from_trnman(struct st_maria_share *share, TRN *trn);
+void _ma_reset_history(struct st_maria_share *share);
diff --git a/storage/maria/ma_static.c b/storage/maria/ma_static.c
new file mode 100644
index 00000000000..917385f9568
--- /dev/null
+++ b/storage/maria/ma_static.c
@@ -0,0 +1,109 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+/*
+ Static variables for MARIA library. All definied here for easy making of
+ a shared library
+*/
+
+#ifndef _global_h
+#include "maria_def.h"
+#include "trnman.h"
+#endif
+
+LIST *maria_open_list=0;
+uchar maria_file_magic[]=
+{ (uchar) 254, (uchar) 254, (uchar) 9, '\003', };
+uchar maria_pack_file_magic[]=
+{ (uchar) 254, (uchar) 254, (uchar) 10, '\001', };
+/* Unique number for this maria instance */
+uchar maria_uuid[MY_UUID_SIZE];
+uint maria_quick_table_bits=9;
+ulong maria_block_size= MARIA_KEY_BLOCK_LENGTH;
+my_bool maria_flush= 0, maria_single_user= 0;
+my_bool maria_delay_key_write= 0, maria_page_checksums= 1;
+my_bool maria_inited= FALSE;
+my_bool maria_in_ha_maria= FALSE; /* If used from ha_maria or not */
+my_bool maria_recovery_changed_data= 0, maria_recovery_verbose= 0;
+pthread_mutex_t THR_LOCK_maria;
+#if defined(THREAD) && !defined(DONT_USE_RW_LOCKS)
+ulong maria_concurrent_insert= 2;
+#else
+ulong maria_concurrent_insert= 0;
+#endif
+my_off_t maria_max_temp_length= MAX_FILE_SIZE;
+ulong maria_bulk_insert_tree_size=8192*1024;
+ulong maria_data_pointer_size= 4;
+
+PAGECACHE maria_pagecache_var;
+PAGECACHE *maria_pagecache= &maria_pagecache_var;
+
+PAGECACHE maria_log_pagecache_var;
+PAGECACHE *maria_log_pagecache= &maria_log_pagecache_var;
+MY_TMPDIR *maria_tmpdir; /* Tempdir for redo */
+char *maria_data_root;
+HASH maria_stored_state;
+int (*maria_create_trn_hook)(MARIA_HA *);
+
+/**
+ @brief when transactionality does not matter we can use this transaction
+
+ Used in external programs like ma_test*, and also internally inside
+ libmaria when there is no transaction around and the operation isn't
+ transactional (CREATE/DROP/RENAME/OPTIMIZE/REPAIR).
+*/
+TRN dummy_transaction_object;
+
+/* a WT_RESOURCE_TYPE for transactions waiting on a unique key conflict */
+WT_RESOURCE_TYPE ma_rc_dup_unique={ wt_resource_id_memcmp, 0};
+
+/* Enough for comparing if number is zero */
+uchar maria_zero_string[]= {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+/*
+ read_vec[] is used for converting between P_READ_KEY.. and SEARCH_
+ Position is , == , >= , <= , > , <
+*/
+
+uint32 maria_read_vec[]=
+{
+ SEARCH_FIND, /* HA_READ_KEY_EXACT */
+ SEARCH_FIND | SEARCH_BIGGER, /* HA_READ_KEY_OR_NEXT */
+ SEARCH_FIND | SEARCH_SMALLER, /* HA_READ_KEY_OR_PREV */
+ SEARCH_NO_FIND | SEARCH_BIGGER, /* HA_READ_AFTER_KEY */
+ SEARCH_NO_FIND | SEARCH_SMALLER, /* HA_READ_BEFORE_KEY */
+ SEARCH_FIND | SEARCH_PART_KEY, /* HA_READ_PREFIX */
+ SEARCH_LAST, /* HA_READ_PREFIX_LAST */
+ SEARCH_LAST | SEARCH_SMALLER, /* HA_READ_PREFIX_LAST_OR_PREV */
+ MBR_CONTAIN, /* HA_READ_MBR_CONTAIN */
+ MBR_INTERSECT, /* HA_READ_MBR_INTERSECT */
+ MBR_WITHIN, /* HA_READ_MBR_WITHIN */
+ MBR_DISJOINT, /* HA_READ_MBR_DISJOINT */
+ MBR_EQUAL /* HA_READ_MBR_EQUAL */
+};
+
+uint32 maria_readnext_vec[]=
+{
+ SEARCH_BIGGER, SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_BIGGER, SEARCH_SMALLER,
+ SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_SMALLER
+};
+
+static int always_valid(const char *filename __attribute__((unused)))
+{
+ return 0;
+}
+
+int (*maria_test_invalid_symlink)(const char *filename)= always_valid;
diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c
new file mode 100644
index 00000000000..0aa3a3acbc1
--- /dev/null
+++ b/storage/maria/ma_statrec.c
@@ -0,0 +1,302 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+ /* Functions to handle fixed-length-records */
+
+#include "maria_def.h"
+
+
+my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record)
+{
+ uchar temp[8]; /* max pointer length */
+ if (info->s->state.dellink != HA_OFFSET_ERROR &&
+ !info->append_insert_at_end)
+ {
+ my_off_t filepos=info->s->state.dellink;
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ if (info->s->file_read(info, &temp[0],info->s->base.rec_reflength,
+ info->s->state.dellink+1,
+ MYF(MY_NABP)))
+ goto err;
+ info->s->state.dellink= _ma_rec_pos(info->s, temp);
+ info->state->del--;
+ info->state->empty-=info->s->base.pack_reclength;
+ if (info->s->file_write(info, record, info->s->base.reclength,
+ filepos, MYF(MY_NABP)))
+ goto err;
+ }
+ else
+ {
+ if (info->state->data_file_length > info->s->base.max_data_file_length-
+ info->s->base.pack_reclength)
+ {
+ my_errno=HA_ERR_RECORD_FILE_FULL;
+ return(2);
+ }
+ if (info->opt_flag & WRITE_CACHE_USED)
+ { /* Cash in use */
+ if (my_b_write(&info->rec_cache, record,
+ info->s->base.reclength))
+ goto err;
+ if (info->s->base.pack_reclength != info->s->base.reclength)
+ {
+ uint length=info->s->base.pack_reclength - info->s->base.reclength;
+ bzero(temp,length);
+ if (my_b_write(&info->rec_cache, temp,length))
+ goto err;
+ }
+ }
+ else
+ {
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ if (info->s->file_write(info, record, info->s->base.reclength,
+ info->state->data_file_length,
+ info->s->write_flag))
+ goto err;
+ if (info->s->base.pack_reclength != info->s->base.reclength)
+ {
+ uint length=info->s->base.pack_reclength - info->s->base.reclength;
+ bzero(temp,length);
+ if (info->s->file_write(info, temp,length,
+ info->state->data_file_length+
+ info->s->base.reclength,
+ info->s->write_flag))
+ goto err;
+ }
+ }
+ info->state->data_file_length+=info->s->base.pack_reclength;
+ info->s->state.split++;
+ }
+ return 0;
+ err:
+ return 1;
+}
+
+my_bool _ma_update_static_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+ const uchar *oldrec __attribute__ ((unused)),
+ const uchar *record)
+{
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ return (info->s->file_write(info,
+ record, info->s->base.reclength,
+ pos,
+ MYF(MY_NABP)) != 0);
+}
+
+
+my_bool _ma_delete_static_record(MARIA_HA *info,
+ const uchar *record __attribute__ ((unused)))
+{
+ uchar temp[9]; /* 1+sizeof(uint32) */
+ info->state->del++;
+ info->state->empty+=info->s->base.pack_reclength;
+ temp[0]= '\0'; /* Mark that record is deleted */
+ _ma_dpointer(info->s, temp+1, info->s->state.dellink);
+ info->s->state.dellink= info->cur_row.lastpos;
+ info->rec_cache.seek_not_done=1;
+ return (info->s->file_write(info, temp, 1+info->s->rec_reflength,
+ info->cur_row.lastpos, MYF(MY_NABP)) != 0);
+}
+
+
+my_bool _ma_cmp_static_record(register MARIA_HA *info,
+ register const uchar *old)
+{
+ DBUG_ENTER("_ma_cmp_static_record");
+
+ /* We are going to do changes; dont let anybody disturb */
+ dont_break(); /* Dont allow SIGHUP or SIGINT */
+
+ if (info->opt_flag & WRITE_CACHE_USED)
+ {
+ if (flush_io_cache(&info->rec_cache))
+ {
+ DBUG_RETURN(1);
+ }
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ }
+
+ if ((info->opt_flag & READ_CHECK_USED))
+ { /* If check isn't disabled */
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ if (info->s->file_read(info, info->rec_buff, info->s->base.reclength,
+ info->cur_row.lastpos, MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ if (memcmp(info->rec_buff, old, (uint) info->s->base.reclength))
+ {
+ DBUG_DUMP("read",old,info->s->base.reclength);
+ DBUG_DUMP("disk",info->rec_buff,info->s->base.reclength);
+ my_errno=HA_ERR_RECORD_CHANGED; /* Record have changed */
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos)
+{
+ DBUG_ENTER("_ma_cmp_static_unique");
+
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+ if (info->s->file_read(info, info->rec_buff, info->s->base.reclength,
+ pos, MYF(MY_NABP)))
+ DBUG_RETURN(1);
+ DBUG_RETURN(_ma_unique_comp(def, record, info->rec_buff,
+ def->null_are_equal));
+}
+
+
+/*
+ Read a fixed-length-record
+
+ RETURN
+ 0 Ok
+ 1 record delete
+ -1 on read-error or locking-error
+*/
+
+int _ma_read_static_record(register MARIA_HA *info, register uchar *record,
+ MARIA_RECORD_POS pos)
+{
+ int error;
+ DBUG_ENTER("_ma_read_static_record");
+
+ if (pos != HA_OFFSET_ERROR)
+ {
+ if (info->opt_flag & WRITE_CACHE_USED &&
+ info->rec_cache.pos_in_file <= pos &&
+ flush_io_cache(&info->rec_cache))
+ DBUG_RETURN(my_errno);
+ info->rec_cache.seek_not_done=1; /* We have done a seek */
+
+ error= (int) info->s->file_read(info, record,info->s->base.reclength,
+ pos, MYF(MY_NABP));
+ if (! error)
+ {
+ fast_ma_writeinfo(info);
+ if (!*record)
+ {
+ /* Record is deleted */
+ DBUG_PRINT("warning", ("Record is deleted"));
+ DBUG_RETURN((my_errno=HA_ERR_RECORD_DELETED));
+ }
+ info->update|= HA_STATE_AKTIV; /* Record is read */
+ DBUG_RETURN(0);
+ }
+ }
+ fast_ma_writeinfo(info); /* No such record */
+ DBUG_RETURN(my_errno);
+}
+
+
+/**
+ @brief Read record from given position or next record
+
+ @note
+ When scanning, this function will return HA_ERR_RECORD_DELETED
+ for deleted rows even if skip_deleted_blocks is set.
+ The reason for this is to allow the caller to calculate the record
+ position without having to do call maria_position() for each record.
+*/
+
+int _ma_read_rnd_static_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos,
+ my_bool skip_deleted_blocks)
+{
+ int locked,error,cache_read;
+ uint cache_length;
+ MARIA_SHARE *share= info->s;
+ DBUG_ENTER("_ma_read_rnd_static_record");
+
+ cache_read=0;
+ cache_length=0;
+ if (info->opt_flag & READ_CACHE_USED)
+ { /* Cache in use */
+ if (filepos == my_b_tell(&info->rec_cache) &&
+ (skip_deleted_blocks || !filepos))
+ {
+ cache_read=1; /* Read record using cache */
+ cache_length= (uint) (info->rec_cache.read_end -
+ info->rec_cache.read_pos);
+ }
+ else
+ info->rec_cache.seek_not_done=1; /* Filepos is changed */
+ }
+ locked=0;
+ if (info->lock_type == F_UNLCK)
+ {
+ if (filepos >= info->state->data_file_length)
+ { /* Test if new records */
+ if (_ma_readinfo(info,F_RDLCK,0))
+ DBUG_RETURN(my_errno);
+ locked=1;
+ }
+ else
+ { /* We don't nead new info */
+#ifndef UNSAFE_LOCKING
+ if ((! cache_read || share->base.reclength > cache_length) &&
+ share->tot_locks == 0)
+ { /* record not in cache */
+ locked=1;
+ }
+#else
+ info->tmp_lock_type=F_RDLCK;
+#endif
+ }
+ }
+ if (filepos >= info->state->data_file_length)
+ {
+ DBUG_PRINT("test",("filepos: %ld (%ld) records: %ld del: %ld",
+ (long) filepos/share->base.reclength, (long) filepos,
+ (long) info->state->records, (long) info->state->del));
+ fast_ma_writeinfo(info);
+ DBUG_RETURN(my_errno=HA_ERR_END_OF_FILE);
+ }
+ info->cur_row.lastpos= filepos;
+ info->cur_row.nextpos= filepos+share->base.pack_reclength;
+
+ if (! cache_read) /* No cacheing */
+ {
+ error= _ma_read_static_record(info, buf, filepos);
+ DBUG_RETURN(error);
+ }
+
+ /* Read record with cacheing */
+ error=my_b_read(&info->rec_cache, buf, share->base.reclength);
+ if (info->s->base.pack_reclength != info->s->base.reclength && !error)
+ {
+ uchar tmp[8]; /* Skill fill bytes */
+ error=my_b_read(&info->rec_cache, tmp,
+ info->s->base.pack_reclength - info->s->base.reclength);
+ }
+ if (locked)
+ VOID(_ma_writeinfo(info,0)); /* Unlock keyfile */
+ if (!error)
+ {
+ if (!buf[0])
+ { /* Record is removed */
+ DBUG_RETURN(my_errno=HA_ERR_RECORD_DELETED);
+ }
+ /* Found and may be updated */
+ info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+ DBUG_RETURN(0);
+ }
+ /* my_errno should be set if rec_cache.error == -1 */
+ if (info->rec_cache.error != -1 || my_errno == 0)
+ my_errno=HA_ERR_WRONG_IN_RECORD;
+ DBUG_RETURN(my_errno); /* Something wrong (EOF?) */
+}
diff --git a/storage/maria/ma_test1.c b/storage/maria/ma_test1.c
new file mode 100644
index 00000000000..affa3a71634
--- /dev/null
+++ b/storage/maria/ma_test1.c
@@ -0,0 +1,899 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Testing of the basic functions of a MARIA table */
+
+#include "maria_def.h"
+#include <my_getopt.h>
+#include <m_string.h>
+#include "ma_control_file.h"
+#include "ma_loghandler.h"
+#include "ma_checkpoint.h"
+#include "trnman.h"
+
+extern PAGECACHE *maria_log_pagecache;
+extern char *maria_data_root;
+
+#define MAX_REC_LENGTH 1024
+
+static void usage();
+
+static int rec_pointer_size=0, flags[50], testflag, checkpoint;
+static int key_field=FIELD_SKIP_PRESPACE,extra_field=FIELD_SKIP_ENDSPACE;
+static int key_type=HA_KEYTYPE_NUM;
+static int create_flag=0;
+static ulong blob_length;
+static enum data_file_type record_type= DYNAMIC_RECORD;
+
+static uint insert_count, update_count, remove_count;
+static uint pack_keys=0, pack_seg=0, key_length;
+static uint unique_key=HA_NOSAME;
+static uint die_in_middle_of_transaction;
+static my_bool pagecacheing, null_fields, silent, skip_update, opt_unique;
+static my_bool verbose, skip_delete, transactional;
+static my_bool opt_versioning= 0;
+static MARIA_COLUMNDEF recinfo[4];
+static MARIA_KEYDEF keyinfo[10];
+static HA_KEYSEG keyseg[10];
+static HA_KEYSEG uniqueseg[10];
+
+static int run_test(const char *filename);
+static void get_options(int argc, char *argv[]);
+static void create_key(uchar *key,uint rownr);
+static void create_record(uchar *record,uint rownr);
+static void update_record(uchar *record);
+
+
+/*
+ These are here only for testing of recovery with undo. We are not
+ including maria_def.h here as this test is also to be an example of
+ how to use maria outside of the maria directory
+*/
+
+extern int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
+ enum flush_type flush_type_for_data,
+ enum flush_type flush_type_for_index);
+#define MARIA_FLUSH_DATA 1
+
+
+int main(int argc,char *argv[])
+{
+#if defined(SAFE_MUTEX) && defined(THREAD)
+ safe_mutex_deadlock_detector= 1;
+#endif
+ MY_INIT(argv[0]);
+ get_options(argc,argv);
+ maria_data_root= (char *)".";
+ /* Maria requires that we always have a page cache */
+ if (maria_init() ||
+ (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0,
+ maria_block_size, MY_WME) == 0) ||
+ ma_control_file_open(TRUE, TRUE) ||
+ (init_pagecache(maria_log_pagecache,
+ TRANSLOG_PAGECACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, MY_WME) == 0) ||
+ translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+ 0, 0, maria_log_pagecache,
+ TRANSLOG_DEFAULT_FLAGS, 0) ||
+ (transactional && (trnman_init(0) || ma_checkpoint_init(0))))
+ {
+ fprintf(stderr, "Error in initialization\n");
+ exit(1);
+ }
+ if (opt_versioning)
+ init_thr_lock();
+
+ exit(run_test("test1"));
+}
+
+
+static int run_test(const char *filename)
+{
+ MARIA_HA *file;
+ int i,j= 0,error,deleted,rec_length,uniques=0;
+ uint offset_to_key;
+ ha_rows found,row_count;
+ uchar record[MAX_REC_LENGTH],key[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH];
+ MARIA_UNIQUEDEF uniquedef;
+ MARIA_CREATE_INFO create_info;
+
+ if (die_in_middle_of_transaction)
+ null_fields= 1;
+
+ bzero((char*) recinfo,sizeof(recinfo));
+ bzero((char*) &create_info,sizeof(create_info));
+
+ /* First define 2 columns */
+ create_info.null_bytes= 1;
+ recinfo[0].type= key_field;
+ recinfo[0].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr :
+ key_length);
+ if (key_field == FIELD_VARCHAR)
+ recinfo[0].length+= HA_VARCHAR_PACKLENGTH(key_length);
+ recinfo[1].type=extra_field;
+ recinfo[1].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24);
+ if (extra_field == FIELD_VARCHAR)
+ recinfo[1].length+= HA_VARCHAR_PACKLENGTH(recinfo[1].length);
+ recinfo[1].null_bit= null_fields ? 2 : 0;
+
+ if (opt_unique)
+ {
+ recinfo[2].type=FIELD_CHECK;
+ recinfo[2].length=MARIA_UNIQUE_HASH_LENGTH;
+ }
+ rec_length= recinfo[0].length + recinfo[1].length + recinfo[2].length +
+ create_info.null_bytes;
+
+ if (key_type == HA_KEYTYPE_VARTEXT1 &&
+ key_length > 255)
+ key_type= HA_KEYTYPE_VARTEXT2;
+
+ /* Define a key over the first column */
+ keyinfo[0].seg=keyseg;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[0].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[0].seg[0].type= key_type;
+ keyinfo[0].seg[0].flag= pack_seg;
+ keyinfo[0].seg[0].start=1;
+ keyinfo[0].seg[0].length=key_length;
+ keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].seg[0].language= default_charset_info->number;
+ if (pack_seg & HA_BLOB_PART)
+ {
+ keyinfo[0].seg[0].bit_start=4; /* Length of blob length */
+ }
+ keyinfo[0].flag = (uint8) (pack_keys | unique_key);
+
+ bzero((uchar*) flags,sizeof(flags));
+ if (opt_unique)
+ {
+ uint start;
+ uniques=1;
+ bzero((char*) &uniquedef,sizeof(uniquedef));
+ bzero((char*) uniqueseg,sizeof(uniqueseg));
+ uniquedef.seg=uniqueseg;
+ uniquedef.keysegs=2;
+
+ /* Make a unique over all columns (except first NULL fields) */
+ for (i=0, start=1 ; i < 2 ; i++)
+ {
+ uniqueseg[i].start=start;
+ start+=recinfo[i].length;
+ uniqueseg[i].length=recinfo[i].length;
+ uniqueseg[i].language= default_charset_info->number;
+ }
+ uniqueseg[0].type= key_type;
+ uniqueseg[0].null_bit= null_fields ? 2 : 0;
+ uniqueseg[1].type= HA_KEYTYPE_TEXT;
+ if (extra_field == FIELD_BLOB)
+ {
+ uniqueseg[1].length=0; /* The whole blob */
+ uniqueseg[1].bit_start=4; /* long blob */
+ uniqueseg[1].flag|= HA_BLOB_PART;
+ }
+ else if (extra_field == FIELD_VARCHAR)
+ {
+ uniqueseg[1].flag|= HA_VAR_LENGTH_PART;
+ uniqueseg[1].type= (HA_VARCHAR_PACKLENGTH(recinfo[1].length-1) == 1 ?
+ HA_KEYTYPE_VARTEXT1 : HA_KEYTYPE_VARTEXT2);
+ }
+ }
+ else
+ uniques=0;
+
+ offset_to_key= test(null_fields);
+ if (key_field == FIELD_BLOB || key_field == FIELD_VARCHAR)
+ offset_to_key+= 2;
+
+ if (!silent)
+ printf("- Creating maria file\n");
+ create_info.max_rows=(ulong) (rec_pointer_size ?
+ (1L << (rec_pointer_size*8))/40 :
+ 0);
+ create_info.transactional= transactional;
+ if (maria_create(filename, record_type, 1, keyinfo,2+opt_unique,recinfo,
+ uniques, &uniquedef, &create_info,
+ create_flag))
+ goto err;
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+ if (!silent)
+ printf("- Writing key:s\n");
+
+ if (maria_begin(file))
+ goto err;
+ if (opt_versioning)
+ maria_versioning(file, 1);
+ my_errno=0;
+ row_count=deleted=0;
+ for (i=49 ; i>=1 ; i-=2 )
+ {
+ if (insert_count-- == 0)
+ {
+ if (testflag)
+ break;
+ VOID(maria_close(file));
+ exit(0);
+ }
+ j=i%25 +1;
+ create_record(record,j);
+ error=maria_write(file,record);
+ if (!error)
+ row_count++;
+ flags[j]=1;
+ if (verbose || error)
+ printf("J= %2d maria_write: %d errno: %d\n", j,error,my_errno);
+ }
+
+ if (maria_commit(file) || maria_begin(file))
+ goto err;
+
+ if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (testflag == 1)
+ goto end;
+
+ /* Insert 2 rows with null values */
+ if (null_fields)
+ {
+ create_record(record,0);
+ error=maria_write(file,record);
+ if (!error)
+ row_count++;
+ if (verbose || error)
+ printf("J= NULL maria_write: %d errno: %d\n", error,my_errno);
+ error=maria_write(file,record);
+ if (!error)
+ row_count++;
+ if (verbose || error)
+ printf("J= NULL maria_write: %d errno: %d\n", error,my_errno);
+ flags[0]=2;
+ }
+
+ if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (testflag == 2)
+ {
+ printf("Terminating after inserts\n");
+ goto end;
+ }
+
+ if (maria_commit(file) || maria_begin(file))
+ goto err;
+
+ if (!skip_update)
+ {
+ if (opt_unique)
+ {
+ if (!silent)
+ printf("- Checking unique constraint\n");
+ create_record(record,j); /* Check last created row */
+ if (!maria_write(file,record) || my_errno != HA_ERR_FOUND_DUPP_UNIQUE)
+ {
+ printf("unique check failed\n");
+ }
+ }
+ if (!silent)
+ printf("- Updating rows\n");
+
+ /* Update first last row to force extend of file */
+ if (maria_rsame(file,read_record,-1))
+ {
+ printf("Can't find last row with maria_rsame\n");
+ }
+ else
+ {
+ memcpy(record,read_record,rec_length);
+ update_record(record);
+ if (maria_update(file,read_record,record))
+ {
+ printf("Can't update last row: %.*s\n",
+ keyinfo[0].seg[0].length,read_record+1);
+ }
+ }
+
+ /* Read through all rows and update them */
+ assert(maria_scan_init(file) == 0);
+
+ found=0;
+ while ((error= maria_scan(file,read_record)) == 0)
+ {
+ if (--update_count == 0) { VOID(maria_close(file)) ; exit(0) ; }
+ memcpy(record,read_record,rec_length);
+ update_record(record);
+ if (maria_update(file,read_record,record))
+ {
+ printf("Can't update row: %.*s, error: %d\n",
+ keyinfo[0].seg[0].length,record+1,my_errno);
+ }
+ found++;
+ }
+ if (found != row_count)
+ printf("Found %ld of %ld rows\n", (ulong) found, (ulong) row_count);
+ maria_scan_end(file);
+ }
+
+ if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (testflag == 3)
+ {
+ printf("Terminating after updates\n");
+ goto end;
+ }
+ if (!silent)
+ printf("- Reopening file\n");
+ if (maria_commit(file))
+ goto err;
+ if (maria_close(file))
+ goto err;
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+ if (maria_begin(file))
+ goto err;
+ if (opt_versioning)
+ maria_versioning(file, 1);
+ if (!skip_delete)
+ {
+ if (!silent)
+ printf("- Removing keys\n");
+
+ for (i=0 ; i <= 10 ; i++)
+ {
+ /*
+ If you want to debug the problem in ma_test_recovery with BLOBs
+ (see @todo there), you can break out of the loop after just one
+ delete, it is enough, like this:
+ if (i==1) break;
+ */
+ /* testing */
+ if (remove_count-- == 0)
+ {
+ fprintf(stderr,
+ "delete-rows number of rows deleted; Going down hard!\n");
+ goto end;
+ }
+ j=i*2;
+ if (!flags[j])
+ continue;
+ create_key(key,j);
+ my_errno=0;
+ if ((error = maria_rkey(file, read_record, 0, key,
+ HA_WHOLE_KEY, HA_READ_KEY_EXACT)))
+ {
+ if (verbose || (flags[j] >= 1 ||
+ (error && my_errno != HA_ERR_KEY_NOT_FOUND)))
+ printf("key: '%.*s' maria_rkey: %3d errno: %3d\n",
+ (int) key_length,key+offset_to_key,error,my_errno);
+ }
+ else
+ {
+ error=maria_delete(file,read_record);
+ if (verbose || error)
+ printf("key: '%.*s' maria_delete: %3d errno: %3d\n",
+ (int) key_length, key+offset_to_key, error, my_errno);
+ if (! error)
+ {
+ deleted++;
+ flags[j]--;
+ }
+ }
+ }
+ }
+
+ if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (testflag == 4)
+ {
+ printf("Terminating after deletes\n");
+ goto end;
+ }
+
+ if (!silent)
+ printf("- Reading rows with key\n");
+ record[1]= 0; /* For nicer printf */
+ for (i=0 ; i <= 25 ; i++)
+ {
+ create_key(key,i);
+ my_errno=0;
+ error=maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT);
+ if (verbose ||
+ (error == 0 && flags[i] == 0 && unique_key) ||
+ (error && (flags[i] != 0 || my_errno != HA_ERR_KEY_NOT_FOUND)))
+ {
+ printf("key: '%.*s' maria_rkey: %3d errno: %3d record: %s\n",
+ (int) key_length,key+offset_to_key,error,my_errno,record+1);
+ }
+ }
+
+ if (!silent)
+ printf("- Reading rows with position\n");
+ if (maria_scan_init(file))
+ {
+ fprintf(stderr, "maria_scan_init failed\n");
+ goto err;
+ }
+
+ for (i=1,found=0 ; i <= 30 ; i++)
+ {
+ my_errno=0;
+ if ((error= maria_scan(file, read_record)) == HA_ERR_END_OF_FILE)
+ {
+ if (found != row_count-deleted)
+ printf("Found only %ld of %ld rows\n", (ulong) found,
+ (ulong) (row_count - deleted));
+ break;
+ }
+ if (!error)
+ found++;
+ if (verbose || (error != 0 && error != HA_ERR_RECORD_DELETED &&
+ error != HA_ERR_END_OF_FILE))
+ {
+ printf("pos: %2d maria_rrnd: %3d errno: %3d record: %s\n",
+ i-1,error,my_errno,read_record+1);
+ }
+ }
+ maria_scan_end(file);
+
+end:
+ if (die_in_middle_of_transaction)
+ {
+ /* As commit record is not done, UNDO entries needs to be rolled back */
+ switch (die_in_middle_of_transaction) {
+ case 1:
+ /*
+ Flush changed pages go to disk. That will also flush log. Recovery
+ will skip REDOs and apply UNDOs.
+ */
+ _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_RELEASE, FLUSH_RELEASE);
+ break;
+ case 2:
+ /*
+ Just flush log. Pages are likely to not be on disk. Recovery will
+ then execute REDOs and UNDOs.
+ */
+ if (translog_flush(file->trn->undo_lsn))
+ goto err;
+ break;
+ case 3:
+ /*
+ Flush nothing. Pages and log are likely to not be on disk. Recovery
+ will then do nothing.
+ */
+ break;
+ case 4:
+ /*
+ Flush changed data pages go to disk. Changed index pages are not
+ flushed. Recovery will skip some REDOs and apply UNDOs.
+ */
+ _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE,
+ FLUSH_RELEASE);
+ /*
+ We have to flush log separately as the redo for the last key page
+ may not be flushed
+ */
+ if (translog_flush(file->trn->undo_lsn))
+ goto err;
+ break;
+ }
+ printf("Dying on request without maria_commit()/maria_close()\n");
+ exit(0);
+ }
+
+ if (maria_commit(file))
+ goto err;
+ if (maria_close(file))
+ goto err;
+ maria_end();
+ my_end(MY_CHECK_ERROR);
+
+ return (0);
+err:
+ printf("got error: %3d when using maria-database\n",my_errno);
+ return 1; /* skip warning */
+}
+
+
+static void create_key_part(uchar *key,uint rownr)
+{
+ if (!unique_key)
+ rownr&=7; /* Some identical keys */
+ if (keyinfo[0].seg[0].type == HA_KEYTYPE_NUM)
+ {
+ sprintf((char*) key,"%*d",keyinfo[0].seg[0].length,rownr);
+ }
+ else if (keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT1 ||
+ keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT2)
+ { /* Alpha record */
+ /* Create a key that may be easily packed */
+ bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B');
+ sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr);
+ if ((rownr & 7) == 0)
+ {
+ /* Change the key to force a unpack of the next key */
+ bfill(key+3,keyinfo[0].seg[0].length-5,rownr < 10 ? 'a' : 'b');
+ }
+ }
+ else
+ { /* Alpha record */
+ if (keyinfo[0].seg[0].flag & HA_SPACE_PACK)
+ sprintf((char*) key,"%-*d",keyinfo[0].seg[0].length,rownr);
+ else
+ {
+ /* Create a key that may be easily packed */
+ bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B');
+ sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr);
+ if ((rownr & 7) == 0)
+ {
+ /* Change the key to force a unpack of the next key */
+ key[1]= (rownr < 10 ? 'a' : 'b');
+ }
+ }
+ }
+}
+
+
+static void create_key(uchar *key,uint rownr)
+{
+ if (keyinfo[0].seg[0].null_bit)
+ {
+ if (rownr == 0)
+ {
+ key[0]=1; /* null key */
+ key[1]=0; /* For easy print of key */
+ return;
+ }
+ *key++=0;
+ }
+ if (keyinfo[0].seg[0].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART))
+ {
+ uint tmp;
+ create_key_part(key+2,rownr);
+ tmp=strlen((char*) key+2);
+ int2store(key,tmp);
+ }
+ else
+ create_key_part(key,rownr);
+}
+
+
+static uchar blob_key[MAX_REC_LENGTH];
+static uchar blob_record[MAX_REC_LENGTH+20*20];
+
+
+static void create_record(uchar *record,uint rownr)
+{
+ uchar *pos;
+ bzero((char*) record,MAX_REC_LENGTH);
+ record[0]=1; /* delete marker */
+ if (rownr == 0 && keyinfo[0].seg[0].null_bit)
+ record[0]|=keyinfo[0].seg[0].null_bit; /* Null key */
+
+ pos=record+1;
+ if (recinfo[0].type == FIELD_BLOB)
+ {
+ uint tmp;
+ uchar *ptr;
+ create_key_part(blob_key,rownr);
+ tmp=strlen((char*) blob_key);
+ int4store(pos,tmp);
+ ptr=blob_key;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ pos+=recinfo[0].length;
+ }
+ else if (recinfo[0].type == FIELD_VARCHAR)
+ {
+ uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1);
+ create_key_part(pos+pack_length,rownr);
+ tmp= strlen((char*) pos+pack_length);
+ if (pack_length == 1)
+ *(uchar*) pos= (uchar) tmp;
+ else
+ int2store(pos,tmp);
+ pos+= recinfo[0].length;
+ }
+ else
+ {
+ create_key_part(pos,rownr);
+ pos+=recinfo[0].length;
+ }
+ if (recinfo[1].type == FIELD_BLOB)
+ {
+ uint tmp;
+ uchar *ptr;;
+ sprintf((char*) blob_record,"... row: %d", rownr);
+ strappend((char*) blob_record,max(MAX_REC_LENGTH-rownr,10),' ');
+ tmp=strlen((char*) blob_record);
+ int4store(pos,tmp);
+ ptr=blob_record;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*));
+ }
+ else if (recinfo[1].type == FIELD_VARCHAR)
+ {
+ uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1);
+ sprintf((char*) pos+pack_length, "... row: %d", rownr);
+ tmp= strlen((char*) pos+pack_length);
+ if (pack_length == 1)
+ *pos= (uchar) tmp;
+ else
+ int2store(pos,tmp);
+ }
+ else
+ {
+ sprintf((char*) pos,"... row: %d", rownr);
+ strappend((char*) pos,recinfo[1].length,' ');
+ }
+}
+
+/* change row to test re-packing of rows and reallocation of keys */
+
+static void update_record(uchar *record)
+{
+ uchar *pos=record+1;
+ if (recinfo[0].type == FIELD_BLOB)
+ {
+ uchar *column,*ptr;
+ int length;
+ length=uint4korr(pos); /* Long blob */
+ memcpy_fixed(&column,pos+4,sizeof(char*));
+ memcpy(blob_key,column,length); /* Move old key */
+ ptr=blob_key;
+ memcpy_fixed(pos+4,&ptr,sizeof(char*)); /* Store pointer to new key */
+ if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
+ default_charset_info->cset->casedn(default_charset_info,
+ (char*) blob_key, length,
+ (char*) blob_key, length);
+ pos+=recinfo[0].length;
+ }
+ else if (recinfo[0].type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1);
+ uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos);
+ default_charset_info->cset->casedn(default_charset_info,
+ (char*) pos + pack_length, length,
+ (char*) pos + pack_length, length);
+ pos+=recinfo[0].length;
+ }
+ else
+ {
+ if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
+ default_charset_info->cset->casedn(default_charset_info,
+ (char*) pos, keyinfo[0].seg[0].length,
+ (char*) pos, keyinfo[0].seg[0].length);
+ pos+=recinfo[0].length;
+ }
+
+ if (recinfo[1].type == FIELD_BLOB)
+ {
+ uchar *column;
+ int length;
+ length=uint4korr(pos);
+ memcpy_fixed(&column,pos+4,sizeof(char*));
+ memcpy(blob_record,column,length);
+ bfill(blob_record+length,20,'.'); /* Make it larger */
+ length+=20;
+ int4store(pos,length);
+ column=blob_record;
+ memcpy_fixed(pos+4,&column,sizeof(char*));
+ }
+ else if (recinfo[1].type == FIELD_VARCHAR)
+ {
+ /* Second field is longer than 10 characters */
+ uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1);
+ uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos);
+ pos= record+ recinfo[1].offset;
+ bfill(pos+pack_length+length,recinfo[1].length-length-pack_length,'.');
+ length=recinfo[1].length-pack_length;
+ if (pack_length == 1)
+ *(uchar*) pos= (uchar) length;
+ else
+ int2store(pos,length);
+ }
+ else
+ {
+ bfill(pos+recinfo[1].length-10,10,'.');
+ }
+}
+
+
+static struct my_option my_long_options[] =
+{
+ {"checkpoint", 'H', "Checkpoint at specified stage", (uchar**) &checkpoint,
+ (uchar**) &checkpoint, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"checksum", 'c', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+ {"debug", '#', "Undocumented",
+ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"delete-rows", 'd', "Abort after this many rows has been deleted",
+ (uchar**) &remove_count, (uchar**) &remove_count, 0, GET_UINT, REQUIRED_ARG,
+ 1000, 0, 0, 0, 0, 0},
+ {"help", '?', "Display help and exit",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"insert-rows", 'i', "Undocumented", (uchar**) &insert_count,
+ (uchar**) &insert_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0},
+ {"key-alpha", 'a', "Use a key of type HA_KEYTYPE_TEXT",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-binary-pack", 'B', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-blob", 'b', "Undocumented",
+ (uchar**) &blob_length, (uchar**) &blob_length,
+ 0, GET_ULONG, OPT_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-cache", 'K', "Undocumented", (uchar**) &pagecacheing,
+ (uchar**) &pagecacheing, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-length", 'k', "Undocumented", (uchar**) &key_length,
+ (uchar**) &key_length, 0, GET_UINT, REQUIRED_ARG, 6, 0, 0, 0, 0, 0},
+ {"key-multiple", 'm', "Don't use unique keys",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-prefix_pack", 'P', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-space_pack", 'p', "Undocumented",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"key-varchar", 'w', "Test VARCHAR keys",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"null-fields", 'N', "Define fields with NULL",
+ (uchar**) &null_fields, (uchar**) &null_fields, 0, GET_BOOL, NO_ARG,
+ 0, 0, 0, 0, 0, 0},
+ {"row-fixed-size", 'S', "Fixed size records",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"rows-in-block", 'M', "Store rows in block format",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"row-pointer-size", 'R', "Undocumented", (uchar**) &rec_pointer_size,
+ (uchar**) &rec_pointer_size, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"silent", 's', "Undocumented",
+ (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0,
+ 0, 0},
+ {"skip-delete", 'D', "Don't test deletes", (uchar**) &skip_delete,
+ (uchar**) &skip_delete, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"skip-update", 'U', "Don't test updates", (uchar**) &skip_update,
+ (uchar**) &skip_update, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag,
+ (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"test-undo", 'A',
+ "Abort hard. Used for testing recovery with undo",
+ (uchar**) &die_in_middle_of_transaction,
+ (uchar**) &die_in_middle_of_transaction,
+ 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"transactional", 'T',
+ "Test in transactional mode. (Only works with block format)",
+ (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG,
+ 0, 0, 0, 0, 0, 0},
+ {"unique", 'E', "Check unique handling", (uchar**) &opt_unique,
+ (uchar**) &opt_unique, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"update-rows", 'u', "Max number of rows to update", (uchar**) &update_count,
+ (uchar**) &update_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0},
+ {"verbose", 'v', "Be more verbose", (uchar**) &verbose,
+ (uchar**) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"version", 'V', "Print version number and exit",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"versioning", 'C', "Use row versioning (only works with block format)",
+ (uchar**) &opt_versioning, (uchar**) &opt_versioning, 0, GET_BOOL,
+ NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch(optid) {
+ case 'a':
+ key_type= HA_KEYTYPE_TEXT;
+ break;
+ case 'c':
+ create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM;
+ break;
+ case 'R': /* Length of record pointer */
+ if (rec_pointer_size > 3)
+ rec_pointer_size=0;
+ break;
+ case 'P':
+ pack_keys= HA_PACK_KEY; /* Use prefix compression */
+ break;
+ case 'B':
+ pack_keys= HA_BINARY_PACK_KEY; /* Use binary compression */
+ break;
+ case 'M':
+ record_type= BLOCK_RECORD;
+ break;
+ case 'S':
+ if (key_field == FIELD_VARCHAR)
+ {
+ create_flag=0; /* Static sized varchar */
+ record_type= STATIC_RECORD;
+ }
+ else if (key_field != FIELD_BLOB)
+ {
+ key_field=FIELD_NORMAL; /* static-size record */
+ extra_field=FIELD_NORMAL;
+ record_type= STATIC_RECORD;
+ }
+ break;
+ case 'p':
+ pack_keys=HA_PACK_KEY; /* Use prefix + space packing */
+ pack_seg=HA_SPACE_PACK;
+ key_type=HA_KEYTYPE_TEXT;
+ break;
+ case 'm':
+ unique_key=0;
+ break;
+ case 'b':
+ key_field=FIELD_BLOB; /* blob key */
+ extra_field= FIELD_BLOB;
+ pack_seg|= HA_BLOB_PART;
+ key_type= HA_KEYTYPE_VARTEXT1;
+ if (record_type == STATIC_RECORD)
+ record_type= DYNAMIC_RECORD;
+ break;
+ case 'k':
+ if (key_length < 4 || key_length > HA_MAX_KEY_LENGTH)
+ {
+ fprintf(stderr,"Wrong key length\n");
+ exit(1);
+ }
+ break;
+ case 'w':
+ key_field=FIELD_VARCHAR; /* varchar keys */
+ extra_field= FIELD_VARCHAR;
+ key_type= HA_KEYTYPE_VARTEXT1;
+ pack_seg|= HA_VAR_LENGTH_PART;
+ if (record_type == STATIC_RECORD)
+ record_type= DYNAMIC_RECORD;
+ break;
+ case 'K': /* Use key cacheing */
+ pagecacheing=1;
+ break;
+ case 'V':
+ printf("test1 Ver 1.2 \n");
+ exit(0);
+ case '#':
+ DBUG_PUSH(argument);
+ break;
+ case '?':
+ usage();
+ exit(1);
+ }
+ return 0;
+}
+
+
+/* Read options */
+
+static void get_options(int argc, char *argv[])
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(ho_error);
+ if (transactional)
+ record_type= BLOCK_RECORD;
+ return;
+} /* get options */
+
+
+static void usage()
+{
+ printf("Usage: %s [options]\n\n", my_progname);
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+}
diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c
new file mode 100644
index 00000000000..9e2f32f767b
--- /dev/null
+++ b/storage/maria/ma_test2.c
@@ -0,0 +1,1246 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Test av isam-databas: stor test */
+
+#ifndef USE_MY_FUNC /* We want to be able to dbug this !! */
+#define USE_MY_FUNC
+#endif
+#include "maria_def.h"
+#include "trnman.h"
+#include <m_ctype.h>
+#include <my_bit.h>
+#include "ma_checkpoint.h"
+
+#define STANDARD_LENGTH 37
+#define MARIA_KEYS 6
+#define MAX_PARTS 4
+#if !defined(MSDOS) && !defined(labs)
+#define labs(a) abs(a)
+#endif
+
+static void get_options(int argc, char *argv[]);
+static uint rnd(uint max_value);
+static void fix_length(uchar *record,uint length);
+static void put_blob_in_record(uchar *blob_pos,char **blob_buffer,
+ ulong *length);
+static void copy_key(MARIA_HA *info, uint inx, uchar *record, uchar *key);
+
+static int verbose= 0, testflag= 0, first_key= 0, async_io= 0, pagecacheing= 0;
+static int write_cacheing= 0, do_locking= 0, rec_pointer_size= 0;
+static int silent= 0, opt_quick_mode= 0, transactional= 0, skip_update= 0;
+static int die_in_middle_of_transaction= 0, pack_fields= 1;
+static int pack_seg= HA_SPACE_PACK, pack_type= HA_PACK_KEY, remove_count= -1;
+static int create_flag= 0, srand_arg= 0, checkpoint= 0;
+static my_bool opt_versioning= 0;
+static uint use_blob= 0, update_count= 0;
+static ulong pagecache_size=8192*32;
+static enum data_file_type record_type= DYNAMIC_RECORD;
+
+static uint keys=MARIA_KEYS,recant=1000;
+static uint16 key1[1001],key3[5000];
+static uchar record[300],record2[300],key[100],key2[100];
+static uchar read_record[300],read_record2[300],read_record3[300];
+static HA_KEYSEG glob_keyseg[MARIA_KEYS][MAX_PARTS];
+
+ /* Test program */
+
+int main(int argc, char *argv[])
+{
+ uint i;
+ int j,n1,n2,n3,error,k;
+ uint write_count,update,dupp_keys,opt_delete,start,length,blob_pos,
+ reclength,ant,found_parts;
+ my_off_t lastpos;
+ ha_rows range_records,records;
+ MARIA_HA *file;
+ MARIA_KEYDEF keyinfo[10];
+ MARIA_COLUMNDEF recinfo[10];
+ MARIA_INFO info;
+ const char *filename;
+ char *blob_buffer;
+ MARIA_CREATE_INFO create_info;
+
+#if defined(SAFE_MUTEX) && defined(THREAD)
+ safe_mutex_deadlock_detector= 1;
+#endif
+ MY_INIT(argv[0]);
+
+ filename= "test2";
+ get_options(argc,argv);
+ if (! async_io)
+ my_disable_async_io=1;
+
+ /* If we sync or not have no affect on this test */
+ my_disable_sync= 1;
+
+ maria_data_root= (char *)".";
+ /* Maria requires that we always have a page cache */
+ if (maria_init() ||
+ (init_pagecache(maria_pagecache, pagecache_size, 0, 0,
+ maria_block_size, MY_WME) == 0) ||
+ ma_control_file_open(TRUE, TRUE) ||
+ (init_pagecache(maria_log_pagecache,
+ TRANSLOG_PAGECACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, MY_WME) == 0) ||
+ translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+ 0, 0, maria_log_pagecache,
+ TRANSLOG_DEFAULT_FLAGS, 0) ||
+ (transactional && (trnman_init(0) || ma_checkpoint_init(0))))
+ {
+ fprintf(stderr, "Error in initialization");
+ exit(1);
+ }
+ if (opt_versioning)
+ init_thr_lock();
+
+ reclength=STANDARD_LENGTH+60+(use_blob ? 8 : 0);
+ blob_pos=STANDARD_LENGTH+60;
+ keyinfo[0].seg= &glob_keyseg[0][0];
+ keyinfo[0].seg[0].start=0;
+ keyinfo[0].seg[0].length=6;
+ keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[0].seg[0].language= default_charset_info->number;
+ keyinfo[0].seg[0].flag=(uint8) pack_seg;
+ keyinfo[0].seg[0].null_bit=0;
+ keyinfo[0].seg[0].null_pos=0;
+ keyinfo[0].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].flag = pack_type;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[1].seg= &glob_keyseg[1][0];
+ keyinfo[1].seg[0].start=7;
+ keyinfo[1].seg[0].length=6;
+ keyinfo[1].seg[0].type=HA_KEYTYPE_BINARY;
+ keyinfo[1].seg[0].flag=0;
+ keyinfo[1].seg[0].null_bit=0;
+ keyinfo[1].seg[0].null_pos=0;
+ keyinfo[1].seg[1].start=0; /* two part key */
+ keyinfo[1].seg[1].length=6;
+ keyinfo[1].seg[1].type=HA_KEYTYPE_NUM;
+ keyinfo[1].seg[1].flag=HA_REVERSE_SORT;
+ keyinfo[1].seg[1].null_bit=0;
+ keyinfo[1].seg[1].null_pos=0;
+ keyinfo[1].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[1].keysegs=2;
+ keyinfo[1].flag =0;
+ keyinfo[1].block_length= MARIA_MIN_KEY_BLOCK_LENGTH; /* Diff blocklength */
+ keyinfo[2].seg= &glob_keyseg[2][0];
+ keyinfo[2].seg[0].start=12;
+ keyinfo[2].seg[0].length=8;
+ keyinfo[2].seg[0].type=HA_KEYTYPE_BINARY;
+ keyinfo[2].seg[0].flag=HA_REVERSE_SORT;
+ keyinfo[2].seg[0].null_bit=0;
+ keyinfo[2].seg[0].null_pos=0;
+ keyinfo[2].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[2].keysegs=1;
+ keyinfo[2].flag =HA_NOSAME;
+ keyinfo[2].block_length= 0; /* Default block length */
+ keyinfo[3].seg= &glob_keyseg[3][0];
+ keyinfo[3].seg[0].start=0;
+ keyinfo[3].seg[0].length=reclength-(use_blob ? 8 : 0);
+ keyinfo[3].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[3].seg[0].language=default_charset_info->number;
+ keyinfo[3].seg[0].flag=(uint8) pack_seg;
+ keyinfo[3].seg[0].null_bit=0;
+ keyinfo[3].seg[0].null_pos=0;
+ keyinfo[3].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[3].keysegs=1;
+ keyinfo[3].flag = pack_type;
+ keyinfo[3].block_length= 0; /* Default block length */
+ keyinfo[4].seg= &glob_keyseg[4][0];
+ keyinfo[4].seg[0].start=0;
+ keyinfo[4].seg[0].length=5;
+ keyinfo[4].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[4].seg[0].language=default_charset_info->number;
+ keyinfo[4].seg[0].flag=0;
+ keyinfo[4].seg[0].null_bit=0;
+ keyinfo[4].seg[0].null_pos=0;
+ keyinfo[4].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[4].keysegs=1;
+ keyinfo[4].flag = pack_type;
+ keyinfo[4].block_length= 0; /* Default block length */
+ keyinfo[5].seg= &glob_keyseg[5][0];
+ keyinfo[5].seg[0].start=0;
+ keyinfo[5].seg[0].length=4;
+ keyinfo[5].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[5].seg[0].language=default_charset_info->number;
+ keyinfo[5].seg[0].flag=pack_seg;
+ keyinfo[5].seg[0].null_bit=0;
+ keyinfo[5].seg[0].null_pos=0;
+ keyinfo[5].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[5].keysegs=1;
+ keyinfo[5].flag = pack_type;
+ keyinfo[5].block_length= 0; /* Default block length */
+
+ recinfo[0].type=pack_fields ? FIELD_SKIP_PRESPACE : 0;
+ recinfo[0].length=7;
+ recinfo[0].null_bit=0;
+ recinfo[0].null_pos=0;
+ recinfo[1].type=pack_fields ? FIELD_SKIP_PRESPACE : 0;
+ recinfo[1].length=5;
+ recinfo[1].null_bit=0;
+ recinfo[1].null_pos=0;
+ recinfo[2].type=pack_fields ? FIELD_SKIP_PRESPACE : 0;
+ recinfo[2].length=9;
+ recinfo[2].null_bit=0;
+ recinfo[2].null_pos=0;
+ recinfo[3].type=FIELD_NORMAL;
+ recinfo[3].length=STANDARD_LENGTH-7-5-9-4;
+ recinfo[3].null_bit=0;
+ recinfo[3].null_pos=0;
+ recinfo[4].type=pack_fields ? FIELD_SKIP_ZERO : 0;
+ recinfo[4].length=4;
+ recinfo[4].null_bit=0;
+ recinfo[4].null_pos=0;
+ recinfo[5].type=pack_fields ? FIELD_SKIP_ENDSPACE : 0;
+ recinfo[5].length=60;
+ recinfo[5].null_bit=0;
+ recinfo[5].null_pos=0;
+ if (use_blob)
+ {
+ recinfo[6].type=FIELD_BLOB;
+ recinfo[6].length=4+portable_sizeof_char_ptr;
+ recinfo[6].null_bit=0;
+ recinfo[6].null_pos=0;
+ }
+
+ write_count=update=dupp_keys=opt_delete=0;
+ blob_buffer=0;
+
+ for (i=1000 ; i>0 ; i--) key1[i]=0;
+ for (i=4999 ; i>0 ; i--) key3[i]=0;
+
+ if (!silent)
+ printf("- Creating maria-file\n");
+ file= 0;
+ bzero((char*) &create_info,sizeof(create_info));
+ create_info.max_rows=(ha_rows) (rec_pointer_size ?
+ (1L << (rec_pointer_size*8))/
+ reclength : 0);
+ create_info.reloc_rows=(ha_rows) 100;
+ create_info.transactional= transactional;
+ if (maria_create(filename, record_type, keys,&keyinfo[first_key],
+ use_blob ? 7 : 6, &recinfo[0],
+ 0,(MARIA_UNIQUEDEF*) 0,
+ &create_info,create_flag))
+ goto err;
+ if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+ goto err;
+ maria_begin(file);
+ if (opt_versioning)
+ maria_versioning(file, 1);
+ if (testflag == 1)
+ goto end;
+ if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+ if (!silent)
+ printf("- Writing key:s\n");
+ if (do_locking)
+ maria_lock_database(file,F_WRLCK);
+ if (write_cacheing)
+ maria_extra(file,HA_EXTRA_WRITE_CACHE,0);
+ if (opt_quick_mode)
+ maria_extra(file,HA_EXTRA_QUICK,0);
+
+ for (i=0 ; i < recant ; i++)
+ {
+ ulong blob_length;
+ n1=rnd(1000); n2=rnd(100); n3=rnd(5000);
+ sprintf((char*) record,"%6d:%4d:%8d:Pos: %4d ",n1,n2,n3,write_count);
+ int4store(record+STANDARD_LENGTH-4,(long) i);
+ fix_length(record,(uint) STANDARD_LENGTH+rnd(60));
+ put_blob_in_record(record+blob_pos,&blob_buffer, &blob_length);
+ DBUG_PRINT("test",("record: %d blob_length: %lu", i, blob_length));
+
+ if (maria_write(file,record))
+ {
+ if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0)
+ {
+ printf("Error: %d in write at record: %d\n",my_errno,i);
+ goto err;
+ }
+ if (verbose) printf(" Double key: %d at record# %d\n", n3, i);
+ }
+ else
+ {
+ if (key3[n3] == 1 && first_key <3 && first_key+keys >= 3)
+ {
+ printf("Error: Didn't get error when writing second key: '%8d'\n",n3);
+ goto err;
+ }
+ write_count++; key1[n1]++; key3[n3]=1;
+ }
+
+ /* Check if we can find key without flushing database */
+ if (i % 10 == 0)
+ {
+ for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+ if (!j)
+ for (j=999 ; j>0 && key1[j] == 0 ; j--) ;
+ sprintf((char*) key,"%6d",j);
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ {
+ printf("Test in loop: Can't find key: \"%s\"\n",key);
+ goto err;
+ }
+ }
+ }
+ if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (write_cacheing)
+ {
+ if (maria_extra(file,HA_EXTRA_NO_CACHE,0))
+ {
+ puts("got error from maria_extra(HA_EXTRA_NO_CACHE)");
+ goto err;
+ }
+ }
+
+ if (testflag == 2)
+ goto end;
+
+#ifdef REMOVE_WHEN_WE_HAVE_RESIZE
+ if (pagecacheing)
+ resize_pagecache(maria_pagecache, maria_block_size,
+ pagecache_size * 2, 0, 0);
+#endif
+ if (!silent)
+ printf("- Delete\n");
+ if (srand_arg)
+ srand(srand_arg);
+ if (!update_count)
+ update_count= recant/10;
+
+ for (i=0 ; i < update_count ; i++)
+ {
+ for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+ if (j != 0)
+ {
+ sprintf((char*) key,"%6d",j);
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ {
+ printf("can't find key1: \"%s\"\n",key);
+ goto err;
+ }
+ if (bcmp(read_record+keyinfo[0].seg[0].start,
+ key, keyinfo[0].seg[0].length))
+ {
+ printf("Found wrong record when searching for key: \"%s\"\n",key);
+ goto err;
+ }
+ if (opt_delete == (uint) remove_count) /* While testing */
+ goto end;
+ if (maria_delete(file,read_record))
+ {
+ printf("error: %d; can't delete record: \"%s\"\n", my_errno,read_record);
+ goto err;
+ }
+ opt_delete++;
+ key1[atoi((char*) read_record+keyinfo[0].seg[0].start)]--;
+ key3[atoi((char*) read_record+keyinfo[2].seg[0].start)]=0;
+ }
+ else
+ {
+ puts("Warning: Skipping delete test because no dupplicate keys");
+ break;
+ }
+ }
+ if (testflag == 3)
+ goto end;
+ if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (!silent)
+ printf("- Update\n");
+ if (srand_arg)
+ srand(srand_arg);
+ if (!update_count)
+ update_count= recant/10;
+
+ for (i=0 ; i < update_count ; i++)
+ {
+ n1=rnd(1000); n2=rnd(100); n3=rnd(5000);
+ sprintf((char*) record2,"%6d:%4d:%8d:XXX: %4d ",n1,n2,n3,update);
+ int4store(record2+STANDARD_LENGTH-4,(long) i);
+ fix_length(record2,(uint) STANDARD_LENGTH+rnd(60));
+
+ for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+ if (j != 0)
+ {
+ sprintf((char*) key,"%6d",j);
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ {
+ printf("can't find key1: \"%s\"\n", (char*) key);
+ goto err;
+ }
+ if (bcmp(read_record+keyinfo[0].seg[0].start,
+ key, keyinfo[0].seg[0].length))
+ {
+ printf("Found wrong record when searching for key: \"%s\"; Found \"%.*s\"\n",
+ key, keyinfo[0].seg[0].length,
+ read_record+keyinfo[0].seg[0].start);
+ goto err;
+ }
+ if (use_blob)
+ {
+ ulong blob_length;
+ if (i & 1)
+ put_blob_in_record(record2+blob_pos,&blob_buffer, &blob_length);
+ else
+ bmove(record2+blob_pos, read_record+blob_pos, 4 + sizeof(char*));
+ }
+ if (skip_update)
+ continue;
+ if (maria_update(file,read_record,record2))
+ {
+ if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0)
+ {
+ printf("error: %d; can't update:\nFrom: \"%s\"\nTo: \"%s\"\n",
+ my_errno,read_record,record2);
+ goto err;
+ }
+ if (verbose)
+ printf("Double key when tried to update:\nFrom: \"%s\"\nTo: \"%s\"\n",record,record2);
+ }
+ else
+ {
+ key1[atoi((char*) read_record+keyinfo[0].seg[0].start)]--;
+ key3[atoi((char*) read_record+keyinfo[2].seg[0].start)]=0;
+ key1[n1]++; key3[n3]=1;
+ update++;
+ }
+ }
+ }
+ if (testflag == 4)
+ goto end;
+ if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ for (i=999, dupp_keys=j=0 ; i>0 ; i--)
+ {
+ if (key1[i] > dupp_keys)
+ {
+ dupp_keys=key1[i]; j=i;
+ }
+ }
+ sprintf((char*) key,"%6d",j);
+ start=keyinfo[0].seg[0].start;
+ length=keyinfo[0].seg[0].length;
+ if (dupp_keys)
+ {
+ if (!silent)
+ printf("- Same key: first - next -> last - prev -> first\n");
+ DBUG_PRINT("progpos",("first - next -> last - prev -> first"));
+ if (verbose) printf(" Using key: \"%s\" Keys: %d\n",key,dupp_keys);
+
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ goto err;
+ if (maria_rsame(file,read_record2,-1))
+ goto err;
+ if (memcmp(read_record,read_record2,reclength) != 0)
+ {
+ printf("maria_rsame didn't find same record\n");
+ goto err;
+ }
+ info.recpos=maria_position(file);
+ if (maria_rfirst(file,read_record2,0) ||
+ maria_rsame_with_pos(file,read_record2,0,info.recpos) ||
+ memcmp(read_record,read_record2,reclength) != 0)
+ {
+ printf("maria_rsame_with_pos didn't find same record\n");
+ goto err;
+ }
+ {
+ int skr;
+ info.recpos= maria_position(file);
+ skr= maria_rnext(file,read_record2,0);
+ if ((skr && my_errno != HA_ERR_END_OF_FILE) ||
+ maria_rprev(file,read_record2,0) ||
+ memcmp(read_record,read_record2,reclength) != 0 ||
+ info.recpos != maria_position(file))
+ {
+ printf("maria_rsame_with_pos lost position\n");
+ goto err;
+ }
+ }
+ ant=1;
+ while (maria_rnext(file,read_record2,0) == 0 &&
+ memcmp(read_record2+start,key,length) == 0) ant++;
+ if (ant != dupp_keys)
+ {
+ printf("next: Found: %d keys of %d\n",ant,dupp_keys);
+ goto err;
+ }
+ ant=0;
+ while (maria_rprev(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys)
+ {
+ printf("prev: Found: %d records of %d\n",ant,dupp_keys);
+ goto err;
+ }
+
+ /* Check of maria_rnext_same */
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ goto err;
+ ant=1;
+ while (!maria_rnext_same(file,read_record3) && ant < dupp_keys+10)
+ ant++;
+ if (ant != dupp_keys || my_errno != HA_ERR_END_OF_FILE)
+ {
+ printf("maria_rnext_same: Found: %d records of %d\n",ant,dupp_keys);
+ goto err;
+ }
+ }
+
+ if (!silent)
+ printf("- All keys: first - next -> last - prev -> first\n");
+ DBUG_PRINT("progpos",("All keys: first - next -> last - prev -> first"));
+ ant=1;
+ if (maria_rfirst(file,read_record,0))
+ {
+ printf("Can't find first record\n");
+ goto err;
+ }
+ while ((error=maria_rnext(file,read_record3,0)) == 0 && ant < write_count+10)
+ ant++;
+ if (ant != write_count - opt_delete || error != HA_ERR_END_OF_FILE)
+ {
+ printf("next: I found: %d records of %d (error: %d)\n",
+ ant, write_count - opt_delete, error);
+ goto err;
+ }
+ if (maria_rlast(file,read_record2,0) ||
+ bcmp(read_record2,read_record3,reclength))
+ {
+ printf("Can't find last record\n");
+ DBUG_DUMP("record2", read_record2, reclength);
+ DBUG_DUMP("record3", read_record3, reclength);
+ goto err;
+ }
+ ant=1;
+ while (maria_rprev(file,read_record3,0) == 0 && ant < write_count+10)
+ ant++;
+ if (ant != write_count - opt_delete)
+ {
+ printf("prev: I found: %d records of %d\n",ant,write_count);
+ goto err;
+ }
+ if (bcmp(read_record,read_record3,reclength))
+ {
+ printf("Can't find first record\n");
+ goto err;
+ }
+
+ if (!silent)
+ printf("- Test if: Read first - next - prev - prev - next == first\n");
+ DBUG_PRINT("progpos",("- Read first - next - prev - prev - next == first"));
+ if (maria_rfirst(file,read_record,0) ||
+ maria_rnext(file,read_record3,0) ||
+ maria_rprev(file,read_record3,0) ||
+ maria_rprev(file,read_record3,0) == 0 ||
+ maria_rnext(file,read_record3,0))
+ goto err;
+ if (bcmp(read_record,read_record3,reclength) != 0)
+ printf("Can't find first record\n");
+
+ if (!silent)
+ printf("- Test if: Read last - prev - next - next - prev == last\n");
+ DBUG_PRINT("progpos",("Read last - prev - next - next - prev == last"));
+ if (maria_rlast(file,read_record2,0) ||
+ maria_rprev(file,read_record3,0) ||
+ maria_rnext(file,read_record3,0) ||
+ maria_rnext(file,read_record3,0) == 0 ||
+ maria_rprev(file,read_record3,0))
+ goto err;
+ if (bcmp(read_record2,read_record3,reclength))
+ printf("Can't find last record\n");
+#ifdef NOT_ANYMORE
+ if (!silent)
+ puts("- Test read key-part");
+ strmov(key2,key);
+ for(i=strlen(key2) ; i-- > 1 ;)
+ {
+ key2[i]=0;
+
+ /* The following row is just to catch some bugs in the key code */
+ bzero((char*) file->lastkey,file->s->base.max_key_length*2);
+ if (maria_rkey(file,read_record,0,key2,(uint) i,HA_READ_PREFIX))
+ goto err;
+ if (bcmp(read_record+start,key,(uint) i))
+ {
+ puts("Didn't find right record");
+ goto err;
+ }
+ }
+#endif
+ if (dupp_keys > 2)
+ {
+ if (!silent)
+ printf("- Read key (first) - next - delete - next -> last\n");
+ DBUG_PRINT("progpos",("first - next - delete - next -> last"));
+ if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ goto err;
+ if (maria_rnext(file,read_record3,0)) goto err;
+ if (maria_delete(file,read_record3)) goto err;
+ opt_delete++;
+ ant=1;
+ while (maria_rnext(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys-1)
+ {
+ printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-1);
+ goto err;
+ }
+ }
+ if (dupp_keys>4)
+ {
+ if (!silent)
+ printf("- Read last of key - prev - delete - prev -> first\n");
+ DBUG_PRINT("progpos",("last - prev - delete - prev -> first"));
+ if (maria_rprev(file,read_record3,0)) goto err;
+ if (maria_rprev(file,read_record3,0)) goto err;
+ if (maria_delete(file,read_record3)) goto err;
+ opt_delete++;
+ ant=1;
+ while (maria_rprev(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys-2)
+ {
+ printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-2);
+ goto err;
+ }
+ }
+ if (dupp_keys > 6)
+ {
+ if (!silent)
+ printf("- Read first - delete - next -> last\n");
+ DBUG_PRINT("progpos",("first - delete - next -> last"));
+ if (maria_rkey(file,read_record3,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+ goto err;
+ if (maria_delete(file,read_record3)) goto err;
+ opt_delete++;
+ ant=1;
+ if (maria_rnext(file,read_record,0))
+ goto err; /* Skall finnas poster */
+ while (maria_rnext(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys-3)
+ {
+ printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-3);
+ goto err;
+ }
+
+ if (!silent)
+ printf("- Read last - delete - prev -> first\n");
+ DBUG_PRINT("progpos",("last - delete - prev -> first"));
+ if (maria_rprev(file,read_record3,0)) goto err;
+ if (maria_delete(file,read_record3)) goto err;
+ opt_delete++;
+ ant=0;
+ while (maria_rprev(file,read_record3,0) == 0 &&
+ bcmp(read_record3+start,key,length) == 0) ant++;
+ if (ant != dupp_keys-4)
+ {
+ printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-4);
+ goto err;
+ }
+ }
+
+ if (!silent)
+ puts("- Test if: Read rrnd - same");
+ DBUG_PRINT("progpos",("Read rrnd - same"));
+ assert(maria_scan_init(file) == 0);
+ for (i=0 ; i < write_count ; i++)
+ {
+ int tmp;
+ if ((tmp= maria_scan(file,read_record)) &&
+ tmp != HA_ERR_END_OF_FILE &&
+ tmp != HA_ERR_RECORD_DELETED)
+ {
+ printf("Got error %d when scanning table\n", tmp);
+ break;
+ }
+ if (!tmp)
+ {
+ /* Remember position to last found row */
+ info.recpos= maria_position(file);
+ bmove(read_record2,read_record,reclength);
+ }
+ }
+ maria_scan_end(file);
+ if (i != write_count && i != write_count - opt_delete)
+ {
+ printf("Found wrong number of rows while scanning table\n");
+ goto err;
+ }
+
+ if (maria_rsame_with_pos(file,read_record,0,info.recpos))
+ goto err;
+ if (bcmp(read_record,read_record2,reclength) != 0)
+ {
+ printf("maria_rsame_with_pos didn't find same record\n");
+ goto err;
+ }
+
+ for (i=min(2,keys) ; i-- > 0 ;)
+ {
+ if (maria_rsame(file,read_record2,(int) i)) goto err;
+ if (bcmp(read_record,read_record2,reclength) != 0)
+ {
+ printf("maria_rsame didn't find same record\n");
+ goto err;
+ }
+ }
+ if (!silent)
+ puts("- Test maria_records_in_range");
+ maria_status(file,&info,HA_STATUS_VARIABLE);
+ for (i=0 ; i < info.keys ; i++)
+ {
+ key_range min_key, max_key;
+ if (maria_rfirst(file,read_record,(int) i) ||
+ maria_rlast(file,read_record2,(int) i))
+ goto err;
+ copy_key(file,(uint) i, read_record, key);
+ copy_key(file,(uint) i, read_record2, key2);
+ min_key.key= key;
+ min_key.keypart_map= HA_WHOLE_KEY;
+ min_key.flag= HA_READ_KEY_EXACT;
+ max_key.key= key2;
+ max_key.keypart_map= HA_WHOLE_KEY;
+ max_key.flag= HA_READ_AFTER_KEY;
+
+ range_records= maria_records_in_range(file,(int) i, &min_key, &max_key);
+ if (range_records < info.records*8/10 ||
+ range_records > info.records*12/10)
+ {
+ printf("maria_records_range returned %ld; Should be about %ld\n",
+ (long) range_records,(long) info.records);
+ goto err;
+ }
+ if (verbose)
+ {
+ printf("maria_records_range returned %ld; Exact is %ld (diff: %4.2g %%)\n",
+ (long) range_records, (long) info.records,
+ labs((long) range_records - (long) info.records)*100.0/
+ info.records);
+ }
+ }
+ for (i=0 ; i < 5 ; i++)
+ {
+ for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+ for (k=rnd(1000)+1 ; k>0 && key1[k] == 0 ; k--) ;
+ if (j != 0 && k != 0)
+ {
+ key_range min_key, max_key;
+ if (j > k)
+ swap_variables(int, j, k);
+ sprintf((char*) key,"%6d",j);
+ sprintf((char*) key2,"%6d",k);
+
+ min_key.key= key;
+ min_key.keypart_map= HA_WHOLE_KEY;
+ min_key.flag= HA_READ_AFTER_KEY;
+ max_key.key= key2;
+ max_key.keypart_map= HA_WHOLE_KEY;
+ max_key.flag= HA_READ_BEFORE_KEY;
+ range_records= maria_records_in_range(file, 0, &min_key, &max_key);
+ records=0;
+ for (j++ ; j < k ; j++)
+ records+=key1[j];
+ if ((long) range_records < (long) records*7/10-2 ||
+ (long) range_records > (long) records*14/10+2)
+ {
+ printf("maria_records_range for key: %d returned %lu; Should be about %lu\n",
+ i, (ulong) range_records, (ulong) records);
+ goto err;
+ }
+ if (verbose && records)
+ {
+ printf("maria_records_range returned %lu; Exact is %lu (diff: %4.2g %%)\n",
+ (ulong) range_records, (ulong) records,
+ labs((long) range_records-(long) records)*100.0/records);
+
+ }
+ }
+ }
+
+ if (!silent)
+ printf("- maria_info\n");
+ maria_status(file,&info,HA_STATUS_VARIABLE | HA_STATUS_CONST);
+ if (info.records != write_count-opt_delete || info.deleted > opt_delete + update
+ || info.keys != keys)
+ {
+ puts("Wrong info from maria_info");
+ printf("Got: records: %lu delete: %lu i_keys: %d\n",
+ (ulong) info.records, (ulong) info.deleted, info.keys);
+ goto err;
+ }
+ if (verbose)
+ {
+ char buff[80];
+ get_date(buff,3,info.create_time);
+ printf("info: Created %s\n",buff);
+ get_date(buff,3,info.check_time);
+ printf("info: checked %s\n",buff);
+ get_date(buff,3,info.update_time);
+ printf("info: Modified %s\n",buff);
+ }
+
+ maria_panic(HA_PANIC_WRITE);
+ maria_panic(HA_PANIC_READ);
+ if (maria_is_changed(file))
+ puts("Warning: maria_is_changed reported that datafile was changed");
+
+ if (!silent)
+ printf("- maria_extra(CACHE) + maria_rrnd.... + maria_extra(NO_CACHE)\n");
+ if (maria_reset(file) || maria_extra(file,HA_EXTRA_CACHE,0))
+ {
+ if (do_locking || (!use_blob && !pack_fields))
+ {
+ puts("got error from maria_extra(HA_EXTRA_CACHE)");
+ goto err;
+ }
+ }
+ ant=0;
+ assert(maria_scan_init(file) == 0);
+ while ((error= maria_scan(file,record)) != HA_ERR_END_OF_FILE &&
+ ant < write_count + 10)
+ ant+= error ? 0 : 1;
+ maria_scan_end(file);
+ if (ant != write_count-opt_delete)
+ {
+ printf("scan with cache: I can only find: %d records of %d\n",
+ ant,write_count-opt_delete);
+ maria_scan_end(file);
+ goto err;
+ }
+ if (maria_extra(file,HA_EXTRA_NO_CACHE,0))
+ {
+ puts("got error from maria_extra(HA_EXTRA_NO_CACHE)");
+ maria_scan_end(file);
+ goto err;
+ }
+ maria_scan_end(file);
+
+ ant=0;
+ maria_scan_init(file);
+ while ((error=maria_scan(file,record)) != HA_ERR_END_OF_FILE &&
+ ant < write_count + 10)
+ ant+= error ? 0 : 1;
+ if (ant != write_count-opt_delete)
+ {
+ printf("scan with cache: I can only find: %d records of %d\n",
+ ant,write_count-opt_delete);
+ maria_scan_end(file);
+ goto err;
+ }
+ maria_scan_end(file);
+
+ if (testflag == 5)
+ goto end;
+ if (checkpoint == 5 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+ if (!silent)
+ printf("- Removing keys\n");
+ DBUG_PRINT("progpos",("Removing keys"));
+ lastpos = HA_OFFSET_ERROR;
+ /* DBUG_POP(); */
+ maria_reset(file);
+ found_parts=0;
+ maria_scan_init(file);
+ while ((error= maria_scan(file,read_record)) != HA_ERR_END_OF_FILE)
+ {
+ info.recpos=maria_position(file);
+ if (lastpos >= info.recpos && lastpos != HA_OFFSET_ERROR)
+ {
+ printf("maria_rrnd didn't advance filepointer; old: %ld, new: %ld\n",
+ (long) lastpos, (long) info.recpos);
+ goto err;
+ }
+ lastpos=info.recpos;
+ if (error == 0)
+ {
+ if (opt_delete == (uint) remove_count) /* While testing */
+ goto end;
+ if (rnd(2) == 1 && maria_rsame(file,read_record,-1))
+ {
+ printf("can't find record %lx\n",(long) info.recpos);
+ goto err;
+ }
+ if (use_blob)
+ {
+ ulong blob_length,pos;
+ uchar *ptr;
+ memcpy_fixed(&ptr, read_record+blob_pos+4, sizeof(ptr));
+ blob_length= uint4korr(read_record+blob_pos);
+ for (pos=0 ; pos < blob_length ; pos++)
+ {
+ if (ptr[pos] != (uchar) (blob_length+pos))
+ {
+ printf("Found blob with wrong info at %ld\n",(long) lastpos);
+ maria_scan_end(file);
+ my_errno= 0;
+ goto err;
+ }
+ }
+ }
+ if (maria_delete(file,read_record))
+ {
+ printf("can't delete record: %6.6s, delete_count: %d\n",
+ read_record, opt_delete);
+ maria_scan_end(file);
+ goto err;
+ }
+ opt_delete++;
+ }
+ else
+ found_parts++;
+ }
+ if (my_errno != HA_ERR_END_OF_FILE && my_errno != HA_ERR_RECORD_DELETED)
+ printf("error: %d from maria_rrnd\n",my_errno);
+ if (write_count != opt_delete)
+ {
+ printf("Deleted only %d of %d records (%d parts)\n",opt_delete,write_count,
+ found_parts);
+ maria_scan_end(file);
+ goto err;
+ }
+ if (testflag == 6)
+ goto end;
+ if (checkpoint == 6 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+ goto err;
+
+end:
+ maria_scan_end(file);
+ if (die_in_middle_of_transaction)
+ {
+ /* As commit record is not done, UNDO entries needs to be rolled back */
+ switch (die_in_middle_of_transaction) {
+ case 1:
+ /*
+ Flush changed data and index pages go to disk
+ That will also flush log. Recovery will skip REDOs and apply UNDOs.
+ */
+ _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+ FLUSH_RELEASE, FLUSH_RELEASE);
+ break;
+ case 2:
+ /*
+ Just flush log. Pages are likely to not be on disk. Recovery will
+ then execute REDOs and UNDOs.
+ */
+ if (translog_flush(file->trn->undo_lsn))
+ goto err;
+ break;
+ case 3:
+ /*
+ Flush nothing. Pages and log are likely to not be on disk. Recovery
+ will then do nothing.
+ */
+ break;
+ case 4:
+ /*
+ Flush changed data pages go to disk. Changed index pages are not
+ flushed. Recovery will skip some REDOs and apply UNDOs.
+ */
+ _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE,
+ FLUSH_RELEASE);
+ /*
+ We have to flush log separately as the redo for the last key page
+ may not be flushed
+ */
+ if (translog_flush(file->trn->undo_lsn))
+ goto err;
+ break;
+ }
+ printf("Dying on request without maria_commit()/maria_close()\n");
+ exit(0);
+ }
+ if (maria_commit(file))
+ goto err;
+ if (maria_close(file))
+ {
+ file= 0;
+ goto err;
+ }
+ file= 0;
+ maria_panic(HA_PANIC_CLOSE); /* Should close log */
+ if (!silent)
+ {
+ printf("\nFollowing test have been made:\n");
+ printf("Write records: %d\nUpdate records: %d\nSame-key-read: %d\nDelete records: %d\n", write_count,update,dupp_keys,opt_delete);
+ if (rec_pointer_size)
+ printf("Record pointer size: %d\n",rec_pointer_size);
+ printf("maria_block_size: %lu\n", maria_block_size);
+ if (write_cacheing)
+ puts("Key cache resized");
+ if (write_cacheing)
+ puts("Write cacheing used");
+ if (write_cacheing)
+ puts("quick mode");
+ if (async_io && do_locking)
+ puts("Asyncron io with locking used");
+ else if (do_locking)
+ puts("Locking used");
+ if (use_blob)
+ puts("blobs used");
+ printf("key cache status: \n\
+blocks used:%10lu\n\
+not flushed:%10lu\n\
+w_requests: %10lu\n\
+writes: %10lu\n\
+r_requests: %10lu\n\
+reads: %10lu\n",
+ maria_pagecache->blocks_used,
+ maria_pagecache->global_blocks_changed,
+ (ulong) maria_pagecache->global_cache_w_requests,
+ (ulong) maria_pagecache->global_cache_write,
+ (ulong) maria_pagecache->global_cache_r_requests,
+ (ulong) maria_pagecache->global_cache_read);
+ }
+ maria_end();
+ my_free(blob_buffer, MYF(MY_ALLOW_ZERO_PTR));
+ my_end(silent ? MY_CHECK_ERROR : MY_CHECK_ERROR | MY_GIVE_INFO);
+ return(0);
+err:
+ printf("got error: %d when using MARIA-database\n",my_errno);
+ if (file)
+ {
+ if (maria_commit(file))
+ goto err;
+ VOID(maria_close(file));
+ }
+ maria_end();
+ return(1);
+} /* main */
+
+
+/* Read options */
+
+static void get_options(int argc, char **argv)
+{
+ char *pos,*progname;
+
+ progname= argv[0];
+
+ while (--argc >0 && *(pos = *(++argv)) == '-' ) {
+ switch(*++pos) {
+ case 'B':
+ pack_type= HA_BINARY_PACK_KEY;
+ break;
+ case 'b':
+ use_blob= 1000;
+ if (*++pos)
+ use_blob= atol(pos);
+ break;
+ case 'K': /* Use key cacheing */
+ pagecacheing=1;
+ if (*++pos)
+ pagecache_size=atol(pos);
+ break;
+ case 'W': /* Use write cacheing */
+ write_cacheing=1;
+ if (*++pos)
+ my_default_record_cache_size=atoi(pos);
+ break;
+ case 'd':
+ remove_count= atoi(++pos);
+ break;
+ case 'i':
+ if (*++pos)
+ srand(srand_arg= atoi(pos));
+ break;
+ case 'L':
+ do_locking=1;
+ break;
+ case 'a': /* use asyncron io */
+ async_io=1;
+ if (*++pos)
+ my_default_record_cache_size=atoi(pos);
+ break;
+ case 'v': /* verbose */
+ verbose=1;
+ break;
+ case 'm': /* records */
+ if ((recant=atoi(++pos)) < 10 && testflag > 2)
+ {
+ fprintf(stderr,"record count must be >= 10 (if testflag > 2)\n");
+ exit(1);
+ }
+ break;
+ case 'e': /* maria_block_length */
+ case 'E':
+ if ((maria_block_size= atoi(++pos)) < MARIA_MIN_KEY_BLOCK_LENGTH ||
+ maria_block_size > MARIA_MAX_KEY_BLOCK_LENGTH)
+ {
+ fprintf(stderr,"Wrong maria_block_length\n");
+ exit(1);
+ }
+ maria_block_size= my_round_up_to_next_power(maria_block_size);
+ break;
+ case 'f':
+ if ((first_key=atoi(++pos)) < 0 || first_key >= MARIA_KEYS)
+ first_key=0;
+ break;
+ case 'H':
+ checkpoint= atoi(++pos);
+ break;
+ case 'k':
+ if ((keys=(uint) atoi(++pos)) < 1 ||
+ keys > (uint) (MARIA_KEYS-first_key))
+ keys=MARIA_KEYS-first_key;
+ break;
+ case 'M':
+ record_type= BLOCK_RECORD;
+ break;
+ case 'P':
+ pack_type=0; /* Don't use DIFF_LENGTH */
+ pack_seg=0;
+ break;
+ case 'R': /* Length of record pointer */
+ rec_pointer_size=atoi(++pos);
+ if (rec_pointer_size > 7)
+ rec_pointer_size=0;
+ break;
+ case 'S':
+ pack_fields=0; /* Static-length-records */
+ record_type= STATIC_RECORD;
+ break;
+ case 's':
+ silent=1;
+ break;
+ case 't':
+ testflag=atoi(++pos); /* testmod */
+ break;
+ case 'T':
+ transactional= 1;
+ break;
+ case 'A':
+ die_in_middle_of_transaction= atoi(++pos);
+ break;
+ case 'u':
+ update_count=atoi(++pos);
+ if (!update_count)
+ skip_update= 1;
+ break;
+ case 'q':
+ opt_quick_mode=1;
+ break;
+ case 'c':
+ create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM;
+ break;
+ case 'D':
+ create_flag|=HA_CREATE_DELAY_KEY_WRITE;
+ break;
+ case 'g':
+ skip_update= TRUE;
+ break;
+ case 'C':
+ opt_versioning= 1;
+ break;
+ case '?':
+ case 'I':
+ case 'V':
+ printf("%s Ver 1.2 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE);
+ puts("By Monty, for testing Maria\n");
+ printf("Usage: %s [-?AbBcCDIKLPRqSsTVWltv] [-k#] [-f#] [-m#] [-e#] [-E#] [-t#]\n",
+ progname);
+ exit(0);
+ case '#':
+ DBUG_PUSH (++pos);
+ break;
+ default:
+ printf("Illegal option: '%c'\n",*pos);
+ break;
+ }
+ }
+ return;
+} /* get options */
+
+ /* Get a random value 0 <= x <= n */
+
+static uint rnd(uint max_value)
+{
+ return (uint) ((rand() & 32767)/32767.0*max_value);
+} /* rnd */
+
+
+ /* Create a variable length record */
+
+static void fix_length(uchar *rec, uint length)
+{
+ bmove(rec+STANDARD_LENGTH,
+ "0123456789012345678901234567890123456789012345678901234567890",
+ length-STANDARD_LENGTH);
+ strfill((char*) rec+length,STANDARD_LENGTH+60-length,' ');
+} /* fix_length */
+
+
+/* Put maybe a blob in record */
+
+static int first_entry;
+
+static void put_blob_in_record(uchar *blob_pos, char **blob_buffer,
+ ulong *blob_length)
+{
+ ulong i,length;
+ *blob_length= 0;
+ if (use_blob)
+ {
+ if (! *blob_buffer &&
+ !(*blob_buffer=my_malloc((uint) use_blob,MYF(MY_WME))))
+ {
+ use_blob= 0;
+ return;
+ }
+ if (rnd(10) == 0)
+ {
+ if (first_entry++ == 0)
+ {
+ /* Ensure we have at least one blob of max length in file */
+ length= use_blob;
+ }
+ else
+ length=rnd(use_blob);
+ for (i=0 ; i < length ; i++)
+ (*blob_buffer)[i]=(char) (length+i);
+ int4store(blob_pos,length);
+ memcpy_fixed(blob_pos+4,(char*) blob_buffer,sizeof(char*));
+ *blob_length= length;
+ }
+ else
+ {
+ int4store(blob_pos,0);
+ }
+ }
+ return;
+}
+
+
+static void copy_key(MARIA_HA *info,uint inx,uchar *rec,uchar *key_buff)
+{
+ HA_KEYSEG *keyseg;
+
+ for (keyseg=info->s->keyinfo[inx].seg ; keyseg->type ; keyseg++)
+ {
+ memcpy(key_buff,rec+keyseg->start,(size_t) keyseg->length);
+ key_buff+=keyseg->length;
+ }
+ return;
+}
diff --git a/storage/maria/ma_test3.c b/storage/maria/ma_test3.c
new file mode 100644
index 00000000000..040d6fa78c2
--- /dev/null
+++ b/storage/maria/ma_test3.c
@@ -0,0 +1,501 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Test av locking */
+
+#if !(defined (__NETWARE_) || defined (_WIN32)) /*no fork() in Windows*/
+
+#include "maria.h"
+#include <sys/types.h>
+#ifdef HAVE_SYS_WAIT_H
+# include <sys/wait.h>
+#endif
+#ifndef WEXITSTATUS
+# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
+#endif
+#ifndef WIFEXITED
+# define WIFEXITED(stat_val) (((stat_val) & 255) == 0)
+#endif
+
+
+#if defined(HAVE_LRAND48)
+#define rnd(X) (lrand48() % X)
+#define rnd_init(X) srand48(X)
+#else
+#define rnd(X) (random() % X)
+#define rnd_init(X) srandom(X)
+#endif
+
+
+const char *filename= "test3";
+uint tests=10,forks=10,pagecacheing=0;
+
+static void get_options(int argc, char *argv[]);
+void start_test(int id);
+int test_read(MARIA_HA *,int),test_write(MARIA_HA *,int,int),
+ test_update(MARIA_HA *,int,int),test_rrnd(MARIA_HA *,int);
+
+struct record {
+ uchar id[8];
+ uchar nr[4];
+ uchar text[10];
+} record;
+
+
+int main(int argc,char **argv)
+{
+ int status,wait_ret;
+ uint i=0;
+ MARIA_KEYDEF keyinfo[10];
+ MARIA_COLUMNDEF recinfo[10];
+ HA_KEYSEG keyseg[10][2];
+ MY_INIT(argv[0]);
+ get_options(argc,argv);
+
+ fprintf(stderr, "WARNING! this program is to test 'external locking'"
+ " (when several processes share a table through file locking)"
+ " which is not supported by Maria at all; expect errors."
+ " We may soon remove this program.\n");
+ maria_init();
+ bzero((char*) keyinfo,sizeof(keyinfo));
+ bzero((char*) recinfo,sizeof(recinfo));
+ bzero((char*) keyseg,sizeof(keyseg));
+ keyinfo[0].seg= &keyseg[0][0];
+ keyinfo[0].seg[0].start=0;
+ keyinfo[0].seg[0].length=8;
+ keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT;
+ keyinfo[0].seg[0].flag=HA_SPACE_PACK;
+ keyinfo[0].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[0].keysegs=1;
+ keyinfo[0].flag = (uint8) HA_PACK_KEY;
+ keyinfo[0].block_length= 0; /* Default block length */
+ keyinfo[1].seg= &keyseg[1][0];
+ keyinfo[1].seg[0].start=8;
+ keyinfo[1].seg[0].length=4; /* Long is always 4 in maria */
+ keyinfo[1].seg[0].type=HA_KEYTYPE_LONG_INT;
+ keyinfo[1].seg[0].flag=0;
+ keyinfo[1].key_alg=HA_KEY_ALG_BTREE;
+ keyinfo[1].keysegs=1;
+ keyinfo[1].flag =HA_NOSAME;
+ keyinfo[1].block_length= 0; /* Default block length */
+
+ recinfo[0].type=0;
+ recinfo[0].length=sizeof(record.id);
+ recinfo[1].type=0;
+ recinfo[1].length=sizeof(record.nr);
+ recinfo[2].type=0;
+ recinfo[2].length=sizeof(record.text);
+
+ puts("- Creating maria-file");
+ my_delete(filename,MYF(0)); /* Remove old locks under gdb */
+ if (maria_create(filename,BLOCK_RECORD, 2, &keyinfo[0],2,&recinfo[0],0,
+ (MARIA_UNIQUEDEF*) 0, (MARIA_CREATE_INFO*) 0,0))
+ exit(1);
+
+ rnd_init(0);
+ printf("- Starting %d processes\n",forks); fflush(stdout);
+ for (i=0 ; i < forks; i++)
+ {
+ if (!fork())
+ {
+ start_test(i+1);
+ sleep(1);
+ return 0;
+ }
+ VOID(rnd(1));
+ }
+
+ for (i=0 ; i < forks ; i++)
+ while ((wait_ret=wait(&status)) && wait_ret == -1);
+ maria_end();
+ return 0;
+}
+
+
+static void get_options(int argc, char **argv)
+{
+ char *pos,*progname;
+
+ progname= argv[0];
+
+ while (--argc >0 && *(pos = *(++argv)) == '-' ) {
+ switch(*++pos) {
+ case 'f':
+ forks=atoi(++pos);
+ break;
+ case 't':
+ tests=atoi(++pos);
+ break;
+ case 'K': /* Use key cacheing */
+ pagecacheing=1;
+ break;
+ case 'A': /* All flags */
+ pagecacheing=1;
+ break;
+ case '?':
+ case 'I':
+ case 'V':
+ printf("%s Ver 1.0 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE);
+ puts("By Monty, for your professional use\n");
+ puts("Test av locking with threads\n");
+ printf("Usage: %s [-?lKA] [-f#] [-t#]\n",progname);
+ exit(0);
+ case '#':
+ DBUG_PUSH (++pos);
+ break;
+ default:
+ printf("Illegal option: '%c'\n",*pos);
+ break;
+ }
+ }
+ return;
+}
+
+
+void start_test(int id)
+{
+ uint i;
+ int error,lock_type;
+ MARIA_INFO isam_info;
+ MARIA_HA *file,*file1,*file2=0,*lock;
+
+ if (!(file1=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)) ||
+ !(file2=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)))
+ {
+ fprintf(stderr,"Can't open isam-file: %s\n",filename);
+ exit(1);
+ }
+ if (pagecacheing && rnd(2) == 0)
+ init_pagecache(maria_pagecache, 65536L, 0, 0, MARIA_KEY_BLOCK_LENGTH,
+ MY_WME);
+ printf("Process %d, pid: %ld\n",id,(long) getpid()); fflush(stdout);
+
+ for (error=i=0 ; i < tests && !error; i++)
+ {
+ file= (rnd(2) == 1) ? file1 : file2;
+ lock=0 ; lock_type=0;
+ if (rnd(10) == 0)
+ {
+ if (maria_lock_database(lock=(rnd(2) ? file1 : file2),
+ lock_type=(rnd(2) == 0 ? F_RDLCK : F_WRLCK)))
+ {
+ fprintf(stderr,"%2d: start: Can't lock table %d\n",id,my_errno);
+ error=1;
+ break;
+ }
+ }
+ switch (rnd(4)) {
+ case 0: error=test_read(file,id); break;
+ case 1: error=test_rrnd(file,id); break;
+ case 2: error=test_write(file,id,lock_type); break;
+ case 3: error=test_update(file,id,lock_type); break;
+ }
+ if (lock)
+ maria_lock_database(lock,F_UNLCK);
+ }
+ if (!error)
+ {
+ maria_status(file1,&isam_info,HA_STATUS_VARIABLE);
+ printf("%2d: End of test. Records: %ld Deleted: %ld\n",
+ id,(long) isam_info.records, (long) isam_info.deleted);
+ fflush(stdout);
+ }
+
+ maria_close(file1);
+ maria_close(file2);
+ if (error)
+ {
+ printf("%2d: Aborted\n",id); fflush(stdout);
+ exit(1);
+ }
+}
+
+
+int test_read(MARIA_HA *file,int id)
+{
+ uint i,lock,found,next,prev;
+ ulong find;
+
+ lock=0;
+ if (rnd(2) == 0)
+ {
+ lock=1;
+ if (maria_lock_database(file,F_RDLCK))
+ {
+ fprintf(stderr,"%2d: Can't lock table %d\n",id,my_errno);
+ return 1;
+ }
+ }
+
+ found=next=prev=0;
+ for (i=0 ; i < 100 ; i++)
+ {
+ find=rnd(100000);
+ if (!maria_rkey(file,record.id,1,(uchar*) &find, HA_WHOLE_KEY,
+ HA_READ_KEY_EXACT))
+ found++;
+ else
+ {
+ if (my_errno != HA_ERR_KEY_NOT_FOUND)
+ {
+ fprintf(stderr,"%2d: Got error %d from read in read\n",id,my_errno);
+ return 1;
+ }
+ else if (!maria_rnext(file,record.id,1))
+ next++;
+ else
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rnext in read\n",id,my_errno);
+ return 1;
+ }
+ else if (!maria_rprev(file,record.id,1))
+ prev++;
+ else
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rnext in read\n",
+ id,my_errno);
+ return 1;
+ }
+ }
+ }
+ }
+ }
+ if (lock)
+ {
+ if (maria_lock_database(file,F_UNLCK))
+ {
+ fprintf(stderr,"%2d: Can't unlock table\n",id);
+ return 1;
+ }
+ }
+ printf("%2d: read: found: %5d next: %5d prev: %5d\n",
+ id,found,next,prev);
+ fflush(stdout);
+ return 0;
+}
+
+
+int test_rrnd(MARIA_HA *file,int id)
+{
+ uint count,lock;
+
+ lock=0;
+ if (rnd(2) == 0)
+ {
+ lock=1;
+ if (maria_lock_database(file,F_RDLCK))
+ {
+ fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno);
+ maria_close(file);
+ return 1;
+ }
+ if (rnd(2) == 0)
+ maria_extra(file,HA_EXTRA_CACHE,0);
+ }
+
+ count=0;
+ if (maria_rrnd(file,record.id,0L))
+ {
+ if (my_errno == HA_ERR_END_OF_FILE)
+ goto end;
+ fprintf(stderr,"%2d: Can't read first record (%d)\n",id,my_errno);
+ return 1;
+ }
+ for (count=1 ; !maria_rrnd(file,record.id,HA_OFFSET_ERROR) ;count++) ;
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rrnd\n",id,my_errno);
+ return 1;
+ }
+
+end:
+ if (lock)
+ {
+ maria_extra(file,HA_EXTRA_NO_CACHE,0);
+ if (maria_lock_database(file,F_UNLCK))
+ {
+ fprintf(stderr,"%2d: Can't unlock table\n",id);
+ exit(0);
+ }
+ }
+ printf("%2d: rrnd: %5d\n",id,count); fflush(stdout);
+ return 0;
+}
+
+
+int test_write(MARIA_HA *file,int id,int lock_type)
+{
+ uint i,tries,count,lock;
+
+ lock=0;
+ if (rnd(2) == 0 || lock_type == F_RDLCK)
+ {
+ lock=1;
+ if (maria_lock_database(file,F_WRLCK))
+ {
+ if (lock_type == F_RDLCK && my_errno == EDEADLK)
+ {
+ printf("%2d: write: deadlock\n",id); fflush(stdout);
+ return 0;
+ }
+ fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno);
+ maria_close(file);
+ return 1;
+ }
+ if (rnd(2) == 0)
+ maria_extra(file,HA_EXTRA_WRITE_CACHE,0);
+ }
+
+ sprintf((char*) record.id,"%7ld", (long) getpid());
+ strnmov((char*) record.text,"Testing...", sizeof(record.text));
+
+ tries=(uint) rnd(100)+10;
+ for (i=count=0 ; i < tries ; i++)
+ {
+ uint32 tmp=rnd(80000)+20000;
+ int4store(record.nr,tmp);
+ if (!maria_write(file,record.id))
+ count++;
+ else
+ {
+ if (my_errno != HA_ERR_FOUND_DUPP_KEY)
+ {
+ fprintf(stderr,"%2d: Got error %d (errno %d) from write\n",id,my_errno,
+ errno);
+ return 1;
+ }
+ }
+ }
+ if (lock)
+ {
+ maria_extra(file,HA_EXTRA_NO_CACHE,0);
+ if (maria_lock_database(file,F_UNLCK))
+ {
+ fprintf(stderr,"%2d: Can't unlock table\n",id);
+ exit(0);
+ }
+ }
+ printf("%2d: write: %5d\n",id,count); fflush(stdout);
+ return 0;
+}
+
+
+int test_update(MARIA_HA *file,int id,int lock_type)
+{
+ uint i,lock,found,next,prev,update;
+ uint32 tmp;
+ char find[4];
+ struct record new_record;
+
+ lock=0;
+ if (rnd(2) == 0 || lock_type == F_RDLCK)
+ {
+ lock=1;
+ if (maria_lock_database(file,F_WRLCK))
+ {
+ if (lock_type == F_RDLCK && my_errno == EDEADLK)
+ {
+ printf("%2d: write: deadlock\n",id); fflush(stdout);
+ return 0;
+ }
+ fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno);
+ return 1;
+ }
+ }
+ bzero((char*) &new_record,sizeof(new_record));
+ strmov((char*) new_record.text,"Updated");
+
+ found=next=prev=update=0;
+ for (i=0 ; i < 100 ; i++)
+ {
+ tmp=rnd(100000);
+ int4store(find,tmp);
+ if (!maria_rkey(file,record.id,1,(uchar*) find, HA_WHOLE_KEY,
+ HA_READ_KEY_EXACT))
+ found++;
+ else
+ {
+ if (my_errno != HA_ERR_KEY_NOT_FOUND)
+ {
+ fprintf(stderr,"%2d: Got error %d from read in update\n",id,my_errno);
+ return 1;
+ }
+ else if (!maria_rnext(file,record.id,1))
+ next++;
+ else
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rnext in update\n",
+ id,my_errno);
+ return 1;
+ }
+ else if (!maria_rprev(file,record.id,1))
+ prev++;
+ else
+ {
+ if (my_errno != HA_ERR_END_OF_FILE)
+ {
+ fprintf(stderr,"%2d: Got error %d from rnext in update\n",
+ id,my_errno);
+ return 1;
+ }
+ continue;
+ }
+ }
+ }
+ memcpy_fixed(new_record.id,record.id,sizeof(record.id));
+ tmp=rnd(20000)+40000;
+ int4store(new_record.nr,tmp);
+ if (!maria_update(file,record.id,new_record.id))
+ update++;
+ else
+ {
+ if (my_errno != HA_ERR_RECORD_CHANGED &&
+ my_errno != HA_ERR_RECORD_DELETED &&
+ my_errno != HA_ERR_FOUND_DUPP_KEY)
+ {
+ fprintf(stderr,"%2d: Got error %d from update\n",id,my_errno);
+ return 1;
+ }
+ }
+ }
+ if (lock)
+ {
+ if (maria_lock_database(file,F_UNLCK))
+ {
+ fprintf(stderr,"Can't unlock table,id, error%d\n",my_errno);
+ return 1;
+ }
+ }
+ printf("%2d: update: %5d\n",id,update); fflush(stdout);
+ return 0;
+}
+
+#else /* __NETWARE__ || __WIN__ */
+
+#include <stdio.h>
+
+int main()
+{
+ fprintf(stderr,"this test has not been ported to Netware or Windows\n");
+ return 0;
+}
+
+#endif /* __NETWARE__|| __WIN__ */
diff --git a/storage/maria/ma_test_all.res b/storage/maria/ma_test_all.res
new file mode 100644
index 00000000000..586aaf68020
--- /dev/null
+++ b/storage/maria/ma_test_all.res
@@ -0,0 +1,14 @@
+Running tests with dynamic row format
+Running tests with static row format
+Running tests with block row format
+Running tests with block row format and transactions
+ma_test2 -s -L -K -R1 -m2000 ; Should give error 135
+Error: 135 in write at record: 1099
+got error: 135 when using MARIA-database
+./maria_chk -sm test2 will warn that 'Datafile is almost full'
+maria_chk: MARIA file test2
+maria_chk: warning: Datafile is almost full, 65516 of 65534 used
+MARIA-table 'test2' is usable but should be fixed
+MARIA RECOVERY TESTS
+ALL RECOVERY TESTS OK
+!!!!!!!! BUT REMEMBER to FIX this BLOB issue !!!!!!!
diff --git a/storage/maria/ma_test_all.sh b/storage/maria/ma_test_all.sh
new file mode 100755
index 00000000000..041fbf3abe6
--- /dev/null
+++ b/storage/maria/ma_test_all.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+# This file is now deprecated and has been replaced by
+# unittest/ma_test_all-t
+#
+#
+#
+#
+
+if test -n "$1"; then
+
+ # unit.pl can't pass options to ma_test_all-t, so if anything
+ # was passed as an argument, assume the purpose was to pass
+ # them to ma_test_all-t and call it directly
+
+ unittest/ma_test_all-t $@
+else
+ perl ../../unittest/unit.pl run unittest/ma_test_all-t
+fi
diff --git a/storage/maria/ma_test_big.sh b/storage/maria/ma_test_big.sh
new file mode 100644
index 00000000000..6419d05e3a4
--- /dev/null
+++ b/storage/maria/ma_test_big.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+# This tests is good to find bugs in the redo/undo handling and in
+# finding bugs in blob handling
+#
+
+set -e
+a=15
+while test $a -le 5000
+do
+ echo $a
+ rm -f maria_log*
+ ma_test2 -s -L -K -W -P -M -T -c -b32768 -t4 -A1 -m$a > /dev/null
+ maria_read_log -a -s >& /dev/null
+ maria_chk -es test2
+ maria_read_log -a -s >& /dev/null
+ maria_chk -es test2
+ rm test2.MA?
+ maria_read_log -a -s >& /dev/null
+ maria_chk -es test2
+ a=$((a+1))
+done
diff --git a/storage/maria/ma_test_force_start.pl b/storage/maria/ma_test_force_start.pl
new file mode 100755
index 00000000000..8148b2f212b
--- /dev/null
+++ b/storage/maria/ma_test_force_start.pl
@@ -0,0 +1,238 @@
+#!/usr/bin/env perl
+
+
+use strict;
+use warnings;
+
+my $usage= <<EOF;
+This program tests that the options
+--aria-force-start-after-recovery-failures --aria-recover work as
+expected.
+It has to be run from directory mysql-test, and works with non-debug
+and debug binaries.
+Pass it option -d or -i (to test corruption of data or index file).
+EOF
+
+# -d currently exhibits BUG#36578
+# "Maria: maria-recover may fail to autorepair a table"
+
+die($usage) if (@ARGV == 0);
+
+my $corrupt_index;
+
+if ($ARGV[0] eq '-d')
+ {
+ $corrupt_index= 0;
+ }
+elsif ($ARGV[0] eq '-i')
+ {
+ $corrupt_index= 1;
+ }
+else
+ {
+ die($usage);
+ }
+
+my $force_after= 3;
+my $corrupt_file= $corrupt_index ? "MAI" : "MAD";
+my $corrupt_message=
+ "\\[ERROR\\] mysqld(.exe)*: Table '..test.t1' is marked as crashed and should be repaired";
+
+my $sql_name= "./var/tmp/create_table.sql";
+my $error_log_name= "./var/log/master.err";
+my @cmd_output;
+my $whatever; # garbage data
+$ENV{MTR_VERSION} = 1; # MTR2 does not have --start-and-exit
+my $base_server_cmd= "perl mysql-test-run.pl --mysqld=--aria-force-start-after-recovery-failures=$force_after --suite=maria maria.maria-recover ";
+if ($^O =~ /^mswin/i)
+ {
+ print <<EOF;
+WARNING: with Activestate Perl, mysql-test-run.pl --start-and-exit has a bug:
+it does not exit; cygwin perl recommended
+EOF
+ }
+my $iswindows= ( $^O =~ /win/i && $^O !~ /darwin/i );
+$base_server_cmd.= ($iswindows ? "--mysqld=--console" : "--mem");
+my $server_cmd;
+my $server_pid_name="./var/run/master.pid";
+my $server_pid;
+my $i; # count of server restarts
+sub kill_server;
+
+my $suffix= ($iswindows ? ".exe" : "");
+my $client_exe_path= "../client/release";
+# we use -f, sometimes -x is unexpectedly false in Cygwin
+if ( ! -f "$client_exe_path/mysql$suffix" )
+ {
+ $client_exe_path= "../client/relwithdebinfo";
+ if ( ! -f "$client_exe_path/mysql$suffix" )
+ {
+ $client_exe_path= "../client/debug";
+ if ( ! -f "$client_exe_path/mysql$suffix" )
+ {
+ $client_exe_path= "../client";
+ if ( ! -f "$client_exe_path/mysql$suffix" )
+ {
+ die("Cannot find 'mysql' executable\n");
+ }
+ }
+ }
+ }
+
+print "starting mysqld\n";
+$server_cmd= $base_server_cmd . " --start-and-exit 2>&1";
+@cmd_output=`$server_cmd`;
+die if $?;
+my $master_port= (grep (/Using MASTER_MYPORT .*= (\d+)$/, @cmd_output))[0];
+$master_port =~ s/.*= //;
+chomp $master_port;
+die unless $master_port > 0;
+
+my $client_cmd= "$client_exe_path/mysql -u root -h 127.0.0.1 -P $master_port test < $sql_name";
+
+open(FILE, ">", $sql_name) or die;
+
+# To exhibit BUG#36578 with -d, we don't create an index if -d. This is
+# because the presence of an index will cause repair-by-sort to be used,
+# where sort_get_next_record() is only called inside
+#_ma_create_index_by_sort(), so the latter function fails and in this
+# case retry_repair is set, so bug does not happen. Whereas without
+# an index, repair-with-key-cache is called, which calls
+# sort_get_next_record() whose failure itself does not cause a retry.
+
+print FILE "create table t1 (a varchar(1000)".
+ ($corrupt_index ? ", index(a)" : "") .") engine=aria;\n";
+print FILE <<EOF;
+insert into t1 values("ThursdayMorningsMarket");
+# If Recovery executes REDO_INDEX_NEW_PAGE it will overwrite our
+# intentional corruption; we make Recovery skip this record by bumping
+# create_rename_lsn using OPTIMIZE TABLE. This also makes sure to put
+# the pages on disk, so that we can corrupt them.
+optimize table t1;
+# mark table open, so that --aria-recover repairs it
+insert into t1 select concat(a,'b') from t1 limit 1;
+EOF
+close FILE;
+
+print "creating table\n";
+`$client_cmd`;
+die if $?;
+
+print "killing mysqld hard\n";
+kill_server(9);
+
+print "ruining " .
+ ($corrupt_index ? "first page of keys" : "bitmap page") .
+ " in table to test aria-recover\n";
+open(FILE, "+<", "./var/master-data/test/t1.$corrupt_file") or die;
+$whatever= ("\xAB" x 100);
+sysseek (FILE, $corrupt_index ? 8192 : (8192-100-100), 0) or die;
+syswrite (FILE, $whatever) or die;
+close FILE;
+
+print "ruining log to make recovery fail; mysqld should fail the $force_after first restarts\n";
+open(FILE, "+<", "./var/tmp/aria_log.00000001") or die;
+$whatever= ("\xAB" x 8192);
+sysseek (FILE, 99, 0) or die;
+syswrite (FILE, $whatever) or die;
+close FILE;
+
+$server_cmd= $base_server_cmd . " --start-dirty 2>&1";
+for($i= 1; $i <= $force_after; $i= $i + 1)
+ {
+ print "mysqld restart number $i... ";
+ unlink($error_log_name) or die;
+ `$server_cmd`;
+ # mysqld should return 1 when can't read log
+ die unless (($? >> 8) == 1);
+ open(FILE, "<", $error_log_name) or die;
+ @cmd_output= <FILE>;
+ close FILE;
+ die unless grep(/\[ERROR\] mysqld(.exe)*: Aria engine: log initialization failed/, @cmd_output);
+ die unless grep(/\[ERROR\] Plugin 'Aria' init function returned error./, @cmd_output);
+ print "failed - ok\n";
+ }
+
+print "mysqld restart number $i... ";
+unlink($error_log_name) or die;
+@cmd_output=`$server_cmd`;
+die if $?;
+open(FILE, "<", $error_log_name) or die;
+@cmd_output= <FILE>;
+close FILE;
+die unless grep(/\[Warning\] mysqld(.exe)*: Aria engine: removed all logs after [\d]+ consecutive failures of recovery from logs/, @cmd_output);
+die unless grep(/\[ERROR\] mysqld(.exe)*: File '.*tmp.aria_log.00000001' not found \(Errcode: 2\)/, @cmd_output);
+print "success - ok\n";
+
+open(FILE, ">", $sql_name) or die;
+print FILE <<EOF;
+set global aria_recover=normal;
+insert into t1 values('aaa');
+EOF
+close FILE;
+
+# verify corruption has not yet been noticed
+open(FILE, "<", $error_log_name) or die;
+@cmd_output= <FILE>;
+close FILE;
+die if grep(/$corrupt_message/, @cmd_output);
+
+print "inserting in table\n";
+`$client_cmd`;
+die if $?;
+print "table is usable - ok\n";
+
+open(FILE, "<", $error_log_name) or die;
+@cmd_output= <FILE>;
+close FILE;
+die unless grep(/$corrupt_message/, @cmd_output);
+die unless grep(/\[Warning\] Recovering table: '..test.t1'/, @cmd_output);
+print "was corrupted and automatically repaired - ok\n";
+
+# remove our traces
+kill_server(15);
+
+print "TEST ALL OK\n";
+
+# kills mysqld with signal given in parameter
+sub kill_server
+ {
+ my ($sig)= @_;
+ my $wait_count= 0;
+ my $kill_cmd;
+ my @kill_output;
+ open(FILE, "<", $server_pid_name) or die;
+ @cmd_output= <FILE>;
+ close FILE;
+ $server_pid= $cmd_output[0];
+ chomp $server_pid;
+ die unless $server_pid > 0;
+ if ($iswindows)
+ {
+ # On Windows, server_pid_name is not the "main" process id
+ # so perl's kill() does not see this process id.
+ # But taskkill works, though only with /F ("-9"-style kill).
+ $kill_cmd= "taskkill /F /PID $server_pid 2>&1";
+ @kill_output= `$kill_cmd`;
+ die unless grep(/has been terminated/, @kill_output);
+ }
+ else
+ {
+ kill($sig, $server_pid) or die;
+ }
+ while (1) # wait until mysqld process gone
+ {
+ if ($iswindows)
+ {
+ @kill_output= `$kill_cmd`;
+ last if grep(/not found/, @kill_output);
+ }
+ else
+ {
+ kill (0, $server_pid) or last;
+ }
+ print "waiting for mysqld to die\n" if ($wait_count > 30);
+ $wait_count= $wait_count + 1;
+ select(undef, undef, undef, 0.1);
+ }
+ }
diff --git a/storage/maria/ma_test_recovery b/storage/maria/ma_test_recovery
new file mode 100755
index 00000000000..0b20264c434
--- /dev/null
+++ b/storage/maria/ma_test_recovery
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+# Remove comment from next line if this script fails and you need more
+# information of what's going on
+
+# This file is deprecated and has been replaced with ma_test_recovery.pl
+
+unittest/ma_test_recovery.pl $@
diff --git a/storage/maria/ma_unique.c b/storage/maria/ma_unique.c
new file mode 100644
index 00000000000..a90578c2162
--- /dev/null
+++ b/storage/maria/ma_unique.c
@@ -0,0 +1,244 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Functions to check if a row is unique */
+
+#include "maria_def.h"
+#include <m_ctype.h>
+
+/**
+ Check if there exist a row with the same hash
+
+ @notes
+ This function is not versioning safe. For the moment this is not a problem
+ as it's only used for internal temporary tables in MySQL for which there
+ isn't any versioning information.
+*/
+
+my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, uchar *record,
+ ha_checksum unique_hash, my_off_t disk_pos)
+{
+ my_off_t lastpos=info->cur_row.lastpos;
+ MARIA_KEYDEF *keyinfo= &info->s->keyinfo[def->key];
+ uchar *key_buff= info->lastkey_buff2;
+ MARIA_KEY key;
+ DBUG_ENTER("_ma_check_unique");
+ DBUG_PRINT("enter",("unique_hash: %lu", (ulong) unique_hash));
+
+ maria_unique_store(record+keyinfo->seg->start, unique_hash);
+ /* Can't be spatial so it's ok to call _ma_make_key directly here */
+ _ma_make_key(info, &key, def->key, key_buff, record, 0, 0);
+
+ /* The above changed info->lastkey_buff2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+
+ DBUG_ASSERT(key.data_length == MARIA_UNIQUE_HASH_LENGTH);
+ if (_ma_search(info, &key, SEARCH_FIND, info->s->state.key_root[def->key]))
+ {
+ info->page_changed=1; /* Can't optimize read next */
+ info->cur_row.lastpos= lastpos;
+ DBUG_RETURN(0); /* No matching rows */
+ }
+
+ for (;;)
+ {
+ if (info->cur_row.lastpos != disk_pos &&
+ !(*info->s->compare_unique)(info,def,record,info->cur_row.lastpos))
+ {
+ my_errno=HA_ERR_FOUND_DUPP_UNIQUE;
+ info->errkey= (int) def->key;
+ info->dup_key_pos= info->cur_row.lastpos;
+ info->page_changed= 1; /* Can't optimize read next */
+ info->cur_row.lastpos= lastpos;
+ DBUG_PRINT("info",("Found duplicate"));
+ DBUG_RETURN(1); /* Found identical */
+ }
+ DBUG_ASSERT(info->last_key.data_length == MARIA_UNIQUE_HASH_LENGTH);
+ if (_ma_search_next(info, &info->last_key, SEARCH_BIGGER,
+ info->s->state.key_root[def->key]) ||
+ bcmp(info->last_key.data, key_buff, MARIA_UNIQUE_HASH_LENGTH))
+ {
+ info->page_changed= 1; /* Can't optimize read next */
+ info->cur_row.lastpos= lastpos;
+ DBUG_RETURN(0); /* end of tree */
+ }
+ }
+}
+
+
+/*
+ Calculate a hash for a row
+
+ TODO
+ Add support for bit fields
+*/
+
+ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *record)
+{
+ const uchar *pos, *end;
+ ha_checksum crc= 0;
+ ulong seed1=0, seed2= 4;
+ HA_KEYSEG *keyseg;
+
+ for (keyseg=def->seg ; keyseg < def->end ; keyseg++)
+ {
+ enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type;
+ uint length=keyseg->length;
+
+ if (keyseg->null_bit)
+ {
+ if (record[keyseg->null_pos] & keyseg->null_bit)
+ {
+ /*
+ Change crc in a way different from an empty string or 0.
+ (This is an optimisation; The code will work even if this isn't
+ done)
+ */
+ crc=((crc << 8) + 511+
+ (crc >> (8*sizeof(ha_checksum)-8)));
+ continue;
+ }
+ }
+ pos= record+keyseg->start;
+ if (keyseg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint pack_length= keyseg->bit_start;
+ uint tmp_length= (pack_length == 1 ? (uint) *pos :
+ uint2korr(pos));
+ pos+= pack_length; /* Skip VARCHAR length */
+ set_if_smaller(length,tmp_length);
+ }
+ else if (keyseg->flag & HA_BLOB_PART)
+ {
+ uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos);
+ memcpy_fixed((uchar*) &pos,pos+keyseg->bit_start,sizeof(char*));
+ if (!length || length > tmp_length)
+ length=tmp_length; /* The whole blob */
+ }
+ end= pos+length;
+ if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 ||
+ type == HA_KEYTYPE_VARTEXT2)
+ {
+ keyseg->charset->coll->hash_sort(keyseg->charset,
+ (const uchar*) pos, length, &seed1,
+ &seed2);
+ crc^= seed1;
+ }
+ else
+ while (pos != end)
+ crc=((crc << 8) +
+ (((uchar) *pos++))) +
+ (crc >> (8*sizeof(ha_checksum)-8));
+ }
+ return crc;
+}
+
+
+/*
+ compare unique key for two rows
+
+ TODO
+ Add support for bit fields
+
+ RETURN
+ 0 if both rows have equal unique value
+ 1 Rows are different
+*/
+
+my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b,
+ my_bool null_are_equal)
+{
+ const uchar *pos_a, *pos_b, *end;
+ HA_KEYSEG *keyseg;
+
+ for (keyseg=def->seg ; keyseg < def->end ; keyseg++)
+ {
+ enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type;
+ uint a_length, b_length;
+ a_length= b_length= keyseg->length;
+
+ /* If part is NULL it's regarded as different */
+ if (keyseg->null_bit)
+ {
+ uint tmp;
+ if ((tmp=(a[keyseg->null_pos] & keyseg->null_bit)) !=
+ (uint) (b[keyseg->null_pos] & keyseg->null_bit))
+ return 1;
+ if (tmp)
+ {
+ if (!null_are_equal)
+ return 1;
+ continue;
+ }
+ }
+ pos_a= a+keyseg->start;
+ pos_b= b+keyseg->start;
+ if (keyseg->flag & HA_VAR_LENGTH_PART)
+ {
+ uint pack_length= keyseg->bit_start;
+ if (pack_length == 1)
+ {
+ a_length= (uint) *pos_a++;
+ b_length= (uint) *pos_b++;
+ }
+ else
+ {
+ a_length= uint2korr(pos_a);
+ b_length= uint2korr(pos_b);
+ pos_a+= 2; /* Skip VARCHAR length */
+ pos_b+= 2;
+ }
+ set_if_smaller(a_length, keyseg->length); /* Safety */
+ set_if_smaller(b_length, keyseg->length); /* safety */
+ }
+ else if (keyseg->flag & HA_BLOB_PART)
+ {
+ /* Only compare 'length' characters if length != 0 */
+ a_length= _ma_calc_blob_length(keyseg->bit_start,pos_a);
+ b_length= _ma_calc_blob_length(keyseg->bit_start,pos_b);
+ /* Check that a and b are of equal length */
+ if (keyseg->length)
+ {
+ /*
+ This is used in some cases when we are not interested in comparing
+ the whole length of the blob.
+ */
+ set_if_smaller(a_length, keyseg->length);
+ set_if_smaller(b_length, keyseg->length);
+ }
+ memcpy_fixed((uchar*) &pos_a,pos_a+keyseg->bit_start,sizeof(char*));
+ memcpy_fixed((uchar*) &pos_b,pos_b+keyseg->bit_start,sizeof(char*));
+ }
+ if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 ||
+ type == HA_KEYTYPE_VARTEXT2)
+ {
+ if (ha_compare_text(keyseg->charset, pos_a, a_length,
+ pos_b, b_length, 0, 1))
+ return 1;
+ }
+ else
+ {
+ if (a_length != b_length)
+ return 1;
+ end= pos_a+a_length;
+ while (pos_a != end)
+ {
+ if (*pos_a++ != *pos_b++)
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
diff --git a/storage/maria/ma_update.c b/storage/maria/ma_update.c
new file mode 100644
index 00000000000..7b9e006ec43
--- /dev/null
+++ b/storage/maria/ma_update.c
@@ -0,0 +1,253 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "ma_fulltext.h"
+#include "ma_rt_index.h"
+#include "trnman.h"
+
+/**
+ Update an old row in a MARIA table
+*/
+
+int maria_update(register MARIA_HA *info, const uchar *oldrec, uchar *newrec)
+{
+ int flag,key_changed,save_errno;
+ reg3 my_off_t pos;
+ uint i;
+ uchar old_key_buff[MARIA_MAX_KEY_BUFF],*new_key_buff;
+ my_bool auto_key_changed= 0;
+ ulonglong changed;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo;
+ DBUG_ENTER("maria_update");
+ LINT_INIT(new_key_buff);
+ LINT_INIT(changed);
+
+ DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage",
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ DBUG_RETURN(my_errno= HA_ERR_CRASHED););
+ if (!(info->update & HA_STATE_AKTIV))
+ {
+ DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND);
+ }
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ DBUG_RETURN(my_errno=EACCES);
+ }
+ if (share->state.state.key_file_length >= share->base.margin_key_file_length)
+ {
+ DBUG_RETURN(my_errno=HA_ERR_INDEX_FILE_FULL);
+ }
+ pos= info->cur_row.lastpos;
+ if (_ma_readinfo(info,F_WRLCK,1))
+ DBUG_RETURN(my_errno);
+
+ if ((*share->compare_record)(info,oldrec))
+ {
+ save_errno= my_errno;
+ DBUG_PRINT("warning", ("Got error from compare record"));
+ goto err_end; /* Record has changed */
+ }
+
+ /* Calculate and check all unique constraints */
+ key_changed=0;
+ for (i=0 ; i < share->state.header.uniques ; i++)
+ {
+ MARIA_UNIQUEDEF *def=share->uniqueinfo+i;
+ if (_ma_unique_comp(def, newrec, oldrec,1) &&
+ _ma_check_unique(info, def, newrec, _ma_unique_hash(def, newrec),
+ pos))
+ {
+ save_errno=my_errno;
+ goto err_end;
+ }
+ }
+ if (_ma_mark_file_changed(info))
+ {
+ save_errno=my_errno;
+ goto err_end;
+ }
+
+ /* Ensure we don't try to restore auto_increment if it doesn't change */
+ info->last_auto_increment= ~(ulonglong) 0;
+
+ /* Check which keys changed from the original row */
+
+ new_key_buff= info->lastkey_buff2;
+ changed=0;
+ for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++)
+ {
+ if (maria_is_key_active(share->state.key_map, i))
+ {
+ if (keyinfo->flag & HA_FULLTEXT )
+ {
+ if (_ma_ft_cmp(info,i,oldrec, newrec))
+ {
+ if ((int) i == info->lastinx)
+ {
+ /*
+ We are changeing the index we are reading on. Mark that
+ the index data has changed and we need to do a full search
+ when doing read-next
+ */
+ key_changed|=HA_STATE_WRITTEN;
+ }
+ changed|=((ulonglong) 1 << i);
+ if (_ma_ft_update(info,i,old_key_buff,oldrec,newrec,pos))
+ goto err;
+ }
+ }
+ else
+ {
+ MARIA_KEY new_key, old_key;
+
+ (*keyinfo->make_key)(info,&new_key, i, new_key_buff, newrec,
+ pos, info->trn->trid);
+ (*keyinfo->make_key)(info,&old_key, i, old_key_buff,
+ oldrec, pos, info->cur_row.trid);
+
+ /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+
+ if (new_key.data_length != old_key.data_length ||
+ memcmp(old_key.data, new_key.data, new_key.data_length))
+ {
+ if ((int) i == info->lastinx)
+ key_changed|=HA_STATE_WRITTEN; /* Mark that keyfile changed */
+ changed|=((ulonglong) 1 << i);
+ keyinfo->version++;
+ if (keyinfo->ck_delete(info,&old_key))
+ goto err;
+ if (keyinfo->ck_insert(info,&new_key))
+ goto err;
+ if (share->base.auto_key == i+1)
+ auto_key_changed=1;
+ }
+ }
+ }
+ }
+
+ if (share->calc_checksum)
+ {
+ /*
+ We can't use the row based checksum as this doesn't have enough
+ precision (one byte, while the table's is more bytes).
+ At least _ma_check_unique() modifies the 'newrec' record, so checksum
+ has to be computed _after_ it. Nobody apparently modifies 'oldrec'.
+ We need to pass the old row's checksum down to (*update_record)(), we do
+ this via info->new_row.checksum (not intuitive but existing code
+ mandated that cur_row is the new row).
+ If (*update_record)() fails, table will be marked corrupted so no need
+ to revert the live checksum change.
+ */
+ info->cur_row.checksum= (*share->calc_checksum)(info, newrec);
+ info->new_row.checksum= (*share->calc_checksum)(info, oldrec);
+ info->state->checksum+= info->cur_row.checksum - info->new_row.checksum;
+ }
+
+ if ((*share->update_record)(info, pos, oldrec, newrec))
+ goto err;
+
+ if (auto_key_changed & !share->now_transactional)
+ {
+ const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg;
+ const uchar *key= newrec + keyseg->start;
+ set_if_bigger(share->state.auto_increment,
+ ma_retrieve_auto_increment(key, keyseg->type));
+ }
+
+ /*
+ We can't yet have HA_STATE_AKTIV here, as block_record dosn't support it
+ */
+ info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED | key_changed);
+ share->state.changed|= STATE_NOT_MOVABLE | STATE_NOT_ZEROFILLED;
+ info->state->changed= 1;
+
+ /*
+ Every Maria function that updates Maria table must end with
+ call to _ma_writeinfo(). If operation (second param of
+ _ma_writeinfo()) is not 0 it sets share->changed to 1, that is
+ flags that data has changed. If operation is 0, this function
+ equals to no-op in this case.
+
+ ma_update() must always pass !0 value as operation, since even if
+ there is no index change there could be data change.
+ */
+ VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE));
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ if (info->invalidator != 0)
+ {
+ DBUG_PRINT("info", ("invalidator... '%s' (update)",
+ share->open_file_name.str));
+ (*info->invalidator)(share->open_file_name.str);
+ info->invalidator=0;
+ }
+ DBUG_RETURN(0);
+
+err:
+ DBUG_PRINT("error",("key: %d errno: %d",i,my_errno));
+ save_errno= my_errno;
+ DBUG_ASSERT(save_errno);
+ if (!save_errno)
+ save_errno= HA_ERR_INTERNAL_ERROR; /* Should never happen */
+
+ if (my_errno == HA_ERR_FOUND_DUPP_KEY || my_errno == HA_ERR_OUT_OF_MEM ||
+ my_errno == HA_ERR_RECORD_FILE_FULL)
+ {
+ info->errkey= (int) i;
+ flag=0;
+ do
+ {
+ if (((ulonglong) 1 << i) & changed)
+ {
+ if (share->keyinfo[i].flag & HA_FULLTEXT)
+ {
+ if ((flag++ && _ma_ft_del(info,i,new_key_buff,newrec,pos)) ||
+ _ma_ft_add(info,i,old_key_buff,oldrec,pos))
+ break;
+ }
+ else
+ {
+ MARIA_KEY new_key, old_key;
+ (*share->keyinfo[i].make_key)(info, &new_key, i, new_key_buff,
+ newrec, pos,
+ info->trn->trid);
+ (*share->keyinfo[i].make_key)(info, &old_key, i, old_key_buff,
+ oldrec, pos, info->cur_row.trid);
+ if ((flag++ && _ma_ck_delete(info, &new_key)) ||
+ _ma_ck_write(info, &old_key))
+ break;
+ }
+ }
+ } while (i-- != 0);
+ }
+ else
+ {
+ maria_print_error(share, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ }
+ info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_ROW_CHANGED |
+ key_changed);
+
+ err_end:
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ if (save_errno == HA_ERR_KEY_NOT_FOUND)
+ {
+ maria_print_error(share, HA_ERR_CRASHED);
+ save_errno=HA_ERR_CRASHED;
+ }
+ DBUG_RETURN(my_errno=save_errno);
+} /* maria_update */
diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c
new file mode 100644
index 00000000000..02eeec754ee
--- /dev/null
+++ b/storage/maria/ma_write.c
@@ -0,0 +1,2461 @@
+/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+ Copyright (C) 2008-2009 Sun Microsystems, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Write a row to a MARIA table */
+
+#include "ma_fulltext.h"
+#include "ma_rt_index.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+#include "ma_blockrec.h"
+
+#define MAX_POINTER_LENGTH 8
+
+ /* Functions declared in this file */
+
+static int w_search(MARIA_HA *info, uint32 comp_flag,
+ MARIA_KEY *key, my_off_t page,
+ MARIA_PAGE *father_page, uchar *father_keypos,
+ my_bool insert_last);
+static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ MARIA_KEY *key, MARIA_PAGE *curr_page,
+ MARIA_PAGE *father_page,
+ uchar *father_key_pos, MARIA_KEY_PARAM *s_temp);
+static uchar *_ma_find_last_pos(MARIA_KEY *int_key,
+ MARIA_PAGE *page, uchar **after_key);
+static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key);
+static my_bool _ma_ck_write_btree(register MARIA_HA *info, MARIA_KEY *key);
+static my_bool _ma_ck_write_btree_with_log(MARIA_HA *, MARIA_KEY *, my_off_t *,
+ uint32);
+static my_bool _ma_log_split(MARIA_PAGE *page, uint org_length,
+ uint new_length,
+ const uchar *key_pos,
+ uint key_length, int move_length,
+ enum en_key_op prefix_or_suffix,
+ const uchar *data, uint data_length,
+ uint changed_length);
+static my_bool _ma_log_del_prefix(MARIA_PAGE *page,
+ uint org_length, uint new_length,
+ const uchar *key_pos, uint key_length,
+ int move_length);
+static my_bool _ma_log_key_middle(MARIA_PAGE *page,
+ uint new_length,
+ uint data_added_first,
+ uint data_changed_first,
+ uint data_deleted_last,
+ const uchar *key_pos,
+ uint key_length, int move_length);
+
+/*
+ @brief Default handler for returing position to new row
+
+ @note
+ This is only called for non transactional tables and not for block format
+ which is why we use info->state here.
+*/
+
+MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info,
+ const uchar *record
+ __attribute__((unused)))
+{
+ return ((info->s->state.dellink != HA_OFFSET_ERROR &&
+ !info->append_insert_at_end) ?
+ info->s->state.dellink :
+ info->state->data_file_length);
+}
+
+my_bool _ma_write_abort_default(MARIA_HA *info __attribute__((unused)))
+{
+ return 0;
+}
+
+
+/* Write new record to a table */
+
+int maria_write(MARIA_HA *info, uchar *record)
+{
+ MARIA_SHARE *share= info->s;
+ uint i;
+ int save_errno;
+ MARIA_RECORD_POS filepos;
+ uchar *buff;
+ my_bool lock_tree= share->lock_key_trees;
+ my_bool fatal_error;
+ MARIA_KEYDEF *keyinfo;
+ DBUG_ENTER("maria_write");
+ DBUG_PRINT("enter",("index_file: %d data_file: %d",
+ share->kfile.file, info->dfile.file));
+
+ DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage",
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ DBUG_RETURN(my_errno= HA_ERR_CRASHED););
+ if (share->options & HA_OPTION_READ_ONLY_DATA)
+ {
+ DBUG_RETURN(my_errno=EACCES);
+ }
+ if (_ma_readinfo(info,F_WRLCK,1))
+ DBUG_RETURN(my_errno);
+ dont_break(); /* Dont allow SIGHUP or SIGINT */
+
+ if (share->base.reloc == (ha_rows) 1 &&
+ share->base.records == (ha_rows) 1 &&
+ share->state.state.records == (ha_rows) 1)
+ { /* System file */
+ my_errno=HA_ERR_RECORD_FILE_FULL;
+ goto err2;
+ }
+ if (share->state.state.key_file_length >= share->base.margin_key_file_length)
+ {
+ my_errno=HA_ERR_INDEX_FILE_FULL;
+ goto err2;
+ }
+ if (_ma_mark_file_changed(info))
+ goto err2;
+
+ /* Calculate and check all unique constraints */
+ for (i=0 ; i < share->state.header.uniques ; i++)
+ {
+ if (_ma_check_unique(info,share->uniqueinfo+i,record,
+ _ma_unique_hash(share->uniqueinfo+i,record),
+ HA_OFFSET_ERROR))
+ goto err2;
+ }
+
+ /* Ensure we don't try to restore auto_increment if it doesn't change */
+ info->last_auto_increment= ~(ulonglong) 0;
+
+ if ((info->opt_flag & OPT_NO_ROWS))
+ filepos= HA_OFFSET_ERROR;
+ else
+ {
+ /*
+ This may either calculate a record or, or write the record and return
+ the record id
+ */
+ if ((filepos= (*share->write_record_init)(info, record)) ==
+ HA_OFFSET_ERROR)
+ goto err2;
+ }
+
+ /* Write all keys to indextree */
+ buff= info->lastkey_buff2;
+ for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++)
+ {
+ MARIA_KEY int_key;
+ if (maria_is_key_active(share->state.key_map, i))
+ {
+ my_bool local_lock_tree= (lock_tree &&
+ !(info->bulk_insert &&
+ is_tree_inited(&info->bulk_insert[i])));
+ if (local_lock_tree)
+ {
+ rw_wrlock(&keyinfo->root_lock);
+ keyinfo->version++;
+ }
+ if (keyinfo->flag & HA_FULLTEXT )
+ {
+ if (_ma_ft_add(info,i, buff,record,filepos))
+ {
+ if (local_lock_tree)
+ rw_unlock(&keyinfo->root_lock);
+ DBUG_PRINT("error",("Got error: %d on write",my_errno));
+ goto err;
+ }
+ }
+ else
+ {
+ while (keyinfo->ck_insert(info,
+ (*keyinfo->make_key)(info, &int_key, i,
+ buff, record, filepos,
+ info->trn->trid)))
+ {
+ TRN *blocker;
+ DBUG_PRINT("error",("Got error: %d on write",my_errno));
+ /*
+ explicit check to filter out temp tables, they aren't
+ transactional and don't have a proper TRN so the code
+ below doesn't work for them.
+ Also, filter out non-thread maria use, and table modified in
+ the same transaction.
+ At last, filter out non-dup-unique errors.
+ */
+ if (!local_lock_tree)
+ goto err;
+ if (info->dup_key_trid == info->trn->trid ||
+ my_errno != HA_ERR_FOUND_DUPP_KEY)
+ {
+ rw_unlock(&keyinfo->root_lock);
+ goto err;
+ }
+ /* Different TrIDs: table must be transactional */
+ DBUG_ASSERT(share->base.born_transactional);
+ /*
+ If transactions are disabled, and dup_key_trid is different from
+ our TrID, it must be ALTER TABLE with dup_key_trid==0 (no
+ transaction). ALTER TABLE does have MARIA_HA::TRN not dummy but
+ puts TrID=0 in rows/keys.
+ */
+ DBUG_ASSERT(share->now_transactional ||
+ (info->dup_key_trid == 0));
+ blocker= trnman_trid_to_trn(info->trn, info->dup_key_trid);
+ /*
+ if blocker TRN was not found, it means that the conflicting
+ transaction was committed long time ago. It could not be
+ aborted, as it would have to wait on the key tree lock
+ to remove the conflicting key it has inserted.
+ */
+ if (!blocker || blocker->commit_trid != ~(TrID)0)
+ { /* committed */
+ if (blocker)
+ pthread_mutex_unlock(& blocker->state_lock);
+ rw_unlock(&keyinfo->root_lock);
+ goto err;
+ }
+ rw_unlock(&keyinfo->root_lock);
+ {
+ /* running. now we wait */
+ WT_RESOURCE_ID rc;
+ int res;
+ const char *old_proc_info;
+
+ rc.type= &ma_rc_dup_unique;
+ /* TODO savepoint id when we'll have them */
+ rc.value= (intptr)blocker;
+ res= wt_thd_will_wait_for(info->trn->wt, blocker->wt, & rc);
+ if (res != WT_OK)
+ {
+ pthread_mutex_unlock(& blocker->state_lock);
+ my_errno= HA_ERR_LOCK_DEADLOCK;
+ goto err;
+ }
+ old_proc_info= proc_info_hook(0,
+ "waiting for a resource",
+ __func__, __FILE__, __LINE__);
+ res= wt_thd_cond_timedwait(info->trn->wt, & blocker->state_lock);
+ proc_info_hook(0, old_proc_info, __func__, __FILE__, __LINE__);
+
+ pthread_mutex_unlock(& blocker->state_lock);
+ if (res != WT_OK)
+ {
+ my_errno= res == WT_TIMEOUT ? HA_ERR_LOCK_WAIT_TIMEOUT
+ : HA_ERR_LOCK_DEADLOCK;
+ goto err;
+ }
+ }
+ rw_wrlock(&keyinfo->root_lock);
+#ifndef MARIA_CANNOT_ROLLBACK
+ keyinfo->version++;
+#endif
+ }
+ }
+
+ /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+ info->update&= ~HA_STATE_RNEXT_SAME;
+
+ if (local_lock_tree)
+ rw_unlock(&keyinfo->root_lock);
+ }
+ }
+ if (share->calc_write_checksum)
+ info->cur_row.checksum= (*share->calc_write_checksum)(info,record);
+ if (filepos != HA_OFFSET_ERROR)
+ {
+ if ((*share->write_record)(info,record))
+ goto err;
+ info->state->checksum+= info->cur_row.checksum;
+ }
+ if (!share->now_transactional)
+ {
+ if (share->base.auto_key != 0)
+ {
+ const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg;
+ const uchar *key= record + keyseg->start;
+ set_if_bigger(share->state.auto_increment,
+ ma_retrieve_auto_increment(key, keyseg->type));
+ }
+ }
+ info->state->records++;
+ info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_WRITTEN |
+ HA_STATE_ROW_CHANGED);
+ share->state.changed|= STATE_NOT_MOVABLE | STATE_NOT_ZEROFILLED;
+ info->state->changed= 1;
+
+ info->cur_row.lastpos= filepos;
+ VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE));
+ if (info->invalidator != 0)
+ {
+ DBUG_PRINT("info", ("invalidator... '%s' (update)",
+ share->open_file_name.str));
+ (*info->invalidator)(share->open_file_name.str);
+ info->invalidator=0;
+ }
+
+ /*
+ Update status of the table. We need to do so after each row write
+ for the log tables, as we want the new row to become visible to
+ other threads as soon as possible. We don't lock mutex here
+ (as it is required by pthread memory visibility rules) as (1) it's
+ not critical to use outdated share->is_log_table value (2) locking
+ mutex here for every write is too expensive.
+ */
+ if (share->is_log_table)
+ _ma_update_status((void*) info);
+
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ DBUG_RETURN(0);
+
+err:
+ save_errno= my_errno;
+ fatal_error= 0;
+ if (my_errno == HA_ERR_FOUND_DUPP_KEY ||
+ my_errno == HA_ERR_RECORD_FILE_FULL ||
+ my_errno == HA_ERR_LOCK_DEADLOCK ||
+ my_errno == HA_ERR_LOCK_WAIT_TIMEOUT ||
+ my_errno == HA_ERR_NULL_IN_SPATIAL ||
+ my_errno == HA_ERR_OUT_OF_MEM)
+ {
+ if (info->bulk_insert)
+ {
+ uint j;
+ for (j=0 ; j < share->base.keys ; j++)
+ maria_flush_bulk_insert(info, j);
+ }
+ info->errkey= (int) i;
+ /*
+ We delete keys in the reverse order of insertion. This is the order that
+ a rollback would do and is important for CLR_ENDs generated by
+ _ma_ft|ck_delete() and write_record_abort() to work (with any other
+ order they would cause wrong jumps in the chain).
+ */
+ while ( i-- > 0)
+ {
+ if (maria_is_key_active(share->state.key_map, i))
+ {
+ my_bool local_lock_tree= (lock_tree &&
+ !(info->bulk_insert &&
+ is_tree_inited(&info->bulk_insert[i])));
+ keyinfo= share->keyinfo + i;
+ if (local_lock_tree)
+ rw_wrlock(&keyinfo->root_lock);
+ /**
+ @todo RECOVERY BUG
+ The key deletes below should generate CLR_ENDs
+ */
+ if (keyinfo->flag & HA_FULLTEXT)
+ {
+ if (_ma_ft_del(info,i,buff,record,filepos))
+ {
+ if (local_lock_tree)
+ rw_unlock(&keyinfo->root_lock);
+ break;
+ }
+ }
+ else
+ {
+ MARIA_KEY key;
+ if (_ma_ck_delete(info,
+ (*keyinfo->make_key)(info, &key, i, buff, record,
+ filepos, info->trn->trid)))
+ {
+ if (local_lock_tree)
+ rw_unlock(&keyinfo->root_lock);
+ break;
+ }
+ }
+ if (local_lock_tree)
+ rw_unlock(&keyinfo->root_lock);
+ }
+ }
+ }
+ else
+ fatal_error= 1;
+
+ if ((*share->write_record_abort)(info))
+ fatal_error= 1;
+ if (fatal_error)
+ {
+ maria_print_error(info->s, HA_ERR_CRASHED);
+ maria_mark_crashed(info);
+ }
+
+ info->update= (HA_STATE_CHANGED | HA_STATE_WRITTEN | HA_STATE_ROW_CHANGED);
+ my_errno=save_errno;
+err2:
+ save_errno=my_errno;
+ DBUG_ASSERT(save_errno);
+ if (!save_errno)
+ save_errno= HA_ERR_INTERNAL_ERROR; /* Should never happen */
+ DBUG_PRINT("error", ("got error: %d", save_errno));
+ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+ allow_break(); /* Allow SIGHUP & SIGINT */
+ DBUG_RETURN(my_errno=save_errno);
+} /* maria_write */
+
+
+/*
+ Write one key to btree
+
+ TODO
+ Remove this function and have bulk insert change keyinfo->ck_insert
+ to point to the right function
+*/
+
+my_bool _ma_ck_write(MARIA_HA *info, MARIA_KEY *key)
+{
+ DBUG_ENTER("_ma_ck_write");
+
+ if (info->bulk_insert &&
+ is_tree_inited(&info->bulk_insert[key->keyinfo->key_nr]))
+ {
+ DBUG_RETURN(_ma_ck_write_tree(info, key));
+ }
+ DBUG_RETURN(_ma_ck_write_btree(info, key));
+} /* _ma_ck_write */
+
+
+/**********************************************************************
+ Insert key into btree (normal case)
+**********************************************************************/
+
+static my_bool _ma_ck_write_btree(MARIA_HA *info, MARIA_KEY *key)
+{
+ my_bool error;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ my_off_t *root= &info->s->state.key_root[keyinfo->key_nr];
+ DBUG_ENTER("_ma_ck_write_btree");
+
+ error= _ma_ck_write_btree_with_log(info, key, root,
+ keyinfo->write_comp_flag | key->flag);
+ if (info->ft1_to_ft2)
+ {
+ if (!error)
+ error= _ma_ft_convert_to_ft2(info, key);
+ delete_dynamic(info->ft1_to_ft2);
+ my_free(info->ft1_to_ft2, MYF(0));
+ info->ft1_to_ft2=0;
+ }
+ DBUG_RETURN(error);
+} /* _ma_ck_write_btree */
+
+
+/**
+ @brief Write a key to the b-tree
+
+ @retval 1 error
+ @retval 0 ok
+*/
+
+static my_bool _ma_ck_write_btree_with_log(MARIA_HA *info, MARIA_KEY *key,
+ my_off_t *root, uint32 comp_flag)
+{
+ MARIA_SHARE *share= info->s;
+ LSN lsn= LSN_IMPOSSIBLE;
+ int error;
+ my_off_t new_root= *root;
+ uchar key_buff[MARIA_MAX_KEY_BUFF];
+ MARIA_KEY org_key;
+ DBUG_ENTER("_ma_ck_write_btree_with_log");
+
+ LINT_INIT_STRUCT(org_key);
+ if (share->now_transactional)
+ {
+ /* Save original value as the key may change */
+ org_key= *key;
+ memcpy(key_buff, key->data, key->data_length + key->ref_length);
+ }
+
+ error= _ma_ck_real_write_btree(info, key, &new_root, comp_flag);
+ if (!error && share->now_transactional)
+ {
+ /* Log the original value */
+ *key= org_key;
+ key->data= key_buff;
+ error= _ma_write_undo_key_insert(info, key, root, new_root, &lsn);
+ }
+ else
+ {
+ *root= new_root;
+ _ma_fast_unlock_key_del(info);
+ }
+ _ma_unpin_all_pages_and_finalize_row(info, lsn);
+
+ DBUG_RETURN(error != 0);
+} /* _ma_ck_write_btree_with_log */
+
+
+/**
+ @brief Write a key to the b-tree
+
+ @retval 1 error
+ @retval 0 ok
+*/
+
+my_bool _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEY *key, my_off_t *root,
+ uint32 comp_flag)
+{
+ int error;
+ DBUG_ENTER("_ma_ck_real_write_btree");
+
+ /* key_length parameter is used only if comp_flag is SEARCH_FIND */
+ if (*root == HA_OFFSET_ERROR ||
+ (error= w_search(info, comp_flag, key, *root, (MARIA_PAGE *) 0,
+ (uchar*) 0, 1)) > 0)
+ error= _ma_enlarge_root(info, key, root);
+ DBUG_RETURN(error != 0);
+} /* _ma_ck_real_write_btree */
+
+
+/**
+ @brief Make a new root with key as only pointer
+
+ @retval 1 error
+ @retval 0 ok
+*/
+
+my_bool _ma_enlarge_root(MARIA_HA *info, MARIA_KEY *key, my_off_t *root)
+{
+ uint t_length, nod_flag;
+ MARIA_KEY_PARAM s_temp;
+ MARIA_SHARE *share= info->s;
+ MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_PAGE page;
+ my_bool res= 0;
+ DBUG_ENTER("_ma_enlarge_root");
+
+ page.info= info;
+ page.keyinfo= keyinfo;
+ page.buff= info->buff;
+ page.flag= 0;
+
+ nod_flag= (*root != HA_OFFSET_ERROR) ? share->base.key_reflength : 0;
+ /* Store pointer to prev page if nod */
+ _ma_kpointer(info, page.buff + share->keypage_header, *root);
+ t_length= (*keyinfo->pack_key)(key, nod_flag, (uchar*) 0,
+ (uchar*) 0, (uchar*) 0, &s_temp);
+ page.size= share->keypage_header + t_length + nod_flag;
+
+ bzero(page.buff, share->keypage_header);
+ _ma_store_keynr(share, page.buff, keyinfo->key_nr);
+ if (nod_flag)
+ page.flag|= KEYPAGE_FLAG_ISNOD;
+ if (key->flag & (SEARCH_USER_KEY_HAS_TRANSID | SEARCH_PAGE_KEY_HAS_TRANSID))
+ page.flag|= KEYPAGE_FLAG_HAS_TRANSID;
+ (*keyinfo->store_key)(keyinfo, page.buff + share->keypage_header +
+ nod_flag, &s_temp);
+
+ /* Mark that info->buff was used */
+ info->keyread_buff_used= info->page_changed= 1;
+ if ((page.pos= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) ==
+ HA_OFFSET_ERROR)
+ DBUG_RETURN(1);
+ *root= page.pos;
+
+ page_store_info(share, &page);
+
+ /*
+ Clear unitialized part of page to avoid valgrind/purify warnings
+ and to get a clean page that is easier to compress and compare with
+ pages generated with redo
+ */
+ bzero(page.buff + page.size, share->block_size - page.size);
+
+ if (share->now_transactional && _ma_log_new(&page, 1))
+ res= 1;
+
+ if (_ma_write_keypage(&page, page_link->write_lock,
+ PAGECACHE_PRIORITY_HIGH))
+ res= 1;
+
+ DBUG_RETURN(res);
+} /* _ma_enlarge_root */
+
+
+/*
+ Search after a position for a key and store it there
+
+ TODO:
+ Change this to use pagecache directly instead of creating a copy
+ of the page. To do this, we must however change write-key-on-page
+ algorithm to not overwrite the buffer but instead store any overflow
+ key in a separate buffer.
+
+ @return
+ @retval -1 error
+ @retval 0 ok
+ @retval > 0 Key should be stored in higher tree
+*/
+
+static int w_search(register MARIA_HA *info, uint32 comp_flag, MARIA_KEY *key,
+ my_off_t page_pos,
+ MARIA_PAGE *father_page, uchar *father_keypos,
+ my_bool insert_last)
+{
+ int error,flag;
+ uchar *temp_buff,*keypos;
+ uchar keybuff[MARIA_MAX_KEY_BUFF];
+ my_bool was_last_key;
+ my_off_t next_page, dup_key_pos;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_PAGE page;
+ DBUG_ENTER("w_search");
+ DBUG_PRINT("enter", ("page: %lu", (ulong) (page_pos/keyinfo->block_length)));
+
+ if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+ MARIA_MAX_KEY_BUFF*2)))
+ DBUG_RETURN(-1);
+ if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE,
+ DFLT_INIT_HITS, temp_buff, 0))
+ goto err;
+
+ flag= (*keyinfo->bin_search)(key, &page, comp_flag, &keypos,
+ keybuff, &was_last_key);
+ if (flag == 0)
+ {
+ MARIA_KEY tmp_key;
+ /* get position to record with duplicated key */
+
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data= keybuff;
+
+ if ((*keyinfo->get_key)(&tmp_key, page.flag, page.node, &keypos))
+ dup_key_pos= _ma_row_pos_from_key(&tmp_key);
+ else
+ dup_key_pos= HA_OFFSET_ERROR;
+
+ if (keyinfo->flag & HA_FULLTEXT)
+ {
+ uint off;
+ int subkeys;
+
+ get_key_full_length_rdonly(off, keybuff);
+ subkeys=ft_sintXkorr(keybuff+off);
+ comp_flag=SEARCH_SAME;
+ if (subkeys >= 0)
+ {
+ /* normal word, one-level tree structure */
+ flag=(*keyinfo->bin_search)(key, &page, comp_flag,
+ &keypos, keybuff, &was_last_key);
+ }
+ else
+ {
+ /* popular word. two-level tree. going down */
+ my_off_t root=dup_key_pos;
+ keyinfo= &share->ft2_keyinfo;
+ get_key_full_length_rdonly(off, key);
+ key+=off;
+ /* we'll modify key entry 'in vivo' */
+ keypos-= keyinfo->keylength + page.node;
+ error= _ma_ck_real_write_btree(info, key, &root, comp_flag);
+ _ma_dpointer(share, keypos+HA_FT_WLEN, root);
+ subkeys--; /* should there be underflow protection ? */
+ DBUG_ASSERT(subkeys < 0);
+ ft_intXstore(keypos, subkeys);
+ if (!error)
+ {
+ page_mark_changed(info, &page);
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ }
+ my_afree(temp_buff);
+ DBUG_RETURN(error);
+ }
+ }
+ else /* not HA_FULLTEXT, normal HA_NOSAME key */
+ {
+ /*
+ TODO
+ When the index will support true versioning - with multiple
+ identical values in the UNIQUE index, invisible to each other -
+ the following should be changed to "continue inserting keys, at the
+ end (of the row or statement) wait". We need to wait on *all*
+ unique conflicts at once, not one-at-a-time, because we need to
+ know all blockers in advance, otherwise we'll have incomplete wait-for
+ graph.
+ */
+ /*
+ transaction that has inserted the conflicting key may be in progress.
+ the caller will wait for it to be committed or aborted.
+ */
+ info->dup_key_trid= _ma_trid_from_key(&tmp_key);
+ info->dup_key_pos= dup_key_pos;
+ my_errno= HA_ERR_FOUND_DUPP_KEY;
+ DBUG_PRINT("warning",
+ ("Duplicate key. dup_key_trid: %lu pos %lu visible: %d",
+ (ulong) info->dup_key_trid,
+ (ulong) info->dup_key_pos,
+ info->trn ? trnman_can_read_from(info->trn,
+ info->dup_key_trid) : 2));
+ goto err;
+ }
+ }
+ if (flag == MARIA_FOUND_WRONG_KEY)
+ goto err;
+ if (!was_last_key)
+ insert_last=0;
+ next_page= _ma_kpos(page.node, keypos);
+ if (next_page == HA_OFFSET_ERROR ||
+ (error= w_search(info, comp_flag, key, next_page,
+ &page, keypos, insert_last)) > 0)
+ {
+ error= _ma_insert(info, key, &page, keypos, keybuff,
+ father_page, father_keypos, insert_last);
+ page_mark_changed(info, &page);
+ if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS))
+ goto err;
+ }
+ my_afree(temp_buff);
+ DBUG_RETURN(error);
+err:
+ my_afree(temp_buff);
+ DBUG_PRINT("exit",("Error: %d",my_errno));
+ DBUG_RETURN(-1);
+} /* w_search */
+
+
+/*
+ Insert new key.
+
+ SYNOPSIS
+ _ma_insert()
+ info Open table information.
+ keyinfo Key definition information.
+ key New key
+ anc_page Key page (beginning)
+ key_pos Position in key page where to insert.
+ key_buff Copy of previous key if keys where packed.
+ father_page position of parent key page in file.
+ father_key_pos position in parent key page for balancing.
+ insert_last If to append at end of page.
+
+ DESCRIPTION
+ Insert new key at right of key_pos.
+ Note that caller must save anc_buff
+
+ This function writes log records for all changed pages
+ (Including anc_buff and father page)
+
+ RETURN
+ < 0 Error.
+ 0 OK
+ 1 If key contains key to upper level (from balance page)
+ 2 If key contains key to upper level (from split space)
+*/
+
+int _ma_insert(register MARIA_HA *info, MARIA_KEY *key,
+ MARIA_PAGE *anc_page, uchar *key_pos, uchar *key_buff,
+ MARIA_PAGE *father_page, uchar *father_key_pos,
+ my_bool insert_last)
+{
+ uint a_length, nod_flag, org_anc_length;
+ int t_length;
+ uchar *endpos, *prev_key, *anc_buff;
+ MARIA_KEY_PARAM s_temp;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ DBUG_ENTER("_ma_insert");
+ DBUG_PRINT("enter",("key_pos: 0x%lx", (ulong) key_pos));
+ DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key););
+
+ /*
+ Note that anc_page->size can be bigger then block_size in case of
+ delete key that caused increase of page length
+ */
+ org_anc_length= a_length= anc_page->size;
+ nod_flag= anc_page->node;
+
+ anc_buff= anc_page->buff;
+ endpos= anc_buff+ a_length;
+ prev_key= (key_pos == anc_buff + share->keypage_header + nod_flag ?
+ (uchar*) 0 : key_buff);
+ t_length= (*keyinfo->pack_key)(key, nod_flag,
+ (key_pos == endpos ? (uchar*) 0 : key_pos),
+ prev_key, prev_key, &s_temp);
+#ifndef DBUG_OFF
+ if (prev_key && (keyinfo->flag & (HA_BINARY_PACK_KEY | HA_PACK_KEY)))
+ {
+ DBUG_DUMP("prev_key", prev_key, _ma_keylength(keyinfo,prev_key));
+ }
+ if (keyinfo->flag & HA_PACK_KEY)
+ {
+ DBUG_PRINT("test",("t_length: %d ref_len: %d",
+ t_length,s_temp.ref_length));
+ DBUG_PRINT("test",("n_ref_len: %d n_length: %d key_pos: 0x%lx",
+ s_temp.n_ref_length, s_temp.n_length, (long) s_temp.key));
+ }
+#endif
+ if (t_length > 0)
+ {
+ if (t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH)
+ {
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(-1);
+ }
+ bmove_upp(endpos+t_length, endpos, (uint) (endpos-key_pos));
+ }
+ else
+ {
+ if (-t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH)
+ {
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(-1);
+ }
+ bmove(key_pos,key_pos-t_length,(uint) (endpos-key_pos)+t_length);
+ }
+ (*keyinfo->store_key)(keyinfo,key_pos,&s_temp);
+ a_length+=t_length;
+
+ if (key->flag & (SEARCH_USER_KEY_HAS_TRANSID | SEARCH_PAGE_KEY_HAS_TRANSID))
+ {
+ _ma_mark_page_with_transid(share, anc_page);
+ }
+ anc_page->size= a_length;
+ page_store_size(share, anc_page);
+
+ /*
+ Check if the new key fits totally into the the page
+ (anc_buff is big enough to contain a full page + one key)
+ */
+ if (a_length <= share->max_index_block_size)
+ {
+ if (share->max_index_block_size - a_length < 32 &&
+ (keyinfo->flag & HA_FULLTEXT) && key_pos == endpos &&
+ share->base.key_reflength <= share->base.rec_reflength &&
+ share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD))
+ {
+ /*
+ Normal word. One-level tree. Page is almost full.
+ Let's consider converting.
+ We'll compare 'key' and the first key at anc_buff
+ */
+ const uchar *a= key->data;
+ const uchar *b= anc_buff + share->keypage_header + nod_flag;
+ uint alen, blen, ft2len= share->ft2_keyinfo.keylength;
+ /* the very first key on the page is always unpacked */
+ DBUG_ASSERT((*b & 128) == 0);
+#if HA_FT_MAXLEN >= 127
+ blen= mi_uint2korr(b); b+=2;
+ When you enable this code, as part of the MyISAM->Maria merge of
+ChangeSet@1.2562, 2008-04-09 07:41:40+02:00, serg@janus.mylan +9 -0
+ restore ft2 functionality, fix bugs.
+ Then this will enable two-level fulltext index, which is not totally
+ recoverable yet.
+ So remove this text and inform Guilhem so that he fixes the issue.
+#else
+ blen= *b++;
+#endif
+ get_key_length(alen,a);
+ DBUG_ASSERT(info->ft1_to_ft2==0);
+ if (alen == blen &&
+ ha_compare_text(keyinfo->seg->charset, a, alen,
+ b, blen, 0, 0) == 0)
+ {
+ /* Yup. converting */
+ info->ft1_to_ft2=(DYNAMIC_ARRAY *)
+ my_malloc(sizeof(DYNAMIC_ARRAY), MYF(MY_WME));
+ my_init_dynamic_array(info->ft1_to_ft2, ft2len, 300, 50);
+
+ /*
+ Now, adding all keys from the page to dynarray
+ if the page is a leaf (if not keys will be deleted later)
+ */
+ if (!nod_flag)
+ {
+ /*
+ Let's leave the first key on the page, though, because
+ we cannot easily dispatch an empty page here
+ */
+ b+=blen+ft2len+2;
+ for (a=anc_buff+a_length ; b < a ; b+=ft2len+2)
+ insert_dynamic(info->ft1_to_ft2, b);
+
+ /* fixing the page's length - it contains only one key now */
+ anc_page->size= share->keypage_header + blen + ft2len + 2;
+ page_store_size(share, anc_page);
+ }
+ /* the rest will be done when we're back from recursion */
+ }
+ }
+ else
+ {
+ if (share->now_transactional &&
+ _ma_log_add(anc_page, org_anc_length,
+ key_pos, s_temp.changed_length, t_length, 1,
+ KEY_OP_DEBUG_LOG_ADD_1))
+ DBUG_RETURN(-1);
+ }
+ DBUG_RETURN(0); /* There is room on page */
+ }
+ /* Page is full */
+ if (nod_flag)
+ insert_last=0;
+ /*
+ TODO:
+ Remove 'born_transactional' here.
+ The only reason for having it here is that the current
+ _ma_balance_page_ can't handle variable length keys.
+ */
+ if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+ father_page && !insert_last && !info->quick_mode &&
+ !info->s->base.born_transactional)
+ {
+ s_temp.key_pos= key_pos;
+ page_mark_changed(info, father_page);
+ DBUG_RETURN(_ma_balance_page(info, keyinfo, key, anc_page,
+ father_page, father_key_pos,
+ &s_temp));
+ }
+ DBUG_RETURN(_ma_split_page(info, key, anc_page,
+ min(org_anc_length,
+ info->s->max_index_block_size),
+ key_pos, s_temp.changed_length, t_length,
+ key_buff, insert_last));
+} /* _ma_insert */
+
+
+/**
+ @brief split a full page in two and assign emerging item to key
+
+ @fn _ma_split_page()
+ info Maria handler
+ keyinfo Key handler
+ key Buffer for middle key
+ split_page Page that should be split
+ org_split_length Original length of split_page before key was inserted
+ inserted_key_pos Address in buffer where key was inserted
+ changed_length Number of bytes changed at 'inserted_key_pos'
+ move_length Number of bytes buffer was moved when key was inserted
+ key_buff Key buffer to use for temporary storage of key
+ insert_last_key If we are insert key on rightmost key page
+
+ @note
+ split_buff is not stored on disk (caller has to do this)
+
+ @return
+ @retval 2 ok (Middle key up from _ma_insert())
+ @retval -1 error
+*/
+
+int _ma_split_page(MARIA_HA *info, MARIA_KEY *key, MARIA_PAGE *split_page,
+ uint org_split_length,
+ uchar *inserted_key_pos, uint changed_length,
+ int move_length,
+ uchar *key_buff, my_bool insert_last_key)
+{
+ uint length,a_length,key_ref_length,t_length,nod_flag,key_length;
+ uint page_length, split_length, page_flag;
+ uchar *key_pos,*pos, *after_key;
+ MARIA_KEY_PARAM s_temp;
+ MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ MARIA_KEY tmp_key;
+ MARIA_PAGE new_page;
+ int res;
+ DBUG_ENTER("_ma_split_page");
+
+ LINT_INIT(after_key);
+ DBUG_DUMP("buff", split_page->buff, split_page->size);
+
+ info->page_changed=1; /* Info->buff is used */
+ info->keyread_buff_used=1;
+ page_flag= split_page->flag;
+ nod_flag= split_page->node;
+ key_ref_length= share->keypage_header + nod_flag;
+
+ new_page.info= info;
+ new_page.buff= info->buff;
+ new_page.keyinfo= keyinfo;
+
+ tmp_key.data= key_buff;
+ tmp_key.keyinfo= keyinfo;
+ if (insert_last_key)
+ key_pos= _ma_find_last_pos(&tmp_key, split_page, &after_key);
+ else
+ key_pos= _ma_find_half_pos(&tmp_key, split_page, &after_key);
+ if (!key_pos)
+ DBUG_RETURN(-1);
+
+ key_length= tmp_key.data_length + tmp_key.ref_length;
+ split_length= (uint) (key_pos - split_page->buff);
+ a_length= split_page->size;
+ split_page->size= split_length;
+ page_store_size(share, split_page);
+
+ key_pos=after_key;
+ if (nod_flag)
+ {
+ DBUG_PRINT("test",("Splitting nod"));
+ pos=key_pos-nod_flag;
+ memcpy(new_page.buff + share->keypage_header, pos, (size_t) nod_flag);
+ }
+
+ /* Move middle item to key and pointer to new page */
+ if ((new_page.pos= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) ==
+ HA_OFFSET_ERROR)
+ DBUG_RETURN(-1);
+
+ _ma_copy_key(key, &tmp_key);
+ _ma_kpointer(info, key->data + key_length, new_page.pos);
+
+ /* Store new page */
+ if (!(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &key_pos))
+ DBUG_RETURN(-1);
+
+ t_length=(*keyinfo->pack_key)(&tmp_key, nod_flag, (uchar *) 0,
+ (uchar*) 0, (uchar*) 0, &s_temp);
+ length=(uint) ((split_page->buff + a_length) - key_pos);
+ memcpy(new_page.buff + key_ref_length + t_length, key_pos,
+ (size_t) length);
+ (*keyinfo->store_key)(keyinfo,new_page.buff+key_ref_length,&s_temp);
+ page_length= length + t_length + key_ref_length;
+
+ bzero(new_page.buff, share->keypage_header);
+ /* Copy KEYFLAG_FLAG_ISNODE and KEYPAGE_FLAG_HAS_TRANSID from parent page */
+ new_page.flag= page_flag;
+ new_page.size= page_length;
+ page_store_info(share, &new_page);
+
+ /* Copy key number */
+ new_page.buff[share->keypage_header - KEYPAGE_USED_SIZE -
+ KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE]=
+ split_page->buff[share->keypage_header - KEYPAGE_USED_SIZE -
+ KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE];
+
+ res= 2; /* Middle key up */
+ if (share->now_transactional && _ma_log_new(&new_page, 0))
+ res= -1;
+
+ /*
+ Clear unitialized part of page to avoid valgrind/purify warnings
+ and to get a clean page that is easier to compress and compare with
+ pages generated with redo
+ */
+ bzero(new_page.buff + page_length, share->block_size - page_length);
+
+ if (_ma_write_keypage(&new_page, page_link->write_lock,
+ DFLT_INIT_HITS))
+ res= -1;
+
+ /* Save changes to split pages */
+ if (share->now_transactional &&
+ _ma_log_split(split_page, org_split_length, split_length,
+ inserted_key_pos, changed_length, move_length,
+ KEY_OP_NONE, (uchar*) 0, 0, 0))
+ res= -1;
+
+ DBUG_DUMP_KEY("middle_key", key);
+ DBUG_RETURN(res);
+} /* _ma_split_page */
+
+
+/*
+ Calculate how to much to move to split a page in two
+
+ Returns pointer to start of key.
+ key will contain the key.
+ return_key_length will contain the length of key
+ after_key will contain the position to where the next key starts
+*/
+
+uchar *_ma_find_half_pos(MARIA_KEY *key, MARIA_PAGE *ma_page,
+ uchar **after_key)
+{
+ uint keys, length, key_ref_length, page_flag, nod_flag;
+ uchar *page, *end, *lastpos;
+ MARIA_HA *info= ma_page->info;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ DBUG_ENTER("_ma_find_half_pos");
+
+ nod_flag= ma_page->node;
+ key_ref_length= share->keypage_header + nod_flag;
+ page_flag= ma_page->flag;
+ length= ma_page->size - key_ref_length;
+ page= ma_page->buff+ key_ref_length; /* Point to first key */
+
+ if (!(keyinfo->flag &
+ (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY |
+ HA_BINARY_PACK_KEY)) && !(page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+ {
+ key_ref_length= keyinfo->keylength+nod_flag;
+ key->data_length= keyinfo->keylength - info->s->rec_reflength;
+ key->ref_length= info->s->rec_reflength;
+ key->flag= 0;
+ keys=length/(key_ref_length*2);
+ end=page+keys*key_ref_length;
+ *after_key=end+key_ref_length;
+ memcpy(key->data, end, key_ref_length);
+ DBUG_RETURN(end);
+ }
+
+ end=page+length/2-key_ref_length; /* This is aprox. half */
+ key->data[0]= 0; /* Safety */
+ do
+ {
+ lastpos=page;
+ if (!(length= (*keyinfo->get_key)(key, page_flag, nod_flag, &page)))
+ DBUG_RETURN(0);
+ } while (page < end);
+ *after_key= page;
+ DBUG_PRINT("exit",("returns: 0x%lx page: 0x%lx half: 0x%lx",
+ (long) lastpos, (long) page, (long) end));
+ DBUG_RETURN(lastpos);
+} /* _ma_find_half_pos */
+
+
+/**
+ Find second to last key on leaf page
+
+ @notes
+ Used to split buffer at last key. In this case the next to last
+ key will be moved to parent page and last key will be on it's own page.
+
+ @TODO
+ Add one argument for 'last key value' to get_key so that one can
+ do the loop without having to copy the found key the whole time
+
+ @return
+ @retval Pointer to the start of the key before the last key
+ @retval int_key will contain the last key
+*/
+
+static uchar *_ma_find_last_pos(MARIA_KEY *int_key, MARIA_PAGE *ma_page,
+ uchar **after_key)
+{
+ uint keys, length, key_ref_length, page_flag;
+ uchar *page, *end, *lastpos, *prevpos;
+ uchar key_buff[MARIA_MAX_KEY_BUFF];
+ MARIA_HA *info= ma_page->info;
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= int_key->keyinfo;
+ MARIA_KEY tmp_key;
+ DBUG_ENTER("_ma_find_last_pos");
+
+ key_ref_length= share->keypage_header;
+ page_flag= ma_page->flag;
+ length= ma_page->size - key_ref_length;
+ page= ma_page->buff + key_ref_length;
+
+ if (!(keyinfo->flag &
+ (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY |
+ HA_BINARY_PACK_KEY)) && !(page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+ {
+ keys= length / keyinfo->keylength - 2;
+ length= keyinfo->keylength;
+ int_key->data_length= length - info->s->rec_reflength;
+ int_key->ref_length= info->s->rec_reflength;
+ int_key->flag= 0;
+ end=page+keys*length;
+ *after_key=end+length;
+ memcpy(int_key->data, end, length);
+ DBUG_RETURN(end);
+ }
+
+ end=page+length-key_ref_length;
+ lastpos=page;
+ tmp_key.data= key_buff;
+ tmp_key.keyinfo= int_key->keyinfo;
+ key_buff[0]= 0; /* Safety */
+
+ /* We know that there are at least 2 keys on the page */
+
+ if (!(length=(*keyinfo->get_key)(&tmp_key, page_flag, 0, &page)))
+ {
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0);
+ }
+
+ do
+ {
+ prevpos=lastpos; lastpos=page;
+ int_key->data_length= tmp_key.data_length;
+ int_key->ref_length= tmp_key.ref_length;
+ int_key->flag= tmp_key.flag;
+ memcpy(int_key->data, key_buff, length); /* previous key */
+ if (!(length=(*keyinfo->get_key)(&tmp_key, page_flag, 0, &page)))
+ {
+ my_errno=HA_ERR_CRASHED;
+ DBUG_RETURN(0);
+ }
+ } while (page < end);
+
+ *after_key=lastpos;
+ DBUG_PRINT("exit",("returns: 0x%lx page: 0x%lx end: 0x%lx",
+ (long) prevpos,(long) page,(long) end));
+ DBUG_RETURN(prevpos);
+} /* _ma_find_last_pos */
+
+
+/**
+ @brief Balance page with static size keys with page on right/left
+
+ @param key Middle key will be stored here
+
+ @notes
+ Father_buff will always be changed
+ Caller must handle saving of curr_buff
+
+ @return
+ @retval 0 Balance was done (father buff is saved)
+ @retval 1 Middle key up (father buff is not saved)
+ @retval -1 Error
+*/
+
+static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ MARIA_KEY *key, MARIA_PAGE *curr_page,
+ MARIA_PAGE *father_page,
+ uchar *father_key_pos, MARIA_KEY_PARAM *s_temp)
+{
+ MARIA_PINNED_PAGE tmp_page_link, *new_page_link= &tmp_page_link;
+ MARIA_SHARE *share= info->s;
+ my_bool right;
+ uint k_length,father_length,father_keylength,nod_flag,curr_keylength;
+ uint right_length,left_length,new_right_length,new_left_length,extra_length;
+ uint keys, tmp_length, extra_buff_length;
+ uchar *pos, *extra_buff, *parting_key;
+ uchar tmp_part_key[MARIA_MAX_KEY_BUFF];
+ MARIA_PAGE next_page, extra_page, *left_page, *right_page;
+ DBUG_ENTER("_ma_balance_page");
+
+ k_length= keyinfo->keylength;
+ father_length= father_page->size;
+ father_keylength= k_length + share->base.key_reflength;
+ nod_flag= curr_page->node;
+ curr_keylength= k_length+nod_flag;
+ info->page_changed=1;
+
+ if ((father_key_pos != father_page->buff+father_length &&
+ (info->state->records & 1)) ||
+ father_key_pos == father_page->buff+ share->keypage_header +
+ share->base.key_reflength)
+ {
+ right=1;
+ next_page.pos= _ma_kpos(share->base.key_reflength,
+ father_key_pos+father_keylength);
+ left_page= curr_page;
+ right_page= &next_page;
+ DBUG_PRINT("info", ("use right page: %lu",
+ (ulong) (next_page.pos / keyinfo->block_length)));
+ }
+ else
+ {
+ right=0;
+ father_key_pos-=father_keylength;
+ next_page.pos= _ma_kpos(share->base.key_reflength,father_key_pos);
+ left_page= &next_page;
+ right_page= curr_page;
+ DBUG_PRINT("info", ("use left page: %lu",
+ (ulong) (next_page.pos / keyinfo->block_length)));
+ } /* father_key_pos ptr to parting key */
+
+ if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos,
+ PAGECACHE_LOCK_WRITE,
+ DFLT_INIT_HITS, info->buff, 0))
+ goto err;
+ page_mark_changed(info, &next_page);
+ DBUG_DUMP("next", next_page.buff, next_page.size);
+
+ /* Test if there is room to share keys */
+ left_length= left_page->size;
+ right_length= right_page->size;
+ keys= ((left_length+right_length-share->keypage_header*2-nod_flag*2)/
+ curr_keylength);
+
+ if ((right ? right_length : left_length) + curr_keylength <=
+ share->max_index_block_size)
+ {
+ /* Enough space to hold all keys in the two buffers ; Balance bufferts */
+ new_left_length= share->keypage_header+nod_flag+(keys/2)*curr_keylength;
+ new_right_length=share->keypage_header+nod_flag+(((keys+1)/2)*
+ curr_keylength);
+ left_page->size= new_left_length;
+ page_store_size(share, left_page);
+ right_page->size= new_right_length;
+ page_store_size(share, right_page);
+
+ DBUG_PRINT("info", ("left_length: %u -> %u right_length: %u -> %u",
+ left_length, new_left_length,
+ right_length, new_right_length));
+ if (left_length < new_left_length)
+ {
+ uint length;
+ DBUG_PRINT("info", ("move keys to end of buff"));
+
+ /* Move keys right_page -> left_page */
+ pos= left_page->buff+left_length;
+ memcpy(pos,father_key_pos, (size_t) k_length);
+ memcpy(pos+k_length, right_page->buff + share->keypage_header,
+ (size_t) (length=new_left_length - left_length - k_length));
+ pos= right_page->buff + share->keypage_header + length;
+ memcpy(father_key_pos, pos, (size_t) k_length);
+ bmove(right_page->buff + share->keypage_header,
+ pos + k_length, new_right_length);
+
+ if (share->now_transactional)
+ {
+ if (right)
+ {
+ /*
+ Log changes to page on left
+ The original page is on the left and stored in left_page->buff
+ We have on the page the newly inserted key and data
+ from buff added last on the page
+ */
+ if (_ma_log_split(curr_page,
+ left_length - s_temp->move_length,
+ new_left_length,
+ s_temp->key_pos, s_temp->changed_length,
+ s_temp->move_length,
+ KEY_OP_ADD_SUFFIX,
+ curr_page->buff + left_length,
+ new_left_length - left_length,
+ new_left_length - left_length+ k_length))
+ goto err;
+ /*
+ Log changes to page on right
+ This contains the original data with some keys deleted from
+ start of page
+ */
+ if (_ma_log_prefix(&next_page, 0,
+ ((int) new_right_length - (int) right_length),
+ KEY_OP_DEBUG_LOG_PREFIX_3))
+ goto err;
+ }
+ else
+ {
+ /*
+ Log changes to page on right (the original page) which is in buff
+ Data is removed from start of page
+ The inserted key may be in buff or moved to curr_buff
+ */
+ if (_ma_log_del_prefix(curr_page,
+ right_length - s_temp->changed_length,
+ new_right_length,
+ s_temp->key_pos, s_temp->changed_length,
+ s_temp->move_length))
+ goto err;
+ /*
+ Log changes to page on left, which has new data added last
+ */
+ if (_ma_log_suffix(&next_page, left_length, new_left_length))
+ goto err;
+ }
+ }
+ }
+ else
+ {
+ uint length;
+ DBUG_PRINT("info", ("move keys to start of right_page"));
+
+ bmove_upp(right_page->buff + new_right_length,
+ right_page->buff + right_length,
+ right_length - share->keypage_header);
+ length= new_right_length -right_length - k_length;
+ memcpy(right_page->buff + share->keypage_header + length, father_key_pos,
+ (size_t) k_length);
+ pos= left_page->buff + new_left_length;
+ memcpy(father_key_pos, pos, (size_t) k_length);
+ memcpy(right_page->buff + share->keypage_header, pos+k_length,
+ (size_t) length);
+
+ if (share->now_transactional)
+ {
+ if (right)
+ {
+ /*
+ Log changes to page on left
+ The original page is on the left and stored in curr_buff
+ The page is shortened from end and the key may be on the page
+ */
+ if (_ma_log_split(curr_page,
+ left_length - s_temp->move_length,
+ new_left_length,
+ s_temp->key_pos, s_temp->changed_length,
+ s_temp->move_length,
+ KEY_OP_NONE, (uchar*) 0, 0, 0))
+ goto err;
+ /*
+ Log changes to page on right
+ This contains the original data, with some data from cur_buff
+ added first
+ */
+ if (_ma_log_prefix(&next_page,
+ (uint) (new_right_length - right_length),
+ (int) (new_right_length - right_length),
+ KEY_OP_DEBUG_LOG_PREFIX_4))
+ goto err;
+ }
+ else
+ {
+ /*
+ Log changes to page on right (the original page) which is in buff
+ We have on the page the newly inserted key and data
+ from buff added first on the page
+ */
+ uint diff_length= new_right_length - right_length;
+ if (_ma_log_split(curr_page,
+ left_length - s_temp->move_length,
+ new_right_length,
+ s_temp->key_pos + diff_length,
+ s_temp->changed_length,
+ s_temp->move_length,
+ KEY_OP_ADD_PREFIX,
+ curr_page->buff + share->keypage_header,
+ diff_length, diff_length + k_length))
+ goto err;
+ /*
+ Log changes to page on left, which is shortened from end
+ */
+ if (_ma_log_suffix(&next_page, left_length, new_left_length))
+ goto err;
+ }
+ }
+ }
+
+ /* Log changes to father (one level up) page */
+
+ if (share->now_transactional &&
+ _ma_log_change(father_page, father_key_pos, k_length,
+ KEY_OP_DEBUG_FATHER_CHANGED_1))
+ goto err;
+
+ /*
+ next_page_link->changed is marked as true above and fathers
+ page_link->changed is marked as true in caller
+ */
+ if (_ma_write_keypage(&next_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ DFLT_INIT_HITS) ||
+ _ma_write_keypage(father_page,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+ goto err;
+ DBUG_RETURN(0);
+ }
+
+ /* left_page and right_page are full, lets split and make new nod */
+
+ extra_buff= info->buff+share->base.max_key_block_length;
+ new_left_length= new_right_length= (share->keypage_header + nod_flag +
+ (keys+1) / 3 * curr_keylength);
+ extra_page.info= info;
+ extra_page.keyinfo= keyinfo;
+ extra_page.buff= extra_buff;
+
+ /*
+ 5 is the minum number of keys we can have here. This comes from
+ the fact that each full page can store at least 2 keys and in this case
+ we have a 'split' key, ie 2+2+1 = 5
+ */
+ if (keys == 5) /* Too few keys to balance */
+ new_left_length-=curr_keylength;
+ extra_length= (nod_flag + left_length + right_length -
+ new_left_length - new_right_length - curr_keylength);
+ extra_buff_length= extra_length + share->keypage_header;
+ DBUG_PRINT("info",("left_length: %d right_length: %d new_left_length: %d new_right_length: %d extra_length: %d",
+ left_length, right_length,
+ new_left_length, new_right_length,
+ extra_length));
+
+ left_page->size= new_left_length;
+ page_store_size(share, left_page);
+ right_page->size= new_right_length;
+ page_store_size(share, right_page);
+
+ bzero(extra_buff, share->keypage_header);
+ extra_page.flag= nod_flag ? KEYPAGE_FLAG_ISNOD : 0;
+ extra_page.size= extra_buff_length;
+ page_store_info(share, &extra_page);
+
+ /* Copy key number */
+ extra_buff[share->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_KEYID_SIZE -
+ KEYPAGE_FLAG_SIZE]= keyinfo->key_nr;
+
+ /* move first largest keys to new page */
+ pos= right_page->buff + right_length-extra_length;
+ memcpy(extra_buff + share->keypage_header, pos, extra_length);
+ /* Zero old data from buffer */
+ bzero(extra_buff + extra_buff_length,
+ share->block_size - extra_buff_length);
+
+ /* Save new parting key between buff and extra_buff */
+ memcpy(tmp_part_key, pos-k_length,k_length);
+ /* Make place for new keys */
+ bmove_upp(right_page->buff + new_right_length, pos - k_length,
+ right_length - extra_length - k_length - share->keypage_header);
+ /* Copy keys from left page */
+ pos= left_page->buff + new_left_length;
+ memcpy(right_page->buff + share->keypage_header, pos + k_length,
+ (size_t) (tmp_length= left_length - new_left_length - k_length));
+ /* Copy old parting key */
+ parting_key= right_page->buff + share->keypage_header + tmp_length;
+ memcpy(parting_key, father_key_pos, (size_t) k_length);
+
+ /* Move new parting keys up to caller */
+ memcpy((right ? key->data : father_key_pos),pos,(size_t) k_length);
+ memcpy((right ? father_key_pos : key->data),tmp_part_key, k_length);
+
+ if ((extra_page.pos= _ma_new(info, DFLT_INIT_HITS, &new_page_link))
+ == HA_OFFSET_ERROR)
+ goto err;
+ _ma_kpointer(info,key->data+k_length, extra_page.pos);
+ /* This is safe as long we are using not keys with transid */
+ key->data_length= k_length - info->s->rec_reflength;
+ key->ref_length= info->s->rec_reflength;
+
+ if (right)
+ {
+ /*
+ Page order according to key values:
+ orignal_page (curr_page = left_page), next_page (buff), extra_buff
+
+ Move page positions so that we store data in extra_page where
+ next_page was and next_page will be stored at the new position
+ */
+ swap_variables(my_off_t, extra_page.pos, next_page.pos);
+ }
+
+ if (share->now_transactional)
+ {
+ if (right)
+ {
+ /*
+ left_page is shortened,
+ right_page is getting new keys at start and shortened from end.
+ extra_page is new page
+
+ Note that extra_page (largest key parts) will be stored at the
+ place of the original 'right' page (next_page) and right page
+ will be stored at the new page position
+
+ This makes the log entries smaller as right_page contains all
+ data to generate the data extra_buff
+ */
+
+ /*
+ Log changes to page on left (page shortened page at end)
+ */
+ if (_ma_log_split(curr_page,
+ left_length - s_temp->move_length, new_left_length,
+ s_temp->key_pos, s_temp->changed_length,
+ s_temp->move_length,
+ KEY_OP_NONE, (uchar*) 0, 0, 0))
+ goto err;
+ /*
+ Log changes to right page (stored at next page)
+ This contains the last 'extra_buff' from 'buff'
+ */
+ if (_ma_log_prefix(&extra_page,
+ 0, (int) (extra_buff_length - right_length),
+ KEY_OP_DEBUG_LOG_PREFIX_5))
+ goto err;
+
+ /*
+ Log changes to middle page, which is stored at the new page
+ position
+ */
+ if (_ma_log_new(&next_page, 0))
+ goto err;
+ }
+ else
+ {
+ /*
+ Log changes to page on right (the original page) which is in buff
+ This contains the original data, with some data from curr_buff
+ added first and shortened at end
+ */
+ int data_added_first= left_length - new_left_length;
+ if (_ma_log_key_middle(right_page,
+ new_right_length,
+ data_added_first,
+ data_added_first,
+ extra_length,
+ s_temp->key_pos,
+ s_temp->changed_length,
+ s_temp->move_length))
+ goto err;
+
+ /* Log changes to page on left, which is shortened from end */
+ if (_ma_log_suffix(left_page, left_length, new_left_length))
+ goto err;
+
+ /* Log change to rightmost (new) page */
+ if (_ma_log_new(&extra_page, 0))
+ goto err;
+ }
+
+ /* Log changes to father (one level up) page */
+ if (share->now_transactional &&
+ _ma_log_change(father_page, father_key_pos, k_length,
+ KEY_OP_DEBUG_FATHER_CHANGED_2))
+ goto err;
+ }
+
+ if (_ma_write_keypage(&next_page,
+ (right ? new_page_link->write_lock :
+ PAGECACHE_LOCK_LEFT_WRITELOCKED),
+ DFLT_INIT_HITS) ||
+ _ma_write_keypage(&extra_page,
+ (!right ? new_page_link->write_lock :
+ PAGECACHE_LOCK_LEFT_WRITELOCKED),
+ DFLT_INIT_HITS))
+ goto err;
+
+ DBUG_RETURN(1); /* Middle key up */
+
+err:
+ DBUG_RETURN(-1);
+} /* _ma_balance_page */
+
+
+/**********************************************************************
+ * Bulk insert code *
+ **********************************************************************/
+
+typedef struct {
+ MARIA_HA *info;
+ uint keynr;
+} bulk_insert_param;
+
+
+static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key)
+{
+ my_bool error;
+ uint keynr= key->keyinfo->key_nr;
+ DBUG_ENTER("_ma_ck_write_tree");
+
+ /* Store ref_length as this is always constant */
+ info->bulk_insert_ref_length= key->ref_length;
+ error= tree_insert(&info->bulk_insert[keynr], key->data,
+ key->data_length + key->ref_length,
+ info->bulk_insert[keynr].custom_arg) == 0;
+ DBUG_RETURN(error);
+} /* _ma_ck_write_tree */
+
+
+/* typeof(_ma_keys_compare)=qsort_cmp2 */
+
+static int keys_compare(bulk_insert_param *param, uchar *key1, uchar *key2)
+{
+ uint not_used[2];
+ return ha_key_cmp(param->info->s->keyinfo[param->keynr].seg,
+ key1, key2, USE_WHOLE_KEY, SEARCH_SAME,
+ not_used);
+}
+
+
+static int keys_free(uchar *key, TREE_FREE mode, bulk_insert_param *param)
+{
+ /*
+ Probably I can use info->lastkey here, but I'm not sure,
+ and to be safe I'd better use local lastkey.
+ */
+ MARIA_SHARE *share= param->info->s;
+ uchar lastkey[MARIA_MAX_KEY_BUFF];
+ uint keylen;
+ MARIA_KEYDEF *keyinfo= share->keyinfo + param->keynr;
+ MARIA_KEY tmp_key;
+
+ switch (mode) {
+ case free_init:
+ if (share->lock_key_trees)
+ {
+ rw_wrlock(&keyinfo->root_lock);
+ keyinfo->version++;
+ }
+ return 0;
+ case free_free:
+ /* Note: keylen doesn't contain transid lengths */
+ keylen= _ma_keylength(keyinfo, key);
+ tmp_key.data= lastkey;
+ tmp_key.keyinfo= keyinfo;
+ tmp_key.data_length= keylen - share->rec_reflength;
+ tmp_key.ref_length= param->info->bulk_insert_ref_length;
+ tmp_key.flag= (param->info->bulk_insert_ref_length ==
+ share->rec_reflength ? 0 : SEARCH_USER_KEY_HAS_TRANSID);
+ /*
+ We have to copy key as ma_ck_write_btree may need the buffer for
+ copying middle key up if tree is growing
+ */
+ memcpy(lastkey, key, tmp_key.data_length + tmp_key.ref_length);
+ return _ma_ck_write_btree(param->info, &tmp_key);
+ case free_end:
+ if (share->lock_key_trees)
+ rw_unlock(&keyinfo->root_lock);
+ return 0;
+ }
+ return 1;
+}
+
+
+int maria_init_bulk_insert(MARIA_HA *info, ulong cache_size, ha_rows rows)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *key=share->keyinfo;
+ bulk_insert_param *params;
+ uint i, num_keys, total_keylength;
+ ulonglong key_map;
+ DBUG_ENTER("_ma_init_bulk_insert");
+ DBUG_PRINT("enter",("cache_size: %lu", cache_size));
+
+ DBUG_ASSERT(!info->bulk_insert &&
+ (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT));
+
+ maria_clear_all_keys_active(key_map);
+ for (i=total_keylength=num_keys=0 ; i < share->base.keys ; i++)
+ {
+ if (! (key[i].flag & HA_NOSAME) && (share->base.auto_key != i + 1) &&
+ maria_is_key_active(share->state.key_map, i))
+ {
+ num_keys++;
+ maria_set_key_active(key_map, i);
+ total_keylength+=key[i].maxlength+TREE_ELEMENT_EXTRA_SIZE;
+ }
+ }
+
+ if (num_keys==0 ||
+ num_keys * MARIA_MIN_SIZE_BULK_INSERT_TREE > cache_size)
+ DBUG_RETURN(0);
+
+ if (rows && rows*total_keylength < cache_size)
+ cache_size= (ulong)rows;
+ else
+ cache_size/=total_keylength*16;
+
+ info->bulk_insert=(TREE *)
+ my_malloc((sizeof(TREE)*share->base.keys+
+ sizeof(bulk_insert_param)*num_keys),MYF(0));
+
+ if (!info->bulk_insert)
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+
+ params=(bulk_insert_param *)(info->bulk_insert+share->base.keys);
+ for (i=0 ; i < share->base.keys ; i++)
+ {
+ if (maria_is_key_active(key_map, i))
+ {
+ params->info=info;
+ params->keynr=i;
+ /* Only allocate a 16'th of the buffer at a time */
+ init_tree(&info->bulk_insert[i],
+ cache_size * key[i].maxlength,
+ cache_size * key[i].maxlength, 0,
+ (qsort_cmp2)keys_compare, 0,
+ (tree_element_free) keys_free, (void *)params++);
+ }
+ else
+ info->bulk_insert[i].root=0;
+ }
+
+ DBUG_RETURN(0);
+}
+
+void maria_flush_bulk_insert(MARIA_HA *info, uint inx)
+{
+ if (info->bulk_insert)
+ {
+ if (is_tree_inited(&info->bulk_insert[inx]))
+ reset_tree(&info->bulk_insert[inx]);
+ }
+}
+
+void maria_end_bulk_insert(MARIA_HA *info)
+{
+ DBUG_ENTER("maria_end_bulk_insert");
+ if (info->bulk_insert)
+ {
+ uint i;
+ for (i=0 ; i < info->s->base.keys ; i++)
+ {
+ if (is_tree_inited(&info->bulk_insert[i]))
+ {
+ if (info->s->deleting)
+ reset_free_element(&info->bulk_insert[i]);
+ delete_tree(&info->bulk_insert[i]);
+ }
+ }
+ my_free(info->bulk_insert, MYF(0));
+ info->bulk_insert= 0;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/****************************************************************************
+ Dedicated functions that generate log entries
+****************************************************************************/
+
+
+int _ma_write_undo_key_insert(MARIA_HA *info, const MARIA_KEY *key,
+ my_off_t *root, my_off_t new_root, LSN *res_lsn)
+{
+ MARIA_SHARE *share= info->s;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ KEY_NR_STORE_SIZE];
+ const uchar *key_value;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ struct st_msg_to_write_hook_for_undo_key msg;
+ uint key_length;
+
+ /* Save if we need to write a clr record */
+ lsn_store(log_data, info->trn->undo_lsn);
+ key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
+ keyinfo->key_nr);
+ key_length= key->data_length + key->ref_length;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key->data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length;
+
+ msg.root= root;
+ msg.value= new_root;
+ msg.auto_increment= 0;
+ key_value= key->data;
+ if (share->base.auto_key == ((uint) keyinfo->key_nr + 1))
+ {
+ const HA_KEYSEG *keyseg= keyinfo->seg;
+ uchar reversed[MARIA_MAX_KEY_BUFF];
+ if (keyseg->flag & HA_SWAP_KEY)
+ {
+ /* We put key from log record to "data record" packing format... */
+ const uchar *key_ptr= key->data, *key_end= key->data + keyseg->length;
+ uchar *to= reversed + keyseg->length;
+ do
+ {
+ *--to= *key_ptr++;
+ } while (key_ptr != key_end);
+ key_value= to;
+ }
+ /* ... so that we can read it with: */
+ msg.auto_increment=
+ ma_retrieve_auto_increment(key_value, keyseg->type);
+ /* and write_hook_for_undo_key_insert() will pick this. */
+ }
+
+ return translog_write_record(res_lsn, LOGREC_UNDO_KEY_INSERT,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length +
+ key_length,
+ TRANSLOG_INTERNAL_PARTS + 2, log_array,
+ log_data + LSN_STORE_SIZE, &msg) ? -1 : 0;
+}
+
+
+/**
+ @brief Log creation of new page
+
+ @note
+ We don't have to store the page_length into the log entry as we can
+ calculate this from the length of the log entry
+
+ @retval 1 error
+ @retval 0 ok
+*/
+
+my_bool _ma_log_new(MARIA_PAGE *ma_page, my_bool root_page)
+{
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE
+ +1];
+ uint page_length;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+ MARIA_HA *info= ma_page->info;
+ MARIA_SHARE *share= info->s;
+ my_off_t page= ma_page->pos / share->block_size;
+ DBUG_ENTER("_ma_log_new");
+ DBUG_PRINT("enter", ("page: %lu", (ulong) page));
+
+ DBUG_ASSERT(share->now_transactional);
+
+ /* Store address of new root page */
+ page_store(log_data + FILEID_STORE_SIZE, page);
+
+ /* Store link to next unused page */
+ if (info->key_del_used == 2)
+ page= 0; /* key_del not changed */
+ else
+ page= ((share->key_del_current == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO :
+ share->key_del_current / share->block_size);
+
+ page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page);
+ key_nr_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE*2,
+ ma_page->keyinfo->key_nr);
+ log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE*2 + KEY_NR_STORE_SIZE]=
+ (uchar) root_page;
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+ page_length= ma_page->size - LSN_STORE_SIZE;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= ma_page->buff + LSN_STORE_SIZE;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= page_length;
+
+ /* Remember new page length for future log entires for same page */
+ ma_page->org_size= ma_page->size;
+
+ if (translog_write_record(&lsn, LOGREC_REDO_INDEX_NEW_PAGE,
+ info->trn, info,
+ (translog_size_t)
+ (sizeof(log_data) + page_length),
+ TRANSLOG_INTERNAL_PARTS + 2, log_array,
+ log_data, NULL))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief
+ Log when some part of the key page changes
+*/
+
+my_bool _ma_log_change(MARIA_PAGE *ma_page, const uchar *key_pos, uint length,
+ enum en_key_debug debug_marker __attribute__((unused)))
+{
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 6 + 7], *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+ uint offset= (uint) (key_pos - ma_page->buff), translog_parts;
+ MARIA_HA *info= ma_page->info;
+ my_off_t page= ma_page->pos / info->s->block_size;
+ DBUG_ENTER("_ma_log_change");
+ DBUG_PRINT("enter", ("page: %lu length: %u", (ulong) page, length));
+
+ DBUG_ASSERT(info->s->now_transactional);
+ DBUG_ASSERT(offset + length <= ma_page->size);
+ DBUG_ASSERT(ma_page->org_size == ma_page->size);
+
+ /* Store address of new root page */
+ page= ma_page->pos / info->s->block_size;
+ page_store(log_data + FILEID_STORE_SIZE, page);
+ log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+ (*log_pos++)= KEY_OP_DEBUG;
+ (*log_pos++)= debug_marker;
+#endif
+
+ log_pos[0]= KEY_OP_OFFSET;
+ int2store(log_pos+1, offset);
+ log_pos[3]= KEY_OP_CHANGE;
+ int2store(log_pos+4, length);
+ log_pos+= 6;
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (log_pos - log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
+ translog_parts= 2;
+
+ _ma_log_key_changes(ma_page,
+ log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_pos, &length, &translog_parts);
+
+ if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t) (log_pos - log_data) + length,
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ @brief Write log entry for page splitting
+
+ @fn _ma_log_split()
+ @param
+ ma_page Page that is changed
+ org_length Original length of page. Can be bigger than block_size
+ for block that overflowed
+ new_length New length of page
+ key_pos Where key is inserted on page (may be 0 if no key)
+ key_length Number of bytes changed at key_pos
+ move_length Number of bytes moved at key_pos to make room for key
+ prefix_or_suffix KEY_OP_NONE Ignored
+ KEY_OP_ADD_PREFIX Add data to start of page
+ KEY_OP_ADD_SUFFIX Add data to end of page
+ data What data was added
+ data_length Number of bytes added first or last
+ changed_length Number of bytes changed first or last.
+
+ @note
+ Write log entry for page that has got a key added to the page under
+ one and only one of the following senarios:
+ - Page is shortened from end
+ - Data is added to end of page
+ - Data added at front of page
+*/
+
+static my_bool _ma_log_split(MARIA_PAGE *ma_page,
+ uint org_length, uint new_length,
+ const uchar *key_pos, uint key_length,
+ int move_length, enum en_key_op prefix_or_suffix,
+ const uchar *data, uint data_length,
+ uint changed_length)
+{
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 3+3+3+3+3+2 +7];
+ uchar *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6];
+ uint offset= (uint) (key_pos - ma_page->buff);
+ uint translog_parts, extra_length;
+ MARIA_HA *info= ma_page->info;
+ my_off_t page= ma_page->pos / info->s->block_size;
+ DBUG_ENTER("_ma_log_split");
+ DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u",
+ (ulong) page, org_length, new_length));
+
+ DBUG_ASSERT(changed_length >= data_length);
+ DBUG_ASSERT(org_length <= info->s->max_index_block_size);
+ DBUG_ASSERT(new_length == ma_page->size);
+ DBUG_ASSERT(org_length == ma_page->org_size);
+
+ log_pos= log_data + FILEID_STORE_SIZE;
+ page_store(log_pos, page);
+ log_pos+= PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+ (*log_pos++)= KEY_OP_DEBUG;
+ (*log_pos++)= KEY_OP_DEBUG_LOG_SPLIT;
+#endif
+
+ /* Store keypage_flag */
+ *log_pos++= KEY_OP_SET_PAGEFLAG;
+ *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+ if (new_length <= offset || !key_pos)
+ {
+ /*
+ Page was split before inserted key. Write redo entry where
+ we just cut current page at page_length
+ */
+ uint length_offset= org_length - new_length;
+ log_pos[0]= KEY_OP_DEL_SUFFIX;
+ int2store(log_pos+1, length_offset);
+ log_pos+= 3;
+ translog_parts= 1;
+ extra_length= 0;
+ DBUG_ASSERT(data_length == 0);
+ }
+ else
+ {
+ /* Key was added to page which was split after the inserted key */
+ uint max_key_length;
+
+ /*
+ Handle case when split happened directly after the newly inserted key.
+ */
+ max_key_length= new_length - offset;
+ extra_length= min(key_length, max_key_length);
+ if (offset + move_length > new_length)
+ {
+ /* This is true when move_length includes changes for next packed key */
+ move_length= new_length - offset;
+ }
+
+ if ((int) new_length < (int) (org_length + move_length + data_length))
+ {
+ /* Shorten page */
+ uint diff= org_length + move_length + data_length - new_length;
+ log_pos[0]= KEY_OP_DEL_SUFFIX;
+ int2store(log_pos + 1, diff);
+ log_pos+= 3;
+ DBUG_ASSERT(data_length == 0); /* Page is shortened */
+ DBUG_ASSERT(offset <= org_length - diff);
+ }
+ else
+ {
+ DBUG_ASSERT(new_length == org_length + move_length + data_length);
+ DBUG_ASSERT(offset <= org_length);
+ }
+
+ log_pos[0]= KEY_OP_OFFSET;
+ int2store(log_pos+1, offset);
+ log_pos+= 3;
+
+ if (move_length)
+ {
+ log_pos[0]= KEY_OP_SHIFT;
+ int2store(log_pos+1, move_length);
+ log_pos+= 3;
+ }
+
+ log_pos[0]= KEY_OP_CHANGE;
+ int2store(log_pos+1, extra_length);
+ log_pos+= 3;
+
+ /* Point to original inserted key data */
+ if (prefix_or_suffix == KEY_OP_ADD_PREFIX)
+ key_pos+= data_length;
+
+ translog_parts= 2;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extra_length;
+ }
+
+ if (data_length)
+ {
+ /* Add prefix or suffix */
+ log_pos[0]= prefix_or_suffix;
+ int2store(log_pos+1, data_length);
+ log_pos+= 3;
+ if (prefix_or_suffix == KEY_OP_ADD_PREFIX)
+ {
+ int2store(log_pos+1, changed_length);
+ log_pos+= 2;
+ data_length= changed_length;
+ }
+ log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].str= data;
+ log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].length= data_length;
+ translog_parts++;
+ extra_length+= data_length;
+ }
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+ log_data);
+
+ _ma_log_key_changes(ma_page,
+ log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_pos, &extra_length, &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ ma_page->org_size= ma_page->size;
+
+ DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length + extra_length,
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL));
+}
+
+
+/**
+ @brief
+ Write log entry for page that has got a key added to the page
+ and page is shortened from start of page
+
+ @fn _ma_log_del_prefix()
+ @param info Maria handler
+ @param page Page number
+ @param buff Page buffer
+ @param org_length Length of buffer when read
+ @param new_length Final length
+ @param key_pos Where on page buffer key was added. This is position
+ before prefix was removed
+ @param key_length How many bytes was changed at 'key_pos'
+ @param move_length How many bytes was moved up when key was added
+
+ @return
+ @retval 0 ok
+ @retval 1 error
+*/
+
+static my_bool _ma_log_del_prefix(MARIA_PAGE *ma_page,
+ uint org_length, uint new_length,
+ const uchar *key_pos, uint key_length,
+ int move_length)
+{
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 12 + 7];
+ uchar *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+ uint offset= (uint) (key_pos - ma_page->buff);
+ uint diff_length= org_length + move_length - new_length;
+ uint translog_parts, extra_length;
+ MARIA_HA *info= ma_page->info;
+ my_off_t page= ma_page->pos / info->s->block_size;
+ DBUG_ENTER("_ma_log_del_prefix");
+ DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u",
+ (ulong) page, org_length, new_length));
+
+ DBUG_ASSERT((int) diff_length > 0);
+ DBUG_ASSERT(ma_page->org_size == org_length);
+ DBUG_ASSERT(ma_page->size == new_length);
+
+ log_pos= log_data + FILEID_STORE_SIZE;
+ page_store(log_pos, page);
+ log_pos+= PAGE_STORE_SIZE;
+
+ translog_parts= 1;
+ extra_length= 0;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+ *log_pos++= KEY_OP_DEBUG;
+ *log_pos++= KEY_OP_DEBUG_LOG_DEL_PREFIX;
+#endif
+
+ /* Store keypage_flag */
+ *log_pos++= KEY_OP_SET_PAGEFLAG;
+ *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+ if (offset < diff_length + info->s->keypage_header)
+ {
+ /*
+ Key is not anymore on page. Move data down, but take into account that
+ the original page had grown with 'move_length bytes'
+ */
+ DBUG_ASSERT(offset + key_length <= diff_length + info->s->keypage_header);
+
+ log_pos[0]= KEY_OP_DEL_PREFIX;
+ int2store(log_pos+1, diff_length - move_length);
+ log_pos+= 3;
+ }
+ else
+ {
+ /*
+ Correct position to key, as data before key has been delete and key
+ has thus been moved down
+ */
+ offset-= diff_length;
+ key_pos-= diff_length;
+
+ /* Move data down */
+ log_pos[0]= KEY_OP_DEL_PREFIX;
+ int2store(log_pos+1, diff_length);
+ log_pos+= 3;
+
+ log_pos[0]= KEY_OP_OFFSET;
+ int2store(log_pos+1, offset);
+ log_pos+= 3;
+
+ if (move_length)
+ {
+ log_pos[0]= KEY_OP_SHIFT;
+ int2store(log_pos+1, move_length);
+ log_pos+= 3;
+ }
+ log_pos[0]= KEY_OP_CHANGE;
+ int2store(log_pos+1, key_length);
+ log_pos+= 3;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length;
+ translog_parts= 2;
+ extra_length= key_length;
+ }
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+ log_data);
+ _ma_log_key_changes(ma_page,
+ log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_pos, &extra_length, &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ ma_page->org_size= ma_page->size;
+
+ DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length + extra_length,
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL));
+}
+
+
+/**
+ @brief
+ Write log entry for page that has got data added first and
+ data deleted last. Old changed key may be part of page
+*/
+
+static my_bool _ma_log_key_middle(MARIA_PAGE *ma_page,
+ uint new_length,
+ uint data_added_first,
+ uint data_changed_first,
+ uint data_deleted_last,
+ const uchar *key_pos,
+ uint key_length, int move_length)
+{
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 3+5+3+3+3 + 7];
+ uchar *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6];
+ uint key_offset;
+ uint translog_parts, extra_length;
+ MARIA_HA *info= ma_page->info;
+ my_off_t page= ma_page->pos / info->s->block_size;
+ DBUG_ENTER("_ma_log_key_middle");
+ DBUG_PRINT("enter", ("page: %lu", (ulong) page));
+
+ DBUG_ASSERT(ma_page->size == new_length);
+
+ /* new place of key after changes */
+ key_pos+= data_added_first;
+ key_offset= (uint) (key_pos - ma_page->buff);
+ if (key_offset < new_length)
+ {
+ /* key is on page; Calculate how much of the key is there */
+ uint max_key_length= new_length - key_offset;
+ if (max_key_length < key_length)
+ {
+ /* Key is last on page */
+ key_length= max_key_length;
+ move_length= 0;
+ }
+ /*
+ Take into account that new data was added as part of original key
+ that also needs to be removed from page
+ */
+ data_deleted_last+= move_length;
+ }
+
+ /* First log changes to page */
+ log_pos= log_data + FILEID_STORE_SIZE;
+ page_store(log_pos, page);
+ log_pos+= PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+ *log_pos++= KEY_OP_DEBUG;
+ *log_pos++= KEY_OP_DEBUG_LOG_MIDDLE;
+#endif
+
+ /* Store keypage_flag */
+ *log_pos++= KEY_OP_SET_PAGEFLAG;
+ *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+ log_pos[0]= KEY_OP_DEL_SUFFIX;
+ int2store(log_pos+1, data_deleted_last);
+ log_pos+= 3;
+
+ log_pos[0]= KEY_OP_ADD_PREFIX;
+ int2store(log_pos+1, data_added_first);
+ int2store(log_pos+3, data_changed_first);
+ log_pos+= 5;
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+ log_data);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (ma_page->buff +
+ info->s->keypage_header);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first;
+ translog_parts= 2;
+ extra_length= data_changed_first;
+
+ /* If changed key is on page, log those changes too */
+
+ if (key_offset < new_length)
+ {
+ uchar *start_log_pos= log_pos;
+
+ log_pos[0]= KEY_OP_OFFSET;
+ int2store(log_pos+1, key_offset);
+ log_pos+= 3;
+ if (move_length)
+ {
+ log_pos[0]= KEY_OP_SHIFT;
+ int2store(log_pos+1, move_length);
+ log_pos+= 3;
+ }
+ log_pos[0]= KEY_OP_CHANGE;
+ int2store(log_pos+1, key_length);
+ log_pos+= 3;
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].str= start_log_pos;
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].length= (uint) (log_pos -
+ start_log_pos);
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].str= key_pos;
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].length= key_length;
+ translog_parts+=2;
+ extra_length+= (uint) (log_array[TRANSLOG_INTERNAL_PARTS + 2].length +
+ key_length);
+ }
+
+ _ma_log_key_changes(ma_page,
+ log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_pos, &extra_length, &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ ma_page->org_size= ma_page->size;
+
+ DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t)
+ (log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length + extra_length),
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL));
+}
+
+
+#ifdef NOT_NEEDED
+
+/**
+ @brief
+ Write log entry for page that has got data added first and
+ data deleted last
+*/
+
+static my_bool _ma_log_middle(MARIA_PAGE *ma_page,
+ uint data_added_first, uint data_changed_first,
+ uint data_deleted_last)
+{
+ LSN lsn;
+ LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 3 + 5 + 7], *log_pos;
+ MARIA_HA *info= ma_page->info;
+ my_off_t page= ma_page->page / info->s->block_size;
+ uint translog_parts, extra_length;
+ DBUG_ENTER("_ma_log_middle");
+ DBUG_PRINT("enter", ("page: %lu", (ulong) page));
+
+ DBUG_ASSERT(ma_page->org_size + data_added_first - data_deleted_last ==
+ ma_page->size);
+
+ log_pos= log_data + FILEID_STORE_SIZE;
+ page_store(log_pos, page);
+ log_pos+= PAGE_STORE_SIZE;
+
+ log_pos[0]= KEY_OP_DEL_PREFIX;
+ int2store(log_pos+1, data_deleted_last);
+ log_pos+= 3;
+
+ log_pos[0]= KEY_OP_ADD_PREFIX;
+ int2store(log_pos+1, data_added_first);
+ int2store(log_pos+3, data_changed_first);
+ log_pos+= 5;
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+ log_data);
+
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= ((char*) buff +
+ info->s->keypage_header);
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first;
+ translog_parts= 2;
+ extra_length= data_changed_first;
+
+ _ma_log_key_changes(ma_page,
+ log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_pos, &extra_length, &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ ma_page->org_size= ma_page->size;
+
+ DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t)
+ log_array[TRANSLOG_INTERNAL_PARTS +
+ 0].length + extra_length,
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL));
+}
+#endif
diff --git a/storage/maria/maria_chk.c b/storage/maria/maria_chk.c
new file mode 100644
index 00000000000..4e19d5878ea
--- /dev/null
+++ b/storage/maria/maria_chk.c
@@ -0,0 +1,2008 @@
+/* Copyright (C) 2006-2003 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Describe, check and repair of MARIA tables */
+
+#include "ma_fulltext.h"
+#include <myisamchk.h>
+#include <my_bit.h>
+#include <m_ctype.h>
+#include <stdarg.h>
+#include <my_getopt.h>
+#ifdef HAVE_SYS_VADVICE_H
+#include <sys/vadvise.h>
+#endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+SET_STACK_SIZE(9000) /* Minimum stack size for program */
+
+#ifndef USE_RAID
+#define my_raid_create(A,B,C,D,E,F,G) my_create(A,B,C,G)
+#define my_raid_delete(A,B,C) my_delete(A,B)
+#endif
+
+static uint decode_bits;
+static char **default_argv;
+static const char *load_default_groups[]= { "aria_chk", 0 };
+static const char *set_collation_name, *opt_tmpdir, *opt_log_dir;
+static CHARSET_INFO *set_collation;
+static int stopwords_inited= 0;
+static MY_TMPDIR maria_chk_tmpdir;
+static my_bool opt_transaction_logging, opt_debug, opt_require_control_file;
+static my_bool opt_warning_for_wrong_transid;
+
+static const char *type_names[]=
+{
+ "impossible","char","binary", "short", "long", "float",
+ "double","number","unsigned short",
+ "unsigned long","longlong","ulonglong","int24",
+ "uint24","int8","varchar", "varbin", "varchar2", "varbin2", "bit",
+ "?","?"
+};
+
+static const char *prefix_packed_txt="packed ",
+ *bin_packed_txt="prefix ",
+ *diff_txt="stripped ",
+ *null_txt="NULL",
+ *blob_txt="BLOB ";
+
+static const char *field_pack[]=
+{
+ "","no endspace", "no prespace",
+ "no zeros", "blob", "constant", "table-lockup",
+ "always zero","varchar","unique-hash","?","?"
+};
+
+static const char *record_formats[]=
+{
+ "Fixed length", "Packed", "Compressed", "Block", "?"
+};
+
+static const char *bitmap_description[]=
+{
+ "Empty page", "Part filled head page","Part filled head page",
+ "Part filled head page", "Full head page",
+ "Part filled tail page","Part filled tail page",
+ "Full tail or blob page"
+};
+
+static const char *maria_stats_method_str="nulls_unequal";
+static char default_open_errmsg[]= "%d when opening Aria table '%s'";
+static char default_close_errmsg[]= "%d when closing Aria table '%s'";
+
+static void get_options(int *argc,char * * *argv);
+static void print_version(void);
+static void usage(void);
+static int maria_chk(HA_CHECK *param, char *filename);
+static void descript(HA_CHECK *param, register MARIA_HA *info, char *name);
+static int maria_sort_records(HA_CHECK *param, register MARIA_HA *info,
+ char *name, uint sort_key,
+ my_bool write_info, my_bool update_index);
+static int sort_record_index(MARIA_SORT_PARAM *sort_param, MARIA_PAGE *page,
+ uint sortkey, File new_file,
+ my_bool update_index);
+static my_bool write_log_record(HA_CHECK *param);
+
+HA_CHECK check_param;
+
+ /* Main program */
+
+int main(int argc, char **argv)
+{
+ int error;
+ MY_INIT(argv[0]);
+
+ opt_log_dir= maria_data_root= (char *)".";
+ maria_chk_init(&check_param);
+ check_param.opt_lock_memory= 1; /* Lock memory if possible */
+ check_param.using_global_keycache = 0;
+ get_options(&argc,(char***) &argv);
+ maria_quick_table_bits=decode_bits;
+ error=0;
+ maria_init();
+
+ maria_block_size= 0; /* Use block size from control file */
+ if (ma_control_file_open(FALSE, opt_require_control_file ||
+ !(check_param.testflag & T_SILENT)) &&
+ (opt_require_control_file ||
+ (opt_transaction_logging && (check_param.testflag & T_REP_ANY))))
+ {
+ error= 1;
+ goto end;
+ }
+
+ /*
+ If we are doing a repair, user may want to store this repair into the log
+ so that the log has a complete history and can be used to replay.
+ */
+ if (opt_transaction_logging && (check_param.testflag & T_REP_ANY))
+ {
+ if (init_pagecache(maria_log_pagecache,
+ TRANSLOG_PAGECACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, MY_WME) == 0 ||
+ translog_init(opt_log_dir, TRANSLOG_FILE_SIZE,
+ 0, 0, maria_log_pagecache,
+ TRANSLOG_DEFAULT_FLAGS, 0))
+ {
+ _ma_check_print_error(&check_param,
+ "Can't initialize transaction logging. Run "
+ "recovery with switch --skip-transaction-log");
+ error= 1;
+ goto end;
+ }
+ }
+
+ while (--argc >= 0)
+ {
+ int new_error=maria_chk(&check_param, *(argv++));
+ if ((check_param.testflag & T_REP_ANY) != T_REP)
+ check_param.testflag&= ~T_REP;
+ VOID(fflush(stdout));
+ VOID(fflush(stderr));
+ if ((check_param.error_printed | check_param.warning_printed) &&
+ (check_param.testflag & T_FORCE_CREATE) &&
+ (!(check_param.testflag & (T_REP | T_REP_BY_SORT | T_SORT_RECORDS |
+ T_SORT_INDEX))))
+ {
+ ulonglong old_testflag=check_param.testflag;
+ if (!(check_param.testflag & T_REP))
+ check_param.testflag|= T_REP_BY_SORT;
+ check_param.testflag&= ~T_EXTEND; /* Not needed */
+ error|=maria_chk(&check_param, argv[-1]);
+ check_param.testflag= old_testflag;
+ VOID(fflush(stdout));
+ VOID(fflush(stderr));
+ }
+ else
+ error|=new_error;
+ if (argc && (!(check_param.testflag & T_SILENT) ||
+ check_param.testflag & T_INFO))
+ {
+ puts("\n---------\n");
+ VOID(fflush(stdout));
+ }
+ }
+end:
+ if (check_param.total_files > 1)
+ { /* Only if descript */
+ char buff[22],buff2[22];
+ if (!(check_param.testflag & T_SILENT) || check_param.testflag & T_INFO)
+ puts("\n---------");
+ printf("\nTotal of all %d Aria-files:\nData records: %9s Deleted blocks: %9s\n",check_param.total_files,llstr(check_param.total_records,buff),
+ llstr(check_param.total_deleted,buff2));
+ }
+ free_defaults(default_argv);
+ free_tmpdir(&maria_chk_tmpdir);
+ maria_end();
+ my_end(check_param.testflag & T_INFO ?
+ MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
+ exit(error);
+#ifndef _lint
+ return 0; /* No compiler warning */
+#endif
+} /* main */
+
+enum options_mc {
+ OPT_CHARSETS_DIR=256, OPT_SET_COLLATION,OPT_START_CHECK_POS,
+ OPT_CORRECT_CHECKSUM, OPT_PAGE_BUFFER_SIZE,
+ OPT_KEY_CACHE_BLOCK_SIZE, OPT_MARIA_BLOCK_SIZE,
+ OPT_READ_BUFFER_SIZE, OPT_WRITE_BUFFER_SIZE, OPT_SORT_BUFFER_SIZE,
+ OPT_SORT_KEY_BLOCKS, OPT_DECODE_BITS, OPT_FT_MIN_WORD_LEN,
+ OPT_FT_MAX_WORD_LEN, OPT_FT_STOPWORD_FILE,
+ OPT_MAX_RECORD_LENGTH, OPT_AUTO_CLOSE, OPT_STATS_METHOD, OPT_TRANSACTION_LOG,
+ OPT_SKIP_SAFEMALLOC, OPT_ZEROFILL_KEEP_LSN, OPT_REQUIRE_CONTROL_FILE,
+ OPT_LOG_DIR, OPT_DATADIR, OPT_WARNING_FOR_WRONG_TRANSID
+};
+
+static struct my_option my_long_options[] =
+{
+ {"analyze", 'a',
+ "Analyze distribution of keys. Will make some joins in MySQL faster. You can check the calculated distribution.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifdef __NETWARE__
+ {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"block-search", 'b',
+ "No help available.",
+ 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"backup", 'B',
+ "Make a backup of the .MAD file as 'filename-time.BAK'.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"character-sets-dir", OPT_CHARSETS_DIR,
+ "Directory where character sets are.",
+ (char**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"check", 'c',
+ "Check table for errors.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"check-only-changed", 'C',
+ "Check only tables that have changed since last check. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"correct-checksum", OPT_CORRECT_CHECKSUM,
+ "Correct checksum information for table.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+ {"debug", '#',
+ "Output debug log. Often this is 'd:t:o,filename'.",
+ 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"description", 'd',
+ "Prints some information about table.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"data-file-length", 'D',
+ "Max length of data file (when recreating data-file when it's full).",
+ &check_param.max_data_file_length,
+ &check_param.max_data_file_length,
+ 0, GET_LL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"extend-check", 'e',
+ "If used when checking a table, ensure that the table is 100 percent consistent, which will take a long time. If used when repairing a table, try to recover every possible row from the data file. Normally this will also find a lot of garbage rows; Don't use this option with repair if you are not totally desperate.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"fast", 'F',
+ "Check only tables that haven't been closed properly. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"force", 'f',
+ "Restart with -r if there are any errors in the table. States will be updated as with --update-state.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"HELP", 'H',
+ "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"help", '?',
+ "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"information", 'i',
+ "Print statistics information about table that is checked.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"keys-used", 'k',
+ "Tell Aria to update only some specific keys. # is a bit mask of which keys to use. This can be used to get faster inserts.",
+ &check_param.keys_in_use,
+ &check_param.keys_in_use,
+ 0, GET_ULL, REQUIRED_ARG, -1, 0, 0, 0, 0, 0},
+ {"datadir", OPT_DATADIR,
+ "Path for control file (and logs if --logdir not used).",
+ &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
+ 0, 0, 0, 0, 0, 0},
+ {"logdir", OPT_LOG_DIR,
+ "Path for log files.",
+ (char**) &opt_log_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"max-record-length", OPT_MAX_RECORD_LENGTH,
+ "Skip rows bigger than this if aria_chk can't allocate memory to hold it",
+ &check_param.max_record_length,
+ &check_param.max_record_length,
+ 0, GET_ULL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0},
+ {"medium-check", 'm',
+ "Faster than extend-check, but only finds 99.99% of all errors. Should be good enough for most cases.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"quick", 'q', "Faster repair by not modifying the data file.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"read-only", 'T',
+ "Don't mark table as checked.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"recover", 'r',
+ "Can fix almost anything except unique keys that aren't unique.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"parallel-recover", 'p',
+ "Same as '-r' but creates all the keys in parallel.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"safe-recover", 'o',
+ "Uses old recovery method; Slower than '-r' but can handle a couple of cases where '-r' reports that it can't fix the data file.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"sort-recover", 'n',
+ "Force recovering with sorting even if the temporary file was very big.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { "require-control-file", OPT_REQUIRE_CONTROL_FILE,
+ "Abort if cannot find control file",
+ (uchar**)&opt_require_control_file, 0, 0, GET_BOOL, NO_ARG,
+ 0, 0, 0, 0, 0, 0},
+#ifdef DEBUG
+ {"start-check-pos", OPT_START_CHECK_POS,
+ "No help available.",
+ 0, 0, 0, GET_ULL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"set-auto-increment", 'A',
+ "Force auto_increment to start at this or higher value. If no value is given, then sets the next auto_increment value to the highest used value for the auto key + 1.",
+ &check_param.auto_increment_value,
+ &check_param.auto_increment_value,
+ 0, GET_ULL, OPT_ARG, 0, 0, 0, 0, 0, 0},
+ {"set-collation", OPT_SET_COLLATION,
+ "Change the collation used by the index",
+ (char**) &set_collation_name, 0, 0, GET_STR, REQUIRED_ARG,
+ 0, 0, 0, 0, 0, 0},
+ {"silent", 's',
+ "Only print errors. One can use two -s to make aria_chk very silent.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+#ifdef SAFEMALLOC
+ {"skip-safemalloc", OPT_SKIP_SAFEMALLOC,
+ "Don't use the memory allocation checking.", 0, 0, 0, GET_NO_ARG, NO_ARG,
+ 0, 0, 0, 0, 0, 0},
+#endif
+#endif
+ {"sort-index", 'S',
+ "Sort index blocks. This speeds up 'read-next' in applications.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"sort-records", 'R',
+ "Sort records according to an index. This makes your data much more localized and may speed up things. (It may be VERY slow to do a sort the first time!)",
+ &check_param.opt_sort_key,
+ &check_param.opt_sort_key,
+ 0, GET_UINT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"tmpdir", 't', "Path for temporary files.", (char**) &opt_tmpdir,
+ 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"transaction-log", OPT_TRANSACTION_LOG,
+ "Log repair command to transaction log",
+ &opt_transaction_logging, &opt_transaction_logging,
+ 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"update-state", 'U',
+ "Mark tables as crashed if any errors were found and clean if check didn't "
+ "find any errors. This allows one to get rid of warnings like 'table not "
+ "properly closed'",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"unpack", 'u',
+ "Unpack file packed with aria_pack.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"verbose", 'v',
+ "Print more information. This can be used with --description and --check. Use many -v for more verbosity!",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"version", 'V', "Print version and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"wait", 'w', "Wait if table is locked.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"warning-for-wrong-transaction-id", OPT_WARNING_FOR_WRONG_TRANSID,
+ "Give a warning if we find a transaction id in the table that is bigger"
+ "than what exists in the control file. Use --skip-... to disable warning",
+ &opt_warning_for_wrong_transid, &opt_warning_for_wrong_transid,
+ 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0},
+ { "page_buffer_size", OPT_PAGE_BUFFER_SIZE,
+ "Size of page buffer. Used by --safe-repair",
+ &check_param.use_buffers, &check_param.use_buffers, 0,
+ GET_ULONG, REQUIRED_ARG, (long) USE_BUFFER_INIT, 1024L*1024L,
+ (long) ~0L, (long) MALLOC_OVERHEAD, (long) IO_SIZE, 0},
+ { "read_buffer_size", OPT_READ_BUFFER_SIZE,
+ "Read buffer size for sequential reads during scanning",
+ &check_param.read_buffer_length,
+ &check_param.read_buffer_length, 0, GET_ULONG, REQUIRED_ARG,
+ (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD,
+ (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0},
+ { "write_buffer_size", OPT_WRITE_BUFFER_SIZE,
+ "Write buffer size for sequential writes during repair of fixed size or dynamic size rows",
+ &check_param.write_buffer_length,
+ &check_param.write_buffer_length, 0, GET_ULONG, REQUIRED_ARG,
+ (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD,
+ (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0},
+ { "sort_buffer_size", OPT_SORT_BUFFER_SIZE,
+ "Size of sort buffer. Used by --recover",
+ &check_param.sort_buffer_length,
+ &check_param.sort_buffer_length, 0, GET_ULONG, REQUIRED_ARG,
+ (long) SORT_BUFFER_INIT, (long) (MIN_SORT_BUFFER + MALLOC_OVERHEAD),
+ (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0},
+ { "sort_key_blocks", OPT_SORT_KEY_BLOCKS,
+ "Internal buffer for sorting keys; Don't touch :)",
+ &check_param.sort_key_blocks,
+ &check_param.sort_key_blocks, 0, GET_ULONG, REQUIRED_ARG,
+ BUFFERS_WHEN_SORTING, 4L, 100L, 0L, 1L, 0},
+ { "decode_bits", OPT_DECODE_BITS, "", &decode_bits,
+ &decode_bits, 0, GET_UINT, REQUIRED_ARG, 9L, 4L, 17L, 0L, 1L, 0},
+ { "ft_min_word_len", OPT_FT_MIN_WORD_LEN, "", &ft_min_word_len,
+ &ft_min_word_len, 0, GET_ULONG, REQUIRED_ARG, 4, 1, HA_FT_MAXCHARLEN,
+ 0, 1, 0},
+ { "ft_max_word_len", OPT_FT_MAX_WORD_LEN, "", &ft_max_word_len,
+ &ft_max_word_len, 0, GET_ULONG, REQUIRED_ARG, HA_FT_MAXCHARLEN, 10,
+ HA_FT_MAXCHARLEN, 0, 1, 0},
+ { "aria_ft_stopword_file", OPT_FT_STOPWORD_FILE,
+ "Use stopwords from this file instead of built-in list.",
+ (char**) &ft_stopword_file, (char**) &ft_stopword_file, 0, GET_STR,
+ REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ { "stats_method", OPT_STATS_METHOD,
+ "Specifies how index statistics collection code should treat NULLs. "
+ "Possible values of name are \"nulls_unequal\" (default behavior for 4.1/5.0), "
+ "\"nulls_equal\" (emulate 4.0 behavior), and \"nulls_ignored\".",
+ (char**) &maria_stats_method_str, (char**) &maria_stats_method_str, 0,
+ GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ { "zerofill", 'z',
+ "Fill empty space in data and index files with zeroes,",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { "zerofill-keep-lsn", OPT_ZEROFILL_KEEP_LSN,
+ "Like --zerofill but does not zero out LSN of data/index pages;"
+ " used only for testing and debugging",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+#include <help_start.h>
+
+static void print_version(void)
+{
+ printf("%s Ver 1.0 for %s at %s\n", my_progname, SYSTEM_TYPE,
+ MACHINE_TYPE);
+ NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+ print_version();
+ puts("By Monty, for your professional use");
+ puts("This software comes with NO WARRANTY: see the PUBLIC for details.\n");
+ puts("Description, check and repair of Aria tables.");
+ puts("Used without options all tables on the command will be checked for errors");
+ printf("Usage: %s [OPTIONS] tables[.MAI]\n", my_progname_short);
+ printf("\nGlobal options:\n");
+#ifndef DBUG_OFF
+ printf("\
+ -#, --debug=... Output debug log. Often this is 'd:t:o,filename'.\n");
+#endif
+ printf("\
+ -H, --HELP Display this help and exit.\n\
+ -?, --help Display this help and exit.\n\
+ --datadir=path Path for control file (and logs if --logdir not used)\n\
+ --logdir=path Path for log files\n\
+ --require-control-file Abort if we can't find/read the maria_log_control\n\
+ file\n\
+ -s, --silent Only print errors. One can use two -s to make\n\
+ maria_chk very silent.\n\
+ -t, --tmpdir=path Path for temporary files. Multiple paths can be\n\
+ specified, separated by ");
+#if defined( __WIN__) || defined(__NETWARE__)
+ printf("semicolon (;)");
+#else
+ printf("colon (:)");
+#endif
+ printf(", they will be used\n\
+ in a round-robin fashion.\n\
+ -v, --verbose Print more information. This can be used with\n\
+ --description and --check. Use many -v for more verbosity.\n\
+ -V, --version Print version and exit.\n\
+ -w, --wait Wait if table is locked.\n\n");
+#ifdef DEBUG
+ puts(" --start-check-pos=# Start reading file at given offset.\n");
+#endif
+
+ puts("Check options (check is the default action for aria_chk):\n\
+ -c, --check Check table for errors.\n\
+ -e, --extend-check Check the table VERY throughly. Only use this in\n\
+ extreme cases as aria_chk should normally be able to\n\
+ find out if the table is ok even without this switch.\n\
+ -F, --fast Check only tables that haven't been closed properly.\n\
+ -C, --check-only-changed\n\
+ Check only tables that have changed since last check.\n\
+ -f, --force Restart with '-r' if there are any errors in the table.\n\
+ States will be updated as with '--update-state'.\n\
+ -i, --information Print statistics information about table that is checked.\n\
+ -m, --medium-check Faster than extend-check, but only finds 99.99% of\n\
+ all errors. Should be good enough for most cases.\n\
+ -U, --update-state Mark tables as crashed if you find any errors.\n\
+ -T, --read-only Don't mark table as checked.\n");
+
+ puts("\
+Recover (repair)/ options (When using '--recover' or '--safe-recover'):\n\
+ -B, --backup Make a backup of the .MAD file as 'filename-time.BAK'.\n\
+ --correct-checksum Correct checksum information for table.\n\
+ -D, --data-file-length=# Max length of data file (when recreating data\n\
+ file when it's full).\n\
+ -e, --extend-check Try to recover every possible row from the data file\n\
+ Normally this will also find a lot of garbage rows;\n\
+ Don't use this option if you are not totally desperate.\n\
+ -f, --force Overwrite old temporary files.\n\
+ -k, --keys-used=# Tell Aria to update only some specific keys. # is a\n\
+ bit mask of which keys to use. This can be used to\n\
+ get faster inserts.\n\
+ --max-record-length=#\n\
+ Skip rows bigger than this if aria_chk can't allocate\n\
+ memory to hold it.\n\
+ -r, --recover Can fix almost anything except unique keys that aren't\n\
+ unique.\n\
+ -n, --sort-recover Forces recovering with sorting even if the temporary\n\
+ file would be very big.\n\
+ -p, --parallel-recover\n\
+ Uses the same technique as '-r' and '-n', but creates\n\
+ all the keys in parallel, in different threads.");
+ puts("\
+ -o, --safe-recover Uses old recovery method; Slower than '-r' but can\n \
+ handle a couple of cases where '-r' reports that it\n\
+ can't fix the data file.\n\
+ --transaction-log Log repair command to transaction log. This is needed\n\
+ if one wants to use the aria_read_log to repeat the \n\
+ repair\n\
+ --character-sets-dir=...\n\
+ Directory where character sets are.\n\
+ --set-collation=name\n\
+ Change the collation used by the index.\n\
+ -q, --quick Faster repair by not modifying the data file.\n\
+ One can give a second '-q' to force aria_chk to\n\
+ modify the original datafile in case of duplicate keys.\n\
+ NOTE: Tables where the data file is currupted can't be\n\
+ fixed with this option.\n\
+ -u, --unpack Unpack file packed with ariapack.\n\
+");
+
+ puts("Other actions:\n\
+ -a, --analyze Analyze distribution of keys. Will make some joins in\n\
+ MariaDB faster. You can check the calculated distribution\n\
+ by using '--description --verbose table_name'.\n\
+ --stats_method=name Specifies how index statistics collection code should\n\
+ treat NULLs. Possible values of name are \"nulls_unequal\"\n\
+ (default for 4.1/5.0), \"nulls_equal\" (emulate 4.0), and \n\
+ \"nulls_ignored\".\n\
+ -d, --description Prints some information about table.\n\
+ -A, --set-auto-increment[=value]\n\
+ Force auto_increment to start at this or higher value\n\
+ If no value is given, then sets the next auto_increment\n\
+ value to the highest used value for the auto key + 1.\n\
+ -S, --sort-index Sort index blocks. This speeds up 'read-next' in\n\
+ applications.\n\
+ -R, --sort-records=#\n\
+ Sort records according to an index. This makes your\n\
+ data much more localized and may speed up things\n\
+ (It may be VERY slow to do a sort the first time!).\n\
+ -b, --block-search=#\n\
+ Find a record, a block at given offset belongs to.\n\
+ -z, --zerofill Fill empty space in data and index files with zeroes\n\
+ --zerofill-keep-lsn Like --zerofill but does not zero out LSN of\n\
+ data/index pages.");
+
+ puts("Variables:\n\
+--page_buffer_size=# Size of page buffer. Used by --safe-repair\n\
+--read_buffer_size=# Read buffer size for sequential reads during scanning\n\
+--sort_buffer_size=# Size of sort buffer. Used by --recover\n\
+--sort_key_blocks=# Internal buffer for sorting keys; Don't touch :)\n\
+--write_buffer_size=# Write buffer size for sequential writes during repair");
+
+ print_defaults("my", load_default_groups);
+ my_print_variables(my_long_options);
+}
+
+#include <help_end.h>
+
+const char *maria_stats_method_names[] = {"nulls_unequal", "nulls_equal",
+ "nulls_ignored", NullS};
+TYPELIB maria_stats_method_typelib= {
+ array_elements(maria_stats_method_names) - 1, "",
+ maria_stats_method_names, NULL};
+
+ /* Read options */
+
+static my_bool
+get_one_option(int optid,
+ const struct my_option *opt __attribute__((unused)),
+ char *argument)
+{
+ switch (optid) {
+#ifdef __NETWARE__
+ case OPT_AUTO_CLOSE:
+ setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
+ break;
+#endif
+ case 'a':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_STATISTICS;
+ else
+ check_param.testflag|= T_STATISTICS;
+ break;
+ case 'A':
+ if (argument)
+ check_param.auto_increment_value= strtoull(argument, NULL, 0);
+ else
+ check_param.auto_increment_value= 0; /* Set to max used value */
+ check_param.testflag|= T_AUTO_INC;
+ break;
+ case 'b':
+ check_param.search_after_block= strtoul(argument, NULL, 10);
+ break;
+ case 'B':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_BACKUP_DATA;
+ else
+ check_param.testflag|= T_BACKUP_DATA;
+ break;
+ case 'c':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_CHECK;
+ else
+ check_param.testflag|= T_CHECK;
+ break;
+ case 'C':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~(T_CHECK | T_CHECK_ONLY_CHANGED);
+ else
+ check_param.testflag|= T_CHECK | T_CHECK_ONLY_CHANGED;
+ break;
+ case 'D':
+ check_param.max_data_file_length=strtoll(argument, NULL, 10);
+ break;
+ case 's': /* silent */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~(T_SILENT | T_VERY_SILENT);
+ else
+ {
+ if (check_param.testflag & T_SILENT)
+ check_param.testflag|= T_VERY_SILENT;
+ check_param.testflag|= T_SILENT;
+ check_param.testflag&= ~T_WRITE_LOOP;
+ }
+ break;
+ case 'w':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_WAIT_FOREVER;
+ else
+ check_param.testflag|= T_WAIT_FOREVER;
+ break;
+ case 'd': /* description if isam-file */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_DESCRIPT;
+ else
+ check_param.testflag|= T_DESCRIPT;
+ break;
+ case 'e': /* extend check */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_EXTEND;
+ else
+ check_param.testflag|= T_EXTEND;
+ break;
+ case 'i':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_INFO;
+ else
+ check_param.testflag|= T_INFO;
+ break;
+ case 'f':
+ if (argument == disabled_my_option)
+ {
+ check_param.tmpfile_createflag= O_RDWR | O_TRUNC | O_EXCL;
+ check_param.testflag&= ~(T_FORCE_CREATE | T_UPDATE_STATE);
+ }
+ else
+ {
+ check_param.tmpfile_createflag= O_RDWR | O_TRUNC;
+ check_param.testflag|= T_FORCE_CREATE | T_UPDATE_STATE;
+ }
+ break;
+ case 'F':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_FAST;
+ else
+ check_param.testflag|= T_FAST;
+ break;
+ case 'k':
+ check_param.keys_in_use= (ulonglong) strtoll(argument, NULL, 10);
+ break;
+ case 'm':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_MEDIUM;
+ else
+ check_param.testflag|= T_MEDIUM; /* Medium check */
+ break;
+ case 'r': /* Repair table */
+ check_param.testflag&= ~T_REP_ANY;
+ if (argument != disabled_my_option)
+ check_param.testflag|= T_REP_BY_SORT;
+ break;
+ case 'p':
+ check_param.testflag&= ~T_REP_ANY;
+ if (argument != disabled_my_option)
+ check_param.testflag|= T_REP_PARALLEL;
+ break;
+ case 'o':
+ check_param.testflag&= ~T_REP_ANY;
+ check_param.force_sort= 0;
+ if (argument != disabled_my_option)
+ {
+ check_param.testflag|= T_REP;
+ my_disable_async_io= 1; /* More safety */
+ }
+ break;
+ case 'n':
+ check_param.testflag&= ~T_REP_ANY;
+ if (argument == disabled_my_option)
+ check_param.force_sort= 0;
+ else
+ {
+ check_param.testflag|= T_REP_BY_SORT;
+ check_param.force_sort= 1;
+ }
+ break;
+ case 'q':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~(T_QUICK | T_FORCE_UNIQUENESS);
+ else
+ check_param.testflag|=
+ (check_param.testflag & T_QUICK) ? T_FORCE_UNIQUENESS : T_QUICK;
+ break;
+ case 'u':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_UNPACK;
+ else
+ {
+ check_param.testflag|= T_UNPACK;
+ if (!(check_param.testflag & T_REP_ANY))
+ check_param.testflag|= T_REP_BY_SORT;
+ }
+ break;
+ case 'v': /* Verbose */
+ if (argument == disabled_my_option)
+ {
+ check_param.testflag&= ~T_VERBOSE;
+ check_param.verbose=0;
+ }
+ else
+ {
+ check_param.testflag|= T_VERBOSE;
+ check_param.verbose++;
+ }
+ break;
+ case 'R': /* Sort records */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_SORT_RECORDS;
+ else
+ {
+ check_param.testflag|= T_SORT_RECORDS;
+ check_param.opt_sort_key= (uint) atoi(argument) - 1;
+ if (check_param.opt_sort_key >= MARIA_MAX_KEY)
+ {
+ fprintf(stderr,
+ "The value of the sort key is bigger than max key: %d.\n",
+ MARIA_MAX_KEY);
+ exit(1);
+ }
+ }
+ break;
+ case 'S': /* Sort index */
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_SORT_INDEX;
+ else
+ check_param.testflag|= T_SORT_INDEX;
+ break;
+ case 'T':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_READONLY;
+ else
+ check_param.testflag|= T_READONLY;
+ break;
+ case 'U':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_UPDATE_STATE;
+ else
+ check_param.testflag|= T_UPDATE_STATE;
+ break;
+ case '#':
+ DBUG_SET_INITIAL(argument ? argument : "d:t:o,/tmp/aria_chk.trace");
+ opt_debug= 1;
+ break;
+ case OPT_SKIP_SAFEMALLOC:
+#ifdef SAFEMALLOC
+ sf_malloc_quick=1;
+#endif
+ break;
+ case 'V':
+ print_version();
+ exit(0);
+ case OPT_CORRECT_CHECKSUM:
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_CALC_CHECKSUM;
+ else
+ check_param.testflag|= T_CALC_CHECKSUM;
+ break;
+ case OPT_STATS_METHOD:
+ {
+ int method;
+ enum_handler_stats_method method_conv;
+ LINT_INIT(method_conv);
+ maria_stats_method_str= argument;
+ if ((method=find_type(argument, &maria_stats_method_typelib, 2)) <= 0)
+ {
+ fprintf(stderr, "Invalid value of stats_method: %s.\n", argument);
+ exit(1);
+ }
+ switch (method-1) {
+ case 0:
+ method_conv= MI_STATS_METHOD_NULLS_EQUAL;
+ break;
+ case 1:
+ method_conv= MI_STATS_METHOD_NULLS_NOT_EQUAL;
+ break;
+ case 2:
+ method_conv= MI_STATS_METHOD_IGNORE_NULLS;
+ break;
+ default: assert(0); /* Impossible */
+ }
+ check_param.stats_method= method_conv;
+ break;
+ }
+#ifdef DEBUG /* Only useful if debugging */
+ case OPT_START_CHECK_POS:
+ check_param.start_check_pos= strtoull(argument, NULL, 0);
+ break;
+#endif
+ case 'z':
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~T_ZEROFILL;
+ else
+ check_param.testflag|= T_ZEROFILL;
+ break;
+ case OPT_ZEROFILL_KEEP_LSN:
+ if (argument == disabled_my_option)
+ check_param.testflag&= ~(T_ZEROFILL_KEEP_LSN | T_ZEROFILL);
+ else
+ check_param.testflag|= (T_ZEROFILL_KEEP_LSN | T_ZEROFILL);
+ break;
+ case 'H':
+ my_print_help(my_long_options);
+ exit(0);
+ case '?':
+ usage();
+ exit(0);
+ }
+ return 0;
+}
+
+
+static void get_options(register int *argc,register char ***argv)
+{
+ int ho_error;
+
+ load_defaults("my", load_default_groups, argc, argv);
+ default_argv= *argv;
+ if (isatty(fileno(stdout)))
+ check_param.testflag|=T_WRITE_LOOP;
+
+ if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ /* If using repair, then update checksum if one uses --update-state */
+ if ((check_param.testflag & T_UPDATE_STATE) &&
+ (check_param.testflag & T_REP_ANY))
+ check_param.testflag|= T_CALC_CHECKSUM;
+
+ if (*argc == 0)
+ {
+ usage();
+ exit(-1);
+ }
+
+ if ((check_param.testflag & T_UNPACK) &&
+ (check_param.testflag & (T_QUICK | T_SORT_RECORDS)))
+ {
+ VOID(fprintf(stderr,
+ "%s: --unpack can't be used with --quick or --sort-records\n",
+ my_progname_short));
+ exit(1);
+ }
+ if ((check_param.testflag & T_READONLY) &&
+ (check_param.testflag &
+ (T_REP_ANY | T_STATISTICS | T_AUTO_INC |
+ T_SORT_RECORDS | T_SORT_INDEX | T_FORCE_CREATE)))
+ {
+ VOID(fprintf(stderr,
+ "%s: Can't use --readonly when repairing or sorting\n",
+ my_progname_short));
+ exit(1);
+ }
+
+ if (!opt_debug)
+ {
+ DEBUGGER_OFF; /* Speed up things a bit */
+ }
+ if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir))
+ exit(1);
+
+ check_param.tmpdir=&maria_chk_tmpdir;
+
+ if (set_collation_name)
+ if (!(set_collation= get_charset_by_name(set_collation_name,
+ MYF(MY_WME))))
+ exit(1);
+
+ return;
+} /* get options */
+
+
+ /* Check table */
+
+static int maria_chk(HA_CHECK *param, char *filename)
+{
+ int error,lock_type,recreate;
+ my_bool rep_quick= test(param->testflag & (T_QUICK | T_FORCE_UNIQUENESS));
+ MARIA_HA *info;
+ File datafile;
+ char llbuff[22],llbuff2[22];
+ my_bool state_updated=0;
+ MARIA_SHARE *share;
+ DBUG_ENTER("maria_chk");
+
+ param->out_flag=error=param->warning_printed=param->error_printed=
+ recreate=0;
+ datafile=0;
+ param->isam_file_name=filename; /* For error messages */
+ if (!(info=maria_open(filename,
+ (param->testflag & (T_DESCRIPT | T_READONLY)) ?
+ O_RDONLY : O_RDWR,
+ HA_OPEN_FOR_REPAIR |
+ ((param->testflag & T_WAIT_FOREVER) ?
+ HA_OPEN_WAIT_IF_LOCKED :
+ (param->testflag & T_DESCRIPT) ?
+ HA_OPEN_IGNORE_IF_LOCKED : HA_OPEN_ABORT_IF_LOCKED))))
+ {
+ /* Avoid twice printing of isam file name */
+ param->error_printed=1;
+ switch (my_errno) {
+ case HA_ERR_CRASHED:
+ _ma_check_print_error(param,"'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename);
+ break;
+ case HA_ERR_NOT_A_TABLE:
+ _ma_check_print_error(param,"'%s' is not a Aria table",filename);
+ break;
+ case HA_ERR_CRASHED_ON_USAGE:
+ _ma_check_print_error(param,"'%s' is marked as crashed",filename);
+ break;
+ case HA_ERR_CRASHED_ON_REPAIR:
+ _ma_check_print_error(param,"'%s' is marked as crashed after last repair",filename);
+ break;
+ case HA_ERR_OLD_FILE:
+ _ma_check_print_error(param,"'%s' is a old type of Aria table", filename);
+ break;
+ case HA_ERR_NEW_FILE:
+ _ma_check_print_error(param,"'%s' uses new features not supported by this version of the Aria library", filename);
+ break;
+ case HA_ERR_END_OF_FILE:
+ _ma_check_print_error(param,"Couldn't read complete header from '%s'", filename);
+ break;
+ case EAGAIN:
+ _ma_check_print_error(param,"'%s' is locked. Use -w to wait until unlocked",filename);
+ break;
+ case ENOENT:
+ _ma_check_print_error(param,"File '%s' doesn't exist",filename);
+ break;
+ case EACCES:
+ _ma_check_print_error(param,"You don't have permission to use '%s'",
+ filename);
+ break;
+ default:
+ _ma_check_print_error(param,"%d when opening Aria table '%s'",
+ my_errno,filename);
+ break;
+ }
+ DBUG_RETURN(1);
+ }
+ share= info->s;
+ share->tot_locks-= share->r_locks;
+ share->r_locks=0;
+ maria_block_size= share->base.block_size;
+
+ if (share->data_file_type == BLOCK_RECORD ||
+ ((param->testflag & T_UNPACK) &&
+ share->state.header.org_data_file_type == BLOCK_RECORD))
+ {
+ if (param->testflag & T_SORT_RECORDS)
+ {
+ _ma_check_print_error(param,
+ "Record format used by '%s' is is not yet supported with sort-records",
+ filename);
+ param->error_printed= 0;
+ error= 1;
+ goto end2;
+ }
+ /* We can't do parallell repair with BLOCK_RECORD yet */
+ if (param->testflag & T_REP_PARALLEL)
+ {
+ param->testflag&= ~T_REP_PARALLEL;
+ param->testflag|= T_REP_BY_SORT;
+ }
+ }
+
+ /*
+ Skip the checking of the file if:
+ We are using --fast and the table is closed properly
+ We are using --check-only-changed-tables and the table hasn't changed
+ */
+ if (param->testflag & (T_FAST | T_CHECK_ONLY_CHANGED))
+ {
+ my_bool need_to_check= (maria_is_crashed(info) ||
+ share->state.open_count != 0);
+
+ if ((param->testflag & (T_REP_ANY | T_SORT_RECORDS)) &&
+ ((share->state.changed & (STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR) ||
+ !(param->testflag & T_CHECK_ONLY_CHANGED))))
+ need_to_check=1;
+
+ if (info->s->base.keys && info->state->records)
+ {
+ if ((param->testflag & T_STATISTICS) &&
+ (share->state.changed & STATE_NOT_ANALYZED))
+ need_to_check=1;
+ if ((param->testflag & T_SORT_INDEX) &&
+ (share->state.changed & STATE_NOT_SORTED_PAGES))
+ need_to_check=1;
+ if ((param->testflag & T_REP_BY_SORT) &&
+ (share->state.changed & STATE_NOT_OPTIMIZED_KEYS))
+ need_to_check=1;
+ }
+ if ((param->testflag & T_CHECK_ONLY_CHANGED) &&
+ (share->state.changed & (STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR)))
+ need_to_check=1;
+ if (!need_to_check)
+ {
+ if (!(param->testflag & T_SILENT) || param->testflag & T_INFO)
+ printf("Aria file: %s is already checked\n",filename);
+ if (maria_close(info))
+ {
+ _ma_check_print_error(param,"%d when closing Aria table '%s'",
+ my_errno,filename);
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+ }
+ }
+ if ((param->testflag & (T_REP_ANY | T_STATISTICS |
+ T_SORT_RECORDS | T_SORT_INDEX)) &&
+ (((param->testflag & T_UNPACK) &&
+ share->data_file_type == COMPRESSED_RECORD) ||
+ mi_uint2korr(share->state.header.state_info_length) !=
+ MARIA_STATE_INFO_SIZE ||
+ mi_uint2korr(share->state.header.base_info_length) !=
+ MARIA_BASE_INFO_SIZE ||
+ maria_is_any_intersect_keys_active(param->keys_in_use, share->base.keys,
+ ~share->state.key_map) ||
+ maria_test_if_almost_full(info) ||
+ info->s->state.header.file_version[3] != maria_file_magic[3] ||
+ (set_collation &&
+ set_collation->number != share->state.header.language)))
+ {
+ if (set_collation)
+ param->language= set_collation->number;
+ if (maria_recreate_table(param, &info,filename))
+ {
+ VOID(fprintf(stderr,
+ "Aria table '%s' is not fixed because of errors\n",
+ filename));
+ return(-1);
+ }
+ recreate=1;
+ if (!(param->testflag & T_REP_ANY))
+ {
+ param->testflag|=T_REP_BY_SORT; /* if only STATISTICS */
+ if (!(param->testflag & T_SILENT))
+ printf("- '%s' has old table-format. Recreating index\n",filename);
+ rep_quick= 1;
+ }
+ share= info->s;
+ share->tot_locks-= share->r_locks;
+ share->r_locks=0;
+ }
+
+ if (param->testflag & T_DESCRIPT)
+ {
+ param->total_files++;
+ param->total_records+=info->state->records;
+ param->total_deleted+=info->state->del;
+ descript(param, info, filename);
+ maria_close(info); /* Should always succeed */
+ return(0);
+ }
+
+ if (!stopwords_inited++)
+ ft_init_stopwords();
+
+ if (!(param->testflag & T_READONLY))
+ lock_type = F_WRLCK; /* table is changed */
+ else
+ lock_type= F_RDLCK;
+ if (info->lock_type == F_RDLCK)
+ info->lock_type=F_UNLCK; /* Read only table */
+ if (_ma_readinfo(info,lock_type,0))
+ {
+ _ma_check_print_error(param,"Can't lock indexfile of '%s', error: %d",
+ filename,my_errno);
+ param->error_printed=0;
+ error= 1;
+ goto end2;
+ }
+ /*
+ _ma_readinfo() has locked the table.
+ We mark the table as locked (without doing file locks) to be able to
+ use functions that only works on locked tables (like row caching).
+ */
+ maria_lock_database(info, F_EXTRA_LCK);
+ datafile= info->dfile.file;
+ if (init_pagecache(maria_pagecache, (size_t) param->use_buffers, 0, 0,
+ maria_block_size, MY_WME) == 0)
+ {
+ _ma_check_print_error(param, "Can't initialize page cache with %lu memory",
+ (ulong) param->use_buffers);
+ error= 1;
+ goto end2;
+ }
+
+ if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX |
+ T_ZEROFILL))
+ {
+ /*
+ Mark table as not transactional to avoid logging. Should not be needed,
+ maria_repair and maria_zerofill do it already.
+ */
+ _ma_tmp_disable_logging_for_table(info, FALSE);
+
+ if (param->testflag & T_REP_ANY)
+ {
+ ulonglong tmp=share->state.key_map;
+ maria_copy_keys_active(share->state.key_map, share->base.keys,
+ param->keys_in_use);
+ if (tmp != share->state.key_map)
+ info->update|=HA_STATE_CHANGED;
+
+ if (rep_quick &&
+ maria_chk_del(param, info, param->testflag & ~T_VERBOSE))
+ {
+ if (param->testflag & T_FORCE_CREATE)
+ {
+ rep_quick=0;
+ _ma_check_print_info(param,"Creating new data file\n");
+ }
+ else
+ {
+ error=1;
+ _ma_check_print_error(param,
+ "Quick-recover aborted; Run recovery without switch 'q'");
+ }
+ }
+ }
+ if (!error)
+ {
+ /*
+ Unless this was only --zerofill-keep-lsn, old REDOs are not
+ applicable, tell the server's Recovery to ignore them; we don't
+ know what the log's end LSN is now, so we just let the server know
+ that it will have to find and store it.
+ This is the only case where create_rename_lsn can be a horizon and not
+ a LSN.
+ If this was only --zerofill-keep-lsn, the table can be used in
+ Recovery and especially in this scenario: do a dirty-copy-based backup
+ (snapshot-like), --zerofill-keep-lsn on the copies to achieve better
+ compression, compress the copies with an external tool, and after a
+ restore, Recovery still works (because pages and state still have
+ their correct LSNs).
+ */
+ if (share->base.born_transactional &&
+ ((param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX |
+ T_ZEROFILL | T_ZEROFILL_KEEP_LSN)) !=
+ (T_ZEROFILL | T_ZEROFILL_KEEP_LSN)))
+ share->state.create_rename_lsn= share->state.is_of_horizon=
+ share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
+ }
+ if (!error && (param->testflag & T_REP_ANY))
+ {
+ if ((param->testflag & (T_REP_BY_SORT | T_REP_PARALLEL)) &&
+ (maria_is_any_key_active(share->state.key_map) ||
+ (rep_quick && !param->keys_in_use && !recreate)) &&
+ maria_test_if_sort_rep(info, info->state->records,
+ info->s->state.key_map,
+ param->force_sort))
+ {
+ if (param->testflag & T_REP_BY_SORT)
+ error=maria_repair_by_sort(param,info,filename,rep_quick);
+ else
+ error=maria_repair_parallel(param,info,filename,rep_quick);
+ state_updated=1;
+ }
+ else
+ error=maria_repair(param, info,filename,rep_quick);
+ }
+ if (!error && (param->testflag & T_SORT_RECORDS))
+ {
+ /*
+ The data file is nowadays reopened in the repair code so we should
+ soon remove the following reopen-code
+ */
+#ifndef TO_BE_REMOVED
+ if (param->out_flag & O_NEW_DATA)
+ { /* Change temp file to org file */
+ VOID(my_close(info->dfile.file, MYF(MY_WME))); /* Close new file */
+ error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT,
+ MYF(0));
+ if (_ma_open_datafile(info,info->s, NullS, -1))
+ error=1;
+ param->out_flag&= ~O_NEW_DATA; /* We are using new datafile */
+ param->read_cache.file= info->dfile.file;
+ }
+#endif
+ if (! error)
+ {
+ uint key;
+ /*
+ We can't update the index in maria_sort_records if we have a
+ prefix compressed or fulltext index
+ */
+ my_bool update_index=1;
+ for (key=0 ; key < share->base.keys; key++)
+ if (share->keyinfo[key].flag & (HA_BINARY_PACK_KEY|HA_FULLTEXT))
+ update_index=0;
+
+ error=maria_sort_records(param,info,filename,param->opt_sort_key,
+ /* what is the following parameter for ? */
+ (my_bool) !(param->testflag & T_REP),
+ update_index);
+ datafile= info->dfile.file; /* This is now locked */
+ if (!error && !update_index)
+ {
+ if (param->verbose)
+ puts("Table had a compressed index; We must now recreate the index");
+ error=maria_repair_by_sort(param,info,filename,1);
+ }
+ }
+ }
+ if (!error && (param->testflag & T_SORT_INDEX))
+ error= maria_sort_index(param,info,filename);
+ if (!error && (param->testflag & T_ZEROFILL))
+ error= maria_zerofill(param, info, filename);
+ if (!error)
+ {
+ DBUG_PRINT("info", ("Reseting crashed state"));
+ share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR);
+ }
+ else
+ maria_mark_crashed(info);
+ }
+ else if ((param->testflag & T_CHECK) || !(param->testflag & T_AUTO_INC))
+ {
+ if (!(param->testflag & T_VERY_SILENT) || param->testflag & T_INFO)
+ printf("Checking Aria file: %s\n",filename);
+ if (!(param->testflag & T_SILENT))
+ printf("Data records: %7s Deleted blocks: %7s\n",
+ llstr(info->state->records,llbuff),
+ llstr(info->state->del,llbuff2));
+ maria_chk_init_for_check(param, info);
+ if (opt_warning_for_wrong_transid == 0)
+ param->max_trid= ~ (ulonglong) 0;
+ error= maria_chk_status(param,info);
+ maria_intersect_keys_active(share->state.key_map, param->keys_in_use);
+ error|= maria_chk_size(param,info);
+ if (!error || !(param->testflag & (T_FAST | T_FORCE_CREATE)))
+ error|=maria_chk_del(param, info,param->testflag);
+ if ((!error || (!(param->testflag & (T_FAST | T_FORCE_CREATE)) &&
+ !param->start_check_pos)))
+ {
+ error|=maria_chk_key(param, info);
+ if (!error && (param->testflag & (T_STATISTICS | T_AUTO_INC)))
+ error=maria_update_state_info(param, info,
+ ((param->testflag & T_STATISTICS) ?
+ UPDATE_STAT : 0) |
+ ((param->testflag & T_AUTO_INC) ?
+ UPDATE_AUTO_INC : 0));
+ }
+ if ((!rep_quick && !error) ||
+ !(param->testflag & (T_FAST | T_FORCE_CREATE)))
+ {
+ VOID(init_io_cache(&param->read_cache,datafile,
+ (uint) param->read_buffer_length,
+ READ_CACHE,
+ (param->start_check_pos ?
+ param->start_check_pos :
+ share->pack.header_length),
+ 1,
+ MYF(MY_WME)));
+ maria_lock_memory(param);
+ if ((info->s->data_file_type != STATIC_RECORD) ||
+ (param->testflag & (T_EXTEND | T_MEDIUM)))
+ error|=maria_chk_data_link(param, info,
+ test(param->testflag & T_EXTEND));
+ VOID(end_io_cache(&param->read_cache));
+ }
+ if (!error)
+ {
+ if (((share->state.changed &
+ (STATE_CHANGED | STATE_CRASHED | STATE_CRASHED_ON_REPAIR |
+ STATE_IN_REPAIR)) ||
+ share->state.open_count != 0)
+ && (param->testflag & T_UPDATE_STATE))
+ info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ DBUG_PRINT("info", ("Reseting crashed state"));
+ share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+ STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR);
+ }
+ else if (!maria_is_crashed(info) &&
+ (param->testflag & T_UPDATE_STATE))
+ { /* Mark crashed */
+ maria_mark_crashed(info);
+ info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+ }
+ }
+
+ if ((param->testflag & T_AUTO_INC) ||
+ ((param->testflag & T_REP_ANY) && info->s->base.auto_key))
+ _ma_update_auto_increment_key(param, info,
+ (my_bool) !test(param->testflag & T_AUTO_INC));
+
+ if (info->update & HA_STATE_CHANGED && ! (param->testflag & T_READONLY))
+ error|=maria_update_state_info(param, info,
+ UPDATE_OPEN_COUNT |
+ (((param->testflag & T_REP_ANY) ?
+ UPDATE_TIME : 0) |
+ (state_updated ? UPDATE_STAT : 0) |
+ ((param->testflag & T_SORT_RECORDS) ?
+ UPDATE_SORT : 0)));
+ info->update&= ~HA_STATE_CHANGED;
+ _ma_reenable_logging_for_table(info, FALSE);
+ maria_lock_database(info, F_UNLCK);
+
+end2:
+ end_pagecache(maria_pagecache, 1);
+ if (maria_close(info))
+ {
+ _ma_check_print_error(param, default_close_errmsg, my_errno, filename);
+ DBUG_RETURN(1);
+ }
+ if (error == 0)
+ {
+ if (param->out_flag & O_NEW_DATA)
+ error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT,
+ ((param->testflag & T_BACKUP_DATA) ?
+ MYF(MY_REDEL_MAKE_BACKUP) : MYF(0)));
+ if (param->out_flag & O_NEW_INDEX)
+ error|=maria_change_to_newfile(filename,MARIA_NAME_IEXT,INDEX_TMP_EXT,
+ MYF(0));
+ }
+ if (opt_transaction_logging &&
+ share->base.born_transactional && !error &&
+ (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX |
+ T_ZEROFILL)))
+ error= write_log_record(param);
+
+ if (param->not_visible_rows_found && (param->testflag & T_VERBOSE))
+ {
+ char buff[22];
+ printf("Max transaction id found: %s\n",
+ llstr(param->max_found_trid, buff));
+ }
+
+ VOID(fflush(stdout)); VOID(fflush(stderr));
+
+ if (param->error_printed)
+ {
+ if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX))
+ {
+ VOID(fprintf(stderr,
+ "Aria table '%s' is not fixed because of errors\n",
+ filename));
+ if (param->testflag & T_REP_ANY)
+ VOID(fprintf(stderr,
+ "Try fixing it by using the --safe-recover (-o), the --force (-f) option or by not using the --quick (-q) flag\n"));
+ }
+ else if (!(param->error_printed & 2) &&
+ !(param->testflag & T_FORCE_CREATE))
+ VOID(fprintf(stderr,
+ "Aria table '%s' is corrupted\nFix it using switch \"-r\" or \"-o\"\n",
+ filename));
+ }
+ else if (param->warning_printed &&
+ ! (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX |
+ T_FORCE_CREATE)))
+ VOID(fprintf(stderr, "Aria table '%s' is usable but should be fixed\n",
+ filename));
+ VOID(fflush(stderr));
+ DBUG_RETURN(error);
+} /* maria_chk */
+
+
+/* Write info about table */
+
+static void descript(HA_CHECK *param, register MARIA_HA *info, char *name)
+{
+ uint key,keyseg_nr,field;
+ reg3 MARIA_KEYDEF *keyinfo;
+ reg2 HA_KEYSEG *keyseg;
+ reg4 const char *text;
+ char buff[200],length[10],*pos,*end;
+ enum en_fieldtype type;
+ MARIA_SHARE *share= info->s;
+ char llbuff[22],llbuff2[22];
+ DBUG_ENTER("descript");
+
+ if (param->testflag & T_VERY_SILENT)
+ {
+ longlong checksum= info->state->checksum;
+ if (!(share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)))
+ checksum= 0;
+ printf("%s %s %s\n", name, llstr(info->state->records,llbuff),
+ llstr(checksum, llbuff2));
+ DBUG_VOID_RETURN;
+ }
+
+ printf("Aria file: %s\n",name);
+ printf("Record format: %s\n", record_formats[share->data_file_type]);
+ printf("Crashsafe: %s\n",
+ share->base.born_transactional ? "yes" : "no");
+ printf("Character set: %s (%d)\n",
+ get_charset_name(share->state.header.language),
+ share->state.header.language);
+
+ if (param->testflag & T_VERBOSE)
+ {
+ printf("File-version: %d\n",
+ (int) share->state.header.file_version[3]);
+ if (share->state.create_time)
+ {
+ get_date(buff,1,share->state.create_time);
+ printf("Creation time: %s\n",buff);
+ }
+ if (share->state.check_time)
+ {
+ get_date(buff,1,share->state.check_time);
+ printf("Recover time: %s\n",buff);
+ }
+ if (share->base.born_transactional)
+ {
+ printf("LSNs: create_rename (%lu,0x%lx),"
+ " state_horizon (%lu,0x%lx), skip_redo (%lu,0x%lx)\n",
+ LSN_IN_PARTS(share->state.create_rename_lsn),
+ LSN_IN_PARTS(share->state.is_of_horizon),
+ LSN_IN_PARTS(share->state.skip_redo_lsn));
+ }
+ compile_time_assert((MY_UUID_STRING_LENGTH + 1) <= sizeof(buff));
+ buff[MY_UUID_STRING_LENGTH]= 0;
+ my_uuid2str(share->base.uuid, buff);
+ printf("UUID: %s\n", buff);
+ pos=buff;
+ if (share->state.changed & STATE_CRASHED)
+ strmov(buff,"crashed");
+ else
+ {
+ if (share->state.open_count)
+ pos=strmov(pos,"open,");
+ if (share->state.changed & STATE_CHANGED)
+ pos=strmov(pos,"changed,");
+ else
+ pos=strmov(pos,"checked,");
+ if (!(share->state.changed & STATE_NOT_ANALYZED))
+ pos=strmov(pos,"analyzed,");
+ if (!(share->state.changed & STATE_NOT_OPTIMIZED_KEYS))
+ pos=strmov(pos,"optimized keys,");
+ if (!(share->state.changed & STATE_NOT_SORTED_PAGES))
+ pos=strmov(pos,"sorted index pages,");
+ if (!(share->state.changed & STATE_NOT_ZEROFILLED))
+ pos=strmov(pos,"zerofilled,");
+ if (!(share->state.changed & STATE_NOT_MOVABLE))
+ pos=strmov(pos,"movable,");
+ pos[-1]=0; /* Remove extra ',' */
+ }
+ printf("Status: %s\n",buff);
+ if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+ printf("Checksum: %26s\n",llstr(info->state->checksum,llbuff));
+;
+ if (share->options & HA_OPTION_DELAY_KEY_WRITE)
+ printf("Keys are only flushed at close\n");
+
+ if (share->options & HA_OPTION_PAGE_CHECKSUM)
+ printf("Page checksums are used\n");
+ if (share->base.auto_key)
+ {
+ printf("Auto increment key: %16d Last value: %18s\n",
+ share->base.auto_key,
+ llstr(share->state.auto_increment,llbuff));
+ }
+ }
+ printf("Data records: %16s Deleted blocks: %18s\n",
+ llstr(info->state->records,llbuff),llstr(info->state->del,llbuff2));
+ if (param->testflag & T_SILENT)
+ DBUG_VOID_RETURN; /* This is enough */
+
+ if (param->testflag & T_VERBOSE)
+ {
+#ifdef USE_RELOC
+ printf("Init-relocation: %16s\n",llstr(share->base.reloc,llbuff));
+#endif
+ printf("Datafile parts: %16s Deleted data: %18s\n",
+ llstr(share->state.split,llbuff),
+ llstr(info->state->empty,llbuff2));
+ printf("Datafile pointer (bytes): %11d Keyfile pointer (bytes): %13d\n",
+ share->rec_reflength,share->base.key_reflength);
+ printf("Datafile length: %16s Keyfile length: %18s\n",
+ llstr(info->state->data_file_length,llbuff),
+ llstr(info->state->key_file_length,llbuff2));
+
+ if (info->s->base.reloc == 1L && info->s->base.records == 1L)
+ puts("This is a one-record table");
+ else
+ {
+ if (share->base.max_data_file_length != HA_OFFSET_ERROR ||
+ share->base.max_key_file_length != HA_OFFSET_ERROR)
+ printf("Max datafile length: %16s Max keyfile length: %18s\n",
+ llstr(share->base.max_data_file_length-1,llbuff),
+ llstr(share->base.max_key_file_length-1,llbuff2));
+ }
+ }
+ printf("Block_size: %16d\n",(int) share->block_size);
+ printf("Recordlength: %16d\n",(int) share->base.pack_reclength);
+ if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
+ {
+ longlong2str(share->state.key_map,buff,2,1);
+ printf("Using only keys '%s' of %d possibly keys\n",
+ buff, share->base.keys);
+ }
+ puts("\nTable description:");
+ printf("Key Start Len Index Type");
+ if (param->testflag & T_VERBOSE)
+ printf(" Rec/key Root Blocksize");
+ VOID(putchar('\n'));
+
+ for (key=keyseg_nr=0, keyinfo= &share->keyinfo[0] ;
+ key < share->base.keys;
+ key++,keyinfo++)
+ {
+ keyseg=keyinfo->seg;
+ if (keyinfo->flag & HA_NOSAME) text="unique ";
+ else if (keyinfo->flag & HA_FULLTEXT) text="fulltext ";
+ else text="multip.";
+
+ pos=buff;
+ if (keyseg->flag & HA_REVERSE_SORT)
+ *pos++ = '-';
+ pos=strmov(pos,type_names[keyseg->type]);
+ *pos++ = ' ';
+ *pos=0;
+ if (keyinfo->flag & HA_PACK_KEY)
+ pos=strmov(pos,prefix_packed_txt);
+ if (keyinfo->flag & HA_BINARY_PACK_KEY)
+ pos=strmov(pos,bin_packed_txt);
+ if (keyseg->flag & HA_SPACE_PACK)
+ pos=strmov(pos,diff_txt);
+ if (keyseg->flag & HA_BLOB_PART)
+ pos=strmov(pos,blob_txt);
+ if (keyseg->flag & HA_NULL_PART)
+ pos=strmov(pos,null_txt);
+ *pos=0;
+
+ printf("%-4d%-6ld%-3d %-8s%-23s",
+ key+1,(long) keyseg->start+1,keyseg->length,text,buff);
+ if (share->state.key_root[key] != HA_OFFSET_ERROR)
+ llstr(share->state.key_root[key],buff);
+ else
+ buff[0]=0;
+ if (param->testflag & T_VERBOSE)
+ printf("%9.0f %12s %10d",
+ share->state.rec_per_key_part[keyseg_nr++],
+ buff,keyinfo->block_length);
+ VOID(putchar('\n'));
+ while ((++keyseg)->type != HA_KEYTYPE_END)
+ {
+ pos=buff;
+ if (keyseg->flag & HA_REVERSE_SORT)
+ *pos++ = '-';
+ pos=strmov(pos,type_names[keyseg->type]);
+ *pos++= ' ';
+ if (keyseg->flag & HA_SPACE_PACK)
+ pos=strmov(pos,diff_txt);
+ if (keyseg->flag & HA_BLOB_PART)
+ pos=strmov(pos,blob_txt);
+ if (keyseg->flag & HA_NULL_PART)
+ pos=strmov(pos,null_txt);
+ *pos=0;
+ printf(" %-6ld%-3d %-21s",
+ (long) keyseg->start+1,keyseg->length,buff);
+ if (param->testflag & T_VERBOSE)
+ printf("%11.0f", share->state.rec_per_key_part[keyseg_nr++]);
+ VOID(putchar('\n'));
+ }
+ keyseg++;
+ }
+ if (share->state.header.uniques)
+ {
+ MARIA_UNIQUEDEF *uniqueinfo;
+ puts("\nUnique Key Start Len Nullpos Nullbit Type");
+ for (key=0,uniqueinfo= &share->uniqueinfo[0] ;
+ key < share->state.header.uniques; key++, uniqueinfo++)
+ {
+ my_bool new_row=0;
+ char null_bit[8],null_pos[8];
+ printf("%-8d%-5d",key+1,uniqueinfo->key+1);
+ for (keyseg=uniqueinfo->seg ; keyseg->type != HA_KEYTYPE_END ; keyseg++)
+ {
+ if (new_row)
+ fputs(" ",stdout);
+ null_bit[0]=null_pos[0]=0;
+ if (keyseg->null_bit)
+ {
+ sprintf(null_bit,"%d",keyseg->null_bit);
+ sprintf(null_pos,"%ld",(long) keyseg->null_pos+1);
+ }
+ printf("%-7ld%-5d%-9s%-10s%-30s\n",
+ (long) keyseg->start+1,keyseg->length,
+ null_pos,null_bit,
+ type_names[keyseg->type]);
+ new_row=1;
+ }
+ }
+ }
+ if (param->verbose > 1)
+ {
+ char null_bit[8],null_pos[8];
+ printf("\nField Start Length Nullpos Nullbit Type");
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ printf(" Huff tree Bits");
+ VOID(putchar('\n'));
+
+ for (field=0 ; field < share->base.fields ; field++)
+ {
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ type=share->columndef[field].base_type;
+ else
+ type=(enum en_fieldtype) share->columndef[field].type;
+ end=strmov(buff,field_pack[type]);
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ {
+ if (share->columndef[field].pack_type & PACK_TYPE_SELECTED)
+ end=strmov(end,", not_always");
+ if (share->columndef[field].pack_type & PACK_TYPE_SPACE_FIELDS)
+ end=strmov(end,", no empty");
+ if (share->columndef[field].pack_type & PACK_TYPE_ZERO_FILL)
+ {
+ sprintf(end,", zerofill(%d)",share->columndef[field].space_length_bits);
+ end=strend(end);
+ }
+ }
+ if (buff[0] == ',')
+ strmov(buff,buff+2);
+ int10_to_str((long) share->columndef[field].length,length,10);
+ null_bit[0]=null_pos[0]=0;
+ if (share->columndef[field].null_bit)
+ {
+ sprintf(null_bit,"%d",share->columndef[field].null_bit);
+ sprintf(null_pos,"%d",share->columndef[field].null_pos+1);
+ }
+ printf("%-6d%-6u%-7s%-8s%-8s%-35s",field+1,
+ (uint) share->columndef[field].offset+1,
+ length, null_pos, null_bit, buff);
+ if (share->options & HA_OPTION_COMPRESS_RECORD)
+ {
+ if (share->columndef[field].huff_tree)
+ printf("%3d %2d",
+ (uint) (share->columndef[field].huff_tree-share->decode_trees)+1,
+ share->columndef[field].huff_tree->quick_table_bits);
+ }
+ VOID(putchar('\n'));
+ }
+ if (share->data_file_type == BLOCK_RECORD)
+ {
+ uint i;
+ puts("\nBitmap Data size Description");
+ for (i=0 ; i <= 7 ; i++)
+ printf("%u %5u %s\n", i, share->bitmap.sizes[i],
+ bitmap_description[i]);
+ }
+ }
+ DBUG_VOID_RETURN;
+} /* describe */
+
+
+ /* Sort records according to one key */
+
+static int maria_sort_records(HA_CHECK *param,
+ register MARIA_HA *info, char *name,
+ uint sort_key,
+ my_bool write_info,
+ my_bool update_index)
+{
+ int got_error;
+ uint key;
+ MARIA_KEYDEF *keyinfo;
+ File new_file;
+ uchar *temp_buff;
+ ha_rows old_record_count;
+ MARIA_SHARE *share= info->s;
+ char llbuff[22],llbuff2[22];
+ MARIA_SORT_INFO sort_info;
+ MARIA_SORT_PARAM sort_param;
+ MARIA_PAGE page;
+ DBUG_ENTER("sort_records");
+
+ bzero((char*)&sort_info,sizeof(sort_info));
+ bzero((char*)&sort_param,sizeof(sort_param));
+ sort_param.sort_info=&sort_info;
+ sort_info.param=param;
+ keyinfo= &share->keyinfo[sort_key];
+ got_error=1;
+ temp_buff=0;
+ new_file= -1;
+
+ if (! maria_is_key_active(share->state.key_map, sort_key))
+ {
+ _ma_check_print_warning(param,
+ "Can't sort table '%s' on key %d; No such key",
+ name,sort_key+1);
+ param->error_printed=0;
+ DBUG_RETURN(0); /* Nothing to do */
+ }
+ if (keyinfo->flag & HA_FULLTEXT)
+ {
+ _ma_check_print_warning(param,"Can't sort table '%s' on FULLTEXT key %d",
+ name,sort_key+1);
+ param->error_printed=0;
+ DBUG_RETURN(0); /* Nothing to do */
+ }
+ if (keyinfo->flag & HA_BINARY_PACK_KEY)
+ {
+ _ma_check_print_warning(param,
+ "Can't sort table '%s' on a key with prefix "
+ "packing %d",
+ name,sort_key+1);
+ param->error_printed=0;
+ DBUG_RETURN(0);
+ }
+
+
+ if (share->data_file_type == COMPRESSED_RECORD)
+ {
+ _ma_check_print_warning(param,"Can't sort read-only table '%s'", name);
+ param->error_printed=0;
+ DBUG_RETURN(0); /* Nothing to do */
+ }
+ if (!(param->testflag & T_SILENT))
+ {
+ printf("- Sorting records for Aria table '%s'\n",name);
+ if (write_info)
+ printf("Data records: %9s Deleted: %9s\n",
+ llstr(info->state->records,llbuff),
+ llstr(info->state->del,llbuff2));
+ }
+ if (share->state.key_root[sort_key] == HA_OFFSET_ERROR)
+ DBUG_RETURN(0); /* Nothing to do */
+
+ if (init_io_cache(&info->rec_cache,-1,(uint) param->write_buffer_length,
+ WRITE_CACHE,share->pack.header_length,1,
+ MYF(MY_WME | MY_WAIT_IF_FULL)))
+ goto err;
+ info->opt_flag|=WRITE_CACHE_USED;
+
+ if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length)))
+ {
+ _ma_check_print_error(param,"Not enough memory for key block");
+ goto err;
+ }
+
+ if (!(sort_param.record=
+ (uchar*) my_malloc((uint) share->base.default_rec_buff_size, MYF(0))))
+ {
+ _ma_check_print_error(param,"Not enough memory for record");
+ goto err;
+ }
+
+ fn_format(param->temp_filename,name,"", MARIA_NAME_DEXT,2+4+32);
+ new_file= my_create(fn_format(param->temp_filename,
+ param->temp_filename,"",
+ DATA_TMP_EXT,
+ MY_REPLACE_EXT | MY_UNPACK_FILENAME),
+ 0, param->tmpfile_createflag,
+ MYF(0));
+ if (new_file < 0)
+ {
+ _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+ param->temp_filename);
+ goto err;
+ }
+ if (share->pack.header_length)
+ if (maria_filecopy(param, new_file, info->dfile.file, 0L,
+ share->pack.header_length,
+ "datafile-header"))
+ goto err;
+ info->rec_cache.file=new_file; /* Use this file for cacheing*/
+
+ maria_lock_memory(param);
+ for (key=0 ; key < share->base.keys ; key++)
+ share->keyinfo[key].flag|= HA_SORT_ALLOWS_SAME;
+
+ if (my_pread(share->kfile.file, temp_buff,
+ (uint) keyinfo->block_length,
+ share->state.key_root[sort_key],
+ MYF(MY_NABP+MY_WME)))
+ {
+ _ma_check_print_error(param, "Can't read indexpage from filepos: %s",
+ llstr(share->state.key_root[sort_key], llbuff));
+ goto err;
+ }
+
+ /* Setup param for _ma_sort_write_record */
+ sort_info.info=info;
+ sort_info.new_data_file_type=share->data_file_type;
+ sort_param.fix_datafile=1;
+ sort_param.master=1;
+ sort_param.filepos=share->pack.header_length;
+ old_record_count=info->state->records;
+ info->state->records=0;
+ if (sort_info.new_data_file_type != COMPRESSED_RECORD)
+ info->state->checksum=0;
+
+ _ma_page_setup(&page, info, keyinfo, share->state.key_root[sort_key],
+ temp_buff);
+ if (sort_record_index(&sort_param, &page, sort_key,new_file,update_index) ||
+ maria_write_data_suffix(&sort_info,1) ||
+ flush_io_cache(&info->rec_cache))
+ goto err;
+
+ if (info->state->records != old_record_count)
+ {
+ _ma_check_print_error(param,"found %s of %s records",
+ llstr(info->state->records,llbuff),
+ llstr(old_record_count,llbuff2));
+ goto err;
+ }
+
+ VOID(my_close(info->dfile.file, MYF(MY_WME)));
+ param->out_flag|=O_NEW_DATA; /* Data in new file */
+ info->dfile.file= new_file; /* Use new datafile */
+ _ma_set_data_pagecache_callbacks(&info->dfile, info->s);
+
+ info->state->del=0;
+ info->state->empty=0;
+ share->state.dellink= HA_OFFSET_ERROR;
+ info->state->data_file_length=sort_param.filepos;
+ share->state.split=info->state->records; /* Only hole records */
+ share->state.version=(ulong) time((time_t*) 0);
+
+ info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+ if (param->testflag & T_WRITE_LOOP)
+ {
+ VOID(fputs(" \r",stdout)); VOID(fflush(stdout));
+ }
+ got_error=0;
+
+err:
+ if (got_error && new_file >= 0)
+ {
+ VOID(end_io_cache(&info->rec_cache));
+ (void) my_close(new_file,MYF(MY_WME));
+ (void) my_delete(param->temp_filename, MYF(MY_WME));
+ }
+ if (temp_buff)
+ {
+ my_afree(temp_buff);
+ }
+ my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR));
+ info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+ VOID(end_io_cache(&info->rec_cache));
+ my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+ sort_info.buff=0;
+ share->state.sortkey=sort_key;
+ DBUG_RETURN(got_error);
+} /* sort_records */
+
+
+/* Sort records recursive using one index */
+
+static int sort_record_index(MARIA_SORT_PARAM *sort_param,
+ MARIA_PAGE *ma_page, uint sort_key,
+ File new_file,my_bool update_index)
+{
+ MARIA_HA *info= ma_page->info;
+ MARIA_SHARE *share= info->s;
+ uint page_flag, nod_flag,used_length;
+ uchar *temp_buff,*keypos,*endpos;
+ my_off_t next_page,rec_pos;
+ uchar lastkey[MARIA_MAX_KEY_BUFF];
+ char llbuff[22];
+ MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+ HA_CHECK *param=sort_info->param;
+ MARIA_KEY tmp_key;
+ MARIA_PAGE new_page;
+ const MARIA_KEYDEF *keyinfo= ma_page->keyinfo;
+ DBUG_ENTER("sort_record_index");
+
+ page_flag= ma_page->flag;
+ nod_flag= ma_page->node;
+ temp_buff=0;
+ tmp_key.keyinfo= (MARIA_KEYDEF*) keyinfo;
+ tmp_key.data= lastkey;
+
+ if (nod_flag)
+ {
+ if (!(temp_buff= (uchar*) my_alloca(tmp_key.keyinfo->block_length)))
+ {
+ _ma_check_print_error(param,"Not Enough memory");
+ DBUG_RETURN(-1);
+ }
+ }
+ used_length= ma_page->size;
+ keypos= ma_page->buff + share->keypage_header + nod_flag;
+ endpos= ma_page->buff + used_length;
+ for ( ;; )
+ {
+ _sanity(__FILE__,__LINE__);
+ if (nod_flag)
+ {
+ next_page= _ma_kpos(nod_flag, keypos);
+ if (my_pread(share->kfile.file, temp_buff,
+ (uint) tmp_key.keyinfo->block_length, next_page,
+ MYF(MY_NABP+MY_WME)))
+ {
+ _ma_check_print_error(param,"Can't read keys from filepos: %s",
+ llstr(next_page,llbuff));
+ goto err;
+ }
+ _ma_page_setup(&new_page, info, ma_page->keyinfo, next_page, temp_buff);
+
+ if (sort_record_index(sort_param, &new_page, sort_key,
+ new_file, update_index))
+ goto err;
+ }
+ _sanity(__FILE__,__LINE__);
+ if (keypos >= endpos ||
+ !(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos))
+ break;
+ rec_pos= _ma_row_pos_from_key(&tmp_key);
+
+ if ((*share->read_record)(info,sort_param->record,rec_pos))
+ {
+ _ma_check_print_error(param,"%d when reading datafile",my_errno);
+ goto err;
+ }
+ if (rec_pos != sort_param->filepos && update_index)
+ {
+ _ma_dpointer(share, keypos - nod_flag - tmp_key.ref_length,
+ sort_param->filepos);
+ if (maria_movepoint(info,sort_param->record,rec_pos,sort_param->filepos,
+ sort_key))
+ {
+ _ma_check_print_error(param,"%d when updating key-pointers",my_errno);
+ goto err;
+ }
+ }
+ if (_ma_sort_write_record(sort_param))
+ goto err;
+ }
+ /* Clear end of block to get better compression if the table is backuped */
+ bzero(ma_page->buff + used_length, keyinfo->block_length - used_length);
+ if (my_pwrite(share->kfile.file, ma_page->buff, (uint)keyinfo->block_length,
+ ma_page->pos, param->myf_rw))
+ {
+ _ma_check_print_error(param,"%d when updating keyblock",my_errno);
+ goto err;
+ }
+ if (temp_buff)
+ my_afree(temp_buff);
+ DBUG_RETURN(0);
+err:
+ if (temp_buff)
+ my_afree(temp_buff);
+ DBUG_RETURN(1);
+} /* sort_record_index */
+
+
+static my_bool write_log_record(HA_CHECK *param)
+{
+ /*
+ Now that all operations including O_NEW_DATA|INDEX are successfully
+ done, we can write a log record.
+ */
+ MARIA_HA *info= maria_open(param->isam_file_name, O_RDWR, 0);
+ if (info == NULL)
+ _ma_check_print_error(param, default_open_errmsg, my_errno,
+ param->isam_file_name);
+ else
+ {
+ if (write_log_record_for_repair(param, info))
+ _ma_check_print_error(param, "%d when writing log record for"
+ " Aria table '%s'", my_errno,
+ param->isam_file_name);
+ else if (maria_close(info))
+ _ma_check_print_error(param, default_close_errmsg, my_errno,
+ param->isam_file_name);
+ else
+ return FALSE;
+ }
+ return TRUE;
+}
+
+#include "ma_check_standalone.h"
diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h
new file mode 100644
index 00000000000..ba97684b1aa
--- /dev/null
+++ b/storage/maria/maria_def.h
@@ -0,0 +1,1267 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* This file is included by all internal maria files */
+
+#include "maria.h" /* Structs & some defines */
+#include <myisampack.h> /* packing of keys */
+#include <my_tree.h>
+#include <my_bitmap.h>
+#ifdef THREAD
+#include <my_pthread.h>
+#include <thr_lock.h>
+#else
+#include <my_no_pthread.h>
+#endif
+#include <hash.h>
+#include "ma_loghandler.h"
+#include "ma_control_file.h"
+#include "ma_state.h"
+#include <waiting_threads.h>
+
+/* For testing recovery */
+#ifdef TO_BE_REMOVED
+#define IDENTICAL_PAGES_AFTER_RECOVERY 1
+#endif
+/* Do extra sanity checking */
+#define SANITY_CHECKS 1
+#ifdef EXTRA_DEBUG
+#define EXTRA_DEBUG_KEY_CHANGES
+#define EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES
+#endif
+
+#define MAX_NONMAPPED_INSERTS 1000
+#define MARIA_MAX_TREE_LEVELS 32
+
+/* maria_open() flag, specific for maria_pack */
+#define HA_OPEN_IGNORE_MOVED_STATE (1U << 30)
+
+struct st_transaction;
+
+/* undef map from my_nosys; We need test-if-disk full */
+#undef my_write
+
+#define CRC_SIZE 4
+
+typedef struct st_maria_state_info
+{
+ struct
+ { /* Fileheader (24 bytes) */
+ uchar file_version[4];
+ uchar options[2];
+ uchar header_length[2];
+ uchar state_info_length[2];
+ uchar base_info_length[2];
+ uchar base_pos[2];
+ uchar key_parts[2]; /* Key parts */
+ uchar unique_key_parts[2]; /* Key parts + unique parts */
+ uchar keys; /* number of keys in file */
+ uchar uniques; /* number of UNIQUE definitions */
+ uchar language; /* Language for indexes */
+ uchar fulltext_keys;
+ uchar data_file_type;
+ /* Used by mariapack to store the original data_file_type */
+ uchar org_data_file_type;
+ } header;
+
+ MARIA_STATUS_INFO state;
+ /* maria_ha->state points here for crash-safe but not versioned tables */
+ MARIA_STATUS_INFO common;
+ ha_rows split; /* number of split blocks */
+ my_off_t dellink; /* Link to next removed block */
+ pgcache_page_no_t first_bitmap_with_space;
+ ulonglong auto_increment;
+ TrID create_trid; /* Minum trid for file */
+ TrID last_change_trn; /* selfdescriptive */
+ ulong update_count; /* Updated for each write lock */
+ ulong status;
+ double *rec_per_key_part;
+ ulong *nulls_per_key_part;
+ ha_checksum checksum; /* Table checksum */
+ my_off_t *key_root; /* Start of key trees */
+ my_off_t key_del; /* delete links for index pages */
+ my_off_t records_at_analyze; /* Rows when calculating rec_per_key */
+
+ ulong sec_index_changed; /* Updated when new sec_index */
+ ulong sec_index_used; /* which extra index are in use */
+ ulonglong key_map; /* Which keys are in use */
+ ulong version; /* timestamp of create */
+ time_t create_time; /* Time when created database */
+ time_t recover_time; /* Time for last recover */
+ time_t check_time; /* Time for last check */
+ uint sortkey; /* sorted by this key (not used) */
+ uint open_count;
+ uint changed; /* Changed since maria_chk */
+ /**
+ Birthday of the table: no record in the log before this LSN should ever
+ be applied to the table. Updated when created, renamed, explicitely
+ repaired (REPAIR|OPTIMIZE TABLE, ALTER TABLE ENABLE KEYS, maria_chk).
+ */
+ LSN create_rename_lsn;
+ /** @brief Log horizon when state was last updated on disk */
+ TRANSLOG_ADDRESS is_of_horizon;
+ /**
+ REDO phase should ignore any record before this LSN. UNDO phase
+ shouldn't, this is the difference with create_rename_lsn.
+ skip_redo_lsn >= create_rename_lsn.
+ The distinction is for these cases:
+ - after a repair at end of bulk insert (enabling indices), REDO phase
+ should skip the table but UNDO phase should not, so only skip_redo_lsn is
+ increased, not create_rename_lsn
+ - if one table is corrupted and so recovery fails, user may repair the
+ table with maria_chk and let recovery restart: that recovery should then
+ skip the repaired table even in the UNDO phase, so create_rename_lsn is
+ increased.
+ */
+ LSN skip_redo_lsn;
+
+ /* the following isn't saved on disk */
+ uint state_diff_length; /* Should be 0 */
+ uint state_length; /* Length of state header in file */
+ ulong *key_info;
+} MARIA_STATE_INFO;
+
+
+#define MARIA_STATE_INFO_SIZE \
+ (24 + 2 + LSN_STORE_SIZE*3 + 4 + 11*8 + 4*4 + 8 + 3*4 + 5*8)
+#define MARIA_FILE_OPEN_COUNT_OFFSET 0
+#define MARIA_FILE_CHANGED_OFFSET 2
+#define MARIA_FILE_CREATE_RENAME_LSN_OFFSET 4
+#define MARIA_FILE_CREATE_TRID_OFFSET (4 + LSN_STORE_SIZE*3 + 11*8)
+
+#define MARIA_STATE_KEY_SIZE (8 + 4)
+#define MARIA_STATE_KEYBLOCK_SIZE 8
+#define MARIA_STATE_KEYSEG_SIZE 12
+#define MARIA_STATE_EXTRA_SIZE (MARIA_MAX_KEY*MARIA_STATE_KEY_SIZE + MARIA_MAX_KEY*HA_MAX_KEY_SEG*MARIA_STATE_KEYSEG_SIZE)
+#define MARIA_KEYDEF_SIZE (2+ 5*2)
+#define MARIA_UNIQUEDEF_SIZE (2+1+1)
+#define HA_KEYSEG_SIZE (6+ 2*2 + 4*2)
+#define MARIA_MAX_KEY_BUFF (HA_MAX_KEY_BUFF + MARIA_MAX_PACK_TRANSID_SIZE)
+#define MARIA_COLUMNDEF_SIZE (2*7+1+1+4)
+#define MARIA_BASE_INFO_SIZE (MY_UUID_SIZE + 5*8 + 6*4 + 11*2 + 6 + 5*2 + 1 + 16)
+#define MARIA_INDEX_BLOCK_MARGIN 16 /* Safety margin for .MYI tables */
+/* Internal management bytes needed to store 2 transid/key on an index page */
+#define MARIA_MAX_PACK_TRANSID_SIZE (TRANSID_SIZE+1)
+#define MARIA_TRANSID_PACK_OFFSET (256- TRANSID_SIZE - 1)
+#define MARIA_MIN_TRANSID_PACK_OFFSET (MARIA_TRANSID_PACK_OFFSET-TRANSID_SIZE)
+#define MARIA_INDEX_OVERHEAD_SIZE (MARIA_MAX_PACK_TRANSID_SIZE * 2)
+#define MARIA_DELETE_KEY_NR 255 /* keynr for deleted blocks */
+
+/*
+ Basic information of the Maria table. This is stored on disk
+ and not changed (unless we do DLL changes).
+*/
+
+typedef struct st_ma_base_info
+{
+ my_off_t keystart; /* Start of keys */
+ my_off_t max_data_file_length;
+ my_off_t max_key_file_length;
+ my_off_t margin_key_file_length;
+ ha_rows records, reloc; /* Create information */
+ ulong mean_row_length; /* Create information */
+ ulong reclength; /* length of unpacked record */
+ ulong pack_reclength; /* Length of full packed rec */
+ ulong min_pack_length;
+ ulong max_pack_length; /* Max possibly length of packed rec */
+ ulong min_block_length;
+ uint fields; /* fields in table */
+ uint fixed_not_null_fields;
+ uint fixed_not_null_fields_length;
+ uint max_field_lengths;
+ uint pack_fields; /* packed fields in table */
+ uint varlength_fields; /* char/varchar/blobs */
+ /* Number of bytes in the index used to refer to a row (2-8) */
+ uint rec_reflength;
+ /* Number of bytes in the index used to refer to another index page (2-8) */
+ uint key_reflength; /* = 2-8 */
+ uint keys; /* same as in state.header */
+ uint auto_key; /* Which key-1 is a auto key */
+ uint blobs; /* Number of blobs */
+ /* Length of packed bits (when table was created first time) */
+ uint pack_bytes;
+ /* Length of null bits (when table was created first time) */
+ uint original_null_bytes;
+ uint null_bytes; /* Null bytes in record */
+ uint field_offsets; /* Number of field offsets */
+ uint max_key_block_length; /* Max block length */
+ uint max_key_length; /* Max key length */
+ /* Extra allocation when using dynamic record format */
+ uint extra_alloc_bytes;
+ uint extra_alloc_procent;
+ uint is_nulls_extended; /* 1 if new null bytes */
+ uint default_row_flag; /* 0 or ROW_FLAG_NULLS_EXTENDED */
+ uint block_size;
+ /* Size of initial record buffer */
+ uint default_rec_buff_size;
+ /* Extra number of bytes the row format require in the record buffer */
+ uint extra_rec_buff_size;
+ /* Tuning flags that can be ignored by older Maria versions */
+ uint extra_options;
+
+ /* The following are from the header */
+ uint key_parts, all_key_parts;
+ uchar uuid[MY_UUID_SIZE];
+ /**
+ @brief If false, we disable logging, versioning, transaction etc. Observe
+ difference with MARIA_SHARE::now_transactional
+ */
+ my_bool born_transactional;
+} MARIA_BASE_INFO;
+
+
+/* Structs used intern in database */
+
+typedef struct st_maria_blob /* Info of record */
+{
+ ulong offset; /* Offset to blob in record */
+ uint pack_length; /* Type of packed length */
+ ulong length; /* Calc:ed for each record */
+} MARIA_BLOB;
+
+
+typedef struct st_maria_pack
+{
+ ulong header_length;
+ uint ref_length;
+ uchar version;
+} MARIA_PACK;
+
+typedef struct st_maria_file_bitmap
+{
+ uchar *map;
+ pgcache_page_no_t page; /* Page number for current bitmap */
+ uint used_size; /* Size of bitmap head that is not 0 */
+ my_bool changed; /* 1 if page needs to be written */
+ my_bool changed_not_flushed; /* 1 if some bitmap is not flushed */
+ uint flush_all_requested; /**< If _ma_bitmap_flush_all waiting */
+ uint non_flushable; /**< 0 if bitmap and log are in sync */
+ PAGECACHE_FILE file; /* datafile where bitmap is stored */
+
+#ifdef THREAD
+ pthread_mutex_t bitmap_lock;
+ pthread_cond_t bitmap_cond; /**< When bitmap becomes flushable */
+#endif
+ /* Constants, allocated when initiating bitmaps */
+ uint sizes[8]; /* Size per bit combination */
+ uint total_size; /* Total usable size of bitmap page */
+ uint block_size; /* Block size of file */
+ ulong pages_covered; /* Pages covered by bitmap + 1 */
+ DYNAMIC_ARRAY pinned_pages; /**< not-yet-flushable bitmap pages */
+} MARIA_FILE_BITMAP;
+
+#define MARIA_CHECKPOINT_LOOKS_AT_ME 1
+#define MARIA_CHECKPOINT_SHOULD_FREE_ME 2
+#define MARIA_CHECKPOINT_SEEN_IN_LOOP 4
+
+typedef struct st_maria_share
+{ /* Shared between opens */
+ MARIA_STATE_INFO state;
+ MARIA_BASE_INFO base;
+ MARIA_STATE_HISTORY *state_history;
+ MARIA_KEYDEF ft2_keyinfo; /* Second-level ft-key definition */
+ MARIA_KEYDEF *keyinfo; /* Key definitions */
+ MARIA_UNIQUEDEF *uniqueinfo; /* unique definitions */
+ HA_KEYSEG *keyparts; /* key part info */
+ MARIA_COLUMNDEF *columndef; /* Pointer to column information */
+ MARIA_PACK pack; /* Data about packed records */
+ MARIA_BLOB *blobs; /* Pointer to blobs */
+ uint16 *column_nr; /* Original column order */
+ LEX_STRING unique_file_name; /* realpath() of index file */
+ LEX_STRING data_file_name; /* Resolved path names from symlinks */
+ LEX_STRING index_file_name;
+ LEX_STRING open_file_name; /* parameter to open filename */
+ uchar *file_map; /* mem-map of file if possible */
+ PAGECACHE *pagecache; /* ref to the current key cache */
+ MARIA_DECODE_TREE *decode_trees;
+ /*
+ Previous auto-increment value. Used to verify if we can restore the
+ auto-increment counter if we have to abort an insert (duplicate key).
+ */
+ ulonglong last_auto_increment;
+ uint16 *decode_tables;
+ uint16 id; /**< 2-byte id by which log records refer to the table */
+ /* Called the first time the table instance is opened */
+ my_bool (*once_init)(struct st_maria_share *, File);
+ /* Called when the last instance of the table is closed */
+ my_bool (*once_end)(struct st_maria_share *);
+ /* Is called for every open of the table */
+ my_bool (*init)(MARIA_HA *);
+ /* Is called for every close of the table */
+ void (*end)(MARIA_HA *);
+ /* Called when we want to read a record from a specific position */
+ int (*read_record)(MARIA_HA *, uchar *, MARIA_RECORD_POS);
+ /* Initialize a scan */
+ my_bool (*scan_init)(MARIA_HA *);
+ /* Read next record while scanning */
+ int (*scan)(MARIA_HA *, uchar *, MARIA_RECORD_POS, my_bool);
+ /* End scan */
+ void (*scan_end)(MARIA_HA *);
+ int (*scan_remember_pos)(MARIA_HA *, MARIA_RECORD_POS*);
+ void (*scan_restore_pos)(MARIA_HA *, MARIA_RECORD_POS);
+ /* Pre-write of row (some handlers may do the actual write here) */
+ MARIA_RECORD_POS (*write_record_init)(MARIA_HA *, const uchar *);
+ /* Write record (or accept write_record_init) */
+ my_bool (*write_record)(MARIA_HA *, const uchar *);
+ /* Called when write failed */
+ my_bool (*write_record_abort)(MARIA_HA *);
+ my_bool (*update_record)(MARIA_HA *, MARIA_RECORD_POS,
+ const uchar *, const uchar *);
+ my_bool (*delete_record)(MARIA_HA *, const uchar *record);
+ my_bool (*compare_record)(MARIA_HA *, const uchar *);
+ /* calculate checksum for a row */
+ ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
+ /*
+ Calculate checksum for a row during write. May be 0 if we calculate
+ the checksum in write_record_init()
+ */
+ ha_checksum(*calc_write_checksum)(MARIA_HA *, const uchar *);
+ /* calculate checksum for a row during check table */
+ ha_checksum(*calc_check_checksum)(MARIA_HA *, const uchar *);
+ /* Compare a row in memory with a row on disk */
+ my_bool (*compare_unique)(MARIA_HA *, MARIA_UNIQUEDEF *,
+ const uchar *record, MARIA_RECORD_POS pos);
+ my_off_t (*keypos_to_recpos)(struct st_maria_share *share, my_off_t pos);
+ my_off_t (*recpos_to_keypos)(struct st_maria_share *share, my_off_t pos);
+ my_bool (*row_is_visible)(MARIA_HA *);
+
+ /* Mapings to read/write the data file */
+ size_t (*file_read)(MARIA_HA *, uchar *, size_t, my_off_t, myf);
+ size_t (*file_write)(MARIA_HA *, const uchar *, size_t, my_off_t, myf);
+ /* query cache invalidator for merged tables */
+ invalidator_by_filename invalidator;
+ /* query cache invalidator for changing state */
+ invalidator_by_filename chst_invalidator;
+ my_off_t key_del_current; /* delete links for index pages */
+ ulong this_process; /* processid */
+ ulong last_process; /* For table-change-check */
+ ulong last_version; /* Version on start */
+ ulong options; /* Options used */
+ ulong min_pack_length; /* These are used by packed data */
+ ulong max_pack_length;
+ ulong state_diff_length;
+ uint rec_reflength; /* rec_reflength in use now */
+ uint keypage_header;
+ uint32 ftkeys; /* Number of distinct full-text keys
+ + 1 */
+ PAGECACHE_FILE kfile; /* Shared keyfile */
+ File data_file; /* Shared data file */
+ int mode; /* mode of file on open */
+ uint reopen; /* How many times opened */
+ uint in_trans; /* Number of references by trn */
+ uint w_locks, r_locks, tot_locks; /* Number of read/write locks */
+ uint block_size; /* block_size of keyfile & data file*/
+ uint max_index_block_size; /* block_size - end_of_page_info */
+ /* Fixed length part of a packed row in BLOCK_RECORD format */
+ uint base_length;
+ myf write_flag;
+ enum data_file_type data_file_type;
+ enum pagecache_page_type page_type; /* value depending transactional */
+ /**
+ if Checkpoint looking at table; protected by close_lock or THR_LOCK_maria
+ */
+ uint8 in_checkpoint;
+ my_bool temporary;
+ /* Below flag is needed to make log tables work with concurrent insert */
+ my_bool is_log_table;
+
+ my_bool changed, /* If changed since lock */
+ global_changed, /* If changed since open */
+ not_flushed;
+ my_bool lock_key_trees; /* If we have to lock trees on read */
+ my_bool non_transactional_concurrent_insert;
+ my_bool delay_key_write;
+ my_bool have_rtree;
+ /**
+ @brief if the table is transactional right now. It may have been created
+ transactional (base.born_transactional==TRUE) but with transactionality
+ (logging) temporarily disabled (now_transactional==FALSE). The opposite
+ (FALSE, TRUE) is impossible.
+ */
+ my_bool now_transactional;
+ my_bool have_versioning;
+ my_bool key_del_used; /* != 0 if key_del is locked */
+ my_bool deleting; /* we are going to delete this table */
+#ifdef THREAD
+ THR_LOCK lock;
+ void (*lock_restore_status)(void *);
+ /**
+ Protects kfile, dfile, most members of the state, state disk writes,
+ versioning information (like in_trans, state_history).
+ @todo find the exhaustive list.
+ */
+ pthread_mutex_t intern_lock;
+ pthread_mutex_t key_del_lock;
+ pthread_cond_t key_del_cond;
+ /**
+ _Always_ held while closing table; prevents checkpoint from looking at
+ structures freed during closure (like bitmap). If you need close_lock and
+ intern_lock, lock them in this order.
+ */
+ pthread_mutex_t close_lock;
+#endif
+ my_off_t mmaped_length;
+ uint nonmmaped_inserts; /* counter of writing in
+ non-mmaped area */
+ MARIA_FILE_BITMAP bitmap;
+ rw_lock_t mmap_lock;
+ LSN lsn_of_file_id; /**< LSN of its last LOGREC_FILE_ID */
+} MARIA_SHARE;
+
+
+typedef uchar MARIA_BITMAP_BUFFER;
+
+typedef struct st_maria_bitmap_block
+{
+ pgcache_page_no_t page; /* Page number */
+ /* Number of continuous pages. TAIL_BIT is set if this is a tail page */
+ uint page_count;
+ uint empty_space; /* Set for head and tail pages */
+ /*
+ Number of BLOCKS for block-region (holds all non-blob-fields or one blob)
+ */
+ uint sub_blocks;
+ /* set to <> 0 in write_record() if this block was actually used */
+ uint8 used;
+ uint8 org_bitmap_value;
+} MARIA_BITMAP_BLOCK;
+
+
+typedef struct st_maria_bitmap_blocks
+{
+ MARIA_BITMAP_BLOCK *block;
+ uint count;
+ my_bool tail_page_skipped; /* If some tail pages was not used */
+ my_bool page_skipped; /* If some full pages was not used */
+} MARIA_BITMAP_BLOCKS;
+
+
+/* Data about the currently read row */
+typedef struct st_maria_row
+{
+ MARIA_BITMAP_BLOCKS insert_blocks;
+ MARIA_BITMAP_BUFFER *extents;
+ MARIA_RECORD_POS lastpos, nextpos;
+ MARIA_RECORD_POS *tail_positions;
+ ha_checksum checksum;
+ LSN orig_undo_lsn; /* Lsn at start of row insert */
+ TrID trid; /* Transaction id for current row */
+ uchar *empty_bits, *field_lengths;
+ uint *null_field_lengths; /* All null field lengths */
+ ulong *blob_lengths; /* Length for each blob */
+ ulong min_length, normal_length, char_length, varchar_length;
+ ulong blob_length, total_length;
+ size_t extents_buffer_length; /* Size of 'extents' buffer */
+ uint head_length, header_length;
+ uint field_lengths_length; /* Length of data in field_lengths */
+ uint extents_count; /* number of extents in 'extents' */
+ uint full_page_count, tail_count; /* For maria_chk */
+ uint space_on_head_page;
+} MARIA_ROW;
+
+/* Data to scan row in blocked format */
+typedef struct st_maria_block_scan
+{
+ uchar *bitmap_buff, *bitmap_pos, *bitmap_end, *page_buff;
+ uchar *dir, *dir_end;
+ pgcache_page_no_t bitmap_page, max_page;
+ ulonglong bits;
+ uint number_of_rows, bit_pos;
+ MARIA_RECORD_POS row_base_page;
+} MARIA_BLOCK_SCAN;
+
+typedef ICP_RESULT (*index_cond_func_t)(void *param);
+
+struct st_maria_handler
+{
+ MARIA_SHARE *s; /* Shared between open:s */
+ struct st_ma_transaction *trn; /* Pointer to active transaction */
+ void *external_ptr; /* Pointer to THD in mysql */
+ MARIA_STATUS_INFO *state, state_save;
+ MARIA_STATUS_INFO *state_start; /* State at start of transaction */
+ MARIA_ROW cur_row; /* The active row that we just read */
+ MARIA_ROW new_row; /* Storage for a row during update */
+ MARIA_KEY last_key; /* Last found key */
+ MARIA_BLOCK_SCAN scan, *scan_save;
+ MARIA_BLOB *blobs; /* Pointer to blobs */
+ MARIA_BIT_BUFF bit_buff;
+ DYNAMIC_ARRAY bitmap_blocks;
+ DYNAMIC_ARRAY pinned_pages;
+ /* accumulate indexfile changes between write's */
+ TREE *bulk_insert;
+ LEX_CUSTRING *log_row_parts; /* For logging */
+ DYNAMIC_ARRAY *ft1_to_ft2; /* used only in ft1->ft2 conversion */
+ MEM_ROOT ft_memroot; /* used by the parser */
+ MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit */
+ uchar *buff; /* page buffer */
+ uchar *keyread_buff; /* Buffer for last key read */
+ uchar *lastkey_buff; /* Last used search key */
+ uchar *lastkey_buff2;
+ uchar *first_mbr_key; /* Searhed spatial key */
+ uchar *rec_buff; /* Temp buffer for recordpack */
+ uchar *blob_buff; /* Temp buffer for blobs */
+ uchar *int_keypos; /* Save position for next/previous */
+ uchar *int_maxpos; /* -""- */
+ uint keypos_offset; /* Tmp storage for offset int_keypos */
+ uint maxpos_offset; /* Tmp storage for offset int_maxpos */
+ uchar *update_field_data; /* Used by update in rows-in-block */
+ uint int_nod_flag; /* -""- */
+ uint32 int_keytree_version; /* -""- */
+ int (*read_record)(MARIA_HA *, uchar*, MARIA_RECORD_POS);
+ invalidator_by_filename invalidator; /* query cache invalidator */
+ ulonglong last_auto_increment; /* auto value at start of statement */
+ ulong this_unique; /* uniq filenumber or thread */
+ ulong last_unique; /* last unique number */
+ ulong this_loop; /* counter for this open */
+ ulong last_loop; /* last used counter */
+ MARIA_RECORD_POS save_lastpos;
+ MARIA_RECORD_POS dup_key_pos;
+ TrID dup_key_trid;
+ my_off_t pos; /* Intern variable */
+ my_off_t last_keypage; /* Last key page read */
+ my_off_t last_search_keypage; /* Last keypage when searching */
+
+ /*
+ QQ: the folloing two xxx_length fields should be removed,
+ as they are not compatible with parallel repair
+ */
+ ulong packed_length, blob_length; /* Length of found, packed record */
+ size_t rec_buff_size, blob_buff_size;
+ PAGECACHE_FILE dfile; /* The datafile */
+ IO_CACHE rec_cache; /* When cacheing records */
+ LIST open_list;
+ MY_BITMAP changed_fields;
+ ulong row_base_length; /* Length of row header */
+ uint row_flag; /* Flag to store in row header */
+ uint opt_flag; /* Optim. for space/speed */
+ uint update; /* If file changed since open */
+ int lastinx; /* Last used index */
+ uint last_rkey_length; /* Last length in maria_rkey() */
+ uint *last_rtree_keypos; /* Last key positions for rtrees */
+ uint bulk_insert_ref_length; /* Lenght of row ref during bi */
+ uint non_flushable_state;
+ enum ha_rkey_function last_key_func; /* CONTAIN, OVERLAP, etc */
+ uint save_lastkey_data_length;
+ uint save_lastkey_ref_length;
+ uint pack_key_length; /* For MARIA_MRG */
+ myf lock_wait; /* is 0 or MY_SHORT_WAIT */
+ int errkey; /* Got last error on this key */
+ int lock_type; /* How database was locked */
+ int tmp_lock_type; /* When locked by readinfo */
+ uint data_changed; /* Somebody has changed data */
+ uint save_update; /* When using KEY_READ */
+ int save_lastinx;
+ uint preload_buff_size; /* When preloading indexes */
+ uint16 last_used_keyseg; /* For MARIAMRG */
+ uint8 key_del_used; /* != 0 if key_del is used */
+ my_bool was_locked; /* Was locked in panic */
+ my_bool append_insert_at_end; /* Set if concurrent insert */
+ my_bool quick_mode;
+ /* Marker if key_del_changed */
+ /* If info->keyread_buff can't be used for rnext */
+ my_bool page_changed;
+ /* If info->keyread_buff has to be re-read for rnext */
+ my_bool keyread_buff_used;
+ my_bool once_flags; /* For MARIA_MRG */
+ /* For bulk insert enable/disable transactions control */
+ my_bool switched_transactional;
+#ifdef __WIN__
+ my_bool owned_by_merge; /* This Maria table is part of a merge union */
+#endif
+#ifdef THREAD
+ THR_LOCK_DATA lock;
+#endif
+ uchar *maria_rtree_recursion_state; /* For RTREE */
+ uchar length_buff[5]; /* temp buff to store blob lengths */
+ int maria_rtree_recursion_depth;
+
+ index_cond_func_t index_cond_func; /* Index condition function */
+ void *index_cond_func_arg; /* parameter for the func */
+};
+
+/* Some defines used by maria-functions */
+
+#define USE_WHOLE_KEY 65535 /* Use whole key in _search() */
+#define F_EXTRA_LCK -1
+
+/* bits in opt_flag */
+#define MEMMAP_USED 32
+#define REMEMBER_OLD_POS 64
+
+#define WRITEINFO_UPDATE_KEYFILE 1
+#define WRITEINFO_NO_UNLOCK 2
+
+/* once_flags */
+#define USE_PACKED_KEYS 1
+#define RRND_PRESERVE_LASTINX 2
+
+/* bits in state.changed */
+
+#define STATE_CHANGED 1
+#define STATE_CRASHED 2
+#define STATE_CRASHED_ON_REPAIR 4
+#define STATE_NOT_ANALYZED 8
+#define STATE_NOT_OPTIMIZED_KEYS 16
+#define STATE_NOT_SORTED_PAGES 32
+#define STATE_NOT_OPTIMIZED_ROWS 64
+#define STATE_NOT_ZEROFILLED 128
+#define STATE_NOT_MOVABLE 256
+#define STATE_MOVED 512 /* set if base->uuid != maria_uuid */
+#define STATE_IN_REPAIR 1024 /* We are running repair on table */
+
+/* options to maria_read_cache */
+
+#define READING_NEXT 1
+#define READING_HEADER 2
+
+/* Number of bytes on key pages to indicate used size */
+#define KEYPAGE_USED_SIZE 2
+#define KEYPAGE_KEYID_SIZE 1
+#define KEYPAGE_FLAG_SIZE 1
+#define KEYPAGE_CHECKSUM_SIZE 4
+#define MAX_KEYPAGE_HEADER_SIZE (LSN_STORE_SIZE + KEYPAGE_USED_SIZE + \
+ KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE + \
+ TRANSID_SIZE)
+#define KEYPAGE_FLAG_ISNOD 1
+#define KEYPAGE_FLAG_HAS_TRANSID 2
+/* Position to KEYPAGE_FLAG for transactional tables */
+#define KEYPAGE_TRANSFLAG_OFFSET LSN_STORE_SIZE + TRANSID_SIZE + KEYPAGE_KEYID_SIZE
+
+#define _ma_get_page_used(share,x) \
+ ((uint) mi_uint2korr((x) + (share)->keypage_header - KEYPAGE_USED_SIZE))
+#define _ma_store_page_used(share,x,y) \
+ mi_int2store((x) + (share)->keypage_header - KEYPAGE_USED_SIZE, (y))
+#define _ma_get_keypage_flag(share,x) x[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]
+#define _ma_test_if_nod(share,x) \
+ ((_ma_get_keypage_flag(share,x) & KEYPAGE_FLAG_ISNOD) ? (share)->base.key_reflength : 0)
+
+#define _ma_store_keynr(share, x, nr) x[(share)->keypage_header - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE - KEYPAGE_USED_SIZE]= (nr)
+#define _ma_get_keynr(share, x) ((uchar) x[(share)->keypage_header - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE - KEYPAGE_USED_SIZE])
+#define _ma_store_transid(buff, transid) \
+ transid_store((buff) + LSN_STORE_SIZE, (transid))
+#define _ma_korr_transid(buff) \
+ transid_korr((buff) + LSN_STORE_SIZE)
+#define _ma_store_keypage_flag(share,x,flag) x[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]= (flag)
+#define _ma_mark_page_with_transid(share, page) \
+ (page)->flag|= KEYPAGE_FLAG_HAS_TRANSID; \
+ (page)->buff[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]= (page)->flag;
+
+
+/*
+ TODO: write int4store_aligned as *((uint32 *) (T))= (uint32) (A) for
+ architectures where it is possible
+*/
+#define int4store_aligned(A,B) int4store((A),(B))
+
+#define maria_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \
+ DBUG_PRINT("error", ("Marked table crashed")); \
+ }while(0)
+#define maria_mark_crashed_share(x) \
+ do{(x)->state.changed|= STATE_CRASHED; \
+ DBUG_PRINT("error", ("Marked table crashed")); \
+ }while(0)
+#define maria_mark_crashed_on_repair(x) do{(x)->s->state.changed|= \
+ STATE_CRASHED|STATE_CRASHED_ON_REPAIR; \
+ (x)->update|= HA_STATE_CHANGED; \
+ DBUG_PRINT("error", ("Marked table crashed on repair")); \
+ }while(0)
+#define maria_mark_in_repair(x) do{(x)->s->state.changed|= \
+ STATE_CRASHED | STATE_IN_REPAIR; \
+ (x)->update|= HA_STATE_CHANGED; \
+ DBUG_PRINT("error", ("Marked table crashed for repair")); \
+ }while(0)
+#define maria_is_crashed(x) ((x)->s->state.changed & STATE_CRASHED)
+#define maria_is_crashed_on_repair(x) ((x)->s->state.changed & STATE_CRASHED_ON_REPAIR)
+#define maria_in_repair(x) ((x)->s->state.changed & STATE_IN_REPAIR)
+
+#ifdef EXTRA_DEBUG
+/**
+ Brings additional information in certain debug builds and in standalone
+ (non-ha_maria) programs. To help debugging. Not in ha_maria, to not spam the
+ user (some messages can be produced many times per statement, or even
+ wrongly during some repair operations).
+*/
+#define maria_print_error(SHARE, ERRNO) \
+ do{ if (!maria_in_ha_maria) \
+ _ma_report_error((ERRNO), &(SHARE)->index_file_name); } \
+ while(0)
+#else
+#define maria_print_error(SHARE, ERRNO) while (0)
+#endif
+#define DBUG_DUMP_KEY(name, key) DBUG_DUMP(name, (key)->data, (key)->data_length + (key)->ref_length)
+
+
+/* Functions to store length of space packed keys, VARCHAR or BLOB keys */
+
+#define store_key_length(key,length) \
+{ if ((length) < 255) \
+ { *(key)=(length); } \
+ else \
+ { *(key)=255; mi_int2store((key)+1,(length)); } \
+}
+
+#define get_key_full_length(length,key) \
+ { if (*(const uchar*) (key) != 255) \
+ length= ((uint) *(const uchar*) ((key)++))+1; \
+ else \
+ { length=mi_uint2korr((key)+1)+3; (key)+=3; } \
+}
+
+#define get_key_full_length_rdonly(length,key) \
+{ if (*(const uchar*) (key) != 255) \
+ length= ((uint) *(const uchar*) ((key)))+1; \
+ else \
+ { length=mi_uint2korr((key)+1)+3; } \
+}
+
+#define maria_max_key_length() ((maria_block_size - MAX_KEYPAGE_HEADER_SIZE)/2 - MARIA_INDEX_OVERHEAD_SIZE)
+#define get_pack_length(length) ((length) >= 255 ? 3 : 1)
+#define _ma_have_versioning(info) ((info)->row_flag & ROW_FLAG_TRANSID)
+
+/**
+ Sets table's trn and prints debug information
+ @param tbl MARIA_HA of table
+ @param newtrn what to put into tbl->trn
+ @note cast of newtrn is because %p of NULL gives warning (NULL is int)
+*/
+#define _ma_set_trn_for_table(tbl, newtrn) do { \
+ DBUG_PRINT("info",("table: %p trn: %p -> %p", \
+ (tbl), (tbl)->trn, (void *)(newtrn))); \
+ (tbl)->trn= (newtrn); \
+ } while (0)
+
+
+#define MARIA_MIN_BLOCK_LENGTH 20 /* Because of delete-link */
+/* Don't use to small record-blocks */
+#define MARIA_EXTEND_BLOCK_LENGTH 20
+#define MARIA_SPLIT_LENGTH ((MARIA_EXTEND_BLOCK_LENGTH+4)*2)
+ /* Max prefix of record-block */
+#define MARIA_MAX_DYN_BLOCK_HEADER 20
+#define MARIA_BLOCK_INFO_HEADER_LENGTH 20
+#define MARIA_DYN_DELETE_BLOCK_HEADER 20 /* length of delete-block-header */
+#define MARIA_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L)
+#define MARIA_DYN_MAX_ROW_LENGTH (MARIA_DYN_MAX_BLOCK_LENGTH - MARIA_SPLIT_LENGTH)
+#define MARIA_DYN_ALIGN_SIZE 4 /* Align blocks on this */
+#define MARIA_MAX_DYN_HEADER_BYTE 13 /* max header uchar for dynamic rows */
+#define MARIA_MAX_BLOCK_LENGTH ((((ulong) 1 << 24)-1) & (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1)))
+#define MARIA_REC_BUFF_OFFSET ALIGN_SIZE(MARIA_DYN_DELETE_BLOCK_HEADER+sizeof(uint32))
+
+#define MEMMAP_EXTRA_MARGIN 7 /* Write this as a suffix for file */
+
+#define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */
+#define PACK_TYPE_SPACE_FIELDS 2
+#define PACK_TYPE_ZERO_FILL 4
+#define MARIA_FOUND_WRONG_KEY 32738 /* Impossible value from ha_key_cmp */
+
+#define MARIA_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size))
+#define MARIA_MAX_KEYPTR_SIZE 5 /* For calculating block lengths */
+
+/* Marker for impossible delete link */
+#define IMPOSSIBLE_PAGE_NO LL(0xFFFFFFFFFF)
+
+/* The UNIQUE check is done with a hashed long key */
+
+#define MARIA_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT
+#define maria_unique_store(A,B) mi_int4store((A),(B))
+
+#ifdef THREAD
+extern pthread_mutex_t THR_LOCK_maria;
+#endif
+#if !defined(THREAD) || defined(DONT_USE_RW_LOCKS)
+#define rw_wrlock(A) {}
+#define rw_rdlock(A) {}
+#define rw_unlock(A) {}
+#endif
+
+/* Some tuning parameters */
+#define MARIA_MIN_KEYBLOCK_LENGTH 50 /* When to split delete blocks */
+#define MARIA_MIN_SIZE_BULK_INSERT_TREE 16384 /* this is per key */
+#define MARIA_MIN_ROWS_TO_USE_BULK_INSERT 100
+#define MARIA_MIN_ROWS_TO_DISABLE_INDEXES 100
+#define MARIA_MIN_ROWS_TO_USE_WRITE_CACHE 10
+/* Keep a small buffer for tables only using small blobs */
+#define MARIA_SMALL_BLOB_BUFFER 1024
+#define MARIA_MAX_CONTROL_FILE_LOCK_RETRY 30 /* Retry this many times */
+
+
+/* Some extern variables */
+extern LIST *maria_open_list;
+extern uchar maria_file_magic[], maria_pack_file_magic[];
+extern uchar maria_uuid[MY_UUID_SIZE];
+extern uint32 maria_read_vec[], maria_readnext_vec[];
+extern uint maria_quick_table_bits;
+extern char *maria_data_root;
+extern uchar maria_zero_string[];
+extern my_bool maria_inited, maria_in_ha_maria, maria_recovery_changed_data;
+extern my_bool maria_recovery_verbose;
+extern HASH maria_stored_state;
+extern int (*maria_create_trn_hook)(MARIA_HA *);
+
+/* This is used by _ma_calc_xxx_key_length och _ma_store_key */
+typedef struct st_maria_s_param
+{
+ const uchar *key;
+ uchar *prev_key, *next_key_pos;
+ uchar *key_pos; /* For balance page */
+ uint ref_length, key_length, n_ref_length;
+ uint n_length, totlength, part_of_prev_key, prev_length, pack_marker;
+ uint changed_length;
+ int move_length; /* For balance_page */
+ my_bool store_not_null;
+} MARIA_KEY_PARAM;
+
+
+/* Used to store reference to pinned page */
+typedef struct st_pinned_page
+{
+ PAGECACHE_BLOCK_LINK *link;
+ enum pagecache_page_lock unlock, write_lock;
+ my_bool changed;
+} MARIA_PINNED_PAGE;
+
+
+/* Keeps all information about a page and related to a page */
+typedef struct st_maria_page
+{
+ MARIA_HA *info;
+ const MARIA_KEYDEF *keyinfo;
+ uchar *buff; /* Data for page */
+ my_off_t pos; /* Disk address to page */
+ uint size; /* Size of data on page */
+ uint org_size; /* Size of page at read or after log */
+ uint node; /* 0 or share->base.key_reflength */
+ uint flag; /* Page flag */
+ uint link_offset;
+} MARIA_PAGE;
+
+
+/* Prototypes for intern functions */
+extern int _ma_read_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS);
+extern int _ma_read_rnd_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS,
+ my_bool);
+extern my_bool _ma_write_dynamic_record(MARIA_HA *, const uchar *);
+extern my_bool _ma_update_dynamic_record(MARIA_HA *, MARIA_RECORD_POS,
+ const uchar *, const uchar *);
+extern my_bool _ma_delete_dynamic_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_cmp_dynamic_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_write_blob_record(MARIA_HA *, const uchar *);
+extern my_bool _ma_update_blob_record(MARIA_HA *, MARIA_RECORD_POS,
+ const uchar *, const uchar *);
+extern int _ma_read_static_record(MARIA_HA *info, uchar *, MARIA_RECORD_POS);
+extern int _ma_read_rnd_static_record(MARIA_HA *, uchar *, MARIA_RECORD_POS,
+ my_bool);
+extern my_bool _ma_write_static_record(MARIA_HA *, const uchar *);
+extern my_bool _ma_update_static_record(MARIA_HA *, MARIA_RECORD_POS,
+ const uchar *, const uchar *);
+extern my_bool _ma_delete_static_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_cmp_static_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_ck_write(MARIA_HA *info, MARIA_KEY *key);
+extern my_bool _ma_enlarge_root(MARIA_HA *info, MARIA_KEY *key,
+ MARIA_RECORD_POS *root);
+int _ma_insert(register MARIA_HA *info, MARIA_KEY *key,
+ MARIA_PAGE *anc_page, uchar *key_pos, uchar *key_buff,
+ MARIA_PAGE *father_page, uchar *father_key_pos,
+ my_bool insert_last);
+extern my_bool _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEY *key,
+ MARIA_RECORD_POS *root, uint32 comp_flag);
+extern int _ma_split_page(MARIA_HA *info, MARIA_KEY *key,
+ MARIA_PAGE *split_page,
+ uint org_split_length,
+ uchar *inserted_key_pos, uint changed_length,
+ int move_length,
+ uchar *key_buff, my_bool insert_last_key);
+extern uchar *_ma_find_half_pos(MARIA_KEY *key, MARIA_PAGE *page,
+ uchar ** after_key);
+extern int _ma_calc_static_key_length(const MARIA_KEY *key, uint nod_flag,
+ uchar *key_pos, uchar *org_key,
+ uchar *key_buff,
+ MARIA_KEY_PARAM *s_temp);
+extern int _ma_calc_var_key_length(const MARIA_KEY *key, uint nod_flag,
+ uchar *key_pos, uchar *org_key,
+ uchar *key_buff,
+ MARIA_KEY_PARAM *s_temp);
+extern int _ma_calc_var_pack_key_length(const MARIA_KEY *key,
+ uint nod_flag, uchar *next_key,
+ uchar *org_key, uchar *prev_key,
+ MARIA_KEY_PARAM *s_temp);
+extern int _ma_calc_bin_pack_key_length(const MARIA_KEY *key,
+ uint nod_flag, uchar *next_key,
+ uchar *org_key, uchar *prev_key,
+ MARIA_KEY_PARAM *s_temp);
+extern void _ma_store_static_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+ MARIA_KEY_PARAM *s_temp);
+extern void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+ MARIA_KEY_PARAM *s_temp);
+#ifdef NOT_USED
+extern void _ma_store_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+ MARIA_KEY_PARAM *s_temp);
+#endif
+extern void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+ MARIA_KEY_PARAM *s_temp);
+
+extern my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key);
+extern my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key,
+ my_off_t *root);
+extern int _ma_readinfo(MARIA_HA *info, int lock_flag, int check_keybuffer);
+extern int _ma_writeinfo(MARIA_HA *info, uint options);
+extern int _ma_test_if_changed(MARIA_HA *info);
+extern int _ma_mark_file_changed(MARIA_HA *info);
+extern void _ma_mark_file_crashed(MARIA_SHARE *share);
+extern my_bool _ma_set_uuid(MARIA_HA *info, my_bool reset_uuid);
+extern my_bool _ma_check_if_zero(uchar *pos, size_t size);
+extern int _ma_decrement_open_count(MARIA_HA *info);
+extern int _ma_check_index(MARIA_HA *info, int inx);
+extern int _ma_search(MARIA_HA *info, MARIA_KEY *key, uint32 nextflag,
+ my_off_t pos);
+extern int _ma_bin_search(const MARIA_KEY *key, const MARIA_PAGE *page,
+ uint32 comp_flag, uchar **ret_pos, uchar *buff,
+ my_bool *was_last_key);
+extern int _ma_seq_search(const MARIA_KEY *key, const MARIA_PAGE *page,
+ uint comp_flag, uchar ** ret_pos, uchar *buff,
+ my_bool *was_last_key);
+extern int _ma_prefix_search(const MARIA_KEY *key, const MARIA_PAGE *page,
+ uint32 comp_flag, uchar ** ret_pos, uchar *buff,
+ my_bool *was_last_key);
+extern my_off_t _ma_kpos(uint nod_flag, const uchar *after_key);
+extern void _ma_kpointer(MARIA_HA *info, uchar *buff, my_off_t pos);
+MARIA_RECORD_POS _ma_row_pos_from_key(const MARIA_KEY *key);
+TrID _ma_trid_from_key(const MARIA_KEY *key);
+extern MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *share, uchar *ptr);
+extern void _ma_dpointer(MARIA_SHARE *share, uchar *buff,
+ MARIA_RECORD_POS pos);
+extern uint _ma_get_static_key(MARIA_KEY *key, uint page_flag, uint nod_flag,
+ uchar **page);
+extern uchar *_ma_skip_static_key(MARIA_KEY *key, uint page_flag,
+ uint nod_flag, uchar *page);
+extern uint _ma_get_pack_key(MARIA_KEY *key, uint page_flag, uint nod_flag,
+ uchar **page);
+extern uchar *_ma_skip_pack_key(MARIA_KEY *key, uint page_flag,
+ uint nod_flag, uchar *page);
+extern uint _ma_get_binary_pack_key(MARIA_KEY *key, uint page_flag,
+ uint nod_flag, uchar **page_pos);
+uchar *_ma_skip_binary_pack_key(MARIA_KEY *key, uint page_flag,
+ uint nod_flag, uchar *page);
+extern uchar *_ma_get_last_key(MARIA_KEY *key, MARIA_PAGE *page,
+ uchar *endpos);
+extern uchar *_ma_get_key(MARIA_KEY *key, MARIA_PAGE *page, uchar *keypos);
+extern uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key);
+extern uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, const uchar *key,
+ HA_KEYSEG *end);
+extern int _ma_search_next(MARIA_HA *info, MARIA_KEY *key,
+ uint32 nextflag, my_off_t pos);
+extern int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t pos);
+extern int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+ my_off_t pos);
+extern my_off_t _ma_static_keypos_to_recpos(MARIA_SHARE *share, my_off_t pos);
+extern my_off_t _ma_static_recpos_to_keypos(MARIA_SHARE *share, my_off_t pos);
+extern my_off_t _ma_transparent_recpos(MARIA_SHARE *share, my_off_t pos);
+extern my_off_t _ma_transaction_keypos_to_recpos(MARIA_SHARE *, my_off_t pos);
+extern my_off_t _ma_transaction_recpos_to_keypos(MARIA_SHARE *, my_off_t pos);
+
+extern void _ma_page_setup(MARIA_PAGE *page, MARIA_HA *info,
+ const MARIA_KEYDEF *keyinfo, my_off_t pos,
+ uchar *buff);
+extern my_bool _ma_fetch_keypage(MARIA_PAGE *page, MARIA_HA *info,
+ const MARIA_KEYDEF *keyinfo,
+ my_off_t pos, enum pagecache_page_lock lock,
+ int level, uchar *buff,
+ my_bool return_buffer);
+extern my_bool _ma_write_keypage(MARIA_PAGE *page,
+ enum pagecache_page_lock lock, int level);
+extern int _ma_dispose(MARIA_HA *info, my_off_t pos, my_bool page_not_read);
+extern my_off_t _ma_new(register MARIA_HA *info, int level,
+ MARIA_PINNED_PAGE **page_link);
+extern my_bool _ma_compact_keypage(MARIA_PAGE *page, TrID min_read_from);
+extern uint transid_store_packed(MARIA_HA *info, uchar *to, ulonglong trid);
+extern ulonglong transid_get_packed(MARIA_SHARE *share, const uchar *from);
+#define transid_packed_length(data) \
+ ((data)[0] < MARIA_MIN_TRANSID_PACK_OFFSET ? 1 : \
+ (uint) ((uchar) (data)[0]) - (MARIA_TRANSID_PACK_OFFSET - 1))
+#define key_has_transid(key) (*(key) & 1)
+
+#define page_mark_changed(info, page) \
+ dynamic_element(&(info)->pinned_pages, (page)->link_offset, \
+ MARIA_PINNED_PAGE*)->changed= 1;
+#define page_store_size(share, page) \
+ _ma_store_page_used((share), (page)->buff, (page)->size);
+#define page_store_info(share, page) \
+ _ma_store_keypage_flag((share), (page)->buff, (page)->flag); \
+ _ma_store_page_used((share), (page)->buff, (page)->size);
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+void page_cleanup(MARIA_SHARE *share, MARIA_PAGE *page)
+#else
+#define page_cleanup(A,B) while (0)
+#endif
+
+extern MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr,
+ uchar *key, const uchar *record,
+ MARIA_RECORD_POS filepos, ulonglong trid);
+extern MARIA_KEY *_ma_pack_key(MARIA_HA *info, MARIA_KEY *int_key,
+ uint keynr, uchar *key,
+ const uchar *old, key_part_map keypart_map,
+ HA_KEYSEG ** last_used_keyseg);
+extern void _ma_copy_key(MARIA_KEY *to, const MARIA_KEY *from);
+extern int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS);
+extern my_bool _ma_read_cache(IO_CACHE *info, uchar *buff,
+ MARIA_RECORD_POS pos, size_t length,
+ uint re_read_if_possibly);
+extern ulonglong ma_retrieve_auto_increment(const uchar *key, uint8 key_type);
+extern my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size,
+ size_t new_size);
+extern ulong _ma_rec_unpack(MARIA_HA *info, uchar *to, uchar *from,
+ ulong reclength);
+extern my_bool _ma_rec_check(MARIA_HA *info, const uchar *record,
+ uchar *packpos, ulong packed_length,
+ my_bool with_checkum, ha_checksum checksum);
+extern int _ma_write_part_record(MARIA_HA *info, my_off_t filepos,
+ ulong length, my_off_t next_filepos,
+ uchar ** record, ulong *reclength,
+ int *flag);
+extern void _ma_print_key(FILE *stream, MARIA_KEY *key);
+extern void _ma_print_keydata(FILE *stream, HA_KEYSEG *keyseg,
+ const uchar *key, uint length);
+extern my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile);
+extern my_bool _ma_once_end_pack_row(MARIA_SHARE *share);
+extern int _ma_read_pack_record(MARIA_HA *info, uchar *buf,
+ MARIA_RECORD_POS filepos);
+extern int _ma_read_rnd_pack_record(MARIA_HA *, uchar *, MARIA_RECORD_POS,
+ my_bool);
+extern int _ma_pack_rec_unpack(MARIA_HA *info, MARIA_BIT_BUFF *bit_buff,
+ uchar *to, uchar *from, ulong reclength);
+extern ulonglong _ma_safe_mul(ulonglong a, ulonglong b);
+extern int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf,
+ const uchar *oldrec, const uchar *newrec,
+ my_off_t pos);
+
+/*
+ Parameter to _ma_get_block_info
+ The dynamic row header is read into this struct. For an explanation of
+ the fields, look at the function _ma_get_block_info().
+*/
+
+typedef struct st_maria_block_info
+{
+ uchar header[MARIA_BLOCK_INFO_HEADER_LENGTH];
+ ulong rec_len;
+ ulong data_len;
+ ulong block_len;
+ ulong blob_len;
+ MARIA_RECORD_POS filepos;
+ MARIA_RECORD_POS next_filepos;
+ MARIA_RECORD_POS prev_filepos;
+ uint second_read;
+ uint offset;
+} MARIA_BLOCK_INFO;
+
+
+/* bits in return from _ma_get_block_info */
+
+#define BLOCK_FIRST 1
+#define BLOCK_LAST 2
+#define BLOCK_DELETED 4
+#define BLOCK_ERROR 8 /* Wrong data */
+#define BLOCK_SYNC_ERROR 16 /* Right data at wrong place */
+#define BLOCK_FATAL_ERROR 32 /* hardware-error */
+
+#define NEED_MEM ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */
+#define MAXERR 20
+#define BUFFERS_WHEN_SORTING 16 /* Alloc for sort-key-tree */
+#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
+#define INDEX_TMP_EXT ".TMM"
+#define DATA_TMP_EXT ".TMD"
+
+#define UPDATE_TIME 1
+#define UPDATE_STAT 2
+#define UPDATE_SORT 4
+#define UPDATE_AUTO_INC 8
+#define UPDATE_OPEN_COUNT 16
+
+#define USE_BUFFER_INIT (((1024L*1024L*128-MALLOC_OVERHEAD)/8192)*8192)
+#define READ_BUFFER_INIT (1024L*256L-MALLOC_OVERHEAD)
+#define SORT_BUFFER_INIT (1024L*1024L*256-MALLOC_OVERHEAD)
+#define MIN_SORT_BUFFER (4096-MALLOC_OVERHEAD)
+
+#define fast_ma_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _ma_writeinfo((INFO),0)
+#define fast_ma_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _ma_readinfo((INFO),F_RDLCK,1)
+
+extern uint _ma_get_block_info(MARIA_BLOCK_INFO *, File, my_off_t);
+extern uint _ma_rec_pack(MARIA_HA *info, uchar *to, const uchar *from);
+extern uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff,
+ MARIA_BLOCK_INFO *info, uchar **rec_buff_p,
+ size_t *rec_buff_size,
+ File file, my_off_t filepos);
+extern void _ma_store_blob_length(uchar *pos, uint pack_length, uint length);
+extern void _ma_report_error(int errcode, const LEX_STRING *file_name);
+extern my_bool _ma_memmap_file(MARIA_HA *info);
+extern void _ma_unmap_file(MARIA_HA *info);
+extern uint _ma_save_pack_length(uint version, uchar * block_buff,
+ ulong length);
+extern uint _ma_calc_pack_length(uint version, ulong length);
+extern ulong _ma_calc_blob_length(uint length, const uchar *pos);
+extern size_t _ma_mmap_pread(MARIA_HA *info, uchar *Buffer,
+ size_t Count, my_off_t offset, myf MyFlags);
+extern size_t _ma_mmap_pwrite(MARIA_HA *info, const uchar *Buffer,
+ size_t Count, my_off_t offset, myf MyFlags);
+extern size_t _ma_nommap_pread(MARIA_HA *info, uchar *Buffer,
+ size_t Count, my_off_t offset, myf MyFlags);
+extern size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer,
+ size_t Count, my_off_t offset, myf MyFlags);
+
+/* my_pwrite instead of my_write used */
+#define MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET 1
+/* info should be written */
+#define MA_STATE_INFO_WRITE_FULL_INFO 2
+/* intern_lock taking is needed */
+#define MA_STATE_INFO_WRITE_LOCK 4
+uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite);
+uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite);
+uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state);
+uint _ma_base_info_write(File file, MARIA_BASE_INFO *base);
+my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg);
+uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg);
+my_bool _ma_keydef_write(File file, MARIA_KEYDEF *keydef);
+uchar *_ma_keydef_read(uchar *ptr, MARIA_KEYDEF *keydef);
+my_bool _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *keydef);
+uchar *_ma_uniquedef_read(uchar *ptr, MARIA_UNIQUEDEF *keydef);
+my_bool _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef);
+uchar *_ma_columndef_read(uchar *ptr, MARIA_COLUMNDEF *columndef);
+my_bool _ma_column_nr_write(File file, uint16 *offsets, uint columns);
+uchar *_ma_column_nr_read(uchar *ptr, uint16 *offsets, uint columns);
+ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record);
+ha_checksum _ma_checksum(MARIA_HA *info, const uchar *buf);
+ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *buf);
+my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ uchar *record, ha_checksum unique_hash,
+ MARIA_RECORD_POS pos);
+ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *buf);
+my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos);
+my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+ const uchar *record, MARIA_RECORD_POS pos);
+my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b,
+ my_bool null_are_equal);
+void _ma_get_status(void *param, my_bool concurrent_insert);
+void _ma_update_status(void *param);
+void _ma_restore_status(void *param);
+void _ma_copy_status(void *to, void *from);
+my_bool _ma_check_status(void *param);
+void _ma_restore_status(void *param);
+void _ma_reset_status(MARIA_HA *maria);
+int _ma_def_scan_remember_pos(MARIA_HA *info, MARIA_RECORD_POS *lastpos);
+void _ma_def_scan_restore_pos(MARIA_HA *info, MARIA_RECORD_POS lastpos);
+
+#include "ma_commit.h"
+
+extern MARIA_HA *_ma_test_if_reopen(const char *filename);
+my_bool _ma_check_table_is_closed(const char *name, const char *where);
+int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share, const char *org_name,
+ File file_to_dup);
+int _ma_open_keyfile(MARIA_SHARE *share);
+void _ma_setup_functions(register MARIA_SHARE *share);
+my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size);
+void _ma_remap_file(MARIA_HA *info, my_off_t size);
+
+MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, const uchar *record);
+my_bool _ma_write_abort_default(MARIA_HA *info);
+
+C_MODE_START
+#define MARIA_FLUSH_DATA 1
+#define MARIA_FLUSH_INDEX 2
+int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
+ enum flush_type flush_type_for_data,
+ enum flush_type flush_type_for_index);
+/*
+ Functions needed by _ma_check (are overridden in MySQL/ha_maria.cc).
+ See ma_check_standalone.h .
+*/
+int _ma_killed_ptr(HA_CHECK *param);
+void _ma_check_print_error _VARARGS((HA_CHECK *param, const char *fmt, ...))
+ ATTRIBUTE_FORMAT(printf, 2, 3);
+void _ma_check_print_warning _VARARGS((HA_CHECK *param, const char *fmt, ...))
+ ATTRIBUTE_FORMAT(printf, 2, 3);
+void _ma_check_print_info _VARARGS((HA_CHECK *param, const char *fmt, ...))
+ ATTRIBUTE_FORMAT(printf, 2, 3);
+my_bool write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info);
+C_MODE_END
+
+int _ma_flush_pending_blocks(MARIA_SORT_PARAM *param);
+int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param);
+int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param);
+#ifdef THREAD
+pthread_handler_t _ma_thr_find_all_keys(void *arg);
+#endif
+
+int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param);
+int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages,
+ size_t);
+int _ma_sync_table_files(const MARIA_HA *info);
+int _ma_initialize_data_file(MARIA_SHARE *share, File dfile);
+int _ma_update_state_lsns(MARIA_SHARE *share,
+ LSN lsn, TrID create_trid, my_bool do_sync,
+ my_bool update_create_rename_lsn);
+int _ma_update_state_lsns_sub(MARIA_SHARE *share, LSN lsn,
+ TrID create_trid, my_bool do_sync,
+ my_bool update_create_rename_lsn);
+void _ma_set_data_pagecache_callbacks(PAGECACHE_FILE *file,
+ MARIA_SHARE *share);
+void _ma_set_index_pagecache_callbacks(PAGECACHE_FILE *file,
+ MARIA_SHARE *share);
+void _ma_tmp_disable_logging_for_table(MARIA_HA *info,
+ my_bool log_incomplete);
+my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages);
+my_bool write_log_record_for_bulk_insert(MARIA_HA *info);
+void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn);
+
+#define MARIA_NO_CRC_NORMAL_PAGE 0xffffffff
+#define MARIA_NO_CRC_BITMAP_PAGE 0xfffffffe
+extern my_bool maria_page_crc_set_index(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern my_bool maria_page_crc_set_normal(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern my_bool maria_page_crc_check_bitmap(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern my_bool maria_page_crc_check_data(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern my_bool maria_page_crc_check_index(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern my_bool maria_page_crc_check_none(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern my_bool maria_page_filler_set_bitmap(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern my_bool maria_page_filler_set_normal(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern my_bool maria_page_filler_set_none(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern void maria_page_write_failure(uchar* data_ptr);
+extern my_bool maria_flush_log_for_page(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern my_bool maria_flush_log_for_page_none(uchar *page,
+ pgcache_page_no_t page_no,
+ uchar *data_ptr);
+extern PAGECACHE *maria_log_pagecache;
+extern void ma_set_index_cond_func(MARIA_HA *info, index_cond_func_t func,
+ void *func_arg);
+int ma_check_index_cond(register MARIA_HA *info, uint keynr, uchar *record);
diff --git a/storage/maria/maria_ftdump.c b/storage/maria/maria_ftdump.c
new file mode 100644
index 00000000000..870d07fa96e
--- /dev/null
+++ b/storage/maria/maria_ftdump.c
@@ -0,0 +1,282 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+ added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ma_ftdefs.h"
+#include <my_getopt.h>
+
+static void usage();
+static void complain(int val);
+static my_bool get_one_option(int, const struct my_option *, char *);
+
+static int count=0, stats=0, dump=0, lstats=0;
+static my_bool verbose;
+static char *query=NULL;
+static uint lengths[256];
+
+#define MAX_LEN (HA_FT_MAXBYTELEN+10)
+#define HOW_OFTEN_TO_WRITE 10000
+
+static struct my_option my_long_options[] =
+{
+ {"help", 'h', "Display help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"help", '?', "Synonym for -h.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"count", 'c', "Calculate per-word stats (counts and global weights).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"dump", 'd', "Dump index (incl. data offsets and word weights).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"length", 'l', "Report length distribution.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"stats", 's', "Report global stats.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"verbose", 'v', "Be verbose.",
+ &verbose, &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+int main(int argc,char *argv[])
+{
+ int error=0;
+ uint keylen, keylen2=0, inx, doc_cnt=0;
+ float weight= 1.0;
+ double gws, min_gws=0, avg_gws=0;
+ MARIA_HA *info;
+ char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN];
+ ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0;
+ struct { MARIA_HA *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */
+
+ MY_INIT(argv[0]);
+ if ((error= handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(error);
+ maria_init();
+ if (count || dump)
+ verbose=0;
+ if (!count && !dump && !lstats && !query)
+ stats=1;
+
+ if (verbose)
+ setbuf(stdout,NULL);
+
+ if (argc < 2)
+ usage();
+
+ {
+ char *end;
+ inx= (uint) strtoll(argv[1], &end, 10);
+ if (*end)
+ usage();
+ }
+
+ init_pagecache(maria_pagecache, USE_BUFFER_INIT, 0, 0,
+ MARIA_KEY_BLOCK_LENGTH, MY_WME);
+
+ if (!(info=maria_open(argv[0], O_RDONLY,
+ HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER)))
+ {
+ error=my_errno;
+ goto err;
+ }
+
+ *buf2=0;
+ aio->info=info;
+
+ if ((inx >= info->s->base.keys) ||
+ !(info->s->keyinfo[inx].flag & HA_FULLTEXT))
+ {
+ printf("Key %d in table %s is not a FULLTEXT key\n", inx,
+ info->s->open_file_name.str);
+ goto err;
+ }
+
+ maria_lock_database(info, F_EXTRA_LCK);
+
+ info->cur_row.lastpos= HA_OFFSET_ERROR;
+ info->update|= HA_STATE_PREV_FOUND;
+
+ while (!(error=maria_rnext(info,NULL,inx)))
+ {
+ FT_WEIGTH subkeys;
+ keylen=*(info->lastkey_buff);
+
+ subkeys.i= ft_sintXkorr(info->lastkey_buff + keylen + 1);
+ if (subkeys.i >= 0)
+ weight= subkeys.f;
+
+#ifdef HAVE_SNPRINTF
+ snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey_buff+1);
+#else
+ sprintf(buf,"%.*s",(int) keylen,info->lastkey_buff+1);
+#endif
+ my_casedn_str(default_charset_info,buf);
+ total++;
+ lengths[keylen]++;
+
+ if (count || stats)
+ {
+ if (strcmp(buf, buf2))
+ {
+ if (*buf2)
+ {
+ uniq++;
+ avg_gws+=gws=GWS_IN_USE;
+ if (count)
+ printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
+ if (maxlen<keylen2)
+ {
+ maxlen=keylen2;
+ strmov(buf_maxlen, buf2);
+ }
+ if (max_doc_cnt < doc_cnt)
+ {
+ max_doc_cnt=doc_cnt;
+ strmov(buf_min_gws, buf2);
+ min_gws=gws;
+ }
+ }
+ strmov(buf2, buf);
+ keylen2=keylen;
+ doc_cnt=0;
+ }
+ doc_cnt+= (subkeys.i >= 0 ? 1 : -subkeys.i);
+ }
+ if (dump)
+ {
+ if (subkeys.i >= 0)
+ printf("%9lx %20.7f %s\n", (long) info->cur_row.lastpos,weight,buf);
+ else
+ printf("%9lx => %17d %s\n",(long) info->cur_row.lastpos,-subkeys.i,
+ buf);
+ }
+ if (verbose && (total%HOW_OFTEN_TO_WRITE)==0)
+ printf("%10ld\r",total);
+ }
+ maria_lock_database(info, F_UNLCK);
+
+ if (count || stats)
+ {
+ if (*buf2)
+ {
+ uniq++;
+ avg_gws+=gws=GWS_IN_USE;
+ if (count)
+ printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
+ if (maxlen<keylen2)
+ {
+ maxlen=keylen2;
+ strmov(buf_maxlen, buf2);
+ }
+ if (max_doc_cnt < doc_cnt)
+ {
+ max_doc_cnt=doc_cnt;
+ strmov(buf_min_gws, buf2);
+ min_gws=gws;
+ }
+ }
+ }
+
+ if (stats)
+ {
+ count=0;
+ for (inx=0;inx<256;inx++)
+ {
+ count+=lengths[inx];
+ if ((ulong) count >= total/2)
+ break;
+ }
+ printf("Total rows: %lu\nTotal words: %lu\n"
+ "Unique words: %lu\nLongest word: %lu chars (%s)\n"
+ "Median length: %u\n"
+ "Average global weight: %f\n"
+ "Most common word: %lu times, weight: %f (%s)\n",
+ (long) info->state->records, total, uniq, maxlen, buf_maxlen,
+ inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws);
+ }
+ if (lstats)
+ {
+ count=0;
+ for (inx=0; inx<256; inx++)
+ {
+ count+=lengths[inx];
+ if (count && lengths[inx])
+ printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx,
+ (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count,
+ 100.0*count/total);
+ }
+ }
+
+err:
+ if (error && error != HA_ERR_END_OF_FILE)
+ printf("got error %d\n",my_errno);
+ if (info)
+ maria_close(info);
+ maria_end();
+ return 0;
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch(optid) {
+ case 'd':
+ dump=1;
+ complain(count || query);
+ break;
+ case 's':
+ stats=1;
+ complain(query!=0);
+ break;
+ case 'c':
+ count= 1;
+ complain(dump || query);
+ break;
+ case 'l':
+ lstats=1;
+ complain(query!=0);
+ break;
+ case '?':
+ case 'h':
+ usage();
+ }
+ return 0;
+}
+
+#include <help_start.h>
+
+static void usage()
+{
+ printf("Use: aria_ft_dump <table_name> <index_num>\n");
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+ NETWARE_SET_SCREEN_MODE(1);
+ exit(1);
+}
+
+#include <help_end.h>
+
+static void complain(int val) /* Kinda assert :-) */
+{
+ if (val)
+ {
+ printf("You cannot use these options together!\n");
+ exit(1);
+ }
+}
diff --git a/storage/maria/maria_pack.c b/storage/maria/maria_pack.c
new file mode 100644
index 00000000000..1d2d3995bd8
--- /dev/null
+++ b/storage/maria/maria_pack.c
@@ -0,0 +1,3234 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Pack MARIA file */
+
+#ifndef USE_MY_FUNC
+#define USE_MY_FUNC /* We need at least my_malloc */
+#endif
+
+#include "maria_def.h"
+#include <queues.h>
+#include <my_tree.h>
+#include "mysys_err.h"
+#ifdef MSDOS
+#include <io.h>
+#endif
+#ifndef __GNU_LIBRARY__
+#define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
+#endif
+#include <my_getopt.h>
+#include <assert.h>
+
+#if SIZEOF_LONG_LONG > 4
+#define BITS_SAVED 64
+#else
+#define BITS_SAVED 32
+#endif
+
+#define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
+#define HEAD_LENGTH 32
+#define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
+
+#define DATA_TMP_EXT ".TMD"
+#define OLD_EXT ".OLD"
+#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
+
+struct st_file_buffer {
+ File file;
+ uchar *buffer,*pos,*end;
+ my_off_t pos_in_file;
+ int bits;
+ ulonglong bitbucket;
+};
+
+struct st_huff_tree;
+struct st_huff_element;
+
+typedef struct st_huff_counts {
+ uint field_length,max_zero_fill;
+ uint pack_type;
+ uint max_end_space,max_pre_space,length_bits,min_space;
+ ulong max_length;
+ enum en_fieldtype field_type;
+ struct st_huff_tree *tree; /* Tree for field */
+ my_off_t counts[256];
+ my_off_t end_space[8];
+ my_off_t pre_space[8];
+ my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
+ TREE int_tree; /* Tree for detecting distinct column values. */
+ uchar *tree_buff; /* Column values, 'field_length' each. */
+ uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
+} HUFF_COUNTS;
+
+typedef struct st_huff_element HUFF_ELEMENT;
+
+/*
+ WARNING: It is crucial for the optimizations in calc_packed_length()
+ that 'count' is the first element of 'HUFF_ELEMENT'.
+*/
+struct st_huff_element {
+ my_off_t count;
+ union un_element {
+ struct st_nod {
+ HUFF_ELEMENT *left,*right;
+ } nod;
+ struct st_leaf {
+ HUFF_ELEMENT *null;
+ uint element_nr; /* Number of element */
+ } leaf;
+ } a;
+};
+
+
+typedef struct st_huff_tree {
+ HUFF_ELEMENT *root,*element_buffer;
+ HUFF_COUNTS *counts;
+ uint tree_number;
+ uint elements;
+ my_off_t bytes_packed;
+ uint tree_pack_length;
+ uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
+ ulonglong *code;
+ uchar *code_len;
+} HUFF_TREE;
+
+
+typedef struct st_isam_mrg {
+ MARIA_HA **file,**current,**end;
+ uint free_file;
+ uint count;
+ uint min_pack_length; /* Theese is used by packed data */
+ uint max_pack_length;
+ uint ref_length;
+ uint max_blob_length;
+ my_off_t records;
+ /* true if at least one source file has at least one disabled index */
+ my_bool src_file_has_indexes_disabled;
+} PACK_MRG_INFO;
+
+
+extern int main(int argc,char * *argv);
+static void get_options(int *argc,char ***argv);
+static MARIA_HA *open_maria_file(char *name,int mode);
+static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count);
+static int compress(PACK_MRG_INFO *file,char *join_name);
+static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
+static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
+ uint trees,
+ HUFF_COUNTS *huff_counts,
+ uint fields);
+static int compare_tree(void* cmp_arg __attribute__((unused)),
+ const uchar *s,const uchar *t);
+static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
+static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
+ my_off_t records);
+static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
+ uint max_space_length,my_off_t *space_counts,
+ my_off_t tot_space_count,
+ enum en_fieldtype field_type);
+static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
+static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
+static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
+static int save_counts_in_queue(uchar *key,element_count count,
+ HUFF_TREE *tree);
+static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
+static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
+static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
+static void make_traverse_code_tree(HUFF_TREE *huff_tree,
+ HUFF_ELEMENT *element,uint size,
+ ulonglong code);
+static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
+ my_off_t tot_elements,my_off_t filelength);
+static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
+static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
+static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
+ HUFF_ELEMENT *element,
+ uint *offset);
+static uint max_bit(uint value);
+static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
+static char *make_new_name(char *new_name,char *old_name);
+static char *make_old_name(char *new_name,char *old_name);
+static void init_file_buffer(File file,pbool read_buffer);
+static int flush_buffer(ulong neaded_length);
+static void end_file_buffer(void);
+static void write_bits(ulonglong value, uint bits);
+static void flush_bits(void);
+static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
+ my_off_t new_length, ha_checksum crc);
+static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,
+ my_off_t new_length, ha_checksum crc);
+static int mrg_close(PACK_MRG_INFO *mrg);
+static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
+static void mrg_reset(PACK_MRG_INFO *mrg);
+#if !defined(DBUG_OFF)
+static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
+static int fakecmp(my_off_t **count1, my_off_t **count2);
+#endif
+
+
+static int error_on_write=0,test_only=0,verbose=0,silent=0,
+ write_loop=0,force_pack=0, isamchk_neaded=0;
+static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
+static my_bool backup, opt_wait;
+/*
+ tree_buff_length is somewhat arbitrary. The bigger it is the better
+ the chance to win in terms of compression factor. On the other hand,
+ this table becomes part of the compressed file header. And its length
+ is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
+*/
+static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
+static char tmp_dir[FN_REFLEN]={0},*join_table;
+static my_off_t intervall_length;
+static ha_checksum glob_crc;
+static struct st_file_buffer file_buffer;
+static QUEUE queue;
+static HUFF_COUNTS *global_count;
+static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+static const char *load_default_groups[]= { "ariapack",0 };
+
+ /* The main program */
+
+int main(int argc, char **argv)
+{
+ int error,ok;
+ PACK_MRG_INFO merge;
+ char **default_argv;
+ MY_INIT(argv[0]);
+
+ load_defaults("my",load_default_groups,&argc,&argv);
+ default_argv= argv;
+ get_options(&argc,&argv);
+ maria_init();
+
+ error=ok=isamchk_neaded=0;
+ if (join_table)
+ { /* Join files into one */
+ if (open_maria_files(&merge,argv,(uint) argc) ||
+ compress(&merge,join_table))
+ error=1;
+ }
+ else while (argc--)
+ {
+ MARIA_HA *isam_file;
+ if (!(isam_file=open_maria_file(*argv++,O_RDWR)))
+ error=1;
+ else
+ {
+ merge.file= &isam_file;
+ merge.current=0;
+ merge.free_file=0;
+ merge.count=1;
+ if (compress(&merge,0))
+ error=1;
+ else
+ ok=1;
+ }
+ }
+ if (ok && isamchk_neaded && !silent)
+ puts("Remember to run aria_chk -rq on compressed tables");
+ VOID(fflush(stdout));
+ VOID(fflush(stderr));
+ free_defaults(default_argv);
+ maria_end();
+ my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
+ exit(error ? 2 : 0);
+#ifndef _lint
+ return 0; /* No compiler warning */
+#endif
+}
+
+enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
+
+static struct my_option my_long_options[] =
+{
+#ifdef __NETWARE__
+ {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"backup", 'b', "Make a backup of the table as table_name.OLD.",
+ &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"character-sets-dir", OPT_CHARSETS_DIR_MP,
+ "Directory where character sets are.", (char**) &charsets_dir,
+ (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
+ 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+ {"force", 'f',
+ "Force packing of table even if it gets bigger or if tempfile exists.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"join", 'j',
+ "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
+ &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
+ 0, 0, 0},
+ {"help", '?', "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"silent", 's', "Be more silent.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"tmpdir", 'T', "Use temporary directory to store temporary table.",
+ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"test", 't', "Don't pack table, only test packing it.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"version", 'V', "Output version information and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
+ &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+#include <help_start.h>
+
+static void print_version(void)
+{
+ VOID(printf("%s Ver 1.0 for %s on %s\n",
+ my_progname, SYSTEM_TYPE, MACHINE_TYPE));
+ NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+ print_version();
+ puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
+ puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+ puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+ puts("Pack a Aria-table to take much less space.");
+ puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file");
+ puts("afterwards to update the keys.");
+ puts("You should give the .MAI file as the filename argument.");
+ puts("To unpack a packed table, run aria_chk -u on the table");
+
+ VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
+ my_print_help(my_long_options);
+ print_defaults("my", load_default_groups);
+ my_print_variables(my_long_options);
+}
+
+#include <help_end.h>
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument)
+{
+ uint length;
+
+ switch(optid) {
+#ifdef __NETWARE__
+ case OPT_AUTO_CLOSE:
+ setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
+ break;
+#endif
+ case 'f':
+ force_pack= 1;
+ tmpfile_createflag= O_RDWR | O_TRUNC;
+ break;
+ case 's':
+ write_loop= verbose= 0;
+ silent= 1;
+ break;
+ case 't':
+ test_only= 1;
+ /* Avoid to reset 'verbose' if it was already set > 1. */
+ if (! verbose)
+ verbose= 1;
+ break;
+ case 'T':
+ length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
+ if (length != dirname_length(tmp_dir))
+ {
+ tmp_dir[length]=FN_LIBCHAR;
+ tmp_dir[length+1]=0;
+ }
+ break;
+ case 'v':
+ verbose++; /* Allow for selecting the level of verbosity. */
+ silent= 0;
+ break;
+ case '#':
+ DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace");
+ break;
+ case 'V':
+ print_version();
+ exit(0);
+ case 'I':
+ case '?':
+ usage();
+ exit(0);
+ }
+ return 0;
+}
+
+ /* reads options */
+ /* Initiates DEBUG - but no debugging here ! */
+
+static void get_options(int *argc,char ***argv)
+{
+ int ho_error;
+
+ my_progname= argv[0][0];
+ if (isatty(fileno(stdout)))
+ write_loop=1;
+
+ if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ if (!*argc)
+ {
+ usage();
+ exit(1);
+ }
+ if (join_table)
+ {
+ backup=0; /* Not needed */
+ tmp_dir[0]=0;
+ }
+ return;
+}
+
+
+static MARIA_HA *open_maria_file(char *name,int mode)
+{
+ MARIA_HA *isam_file;
+ MARIA_SHARE *share;
+ DBUG_ENTER("open_maria_file");
+
+ if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
+ (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
+ HA_OPEN_ABORT_IF_LOCKED))))
+ {
+ VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
+ DBUG_RETURN(0);
+ }
+ share=isam_file->s;
+ if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
+ {
+ if (!force_pack)
+ {
+ VOID(fprintf(stderr, "%s is already compressed\n", name));
+ VOID(maria_close(isam_file));
+ DBUG_RETURN(0);
+ }
+ if (verbose)
+ puts("Recompressing already compressed table");
+ share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
+ }
+ if (! force_pack && share->state.state.records != 0 &&
+ (share->state.state.records <= 1 ||
+ share->state.state.data_file_length < 1024))
+ {
+ VOID(fprintf(stderr, "%s is too small to compress\n", name));
+ VOID(maria_close(isam_file));
+ DBUG_RETURN(0);
+ }
+ VOID(maria_lock_database(isam_file,F_WRLCK));
+ maria_ignore_trids(isam_file);
+ DBUG_RETURN(isam_file);
+}
+
+
+static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
+{
+ uint i,j;
+ mrg->count=0;
+ mrg->current=0;
+ mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE));
+ mrg->free_file=1;
+ mrg->src_file_has_indexes_disabled= 0;
+ for (i=0; i < count ; i++)
+ {
+ if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY)))
+ goto error;
+
+ mrg->src_file_has_indexes_disabled|=
+ ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
+ mrg->file[i]->s->base.keys);
+ }
+ /* Check that files are identical */
+ for (j=0 ; j < count-1 ; j++)
+ {
+ MARIA_COLUMNDEF *m1,*m2,*end;
+ if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
+ mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
+ goto diff_file;
+ m1=mrg->file[j]->s->columndef;
+ end=m1+mrg->file[j]->s->base.fields;
+ m2=mrg->file[j+1]->s->columndef;
+ for ( ; m1 != end ; m1++,m2++)
+ {
+ if (m1->type != m2->type || m1->length != m2->length)
+ goto diff_file;
+ }
+ }
+ mrg->count=count;
+ return 0;
+
+ diff_file:
+ VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
+ my_progname, names[j], names[j+1]));
+ error:
+ while (i--)
+ maria_close(mrg->file[i]);
+ my_free(mrg->file, MYF(0));
+ return 1;
+}
+
+
+static int compress(PACK_MRG_INFO *mrg,char *result_table)
+{
+ int error;
+ File new_file,join_maria_file;
+ MARIA_HA *isam_file;
+ MARIA_SHARE *share;
+ char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
+ uint i,header_length,fields,trees,used_trees;
+ my_off_t old_length,new_length,tot_elements;
+ HUFF_COUNTS *huff_counts;
+ HUFF_TREE *huff_trees;
+ DBUG_ENTER("compress");
+
+ isam_file=mrg->file[0]; /* Take this as an example */
+ share=isam_file->s;
+ new_file=join_maria_file= -1;
+ trees=fields=0;
+ huff_trees=0;
+ huff_counts=0;
+ maria_block_size= isam_file->s->block_size;
+
+ /* Create temporary or join file */
+ if (backup)
+ VOID(fn_format(org_name,isam_file->s->open_file_name.str,
+ "",MARIA_NAME_DEXT, 2));
+ else
+ VOID(fn_format(org_name,isam_file->s->open_file_name.str,
+ "",MARIA_NAME_DEXT, 2+4+16));
+
+ if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
+ maria_block_size, MY_WME) == 0)
+ {
+ fprintf(stderr, "Can't initialize page cache\n");
+ goto err;
+ }
+
+ if (!test_only && result_table)
+ {
+ /* Make a new indexfile based on first file in list */
+ uint length;
+ uchar *buff;
+ strmov(org_name,result_table); /* Fix error messages */
+ VOID(fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2));
+ if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
+ < 0)
+ goto err;
+ length=(uint) share->base.keystart;
+ if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
+ goto err;
+ if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
+ my_write(join_maria_file,buff,length,
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
+ {
+ my_free(buff,MYF(0));
+ goto err;
+ }
+ my_free(buff,MYF(0));
+ VOID(fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2));
+ }
+ else if (!tmp_dir[0])
+ VOID(make_new_name(new_name,org_name));
+ else
+ VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
+ if (!test_only &&
+ (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
+ goto err;
+
+ /* Start calculating statistics */
+
+ mrg->records=0;
+ for (i=0 ; i < mrg->count ; i++)
+ mrg->records+=mrg->file[i]->s->state.state.records;
+
+ DBUG_PRINT("info", ("Compressing %s: (%lu records)",
+ result_table ? new_name : org_name,
+ (ulong) mrg->records));
+ if (write_loop || verbose)
+ {
+ VOID(printf("Compressing %s: (%lu records)\n",
+ result_table ? new_name : org_name, (ulong) mrg->records));
+ }
+ trees=fields=share->base.fields;
+ huff_counts=init_huff_count(isam_file,mrg->records);
+ QUICK_SAFEMALLOC;
+
+ /*
+ Read the whole data file(s) for statistics.
+ */
+ DBUG_PRINT("info", ("- Calculating statistics"));
+ if (write_loop || verbose)
+ VOID(printf("- Calculating statistics\n"));
+ if (get_statistic(mrg,huff_counts))
+ goto err;
+ NORMAL_SAFEMALLOC;
+ old_length=0;
+ for (i=0; i < mrg->count ; i++)
+ old_length+= (mrg->file[i]->s->state.state.data_file_length -
+ mrg->file[i]->s->state.state.empty);
+
+ /*
+ Create a global priority queue in preparation for making
+ temporary Huffman trees.
+ */
+ if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
+ goto err;
+
+ /*
+ Check each column if we should use pre-space-compress, end-space-
+ compress, empty-field-compress or zero-field-compress.
+ */
+ check_counts(huff_counts,fields,mrg->records);
+
+ /*
+ Build a Huffman tree for each column.
+ */
+ huff_trees=make_huff_trees(huff_counts,trees);
+
+ /*
+ If the packed lengths of combined columns is less then the sum of
+ the non-combined columns, then create common Huffman trees for them.
+ We do this only for uchar compressed columns, not for distinct values
+ compressed columns.
+ */
+ if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
+ goto err;
+
+ /*
+ Assign codes to all uchar or column values.
+ */
+ if (make_huff_decode_table(huff_trees,fields))
+ goto err;
+
+ /* Prepare a file buffer. */
+ init_file_buffer(new_file,0);
+
+ /*
+ Reserve space in the target file for the fixed compressed file header.
+ */
+ file_buffer.pos_in_file=HEAD_LENGTH;
+ if (! test_only)
+ VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
+
+ /*
+ Write field infos: field type, pack type, length bits, tree number.
+ */
+ write_field_info(huff_counts,fields,used_trees);
+
+ /*
+ Write decode trees.
+ */
+ if (!(tot_elements=write_huff_tree(huff_trees,trees)))
+ goto err;
+
+ /*
+ Calculate the total length of the compression info header.
+ This includes the fixed compressed file header, the column compression
+ type descriptions, and the decode trees.
+ */
+ header_length=(uint) file_buffer.pos_in_file+
+ (uint) (file_buffer.pos-file_buffer.buffer);
+
+ /*
+ Compress the source file into the target file.
+ */
+ DBUG_PRINT("info", ("- Compressing file"));
+ if (write_loop || verbose)
+ VOID(printf("- Compressing file\n"));
+ error=compress_maria_file(mrg,huff_counts);
+ new_length=file_buffer.pos_in_file;
+ if (!error && !test_only)
+ {
+ uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
+ bzero(buff,sizeof(buff));
+ error=my_write(file_buffer.file,buff,sizeof(buff),
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
+ }
+
+ /*
+ Write the fixed compressed file header.
+ */
+ if (!error)
+ error=write_header(mrg,header_length,used_trees,tot_elements,
+ new_length);
+
+ /* Flush the file buffer. */
+ end_file_buffer();
+
+ /* Display statistics. */
+ DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
+ "Mean total length: %6ld",
+ mrg->min_pack_length, mrg->max_pack_length,
+ (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
+ if (verbose && mrg->records)
+ VOID(printf("Min record length: %6d Max length: %6d "
+ "Mean total length: %6ld\n", mrg->min_pack_length,
+ mrg->max_pack_length, (ulong) (new_length/mrg->records)));
+
+ /* Close source and target file. */
+ if (!test_only)
+ {
+ error|=my_close(new_file,MYF(MY_WME));
+ if (!result_table)
+ {
+ error|=my_close(isam_file->dfile.file, MYF(MY_WME));
+ isam_file->dfile.file= -1; /* Tell maria_close file is closed */
+ isam_file->s->bitmap.file.file= -1;
+ }
+ }
+
+ /* Cleanup. */
+ free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
+ if (! test_only && ! error)
+ {
+ if (result_table)
+ {
+ error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc);
+ }
+ else
+ {
+ if (backup)
+ {
+ if (my_rename(org_name,make_old_name(temp_name,
+ isam_file->s->open_file_name.str),
+ MYF(MY_WME)))
+ error=1;
+ else
+ {
+ if (tmp_dir[0])
+ error=my_copy(new_name,org_name,MYF(MY_WME));
+ else
+ error=my_rename(new_name,org_name,MYF(MY_WME));
+ if (!error)
+ {
+ VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
+ if (tmp_dir[0])
+ VOID(my_delete(new_name,MYF(MY_WME)));
+ }
+ }
+ }
+ else
+ {
+ if (tmp_dir[0])
+ {
+ error=my_copy(new_name,org_name,
+ MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
+ if (!error)
+ VOID(my_delete(new_name,MYF(MY_WME)));
+ }
+ else
+ error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
+ }
+ if (! error)
+ error=save_state(isam_file,mrg,new_length,glob_crc);
+ }
+ }
+ error|=mrg_close(mrg);
+ if (join_maria_file >= 0)
+ error|=my_close(join_maria_file,MYF(MY_WME));
+ if (error)
+ {
+ VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
+ VOID(my_delete(new_name,MYF(MY_WME)));
+ DBUG_RETURN(-1);
+ }
+ if (write_loop || verbose)
+ {
+ if (old_length)
+ VOID(printf("%.4g%% \n",
+ (((longlong) (old_length - new_length)) * 100.0 /
+ (longlong) old_length)));
+ else
+ puts("Empty file saved in compressed format");
+ }
+ DBUG_RETURN(0);
+
+ err:
+ end_pagecache(maria_pagecache, 1);
+ free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
+ if (new_file >= 0)
+ VOID(my_close(new_file,MYF(0)));
+ if (join_maria_file >= 0)
+ VOID(my_close(join_maria_file,MYF(0)));
+ mrg_close(mrg);
+ VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
+ DBUG_RETURN(-1);
+}
+
+ /* Init a huff_count-struct for each field and init it */
+
+static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
+{
+ reg2 uint i;
+ reg1 HUFF_COUNTS *count;
+ if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
+ sizeof(HUFF_COUNTS),
+ MYF(MY_ZEROFILL | MY_WME))))
+ {
+ for (i=0 ; i < info->s->base.fields ; i++)
+ {
+ enum en_fieldtype type;
+ count[i].field_length=info->s->columndef[i].length;
+ type= count[i].field_type= (enum en_fieldtype) info->s->columndef[i].type;
+ if (type == FIELD_INTERVALL ||
+ type == FIELD_CONSTANT ||
+ type == FIELD_ZERO)
+ type = FIELD_NORMAL;
+ if (count[i].field_length <= 8 &&
+ (type == FIELD_NORMAL ||
+ type == FIELD_SKIP_ZERO))
+ count[i].max_zero_fill= count[i].field_length;
+ /*
+ For every column initialize a tree, which is used to detect distinct
+ column values. 'int_tree' works together with 'tree_buff' and
+ 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
+ This is accomplished by '-1' as the element size.
+ */
+ init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
+ NULL);
+ if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
+ count[i].tree_pos=count[i].tree_buff =
+ my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
+ MYF(MY_WME));
+ }
+ }
+ return count;
+}
+
+
+ /* Free memory used by counts and trees */
+
+static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
+ HUFF_COUNTS *huff_counts,
+ uint fields)
+{
+ register uint i;
+
+ if (huff_trees)
+ {
+ for (i=0 ; i < trees ; i++)
+ {
+ if (huff_trees[i].element_buffer)
+ my_free(huff_trees[i].element_buffer,MYF(0));
+ if (huff_trees[i].code)
+ my_free(huff_trees[i].code,MYF(0));
+ }
+ my_free(huff_trees,MYF(0));
+ }
+ if (huff_counts)
+ {
+ for (i=0 ; i < fields ; i++)
+ {
+ if (huff_counts[i].tree_buff)
+ {
+ my_free(huff_counts[i].tree_buff,MYF(0));
+ delete_tree(&huff_counts[i].int_tree);
+ }
+ }
+ my_free(huff_counts, MYF(0));
+ }
+ delete_queue(&queue); /* This is safe to free */
+ return;
+}
+
+ /* Read through old file and gather some statistics */
+
+static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
+{
+ int error;
+ uint length, null_bytes;
+ ulong reclength,max_blob_length;
+ uchar *record,*pos,*next_pos,*end_pos,*start_pos;
+ ha_rows record_count;
+ HUFF_COUNTS *count,*end_count;
+ TREE_ELEMENT *element;
+ ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
+ DBUG_ENTER("get_statistic");
+
+ reclength= mrg->file[0]->s->base.reclength;
+ null_bytes= mrg->file[0]->s->base.null_bytes;
+ record=(uchar*) my_alloca(reclength);
+ end_count=huff_counts+mrg->file[0]->s->base.fields;
+ record_count=0; glob_crc=0;
+ max_blob_length=0;
+
+ /* Check how to calculate checksum */
+ if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
+ calc_checksum= _ma_static_checksum;
+ else
+ calc_checksum= _ma_checksum;
+
+ mrg_reset(mrg);
+ while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
+ {
+ ulong tot_blob_length=0;
+ if (! error)
+ {
+ /* glob_crc is a checksum over all bytes of all records. */
+ glob_crc+= (*calc_checksum)(mrg->file[0],record);
+
+ /* Count the incidence of values separately for every column. */
+ for (pos=record + null_bytes, count=huff_counts ;
+ count < end_count ;
+ count++,
+ pos=next_pos)
+ {
+ next_pos=end_pos=(start_pos=pos)+count->field_length;
+
+ /*
+ Put the whole column value in a tree if there is room for it.
+ 'int_tree' is used to quickly check for duplicate values.
+ 'tree_buff' collects as many distinct column values as
+ possible. If the field length is > 1, it is tree_buff_length,
+ else 2 bytes. Each value is 'field_length' bytes big. If there
+ are more distinct column values than fit into the buffer, we
+ give up with this tree. BLOBs and VARCHARs do not have a
+ tree_buff as it can only be used with fixed length columns.
+ For the special case of field length == 1, we handle only the
+ case that there is only one distinct value in the table(s).
+ Otherwise, we can have a maximum of 256 distinct values. This
+ is then handled by the normal Huffman tree build.
+
+ Another limit for collecting distinct column values is the
+ number of values itself. Since we would need to build a
+ Huffman tree for the values, we are limited by the 'IS_OFFSET'
+ constant. This constant expresses a bit which is used to
+ determine if a tree element holds a final value or an offset
+ to a child element. Hence, all values and offsets need to be
+ smaller than 'IS_OFFSET'. A tree element is implemented with
+ two integer values, one for the left branch and one for the
+ right branch. For the extreme case that the first element
+ points to the last element, the number of integers in the tree
+ must be less or equal to IS_OFFSET. So the number of elements
+ must be less or equal to IS_OFFSET / 2.
+
+ WARNING: At first, we insert a pointer into the record buffer
+ as the key for the tree. If we got a new distinct value, which
+ is really inserted into the tree, instead of being counted
+ only, we will copy the column value from the record buffer to
+ 'tree_buff' and adjust the key pointer of the tree accordingly.
+ */
+ if (count->tree_buff)
+ {
+ global_count=count;
+ if (!(element=tree_insert(&count->int_tree,pos, 0,
+ count->int_tree.custom_arg)) ||
+ (element->count == 1 &&
+ (count->tree_buff + tree_buff_length <
+ count->tree_pos + count->field_length)) ||
+ (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
+ (count->field_length == 1 &&
+ count->int_tree.elements_in_tree > 1))
+ {
+ delete_tree(&count->int_tree);
+ my_free(count->tree_buff,MYF(0));
+ count->tree_buff=0;
+ }
+ else
+ {
+ /*
+ If tree_insert() succeeds, it either creates a new element
+ or increments the counter of an existing element.
+ */
+ if (element->count == 1)
+ {
+ /* Copy the new column value into 'tree_buff'. */
+ memcpy(count->tree_pos,pos,(size_t) count->field_length);
+ /* Adjust the key pointer in the tree. */
+ tree_set_pointer(element,count->tree_pos);
+ /* Point behind the last column value so far. */
+ count->tree_pos+=count->field_length;
+ }
+ }
+ }
+
+ /* Save character counters and space-counts and zero-field-counts */
+ if (count->field_type == FIELD_NORMAL ||
+ count->field_type == FIELD_SKIP_ENDSPACE)
+ {
+ /* Ignore trailing space. */
+ for ( ; end_pos > pos ; end_pos--)
+ if (end_pos[-1] != ' ')
+ break;
+ /* Empty fields are just counted. Go to the next record. */
+ if (end_pos == pos)
+ {
+ count->empty_fields++;
+ count->max_zero_fill=0;
+ continue;
+ }
+ /*
+ Count the total of all trailing spaces and the number of
+ short trailing spaces. Remember the longest trailing space.
+ */
+ length= (uint) (next_pos-end_pos);
+ count->tot_end_space+=length;
+ if (length < 8)
+ count->end_space[length]++;
+ if (count->max_end_space < length)
+ count->max_end_space = length;
+ }
+
+ if (count->field_type == FIELD_NORMAL ||
+ count->field_type == FIELD_SKIP_PRESPACE)
+ {
+ /* Ignore leading space. */
+ for (pos=start_pos; pos < end_pos ; pos++)
+ if (pos[0] != ' ')
+ break;
+ /* Empty fields are just counted. Go to the next record. */
+ if (end_pos == pos)
+ {
+ count->empty_fields++;
+ count->max_zero_fill=0;
+ continue;
+ }
+ /*
+ Count the total of all leading spaces and the number of
+ short leading spaces. Remember the longest leading space.
+ */
+ length= (uint) (pos-start_pos);
+ count->tot_pre_space+=length;
+ if (length < 8)
+ count->pre_space[length]++;
+ if (count->max_pre_space < length)
+ count->max_pre_space = length;
+ }
+
+ /* Calculate pos, end_pos, and max_length for variable length fields. */
+ if (count->field_type == FIELD_BLOB)
+ {
+ uint field_length=count->field_length -portable_sizeof_char_ptr;
+ ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
+ memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*));
+ end_pos=pos+blob_length;
+ tot_blob_length+=blob_length;
+ set_if_bigger(count->max_length,blob_length);
+ }
+ else if (count->field_type == FIELD_VARCHAR)
+ {
+ uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
+ length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
+ uint2korr(start_pos));
+ pos= start_pos+pack_length;
+ end_pos= pos+length;
+ set_if_bigger(count->max_length,length);
+ }
+
+ /* Evaluate 'max_zero_fill' for short fields. */
+ if (count->field_length <= 8 &&
+ (count->field_type == FIELD_NORMAL ||
+ count->field_type == FIELD_SKIP_ZERO))
+ {
+ uint i;
+ /* Zero fields are just counted. Go to the next record. */
+ if (!memcmp(start_pos, zero_string, count->field_length))
+ {
+ count->zero_fields++;
+ continue;
+ }
+ /*
+ max_zero_fill starts with field_length. It is decreased every
+ time a shorter "zero trailer" is found. It is set to zero when
+ an empty field is found (see above). This suggests that the
+ variable should be called 'min_zero_fill'.
+ */
+ for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
+ i++) ;
+ if (i < count->max_zero_fill)
+ count->max_zero_fill=i;
+ }
+
+ /* Ignore zero fields and check fields. */
+ if (count->field_type == FIELD_ZERO ||
+ count->field_type == FIELD_CHECK)
+ continue;
+
+ /*
+ Count the incidence of every uchar value in the
+ significant field value.
+ */
+ for ( ; pos < end_pos ; pos++)
+ count->counts[(uchar) *pos]++;
+
+ /* Step to next field. */
+ }
+
+ if (tot_blob_length > max_blob_length)
+ max_blob_length=tot_blob_length;
+ record_count++;
+ if (write_loop && record_count % WRITE_COUNT == 0)
+ {
+ VOID(printf("%lu\r", (ulong) record_count));
+ VOID(fflush(stdout));
+ }
+ }
+ else if (error != HA_ERR_RECORD_DELETED)
+ {
+ VOID(fprintf(stderr, "Got error %d while reading rows\n", error));
+ break;
+ }
+
+ /* Step to next record. */
+ }
+ if (write_loop)
+ {
+ VOID(printf(" \r"));
+ VOID(fflush(stdout));
+ }
+
+ /*
+ If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
+ codes.
+ */
+ DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
+
+ DBUG_PRINT("info", ("Found the following number of incidents "
+ "of the uchar codes:"));
+ if (verbose >= 2)
+ VOID(printf("Found the following number of incidents "
+ "of the uchar codes:\n"));
+ for (count= huff_counts ; count < end_count; count++)
+ {
+ uint idx;
+ my_off_t total_count;
+ char llbuf[32];
+
+ DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
+ if (verbose >= 2)
+ VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
+ if (count->tree_buff)
+ {
+ DBUG_PRINT("info", ("number of distinct values: %u",
+ (uint) ((count->tree_pos - count->tree_buff) /
+ count->field_length)));
+ if (verbose >= 2)
+ VOID(printf("number of distinct values: %u\n",
+ (uint) ((count->tree_pos - count->tree_buff) /
+ count->field_length)));
+ }
+ total_count= 0;
+ for (idx= 0; idx < 256; idx++)
+ {
+ if (count->counts[idx])
+ {
+ total_count+= count->counts[idx];
+ DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
+ llstr((longlong) count->counts[idx], llbuf)));
+ if (verbose >= 2)
+ VOID(printf("counts[0x%02x]: %12s\n", idx,
+ llstr((longlong) count->counts[idx], llbuf)));
+ }
+ }
+ DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count,
+ llbuf)));
+ if ((verbose >= 2) && total_count)
+ {
+ VOID(printf("total: %12s\n",
+ llstr((longlong) total_count, llbuf)));
+ }
+ }
+
+ mrg->records=record_count;
+ mrg->max_blob_length=max_blob_length;
+ my_afree(record);
+ DBUG_RETURN(error != HA_ERR_END_OF_FILE);
+}
+
+static int compare_huff_elements(void *not_used __attribute__((unused)),
+ uchar *a, uchar *b)
+{
+ return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
+ (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
+}
+
+ /* Check each tree if we should use pre-space-compress, end-space-
+ compress, empty-field-compress or zero-field-compress */
+
+static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
+ my_off_t records)
+{
+ uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
+ my_off_t old_length,new_length,length;
+ DBUG_ENTER("check_counts");
+
+ bzero((uchar*) field_count,sizeof(field_count));
+ space_fields=fill_zero_fields=0;
+
+ for (; trees-- ; huff_counts++)
+ {
+ if (huff_counts->field_type == FIELD_BLOB)
+ {
+ huff_counts->length_bits=max_bit(huff_counts->max_length);
+ goto found_pack;
+ }
+ else if (huff_counts->field_type == FIELD_VARCHAR)
+ {
+ huff_counts->length_bits=max_bit(huff_counts->max_length);
+ goto found_pack;
+ }
+ else if (huff_counts->field_type == FIELD_CHECK)
+ {
+ huff_counts->bytes_packed=0;
+ huff_counts->counts[0]=0;
+ goto found_pack;
+ }
+
+ huff_counts->field_type=FIELD_NORMAL;
+ huff_counts->pack_type=0;
+
+ /* Check for zero-filled records (in this column), or zero records. */
+ if (huff_counts->zero_fields || ! records)
+ {
+ my_off_t old_space_count;
+ /*
+ If there are only zero filled records (in this column),
+ or no records at all, we are done.
+ */
+ if (huff_counts->zero_fields == records)
+ {
+ huff_counts->field_type= FIELD_ZERO;
+ huff_counts->bytes_packed=0;
+ huff_counts->counts[0]=0;
+ goto found_pack;
+ }
+ /* Remeber the number of significant spaces. */
+ old_space_count=huff_counts->counts[' '];
+ /* Add all leading and trailing spaces. */
+ huff_counts->counts[' ']+= (huff_counts->tot_end_space +
+ huff_counts->tot_pre_space +
+ huff_counts->empty_fields *
+ huff_counts->field_length);
+ /* Check, what the compressed length of this would be. */
+ old_length=calc_packed_length(huff_counts,0)+records/8;
+ /* Get the number of zero bytes. */
+ length=huff_counts->zero_fields*huff_counts->field_length;
+ /* Add it to the counts. */
+ huff_counts->counts[0]+=length;
+ /* Check, what the compressed length of this would be. */
+ new_length=calc_packed_length(huff_counts,0);
+ /* If the compression without the zeroes would be shorter, we are done. */
+ if (old_length < new_length && huff_counts->field_length > 1)
+ {
+ huff_counts->field_type=FIELD_SKIP_ZERO;
+ huff_counts->counts[0]-=length;
+ huff_counts->bytes_packed=old_length- records/8;
+ goto found_pack;
+ }
+ /* Remove the insignificant spaces, but keep the zeroes. */
+ huff_counts->counts[' ']=old_space_count;
+ }
+ /* Check, what the compressed length of this column would be. */
+ huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
+
+ /*
+ If there are enough empty records (in this column),
+ treating them specially may pay off.
+ */
+ if (huff_counts->empty_fields)
+ {
+ if (huff_counts->field_length > 2 &&
+ huff_counts->empty_fields + (records - huff_counts->empty_fields)*
+ (1+max_bit(max(huff_counts->max_pre_space,
+ huff_counts->max_end_space))) <
+ records * max_bit(huff_counts->field_length))
+ {
+ huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
+ }
+ else
+ {
+ length=huff_counts->empty_fields*huff_counts->field_length;
+ if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
+ {
+ huff_counts->tot_end_space+=length;
+ huff_counts->max_end_space=huff_counts->field_length;
+ if (huff_counts->field_length < 8)
+ huff_counts->end_space[huff_counts->field_length]+=
+ huff_counts->empty_fields;
+ }
+ if (huff_counts->tot_pre_space)
+ {
+ huff_counts->tot_pre_space+=length;
+ huff_counts->max_pre_space=huff_counts->field_length;
+ if (huff_counts->field_length < 8)
+ huff_counts->pre_space[huff_counts->field_length]+=
+ huff_counts->empty_fields;
+ }
+ }
+ }
+
+ /*
+ If there are enough trailing spaces (in this column),
+ treating them specially may pay off.
+ */
+ if (huff_counts->tot_end_space)
+ {
+ huff_counts->counts[' ']+=huff_counts->tot_pre_space;
+ if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
+ huff_counts->end_space,
+ huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
+ goto found_pack;
+ huff_counts->counts[' ']-=huff_counts->tot_pre_space;
+ }
+
+ /*
+ If there are enough leading spaces (in this column),
+ treating them specially may pay off.
+ */
+ if (huff_counts->tot_pre_space)
+ {
+ if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
+ huff_counts->pre_space,
+ huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
+ goto found_pack;
+ }
+
+ found_pack: /* Found field-packing */
+
+ /* Test if we can use zero-fill */
+
+ if (huff_counts->max_zero_fill &&
+ (huff_counts->field_type == FIELD_NORMAL ||
+ huff_counts->field_type == FIELD_SKIP_ZERO))
+ {
+ huff_counts->counts[0]-=huff_counts->max_zero_fill*
+ (huff_counts->field_type == FIELD_SKIP_ZERO ?
+ records - huff_counts->zero_fields : records);
+ huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
+ huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
+ }
+
+ /* Test if intervall-field is better */
+
+ if (huff_counts->tree_buff)
+ {
+ HUFF_TREE tree;
+
+ DBUG_EXECUTE_IF("forceintervall",
+ huff_counts->bytes_packed= ~ (my_off_t) 0;);
+ tree.element_buffer=0;
+ if (!make_huff_tree(&tree,huff_counts) &&
+ tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
+ {
+ if (tree.elements == 1)
+ huff_counts->field_type=FIELD_CONSTANT;
+ else
+ huff_counts->field_type=FIELD_INTERVALL;
+ huff_counts->pack_type=0;
+ }
+ else
+ {
+ my_free(huff_counts->tree_buff,MYF(0));
+ delete_tree(&huff_counts->int_tree);
+ huff_counts->tree_buff=0;
+ }
+ if (tree.element_buffer)
+ my_free(tree.element_buffer,MYF(0));
+ }
+ if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
+ space_fields++;
+ if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
+ fill_zero_fields++;
+ field_count[huff_counts->field_type]++;
+ }
+ DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
+ "empty-zero: %3d empty-fill: %3d",
+ field_count[FIELD_NORMAL],space_fields,
+ field_count[FIELD_SKIP_ZERO],fill_zero_fields));
+ DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
+ "intervall-fields: %3d zero: %3d",
+ field_count[FIELD_SKIP_PRESPACE],
+ field_count[FIELD_SKIP_ENDSPACE],
+ field_count[FIELD_INTERVALL],
+ field_count[FIELD_ZERO]));
+ if (verbose)
+ VOID(printf("\nnormal: %3d empty-space: %3d "
+ "empty-zero: %3d empty-fill: %3d\n"
+ "pre-space: %3d end-space: %3d "
+ "intervall-fields: %3d zero: %3d\n",
+ field_count[FIELD_NORMAL],space_fields,
+ field_count[FIELD_SKIP_ZERO],fill_zero_fields,
+ field_count[FIELD_SKIP_PRESPACE],
+ field_count[FIELD_SKIP_ENDSPACE],
+ field_count[FIELD_INTERVALL],
+ field_count[FIELD_ZERO]));
+ DBUG_VOID_RETURN;
+}
+
+
+/* Test if we can use space-compression and empty-field-compression */
+
+static int
+test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
+ uint max_space_length, my_off_t *space_counts,
+ my_off_t tot_space_count, enum en_fieldtype field_type)
+{
+ int min_pos;
+ uint length_bits,i;
+ my_off_t space_count,min_space_count,min_pack,new_length,skip;
+
+ length_bits=max_bit(max_space_length);
+
+ /* Default no end_space-packing */
+ space_count=huff_counts->counts[(uint) ' '];
+ min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
+ min_pack=calc_packed_length(huff_counts,0);
+ min_pos= -2;
+ huff_counts->counts[(uint) ' ']=space_count;
+
+ /* Test with allways space-count */
+ new_length=huff_counts->bytes_packed+length_bits*records/8;
+ if (new_length+1 < min_pack)
+ {
+ min_pos= -1;
+ min_pack=new_length;
+ min_space_count=space_count;
+ }
+ /* Test with length-flag */
+ for (skip=0L, i=0 ; i < 8 ; i++)
+ {
+ if (space_counts[i])
+ {
+ if (i)
+ huff_counts->counts[(uint) ' ']+=space_counts[i];
+ skip+=huff_counts->pre_space[i];
+ new_length=calc_packed_length(huff_counts,0)+
+ (records+(records-skip)*(1+length_bits))/8;
+ if (new_length < min_pack)
+ {
+ min_pos=(int) i;
+ min_pack=new_length;
+ min_space_count=huff_counts->counts[(uint) ' '];
+ }
+ }
+ }
+
+ huff_counts->counts[(uint) ' ']=min_space_count;
+ huff_counts->bytes_packed=min_pack;
+ switch (min_pos) {
+ case -2:
+ return(0); /* No space-compress */
+ case -1: /* Always space-count */
+ huff_counts->field_type=field_type;
+ huff_counts->min_space=0;
+ huff_counts->length_bits=max_bit(max_space_length);
+ break;
+ default:
+ huff_counts->field_type=field_type;
+ huff_counts->min_space=(uint) min_pos;
+ huff_counts->pack_type|=PACK_TYPE_SELECTED;
+ huff_counts->length_bits=max_bit(max_space_length);
+ break;
+ }
+ return(1); /* Using space-compress */
+}
+
+
+ /* Make a huff_tree of each huff_count */
+
+static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
+{
+ uint tree;
+ HUFF_TREE *huff_tree;
+ DBUG_ENTER("make_huff_trees");
+
+ if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
+ MYF(MY_WME | MY_ZEROFILL))))
+ DBUG_RETURN(0);
+
+ for (tree=0 ; tree < trees ; tree++)
+ {
+ if (make_huff_tree(huff_tree+tree,huff_counts+tree))
+ {
+ while (tree--)
+ my_free(huff_tree[tree].element_buffer,MYF(0));
+ my_free(huff_tree,MYF(0));
+ DBUG_RETURN(0);
+ }
+ }
+ DBUG_RETURN(huff_tree);
+}
+
+/*
+ Build a Huffman tree.
+
+ SYNOPSIS
+ make_huff_tree()
+ huff_tree The Huffman tree.
+ huff_counts The counts.
+
+ DESCRIPTION
+ Build a Huffman tree according to huff_counts->counts or
+ huff_counts->tree_buff. tree_buff, if non-NULL contains up to
+ tree_buff_length of distinct column values. In that case, whole
+ values can be Huffman encoded instead of single bytes.
+
+ RETURN
+ 0 OK
+ != 0 Error
+*/
+
+static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
+{
+ uint i,found,bits_packed,first,last;
+ my_off_t bytes_packed;
+ HUFF_ELEMENT *a,*b,*new_huff_el;
+
+ first=last=0;
+ if (huff_counts->tree_buff)
+ {
+ /* Calculate the number of distinct values in tree_buff. */
+ found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
+ huff_counts->field_length;
+ first=0; last=found-1;
+ }
+ else
+ {
+ /* Count the number of uchar codes found in the column. */
+ for (i=found=0 ; i < 256 ; i++)
+ {
+ if (huff_counts->counts[i])
+ {
+ if (! found++)
+ first=i;
+ last=i;
+ }
+ }
+ if (found < 2)
+ found=2;
+ }
+
+ /* When using 'tree_buff' we can have more that 256 values. */
+ if (queue.max_elements < found)
+ {
+ delete_queue(&queue);
+ if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
+ return -1;
+ }
+
+ /* Allocate or reallocate an element buffer for the Huffman tree. */
+ if (!huff_tree->element_buffer)
+ {
+ if (!(huff_tree->element_buffer=
+ (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
+ return 1;
+ }
+ else
+ {
+ HUFF_ELEMENT *temp;
+ if (!(temp=
+ (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
+ found*2*sizeof(HUFF_ELEMENT),
+ MYF(MY_WME))))
+ return 1;
+ huff_tree->element_buffer=temp;
+ }
+
+ huff_counts->tree=huff_tree;
+ huff_tree->counts=huff_counts;
+ huff_tree->min_chr=first;
+ huff_tree->max_chr=last;
+ huff_tree->char_bits=max_bit(last-first);
+ huff_tree->offset_bits=max_bit(found-1)+1;
+
+ if (huff_counts->tree_buff)
+ {
+ huff_tree->elements=0;
+ huff_tree->tree_pack_length=(1+15+16+5+5+
+ (huff_tree->char_bits+1)*found+
+ (huff_tree->offset_bits+1)*
+ (found-2)+7)/8 +
+ (uint) (huff_tree->counts->tree_pos-
+ huff_tree->counts->tree_buff);
+ /*
+ Put a HUFF_ELEMENT into the queue for every distinct column value.
+
+ tree_walk() calls save_counts_in_queue() for every element in
+ 'int_tree'. This takes elements from the target trees element
+ buffer and places references to them into the buffer of the
+ priority queue. We insert in column value order, but the order is
+ in fact irrelevant here. We will establish the correct order
+ later.
+ */
+ tree_walk(&huff_counts->int_tree,
+ (int (*)(void*, element_count,void*)) save_counts_in_queue,
+ (uchar*) huff_tree, left_root_right);
+ }
+ else
+ {
+ huff_tree->elements=found;
+ huff_tree->tree_pack_length=(9+9+5+5+
+ (huff_tree->char_bits+1)*found+
+ (huff_tree->offset_bits+1)*
+ (found-2)+7)/8;
+ /*
+ Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
+
+ The elements are taken from the target trees element buffer.
+ Instead of using queue_insert(), we just place references to the
+ elements into the buffer of the priority queue. We insert in byte
+ value order, but the order is in fact irrelevant here. We will
+ establish the correct order later.
+ */
+ for (i=first, found=0 ; i <= last ; i++)
+ {
+ if (huff_counts->counts[i])
+ {
+ new_huff_el=huff_tree->element_buffer+(found++);
+ new_huff_el->count=huff_counts->counts[i];
+ new_huff_el->a.leaf.null=0;
+ new_huff_el->a.leaf.element_nr=i;
+ queue.root[found]=(uchar*) new_huff_el;
+ }
+ }
+ /*
+ If there is only a single uchar value in this field in all records,
+ add a second element with zero incidence. This is required to enter
+ the loop, which builds the Huffman tree.
+ */
+ while (found < 2)
+ {
+ new_huff_el=huff_tree->element_buffer+(found++);
+ new_huff_el->count=0;
+ new_huff_el->a.leaf.null=0;
+ if (last)
+ new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
+ else
+ new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
+ queue.root[found]=(uchar*) new_huff_el;
+ }
+ }
+
+ /* Make a queue from the queue buffer. */
+ queue.elements=found;
+
+ /*
+ Make a priority queue from the queue. Construct its index so that we
+ have a partially ordered tree.
+ */
+ queue_fix(&queue);
+
+ /* The Huffman algorithm. */
+ bytes_packed=0; bits_packed=0;
+ for (i=1 ; i < found ; i++)
+ {
+ /*
+ Pop the top element from the queue (the one with the least incidence).
+ Popping from a priority queue includes a re-ordering of the queue,
+ to get the next least incidence element to the top.
+ */
+ a=(HUFF_ELEMENT*) queue_remove_top(&queue);
+ /* Copy the next least incidence element */
+ b=(HUFF_ELEMENT*) queue_top(&queue);
+ /* Get a new element from the element buffer. */
+ new_huff_el=huff_tree->element_buffer+found+i;
+ /* The new element gets the sum of the two least incidence elements. */
+ new_huff_el->count=a->count+b->count;
+ /*
+ The Huffman algorithm assigns another bit to the code for a byte
+ every time that bytes incidence is combined (directly or indirectly)
+ to a new element as one of the two least incidence elements.
+ This means that one more bit per incidence of that uchar is required
+ in the resulting file. So we add the new combined incidence as the
+ number of bits by which the result grows.
+ */
+ bits_packed+=(uint) (new_huff_el->count & 7);
+ bytes_packed+=new_huff_el->count/8;
+ /* The new element points to its children, lesser in left. */
+ new_huff_el->a.nod.left=a;
+ new_huff_el->a.nod.right=b;
+ /*
+ Replace the copied top element by the new element and re-order the
+ queue.
+ */
+ queue_top(&queue)= (uchar*) new_huff_el;
+ queue_replace_top(&queue);
+ }
+ huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
+ huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
+ return 0;
+}
+
+static int compare_tree(void* cmp_arg __attribute__((unused)),
+ register const uchar *s, register const uchar *t)
+{
+ uint length;
+ for (length=global_count->field_length; length-- ;)
+ if (*s++ != *t++)
+ return (int) s[-1] - (int) t[-1];
+ return 0;
+}
+
+/*
+ Organize distinct column values and their incidences into a priority queue.
+
+ SYNOPSIS
+ save_counts_in_queue()
+ key The column value.
+ count The incidence of this value.
+ tree The Huffman tree to be built later.
+
+ DESCRIPTION
+ We use the element buffer of the targeted tree. The distinct column
+ values are organized in a priority queue first. The Huffman
+ algorithm will later organize the elements into a Huffman tree. For
+ the time being, we just place references to the elements into the
+ queue buffer. The buffer will later be organized into a priority
+ queue.
+
+ RETURN
+ 0
+ */
+
+static int save_counts_in_queue(uchar *key, element_count count,
+ HUFF_TREE *tree)
+{
+ HUFF_ELEMENT *new_huff_el;
+
+ new_huff_el=tree->element_buffer+(tree->elements++);
+ new_huff_el->count=count;
+ new_huff_el->a.leaf.null=0;
+ new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
+ tree->counts->field_length;
+ queue.root[tree->elements]=(uchar*) new_huff_el;
+ return 0;
+}
+
+
+/*
+ Calculate length of file if given counts should be used.
+
+ SYNOPSIS
+ calc_packed_length()
+ huff_counts The counts for a column of the table(s).
+ add_tree_lenght If the decode tree length should be added.
+
+ DESCRIPTION
+ We need to follow the Huffman algorithm until we know, how many bits
+ are required for each uchar code. But we do not need the resulting
+ Huffman tree. Hence, we can leave out some steps which are essential
+ in make_huff_tree().
+
+ RETURN
+ Number of bytes required to compress this table column.
+*/
+
+static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
+ uint add_tree_lenght)
+{
+ uint i,found,bits_packed,first,last;
+ my_off_t bytes_packed;
+ HUFF_ELEMENT element_buffer[256];
+ DBUG_ENTER("calc_packed_length");
+
+ /*
+ WARNING: We use a small hack for efficiency: Instead of placing
+ references to HUFF_ELEMENTs into the queue, we just insert
+ references to the counts of the uchar codes which appeared in this
+ table column. During the Huffman algorithm they are successively
+ replaced by references to HUFF_ELEMENTs. This works, because
+ HUFF_ELEMENTs have the incidence count at their beginning.
+ Regardless, wether the queue array contains references to counts of
+ type my_off_t or references to HUFF_ELEMENTs which have the count of
+ type my_off_t at their beginning, it always points to a count of the
+ same type.
+
+ Instead of using queue_insert(), we just copy the references into
+ the buffer of the priority queue. We insert in uchar value order, but
+ the order is in fact irrelevant here. We will establish the correct
+ order later.
+ */
+ first=last=0;
+ for (i=found=0 ; i < 256 ; i++)
+ {
+ if (huff_counts->counts[i])
+ {
+ if (! found++)
+ first=i;
+ last=i;
+ /* We start with root[1], which is the queues top element. */
+ queue.root[found]=(uchar*) &huff_counts->counts[i];
+ }
+ }
+ if (!found)
+ DBUG_RETURN(0); /* Empty tree */
+ /*
+ If there is only a single uchar value in this field in all records,
+ add a second element with zero incidence. This is required to enter
+ the loop, which follows the Huffman algorithm.
+ */
+ if (found < 2)
+ queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
+
+ /* Make a queue from the queue buffer. */
+ queue.elements=found;
+
+ bytes_packed=0; bits_packed=0;
+ /* Add the length of the coding table, which would become part of the file. */
+ if (add_tree_lenght)
+ bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
+ (max_bit(found-1)+1+1)*(found-2) +7)/8;
+
+ /*
+ Make a priority queue from the queue. Construct its index so that we
+ have a partially ordered tree.
+ */
+ queue_fix(&queue);
+
+ /* The Huffman algorithm. */
+ for (i=0 ; i < found-1 ; i++)
+ {
+ my_off_t *a;
+ my_off_t *b;
+ HUFF_ELEMENT *new_huff_el;
+
+ /*
+ Pop the top element from the queue (the one with the least
+ incidence). Popping from a priority queue includes a re-ordering
+ of the queue, to get the next least incidence element to the top.
+ */
+ a= (my_off_t*) queue_remove_top(&queue);
+ /* Copy the next least incidence element. */
+ b= (my_off_t*) queue_top(&queue);
+ /* Create a new element in a local (automatic) buffer. */
+ new_huff_el= element_buffer + i;
+ /* The new element gets the sum of the two least incidence elements. */
+ new_huff_el->count= *a + *b;
+ /*
+ The Huffman algorithm assigns another bit to the code for a byte
+ every time that bytes incidence is combined (directly or indirectly)
+ to a new element as one of the two least incidence elements.
+ This means that one more bit per incidence of that uchar is required
+ in the resulting file. So we add the new combined incidence as the
+ number of bits by which the result grows.
+ */
+ bits_packed+=(uint) (new_huff_el->count & 7);
+ bytes_packed+=new_huff_el->count/8;
+ /*
+ Replace the copied top element by the new element and re-order the
+ queue. This successively replaces the references to counts by
+ references to HUFF_ELEMENTs.
+ */
+ queue_top(&queue)= (uchar*) new_huff_el;
+ queue_replace_top(&queue);
+ }
+ DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
+}
+
+
+ /* Remove trees that don't give any compression */
+
+static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
+{
+ uint k,tree_number;
+ HUFF_COUNTS count,*i,*j,*last_count;
+
+ last_count=huff_counts+trees;
+ for (tree_number=0, i=huff_counts ; i < last_count ; i++)
+ {
+ if (!i->tree->tree_number)
+ {
+ i->tree->tree_number= ++tree_number;
+ if (i->tree_buff)
+ continue; /* Don't join intervall */
+ for (j=i+1 ; j < last_count ; j++)
+ {
+ if (! j->tree->tree_number && ! j->tree_buff)
+ {
+ for (k=0 ; k < 256 ; k++)
+ count.counts[k]=i->counts[k]+j->counts[k];
+ if (calc_packed_length(&count,1) <=
+ i->tree->bytes_packed + j->tree->bytes_packed+
+ i->tree->tree_pack_length+j->tree->tree_pack_length+
+ ALLOWED_JOIN_DIFF)
+ {
+ memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
+ sizeof(count.counts[0])*256);
+ my_free((uchar*) j->tree->element_buffer,MYF(0));
+ j->tree->element_buffer=0;
+ j->tree=i->tree;
+ bmove((uchar*) i->counts,(uchar*) count.counts,
+ sizeof(count.counts[0])*256);
+ if (make_huff_tree(i->tree,i))
+ return (uint) -1;
+ }
+ }
+ }
+ }
+ }
+ DBUG_PRINT("info", ("Original trees: %d After join: %d",
+ trees, tree_number));
+ if (verbose)
+ VOID(printf("Original trees: %d After join: %d\n", trees, tree_number));
+ return tree_number; /* Return trees left */
+}
+
+
+/*
+ Fill in huff_tree encode tables.
+
+ SYNOPSIS
+ make_huff_decode_table()
+ huff_tree An array of HUFF_TREE which are to be encoded.
+ trees The number of HUFF_TREE in the array.
+
+ RETURN
+ 0 success
+ != 0 error
+*/
+
+static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
+{
+ uint elements;
+ for ( ; trees-- ; huff_tree++)
+ {
+ if (huff_tree->tree_number > 0)
+ {
+ elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
+ if (!(huff_tree->code =
+ (ulonglong*) my_malloc(elements*
+ (sizeof(ulonglong) + sizeof(uchar)),
+ MYF(MY_WME | MY_ZEROFILL))))
+ return 1;
+ huff_tree->code_len=(uchar*) (huff_tree->code+elements);
+ make_traverse_code_tree(huff_tree, huff_tree->root,
+ 8 * sizeof(ulonglong), LL(0));
+ }
+ }
+ return 0;
+}
+
+
+static void make_traverse_code_tree(HUFF_TREE *huff_tree,
+ HUFF_ELEMENT *element,
+ uint size, ulonglong code)
+{
+ uint chr;
+ if (!element->a.leaf.null)
+ {
+ chr=element->a.leaf.element_nr;
+ huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
+ huff_tree->code[chr]= (code >> size);
+ if (huff_tree->height < 8 * sizeof(ulonglong) - size)
+ huff_tree->height= 8 * sizeof(ulonglong) - size;
+ }
+ else
+ {
+ size--;
+ make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
+ make_traverse_code_tree(huff_tree, element->a.nod.right, size,
+ code + (((ulonglong) 1) << size));
+ }
+ return;
+}
+
+
+/*
+ Convert a value into binary digits.
+
+ SYNOPSIS
+ bindigits()
+ value The value.
+ length The number of low order bits to convert.
+
+ NOTE
+ The result string is in static storage. It is reused on every call.
+ So you cannot use it twice in one expression.
+
+ RETURN
+ A pointer to a static NUL-terminated string.
+ */
+
+static char *bindigits(ulonglong value, uint bits)
+{
+ static char digits[72];
+ char *ptr= digits;
+ uint idx= bits;
+
+ DBUG_ASSERT(idx < sizeof(digits));
+ while (idx)
+ *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
+ *ptr= '\0';
+ return digits;
+}
+
+
+/*
+ Convert a value into hexadecimal digits.
+
+ SYNOPSIS
+ hexdigits()
+ value The value.
+
+ NOTE
+ The result string is in static storage. It is reused on every call.
+ So you cannot use it twice in one expression.
+
+ RETURN
+ A pointer to a static NUL-terminated string.
+ */
+
+static char *hexdigits(ulonglong value)
+{
+ static char digits[20];
+ char *ptr= digits;
+ uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
+
+ DBUG_ASSERT(idx < sizeof(digits));
+ while (idx)
+ {
+ if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
+ *(ptr - 1)+= 'a' - '9' - 1;
+ }
+ *ptr= '\0';
+ return digits;
+}
+
+
+ /* Write header to new packed data file */
+
+static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
+ my_off_t tot_elements,my_off_t filelength)
+{
+ uchar *buff= (uchar*) file_buffer.pos;
+
+ bzero(buff,HEAD_LENGTH);
+ memcpy_fixed(buff,maria_pack_file_magic,4);
+ int4store(buff+4,head_length);
+ int4store(buff+8, mrg->min_pack_length);
+ int4store(buff+12,mrg->max_pack_length);
+ int4store(buff+16,tot_elements);
+ int4store(buff+20,intervall_length);
+ int2store(buff+24,trees);
+ buff[26]=(char) mrg->ref_length;
+ /* Save record pointer length */
+ buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
+ if (test_only)
+ return 0;
+ VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
+ return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
+}
+
+ /* Write fieldinfo to new packed file */
+
+static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
+{
+ reg1 uint i;
+ uint huff_tree_bits;
+ huff_tree_bits=max_bit(trees ? trees-1 : 0);
+
+ DBUG_PRINT("info", (" "));
+ DBUG_PRINT("info", ("column types:"));
+ DBUG_PRINT("info", ("FIELD_NORMAL 0"));
+ DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
+ DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
+ DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
+ DBUG_PRINT("info", ("FIELD_BLOB 4"));
+ DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
+ DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
+ DBUG_PRINT("info", ("FIELD_ZERO 7"));
+ DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
+ DBUG_PRINT("info", ("FIELD_CHECK 9"));
+ DBUG_PRINT("info", (" "));
+ DBUG_PRINT("info", ("pack type as a set of flags:"));
+ DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
+ DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
+ DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
+ DBUG_PRINT("info", (" "));
+ if (verbose >= 2)
+ {
+ VOID(printf("\n"));
+ VOID(printf("column types:\n"));
+ VOID(printf("FIELD_NORMAL 0\n"));
+ VOID(printf("FIELD_SKIP_ENDSPACE 1\n"));
+ VOID(printf("FIELD_SKIP_PRESPACE 2\n"));
+ VOID(printf("FIELD_SKIP_ZERO 3\n"));
+ VOID(printf("FIELD_BLOB 4\n"));
+ VOID(printf("FIELD_CONSTANT 5\n"));
+ VOID(printf("FIELD_INTERVALL 6\n"));
+ VOID(printf("FIELD_ZERO 7\n"));
+ VOID(printf("FIELD_VARCHAR 8\n"));
+ VOID(printf("FIELD_CHECK 9\n"));
+ VOID(printf("\n"));
+ VOID(printf("pack type as a set of flags:\n"));
+ VOID(printf("PACK_TYPE_SELECTED 1\n"));
+ VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n"));
+ VOID(printf("PACK_TYPE_ZERO_FILL 4\n"));
+ VOID(printf("\n"));
+ }
+ for (i=0 ; i++ < fields ; counts++)
+ {
+ write_bits((ulonglong) (int) counts->field_type, 5);
+ write_bits(counts->pack_type,6);
+ if (counts->pack_type & PACK_TYPE_ZERO_FILL)
+ write_bits(counts->max_zero_fill,5);
+ else
+ write_bits(counts->length_bits,5);
+ write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
+ DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
+ "lbits: %2u tree: %2u length: %4u",
+ i , counts->field_type, counts->pack_type,
+ counts->max_zero_fill, counts->length_bits,
+ counts->tree->tree_number, counts->field_length));
+ if (verbose >= 2)
+ VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
+ "tree: %2u length: %4u\n", i , counts->field_type,
+ counts->pack_type, counts->max_zero_fill, counts->length_bits,
+ counts->tree->tree_number, counts->field_length));
+ }
+ flush_bits();
+ return;
+}
+
+ /* Write all huff_trees to new datafile. Return tot count of
+ elements in all trees
+ Returns 0 on error */
+
+static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
+{
+ uint i,int_length;
+ uint tree_no;
+ uint codes;
+ uint errors= 0;
+ uint *packed_tree,*offset,length;
+ my_off_t elements;
+
+ /* Find the highest number of elements in the trees. */
+ for (i=length=0 ; i < trees ; i++)
+ if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
+ length=huff_tree[i].elements;
+ /*
+ Allocate a buffer for packing a decode tree. Two numbers per element
+ (left child and right child).
+ */
+ if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
+ {
+ my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
+ return 0;
+ }
+
+ DBUG_PRINT("info", (" "));
+ if (verbose >= 2)
+ VOID(printf("\n"));
+ tree_no= 0;
+ intervall_length=0;
+ for (elements=0; trees-- ; huff_tree++)
+ {
+ /* Skip columns that have been joined with other columns. */
+ if (huff_tree->tree_number == 0)
+ continue; /* Deleted tree */
+ tree_no++;
+ DBUG_PRINT("info", (" "));
+ if (verbose >= 3)
+ VOID(printf("\n"));
+ /* Count the total number of elements (byte codes or column values). */
+ elements+=huff_tree->elements;
+ huff_tree->max_offset=2;
+ /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
+ if (huff_tree->elements <= 1)
+ offset=packed_tree;
+ else
+ offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
+
+ /* This should be the same as 'length' above. */
+ huff_tree->offset_bits=max_bit(huff_tree->max_offset);
+
+ /*
+ Since we check this during collecting the distinct column values,
+ this should never happen.
+ */
+ if (huff_tree->max_offset >= IS_OFFSET)
+ { /* This should be impossible */
+ VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
+ huff_tree->max_offset));
+ my_afree(packed_tree);
+ return 0;
+ }
+
+ DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu "
+ "char_bits: %u\n",
+ (ulong) (file_buffer.pos - file_buffer.buffer),
+ huff_tree->elements, (ulong) (offset - packed_tree),
+ huff_tree->char_bits));
+ if (!huff_tree->counts->tree_buff)
+ {
+ /* We do a uchar compression on this column. Mark with bit 0. */
+ write_bits(0,1);
+ write_bits(huff_tree->min_chr,8);
+ write_bits(huff_tree->elements,9);
+ write_bits(huff_tree->char_bits,5);
+ write_bits(huff_tree->offset_bits,5);
+ int_length=0;
+ }
+ else
+ {
+ int_length=(uint) (huff_tree->counts->tree_pos -
+ huff_tree->counts->tree_buff);
+ /* We have distinct column values for this column. Mark with bit 1. */
+ write_bits(1,1);
+ write_bits(huff_tree->elements,15);
+ write_bits(int_length,16);
+ write_bits(huff_tree->char_bits,5);
+ write_bits(huff_tree->offset_bits,5);
+ intervall_length+=int_length;
+ }
+ DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
+ "offset_bits: %2u %s: %5u codelen: %2u",
+ tree_no, huff_tree->elements, huff_tree->char_bits,
+ huff_tree->offset_bits, huff_tree->counts->tree_buff ?
+ "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
+ int_length : huff_tree->min_chr, huff_tree->height));
+ if (verbose >= 2)
+ VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
+ "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
+ huff_tree->char_bits, huff_tree->offset_bits,
+ huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
+ huff_tree->counts->tree_buff ? int_length :
+ huff_tree->min_chr, huff_tree->height));
+
+ /* Check that the code tree length matches the element count. */
+ length=(uint) (offset-packed_tree);
+ if (length != huff_tree->elements*2-2)
+ {
+ VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
+ length, huff_tree->elements * 2 - 2));
+ errors++;
+ break;
+ }
+
+ for (i=0 ; i < length ; i++)
+ {
+ if (packed_tree[i] & IS_OFFSET)
+ write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
+ huff_tree->offset_bits+1);
+ else
+ write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
+ DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
+ i, (packed_tree[i] & IS_OFFSET) ?
+ " -> " : "", (packed_tree[i] & IS_OFFSET) ?
+ packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
+ if (verbose >= 3)
+ VOID(printf("tree[0x%04x]: %s0x%04x\n",
+ i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
+ (packed_tree[i] & IS_OFFSET) ?
+ packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
+ }
+ flush_bits();
+
+ /*
+ Display coding tables and check their correctness.
+ */
+ codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
+ for (i= 0; i < codes; i++)
+ {
+ ulonglong code;
+ uint bits;
+ uint len;
+ uint idx;
+
+ if (! (len= huff_tree->code_len[i]))
+ continue;
+ DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
+ hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
+ bindigits(huff_tree->code[i],
+ huff_tree->code_len[i])));
+ if (verbose >= 3)
+ VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
+ hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
+ bindigits(huff_tree->code[i], huff_tree->code_len[i])));
+
+ /* Check that the encode table decodes correctly. */
+ code= 0;
+ bits= 0;
+ idx= 0;
+ DBUG_EXECUTE_IF("forcechkerr1", len--;);
+ DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
+ DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
+ for (;;)
+ {
+ if (! len)
+ {
+ VOID(fflush(stdout));
+ VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
+ hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
+ errors++;
+ break;
+ }
+ code<<= 1;
+ code|= (huff_tree->code[i] >> (--len)) & 1;
+ bits++;
+ if (bits > 8 * sizeof(code))
+ {
+ VOID(fflush(stdout));
+ VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
+ bits, (uint) (8 * sizeof(code))));
+ errors++;
+ break;
+ }
+ idx+= (uint) code & 1;
+ if (idx >= length)
+ {
+ VOID(fflush(stdout));
+ VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
+ idx, length));
+ errors++;
+ break;
+ }
+ if (packed_tree[idx] & IS_OFFSET)
+ idx+= packed_tree[idx] & ~IS_OFFSET;
+ else
+ break; /* Hit a leaf. This contains the result value. */
+ }
+ if (errors)
+ break;
+
+ DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
+ if (packed_tree[idx] != i)
+ {
+ VOID(fflush(stdout));
+ VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
+ packed_tree[idx], i));
+ errors++;
+ break;
+ }
+ } /*end for (codes)*/
+ if (errors)
+ break;
+
+ /* Write column values in case of distinct column value compression. */
+ if (huff_tree->counts->tree_buff)
+ {
+ for (i=0 ; i < int_length ; i++)
+ {
+ write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
+ DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
+ i, (uchar) huff_tree->counts->tree_buff[i]));
+ if (verbose >= 3)
+ VOID(printf("column_values[0x%04x]: 0x%02x\n",
+ i, (uchar) huff_tree->counts->tree_buff[i]));
+ }
+ }
+ flush_bits();
+ }
+ DBUG_PRINT("info", (" "));
+ if (verbose >= 2)
+ VOID(printf("\n"));
+ my_afree(packed_tree);
+ if (errors)
+ {
+ VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
+ return 0;
+ }
+ return elements;
+}
+
+
+static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
+ uint *offset)
+{
+ uint *prev_offset;
+
+ prev_offset= offset;
+ /*
+ 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
+ then there is no left child and, hence no right child either. This
+ is a property of a binary tree. An element is either a node with two
+ childs, or a leaf without childs.
+
+ The current element is always a node with two childs. Go left first.
+ */
+ if (!element->a.nod.left->a.leaf.null)
+ {
+ /* Store the uchar code or the index of the column value. */
+ prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
+ offset+=2;
+ }
+ else
+ {
+ /*
+ Recursively traverse the tree to the left. Mark it as an offset to
+ another tree node (in contrast to a uchar code or column value index).
+ */
+ prev_offset[0]= IS_OFFSET+2;
+ offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
+ }
+
+ /* Now, check the right child. */
+ if (!element->a.nod.right->a.leaf.null)
+ {
+ /* Store the uchar code or the index of the column value. */
+ prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
+ return offset;
+ }
+ else
+ {
+ /*
+ Recursively traverse the tree to the right. Mark it as an offset to
+ another tree node (in contrast to a uchar code or column value index).
+ */
+ uint temp=(uint) (offset-prev_offset-1);
+ prev_offset[1]= IS_OFFSET+ temp;
+ if (huff_tree->max_offset < temp)
+ huff_tree->max_offset = temp;
+ return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
+ }
+}
+
+ /* Get number of bits neaded to represent value */
+
+static uint max_bit(register uint value)
+{
+ reg2 uint power=1;
+
+ while ((value>>=1))
+ power++;
+ return (power);
+}
+
+
+static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
+{
+ int error;
+ uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
+ uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
+ my_off_t record_count;
+ char llbuf[32];
+ ulong length,pack_length;
+ uchar *record,*pos,*end_pos,*record_pos,*start_pos;
+ HUFF_COUNTS *count,*end_count;
+ HUFF_TREE *tree;
+ MARIA_HA *isam_file=mrg->file[0];
+ uint pack_version= (uint) isam_file->s->pack.version;
+ DBUG_ENTER("compress_maria_file");
+
+ /* Allocate a buffer for the records (excluding blobs). */
+ if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
+ return -1;
+
+ end_count=huff_counts+isam_file->s->base.fields;
+ min_record_length= (uint) ~0;
+ max_record_length=0;
+ null_bytes= isam_file->s->base.null_bytes;
+
+ /*
+ Calculate the maximum number of bits required to pack the records.
+ Remember to understand 'max_zero_fill' as 'min_zero_fill'.
+ The tree height determines the maximum number of bits per value.
+ Some fields skip leading or trailing spaces or zeroes. The skipped
+ number of bytes is encoded by 'length_bits' bits.
+ Empty blobs and varchar are encoded with a single 1 bit. Other blobs
+ and varchar get a leading 0 bit.
+ */
+ max_calc_length= null_bytes;
+ for (i= 0 ; i < isam_file->s->base.fields ; i++)
+ {
+ if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
+ huff_counts[i].max_zero_fill=0;
+ if (huff_counts[i].field_type == FIELD_CONSTANT ||
+ huff_counts[i].field_type == FIELD_ZERO ||
+ huff_counts[i].field_type == FIELD_CHECK)
+ continue;
+ if (huff_counts[i].field_type == FIELD_INTERVALL)
+ max_calc_length+=huff_counts[i].tree->height;
+ else if (huff_counts[i].field_type == FIELD_BLOB ||
+ huff_counts[i].field_type == FIELD_VARCHAR)
+ max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
+ else
+ max_calc_length+=
+ (huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
+ huff_counts[i].tree->height+huff_counts[i].length_bits;
+ }
+ max_calc_length= (max_calc_length + 7) / 8;
+ pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
+ record_count=0;
+ /* 'max_blob_length' is the max length of all blobs of a record. */
+ pack_blob_length= isam_file->s->base.blobs ?
+ _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
+ max_pack_length=pack_ref_length+pack_blob_length;
+
+ DBUG_PRINT("fields", ("==="));
+ mrg_reset(mrg);
+ while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
+ {
+ ulong tot_blob_length=0;
+ if (! error)
+ {
+ if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length +
+ null_bytes))
+ break;
+ record_pos= file_buffer.pos;
+ file_buffer.pos+= max_pack_length;
+ if (null_bytes)
+ {
+ /* Copy null bits 'as is' */
+ memcpy(file_buffer.pos, record, null_bytes);
+ file_buffer.pos+= null_bytes;
+ }
+ for (start_pos=record+null_bytes, count= huff_counts;
+ count < end_count ;
+ count++)
+ {
+ end_pos=start_pos+(field_length=count->field_length);
+ tree=count->tree;
+
+ DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
+ "lbits: %2u tree: %2u length: %4u",
+ (ulong) (count - huff_counts + 1),
+ count->field_type,
+ count->pack_type, count->max_zero_fill,
+ count->length_bits, count->tree->tree_number,
+ count->field_length));
+
+ /* Check if the column contains spaces only. */
+ if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
+ {
+ for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
+ if (pos == end_pos)
+ {
+ DBUG_PRINT("fields",
+ ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
+ DBUG_PRINT("fields", ("---"));
+ write_bits(1,1);
+ start_pos=end_pos;
+ continue;
+ }
+ DBUG_PRINT("fields",
+ ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
+ write_bits(0,1);
+ }
+ end_pos-=count->max_zero_fill;
+ field_length-=count->max_zero_fill;
+
+ switch (count->field_type) {
+ case FIELD_SKIP_ZERO:
+ if (!memcmp(start_pos, zero_string, field_length))
+ {
+ DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
+ write_bits(1,1);
+ start_pos=end_pos;
+ break;
+ }
+ DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
+ write_bits(0,1);
+ /* Fall through */
+ case FIELD_NORMAL:
+ DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
+ (ulong) (end_pos - start_pos)));
+ for ( ; start_pos < end_pos ; start_pos++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *start_pos,
+ hexdigits(tree->code[(uchar) *start_pos]),
+ (uint) tree->code_len[(uchar) *start_pos],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos]);
+ }
+ break;
+ case FIELD_SKIP_ENDSPACE:
+ for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
+ length= (ulong) (end_pos - pos);
+ if (count->pack_type & PACK_TYPE_SELECTED)
+ {
+ if (length > count->min_space)
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
+ length, field_length, count->length_bits));
+ write_bits(1,1);
+ write_bits(length,count->length_bits);
+ }
+ else
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_ENDSPACE not more than min_space, "
+ "bits: 1"));
+ write_bits(0,1);
+ pos=end_pos;
+ }
+ }
+ else
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
+ length, field_length, count->length_bits));
+ write_bits(length,count->length_bits);
+ }
+ /* Encode all significant bytes. */
+ DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
+ (ulong) (pos - start_pos)));
+ for ( ; start_pos < pos ; start_pos++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *start_pos,
+ hexdigits(tree->code[(uchar) *start_pos]),
+ (uint) tree->code_len[(uchar) *start_pos],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos]);
+ }
+ start_pos=end_pos;
+ break;
+ case FIELD_SKIP_PRESPACE:
+ for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
+ length= (ulong) (pos - start_pos);
+ if (count->pack_type & PACK_TYPE_SELECTED)
+ {
+ if (length > count->min_space)
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
+ length, field_length, count->length_bits));
+ write_bits(1,1);
+ write_bits(length,count->length_bits);
+ }
+ else
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_PRESPACE not more than min_space, "
+ "bits: 1"));
+ pos=start_pos;
+ write_bits(0,1);
+ }
+ }
+ else
+ {
+ DBUG_PRINT("fields",
+ ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
+ length, field_length, count->length_bits));
+ write_bits(length,count->length_bits);
+ }
+ /* Encode all significant bytes. */
+ DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
+ (ulong) (end_pos - start_pos)));
+ for (start_pos=pos ; start_pos < end_pos ; start_pos++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *start_pos,
+ hexdigits(tree->code[(uchar) *start_pos]),
+ (uint) tree->code_len[(uchar) *start_pos],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos]);
+ }
+ break;
+ case FIELD_CONSTANT:
+ case FIELD_ZERO:
+ case FIELD_CHECK:
+ DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
+ start_pos=end_pos;
+ break;
+ case FIELD_INTERVALL:
+ global_count=count;
+ pos=(uchar*) tree_search(&count->int_tree, start_pos,
+ count->int_tree.custom_arg);
+ intervall=(uint) (pos - count->tree_buff)/field_length;
+ DBUG_PRINT("fields", ("FIELD_INTERVALL"));
+ DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
+ intervall, hexdigits(tree->code[intervall]),
+ (uint) tree->code_len[intervall]));
+ write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
+ start_pos=end_pos;
+ break;
+ case FIELD_BLOB:
+ {
+ ulong blob_length= _ma_calc_blob_length(field_length-
+ portable_sizeof_char_ptr,
+ start_pos);
+ /* Empty blobs are encoded with a single 1 bit. */
+ if (!blob_length)
+ {
+ DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
+ write_bits(1,1);
+ }
+ else
+ {
+ uchar *blob,*blob_end;
+ DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
+ write_bits(0,1);
+ /* Write the blob length. */
+ DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
+ blob_length, count->length_bits));
+ write_bits(blob_length,count->length_bits);
+ memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
+ sizeof(char*));
+ blob_end=blob+blob_length;
+ /* Encode the blob bytes. */
+ for ( ; blob < blob_end ; blob++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
+ (uint) tree->code_len[(uchar) *blob],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint)tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *blob],
+ (uint) tree->code_len[(uchar) *blob]);
+ }
+ tot_blob_length+=blob_length;
+ }
+ start_pos= end_pos;
+ break;
+ }
+ case FIELD_VARCHAR:
+ {
+ uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
+ ulong col_length= (var_pack_length == 1 ?
+ (uint) *(uchar*) start_pos :
+ uint2korr(start_pos));
+ /* Empty varchar are encoded with a single 1 bit. */
+ if (!col_length)
+ {
+ DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
+ write_bits(1,1); /* Empty varchar */
+ }
+ else
+ {
+ uchar *end= start_pos + var_pack_length + col_length;
+ DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
+ write_bits(0,1);
+ /* Write the varchar length. */
+ DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
+ col_length, count->length_bits));
+ write_bits(col_length,count->length_bits);
+ /* Encode the varchar bytes. */
+ for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
+ {
+ DBUG_PRINT("fields",
+ ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
+ (uchar) *start_pos,
+ hexdigits(tree->code[(uchar) *start_pos]),
+ (uint) tree->code_len[(uchar) *start_pos],
+ bindigits(tree->code[(uchar) *start_pos],
+ (uint)tree->code_len[(uchar) *start_pos])));
+ write_bits(tree->code[(uchar) *start_pos],
+ (uint) tree->code_len[(uchar) *start_pos]);
+ }
+ }
+ start_pos= end_pos;
+ break;
+ }
+ case FIELD_LAST:
+ case FIELD_enum_val_count:
+ abort(); /* Impossible */
+ }
+ start_pos+=count->max_zero_fill;
+ DBUG_PRINT("fields", ("---"));
+ }
+ flush_bits();
+ length=(ulong) (file_buffer.pos - record_pos) - max_pack_length;
+ pack_length= _ma_save_pack_length(pack_version, record_pos, length);
+ if (pack_blob_length)
+ pack_length+= _ma_save_pack_length(pack_version,
+ record_pos + pack_length,
+ tot_blob_length);
+ DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
+ "length-bytes: %lu", (ulong) record_count, length,
+ tot_blob_length, pack_length));
+ DBUG_PRINT("fields", ("==="));
+
+ /* Correct file buffer if the header was smaller */
+ if (pack_length != max_pack_length)
+ {
+ bmove(record_pos+pack_length,record_pos+max_pack_length,length);
+ file_buffer.pos-= (max_pack_length-pack_length);
+ }
+ if (length < (ulong) min_record_length)
+ min_record_length=(uint) length;
+ if (length > (ulong) max_record_length)
+ max_record_length=(uint) length;
+ record_count++;
+ if (write_loop && record_count % WRITE_COUNT == 0)
+ {
+ VOID(printf("%lu\r", (ulong) record_count));
+ VOID(fflush(stdout));
+ }
+ }
+ else if (error != HA_ERR_RECORD_DELETED)
+ break;
+ }
+ if (error == HA_ERR_END_OF_FILE)
+ error=0;
+ else
+ {
+ VOID(fprintf(stderr, "%s: Got error %d reading records\n",
+ my_progname, error));
+ }
+ if (verbose >= 2)
+ VOID(printf("wrote %s records.\n", llstr((longlong) record_count, llbuf)));
+
+ my_afree(record);
+ mrg->ref_length=max_pack_length;
+ mrg->min_pack_length=max_record_length ? min_record_length : 0;
+ mrg->max_pack_length=max_record_length;
+ DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
+}
+
+
+static char *make_new_name(char *new_name, char *old_name)
+{
+ return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
+}
+
+static char *make_old_name(char *new_name, char *old_name)
+{
+ return fn_format(new_name,old_name,"",OLD_EXT,2+4);
+}
+
+ /* rutines for bit writing buffer */
+
+static void init_file_buffer(File file, pbool read_buffer)
+{
+ file_buffer.file=file;
+ file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
+ MYF(MY_WME));
+ file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
+ file_buffer.pos_in_file=0;
+ error_on_write=0;
+ if (read_buffer)
+ {
+
+ file_buffer.pos=file_buffer.end;
+ file_buffer.bits=0;
+ }
+ else
+ {
+ file_buffer.pos=file_buffer.buffer;
+ file_buffer.bits=BITS_SAVED;
+ }
+ file_buffer.bitbucket= 0;
+}
+
+
+static int flush_buffer(ulong neaded_length)
+{
+ ulong length;
+
+ /*
+ file_buffer.end is 8 bytes lower than the real end of the buffer.
+ This is done so that the end-of-buffer condition does not need to be
+ checked for every uchar (see write_bits()). Consequently,
+ file_buffer.pos can become greater than file_buffer.end. The
+ algorithms in the other functions ensure that there will never be
+ more than 8 bytes written to the buffer without an end-of-buffer
+ check. So the buffer cannot be overrun. But we need to check for the
+ near-to-buffer-end condition to avoid a negative result, which is
+ casted to unsigned and thus becomes giant.
+ */
+ if ((file_buffer.pos < file_buffer.end) &&
+ ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
+ return 0;
+ length=(ulong) (file_buffer.pos-file_buffer.buffer);
+ file_buffer.pos=file_buffer.buffer;
+ file_buffer.pos_in_file+=length;
+ if (test_only)
+ return 0;
+ if (error_on_write|| my_write(file_buffer.file,
+ (const uchar*) file_buffer.buffer,
+ length,
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
+ {
+ error_on_write=1;
+ return 1;
+ }
+
+ if (neaded_length != ~(ulong) 0 &&
+ (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
+ {
+ uchar *tmp;
+ neaded_length+=256; /* some margin */
+ tmp= (uchar*) my_realloc(file_buffer.buffer, neaded_length,MYF(MY_WME));
+ if (!tmp)
+ return 1;
+ file_buffer.pos= (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
+ file_buffer.buffer= tmp;
+ file_buffer.end= (tmp+neaded_length-8);
+ }
+ return 0;
+}
+
+
+static void end_file_buffer(void)
+{
+ my_free(file_buffer.buffer, MYF(0));
+}
+
+ /* output `bits` low bits of `value' */
+
+static void write_bits(register ulonglong value, register uint bits)
+{
+ DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
+ (bits == 8 * sizeof(value)));
+
+ if ((file_buffer.bits-= (int) bits) >= 0)
+ {
+ file_buffer.bitbucket|= value << file_buffer.bits;
+ }
+ else
+ {
+ reg3 ulonglong bit_buffer;
+ bits= (uint) -file_buffer.bits;
+ bit_buffer= (file_buffer.bitbucket |
+ ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
+#if BITS_SAVED == 64
+ *file_buffer.pos++= (uchar) (bit_buffer >> 56);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 48);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 40);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 32);
+#endif
+ *file_buffer.pos++= (uchar) (bit_buffer >> 24);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 16);
+ *file_buffer.pos++= (uchar) (bit_buffer >> 8);
+ *file_buffer.pos++= (uchar) (bit_buffer);
+
+ if (bits != 8 * sizeof(value))
+ value&= (((ulonglong) 1) << bits) - 1;
+ if (file_buffer.pos >= file_buffer.end)
+ VOID(flush_buffer(~ (ulong) 0));
+ file_buffer.bits=(int) (BITS_SAVED - bits);
+ file_buffer.bitbucket= value << (BITS_SAVED - bits);
+ }
+ return;
+}
+
+ /* Flush bits in bit_buffer to buffer */
+
+static void flush_bits(void)
+{
+ int bits;
+ ulonglong bit_buffer;
+
+ bits= file_buffer.bits & ~7;
+ bit_buffer= file_buffer.bitbucket >> bits;
+ bits= BITS_SAVED - bits;
+ while (bits > 0)
+ {
+ bits-= 8;
+ *file_buffer.pos++= (uchar) (bit_buffer >> bits);
+ }
+ if (file_buffer.pos >= file_buffer.end)
+ VOID(flush_buffer(~ (ulong) 0));
+ file_buffer.bits= BITS_SAVED;
+ file_buffer.bitbucket= 0;
+}
+
+
+/****************************************************************************
+** functions to handle the joined files
+****************************************************************************/
+
+static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
+ my_off_t new_length,
+ ha_checksum crc)
+{
+ MARIA_SHARE *share=isam_file->s;
+ uint options=mi_uint2korr(share->state.header.options);
+ uint key;
+ DBUG_ENTER("save_state");
+
+ options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
+ mi_int2store(share->state.header.options,options);
+ /* Save the original file type of we have to undo the packing later */
+ share->state.header.org_data_file_type= share->state.header.data_file_type;
+ share->state.header.data_file_type= COMPRESSED_RECORD;
+
+ share->state.state.data_file_length=new_length;
+ share->state.state.del=0;
+ share->state.state.empty=0;
+ share->state.dellink= HA_OFFSET_ERROR;
+ share->state.split=(ha_rows) mrg->records;
+ share->state.version=(ulong) time((time_t*) 0);
+ if (share->base.born_transactional)
+ share->state.create_rename_lsn= share->state.is_of_horizon=
+ share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
+ if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
+ {
+ /*
+ Some indexes are disabled, cannot use current key_file_length value
+ as an estimate of upper bound of index file size. Use packed data file
+ size instead.
+ */
+ share->state.state.key_file_length= new_length;
+ }
+ /*
+ If there are no disabled indexes, keep key_file_length value from
+ original file so "aria_chk -rq" can use this value (this is necessary
+ because index size cannot be easily calculated for fulltext keys)
+ */
+ maria_clear_all_keys_active(share->state.key_map);
+ for (key=0 ; key < share->base.keys ; key++)
+ share->state.key_root[key]= HA_OFFSET_ERROR;
+ share->state.key_del= HA_OFFSET_ERROR;
+ share->state.state.checksum= crc; /* Save crc in file */
+ share->changed=1; /* Force write of header */
+ share->state.open_count=0;
+ share->global_changed=0;
+ VOID(my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0)));
+ if (share->base.keys)
+ isamchk_neaded=1;
+ DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
+ &share->state,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_FULL_INFO));
+}
+
+
+static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
+ ha_checksum crc)
+{
+ MARIA_STATE_INFO state;
+ MARIA_HA *isam_file=mrg->file[0];
+ uint options;
+ DBUG_ENTER("save_state_mrg");
+
+ state= isam_file->s->state;
+ options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
+ HA_OPTION_READ_ONLY_DATA);
+ mi_int2store(state.header.options,options);
+ /* Save the original file type of we have to undo the packing later */
+ state.header.org_data_file_type= state.header.data_file_type;
+ state.header.data_file_type= COMPRESSED_RECORD;
+
+ state.state.data_file_length=new_length;
+ state.state.del=0;
+ state.state.empty=0;
+ state.state.records=state.split=(ha_rows) mrg->records;
+ state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn=
+ LSN_NEEDS_NEW_STATE_LSNS;
+
+ /* See comment above in save_state about key_file_length handling. */
+ if (mrg->src_file_has_indexes_disabled)
+ {
+ isam_file->s->state.state.key_file_length=
+ max(isam_file->s->state.state.key_file_length, new_length);
+ }
+ state.dellink= HA_OFFSET_ERROR;
+ state.version=(ulong) time((time_t*) 0);
+ maria_clear_all_keys_active(state.key_map);
+ state.state.checksum=crc;
+ if (isam_file->s->base.keys)
+ isamchk_neaded=1;
+ state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
+ DBUG_RETURN (_ma_state_info_write_sub(file, &state,
+ MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_FULL_INFO));
+}
+
+
+/* reset for mrg_rrnd */
+
+static void mrg_reset(PACK_MRG_INFO *mrg)
+{
+ if (mrg->current)
+ {
+ maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
+ mrg->current=0;
+ }
+}
+
+static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
+{
+ int error;
+ MARIA_HA *isam_info;
+ my_off_t filepos;
+
+ if (!info->current)
+ {
+ isam_info= *(info->current=info->file);
+ info->end=info->current+info->count;
+ maria_reset(isam_info);
+ maria_extra(isam_info, HA_EXTRA_CACHE, 0);
+ if ((error= maria_scan_init(isam_info)))
+ return(error);
+ }
+ else
+ isam_info= *info->current;
+
+ for (;;)
+ {
+ if (!(error= maria_scan(isam_info, buf)) ||
+ error != HA_ERR_END_OF_FILE)
+ return (error);
+ maria_scan_end(isam_info);
+ maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
+ if (info->current+1 == info->end)
+ return(HA_ERR_END_OF_FILE);
+ info->current++;
+ isam_info= *info->current;
+ filepos=isam_info->s->pack.header_length;
+ maria_reset(isam_info);
+ maria_extra(isam_info,HA_EXTRA_CACHE, 0);
+ if ((error= maria_scan_init(isam_info)))
+ return(error);
+ }
+}
+
+
+static int mrg_close(PACK_MRG_INFO *mrg)
+{
+ uint i;
+ int error=0;
+ DBUG_ENTER("mrg_close");
+
+ for (i=0 ; i < mrg->count ; i++)
+ error|=maria_close(mrg->file[i]);
+ if (mrg->free_file)
+ my_free(mrg->file, MYF(0));
+ DBUG_RETURN(error);
+}
+
+
+#if !defined(DBUG_OFF)
+/*
+ Fake the counts to get big Huffman codes.
+
+ SYNOPSIS
+ fakebigcodes()
+ huff_counts A pointer to the counts array.
+ end_count A pointer past the counts array.
+
+ DESCRIPTION
+
+ Huffman coding works by removing the two least frequent values from
+ the list of values and add a new value with the sum of their
+ incidences in a loop until only one value is left. Every time a
+ value is reused for a new value, it gets one more bit for its
+ encoding. Hence, the least frequent values get the longest codes.
+
+ To get a maximum code length for a value, two of the values must
+ have an incidence of 1. As their sum is 2, the next infrequent value
+ must have at least an incidence of 2, then 4, 8, 16 and so on. This
+ means that one needs 2**n bytes (values) for a code length of n
+ bits. However, using more distinct values forces the use of longer
+ codes, or reaching the code length with less total bytes (values).
+
+ To get 64(32)-bit codes, I sort the counts by decreasing incidence.
+ I assign counts of 1 to the two most frequent values, a count of 2
+ for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
+ the remaining values get 1. That way every possible uchar has an
+ assigned code, though not all codes are used if not all uchar values
+ are present in the column.
+
+ This strategy would work with distinct column values too, but
+ requires that at least 64(32) values are present. To make things
+ easier here, I cancel all distinct column values and force byte
+ compression for all columns.
+
+ RETURN
+ void
+*/
+
+static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
+{
+ HUFF_COUNTS *count;
+ my_off_t *cur_count_p;
+ my_off_t *end_count_p;
+ my_off_t **cur_sort_p;
+ my_off_t **end_sort_p;
+ my_off_t *sort_counts[256];
+ my_off_t total;
+ DBUG_ENTER("fakebigcodes");
+
+ for (count= huff_counts; count < end_count; count++)
+ {
+ /*
+ Remove distinct column values.
+ */
+ if (huff_counts->tree_buff)
+ {
+ my_free(huff_counts->tree_buff, MYF(0));
+ delete_tree(&huff_counts->int_tree);
+ huff_counts->tree_buff= NULL;
+ DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
+ }
+
+ /*
+ Sort counts by decreasing incidence.
+ */
+ cur_count_p= count->counts;
+ end_count_p= cur_count_p + 256;
+ cur_sort_p= sort_counts;
+ while (cur_count_p < end_count_p)
+ *(cur_sort_p++)= cur_count_p++;
+ (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
+
+ /*
+ Assign faked counts.
+ */
+ cur_sort_p= sort_counts;
+#if SIZEOF_LONG_LONG > 4
+ end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
+#else
+ end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
+#endif
+ /* Most frequent value gets a faked count of 1. */
+ **(cur_sort_p++)= 1;
+ total= 1;
+ while (cur_sort_p < end_sort_p)
+ {
+ **(cur_sort_p++)= total;
+ total<<= 1;
+ }
+ /* Set the last value. */
+ **(cur_sort_p++)= --total;
+ /*
+ Set the remaining counts.
+ */
+ end_sort_p= sort_counts + 256;
+ while (cur_sort_p < end_sort_p)
+ **(cur_sort_p++)= 1;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Compare two counts for reverse sorting.
+
+ SYNOPSIS
+ fakecmp()
+ count1 One count.
+ count2 Another count.
+
+ RETURN
+ 1 count1 < count2
+ 0 count1 == count2
+ -1 count1 > count2
+*/
+
+static int fakecmp(my_off_t **count1, my_off_t **count2)
+{
+ return ((**count1 < **count2) ? 1 :
+ (**count1 > **count2) ? -1 : 0);
+}
+#endif
diff --git a/storage/maria/maria_read_log.c b/storage/maria/maria_read_log.c
new file mode 100644
index 00000000000..de45eb0bcb6
--- /dev/null
+++ b/storage/maria/maria_read_log.c
@@ -0,0 +1,308 @@
+/* Copyright (C) 2007 MySQL AB
+ Copyright (C) 2010 Monty Program Ab
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "maria_def.h"
+#include "ma_recovery.h"
+#include <my_getopt.h>
+
+#define LOG_FLAGS 0
+
+static const char *load_default_groups[]= { "aria_read_log",0 };
+static void get_options(int *argc,char * * *argv);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+const char *default_dbug_option= "d:t:O,\\aria_read_log.trace";
+#else
+const char *default_dbug_option= "d:t:o,/tmp/aria_read_log.trace";
+#endif
+#endif /* DBUG_OFF */
+static my_bool opt_display_only, opt_apply, opt_apply_undo, opt_silent;
+static my_bool opt_check;
+static const char *opt_tmpdir;
+static ulong opt_page_buffer_size;
+static ulonglong opt_start_from_lsn, opt_end_lsn, opt_start_from_checkpoint;
+static MY_TMPDIR maria_chk_tmpdir;
+
+
+int main(int argc, char **argv)
+{
+ LSN lsn;
+ char **default_argv;
+ uint warnings_count;
+ MY_INIT(argv[0]);
+
+ load_defaults("my", load_default_groups, &argc, &argv);
+ default_argv= argv;
+ maria_data_root= (char *)".";
+ get_options(&argc, &argv);
+
+ maria_in_recovery= TRUE;
+
+ if (maria_init())
+ {
+ fprintf(stderr, "Can't init Aria engine (%d)\n", errno);
+ goto err;
+ }
+ maria_block_size= 0; /* Use block size from file */
+ /* we don't want to create a control file, it MUST exist */
+ if (ma_control_file_open(FALSE, TRUE))
+ {
+ fprintf(stderr, "Can't open control file (%d)\n", errno);
+ goto err;
+ }
+ if (last_logno == FILENO_IMPOSSIBLE)
+ {
+ fprintf(stderr, "Can't find any log\n");
+ goto err;
+ }
+ if (init_pagecache(maria_pagecache, opt_page_buffer_size, 0, 0,
+ maria_block_size, MY_WME) == 0)
+ {
+ fprintf(stderr, "Got error in init_pagecache() (errno: %d)\n", errno);
+ goto err;
+ }
+ /*
+ If log handler does not find the "last_logno" log it will return error,
+ which is good.
+ But if it finds a log and this log was crashed, it will create a new log,
+ which is useless. TODO: start log handler in read-only mode.
+ */
+ if (init_pagecache(maria_log_pagecache,
+ TRANSLOG_PAGECACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, MY_WME) == 0 ||
+ translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+ 0, 0, maria_log_pagecache, TRANSLOG_DEFAULT_FLAGS,
+ opt_display_only))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ goto err;
+ }
+
+ if (opt_display_only)
+ printf("You are using --display-only, NOTHING will be written to disk\n");
+
+ lsn= translog_first_lsn_in_log();
+ if (lsn == LSN_ERROR)
+ {
+ fprintf(stderr, "Opening transaction log failed\n");
+ goto end;
+ }
+ if (lsn == LSN_IMPOSSIBLE)
+ {
+ fprintf(stdout, "The transaction log is empty\n");
+ }
+ if (opt_start_from_checkpoint && !opt_start_from_lsn &&
+ last_checkpoint_lsn != LSN_IMPOSSIBLE)
+ {
+ lsn= LSN_IMPOSSIBLE; /* LSN set in maria_apply_log() */
+ fprintf(stdout, "Starting from checkpoint (%lu,0x%lx)\n",
+ LSN_IN_PARTS(last_checkpoint_lsn));
+ }
+ else
+ fprintf(stdout, "The transaction log starts from lsn (%lu,0x%lx)\n",
+ LSN_IN_PARTS(lsn));
+
+ if (opt_start_from_lsn)
+ {
+ if (opt_start_from_lsn < (ulonglong) lsn)
+ {
+ fprintf(stderr, "start_from_lsn is too small. Aborting\n");
+ maria_end();
+ goto err;
+ }
+ lsn= (LSN) opt_start_from_lsn;
+ fprintf(stdout, "Starting reading log from lsn (%lu,0x%lx)\n",
+ LSN_IN_PARTS(lsn));
+ }
+
+ if (opt_end_lsn != LSN_IMPOSSIBLE)
+ {
+ /* We can't apply undo if we use end_lsn */
+ opt_apply_undo= 0;
+ }
+
+ fprintf(stdout, "TRACE of the last aria_read_log\n");
+ if (maria_apply_log(lsn, opt_end_lsn, opt_apply ? MARIA_LOG_APPLY :
+ (opt_check ? MARIA_LOG_CHECK :
+ MARIA_LOG_DISPLAY_HEADER), opt_silent ? NULL : stdout,
+ opt_apply_undo, FALSE, FALSE, &warnings_count))
+ goto err;
+ if (warnings_count == 0)
+ fprintf(stdout, "%s: SUCCESS\n", my_progname_short);
+ else
+ fprintf(stdout, "%s: DOUBTFUL (%u warnings, check previous output)\n",
+ my_progname_short, warnings_count);
+
+end:
+ maria_end();
+ free_tmpdir(&maria_chk_tmpdir);
+ free_defaults(default_argv);
+ my_end(0);
+ exit(0);
+ return 0; /* No compiler warning */
+
+err:
+ /* don't touch anything more, in case we hit a bug */
+ fprintf(stderr, "%s: FAILED\n", my_progname_short);
+ free_tmpdir(&maria_chk_tmpdir);
+ free_defaults(default_argv);
+ exit(1);
+}
+
+
+#include "ma_check_standalone.h"
+
+enum options_mc {
+ OPT_CHARSETS_DIR=256
+};
+
+static struct my_option my_long_options[] =
+{
+ {"apply", 'a',
+ "Apply log to tables: modifies tables! you should make a backup first! "
+ " Displays a lot of information if not run with --silent",
+ (uchar **) &opt_apply, (uchar **) &opt_apply, 0,
+ GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"character-sets-dir", OPT_CHARSETS_DIR,
+ "Directory where character sets are.",
+ (char**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"check", 'c',
+ "if --display-only, check if record is fully readable (for debugging)",
+ (uchar **) &opt_check, (uchar **) &opt_check, 0,
+ GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+ {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.",
+ 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"help", '?', "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"display-only", 'd', "display brief info read from records' header",
+ &opt_display_only, &opt_display_only, 0, GET_BOOL,
+ NO_ARG,0, 0, 0, 0, 0, 0},
+ {"aria-log-dir-path", 'l',
+ "Path to the directory where to store transactional log",
+ (uchar **) &maria_data_root, (uchar **) &maria_data_root, 0,
+ GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ { "page-buffer-size", 'P', "",
+ &opt_page_buffer_size, &opt_page_buffer_size, 0,
+ GET_ULONG, REQUIRED_ARG, (long) USE_BUFFER_INIT,
+ (long) USE_BUFFER_INIT, (long) ~(ulong) 0, (long) MALLOC_OVERHEAD,
+ (long) IO_SIZE, 0},
+ { "start-from-lsn", 'o', "Start reading log from this lsn",
+ &opt_start_from_lsn, &opt_start_from_lsn,
+ 0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 },
+ {"start-from-checkpoint", 'C', "Start applying from last checkpoint",
+ &opt_start_from_checkpoint, &opt_start_from_checkpoint, 0,
+ GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { "end-lsn", 'e', "Stop applying at this lsn. If end-lsn is used, UNDO:s "
+ "will not be applied", &opt_end_lsn, &opt_end_lsn,
+ 0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 },
+ {"silent", 's', "Print less information during apply/undo phase",
+ &opt_silent, &opt_silent, 0,
+ GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"verbose", 'v', "Print more information during apply/undo phase",
+ &maria_recovery_verbose, &maria_recovery_verbose, 0,
+ GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"tmpdir", 't', "Path for temporary files. Multiple paths can be specified, "
+ "separated by "
+#if defined( __WIN__) || defined(__NETWARE__)
+ "semicolon (;)"
+#else
+ "colon (:)"
+#endif
+ , (char**) &opt_tmpdir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"undo", 'u', "Apply UNDO records to tables. (disable with --disable-undo)",
+ (uchar **) &opt_apply_undo, (uchar **) &opt_apply_undo, 0,
+ GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0},
+ {"version", 'V', "Print version and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+#include <help_start.h>
+
+static void print_version(void)
+{
+ VOID(printf("%s Ver 1.3 for %s on %s\n",
+ my_progname_short, SYSTEM_TYPE, MACHINE_TYPE));
+ NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+ print_version();
+ puts("Copyright (C) 2007 MySQL AB");
+ puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+ puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+ puts("Display and apply log records from a Aria transaction log");
+ puts("found in the current directory (for now)");
+#ifndef IDENTICAL_PAGES_AFTER_RECOVERY
+ puts("\nNote: Aria is compiled without -DIDENTICAL_PAGES_AFTER_RECOVERY\n"
+ "which means that the table files are not byte-to-byte identical to\n"
+ "files created during normal execution. This should be ok, except for\n"
+ "test scripts that tries to compare files before and after recovery.");
+#endif
+ VOID(printf("\nUsage: %s OPTIONS\n", my_progname_short));
+ puts("You need to use one of -d or -a");
+ my_print_help(my_long_options);
+ print_defaults("my", load_default_groups);
+ my_print_variables(my_long_options);
+}
+
+#include <help_end.h>
+
+static my_bool
+get_one_option(int optid __attribute__((unused)),
+ const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch (optid) {
+ case '?':
+ usage();
+ exit(0);
+ case 'V':
+ print_version();
+ exit(0);
+#ifndef DBUG_OFF
+ case '#':
+ DBUG_SET_INITIAL(argument ? argument : default_dbug_option);
+ break;
+#endif
+ }
+ return 0;
+}
+
+static void get_options(int *argc,char ***argv)
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ if (!opt_apply)
+ opt_apply_undo= FALSE;
+
+ if (((opt_display_only + opt_apply) != 1) || (*argc > 0))
+ {
+ usage();
+ exit(1);
+ }
+ if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir))
+ exit(1);
+ maria_tmpdir= &maria_chk_tmpdir;
+}
diff --git a/storage/maria/maria_rename.sh b/storage/maria/maria_rename.sh
new file mode 100755
index 00000000000..fb20e47e635
--- /dev/null
+++ b/storage/maria/maria_rename.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+replace myisam maria MYISAM MARIA MyISAM MARIA -- mysql-test/t/*maria*test mysql-test/r/*maria*result
+
+FILES=`echo sql/ha_maria.{cc,h} include/maria*h storage/maria/*.{c,h}`
+
+replace myisam maria MYISAM MARIA MyISAM MARIA myisam.h maria.h myisamdef.h maria_def.h mi_ maria_ ft_ maria_ft_ "Copyright (C) 2000" "Copyright (C) 2006" MI_ISAMINFO MARIA_INFO MI_CREATE_INFO MARIA_CREATE_INFO maria_isam_ maria_ MI_INFO MARIA_HA MI_ MARIA_ MARIACHK MARIA_CHK rt_index.h ma_rt_index.h rtree_ maria_rtree rt_key.h ma_rt_key.h rt_mbr.h ma_rt_mbr.h -- $FILES
+
+replace check_table_is_closed _ma_check_table_is_closed test_if_reopen _ma_test_if_reopen my_n_base_info_read maria_n_base_info_read update_auto_increment _ma_update_auto_increment save_pack_length _ma_save_packlength calc_pack_length _ma_calc_pack_length -- $FILES
+
+replace mi_ ma_ ft_ ma_ft_ rt_ ma_rt_ myisam maria myisamchk maria_chk myisampack maria_pack myisamlog maria_log -- storage/maria/Makefile.am
+
+#
+# Restore wrong replaces
+#
+
+replace maria_sint1korr mi_sint1korr maria_uint1korr mi_uint1korr maria_sint2korr mi_sint2korr maria_sint3korr mi_sint3korr maria_sint4korr mi_sint4korr maria_sint8korr mi_sint8korr maria_uint2korr mi_uint2korr maria_uint3korr mi_uint3korr maria_uint4korr mi_uint4korr maria_uint5korr mi_uint5korr maria_uint6korr mi_uint6korr maria_uint7korr mi_uint7korr maria_uint8korr mi_uint8korr maria_int1store mi_int1store maria_int2store mi_int2store maria_int3store mi_int3store maria_int4store mi_int4store maria_int5store mi_int5store maria_int6store mi_int6store maria_int7store mi_int7store maria_int8store mi_int8store maria_float4store mi_float4store maria_float4get mi_float4get maria_float8store mi_float8store maria_float8get mi_float8get maria_rowstore mi_rowstore maria_rowkorr mi_rowkorr maria_sizestore mi_sizestore maria_sizekorr mi_sizekorr _maria_maria_ _maria MARIA_MAX_POSSIBLE_KEY HA_MAX_POSSIBLE_KEY MARIA_MAX_KEY_BUFF HA_MAX_KEY_BUFF MARIA_MAX_KEY_SEG HA_MAX_KEY_SEG maria_ft_sintXkorr ft_sintXkorr maria_ft_intXstore ft_intXstore maria_ft_boolean_syntax ft_boolean_syntax maria_ft_min_word_len ft_min_word_len maria_ft_max_word_len ft_max_word_len -- $FILES
diff --git a/storage/maria/plug.in b/storage/maria/plug.in
new file mode 100644
index 00000000000..008d82250c8
--- /dev/null
+++ b/storage/maria/plug.in
@@ -0,0 +1,19 @@
+MYSQL_STORAGE_ENGINE(aria,, [Aria Storage Engine],
+ [Crash-safe tables with MyISAM heritage], [default,max,max-no-ndb])
+MYSQL_PLUGIN_DIRECTORY(aria, [storage/maria])
+MYSQL_PLUGIN_STATIC(aria, [libaria.a])
+MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(aria, [ha_maria.cc])
+
+MYSQL_PLUGIN_ACTIONS(aria, [
+# AC_CONFIG_FILES(storage/maria/unittest/Makefile)
+AC_ARG_WITH(aria-tmp-tables,
+ AC_HELP_STRING([--with-aria-tmp-tables],[Use Aria for internal temporary tables]),
+ [with_aria_tmp_tables=$withval],
+ [with_aria_tmp_tables=yes]
+)
+
+if test "$with_aria_tmp_tables" = "yes"
+then
+ AC_DEFINE([USE_MARIA_FOR_TMP_TABLES], [1], [Aria is used for internal temporary tables])
+fi
+])
diff --git a/storage/maria/tablockman.c b/storage/maria/tablockman.c
new file mode 100644
index 00000000000..1bb8889aaa7
--- /dev/null
+++ b/storage/maria/tablockman.c
@@ -0,0 +1,674 @@
+/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */
+/* QQ: automatically place S instead of LS if possible */
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include <my_base.h>
+#include <hash.h>
+#include "tablockman.h"
+
+/*
+ Lock Manager for Table Locks
+
+ The code below handles locks on resources - but it is optimized for a
+ case when a number of resources is not very large, and there are many of
+ locks per resource - that is a resource is likely to be a table or a
+ database, but hardly a row in a table.
+
+ Locks belong to "lock owners". A Lock Owner is uniquely identified by a
+ 16-bit number - loid (lock owner identifier). A function loid_to_tlo must
+ be provided by the application that takes such a number as an argument
+ and returns a TABLE_LOCK_OWNER structure.
+
+ Lock levels are completely defined by three tables. Lock compatibility
+ matrix specifies which locks can be held at the same time on a resource.
+ Lock combining matrix specifies what lock level has the same behaviour as
+ a pair of two locks of given levels. getlock_result matrix simplifies
+ intention locking and lock escalation for an application, basically it
+ defines which locks are intention locks and which locks are "loose"
+ locks. It is only used to provide better diagnostics for the
+ application, lock manager itself does not differentiate between normal,
+ intention, and loose locks.
+
+ The assumptions are: few distinct resources, many locks are held at the
+ same time on one resource. Thus: a lock structure _per resource_ can be
+ rather large; a lock structure _per lock_ does not need to be very small
+ either; we need to optimize for _speed_. Operations we need are: place a
+ lock, check if a particular transaction already has a lock on this
+ resource, check if a conflicting lock exists, if yes - find who owns it.
+
+ Solution: every resource has a structure with
+ 1. Hash of latest (see the lock upgrade section below) granted locks with
+ loid as a key. Thus, checking if a given transaction has a lock on
+ this resource is O(1) operation.
+ 2. Doubly-linked lists of all granted locks - one list for every lock
+ type. Thus, checking if a conflicting lock exists is a check whether
+ an appropriate list head pointer is not null, also O(1).
+ 3. Every lock has a loid of the owner, thus checking who owns a
+ conflicting lock is also O(1).
+ 4. Deque of waiting locks. It's a deque (double-ended queue) not a fifo,
+ because for lock upgrades requests are added to the queue head, not
+ tail. This is a single place where there it gets O(N) on number
+ of locks - when a transaction wakes up from waiting on a condition,
+ it may need to scan the queue backward to the beginning to find
+ a conflicting lock. It is guaranteed though that "all transactions
+ before it" received the same - or earlier - signal. In other words a
+ transaction needs to scan all transactions before it that received the
+ signal but didn't have a chance to resume the execution yet, so
+ practically OS scheduler won't let the scan to be O(N).
+
+ Waiting: if there is a conflicting lock or if wait queue is not empty, a
+ requested lock cannot be granted at once. It is added to the end of the
+ wait queue. If a queue was empty and there is a conflicting lock - the
+ "blocker" transaction is the owner of this lock. If a queue is not empty,
+ an owner of the previous lock in the queue is the "blocker". But if the
+ previous lock is compatible with the request, then the "blocker" is the
+ transaction that the owner of the lock at the end of the queue is waiting
+ for (in other words, our lock is added to the end of the wait queue, and
+ our blocker is the same as of the lock right before us).
+
+ Lock upgrades: when a thread that has a lock on a given resource,
+ requests a new lock on the same resource and the old lock is not enough
+ to satisfy new lock requirements (which is defined by
+ lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock
+ (defined by lock_combining_matrix as above) is placed. Depending on
+ other granted locks it is immediately granted or it has to wait. Here the
+ lock is added to the start of the waiting queue, not to the end. Old
+ lock, is removed from the hash, but not from the doubly-linked lists.
+ (indeed, a transaction checks "do I have a lock on this resource ?" by
+ looking in a hash, and it should find a latest lock, so old locks must be
+ removed; but a transaction checks "are there conflicting locks ?" by
+ checking doubly-linked lists, it doesn't matter if it will find an old
+ lock - if it would be removed, a new lock would be also a conflict).
+ So, a hash contains only "latest" locks - there can be only one latest
+ lock per resource per transaction. But doubly-linked lists contain all
+ locks, even "obsolete" ones, because it doesnt't hurt. Note that old
+ locks can not be freed early, in particular they stay in the
+ 'active_locks' list of a lock owner, because they may be "re-enabled"
+ on a savepoint rollback.
+
+ To better support table-row relations where one needs to lock the table
+ with an intention lock before locking the row, extended diagnostics is
+ provided. When an intention lock (presumably on a table) is granted,
+ lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row,
+ perhaps the thread already has a normal lock on this table),
+ GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual),
+ GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check
+ whether it's possible to lock the row, but no need to lock it - perhaps
+ the thread has a loose lock on this table). This is defined by
+ getlock_result[] table.
+
+ Instant duration locks are not supported. Though they're trivial to add,
+ they are normally only used on rows, not on tables. So, presumably,
+ they are not needed here.
+
+ Mutexes: there're table mutexes (LOCKED_TABLE::mutex), lock owner mutexes
+ (TABLE_LOCK_OWNER::mutex), and a pool mutex (TABLOCKMAN::pool_mutex).
+ table mutex protects operations on the table lock structures, and lock
+ owner pointers waiting_for and waiting_for_loid.
+ lock owner mutex is only used to wait on lock owner condition
+ (TABLE_LOCK_OWNER::cond), there's no need to protect owner's lock
+ structures, and only lock owner itself may access them.
+ The pool mutex protects a pool of unused locks. Note the locking order:
+ first the table mutex, then the owner mutex or a pool mutex.
+ Table mutex lock cannot be attempted when owner or pool mutex are locked.
+ No mutex lock can be attempted if owner or pool mutex are locked.
+*/
+
+/*
+ Lock compatibility matrix.
+
+ It's asymmetric. Read it as "Somebody has the lock <value in the row
+ label>, can I set the lock <value in the column label> ?"
+
+ ') Though you can take LS lock while somebody has S lock, it makes no
+ sense - it's simpler to take S lock too.
+
+ 1 - compatible
+ 0 - incompatible
+ -1 - "impossible", so that we can assert the impossibility.
+*/
+static const int lock_compatibility_matrix[10][10]=
+{ /* N S X IS IX SIX LS LX SLX LSIX */
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, /* N */
+ { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */
+ { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */
+ { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */
+ { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */
+ { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */
+ { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */
+};
+
+/*
+ Lock combining matrix.
+
+ It's symmetric. Read it as "what lock level L is identical to the
+ set of two locks A and B"
+
+ One should never get N from it, we assert the impossibility
+*/
+static const enum lockman_lock_type lock_combining_matrix[10][10]=
+{/* N S X IS IX SIX LS LX SLX LSIX */
+ { N, N, N, N, N, N, N, N, N, N}, /* N */
+ { N, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */
+ { N, X, X, X, X, X, X, X, X, X}, /* X */
+ { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */
+ { N, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */
+ { N, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */
+ { N, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */
+ { N, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */
+ { N, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */
+ { N, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */
+};
+
+/*
+ the return codes for lockman_getlock
+
+ It's asymmetric. Read it as "I have the lock <value in the row label>,
+ what value should be returned for <value in the column label> ?"
+
+ 0 means impossible combination (assert!)
+
+ Defines below help to preserve the table structure.
+ I/L/A values are self explanatory
+ x means the combination is possible (assert should not crash)
+ but it cannot happen in row locks, only in table locks (S,X),
+ or lock escalations (LS,LX)
+*/
+#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE
+#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+#define A GOT_THE_LOCK
+#define x GOT_THE_LOCK
+static const enum lockman_getlock_result getlock_result[10][10]=
+{/* N S X IS IX SIX LS LX SLX LSIX */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */
+ { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */
+ { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */
+ { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */
+ { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */
+ { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */
+ { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */
+ { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */
+ { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */
+ { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */
+};
+#undef I
+#undef L
+#undef A
+#undef x
+
+/*
+ this structure is optimized for a case when there're many locks
+ on the same resource - e.g. a table
+*/
+
+struct st_table_lock {
+ /* QQ: do we need upgraded_from ? */
+ struct st_table_lock *next_in_lo, *upgraded_from, *next, *prev;
+ struct st_locked_table *table;
+ uint16 loid;
+ uchar lock_type;
+};
+
+#define hash_insert my_hash_insert /* for consistency :) */
+
+static inline
+TABLE_LOCK *find_by_loid(LOCKED_TABLE *table, uint16 loid)
+{
+ return (TABLE_LOCK *)hash_search(& table->latest_locks,
+ (uchar *)& loid, sizeof(loid));
+}
+
+static inline
+void remove_from_wait_queue(TABLE_LOCK *lock, LOCKED_TABLE *table)
+{
+ DBUG_ASSERT(table == lock->table);
+ if (lock->prev)
+ {
+ DBUG_ASSERT(table->wait_queue_out != lock);
+ lock->prev->next= lock->next;
+ }
+ else
+ {
+ DBUG_ASSERT(table->wait_queue_out == lock);
+ table->wait_queue_out= lock->next;
+ }
+ if (lock->next)
+ {
+ DBUG_ASSERT(table->wait_queue_in != lock);
+ lock->next->prev= lock->prev;
+ }
+ else
+ {
+ DBUG_ASSERT(table->wait_queue_in == lock);
+ table->wait_queue_in= lock->prev;
+ }
+}
+
+/*
+ DESCRIPTION
+ tries to lock a resource 'table' with a lock level 'lock'.
+
+ RETURN
+ see enum lockman_getlock_result
+*/
+enum lockman_getlock_result
+tablockman_getlock(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo,
+ LOCKED_TABLE *table, enum lockman_lock_type lock)
+{
+ TABLE_LOCK *old, *new, *blocker, *blocker2;
+ TABLE_LOCK_OWNER *wait_for;
+ struct timespec timeout;
+ enum lockman_lock_type new_lock;
+ enum lockman_getlock_result res;
+ int i;
+
+ DBUG_ASSERT(lo->waiting_lock == 0);
+ DBUG_ASSERT(lo->waiting_for == 0);
+ DBUG_ASSERT(lo->waiting_for_loid == 0);
+
+ pthread_mutex_lock(& table->mutex);
+ /* do we already have a lock on this resource ? */
+ old= find_by_loid(table, lo->loid);
+
+ /* calculate the level of the upgraded lock, if yes */
+ new_lock= old ? lock_combining_matrix[old->lock_type][lock] : lock;
+
+ /* and check if old lock is enough to satisfy the new request */
+ if (old && new_lock == old->lock_type)
+ {
+ /* yes */
+ res= getlock_result[old->lock_type][lock];
+ goto ret;
+ }
+
+ /* no, placing a new lock. first - take a free lock structure from the pool */
+ pthread_mutex_lock(& lm->pool_mutex);
+ new= lm->pool;
+ if (new)
+ {
+ lm->pool= new->next;
+ pthread_mutex_unlock(& lm->pool_mutex);
+ }
+ else
+ {
+ pthread_mutex_unlock(& lm->pool_mutex);
+ new= (TABLE_LOCK *)my_malloc(sizeof(*new), MYF(MY_WME));
+ if (unlikely(!new))
+ {
+ res= NO_MEMORY_FOR_LOCK;
+ goto ret;
+ }
+ }
+
+ new->loid= lo->loid;
+ new->lock_type= new_lock;
+ new->table= table;
+
+ /* and try to place it */
+ for (new->prev= table->wait_queue_in;;)
+ {
+ wait_for= 0;
+ if (!old)
+ {
+ /* not upgrading - a lock must be added to the _end_ of the wait queue */
+ for (blocker= new->prev; blocker && !wait_for; blocker= blocker->prev)
+ {
+ TABLE_LOCK_OWNER *tmp= lm->loid_to_tlo(blocker->loid);
+
+ /* find a blocking lock */
+ DBUG_ASSERT(table->wait_queue_out);
+ DBUG_ASSERT(table->wait_queue_in);
+ if (!lock_compatibility_matrix[blocker->lock_type][lock])
+ {
+ /* found! */
+ wait_for= tmp;
+ break;
+ }
+
+ /*
+ hmm, the lock before doesn't block us, let's look one step further.
+ the condition below means:
+
+ if we never waited on a condition yet
+ OR
+ the lock before ours (blocker) waits on a lock (blocker2) that is
+ present in the hash AND and conflicts with 'blocker'
+
+ the condition after OR may fail if 'blocker2' was removed from
+ the hash, its signal woke us up, but 'blocker' itself didn't see
+ the signal yet.
+ */
+ if (!lo->waiting_lock ||
+ ((blocker2= find_by_loid(table, tmp->waiting_for_loid)) &&
+ !lock_compatibility_matrix[blocker2->lock_type]
+ [blocker->lock_type]))
+ {
+ /* but it's waiting for a real lock. we'll wait for the same lock */
+ wait_for= tmp->waiting_for;
+ /*
+ We don't really need tmp->waiting_for, as tmp->waiting_for_loid
+ is enough. waiting_for is just a local cache to avoid calling
+ loid_to_tlo().
+ But it's essensial that tmp->waiting_for pointer can ONLY
+ be dereferenced if find_by_loid() above returns a non-null
+ pointer, because a TABLE_LOCK_OWNER object that it points to
+ may've been freed when we come here after a signal.
+ In particular tmp->waiting_for_loid cannot be replaced
+ with tmp->waiting_for->loid.
+ */
+ DBUG_ASSERT(wait_for == lm->loid_to_tlo(tmp->waiting_for_loid));
+ break;
+ }
+
+ /*
+ otherwise - a lock it's waiting for doesn't exist.
+ We've no choice but to scan the wait queue backwards, looking
+ for a conflicting lock or a lock waiting for a real lock.
+ QQ is there a way to avoid this scanning ?
+ */
+ }
+ }
+
+ if (wait_for == 0)
+ {
+ /* checking for compatibility with existing locks */
+ for (blocker= 0, i= 0; i < LOCK_TYPES; i++)
+ {
+ if (table->active_locks[i] && !lock_compatibility_matrix[i+1][lock])
+ {
+ blocker= table->active_locks[i];
+ /* if the first lock in the list is our own - skip it */
+ if (blocker->loid == lo->loid)
+ blocker= blocker->next;
+ if (blocker) /* found a conflicting lock, need to wait */
+ break;
+ }
+ }
+ if (!blocker) /* free to go */
+ break;
+ wait_for= lm->loid_to_tlo(blocker->loid);
+ }
+
+ /* ok, we're here - the wait is inevitable */
+ lo->waiting_for= wait_for;
+ lo->waiting_for_loid= wait_for->loid;
+ if (!lo->waiting_lock) /* first iteration of the for() loop */
+ {
+ /* lock upgrade or new lock request ? */
+ if (old)
+ {
+ /* upgrade - add the lock to the _start_ of the wait queue */
+ new->prev= 0;
+ if ((new->next= table->wait_queue_out))
+ new->next->prev= new;
+ table->wait_queue_out= new;
+ if (!table->wait_queue_in)
+ table->wait_queue_in= table->wait_queue_out;
+ }
+ else
+ {
+ /* new lock - add the lock to the _end_ of the wait queue */
+ new->next= 0;
+ if ((new->prev= table->wait_queue_in))
+ new->prev->next= new;
+ table->wait_queue_in= new;
+ if (!table->wait_queue_out)
+ table->wait_queue_out= table->wait_queue_in;
+ }
+ lo->waiting_lock= new;
+
+ set_timespec_nsec(timeout,lm->lock_timeout * 1000000);
+
+ }
+
+ /*
+ prepare to wait.
+ we must lock blocker's mutex to wait on blocker's cond.
+ and we must release table's mutex.
+ note that blocker's mutex is locked _before_ table's mutex is released
+ */
+ pthread_mutex_lock(wait_for->mutex);
+ pthread_mutex_unlock(& table->mutex);
+
+ /* now really wait */
+ i= pthread_cond_timedwait(wait_for->cond, wait_for->mutex, & timeout);
+
+ pthread_mutex_unlock(wait_for->mutex);
+
+ if (i == ETIMEDOUT || i == ETIME)
+ {
+ /* we rely on the caller to rollback and release all locks */
+ res= LOCK_TIMEOUT;
+ goto ret2;
+ }
+
+ pthread_mutex_lock(& table->mutex);
+
+ /* ... and repeat from the beginning */
+ }
+ /* yeah! we can place the lock now */
+
+ /* remove the lock from the wait queue, if it was there */
+ if (lo->waiting_lock)
+ {
+ remove_from_wait_queue(new, table);
+ lo->waiting_lock= 0;
+ lo->waiting_for= 0;
+ lo->waiting_for_loid= 0;
+ }
+
+ /* add it to the list of all locks of this lock owner */
+ new->next_in_lo= lo->active_locks;
+ lo->active_locks= new;
+
+ /* and to the list of active locks of this lock type */
+ new->prev= 0;
+ if ((new->next= table->active_locks[new_lock-1]))
+ new->next->prev= new;
+ table->active_locks[new_lock-1]= new;
+
+ /* update the latest_locks hash */
+ if (old)
+ hash_delete(& table->latest_locks, (uchar *)old);
+ hash_insert(& table->latest_locks, (uchar *)new);
+
+ new->upgraded_from= old;
+
+ res= getlock_result[lock][lock];
+
+ret:
+ pthread_mutex_unlock(& table->mutex);
+ret2:
+ DBUG_ASSERT(res);
+ return res;
+}
+
+/*
+ DESCRIPTION
+ release all locks belonging to a transaction.
+ signal waiters to continue
+*/
+void tablockman_release_locks(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo)
+{
+ TABLE_LOCK *lock, *local_pool= 0, *local_pool_end;
+
+ /*
+ instead of adding released locks to a pool one by one, we'll link
+ them in a list and add to a pool in one short action (under a mutex)
+ */
+ local_pool_end= lo->waiting_lock ? lo->waiting_lock : lo->active_locks;
+ if (!local_pool_end)
+ return;
+
+ /* release a waiting lock, if any */
+ if ((lock= lo->waiting_lock))
+ {
+ DBUG_ASSERT(lock->loid == lo->loid);
+ pthread_mutex_lock(& lock->table->mutex);
+ remove_from_wait_queue(lock, lock->table);
+
+ /*
+ a special case: if this lock was not the last in the wait queue
+ and it's compatible with the next lock, than the next lock
+ is waiting for our blocker though really it waits for us, indirectly.
+ Signal our blocker to release this next lock (after we removed our
+ lock from the wait queue, of course).
+ */
+ /*
+ An example to clarify the above:
+ trn1> S-lock the table. Granted.
+ trn2> IX-lock the table. Added to the wait queue. trn2 waits on trn1
+ trn3> IS-lock the table. The queue is not empty, so IS-lock is added
+ to the queue. It's compatible with the waiting IX-lock, so trn3
+ waits for trn2->waiting_for, that is trn1.
+ if trn1 releases the lock it signals trn1->cond and both waiting
+ transactions are awaken. But if trn2 times out, trn3 must be notified
+ too (as IS and S locks are compatible). So trn2 must signal trn1->cond.
+ */
+ if (lock->next &&
+ lock_compatibility_matrix[lock->next->lock_type][lock->lock_type])
+ {
+ pthread_mutex_lock(lo->waiting_for->mutex);
+ pthread_cond_broadcast(lo->waiting_for->cond);
+ pthread_mutex_unlock(lo->waiting_for->mutex);
+ }
+ lo->waiting_for= 0;
+ lo->waiting_for_loid= 0;
+ pthread_mutex_unlock(& lock->table->mutex);
+
+ lock->next= local_pool;
+ local_pool= lock;
+ }
+
+ /* now release granted locks */
+ lock= lo->active_locks;
+ while (lock)
+ {
+ TABLE_LOCK *cur= lock;
+ pthread_mutex_t *mutex= & lock->table->mutex;
+ DBUG_ASSERT(cur->loid == lo->loid);
+
+ DBUG_ASSERT(lock != lock->next_in_lo);
+ lock= lock->next_in_lo;
+
+ /* TODO ? group locks by table to reduce the number of mutex locks */
+ pthread_mutex_lock(mutex);
+ hash_delete(& cur->table->latest_locks, (uchar *)cur);
+
+ if (cur->prev)
+ cur->prev->next= cur->next;
+ if (cur->next)
+ cur->next->prev= cur->prev;
+ if (cur->table->active_locks[cur->lock_type-1] == cur)
+ cur->table->active_locks[cur->lock_type-1]= cur->next;
+
+ cur->next= local_pool;
+ local_pool= cur;
+
+ pthread_mutex_unlock(mutex);
+ }
+
+ lo->waiting_lock= lo->active_locks= 0;
+
+ /*
+ okay, all locks released. now signal that we're leaving,
+ in case somebody's waiting for it
+ */
+ pthread_mutex_lock(lo->mutex);
+ pthread_cond_broadcast(lo->cond);
+ pthread_mutex_unlock(lo->mutex);
+
+ /* and push all freed locks to the lockman's pool */
+ pthread_mutex_lock(& lm->pool_mutex);
+ local_pool_end->next= lm->pool;
+ lm->pool= local_pool;
+ pthread_mutex_unlock(& lm->pool_mutex);
+}
+
+void tablockman_init(TABLOCKMAN *lm, loid_to_tlo_func *func, uint timeout)
+{
+ lm->pool= 0;
+ lm->loid_to_tlo= func;
+ lm->lock_timeout= timeout;
+ pthread_mutex_init(& lm->pool_mutex, MY_MUTEX_INIT_FAST);
+ my_getsystime(); /* ensure that my_getsystime() is initialized */
+}
+
+void tablockman_destroy(TABLOCKMAN *lm)
+{
+ while (lm->pool)
+ {
+ TABLE_LOCK *tmp= lm->pool;
+ lm->pool= tmp->next;
+ my_free((void *)tmp, MYF(0));
+ }
+ pthread_mutex_destroy(& lm->pool_mutex);
+}
+
+/*
+ initialize a LOCKED_TABLE structure
+
+ SYNOPSYS
+ lt a LOCKED_TABLE to initialize
+ initial_hash_size initial size for 'latest_locks' hash
+*/
+void tablockman_init_locked_table(LOCKED_TABLE *lt, int initial_hash_size)
+{
+ bzero(lt, sizeof(*lt));
+ pthread_mutex_init(& lt->mutex, MY_MUTEX_INIT_FAST);
+ hash_init(& lt->latest_locks, & my_charset_bin, initial_hash_size,
+ offsetof(TABLE_LOCK, loid),
+ sizeof(((TABLE_LOCK*)0)->loid), 0, 0, 0);
+}
+
+void tablockman_destroy_locked_table(LOCKED_TABLE *lt)
+{
+ int i;
+
+ DBUG_ASSERT(lt->wait_queue_out == 0);
+ DBUG_ASSERT(lt->wait_queue_in == 0);
+ DBUG_ASSERT(lt->latest_locks.records == 0);
+ for (i= 0; i<LOCK_TYPES; i++)
+ DBUG_ASSERT(lt->active_locks[i] == 0);
+
+ hash_free(& lt->latest_locks);
+ pthread_mutex_destroy(& lt->mutex);
+}
+
+#ifdef EXTRA_DEBUG
+static const char *lock2str[LOCK_TYPES+1]= {"N", "S", "X", "IS", "IX", "SIX",
+ "LS", "LX", "SLX", "LSIX"};
+
+void tablockman_print_tlo(TABLE_LOCK_OWNER *lo)
+{
+ TABLE_LOCK *lock;
+
+ printf("lo%d>", lo->loid);
+ if ((lock= lo->waiting_lock))
+ printf(" (%s.0x%lx)", lock2str[lock->lock_type], (ulong)lock->table);
+ for (lock= lo->active_locks;
+ lock && lock != lock->next_in_lo;
+ lock= lock->next_in_lo)
+ printf(" %s.0x%lx", lock2str[lock->lock_type], (ulong)lock->table);
+ if (lock && lock == lock->next_in_lo)
+ printf("!");
+ printf("\n");
+}
+#endif
+
diff --git a/storage/maria/tablockman.h b/storage/maria/tablockman.h
new file mode 100644
index 00000000000..e33d1aa44e8
--- /dev/null
+++ b/storage/maria/tablockman.h
@@ -0,0 +1,87 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _tablockman_h
+#define _tablockman_h
+
+/*
+ Lock levels:
+ ^^^^^^^^^^^
+
+ N - "no lock", not a lock, used sometimes internally to simplify the code
+ S - Shared
+ X - eXclusive
+ IS - Intention Shared
+ IX - Intention eXclusive
+ SIX - Shared + Intention eXclusive
+ LS - Loose Shared
+ LX - Loose eXclusive
+ SLX - Shared + Loose eXclusive
+ LSIX - Loose Shared + Intention eXclusive
+*/
+#ifndef _lockman_h
+/* QQ: TODO remove N-locks */
+enum lockman_lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST };
+enum lockman_getlock_result {
+ NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT,
+ GOT_THE_LOCK,
+ GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE,
+ GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+};
+#endif
+
+#define LOCK_TYPES (LOCK_TYPE_LAST-1)
+
+typedef struct st_table_lock TABLE_LOCK;
+
+typedef struct st_table_lock_owner {
+ TABLE_LOCK *active_locks; /* list of active locks */
+ TABLE_LOCK *waiting_lock; /* waiting lock (one lock only) */
+ struct st_table_lock_owner *waiting_for; /* transaction we're waiting for */
+ pthread_cond_t *cond; /* transactions waiting for us, wait on 'cond' */
+ pthread_mutex_t *mutex; /* mutex is required to use 'cond' */
+ uint16 loid, waiting_for_loid; /* Lock Owner IDentifier */
+} TABLE_LOCK_OWNER;
+
+typedef struct st_locked_table {
+ pthread_mutex_t mutex; /* mutex for everything below */
+ HASH latest_locks; /* latest locks in a hash */
+ TABLE_LOCK *active_locks[LOCK_TYPES]; /* dl-list of locks per type */
+ TABLE_LOCK *wait_queue_in, *wait_queue_out; /* wait deque (double-end queue)*/
+} LOCKED_TABLE;
+
+typedef TABLE_LOCK_OWNER *loid_to_tlo_func(uint16);
+
+typedef struct {
+ pthread_mutex_t pool_mutex;
+ TABLE_LOCK *pool; /* lifo pool of free locks */
+ uint lock_timeout; /* lock timeout in milliseconds */
+ loid_to_tlo_func *loid_to_tlo; /* for mapping loid to TABLE_LOCK_OWNER */
+} TABLOCKMAN;
+
+void tablockman_init(TABLOCKMAN *, loid_to_tlo_func *, uint);
+void tablockman_destroy(TABLOCKMAN *);
+enum lockman_getlock_result tablockman_getlock(TABLOCKMAN *, TABLE_LOCK_OWNER *,
+ LOCKED_TABLE *, enum lockman_lock_type);
+void tablockman_release_locks(TABLOCKMAN *, TABLE_LOCK_OWNER *);
+void tablockman_init_locked_table(LOCKED_TABLE *, int);
+void tablockman_destroy_locked_table(LOCKED_TABLE *);
+
+#ifdef EXTRA_DEBUG
+void tablockman_print_tlo(TABLE_LOCK_OWNER *);
+#endif
+
+#endif
+
diff --git a/storage/maria/test_pack b/storage/maria/test_pack
new file mode 100755
index 00000000000..689645b1661
--- /dev/null
+++ b/storage/maria/test_pack
@@ -0,0 +1,10 @@
+silent="-s"
+suffix=""
+
+ma_test1$suffix -s ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -us test1 ; maria_chk$suffix -es test1
+ma_test1$suffix -s -S ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ;maria_chk$suffix -us test1 ; maria_chk$suffix -es test1
+ma_test1$suffix -s -b ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1
+ma_test1$suffix -s -w ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -ros test1 ; maria_chk$suffix -es test1
+
+ma_test2$suffix -s -t4 ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2
+ma_test2$suffix -s -t4 -b ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2
diff --git a/storage/maria/trnman.c b/storage/maria/trnman.c
new file mode 100644
index 00000000000..05330baed76
--- /dev/null
+++ b/storage/maria/trnman.c
@@ -0,0 +1,979 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "trnman.h"
+#include "ma_checkpoint.h"
+#include "ma_control_file.h"
+
+/*
+ status variables:
+ how many trns in the active list currently,
+ in the committed list currently, allocated since startup.
+*/
+uint trnman_active_transactions, trnman_committed_transactions,
+ trnman_allocated_transactions;
+
+/* list of active transactions in the trid order */
+static TRN active_list_min, active_list_max;
+/* list of committed transactions in the trid order */
+static TRN committed_list_min, committed_list_max;
+
+/* a counter, used to generate transaction ids */
+static TrID global_trid_generator;
+
+/*
+ The minimum existing transaction id for trnman_get_min_trid()
+ The default value is used when transaction manager not initialize;
+ Probably called from maria_chk
+*/
+static TrID trid_min_read_from= MAX_TRID;
+
+/* the mutex for everything above */
+static pthread_mutex_t LOCK_trn_list;
+
+/* LIFO pool of unused TRN structured for reuse */
+static TRN *pool;
+
+/* a hash for committed transactions that maps trid to a TRN structure */
+static LF_HASH trid_to_trn;
+
+/* an array that maps short_id of an active transaction to a TRN structure */
+static TRN **short_trid_to_active_trn;
+
+/* locks for short_trid_to_active_trn and pool */
+static my_atomic_rwlock_t LOCK_short_trid_to_trn, LOCK_pool;
+static my_bool default_trnman_end_trans_hook(TRN *, my_bool, my_bool);
+static void trnman_free_trn(TRN *);
+
+my_bool (*trnman_end_trans_hook)(TRN *, my_bool, my_bool)=
+ default_trnman_end_trans_hook;
+
+/*
+ Simple interface functions
+ QQ: if they stay so simple, should we make them inline?
+*/
+
+uint trnman_increment_locked_tables(TRN *trn)
+{
+ return trn->locked_tables++;
+}
+
+uint trnman_has_locked_tables(TRN *trn)
+{
+ return trn->locked_tables;
+}
+
+uint trnman_decrement_locked_tables(TRN *trn)
+{
+ return --trn->locked_tables;
+}
+
+void trnman_reset_locked_tables(TRN *trn, uint locked_tables)
+{
+ trn->locked_tables= locked_tables;
+}
+
+#ifdef EXTRA_DEBUG
+uint16 trnman_get_flags(TRN *trn)
+{
+ return trn->flags;
+}
+
+void trnman_set_flags(TRN *trn, uint16 flags)
+{
+ trn->flags= flags;
+}
+#endif
+
+/** Wake up threads waiting for this transaction */
+static void wt_thd_release_self(TRN *trn)
+{
+ if (trn->wt)
+ {
+ WT_RESOURCE_ID rc;
+ rc.type= &ma_rc_dup_unique;
+ rc.value= (intptr)trn;
+ wt_thd_release(trn->wt, & rc);
+ trn->wt= 0;
+ }
+}
+
+static my_bool
+default_trnman_end_trans_hook(TRN *trn __attribute__ ((unused)),
+ my_bool commit __attribute__ ((unused)),
+ my_bool active_transactions
+ __attribute__ ((unused)))
+{
+ return 0;
+}
+
+
+static uchar *trn_get_hash_key(const uchar *trn, size_t *len,
+ my_bool unused __attribute__ ((unused)))
+{
+ *len= sizeof(TrID);
+ return (uchar *) & ((*((TRN **)trn))->trid);
+}
+
+
+/**
+ @brief Initializes transaction manager.
+
+ @param initial_trid Generated TrIDs will start from initial_trid+1.
+
+ @return Operation status
+ @retval 0 OK
+ @retval !=0 Error
+*/
+
+int trnman_init(TrID initial_trid)
+{
+ DBUG_ENTER("trnman_init");
+ DBUG_PRINT("enter", ("initial_trid: %lu", (ulong) initial_trid));
+
+ short_trid_to_active_trn= (TRN **)my_malloc(SHORT_TRID_MAX*sizeof(TRN*),
+ MYF(MY_WME|MY_ZEROFILL));
+ if (unlikely(!short_trid_to_active_trn))
+ DBUG_RETURN(1);
+ short_trid_to_active_trn--; /* min short_id is 1 */
+
+ /*
+ Initialize lists.
+ active_list_max.min_read_from must be larger than any trid,
+ so that when an active list is empty we would could free
+ all committed list.
+ And committed_list_max itself can not be freed so
+ committed_list_max.commit_trid must not be smaller that
+ active_list_max.min_read_from
+ */
+
+ active_list_max.trid= active_list_min.trid= 0;
+ active_list_max.min_read_from= MAX_TRID;
+ active_list_max.next= active_list_min.prev= 0;
+ active_list_max.prev= &active_list_min;
+ active_list_min.next= &active_list_max;
+
+ committed_list_max.commit_trid= MAX_TRID;
+ committed_list_max.next= committed_list_min.prev= 0;
+ committed_list_max.prev= &committed_list_min;
+ committed_list_min.next= &committed_list_max;
+
+ trnman_active_transactions= 0;
+ trnman_committed_transactions= 0;
+ trnman_allocated_transactions= 0;
+ /* This is needed for recovery and repair */
+ dummy_transaction_object.min_read_from= ~(TrID) 0;
+
+ pool= 0;
+ global_trid_generator= initial_trid;
+ trid_min_read_from= initial_trid;
+ lf_hash_init(&trid_to_trn, sizeof(TRN*), LF_HASH_UNIQUE,
+ 0, 0, trn_get_hash_key, 0);
+ DBUG_PRINT("info", ("pthread_mutex_init LOCK_trn_list"));
+ pthread_mutex_init(&LOCK_trn_list, MY_MUTEX_INIT_FAST);
+ my_atomic_rwlock_init(&LOCK_short_trid_to_trn);
+ my_atomic_rwlock_init(&LOCK_pool);
+
+ DBUG_RETURN(0);
+}
+
+/*
+ NOTE
+ this could only be called in the "idle" state - no transaction can be
+ running. See asserts below.
+*/
+void trnman_destroy()
+{
+ DBUG_ENTER("trnman_destroy");
+
+ if (short_trid_to_active_trn == NULL) /* trnman already destroyed */
+ DBUG_VOID_RETURN;
+ DBUG_ASSERT(trid_to_trn.count == 0);
+ DBUG_ASSERT(trnman_active_transactions == 0);
+ DBUG_ASSERT(trnman_committed_transactions == 0);
+ DBUG_ASSERT(active_list_max.prev == &active_list_min);
+ DBUG_ASSERT(active_list_min.next == &active_list_max);
+ DBUG_ASSERT(committed_list_max.prev == &committed_list_min);
+ DBUG_ASSERT(committed_list_min.next == &committed_list_max);
+ while (pool)
+ {
+ TRN *trn= pool;
+ pool= pool->next;
+ DBUG_ASSERT(trn->wt == NULL);
+ pthread_mutex_destroy(&trn->state_lock);
+ my_free((void *)trn, MYF(0));
+ }
+ lf_hash_destroy(&trid_to_trn);
+ DBUG_PRINT("info", ("pthread_mutex_destroy LOCK_trn_list"));
+ pthread_mutex_destroy(&LOCK_trn_list);
+ my_atomic_rwlock_destroy(&LOCK_short_trid_to_trn);
+ my_atomic_rwlock_destroy(&LOCK_pool);
+ my_free((void *)(short_trid_to_active_trn+1), MYF(0));
+ short_trid_to_active_trn= NULL;
+
+ DBUG_VOID_RETURN;
+}
+
+/*
+ NOTE
+ TrID is limited to 6 bytes. Initial value of the generator
+ is set by the recovery code - being read from the last checkpoint
+ (or 1 on a first run).
+*/
+static TrID new_trid()
+{
+ DBUG_ENTER("new_trid");
+ DBUG_ASSERT(global_trid_generator < 0xffffffffffffLL);
+ DBUG_PRINT("info", ("safe_mutex_assert_owner LOCK_trn_list"));
+ safe_mutex_assert_owner(&LOCK_trn_list);
+ DBUG_RETURN(++global_trid_generator);
+}
+
+static uint get_short_trid(TRN *trn)
+{
+ int i= (int) ((global_trid_generator + (intptr)trn) * 312089 %
+ SHORT_TRID_MAX) + 1;
+ uint res=0;
+
+ for ( ; !res ; i= 1)
+ {
+ my_atomic_rwlock_wrlock(&LOCK_short_trid_to_trn);
+ for ( ; i <= SHORT_TRID_MAX; i++) /* the range is [1..SHORT_TRID_MAX] */
+ {
+ void *tmp= NULL;
+ if (short_trid_to_active_trn[i] == NULL &&
+ my_atomic_casptr((void **)&short_trid_to_active_trn[i], &tmp, trn))
+ {
+ res= i;
+ break;
+ }
+ }
+ my_atomic_rwlock_wrunlock(&LOCK_short_trid_to_trn);
+ }
+ return res;
+}
+
+/**
+ Allocates and initialzies a new TRN object
+
+ @note the 'wt' parameter can only be 0 in a single-threaded code (or,
+ generally, where threads cannot block each other), otherwise the
+ first call to the deadlock detector will sigsegv.
+*/
+
+TRN *trnman_new_trn(WT_THD *wt)
+{
+ int res;
+ TRN *trn;
+ union { TRN *trn; void *v; } tmp;
+ DBUG_ENTER("trnman_new_trn");
+
+ /*
+ we have a mutex, to do simple things under it - allocate a TRN,
+ increment trnman_active_transactions, set trn->min_read_from.
+
+ Note that all the above is fast. generating short_id may be slow,
+ as it involves scanning a large array - so it's done outside of the
+ mutex.
+ */
+
+ DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list"));
+ pthread_mutex_lock(&LOCK_trn_list);
+
+ /* Allocating a new TRN structure */
+ tmp.trn= pool;
+ /*
+ Popping an unused TRN from the pool
+ (ABA isn't possible, we're behind a mutex
+ */
+ my_atomic_rwlock_wrlock(&LOCK_pool);
+ while (tmp.trn && !my_atomic_casptr((void **)(char*) &pool, &tmp.v,
+ (void *)tmp.trn->next))
+ /* no-op */;
+ my_atomic_rwlock_wrunlock(&LOCK_pool);
+
+ /* Nothing in the pool ? Allocate a new one */
+ if (!(trn= tmp.trn))
+ {
+ /*
+ trn should be completely initalized at create time to allow
+ one to keep a known state on it.
+ (Like redo_lns, which is assumed to be 0 at start of row handling
+ and reset to zero before end of row handling)
+ */
+ trn= (TRN *)my_malloc(sizeof(TRN), MYF(MY_WME | MY_ZEROFILL));
+ if (unlikely(!trn))
+ {
+ DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list"));
+ pthread_mutex_unlock(&LOCK_trn_list);
+ return 0;
+ }
+ trnman_allocated_transactions++;
+ pthread_mutex_init(&trn->state_lock, MY_MUTEX_INIT_FAST);
+ }
+ trn->wt= wt;
+ trn->pins= lf_hash_get_pins(&trid_to_trn);
+ if (!trn->pins)
+ {
+ trnman_free_trn(trn);
+ pthread_mutex_unlock(&LOCK_trn_list);
+ return 0;
+ }
+
+ trnman_active_transactions++;
+
+ trn->min_read_from= active_list_min.next->trid;
+
+ trn->trid= new_trid();
+
+ trn->next= &active_list_max;
+ trn->prev= active_list_max.prev;
+ active_list_max.prev= trn->prev->next= trn;
+ trid_min_read_from= active_list_min.next->min_read_from;
+ DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list"));
+ pthread_mutex_unlock(&LOCK_trn_list);
+
+ if (unlikely(!trn->min_read_from))
+ {
+ /*
+ We are the only transaction. Set min_read_from so that we can read
+ our own rows
+ */
+ trn->min_read_from= trn->trid + 1;
+ }
+
+ /* no other transaction can read changes done by this one */
+ trn->commit_trid= MAX_TRID;
+ trn->rec_lsn= trn->undo_lsn= trn->first_undo_lsn= 0;
+ trn->used_tables= 0;
+
+ trn->locked_tables= 0;
+ trn->flags= 0;
+
+ /*
+ only after the following function TRN is considered initialized,
+ so it must be done the last
+ */
+ pthread_mutex_lock(&trn->state_lock);
+ trn->short_id= get_short_trid(trn);
+ pthread_mutex_unlock(&trn->state_lock);
+
+ res= lf_hash_insert(&trid_to_trn, trn->pins, &trn);
+ DBUG_ASSERT(res <= 0);
+ if (res)
+ {
+ trnman_end_trn(trn, 0);
+ return 0;
+ }
+
+ DBUG_PRINT("exit", ("trn: 0x%lx trid: 0x%lu",
+ (ulong) trn, (ulong) trn->trid));
+
+ DBUG_RETURN(trn);
+}
+
+/*
+ remove a trn from the active list.
+ if necessary - move to committed list and set commit_trid
+
+ NOTE
+ Locks are released at the end. In particular, after placing the
+ transaction in commit list, and after setting commit_trid. It's
+ important, as commit_trid affects visibility. Locks don't affect
+ anything they simply delay execution of other threads - they could be
+ released arbitrarily late. In other words, when locks are released it
+ serves as a start banner for other threads, they start to run. So
+ everything they may need must be ready at that point.
+
+ RETURN
+ 0 ok
+ 1 error
+*/
+my_bool trnman_end_trn(TRN *trn, my_bool commit)
+{
+ int res= 1;
+ uint16 cached_short_id= trn->short_id; /* we have to cache it, see below */
+ TRN *free_me= 0;
+ LF_PINS *pins= trn->pins;
+ DBUG_ENTER("trnman_end_trn");
+ DBUG_PRINT("enter", ("trn=0x%lx commit=%d", (ulong) trn, commit));
+
+ /* if a rollback, all UNDO records should have been executed */
+ DBUG_ASSERT(commit || trn->undo_lsn == 0);
+ DBUG_ASSERT(trn != &dummy_transaction_object);
+ DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list"));
+
+ pthread_mutex_lock(&LOCK_trn_list);
+
+ /* remove from active list */
+ trn->next->prev= trn->prev;
+ trn->prev->next= trn->next;
+
+ /*
+ if trn was the oldest active transaction, now that it goes away there
+ may be committed transactions in the list which no active transaction
+ needs to bother about - clean up the committed list
+ */
+ if (trn->prev == &active_list_min)
+ {
+ uint free_me_count;
+ TRN *t;
+ for (t= committed_list_min.next, free_me_count= 0;
+ t->commit_trid < active_list_min.next->min_read_from;
+ t= t->next, free_me_count++) /* no-op */;
+
+ DBUG_ASSERT((t != committed_list_min.next && free_me_count > 0) ||
+ (t == committed_list_min.next && free_me_count == 0));
+ /* found transactions committed before the oldest active one */
+ if (t != committed_list_min.next)
+ {
+ free_me= committed_list_min.next;
+ committed_list_min.next= t;
+ t->prev->next= 0;
+ t->prev= &committed_list_min;
+ trnman_committed_transactions-= free_me_count;
+ }
+ }
+
+ pthread_mutex_lock(&trn->state_lock);
+ if (commit)
+ trn->commit_trid= global_trid_generator;
+ wt_thd_release_self(trn);
+ pthread_mutex_unlock(&trn->state_lock);
+
+ /*
+ if transaction is committed and it was not the only active transaction -
+ add it to the committed list
+ */
+ if (commit && active_list_min.next != &active_list_max)
+ {
+ trn->next= &committed_list_max;
+ trn->prev= committed_list_max.prev;
+ trnman_committed_transactions++;
+ committed_list_max.prev= trn->prev->next= trn;
+ }
+ else
+ {
+ trn->next= free_me;
+ free_me= trn;
+ }
+ trid_min_read_from= active_list_min.next->min_read_from;
+
+ if ((*trnman_end_trans_hook)(trn, commit,
+ active_list_min.next != &active_list_max))
+ res= -1;
+ trnman_active_transactions--;
+
+ DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list"));
+ pthread_mutex_unlock(&LOCK_trn_list);
+
+ /*
+ the rest is done outside of a critical section
+
+ note that we don't own trn anymore, it may be in a shared list now.
+ Thus, we cannot dereference it, and must use cached_short_id below.
+ */
+ my_atomic_rwlock_rdlock(&LOCK_short_trid_to_trn);
+ my_atomic_storeptr((void **)&short_trid_to_active_trn[cached_short_id], 0);
+ my_atomic_rwlock_rdunlock(&LOCK_short_trid_to_trn);
+
+ /*
+ we, under the mutex, removed going-in-free_me transactions from the
+ active and committed lists, thus nobody else may see them when it scans
+ those lists, and thus nobody may want to free them. Now we don't
+ need a mutex to access free_me list
+ */
+ /* QQ: send them to the purge thread */
+ while (free_me)
+ {
+ TRN *t= free_me;
+ free_me= free_me->next;
+
+ /* ignore OOM. it's harmless, and we can do nothing here anyway */
+ (void)lf_hash_delete(&trid_to_trn, pins, &t->trid, sizeof(TrID));
+
+ trnman_free_trn(t);
+ }
+
+ lf_hash_put_pins(pins);
+
+ DBUG_RETURN(res < 0);
+}
+
+/*
+ free a trn (add to the pool, that is)
+ note - we can never really free() a TRN if there's at least one other
+ running transaction - see, e.g., how lock waits are implemented in
+ lockman.c
+ The same is true for other lock-free data structures too. We may need some
+ kind of FLUSH command to reset them all - ensuring that no transactions are
+ running. It may even be called automatically on checkpoints if no
+ transactions are running.
+*/
+static void trnman_free_trn(TRN *trn)
+{
+ /*
+ union is to solve strict aliasing issue.
+ without it gcc 3.4.3 doesn't notice that updating *(void **)&tmp
+ modifies the value of tmp.
+ */
+ union { TRN *trn; void *v; } tmp;
+
+ pthread_mutex_lock(&trn->state_lock);
+ trn->short_id= 0;
+ pthread_mutex_unlock(&trn->state_lock);
+
+ tmp.trn= pool;
+
+ my_atomic_rwlock_wrlock(&LOCK_pool);
+ do
+ {
+ /*
+ without this volatile cast gcc-3.4.4 moves the assignment
+ down after the loop at -O2
+ */
+ *(TRN * volatile *)&(trn->next)= tmp.trn;
+ } while (!my_atomic_casptr((void **)(char*)&pool, &tmp.v, trn));
+ my_atomic_rwlock_wrunlock(&LOCK_pool);
+}
+
+/*
+ NOTE
+ here we access the hash in a lock-free manner.
+ It's safe, a 'found' TRN can never be freed/reused before we access it.
+ In fact, it cannot be freed before 'trn' ends, because a 'found' TRN
+ can only be removed from the hash when:
+ found->commit_trid < ALL (trn->min_read_from)
+ that is, at least
+ found->commit_trid < trn->min_read_from
+ but
+ found->trid >= trn->min_read_from
+ and
+ found->commit_trid > found->trid
+
+ RETURN
+ 1 can
+ 0 cannot
+ -1 error (OOM)
+*/
+int trnman_can_read_from(TRN *trn, TrID trid)
+{
+ TRN **found;
+ my_bool can;
+ LF_REQUIRE_PINS(3);
+
+ if (trid < trn->min_read_from)
+ return 1; /* Row is visible by all transactions in the system */
+
+ if (trid >= trn->trid)
+ {
+ /*
+ We have now two cases
+ trid > trn->trid, in which case the row is from a new transaction
+ and not visible, in which case we should return 0.
+ trid == trn->trid in which case the row is from the current transaction
+ and we should return 1
+ */
+ return trid == trn->trid;
+ }
+
+ found= lf_hash_search(&trid_to_trn, trn->pins, &trid, sizeof(trid));
+ if (found == NULL)
+ return 0; /* not in the hash of transactions = cannot read */
+ if (found == MY_ERRPTR)
+ return -1;
+
+ can= (*found)->commit_trid < trn->trid;
+ lf_hash_search_unpin(trn->pins);
+ return can;
+}
+
+/**
+ Finds a TRN by its TrID
+
+ @param trn current trn. Needed for pinning pointers (see lf_pin)
+ @param trid trid to search for
+
+ @return found trn or 0
+
+ @note that trn is returned with its state locked!
+*/
+TRN *trnman_trid_to_trn(TRN *trn, TrID trid)
+{
+ TRN **found;
+ LF_REQUIRE_PINS(3);
+
+ if (trid < trn->min_read_from)
+ return 0; /* it's committed eons ago */
+
+ found= lf_hash_search(&trid_to_trn, trn->pins, &trid, sizeof(trid));
+ if (found == NULL || found == MY_ERRPTR)
+ return 0; /* no luck */
+
+ /* we've found something */
+ pthread_mutex_lock(&(*found)->state_lock);
+
+ if ((*found)->short_id == 0)
+ {
+ pthread_mutex_unlock(&(*found)->state_lock);
+ lf_hash_search_unpin(trn->pins);
+ return 0; /* but it was a ghost */
+ }
+ lf_hash_search_unpin(trn->pins);
+
+ /* Gotcha! */
+ return *found;
+}
+
+/* TODO: the stubs below are waiting for savepoints to be implemented */
+
+void trnman_new_statement(TRN *trn __attribute__ ((unused)))
+{
+}
+
+void trnman_rollback_statement(TRN *trn __attribute__ ((unused)))
+{
+}
+
+
+/**
+ @brief Allocates buffers and stores in them some info about transactions
+
+ Does the allocation because the caller cannot know the size itself.
+ Memory freeing is to be done by the caller (if the "str" member of the
+ LEX_STRING is not NULL).
+ The caller has the intention of doing checkpoints.
+
+ @param[out] str_act pointer to where the allocated buffer,
+ and its size, will be put; buffer will be filled
+ with info about active transactions
+ @param[out] str_com pointer to where the allocated buffer,
+ and its size, will be put; buffer will be filled
+ with info about committed transactions
+ @param[out] min_first_undo_lsn pointer to where the minimum
+ first_undo_lsn of all transactions will be put
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com,
+ LSN *min_rec_lsn, LSN *min_first_undo_lsn)
+{
+ my_bool error;
+ TRN *trn;
+ char *ptr;
+ uint stored_transactions= 0;
+ LSN minimum_rec_lsn= LSN_MAX, minimum_first_undo_lsn= LSN_MAX;
+ DBUG_ENTER("trnman_collect_transactions");
+
+ DBUG_ASSERT((NULL == str_act->str) && (NULL == str_com->str));
+
+ /* validate the use of read_non_atomic() in general: */
+ compile_time_assert((sizeof(LSN) == 8) && (sizeof(LSN_WITH_FLAGS) == 8));
+ pthread_mutex_lock(&LOCK_trn_list);
+ str_act->length= 2 + /* number of active transactions */
+ LSN_STORE_SIZE + /* minimum of their rec_lsn */
+ TRANSID_SIZE + /* current TrID generator value */
+ (2 + /* short id */
+ 6 + /* long id */
+ LSN_STORE_SIZE + /* undo_lsn */
+#ifdef MARIA_VERSIONING /* not enabled yet */
+ LSN_STORE_SIZE + /* undo_purge_lsn */
+#endif
+ LSN_STORE_SIZE /* first_undo_lsn */
+ ) * trnman_active_transactions;
+ str_com->length= 4 + /* number of committed transactions */
+ (6 + /* long id */
+#ifdef MARIA_VERSIONING /* not enabled yet */
+ LSN_STORE_SIZE + /* undo_purge_lsn */
+#endif
+ LSN_STORE_SIZE /* first_undo_lsn */
+ ) * trnman_committed_transactions;
+ if ((NULL == (str_act->str= my_malloc(str_act->length, MYF(MY_WME)))) ||
+ (NULL == (str_com->str= my_malloc(str_com->length, MYF(MY_WME)))))
+ goto err;
+ /* First, the active transactions */
+ ptr= str_act->str + 2 + LSN_STORE_SIZE;
+ transid_store(ptr, global_trid_generator);
+ ptr+= TRANSID_SIZE;
+ for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
+ {
+ uint sid;
+ LSN rec_lsn, undo_lsn, first_undo_lsn;
+ pthread_mutex_lock(&trn->state_lock);
+ sid= trn->short_id;
+ pthread_mutex_unlock(&trn->state_lock);
+ if (sid == 0)
+ {
+ /*
+ Not even inited, has done nothing. Or it is the
+ dummy_transaction_object, which does only non-transactional
+ immediate-sync operations (CREATE/DROP/RENAME/REPAIR TABLE), and so
+ can be forgotten for Checkpoint.
+ */
+ continue;
+ }
+ /* needed for low-water mark calculation */
+ if (((rec_lsn= lsn_read_non_atomic(trn->rec_lsn)) > 0) &&
+ (cmp_translog_addr(rec_lsn, minimum_rec_lsn) < 0))
+ minimum_rec_lsn= rec_lsn;
+ /*
+ If trn has not logged LOGREC_LONG_TRANSACTION_ID, this trn will be
+ discovered when seeing that log record which is for sure located after
+ checkpoint_start_log_horizon.
+ */
+ if ((LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn) &
+ TRANSACTION_LOGGED_LONG_ID) == 0)
+ continue;
+ /*
+ On the other hand, if undo_lsn is LSN_IMPOSSIBLE, trn may later log
+ records; so we must include trn in the checkpoint now, because we cannot
+ count on LOGREC_LONG_TRANSACTION_ID (as we are already past it).
+ */
+ undo_lsn= trn->undo_lsn;
+ stored_transactions++;
+ int2store(ptr, sid);
+ ptr+= 2;
+ int6store(ptr, trn->trid);
+ ptr+= 6;
+ lsn_store(ptr, undo_lsn); /* needed for rollback */
+ ptr+= LSN_STORE_SIZE;
+ /* needed for low-water mark calculation */
+ if (((first_undo_lsn= lsn_read_non_atomic(trn->first_undo_lsn)) > 0) &&
+ (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0))
+ minimum_first_undo_lsn= first_undo_lsn;
+ lsn_store(ptr, first_undo_lsn);
+ ptr+= LSN_STORE_SIZE;
+#ifdef MARIA_VERSIONING /* not enabled yet */
+ /* to know where purging should start (last delete of this trn) */
+ lsn_store(ptr, trn->undo_purge_lsn);
+ ptr+= LSN_STORE_SIZE;
+#endif
+ /**
+ @todo RECOVERY: add a comment explaining why we can dirtily read some
+ vars, inspired by the text of "assumption 8" in WL#3072
+ */
+ }
+ str_act->length= ptr - str_act->str; /* as we maybe over-estimated */
+ ptr= str_act->str;
+ DBUG_PRINT("info",("collected %u active transactions",
+ (uint)stored_transactions));
+ int2store(ptr, stored_transactions);
+ ptr+= 2;
+ /* this LSN influences how REDOs for any page can be ignored by Recovery */
+ lsn_store(ptr, minimum_rec_lsn);
+ /* one day there will also be a list of prepared transactions */
+ /* do the same for committed ones */
+ ptr= str_com->str;
+ int4store(ptr, trnman_committed_transactions);
+ ptr+= 4;
+ DBUG_PRINT("info",("collected %u committed transactions",
+ (uint)trnman_committed_transactions));
+ for (trn= committed_list_min.next; trn != &committed_list_max;
+ trn= trn->next)
+ {
+ LSN first_undo_lsn;
+ int6store(ptr, trn->trid);
+ ptr+= 6;
+#ifdef MARIA_VERSIONING /* not enabled yet */
+ lsn_store(ptr, trn->undo_purge_lsn);
+ ptr+= LSN_STORE_SIZE;
+#endif
+ first_undo_lsn= LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn);
+ if (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0)
+ minimum_first_undo_lsn= first_undo_lsn;
+ lsn_store(ptr, first_undo_lsn);
+ ptr+= LSN_STORE_SIZE;
+ }
+ /*
+ TODO: if we see there exists no transaction (active and committed) we can
+ tell the lock-free structures to do some freeing (my_free()).
+ */
+ error= 0;
+ *min_rec_lsn= minimum_rec_lsn;
+ *min_first_undo_lsn= minimum_first_undo_lsn;
+ goto end;
+err:
+ error= 1;
+end:
+ pthread_mutex_unlock(&LOCK_trn_list);
+ DBUG_RETURN(error);
+}
+
+
+TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid)
+{
+ TrID old_trid_generator= global_trid_generator;
+ TRN *trn;
+ DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded);
+ global_trid_generator= longid-1; /* force a correct trid in the new trn */
+ if (unlikely((trn= trnman_new_trn(NULL)) == NULL))
+ return NULL;
+ /* deallocate excessive allocations of trnman_new_trn() */
+ global_trid_generator= old_trid_generator;
+ set_if_bigger(global_trid_generator, longid);
+ short_trid_to_active_trn[trn->short_id]= 0;
+ DBUG_ASSERT(short_trid_to_active_trn[shortid] == NULL);
+ short_trid_to_active_trn[shortid]= trn;
+ trn->short_id= shortid;
+ return trn;
+}
+
+
+TRN *trnman_get_any_trn()
+{
+ TRN *trn= active_list_min.next;
+ return (trn != &active_list_max) ? trn : NULL;
+}
+
+
+/**
+ Returns the minimum existing transaction id. May return a too small
+ number in race conditions, but this is ok as the value is used to
+ remove not visible transid from index/rows.
+*/
+
+TrID trnman_get_min_trid()
+{
+ return trid_min_read_from;
+}
+
+
+/**
+ Returns the minimum possible transaction id
+
+ @notes
+ If there is no transactions running, returns number for next running
+ transaction.
+ If one has an active transaction, the returned number will be less or
+ equal to this. If one is not running in a transaction one will ge the
+ number for the next started transaction. This is used in create table
+ to get a safe minimum trid to use.
+*/
+
+TrID trnman_get_min_safe_trid()
+{
+ TrID trid;
+ pthread_mutex_lock(&LOCK_trn_list);
+ trid= min(active_list_min.next->min_read_from,
+ global_trid_generator);
+ pthread_mutex_unlock(&LOCK_trn_list);
+ return trid;
+}
+
+
+/**
+ Returns maximum transaction id given to a transaction so far.
+*/
+
+TrID trnman_get_max_trid()
+{
+ TrID id;
+ if (short_trid_to_active_trn == NULL)
+ return 0;
+ pthread_mutex_lock(&LOCK_trn_list);
+ id= global_trid_generator;
+ pthread_mutex_unlock(&LOCK_trn_list);
+ return id;
+}
+
+/**
+ @brief Check if there exist an active transaction between two commit_id's
+
+ @todo
+ Improve speed of this.
+ - Store transactions in tree or skip list
+ - Have function to copying all active transaction id's to b-tree
+ and use b-tree for checking states. This could be a big win
+ for checkpoint that will call this function for a lot of objects.
+
+ @return
+ 0 No transaction exists
+ 1 There is at least on active transaction in the given range
+*/
+
+my_bool trnman_exists_active_transactions(TrID min_id, TrID max_id,
+ my_bool trnman_is_locked)
+{
+ TRN *trn;
+ my_bool ret= 0;
+
+ if (!trnman_is_locked)
+ pthread_mutex_lock(&LOCK_trn_list);
+ safe_mutex_assert_owner(&LOCK_trn_list);
+ for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
+ {
+ /*
+ We use <= for max_id as max_id is a commit_trid and trn->trid
+ is transaction id. When calculating commit_trid we use the
+ current value of global_trid_generator. global_trid_generator is
+ incremented for each new transaction.
+
+ For example, assuming we have
+ min_id = 5
+ max_id = 10
+
+ A trid of value 5 can't see the history event between 5 & 10
+ at it vas started before min_id 5 was committed.
+ A trid of value 10 can't see the next history event (max_id = 10)
+ as it started before this was committed. In this case it must use
+ the this event.
+ */
+ if (trn->trid > min_id && trn->trid <= max_id)
+ {
+ ret= 1;
+ break;
+ }
+ }
+ if (!trnman_is_locked)
+ pthread_mutex_unlock(&LOCK_trn_list);
+ return ret;
+}
+
+
+/**
+ lock transaction list
+*/
+
+void trnman_lock()
+{
+ pthread_mutex_lock(&LOCK_trn_list);
+}
+
+
+/**
+ unlock transaction list
+*/
+
+void trnman_unlock()
+{
+ pthread_mutex_unlock(&LOCK_trn_list);
+}
+
+
+/**
+ Is trman initialized
+*/
+
+my_bool trman_is_inited()
+{
+ return (short_trid_to_active_trn != NULL);
+}
diff --git a/storage/maria/trnman.h b/storage/maria/trnman.h
new file mode 100644
index 00000000000..afe01d4ad10
--- /dev/null
+++ b/storage/maria/trnman.h
@@ -0,0 +1,67 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _trnman_h
+#define _trnman_h
+
+C_MODE_START
+
+#include <lf.h>
+#include "trnman_public.h"
+#include "ma_loghandler_lsn.h"
+
+/**
+ trid - 6 uchar transaction identifier. Assigned when a transaction
+ is created. Transaction can always be identified by its trid,
+ even after transaction has ended.
+
+ short_id - 2-byte transaction identifier, identifies a running
+ transaction, is reassigned when transaction ends.
+
+ when short_id is 0, TRN is not initialized, for all practical purposes
+ it could be considered unused.
+
+ when commit_trid is MAX_TRID the transaction is running, otherwise it's
+ committed.
+
+ state_lock mutex protects the state of a TRN, that is whether a TRN
+ is committed/running/unused. Meaning that modifications of short_id and
+ commit_trid happen under this mutex.
+*/
+
+struct st_ma_transaction
+{
+ LF_PINS *pins;
+ WT_THD *wt;
+ pthread_mutex_t state_lock;
+ void *used_tables; /**< Tables used by transaction */
+ TRN *next, *prev;
+ TrID trid, min_read_from, commit_trid;
+ LSN rec_lsn, undo_lsn;
+ LSN_WITH_FLAGS first_undo_lsn;
+ uint locked_tables;
+ uint16 short_id;
+ uint16 flags; /**< Various flags */
+};
+
+#define TRANSACTION_LOGGED_LONG_ID ULL(0x8000000000000000)
+#define MAX_TRID (~(TrID)0)
+
+extern WT_RESOURCE_TYPE ma_rc_dup_unique;
+
+C_MODE_END
+
+#endif
+
diff --git a/storage/maria/trnman_public.h b/storage/maria/trnman_public.h
new file mode 100644
index 00000000000..9523eb5de8f
--- /dev/null
+++ b/storage/maria/trnman_public.h
@@ -0,0 +1,85 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+/*
+ External definitions for trnman.h
+ We need to split this into two files as gcc 4.1.2 gives error if it tries
+ to include my_atomic.h in C++ code.
+*/
+
+#ifndef _trnman_public_h
+#define _trnman_public_h
+
+#include "ma_loghandler_lsn.h"
+#include <waiting_threads.h>
+
+C_MODE_START
+typedef uint64 TrID; /* our TrID is 6 bytes */
+typedef struct st_ma_transaction TRN;
+
+#define SHORT_TRID_MAX 65535
+
+extern uint trnman_active_transactions, trnman_allocated_transactions;
+extern TRN dummy_transaction_object;
+extern my_bool (*trnman_end_trans_hook)(TRN *trn, my_bool commit,
+ my_bool active_transactions);
+
+int trnman_init(TrID);
+void trnman_destroy(void);
+TRN *trnman_new_trn(WT_THD *wt);
+my_bool trnman_end_trn(TRN *trn, my_bool commit);
+#define trnman_commit_trn(T) trnman_end_trn(T, TRUE)
+#define trnman_abort_trn(T) trnman_end_trn(T, FALSE)
+#define trnman_rollback_trn(T) trnman_end_trn(T, FALSE)
+int trnman_can_read_from(TRN *trn, TrID trid);
+TRN *trnman_trid_to_trn(TRN *trn, TrID trid);
+void trnman_new_statement(TRN *trn);
+void trnman_rollback_statement(TRN *trn);
+my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com,
+ LSN *min_rec_lsn,
+ LSN *min_first_undo_lsn);
+
+uint trnman_increment_locked_tables(TRN *trn);
+uint trnman_decrement_locked_tables(TRN *trn);
+uint trnman_has_locked_tables(TRN *trn);
+void trnman_reset_locked_tables(TRN *trn, uint locked_tables);
+TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid);
+TRN *trnman_get_any_trn(void);
+TrID trnman_get_min_trid(void);
+TrID trnman_get_max_trid(void);
+TrID trnman_get_min_safe_trid();
+my_bool trnman_exists_active_transactions(TrID min_id, TrID max_id,
+ my_bool trnman_is_locked);
+#define TRANSID_SIZE 6
+#define transid_store(dst, id) int6store(dst,id)
+#define transid_korr(P) uint6korr(P)
+void trnman_lock();
+void trnman_unlock();
+my_bool trman_is_inited();
+#ifdef EXTRA_DEBUG
+uint16 trnman_get_flags(TRN *);
+void trnman_set_flags(TRN *, uint16 flags);
+#else
+#define trnman_get_flags(A) 0
+#define trnman_set_flags(A, B) do { } while (0)
+#endif
+
+/* Flag bits */
+#define TRN_STATE_INFO_LOGGED 1 /* Query is logged */
+#define TRN_STATE_TABLES_CAN_CHANGE 2 /* Things can change during trans. */
+
+C_MODE_END
+#endif
diff --git a/storage/maria/unittest/CMakeLists.txt b/storage/maria/unittest/CMakeLists.txt
new file mode 100644
index 00000000000..fe6327c6ea3
--- /dev/null
+++ b/storage/maria/unittest/CMakeLists.txt
@@ -0,0 +1,95 @@
+# Copyright (C) 2007 MySQL AB
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/zlib
+ ${CMAKE_SOURCE_DIR}/unittest/mytap)
+LINK_LIBRARIES(aria myisam mytap mysys dbug strings wsock32 zlib)
+
+ADD_EXECUTABLE(ma_control_file-t ma_control_file-t.c)
+ADD_EXECUTABLE(trnman-t trnman-t.c)
+ADD_EXECUTABLE(ma_test_loghandler-t
+ ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_multigroup-t
+ ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c)
+ADD_EXECUTABLE(ma_test_loghandler_multithread-t
+ ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_pagecache-t
+ ma_test_loghandler_pagecache-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_long-t
+ ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+SET_TARGET_PROPERTIES(ma_test_loghandler_long-t PROPERTIES COMPILE_FLAGS "-DLONG_LOG_TEST")
+
+ADD_EXECUTABLE(ma_test_loghandler_noflush-t
+ ma_test_loghandler_noflush-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_first_lsn-t
+ ma_test_loghandler_first_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_max_lsn-t
+ ma_test_loghandler_max_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_purge-t
+ ma_test_loghandler_purge-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_readonly-t
+ ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c)
+SET_TARGET_PROPERTIES(ma_test_loghandler_readonly-t PROPERTIES COMPILE_FLAGS "-DREADONLY_TEST")
+ADD_EXECUTABLE(ma_test_loghandler_nologs-t
+ ma_test_loghandler_nologs-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+
+SET(ma_pagecache_single_src ma_pagecache_single.c test_file.c test_file.h)
+SET(ma_pagecache_consist_src ma_pagecache_consist.c test_file.c test_file.h)
+SET(ma_pagecache_common_cppflags "-DEXTRA_DEBUG -DPAGECACHE_DEBUG -DMAIN")
+
+ADD_EXECUTABLE(ma_pagecache_single_1k-t ${ma_pagecache_single_src})
+SET_TARGET_PROPERTIES(ma_pagecache_single_1k-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024")
+
+ADD_EXECUTABLE(ma_pagecache_single_8k-t ${ma_pagecache_single_src})
+SET_TARGET_PROPERTIES(ma_pagecache_single_8k-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=8192")
+
+ADD_EXECUTABLE(ma_pagecache_single_64k-t ${ma_pagecache_single_src})
+SET_TARGET_PROPERTIES(ma_pagecache_single_64k-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536")
+
+ADD_EXECUTABLE(ma_pagecache_consist_1k-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_1k-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024")
+
+ADD_EXECUTABLE(ma_pagecache_consist_64k-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_64k-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536")
+
+ADD_EXECUTABLE(ma_pagecache_consist_1kHC-t
+ ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_1kHC-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_HIGH_CONCURENCY")
+ADD_EXECUTABLE(ma_pagecache_consist_64kHC-t
+ ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_64kHC-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_HIGH_CONCURENCY")
+ADD_EXECUTABLE(ma_pagecache_consist_1kRD-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_1kRD-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_READERS")
+ADD_EXECUTABLE(ma_pagecache_consist_64kRD-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_64kRD-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_READERS")
+ADD_EXECUTABLE(ma_pagecache_consist_1kWR-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_1kWR-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_WRITERS")
+ADD_EXECUTABLE(ma_pagecache_consist_64kWR-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_64kWR-t
+ PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_WRITERS")
+ADD_EXECUTABLE(ma_pagecache_rwconsist_1k-t ma_pagecache_rwconsist.c)
+SET_TARGET_PROPERTIES(ma_pagecache_rwconsist_1k-t PROPERTIES COMPILE_FLAGS "-DTEST_PAGE_SIZE=1024")
+ADD_EXECUTABLE(ma_pagecache_rwconsist2_1k-t ma_pagecache_rwconsist2.c)
+SET_TARGET_PROPERTIES(ma_pagecache_rwconsist2_1k-t PROPERTIES COMPILE_FLAGS "-DTEST_PAGE_SIZE=1024")
diff --git a/storage/maria/unittest/Makefile.am b/storage/maria/unittest/Makefile.am
new file mode 100644
index 00000000000..b5bc8587066
--- /dev/null
+++ b/storage/maria/unittest/Makefile.am
@@ -0,0 +1,115 @@
+# Copyright (C) 2006-2008 MySQL AB
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+AM_CPPFLAGS = @ZLIB_INCLUDES@ -I$(top_builddir)/include \
+ -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap
+INCLUDES = @ZLIB_INCLUDES@ -I$(top_builddir)/include \
+ -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap
+EXTRA_DIST= ma_test_all-t CMakeLists.txt \
+ ma_test_recovery.pl ma_test_recovery.expected
+# Only reason to link with libmyisam.a here is that it's where some fulltext
+# pieces are (but soon we'll remove fulltext dependencies from Aria).
+LDADD= $(top_builddir)/unittest/mytap/libmytap.a \
+ $(top_builddir)/storage/maria/libaria.a \
+ $(top_builddir)/storage/myisam/libmyisam.a \
+ $(top_builddir)/mysys/libmysys.a \
+ $(top_builddir)/dbug/libdbug.a \
+ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+noinst_PROGRAMS = ma_control_file-t trnman-t \
+ ma_pagecache_single_1k-t ma_pagecache_single_8k-t \
+ ma_pagecache_single_64k-t \
+ ma_pagecache_consist_1k-t \
+ ma_pagecache_consist_64k-t \
+ ma_pagecache_consist_1kHC-t \
+ ma_pagecache_consist_64kHC-t \
+ ma_pagecache_consist_1kRD-t \
+ ma_pagecache_consist_64kRD-t \
+ ma_pagecache_consist_1kWR-t \
+ ma_pagecache_consist_64kWR-t \
+ ma_pagecache_rwconsist_1k-t \
+ ma_pagecache_rwconsist2_1k-t \
+ ma_test_loghandler-t \
+ ma_test_loghandler_multigroup-t \
+ ma_test_loghandler_multithread-t \
+ ma_test_loghandler_multiflush-t \
+ ma_test_loghandler_pagecache-t \
+ ma_test_loghandler_long-t \
+ ma_test_loghandler_noflush-t \
+ ma_test_loghandler_first_lsn-t \
+ ma_test_loghandler_max_lsn-t \
+ ma_test_loghandler_purge-t \
+ ma_test_loghandler_readonly-t\
+ ma_test_loghandler_nologs-t
+
+ma_test_loghandler_t_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_multigroup_t_SOURCES = ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c sequence_storage.h
+ma_test_loghandler_multithread_t_SOURCES = ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_multiflush_t_SOURCES = ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_multiflush_t_CPPFLAGS = -DMULTIFLUSH_TEST
+ma_test_loghandler_pagecache_t_SOURCES = ma_test_loghandler_pagecache-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_long_t_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_long_t_CPPFLAGS = -DLONG_LOG_TEST
+ma_test_loghandler_noflush_t_SOURCES = ma_test_loghandler_noflush-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_first_lsn_t_SOURCES = ma_test_loghandler_first_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_max_lsn_t_SOURCES = ma_test_loghandler_max_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_purge_t_SOURCES = ma_test_loghandler_purge-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_readonly_t_SOURCES = ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c sequence_storage.h
+ma_test_loghandler_readonly_t_CPPFLAGS = -DREADONLY_TEST
+ma_test_loghandler_nologs_t_SOURCES = ma_test_loghandler_nologs-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+
+ma_pagecache_single_src = ma_pagecache_single.c test_file.c test_file.h
+ma_pagecache_consist_src = ma_pagecache_consist.c test_file.c test_file.h
+ma_pagecache_common_cppflags = -DEXTRA_DEBUG -DPAGECACHE_DEBUG -DMAIN
+
+ma_pagecache_single_1k_t_SOURCES = $(ma_pagecache_single_src)
+ma_pagecache_single_8k_t_SOURCES = $(ma_pagecache_single_src)
+ma_pagecache_single_64k_t_SOURCES = $(ma_pagecache_single_src)
+ma_pagecache_single_1k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024
+ma_pagecache_single_8k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=8192
+ma_pagecache_single_64k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DBIG
+
+ma_pagecache_consist_1k_t_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_1k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024
+ma_pagecache_consist_64k_t_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_64k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536
+
+ma_pagecache_consist_1kHC_t_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_1kHC_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 -DTEST_HIGH_CONCURENCY
+ma_pagecache_consist_64kHC_t_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_64kHC_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DTEST_HIGH_CONCURENCY
+
+ma_pagecache_consist_1kRD_t_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_1kRD_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 -DTEST_READERS
+ma_pagecache_consist_64kRD_t_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_64kRD_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DTEST_READERS
+
+ma_pagecache_consist_1kWR_t_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_1kWR_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 -DTEST_WRITERS
+ma_pagecache_consist_64kWR_t_SOURCES = $(ma_pagecache_consist_src)
+ma_pagecache_consist_64kWR_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DTEST_WRITERS
+
+ma_pagecache_rwconsist_1k_t_SOURCES = ma_pagecache_rwconsist.c
+ma_pagecache_rwconsist_1k_t_CPPFLAGS = -DTEST_PAGE_SIZE=1024
+ma_pagecache_rwconsist2_1k_t_SOURCES = ma_pagecache_rwconsist2.c
+ma_pagecache_rwconsist2_1k_t_CPPFLAGS = -DTEST_PAGE_SIZE=1024
+
+# the generic lock manager may not be used in the end and lockman1-t crashes,
+# and lockman2-t takes at least quarter an hour,
+# so we don't build lockman-t and lockman1-t and lockman2-t
+CLEANFILES = aria_log_control page_cache_test_file_1 \
+ aria_log.????????
+
+# Don't update the files from bitkeeper
+%::SCCS/s.%
diff --git a/storage/maria/unittest/lockman-t.c b/storage/maria/unittest/lockman-t.c
new file mode 100644
index 00000000000..9b54a3d8ff9
--- /dev/null
+++ b/storage/maria/unittest/lockman-t.c
@@ -0,0 +1,308 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ lockman for row and table locks
+*/
+
+/* #define EXTRA_VERBOSE */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include "../lockman.h"
+
+#define Nlos 100
+LOCK_OWNER loarray[Nlos];
+pthread_mutex_t mutexes[Nlos];
+pthread_cond_t conds[Nlos];
+LOCKMAN lockman;
+
+#ifndef EXTRA_VERBOSE
+#define print_lockhash(X) /* no-op */
+#define DIAG(X) /* no-op */
+#else
+#define DIAG(X) diag X
+#endif
+
+LOCK_OWNER *loid2lo(uint16 loid)
+{
+ return loarray+loid-1;
+}
+
+#define unlock_all(O) diag("lo" #O "> release all locks"); \
+ lockman_release_locks(&lockman, loid2lo(O));print_lockhash(&lockman)
+#define test_lock(O, R, L, S, RES) \
+ ok(lockman_getlock(&lockman, loid2lo(O), R, L) == RES, \
+ "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \
+ print_lockhash(&lockman)
+#define lock_ok_a(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK)
+#define lock_ok_i(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE)
+#define lock_ok_l(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE)
+#define lock_conflict(O, R, L) \
+ test_lock(O, R, L, "cannot ", DIDNT_GET_THE_LOCK);
+
+void test_lockman_simple()
+{
+ /* simple */
+ lock_ok_a(1, 1, S);
+ lock_ok_i(2, 2, IS);
+ lock_ok_i(1, 2, IX);
+ /* lock escalation */
+ lock_ok_a(1, 1, X);
+ lock_ok_i(2, 2, IX);
+ /* failures */
+ lock_conflict(2, 1, X);
+ unlock_all(2);
+ lock_ok_a(1, 2, S);
+ lock_ok_a(1, 2, IS);
+ lock_ok_a(1, 2, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_a(2, 3, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_l(2, 3, IS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_conflict(2, 1, S);
+ lock_ok_a(1, 1, LS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_a(2, 1, LS);
+ lock_ok_a(1, 1, LS);
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+
+ lock_ok_i(1, 4, IS);
+ lock_ok_i(2, 4, IS);
+ lock_ok_i(3, 4, IS);
+ lock_ok_a(3, 4, LS);
+ lock_ok_i(4, 4, IS);
+ lock_conflict(4, 4, IX);
+ lock_conflict(2, 4, IX);
+ lock_ok_a(1, 4, LS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+ unlock_all(4);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(2, 1, IX);
+ lock_conflict(1, 1, S);
+ lock_conflict(2, 1, X);
+ unlock_all(1);
+ unlock_all(2);
+}
+
+int rt_num_threads;
+int litmus;
+int thread_number= 0, timeouts= 0;
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+ pthread_t *threads;
+ ulonglong now= my_getsystime();
+ int i;
+
+ thread_number= timeouts= 0;
+ litmus= 0;
+
+ threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+ if (!threads)
+ {
+ diag("Out of memory");
+ abort();
+ }
+
+ diag("Running %s with %d threads, %d iterations... ", test, n, m);
+ rt_num_threads= n;
+ for (i= 0; i < n ; i++)
+ if (pthread_create(threads+i, 0, handler, &m))
+ {
+ diag("Could not create thread");
+ abort();
+ }
+ for (i= 0 ; i < n ; i++)
+ pthread_join(threads[i], 0);
+ now= my_getsystime()-now;
+ ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+ my_free((void*)threads, MYF(0));
+}
+
+pthread_mutex_t rt_mutex;
+int Nrows= 100;
+int Ntables= 10;
+int table_lock_ratio= 10;
+enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX};
+char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"};
+char *res2str[4]= {
+ "DIDN'T GET THE LOCK",
+ "GOT THE LOCK",
+ "GOT THE LOCK NEED TO LOCK A SUBRESOURCE",
+ "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"};
+pthread_handler_t test_lockman(void *arg)
+{
+ int m= (*(int *)arg);
+ uint x, loid, row, table, res, locklevel, timeout= 0;
+ LOCK_OWNER *lo;
+
+ pthread_mutex_lock(&rt_mutex);
+ loid= ++thread_number;
+ pthread_mutex_unlock(&rt_mutex);
+ lo= loid2lo(loid);
+
+ for (x= ((int)(intptr)(&m)); m > 0; m--)
+ {
+ x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */
+ row= x % Nrows + Ntables;
+ table= row % Ntables;
+ locklevel= (x/Nrows) & 3;
+ if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0)
+ { /* table lock */
+ res= lockman_getlock(&lockman, lo, table, lock_array[locklevel]);
+ DIAG(("loid %2d, table %d, lock %s, res %s", loid, table,
+ lock2str[locklevel], res2str[res]));
+ if (res == DIDNT_GET_THE_LOCK)
+ {
+ lockman_release_locks(&lockman, lo);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ }
+ else
+ { /* row lock */
+ locklevel&= 1;
+ res= lockman_getlock(&lockman, lo, table, lock_array[locklevel + 4]);
+ DIAG(("loid %2d, row %d, lock %s, res %s", loid, row,
+ lock2str[locklevel+4], res2str[res]));
+ switch (res)
+ {
+ case DIDNT_GET_THE_LOCK:
+ lockman_release_locks(&lockman, lo);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ case GOT_THE_LOCK:
+ continue;
+ case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE:
+ /* not implemented, so take a regular lock */
+ case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE:
+ res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]);
+ DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row,
+ lock2str[locklevel], res2str[res]));
+ if (res == DIDNT_GET_THE_LOCK)
+ {
+ lockman_release_locks(&lockman, lo);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ continue;
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
+ }
+
+ lockman_release_locks(&lockman, lo);
+
+ pthread_mutex_lock(&rt_mutex);
+ rt_num_threads--;
+ timeouts+= timeout;
+ if (!rt_num_threads)
+ diag("number of timeouts: %d", timeouts);
+ pthread_mutex_unlock(&rt_mutex);
+
+ return 0;
+}
+
+int main()
+{
+ int i;
+
+ my_init();
+ pthread_mutex_init(&rt_mutex, 0);
+
+ plan(35);
+
+ if (my_atomic_initialize())
+ return exit_status();
+
+
+ lockman_init(&lockman, &loid2lo, 50);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ loarray[i].pins= lf_alloc_get_pins(&lockman.alloc);
+ loarray[i].all_locks= 0;
+ loarray[i].waiting_for= 0;
+ pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+ pthread_cond_init (&conds[i], 0);
+ loarray[i].mutex= &mutexes[i];
+ loarray[i].cond= &conds[i];
+ loarray[i].loid= i+1;
+ }
+
+ test_lockman_simple();
+
+#define CYCLES 10000
+#define THREADS Nlos /* don't change this line */
+
+ /* mixed load, stress-test with random locks */
+ Nrows= 100;
+ Ntables= 10;
+ table_lock_ratio= 10;
+ run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES);
+
+ /* "real-life" simulation - many rows, no table locks */
+ Nrows= 1000000;
+ Ntables= 10;
+ table_lock_ratio= 0;
+ run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ lockman_release_locks(&lockman, &loarray[i]);
+ pthread_mutex_destroy(loarray[i].mutex);
+ pthread_cond_destroy(loarray[i].cond);
+ lf_pinbox_put_pins(loarray[i].pins);
+ }
+
+ {
+ ulonglong now= my_getsystime();
+ lockman_destroy(&lockman);
+ now= my_getsystime()-now;
+ diag("lockman_destroy: %g secs", ((double)now)/1e7);
+ }
+
+ pthread_mutex_destroy(&rt_mutex);
+ my_end(0);
+ return exit_status();
+}
+
diff --git a/storage/maria/unittest/lockman1-t.c b/storage/maria/unittest/lockman1-t.c
new file mode 100644
index 00000000000..ca959c6e6e3
--- /dev/null
+++ b/storage/maria/unittest/lockman1-t.c
@@ -0,0 +1,334 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ lockman for row locks, tablockman for table locks
+*/
+
+/* #define EXTRA_VERBOSE */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include "../lockman.h"
+#include "../tablockman.h"
+
+#define Nlos 100
+#define Ntbls 10
+LOCK_OWNER loarray[Nlos];
+TABLE_LOCK_OWNER loarray1[Nlos];
+pthread_mutex_t mutexes[Nlos];
+pthread_cond_t conds[Nlos];
+LOCKED_TABLE ltarray[Ntbls];
+LOCKMAN lockman;
+TABLOCKMAN tablockman;
+
+#ifndef EXTRA_VERBOSE
+#define print_lo1(X) /* no-op */
+#define DIAG(X) /* no-op */
+#else
+#define DIAG(X) diag X
+#endif
+
+LOCK_OWNER *loid2lo(uint16 loid)
+{
+ return loarray+loid-1;
+}
+TABLE_LOCK_OWNER *loid2lo1(uint16 loid)
+{
+ return loarray1+loid-1;
+}
+
+#define unlock_all(O) diag("lo" #O "> release all locks"); \
+ tablockman_release_locks(&tablockman, loid2lo1(O));
+#define test_lock(O, R, L, S, RES) \
+ ok(tablockman_getlock(&tablockman, loid2lo1(O), &ltarray[R], L) == RES, \
+ "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \
+ print_lo1(loid2lo1(O));
+#define lock_ok_a(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK)
+#define lock_ok_i(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE)
+#define lock_ok_l(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE)
+#define lock_conflict(O, R, L) \
+ test_lock(O, R, L, "cannot ", LOCK_TIMEOUT);
+
+void test_tablockman_simple()
+{
+ /* simple */
+ lock_ok_a(1, 1, S);
+ lock_ok_i(2, 2, IS);
+ lock_ok_i(1, 2, IX);
+ /* lock escalation */
+ lock_ok_a(1, 1, X);
+ lock_ok_i(2, 2, IX);
+ /* failures */
+ lock_conflict(2, 1, X);
+ unlock_all(2);
+ lock_ok_a(1, 2, S);
+ lock_ok_a(1, 2, IS);
+ lock_ok_a(1, 2, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_a(2, 3, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_l(2, 3, IS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_conflict(2, 1, S);
+ lock_ok_a(1, 1, LS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_a(2, 1, LS);
+ lock_ok_a(1, 1, LS);
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+
+ lock_ok_i(1, 4, IS);
+ lock_ok_i(2, 4, IS);
+ lock_ok_i(3, 4, IS);
+ lock_ok_a(3, 4, LS);
+ lock_ok_i(4, 4, IS);
+ lock_conflict(4, 4, IX);
+ lock_conflict(2, 4, IX);
+ lock_ok_a(1, 4, LS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+ unlock_all(4);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(2, 1, IX);
+ lock_conflict(1, 1, S);
+ lock_conflict(2, 1, X);
+ unlock_all(1);
+ unlock_all(2);
+}
+
+int rt_num_threads;
+int litmus;
+int thread_number= 0, timeouts= 0;
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+ pthread_t *threads;
+ ulonglong now= my_getsystime();
+ int i;
+
+ thread_number= timeouts= 0;
+ litmus= 0;
+
+ threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+ if (!threads)
+ {
+ diag("Out of memory");
+ abort();
+ }
+
+ diag("Running %s with %d threads, %d iterations... ", test, n, m);
+ rt_num_threads= n;
+ for (i= 0; i < n ; i++)
+ if (pthread_create(threads+i, 0, handler, &m))
+ {
+ diag("Could not create thread");
+ abort();
+ }
+ for (i= 0 ; i < n ; i++)
+ pthread_join(threads[i], 0);
+ now= my_getsystime()-now;
+ ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+ my_free((void*)threads, MYF(0));
+}
+
+pthread_mutex_t rt_mutex;
+int Nrows= 100;
+int Ntables= 10;
+int table_lock_ratio= 10;
+enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX};
+char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"};
+char *res2str[]= {
+ "DIDN'T GET THE LOCK",
+ "OUT OF MEMORY",
+ "DEADLOCK",
+ "LOCK TIMEOUT",
+ "GOT THE LOCK",
+ "GOT THE LOCK NEED TO LOCK A SUBRESOURCE",
+ "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"};
+pthread_handler_t test_lockman(void *arg)
+{
+ int m= (*(int *)arg);
+ uint x, loid, row, table, res, locklevel, timeout= 0;
+ LOCK_OWNER *lo; TABLE_LOCK_OWNER *lo1; DBUG_ASSERT(Ntables <= Ntbls);
+
+ pthread_mutex_lock(&rt_mutex);
+ loid= ++thread_number;
+ pthread_mutex_unlock(&rt_mutex);
+ lo= loid2lo(loid); lo1= loid2lo1(loid);
+
+ for (x= ((int)(intptr)(&m)); m > 0; m--)
+ {
+ x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */
+ row= x % Nrows + Ntables;
+ table= row % Ntables;
+ locklevel= (x/Nrows) & 3;
+ if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0)
+ { /* table lock */
+ res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel]);
+ DIAG(("loid %2d, table %d, lock %s, res %s", loid, table,
+ lock2str[locklevel], res2str[res]));
+ if (res < GOT_THE_LOCK)
+ {
+ lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ }
+ else
+ { /* row lock */
+ locklevel&= 1;
+ res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]);
+ DIAG(("loid %2d, row %d, lock %s, res %s", loid, row,
+ lock2str[locklevel+4], res2str[res]));
+ switch (res)
+ {
+ case GOT_THE_LOCK:
+ continue;
+ case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE:
+ /* not implemented, so take a regular lock */
+ case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE:
+ res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]);
+ DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row,
+ lock2str[locklevel], res2str[res]));
+ if (res == DIDNT_GET_THE_LOCK)
+ {
+ lockman_release_locks(&lockman, lo);
+ tablockman_release_locks(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ continue;
+ default:
+ lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ }
+ }
+
+ lockman_release_locks(&lockman, lo);
+ tablockman_release_locks(&tablockman, lo1);
+
+ pthread_mutex_lock(&rt_mutex);
+ rt_num_threads--;
+ timeouts+= timeout;
+ if (!rt_num_threads)
+ diag("number of timeouts: %d", timeouts);
+ pthread_mutex_unlock(&rt_mutex);
+
+ return 0;
+}
+
+int main()
+{
+ int i;
+
+ my_init();
+ pthread_mutex_init(&rt_mutex, 0);
+
+ plan(35);
+
+ if (my_atomic_initialize())
+ return exit_status();
+
+
+ lockman_init(&lockman, &loid2lo, 50);
+ tablockman_init(&tablockman, &loid2lo1, 50);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+ pthread_cond_init (&conds[i], 0);
+
+ loarray[i].pins= lf_alloc_get_pins(&lockman.alloc);
+ loarray[i].all_locks= 0;
+ loarray[i].waiting_for= 0;
+ loarray[i].mutex= &mutexes[i];
+ loarray[i].cond= &conds[i];
+ loarray[i].loid= i+1;
+
+ loarray1[i].active_locks= 0;
+ loarray1[i].waiting_lock= 0;
+ loarray1[i].waiting_for= 0;
+ loarray1[i].mutex= &mutexes[i];
+ loarray1[i].cond= &conds[i];
+ loarray1[i].loid= i+1;
+ }
+
+ for (i= 0; i < Ntbls; i++)
+ {
+ tablockman_init_locked_table(ltarray+i, Nlos);
+ }
+
+ test_tablockman_simple();
+
+#define CYCLES 10000
+#define THREADS Nlos /* don't change this line */
+
+ /* mixed load, stress-test with random locks */
+ Nrows= 100;
+ Ntables= 10;
+ table_lock_ratio= 10;
+ run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES);
+
+ /* "real-life" simulation - many rows, no table locks */
+ Nrows= 1000000;
+ Ntables= 10;
+ table_lock_ratio= 0;
+ run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ lockman_release_locks(&lockman, &loarray[i]);
+ pthread_mutex_destroy(loarray[i].mutex);
+ pthread_cond_destroy(loarray[i].cond);
+ lf_pinbox_put_pins(loarray[i].pins);
+ }
+
+ {
+ ulonglong now= my_getsystime();
+ lockman_destroy(&lockman);
+ now= my_getsystime()-now;
+ diag("lockman_destroy: %g secs", ((double)now)/1e7);
+ }
+
+ pthread_mutex_destroy(&rt_mutex);
+ my_end(0);
+ return exit_status();
+}
+
diff --git a/storage/maria/unittest/lockman2-t.c b/storage/maria/unittest/lockman2-t.c
new file mode 100644
index 00000000000..c1d40159500
--- /dev/null
+++ b/storage/maria/unittest/lockman2-t.c
@@ -0,0 +1,361 @@
+/* Copyright (C) 2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ tablockman for row and table locks
+*/
+
+/* #define EXTRA_VERBOSE */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include "../tablockman.h"
+
+#define Nlos 100
+#define Ntbls 110
+TABLE_LOCK_OWNER loarray1[Nlos];
+pthread_mutex_t mutexes[Nlos];
+pthread_cond_t conds[Nlos];
+LOCKED_TABLE ltarray[Ntbls];
+TABLOCKMAN tablockman;
+
+#ifndef EXTRA_VERBOSE
+#define print_lo1(X) /* no-op */
+#define DIAG(X) /* no-op */
+#else
+#define DIAG(X) diag X
+#endif
+
+TABLE_LOCK_OWNER *loid2lo1(uint16 loid)
+{
+ return loarray1+loid-1;
+}
+
+#define unlock_all(O) diag("lo" #O "> release all locks"); \
+ tablockman_release_locks(&tablockman, loid2lo1(O));
+#define test_lock(O, R, L, S, RES) \
+ ok(tablockman_getlock(&tablockman, loid2lo1(O), &ltarray[R], L) == RES, \
+ "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \
+ print_lo1(loid2lo1(O));
+#define lock_ok_a(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK)
+#define lock_ok_i(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE)
+#define lock_ok_l(O, R, L) \
+ test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE)
+#define lock_conflict(O, R, L) \
+ test_lock(O, R, L, "cannot ", LOCK_TIMEOUT);
+
+void test_tablockman_simple()
+{
+ /* simple */
+ lock_ok_a(1, 1, S);
+ lock_ok_i(2, 2, IS);
+ lock_ok_i(1, 2, IX);
+ /* lock escalation */
+ lock_ok_a(1, 1, X);
+ lock_ok_i(2, 2, IX);
+ /* failures */
+ lock_conflict(2, 1, X);
+ unlock_all(2);
+ lock_ok_a(1, 2, S);
+ lock_ok_a(1, 2, IS);
+ lock_ok_a(1, 2, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_a(2, 3, LS);
+ lock_ok_i(1, 3, IX);
+ lock_ok_l(2, 3, IS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_conflict(2, 1, S);
+ lock_ok_a(1, 1, LS);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_a(2, 1, LS);
+ lock_ok_a(1, 1, LS);
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+
+ lock_ok_i(1, 4, IS);
+ lock_ok_i(2, 4, IS);
+ lock_ok_i(3, 4, IS);
+ lock_ok_a(3, 4, LS);
+ lock_ok_i(4, 4, IS);
+ lock_conflict(4, 4, IX);
+ lock_conflict(2, 4, IX);
+ lock_ok_a(1, 4, LS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+ unlock_all(4);
+
+ lock_ok_i(1, 1, IX);
+ lock_ok_i(2, 1, IX);
+ lock_conflict(1, 1, S);
+ lock_conflict(2, 1, X);
+ unlock_all(1);
+ unlock_all(2);
+
+ lock_ok_i(1, 1, IS);
+ lock_conflict(2, 1, X);
+ lock_conflict(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+
+ lock_ok_a(1, 1, S);
+ lock_conflict(2, 1, IX);
+ lock_conflict(3, 1, IS);
+ unlock_all(1);
+ unlock_all(2);
+ unlock_all(3);
+}
+
+int rt_num_threads;
+int litmus;
+int thread_number= 0, timeouts= 0;
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+ pthread_t *threads;
+ ulonglong now= my_getsystime();
+ int i;
+
+ thread_number= timeouts= 0;
+ litmus= 0;
+
+ threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+ if (!threads)
+ {
+ diag("Out of memory");
+ abort();
+ }
+
+ diag("Running %s with %d threads, %d iterations... ", test, n, m);
+ rt_num_threads= n;
+ for (i= 0; i < n ; i++)
+ if (pthread_create(threads+i, 0, handler, &m))
+ {
+ diag("Could not create thread");
+ abort();
+ }
+ for (i= 0 ; i < n ; i++)
+ pthread_join(threads[i], 0);
+ now= my_getsystime()-now;
+ ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+ my_free((void*)threads, MYF(0));
+}
+
+static void reinit_tlo(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo)
+{
+#ifdef NOT_USED_YET
+ TABLE_LOCK_OWNER backup= *lo;
+#endif
+
+ tablockman_release_locks(lm, lo);
+#ifdef NOT_USED_YET
+ pthread_mutex_destroy(lo->mutex);
+ pthread_cond_destroy(lo->cond);
+ bzero(lo, sizeof(*lo));
+
+ lo->mutex= backup.mutex;
+ lo->cond= backup.cond;
+ lo->loid= backup.loid;
+ pthread_mutex_init(lo->mutex, MY_MUTEX_INIT_FAST);
+ pthread_cond_init(lo->cond, 0);
+#endif
+}
+
+pthread_mutex_t rt_mutex;
+int Nrows= 100;
+int Ntables= 10;
+int table_lock_ratio= 10;
+enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX};
+const char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"};
+const char *res2str[]= {
+ 0,
+ "OUT OF MEMORY",
+ "DEADLOCK",
+ "LOCK TIMEOUT",
+ "GOT THE LOCK",
+ "GOT THE LOCK NEED TO LOCK A SUBRESOURCE",
+ "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"};
+
+pthread_handler_t test_lockman(void *arg)
+{
+ int m= (*(int *)arg);
+ uint x, loid, row, table, res, locklevel, timeout= 0;
+ TABLE_LOCK_OWNER *lo1;
+ DBUG_ASSERT(Ntables <= Ntbls);
+ DBUG_ASSERT(Nrows + Ntables <= Ntbls);
+
+ pthread_mutex_lock(&rt_mutex);
+ loid= ++thread_number;
+ pthread_mutex_unlock(&rt_mutex);
+ lo1= loid2lo1(loid);
+
+ for (x= ((int)(intptr)(&m)); m > 0; m--)
+ {
+ /* three prime numbers */
+ x= (uint) ((x*LL(3628273133) + LL(1500450271)) % LL(9576890767));
+ row= x % Nrows + Ntables;
+ table= row % Ntables;
+ locklevel= (x/Nrows) & 3;
+ if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0)
+ {
+ /* table lock */
+ res= tablockman_getlock(&tablockman, lo1, ltarray+table,
+ lock_array[locklevel]);
+ DIAG(("loid %2d, table %d, lock %s, res %s", loid, table,
+ lock2str[locklevel], res2str[res]));
+ if (res < GOT_THE_LOCK)
+ {
+ reinit_tlo(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ }
+ else
+ { /* row lock */
+ locklevel&= 1;
+ res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]);
+ DIAG(("loid %2d, row %d, lock %s, res %s", loid, row,
+ lock2str[locklevel+4], res2str[res]));
+ switch (res)
+ {
+ case GOT_THE_LOCK:
+ continue;
+ case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE:
+ /* not implemented, so take a regular lock */
+ case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE:
+ res= tablockman_getlock(&tablockman, lo1, ltarray+row, lock_array[locklevel]);
+ DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row,
+ lock2str[locklevel], res2str[res]));
+ if (res < GOT_THE_LOCK)
+ {
+ reinit_tlo(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ DBUG_ASSERT(res == GOT_THE_LOCK);
+ continue;
+ default:
+ reinit_tlo(&tablockman, lo1);
+ DIAG(("loid %2d, release all locks", loid));
+ timeout++;
+ continue;
+ }
+ }
+ }
+
+ reinit_tlo(&tablockman, lo1);
+
+ pthread_mutex_lock(&rt_mutex);
+ rt_num_threads--;
+ timeouts+= timeout;
+ if (!rt_num_threads)
+ diag("number of timeouts: %d", timeouts);
+ pthread_mutex_unlock(&rt_mutex);
+
+ return 0;
+}
+
+int main(int argc __attribute__((unused)), char **argv)
+{
+ int i;
+ MY_INIT(argv[0]);
+
+ my_init();
+ pthread_mutex_init(&rt_mutex, 0);
+
+ plan(40);
+
+ if (my_atomic_initialize())
+ return exit_status();
+
+
+ tablockman_init(&tablockman, &loid2lo1, 50);
+
+ for (i= 0; i < Nlos; i++)
+ {
+ pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+ pthread_cond_init (&conds[i], 0);
+
+ loarray1[i].active_locks= 0;
+ loarray1[i].waiting_lock= 0;
+ loarray1[i].waiting_for= 0;
+ loarray1[i].mutex= &mutexes[i];
+ loarray1[i].cond= &conds[i];
+ loarray1[i].loid= i+1;
+ }
+
+ for (i= 0; i < Ntbls; i++)
+ {
+ tablockman_init_locked_table(ltarray+i, Nlos);
+ }
+
+ test_tablockman_simple();
+
+#define CYCLES 10000
+#define THREADS Nlos /* don't change this line */
+
+ /* mixed load, stress-test with random locks */
+ Nrows= 100;
+ Ntables= 10;
+ table_lock_ratio= 10;
+ run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES);
+#if 0
+ /* "real-life" simulation - many rows, no table locks */
+ Nrows= 1000000;
+ Ntables= 10;
+ table_lock_ratio= 0;
+ run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10);
+#endif
+ for (i= 0; i < Nlos; i++)
+ {
+ tablockman_release_locks(&tablockman, &loarray1[i]);
+ pthread_mutex_destroy(loarray1[i].mutex);
+ pthread_cond_destroy(loarray1[i].cond);
+ }
+
+ {
+ ulonglong now= my_getsystime();
+ for (i= 0; i < Ntbls; i++)
+ {
+ tablockman_destroy_locked_table(ltarray+i);
+ }
+ tablockman_destroy(&tablockman);
+ now= my_getsystime()-now;
+ diag("lockman_destroy: %g secs", ((double)now)/1e7);
+ }
+
+ pthread_mutex_destroy(&rt_mutex);
+ my_end(0);
+ return exit_status();
+}
+
diff --git a/storage/maria/unittest/ma_control_file-t.c b/storage/maria/unittest/ma_control_file-t.c
new file mode 100644
index 00000000000..164ea284f31
--- /dev/null
+++ b/storage/maria/unittest/ma_control_file-t.c
@@ -0,0 +1,592 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Unit test of the control file module of the Aria engine WL#3234 */
+
+/*
+ Note that it is not possible to test the durability of the write (can't
+ pull the plug programmatically :)
+*/
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <tap.h>
+
+#ifndef WITH_ARIA_STORAGE_ENGINE
+/*
+ If Aria is not compiled in, normally we don't come to building this test.
+*/
+#error "Aria engine is not compiled in, test cannot be built"
+#endif
+
+#include "maria.h"
+#include "../../../storage/maria/maria_def.h"
+#include <my_getopt.h>
+
+#define EXTRACT_DEFINITIONS
+#include "../ma_control_file.c"
+#undef EXTRACT_DEFINITIONS
+
+char file_name[FN_REFLEN];
+
+/* The values we'll set and expect the control file module to return */
+LSN expect_checkpoint_lsn;
+uint32 expect_logno;
+TrID expect_max_trid;
+uint8 expect_recovery_failures;
+
+static int delete_file(myf my_flags);
+/*
+ Those are test-specific wrappers around the module's API functions: after
+ calling the module's API functions they perform checks on the result.
+*/
+static int close_file(void); /* wraps ma_control_file_end */
+/* wraps ma_control_file_open_or_create */
+static int open_file(void);
+/* wraps ma_control_file_write_and_force */
+static int write_file(LSN checkpoint_lsn, uint32 logno, TrID trid,
+ uint8 rec_failures);
+
+/* Tests */
+static int test_one_log_and_recovery_failures(void);
+static int test_five_logs_and_max_trid(void);
+static int test_3_checkpoints_and_2_logs(void);
+static int test_binary_content(void);
+static int test_start_stop(void);
+static int test_2_open_and_2_close(void);
+static int test_bad_magic_string(void);
+static int test_bad_checksum(void);
+static int test_bad_hchecksum(void);
+static int test_future_size(void);
+static int test_bad_blocksize(void);
+static int test_bad_size(void);
+
+/* Utility */
+static int verify_module_values_match_expected(void);
+static int verify_module_values_are_impossible(void);
+static void usage(void);
+static void get_options(int argc, char *argv[]);
+
+/*
+ If "expr" is FALSE, this macro will make the function print a diagnostic
+ message and immediately return 1.
+ This is inspired from assert() but does not crash the binary (sometimes we
+ may want to see how other tests go even if one fails).
+ RET_ERR means "return error".
+*/
+
+#define RET_ERR_UNLESS(expr) \
+ {if (!(expr)) {diag("line %d: failure: '%s'", __LINE__, #expr); assert(0);return 1;}}
+
+
+/* Used to ignore error messages from ma_control_file_open() */
+
+static int my_ignore_message(uint error __attribute__((unused)),
+ const char *str __attribute__((unused)),
+ myf MyFlags __attribute__((unused)))
+{
+ DBUG_ENTER("my_message_no_curses");
+ DBUG_PRINT("enter",("message: %s",str));
+ DBUG_RETURN(0);
+}
+
+int (*default_error_handler_hook)(uint my_err, const char *str,
+ myf MyFlags) = 0;
+
+
+/* like ma_control_file_open(), but without error messages */
+
+static CONTROL_FILE_ERROR local_ma_control_file_open(void)
+{
+ CONTROL_FILE_ERROR error;
+ error_handler_hook= my_ignore_message;
+ error= ma_control_file_open(TRUE, TRUE);
+ error_handler_hook= default_error_handler_hook;
+ return error;
+}
+
+
+
+int main(int argc,char *argv[])
+{
+ MY_INIT(argv[0]);
+ my_init();
+
+ maria_data_root= (char *)".";
+ default_error_handler_hook= error_handler_hook;
+
+ plan(12);
+
+ diag("Unit tests for control file");
+
+ get_options(argc,argv);
+
+ diag("Deleting control file at startup, if there is an old one");
+ RET_ERR_UNLESS(0 == delete_file(0)); /* if fails, can't continue */
+
+ diag("Tests of normal conditions");
+ ok(0 == test_one_log_and_recovery_failures(),
+ "test of creating one log and recording recovery failures");
+ ok(0 == test_five_logs_and_max_trid(),
+ "test of creating five logs and many transactions");
+ ok(0 == test_3_checkpoints_and_2_logs(),
+ "test of creating three checkpoints and two logs");
+ ok(0 == test_binary_content(), "test of the binary content of the file");
+ ok(0 == test_start_stop(), "test of multiple starts and stops");
+ diag("Tests of abnormal conditions");
+ ok(0 == test_2_open_and_2_close(),
+ "test of two open and two close (strange call sequence)");
+ ok(0 == test_bad_magic_string(), "test of bad magic string");
+ ok(0 == test_bad_checksum(), "test of bad checksum");
+ ok(0 == test_bad_hchecksum(), "test of bad hchecksum");
+ ok(0 == test_future_size(), "test of ability to handlr future versions");
+ ok(0 == test_bad_blocksize(), "test of bad blocksize");
+ ok(0 == test_bad_size(), "test of too small/big file");
+
+ return exit_status();
+}
+
+
+static int delete_file(myf my_flags)
+{
+ RET_ERR_UNLESS(fn_format(file_name, CONTROL_FILE_BASE_NAME,
+ maria_data_root, "", MYF(MY_WME)) != NullS);
+ /*
+ Maybe file does not exist, ignore error.
+ The error will however be printed on stderr.
+ */
+ my_delete(file_name, my_flags);
+ expect_checkpoint_lsn= LSN_IMPOSSIBLE;
+ expect_logno= FILENO_IMPOSSIBLE;
+ expect_max_trid= expect_recovery_failures= 0;
+
+ return 0;
+}
+
+/*
+ Verifies that global values last_checkpoint_lsn, last_logno,
+ max_trid_in_control_file (belonging to the module) match what we expect.
+*/
+static int verify_module_values_match_expected(void)
+{
+ RET_ERR_UNLESS(last_logno == expect_logno);
+ RET_ERR_UNLESS(last_checkpoint_lsn == expect_checkpoint_lsn);
+ RET_ERR_UNLESS(max_trid_in_control_file == expect_max_trid);
+ RET_ERR_UNLESS(recovery_failures == expect_recovery_failures);
+ return 0;
+}
+
+
+/*
+ Verifies that global values last_checkpoint_lsn and last_logno (belonging
+ to the module) are impossible (this is used when the file has been closed).
+*/
+static int verify_module_values_are_impossible(void)
+{
+ RET_ERR_UNLESS(last_logno == FILENO_IMPOSSIBLE);
+ RET_ERR_UNLESS(last_checkpoint_lsn == LSN_IMPOSSIBLE);
+ RET_ERR_UNLESS(max_trid_in_control_file == 0);
+ return 0;
+}
+
+
+static int close_file(void)
+{
+ /* Simulate shutdown */
+ ma_control_file_end();
+ /* Verify amnesia */
+ RET_ERR_UNLESS(verify_module_values_are_impossible() == 0);
+ return 0;
+}
+
+static int open_file(void)
+{
+ RET_ERR_UNLESS(local_ma_control_file_open() == CONTROL_FILE_OK);
+ /* Check that the module reports expected information */
+ RET_ERR_UNLESS(verify_module_values_match_expected() == 0);
+ return 0;
+}
+
+static int write_file(LSN checkpoint_lsn, uint32 logno, TrID trid,
+ uint8 rec_failures)
+{
+ RET_ERR_UNLESS(ma_control_file_write_and_force(checkpoint_lsn, logno, trid,
+ rec_failures)
+ == 0);
+ /* Check that the module reports expected information */
+ RET_ERR_UNLESS(verify_module_values_match_expected() == 0);
+ return 0;
+}
+
+static int test_one_log_and_recovery_failures(void)
+{
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ expect_logno= 123;
+ RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno,
+ max_trid_in_control_file,
+ recovery_failures) == 0);
+ expect_recovery_failures= 158;
+ RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno,
+ max_trid_in_control_file,
+ expect_recovery_failures) == 0);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_five_logs_and_max_trid(void)
+{
+ uint i;
+
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ expect_logno= 100;
+ expect_max_trid= ULL(14111978111);
+ for (i= 0; i<5; i++)
+ {
+ expect_logno*= 3;
+ RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno,
+ expect_max_trid,
+ recovery_failures) == 0);
+ }
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_3_checkpoints_and_2_logs(void)
+{
+ /*
+ Simulate one checkpoint, one log creation, two checkpoints, one
+ log creation.
+ */
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ expect_checkpoint_lsn= MAKE_LSN(5, 10000);
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+ max_trid_in_control_file,
+ recovery_failures) == 0);
+
+ expect_logno= 17;
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+ max_trid_in_control_file,
+ recovery_failures) == 0);
+
+ expect_checkpoint_lsn= MAKE_LSN(17, 20000);
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+ max_trid_in_control_file,
+ recovery_failures) == 0);
+
+ expect_checkpoint_lsn= MAKE_LSN(17, 45000);
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+ max_trid_in_control_file,
+ recovery_failures) == 0);
+
+ expect_logno= 19;
+ RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+ max_trid_in_control_file,
+ recovery_failures) == 0);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_binary_content(void)
+{
+ uint i;
+ int fd;
+
+ /*
+ TEST4: actually check by ourselves the content of the file.
+ Note that constants (offsets) are hard-coded here, precisely to prevent
+ someone from changing them in the control file module and breaking
+ backward-compatibility.
+ TODO: when we reach the format-freeze state, we may even just do a
+ comparison with a raw binary string, to not depend on any uint4korr
+ future change/breakage.
+ */
+
+ uchar buffer[45];
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_read(fd, buffer, 45, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ i= uint3korr(buffer + 34 );
+ RET_ERR_UNLESS(i == LSN_FILE_NO(last_checkpoint_lsn));
+ i= uint4korr(buffer + 37);
+ RET_ERR_UNLESS(i == LSN_OFFSET(last_checkpoint_lsn));
+ i= uint4korr(buffer + 41);
+ RET_ERR_UNLESS(i == last_logno);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_start_stop(void)
+{
+ /* TEST5: Simulate start/nothing/stop/start/nothing/stop/start */
+
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_2_open_and_2_close(void)
+{
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+
+static int test_bad_magic_string(void)
+{
+ uchar buffer[4];
+ int fd;
+
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+
+ /* Corrupt magic string */
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_pread(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_pwrite(fd, (const uchar *)"papa", 4, 0,
+ MYF(MY_FNABP | MY_WME)) == 0);
+
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(local_ma_control_file_open() ==
+ CONTROL_FILE_BAD_MAGIC_STRING);
+ /* Restore magic string */
+ RET_ERR_UNLESS(my_pwrite(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+static int test_bad_checksum(void)
+{
+ uchar buffer[4];
+ int fd;
+
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+
+ /* Corrupt checksum */
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_pread(fd, buffer, 1, 30, MYF(MY_FNABP | MY_WME)) == 0);
+ buffer[0]+= 3; /* mangle checksum */
+ RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 30, MYF(MY_FNABP | MY_WME)) == 0);
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(local_ma_control_file_open() ==
+ CONTROL_FILE_BAD_CHECKSUM);
+ /* Restore checksum */
+ buffer[0]-= 3;
+ RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 30, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+
+ return 0;
+}
+
+
+static int test_bad_blocksize(void)
+{
+ maria_block_size<<= 1;
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(local_ma_control_file_open() ==
+ CONTROL_FILE_WRONG_BLOCKSIZE);
+ /* Restore blocksize */
+ maria_block_size>>= 1;
+
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+ return 0;
+}
+
+
+static int test_future_size(void)
+{
+ /*
+ Here we check ability to add fields only so we can use
+ defined constants
+ */
+ uint32 sum;
+ int fd;
+ uchar buffer[CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE + 2];
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_read(fd, buffer,
+ CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE,
+ MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+ /* "add" new field of 1 byte (value 1) to header and variable part */
+ memmove(buffer + CF_CREATE_TIME_TOTAL_SIZE + 1,
+ buffer + CF_CREATE_TIME_TOTAL_SIZE,
+ CF_CHANGEABLE_TOTAL_SIZE);
+ buffer[CF_CREATE_TIME_TOTAL_SIZE - CF_CHECKSUM_SIZE]= '\1';
+ buffer[CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE + 1]= '\1';
+ /* fix lengths */
+ int2store(buffer + CF_CREATE_TIME_SIZE_OFFSET, CF_CREATE_TIME_TOTAL_SIZE + 1);
+ int2store(buffer + CF_CHANGEABLE_SIZE_OFFSET, CF_CHANGEABLE_TOTAL_SIZE + 1);
+ /* recalculete checksums */
+ sum= (uint32) my_checksum(0, buffer, CF_CREATE_TIME_TOTAL_SIZE -
+ CF_CHECKSUM_SIZE + 1);
+ int4store(buffer + CF_CREATE_TIME_TOTAL_SIZE - CF_CHECKSUM_SIZE + 1, sum);
+ sum= (uint32) my_checksum(0, buffer + CF_CREATE_TIME_TOTAL_SIZE + 1 +
+ CF_CHECKSUM_SIZE,
+ CF_CHANGEABLE_TOTAL_SIZE - CF_CHECKSUM_SIZE + 1);
+ int4store(buffer + CF_CREATE_TIME_TOTAL_SIZE + 1, sum);
+ /* write new file and check it */
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_pwrite(fd, buffer,
+ CF_CREATE_TIME_TOTAL_SIZE +
+ CF_CHANGEABLE_TOTAL_SIZE + 2,
+ 0, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+
+ return(0);
+}
+
+static int test_bad_hchecksum(void)
+{
+ uchar buffer[4];
+ int fd;
+
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+
+ /* Corrupt checksum */
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_pread(fd, buffer, 1, 26, MYF(MY_FNABP | MY_WME)) == 0);
+ buffer[0]+= 3; /* mangle checksum */
+ RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 26, MYF(MY_FNABP | MY_WME)) == 0);
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(local_ma_control_file_open() ==
+ CONTROL_FILE_BAD_HEAD_CHECKSUM);
+ /* Restore checksum */
+ buffer[0]-= 3;
+ RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 26, MYF(MY_FNABP | MY_WME)) == 0);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+
+ return 0;
+}
+
+
+static int test_bad_size(void)
+{
+ uchar buffer[]=
+ "123456789012345678901234567890123456789012345678901234567890123456";
+ int fd, i;
+
+ /* A too short file */
+ RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0);
+ RET_ERR_UNLESS((fd= my_open(file_name,
+ O_BINARY | O_RDWR | O_CREAT,
+ MYF(MY_WME))) >= 0);
+ RET_ERR_UNLESS(my_write(fd, buffer, 10, MYF(MY_FNABP | MY_WME)) == 0);
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(local_ma_control_file_open() ==
+ CONTROL_FILE_TOO_SMALL);
+ for (i= 0; i < 8; i++)
+ {
+ RET_ERR_UNLESS(my_write(fd, buffer, 66, MYF(MY_FNABP | MY_WME)) == 0);
+ }
+ /* Check that control file module sees the problem */
+ RET_ERR_UNLESS(local_ma_control_file_open() ==
+ CONTROL_FILE_TOO_BIG);
+ RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+
+ /* Leave a correct control file */
+ RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0);
+ RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+ RET_ERR_UNLESS(close_file() == 0);
+
+ return 0;
+}
+
+
+static struct my_option my_long_options[] =
+{
+#ifndef DBUG_OFF
+ {"debug", '#', "Debug log.",
+ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"help", '?', "Display help and exit",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"version", 'V', "Print version number and exit",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static void version(void)
+{
+ printf("ma_control_file_test: unit test for the control file "
+ "module of the Aria storage engine. Ver 1.0 \n");
+}
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch(optid) {
+ case 'V':
+ version();
+ exit(0);
+ case '#':
+ DBUG_PUSH (argument);
+ break;
+ case '?':
+ version();
+ usage();
+ exit(0);
+ }
+ return 0;
+}
+
+
+/* Read options */
+
+static void get_options(int argc, char *argv[])
+{
+ int ho_error;
+
+ if ((ho_error=handle_options(&argc, &argv, my_long_options,
+ get_one_option)))
+ exit(ho_error);
+
+ return;
+} /* get options */
+
+
+static void usage(void)
+{
+ printf("Usage: %s [options]\n\n", my_progname);
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+}
diff --git a/storage/maria/unittest/ma_loghandler_examples.c b/storage/maria/unittest/ma_loghandler_examples.c
new file mode 100644
index 00000000000..0c11a3b9a8e
--- /dev/null
+++ b/storage/maria/unittest/ma_loghandler_examples.c
@@ -0,0 +1,65 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+
+static LOG_DESC INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE=
+{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0,
+ "fixed0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, NULL, NULL, 0,
+"variable0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 7, 7, NULL, NULL, NULL, 1,
+"fixed1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 12, NULL, NULL, NULL, 1,
+"variable1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 23, 23, NULL, NULL, NULL, 2,
+"fixed2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 19, NULL, NULL, NULL, 2,
+"variable2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+
+void translog_example_table_init()
+{
+ int i;
+ log_record_type_descriptor[LOGREC_FIXED_RECORD_0LSN_EXAMPLE]=
+ INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE]=
+ INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_FIXED_RECORD_1LSN_EXAMPLE]=
+ INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE]=
+ INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_FIXED_RECORD_2LSN_EXAMPLE]=
+ INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE;
+ log_record_type_descriptor[LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE]=
+ INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE;
+ for (i= LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE + 1;
+ i < LOGREC_NUMBER_OF_TYPES;
+ i++)
+ log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED;
+}
+
+
+
diff --git a/storage/maria/unittest/ma_maria_log_cleanup.c b/storage/maria/unittest/ma_maria_log_cleanup.c
new file mode 100644
index 00000000000..f85c75b1a88
--- /dev/null
+++ b/storage/maria/unittest/ma_maria_log_cleanup.c
@@ -0,0 +1,64 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <my_dir.h>
+
+my_bool maria_log_remove()
+{
+ MY_DIR *dirp;
+ uint i;
+ MY_STAT stat_buff;
+ char file_name[FN_REFLEN];
+
+ /* Removes control file */
+ if (fn_format(file_name, CONTROL_FILE_BASE_NAME,
+ maria_data_root, "", MYF(MY_WME)) == NullS)
+ return 1;
+ if (my_stat(file_name, &stat_buff, MYF(0)) &&
+ my_delete(file_name, MYF(MY_WME)) != 0)
+ return 1;
+
+ /* Finds and removes transaction log files */
+ if (!(dirp = my_dir(maria_data_root, MYF(MY_DONT_SORT))))
+ return 1;
+
+ for (i= 0; i < dirp->number_off_files; i++)
+ {
+ char *file= dirp->dir_entry[i].name;
+ if (strncmp(file, "aria_log.", 9) == 0 &&
+ file[9] >= '0' && file[9] <= '9' &&
+ file[10] >= '0' && file[10] <= '9' &&
+ file[11] >= '0' && file[11] <= '9' &&
+ file[12] >= '0' && file[12] <= '9' &&
+ file[13] >= '0' && file[13] <= '9' &&
+ file[14] >= '0' && file[14] <= '9' &&
+ file[15] >= '0' && file[15] <= '9' &&
+ file[16] >= '0' && file[16] <= '9' &&
+ file[17] == '\0')
+ {
+ if (fn_format(file_name, file,
+ maria_data_root, "", MYF(MY_WME)) == NullS ||
+ my_delete(file_name, MYF(MY_WME)) != 0)
+ {
+ my_dirend(dirp);
+ return 1;
+ }
+ }
+ }
+ my_dirend(dirp);
+ return 0;
+}
+
diff --git a/storage/maria/unittest/ma_pagecache_consist.c b/storage/maria/unittest/ma_pagecache_consist.c
new file mode 100644
index 00000000000..7dbdba433c6
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_consist.c
@@ -0,0 +1,498 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+ my_atomic-t.c (see BUG#22320).
+*/
+
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count;
+static PAGECACHE pagecache;
+
+#ifdef TEST_HIGH_CONCURENCY
+static uint number_of_readers= 10;
+static uint number_of_writers= 20;
+static uint number_of_tests= 30000;
+static uint record_length_limit= TEST_PAGE_SIZE/200;
+static uint number_of_pages= 20;
+static uint flush_divider= 1000;
+#else /*TEST_HIGH_CONCURENCY*/
+#ifdef TEST_READERS
+static uint number_of_readers= 10;
+static uint number_of_writers= 1;
+static uint number_of_tests= 30000;
+static uint record_length_limit= TEST_PAGE_SIZE/200;
+static uint number_of_pages= 20;
+static uint flush_divider= 1000;
+#undef SKIP_BIG_TESTS
+#define SKIP_BIG_TESTS(X) /* no-op */
+#else /*TEST_READERS*/
+#ifdef TEST_WRITERS
+static uint number_of_readers= 0;
+static uint number_of_writers= 10;
+static uint number_of_tests= 30000;
+static uint record_length_limit= TEST_PAGE_SIZE/200;
+static uint number_of_pages= 20;
+static uint flush_divider= 1000;
+#undef SKIP_BIG_TESTS
+#define SKIP_BIG_TESTS(X) /* no-op */
+#else /*TEST_WRITERS*/
+static uint number_of_readers= 10;
+static uint number_of_writers= 10;
+static uint number_of_tests= 50000;
+static uint record_length_limit= TEST_PAGE_SIZE/200;
+static uint number_of_pages= 20000;
+static uint flush_divider= 1000;
+#endif /*TEST_WRITERS*/
+#endif /*TEST_READERS*/
+#endif /*TEST_HIGH_CONCURENCY*/
+
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no __attribute__((unused)),
+ uchar* data_ptr __attribute__((unused)))
+{
+ return 0;
+}
+
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+ return;
+}
+
+
+/*
+ Get pseudo-random length of the field in (0;limit)
+
+ SYNOPSYS
+ get_len()
+ limit limit for generated value
+
+ RETURN
+ length where length >= 0 & length < limit
+*/
+
+static uint get_len(uint limit)
+{
+ return (uint)((ulonglong)rand()*(limit-1)/RAND_MAX);
+}
+
+
+/*
+ Check page's consistency: layout is
+ 4 bytes: number 'num' of records in this page, then num occurences of
+ { 4 bytes: record's length 'len'; then 4 bytes unchecked ('tag') then
+ 'len' bytes each equal to the record's sequential number in this page,
+ modulo 256 }, then zeroes.
+ */
+uint check_page(uchar *buff, ulong offset, int page_locked, int page_no,
+ int tag)
+{
+ uint end= sizeof(uint);
+ uint num= uint4korr(buff);
+ uint i;
+ DBUG_ENTER("check_page");
+
+ for (i= 0; i < num; i++)
+ {
+ uint len= uint4korr(buff + end);
+ uint j;
+ end+= 4 + 4;
+ if (len + end > TEST_PAGE_SIZE)
+ {
+ diag("incorrect field header #%u by offset %lu\n", i, offset + end);
+ goto err;
+ }
+ for(j= 0; j < len; j++)
+ {
+ if (buff[end + j] != (uchar)((i+1) % 256))
+ {
+ diag("incorrect %lu byte\n", offset + end + j);
+ goto err;
+ }
+ }
+ end+= len;
+ }
+ for(i= end; i < TEST_PAGE_SIZE; i++)
+ {
+ if (buff[i] != 0)
+ {
+ int h;
+ DBUG_PRINT("err",
+ ("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n",
+ offset + i, offset, i, page_no,
+ (page_locked ? "locked" : "unlocked"),
+ end, num, tag));
+ diag("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n",
+ offset + i, offset, i, page_no,
+ (page_locked ? "locked" : "unlocked"),
+ end, num, tag);
+ h= my_open("wrong_page", O_CREAT | O_TRUNC | O_RDWR, MYF(0));
+ my_pwrite(h, (uchar*) buff, TEST_PAGE_SIZE, 0, MYF(0));
+ my_close(h, MYF(0));
+ goto err;
+ }
+ }
+ DBUG_RETURN(end);
+err:
+ DBUG_PRINT("err", ("try to flush"));
+ if (page_locked)
+ {
+ pagecache_delete(&pagecache, &file1, page_no,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, 1);
+ }
+ else
+ {
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE);
+ }
+ exit(1);
+}
+
+void put_rec(uchar *buff, uint end, uint len, uint tag)
+{
+ uint i;
+ uint num;
+ num= uint4korr(buff);
+ if (!len)
+ len= 1;
+ if (end + 4*2 + len > TEST_PAGE_SIZE)
+ return;
+ int4store(buff + end, len);
+ end+= 4;
+ int4store(buff + end, tag);
+ end+= 4;
+ num++;
+ int4store(buff, num);
+ for (i= end; i < (len + end); i++)
+ {
+ buff[i]= (uchar) num % 256;
+ }
+}
+
+/*
+ Recreate and reopen a file for test
+
+ SYNOPSIS
+ reset_file()
+ file File to reset
+ file_name Path (and name) of file which should be reset
+*/
+
+void reset_file(PAGECACHE_FILE file, char *file_name)
+{
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE);
+ if (my_close(file1.file, MYF(0)) != 0)
+ {
+ diag("Got error during %s closing from close() (errno: %d)\n",
+ file_name, errno);
+ exit(1);
+ }
+ my_delete(file_name, MYF(0));
+ if ((file.file= my_open(file_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ diag("Got error during %s creation from open() (errno: %d)\n",
+ file_name, errno);
+ exit(1);
+ }
+}
+
+
+void reader(int num)
+{
+ unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+ uint i;
+
+ for (i= 0; i < number_of_tests; i++)
+ {
+ uint page= get_len(number_of_pages);
+ pagecache_read(&pagecache, &file1, page, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ check_page(buffr, page * TEST_PAGE_SIZE, 0, page, -num);
+
+ }
+ free(buffr);
+}
+
+
+void writer(int num)
+{
+ unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+ uint i;
+
+ for (i= 0; i < number_of_tests; i++)
+ {
+ uint end;
+ uint page= get_len(number_of_pages);
+ pagecache_read(&pagecache, &file1, page, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ 0);
+ end= check_page(buffr, page * TEST_PAGE_SIZE, 1, page, num);
+ put_rec(buffr, end, get_len(record_length_limit), num);
+ pagecache_write(&pagecache, &file1, page, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+
+ if (i % flush_divider == 0)
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ }
+ free(buffr);
+}
+
+
+static void *test_thread_reader(void *arg)
+{
+ int param=*((int*) arg);
+ my_thread_init();
+ {
+ DBUG_ENTER("test_reader");
+ DBUG_PRINT("enter", ("param: %d", param));
+
+ reader(param);
+
+ DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(1, "reader%d: done", param);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ }
+ return 0;
+}
+
+
+static void *test_thread_writer(void *arg)
+{
+ int param=*((int*) arg);
+ my_thread_init();
+ {
+ DBUG_ENTER("test_writer");
+ DBUG_PRINT("enter", ("param: %d", param));
+
+ writer(param);
+
+ DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(1, "writer%d: done", param);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ }
+ return 0;
+}
+
+
+int main(int argc __attribute__((unused)),
+ char **argv __attribute__((unused)))
+{
+ pthread_t tid;
+ pthread_attr_t thr_attr;
+ int *param, error, pagen;
+
+ MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/test_pagecache_consist.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ {
+ DBUG_ENTER("main");
+ DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+ plan(number_of_writers + number_of_readers);
+ SKIP_BIG_TESTS(number_of_writers + number_of_readers)
+ {
+
+ if ((file1.file= my_open(file1_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ diag( "Got error during file1 creation from open() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+ &dummy_fail_callback, &dummy_callback, NULL);
+ DBUG_PRINT("info", ("file1: %d", file1.file));
+ if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+ exit(1);
+ my_pwrite(file1.file, (const uchar *)"test file", 9, 0, MYF(0));
+
+ if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+ {
+ diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+ if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+ {
+ diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+
+ if ((error= pthread_attr_init(&thr_attr)))
+ {
+ diag("Got error: %d from pthread_attr_init (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+ {
+ diag(
+ "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+ VOID(thr_setconcurrency(2));
+#endif
+
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TEST_PAGE_SIZE, 0)) == 0)
+ {
+ diag("Got error: init_pagecache() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ DBUG_PRINT("info", ("Page cache %d pages", pagen));
+ {
+ unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+ uint i;
+ memset(buffr, '\0', TEST_PAGE_SIZE);
+ for (i= 0; i < number_of_pages; i++)
+ {
+ pagecache_write(&pagecache, &file1, i, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ }
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ free(buffr);
+ }
+ pthread_mutex_lock(&LOCK_thread_count);
+ while (number_of_readers != 0 || number_of_writers != 0)
+ {
+ if (number_of_readers != 0)
+ {
+ param=(int*) malloc(sizeof(int));
+ *param= number_of_readers;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_reader,
+ (void*) param)))
+ {
+ diag("Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_readers--;
+ }
+ if (number_of_writers != 0)
+ {
+ param=(int*) malloc(sizeof(int));
+ *param= number_of_writers;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+ (void*) param)))
+ {
+ diag("Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_writers--;
+ }
+ }
+ DBUG_PRINT("info", ("Thread started"));
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ pthread_attr_destroy(&thr_attr);
+
+ /* wait finishing */
+ pthread_mutex_lock(&LOCK_thread_count);
+ while (thread_count)
+ {
+ if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count)))
+ diag("COND_thread_count: %d from pthread_cond_wait\n",error);
+ }
+ pthread_mutex_unlock(&LOCK_thread_count);
+ DBUG_PRINT("info", ("thread ended"));
+
+ end_pagecache(&pagecache, 1);
+ DBUG_PRINT("info", ("Page cache ended"));
+
+ if (my_close(file1.file, MYF(0)) != 0)
+ {
+ diag( "Got error during file1 closing from close() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ my_delete(file1_name, MYF(0));
+
+ DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+ DBUG_PRINT("info", ("Program end"));
+
+ } /* SKIP_BIG_TESTS */
+ my_end(0);
+
+ return exit_status();
+ }
+}
diff --git a/storage/maria/unittest/ma_pagecache_rwconsist.c b/storage/maria/unittest/ma_pagecache_rwconsist.c
new file mode 100644
index 00000000000..a1a22b5e18d
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_rwconsist.c
@@ -0,0 +1,362 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+ my_atomic-t.c (see BUG#22320).
+*/
+
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+
+#define SLEEP my_sleep(5)
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count= 0;
+static PAGECACHE pagecache;
+
+static uint number_of_readers= 5;
+static uint number_of_writers= 5;
+static uint number_of_read_tests= 2000;
+static uint number_of_write_tests= 1000;
+static uint read_sleep_limit= 3;
+static uint report_divisor= 50;
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no __attribute__((unused)),
+ uchar* data_ptr __attribute__((unused)))
+{
+ return 0;
+}
+
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+ return;
+}
+
+
+/**
+ @brief Checks page consistency
+
+ @param buff pointer to the page content
+ @param task task ID
+*/
+void check_page(uchar *buff, int task)
+{
+ uint i;
+ DBUG_ENTER("check_page");
+
+ for (i= 1; i < TEST_PAGE_SIZE; i++)
+ {
+ if (buff[0] != buff[i])
+ goto err;
+ }
+ DBUG_VOID_RETURN;
+err:
+ diag("Task %d char #%u '%u' != '%u'", task, i, (uint) buff[0],
+ (uint) buff[i]);
+ DBUG_PRINT("err", ("try to flush"));
+ exit(1);
+}
+
+
+
+void reader(int num)
+{
+ unsigned char *buff;
+ uint i;
+ PAGECACHE_BLOCK_LINK *link;
+
+ for (i= 0; i < number_of_read_tests; i++)
+ {
+ if (i % report_divisor == 0)
+ diag("Reader %d - %u", num, i);
+ buff= pagecache_read(&pagecache, &file1, 0, 3, NULL,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_READ,
+ &link);
+ check_page(buff, num);
+ pagecache_unlock_by_link(&pagecache, link,
+ PAGECACHE_LOCK_READ_UNLOCK,
+ PAGECACHE_UNPIN, 0, 0, 0, FALSE);
+ {
+ int lim= rand() % read_sleep_limit;
+ int j;
+ for (j= 0; j < lim; j++)
+ SLEEP;
+ }
+ }
+}
+
+
+void writer(int num)
+{
+ uint i;
+ uchar *buff;
+ PAGECACHE_BLOCK_LINK *link;
+
+ for (i= 0; i < number_of_write_tests; i++)
+ {
+ uchar c= (uchar) rand() % 256;
+
+ if (i % report_divisor == 0)
+ diag("Writer %d - %u", num, i);
+ buff= pagecache_read(&pagecache, &file1, 0, 3, NULL,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ &link);
+
+ check_page(buff, num);
+ bfill(buff, TEST_PAGE_SIZE / 2, c);
+ SLEEP;
+ bfill(buff + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE / 2, c);
+ check_page(buff, num);
+ pagecache_unlock_by_link(&pagecache, link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, 0, 0, 1, FALSE);
+ SLEEP;
+ }
+}
+
+
+static void *test_thread_reader(void *arg)
+{
+ int param=*((int*) arg);
+ my_thread_init();
+ {
+ DBUG_ENTER("test_reader");
+
+ DBUG_PRINT("enter", ("param: %d", param));
+
+ reader(param);
+
+ DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(1, "reader%d: done", param);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ }
+ return 0;
+}
+
+
+static void *test_thread_writer(void *arg)
+{
+ int param=*((int*) arg);
+ my_thread_init();
+ {
+ DBUG_ENTER("test_writer");
+
+ writer(param);
+
+ DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(1, "writer%d: done", param);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ }
+ return 0;
+}
+
+
+int main(int argc __attribute__((unused)),
+ char **argv __attribute__((unused)))
+{
+ pthread_t tid;
+ pthread_attr_t thr_attr;
+ int *param, error, pagen;
+
+ MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace";
+#else
+ default_dbug_option= "d:t:i:O,/tmp/test_pagecache_consist.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ {
+ DBUG_ENTER("main");
+ DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+ plan(number_of_writers + number_of_readers);
+ SKIP_BIG_TESTS(number_of_writers + number_of_readers)
+ {
+
+ if ((file1.file= my_open(file1_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ diag( "Got error during file1 creation from open() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+ &dummy_fail_callback, &dummy_callback, NULL);
+ DBUG_PRINT("info", ("file1: %d", file1.file));
+ if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+ exit(1);
+ my_pwrite(file1.file, (const uchar*) "test file", 9, 0, MYF(0));
+
+ if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+ {
+ diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+ if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+ {
+ diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+
+ if ((error= pthread_attr_init(&thr_attr)))
+ {
+ diag("Got error: %d from pthread_attr_init (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+ {
+ diag(
+ "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+ VOID(thr_setconcurrency(2));
+#endif
+
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TEST_PAGE_SIZE, 0)) == 0)
+ {
+ diag("Got error: init_pagecache() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ DBUG_PRINT("info", ("Page cache %d pages", pagen));
+ {
+ unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+ memset(buffr, '\0', TEST_PAGE_SIZE);
+ pagecache_write(&pagecache, &file1, 0, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ }
+ pthread_mutex_lock(&LOCK_thread_count);
+
+ while (number_of_readers != 0 || number_of_writers != 0)
+ {
+ if (number_of_readers != 0)
+ {
+ param=(int*) malloc(sizeof(int));
+ *param= number_of_readers + number_of_writers;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_reader,
+ (void*) param)))
+ {
+ diag("Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_readers--;
+ }
+ if (number_of_writers != 0)
+ {
+ param=(int*) malloc(sizeof(int));
+ *param= number_of_writers + number_of_readers;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+ (void*) param)))
+ {
+ diag("Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_writers--;
+ }
+ }
+ DBUG_PRINT("info", ("Thread started"));
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ pthread_attr_destroy(&thr_attr);
+
+ /* wait finishing */
+ pthread_mutex_lock(&LOCK_thread_count);
+ while (thread_count)
+ {
+ if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count)))
+ diag("COND_thread_count: %d from pthread_cond_wait\n", error);
+ }
+ pthread_mutex_unlock(&LOCK_thread_count);
+ DBUG_PRINT("info", ("thread ended"));
+
+ end_pagecache(&pagecache, 1);
+ DBUG_PRINT("info", ("Page cache ended"));
+
+ if (my_close(file1.file, MYF(0)) != 0)
+ {
+ diag( "Got error during file1 closing from close() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ my_delete(file1_name, MYF(0));
+
+ DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+ DBUG_PRINT("info", ("Program end"));
+ } /* SKIP_BIG_TESTS */
+ my_end(0);
+
+ return exit_status();
+ }
+}
diff --git a/storage/maria/unittest/ma_pagecache_rwconsist2.c b/storage/maria/unittest/ma_pagecache_rwconsist2.c
new file mode 100644
index 00000000000..34183a2d0ab
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_rwconsist2.c
@@ -0,0 +1,358 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+/**
+ @file this unit tests consistence of long block writing under write lock
+ and simultaneous reading of this block with read request without read lock
+ requirement.
+*/
+
+/*
+ TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+ my_atomic-t.c (see BUG#22320).
+*/
+
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+
+#define SLEEP my_sleep(5)
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count= 0;
+static PAGECACHE pagecache;
+
+static uint number_of_readers= 5;
+static uint number_of_writers= 5;
+static uint number_of_read_tests= 20000;
+static uint number_of_write_tests= 1000;
+static uint report_divisor= 50;
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no __attribute__((unused)),
+ uchar* data_ptr __attribute__((unused)))
+{
+ return 0;
+}
+
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+ return;
+}
+
+
+/**
+ @brief Checks page consistency
+
+ @param buff pointer to the page content
+ @param task task ID
+*/
+void check_page(uchar *buff, int task)
+{
+ uint i;
+ DBUG_ENTER("check_page");
+
+ for (i= 1; i < TEST_PAGE_SIZE; i++)
+ {
+ if (buff[0] != buff[i])
+ goto err;
+ }
+ DBUG_VOID_RETURN;
+err:
+ diag("Task %d char #%u '%u' != '%u'", task, i, (uint) buff[0],
+ (uint) buff[i]);
+ DBUG_PRINT("err", ("try to flush"));
+ exit(1);
+}
+
+
+
+void reader(int num)
+{
+ unsigned char buff[TEST_PAGE_SIZE];
+ uint i;
+
+ for (i= 0; i < number_of_read_tests; i++)
+ {
+ if (i % report_divisor == 0)
+ diag("Reader %d - %u", num, i);
+ pagecache_read(&pagecache, &file1, 0, 3, buff,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL);
+ check_page(buff, num);
+ }
+}
+
+
+void writer(int num)
+{
+ uint i;
+ uchar *buff;
+ PAGECACHE_BLOCK_LINK *link;
+
+ for (i= 0; i < number_of_write_tests; i++)
+ {
+ uchar c= (uchar) rand() % 256;
+
+ if (i % report_divisor == 0)
+ diag("Writer %d - %u", num, i);
+ buff= pagecache_read(&pagecache, &file1, 0, 3, NULL,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ &link);
+
+ check_page(buff, num);
+ bfill(buff, TEST_PAGE_SIZE / 2, c);
+ SLEEP;
+ bfill(buff + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE / 2, c);
+ check_page(buff, num);
+ pagecache_unlock_by_link(&pagecache, link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN, 0, 0, 1, FALSE);
+ SLEEP;
+ }
+}
+
+
+static void *test_thread_reader(void *arg)
+{
+ int param=*((int*) arg);
+ my_thread_init();
+ {
+ DBUG_ENTER("test_reader");
+
+ DBUG_PRINT("enter", ("param: %d", param));
+
+ reader(param);
+
+ DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(1, "reader%d: done", param);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ }
+ return 0;
+}
+
+
+static void *test_thread_writer(void *arg)
+{
+ int param=*((int*) arg);
+ my_thread_init();
+ {
+ DBUG_ENTER("test_writer");
+
+ writer(param);
+
+ DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(1, "writer%d: done", param);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ }
+ return 0;
+}
+
+
+int main(int argc __attribute__((unused)),
+ char **argv __attribute__((unused)))
+{
+ pthread_t tid;
+ pthread_attr_t thr_attr;
+ int *param, error, pagen;
+
+ MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace";
+#else
+ default_dbug_option= "d:t:i:O,/tmp/test_pagecache_consist.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ {
+ DBUG_ENTER("main");
+ DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+ plan(number_of_writers + number_of_readers);
+ SKIP_BIG_TESTS(number_of_writers + number_of_readers)
+ {
+
+ if ((file1.file= my_open(file1_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ diag( "Got error during file1 creation from open() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+ &dummy_fail_callback, &dummy_callback, NULL);
+ DBUG_PRINT("info", ("file1: %d", file1.file));
+ if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+ exit(1);
+ my_pwrite(file1.file, (const uchar*) "test file", 9, 0, MYF(0));
+
+ if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+ {
+ diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+ if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+ {
+ diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+
+ if ((error= pthread_attr_init(&thr_attr)))
+ {
+ diag("Got error: %d from pthread_attr_init (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+ {
+ diag(
+ "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+ VOID(thr_setconcurrency(2));
+#endif
+
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TEST_PAGE_SIZE, 0)) == 0)
+ {
+ diag("Got error: init_pagecache() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ DBUG_PRINT("info", ("Page cache %d pages", pagen));
+ {
+ unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+ memset(buffr, '\0', TEST_PAGE_SIZE);
+ pagecache_write(&pagecache, &file1, 0, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ }
+ pthread_mutex_lock(&LOCK_thread_count);
+
+ while (number_of_readers != 0 || number_of_writers != 0)
+ {
+ if (number_of_readers != 0)
+ {
+ param=(int*) malloc(sizeof(int));
+ *param= number_of_readers + number_of_writers;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_reader,
+ (void*) param)))
+ {
+ diag("Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_readers--;
+ }
+ if (number_of_writers != 0)
+ {
+ param=(int*) malloc(sizeof(int));
+ *param= number_of_writers + number_of_readers;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+ (void*) param)))
+ {
+ diag("Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_writers--;
+ }
+ }
+ DBUG_PRINT("info", ("Thread started"));
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ pthread_attr_destroy(&thr_attr);
+
+ /* wait finishing */
+ pthread_mutex_lock(&LOCK_thread_count);
+ while (thread_count)
+ {
+ if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count)))
+ diag("COND_thread_count: %d from pthread_cond_wait\n", error);
+ }
+ pthread_mutex_unlock(&LOCK_thread_count);
+ DBUG_PRINT("info", ("thread ended"));
+
+ end_pagecache(&pagecache, 1);
+ DBUG_PRINT("info", ("Page cache ended"));
+
+ if (my_close(file1.file, MYF(0)) != 0)
+ {
+ diag( "Got error during file1 closing from close() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ my_delete(file1_name, MYF(0));
+
+ DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+ DBUG_PRINT("info", ("Program end"));
+ } /* SKIP_BIG_TESTS */
+ my_end(0);
+
+ return exit_status();
+ }
+}
diff --git a/storage/maria/unittest/ma_pagecache_single.c b/storage/maria/unittest/ma_pagecache_single.c
new file mode 100644
index 00000000000..32e588e165a
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_single.c
@@ -0,0 +1,853 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+ my_atomic-t.c (see BUG#22320).
+ Use diag() instead of fprintf(stderr).
+*/
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*10)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+#ifndef BIG
+#undef SKIP_BIG_TESTS
+#define SKIP_BIG_TESTS(X) /* no-op */
+#endif
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static char *file2_name= (char*)"page_cache_test_file_2";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count;
+static PAGECACHE pagecache;
+
+/*
+ File contance descriptors
+*/
+static struct file_desc simple_read_write_test_file[]=
+{
+ { TEST_PAGE_SIZE, '\1'},
+ {0, 0}
+};
+static struct file_desc simple_read_change_write_read_test_file[]=
+{
+ { TEST_PAGE_SIZE/2, '\65'},
+ { TEST_PAGE_SIZE/2, '\1'},
+ {0, 0}
+};
+static struct file_desc simple_pin_test_file1[]=
+{
+ { TEST_PAGE_SIZE*2, '\1'},
+ {0, 0}
+};
+static struct file_desc simple_pin_test_file2[]=
+{
+ { TEST_PAGE_SIZE/2, '\1'},
+ { TEST_PAGE_SIZE/2, (unsigned char)129},
+ { TEST_PAGE_SIZE, '\1'},
+ {0, 0}
+};
+static struct file_desc simple_pin_no_lock_test_file1[]=
+{
+ { TEST_PAGE_SIZE, '\4'},
+ {0, 0}
+};
+static struct file_desc simple_pin_no_lock_test_file2[]=
+{
+ { TEST_PAGE_SIZE, '\5'},
+ {0, 0}
+};
+static struct file_desc simple_pin_no_lock_test_file3[]=
+{
+ { TEST_PAGE_SIZE, '\6'},
+ {0, 0}
+};
+static struct file_desc simple_delete_forget_test_file[]=
+{
+ { TEST_PAGE_SIZE, '\1'},
+ {0, 0}
+};
+static struct file_desc simple_delete_flush_test_file[]=
+{
+ { TEST_PAGE_SIZE, '\2'},
+ {0, 0}
+};
+
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no __attribute__((unused)),
+ uchar* data_ptr __attribute__((unused)))
+{
+ return 0;
+}
+
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+ return;
+}
+
+
+/*
+ Recreate and reopen a file for test
+
+ SYNOPSIS
+ reset_file()
+ file File to reset
+ file_name Path (and name) of file which should be reset
+*/
+
+void reset_file(PAGECACHE_FILE *file, const char *file_name)
+{
+ flush_pagecache_blocks(&pagecache, file, FLUSH_RELEASE);
+ if (my_close(file->file, MYF(MY_WME)))
+ exit(1);
+ my_delete(file_name, MYF(MY_WME));
+ if ((file->file= my_open(file_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ diag("Got error during %s creation from open() (errno: %d)\n",
+ file_name, my_errno);
+ exit(1);
+ }
+}
+
+/*
+ Write then read page, check file on disk
+*/
+
+int simple_read_write_test()
+{
+ unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+ unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+ int res;
+ DBUG_ENTER("simple_read_write_test");
+ bfill(buffw, TEST_PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ pagecache_read(&pagecache, &file1, 0, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ ok((res= test(memcmp(buffr, buffw, TEST_PAGE_SIZE) == 0)),
+ "Simple write-read page ");
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error during flushing pagecache\n");
+ exit(1);
+ }
+ ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+ simple_read_write_test_file))),
+ "Simple write-read page file");
+ if (res)
+ reset_file(&file1, file1_name);
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Prepare page, then read (and lock), change (write new value and unlock),
+ then check the page in the cache and on the disk
+*/
+int simple_read_change_write_read_test()
+{
+ unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+ unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+ int res, res2;
+ DBUG_ENTER("simple_read_change_write_read_test");
+
+ /* prepare the file */
+ bfill(buffw, TEST_PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error during flushing pagecache\n");
+ exit(1);
+ }
+ /* test */
+ pagecache_read(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ 0);
+ bfill(buffw, TEST_PAGE_SIZE/2, '\65');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+
+ pagecache_read(&pagecache, &file1, 0, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ ok((res= test(memcmp(buffr, buffw, TEST_PAGE_SIZE) == 0)),
+ "Simple read-change-write-read page ");
+ DBUG_ASSERT(pagecache.blocks_changed == 1);
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error during flushing pagecache\n");
+ exit(1);
+ }
+ DBUG_ASSERT(pagecache.blocks_changed == 0);
+ ok((res2= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+ simple_read_change_write_read_test_file))),
+ "Simple read-change-write-read page file");
+ if (res && res2)
+ reset_file(&file1, file1_name);
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res && res2);
+}
+
+
+/*
+ Prepare page, read page 0 (and pin) then write page 1 and page 0.
+ Flush the file (should flush only page 1 and return 1 (page 0 is
+ still pinned).
+ Check file on the disk.
+ Unpin and flush.
+ Check file on the disk.
+*/
+int simple_pin_test()
+{
+ unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+ int res;
+ DBUG_ENTER("simple_pin_test");
+ /* prepare the file */
+ bfill(buffw, TEST_PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ /* test */
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error during flushing pagecache\n");
+ exit(1);
+ }
+ pagecache_read(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ 0);
+ pagecache_write(&pagecache, &file1, 1, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ bfill(buffw + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE/2, ((unsigned char) 129));
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ /*
+ We have to get error because one page of the file is pinned,
+ other page should be flushed
+ */
+ if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Did not get error in flush_pagecache_blocks\n");
+ res= 0;
+ goto err;
+ }
+ ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE*2,
+ simple_pin_test_file1))),
+ "Simple pin page file with pin");
+ pagecache_unlock(&pagecache,
+ &file1,
+ 0,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_UNPIN,
+ 0, 0, 0);
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error in flush_pagecache_blocks\n");
+ res= 0;
+ goto err;
+ }
+ ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE,
+ simple_pin_test_file2))),
+ "Simple pin page result file");
+ if (res)
+ reset_file(&file1, file1_name);
+err:
+ free(buffw);
+ DBUG_RETURN(res);
+}
+
+/*
+ Prepare page, read page 0 (and pin) then write page 1 and page 0.
+ Flush the file (should flush only page 1 and return 1 (page 0 is
+ still pinned).
+ Check file on the disk.
+ Unpin and flush.
+ Check file on the disk.
+*/
+int simple_pin_test2()
+{
+ unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+ int res;
+ DBUG_ENTER("simple_pin_test2");
+ /* prepare the file */
+ bfill(buffw, TEST_PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ /* test */
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error during flushing pagecache\n");
+ exit(1);
+ }
+ pagecache_read(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ 0);
+ pagecache_write(&pagecache, &file1, 1, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ bfill(buffw + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE/2, ((unsigned char) 129));
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE_TO_READ,
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ /*
+ We have to get error because one page of the file is pinned,
+ other page should be flushed
+ */
+ if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY))
+ {
+ diag("Did not get error in flush_pagecache_blocks 2\n");
+ res= 0;
+ goto err;
+ }
+ ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE*2,
+ simple_pin_test_file1))),
+ "Simple pin page file with pin 2");
+
+ /* Test that a normal flush goes through */
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error in flush_pagecache_blocks 3\n");
+ res= 0;
+ goto err;
+ }
+ pagecache_unlock(&pagecache,
+ &file1,
+ 0,
+ PAGECACHE_LOCK_READ_UNLOCK,
+ PAGECACHE_UNPIN,
+ 0, 0, 0);
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error in flush_pagecache_blocks 4\n");
+ res= 0;
+ goto err;
+ }
+ ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE,
+ simple_pin_test_file2))),
+ "Simple pin page result file 2");
+ if (res)
+ reset_file(&file1, file1_name);
+err:
+ free(buffw);
+ DBUG_RETURN(res);
+}
+
+/*
+ Checks pins without lock.
+*/
+int simple_pin_no_lock_test()
+{
+ unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+ PAGECACHE_BLOCK_LINK *link;
+ int res;
+ DBUG_ENTER("simple_pin_no_lock_test");
+ /* prepare the file */
+ bfill(buffw, TEST_PAGE_SIZE, '\4');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ /* test */
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error during flushing pagecache 2\n");
+ exit(1);
+ }
+ bfill(buffw, TEST_PAGE_SIZE, '\5');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ /*
+ We have to get error because one page of the file is pinned,
+ other page should be flushed
+ */
+ if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY))
+ {
+ diag("Did not get error in flush_pagecache_blocks 2\n");
+ res= 0;
+ goto err;
+ }
+ ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+ simple_pin_no_lock_test_file1))),
+ "Simple pin (no lock) page file with pin 2");
+ pagecache_unlock(&pagecache,
+ &file1,
+ 0,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_UNPIN,
+ 0, 0, 0);
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error in flush_pagecache_blocks 2\n");
+ res= 0;
+ goto err;
+ }
+ ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+ simple_pin_no_lock_test_file2))),
+ "Simple pin (no lock) page result file 2");
+
+ bfill(buffw, TEST_PAGE_SIZE, '\6');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ PAGECACHE_PIN,
+ PAGECACHE_WRITE_DELAY,
+ &link, LSN_IMPOSSIBLE);
+ pagecache_unlock_by_link(&pagecache, link,
+ PAGECACHE_LOCK_WRITE_UNLOCK,
+ PAGECACHE_PIN_LEFT_PINNED, 0, 0, 1, FALSE);
+ if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY))
+ {
+ diag("Did not get error in flush_pagecache_blocks 3\n");
+ res= 0;
+ goto err;
+ }
+ ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+ simple_pin_no_lock_test_file2))),
+ "Simple pin (no lock) page file with pin 3");
+ pagecache_unpin_by_link(&pagecache, link, 0);
+ if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+ {
+ diag("Got error in flush_pagecache_blocks 3\n");
+ res= 0;
+ goto err;
+ }
+ ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+ simple_pin_no_lock_test_file3))),
+ "Simple pin (no lock) page result file 3");
+ if (res)
+ reset_file(&file1, file1_name);
+err:
+ free(buffw);
+ DBUG_RETURN(res);
+}
+/*
+ Prepare page, write new value, then delete page from cache without flush,
+ on the disk should be page with old content written during preparation
+*/
+
+int simple_delete_forget_test()
+{
+ unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+ unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+ int res;
+ DBUG_ENTER("simple_delete_forget_test");
+ /* prepare the file */
+ bfill(buffw, TEST_PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ /* test */
+ bfill(buffw, TEST_PAGE_SIZE, '\2');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ pagecache_delete(&pagecache, &file1, 0,
+ PAGECACHE_LOCK_WRITE, 0);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+ simple_delete_forget_test_file))),
+ "Simple delete-forget page file");
+ if (res)
+ reset_file(&file1, file1_name);
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res);
+}
+
+/*
+ Prepare page with locking, write new content to the page,
+ delete page with flush and on existing lock,
+ check that page on disk contain new value.
+*/
+
+int simple_delete_flush_test()
+{
+ unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+ unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+ PAGECACHE_BLOCK_LINK *link;
+ int res;
+ DBUG_ENTER("simple_delete_flush_test");
+ /* prepare the file */
+ bfill(buffw, TEST_PAGE_SIZE, '\1');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_WRITE,
+ PAGECACHE_PIN,
+ PAGECACHE_WRITE_DELAY,
+ &link, LSN_IMPOSSIBLE);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ /* test */
+ bfill(buffw, TEST_PAGE_SIZE, '\2');
+ pagecache_write(&pagecache, &file1, 0, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED,
+ PAGECACHE_PIN_LEFT_PINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ if (pagecache_delete_by_link(&pagecache, link,
+ PAGECACHE_LOCK_LEFT_WRITELOCKED, 1))
+ {
+ diag("simple_delete_flush_test: error during delete");
+ exit(1);
+ }
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+ simple_delete_flush_test_file))),
+ "Simple delete flush (link) page file");
+ if (res)
+ reset_file(&file1, file1_name);
+ free(buffw);
+ free(buffr);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ write then read file bigger then cache
+*/
+
+int simple_big_test()
+{
+ unsigned char *buffw= (unsigned char *) my_malloc(TEST_PAGE_SIZE, MYF(MY_WME));
+ unsigned char *buffr= (unsigned char *) my_malloc(TEST_PAGE_SIZE, MYF(MY_WME));
+ struct file_desc *desc= ((struct file_desc *)
+ my_malloc((PCACHE_SIZE/(TEST_PAGE_SIZE/2) + 1) *
+ sizeof(struct file_desc), MYF(MY_WME)));
+ int res, i;
+ DBUG_ENTER("simple_big_test");
+
+ /* prepare the file twice larger then cache */
+ for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE/2); i++)
+ {
+ bfill(buffw, TEST_PAGE_SIZE, (unsigned char) (i & 0xff));
+ desc[i].length= TEST_PAGE_SIZE;
+ desc[i].content= (i & 0xff);
+ pagecache_write(&pagecache, &file1, i, 3, buffw,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ }
+ desc[i].length= 0;
+ desc[i].content= '\0';
+ ok(1, "Simple big file write");
+ /* check written pages sequentally read */
+ for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE/2); i++)
+ {
+ int j;
+ pagecache_read(&pagecache, &file1, i, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ for(j= 0; j < TEST_PAGE_SIZE; j++)
+ {
+ if (buffr[j] != (i & 0xff))
+ {
+ diag("simple_big_test seq: page %u byte %u mismatch\n", i, j);
+ res= 0;
+ goto err;
+ }
+ }
+ }
+ ok(1, "Simple big file sequential read");
+ /* chack random reads */
+ for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE); i++)
+ {
+ int j, page;
+ page= rand() % (PCACHE_SIZE/(TEST_PAGE_SIZE/2));
+ pagecache_read(&pagecache, &file1, page, 3, buffr,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ 0);
+ for(j= 0; j < TEST_PAGE_SIZE; j++)
+ {
+ if (buffr[j] != (page & 0xff))
+ {
+ diag("simple_big_test rnd: page %u byte %u mismatch\n", page, j);
+ res= 0;
+ goto err;
+ }
+ }
+ }
+ ok(1, "Simple big file random read");
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+
+ ok((res= test(test_file(file1, file1_name, PCACHE_SIZE*2, TEST_PAGE_SIZE,
+ desc))),
+ "Simple big file");
+ if (res)
+ reset_file(&file1, file1_name);
+
+err:
+ my_free(buffw, 0);
+ my_free(buffr, 0);
+ my_free(desc, 0);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Thread function
+*/
+
+static void *test_thread(void *arg)
+{
+#ifndef DBUG_OFF
+ int param= *((int*) arg);
+#endif
+
+ my_thread_init();
+ {
+ DBUG_ENTER("test_thread");
+ DBUG_PRINT("enter", ("param: %d", param));
+
+ if (!simple_read_write_test() ||
+ !simple_read_change_write_read_test() ||
+ !simple_pin_test() ||
+ !simple_pin_test2() ||
+ !simple_pin_no_lock_test() ||
+ !simple_delete_forget_test() ||
+ !simple_delete_flush_test())
+ exit(1);
+
+ SKIP_BIG_TESTS(4)
+ {
+ if (!simple_big_test())
+ exit(1);
+ }
+
+ DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name()));
+ pthread_mutex_lock(&LOCK_thread_count);
+ thread_count--;
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+int main(int argc __attribute__((unused)),
+ char **argv __attribute__((unused)))
+{
+ pthread_t tid;
+ pthread_attr_t thr_attr;
+ int *param, error, pagen;
+ File tmp_file;
+ MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\test_pagecache_single.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/test_pagecache_single.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+ {
+ DBUG_ENTER("main");
+ DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+
+ plan(18);
+ SKIP_BIG_TESTS(18)
+ {
+
+ if ((tmp_file= my_open(file2_name, O_CREAT | O_TRUNC | O_RDWR,
+ MYF(MY_WME))) < 0)
+ exit(1);
+
+ if ((file1.file= my_open(file1_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+ &dummy_fail_callback, &dummy_callback, NULL);
+ my_close(tmp_file, MYF(0));
+ my_delete(file2_name, MYF(0));
+
+ DBUG_PRINT("info", ("file1: %d", file1.file));
+ if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+ exit(1);
+ my_pwrite(file1.file, (const uchar*)"test file", 9, 0, MYF(MY_WME));
+
+ if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+ {
+ fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+ if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+ {
+ fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+
+ if ((error= pthread_attr_init(&thr_attr)))
+ {
+ fprintf(stderr,"Got error: %d from pthread_attr_init (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+ {
+ fprintf(stderr,
+ "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+ VOID(thr_setconcurrency(2));
+#endif
+
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TEST_PAGE_SIZE, MYF(MY_WME))) == 0)
+ {
+ fprintf(stderr,"Got error: init_pagecache() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ DBUG_PRINT("info", ("Page cache %d pages", pagen));
+
+ pthread_mutex_lock(&LOCK_thread_count);
+ param=(int*) malloc(sizeof(int));
+ *param= 1;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread, (void*) param)))
+ {
+ fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n",
+ error,errno);
+ exit(1);
+ }
+ thread_count++;
+ DBUG_PRINT("info", ("Thread started"));
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ pthread_attr_destroy(&thr_attr);
+
+ pthread_mutex_lock(&LOCK_thread_count);
+ while (thread_count)
+ {
+ if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count)))
+ fprintf(stderr,"Got error: %d from pthread_cond_wait\n",error);
+ }
+ pthread_mutex_unlock(&LOCK_thread_count);
+ DBUG_PRINT("info", ("thread ended"));
+
+ end_pagecache(&pagecache, 1);
+ DBUG_PRINT("info", ("Page cache ended"));
+
+ if (my_close(file1.file, MYF(MY_WME)))
+ exit(1);
+
+ my_delete(file1_name, MYF(0));
+
+ } /* SKIP_BIG_TESTS */
+ DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+ DBUG_PRINT("info", ("Program end"));
+
+ my_end(0);
+
+ }
+ return exit_status();
+}
diff --git a/storage/maria/unittest/ma_test_all-t b/storage/maria/unittest/ma_test_all-t
new file mode 100755
index 00000000000..0b11daf7f98
--- /dev/null
+++ b/storage/maria/unittest/ma_test_all-t
@@ -0,0 +1,710 @@
+#!/usr/bin/env perl
+#
+# Run various unit tests.
+#
+
+use Getopt::Long;
+use File::Basename;
+
+$|= 1;
+$^W = 1; # warnings, because env cannot parse 'perl -w'
+$VER= "1.4";
+
+$opt_version= 0;
+$opt_help= 0;
+$opt_verbose= 0;
+$opt_abort_on_error= 0;
+$opt_valgrind= "valgrind --alignment=8 --leak-check=yes";
+$opt_silent= "-s";
+$opt_number_of_tests= 0;
+$opt_run_tests= undef();
+
+my $maria_path; # path to "storage/maria"
+my $maria_exe_path; # path to executables (ma_test1, aria_chk etc)
+my $my_progname= $0;
+$my_progname=~ s/.*[\/]//;
+my $runtime_error= 0; # Return 1 if error(s) occur during run
+my $NEW_TEST= 0; # Test group separator in an array of tests
+my $test_begin= 0;
+my $test_end= 0;
+my $test_counter= 0;
+
+run_tests();
+
+####
+#### Initialise variables, clean temporary files and run the tests
+####
+
+sub run_tests
+{
+ my $nr_tests= 0;
+ my $flag_exit= 0;
+
+ if (!GetOptions("help" => \$opt_help,
+ "version" => \$opt_version,
+ "verbose" => \$opt_verbose,
+ "abort-on-error" => \$opt_abort_on_error,
+ "valgrind=s" => \$opt_valgrind,
+ "silent=s" => \$opt_silent,
+ "number-of-tests" => \$opt_number_of_tests,
+ "run-tests=s" => \$opt_run_tests,
+ "start-from=s" => \$opt_run_tests))
+ {
+ $flag_exit= 1;
+ }
+ if ($opt_version)
+ {
+ print "$my_progname version $VER\n";
+ exit(0);
+ }
+ $maria_path= dirname($0) . "/..";
+
+ my $suffix= ( $^O =~ /win/i && $^O !~ /darwin/i ) ? ".exe" : "";
+ $maria_exe_path= "$maria_path/release";
+ # we use -f, sometimes -x is unexpectedly false in Cygwin
+ if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+ {
+ $maria_exe_path= "$maria_path/relwithdebinfo";
+ if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+ {
+ $maria_exe_path= "$maria_path/debug";
+ if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+ {
+ $maria_exe_path= $maria_path;
+ if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+ {
+ die("Cannot find ma_test1 executable\n");
+ }
+ }
+ }
+ }
+
+ usage() if ($opt_help || $flag_exit);
+
+ #
+ # IMPORTANT: If you modify this file, please read this:
+ #
+ # Count total number of tests. Make sure that the functions return
+ # number of unit tests correctly, e.g. calls to ok(). The last argument
+ # for each function is a flag counter and will return the number of
+ # unit tests in each. Please see comments on function ok() at the end.
+ #
+ # If you modify any functions or add any new ones, please make sure the
+ # unit tests are appropriately detected here. A wrong count will
+ # make the unit test fail during 'make test'. $nr_tests must be right.
+ #
+
+ $nr_tests+= run_check_tests(0, 0, 0, 0, 1) * 5; #
+ $nr_tests+= run_repair_tests(0, 0, 0, 0, 1) * 5; # called 4 times
+ $nr_tests+= run_pack_tests(0, 0, 0, 0, 1) * 5; #
+ $nr_tests+= run_tests_on_warnings_and_errors(0, 0, 0, 1);
+ $nr_tests+= run_ma_test_recovery(0, 1);
+ $nr_tests+= run_tests_on_clrs(0, 0, 1);
+
+ if ($opt_number_of_tests)
+ {
+ print "Total number of tests is $nr_tests\n";
+ exit(0);
+ }
+
+ if (defined($opt_run_tests))
+ {
+ if ($opt_run_tests =~ m/^(\d+)$/ ||
+ $opt_run_tests =~ m/^(\d+)\.+$/)
+ {
+ $test_begin= $1;
+ }
+ elsif ($opt_run_tests =~ m/^(\d+)\.+(\d+)$/)
+ {
+ $test_begin= $1;
+ $test_end= $2;
+ }
+ else
+ {
+ print "Wrong syntax for option --run-tests=$opt_run_tests\n";
+ print "Please use --run-tests=<begin>..<end>\nwhere 'begin' is the ";
+ print "first test to be run and 'end' is the last.\n";
+ exit(1);
+ }
+ if ($test_end > $nr_tests)
+ {
+ print "Test range ($test_begin..$test_end) out of range. ";
+ print "There are only $nr_tests in the test suite.\n";
+ exit(1);
+ }
+ $test_begin++ if (!$test_begin); # Handle zero, if user gave that
+ if ($test_end && $test_begin > $test_end)
+ {
+ print "Bad test range ($test_begin..$test_end)\n";
+ exit(1);
+ }
+ # Now adjust number of tests
+ $nr_tests= ($test_end ? $test_end : $nr_tests) - $test_begin + 1;
+ }
+
+ #
+ # clean-up
+ #
+
+ unlink <*.TMD aria_log*>; # Delete temporary files
+
+ #
+ # Run tests
+ #
+
+ if (!$opt_verbose)
+ {
+ print "1..$nr_tests\n";
+ }
+ else
+ {
+ print "Total tests: $nr_tests\n";
+ }
+
+ if ($opt_verbose)
+ {
+ print "Running tests with dynamic row format\n"
+ }
+ run_check_tests($suffix, $opt_silent, "", $opt_verbose, 0);
+ run_repair_tests($suffix, $opt_silent, "", $opt_verbose, 0);
+ run_pack_tests($suffix, $opt_silent, "", $opt_verbose, 0);
+
+ if ($opt_verbose)
+ {
+ print "\nRunning tests with static row format\n";
+ }
+ run_check_tests($suffix, $opt_silent, "-S", $opt_verbose, 0);
+ run_repair_tests($suffix, $opt_silent, "-S", $opt_verbose, 0);
+ run_pack_tests($suffix, $opt_silent, "-S", $opt_verbose, 0);
+
+ if ($opt_verbose)
+ {
+ print "\nRunning tests with block row format\n";
+ }
+ run_check_tests($suffix, $opt_silent, "-M", $opt_verbose, 0);
+ run_repair_tests($suffix, $opt_silent, "-M", $opt_verbose, 0);
+ run_pack_tests($suffix, $opt_silent, "-M", $opt_verbose, 0);
+
+ if ($opt_verbose)
+ {
+ print "\nRunning tests with block row format and transactions\n";
+ }
+ run_check_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0);
+ run_repair_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0);
+ run_pack_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0);
+
+ if ($opt_verbose)
+ {
+ print "\nRunning tests with block row format, transactions and versioning\n";
+ }
+ run_check_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0);
+ run_repair_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0);
+ run_pack_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0);
+
+
+ if ($opt_verbose)
+ {
+ print "\nRunning tests with warnings and recovery\n";
+ }
+ run_tests_on_warnings_and_errors($suffix, $opt_silent, $opt_verbose, 0);
+ run_ma_test_recovery($opt_verbose, 0);
+ run_tests_on_clrs($suffix, $opt_verbose, 0);
+
+ exit($runtime_error);
+}
+
+####
+#### regular tests
+####
+
+sub run_check_tests
+{
+ my ($suffix, $silent, $row_type, $verbose, $count)= @_;
+ my ($i, $nr_tests);
+ my @ma_test1_opt= ( ["","-se"],
+ ["-N","-se"],
+ ["-P --checksum","-se"],
+ ["-P -N","-se"],
+ ["-B -N -R2","-sm"],
+ ["-a -k 480 --unique","-sm"],
+ ["-a -N -R1 ","-sm"],
+ ["-p","-sm"],
+ ["-p -N --unique","-sm"],
+ ["-p -N --key_length=127 --checksum","-sm"],
+ ["-p -N --key_length=128","-sm"],
+ ["-p --key_length=480","-sm"],
+ ["-a -B","-sm"],
+ ["-a -B --key_length=64 --unique","-sm"],
+ ["-a -B -k 480 --checksum","-sm"],
+ ["-a -B -k 480 -N --unique --checksum","-sm"],
+ ["-a -m","-sm"],
+ ["-a -m -P --unique --checksum","-sm"],
+ ["-a -m -P --key_length=480 --key_cache","-sm"],
+ ["-m -p","-sm"],
+ ["-w --unique","-sm"],
+ ["-a -w --key_length=64 --checksum","-sm"],
+ ["-a -w -N --key_length=480","-sm"],
+ ["-a -w --key_length=480 --checksum","-sm"],
+ ["-a -b -N","-sm"],
+ ["-a -b --key_length=480","-sm"],
+ ["-p -B --key_length=480","-sm"],
+ ["--checksum --unique","-se"],
+ ["--unique","-se"],
+ ["--key_multiple -N -S","-sm"],
+ ["--key_multiple -a -p --key_length=480","-sm"],
+ ["--key_multiple -a -B --key_length=480","-sm"],
+ ["--key_multiple -P -S","-sm"] );
+ my @ma_test2_opt= ( ["-L -K -W -P","-sm"],
+ ["-L -K -W -P -A","-sm"],
+ ["-L -K -W -P -b32768", "-sm"],
+ ["-L -K -W -P -M -T -c -b32768 -t4 -m300", "-sm"],
+ ["-L -K -P -R3 -m50 -b1000000", "-sm"],
+ ["-L -B","-sm"],
+ ["-D -B -c","-sm"],
+ ["-m10000 -e4096 -K","-sm"],
+ ["-m10000 -e8192 -K","-sm"],
+ ["-m10000 -e16384 -E16384 -K -L","-sm"],
+ ["-L -K -W -P -b32768", "-se"],
+ ["-c -b65000","-se"] );
+ my @ma_rt_test_opt= ( ); # (["--checksum", "-se"] );
+
+
+ if ($count)
+ {
+ $nr_tests= 2; # Number of tests outside loops
+ for ($i= 0; defined($ma_test1_opt[$i]); $i++) { $nr_tests+=2; }
+ for ($i= 0; defined($ma_test2_opt[$i]); $i++) { $nr_tests+=2; }
+ for ($i= 0; defined($ma_rt_test_opt[$i]); $i++) { $nr_tests+=2; }
+ return $nr_tests;
+ }
+
+ for ($i= 0; defined($ma_test1_opt[$i]); $i++)
+ {
+ unlink <aria_log_control aria_log.*>;
+ ok("$maria_exe_path/ma_test1$suffix $silent $ma_test1_opt[$i][0] $row_type",
+ $verbose, $i + 1);
+ ok("$maria_exe_path/aria_chk$suffix $ma_test1_opt[$i][1] test1",
+ $verbose, $i + 1);
+ }
+ #
+ # These tests are outside the loops. Make sure to include them in
+ # nr_tests manually
+ #
+ ok("$maria_exe_path/aria_pack$suffix --force -s test1", $verbose, 0);
+ ok("$maria_exe_path/aria_chk$suffix -ess test1", $verbose, 0);
+
+ for ($i= 0; defined($ma_test2_opt[$i]); $i++)
+ {
+ unlink <aria_log_control aria_log.*>;
+ ok("$maria_exe_path/ma_test2$suffix $silent $ma_test2_opt[$i][0] $row_type",
+ $verbose, $i + 1);
+ ok("$maria_exe_path/aria_chk$suffix $ma_test2_opt[$i][1] test2",
+ $verbose, $i + 1);
+ }
+
+ for ($i= 0; defined($ma_rt_test_opt[$i]); $i++)
+ {
+ unlink <aria_log_control aria_log.*>;
+ ok("$maria_exe_path/ma_rt_test$suffix $silent $ma_rt_test_opt[$i][0] $row_type",
+ $verbose, $i + 1);
+ ok("$maria_exe_path/aria_chk$suffix $ma_rt_test_opt[$i][1] rt_test",
+ $verbose, $i + 1);
+ }
+
+ unlink <aria_log_control aria_log.*>;
+
+ return 0;
+}
+
+####
+#### repair tests
+####
+
+sub run_repair_tests()
+{
+ my ($suffix, $silent, $row_type, $verbose, $count)= @_;
+ my ($i);
+
+ my @t= ($NEW_TEST,
+ "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ "$maria_exe_path/aria_chk$suffix --silent -re --transaction-log test1",
+ "$maria_exe_path/aria_chk$suffix -rs test1",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ "$maria_exe_path/aria_chk$suffix -rqs test1",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ "$maria_exe_path/aria_chk$suffix -rs --correct-checksum test1",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ "$maria_exe_path/aria_chk$suffix -rqs --correct-checksum test1",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ "$maria_exe_path/aria_chk$suffix -ros --correct-checksum test1",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ "$maria_exe_path/aria_chk$suffix -rqos --correct-checksum test1",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ "$maria_exe_path/aria_chk$suffix -sz test1",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ "$maria_exe_path/ma_test2$suffix $silent -c -d1 $row_type",
+ "$maria_exe_path/aria_chk$suffix -s --parallel-recover test2",
+ "$maria_exe_path/aria_chk$suffix -se test2",
+ "$maria_exe_path/aria_chk$suffix -s --parallel-recover --quick test2",
+ "$maria_exe_path/aria_chk$suffix -se test2",
+ "$maria_exe_path/ma_test2$suffix $silent -c $row_type",
+ "$maria_exe_path/aria_chk$suffix -se test2",
+ "$maria_exe_path/aria_chk$suffix -sr test2",
+ "$maria_exe_path/aria_chk$suffix -se test2",
+ "$maria_exe_path/ma_test2$suffix $silent -c -t4 -b32768 $row_type",
+ "$maria_exe_path/aria_chk$suffix -s --zerofill test1",
+ "$maria_exe_path/aria_chk$suffix -se test1"
+ );
+
+ return &count_tests(\@t) if ($count);
+ &run_test_bunch(\@t, $verbose, 0);
+ return 0;
+}
+
+####
+#### pack tests
+####
+
+sub run_pack_tests()
+{
+ my ($suffix, $silent, $row_type, $verbose, $count)= @_;
+ my ($i);
+
+ my @t= ($NEW_TEST,
+ "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type",
+ "$maria_exe_path/aria_pack$suffix --force -s test1",
+ "$maria_exe_path/aria_chk$suffix -ess test1",
+ "$maria_exe_path/aria_chk$suffix -rqs test1",
+ "$maria_exe_path/aria_chk$suffix -es test1",
+ "$maria_exe_path/aria_chk$suffix -rs test1",
+ "$maria_exe_path/aria_chk$suffix -es test1",
+ "$maria_exe_path/aria_chk$suffix -rus test1",
+ "$maria_exe_path/aria_chk$suffix -es test1",
+ $NEW_TEST,
+ "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type",
+ "$maria_exe_path/aria_pack$suffix --force -s test1",
+ "$maria_exe_path/aria_chk$suffix -rus --safe-recover test1",
+ "$maria_exe_path/aria_chk$suffix -es test1",
+ $NEW_TEST,
+ "$maria_exe_path/ma_test1$suffix $silent --checksum -S $row_type",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ "$maria_exe_path/aria_chk$suffix -ros test1",
+ "$maria_exe_path/aria_chk$suffix -rqs test1",
+ "$maria_exe_path/aria_chk$suffix -se test1",
+ $NEW_TEST,
+ "$maria_exe_path/aria_pack$suffix --force -s test1",
+ "$maria_exe_path/aria_chk$suffix -rqs test1",
+ "$maria_exe_path/aria_chk$suffix -es test1",
+ "$maria_exe_path/aria_chk$suffix -rus test1",
+ "$maria_exe_path/aria_chk$suffix -es test1",
+ $NEW_TEST,
+ "$maria_exe_path/ma_test2$suffix $silent -c -d1 $row_type",
+ "$maria_exe_path/aria_chk$suffix -s --parallel-recover test2",
+ "$maria_exe_path/aria_chk$suffix -se test2",
+ "$maria_exe_path/aria_chk$suffix -s --unpack --parallel-recover test2",
+ "$maria_exe_path/aria_chk$suffix -se test2",
+ "$maria_exe_path/aria_pack$suffix --force -s test1",
+ "$maria_exe_path/aria_chk$suffix -s --unpack --parallel-recover test2",
+ "$maria_exe_path/aria_chk$suffix -se test2",
+ $NEW_TEST,
+ "$maria_exe_path/ma_test1$suffix $silent -c $row_type",
+ "cp test1.MAD test2.MAD",
+ "cp test1.MAI test2.MAI",
+ "$maria_exe_path/aria_pack$suffix --force -s --join=test3 test1 test2",
+ "$maria_exe_path/aria_chk -s test3",
+ "$maria_exe_path/aria_chk -s --safe-recover test3",
+ "$maria_exe_path/aria_chk -s test3"
+ );
+
+ return &count_tests(\@t) if ($count);
+ &run_test_bunch(\@t, $verbose, 0);
+ return 0;
+}
+
+####
+#### Tests that gives warnings or errors
+####
+
+sub run_tests_on_warnings_and_errors
+{
+ my ($suffix, $silent, $verbose, $count)= @_;
+ my ($com);
+
+ return 9 if ($count); # Number of tests in this function, e.g. calls to ok()
+
+ ok("$maria_exe_path/ma_test2$suffix $silent -L -K -W -P -S -R1 -m500",
+ $verbose, 0);
+ ok("$maria_exe_path/aria_chk$suffix -sm test2", $verbose, 0);
+ # ma_test2$suffix $silent -L -K -R1 -m2000 ; Should give error 135\n
+ # In the following a failure is a success and success is a failure
+ $com= "$maria_exe_path/ma_test2$suffix $silent -L -K -R1 -m2000 ";
+ $com.= ">ma_test2_message.txt 2>&1";
+ ok($com, $verbose, 0, 1);
+ ok("cat ma_test2_message.txt", $verbose, 0);
+ ok("grep \"Error: 135\" ma_test2_message.txt > /dev/null", $verbose, 0);
+ # maria_exe_path/aria_chk$suffix -sm test2 will warn that
+ # Datafile is almost full
+ ok("$maria_exe_path/aria_chk$suffix -sm test2 >ma_test2_message.txt 2>&1",
+ $verbose, 0);
+ ok("cat ma_test2_message.txt", $verbose, 0);
+ ok("grep \"warning: Datafile is almost full\" ma_test2_message.txt>/dev/null",
+ $verbose, 0);
+ unlink <ma_test2_message.txt>;
+ ok("$maria_exe_path/aria_chk$suffix -ssm test2", $verbose, 0);
+
+ return 0;
+}
+
+####
+#### Test that removing tables and applying the log leads to identical tables
+####
+
+sub run_ma_test_recovery
+{
+ my ($verbose, $count)= @_;
+
+ return 1 if ($count); # Number of tests in this function
+ ok("$maria_path/unittest/ma_test_recovery.pl", $verbose, 0);
+ return 0;
+}
+
+####
+#### Tests on CLR's
+####
+
+sub run_tests_on_clrs
+{
+ my ($suffix, $verbose, $count)= @_;
+ my ($i);
+
+ my @t= ($NEW_TEST,
+ "$maria_exe_path/ma_test2$suffix -s -L -K -W -P -M -T -c -b -t2 -A1",
+ "cp aria_log_control tmp",
+ "$maria_exe_path/aria_read_log$suffix -a -s",
+ "$maria_exe_path/aria_chk$suffix -s -e test2",
+ "cp tmp/aria_log_control .",
+ "rm test2.MA?",
+ "$maria_exe_path/aria_read_log$suffix -a -s",
+ "$maria_exe_path/aria_chk$suffix -s -e test2",
+ "rm test2.MA?",
+ $NEW_TEST,
+ "$maria_exe_path/ma_test2$suffix -s -L -K -W -P -M -T -c -b -t2 -A1",
+ "$maria_exe_path/aria_read_log$suffix -a -s",
+ "$maria_exe_path/aria_chk$suffix -s -e test2",
+ "rm test2.MA?",
+ "$maria_exe_path/aria_read_log$suffix -a -s",
+ "$maria_exe_path/aria_chk$suffix -e -s test2",
+ "rm test2.MA?",
+ $NEW_TEST,
+ "$maria_exe_path/ma_test2$suffix -s -L -K -W -P -M -T -c -b32768 -t4 -A1",
+ "$maria_exe_path/aria_read_log$suffix -a -s",
+ "$maria_exe_path/aria_chk$suffix -es test2",
+ "$maria_exe_path/aria_read_log$suffix -a -s",
+ "$maria_exe_path/aria_chk$suffix -es test2",
+ "rm test2.MA?",
+ "$maria_exe_path/aria_read_log$suffix -a -s",
+ "$maria_exe_path/aria_chk$suffix -es test2",
+ "rm test2.MA?"
+ );
+
+ return &count_tests(\@t) if ($count);
+ &run_test_bunch(\@t, $verbose, 1);
+ return 0;
+}
+
+#
+# Print "ok" on success and "not ok" on error
+#
+# Note: Every time this function is called it will be counted
+# as a unit test.
+#
+# Args: $com: The actual command run. Will be printed on a failure
+# $verbose: Be more verbose.
+# $iteration: Number of iterations in a loop when the error
+# occurred. If not in loop, this should be blank
+# (e.g. send zero).
+# $expected_error: Optional; put here expected error code. Test
+# will pass with this result only.
+#
+# Return value: Will return 1 on success and 0 on an error
+#
+
+sub ok
+{
+ my ($com, $verbose, $iteration, $expected_error)= @_;
+ my ($msg, $output, $err, $len);
+
+ $test_counter++;
+ if ($test_begin > $test_counter)
+ {
+ return 0;
+ }
+ if ($test_end && $test_end < $test_counter)
+ {
+ exit(0);
+ }
+
+ $msg= "";
+ $expected_error= 0 if (!defined($expected_error));
+
+ if ($verbose)
+ {
+ print "$com ";
+ }
+ $output= `$com 2>&1`;
+ $len= length($com);
+ if ($verbose)
+ {
+ print " " x (62 - $len);
+ }
+ $err= $?;
+ if ((!$err && !$expected_error) ||
+ (($err >> 8) == $expected_error && $expected_error))
+ {
+ print "[ " if ($verbose);
+ print "ok";
+ if ($verbose)
+ {
+ print " ]";
+ print " " x (5 - length("$test_counter"));
+ print "$test_counter";
+ }
+ else
+ {
+ print " $test_counter - $com"
+ }
+ print "\n";
+ return 1;
+ }
+ print "[ " if ($verbose);
+ print "not ok";
+ print " ]" if ($verbose);
+ print " $test_counter - $com" unless $verbose;
+ print "\n";
+ if ($verbose && defined($output) && length($output))
+ {
+ print "$output\n";
+ }
+ if (!$verbose)
+ {
+ $msg= "\n"; # Get a nicer output in perl unit test mode
+ }
+ $msg.= "Failed test '$com' ";
+ if ($iteration)
+ {
+ $msg.= "(loop iteration $iteration.) ";
+ }
+ $msg.= "at line ";
+ $msg.= (caller)[2];
+ $msg.= "\n(errcode: $err, test: $test_counter)\n";
+ if ($expected_error)
+ {
+ $msg.= "Was expecting errcode: $expected_error\n";
+ }
+ warn $msg;
+ $runtime_error= 1;
+ if ($opt_abort_on_error)
+ {
+ exit 1;
+ }
+ return 0;
+}
+
+#
+# Print "skip" and the reason
+#
+# Note: Every time this function is called it will be counted
+# as a unit test.
+#
+# Args: $com: The actual command run. Will be printed on a failure
+# $reason: The reason to skip a test
+# $verbose: Be more verbose.
+#
+
+sub skip
+{
+ my ($com, $reason, $verbose)= @_;
+
+ $test_counter++;
+ return 0 if $test_begin > $test_counter;
+ exit 0 if $test_end && $test_end < $test_counter;
+ printf '%-64s[ skipped ]%5d', $com, $test_counter if $verbose;
+ print "ok $test_counter # skip $reason" unless $verbose;
+ print "\n";
+ return 1;
+}
+
+####
+#### Count tests
+#### Arguments: $t: an array of the tests
+####
+
+sub count_tests
+{
+ my ($t)= @_;
+ my ($i, $nr_tests);
+
+ $nr_tests= 0;
+ for ($i= 0; defined(@$t[$i]); $i++) { $nr_tests++ if (@$t[$i]); }
+ return $nr_tests;
+}
+
+####
+#### Run a bunch of tests
+#### Arguments: $t: an array of the tests
+#### $verbose: to be passed to ok()
+#### $clear: clear log files if set
+####
+
+sub run_test_bunch
+{
+ my ($t, $verbose, $clear)= @_;
+ my ($i);
+
+ for ($i= 0; defined(@$t[$i]); $i++)
+ {
+ if ($clear && @$t[$i] eq $NEW_TEST)
+ {
+ unlink <aria_log.* aria_log_control>;
+ }
+ if (@$t[$i] ne $NEW_TEST)
+ {
+ ok(@$t[$i], $verbose, $i + 1);
+ }
+ }
+}
+
+####
+#### usage
+####
+
+sub usage
+{
+ print <<EOF;
+$my_progname version $VER
+
+Description:
+
+Run various Aria related tests. Typically used via make test as a unittest.
+
+Options
+--help Show this help and exit.
+--abort-on-error Abort at once in case of error.
+--number-of-tests Print the total number of tests and exit.
+--run-tests=... Test number(s) that should be run. You can give just
+ one number or a range. For example 45..89. To run a specific
+ test alone, for example test 215, use --run-tests=215..215
+ Use this option with caution, because some of the tests
+ might depend on previous ones.
+--start-from=... Alias for --run-tests
+--silent=... Silent option passed to ma_test* tests ('$opt_silent')
+--valgrind=... Options for valgrind.
+ ('$opt_valgrind')
+--verbose Be more verbose. Will print each unittest on a line
+ and result after. This mode cannot be used with unit.pl
+ when running in normal unit test mode.
+--version Show version number and exit.
+EOF
+ exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler-t.c b/storage/maria/unittest/ma_test_loghandler-t.c
new file mode 100644
index 00000000000..ffac9b04839
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler-t.c
@@ -0,0 +1,661 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void example_loghandler_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+static TRN *trn= &dummy_transaction_object;
+
+#define PCACHE_SIZE (1024*1024*10)
+
+#define LONG_BUFFER_SIZE (100 * 1024)
+
+#ifdef LONG_LOG_TEST
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE (1024L*1024L*8)
+#define ITERATIONS (1600*4)
+
+#else
+#undef SKIP_BIG_TESTS
+#define SKIP_BIG_TESTS(X) /* no-op */
+#define LOG_FLAGS (TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC)
+#define LOG_FILE_SIZE (1024L*1024L*8L)
+#define ITERATIONS 1600
+#endif
+
+/*
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE 1024L*1024L*1024L
+#define ITERATIONS 181000
+*/
+
+/*
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE 1024L*1024L*3L
+#define ITERATIONS 1600
+*/
+
+/*
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE 1024L*1024L*100L
+#define ITERATIONS 65000
+*/
+
+/*
+ Generate random value in the range (0,LONG_BUFFER_SIZE)
+*/
+static uint32 rand_buffer_size()
+{
+ return (uint32)((ulonglong)rand()*(LONG_BUFFER_SIZE + 1)/RAND_MAX);
+}
+
+/*
+ Check that the buffer filled correctly
+
+ SYNOPSIS
+ check_content()
+ ptr Pointer to the buffer
+ length length of the buffer
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+
+static my_bool check_content(uchar *ptr, ulong length)
+{
+ ulong i;
+ uchar buff[2];
+ for (i= 0; i < length; i++)
+ {
+ if (i % 2 == 0)
+ int2store(buff, i >> 1);
+ if (ptr[i] != buff[i % 2])
+ {
+ fprintf(stderr, "Byte # %lu is %x instead of %x",
+ i, (uint) ptr[i], (uint) buff[i % 2]);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+/*
+ Report OK for read operation
+
+ SYNOPSIS
+ read_ok()
+ rec the record header
+*/
+
+void read_ok(TRANSLOG_HEADER_BUFFER *rec)
+{
+ ok(1, "read record type: %u LSN: (%lu,0x%lx)",
+ rec->type, LSN_IN_PARTS(rec->lsn));
+}
+
+/*
+ Read whole record content, and check content (put with offset)
+
+ SYNOPSIS
+ read_and_check_content()
+ rec The record header buffer
+ buffer The buffer to read the record in
+ skip Skip this number of bytes ot the record content
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec,
+ uchar *buffer, uint skip)
+{
+ DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE * 2 + 7 * 2 + 2);
+ if (translog_read_record(rec->lsn, 0, rec->record_length, buffer, NULL) !=
+ rec->record_length)
+ return 1;
+ return check_content(buffer + skip, rec->record_length - skip);
+}
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint32 i;
+ uint32 rec_len;
+ uint pagen;
+ uchar long_tr_id[6];
+ uchar lsn_buff[23]=
+ {
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55
+ };
+ uchar long_buffer[LONG_BUFFER_SIZE * 2 + LSN_STORE_SIZE * 2 + 2];
+ PAGECACHE pagecache;
+ LSN lsn, lsn_base, first_lsn;
+ TRANSLOG_HEADER_BUFFER rec;
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 3];
+ struct st_translog_scanner_data scanner;
+ int rc;
+
+ MY_INIT(argv[0]);
+
+ if (my_set_max_open_files(100) < 100)
+ {
+ fprintf(stderr, "can't allocate 100 file descriptors\n");
+ exit(1);
+ }
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= (char *)".";
+ if (maria_log_remove())
+ exit(1);
+
+ for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i+= 2)
+ {
+ int2store(long_buffer + i, (i >> 1));
+ /* long_buffer[i]= (i & 0xFF); */
+ }
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ LOG_FLAGS, 0, &translog_example_table_init,
+ 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ plan(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1 + 1);
+
+ SKIP_BIG_TESTS(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1 + 1)
+ {
+
+ srand(122334817L);
+
+ long_tr_id[5]= 0xff;
+
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ trn->short_id= 0;
+ trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ lsn_base= first_lsn= lsn;
+
+ for (i= 1; i < ITERATIONS; i++)
+ {
+ trn->short_id= i % 0xFFFF;
+ if (i % 2)
+ {
+ lsn_store(lsn_buff, lsn_base);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+ /* check auto-count feature */
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= NULL;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= 0;
+ if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_1LSN_EXAMPLE, trn,
+ NULL, LSN_STORE_SIZE, 0, parts, NULL, NULL))
+ {
+ fprintf(stderr, "1 Can't write reference defore record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+ lsn_store(lsn_buff, lsn_base);
+ if ((rec_len= rand_buffer_size()) < 12)
+ rec_len= 12;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+ /* check record length auto-counting */
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+ trn, NULL, 0, TRANSLOG_INTERNAL_PARTS + 2,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "1 Can't write var reference defore record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+ }
+ else
+ {
+ lsn_store(lsn_buff, lsn_base);
+ lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 23;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_2LSN_EXAMPLE,
+ trn, NULL, 23, TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "0 Can't write reference defore record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+ lsn_store(lsn_buff, lsn_base);
+ lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+ if ((rec_len= rand_buffer_size()) < 19)
+ rec_len= 19;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 14;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE,
+ trn, NULL, 14 + rec_len,
+ TRANSLOG_INTERNAL_PARTS + 2, parts, NULL,
+ NULL))
+ {
+ fprintf(stderr, "0 Can't write var reference defore record #%lu\n",
+ (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+ }
+ int4store(long_tr_id, i);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ trn, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+
+ lsn_base= lsn;
+
+ if ((rec_len= rand_buffer_size()) < 9)
+ rec_len= 9;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+ trn, NULL, rec_len,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+ if (translog_flush(lsn))
+ {
+ fprintf(stderr, "Can't flush #%lu\n", (ulong) i);
+ translog_destroy();
+ ok(0, "flush");
+ exit(1);
+ }
+ ok(1, "flush");
+ }
+
+ if (translog_flush(translog_get_horizon()))
+ {
+ fprintf(stderr, "Can't flush up to horizon\n");
+ translog_destroy();
+ ok(0, "flush");
+ exit(1);
+ }
+ ok(1, "flush");
+
+ srand(122334817L);
+
+ rc= 1;
+
+ {
+ int len= translog_read_record_header(first_lsn, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "translog_read_record_header failed (%d)\n", errno);
+ goto err;
+ }
+ if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 ||
+ rec.record_length != 6 || uint4korr(rec.header) != 0 ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF ||
+ first_lsn != rec.lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(0)\n"
+ "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, "
+ "lsn(%lu,0x%lx)\n",
+ (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length,
+ (uint) uint4korr(rec.header), (uint) rec.header[4],
+ (uint) rec.header[5],
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+ lsn= first_lsn;
+ if (translog_scanner_init(first_lsn, 1, &scanner, 0))
+ {
+ fprintf(stderr, "scanner init failed\n");
+ goto err;
+ }
+ for (i= 1;; i++)
+ {
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ if (i != ITERATIONS)
+ {
+ fprintf(stderr, "EOL met at iteration %u instead of %u\n",
+ i, ITERATIONS);
+ goto err;
+ }
+ break;
+ }
+ if (i % 2)
+ {
+ LSN ref;
+ ref= lsn_korr(rec.header);
+ if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 7 || ref != lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE "
+ "data read(%d) "
+ "type: %u strid: %u len: %u"
+ "ref: (%lu,0x%lx) (%lu,0x%lx) "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ LSN_IN_PARTS(ref), LSN_IN_PARTS(lsn),
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ }
+ else
+ {
+ LSN ref1, ref2;
+ ref1= lsn_korr(rec.header);
+ ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+ if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 23 ||
+ ref1 != lsn ||
+ ref2 != first_lsn ||
+ ((uchar)rec.header[22]) != 0x55 ||
+ ((uchar)rec.header[21]) != 0xAA ||
+ ((uchar)rec.header[20]) != 0x55 ||
+ ((uchar)rec.header[19]) != 0xAA ||
+ ((uchar)rec.header[18]) != 0x55 ||
+ ((uchar)rec.header[17]) != 0xAA ||
+ ((uchar)rec.header[16]) != 0x55 ||
+ ((uchar)rec.header[15]) != 0xAA ||
+ ((uchar)rec.header[14]) != 0x55)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %u, ref1(%lu,0x%lx), "
+ "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+ (uint) rec.header[14], (uint) rec.header[15],
+ (uint) rec.header[16], (uint) rec.header[17],
+ (uint) rec.header[18], (uint) rec.header[19],
+ (uint) rec.header[20], (uint) rec.header[21],
+ (uint) rec.header[22],
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ }
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header (var) "
+ "failed (%d)\n", i, errno);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ fprintf(stderr, "EOL met at the middle of iteration (first var) %u "
+ "instead of beginning of %u\n", i, ITERATIONS);
+ goto err;
+ }
+ if (i % 2)
+ {
+ LSN ref;
+ ref= lsn_korr(rec.header);
+ if ((rec_len= rand_buffer_size()) < 12)
+ rec_len= 12;
+ if (rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len + LSN_STORE_SIZE ||
+ len != 12 || ref != lsn ||
+ check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+ "data read(%d)"
+ "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), "
+ "hdr len: %u (%d), "
+ "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n",
+ i, (uint) rec.type,
+ rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+ (uint) rec.short_trid,
+ rec.short_trid != (i % 0xFFFF),
+ (ulong) rec.record_length, (ulong) rec_len,
+ rec.record_length != rec_len + LSN_STORE_SIZE,
+ (uint) len,
+ len != 12,
+ LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn),
+ (len != 12 || ref != lsn),
+ check_content(rec.header + LSN_STORE_SIZE,
+ len - LSN_STORE_SIZE));
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ }
+ else
+ {
+ LSN ref1, ref2;
+ ref1= lsn_korr(rec.header);
+ ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+ if ((rec_len= rand_buffer_size()) < 19)
+ rec_len= 19;
+ if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len + LSN_STORE_SIZE * 2 ||
+ len != 19 ||
+ ref1 != lsn ||
+ ref2 != first_lsn ||
+ check_content(rec.header + LSN_STORE_SIZE * 2,
+ len - LSN_STORE_SIZE * 2))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, "
+ "ref1(%lu,0x%lx), ref2(%lu,0x%lx), "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (ulong) rec.record_length, (ulong) rec_len,
+ len, LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ }
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ fprintf(stderr, "EOL met at the middle of iteration %u "
+ "instead of beginning of %u\n", i, ITERATIONS);
+ goto err;
+ }
+ if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 6 || uint4korr(rec.header) != i ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(%d)\n"
+ "type %u, strid %u, len %u, i: %u, 4: %u 5: %u "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ (uint) uint4korr(rec.header), (uint) rec.header[4],
+ (uint) rec.header[5],
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ lsn= rec.lsn;
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if ((rec_len= rand_buffer_size()) < 9)
+ rec_len= 9;
+ if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len ||
+ len != 9 || check_content(rec.header, (uint)len))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %lu != %lu, hdr len: %d, "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (ulong) rec.record_length, (ulong) rec_len,
+ len, LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, 0))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ goto err;
+ }
+ read_ok(&rec);
+ translog_free_record_header(&rec);
+ }
+ }
+
+ rc= 0;
+err:
+ if (rc)
+ ok(0, "read record");
+ } /* SKIP_BIG_TESTS */
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+
+ if (maria_log_remove())
+ exit(1);
+
+ return(test(exit_status()));
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c
new file mode 100644
index 00000000000..06d9a00c04c
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c
@@ -0,0 +1,160 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define LOG_FLAGS 0
+
+static char *first_translog_file= (char*)"maria_log.00000001";
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint pagen;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN lsn, first_lsn, theor_lsn;
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+ MY_INIT(argv[0]);
+
+ plan(2);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= (char *)".";
+ if (maria_log_remove())
+ exit(1);
+ /* be sure that we have no logs in the directory*/
+ my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+ my_delete(first_translog_file, MYF(0));
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ LOG_FLAGS, 0, &translog_example_table_init,
+ 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ theor_lsn= translog_first_theoretical_lsn();
+ if (theor_lsn == 1)
+ {
+ fprintf(stderr, "Error reading the first log file.");
+ translog_destroy();
+ exit(1);
+ }
+ if (theor_lsn == LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "There is no first log file.");
+ translog_destroy();
+ exit(1);
+ }
+ first_lsn= translog_first_lsn_in_log();
+ if (first_lsn != LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).",
+ LSN_IN_PARTS(first_lsn));
+ translog_destroy();
+ exit(1);
+ }
+ ok(1, "Empty log response");
+
+
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+
+ theor_lsn= translog_first_theoretical_lsn();
+ if (theor_lsn == 1)
+ {
+ fprintf(stderr, "Error reading the first log file\n");
+ translog_destroy();
+ exit(1);
+ }
+ if (theor_lsn == LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "There is no first log file\n");
+ translog_destroy();
+ exit(1);
+ }
+ first_lsn= translog_first_lsn_in_log();
+ if (first_lsn != theor_lsn)
+ {
+ fprintf(stderr, "Incorrect first lsn: (%lu,0x%lx) "
+ " theoretical first: (%lu,0x%lx)\n",
+ LSN_IN_PARTS(first_lsn), LSN_IN_PARTS(theor_lsn));
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "Full log response");
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+ exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c
new file mode 100644
index 00000000000..64f486b8cf1
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c
@@ -0,0 +1,156 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (8*1024L*1024L)
+#define LOG_FLAGS 0
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ ulong i;
+ uint pagen;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN lsn, max_lsn, last_lsn= LSN_IMPOSSIBLE;
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+ MY_INIT(argv[0]);
+
+ plan(2);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= (char *)".";
+ if (maria_log_remove())
+ exit(1);
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ LOG_FLAGS, 0, &translog_example_table_init,
+ 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ max_lsn= translog_get_file_max_lsn_stored(1);
+ if (max_lsn == 1)
+ {
+ fprintf(stderr, "Error reading the first log file.");
+ translog_destroy();
+ exit(1);
+ }
+ if (max_lsn != LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).",
+ LSN_IN_PARTS(max_lsn));
+ translog_destroy();
+ exit(1);
+ }
+ ok(1, "Empty log response");
+
+
+ /* write more then 1 file */
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ for(i= 0; i < LOG_FILE_SIZE/6; i++)
+ {
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+ if (LSN_FILE_NO(lsn) == 1)
+ last_lsn= lsn;
+ }
+
+
+ max_lsn= translog_get_file_max_lsn_stored(1);
+ if (max_lsn == 1)
+ {
+ fprintf(stderr, "Error reading the first log file\n");
+ translog_destroy();
+ exit(1);
+ }
+ if (max_lsn == LSN_IMPOSSIBLE)
+ {
+ fprintf(stderr, "Isn't first file still finished?!!\n");
+ translog_destroy();
+ exit(1);
+ }
+ if (max_lsn != last_lsn)
+ {
+ fprintf(stderr, "Incorrect max lsn: (%lu,0x%lx) "
+ " last lsn on first file: (%lu,0x%lx)\n",
+ LSN_IN_PARTS(max_lsn), LSN_IN_PARTS(last_lsn));
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "First file max LSN");
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+ exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_multigroup-t.c b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c
new file mode 100644
index 00000000000..7ba7ce3176d
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c
@@ -0,0 +1,746 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+#include "sequence_storage.h"
+#include <my_getopt.h>
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+static TRN *trn= &dummy_transaction_object;
+
+
+#ifndef READONLY_TEST
+
+#define PCACHE_SIZE (1024*1024*10)
+#define LONG_BUFFER_SIZE ((1024L*1024L*1024L) + (1024L*1024L*512))
+#define MIN_REC_LENGTH (1024L*1024L + 1024L*512L + 1)
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define ITERATIONS 2
+#define READONLY 0
+
+#else
+
+#define PCACHE_SIZE (1024*1024*10)
+#define LONG_BUFFER_SIZE (1024L*1024L)
+#define MIN_REC_LENGTH (1024L)
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define ITERATIONS 2
+#define READONLY 1
+
+#endif /*READONLY_TEST*/
+
+
+/*
+#define LOG_FILE_SIZE 1024L*1024L*3L
+#define ITERATIONS 1600
+*/
+/*
+#define LOG_FILE_SIZE 1024L*1024L*100L
+#define ITERATIONS 65000
+*/
+
+
+/*
+ Check that the buffer filled correctly
+
+ SYNOPSIS
+ check_content()
+ ptr Pointer to the buffer
+ length length of the buffer
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool check_content(uchar *ptr, ulong length)
+{
+ ulong i;
+ uchar buff[4];
+ DBUG_ENTER("check_content");
+ for (i= 0; i < length; i++)
+ {
+ if (i % 4 == 0)
+ int4store(buff, (i >> 2));
+ if (ptr[i] != buff[i % 4])
+ {
+ fprintf(stderr, "Byte # %lu is %x instead of %x",
+ i, (uint) ptr[i], (uint) buff[i % 4]);
+ DBUG_DUMP("mem", ptr +(ulong) (i > 16 ? i - 16 : 0),
+ (i > 16 ? 16 : i) + (i + 16 < length ? 16 : length - i));
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Read whole record content, and check content (put with offset)
+
+ SYNOPSIS
+ read_and_check_content()
+ rec The record header buffer
+ buffer The buffer to read the record in
+ skip Skip this number of bytes ot the record content
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec,
+ uchar *buffer, uint skip)
+{
+ int res= 0;
+ translog_size_t len;
+ DBUG_ENTER("read_and_check_content");
+ DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2);
+ if ((len= translog_read_record(rec->lsn, 0, rec->record_length,
+ buffer, NULL)) != rec->record_length)
+ {
+ fprintf(stderr, "Requested %lu byte, read %lu\n",
+ (ulong) rec->record_length, (ulong) len);
+ res= 1;
+ }
+ res|= check_content(buffer + skip, rec->record_length - skip);
+ DBUG_RETURN(res);
+}
+
+static const char *load_default_groups[]= {"ma_unit_loghandler", 0};
+#ifndef DBUG_OFF
+static const char *default_dbug_option=
+ IF_WIN("d:t:i:O,\\ma_test_loghandler.trace",
+ "d:t:i:o,/tmp/ma_test_loghandler.trace");
+#endif
+static const char *opt_wfile= NULL;
+static const char *opt_rfile= NULL;
+static struct my_option my_long_options[] =
+{
+#ifndef DBUG_OFF
+ {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.",
+ 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"write-seq", 'w', "Path to file in which \"random\" sequence used in the test will be written",
+ (uchar**) &opt_wfile, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"read-seq", 'r', "Path to file from which \"random\" sequence used in the test will be read",
+ (uchar**) &opt_rfile, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"help", '?', "Display this help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+static SEQ_STORAGE seq;
+
+static uint32 get_len()
+{
+ uint32 res;
+ DBUG_ENTER("get_len");
+ if (opt_rfile)
+ res= seq_storage_next(&seq);
+ else
+ {
+ res= (uint32)
+ ((ulonglong) rand() *
+ (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1) / RAND_MAX) + MIN_REC_LENGTH;
+ if (opt_wfile &&
+ seq_storage_write(opt_wfile, res))
+ exit(1);
+ }
+ DBUG_PRINT("info", ("length value : %lu", (ulong) res));
+ DBUG_RETURN(res);
+}
+
+static void usage(void)
+{
+ puts("Copyright (C) 2008 MySQL AB");
+ puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+ puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+ puts("Unit test of maria engine");
+ VOID(printf("\nUsage: %s [OPTIONS]\n", my_progname_short));
+ my_print_help(my_long_options);
+ print_defaults("my", load_default_groups);
+ my_print_variables(my_long_options);
+}
+
+
+static my_bool
+get_one_option(int optid __attribute__((unused)),
+ const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch (optid) {
+ case '?':
+ usage();
+ exit(0);
+#ifndef DBUG_OFF
+ case '#':
+ DBUG_SET_INITIAL(argument ? argument : default_dbug_option);
+ break;
+#endif
+ }
+ return 0;
+}
+
+
+static void get_options(int *argc,char ***argv)
+{
+ int ho_error;
+
+ if ((ho_error= handle_options(argc, argv, my_long_options, get_one_option)))
+ exit(ho_error);
+
+ if (opt_rfile && opt_wfile)
+ {
+ usage();
+ exit(1);
+ }
+}
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint32 i;
+ uint32 rec_len;
+ uint pagen;
+ uchar long_tr_id[6];
+ uchar lsn_buff[23]=
+ {
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+ 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55
+ };
+ uchar *long_buffer= malloc(LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2);
+ char **default_argv;
+ PAGECACHE pagecache;
+ LSN lsn, lsn_base, first_lsn;
+ TRANSLOG_HEADER_BUFFER rec;
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 2];
+ struct st_translog_scanner_data scanner;
+ int rc;
+
+ MY_INIT(argv[0]);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= (char *)".";
+ load_defaults("my", load_default_groups, &argc, &argv);
+ default_argv= argv;
+ get_options(&argc, &argv);
+
+ if (maria_log_remove())
+ exit(1);
+
+ {
+ uchar buff[4];
+ for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i++)
+ {
+ if (i % 4 == 0)
+ int4store(buff, (i >> 2));
+ long_buffer[i]= buff[i % 4];
+ }
+ }
+
+ bzero(long_tr_id, 6);
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ 0, 0, &translog_example_table_init, 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ plan(((ITERATIONS - 1) * 4 + 1) * 2);
+
+ if (opt_rfile &&
+ seq_storage_reader_init(&seq, opt_rfile))
+ exit(1);
+ srand(122334817L);
+
+ long_tr_id[5]= 0xff;
+
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ trn->short_id= 0;
+ trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1, parts,
+ NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%u\n", 0);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ lsn_base= first_lsn= lsn;
+
+ for (i= 1; i < ITERATIONS; i++)
+ {
+ if (i % 2)
+ {
+ lsn_store(lsn_buff, lsn_base);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_1LSN_EXAMPLE, trn, NULL,
+ LSN_STORE_SIZE, TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "1 Can't write reference before record #%u\n", i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+ lsn_store(lsn_buff, lsn_base);
+ rec_len= get_len();
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+ trn, NULL, LSN_STORE_SIZE + rec_len,
+ TRANSLOG_INTERNAL_PARTS + 2,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "1 Can't write var reference before record #%u\n", i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+ }
+ else
+ {
+ lsn_store(lsn_buff, lsn_base);
+ lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= 23;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_2LSN_EXAMPLE,
+ trn, NULL, 23, TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "0 Can't write reference before record #%u\n", i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+ lsn_store(lsn_buff, lsn_base);
+ lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+ rec_len= get_len();
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE * 2;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE,
+ trn, NULL, LSN_STORE_SIZE * 2 + rec_len,
+ TRANSLOG_INTERNAL_PARTS + 2,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "0 Can't write var reference before record #%u\n", i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+ }
+ int4store(long_tr_id, i);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ trn, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%u\n", i);
+ translog_destroy();
+ ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+
+ lsn_base= lsn;
+
+ rec_len= get_len();
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len;
+ trn->short_id= i % 0xFFFF;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+ trn, NULL, rec_len,
+ TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write variable record #%u\n", i);
+ translog_destroy();
+ ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+ exit(1);
+ }
+ ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+ }
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+
+ if (ma_control_file_open(TRUE,TRUE))
+ {
+ fprintf(stderr, "pass2: Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, 0)) == 0)
+ {
+ fprintf(stderr, "pass2: Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ 0, READONLY, &translog_example_table_init, 0))
+ {
+ fprintf(stderr, "pass2: Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+
+
+ /* If we were writing sequence we need it only once */
+ opt_wfile= NULL;
+ if (opt_rfile)
+ seq_storage_rewind(&seq);
+ srand(122334817L);
+
+ rc= 1;
+
+ {
+ int len= translog_read_record_header(first_lsn, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "translog_read_record_header failed (%d)\n", errno);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 ||
+ rec.record_length != 6 || uint4korr(rec.header) != 0 ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF ||
+ first_lsn != rec.lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(0)\n"
+ "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, "
+ "lsn(0x%lu,0x%lx)\n",
+ (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length,
+ (uint)uint4korr(rec.header), (uint) rec.header[4],
+ (uint) rec.header[5],
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+ lsn= first_lsn;
+ if (translog_scanner_init(first_lsn, 1, &scanner, 0))
+ {
+ fprintf(stderr, "scanner init failed\n");
+ goto err;
+ }
+ for (i= 1;; i++)
+ {
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ if (i != ITERATIONS)
+ {
+ fprintf(stderr, "EOL met at iteration %u instead of %u\n",
+ i, ITERATIONS);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ break;
+ }
+
+ if (i % 2)
+ {
+ LSN ref;
+ ref= lsn_korr(rec.header);
+ if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != LSN_STORE_SIZE || ref != lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE "
+ "data read(%d)"
+ "type %u, strid %u, len %u, ref(%lu,0x%lx), lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ else
+ {
+ LSN ref1, ref2;
+ ref1= lsn_korr(rec.header);
+ ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+ if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 23 ||
+ ref1 != lsn ||
+ ref2 != first_lsn ||
+ ((uchar)rec.header[22]) != 0x55 ||
+ ((uchar)rec.header[21]) != 0xAA ||
+ ((uchar)rec.header[20]) != 0x55 ||
+ ((uchar)rec.header[19]) != 0xAA ||
+ ((uchar)rec.header[18]) != 0x55 ||
+ ((uchar)rec.header[17]) != 0xAA ||
+ ((uchar)rec.header[16]) != 0x55 ||
+ ((uchar)rec.header[15]) != 0xAA ||
+ ((uchar)rec.header[14]) != 0x55)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %u, ref1(%lu,0x%lx), "
+ "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+ (uint) rec.header[14], (uint) rec.header[15],
+ (uint) rec.header[16], (uint) rec.header[17],
+ (uint) rec.header[18], (uint) rec.header[19],
+ (uint) rec.header[20], (uint) rec.header[21],
+ (uint) rec.header[22],
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ DBUG_ASSERT(0);
+ goto err;
+ }
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header (var) "
+ "failed (%d)\n", i, errno);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ fprintf(stderr, "EOL met at the middle of iteration (first var) %u "
+ "instead of beginning of %u\n", i, ITERATIONS);
+ goto err;
+ }
+ if (i % 2)
+ {
+ LSN ref;
+ ref= lsn_korr(rec.header);
+ rec_len= get_len();
+ if (rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len + LSN_STORE_SIZE ||
+ len != 12 || ref != lsn ||
+ check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+ "data read(%d)"
+ "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), "
+ "hdr len: %d (%d), "
+ "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n",
+ i, (uint) rec.type,
+ rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+ (uint) rec.short_trid,
+ rec.short_trid != (i % 0xFFFF),
+ (ulong) rec.record_length, (ulong) rec_len,
+ rec.record_length != rec_len + LSN_STORE_SIZE,
+ len,
+ len != 12,
+ LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn),
+ (ref != lsn),
+ check_content(rec.header + LSN_STORE_SIZE,
+ len - LSN_STORE_SIZE));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ else
+ {
+ LSN ref1, ref2;
+ ref1= lsn_korr(rec.header);
+ ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+ rec_len= get_len();
+ if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len + LSN_STORE_SIZE * 2 ||
+ len != 19 ||
+ ref1 != lsn ||
+ ref2 != first_lsn ||
+ check_content(rec.header + LSN_STORE_SIZE * 2,
+ len - LSN_STORE_SIZE * 2))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ " data read(%d) "
+ "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, "
+ "ref1(%lu,0x%lx), ref2(%lu,0x%lx), "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (ulong) rec.record_length, (ulong) rec_len,
+ len,
+ LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ fprintf(stderr, "EOL met at the middle of iteration %u "
+ "instead of beginning of %u\n", i, ITERATIONS);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != 6 || uint4korr(rec.header) != i ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(%d)\n"
+ "type %u, strid %u, len %u, i: %u, 4: %u 5: %u "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (uint) rec.record_length,
+ (uint)uint4korr(rec.header), (uint) rec.header[4],
+ (uint) rec.header[5],
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+
+ lsn= rec.lsn;
+
+ len= translog_read_next_record_header(&scanner, &rec);
+ rec_len= get_len();
+ if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE ||
+ rec.short_trid != (i % 0xFFFF) ||
+ rec.record_length != rec_len ||
+ len != 9 || check_content(rec.header, len))
+ {
+ fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+ "data read(%d) "
+ "type %u, strid %u, len %lu != %lu, hdr len: %d, "
+ "lsn(%lu,0x%lx)\n",
+ i, (uint) rec.type, (uint) rec.short_trid,
+ (ulong) rec.record_length, (ulong) rec_len,
+ len, LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, 0))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ ok(1, "read record");
+ translog_free_record_header(&rec);
+ }
+ }
+
+ rc= 0;
+err:
+ if (rc)
+ ok(0, "read record");
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ free_defaults(default_argv);
+ seq_storage_destroy(&seq);
+ if (maria_log_remove())
+ exit(1);
+
+ return (test(exit_status()));
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_multithread-t.c b/storage/maria/unittest/ma_test_loghandler_multithread-t.c
new file mode 100644
index 00000000000..354f5d12e08
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_multithread-t.c
@@ -0,0 +1,556 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+/*#define LOG_FLAGS TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC */
+#define LOG_FLAGS 0
+/*#define LONG_BUFFER_SIZE (1024L*1024L*1024L + 1024L*1024L*512)*/
+
+#ifdef MULTIFLUSH_TEST
+
+#define LONG_BUFFER_SIZE (16384L)
+#define MIN_REC_LENGTH 10
+#define SHOW_DIVIDER 20
+#define ITERATIONS 10000
+#define FLUSH_ITERATIONS 1000
+#define WRITERS 2
+#define FLUSHERS 10
+
+#else
+
+#define LONG_BUFFER_SIZE (512L*1024L*1024L)
+#define MIN_REC_LENGTH 30
+#define SHOW_DIVIDER 10
+#define ITERATIONS 3
+#define FLUSH_ITERATIONS 0
+#define WRITERS 3
+#define FLUSHERS 0
+
+#endif
+
+static uint number_of_writers= WRITERS;
+static uint number_of_flushers= FLUSHERS;
+
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count;
+
+static ulong lens[WRITERS][ITERATIONS];
+static LSN lsns1[WRITERS][ITERATIONS];
+static LSN lsns2[WRITERS][ITERATIONS];
+static uchar *long_buffer;
+
+
+static LSN last_lsn; /* For test purposes the variable allow dirty read/write */
+
+/*
+ Get pseudo-random length of the field in
+ limits [MIN_REC_LENGTH..LONG_BUFFER_SIZE]
+
+ SYNOPSIS
+ get_len()
+
+ RETURN
+ length - length >= 0 length <= LONG_BUFFER_SIZE
+*/
+
+static uint32 get_len()
+{
+ return MIN_REC_LENGTH +
+ (uint32)(((ulonglong)rand())*
+ (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1)/RAND_MAX);
+}
+
+
+/*
+ Check that the buffer filled correctly
+
+ SYNOPSIS
+ check_content()
+ ptr Pointer to the buffer
+ length length of the buffer
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+static my_bool check_content(uchar *ptr, ulong length)
+{
+ ulong i;
+ for (i= 0; i < length; i++)
+ {
+ if (((uchar)ptr[i]) != (i & 0xFF))
+ {
+ fprintf(stderr, "Byte # %lu is %x instead of %x",
+ i, (uint) ptr[i], (uint) (i & 0xFF));
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+/*
+ Read whole record content, and check content (put with offset)
+
+ SYNOPSIS
+ read_and_check_content()
+ rec The record header buffer
+ buffer The buffer to read the record in
+ skip Skip this number of bytes ot the record content
+
+ RETURN
+ 0 - OK
+ 1 - Error
+*/
+
+
+static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec,
+ uchar *buffer, uint skip)
+{
+ int res= 0;
+ translog_size_t len;
+
+ if ((len= translog_read_record(rec->lsn, 0, rec->record_length,
+ buffer, NULL)) != rec->record_length)
+ {
+ fprintf(stderr, "Requested %lu byte, read %lu\n",
+ (ulong) rec->record_length, (ulong) len);
+ res= 1;
+ }
+ res|= check_content(buffer + skip, rec->record_length - skip);
+ return(res);
+}
+
+void writer(int num)
+{
+ LSN lsn;
+ TRN trn;
+ uchar long_tr_id[6];
+ uint i;
+
+ trn.short_id= num;
+ trn.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ for (i= 0; i < ITERATIONS; i++)
+ {
+ uint len= get_len();
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+ lens[num][i]= len;
+
+ int2store(long_tr_id, num);
+ int4store(long_tr_id + 2, i);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write LOGREC_FIXED_RECORD_0LSN_EXAMPLE record #%lu "
+ "thread %i\n", (ulong) i, num);
+ translog_destroy();
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(0, "write records");
+ pthread_mutex_unlock(&LOCK_thread_count);
+ return;
+ }
+ lsns1[num][i]= lsn;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= len;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+ &trn, NULL,
+ len, TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i);
+ translog_destroy();
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(0, "write records");
+ pthread_mutex_unlock(&LOCK_thread_count);
+ return;
+ }
+ lsns2[num][i]= lsn;
+ last_lsn= lsn;
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(1, "write records");
+ pthread_mutex_unlock(&LOCK_thread_count);
+ }
+ return;
+}
+
+
+static void *test_thread_writer(void *arg)
+{
+ int param= *((int*) arg);
+
+ my_thread_init();
+
+ writer(param);
+
+ pthread_mutex_lock(&LOCK_thread_count);
+ thread_count--;
+ ok(1, "writer finished"); /* just to show progress */
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are
+ ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ return(0);
+}
+
+
+static void *test_thread_flusher(void *arg)
+{
+ int param= *((int*) arg);
+ int i;
+
+ my_thread_init();
+
+ for(i= 0; i < FLUSH_ITERATIONS; i++)
+ {
+ translog_flush(last_lsn);
+ pthread_mutex_lock(&LOCK_thread_count);
+ ok(1, "-- flush %d", param);
+ pthread_mutex_unlock(&LOCK_thread_count);
+ }
+
+ pthread_mutex_lock(&LOCK_thread_count);
+ thread_count--;
+ ok(1, "flusher finished"); /* just to show progress */
+ VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are
+ ready */
+ pthread_mutex_unlock(&LOCK_thread_count);
+ free((uchar*) arg);
+ my_thread_end();
+ return(0);
+}
+
+
+int main(int argc __attribute__((unused)),
+ char **argv __attribute__ ((unused)))
+{
+ uint32 i;
+ uint pagen;
+ PAGECACHE pagecache;
+ LSN first_lsn;
+ TRANSLOG_HEADER_BUFFER rec;
+ struct st_translog_scanner_data scanner;
+ pthread_t tid;
+ pthread_attr_t thr_attr;
+ int *param, error;
+ int rc;
+
+ /* Disabled until Sanja tests */
+ plan(1);
+ ok(1, "disabled");
+ exit(0);
+
+ plan(WRITERS + FLUSHERS +
+ ITERATIONS * WRITERS * 3 + FLUSH_ITERATIONS * FLUSHERS );
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= (char *)".";
+ long_buffer= malloc(LONG_BUFFER_SIZE + 7 * 2 + 2);
+ if (long_buffer == 0)
+ {
+ fprintf(stderr, "End of memory\n");
+ exit(1);
+ }
+ for (i= 0; i < (LONG_BUFFER_SIZE + 7 * 2 + 2); i++)
+ long_buffer[i]= (i & 0xFF);
+
+ MY_INIT(argv[0]);
+ if (maria_log_remove())
+ exit(1);
+
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+
+ if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+ {
+ fprintf(stderr, "COND_thread_count: %d from pthread_cond_init "
+ "(errno: %d)\n", error, errno);
+ exit(1);
+ }
+ if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+ {
+ fprintf(stderr, "LOCK_thread_count: %d from pthread_cond_init "
+ "(errno: %d)\n", error, errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_init(&thr_attr)))
+ {
+ fprintf(stderr, "Got error: %d from pthread_attr_init "
+ "(errno: %d)\n", error, errno);
+ exit(1);
+ }
+ if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+ {
+ fprintf(stderr,
+ "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+ VOID(thr_setconcurrency(2));
+#endif
+
+ my_thread_global_init();
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ TRANSLOG_PAGE_SIZE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ LOG_FLAGS, 0, &translog_example_table_init,
+ 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ srand(122334817L);
+ {
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar long_tr_id[6]=
+ {
+ 0x11, 0x22, 0x33, 0x44, 0x55, 0x66
+ };
+
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ if (translog_write_record(&first_lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write the first record\n");
+ translog_destroy();
+ exit(1);
+ }
+ }
+
+
+ pthread_mutex_lock(&LOCK_thread_count);
+ while (number_of_writers != 0 || number_of_flushers != 0)
+ {
+ if (number_of_writers)
+ {
+ param= (int*) malloc(sizeof(int));
+ *param= number_of_writers - 1;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+ (void*) param)))
+ {
+ fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_writers--;
+ }
+ if (number_of_flushers)
+ {
+ param= (int*) malloc(sizeof(int));
+ *param= number_of_flushers - 1;
+ if ((error= pthread_create(&tid, &thr_attr, test_thread_flusher,
+ (void*) param)))
+ {
+ fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n",
+ error, errno);
+ exit(1);
+ }
+ thread_count++;
+ number_of_flushers--;
+ }
+ }
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ pthread_attr_destroy(&thr_attr);
+
+ /* wait finishing */
+ pthread_mutex_lock(&LOCK_thread_count);
+ while (thread_count)
+ {
+ if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count)))
+ fprintf(stderr, "COND_thread_count: %d from pthread_cond_wait\n", error);
+ }
+ pthread_mutex_unlock(&LOCK_thread_count);
+
+ /* Find last LSN and flush up to it (all our log) */
+ {
+ LSN max= 0;
+ for (i= 0; i < WRITERS; i++)
+ {
+ if (cmp_translog_addr(lsns2[i][ITERATIONS - 1], max) > 0)
+ max= lsns2[i][ITERATIONS - 1];
+ }
+ translog_flush(max);
+ }
+
+ rc= 1;
+
+ {
+ uint indeces[WRITERS];
+ uint index, stage;
+ int len;
+ bzero(indeces, sizeof(uint) * WRITERS);
+
+ bzero(indeces, sizeof(indeces));
+
+ if (translog_scanner_init(first_lsn, 1, &scanner, 0))
+ {
+ fprintf(stderr, "scanner init failed\n");
+ goto err;
+ }
+ for (i= 0;; i++)
+ {
+ len= translog_read_next_record_header(&scanner, &rec);
+
+ if (len == RECHEADER_READ_ERROR)
+ {
+ fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+ i, errno);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (len == RECHEADER_READ_EOF)
+ {
+ if (i != WRITERS * ITERATIONS * 2)
+ {
+ fprintf(stderr, "EOL met at iteration %u instead of %u\n",
+ i, ITERATIONS * WRITERS * 2);
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ break;
+ }
+ index= indeces[rec.short_trid] / 2;
+ stage= indeces[rec.short_trid] % 2;
+ if (stage == 0)
+ {
+ if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE ||
+ rec.record_length != 6 ||
+ uint2korr(rec.header) != rec.short_trid ||
+ index != uint4korr(rec.header + 2) ||
+ cmp_translog_addr(lsns1[rec.short_trid][index], rec.lsn) != 0)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(%d)\n"
+ "type %u, strid %u %u, len %u, i: %u %u, "
+ "lsn(%lu,0x%lx) (%lu,0x%lx)\n",
+ i, (uint) rec.type,
+ (uint) rec.short_trid, (uint) uint2korr(rec.header),
+ (uint) rec.record_length,
+ (uint) index, (uint) uint4korr(rec.header + 2),
+ LSN_IN_PARTS(rec.lsn),
+ LSN_IN_PARTS(lsns1[rec.short_trid][index]));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ else
+ {
+ if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE ||
+ len != 9 ||
+ rec.record_length != lens[rec.short_trid][index] ||
+ cmp_translog_addr(lsns2[rec.short_trid][index], rec.lsn) != 0 ||
+ check_content(rec.header, (uint)len))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+ "data read(%d) "
+ "thread: %d, iteration %d, stage %d\n"
+ "type %u (%d), len %d, length %lu %lu (%d) "
+ "lsn(%lu,0x%lx) (%lu,0x%lx)\n",
+ i, (uint) rec.short_trid, index, stage,
+ (uint) rec.type, (rec.type !=
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE),
+ len,
+ (ulong) rec.record_length, lens[rec.short_trid][index],
+ (rec.record_length != lens[rec.short_trid][index]),
+ LSN_IN_PARTS(rec.lsn),
+ LSN_IN_PARTS(lsns2[rec.short_trid][index]));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ if (read_and_check_content(&rec, long_buffer, 0))
+ {
+ fprintf(stderr,
+ "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+ "in whole rec read lsn(%lu,0x%lx)\n",
+ LSN_IN_PARTS(rec.lsn));
+ translog_free_record_header(&rec);
+ goto err;
+ }
+ }
+ ok(1, "record read");
+ translog_free_record_header(&rec);
+ indeces[rec.short_trid]++;
+ }
+ }
+
+ rc= 0;
+err:
+ if (rc)
+ ok(0, "record read");
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+
+ return(exit_status());
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_noflush-t.c b/storage/maria/unittest/ma_test_loghandler_noflush-t.c
new file mode 100644
index 00000000000..973dfd03bcf
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_noflush-t.c
@@ -0,0 +1,146 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define LOG_FLAGS 0
+
+static char *first_translog_file= (char*)"maria_log.00000001";
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint pagen;
+ int rc= 1;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN first_lsn;
+ TRANSLOG_HEADER_BUFFER rec;
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+ translog_size_t len;
+
+ MY_INIT(argv[0]);
+
+ plan(1);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= (char *)".";
+ if (maria_log_remove())
+ exit(1);
+ /* be sure that we have no logs in the directory*/
+ my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+ my_delete(first_translog_file, MYF(0));
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ LOG_FLAGS, 0, &translog_example_table_init,
+ 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ int4store(long_tr_id, 0);
+ long_tr_id[5]= 0xff;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&first_lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+
+ len= translog_read_record_header(first_lsn, &rec);
+ if (len == 0)
+ {
+ fprintf(stderr, "translog_read_record_header failed (%d)\n", errno);
+ goto err;
+ }
+ if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 ||
+ rec.record_length != 6 || uint4korr(rec.header) != 0 ||
+ ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF ||
+ first_lsn != rec.lsn)
+ {
+ fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+ "data read(0)\n"
+ "type: %u (%d) strid: %u (%d) len: %u (%d) i: %u (%d), "
+ "4: %u (%d) 5: %u (%d) "
+ "lsn(%lu,0x%lx) (%d)\n",
+ (uint) rec.type, (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE),
+ (uint) rec.short_trid, (rec.short_trid != 0),
+ (uint) rec.record_length, (rec.record_length != 6),
+ (uint) uint4korr(rec.header), (uint4korr(rec.header) != 0),
+ (uint) rec.header[4], (((uchar)rec.header[4]) != 0),
+ (uint) rec.header[5], (((uchar)rec.header[5]) != 0xFF),
+ LSN_IN_PARTS(rec.lsn), (first_lsn != rec.lsn));
+ goto err;
+ }
+
+ ok(1, "read OK");
+ rc= 0;
+
+err:
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+
+ exit(rc);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_nologs-t.c b/storage/maria/unittest/ma_test_loghandler_nologs-t.c
new file mode 100644
index 00000000000..34508d1d751
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_nologs-t.c
@@ -0,0 +1,195 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void example_loghandler_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (8*1024L*1024L)
+#define LOG_FLAGS 0
+#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2)
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ ulong i;
+ uint pagen;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN lsn;
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar *long_buffer= malloc(LONG_BUFFER_SIZE);
+
+ MY_INIT(argv[0]);
+
+ plan(2);
+
+ bzero(&pagecache, sizeof(pagecache));
+ bzero(long_buffer, LONG_BUFFER_SIZE);
+ maria_data_root= (char *)".";
+ if (maria_log_remove())
+ exit(1);
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ LOG_FLAGS, 0, &translog_example_table_init,
+ 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ /* write more then 1 file */
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #0\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++)
+ {
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #0\n");
+ translog_destroy();
+ exit(1);
+ }
+ }
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+
+ {
+ char file_name[FN_REFLEN];
+ for (i= 1; i <= 2; i++)
+ {
+ translog_filename_by_fileno(i, file_name);
+ if (my_access(file_name, W_OK))
+ {
+ fprintf(stderr, "No file '%s'\n", file_name);
+ exit(1);
+ }
+ if (my_delete(file_name, MYF(MY_WME)) != 0)
+ {
+ fprintf(stderr, "Error %d during removing file'%s'\n",
+ errno, file_name);
+ exit(1);
+ }
+ }
+ }
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ LOG_FLAGS, 0, &translog_example_table_init,
+ 1))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ ok(1, "Log init OK");
+
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #0\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+
+ if (!translog_is_file(3))
+ {
+ fprintf(stderr, "No file #3\n");
+ exit(1);
+ }
+
+ ok(1, "New log is OK");
+
+ if (maria_log_remove())
+ exit(1);
+ exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_pagecache-t.c b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c
new file mode 100644
index 00000000000..1644aa4885c
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c
@@ -0,0 +1,200 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define LOG_FLAGS 0
+
+static char *first_translog_file= (char*)"aria_log.00000001";
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+ pgcache_page_no_t page_no __attribute__((unused)),
+ uchar* data_ptr __attribute__((unused)))
+{
+ return 0;
+}
+
+
+/**
+ @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+ return;
+}
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ uint pagen;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN lsn;
+ my_off_t file_size;
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+ MY_INIT(argv[0]);
+
+ plan(1);
+
+ bzero(&pagecache, sizeof(pagecache));
+ maria_data_root= (char *)".";
+ if (maria_log_remove())
+ exit(1);
+ /* be sure that we have no logs in the directory*/
+ my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+ my_delete(first_translog_file, MYF(0));
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler_pagecache.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler_pagecache.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ LOG_FLAGS, 0, &translog_example_table_init,
+ 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ if ((file1.file= my_open(first_translog_file, O_RDONLY, MYF(MY_WME))) < 0)
+ {
+ fprintf(stderr, "There is no %s (%d)\n", first_translog_file, errno);
+ exit(1);
+ }
+ file_size= my_seek(file1.file, 0, SEEK_END, MYF(MY_WME));
+ if (file_size != TRANSLOG_PAGE_SIZE)
+ {
+ fprintf(stderr,
+ "incorrect initial size of %s: %ld instead of %ld\n",
+ first_translog_file, (long)file_size, (long)TRANSLOG_PAGE_SIZE);
+ exit(1);
+ }
+ my_close(file1.file, MYF(MY_WME));
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+
+ if ((file1.file= my_open(file1_name,
+ O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+ {
+ fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n",
+ errno);
+ exit(1);
+ }
+ pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+ &dummy_fail_callback, maria_flush_log_for_page, NULL);
+ if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+ exit(1);
+
+ {
+ uchar page[PCACHE_PAGE];
+
+ bzero(page, PCACHE_PAGE);
+ lsn_store(page, lsn);
+ pagecache_write(&pagecache, &file1, 0, 3, page,
+ PAGECACHE_LSN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DELAY,
+ 0, LSN_IMPOSSIBLE);
+ flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+ }
+ my_close(file1.file, MYF(MY_WME));
+ if ((file1.file= my_open(first_translog_file, O_RDONLY, MYF(MY_WME))) < 0)
+ {
+ fprintf(stderr, "can't open %s (%d)\n", first_translog_file, errno);
+ exit(1);
+ }
+ file_size= my_seek(file1.file, 0, SEEK_END, MYF(MY_WME));
+ if (file_size != TRANSLOG_PAGE_SIZE * 2)
+ {
+ fprintf(stderr,
+ "incorrect initial size of %s: %ld instead of %ld\n",
+ first_translog_file,
+ (long)file_size, (long)(TRANSLOG_PAGE_SIZE * 2));
+ ok(0, "log triggered");
+ exit(1);
+ }
+ my_close(file1.file, MYF(MY_WME));
+ ok(1, "log triggered");
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+ my_delete(first_translog_file, MYF(0));
+ my_delete(file1_name, MYF(0));
+
+ exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_purge-t.c b/storage/maria/unittest/ma_test_loghandler_purge-t.c
new file mode 100644
index 00000000000..d37b45bc3ca
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_purge-t.c
@@ -0,0 +1,192 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (8*1024L*1024L)
+#define LOG_FLAGS 0
+#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2)
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+ ulong i;
+ uint pagen;
+ uchar long_tr_id[6];
+ PAGECACHE pagecache;
+ LSN lsn;
+ LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+ uchar *long_buffer= malloc(LONG_BUFFER_SIZE);
+
+ MY_INIT(argv[0]);
+
+ plan(4);
+
+ bzero(&pagecache, sizeof(pagecache));
+ bzero(long_buffer, LONG_BUFFER_SIZE);
+ maria_data_root= (char *)".";
+ if (maria_log_remove())
+ exit(1);
+
+ bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+ default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+ default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+ if (argc > 1)
+ {
+ DBUG_SET(default_dbug_option);
+ DBUG_SET_INITIAL(default_dbug_option);
+ }
+#endif
+
+ if (ma_control_file_open(TRUE, TRUE))
+ {
+ fprintf(stderr, "Can't init control file (%d)\n", errno);
+ exit(1);
+ }
+ if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+ PCACHE_PAGE, 0)) == 0)
+ {
+ fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+ exit(1);
+ }
+ if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+ LOG_FLAGS, 0, &translog_example_table_init,
+ 0))
+ {
+ fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+ exit(1);
+ }
+ /* Suppressing of automatic record writing */
+ dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+ /* write more then 1 file */
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+
+ translog_purge(lsn);
+ if (!translog_is_file(1))
+ {
+ fprintf(stderr, "First file was removed after first record\n");
+ translog_destroy();
+ exit(1);
+ }
+ ok(1, "First is not removed");
+
+ for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++)
+ {
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+ translog_destroy();
+ exit(1);
+ }
+ }
+
+ translog_purge(lsn);
+ if (translog_is_file(1))
+ {
+ fprintf(stderr, "First file was not removed.\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "First file is removed");
+
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= LONG_BUFFER_SIZE;
+ if (translog_write_record(&lsn,
+ LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, LONG_BUFFER_SIZE,
+ TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write variable record\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ translog_purge(lsn);
+ if (!translog_is_file(2) || !translog_is_file(3))
+ {
+ fprintf(stderr, "Second file (%d) or third file (%d) is not present.\n",
+ translog_is_file(2), translog_is_file(3));
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "Second and third files are not removed");
+
+ int4store(long_tr_id, 0);
+ parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+ parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+ if (translog_write_record(&lsn,
+ LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+ &dummy_transaction_object, NULL, 6,
+ TRANSLOG_INTERNAL_PARTS + 1,
+ parts, NULL, NULL))
+ {
+ fprintf(stderr, "Can't write last record\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ translog_purge(lsn);
+ if (translog_is_file(2))
+ {
+ fprintf(stderr, "Second file is not removed\n");
+ translog_destroy();
+ exit(1);
+ }
+
+ ok(1, "Second file is removed");
+
+ translog_destroy();
+ end_pagecache(&pagecache, 1);
+ ma_control_file_end();
+ if (maria_log_remove())
+ exit(1);
+ exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_recovery.expected b/storage/maria/unittest/ma_test_recovery.expected
new file mode 100644
index 00000000000..5f7dd54e673
--- /dev/null
+++ b/storage/maria/unittest/ma_test_recovery.expected
@@ -0,0 +1,1578 @@
+Testing the REDO PHASE ALONE
+TEST WITH ma_test1 -s -M -T -c
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -d500
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test2 -s -M -T -c -b65000
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test2 -s -M -T -c -b65000 -d800
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test1 -s -M -T -c -C
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -d500 -C
+applying log
+testing idempotency
+applying log
+Testing the REDO AND UNDO PHASE
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=4 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=4 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=4 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=4 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=4 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=4 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=4 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=4 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status: changed
+========DIFF END=======
diff --git a/storage/maria/unittest/ma_test_recovery.pl b/storage/maria/unittest/ma_test_recovery.pl
new file mode 100755
index 00000000000..d9be82f4e58
--- /dev/null
+++ b/storage/maria/unittest/ma_test_recovery.pl
@@ -0,0 +1,481 @@
+#!/usr/bin/env perl
+
+use Getopt::Long;
+use File::Copy;
+use File::Compare;
+use File::Basename;
+use Digest::MD5;
+
+$|= 1;
+$^W = 1; # warnings, because env cannot parse 'perl -w'
+$VER= "1.2";
+
+$opt_version= 0;
+$opt_help= 0;
+$opt_verbose= 0;
+$opt_abort_on_error=0;
+
+my $silent= "-s";
+my $maria_path; # path to "storage/maria"
+my $maria_exe_path; # path to executables (ma_test1, aria_chk etc)
+my $tmp= "./tmp";
+my $my_progname= $0;
+my $suffix;
+my $zerofilled_tables= 0;
+
+$my_progname=~ s/.*[\/]//;
+$maria_path= dirname($0) . "/..";
+
+main();
+
+####
+#### main function
+####
+
+sub main
+{
+ my ($res, $table);
+
+ if (!GetOptions("abort-on-error", "help", "version", "verbose"))
+ {
+ $flag_exit= 1;
+ }
+ if ($opt_version)
+ {
+ print "$my_progname version $VER\n";
+ exit(0);
+ }
+ usage() if ($opt_help || $flag_exit);
+
+ $suffix= ( $^O =~ /win/i && $^O !~ /darwin/i ) ? ".exe" : "";
+ $maria_exe_path= "$maria_path/release";
+ # we use -f, sometimes -x is unexpectedly false in Cygwin
+ if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+ {
+ $maria_exe_path= "$maria_path/relwithdebinfo";
+ if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+ {
+ $maria_exe_path= "$maria_path/debug";
+ if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+ {
+ $maria_exe_path= $maria_path;
+ if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+ {
+ die("Cannot find ma_test1 executable\n");
+ }
+ }
+ }
+ }
+
+ # test data is always put in the current directory or a tmp subdirectory
+ # of it
+
+ if (! -d "$tmp")
+ {
+ mkdir $tmp;
+ }
+ print "ARIA RECOVERY TESTS\n";
+
+ # To not flood the screen, we redirect all the commands below to a text file
+ # and just give a final error if their output is not as expected
+
+ open (MY_LOG, ">$tmp/ma_test_recovery.output") or die "Can't open log file\n";
+ print MY_LOG "Testing the REDO PHASE ALONE\n";
+
+ # runs a program inserting/deleting rows, then moves the resulting table
+ # elsewhere; applies the log and checks that the data file is
+ # identical to the saved original.
+
+ my @t= ("ma_test1$suffix $silent -M -T -c",
+ "ma_test2$suffix $silent -L -K -W -P -M -T -c -d500",
+ "ma_test2$suffix $silent -M -T -c -b65000",
+ "ma_test2$suffix $silent -M -T -c -b65000 -d800",
+ "ma_test1$suffix $silent -M -T -c -C",
+ "ma_test2$suffix $silent -L -K -W -P -M -T -c -d500 -C",
+ #"ma_rt_test$suffix $silent -M -T -c -C",
+ # @todo: also add to @t2
+ );
+
+ foreach my $prog (@t)
+ {
+ unlink <aria_log.* aria_log_control>;
+ my $prog_no_suffix= $prog;
+ $prog_no_suffix=~ s/$suffix// if ($suffix);
+ print MY_LOG "TEST WITH $prog_no_suffix\n";
+ $res= my_exec("$maria_exe_path/$prog");
+ print MY_LOG $res;
+ # derive table's name from program's name
+ if ($prog =~ m/^ma_(\S+)\s.*/)
+ {
+ $table= $1;
+ }
+ else
+ {
+ die("can't guess table name");
+ }
+ $com= "$maria_exe_path/aria_chk$suffix -dvv $table ";
+ $com.= "| grep -v \"Creation time:\" | grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\"";
+ $com.= "> $tmp/aria_chk_message.good.txt 2>&1";
+ my_exec($com);
+ my $checksum= my_exec("$maria_exe_path/aria_chk$suffix -dss $table");
+ move("$table.MAD", "$tmp/$table-good.MAD") ||
+ die "Can't move $table.MAD to $tmp/$table-good.MAD\n";
+ move("$table.MAI", "$tmp/$table-good.MAI") ||
+ die "Can't move $table.MAI to $tmp/$table-good.MAI\n";
+ apply_log($table, "shouldnotchangelog");
+ check_table_is_same($table, $checksum);
+ $res= physical_cmp($table, "$tmp/$table-good");
+ print MY_LOG $res;
+ print MY_LOG "testing idempotency\n";
+ apply_log($table, "shouldnotchangelog");
+ check_table_is_same($table, $checksum);
+ $res= physical_cmp($table, "$tmp/$table-good");
+ print MY_LOG $res;
+ }
+
+ print MY_LOG "Testing the REDO AND UNDO PHASE\n";
+ # The test programs look like:
+ # work; commit (time T1); work; exit-without-commit (time T2)
+ # We first run the test program and let it exit after T1's commit.
+ # Then we run it again and let it exit at T2. Then we compare
+ # and expect identity.
+
+ my @take_checkpoints= ("no", "yes");
+ my @blobs= ("", "-b32768");
+ my @test_undo= (1, 2, 3, 4);
+ my @t2= ("ma_test1$suffix $silent -M -T -c -N blob -H1",
+ "--testflag=1",
+ "--testflag=2 --test-undo=",
+ "ma_test1$suffix $silent -M -T -c -N blob -H2",
+ "--testflag=3",
+ "--testflag=4 --test-undo=",
+ "ma_test1$suffix $silent -M -T -c -N blob -H2 --versioning",
+ "--testflag=3",
+ "--testflag=4 --test-undo=",
+ "ma_test1$suffix $silent -M -T -c -N blob -H2",
+ "--testflag=2",
+ "--testflag=3 --test-undo=",
+ "ma_test2$suffix $silent -L -K -W -P -M -T -c blob -H1",
+ "-t1",
+ "-t2 -A",
+ "ma_test2$suffix $silent -L -K -W -P -M -T -c blob -H1",
+ "-t1",
+ "-t6 -A");
+
+ foreach my $take_checkpoint (@take_checkpoints)
+ {
+ my ($i, $j, $k, $commit_run_args, $abort_run_args);
+ # we test table without blobs and then table with blobs
+ for ($i= 0; defined($blobs[$i]); $i++)
+ {
+ for ($j= 0; defined($test_undo[$j]); $j++)
+ {
+ # first iteration tests rollback of insert, second tests rollback of delete
+ # -N (create NULL fields) is needed because --test-undo adds it anyway
+ for ($k= 0; defined($t2[$k]); $k+= 3)
+ {
+ $prog= $t2[$k];
+ $prog=~ s/blob/$blobs[$i]/;
+ if ("$take_checkpoint" eq "no") {
+ $prog=~ s/\s+\-H[0-9]+//;
+ }
+ $commit_run_args= $t2[$k + 1];
+ $abort_run_args= $t2[$k + 2];
+ unlink <aria_log.* aria_log_control>;
+ my $prog_no_suffix= $prog;
+ $prog_no_suffix=~ s/$suffix// if ($suffix);
+ print MY_LOG "TEST WITH $prog_no_suffix $commit_run_args (commit at end)\n";
+ $res= my_exec("$maria_exe_path/$prog $commit_run_args");
+ print MY_LOG $res;
+ # derive table's name from program's name
+ if ($prog =~ m/^ma_(\S+)\s.*/)
+ {
+ $table= $1;
+ }
+ else
+ {
+ die("can't guess table name");
+ }
+ $com= "$maria_exe_path/aria_chk$suffix -dvv $table ";
+ $com.= "| grep -v \"Creation time:\" | grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\" ";
+ $com.= "> $tmp/aria_chk_message.good.txt 2>&1";
+ $res= my_exec($com);
+ print MY_LOG $res;
+ $checksum= my_exec("$maria_exe_path/aria_chk$suffix -dss $table");
+ move("$table.MAD", "$tmp/$table-good.MAD") ||
+ die "Can't move $table.MAD to $tmp/$table-good.MAD\n";
+ move("$table.MAI", "$tmp/$table-good.MAI") ||
+ die "Can't move $table.MAI to $tmp/$table-good.MAI\n";
+ unlink <aria_log.* aria_log_control>;
+ print MY_LOG "TEST WITH $prog_no_suffix $abort_run_args$test_undo[$j] (additional aborted work)\n";
+ $res= my_exec("$maria_exe_path/$prog $abort_run_args$test_undo[$j]");
+ print MY_LOG $res;
+ copy("$table.MAD", "$tmp/$table-before_undo.MAD") ||
+ die "Can't copy $table.MAD to $tmp/$table-before_undo.MAD\n";
+ copy("$table.MAI", "$tmp/$table-before_undo.MAI") ||
+ die "Can't copy $table.MAI to $tmp/$table-before_undo.MAI\n";
+
+ # The lines below seem unneeded, will be removed soon
+ # We have to copy and restore logs, as running aria_read_log will
+ # change the aria_control_file
+ # rm -f $tmp/aria_log.* $tmp/aria_log_control
+ # cp $maria_path/aria_log* $tmp
+
+ if ($test_undo[$j] != 3) {
+ apply_log($table, "shouldchangelog"); # should undo aborted work
+ } else {
+ # probably nothing to undo went to log or data file
+ apply_log($table, "dontknow");
+ }
+ copy("$table.MAD", "$tmp/$table-after_undo.MAD") ||
+ die "Can't copy $table.MAD to $tmp/$table-after_undo.MAD\n";
+ copy("$table.MAI", "$tmp/$table-after_undo.MAI") ||
+ die "Can't copy $table.MAI to $tmp/$table-after_undo.MAI\n";
+
+ # It is impossible to do a "cmp" between .good and .after_undo,
+ # because the UNDO phase generated log
+ # records whose LSN tagged pages. Another reason is that rolling back
+ # INSERT only marks the rows free, does not empty them (optimization), so
+ # traces of the INSERT+rollback remain.
+
+ check_table_is_same($table, $checksum);
+ print MY_LOG "testing idempotency\n";
+ apply_log($table, "shouldnotchangelog");
+ check_table_is_same($table, $checksum);
+ $res= physical_cmp($table, "$tmp/$table-after_undo");
+ print MY_LOG $res;
+ print MY_LOG "testing applying of CLRs to recreate table\n";
+ unlink <$table.MA?>;
+ # cp $tmp/aria_log* $maria_path #unneeded
+ apply_log($table, "shouldnotchangelog");
+ check_table_is_same($table, $checksum);
+ $res= physical_cmp($table, "$tmp/$table-after_undo");
+ print MY_LOG $res;
+ }
+ unlink <$table.* $tmp/$table* $tmp/aria_chk_*.txt $tmp/aria_read_log_$table.txt>;
+ }
+ }
+ }
+
+ if ($? >> 8) {
+ print "Some test failed\n";
+ exit(1);
+ }
+
+ close(MY_LOG);
+ # also note that aria_chk -dvv shows differences for ma_test2 in UNDO phase,
+ # this is normal: removing records does not shrink the data/key file,
+ # does not put back the "analyzed,optimized keys"(etc) index state.
+ `diff -b $maria_path/unittest/ma_test_recovery.expected $tmp/ma_test_recovery.output`;
+ if ($? >> 8) {
+ print "UNEXPECTED OUTPUT OF TESTS, FAILED";
+ print " (zerofilled $zerofilled_tables tables)\n";
+ print "For more info, do diff -b $maria_path/unittest/ma_test_recovery.expected ";
+ print "$tmp/ma_test_recovery.output\n";
+ exit(1);
+ }
+ print "ALL RECOVERY TESTS OK (zerofilled $zerofilled_tables tables)\n";
+}
+
+####
+#### check_table_is_same
+####
+
+sub check_table_is_same
+{
+ my ($table, $checksum)= @_;
+ my ($com, $checksum2, $res);
+
+ # Computes checksum of new table and compares to checksum of old table
+ # Shows any difference in table's state (info from the index's header)
+ # Data/key file length is random in ma_test2 (as it uses srand() which
+ # may differ between machines).
+
+ if ($opt_verbose)
+ {
+ print "checking if table $table has changed\n";
+ }
+
+ $com= "$maria_exe_path/aria_chk$suffix -dvv $table | grep -v \"Creation time:\" ";
+ $com.= "| grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\" > $tmp/aria_chk_message.txt 2>&1";
+ $res= `$com`;
+ print MY_LOG $res;
+ $res= `$maria_exe_path/aria_chk$suffix -ss -e --read-only $table`;
+ print MY_LOG $res;
+ $checksum2= `$maria_exe_path/aria_chk$suffix -dss $table`;
+ if ("$checksum" ne "$checksum2")
+ {
+ print MY_LOG "checksum differs for $table before and after recovery\n";
+ return 1;
+ }
+
+ $com= "diff $tmp/aria_chk_message.good.txt $tmp/aria_chk_message.txt ";
+ $com.= "> $tmp/aria_chk_diff.txt || true";
+ $res= `$com`;
+ print MY_LOG $res;
+
+ if (-s "$tmp/aria_chk_diff.txt")
+ {
+ print MY_LOG "Differences in aria_chk -dvv, recovery not yet perfect !\n";
+ print MY_LOG "========DIFF START=======\n";
+ open(MY_FILE, "<$tmp/aria_chk_diff.txt") || die "Can't open file aria_chk_diff.txt\n";
+ while (<MY_FILE>)
+ {
+ print MY_LOG $_;
+ }
+ close(MY_FILE);
+ print MY_LOG "========DIFF END=======\n";
+ }
+}
+
+####
+#### apply_log
+####
+
+sub apply_log
+{
+ my ($table, $shouldchangelog)= @_;
+ my ($log_md5, $log_md5_2);
+
+ # applies log, can verify if applying did write to log or not
+
+ if ("$shouldchangelog" ne "shouldnotchangelog" &&
+ "$shouldchangelog" ne "shouldchangelog" &&
+ "$shouldchangelog" ne "dontknow" )
+ {
+ print MY_LOG "bad argument '$shouldchangelog'\n";
+ return 1;
+ }
+ foreach (<aria_log.*>)
+ {
+ $log_md5.= md5_conv($_);
+ }
+ print MY_LOG "applying log\n";
+ my_exec("$maria_exe_path/aria_read_log$suffix -a > $tmp/aria_read_log_$table.txt");
+ foreach (<aria_log.*>)
+ {
+ $log_md5_2.= md5_conv($_);
+ }
+ if ("$log_md5" ne "$log_md5_2" )
+ {
+ if ("$shouldchangelog" eq "shouldnotchangelog")
+ {
+ print MY_LOG "aria_read_log should not have modified the log\n";
+ return 1;
+ }
+ }
+ elsif ("$shouldchangelog" eq "shouldchangelog")
+ {
+ print MY_LOG "aria_read_log should have modified the log\n";
+ return 1;
+ }
+}
+
+####
+#### md5_conv
+####
+
+sub md5_conv
+{
+ my ($file)= @_;
+
+ open(FILE, $file) or die "Can't open '$file': $!\n";
+ binmode(FILE);
+ my $md5= Digest::MD5->new;
+ $md5->addfile(FILE);
+ close (FILE);
+ return $md5->hexdigest . "\n";
+}
+
+####
+#### physical_cmp: compares two tables (MAI and MAD) physically;
+#### uses zerofill-keep-lsn to reduce irrelevant differences.
+####
+
+sub physical_cmp
+{
+ my ($table1, $table2)= @_;
+ my ($zerofilled, $ret_text)= (0, "");
+ #return `cmp $table1.MAD $table2.MAD`.`cmp $table1.MAI $table2.MAI`;
+ foreach my $file_suffix ("MAD", "MAI")
+ {
+ my $file1= "$table1.$file_suffix";
+ my $file2= "$table2.$file_suffix";
+ my $res= File::Compare::compare($file1, $file2);
+ die() if ($res == -1);
+ if ($res == 1 # they differ
+ and !$zerofilled)
+ {
+ # let's try with --zerofill-keep-lsn
+ $zerofilled= 1; # but no need to do it twice
+ $zerofilled_tables= $zerofilled_tables + 1;
+ my $table_no= 1;
+ foreach my $table ($table1, $table2)
+ {
+ # save original tables to restore them later
+ copy("$table.MAD", "$tmp/before_zerofill$table_no.MAD") || die();
+ copy("$table.MAI", "$tmp/before_zerofill$table_no.MAI") || die();
+ $com= "$maria_exe_path/aria_chk$suffix -ss --zerofill-keep-lsn $table";
+ $res= `$com`;
+ print MY_LOG $res;
+ $table_no= $table_no + 1;
+ }
+ $res= File::Compare::compare($file1, $file2);
+ die() if ($res == -1);
+ }
+ $ret_text.= "$file1 and $file2 differ\n" if ($res != 0);
+ }
+ if ($zerofilled)
+ {
+ my $table_no= 1;
+ foreach my $table ($table1, $table2)
+ {
+ move("$tmp/before_zerofill$table_no.MAD", "$table.MAD") || die();
+ move("$tmp/before_zerofill$table_no.MAI", "$table.MAI") || die();
+ $table_no= $table_no + 1;
+ }
+ }
+ return $ret_text;
+}
+
+
+sub my_exec
+{
+ my($command)= @_;
+ my $res;
+ if ($opt_verbose)
+ {
+ print "$command\n";
+ }
+ $res= `$command`;
+ if ($? != 0 && $opt_abort_on_error)
+ {
+ exit(1);
+ }
+ return $res;
+}
+
+
+####
+#### usage
+####
+
+sub usage
+{
+ print <<EOF;
+$my_progname version $VER
+
+Description:
+
+Run various Aria recovery tests and print the results
+
+Options
+--help Show this help and exit.
+
+--abort-on-error Abort at once in case of error.
+--verbose Show commands while there are executing.
+--version Show version number and exit.
+
+EOF
+ exit(0);
+}
diff --git a/storage/maria/unittest/sequence_storage.c b/storage/maria/unittest/sequence_storage.c
new file mode 100644
index 00000000000..d5db20d31ca
--- /dev/null
+++ b/storage/maria/unittest/sequence_storage.c
@@ -0,0 +1,110 @@
+/* Copyright (C) 2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "../maria_def.h"
+#include "sequence_storage.h"
+
+
+/**
+ @brief Initializes the sequence from the sequence file.
+
+ @param seq Reference on the sequence storage.
+ @param file Path to the file where to write the sequence
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool seq_storage_reader_init(SEQ_STORAGE *seq, const char *file)
+{
+ FILE *fd;
+ seq->pos= 0;
+ if ((fd= my_fopen(file, O_RDONLY, MYF(MY_WME))) == NULL)
+ return 1;
+ if (my_init_dynamic_array(&seq->seq, sizeof(ulong), 10, 10))
+ return 1;
+
+ for(;;)
+ {
+ ulong num;
+ char line[22];
+ if (fgets(line, sizeof(line), fd) == NULL)
+ break;
+ num= atol(line);
+ if (insert_dynamic(&seq->seq, (uchar*) &num))
+ return 1;
+ }
+ fclose(fd);
+ return 0;
+}
+
+
+/**
+ @brief Gets next number from the sequence storage
+
+ @param seq Reference on the sequence storage.
+
+ @return Next number from the sequence.
+*/
+
+ulong seq_storage_next(SEQ_STORAGE *seq)
+{
+ DBUG_ASSERT(seq->seq.elements > 0);
+ DBUG_ASSERT(seq->pos < seq->seq.elements);
+ return (*(dynamic_element(&seq->seq, seq->pos++, ulong *)));
+}
+
+
+/**
+ @brief Frees resources allocated for the storage
+
+ @param seq Reference on the sequence storage.
+*/
+
+void seq_storage_destroy(SEQ_STORAGE *seq)
+{
+ delete_dynamic(&seq->seq);
+}
+
+
+/**
+ @brief Starts the sequence from begining
+
+ @param seq Reference on the sequence storage.
+*/
+
+void seq_storage_rewind(SEQ_STORAGE *seq)
+{
+ seq->pos= 0;
+}
+
+/**
+ @brief Writes a number to the sequence file.
+
+ @param file Path to the file where to write the sequence
+ @pagem num Number to be written
+
+ @retval 0 OK
+ @retval 1 Error
+*/
+
+my_bool seq_storage_write(const char *file, ulong num)
+{
+ FILE *fd;
+ return ((fd= my_fopen(file, O_CREAT | O_APPEND | O_WRONLY, MYF(MY_WME))) ==
+ NULL ||
+ fprintf(fd, "%lu\n", num) < 0 ||
+ fclose(fd) != 0);
+}
diff --git a/storage/maria/unittest/sequence_storage.h b/storage/maria/unittest/sequence_storage.h
new file mode 100644
index 00000000000..78ce15a6253
--- /dev/null
+++ b/storage/maria/unittest/sequence_storage.h
@@ -0,0 +1,28 @@
+/* Copyright (C) 2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
+typedef struct st_seq_storage
+{
+ uint pos;
+ DYNAMIC_ARRAY seq;
+} SEQ_STORAGE;
+
+extern my_bool seq_storage_reader_init(SEQ_STORAGE *seq, const char *file);
+extern ulong seq_storage_next(SEQ_STORAGE *seq);
+extern void seq_storage_destroy(SEQ_STORAGE *seq);
+extern void seq_storage_rewind(SEQ_STORAGE *seq);
+extern my_bool seq_storage_write(const char *file, ulong num);
+
diff --git a/storage/maria/unittest/test_file.c b/storage/maria/unittest/test_file.c
new file mode 100644
index 00000000000..5f7e3939592
--- /dev/null
+++ b/storage/maria/unittest/test_file.c
@@ -0,0 +1,118 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include <tap.h>
+#include <my_sys.h>
+#include <my_dir.h>
+#include "test_file.h"
+
+
+/*
+ Check that file contance correspond to descriptor
+
+ SYNOPSIS
+ test_file()
+ file File to test
+ file_name Path (and name) of file which is tested
+ size size of file
+ buff_size size of buffer which is enought to check the file
+ desc file descriptor to check with
+
+ RETURN
+ 1 file if OK
+ 0 error
+*/
+
+int test_file(PAGECACHE_FILE file, char *file_name,
+ off_t size, size_t buff_size, struct file_desc *desc)
+{
+ unsigned char *buffr= my_malloc(buff_size, MYF(0));
+ off_t pos= 0;
+ size_t byte;
+ int step= 0;
+ int res= 1; /* ok */
+
+#ifdef __WIN__
+ /*
+ On Windows, the info returned by stat(), specifically file length
+ is not necessarily current, because this is the behavior of
+ underlying FindFirstFile() function.
+ */
+ WIN32_FILE_ATTRIBUTE_DATA file_attr;
+ LARGE_INTEGER li;
+ if(GetFileAttributesEx(file_name, GetFileExInfoStandard, &file_attr) == 0)
+ {
+ diag("Can't GetFileAttributesEx %s (errno: %d)\n", file_name,
+ GetLastError());
+ res= 0;
+ goto err;
+ }
+ li.HighPart= file_attr.nFileSizeHigh;
+ li.LowPart= file_attr.nFileSizeLow;
+ if(li.QuadPart != size)
+ {
+ diag("file %s size is %llu (should be %llu)\n",
+ file_name, (ulonglong)size, (ulonglong)li.QuadPart);
+ res= 0; /* failed */
+ /* continue to get more information */
+ }
+#else
+ MY_STAT stat_buff, *stat;
+ if ((stat= my_stat(file_name, &stat_buff, MYF(0))) == NULL)
+ {
+ diag("Can't stat() %s (errno: %d)\n", file_name, errno);
+ res= 0;
+ goto err;
+ }
+ if (stat->st_size != size)
+ {
+ diag("file %s size is %lu (should be %lu)\n",
+ file_name, (ulong) stat->st_size, (ulong) size);
+ res= 0; /* failed */
+ /* continue to get more information */
+ }
+#endif
+
+ /* check content */
+ my_seek(file.file, 0, SEEK_SET, MYF(MY_WME));
+ while (desc[step].length != 0)
+ {
+ if (my_read(file.file, buffr, desc[step].length, MYF(0)) !=
+ desc[step].length)
+ {
+ diag("Can't read %u bytes from %s (file: %d errno: %d)\n",
+ (uint)desc[step].length, file_name, file.file, errno);
+ res= 0;
+ goto err;
+ }
+ for (byte= 0; byte < desc[step].length; byte++)
+ {
+ if (buffr[byte] != desc[step].content)
+ {
+ diag("content of %s mismatch 0x%x in position %lu instead of 0x%x\n",
+ file_name, (uint) buffr[byte], (ulong) (pos + byte),
+ desc[step].content);
+ res= 0;
+ goto err;
+ }
+ }
+ pos+= desc[step].length;
+ step++;
+ }
+
+err:
+ my_free(buffr, 0);
+ return res;
+}
diff --git a/storage/maria/unittest/test_file.h b/storage/maria/unittest/test_file.h
new file mode 100644
index 00000000000..0a1ccf4ab54
--- /dev/null
+++ b/storage/maria/unittest/test_file.h
@@ -0,0 +1,29 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include <m_string.h>
+#include "../ma_pagecache.h"
+
+/*
+ File content descriptor
+*/
+struct file_desc
+{
+ unsigned int length;
+ unsigned char content;
+};
+
+int test_file(PAGECACHE_FILE file, char *file_name,
+ off_t size, size_t buff_size, struct file_desc *desc);
diff --git a/storage/maria/unittest/trnman-t.c b/storage/maria/unittest/trnman-t.c
new file mode 100644
index 00000000000..43cf982a7f2
--- /dev/null
+++ b/storage/maria/unittest/trnman-t.c
@@ -0,0 +1,175 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include <m_string.h>
+#include "../trnman.h"
+
+pthread_mutex_t rt_mutex;
+pthread_attr_t attr;
+size_t stacksize= 0;
+#define STACK_SIZE (((int)stacksize-2048)*STACK_DIRECTION)
+
+int rt_num_threads;
+int litmus;
+
+/*
+ create and end (commit or rollback) transactions randomly
+*/
+#define MAX_ITER 100
+pthread_handler_t test_trnman(void *arg)
+{
+ uint x, y, i, n;
+ TRN *trn[MAX_ITER];
+ int m= (*(int *)arg);
+
+ if (my_thread_init())
+ BAIL_OUT("my_thread_init failed!");
+
+ for (x= ((int)(intptr)(&m)); m > 0; )
+ {
+ y= x= (x*LL(3628273133) + LL(1500450271)) % LL(9576890767); /* three prime numbers */
+ m-= n= x % MAX_ITER;
+ for (i= 0; i < n; i++)
+ {
+ trn[i]= trnman_new_trn(0);
+ if (!trn[i])
+ {
+ diag("trnman_new_trn() failed");
+ litmus++;
+ }
+ }
+ for (i= 0; i < n; i++)
+ {
+ y= (y*19 + 7) % 31;
+ trnman_end_trn(trn[i], y & 1);
+ }
+ }
+ pthread_mutex_lock(&rt_mutex);
+ rt_num_threads--;
+ pthread_mutex_unlock(&rt_mutex);
+
+ my_thread_end();
+
+ return 0;
+}
+#undef MAX_ITER
+
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+ pthread_t *threads;
+ ulonglong now= my_getsystime();
+ int i;
+
+ litmus= 0;
+
+ threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+ if (!threads)
+ {
+ diag("Out of memory");
+ abort();
+ }
+
+ diag("Testing %s with %d threads, %d iterations... ", test, n, m);
+ rt_num_threads= n;
+ for (i= 0; i < n ; i++)
+ if (pthread_create(threads+i, &attr, handler, &m))
+ {
+ diag("Could not create thread");
+ abort();
+ }
+ for (i= 0 ; i < n ; i++)
+ pthread_join(threads[i], 0);
+ now= my_getsystime()-now;
+ ok(litmus == 0, "Tested %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+ my_free((void*)threads, MYF(0));
+}
+
+#define ok_read_from(T1, T2, RES) \
+ i= trnman_can_read_from(trn[T1], trid[T2]); \
+ ok(i == RES, "trn" #T1 " %s read from trn" #T2, i ? "can" : "cannot")
+#define start_transaction(T) \
+ trn[T]= trnman_new_trn(0); \
+ trid[T]= trn[T]->trid
+#define commit(T) trnman_commit_trn(trn[T])
+#define abort(T) trnman_abort_trn(trn[T])
+
+#define Ntrns 4
+void test_trnman_read_from()
+{
+ TRN *trn[Ntrns];
+ TrID trid[Ntrns];
+ int i;
+
+ start_transaction(0); /* start trn1 */
+ start_transaction(1); /* start trn2 */
+ ok_read_from(1, 0, 0);
+ commit(0); /* commit trn1 */
+ start_transaction(2); /* start trn4 */
+ abort(2); /* abort trn4 */
+ start_transaction(3); /* start trn5 */
+ ok_read_from(3, 0, 1);
+ ok_read_from(3, 1, 0);
+ ok_read_from(3, 2, 0);
+ ok_read_from(3, 3, 1);
+ commit(1); /* commit trn2 */
+ ok_read_from(3, 1, 0);
+ commit(3); /* commit trn5 */
+
+}
+
+int main(int argc __attribute__((unused)), char **argv)
+{
+ MY_INIT(argv[0]);
+
+ plan(7);
+
+ if (my_atomic_initialize())
+ return exit_status();
+
+ pthread_mutex_init(&rt_mutex, 0);
+ pthread_attr_init(&attr);
+#ifdef HAVE_PTHREAD_ATTR_GETSTACKSIZE
+ pthread_attr_getstacksize(&attr, &stacksize);
+ if (stacksize == 0)
+#endif
+ stacksize= PTHREAD_STACK_MIN;
+
+#define CYCLES 10000
+#define THREADS 10
+
+ trnman_init(0);
+
+ test_trnman_read_from();
+ run_test("trnman", test_trnman, THREADS, CYCLES);
+
+ diag("mallocs: %d", trnman_allocated_transactions);
+ {
+ ulonglong now= my_getsystime();
+ trnman_destroy();
+ now= my_getsystime()-now;
+ diag("trnman_destroy: %g", ((double)now)/1e7);
+ }
+
+ pthread_mutex_destroy(&rt_mutex);
+ my_end(0);
+ return exit_status();
+}
+