diff options
author | Sergei Golubchik <sergii@pisem.net> | 2010-11-25 18:17:28 +0100 |
---|---|---|
committer | Sergei Golubchik <sergii@pisem.net> | 2010-11-25 18:17:28 +0100 |
commit | 65ca700def99289cc31a7040537f5aa6e12bf485 (patch) | |
tree | 97b3a07299b626c519da0e80c122b5b79b933914 /storage/maria | |
parent | 2ab57de38d13d927ddff2d51aed4af34e13998f5 (diff) | |
parent | 6e5bcca7935d3c62f84bb640e5357664a210ee12 (diff) | |
download | mariadb-git-65ca700def99289cc31a7040537f5aa6e12bf485.tar.gz |
merge.
checkpoint.
does not compile.
Diffstat (limited to 'storage/maria')
150 files changed, 98895 insertions, 0 deletions
diff --git a/storage/maria/CMakeLists.txt b/storage/maria/CMakeLists.txt new file mode 100644 index 00000000000..7b5b190bd57 --- /dev/null +++ b/storage/maria/CMakeLists.txt @@ -0,0 +1,84 @@ +# Copyright (C) 2007 MySQL AB +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +SET(ARIA_SOURCES ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c + ma_rnext.c ma_rnext_same.c + ma_search.c ma_page.c ma_key_recover.c ma_key.c + ma_locking.c ma_state.c + ma_rrnd.c ma_scan.c ma_cache.c + ma_statrec.c ma_packrec.c ma_dynrec.c + ma_blockrec.c ma_bitmap.c + ma_update.c ma_write.c ma_unique.c + ma_delete.c + ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c + ma_rsamepos.c ma_panic.c ma_close.c ma_create.c + ma_range.c ma_dbug.c ma_checksum.c + ma_changed.c ma_static.c ma_delete_all.c + ma_delete_table.c ma_rename.c ma_check.c + ma_keycache.c ma_preload.c ma_ft_parser.c + ma_ft_update.c ma_ft_boolean_search.c + ma_ft_nlq_search.c ft_maria.c ma_sort.c + ha_maria.cc trnman.c lockman.c tablockman.c + ma_rt_index.c ma_rt_key.c ma_rt_mbr.c ma_rt_split.c + ma_sp_key.c ma_control_file.c ma_loghandler.c + ma_pagecache.c ma_pagecaches.c compat_aliases.cc compat_aliases.h + ma_checkpoint.c ma_recovery.c ma_commit.c ma_pagecrc.c + ha_maria.h maria_def.h ma_recovery_util.c ma_servicethread.c +) + +MYSQL_ADD_PLUGIN(aria ${ARIA_SOURCES} + STORAGE_ENGINE + MANDATORY + RECOMPILE_FOR_EMBEDDED) + +TARGET_LINK_LIBRARIES(aria myisam) + +MYSQL_ADD_EXECUTABLE(aria_ftdump maria_ftdump.c) +TARGET_LINK_LIBRARIES(aria_ftdump aria) + +MYSQL_ADD_EXECUTABLE(aria_chk maria_chk.c) +TARGET_LINK_LIBRARIES(aria_chk aria) + +MYSQL_ADD_EXECUTABLE(aria_read_log maria_read_log.c) +TARGET_LINK_LIBRARIES(aria_read_log aria) + +MYSQL_ADD_EXECUTABLE(aria_dump_log ma_loghandler.c unittest/ma_loghandler_examples.c) +TARGET_LINK_LIBRARIES(aria_dump_log aria) +SET_TARGET_PROPERTIES(aria_dump_log PROPERTIES COMPILE_FLAGS "-DMARIA_DUMP_LOG") + +MYSQL_ADD_EXECUTABLE(aria_pack maria_pack.c) +TARGET_LINK_LIBRARIES(aria_pack aria) + +IF(WITH_UNIT_TESTS AND FALSE) + ADD_EXECUTABLE(ma_test1 ma_test1.c) + TARGET_LINK_LIBRARIES(ma_test1 aria) + + ADD_EXECUTABLE(ma_test2 ma_test2.c) + TARGET_LINK_LIBRARIES(ma_test2 aria) + + ADD_EXECUTABLE(ma_test3 ma_test3.c) + TARGET_LINK_LIBRARIES(ma_test3 aria) + + ADD_EXECUTABLE(ma_rt_test ma_rt_test.c) + TARGET_LINK_LIBRARIES(ma_rt_test aria) + + ADD_EXECUTABLE(ma_sp_test ma_sp_test.c) + TARGET_LINK_LIBRARIES(ma_sp_test aria) +ENDIF() + +IF (MSVC) + SET_TARGET_PROPERTIES(aria_chk aria_pack PROPERTIES LINK_FLAGS "setargv.obj") +ENDIF() + diff --git a/storage/maria/Makefile.am b/storage/maria/Makefile.am new file mode 100644 index 00000000000..a83063a0226 --- /dev/null +++ b/storage/maria/Makefile.am @@ -0,0 +1,202 @@ +# Copyright (C) 2000-2008 MySQL AB +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +MYSQLDATAdir = $(localstatedir) +MYSQLSHAREdir = $(pkgdatadir) +MYSQLBASEdir= $(prefix) +MYSQLLIBdir= $(pkglibdir) +INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include \ + -I$(top_srcdir)/regex \ + -I$(top_srcdir)/sql \ + -I$(srcdir) +WRAPLIBS= + +LDADD = + +DEFS = @DEFS@ + +# "." is needed first because tests in unittest need libaria +SUBDIRS = . unittest + +EXTRA_DIST = ma_test_all.sh ma_test_all.res ma_test_big.sh \ + ma_ft_stem.c CMakeLists.txt plug.in ma_test_recovery +pkgdata_DATA = +pkglib_LIBRARIES = libaria.a +bin_PROGRAMS = aria_chk aria_pack aria_ftdump aria_read_log \ + aria_dump_log +aria_chk_DEPENDENCIES= $(LIBRARIES) +# Only reason to link with libmyisam.a here is that it's where some fulltext +# pieces are (but soon we'll remove fulltext dependencies from Aria). +# For now, it imposes that storage/myisam be built before storage/maria. +aria_chk_SOURCES= maria_chk.c +aria_chk_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +aria_pack_SOURCES= maria_pack.c +aria_pack_DEPENDENCIES=$(LIBRARIES) +aria_pack_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +aria_read_log_SOURCES= maria_read_log.c +aria_read_log_DEPENDENCIES=$(LIBRARIES) +aria_read_log_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +aria_dump_log_DEPENDENCIES=$(LIBRARIES) ma_loghandler.c +aria_dump_log_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +aria_dump_log_SOURCES= ma_loghandler.c unittest/ma_loghandler_examples.c +aria_dump_log_CPPFLAGS= -DMARIA_DUMP_LOG +noinst_PROGRAMS = ma_test1 ma_test2 ma_test3 ma_rt_test ma_sp_test +noinst_HEADERS = maria_def.h ma_rt_index.h ma_rt_key.h ma_rt_mbr.h \ + ma_sp_defs.h ma_fulltext.h ma_ftdefs.h ma_ft_test1.h \ + ma_ft_eval.h trnman.h lockman.h tablockman.h \ + ma_control_file.h ha_maria.h ma_blockrec.h \ + ma_loghandler.h ma_loghandler_lsn.h ma_pagecache.h \ + ma_checkpoint.h ma_recovery.h ma_commit.h ma_state.h \ + trnman_public.h ma_check_standalone.h \ + ma_key_recover.h ma_recovery_util.h \ + ma_servicethread.h compat_aliases.h +ma_test1_DEPENDENCIES= $(LIBRARIES) +ma_test1_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +ma_test2_DEPENDENCIES= $(LIBRARIES) +ma_test2_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +ma_test3_DEPENDENCIES= $(LIBRARIES) +ma_test3_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +#ma_ft_test1_DEPENDENCIES= $(LIBRARIES) +#ma_ft_eval_DEPENDENCIES= $(LIBRARIES) +aria_ftdump_SOURCES= maria_ftdump.c +aria_ftdump_DEPENDENCIES= $(LIBRARIES) +aria_ftdump_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +ma_rt_test_DEPENDENCIES= $(LIBRARIES) +ma_rt_test_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +ma_sp_test_DEPENDENCIES= $(LIBRARIES) +ma_sp_test_LDADD= @CLIENT_EXTRA_LDFLAGS@ libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +libaria_a_SOURCES = ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c \ + ma_rnext.c ma_rnext_same.c \ + ma_search.c ma_page.c ma_key_recover.c ma_key.c \ + ma_locking.c ma_state.c \ + ma_rrnd.c ma_scan.c ma_cache.c \ + ma_statrec.c ma_packrec.c ma_dynrec.c \ + ma_blockrec.c ma_bitmap.c \ + ma_update.c ma_write.c ma_unique.c \ + ma_delete.c \ + ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c \ + ma_rsamepos.c ma_panic.c ma_close.c ma_create.c\ + ma_range.c ma_dbug.c ma_checksum.c \ + ma_changed.c ma_static.c ma_delete_all.c \ + ma_delete_table.c ma_rename.c ma_check.c \ + ma_keycache.c ma_preload.c ma_ft_parser.c \ + ma_ft_update.c ma_ft_boolean_search.c \ + ma_ft_nlq_search.c ft_maria.c ma_sort.c \ + trnman.c lockman.c tablockman.c \ + ma_rt_index.c ma_rt_key.c ma_rt_mbr.c ma_rt_split.c \ + ma_sp_key.c ma_control_file.c ma_loghandler.c \ + ma_pagecache.c ma_pagecaches.c \ + ma_checkpoint.c ma_recovery.c ma_commit.c \ + ma_pagecrc.c ma_recovery_util.c \ + ha_maria.cc compat_aliases.cc ma_servicethread.c +CLEANFILES = test?.MA? FT?.MA? isam.log ma_test_all ma_rt_test.MA? sp_test.MA? aria_log_control aria_log.0000* + +SUFFIXES = .sh + +.sh: + @RM@ -f $@ $@-t + @SED@ \ + -e 's!@''bindir''@!$(bindir)!g' \ + -e 's!@''scriptdir''@!$(bindir)!g' \ + -e 's!@''prefix''@!$(prefix)!g' \ + -e 's!@''datadir''@!$(datadir)!g' \ + -e 's!@''localstatedir''@!$(localstatedir)!g' \ + -e 's!@''libexecdir''@!$(libexecdir)!g' \ + -e 's!@''CC''@!@CC@!'\ + -e 's!@''CXX''@!@CXX@!'\ + -e 's!@''GXX''@!@GXX@!'\ + -e 's!@''PERL''@!@PERL@!' \ + -e 's!@''CFLAGS''@!@SAVE_CFLAGS@!'\ + -e 's!@''CXXFLAGS''@!@SAVE_CXXFLAGS@!'\ + -e 's!@''LDFLAGS''@!@SAVE_LDFLAGS@!'\ + -e 's!@''VERSION''@!@VERSION@!' \ + -e 's!@''MYSQL_SERVER_SUFFIX''@!@MYSQL_SERVER_SUFFIX@!' \ + -e 's!@''COMPILATION_COMMENT''@!@COMPILATION_COMMENT@!' \ + -e 's!@''MACHINE_TYPE''@!@MACHINE_TYPE@!' \ + -e 's!@''HOSTNAME''@!@HOSTNAME@!' \ + -e 's!@''SYSTEM_TYPE''@!@SYSTEM_TYPE@!' \ + -e 's!@''CHECK_PID''@!@CHECK_PID@!' \ + -e 's!@''FIND_PROC''@!@FIND_PROC@!' \ + -e 's!@''MYSQLD_DEFAULT_SWITCHES''@!@MYSQLD_DEFAULT_SWITCHES@!' \ + -e 's!@''MYSQL_UNIX_ADDR''@!@MYSQL_UNIX_ADDR@!' \ + -e 's!@''TARGET_LINUX''@!@TARGET_LINUX@!' \ + -e "s!@""CONF_COMMAND""@!@CONF_COMMAND@!" \ + -e 's!@''MYSQLD_USER''@!@MYSQLD_USER@!' \ + -e 's!@''sysconfdir''@!@sysconfdir@!' \ + -e 's!@''SHORT_MYSQL_INTRO''@!@SHORT_MYSQL_INTRO@!' \ + -e 's!@''SHARED_LIB_VERSION''@!@SHARED_LIB_VERSION@!' \ + -e 's!@''MYSQL_BASE_VERSION''@!@MYSQL_BASE_VERSION@!' \ + -e 's!@''MYSQL_NO_DASH_VERSION''@!@MYSQL_NO_DASH_VERSION@!' \ + -e 's!@''MYSQL_TCP_PORT''@!@MYSQL_TCP_PORT@!' \ + -e 's!@''PERL_DBI_VERSION''@!@PERL_DBI_VERSION@!' \ + -e 's!@''PERL_DBD_VERSION''@!@PERL_DBD_VERSION@!' \ + -e 's!@''PERL_DATA_DUMPER''@!@PERL_DATA_DUMPER@!' \ + $< > $@-t + @CHMOD@ +x $@-t + @MV@ $@-t $@ + +tags: + etags *.h *.c *.cc + +unittests = unittest + +test: + perl $(top_srcdir)/unittest/unit.pl run $(unittests) + +test-verbose: + HARNESS_VERBOSE=1 perl $(top_srcdir)/unittest/unit.pl run $(unittests) + +# Don't update the files from bitkeeper +%::SCCS/s.% diff --git a/storage/maria/compat_aliases.cc b/storage/maria/compat_aliases.cc new file mode 100644 index 00000000000..2d3c67d69a7 --- /dev/null +++ b/storage/maria/compat_aliases.cc @@ -0,0 +1,245 @@ +/* Copyright (C) 2010 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + compatibility aliases for system and static variables +*/ +#include <my_global.h> +#include <maria.h> +#include <mysql/plugin.h> +#include "ma_loghandler.h" +#include "compat_aliases.h" + +ulong block_size_alias; +static MYSQL_SYSVAR_ULONG(block_size, block_size_alias, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Deprecated, use --aria-block-size instead", 0, 0, + MARIA_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH, + MARIA_MAX_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH); + +ulong checkpoint_interval_alias; +static MYSQL_SYSVAR_ULONG(checkpoint_interval, checkpoint_interval_alias, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-checkpoint-interval instead", + NULL, NULL, 30, 0, UINT_MAX, 1); + +ulong force_start_after_recovery_failures_alias; +static MYSQL_SYSVAR_ULONG(force_start_after_recovery_failures, force_start_after_recovery_failures_alias, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Deprecated, use --aria-force-start-after-recovery-failures instead", + NULL, NULL, 0, 0, UINT_MAX8, 1); + +my_bool page_checksum_alias; +static MYSQL_SYSVAR_BOOL(page_checksum, page_checksum_alias, 0, + "Deprecated, use --aria-page-checksum instead", 0, 0, 1); + +char *log_dir_path_alias; +static MYSQL_SYSVAR_STR(log_dir_path, log_dir_path_alias, + PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Deprecated, use --aria-log-dir-path instead", + NULL, NULL, mysql_real_data_home); + +ulong log_file_size_alias; +static MYSQL_SYSVAR_ULONG(log_file_size, log_file_size_alias, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-log-file-size instead", + NULL, NULL, TRANSLOG_FILE_SIZE, + TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE); + +ulong group_commit_alias; +static MYSQL_SYSVAR_ENUM(group_commit, group_commit_alias, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-group-commit instead", + NULL, NULL, + TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib); + +ulong group_commit_interval_alias; +static MYSQL_SYSVAR_ULONG(group_commit_interval, group_commit_interval_alias, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-group-commit-interval instead", + NULL, NULL, 0, 0, UINT_MAX, 1); + +ulong log_purge_type_alias; +static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type_alias, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-log-purge-type instead", + NULL, NULL, TRANSLOG_PURGE_IMMIDIATE, + &maria_translog_purge_type_typelib); + +ulonglong max_sort_file_size_alias; +static MYSQL_SYSVAR_ULONGLONG(max_sort_file_size, max_sort_file_size_alias, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-max-temp-length instead", + 0, 0, MAX_FILE_SIZE, 0, MAX_FILE_SIZE, 1024*1024); + +ulong pagecache_age_threshold_alias; +static MYSQL_SYSVAR_ULONG(pagecache_age_threshold, pagecache_age_threshold_alias, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-pagecache-age-threshold instead", + 0, 0, 300, 100, ~0L, 100); + +ulonglong pagecache_buffer_size_alias; +static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, pagecache_buffer_size_alias, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Deprecated, use --aria-pagecache-buffer-size instead", + 0, 0, KEY_CACHE_SIZE, MALLOC_OVERHEAD, ~0UL, IO_SIZE); + +ulong pagecache_division_limit_alias; +static MYSQL_SYSVAR_ULONG(pagecache_division_limit, pagecache_division_limit_alias, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-pagecache-division-limit instead", + 0, 0, 100, 1, 100, 1); + +ulong recover_alias; +static MYSQL_SYSVAR_ENUM(recover, recover_alias, PLUGIN_VAR_OPCMDARG, + "Deprecated, use --aria-recover instead", + NULL, NULL, HA_RECOVER_DEFAULT, &maria_recover_typelib); + +ulong repair_threads_alias; +static MYSQL_THDVAR_ULONG(repair_threads, PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-repair-threads instead", + 0, 0, 1, 1, ~0L, 1); + +ulong sort_buffer_size_alias; +static MYSQL_THDVAR_ULONG(sort_buffer_size, PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-sort-buffer-size instead", + 0, 0, 128L*1024L*1024L, 4, ~0L, 1); + +ulong stats_method_alias; +static MYSQL_THDVAR_ENUM(stats_method, PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-stats-method instead", + 0, 0, 0, &maria_stats_method_typelib); + +ulong sync_log_dir_alias; +static MYSQL_SYSVAR_ENUM(sync_log_dir, sync_log_dir_alias, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use --aria-sync-log-dir instead", + NULL, NULL, TRANSLOG_SYNC_DIR_NEWFILE, + &maria_sync_log_dir_typelib); + +my_bool used_for_temp_tables_alias= 1; +static MYSQL_SYSVAR_BOOL(used_for_temp_tables, + used_for_temp_tables_alias, PLUGIN_VAR_READONLY | PLUGIN_VAR_NOCMDOPT, + NULL, 0, 0, 1); + +static struct st_mysql_show_var status_variables_aliases[]= { + {"Maria", (char*) &status_variables, SHOW_ARRAY}, + {NullS, NullS, SHOW_LONG} +}; + +/* + There is one problem with aliases for command-line options. + Plugin initialization works like this + + for all plugins: + prepare command-line options + initialize command-line option variables to the default values + parse command line, assign values as necessary + + for all plugins: + call the plugin initialization function + + it means, we cannot have maria* and aria* command-line options to use + the same underlying variables - because after assigning maria* values, + MySQL will put there default values again preparing for parsing aria* + values. So, maria* values will be lost. + + So, we create separate set of variables for maria* options, + and take both values into account in ha_maria_init(). + + When the command line was parsed, we patch maria* options + to use the same variables as aria* options so that + set @@maria_some_var would have the same value as @@aria_some_var + without forcing us to copy the values around all the time. +*/ + +static struct st_mysql_sys_var* system_variables_aliases[]= { + MYSQL_SYSVAR(block_size), + MYSQL_SYSVAR(checkpoint_interval), + MYSQL_SYSVAR(force_start_after_recovery_failures), + MYSQL_SYSVAR(group_commit), + MYSQL_SYSVAR(group_commit_interval), + MYSQL_SYSVAR(log_dir_path), + MYSQL_SYSVAR(log_file_size), + MYSQL_SYSVAR(log_purge_type), + MYSQL_SYSVAR(max_sort_file_size), + MYSQL_SYSVAR(page_checksum), + MYSQL_SYSVAR(pagecache_age_threshold), + MYSQL_SYSVAR(pagecache_buffer_size), + MYSQL_SYSVAR(pagecache_division_limit), + MYSQL_SYSVAR(recover), + MYSQL_SYSVAR(repair_threads), + MYSQL_SYSVAR(sort_buffer_size), + MYSQL_SYSVAR(stats_method), + MYSQL_SYSVAR(sync_log_dir), + MYSQL_SYSVAR(used_for_temp_tables), + NULL +}; + +#define COPY_SYSVAR(name) \ + memcpy(&MYSQL_SYSVAR_NAME(name), system_variables[i++], \ + sizeof(MYSQL_SYSVAR_NAME(name))); \ + if (name ## _alias != MYSQL_SYSVAR_NAME(name).def_val && \ + *MYSQL_SYSVAR_NAME(name).value == MYSQL_SYSVAR_NAME(name).def_val) \ + *MYSQL_SYSVAR_NAME(name).value= name ## _alias; + +#define COPY_THDVAR(name) \ + name ## _alias= THDVAR(0, name); \ + memcpy(&MYSQL_SYSVAR_NAME(name), system_variables[i++], \ + sizeof(MYSQL_SYSVAR_NAME(name))); \ + if (name ## _alias != MYSQL_SYSVAR_NAME(name).def_val && \ + THDVAR(0, name) == MYSQL_SYSVAR_NAME(name).def_val) \ + THDVAR(0, name)= name ## _alias; + +void copy_variable_aliases() +{ + int i= 0; + COPY_SYSVAR(block_size); + COPY_SYSVAR(checkpoint_interval); + COPY_SYSVAR(force_start_after_recovery_failures); + COPY_SYSVAR(group_commit); + COPY_SYSVAR(group_commit_interval); + COPY_SYSVAR(log_dir_path); + COPY_SYSVAR(log_file_size); + COPY_SYSVAR(log_purge_type); + COPY_SYSVAR(max_sort_file_size); + COPY_SYSVAR(page_checksum); + COPY_SYSVAR(pagecache_age_threshold); + COPY_SYSVAR(pagecache_buffer_size); + COPY_SYSVAR(pagecache_division_limit); + COPY_SYSVAR(recover); + COPY_THDVAR(repair_threads); + COPY_THDVAR(sort_buffer_size); + COPY_THDVAR(stats_method); + COPY_SYSVAR(sync_log_dir); + COPY_SYSVAR(used_for_temp_tables); +} + +struct st_maria_plugin compat_aliases= { + MYSQL_DAEMON_PLUGIN, + &maria_storage_engine, + "Maria", + "Monty Program Ab", + "Compatibility aliases for the Aria engine", + PLUGIN_LICENSE_GPL, + NULL, + NULL, + 0x0105, + status_variables_aliases, + system_variables_aliases, + "1.5", + MariaDB_PLUGIN_MATURITY_GAMMA +}; + diff --git a/storage/maria/compat_aliases.h b/storage/maria/compat_aliases.h new file mode 100644 index 00000000000..46a4da74eec --- /dev/null +++ b/storage/maria/compat_aliases.h @@ -0,0 +1,27 @@ +/* Copyright (C) 2010 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +extern struct st_maria_plugin compat_aliases; +extern char mysql_real_data_home[FN_REFLEN]; +extern TYPELIB maria_recover_typelib; +extern TYPELIB maria_stats_method_typelib; +extern TYPELIB maria_translog_purge_type_typelib; +extern TYPELIB maria_sync_log_dir_typelib; +extern TYPELIB maria_group_commit_typelib; +extern struct st_mysql_storage_engine maria_storage_engine; +extern my_bool use_maria_for_temp_tables; +extern struct st_mysql_sys_var* system_variables[]; +extern st_mysql_show_var status_variables[]; +void copy_variable_aliases(); diff --git a/storage/maria/file_formats.txt b/storage/maria/file_formats.txt new file mode 100644 index 00000000000..927e8ad985e --- /dev/null +++ b/storage/maria/file_formats.txt @@ -0,0 +1,71 @@ +# +# This should contain a description of the file format for most Maria files +# + +# Description of the header in the index file + +Header, 24 bytes + +Pos Length + +0 4 file_version +4 2 options +6 2 header_length +8 2 state_info_length +10 2 base_info_length +12 2 base_pos +14 2 key_parts +16 2 unique_key_parts +18 1 keys +19 1 uniques +20 1 language +21 1 fulltext_keys +22 1 data_file_type +23 1 org_data_file_type + + +Status part + +24 2 open_count +26 2 state_changed +28 7 create_rename_lsn + 7 is_of_horizon + 7 skip_redo_lsn + 8 state.records + 8 state->state.del + 8 state->split + 8 state->dellink + 8 state->first_bitmap_with_space + 8 state->state.key_file_length + 8 state->state.data_file_length + 8 state->state.empty + 8 state->state.key_empty + 8 state->auto_increment + 8 state->state.checksum + 4 state->process + 4 state->unique + 4 state->status + 4 state->update_count + + 1 state->sortkey + 1 reserved + +for each key + 8 state->key_root[i] + + 8 state->key_del + 4 state->sec_index_changed + 4 state->sec_index_used + 4 state->version + 8 state->key_map + 8 state->create_time + 8 state->recover_time + 8 state->check_time + 8 state->records_at_analyze + +for each key + 4 reserved + +for each key part + 8 state->rec_per_key_part[i] + 4 state->nulls_per_key_part[i] diff --git a/storage/maria/ft_maria.c b/storage/maria/ft_maria.c new file mode 100644 index 00000000000..b1b24592593 --- /dev/null +++ b/storage/maria/ft_maria.c @@ -0,0 +1,48 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* + This function is for interface functions between fulltext and maria +*/ + +#include "ma_ftdefs.h" + +FT_INFO *maria_ft_init_search(uint flags, void *info, uint keynr, + uchar *query, size_t query_len, + CHARSET_INFO *cs, uchar *record) +{ + FT_INFO *res; + if (flags & FT_BOOL) + res= maria_ft_init_boolean_search((MARIA_HA *) info, keynr, query, + query_len, cs); + else + res= maria_ft_init_nlq_search((MARIA_HA *) info, keynr, query, query_len, + flags, record); + return res; +} + +const struct _ft_vft _ma_ft_vft_nlq = { + maria_ft_nlq_read_next, maria_ft_nlq_find_relevance, + maria_ft_nlq_close_search, maria_ft_nlq_get_relevance, + maria_ft_nlq_reinit_search +}; +const struct _ft_vft _ma_ft_vft_boolean = { + maria_ft_boolean_read_next, maria_ft_boolean_find_relevance, + maria_ft_boolean_close_search, maria_ft_boolean_get_relevance, + maria_ft_boolean_reinit_search +}; + diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc new file mode 100644 index 00000000000..27958285a2e --- /dev/null +++ b/storage/maria/ha_maria.cc @@ -0,0 +1,3686 @@ +/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (C) 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +#ifdef USE_PRAGMA_IMPLEMENTATION +#pragma implementation // gcc: Class implementation +#endif + +#define MYSQL_SERVER 1 +#include "mysql_priv.h" +#include <mysql/plugin.h> +#include <m_ctype.h> +#include <my_dir.h> +#include <myisampack.h> +#include <my_bit.h> +#include "ha_maria.h" +#include "trnman_public.h" +#include "trnman.h" +#include "compat_aliases.h" + +C_MODE_START +#include "maria_def.h" +#include "ma_rt_index.h" +#include "ma_blockrec.h" +#include "ma_checkpoint.h" +#include "ma_recovery.h" +C_MODE_END + +/* + Note that in future versions, only *transactional* Maria tables can + rollback, so this flag should be up or down conditionally. +*/ +#ifdef MARIA_CANNOT_ROLLBACK +#define CANNOT_ROLLBACK_FLAG HA_NO_TRANSACTIONS +#define trans_register_ha(A, B, C) do { /* nothing */ } while(0) +#else +#define CANNOT_ROLLBACK_FLAG 0 +#endif +#define THD_TRN (*(TRN **)thd_ha_data(thd, maria_hton)) + +ulong pagecache_division_limit, pagecache_age_threshold; +ulonglong pagecache_buffer_size; + +/** + As the auto-repair is initiated when opened from the SQL layer + (open_unireg_entry(), check_and_repair()), it does not happen when Maria's + Recovery internally opens the table to apply log records to it, which is + good. It would happen only after Recovery, if the table is still + corrupted. +*/ +ulong maria_recover_options= HA_RECOVER_NONE; +handlerton *maria_hton; + +/* bits in maria_recover_options */ +const char *maria_recover_names[]= +{ + /* + Compared to MyISAM, "default" was renamed to "normal" as it collided with + SET var=default which sets to the var's default i.e. what happens when the + var is not set i.e. HA_RECOVER_NONE. + Another change is that OFF is used to disable, not ""; this is to have OFF + display in SHOW VARIABLES which is better than "". + */ + "OFF", "NORMAL", "BACKUP", "FORCE", "QUICK", NullS +}; +TYPELIB maria_recover_typelib= +{ + array_elements(maria_recover_names) - 1, "", + maria_recover_names, NULL +}; + +const char *maria_stats_method_names[]= +{ + "nulls_unequal", "nulls_equal", + "nulls_ignored", NullS +}; +TYPELIB maria_stats_method_typelib= +{ + array_elements(maria_stats_method_names) - 1, "", + maria_stats_method_names, NULL +}; + +/* transactions log purge mode */ +const char *maria_translog_purge_type_names[]= +{ + "immediate", "external", "at_flush", NullS +}; +TYPELIB maria_translog_purge_type_typelib= +{ + array_elements(maria_translog_purge_type_names) - 1, "", + maria_translog_purge_type_names, NULL +}; + +/* transactional log directory sync */ +const char *maria_sync_log_dir_names[]= +{ + "NEVER", "NEWFILE", "ALWAYS", NullS +}; +TYPELIB maria_sync_log_dir_typelib= +{ + array_elements(maria_sync_log_dir_names) - 1, "", + maria_sync_log_dir_names, NULL +}; + +/* transactional log group commit */ +const char *maria_group_commit_names[]= +{ + "none", "hard", "soft", NullS +}; +TYPELIB maria_group_commit_typelib= +{ + array_elements(maria_group_commit_names) - 1, "", + maria_group_commit_names, NULL +}; + +/** Interval between background checkpoints in seconds */ +static ulong checkpoint_interval; +static void update_checkpoint_interval(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save); +static void update_maria_group_commit(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save); +static void update_maria_group_commit_interval(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save); +/** After that many consecutive recovery failures, remove logs */ +static ulong force_start_after_recovery_failures; +static void update_log_file_size(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save); + +static MYSQL_SYSVAR_ULONG(block_size, maria_block_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Block size to be used for Aria index pages.", 0, 0, + MARIA_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH, + MARIA_MAX_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH); + +static MYSQL_SYSVAR_ULONG(checkpoint_interval, checkpoint_interval, + PLUGIN_VAR_RQCMDARG, + "Interval between automatic checkpoints, in seconds; 0 means" + " 'no automatic checkpoints' which makes sense only for testing.", + NULL, update_checkpoint_interval, 30, 0, UINT_MAX, 1); + +static MYSQL_SYSVAR_ULONG(force_start_after_recovery_failures, + force_start_after_recovery_failures, + /* + Read-only because setting it on the fly has no useful effect, + should be set on command-line. + */ + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of consecutive log recovery failures after which logs will be" + " automatically deleted to cure the problem; 0 (the default) disables" + " the feature.", NULL, NULL, 0, 0, UINT_MAX8, 1); + +static MYSQL_SYSVAR_BOOL(page_checksum, maria_page_checksums, 0, + "Maintain page checksums (can be overridden per table " + "with PAGE_CHECKSUM clause in CREATE TABLE)", 0, 0, 1); + +/* It is only command line argument */ +static MYSQL_SYSVAR_STR(log_dir_path, maria_data_root, + PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to the directory where to store transactional log", + NULL, NULL, mysql_real_data_home); + + +static MYSQL_SYSVAR_ULONG(log_file_size, log_file_size, + PLUGIN_VAR_RQCMDARG, + "Limit for transaction log size", + NULL, update_log_file_size, TRANSLOG_FILE_SIZE, + TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE); + +static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit, + PLUGIN_VAR_RQCMDARG, + "Specifies Aria group commit mode. " + "Possible values are \"none\" (no group commit), " + "\"hard\" (with waiting to actual commit), " + "\"soft\" (no wait for commit (DANGEROUS!!!))", + NULL, update_maria_group_commit, + TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib); + +static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval, + PLUGIN_VAR_RQCMDARG, + "Interval between commite in microseconds (1/1000000c)." + " 0 stands for no waiting" + " for other threads to come and do a commit in \"hard\" mode and no" + " sync()/commit at all in \"soft\" mode. Option has only an effect" + " if aria_group_commit is used", + NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1); + +static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type, + PLUGIN_VAR_RQCMDARG, + "Specifies how Aria transactional log will be purged. " + "Possible values of name are \"immediate\", \"external\" " + "and \"at_flush\"", + NULL, NULL, TRANSLOG_PURGE_IMMIDIATE, + &maria_translog_purge_type_typelib); + +static MYSQL_SYSVAR_ULONGLONG(max_sort_file_size, + maria_max_temp_length, PLUGIN_VAR_RQCMDARG, + "Don't use the fast sort index method to created index if the " + "temporary file would get bigger than this.", + 0, 0, MAX_FILE_SIZE & ~(1*MB-1), 0, MAX_FILE_SIZE, 1*MB); + +static MYSQL_SYSVAR_ULONG(pagecache_age_threshold, + pagecache_age_threshold, PLUGIN_VAR_RQCMDARG, + "This characterizes the number of hits a hot block has to be untouched " + "until it is considered aged enough to be downgraded to a warm block. " + "This specifies the percentage ratio of that number of hits to the " + "total number of blocks in the page cache.", 0, 0, + 300, 100, ~0L, 100); + +static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, pagecache_buffer_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The size of the buffer used for index blocks for Aria tables. " + "Increase this to get better index handling (for all reads and " + "multiple writes) to as much as you can afford.", 0, 0, + KEY_CACHE_SIZE, 0, ~(ulong) 0, 1); + +static MYSQL_SYSVAR_ULONG(pagecache_division_limit, pagecache_division_limit, + PLUGIN_VAR_RQCMDARG, + "The minimum percentage of warm blocks in key cache", 0, 0, + 100, 1, 100, 1); + +static MYSQL_SYSVAR_ENUM(recover, maria_recover_options, PLUGIN_VAR_OPCMDARG, + "Specifies how corrupted tables should be automatically repaired." + " Possible values are \"NORMAL\" (the default), \"BACKUP\", \"FORCE\"," + " \"QUICK\", or \"OFF\" which is like not using the option.", + NULL, NULL, HA_RECOVER_DEFAULT, &maria_recover_typelib); + +static MYSQL_THDVAR_ULONG(repair_threads, PLUGIN_VAR_RQCMDARG, + "Number of threads to use when repairing Aria tables. The value of 1 " + "disables parallel repair.", + 0, 0, 1, 1, ~0L, 1); + +static MYSQL_THDVAR_ULONG(sort_buffer_size, PLUGIN_VAR_RQCMDARG, + "The buffer that is allocated when sorting the index when doing a " + "REPAIR or when creating indexes with CREATE INDEX or ALTER TABLE.", + 0, 0, 128L*1024L*1024L, 4, ~0L, 1); + +static MYSQL_THDVAR_ENUM(stats_method, PLUGIN_VAR_RQCMDARG, + "Specifies how Aria index statistics collection code should treat " + "NULLs. Possible values are \"nulls_unequal\", \"nulls_equal\", " + "and \"nulls_ignored\".", 0, 0, 0, &maria_stats_method_typelib); + +static MYSQL_SYSVAR_ENUM(sync_log_dir, sync_log_dir, PLUGIN_VAR_RQCMDARG, + "Controls syncing directory after log file growth and new file " + "creation. Possible values are \"never\", \"newfile\" and " + "\"always\").", NULL, NULL, TRANSLOG_SYNC_DIR_NEWFILE, + &maria_sync_log_dir_typelib); + +#ifdef USE_MARIA_FOR_TMP_TABLES +#define USE_MARIA_FOR_TMP_TABLES_VAL 1 +#else +#define USE_MARIA_FOR_TMP_TABLES_VAL 0 +#endif +my_bool use_maria_for_temp_tables= USE_MARIA_FOR_TMP_TABLES_VAL; + +static MYSQL_SYSVAR_BOOL(used_for_temp_tables, + use_maria_for_temp_tables, PLUGIN_VAR_READONLY | PLUGIN_VAR_NOCMDOPT, + "Whether temporary tables should be MyISAM or Aria", 0, 0, + 1); + +/***************************************************************************** +** MARIA tables +*****************************************************************************/ + +static handler *maria_create_handler(handlerton *hton, + TABLE_SHARE * table, + MEM_ROOT *mem_root) +{ + return new (mem_root) ha_maria(hton, table); +} + + +// collect errors printed by maria_check routines + +static void _ma_check_print_msg(HA_CHECK *param, const char *msg_type, + const char *fmt, va_list args) +{ + THD *thd= (THD *) param->thd; + Protocol *protocol= thd->protocol; + uint length, msg_length; + char msgbuf[HA_MAX_MSG_BUF]; + char name[NAME_LEN * 2 + 2]; + + msg_length= my_vsnprintf(msgbuf, sizeof(msgbuf), fmt, args); + msgbuf[sizeof(msgbuf) - 1]= 0; // healthy paranoia + + DBUG_PRINT(msg_type, ("message: %s", msgbuf)); + + if (!thd->vio_ok()) + { + sql_print_error(fmt, args); + return; + } + + if (param->testflag & + (T_CREATE_MISSING_KEYS | T_SAFE_REPAIR | T_AUTO_REPAIR)) + { + my_message(ER_NOT_KEYFILE, msgbuf, MYF(MY_WME)); + return; + } + length= (uint) (strxmov(name, param->db_name, ".", param->table_name, + NullS) - name); + /* + TODO: switch from protocol to push_warning here. The main reason we didn't + it yet is parallel repair. Due to following trace: + ma_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr. + + Also we likely need to lock mutex here (in both cases with protocol and + push_warning). + */ + protocol->prepare_for_resend(); + protocol->store(name, length, system_charset_info); + protocol->store(param->op_name, system_charset_info); + protocol->store(msg_type, system_charset_info); + protocol->store(msgbuf, msg_length, system_charset_info); + if (protocol->write()) + sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n", + msgbuf); + return; +} + + +/* + Convert TABLE object to Maria key and column definition + + SYNOPSIS + table2maria() + table_arg in TABLE object. + keydef_out out Maria key definition. + recinfo_out out Maria column definition. + records_out out Number of fields. + + DESCRIPTION + This function will allocate and initialize Maria key and column + definition for further use in ma_create or for a check for underlying + table conformance in merge engine. + + The caller needs to free *recinfo_out after use. Since *recinfo_out + and *keydef_out are allocated with a my_multi_malloc, *keydef_out + is freed automatically when *recinfo_out is freed. + + RETURN VALUE + 0 OK + # error code +*/ + +static int table2maria(TABLE *table_arg, data_file_type row_type, + MARIA_KEYDEF **keydef_out, + MARIA_COLUMNDEF **recinfo_out, uint *records_out, + MARIA_CREATE_INFO *create_info) +{ + uint i, j, recpos, minpos, fieldpos, temp_length, length; + enum ha_base_keytype type= HA_KEYTYPE_BINARY; + uchar *record; + KEY *pos; + MARIA_KEYDEF *keydef; + MARIA_COLUMNDEF *recinfo, *recinfo_pos; + HA_KEYSEG *keyseg; + TABLE_SHARE *share= table_arg->s; + uint options= share->db_options_in_use; + DBUG_ENTER("table2maria"); + + if (row_type == BLOCK_RECORD) + options|= HA_OPTION_PACK_RECORD; + + if (!(my_multi_malloc(MYF(MY_WME), + recinfo_out, (share->fields * 2 + 2) * sizeof(MARIA_COLUMNDEF), + keydef_out, share->keys * sizeof(MARIA_KEYDEF), + &keyseg, + (share->key_parts + share->keys) * sizeof(HA_KEYSEG), + NullS))) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); /* purecov: inspected */ + keydef= *keydef_out; + recinfo= *recinfo_out; + pos= table_arg->key_info; + for (i= 0; i < share->keys; i++, pos++) + { + keydef[i].flag= (uint16) (pos->flags & (HA_NOSAME | HA_FULLTEXT | + HA_SPATIAL)); + keydef[i].key_alg= pos->algorithm == HA_KEY_ALG_UNDEF ? + (pos->flags & HA_SPATIAL ? HA_KEY_ALG_RTREE : HA_KEY_ALG_BTREE) : + pos->algorithm; + keydef[i].block_length= pos->block_size; + keydef[i].seg= keyseg; + keydef[i].keysegs= pos->key_parts; + for (j= 0; j < pos->key_parts; j++) + { + Field *field= pos->key_part[j].field; + type= field->key_type(); + keydef[i].seg[j].flag= pos->key_part[j].key_part_flag; + + if (options & HA_OPTION_PACK_KEYS || + (pos->flags & (HA_PACK_KEY | HA_BINARY_PACK_KEY | + HA_SPACE_PACK_USED))) + { + if (pos->key_part[j].length > 8 && + (type == HA_KEYTYPE_TEXT || + type == HA_KEYTYPE_NUM || + (type == HA_KEYTYPE_BINARY && !field->zero_pack()))) + { + /* No blobs here */ + if (j == 0) + keydef[i].flag|= HA_PACK_KEY; + if (!(field->flags & ZEROFILL_FLAG) && + (field->type() == MYSQL_TYPE_STRING || + field->type() == MYSQL_TYPE_VAR_STRING || + ((int) (pos->key_part[j].length - field->decimals())) >= 4)) + keydef[i].seg[j].flag|= HA_SPACE_PACK; + } + else if (j == 0 && (!(pos->flags & HA_NOSAME) || pos->key_length > 16)) + keydef[i].flag|= HA_BINARY_PACK_KEY; + } + keydef[i].seg[j].type= (int) type; + keydef[i].seg[j].start= pos->key_part[j].offset; + keydef[i].seg[j].length= pos->key_part[j].length; + keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_end= + keydef[i].seg[j].bit_length= 0; + keydef[i].seg[j].bit_pos= 0; + keydef[i].seg[j].language= field->charset()->number; + + if (field->null_ptr) + { + keydef[i].seg[j].null_bit= field->null_bit; + keydef[i].seg[j].null_pos= (uint) (field->null_ptr- + (uchar*) table_arg->record[0]); + } + else + { + keydef[i].seg[j].null_bit= 0; + keydef[i].seg[j].null_pos= 0; + } + if (field->type() == MYSQL_TYPE_BLOB || + field->type() == MYSQL_TYPE_GEOMETRY) + { + keydef[i].seg[j].flag|= HA_BLOB_PART; + /* save number of bytes used to pack length */ + keydef[i].seg[j].bit_start= (uint) (field->pack_length() - + share->blob_ptr_size); + } + else if (field->type() == MYSQL_TYPE_BIT) + { + keydef[i].seg[j].bit_length= ((Field_bit *) field)->bit_len; + keydef[i].seg[j].bit_start= ((Field_bit *) field)->bit_ofs; + keydef[i].seg[j].bit_pos= (uint) (((Field_bit *) field)->bit_ptr - + (uchar*) table_arg->record[0]); + } + } + keyseg+= pos->key_parts; + } + if (table_arg->found_next_number_field) + keydef[share->next_number_index].flag|= HA_AUTO_KEY; + record= table_arg->record[0]; + recpos= 0; + recinfo_pos= recinfo; + create_info->null_bytes= table_arg->s->null_bytes; + + while (recpos < (uint) share->stored_rec_length) + { + Field **field, *found= 0; + minpos= share->reclength; + length= 0; + + for (field= table_arg->field; *field; field++) + { + if ((fieldpos= (*field)->offset(record)) >= recpos && + fieldpos <= minpos) + { + /* skip null fields */ + if (!(temp_length= (*field)->pack_length_in_rec())) + continue; /* Skip null-fields */ + if (! found || fieldpos < minpos || + (fieldpos == minpos && temp_length < length)) + { + minpos= fieldpos; + found= *field; + length= temp_length; + } + } + } + DBUG_PRINT("loop", ("found: 0x%lx recpos: %d minpos: %d length: %d", + (long) found, recpos, minpos, length)); + if (!found) + break; + + if (found->flags & BLOB_FLAG) + recinfo_pos->type= FIELD_BLOB; + else if (found->type() == MYSQL_TYPE_VARCHAR) + recinfo_pos->type= FIELD_VARCHAR; + else if (!(options & HA_OPTION_PACK_RECORD) || + (found->zero_pack() && (found->flags & PRI_KEY_FLAG))) + recinfo_pos->type= FIELD_NORMAL; + else if (found->zero_pack()) + recinfo_pos->type= FIELD_SKIP_ZERO; + else + recinfo_pos->type= ((length <= 3 || + (found->flags & ZEROFILL_FLAG)) ? + FIELD_NORMAL : + found->type() == MYSQL_TYPE_STRING || + found->type() == MYSQL_TYPE_VAR_STRING ? + FIELD_SKIP_ENDSPACE : + FIELD_SKIP_PRESPACE); + if (found->null_ptr) + { + recinfo_pos->null_bit= found->null_bit; + recinfo_pos->null_pos= (uint) (found->null_ptr - + (uchar*) table_arg->record[0]); + } + else + { + recinfo_pos->null_bit= 0; + recinfo_pos->null_pos= 0; + } + (recinfo_pos++)->length= (uint16) length; + recpos= minpos + length; + DBUG_PRINT("loop", ("length: %d type: %d", + recinfo_pos[-1].length,recinfo_pos[-1].type)); + } + *records_out= (uint) (recinfo_pos - recinfo); + DBUG_RETURN(0); +} + + +/* + Check for underlying table conformance + + SYNOPSIS + maria_check_definition() + t1_keyinfo in First table key definition + t1_recinfo in First table record definition + t1_keys in Number of keys in first table + t1_recs in Number of records in first table + t2_keyinfo in Second table key definition + t2_recinfo in Second table record definition + t2_keys in Number of keys in second table + t2_recs in Number of records in second table + strict in Strict check switch + + DESCRIPTION + This function compares two Maria definitions. By intention it was done + to compare merge table definition against underlying table definition. + It may also be used to compare dot-frm and MAI definitions of Maria + table as well to compare different Maria table definitions. + + For merge table it is not required that number of keys in merge table + must exactly match number of keys in underlying table. When calling this + function for underlying table conformance check, 'strict' flag must be + set to false, and converted merge definition must be passed as t1_*. + + Otherwise 'strict' flag must be set to 1 and it is not required to pass + converted dot-frm definition as t1_*. + + RETURN VALUE + 0 - Equal definitions. + 1 - Different definitions. + + TODO + - compare FULLTEXT keys; + - compare SPATIAL keys; + - compare FIELD_SKIP_ZERO which is converted to FIELD_NORMAL correctly + (should be correctly detected in table2maria). +*/ + +int maria_check_definition(MARIA_KEYDEF *t1_keyinfo, + MARIA_COLUMNDEF *t1_recinfo, + uint t1_keys, uint t1_recs, + MARIA_KEYDEF *t2_keyinfo, + MARIA_COLUMNDEF *t2_recinfo, + uint t2_keys, uint t2_recs, bool strict) +{ + uint i, j; + DBUG_ENTER("maria_check_definition"); + if ((strict ? t1_keys != t2_keys : t1_keys > t2_keys)) + { + DBUG_PRINT("error", ("Number of keys differs: t1_keys=%u, t2_keys=%u", + t1_keys, t2_keys)); + DBUG_RETURN(1); + } + if (t1_recs != t2_recs) + { + DBUG_PRINT("error", ("Number of recs differs: t1_recs=%u, t2_recs=%u", + t1_recs, t2_recs)); + DBUG_RETURN(1); + } + for (i= 0; i < t1_keys; i++) + { + HA_KEYSEG *t1_keysegs= t1_keyinfo[i].seg; + HA_KEYSEG *t2_keysegs= t2_keyinfo[i].seg; + if (t1_keyinfo[i].flag & HA_FULLTEXT && t2_keyinfo[i].flag & HA_FULLTEXT) + continue; + else if (t1_keyinfo[i].flag & HA_FULLTEXT || + t2_keyinfo[i].flag & HA_FULLTEXT) + { + DBUG_PRINT("error", ("Key %d has different definition", i)); + DBUG_PRINT("error", ("t1_fulltext= %d, t2_fulltext=%d", + test(t1_keyinfo[i].flag & HA_FULLTEXT), + test(t2_keyinfo[i].flag & HA_FULLTEXT))); + DBUG_RETURN(1); + } + if (t1_keyinfo[i].flag & HA_SPATIAL && t2_keyinfo[i].flag & HA_SPATIAL) + continue; + else if (t1_keyinfo[i].flag & HA_SPATIAL || + t2_keyinfo[i].flag & HA_SPATIAL) + { + DBUG_PRINT("error", ("Key %d has different definition", i)); + DBUG_PRINT("error", ("t1_spatial= %d, t2_spatial=%d", + test(t1_keyinfo[i].flag & HA_SPATIAL), + test(t2_keyinfo[i].flag & HA_SPATIAL))); + DBUG_RETURN(1); + } + if (t1_keyinfo[i].keysegs != t2_keyinfo[i].keysegs || + t1_keyinfo[i].key_alg != t2_keyinfo[i].key_alg) + { + DBUG_PRINT("error", ("Key %d has different definition", i)); + DBUG_PRINT("error", ("t1_keysegs=%d, t1_key_alg=%d", + t1_keyinfo[i].keysegs, t1_keyinfo[i].key_alg)); + DBUG_PRINT("error", ("t2_keysegs=%d, t2_key_alg=%d", + t2_keyinfo[i].keysegs, t2_keyinfo[i].key_alg)); + DBUG_RETURN(1); + } + for (j= t1_keyinfo[i].keysegs; j--;) + { + uint8 t1_keysegs_j__type= t1_keysegs[j].type; + /* + Table migration from 4.1 to 5.1. In 5.1 a *TEXT key part is + always HA_KEYTYPE_VARTEXT2. In 4.1 we had only the equivalent of + HA_KEYTYPE_VARTEXT1. Since we treat both the same on MyISAM + level, we can ignore a mismatch between these types. + */ + if ((t1_keysegs[j].flag & HA_BLOB_PART) && + (t2_keysegs[j].flag & HA_BLOB_PART)) + { + if ((t1_keysegs_j__type == HA_KEYTYPE_VARTEXT2) && + (t2_keysegs[j].type == HA_KEYTYPE_VARTEXT1)) + t1_keysegs_j__type= HA_KEYTYPE_VARTEXT1; /* purecov: tested */ + else if ((t1_keysegs_j__type == HA_KEYTYPE_VARBINARY2) && + (t2_keysegs[j].type == HA_KEYTYPE_VARBINARY1)) + t1_keysegs_j__type= HA_KEYTYPE_VARBINARY1; /* purecov: inspected */ + } + + if (t1_keysegs_j__type != t2_keysegs[j].type || + t1_keysegs[j].language != t2_keysegs[j].language || + t1_keysegs[j].null_bit != t2_keysegs[j].null_bit || + t1_keysegs[j].length != t2_keysegs[j].length) + { + DBUG_PRINT("error", ("Key segment %d (key %d) has different " + "definition", j, i)); + DBUG_PRINT("error", ("t1_type=%d, t1_language=%d, t1_null_bit=%d, " + "t1_length=%d", + t1_keysegs[j].type, t1_keysegs[j].language, + t1_keysegs[j].null_bit, t1_keysegs[j].length)); + DBUG_PRINT("error", ("t2_type=%d, t2_language=%d, t2_null_bit=%d, " + "t2_length=%d", + t2_keysegs[j].type, t2_keysegs[j].language, + t2_keysegs[j].null_bit, t2_keysegs[j].length)); + + DBUG_RETURN(1); + } + } + } + + for (i= 0; i < t1_recs; i++) + { + MARIA_COLUMNDEF *t1_rec= &t1_recinfo[i]; + MARIA_COLUMNDEF *t2_rec= &t2_recinfo[i]; + /* + FIELD_SKIP_ZERO can be changed to FIELD_NORMAL in maria_create, + see NOTE1 in ma_create.c + */ + if ((t1_rec->type != t2_rec->type && + !(t1_rec->type == (int) FIELD_SKIP_ZERO && + t1_rec->length == 1 && + t2_rec->type == (int) FIELD_NORMAL)) || + t1_rec->length != t2_rec->length || + t1_rec->null_bit != t2_rec->null_bit) + { + DBUG_PRINT("error", ("Field %d has different definition", i)); + DBUG_PRINT("error", ("t1_type=%d, t1_length=%d, t1_null_bit=%d", + t1_rec->type, t1_rec->length, t1_rec->null_bit)); + DBUG_PRINT("error", ("t2_type=%d, t2_length=%d, t2_null_bit=%d", + t2_rec->type, t2_rec->length, t2_rec->null_bit)); + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +extern "C" { + +int _ma_killed_ptr(HA_CHECK *param) +{ + return thd_killed((THD*)param->thd); +} + + +void _ma_check_print_error(HA_CHECK *param, const char *fmt, ...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_error"); + param->error_printed |= 1; + param->out_flag |= O_DATA_LOST; + va_start(args, fmt); + _ma_check_print_msg(param, "error", fmt, args); + va_end(args); + DBUG_VOID_RETURN; +} + + +void _ma_check_print_info(HA_CHECK *param, const char *fmt, ...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_info"); + va_start(args, fmt); + _ma_check_print_msg(param, "info", fmt, args); + va_end(args); + DBUG_VOID_RETURN; +} + + +void _ma_check_print_warning(HA_CHECK *param, const char *fmt, ...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_warning"); + param->warning_printed= 1; + param->out_flag |= O_DATA_LOST; + va_start(args, fmt); + _ma_check_print_msg(param, "warning", fmt, args); + va_end(args); + DBUG_VOID_RETURN; +} + +/* + Create a transaction object + + SYNOPSIS + info Maria handler + + RETURN + 0 ok + # Error number (HA_ERR_OUT_OF_MEM) +*/ + +static int maria_create_trn_for_mysql(MARIA_HA *info) +{ + THD *thd= (THD*) info->external_ptr; + TRN *trn= THD_TRN; + DBUG_ENTER("maria_create_trn_for_mysql"); + + if (!trn) /* no transaction yet - open it now */ + { + trn= trnman_new_trn(& thd->transaction.wt); + if (unlikely(!trn)) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + THD_TRN= trn; + if (thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) + trans_register_ha(thd, TRUE, maria_hton); + } + _ma_set_trn_for_table(info, trn); + if (!trnman_increment_locked_tables(trn)) + { + trans_register_ha(thd, FALSE, maria_hton); + trnman_new_statement(trn); + } +#ifdef EXTRA_DEBUG + if (info->lock_type == F_WRLCK && + ! (trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED)) + { + trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED | + TRN_STATE_TABLES_CAN_CHANGE); + (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), + thd->query_length()); + } + else + { + DBUG_PRINT("info", ("lock_type: %d trnman_flags: %u", + info->lock_type, trnman_get_flags(trn))); + } + +#endif + DBUG_RETURN(0); +} + +} /* extern "C" */ + +/** + Transactional table doing bulk insert with one single UNDO + (UNDO_BULK_INSERT) and with repair. +*/ +#define BULK_INSERT_SINGLE_UNDO_AND_REPAIR 1 +/** + Transactional table doing bulk insert with one single UNDO + (UNDO_BULK_INSERT) and without repair. +*/ +#define BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR 2 +/** + None of BULK_INSERT_SINGLE_UNDO_AND_REPAIR and + BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR. +*/ +#define BULK_INSERT_NONE 0 + +ha_maria::ha_maria(handlerton *hton, TABLE_SHARE *table_arg): +handler(hton, table_arg), file(0), +int_table_flags(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER | + HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE | + HA_DUPLICATE_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY | + HA_FILE_BASED | HA_CAN_GEOMETRY | CANNOT_ROLLBACK_FLAG | + HA_CAN_BIT_FIELD | HA_CAN_RTREEKEYS | + HA_HAS_RECORDS | HA_STATS_RECORDS_IS_EXACT), +can_enable_indexes(1), bulk_insert_single_undo(BULK_INSERT_NONE) +{} + + +handler *ha_maria::clone(MEM_ROOT *mem_root) +{ + ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(mem_root)); + if (new_handler) + { + new_handler->file->state= file->state; + /* maria_create_trn_for_mysql() is never called for clone() tables */ + new_handler->file->trn= file->trn; + } + return new_handler; +} + + +static const char *ha_maria_exts[]= +{ + MARIA_NAME_IEXT, + MARIA_NAME_DEXT, + NullS +}; + + +const char **ha_maria::bas_ext() const +{ + return ha_maria_exts; +} + + +const char *ha_maria::index_type(uint key_number) +{ + return ((table->key_info[key_number].flags & HA_FULLTEXT) ? + "FULLTEXT" : + (table->key_info[key_number].flags & HA_SPATIAL) ? + "SPATIAL" : + (table->key_info[key_number].algorithm == HA_KEY_ALG_RTREE) ? + "RTREE" : "BTREE"); +} + + +double ha_maria::scan_time() +{ + if (file->s->data_file_type == BLOCK_RECORD) + return ulonglong2double(stats.data_file_length - file->s->block_size) / max(file->s->block_size / 2, IO_SIZE) + 2; + return handler::scan_time(); +} + +/* + We need to be able to store at least two keys on an index page as the + splitting algorithms depends on this. (With only one key on a page + we also can't use any compression, which may make the index file much + larger) + We use HA_MAX_KEY_BUFF as this is a stack restriction imposed by the + handler interface. + + We also need to reserve place for a record pointer (8) and 3 bytes + per key segment to store the length of the segment + possible null bytes. + These extra bytes are required here so that maria_create() will surely + accept any keys created which the returned key data storage length. +*/ + +uint ha_maria::max_supported_key_length() const +{ + uint tmp= (maria_max_key_length() - 8 - HA_MAX_KEY_SEG*3); + return min(HA_MAX_KEY_BUFF, tmp); +} + + +#ifdef HAVE_REPLICATION +int ha_maria::net_read_dump(NET * net) +{ + int data_fd= file->dfile.file; + int error= 0; + + my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME)); + for (;;) + { + ulong packet_len= my_net_read(net); + if (!packet_len) + break; // end of file + if (packet_len == packet_error) + { + sql_print_error("ha_maria::net_read_dump - read error "); + error= -1; + goto err; + } + if (my_write(data_fd, (uchar *) net->read_pos, (uint) packet_len, + MYF(MY_WME | MY_FNABP))) + { + error= errno; + goto err; + } + } +err: + return error; +} + + +int ha_maria::dump(THD * thd, int fd) +{ + MARIA_SHARE *share= file->s; + NET *net= &thd->net; + uint block_size= share->block_size; + my_off_t bytes_to_read= share->state.state.data_file_length; + int data_fd= file->dfile.file; + uchar *buf= (uchar *) my_malloc(block_size, MYF(MY_WME)); + if (!buf) + return ENOMEM; + + int error= 0; + my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME)); + for (; bytes_to_read > 0;) + { + size_t bytes= my_read(data_fd, buf, block_size, MYF(MY_WME)); + if (bytes == MY_FILE_ERROR) + { + error= errno; + goto err; + } + + if (fd >= 0) + { + if (my_write(fd, buf, bytes, MYF(MY_WME | MY_FNABP))) + { + error= errno ? errno : EPIPE; + goto err; + } + } + else + { + if (my_net_write(net, buf, bytes)) + { + error= errno ? errno : EPIPE; + goto err; + } + } + bytes_to_read -= bytes; + } + + if (fd < 0) + { + if (my_net_write(net, (uchar*) "", 0)) + error= errno ? errno : EPIPE; + net_flush(net); + } + +err: + my_free((uchar*) buf, MYF(0)); + return error; +} +#endif /* HAVE_REPLICATION */ + + /* Name is here without an extension */ + +int ha_maria::open(const char *name, int mode, uint test_if_locked) +{ + uint i; + +#ifdef NOT_USED + /* + If the user wants to have memory mapped data files, add an + open_flag. Do not memory map temporary tables because they are + expected to be inserted and thus extended a lot. Memory mapping is + efficient for files that keep their size, but very inefficient for + growing files. Using an open_flag instead of calling ma_extra(... + HA_EXTRA_MMAP ...) after maxs_open() has the advantage that the + mapping is not repeated for every open, but just done on the initial + open, when the MyISAM share is created. Every time the server + requires to open a new instance of a table it calls this method. We + will always supply HA_OPEN_MMAP for a permanent table. However, the + Maria storage engine will ignore this flag if this is a secondary + open of a table that is in use by other threads already (if the + Maria share exists already). + */ + if (!(test_if_locked & HA_OPEN_TMP_TABLE) && opt_maria_use_mmap) + test_if_locked|= HA_OPEN_MMAP; +#endif + + if (unlikely(maria_recover_options != HA_RECOVER_NONE)) + { + /* user asked to trigger a repair if table was not properly closed */ + test_if_locked|= HA_OPEN_ABORT_IF_CRASHED; + } + + if (!(file= maria_open(name, mode, test_if_locked | HA_OPEN_FROM_SQL_LAYER))) + return (my_errno ? my_errno : -1); + + file->s->chst_invalidator= query_cache_invalidate_by_MyISAM_filename_ref; + + if (test_if_locked & (HA_OPEN_IGNORE_IF_LOCKED | HA_OPEN_TMP_TABLE)) + VOID(maria_extra(file, HA_EXTRA_NO_WAIT_LOCK, 0)); + + info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); + if (!(test_if_locked & HA_OPEN_WAIT_IF_LOCKED)) + VOID(maria_extra(file, HA_EXTRA_WAIT_LOCK, 0)); + if ((data_file_type= file->s->data_file_type) != STATIC_RECORD) + int_table_flags |= HA_REC_NOT_IN_SEQ; + if (!file->s->base.born_transactional) + { + /* + INSERT DELAYED cannot work with transactional tables (because it cannot + stand up to "when client gets ok the data is safe on disk": the record + may not even be inserted). In the future, we could enable it back (as a + client doing INSERT DELAYED knows the specificities; but we then should + make sure to regularly commit in the delayed_insert thread). + */ + int_table_flags|= HA_CAN_INSERT_DELAYED; + } + if (file->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + int_table_flags |= HA_HAS_NEW_CHECKSUM; + + for (i= 0; i < table->s->keys; i++) + { + plugin_ref parser= table->key_info[i].parser; + if (table->key_info[i].flags & HA_USES_PARSER) + file->s->keyinfo[i].parser= + (struct st_mysql_ftparser *)plugin_decl(parser)->info; + table->key_info[i].block_size= file->s->keyinfo[i].block_length; + } + my_errno= 0; + return my_errno; +} + + +int ha_maria::close(void) +{ + MARIA_HA *tmp= file; + if (!tmp) + return 0; + file= 0; + return maria_close(tmp); +} + + +int ha_maria::write_row(uchar * buf) +{ + ha_statistic_increment(&SSV::ha_write_count); + + /* If we have a timestamp column, update it to the current time */ + if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT) + table->timestamp_field->set_time(); + + /* + If we have an auto_increment column and we are writing a changed row + or a new row, then update the auto_increment value in the record. + */ + if (table->next_number_field && buf == table->record[0]) + { + int error; + if ((error= update_auto_increment())) + return error; + } + return maria_write(file, buf); +} + + +int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt) +{ + int error; + HA_CHECK ¶m= *(HA_CHECK*) thd->alloc(sizeof(param)); + MARIA_SHARE *share= file->s; + const char *old_proc_info= thd_proc_info(thd, "Checking table"); + TRN *old_trn= file->trn; + + if (!file || !¶m) return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "check"; + param.db_name= table->s->db.str; + param.table_name= table->alias; + param.testflag= check_opt->flags | T_CHECK | T_SILENT; + param.stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method); + + if (!(table->db_stat & HA_READ_ONLY)) + param.testflag |= T_STATISTICS; + param.using_global_keycache= 1; + + if (!maria_is_crashed(file) && + (((param.testflag & T_CHECK_ONLY_CHANGED) && + !(share->state.changed & (STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR | + STATE_IN_REPAIR)) && + share->state.open_count == 0) || + ((param.testflag & T_FAST) && (share->state.open_count == + (uint) (share->global_changed ? 1 : + 0))))) + return HA_ADMIN_ALREADY_DONE; + + maria_chk_init_for_check(¶m, file); + (void) maria_chk_status(¶m, file); // Not fatal + error= maria_chk_size(¶m, file); + if (!error) + error|= maria_chk_del(¶m, file, param.testflag); + if (!error) + error= maria_chk_key(¶m, file); + if (!error) + { + if ((!(param.testflag & T_QUICK) && + ((share->options & + (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) || + (param.testflag & (T_EXTEND | T_MEDIUM)))) || maria_is_crashed(file)) + { + ulonglong old_testflag= param.testflag; + param.testflag |= T_MEDIUM; + if (!(error= init_io_cache(¶m.read_cache, file->dfile.file, + my_default_record_cache_size, READ_CACHE, + share->pack.header_length, 1, MYF(MY_WME)))) + { + error= maria_chk_data_link(¶m, file, + test(param.testflag & T_EXTEND)); + end_io_cache(&(param.read_cache)); + } + param.testflag= old_testflag; + } + } + if (!error) + { + if ((share->state.changed & (STATE_CHANGED | + STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR | + STATE_CRASHED | STATE_NOT_ANALYZED)) || + (param.testflag & T_STATISTICS) || maria_is_crashed(file)) + { + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + pthread_mutex_lock(&share->intern_lock); + DBUG_PRINT("info", ("Reseting crashed state")); + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR); + if (!(table->db_stat & HA_READ_ONLY)) + error= maria_update_state_info(¶m, file, + UPDATE_TIME | UPDATE_OPEN_COUNT | + UPDATE_STAT); + pthread_mutex_unlock(&share->intern_lock); + info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE | + HA_STATUS_CONST); + } + } + else if (!maria_is_crashed(file) && !thd->killed) + { + maria_mark_crashed(file); + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + } + + /* Reset trn, that may have been set by repair */ + _ma_set_trn_for_table(file, old_trn); + thd_proc_info(thd, old_proc_info); + return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK; +} + + +/* + Analyze the key distribution in the table + As the table may be only locked for read, we have to take into account that + two threads may do an analyze at the same time! +*/ + +int ha_maria::analyze(THD *thd, HA_CHECK_OPT * check_opt) +{ + int error= 0; + HA_CHECK ¶m= *(HA_CHECK*) thd->alloc(sizeof(param)); + MARIA_SHARE *share= file->s; + + if (!¶m) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "analyze"; + param.db_name= table->s->db.str; + param.table_name= table->alias; + param.testflag= (T_FAST | T_CHECK | T_SILENT | T_STATISTICS | + T_DONT_CHECK_CHECKSUM); + param.using_global_keycache= 1; + param.stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method); + + if (!(share->state.changed & STATE_NOT_ANALYZED)) + return HA_ADMIN_ALREADY_DONE; + + error= maria_chk_key(¶m, file); + if (!error) + { + pthread_mutex_lock(&share->intern_lock); + error= maria_update_state_info(¶m, file, UPDATE_STAT); + pthread_mutex_unlock(&share->intern_lock); + } + else if (!maria_is_crashed(file) && !thd->killed) + maria_mark_crashed(file); + return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK; +} + + +int ha_maria::restore(THD * thd, HA_CHECK_OPT *check_opt) +{ + HA_CHECK_OPT tmp_check_opt; + char *backup_dir= thd->lex->backup_dir; + char src_path[FN_REFLEN], dst_path[FN_REFLEN]; + char table_name[FN_REFLEN]; + int error; + const char *errmsg; + DBUG_ENTER("restore"); + + VOID(tablename_to_filename(table->s->table_name.str, table_name, + sizeof(table_name))); + + if (fn_format_relative_to_data_home(src_path, table_name, backup_dir, + MARIA_NAME_DEXT)) + DBUG_RETURN(HA_ADMIN_INVALID); + + strxmov(dst_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS); + if (my_copy(src_path, dst_path, MYF(MY_WME))) + { + error= HA_ADMIN_FAILED; + errmsg= "Failed in my_copy (Error %d)"; + goto err; + } + + tmp_check_opt.init(); + tmp_check_opt.flags |= T_VERY_SILENT | T_CALC_CHECKSUM | T_QUICK; + DBUG_RETURN(repair(thd, &tmp_check_opt)); + +err: + { + /* + Don't allocate param on stack here as this may be huge and it's + also allocated by repair() + */ + HA_CHECK *param; + if (!(param= (HA_CHECK*) my_malloc(sizeof(*param), MYF(MY_WME | MY_FAE)))) + DBUG_RETURN(error); + maria_chk_init(param); + param->thd= thd; + param->op_name= "restore"; + param->db_name= table->s->db.str; + param->table_name= table->s->table_name.str; + param->testflag= 0; + _ma_check_print_error(param, errmsg, my_errno); + my_free(param, MYF(0)); + DBUG_RETURN(error); + } +} + + +int ha_maria::backup(THD * thd, HA_CHECK_OPT *check_opt) +{ + char *backup_dir= thd->lex->backup_dir; + char src_path[FN_REFLEN], dst_path[FN_REFLEN]; + char table_name[FN_REFLEN]; + int error; + const char *errmsg; + DBUG_ENTER("ha_maria::backup"); + + VOID(tablename_to_filename(table->s->table_name.str, table_name, + sizeof(table_name))); + + if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir, + reg_ext)) + { + errmsg= "Failed in fn_format() for .frm file (errno: %d)"; + error= HA_ADMIN_INVALID; + goto err; + } + + strxmov(src_path, table->s->normalized_path.str, reg_ext, NullS); + if (my_copy(src_path, dst_path, + MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE))) + { + error= HA_ADMIN_FAILED; + errmsg= "Failed copying .frm file (errno: %d)"; + goto err; + } + + /* Change extension */ + if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir, + MARIA_NAME_DEXT)) + { + errmsg= "Failed in fn_format() for .MYD file (errno: %d)"; + error= HA_ADMIN_INVALID; + goto err; + } + + strxmov(src_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS); + if (_ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_FORCE_WRITE, + FLUSH_KEEP)) + { + error= HA_ADMIN_FAILED; + errmsg= "Failed in flush (Error %d)"; + goto err; + } + if (my_copy(src_path, dst_path, + MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE))) + { + errmsg= "Failed copying .MYD file (errno: %d)"; + error= HA_ADMIN_FAILED; + goto err; + } + DBUG_RETURN(HA_ADMIN_OK); + +err: + { + HA_CHECK ¶m= *(HA_CHECK*) thd->alloc(sizeof(param)); + if (!¶m) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "backup"; + param.db_name= table->s->db.str; + param.table_name= table->s->table_name.str; + param.testflag= 0; + _ma_check_print_error(¶m, errmsg, my_errno); + DBUG_RETURN(error); + } +} + + +int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt) +{ + int error; + HA_CHECK ¶m= *(HA_CHECK*) thd->alloc(sizeof(param)); + ha_rows start_records; + + if (!file || !¶m) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "repair"; + param.testflag= ((check_opt->flags & ~(T_EXTEND)) | + T_SILENT | T_FORCE_CREATE | T_CALC_CHECKSUM | + (check_opt->flags & T_EXTEND ? T_REP : T_REP_BY_SORT)); + param.sort_buffer_length= THDVAR(thd, sort_buffer_size); + start_records= file->state->records; + while ((error= repair(thd, ¶m, 0)) && param.retry_repair) + { + param.retry_repair= 0; + if (test_all_bits(param.testflag, + (uint) (T_RETRY_WITHOUT_QUICK | T_QUICK))) + { + param.testflag&= ~(T_RETRY_WITHOUT_QUICK | T_QUICK); + /* Ensure we don't loose any rows when retrying without quick */ + param.testflag|= T_SAFE_REPAIR; + if (thd->vio_ok()) + _ma_check_print_info(¶m, "Retrying repair without quick"); + else + sql_print_information("Retrying repair of: '%s' without quick", + table->s->path.str); + continue; + } + param.testflag &= ~T_QUICK; + if ((param.testflag & T_REP_BY_SORT)) + { + param.testflag= (param.testflag & ~T_REP_BY_SORT) | T_REP; + sql_print_information("Retrying repair of: '%s' with keycache", + table->s->path.str); + continue; + } + break; + } + if (!error && start_records != file->state->records && + !(check_opt->flags & T_VERY_SILENT)) + { + char llbuff[22], llbuff2[22]; + sql_print_information("Found %s of %s rows when repairing '%s'", + llstr(file->state->records, llbuff), + llstr(start_records, llbuff2), + table->s->path.str); + } + return error; +} + +int ha_maria::zerofill(THD * thd, HA_CHECK_OPT *check_opt) +{ + int error; + HA_CHECK ¶m= *(HA_CHECK*) thd->alloc(sizeof(param)); + TRN *old_trn; + MARIA_SHARE *share= file->s; + + if (!file || !¶m) + return HA_ADMIN_INTERNAL_ERROR; + + old_trn= file->trn; + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "zerofill"; + param.testflag= check_opt->flags | T_SILENT | T_ZEROFILL; + param.sort_buffer_length= THDVAR(thd, sort_buffer_size); + error=maria_zerofill(¶m, file, share->open_file_name.str); + + /* Reset trn, that may have been set by repair */ + _ma_set_trn_for_table(file, old_trn); + + if (!error) + { + pthread_mutex_lock(&share->intern_lock); + maria_update_state_info(¶m, file, UPDATE_TIME | UPDATE_OPEN_COUNT); + pthread_mutex_unlock(&share->intern_lock); + } + return error; +} + +int ha_maria::optimize(THD * thd, HA_CHECK_OPT *check_opt) +{ + int error; + HA_CHECK ¶m= *(HA_CHECK*) thd->alloc(sizeof(param)); + + if (!file || !¶m) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "optimize"; + param.testflag= (check_opt->flags | T_SILENT | T_FORCE_CREATE | + T_REP_BY_SORT | T_STATISTICS | T_SORT_INDEX); + param.sort_buffer_length= THDVAR(thd, sort_buffer_size); + if ((error= repair(thd, ¶m, 1)) && param.retry_repair) + { + sql_print_warning("Warning: Optimize table got errno %d on %s.%s, retrying", + my_errno, param.db_name, param.table_name); + param.testflag &= ~T_REP_BY_SORT; + error= repair(thd, ¶m, 1); + } + + return error; +} + + +int ha_maria::repair(THD *thd, HA_CHECK *param, bool do_optimize) +{ + int error= 0; + ulonglong local_testflag= param->testflag; + bool optimize_done= !do_optimize, statistics_done= 0; + const char *old_proc_info= thd->proc_info; + char fixed_name[FN_REFLEN]; + MARIA_SHARE *share= file->s; + ha_rows rows= file->state->records; + TRN *old_trn= file->trn; + DBUG_ENTER("ha_maria::repair"); + + /* + Normally this method is entered with a properly opened table. If the + repair fails, it can be repeated with more elaborate options. Under + special circumstances it can happen that a repair fails so that it + closed the data file and cannot re-open it. In this case file->dfile + is set to -1. We must not try another repair without an open data + file. (Bug #25289) + */ + if (file->dfile.file == -1) + { + sql_print_information("Retrying repair of: '%s' failed. " + "Please try REPAIR EXTENDED or aria_chk", + table->s->path.str); + DBUG_RETURN(HA_ADMIN_FAILED); + } + + /* + If transactions was not enabled for a transactional table then + file->s->status is not up to date. This is needed for repair_by_sort + to work + */ + if (share->base.born_transactional && !share->now_transactional) + _ma_copy_nontrans_state_information(file); + + param->db_name= table->s->db.str; + param->table_name= table->alias; + param->tmpfile_createflag= O_RDWR | O_TRUNC; + param->using_global_keycache= 1; + param->thd= thd; + param->tmpdir= &mysql_tmpdir_list; + param->out_flag= 0; + strmov(fixed_name, share->open_file_name.str); + + // Don't lock tables if we have used LOCK TABLE + if (!thd->locked_tables && + maria_lock_database(file, table->s->tmp_table ? F_EXTRA_LCK : F_WRLCK)) + { + _ma_check_print_error(param, ER(ER_CANT_LOCK), my_errno); + DBUG_RETURN(HA_ADMIN_FAILED); + } + + if (!do_optimize || + (((share->data_file_type == BLOCK_RECORD) ? + (share->state.changed & STATE_NOT_OPTIMIZED_ROWS) : + (file->state->del || + share->state.split != file->state->records)) && + (!(param->testflag & T_QUICK) || + (share->state.changed & (STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_OPTIMIZED_ROWS))))) + { + ulonglong key_map= ((local_testflag & T_CREATE_MISSING_KEYS) ? + maria_get_mask_all_keys_active(share->base.keys) : + share->state.key_map); + ulonglong save_testflag= param->testflag; + if (maria_test_if_sort_rep(file, file->state->records, key_map, 0) && + (local_testflag & T_REP_BY_SORT)) + { + local_testflag |= T_STATISTICS; + param->testflag |= T_STATISTICS; // We get this for free + statistics_done= 1; + /* TODO: Remove BLOCK_RECORD test when parallel works with blocks */ + if (THDVAR(thd,repair_threads) > 1 && + share->data_file_type != BLOCK_RECORD) + { + char buf[40]; + /* TODO: respect maria_repair_threads variable */ + my_snprintf(buf, 40, "Repair with %d threads", my_count_bits(key_map)); + thd_proc_info(thd, buf); + param->testflag|= T_REP_PARALLEL; + error= maria_repair_parallel(param, file, fixed_name, + test(param->testflag & T_QUICK)); + /* to reset proc_info, as it was pointing to local buffer */ + thd_proc_info(thd, "Repair done"); + } + else + { + thd_proc_info(thd, "Repair by sorting"); + param->testflag|= T_REP_BY_SORT; + error= maria_repair_by_sort(param, file, fixed_name, + test(param->testflag & T_QUICK)); + } + } + else + { + thd_proc_info(thd, "Repair with keycache"); + param->testflag &= ~(T_REP_BY_SORT | T_REP_PARALLEL); + error= maria_repair(param, file, fixed_name, + test(param->testflag & T_QUICK)); + } + param->testflag= save_testflag | (param->testflag & T_RETRY_WITHOUT_QUICK); + optimize_done= 1; + } + if (!error) + { + if ((local_testflag & T_SORT_INDEX) && + (share->state.changed & STATE_NOT_SORTED_PAGES)) + { + optimize_done= 1; + thd_proc_info(thd, "Sorting index"); + error= maria_sort_index(param, file, fixed_name); + } + if (!statistics_done && (local_testflag & T_STATISTICS)) + { + if (share->state.changed & STATE_NOT_ANALYZED) + { + optimize_done= 1; + thd_proc_info(thd, "Analyzing"); + error= maria_chk_key(param, file); + } + else + local_testflag &= ~T_STATISTICS; // Don't update statistics + } + } + thd_proc_info(thd, "Saving state"); + pthread_mutex_lock(&share->intern_lock); + if (!error) + { + if ((share->state.changed & STATE_CHANGED) || maria_is_crashed(file)) + { + DBUG_PRINT("info", ("Reseting crashed state")); + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR); + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + } + /* + repair updates share->state.state. Ensure that file->state is up to date + */ + if (file->state != &share->state.state) + *file->state= share->state.state; + if (share->base.auto_key) + _ma_update_auto_increment_key(param, file, 1); + if (optimize_done) + error= maria_update_state_info(param, file, + UPDATE_TIME | UPDATE_OPEN_COUNT | + (local_testflag & + T_STATISTICS ? UPDATE_STAT : 0)); + info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE | + HA_STATUS_CONST, 0); + if (rows != file->state->records && !(param->testflag & T_VERY_SILENT)) + { + char llbuff[22], llbuff2[22]; + _ma_check_print_warning(param, "Number of rows changed from %s to %s", + llstr(rows, llbuff), + llstr(file->state->records, llbuff2)); + /* Abort if warning was converted to error */ + if (current_thd->is_error()) + error= 1; + } + } + else + { + maria_mark_crashed_on_repair(file); + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + maria_update_state_info(param, file, 0); + } + pthread_mutex_unlock(&share->intern_lock); + thd_proc_info(thd, old_proc_info); + if (!thd->locked_tables) + maria_lock_database(file, F_UNLCK); + + /* Reset trn, that may have been set by repair */ + _ma_set_trn_for_table(file, old_trn); + error= error ? HA_ADMIN_FAILED : + (optimize_done ? + (write_log_record_for_repair(param, file) ? HA_ADMIN_FAILED : + HA_ADMIN_OK) : HA_ADMIN_ALREADY_DONE); + DBUG_RETURN(error); +} + + +/* + Assign table indexes to a specific key cache. +*/ + +int ha_maria::assign_to_keycache(THD * thd, HA_CHECK_OPT *check_opt) +{ +#if 0 && NOT_IMPLEMENTED + PAGECACHE *new_pagecache= check_opt->pagecache; + const char *errmsg= 0; + int error= HA_ADMIN_OK; + ulonglong map; + TABLE_LIST *table_list= table->pos_in_table_list; + DBUG_ENTER("ha_maria::assign_to_keycache"); + + + table->keys_in_use_for_query.clear_all(); + + if (table_list->process_index_hints(table)) + DBUG_RETURN(HA_ADMIN_FAILED); + map= ~(ulonglong) 0; + if (!table->keys_in_use_for_query.is_clear_all()) + /* use all keys if there's no list specified by the user through hints */ + map= table->keys_in_use_for_query.to_ulonglong(); + + if ((error= maria_assign_to_pagecache(file, map, new_pagecache))) + { + char buf[STRING_BUFFER_USUAL_SIZE]; + my_snprintf(buf, sizeof(buf), + "Failed to flush to index file (errno: %d)", error); + errmsg= buf; + error= HA_ADMIN_CORRUPT; + } + + if (error != HA_ADMIN_OK) + { + /* Send error to user */ + HA_CHECK ¶m= *(HA_CHECK*) thd->alloc(sizeof(param)); + if (!¶m) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "assign_to_keycache"; + param.db_name= table->s->db.str; + param.table_name= table->s->table_name.str; + param.testflag= 0; + _ma_check_print_error(¶m, errmsg); + } + DBUG_RETURN(error); +#else + return HA_ADMIN_NOT_IMPLEMENTED; +#endif +} + + +/* + Preload pages of the index file for a table into the key cache. +*/ + +int ha_maria::preload_keys(THD * thd, HA_CHECK_OPT *check_opt) +{ + ulonglong map; + TABLE_LIST *table_list= table->pos_in_table_list; + + DBUG_ENTER("ha_maria::preload_keys"); + + table->keys_in_use_for_query.clear_all(); + + if (table_list->process_index_hints(table)) + DBUG_RETURN(HA_ADMIN_FAILED); + + map= ~(ulonglong) 0; + /* Check validity of the index references */ + if (!table->keys_in_use_for_query.is_clear_all()) + /* use all keys if there's no list specified by the user through hints */ + map= table->keys_in_use_for_query.to_ulonglong(); + + maria_extra(file, HA_EXTRA_PRELOAD_BUFFER_SIZE, + (void*) &thd->variables.preload_buff_size); + + int error; + + if ((error= maria_preload(file, map, table_list->ignore_leaves))) + { + char buf[MYSQL_ERRMSG_SIZE+20]; + const char *errmsg; + + switch (error) { + case HA_ERR_NON_UNIQUE_BLOCK_SIZE: + errmsg= "Indexes use different block sizes"; + break; + case HA_ERR_OUT_OF_MEM: + errmsg= "Failed to allocate buffer"; + break; + default: + my_snprintf(buf, sizeof(buf), + "Failed to read from index file (errno: %d)", my_errno); + errmsg= buf; + } + + HA_CHECK ¶m= *(HA_CHECK*) thd->alloc(sizeof(param)); + if (!¶m) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "preload_keys"; + param.db_name= table->s->db.str; + param.table_name= table->s->table_name.str; + param.testflag= 0; + _ma_check_print_error(¶m, "%s", errmsg); + DBUG_RETURN(HA_ADMIN_FAILED); + } + DBUG_RETURN(HA_ADMIN_OK); +} + + +/* + Disable indexes, making it persistent if requested. + + SYNOPSIS + disable_indexes() + mode mode of operation: + HA_KEY_SWITCH_NONUNIQ disable all non-unique keys + HA_KEY_SWITCH_ALL disable all keys + HA_KEY_SWITCH_NONUNIQ_SAVE dis. non-uni. and make persistent + HA_KEY_SWITCH_ALL_SAVE dis. all keys and make persistent + + IMPLEMENTATION + HA_KEY_SWITCH_NONUNIQ is not implemented. + HA_KEY_SWITCH_ALL_SAVE is not implemented. + + RETURN + 0 ok + HA_ERR_WRONG_COMMAND mode not implemented. +*/ + +int ha_maria::disable_indexes(uint mode) +{ + int error; + + if (mode == HA_KEY_SWITCH_ALL) + { + /* call a storage engine function to switch the key map */ + error= maria_disable_indexes(file); + } + else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE) + { + maria_extra(file, HA_EXTRA_NO_KEYS, 0); + info(HA_STATUS_CONST); // Read new key info + error= 0; + } + else + { + /* mode not implemented */ + error= HA_ERR_WRONG_COMMAND; + } + return error; +} + + +/* + Enable indexes, making it persistent if requested. + + SYNOPSIS + enable_indexes() + mode mode of operation: + HA_KEY_SWITCH_NONUNIQ enable all non-unique keys + HA_KEY_SWITCH_ALL enable all keys + HA_KEY_SWITCH_NONUNIQ_SAVE en. non-uni. and make persistent + HA_KEY_SWITCH_ALL_SAVE en. all keys and make persistent + + DESCRIPTION + Enable indexes, which might have been disabled by disable_index() before. + The modes without _SAVE work only if both data and indexes are empty, + since the MARIA repair would enable them persistently. + To be sure in these cases, call handler::delete_all_rows() before. + + IMPLEMENTATION + HA_KEY_SWITCH_NONUNIQ is not implemented. + HA_KEY_SWITCH_ALL_SAVE is not implemented. + + RETURN + 0 ok + !=0 Error, among others: + HA_ERR_CRASHED data or index is non-empty. Delete all rows and retry. + HA_ERR_WRONG_COMMAND mode not implemented. +*/ + +int ha_maria::enable_indexes(uint mode) +{ + int error; + DBUG_PRINT("info", ("ha_maria::enable_indexes mode: %d", mode)); + if (maria_is_all_keys_active(file->s->state.key_map, file->s->base.keys)) + { + /* All indexes are enabled already. */ + return 0; + } + + if (mode == HA_KEY_SWITCH_ALL) + { + error= maria_enable_indexes(file); + /* + Do not try to repair on error, + as this could make the enabled state persistent, + but mode==HA_KEY_SWITCH_ALL forbids it. + */ + } + else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE) + { + THD *thd= current_thd; + HA_CHECK ¶m= *(HA_CHECK*) thd->alloc(sizeof(param)); + if (!¶m) + return HA_ADMIN_INTERNAL_ERROR; + + const char *save_proc_info= thd_proc_info(thd, "Creating index"); + + maria_chk_init(¶m); + param.op_name= "recreating_index"; + param.testflag= (T_SILENT | T_REP_BY_SORT | T_QUICK | + T_CREATE_MISSING_KEYS | T_SAFE_REPAIR); + if (bulk_insert_single_undo == BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR) + { + bulk_insert_single_undo= BULK_INSERT_SINGLE_UNDO_AND_REPAIR; + /* + Don't bump create_rename_lsn, because UNDO_BULK_INSERT + should not be skipped in case of crash during repair. + */ + param.testflag|= T_NO_CREATE_RENAME_LSN; + } + param.myf_rw &= ~MY_WAIT_IF_FULL; + param.sort_buffer_length= THDVAR(thd,sort_buffer_size); + param.stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method); + param.tmpdir= &mysql_tmpdir_list; + if ((error= (repair(thd, ¶m, 0) != HA_ADMIN_OK)) && param.retry_repair) + { + sql_print_warning("Warning: Enabling keys got errno %d on %s.%s, " + "retrying", + my_errno, param.db_name, param.table_name); + /* This should never fail normally */ + DBUG_ASSERT(thd->killed != 0); + /* Repairing by sort failed. Now try standard repair method. */ + param.testflag &= ~T_REP_BY_SORT; + error= (repair(thd, ¶m, 0) != HA_ADMIN_OK); + /* + If the standard repair succeeded, clear all error messages which + might have been set by the first repair. They can still be seen + with SHOW WARNINGS then. + */ + if (!error) + thd->clear_error(); + } + info(HA_STATUS_CONST); + thd_proc_info(thd, save_proc_info); + } + else + { + /* mode not implemented */ + error= HA_ERR_WRONG_COMMAND; + } + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_enable_index", + { + DBUG_PRINT("maria_crash_enable_index", ("now")); + DBUG_ABORT(); + }); + return error; +} + + +/* + Test if indexes are disabled. + + + SYNOPSIS + indexes_are_disabled() + no parameters + + + RETURN + 0 indexes are not disabled + 1 all indexes are disabled + [2 non-unique indexes are disabled - NOT YET IMPLEMENTED] +*/ + +int ha_maria::indexes_are_disabled(void) +{ + return maria_indexes_are_disabled(file); +} + + +/* + prepare for a many-rows insert operation + e.g. - disable indexes (if they can be recreated fast) or + activate special bulk-insert optimizations + + SYNOPSIS + start_bulk_insert(rows) + rows Rows to be inserted + 0 if we don't know + + NOTICE + Do not forget to call end_bulk_insert() later! +*/ + +void ha_maria::start_bulk_insert(ha_rows rows) +{ + DBUG_ENTER("ha_maria::start_bulk_insert"); + THD *thd= current_thd; + ulong size= min(thd->variables.read_buff_size, + (ulong) (table->s->avg_row_length * rows)); + MARIA_SHARE *share= file->s; + DBUG_PRINT("info", ("start_bulk_insert: rows %lu size %lu", + (ulong) rows, size)); + + /* don't enable row cache if too few rows */ + if (!rows || (rows > MARIA_MIN_ROWS_TO_USE_WRITE_CACHE)) + maria_extra(file, HA_EXTRA_WRITE_CACHE, (void*) &size); + + can_enable_indexes= (maria_is_all_keys_active(share->state.key_map, + share->base.keys)); + bulk_insert_single_undo= BULK_INSERT_NONE; + + if (!(specialflag & SPECIAL_SAFE_MODE)) + { + /* + Only disable old index if the table was empty and we are inserting + a lot of rows. + We should not do this for only a few rows as this is slower and + we don't want to update the key statistics based of only a few rows. + Index file rebuild requires an exclusive lock, so if versioning is on + don't do it (see how ha_maria::store_lock() tries to predict repair). + We can repair index only if we have an exclusive (TL_WRITE) lock. To + see if table is empty, we shouldn't rely on the old records' count from + our transaction's start (if that old count is 0 but now there are + records in the table, we would wrongly destroy them). + So we need to look at share->state.state.records. + As a safety net for now, we don't remove the test of + file->state->records, because there is uncertainty on what will happen + during repair if the two states disagree. + */ + if ((file->state->records == 0) && + (share->state.state.records == 0) && can_enable_indexes && + (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES) && + (file->lock.type == TL_WRITE)) + { + /** + @todo for a single-row INSERT SELECT, we will go into repair, which + is more costly (flushes, syncs) than a row write. + */ + maria_disable_non_unique_index(file, rows); + if (share->now_transactional) + { + bulk_insert_single_undo= BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR; + write_log_record_for_bulk_insert(file); + _ma_tmp_disable_logging_for_table(file, TRUE); + /* + Pages currently in the page cache have type PAGECACHE_LSN_PAGE, we + are not allowed to overwrite them with PAGECACHE_PLAIN_PAGE, so + throw them away. It is not losing data, because we just wrote and + forced an UNDO which will for sure empty the table if we crash. The + upcoming unique-key insertions however need a proper index, so we + cannot leave the corrupted on-disk index file, thus we truncate it. + */ + maria_delete_all_rows(file); + } + } + else if (!file->bulk_insert && + (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT)) + { + maria_init_bulk_insert(file, thd->variables.bulk_insert_buff_size, rows); + } + } + DBUG_VOID_RETURN; +} + + +/* + end special bulk-insert optimizations, + which have been activated by start_bulk_insert(). + + SYNOPSIS + end_bulk_insert() + no arguments + + RETURN + 0 OK + != 0 Error +*/ + +int ha_maria::end_bulk_insert() +{ + int err; + DBUG_ENTER("ha_maria::end_bulk_insert"); + maria_end_bulk_insert(file); + if ((err= maria_extra(file, HA_EXTRA_NO_CACHE, 0))) + goto end; + if (can_enable_indexes && !file->s->deleting) + err= enable_indexes(HA_KEY_SWITCH_NONUNIQ_SAVE); +end: + if (bulk_insert_single_undo != BULK_INSERT_NONE) + { + DBUG_ASSERT(can_enable_indexes); + /* + Table was transactional just before start_bulk_insert(). + No need to flush pages if we did a repair (which already flushed). + */ + err|= + _ma_reenable_logging_for_table(file, + bulk_insert_single_undo == + BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR); + } + DBUG_RETURN(err); +} + + +bool ha_maria::check_and_repair(THD *thd) +{ + int error, crashed; + LEX_STRING old_query; + HA_CHECK_OPT check_opt; + DBUG_ENTER("ha_maria::check_and_repair"); + + check_opt.init(); + + error= 1; + if ((file->s->state.changed & + (STATE_CRASHED | STATE_CRASHED_ON_REPAIR | STATE_MOVED)) == + STATE_MOVED) + { + sql_print_information("Zerofilling moved table: '%s'", + table->s->path.str); + if (!(error= zerofill(thd, &check_opt))) + DBUG_RETURN(0); + } + + /* + if we got this far - the table is crashed. + but don't auto-repair if maria_recover_options is not set + */ + if (!maria_recover_options) + DBUG_RETURN(error); + + error= 0; + check_opt.flags= T_MEDIUM | T_AUTO_REPAIR; + // Don't use quick if deleted rows + if (!file->state->del && (maria_recover_options & HA_RECOVER_QUICK)) + check_opt.flags |= T_QUICK; + + old_query= thd->query_string; + pthread_mutex_lock(&LOCK_thread_count); + thd->query_string= table->s->table_name; + pthread_mutex_unlock(&LOCK_thread_count); + + if (!(crashed= maria_is_crashed(file))) + { + sql_print_warning("Checking table: '%s'", table->s->path.str); + crashed= check(thd, &check_opt); + } + + if (crashed) + { + sql_print_warning("Recovering table: '%s'", table->s->path.str); + check_opt.flags= + ((maria_recover_options & HA_RECOVER_BACKUP ? T_BACKUP_DATA : 0) | + (maria_recover_options & HA_RECOVER_FORCE ? 0 : T_SAFE_REPAIR) | + T_AUTO_REPAIR); + if (repair(thd, &check_opt)) + error= 1; + } + pthread_mutex_lock(&LOCK_thread_count); + thd->query_string= old_query; + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_RETURN(error); +} + + +bool ha_maria::is_crashed() const +{ + return (file->s->state.changed & (STATE_CRASHED | STATE_MOVED) || + (my_disable_locking && file->s->state.open_count)); +} + +#define CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING(msg) \ + do { \ + if (file->lock.type == TL_WRITE_CONCURRENT_INSERT) \ + { \ + my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), msg); \ + return 1; \ + } \ + } while(0) + +int ha_maria::update_row(const uchar * old_data, uchar * new_data) +{ + CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING("UPDATE in WRITE CONCURRENT"); + ha_statistic_increment(&SSV::ha_update_count); + if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) + table->timestamp_field->set_time(); + return maria_update(file, old_data, new_data); +} + + +int ha_maria::delete_row(const uchar * buf) +{ + CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING("DELETE in WRITE CONCURRENT"); + ha_statistic_increment(&SSV::ha_delete_count); + return maria_delete(file, buf); +} + +C_MODE_START + +ICP_RESULT index_cond_func_maria(void *arg) +{ + ha_maria *h= (ha_maria*)arg; + if (h->end_range) + { + if (h->compare_key2(h->end_range) > 0) + return ICP_OUT_OF_RANGE; /* caller should return HA_ERR_END_OF_FILE already */ + } + return h->pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH; +} + +C_MODE_END + +int ha_maria::index_read_map(uchar * buf, const uchar * key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_key_count); + int error= maria_rkey(file, buf, active_index, key, keypart_map, find_flag); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_read_idx_map(uchar * buf, uint index, const uchar * key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) +{ + ha_statistic_increment(&SSV::ha_read_key_count); + int error= maria_rkey(file, buf, index, key, keypart_map, find_flag); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_read_last_map(uchar * buf, const uchar * key, + key_part_map keypart_map) +{ + DBUG_ENTER("ha_maria::index_read_last_map"); + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_key_count); + int error= maria_rkey(file, buf, active_index, key, keypart_map, + HA_READ_PREFIX_LAST); + table->status= error ? STATUS_NOT_FOUND : 0; + DBUG_RETURN(error); +} + + +int ha_maria::index_next(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_next_count); + int error= maria_rnext(file, buf, active_index); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_prev(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_prev_count); + int error= maria_rprev(file, buf, active_index); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_first(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_first_count); + int error= maria_rfirst(file, buf, active_index); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_last(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_last_count); + int error= maria_rlast(file, buf, active_index); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_next_same(uchar * buf, + const uchar *key __attribute__ ((unused)), + uint length __attribute__ ((unused))) +{ + int error; + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_next_count); + /* + TODO: Delete this loop in Maria 1.5 as versioning will ensure this never + happens + */ + do + { + error= maria_rnext_same(file,buf); + } while (error == HA_ERR_RECORD_DELETED); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_init(uint idx, bool sorted) +{ + active_index=idx; + if (pushed_idx_cond_keyno == idx) + ma_set_index_cond_func(file, index_cond_func_maria, this); + return 0; +} + + +int ha_maria::index_end() +{ + active_index=MAX_KEY; + ma_set_index_cond_func(file, NULL, 0); + in_range_check_pushed_down= FALSE; + ds_mrr.dsmrr_close(); + return 0; +} + + +int ha_maria::rnd_init(bool scan) +{ + if (scan) + return maria_scan_init(file); + return maria_reset(file); // Free buffers +} + + +int ha_maria::rnd_end() +{ + ds_mrr.dsmrr_close(); + /* Safe to call even if we don't have started a scan */ + maria_scan_end(file); + return 0; +} + + +int ha_maria::rnd_next(uchar *buf) +{ + ha_statistic_increment(&SSV::ha_read_rnd_next_count); + int error= maria_scan(file, buf); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::remember_rnd_pos() +{ + return (*file->s->scan_remember_pos)(file, &remember_pos); +} + + +int ha_maria::restart_rnd_next(uchar *buf) +{ + (*file->s->scan_restore_pos)(file, remember_pos); + return rnd_next(buf); +} + + +int ha_maria::rnd_pos(uchar *buf, uchar *pos) +{ + ha_statistic_increment(&SSV::ha_read_rnd_count); + int error= maria_rrnd(file, buf, my_get_ptr(pos, ref_length)); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +void ha_maria::position(const uchar *record) +{ + my_off_t row_position= maria_position(file); + my_store_ptr(ref, ref_length, row_position); +} + + +int ha_maria::info(uint flag) +{ + return info(flag, table->s->tmp_table == NO_TMP_TABLE); +} + +int ha_maria::info(uint flag, my_bool lock_table_share) +{ + MARIA_INFO maria_info; + char name_buff[FN_REFLEN]; + + (void) maria_status(file, &maria_info, flag); + if (flag & HA_STATUS_VARIABLE) + { + stats.records= maria_info.records; + stats.deleted= maria_info.deleted; + stats.data_file_length= maria_info.data_file_length; + stats.index_file_length= maria_info.index_file_length; + stats.delete_length= maria_info.delete_length; + stats.check_time= maria_info.check_time; + stats.mean_rec_length= maria_info.mean_reclength; + } + if (flag & HA_STATUS_CONST) + { + TABLE_SHARE *share= table->s; + stats.max_data_file_length= maria_info.max_data_file_length; + stats.max_index_file_length= maria_info.max_index_file_length; + stats.create_time= maria_info.create_time; + ref_length= maria_info.reflength; + share->db_options_in_use= maria_info.options; + stats.block_size= maria_block_size; + stats.mrr_length_per_rec= maria_info.reflength + 8; // 8 = max(sizeof(void *)) + + /* Update share */ + if (lock_table_share) + pthread_mutex_lock(&share->mutex); + share->keys_in_use.set_prefix(share->keys); + share->keys_in_use.intersect_extended(maria_info.key_map); + share->keys_for_keyread.intersect(share->keys_in_use); + share->db_record_offset= maria_info.record_offset; + if (share->key_parts) + { + ulong *to= table->key_info[0].rec_per_key, *end; + double *from= maria_info.rec_per_key; + for (end= to+ share->key_parts ; to < end ; to++, from++) + *to= (ulong) (*from + 0.5); + } + if (lock_table_share) + pthread_mutex_unlock(&share->mutex); + + /* + Set data_file_name and index_file_name to point at the symlink value + if table is symlinked (Ie; Real name is not same as generated name) + */ + data_file_name= index_file_name= 0; + fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_DEXT, + MY_APPEND_EXT | MY_UNPACK_FILENAME); + if (strcmp(name_buff, maria_info.data_file_name)) + data_file_name =maria_info.data_file_name; + fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_IEXT, + MY_APPEND_EXT | MY_UNPACK_FILENAME); + if (strcmp(name_buff, maria_info.index_file_name)) + index_file_name=maria_info.index_file_name; + } + if (flag & HA_STATUS_ERRKEY) + { + errkey= maria_info.errkey; + my_store_ptr(dup_ref, ref_length, maria_info.dup_key_pos); + } + /* Faster to always update, than to do it based on flag */ + stats.update_time= maria_info.update_time; + stats.auto_increment_value= maria_info.auto_increment; + + return 0; +} + + +int ha_maria::extra(enum ha_extra_function operation) +{ + int tmp; + TRN *old_trn= file->trn; + if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_KEYREAD) + return 0; +#ifdef NOT_USED + if (operation == HA_EXTRA_MMAP && !opt_maria_use_mmap) + return 0; +#endif + + /* + We have to set file->trn here because in some cases we call + extern_lock(F_UNLOCK) (which resets file->trn) followed by maria_close() + without calling commit/rollback in between. If file->trn is not set + we can't remove file->share from the transaction list in the extra() call. + + table->in_use is not set in the case this is a done as part of closefrm() + as part of drop table. + */ + + if (file->s->now_transactional && !file->trn && table->in_use && + (operation == HA_EXTRA_PREPARE_FOR_DROP || + operation == HA_EXTRA_PREPARE_FOR_RENAME)) + { + THD *thd= table->in_use; + TRN *trn= THD_TRN; + _ma_set_trn_for_table(file, trn); + } + tmp= maria_extra(file, operation, 0); + file->trn= old_trn; // Reset trn if was used + return tmp; +} + +int ha_maria::reset(void) +{ + pushed_idx_cond= NULL; + pushed_idx_cond_keyno= MAX_KEY; + ma_set_index_cond_func(file, NULL, 0); + ds_mrr.dsmrr_close(); + if (file->trn) + { + /* Next statement is a new statement. Ensure it's logged */ + trnman_set_flags(file->trn, + trnman_get_flags(file->trn) & ~TRN_STATE_INFO_LOGGED); + } + return maria_reset(file); +} + +/* To be used with WRITE_CACHE and EXTRA_CACHE */ + +int ha_maria::extra_opt(enum ha_extra_function operation, ulong cache_size) +{ + if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_WRITE_CACHE) + return 0; + return maria_extra(file, operation, (void*) &cache_size); +} + + +int ha_maria::delete_all_rows() +{ + THD *thd= current_thd; + (void) translog_log_debug_info(file->trn, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), thd->query_length()); + if (file->s->now_transactional && + ((table->in_use->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) || + table->in_use->locked_tables)) + { + /* + We are not in autocommit mode or user have done LOCK TABLES. + We must do the delete row by row to be able to rollback the command + */ + return HA_ERR_WRONG_COMMAND; + } + return maria_delete_all_rows(file); +} + + +int ha_maria::delete_table(const char *name) +{ + THD *thd= current_thd; + (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), thd->query_length()); + return maria_delete_table(name); +} + + +/* This is mainly for temporary tables, so no logging necessary */ + +void ha_maria::drop_table(const char *name) +{ + (void) close(); + (void) maria_delete_table(name); +} + + +int ha_maria::external_lock(THD *thd, int lock_type) +{ + DBUG_ENTER("ha_maria::external_lock"); + /* + We don't test now_transactional because it may vary between lock/unlock + and thus confuse our reference counting. + It is critical to skip non-transactional tables: user-visible temporary + tables get an external_lock() when read/written for the first time, but no + corresponding unlock (they just stay locked and are later dropped while + locked); if a tmp table was transactional, "SELECT FROM non_tmp, tmp" + would never commit as its "locked_tables" count would stay 1. + When Maria has has_transactions()==TRUE, open_temporary_table() + (sql_base.cc) will use TRANSACTIONAL_TMP_TABLE and thus the + external_lock(F_UNLCK) will happen and we can then allow the user to + create transactional temporary tables. + */ + if (file->s->base.born_transactional) + { + /* Transactional table */ + if (lock_type != F_UNLCK) + { + file->external_ptr= thd; // For maria_register_trn() + + if (!file->s->lock_key_trees) // If we don't use versioning + { + /* + We come here in the following cases: + - The table is a temporary table + - It's a table which is crash safe but not yet versioned, for + example a table with fulltext or rtree keys + + Set the current state to point to save_state so that the + block_format code don't count the same record twice. + Copy also the current state. This may have been wrong if the + same file was used several times in the last statement + */ + file->state= file->state_start; + *file->state= file->s->state.state; + } + + if (file->trn) + { + /* This can only happen with tables created with clone() */ + DBUG_ASSERT(cloned); + trnman_increment_locked_tables(file->trn); + } + + if (!thd->transaction.on) + { + /* + No need to log REDOs/UNDOs. If this is an internal temporary table + which will be renamed to a permanent table (like in ALTER TABLE), + the rename happens after unlocking so will be durable (and the table + will get its create_rename_lsn). + Note: if we wanted to enable users to have an old backup and apply + tons of archived logs to roll-forward, we could then not disable + REDOs/UNDOs in this case. + */ + DBUG_PRINT("info", ("Disabling logging for table")); + _ma_tmp_disable_logging_for_table(file, TRUE); + } + } + else + { + TRN *trn= THD_TRN; + /* End of transaction */ + + /* + We always re-enable, don't rely on thd->transaction.on as it is + sometimes reset to true after unlocking (see mysql_truncate() for a + partitioned table based on Maria). + Note that we can come here without having an exclusive lock on the + table, for example in this case: + external_lock(F_(WR|RD)LCK); thr_lock() which fails due to lock + abortion; external_lock(F_UNLCK). Fortunately, the re-enabling happens + only if we were the thread which disabled logging. + */ + if (_ma_reenable_logging_for_table(file, TRUE)) + DBUG_RETURN(1); + /** @todo zero file->trn also in commit and rollback */ + _ma_set_trn_for_table(file, NULL); // Safety + /* + Ensure that file->state points to the current number of rows. This + is needed if someone calls maria_info() without first doing an + external lock of the table + */ + file->state= &file->s->state.state; + if (trn) + { + DBUG_PRINT("info", + ("locked_tables: %u", trnman_has_locked_tables(trn))); + if (trnman_has_locked_tables(trn) && + !trnman_decrement_locked_tables(trn)) + { + /* + OK should not have been sent to client yet (ACID). + This is a bit excessive, ACID requires this only if there are some + changes to commit (rollback shouldn't be tested). + */ + DBUG_ASSERT(!thd->main_da.is_sent || + thd->killed == THD::KILL_CONNECTION); + /* autocommit ? rollback a transaction */ +#ifdef MARIA_CANNOT_ROLLBACK + if (ma_commit(trn)) + DBUG_RETURN(1); + THD_TRN= 0; +#else + if (!(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) + { + trnman_rollback_trn(trn); + DBUG_PRINT("info", ("THD_TRN set to 0x0")); + THD_TRN= 0; + } +#endif + } + trnman_set_flags(trn, trnman_get_flags(trn) & ~ TRN_STATE_INFO_LOGGED); + } + } + } /* if transactional table */ + DBUG_RETURN(maria_lock_database(file, !table->s->tmp_table ? + lock_type : ((lock_type == F_UNLCK) ? + F_UNLCK : F_EXTRA_LCK))); +} + +int ha_maria::start_stmt(THD *thd, thr_lock_type lock_type) +{ + TRN *trn; + if (file->s->base.born_transactional) + { + trn= THD_TRN; + DBUG_ASSERT(trn); // this may be called only after external_lock() + DBUG_ASSERT(trnman_has_locked_tables(trn)); + DBUG_ASSERT(lock_type != TL_UNLOCK); + DBUG_ASSERT(file->trn == trn); + + /* + If there was an implicit commit under this LOCK TABLES by a previous + statement (like a DDL), at least if that previous statement was about a + different ha_maria than 'this' then this->file->trn is a stale + pointer. We fix it: + */ + _ma_set_trn_for_table(file, trn); + /* + As external_lock() was already called, don't increment locked_tables. + Note that we call the function below possibly several times when + statement starts (once per table). This is ok as long as that function + does cheap operations. Otherwise, we will need to do it only on first + call to start_stmt(). + */ + trnman_new_statement(trn); + +#ifdef EXTRA_DEBUG + if (!(trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED) && + trnman_get_flags(trn) & TRN_STATE_TABLES_CAN_CHANGE) + { + trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED); + (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), + thd->query_length()); + } +#endif + } + return 0; +} + + +/** + Performs an implicit commit of the Maria transaction and creates a new + one. + + This can be considered a hack. When Maria loses HA_NO_TRANSACTIONS it will + be participant in the connection's transaction and so the implicit commits + (ha_commit()) (like in end_active_trans()) will do the implicit commit + without need to call this function which can then be removed. + + @param thd THD object + @param new_trn if a new transaction should be created; a new + transaction is not needed when we know that the + tables will be unlocked very soon. +*/ + +int ha_maria::implicit_commit(THD *thd, bool new_trn) +{ +#ifndef MARIA_CANNOT_ROLLBACK +#error this method should be removed +#endif + TRN *trn; + int error= 0; + TABLE *table; + DBUG_ENTER("ha_maria::implicit_commit"); + if (!new_trn && thd->locked_tables) + { + /* + "we are under LOCK TABLES" <=> "we shouldn't commit". + As thd->locked_tables is true, we are either under LOCK TABLES, or in + prelocking; prelocking can be under LOCK TABLES, or not (and in this + latter case only we should commit). + Note that we come here only at the end of the top statement + (dispatch_command()), we are never committing inside a sub-statement./ + */ + enum prelocked_mode_type prelocked_mode= thd->prelocked_mode; + if ((prelocked_mode == NON_PRELOCKED) || + (prelocked_mode == PRELOCKED_UNDER_LOCK_TABLES)) + { + DBUG_PRINT("info", ("locked_tables, skipping")); + DBUG_RETURN(0); + } + } + if ((trn= THD_TRN) != NULL) + { + uint locked_tables= trnman_has_locked_tables(trn); + if (unlikely(ma_commit(trn))) + error= 1; + if (!new_trn) + { + THD_TRN= NULL; + goto end; + } + /* + We need to create a new transaction and put it in THD_TRN. Indeed, + tables may be under LOCK TABLES, and so they will start the next + statement assuming they have a trn (see ha_maria::start_stmt()). + */ + trn= trnman_new_trn(& thd->transaction.wt); + /* This is just a commit, tables stay locked if they were: */ + trnman_reset_locked_tables(trn, locked_tables); + THD_TRN= trn; + if (unlikely(trn == NULL)) + error= HA_ERR_OUT_OF_MEM; + + /* + Move all locked tables to the new transaction + We must do it here as otherwise file->thd and file->state may be + stale pointers. We can't do this in start_stmt() as we don't know + when we should call _ma_setup_live_state() and in some cases, like + in check table, we use the table without calling start_stmt(). + */ + for (table=thd->open_tables; table ; table=table->next) + { + if (table->db_stat && table->file->ht == maria_hton) + { + MARIA_HA *handler= ((ha_maria*) table->file)->file; + if (handler->s->base.born_transactional) + { + _ma_set_trn_for_table(handler, trn); + /* If handler uses versioning */ + if (handler->s->lock_key_trees) + { + if (_ma_setup_live_state(handler)) + error= HA_ERR_OUT_OF_MEM; + } + } + } + } + } +end: + DBUG_RETURN(error); +} + + +THR_LOCK_DATA **ha_maria::store_lock(THD *thd, + THR_LOCK_DATA **to, + enum thr_lock_type lock_type) +{ + /* Test if we can fix test below */ + DBUG_ASSERT(lock_type != TL_UNLOCK && + (lock_type == TL_IGNORE || file->lock.type == TL_UNLOCK)); + if (lock_type != TL_IGNORE && file->lock.type == TL_UNLOCK) + { + const enum enum_sql_command sql_command= thd->lex->sql_command; + /* + We have to disable concurrent inserts for INSERT ... SELECT or + INSERT/UPDATE/DELETE with sub queries if we are using statement based + logging. We take the safe route here and disable this for all commands + that only does reading that are not SELECT. + */ + if (lock_type <= TL_READ_HIGH_PRIORITY && + !thd->current_stmt_binlog_row_based && + (sql_command != SQLCOM_SELECT && + sql_command != SQLCOM_LOCK_TABLES) && + (thd->options & OPTION_BIN_LOG) && + mysql_bin_log.is_open()) + lock_type= TL_READ_NO_INSERT; + else if (lock_type == TL_WRITE_CONCURRENT_INSERT) + { + const enum enum_duplicates duplicates= thd->lex->duplicates; + /* + Explanation for the 3 conditions below, in order: + + - Bulk insert may use repair, which will cause problems if other + threads try to read/insert to the table: disable versioning. + Note that our read of file->state->records is incorrect, as such + variable may have changed when we come to start_bulk_insert() (worse + case: we see != 0 so allow versioning, start_bulk_insert() sees 0 and + uses repair). This is prevented because start_bulk_insert() will not + try repair if we enabled versioning. + - INSERT SELECT ON DUPLICATE KEY UPDATE comes here with + TL_WRITE_CONCURRENT_INSERT but shouldn't because it can do + update/delete of a row and versioning doesn't support that + - same for LOAD DATA CONCURRENT REPLACE. + */ + if ((file->state->records == 0) || + (sql_command == SQLCOM_INSERT_SELECT && duplicates == DUP_UPDATE) || + (sql_command == SQLCOM_LOAD && duplicates == DUP_REPLACE)) + lock_type= TL_WRITE; + } + file->lock.type= lock_type; + } + *to++= &file->lock; + return to; +} + + +void ha_maria::update_create_info(HA_CREATE_INFO *create_info) +{ + ha_maria::info(HA_STATUS_AUTO | HA_STATUS_CONST); + if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) + { + create_info->auto_increment_value= stats.auto_increment_value; + } + create_info->data_file_name= data_file_name; + create_info->index_file_name= index_file_name; + /* We need to restore the row type as Maria can change it */ + if (create_info->row_type != ROW_TYPE_DEFAULT && + !(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) + create_info->row_type= get_row_type(); + /* + Show always page checksums, as this can be forced with + maria_page_checksums variable + */ + if (create_info->page_checksum == HA_CHOICE_UNDEF) + create_info->page_checksum= + (file->s->options & HA_OPTION_PAGE_CHECKSUM) ? HA_CHOICE_YES : + HA_CHOICE_NO; +} + + +enum row_type ha_maria::get_row_type() const +{ + switch (file->s->data_file_type) { + case STATIC_RECORD: return ROW_TYPE_FIXED; + case DYNAMIC_RECORD: return ROW_TYPE_DYNAMIC; + case BLOCK_RECORD: return ROW_TYPE_PAGE; + case COMPRESSED_RECORD: return ROW_TYPE_COMPRESSED; + default: return ROW_TYPE_NOT_USED; + } +} + + +static enum data_file_type maria_row_type(HA_CREATE_INFO *info) +{ + if (info->transactional == HA_CHOICE_YES) + return BLOCK_RECORD; + switch (info->row_type) { + case ROW_TYPE_FIXED: return STATIC_RECORD; + case ROW_TYPE_DYNAMIC: return DYNAMIC_RECORD; + default: return BLOCK_RECORD; + } +} + + +int ha_maria::create(const char *name, register TABLE *table_arg, + HA_CREATE_INFO *ha_create_info) +{ + int error; + uint create_flags= 0, record_count, i; + char buff[FN_REFLEN]; + MARIA_KEYDEF *keydef; + MARIA_COLUMNDEF *recinfo; + MARIA_CREATE_INFO create_info; + TABLE_SHARE *share= table_arg->s; + uint options= share->db_options_in_use; + enum data_file_type row_type; + THD *thd= current_thd; + DBUG_ENTER("ha_maria::create"); + + for (i= 0; i < share->keys; i++) + { + if (table_arg->key_info[i].flags & HA_USES_PARSER) + { + create_flags|= HA_CREATE_RELIES_ON_SQL_LAYER; + break; + } + } + /* Note: BLOCK_RECORD is used if table is transactional */ + row_type= maria_row_type(ha_create_info); + if (ha_create_info->transactional == HA_CHOICE_YES && + ha_create_info->row_type != ROW_TYPE_PAGE && + ha_create_info->row_type != ROW_TYPE_NOT_USED && + ha_create_info->row_type != ROW_TYPE_DEFAULT) + push_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_NOTE, + ER_ILLEGAL_HA_CREATE_OPTION, + "Row format set to PAGE because of TRANSACTIONAL=1 option"); + + bzero((char*) &create_info, sizeof(create_info)); + if ((error= table2maria(table_arg, row_type, &keydef, &recinfo, + &record_count, &create_info))) + DBUG_RETURN(error); /* purecov: inspected */ + create_info.max_rows= share->max_rows; + create_info.reloc_rows= share->min_rows; + create_info.with_auto_increment= share->next_number_key_offset == 0; + create_info.auto_increment= (ha_create_info->auto_increment_value ? + ha_create_info->auto_increment_value -1 : + (ulonglong) 0); + create_info.data_file_length= ((ulonglong) share->max_rows * + share->avg_row_length); + create_info.data_file_name= ha_create_info->data_file_name; + create_info.index_file_name= ha_create_info->index_file_name; + create_info.language= share->table_charset->number; + + /* + Table is transactional: + - If the user specify that table is transactional (in this case + row type is forced to BLOCK_RECORD) + - If they specify BLOCK_RECORD without specifying transactional behaviour + + Shouldn't this test be pushed down to maria_create()? Because currently, + ma_test1 -T crashes: it creates a table with DYNAMIC_RECORD but has + born_transactional==1, which confuses some recovery-related code. + */ + create_info.transactional= (row_type == BLOCK_RECORD && + ha_create_info->transactional != HA_CHOICE_NO); + + if (ha_create_info->options & HA_LEX_CREATE_TMP_TABLE) + create_flags|= HA_CREATE_TMP_TABLE; + if (ha_create_info->options & HA_CREATE_KEEP_FILES) + create_flags|= HA_CREATE_KEEP_FILES; + if (options & HA_OPTION_PACK_RECORD) + create_flags|= HA_PACK_RECORD; + if (options & HA_OPTION_CHECKSUM) + create_flags|= HA_CREATE_CHECKSUM; + if (options & HA_OPTION_DELAY_KEY_WRITE) + create_flags|= HA_CREATE_DELAY_KEY_WRITE; + if ((ha_create_info->page_checksum == HA_CHOICE_UNDEF && + maria_page_checksums) || + ha_create_info->page_checksum == HA_CHOICE_YES) + create_flags|= HA_CREATE_PAGE_CHECKSUM; + + (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), thd->query_length()); + + /* TODO: Check that the following fn_format is really needed */ + error= + maria_create(fn_format(buff, name, "", "", + MY_UNPACK_FILENAME | MY_APPEND_EXT), + row_type, share->keys, keydef, + record_count, recinfo, + 0, (MARIA_UNIQUEDEF *) 0, + &create_info, create_flags); + + my_free((uchar*) recinfo, MYF(0)); + DBUG_RETURN(error); +} + + +int ha_maria::rename_table(const char *from, const char *to) +{ + THD *thd= current_thd; + (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), thd->query_length()); + return maria_rename(from, to); +} + + +void ha_maria::get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values) +{ + ulonglong nr; + int error; + uchar key[HA_MAX_KEY_LENGTH]; + + if (!table->s->next_number_key_offset) + { // Autoincrement at key-start + ha_maria::info(HA_STATUS_AUTO); + *first_value= stats.auto_increment_value; + /* Maria has only table-level lock for now, so reserves to +inf */ + *nb_reserved_values= ULONGLONG_MAX; + return; + } + + /* it's safe to call the following if bulk_insert isn't on */ + maria_flush_bulk_insert(file, table->s->next_number_index); + + (void) extra(HA_EXTRA_KEYREAD); + key_copy(key, table->record[0], + table->key_info + table->s->next_number_index, + table->s->next_number_key_offset); + error= maria_rkey(file, table->record[1], (int) table->s->next_number_index, + key, make_prev_keypart_map(table->s->next_number_keypart), + HA_READ_PREFIX_LAST); + if (error) + nr= 1; + else + { + /* Get data from record[1] */ + nr= ((ulonglong) table->next_number_field-> + val_int_offset(table->s->rec_buff_length) + 1); + } + extra(HA_EXTRA_NO_KEYREAD); + *first_value= nr; + /* + MySQL needs to call us for next row: assume we are inserting ("a",null) + here, we return 3, and next this statement will want to insert ("b",null): + there is no reason why ("b",3+1) would be the good row to insert: maybe it + already exists, maybe 3+1 is too large... + */ + *nb_reserved_values= 1; +} + + +/* + Find out how many rows there is in the given range + + SYNOPSIS + records_in_range() + inx Index to use + min_key Start of range. Null pointer if from first key + max_key End of range. Null pointer if to last key + + NOTES + min_key.flag can have one of the following values: + HA_READ_KEY_EXACT Include the key in the range + HA_READ_AFTER_KEY Don't include key in range + + max_key.flag can have one of the following values: + HA_READ_BEFORE_KEY Don't include key in range + HA_READ_AFTER_KEY Include all 'end_key' values in the range + + RETURN + HA_POS_ERROR Something is wrong with the index tree. + 0 There is no matching keys in the given range + number > 0 There is approximately 'number' matching rows in + the range. +*/ + +ha_rows ha_maria::records_in_range(uint inx, key_range *min_key, + key_range *max_key) +{ + return (ha_rows) maria_records_in_range(file, (int) inx, min_key, max_key); +} + + +int ha_maria::ft_read(uchar * buf) +{ + int error; + + if (!ft_handler) + return -1; + + thread_safe_increment(table->in_use->status_var.ha_read_next_count, + &LOCK_status); // why ? + + error= ft_handler->please->read_next(ft_handler, (char*) buf); + + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +uint ha_maria::checksum() const +{ + return (uint) file->state->checksum; +} + + +bool ha_maria::check_if_incompatible_data(HA_CREATE_INFO *create_info, + uint table_changes) +{ + DBUG_ENTER("check_if_incompatible_data"); + uint options= table->s->db_options_in_use; + + if (create_info->auto_increment_value != stats.auto_increment_value || + create_info->data_file_name != data_file_name || + create_info->index_file_name != index_file_name || + (maria_row_type(create_info) != data_file_type && + create_info->row_type != ROW_TYPE_DEFAULT) || + table_changes == IS_EQUAL_NO || + (table_changes & IS_EQUAL_PACK_LENGTH)) // Not implemented yet + DBUG_RETURN(COMPATIBLE_DATA_NO); + + if ((options & (HA_OPTION_CHECKSUM | + HA_OPTION_DELAY_KEY_WRITE)) != + (create_info->table_options & (HA_OPTION_CHECKSUM | + HA_OPTION_DELAY_KEY_WRITE))) + DBUG_RETURN(COMPATIBLE_DATA_NO); + DBUG_RETURN(COMPATIBLE_DATA_YES); +} + + +static int maria_hton_panic(handlerton *hton, ha_panic_function flag) +{ + /* If no background checkpoints, we need to do one now */ + return ((checkpoint_interval == 0) ? + ma_checkpoint_execute(CHECKPOINT_FULL, FALSE) : 0) | maria_panic(flag); +} + + +static int maria_commit(handlerton *hton __attribute__ ((unused)), + THD *thd, bool all) +{ + TRN *trn= THD_TRN; + DBUG_ENTER("maria_commit"); + trnman_reset_locked_tables(trn, 0); + trnman_set_flags(trn, trnman_get_flags(trn) & ~TRN_STATE_INFO_LOGGED); + + /* statement or transaction ? */ + if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all) + DBUG_RETURN(0); // end of statement + DBUG_PRINT("info", ("THD_TRN set to 0x0")); + THD_TRN= 0; + DBUG_RETURN(ma_commit(trn)); // end of transaction +} + + +static int maria_rollback(handlerton *hton __attribute__ ((unused)), + THD *thd, bool all) +{ + TRN *trn= THD_TRN; + DBUG_ENTER("maria_rollback"); + trnman_reset_locked_tables(trn, 0); + /* statement or transaction ? */ + if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all) + { + trnman_rollback_statement(trn); + DBUG_RETURN(0); // end of statement + } + DBUG_PRINT("info", ("THD_TRN set to 0x0")); + THD_TRN= 0; + DBUG_RETURN(trnman_rollback_trn(trn) ? + HA_ERR_OUT_OF_MEM : 0); // end of transaction +} + + + +/** + @brief flush log handler + + @param hton maria handlerton (unused) + + @retval FALSE OK + @retval TRUE Error +*/ + +bool maria_flush_logs(handlerton *hton) +{ + return test(translog_purge_at_flush()); +} + + +#define SHOW_MSG_LEN (FN_REFLEN + 20) +/** + @brief show status handler + + @param hton maria handlerton + @param thd thread handler + @param print print function + @param stat type of status +*/ + +bool maria_show_status(handlerton *hton, + THD *thd, + stat_print_fn *print, + enum ha_stat_type stat) +{ + const LEX_STRING *engine_name= hton_name(hton); + switch (stat) { + case HA_ENGINE_LOGS: + { + TRANSLOG_ADDRESS horizon= translog_get_horizon(); + uint32 last_file= LSN_FILE_NO(horizon); + uint32 first_needed= translog_get_first_needed_file(); + uint32 first_file= translog_get_first_file(horizon); + uint32 i; + const char unknown[]= "unknown"; + const char needed[]= "in use"; + const char unneeded[]= "free"; + char path[FN_REFLEN]; + + if (first_file == 0) + { + const char error[]= "error"; + print(thd, engine_name->str, engine_name->length, + STRING_WITH_LEN(""), error, sizeof(error) - 1); + break; + } + + for (i= first_file; i <= last_file; i++) + { + char *file; + const char *status; + uint length, status_len; + MY_STAT stat_buff, *stat; + const char error[]= "can't stat"; + char object[SHOW_MSG_LEN]; + file= translog_filename_by_fileno(i, path); + if (!(stat= my_stat(file, &stat_buff, MYF(0)))) + { + status= error; + status_len= sizeof(error) - 1; + length= my_snprintf(object, SHOW_MSG_LEN, "Size unknown ; %s", file); + } + else + { + if (first_needed == 0) + { + status= unknown; + status_len= sizeof(unknown) - 1; + } + else if (i < first_needed) + { + status= unneeded; + status_len= sizeof(unneeded) - 1; + } + else + { + status= needed; + status_len= sizeof(needed) - 1; + } + length= my_snprintf(object, SHOW_MSG_LEN, "Size %12lu ; %s", + (ulong) stat->st_size, file); + } + + print(thd, engine_name->str, engine_name->length, + object, length, status, status_len); + } + break; + } + case HA_ENGINE_STATUS: + case HA_ENGINE_MUTEX: + default: + break; + } + return 0; +} + + +/** + Callback to delete all logs in directory. This is lower-level than other + functions in ma_loghandler.c which delete logs, as it does not rely on + translog_init() having been called first. + + @param directory directory where file is + @param filename base name of the file to delete +*/ + +static my_bool translog_callback_delete_all(const char *directory, + const char *filename) +{ + char complete_name[FN_REFLEN]; + fn_format(complete_name, filename, directory, "", MYF(MY_UNPACK_FILENAME)); + return my_delete(complete_name, MYF(MY_WME)); +} + + +/** + Helper function for option aria-force-start-after-recovery-failures. + Deletes logs if too many failures. Otherwise, increments the counter of + failures in the control file. + Notice how this has to be called _before_ translog_init() (if log is + corrupted, translog_init() might crash the server, so we need to remove logs + before). + + @param log_dir directory where logs to be deleted are +*/ + +static int mark_recovery_start(const char* log_dir) +{ + int res; + DBUG_ENTER("mark_recovery_start"); + if (unlikely(maria_recover_options == HA_RECOVER_NONE)) + ma_message_no_user(ME_JUST_WARNING, "Please consider using option" + " --aria-recover[=...] to automatically check and" + " repair tables when logs are removed by option" + " --aria-force-start-after-recovery-failures=#"); + if (recovery_failures >= force_start_after_recovery_failures) + { + /* + Remove logs which cause the problem; keep control file which has + critical info like uuid, max_trid (removing control file may make + correct tables look corrupted!). + */ + char msg[100]; + res= translog_walk_filenames(log_dir, &translog_callback_delete_all); + my_snprintf(msg, sizeof(msg), + "%s logs after %u consecutive failures of" + " recovery from logs", + (res ? "failed to remove some" : "removed all"), + recovery_failures); + ma_message_no_user((res ? 0 : ME_JUST_WARNING), msg); + } + else + res= ma_control_file_write_and_force(last_checkpoint_lsn, last_logno, + max_trid_in_control_file, + recovery_failures + 1); + DBUG_RETURN(res); +} + + +/** + Helper function for option aria-force-start-after-recovery-failures. + Records in the control file that recovery was a success, so that it's not + counted for aria-force-start-after-recovery-failures. +*/ + +static int mark_recovery_success(void) +{ + /* success of recovery, reset recovery_failures: */ + int res; + DBUG_ENTER("mark_recovery_success"); + res= ma_control_file_write_and_force(last_checkpoint_lsn, last_logno, + max_trid_in_control_file, 0); + DBUG_RETURN(res); +} + + +/* + Return 1 if table has changed during the current transaction +*/ + +bool ha_maria::is_changed() const +{ + return file->state->changed; +} + + +static int ha_maria_init(void *p) +{ + int res; + copy_variable_aliases(); + const char *log_dir= maria_data_root; + maria_hton= (handlerton *)p; + maria_hton->state= SHOW_OPTION_YES; + maria_hton->db_type= DB_TYPE_UNKNOWN; + maria_hton->create= maria_create_handler; + maria_hton->panic= maria_hton_panic; + maria_hton->commit= maria_commit; + maria_hton->rollback= maria_rollback; + maria_hton->flush_logs= maria_flush_logs; + maria_hton->show_status= maria_show_status; + /* TODO: decide if we support Maria being used for log tables */ + maria_hton->flags= HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES; + bzero(maria_log_pagecache, sizeof(*maria_log_pagecache)); + maria_tmpdir= &mysql_tmpdir_list; /* For REDO */ + res= maria_upgrade() || maria_init() || ma_control_file_open(TRUE, TRUE) || + ((force_start_after_recovery_failures != 0) && + mark_recovery_start(log_dir)) || + !init_pagecache(maria_pagecache, + (size_t) pagecache_buffer_size, pagecache_division_limit, + pagecache_age_threshold, maria_block_size, 0) || + !init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0) || + translog_init(maria_data_root, log_file_size, + MYSQL_VERSION_ID, server_id, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0) || + maria_recovery_from_log() || + ((force_start_after_recovery_failures != 0 || + maria_recovery_changed_data) && mark_recovery_success()) || + ma_checkpoint_init(checkpoint_interval); + maria_multi_threaded= maria_in_ha_maria= TRUE; + maria_create_trn_hook= maria_create_trn_for_mysql; + +#if defined(HAVE_REALPATH) && !defined(HAVE_valgrind) && !defined(HAVE_BROKEN_REALPATH) + /* We can only test for sub paths if my_symlink.c is using realpath */ + maria_test_invalid_symlink= test_if_data_home_dir; +#endif + if (res) + maria_hton= 0; + return res ? HA_ERR_INITIALIZATION : 0; +} + + +#ifdef HAVE_QUERY_CACHE +/** + @brief Register a named table with a call back function to the query cache. + + @param thd The thread handle + @param table_key A pointer to the table name in the table cache + @param key_length The length of the table name + @param[out] engine_callback The pointer to the storage engine call back + function, currently 0 + @param[out] engine_data Engine data will be set to 0. + + @note Despite the name of this function, it is used to check each statement + before it is cached and not to register a table or callback function. + + @see handler::register_query_cache_table + + @return The error code. The engine_data and engine_callback will be set to 0. + @retval TRUE Success + @retval FALSE An error occurred +*/ + +my_bool ha_maria::register_query_cache_table(THD *thd, char *table_name, + uint table_name_len, + qc_engine_callback + *engine_callback, + ulonglong *engine_data) +{ + ulonglong actual_data_file_length; + ulonglong current_data_file_length; + DBUG_ENTER("ha_maria::register_query_cache_table"); + + /* + No call back function is needed to determine if a cached statement + is valid or not. + */ + *engine_callback= 0; + + /* + No engine data is needed. + */ + *engine_data= 0; + + if (file->s->now_transactional && file->s->have_versioning) + return (file->trn->trid >= file->s->state.last_change_trn); + + /* + If a concurrent INSERT has happened just before the currently processed + SELECT statement, the total size of the table is unknown. + + To determine if the table size is known, the current thread's snap shot of + the table size with the actual table size are compared. + + If the table size is unknown the SELECT statement can't be cached. + */ + + /* + POSIX visibility rules specify that "2. Whatever memory values a + thread can see when it unlocks a mutex <...> can also be seen by any + thread that later locks the same mutex". In this particular case, + concurrent insert thread had modified the data_file_length in + MYISAM_SHARE before it has unlocked (or even locked) + structure_guard_mutex. So, here we're guaranteed to see at least that + value after we've locked the same mutex. We can see a later value + (modified by some other thread) though, but it's ok, as we only want + to know if the variable was changed, the actual new value doesn't matter + */ + actual_data_file_length= file->s->state.state.data_file_length; + current_data_file_length= file->state->data_file_length; + + /* Return whether is ok to try to cache current statement. */ + DBUG_RETURN(!(file->s->non_transactional_concurrent_insert && + current_data_file_length != actual_data_file_length)); +} +#endif + +struct st_mysql_sys_var* system_variables[]= { + MYSQL_SYSVAR(block_size), + MYSQL_SYSVAR(checkpoint_interval), + MYSQL_SYSVAR(force_start_after_recovery_failures), + MYSQL_SYSVAR(group_commit), + MYSQL_SYSVAR(group_commit_interval), + MYSQL_SYSVAR(log_dir_path), + MYSQL_SYSVAR(log_file_size), + MYSQL_SYSVAR(log_purge_type), + MYSQL_SYSVAR(max_sort_file_size), + MYSQL_SYSVAR(page_checksum), + MYSQL_SYSVAR(pagecache_age_threshold), + MYSQL_SYSVAR(pagecache_buffer_size), + MYSQL_SYSVAR(pagecache_division_limit), + MYSQL_SYSVAR(recover), + MYSQL_SYSVAR(repair_threads), + MYSQL_SYSVAR(sort_buffer_size), + MYSQL_SYSVAR(stats_method), + MYSQL_SYSVAR(sync_log_dir), + MYSQL_SYSVAR(used_for_temp_tables), + NULL +}; + + +/** + @brief Updates the checkpoint interval and restarts the background thread. +*/ + +static void update_checkpoint_interval(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + ma_checkpoint_end(); + ma_checkpoint_init(*(ulong *)var_ptr= (ulong)(*(long *)save)); +} + +/** + @brief Updates group commit mode +*/ + +static void update_maria_group_commit(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + ulong value= (ulong)*((long *)var_ptr); + DBUG_ENTER("update_maria_group_commit"); + DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu", + value, (ulong)(*(long *)save), + maria_group_commit_interval)); + /* old value */ + switch (value) { + case TRANSLOG_GCOMMIT_NONE: + break; + case TRANSLOG_GCOMMIT_HARD: + translog_hard_group_commit(FALSE); + break; + case TRANSLOG_GCOMMIT_SOFT: + translog_soft_sync(FALSE); + if (maria_group_commit_interval) + translog_soft_sync_end(); + break; + default: + DBUG_ASSERT(0); /* impossible */ + } + value= *(ulong *)var_ptr= (ulong)(*(long *)save); + translog_sync(); + /* new value */ + switch (value) { + case TRANSLOG_GCOMMIT_NONE: + break; + case TRANSLOG_GCOMMIT_HARD: + translog_hard_group_commit(TRUE); + break; + case TRANSLOG_GCOMMIT_SOFT: + translog_soft_sync(TRUE); + /* variable change made under global lock so we can just read it */ + if (maria_group_commit_interval) + translog_soft_sync_start(); + break; + default: + DBUG_ASSERT(0); /* impossible */ + } + DBUG_VOID_RETURN; +} + +/** + @brief Updates group commit interval +*/ + +static void update_maria_group_commit_interval(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + ulong new_value= (ulong)*((long *)save); + ulong *value_ptr= (ulong*) var_ptr; + DBUG_ENTER("update_maria_group_commit_interval"); + DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu", + *value_ptr, new_value, maria_group_commit)); + + /* variable change made under global lock so we can just read it */ + switch (maria_group_commit) { + case TRANSLOG_GCOMMIT_NONE: + *value_ptr= new_value; + translog_set_group_commit_interval(new_value); + break; + case TRANSLOG_GCOMMIT_HARD: + *value_ptr= new_value; + translog_set_group_commit_interval(new_value); + break; + case TRANSLOG_GCOMMIT_SOFT: + if (*value_ptr) + translog_soft_sync_end(); + translog_set_group_commit_interval(new_value); + if ((*value_ptr= new_value)) + translog_soft_sync_start(); + break; + default: + DBUG_ASSERT(0); /* impossible */ + } + DBUG_VOID_RETURN; +} + +/** + @brief Updates the transaction log file limit. +*/ + +static void update_log_file_size(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + uint32 size= (uint32)((ulong)(*(long *)save)); + translog_set_file_size(size); + *(ulong *)var_ptr= size; +} + + +SHOW_VAR status_variables[]= { + {"pagecache_blocks_not_flushed", (char*) &maria_pagecache_var.global_blocks_changed, SHOW_LONG_NOFLUSH}, + {"pagecache_blocks_unused", (char*) &maria_pagecache_var.blocks_unused, SHOW_LONG_NOFLUSH}, + {"pagecache_blocks_used", (char*) &maria_pagecache_var.blocks_used, SHOW_LONG_NOFLUSH}, + {"pagecache_read_requests", (char*) &maria_pagecache_var.global_cache_r_requests, SHOW_LONGLONG}, + {"pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG}, + {"pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG}, + {"pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG}, + {"transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG}, + {NullS, NullS, SHOW_LONG} +}; + +static struct st_mysql_show_var aria_status_variables[]= { + {"Aria", (char*) &status_variables, SHOW_ARRAY}, + {NullS, NullS, SHOW_LONG} +}; + +/**************************************************************************** + * Maria MRR implementation: use DS-MRR + ***************************************************************************/ + +int ha_maria::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, + uint n_ranges, uint mode, + HANDLER_BUFFER *buf) +{ + return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf); +} + +int ha_maria::multi_range_read_next(char **range_info) +{ + return ds_mrr.dsmrr_next(range_info); +} + +ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, + uint n_ranges, uint *bufsz, + uint *flags, COST_VECT *cost) +{ + /* + This call is here because there is no location where this->table would + already be known. + TODO: consider moving it into some per-query initialization call. + */ + ds_mrr.init(this, table); + return ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges, bufsz, + flags, cost); +} + +ha_rows ha_maria::multi_range_read_info(uint keyno, uint n_ranges, uint keys, + uint *bufsz, uint *flags, + COST_VECT *cost) +{ + ds_mrr.init(this, table); + return ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost); +} + +/* MyISAM MRR implementation ends */ + + +/* Index condition pushdown implementation*/ + + +Item *ha_maria::idx_cond_push(uint keyno_arg, Item* idx_cond_arg) +{ + pushed_idx_cond_keyno= keyno_arg; + pushed_idx_cond= idx_cond_arg; + in_range_check_pushed_down= TRUE; + if (active_index == pushed_idx_cond_keyno) + ma_set_index_cond_func(file, index_cond_func_maria, this); + return NULL; +} + + + + +struct st_mysql_storage_engine maria_storage_engine= +{ MYSQL_HANDLERTON_INTERFACE_VERSION }; + +maria_declare_plugin(aria) +compat_aliases, +{ + MYSQL_STORAGE_ENGINE_PLUGIN, + &maria_storage_engine, + "Aria", + "Monty Program Ab", + "Crash-safe tables with MyISAM heritage", + PLUGIN_LICENSE_GPL, + ha_maria_init, /* Plugin Init */ + NULL, /* Plugin Deinit */ + 0x0105, /* 1.5 */ + aria_status_variables, /* status variables */ + system_variables, /* system variables */ + "1.5", /* string version */ + MariaDB_PLUGIN_MATURITY_GAMMA /* maturity */ +} +maria_declare_plugin_end; diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h new file mode 100644 index 00000000000..605ad1d3a20 --- /dev/null +++ b/storage/maria/ha_maria.h @@ -0,0 +1,197 @@ +/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef HA_MARIA_INCLUDED +#define HA_MARIA_INCLUDED + +#ifdef USE_PRAGMA_INTERFACE +#pragma interface /* gcc class implementation */ +#endif + +/* class for the maria handler */ + +#include <maria.h> + +#define HA_RECOVER_NONE 0 /* No automatic recover */ +#define HA_RECOVER_DEFAULT 1 /* Automatic recover active */ +#define HA_RECOVER_BACKUP 2 /* Make a backupfile on recover */ +#define HA_RECOVER_FORCE 4 /* Recover even if we loose rows */ +#define HA_RECOVER_QUICK 8 /* Don't check rows in data file */ + +C_MODE_START +ICP_RESULT index_cond_func_maria(void *arg); +C_MODE_END + +extern ulong maria_sort_buffer_size; +extern TYPELIB maria_recover_typelib; +extern ulong maria_recover_options; + +class ha_maria :public handler +{ + MARIA_HA *file; + ulonglong int_table_flags; + MARIA_RECORD_POS remember_pos; + char *data_file_name, *index_file_name; + enum data_file_type data_file_type; + bool can_enable_indexes; + /** + If a transactional table is doing bulk insert with a single + UNDO_BULK_INSERT with/without repair. + */ + uint8 bulk_insert_single_undo; + int repair(THD * thd, HA_CHECK *param, bool optimize); + int zerofill(THD * thd, HA_CHECK_OPT *check_opt); + +public: + ha_maria(handlerton *hton, TABLE_SHARE * table_arg); + ~ha_maria() {} + handler *clone(MEM_ROOT *mem_root); + const char *table_type() const + { return "Aria"; } + const char *index_type(uint key_number); + const char **bas_ext() const; + ulonglong table_flags() const + { return int_table_flags; } + ulong index_flags(uint inx, uint part, bool all_parts) const + { + return ((table_share->key_info[inx].algorithm == HA_KEY_ALG_FULLTEXT) ? + 0 : HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE | + HA_READ_ORDER | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN); + } + uint max_supported_keys() const + { return MARIA_MAX_KEY; } + uint max_supported_key_length() const; + uint max_supported_key_part_length() const + { return max_supported_key_length(); } + enum row_type get_row_type() const; + uint checksum() const; + virtual double scan_time(); + + int open(const char *name, int mode, uint test_if_locked); + int close(void); + int write_row(uchar * buf); + int update_row(const uchar * old_data, uchar * new_data); + int delete_row(const uchar * buf); + int index_read_map(uchar * buf, const uchar * key, key_part_map keypart_map, + enum ha_rkey_function find_flag); + int index_read_idx_map(uchar * buf, uint idx, const uchar * key, + key_part_map keypart_map, + enum ha_rkey_function find_flag); + int index_read_last_map(uchar * buf, const uchar * key, + key_part_map keypart_map); + int index_next(uchar * buf); + int index_prev(uchar * buf); + int index_first(uchar * buf); + int index_last(uchar * buf); + int index_next_same(uchar * buf, const uchar * key, uint keylen); + int ft_init() + { + if (!ft_handler) + return 1; + ft_handler->please->reinit_search(ft_handler); + return 0; + } + FT_INFO *ft_init_ext(uint flags, uint inx, String * key) + { + return maria_ft_init_search(flags, file, inx, + (uchar *) key->ptr(), key->length(), + key->charset(), table->record[0]); + } + int ft_read(uchar * buf); + int index_init(uint idx, bool sorted); + int index_end(); + int rnd_init(bool scan); + int rnd_end(void); + int rnd_next(uchar * buf); + int rnd_pos(uchar * buf, uchar * pos); + int remember_rnd_pos(); + int restart_rnd_next(uchar * buf); + void position(const uchar * record); + int info(uint); + int info(uint, my_bool); + int extra(enum ha_extra_function operation); + int extra_opt(enum ha_extra_function operation, ulong cache_size); + int reset(void); + int external_lock(THD * thd, int lock_type); + int start_stmt(THD *thd, thr_lock_type lock_type); + int delete_all_rows(void); + int disable_indexes(uint mode); + int enable_indexes(uint mode); + int indexes_are_disabled(void); + void start_bulk_insert(ha_rows rows); + int end_bulk_insert(); + ha_rows records_in_range(uint inx, key_range * min_key, key_range * max_key); + void update_create_info(HA_CREATE_INFO * create_info); + int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info); + THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to, + enum thr_lock_type lock_type); + virtual void get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values); + int rename_table(const char *from, const char *to); + int delete_table(const char *name); + void drop_table(const char *name); + int check(THD * thd, HA_CHECK_OPT * check_opt); + int analyze(THD * thd, HA_CHECK_OPT * check_opt); + int repair(THD * thd, HA_CHECK_OPT * check_opt); + bool check_and_repair(THD * thd); + bool is_crashed() const; + bool is_changed() const; + bool auto_repair() const { return maria_recover_options != HA_RECOVER_NONE; } + int optimize(THD * thd, HA_CHECK_OPT * check_opt); + int restore(THD * thd, HA_CHECK_OPT * check_opt); + int backup(THD * thd, HA_CHECK_OPT * check_opt); + int assign_to_keycache(THD * thd, HA_CHECK_OPT * check_opt); + int preload_keys(THD * thd, HA_CHECK_OPT * check_opt); + bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes); + bool check_if_supported_virtual_columns(void) { return TRUE;} +#ifdef HAVE_REPLICATION + int dump(THD * thd, int fd); + int net_read_dump(NET * net); +#endif +#ifdef HAVE_QUERY_CACHE + my_bool register_query_cache_table(THD *thd, char *table_key, + uint key_length, + qc_engine_callback + *engine_callback, + ulonglong *engine_data); +#endif + MARIA_HA *file_ptr(void) + { + return file; + } + static int implicit_commit(THD *thd, bool new_trn); + /** + * Multi Range Read interface + */ + int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, + uint n_ranges, uint mode, HANDLER_BUFFER *buf); + int multi_range_read_next(char **range_info); + ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, + uint n_ranges, uint *bufsz, + uint *flags, COST_VECT *cost); + ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys, + uint *bufsz, uint *flags, COST_VECT *cost); + + /* Index condition pushdown implementation */ + Item *idx_cond_push(uint keyno, Item* idx_cond); +private: + DsMrr_impl ds_mrr; + friend ICP_RESULT index_cond_func_maria(void *arg); +}; + +#endif /* HA_MARIA_INCLUDED */ diff --git a/storage/maria/lockman.c b/storage/maria/lockman.c new file mode 100644 index 00000000000..d6d4dcd44e6 --- /dev/null +++ b/storage/maria/lockman.c @@ -0,0 +1,786 @@ +/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */ +/* QQ: TODO instant duration locks */ +/* QQ: #warning automatically place S instead of LS if possible */ + +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Generic Lock Manager + + Lock manager handles locks on "resources", a resource must be uniquely + identified by a 64-bit number. Lock manager itself does not imply + anything about the nature of a resource - it can be a row, a table, a + database, or just anything. + + Locks belong to "lock owners". A Lock owner is uniquely identified by a + 16-bit number. A function loid2lo must be provided by the application + that takes such a number as an argument and returns a LOCK_OWNER + structure. + + Lock levels are completely defined by three tables. Lock compatibility + matrix specifies which locks can be held at the same time on a resource. + Lock combining matrix specifies what lock level has the same behaviour as + a pair of two locks of given levels. getlock_result matrix simplifies + intention locking and lock escalation for an application, basically it + defines which locks are intention locks and which locks are "loose" + locks. It is only used to provide better diagnostics for the + application, lock manager itself does not differentiate between normal, + intention, and loose locks. + + Internally lock manager is based on a lock-free hash, see lf_hash.c for + details. All locks are stored in a hash, with a resource id as a search + key, so all locks for the same resource will be considered collisions and + will be put in a one (lock-free) linked list. The main lock-handling + logic is in the inner loop that searches for a lock in such a linked + list - lockfind(). + + This works as follows. Locks generally are added to the end of the list + (with one exception, see below). When scanning the list it is always + possible to determine what locks are granted (active) and what locks are + waiting - first lock is obviously active, the second is active if it's + compatible with the first, and so on, a lock is active if it's compatible + with all previous locks and all locks before it are also active. + To calculate the "compatible with all previous locks" all locks are + accumulated in prev_lock variable using lock_combining_matrix. + + Lock upgrades: when a thread that has a lock on a given resource, + requests a new lock on the same resource and the old lock is not enough + to satisfy new lock requirements (which is defined by + lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock is + placed in the list. Depending on other locks it is immediately active or + it will wait for other locks. Here's an exception to "locks are added + to the end" rule - upgraded locks are added after the last active lock + but before all waiting locks. Old lock (the one we upgraded from) is + not removed from the list, indeed it may be needed if the new lock was + in a savepoint that gets rolled back. So old lock is marked as "ignored" + (IGNORE_ME flag). New lock gets an UPGRADED flag. + + Loose locks add an important exception to the above. Loose locks do not + always commute with other locks. In the list IX-LS both locks are active, + while in the LS-IX list only the first lock is active. This creates a + problem in lock upgrades. If the list was IX-LS and the owner of the + first lock wants to place LS lock (which can be immediately granted), the + IX lock is upgraded to LSIX and the list becomes IX-LS-LSIX, which, + according to the lock compatibility matrix means that the last lock is + waiting - of course it all happened because IX and LS were swapped and + they don't commute. To work around this there's ACTIVE flag which is set + in every lock that never waited (was placed active), and this flag + overrides "compatible with all previous locks" rule. + + When a lock is placed to the end of the list it's either compatible with + all locks and all locks are active - new lock becomes active at once, or + it conflicts with some of the locks, in this case in the 'blocker' + variable a conflicting lock is returned and the calling thread waits on a + pthread condition in the LOCK_OWNER structure of the owner of the + conflicting lock. Or a new lock is compatible with all locks, but some + existing locks are not compatible with each other (example: request IS, + when the list is S-IX) - that is not all locks are active. In this case a + first waiting lock is returned in the 'blocker' variable, lockman_getlock() + notices that a "blocker" does not conflict with the requested lock, and + "dereferences" it, to find the lock that it's waiting on. The calling + thread than begins to wait on the same lock. + + To better support table-row relations where one needs to lock the table + with an intention lock before locking the row, extended diagnostics is + provided. When an intention lock (presumably on a table) is granted, + lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row, + perhaps the thread already has a normal lock on this table), + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual), + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check + whether it's possible to lock the row, but no need to lock it - perhaps + the thread has a loose lock on this table). This is defined by + getlock_result[] table. +*/ + +#include <my_global.h> +#include <my_sys.h> +#include <my_bit.h> +#include <lf.h> +#include "lockman.h" + +/* + Lock compatibility matrix. + + It's asymmetric. Read it as "Somebody has the lock <value in the row + label>, can I set the lock <value in the column label> ?" + + ') Though you can take LS lock while somebody has S lock, it makes no + sense - it's simpler to take S lock too. + + 1 - compatible + 0 - incompatible + -1 - "impossible", so that we can assert the impossibility. +*/ +static int lock_compatibility_matrix[10][10]= +{ /* N S X IS IX SIX LS LX SLX LSIX */ + { -1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */ + { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */ + { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */ +}; + +/* + Lock combining matrix. + + It's symmetric. Read it as "what lock level L is identical to the + set of two locks A and B" + + One should never get N from it, we assert the impossibility +*/ +static enum lockman_lock_type lock_combining_matrix[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { N, S, X, IS, IX, SIX, S, SLX, SLX, SIX}, /* N */ + { S, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */ + { X, X, X, X, X, X, X, X, X, X}, /* X */ + { IS, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */ + { IX, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */ + { SIX, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */ + { LS, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */ + { LX, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */ + { SLX, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */ + { LSIX, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */ +}; + +#define REPEAT_ONCE_MORE 0 +#define OK_TO_PLACE_THE_LOCK 1 +#define OK_TO_PLACE_THE_REQUEST 2 +#define ALREADY_HAVE_THE_LOCK 4 +#define ALREADY_HAVE_THE_REQUEST 8 +#define PLACE_NEW_DISABLE_OLD 16 +#define REQUEST_NEW_DISABLE_OLD 32 +#define RESOURCE_WAS_UNLOCKED 64 + +#define NEED_TO_WAIT (OK_TO_PLACE_THE_REQUEST | ALREADY_HAVE_THE_REQUEST |\ + REQUEST_NEW_DISABLE_OLD) +#define ALREADY_HAVE (ALREADY_HAVE_THE_LOCK | ALREADY_HAVE_THE_REQUEST) +#define LOCK_UPGRADE (PLACE_NEW_DISABLE_OLD | REQUEST_NEW_DISABLE_OLD) + + +/* + the return codes for lockman_getlock + + It's asymmetric. Read it as "I have the lock <value in the row label>, + what value should be returned for <value in the column label> ?" + + 0 means impossible combination (assert!) + + Defines below help to preserve the table structure. + I/L/A values are self explanatory + x means the combination is possible (assert should not crash) + but it cannot happen in row locks, only in table locks (S,X), + or lock escalations (LS,LX) +*/ +#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE +#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +#define A GOT_THE_LOCK +#define x GOT_THE_LOCK +static enum lockman_getlock_result getlock_result[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */ + { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */ + { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */ + { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */ + { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */ + { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */ + { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */ + { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */ + { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */ + { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */ +}; +#undef I +#undef L +#undef A +#undef x + +LF_REQUIRE_PINS(4) + +typedef struct lockman_lock { + uint64 resource; + struct lockman_lock *lonext; + intptr volatile link; + uint32 hashnr; + /* QQ: TODO - remove hashnr from LOCK */ + uint16 loid; + uchar lock; /* sizeof(uchar) <= sizeof(enum) */ + uchar flags; +} LOCK; + +#define IGNORE_ME 1 +#define UPGRADED 2 +#define ACTIVE 4 + +typedef struct { + intptr volatile *prev; + LOCK *curr, *next; + LOCK *blocker, *upgrade_from; +} CURSOR; + +#define PTR(V) (LOCK *)((V) & (~(intptr)1)) +#define DELETED(V) ((V) & 1) + +/* + NOTE + cursor is positioned in either case + pins[0..3] are used, they are NOT removed on return +*/ +static int lockfind(LOCK * volatile *head, LOCK *node, + CURSOR *cursor, LF_PINS *pins) +{ + uint32 hashnr, cur_hashnr; + uint64 resource, cur_resource; + intptr cur_link; + my_bool cur_active, compatible, upgrading, prev_active; + enum lockman_lock_type lock, prev_lock, cur_lock; + uint16 loid, cur_loid; + int cur_flags, flags; + + hashnr= node->hashnr; + resource= node->resource; + lock= node->lock; + loid= node->loid; + flags= node->flags; + +retry: + cursor->prev= (intptr *)head; + prev_lock= N; + cur_active= TRUE; + compatible= TRUE; + upgrading= FALSE; + cursor->blocker= cursor->upgrade_from= 0; + _lf_unpin(pins, 3); + do { + cursor->curr= PTR(*cursor->prev); + _lf_pin(pins, 1, cursor->curr); + } while(*cursor->prev != (intptr)cursor->curr && LF_BACKOFF); + for (;;) + { + if (!cursor->curr) + break; + do { + cur_link= cursor->curr->link; + cursor->next= PTR(cur_link); + _lf_pin(pins, 0, cursor->next); + } while (cur_link != cursor->curr->link && LF_BACKOFF); + cur_hashnr= cursor->curr->hashnr; + cur_resource= cursor->curr->resource; + cur_lock= cursor->curr->lock; + cur_loid= cursor->curr->loid; + cur_flags= cursor->curr->flags; + if (*cursor->prev != (intptr)cursor->curr) + { + (void)LF_BACKOFF; + goto retry; + } + if (!DELETED(cur_link)) + { + if (cur_hashnr > hashnr || + (cur_hashnr == hashnr && cur_resource >= resource)) + { + if (cur_hashnr > hashnr || cur_resource > resource) + break; + /* ok, we have a lock for this resource */ + DBUG_ASSERT(lock_compatibility_matrix[prev_lock][cur_lock] >= 0); + DBUG_ASSERT(lock_compatibility_matrix[cur_lock][lock] >= 0); + if ((cur_flags & IGNORE_ME) && ! (flags & IGNORE_ME)) + { + DBUG_ASSERT(cur_active); + if (cur_loid == loid) + cursor->upgrade_from= cursor->curr; + } + else + { + prev_active= cur_active; + if (cur_flags & ACTIVE) + DBUG_ASSERT(prev_active == TRUE); + else + cur_active&= lock_compatibility_matrix[prev_lock][cur_lock]; + if (upgrading && !cur_active /*&& !(cur_flags & UPGRADED)*/) + break; + if (prev_active && !cur_active) + { + cursor->blocker= cursor->curr; + _lf_pin(pins, 3, cursor->curr); + } + if (cur_loid == loid) + { + /* we already have a lock on this resource */ + DBUG_ASSERT(lock_combining_matrix[cur_lock][lock] != N); + DBUG_ASSERT(!upgrading || (flags & IGNORE_ME)); + if (lock_combining_matrix[cur_lock][lock] == cur_lock) + { + /* new lock is compatible */ + if (cur_active) + { + cursor->blocker= cursor->curr; /* loose-locks! */ + _lf_unpin(pins, 3); /* loose-locks! */ + return ALREADY_HAVE_THE_LOCK; + } + else + return ALREADY_HAVE_THE_REQUEST; + } + /* not compatible, upgrading */ + upgrading= TRUE; + cursor->upgrade_from= cursor->curr; + } + else + { + if (!lock_compatibility_matrix[cur_lock][lock]) + { + compatible= FALSE; + cursor->blocker= cursor->curr; + _lf_pin(pins, 3, cursor->curr); + } + } + prev_lock= lock_combining_matrix[prev_lock][cur_lock]; + DBUG_ASSERT(prev_lock != N); + } + } + cursor->prev= &(cursor->curr->link); + _lf_pin(pins, 2, cursor->curr); + } + else + { + if (my_atomic_casptr((void **)cursor->prev, + (void **)(char*) &cursor->curr, cursor->next)) + _lf_alloc_free(pins, cursor->curr); + else + { + (void)LF_BACKOFF; + goto retry; + } + } + cursor->curr= cursor->next; + _lf_pin(pins, 1, cursor->curr); + } + /* + either the end of lock list - no more locks for this resource, + or upgrading and the end of active lock list + */ + if (upgrading) + { + if (compatible /*&& prev_active*/) + return PLACE_NEW_DISABLE_OLD; + else + return REQUEST_NEW_DISABLE_OLD; + } + if (cur_active && compatible) + { + /* + either no locks for this resource or all are compatible. + ok to place the lock in any case. + */ + return prev_lock == N ? RESOURCE_WAS_UNLOCKED + : OK_TO_PLACE_THE_LOCK; + } + /* we have a lock conflict. ok to place a lock request. And wait */ + return OK_TO_PLACE_THE_REQUEST; +} + +/* + NOTE + it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays +*/ +static int lockinsert(LOCK * volatile *head, LOCK *node, LF_PINS *pins, + LOCK **blocker) +{ + CURSOR cursor; + int res; + + do + { + res= lockfind(head, node, &cursor, pins); + DBUG_ASSERT(res != ALREADY_HAVE_THE_REQUEST); + if (!(res & ALREADY_HAVE)) + { + if (res & LOCK_UPGRADE) + { + node->flags|= UPGRADED; + node->lock= lock_combining_matrix[cursor.upgrade_from->lock][node->lock]; + } + if (!(res & NEED_TO_WAIT)) + node->flags|= ACTIVE; + node->link= (intptr)cursor.curr; + DBUG_ASSERT(node->link != (intptr)node); + DBUG_ASSERT(cursor.prev != &node->link); + if (!my_atomic_casptr((void **)cursor.prev, + (void **)(char*) &cursor.curr, node)) + { + res= REPEAT_ONCE_MORE; + node->flags&= ~ACTIVE; + } + if (res & LOCK_UPGRADE) + cursor.upgrade_from->flags|= IGNORE_ME; + /* + QQ: is this OK ? if a reader has already read upgrade_from, + it may find it conflicting with node :( + - see the last test from test_lockman_simple() + */ + } + + } while (res == REPEAT_ONCE_MORE); + _lf_unpin(pins, 0); + _lf_unpin(pins, 1); + _lf_unpin(pins, 2); + /* + note that blocker is not necessarily pinned here (when it's == curr). + this is ok as in such a case it's either a dummy node for + initialize_bucket() and dummy nodes don't need pinning, + or it's a lock of the same transaction for lockman_getlock, + and it cannot be removed by another thread + */ + *blocker= cursor.blocker; + return res; +} + +/* + NOTE + it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays +*/ +static int lockpeek(LOCK * volatile *head, LOCK *node, LF_PINS *pins, + LOCK **blocker) +{ + CURSOR cursor; + int res; + + res= lockfind(head, node, &cursor, pins); + + _lf_unpin(pins, 0); + _lf_unpin(pins, 1); + _lf_unpin(pins, 2); + if (blocker) + *blocker= cursor.blocker; + return res; +} + +/* + NOTE + it uses pins[0..3], on return all pins are removed. + + One _must_ have the lock (or request) to call this +*/ +static int lockdelete(LOCK * volatile *head, LOCK *node, LF_PINS *pins) +{ + CURSOR cursor; + int res; + + do + { + res= lockfind(head, node, &cursor, pins); + DBUG_ASSERT(res & ALREADY_HAVE); + + if (cursor.upgrade_from) + cursor.upgrade_from->flags&= ~IGNORE_ME; + + /* + XXX this does not work with savepoints, as old lock is left ignored. + It cannot be unignored, as would basically mean moving the lock back + in the lock chain (from upgraded). And the latter is not allowed - + because it breaks list scanning. So old ignored lock must be deleted, + new - same - lock must be installed right after the lock we're deleting, + then we can delete. Good news is - this is only required when rolling + back a savepoint. + */ + if (my_atomic_casptr((void **)(char*)&(cursor.curr->link), + (void **)(char*)&cursor.next, 1+(char *)cursor.next)) + { + if (my_atomic_casptr((void **)cursor.prev, + (void **)(char*)&cursor.curr, cursor.next)) + _lf_alloc_free(pins, cursor.curr); + else + lockfind(head, node, &cursor, pins); + } + else + { + res= REPEAT_ONCE_MORE; + if (cursor.upgrade_from) + cursor.upgrade_from->flags|= IGNORE_ME; + } + } while (res == REPEAT_ONCE_MORE); + _lf_unpin(pins, 0); + _lf_unpin(pins, 1); + _lf_unpin(pins, 2); + _lf_unpin(pins, 3); + return res; +} + +void lockman_init(LOCKMAN *lm, loid_to_lo_func *func, uint timeout) +{ + lf_alloc_init(&lm->alloc, sizeof(LOCK), offsetof(LOCK, lonext)); + lf_dynarray_init(&lm->array, sizeof(LOCK **)); + lm->size= 1; + lm->count= 0; + lm->loid_to_lo= func; + lm->lock_timeout= timeout; +} + +void lockman_destroy(LOCKMAN *lm) +{ + LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0); + while (el) + { + intptr next= el->link; + if (el->hashnr & 1) + lf_alloc_direct_free(&lm->alloc, el); + else + my_free((void *)el, MYF(0)); + el= (LOCK *)next; + } + lf_alloc_destroy(&lm->alloc); + lf_dynarray_destroy(&lm->array); +} + +/* TODO: optimize it */ +#define MAX_LOAD 1 + +static void initialize_bucket(LOCKMAN *lm, LOCK * volatile *node, + uint bucket, LF_PINS *pins) +{ + int res; + uint parent= my_clear_highest_bit(bucket); + LOCK *dummy= (LOCK *)my_malloc(sizeof(LOCK), MYF(MY_WME)); + LOCK **tmp= 0, *cur; + LOCK * volatile *el= _lf_dynarray_lvalue(&lm->array, parent); + + if (*el == NULL && bucket) + initialize_bucket(lm, el, parent, pins); + dummy->hashnr= my_reverse_bits(bucket); + dummy->loid= 0; + dummy->lock= X; /* doesn't matter, in fact */ + dummy->resource= 0; + dummy->flags= 0; + res= lockinsert(el, dummy, pins, &cur); + DBUG_ASSERT(res & (ALREADY_HAVE_THE_LOCK | RESOURCE_WAS_UNLOCKED)); + if (res & ALREADY_HAVE_THE_LOCK) + { + my_free((void *)dummy, MYF(0)); + dummy= cur; + } + my_atomic_casptr((void **)node, (void **)(char*) &tmp, dummy); +} + +static inline uint calc_hash(uint64 resource) +{ + const uchar *pos= (uchar *)&resource; + ulong nr1= 1, nr2= 4, i; + for (i= 0; i < sizeof(resource) ; i++, pos++) + { + nr1^= (ulong) ((((uint) nr1 & 63)+nr2) * ((uint)*pos)) + (nr1 << 8); + nr2+= 3; + } + return nr1 & INT_MAX32; +} + +/* + RETURN + see enum lockman_getlock_result + NOTE + uses pins[0..3], they're removed on return +*/ +enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo, + uint64 resource, + enum lockman_lock_type lock) +{ + int res; + uint csize, bucket, hashnr; + LOCK *node, * volatile *el, *blocker; + LF_PINS *pins= lo->pins; + enum lockman_lock_type old_lock; + + DBUG_ASSERT(lo->loid); + lf_rwlock_by_pins(pins); + node= (LOCK *)_lf_alloc_new(pins); + node->flags= 0; + node->lock= lock; + node->loid= lo->loid; + node->resource= resource; + hashnr= calc_hash(resource); + bucket= hashnr % lm->size; + el= _lf_dynarray_lvalue(&lm->array, bucket); + if (*el == NULL) + initialize_bucket(lm, el, bucket, pins); + node->hashnr= my_reverse_bits(hashnr) | 1; + res= lockinsert(el, node, pins, &blocker); + if (res & ALREADY_HAVE) + { + int r; + old_lock= blocker->lock; + _lf_alloc_free(pins, node); + lf_rwunlock_by_pins(pins); + r= getlock_result[old_lock][lock]; + DBUG_ASSERT(r); + return r; + } + /* a new value was added to the hash */ + csize= lm->size; + if ((my_atomic_add32(&lm->count, 1)+1.0) / csize > MAX_LOAD) + my_atomic_cas32(&lm->size, (int*) &csize, csize*2); + node->lonext= lo->all_locks; + lo->all_locks= node; + for ( ; res & NEED_TO_WAIT; res= lockpeek(el, node, pins, &blocker)) + { + LOCK_OWNER *wait_for_lo; + ulonglong deadline; + struct timespec timeout; + + _lf_assert_pin(pins, 3); /* blocker must be pinned here */ + wait_for_lo= lm->loid_to_lo(blocker->loid); + + /* + now, this is tricky. blocker is not necessarily a LOCK + we're waiting for. If it's compatible with what we want, + then we're waiting for a lock that blocker is waiting for + (see two places where blocker is set in lockfind) + In the latter case, let's "dereference" it + */ + if (lock_compatibility_matrix[blocker->lock][lock]) + { + blocker= wait_for_lo->all_locks; + _lf_pin(pins, 3, blocker); + if (blocker != wait_for_lo->all_locks) + continue; + wait_for_lo= wait_for_lo->waiting_for; + } + + /* + note that the blocker transaction may have ended by now, + its LOCK_OWNER and short id were reused, so 'wait_for_lo' may point + to an unrelated - albeit valid - LOCK_OWNER + */ + if (!wait_for_lo) + continue; + + lo->waiting_for= wait_for_lo; + lf_rwunlock_by_pins(pins); + + /* + We lock a mutex - it may belong to a wrong LOCK_OWNER, but it must + belong to _some_ LOCK_OWNER. It means, we can never free() a LOCK_OWNER, + if there're other active LOCK_OWNERs. + */ + /* QQ: race condition here */ + pthread_mutex_lock(wait_for_lo->mutex); + if (DELETED(blocker->link)) + { + /* + blocker transaction was ended, or a savepoint that owned + the lock was rolled back. Either way - the lock was removed + */ + pthread_mutex_unlock(wait_for_lo->mutex); + lf_rwlock_by_pins(pins); + continue; + } + + /* yuck. waiting */ + deadline= my_getsystime() + lm->lock_timeout * 10000; + set_timespec_nsec(timeout,lm->lock_timeout * 1000000); + do + { + pthread_cond_timedwait(wait_for_lo->cond, wait_for_lo->mutex, &timeout); + } while (!DELETED(blocker->link) && my_getsystime() < deadline); + pthread_mutex_unlock(wait_for_lo->mutex); + lf_rwlock_by_pins(pins); + if (!DELETED(blocker->link)) + { + /* + timeout. + note that we _don't_ release the lock request here. + Instead we're relying on the caller to abort the transaction, + and release all locks at once - see lockman_release_locks() + */ + _lf_unpin(pins, 3); + lf_rwunlock_by_pins(pins); + return DIDNT_GET_THE_LOCK; + } + } + lo->waiting_for= 0; + _lf_assert_unpin(pins, 3); /* unpin should not be needed */ + lf_rwunlock_by_pins(pins); + return getlock_result[lock][lock]; +} + +/* + RETURN + 0 - deleted + 1 - didn't (not found) + NOTE + see lockdelete() for pin usage notes +*/ +int lockman_release_locks(LOCKMAN *lm, LOCK_OWNER *lo) +{ + LOCK * volatile *el, *node, *next; + uint bucket; + LF_PINS *pins= lo->pins; + + pthread_mutex_lock(lo->mutex); + lf_rwlock_by_pins(pins); + for (node= lo->all_locks; node; node= next) + { + next= node->lonext; + bucket= calc_hash(node->resource) % lm->size; + el= _lf_dynarray_lvalue(&lm->array, bucket); + if (*el == NULL) + initialize_bucket(lm, el, bucket, pins); + lockdelete(el, node, pins); + my_atomic_add32(&lm->count, -1); + } + lf_rwunlock_by_pins(pins); + lo->all_locks= 0; + /* now signal all waiters */ + pthread_cond_broadcast(lo->cond); + pthread_mutex_unlock(lo->mutex); + return 0; +} + +#ifdef MY_LF_EXTRA_DEBUG +static const char *lock2str[]= +{ "N", "S", "X", "IS", "IX", "SIX", "LS", "LX", "SLX", "LSIX" }; +/* + NOTE + the function below is NOT thread-safe !!! +*/ +void print_lockhash(LOCKMAN *lm) +{ + LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0); + printf("hash: size %u count %u\n", lm->size, lm->count); + while (el) + { + intptr next= el->link; + if (el->hashnr & 1) + { + printf("0x%08lx { resource %lu, loid %u, lock %s", + (long) el->hashnr, (ulong) el->resource, el->loid, + lock2str[el->lock]); + if (el->flags & IGNORE_ME) printf(" IGNORE_ME"); + if (el->flags & UPGRADED) printf(" UPGRADED"); + if (el->flags & ACTIVE) printf(" ACTIVE"); + if (DELETED(next)) printf(" ***DELETED***"); + printf("}\n"); + } + else + { + /*printf("0x%08x { dummy }\n", el->hashnr);*/ + DBUG_ASSERT(el->resource == 0 && el->loid == 0 && el->lock == X); + } + el= PTR(next); + } +} +#endif diff --git a/storage/maria/lockman.h b/storage/maria/lockman.h new file mode 100644 index 00000000000..82ab483896f --- /dev/null +++ b/storage/maria/lockman.h @@ -0,0 +1,76 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _lockman_h +#define _lockman_h + +/* + Lock levels: + ^^^^^^^^^^^ + + N - "no lock", not a lock, used sometimes internally to simplify the code + S - Shared + X - eXclusive + IS - Intention Shared + IX - Intention eXclusive + SIX - Shared + Intention eXclusive + LS - Loose Shared + LX - Loose eXclusive + SLX - Shared + Loose eXclusive + LSIX - Loose Shared + Intention eXclusive +*/ +enum lockman_lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST }; + +struct lockman_lock; + +typedef struct st_lock_owner LOCK_OWNER; +struct st_lock_owner { + LF_PINS *pins; /* must be allocated from lockman's pinbox */ + struct lockman_lock *all_locks; /* a LIFO */ + LOCK_OWNER *waiting_for; + pthread_cond_t *cond; /* transactions waiting for this, wait on 'cond' */ + pthread_mutex_t *mutex; /* mutex is required to use 'cond' */ + uint16 loid; +}; + +typedef LOCK_OWNER *loid_to_lo_func(uint16); +typedef struct { + LF_DYNARRAY array; /* hash itself */ + LF_ALLOCATOR alloc; /* allocator for elements */ + int32 volatile size; /* size of array */ + int32 volatile count; /* number of elements in the hash */ + uint lock_timeout; + loid_to_lo_func *loid_to_lo; +} LOCKMAN; +#define DIDNT_GET_THE_LOCK 0 +enum lockman_getlock_result { + NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT, + GOT_THE_LOCK, + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE, + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +}; + +void lockman_init(LOCKMAN *, loid_to_lo_func *, uint); +void lockman_destroy(LOCKMAN *); +enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo, + uint64 resource, + enum lockman_lock_type lock); +int lockman_release_locks(LOCKMAN *, LOCK_OWNER *); + +#ifdef EXTRA_DEBUG +void print_lockhash(LOCKMAN *lm); +#endif + +#endif diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c new file mode 100644 index 00000000000..c0763b0612d --- /dev/null +++ b/storage/maria/ma_bitmap.c @@ -0,0 +1,2910 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Bitmap handling (for records in block) + + The data file starts with a bitmap page, followed by as many data + pages as the bitmap can cover. After this there is a new bitmap page + and more data pages etc. + + The bitmap code assumes there is always an active bitmap page and thus + that there is at least one bitmap page in the file + + Structure of bitmap page: + + Fixed size records (to be implemented later): + + 2 bits are used to indicate: + + 0 Empty + 1 0-75 % full (at least room for 2 records) + 2 75-100 % full (at least room for one record) + 3 100 % full (no more room for records) + + Assuming 8K pages, this will allow us to map: + 8192 (bytes per page) * 4 (pages mapped per byte) * 8192 (page size)= 256M + + (For Maria this will be 7*4 * 8192 = 224K smaller because of LSN) + + Note that for fixed size rows, we can't add more columns without doing + a full reorganization of the table. The user can always force a dynamic + size row format by specifying ROW_FORMAT=dynamic. + + + Dynamic size records: + + 3 bits are used to indicate Bytes free in 8K page + + 0 Empty page 8176 (head or tail) + 1 0-30 % full (at least room for 3 records) 5724 + 2 30-60 % full (at least room for 2 records) 3271 + 3 60-90 % full (at least room for one record) 818 + 4 100 % full (no more room for records) 0 + 5 Tail page, 0-40 % full 4906 + 6 Tail page, 40-80 % full 1636 + 7 Full tail page or full blob page 0 + + Assuming 8K pages, this will allow us to map: + 8192 (bytes per page) * 8 bits/byte / 3 bits/page * 8192 (page size)= 170.7M + + Note that values 1-3 may be adjust for each individual table based on + 'min record length'. Tail pages are for overflow data which can be of + any size and thus doesn't have to be adjusted for different tables. + If we add more columns to the table, some of the originally calculated + 'cut off' points may not be optimal, but they shouldn't be 'drasticly + wrong'. + + When allocating data from the bitmap, we are trying to do it in a + 'best fit' manner. Blobs and varchar blocks are given out in large + continuous extents to allow fast access to these. Before allowing a + row to 'flow over' to other blocks, we will compact the page and use + all space on it. If there is many rows in the page, we will ensure + there is *LEFT_TO_GROW_ON_SPLIT* bytes left on the page to allow other + rows to grow. + + The bitmap format allows us to extend the row file in big chunks, if needed. + + When calculating the size for a packed row, we will calculate the following + things separately: + - Row header + null_bits + empty_bits fixed size segments etc. + - Size of all char/varchar fields + - Size of each blob field + + The bitmap handler will get all the above information and return + either one page or a set of pages to put the different parts. + + Bitmaps are read on demand in response to insert/delete/update operations. + The following bitmap pointers will be cached and stored on disk on close: + - Current insert_bitmap; When inserting new data we will first try to + fill this one. + - First bitmap which is not completely full. This is updated when we + free data with an update or delete. + + While flushing out bitmaps, we will cache the status of the bitmap in memory + to avoid having to read a bitmap for insert of new data that will not + be of any use + - Total empty space + - Largest number of continuous pages + + Bitmap ONLY goes to disk in the following scenarios + - The file is closed (and we flush all changes to disk) + - On checkpoint + (Ie: When we do a checkpoint, we have to ensure that all bitmaps are + put on disk even if they are not in the page cache). + - When explicitely requested (for example on backup or after recvoery, + to simplify things) + + The flow of writing a row is that: + - Lock the bitmap + - Decide which data pages we will write to + - Mark them full in the bitmap page so that other threads do not try to + use the same data pages as us + - We unlock the bitmap + - Write the data pages + - Lock the bitmap + - Correct the bitmap page with the true final occupation of the data + pages (that is, we marked pages full but when we are done we realize + we didn't fill them) + - Unlock the bitmap. +*/ + +#include "maria_def.h" +#include "ma_blockrec.h" + +#define FULL_HEAD_PAGE 4 +#define FULL_TAIL_PAGE 7 + +/*#define WRONG_BITMAP_FLUSH 1*/ /*define only for provoking bugs*/ +#undef WRONG_BITMAP_FLUSH + +static my_bool _ma_read_bitmap_page(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page); +static my_bool _ma_bitmap_create_missing(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page); + +/* Write bitmap page to key cache */ + +static inline my_bool write_changed_bitmap(MARIA_SHARE *share, + MARIA_FILE_BITMAP *bitmap) +{ + DBUG_ENTER("write_changed_bitmap"); + DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size); + DBUG_ASSERT(bitmap->file.write_callback != 0); + DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable)); + + /* + Mark that a bitmap page has been written to page cache and we have + to flush it during checkpoint. + */ + bitmap->changed_not_flushed= 1; + + if ((bitmap->non_flushable == 0) +#ifdef WRONG_BITMAP_FLUSH + || 1 +#endif + ) + { + my_bool res= pagecache_write(share->pagecache, + &bitmap->file, bitmap->page, 0, + bitmap->map, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE); + DBUG_RETURN(res); + } + else + { + MARIA_PINNED_PAGE page_link; + int res= pagecache_write(share->pagecache, + &bitmap->file, bitmap->page, 0, + bitmap->map, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE); + page_link.unlock= PAGECACHE_LOCK_LEFT_UNLOCKED; + page_link.changed= 1; + push_dynamic(&bitmap->pinned_pages, (void*) &page_link); + DBUG_RETURN(res); + } +} + +/* + Initialize bitmap variables in share + + SYNOPSIS + _ma_bitmap_init() + share Share handler + file data file handler + + NOTES + This is called the first time a file is opened. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_init(MARIA_SHARE *share, File file) +{ + uint aligned_bit_blocks; + uint max_page_size; + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + uint size= share->block_size; +#ifndef DBUG_OFF + /* We want to have a copy of the bitmap to be able to print differences */ + size*= 2; +#endif + + if (((bitmap->map= (uchar*) my_malloc(size, MYF(MY_WME))) == NULL) || + my_init_dynamic_array(&bitmap->pinned_pages, + sizeof(MARIA_PINNED_PAGE), 1, 1)) + return 1; + + bitmap->block_size= share->block_size; + bitmap->file.file= file; + _ma_bitmap_set_pagecache_callbacks(&bitmap->file, share); + + /* Size needs to be aligned on 6 */ + aligned_bit_blocks= (share->block_size - PAGE_SUFFIX_SIZE) / 6; + bitmap->total_size= aligned_bit_blocks * 6; + /* + In each 6 bytes, we have 6*8/3 = 16 pages covered + The +1 is to add the bitmap page, as this doesn't have to be covered + */ + bitmap->pages_covered= aligned_bit_blocks * 16 + 1; + bitmap->flush_all_requested= 0; + bitmap->non_flushable= 0; + + /* Update size for bits */ + /* TODO; Make this dependent of the row size */ + max_page_size= share->block_size - PAGE_OVERHEAD_SIZE + DIR_ENTRY_SIZE; + bitmap->sizes[0]= max_page_size; /* Empty page */ + bitmap->sizes[1]= max_page_size - max_page_size * 30 / 100; + bitmap->sizes[2]= max_page_size - max_page_size * 60 / 100; + bitmap->sizes[3]= max_page_size - max_page_size * 90 / 100; + bitmap->sizes[4]= 0; /* Full page */ + bitmap->sizes[5]= max_page_size - max_page_size * 40 / 100; + bitmap->sizes[6]= max_page_size - max_page_size * 80 / 100; + bitmap->sizes[7]= 0; + + pthread_mutex_init(&share->bitmap.bitmap_lock, MY_MUTEX_INIT_SLOW); + pthread_cond_init(&share->bitmap.bitmap_cond, 0); + + _ma_bitmap_reset_cache(share); + + if (share->state.first_bitmap_with_space == ~(pgcache_page_no_t) 0) + { + /* Start scanning for free space from start of file */ + share->state.first_bitmap_with_space = 0; + } + return 0; +} + + +/* + Free data allocated by _ma_bitmap_init + + SYNOPSIS + _ma_bitmap_end() + share Share handler +*/ + +my_bool _ma_bitmap_end(MARIA_SHARE *share) +{ + my_bool res= _ma_bitmap_flush(share); + safe_mutex_assert_owner(&share->close_lock); + pthread_mutex_destroy(&share->bitmap.bitmap_lock); + pthread_cond_destroy(&share->bitmap.bitmap_cond); + delete_dynamic(&share->bitmap.pinned_pages); + my_free(share->bitmap.map, MYF(MY_ALLOW_ZERO_PTR)); + share->bitmap.map= 0; + return res; +} + + +/* + Send updated bitmap to the page cache + + SYNOPSIS + _ma_bitmap_flush() + share Share handler + + NOTES + In the future, _ma_bitmap_flush() will be called to flush changes don't + by this thread (ie, checking the changed flag is ok). The reason we + check it again in the mutex is that if someone else did a flush at the + same time, we don't have to do the write. + This is also ok for _ma_scan_init_block_record() which does not want to + miss rows: it cares only for committed rows, that is, rows for which there + was a commit before our transaction started; as commit and transaction's + start are protected by the same LOCK_trn_list mutex, we see memory at + least as new as at other transaction's commit time, so if the committed + rows caused bitmap->changed to be true, we see it; if we see 0 it really + means a flush happened since then. So, it's ok to read without bitmap's + mutex. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_flush(MARIA_SHARE *share) +{ + my_bool res= 0; + DBUG_ENTER("_ma_bitmap_flush"); + if (share->bitmap.changed) + { + pthread_mutex_lock(&share->bitmap.bitmap_lock); + if (share->bitmap.changed) + { + res= write_changed_bitmap(share, &share->bitmap); + share->bitmap.changed= 0; + } + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + } + DBUG_RETURN(res); +} + + +/** + Dirty-page filtering criteria for bitmap pages + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg pages_covered of bitmap +*/ + +static enum pagecache_flush_filter_result +filter_flush_bitmap_pages(enum pagecache_page_type type + __attribute__ ((unused)), + pgcache_page_no_t pageno, + LSN rec_lsn __attribute__ ((unused)), + void *arg) +{ + return ((pageno % (*(ulong*)arg)) == 0); +} + + +/** + Flushes current bitmap page to the pagecache, and then all bitmap pages + from pagecache to the file. Used by Checkpoint. + + @param share Table's share +*/ + +my_bool _ma_bitmap_flush_all(MARIA_SHARE *share) +{ + my_bool res= 0; + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + DBUG_ENTER("_ma_bitmap_flush_all"); + pthread_mutex_lock(&bitmap->bitmap_lock); + if (bitmap->changed || bitmap->changed_not_flushed) + { + bitmap->flush_all_requested++; +#ifndef WRONG_BITMAP_FLUSH + while (bitmap->non_flushable > 0) + { + DBUG_PRINT("info", ("waiting for bitmap to be flushable")); + pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock); + } +#endif + DBUG_ASSERT(bitmap->flush_all_requested == 1); + /* + Bitmap is in a flushable state: its contents in memory are reflected by + log records (complete REDO-UNDO groups) and all bitmap pages are + unpinned. We keep the mutex to preserve this situation, and flush to the + file. + */ + if (bitmap->changed) + { + bitmap->changed= FALSE; + res= write_changed_bitmap(share, bitmap); + } + /* + We do NOT use FLUSH_KEEP_LAZY because we must be sure that bitmap + pages have been flushed. That's a condition of correctness of + Recovery: data pages may have been all flushed, if we write the + checkpoint record Recovery will start from after their REDOs. If + bitmap page was not flushed, as the REDOs about it will be skipped, it + will wrongly not be recovered. If bitmap pages had a rec_lsn it would + be different. + There should be no pinned pages as bitmap->non_flushable==0. + */ + if (flush_pagecache_blocks_with_filter(share->pagecache, + &bitmap->file, FLUSH_KEEP, + filter_flush_bitmap_pages, + &bitmap->pages_covered) & + PCFLUSH_PINNED_AND_ERROR) + res= TRUE; + bitmap->changed_not_flushed= FALSE; + bitmap->flush_all_requested--; + /* + Some well-behaved threads may be waiting for flush_all_requested to + become false, wake them up. + */ + DBUG_PRINT("info", ("bitmap flusher waking up others")); + pthread_cond_broadcast(&bitmap->bitmap_cond); + } + pthread_mutex_unlock(&bitmap->bitmap_lock); + DBUG_RETURN(res); +} + + +/** + @brief Lock bitmap from being used by another thread + + @fn _ma_bitmap_lock() + @param share Table's share + + @notes + This is a temporary solution for allowing someone to delete an inserted + duplicate-key row while someone else is doing concurrent inserts. + This is ok for now as duplicate key errors are not that common. + + In the future we will add locks for row-pages to ensure two threads doesn't + work at the same time on the same page. +*/ + +void _ma_bitmap_lock(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + DBUG_ENTER("_ma_bitmap_lock"); + + if (!share->now_transactional) + DBUG_VOID_RETURN; + + pthread_mutex_lock(&bitmap->bitmap_lock); + bitmap->flush_all_requested++; + while (bitmap->non_flushable) + { + DBUG_PRINT("info", ("waiting for bitmap to be flushable")); + pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock); + } + /* + Ensure that _ma_bitmap_flush_all() and _ma_bitmap_lock() are blocked. + ma_bitmap_flushable() is blocked thanks to 'flush_all_requested'. + */ + bitmap->non_flushable= 1; + pthread_mutex_unlock(&bitmap->bitmap_lock); + DBUG_VOID_RETURN; +} + +/** + @brief Unlock bitmap after _ma_bitmap_lock() + + @fn _ma_bitmap_unlock() + @param share Table's share +*/ + +void _ma_bitmap_unlock(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + DBUG_ENTER("_ma_bitmap_unlock"); + + if (!share->now_transactional) + DBUG_VOID_RETURN; + DBUG_ASSERT(bitmap->flush_all_requested > 0 && bitmap->non_flushable == 1); + + pthread_mutex_lock(&bitmap->bitmap_lock); + bitmap->flush_all_requested--; + bitmap->non_flushable= 0; + pthread_mutex_unlock(&bitmap->bitmap_lock); + pthread_cond_broadcast(&bitmap->bitmap_cond); + DBUG_VOID_RETURN; +} + + +/** + @brief Unpin all pinned bitmap pages + + @param share Table's share + + @return Operation status + @retval 0 ok + + @note This unpins pages pinned by other threads. +*/ + +static void _ma_bitmap_unpin_all(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*) + dynamic_array_ptr(&bitmap->pinned_pages, 0)); + MARIA_PINNED_PAGE *pinned_page= page_link + bitmap->pinned_pages.elements; + DBUG_ENTER("_ma_bitmap_unpin_all"); + DBUG_PRINT("info", ("pinned: %u", bitmap->pinned_pages.elements)); + while (pinned_page-- != page_link) + pagecache_unlock_by_link(share->pagecache, pinned_page->link, + pinned_page->unlock, PAGECACHE_UNPIN, + LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, TRUE, TRUE); + bitmap->pinned_pages.elements= 0; + DBUG_VOID_RETURN; +} + + +/* + Intialize bitmap in memory to a zero bitmap + + SYNOPSIS + _ma_bitmap_delete_all() + share Share handler + + NOTES + This is called on maria_delete_all_rows (truncate data file). +*/ + +void _ma_bitmap_delete_all(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + DBUG_ENTER("_ma_bitmap_delete_all"); + if (bitmap->map) /* Not in create */ + { + bzero(bitmap->map, bitmap->block_size); + bitmap->changed= 1; + bitmap->page= 0; + bitmap->used_size= bitmap->total_size; + } + DBUG_VOID_RETURN; +} + + +/** + @brief Reset bitmap caches + + @fn _ma_bitmap_reset_cache() + @param share Maria share + + @notes + This is called after we have swapped file descriptors and we want + bitmap to forget all cached information +*/ + +void _ma_bitmap_reset_cache(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + + if (bitmap->map) /* If using bitmap */ + { + /* Forget changes in current bitmap page */ + bitmap->changed= 0; + + /* + We can't read a page yet, as in some case we don't have an active + page cache yet. + Pretend we have a dummy, full and not changed bitmap page in memory. + */ + bitmap->page= ~(ulonglong) 0; + bitmap->used_size= bitmap->total_size; + bfill(bitmap->map, share->block_size, 255); +#ifndef DBUG_OFF + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); +#endif + } +} + + +/* + Return bitmap pattern for the smallest head block that can hold 'size' + + SYNOPSIS + size_to_head_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0-3 For a description of the bitmap sizes, see the header +*/ + +static uint size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size <= bitmap->sizes[3]) + return 3; + if (size <= bitmap->sizes[2]) + return 2; + if (size <= bitmap->sizes[1]) + return 1; + DBUG_ASSERT(size <= bitmap->sizes[0]); + return 0; +} + + +/* + Return bitmap pattern for head block where there is size bytes free + + SYNOPSIS + _ma_free_size_to_head_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0-4 (Possible bitmap patterns for head block) +*/ + +uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size < bitmap->sizes[3]) + return 4; + if (size < bitmap->sizes[2]) + return 3; + if (size < bitmap->sizes[1]) + return 2; + return (size < bitmap->sizes[0]) ? 1 : 0; +} + + +/* + Return bitmap pattern for the smallest tail block that can hold 'size' + + SYNOPSIS + size_to_tail_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0, 5 or 6 For a description of the bitmap sizes, see the header +*/ + +static uint size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size <= bitmap->sizes[6]) + return 6; + if (size <= bitmap->sizes[5]) + return 5; + DBUG_ASSERT(size <= bitmap->sizes[0]); + return 0; +} + + +/* + Return bitmap pattern for tail block where there is size bytes free + + SYNOPSIS + free_size_to_tail_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0, 5, 6, 7 For a description of the bitmap sizes, see the header +*/ + +static uint free_size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size >= bitmap->sizes[0]) + return 0; /* Revert to empty page */ + if (size < bitmap->sizes[6]) + return 7; + if (size < bitmap->sizes[5]) + return 6; + return 5; +} + + +/* + Return size guranteed to be available on a page + + SYNOPSIS + pattern_to_head_size() + bitmap Bitmap + pattern Pattern (0-7) + + RETURN + 0 - block_size +*/ + +static inline uint pattern_to_size(MARIA_FILE_BITMAP *bitmap, uint pattern) +{ + DBUG_ASSERT(pattern <= 7); + return bitmap->sizes[pattern]; +} + + +/* + Print bitmap for debugging + + SYNOPSIS + _ma_print_bitmap() + bitmap Bitmap to print + + IMPLEMENTATION + Prints all changed bits since last call to _ma_print_bitmap(). + This is done by having a copy of the last bitmap in + bitmap->map+bitmap->block_size. +*/ + +#ifndef DBUG_OFF + +const char *bits_to_txt[]= +{ + "empty", "00-30% full", "30-60% full", "60-90% full", "full", + "tail 00-40 % full", "tail 40-80 % full", "tail/blob full" +}; + +static void _ma_print_bitmap_changes(MARIA_FILE_BITMAP *bitmap) +{ + uchar *pos, *end, *org_pos; + ulong page; + DBUG_ENTER("_ma_print_bitmap_changes"); + + end= bitmap->map + bitmap->used_size; + DBUG_LOCK_FILE; + fprintf(DBUG_FILE,"\nBitmap page changes at page: %lu bitmap: 0x%lx\n", + (ulong) bitmap->page, (long) bitmap->map); + + page= (ulong) bitmap->page+1; + for (pos= bitmap->map, org_pos= bitmap->map + bitmap->block_size ; + pos < end ; + pos+= 6, org_pos+= 6) + { + ulonglong bits= uint6korr(pos); /* 6 bytes = 6*8/3= 16 patterns */ + ulonglong org_bits= uint6korr(org_pos); + uint i; + + /* + Test if there is any changes in the next 16 bitmaps (to not have to + loop through all bits if we know they are the same) + */ + if (bits != org_bits) + { + for (i= 0; i < 16 ; i++, bits>>= 3, org_bits>>= 3) + { + if ((bits & 7) != (org_bits & 7)) + fprintf(DBUG_FILE, "Page: %8lu %s -> %s\n", page+i, + bits_to_txt[org_bits & 7], bits_to_txt[bits & 7]); + } + } + page+= 16; + } + fputc('\n', DBUG_FILE); + DBUG_UNLOCK_FILE; + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); + DBUG_VOID_RETURN; +} + + +/* Print content of bitmap for debugging */ + +void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data, + pgcache_page_no_t page) +{ + uchar *pos, *end; + char llbuff[22]; + + end= bitmap->map + bitmap->used_size; + DBUG_LOCK_FILE; + fprintf(DBUG_FILE,"\nDump of bitmap page at %s\n", llstr(page, llbuff)); + + page++; /* Skip bitmap page */ + for (pos= data, end= pos + bitmap->total_size; + pos < end ; + pos+= 6) + { + ulonglong bits= uint6korr(pos); /* 6 bytes = 6*8/3= 16 patterns */ + + /* + Test if there is any changes in the next 16 bitmaps (to not have to + loop through all bits if we know they are the same) + */ + if (bits) + { + uint i; + for (i= 0; i < 16 ; i++, bits>>= 3) + { + if (bits & 7) + fprintf(DBUG_FILE, "Page: %8s %s\n", llstr(page+i, llbuff), + bits_to_txt[bits & 7]); + } + } + page+= 16; + } + fputc('\n', DBUG_FILE); + DBUG_UNLOCK_FILE; +} + +#endif /* DBUG_OFF */ + + +/*************************************************************************** + Reading & writing bitmap pages +***************************************************************************/ + +/* + Read a given bitmap page + + SYNOPSIS + _ma_read_bitmap_page() + info Maria handler + bitmap Bitmap handler + page Page to read + + TODO + Update 'bitmap->used_size' to real size of used bitmap + + NOTE + We don't always have share->bitmap.bitmap_lock here + (when called from_ma_check_bitmap_data() for example). + + RETURN + 0 ok + 1 error (Error writing old bitmap or reading bitmap page) +*/ + +static my_bool _ma_read_bitmap_page(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page) +{ + MARIA_SHARE *share= info->s; + my_bool res; + DBUG_ENTER("_ma_read_bitmap_page"); + DBUG_ASSERT(page % bitmap->pages_covered == 0); + DBUG_ASSERT(!bitmap->changed); + + bitmap->page= page; + if (((page + 1) * bitmap->block_size) > share->state.state.data_file_length) + { + /* Inexistent or half-created page */ + res= _ma_bitmap_create_missing(info, bitmap, page); + DBUG_RETURN(res); + } + bitmap->used_size= bitmap->total_size; + DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size); + res= pagecache_read(share->pagecache, + &bitmap->file, page, 0, + bitmap->map, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == NULL; + + /* + We can't check maria_bitmap_marker here as if the bitmap page + previously had a true checksum and the user switched mode to not checksum + this may have any value, except maria_normal_page_marker. + + Using maria_normal_page_marker gives us a protection against bugs + when running without any checksums. + */ + +#ifndef DBUG_OFF + if (!res) + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); +#endif + DBUG_RETURN(res); +} + + +/* + Change to another bitmap page + + SYNOPSIS + _ma_change_bitmap_page() + info Maria handler + bitmap Bitmap handler + page Bitmap page to read + + NOTES + If old bitmap was changed, write it out before reading new one + We return empty bitmap if page is outside of file size + + RETURN + 0 ok + 1 error (Error writing old bitmap or reading bitmap page) +*/ + +static my_bool _ma_change_bitmap_page(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page) +{ + DBUG_ENTER("_ma_change_bitmap_page"); + + if (bitmap->changed) + { + if (write_changed_bitmap(info->s, bitmap)) + DBUG_RETURN(1); + bitmap->changed= 0; + } + DBUG_RETURN(_ma_read_bitmap_page(info, bitmap, page)); +} + + +/* + Read next suitable bitmap + + SYNOPSIS + move_to_next_bitmap() + bitmap Bitmap handle + + NOTES + The found bitmap may be full, so calling function may need to call this + repeatedly until it finds enough space. + + TODO + Add cache of bitmaps to not read something that is not usable + + RETURN + 0 ok + 1 error (either couldn't save old bitmap or read new one) +*/ + +static my_bool move_to_next_bitmap(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap) +{ + pgcache_page_no_t page= bitmap->page; + MARIA_STATE_INFO *state= &info->s->state; + DBUG_ENTER("move_to_next_bitmap"); + + if (state->first_bitmap_with_space != ~(ulonglong) 0 && + state->first_bitmap_with_space != page) + { + page= state->first_bitmap_with_space; + state->first_bitmap_with_space= ~(ulonglong) 0; + } + else + page+= bitmap->pages_covered; + DBUG_RETURN(_ma_change_bitmap_page(info, bitmap, page)); +} + + +/**************************************************************************** + Allocate data in bitmaps +****************************************************************************/ + +/* + Store data in 'block' and mark the place used in the bitmap + + SYNOPSIS + fill_block() + bitmap Bitmap handle + block Store data about what we found + best_data Pointer to best 6 uchar aligned area in bitmap->map + best_pos Which bit in *best_data the area starts + 0 = first bit pattern, 1 second bit pattern etc + best_bits The original value of the bits at best_pos + fill_pattern Bitmap pattern to store in best_data[best_pos] + + NOTES + We mark all pages to be 'TAIL's, which means that + block->page_count is really a row position inside the page. +*/ + +static void fill_block(MARIA_FILE_BITMAP *bitmap, + MARIA_BITMAP_BLOCK *block, + uchar *best_data, uint best_pos, uint best_bits, + uint fill_pattern) +{ + uint page, offset, tmp; + uchar *data; + DBUG_ENTER("fill_block"); + + /* For each 6 bytes we have 6*8/3= 16 patterns */ + page= ((uint) (best_data - bitmap->map)) / 6 * 16 + best_pos; + DBUG_ASSERT(page + 1 < bitmap->pages_covered); + block->page= bitmap->page + 1 + page; + block->page_count= TAIL_PAGE_COUNT_MARKER; + block->empty_space= pattern_to_size(bitmap, best_bits); + block->sub_blocks= 0; + block->org_bitmap_value= best_bits; + block->used= BLOCKUSED_TAIL; /* See _ma_bitmap_release_unused() */ + + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + best_pos*= 3; + data= best_data+ best_pos / 8; + offset= best_pos & 7; + tmp= uint2korr(data); + + /* we turn off the 3 bits and replace them with fill_pattern */ + tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset); + int2store(data, tmp); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_VOID_RETURN; +} + + +/* + Allocate data for head block + + SYNOPSIS + allocate_head() + bitmap bitmap + size Size of data region we need to store + block Store found information here + + IMPLEMENTATION + Find the best-fit page to put a region of 'size' + This is defined as the first page of the set of pages + with the smallest free space that can hold 'size'. + + RETURN + 0 ok (block is updated) + 1 error (no space in bitmap; block is not touched) +*/ + + +static my_bool allocate_head(MARIA_FILE_BITMAP *bitmap, uint size, + MARIA_BITMAP_BLOCK *block) +{ + uint min_bits= size_to_head_pattern(bitmap, size); + uchar *data= bitmap->map, *end= data + bitmap->used_size; + uchar *best_data= 0; + uint best_bits= (uint) -1, best_pos; + DBUG_ENTER("allocate_head"); + + LINT_INIT(best_pos); + DBUG_ASSERT(size <= FULL_PAGE_SIZE(bitmap->block_size)); + + for (; data < end; data+= 6) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uint i; + + /* + Skip common patterns + We can skip empty pages (if we already found a match) or + anything matching the following pattern as this will be either + a full page or a tail page + */ + if ((!bits && best_data) || + ((bits & LL(04444444444444444)) == LL(04444444444444444))) + continue; + for (i= 0; i < 16 ; i++, bits >>= 3) + { + uint pattern= (uint) (bits & 7); + if (pattern <= min_bits) + { + /* There is enough space here */ + if ((int) pattern > (int) best_bits) + { + /* + There is more than enough space here and it's better than what + we have found so far. Remember it, as we will choose it if we + don't find anything in this bitmap page. + */ + best_bits= pattern; + best_data= data; + best_pos= i; + if (pattern == min_bits) + goto found; /* Best possible match */ + } + } + } + } + if (!best_data) /* Found no place */ + { + if (data >= bitmap->map + bitmap->total_size) + DBUG_RETURN(1); /* No space in bitmap */ + /* Allocate data at end of bitmap */ + bitmap->used_size+= 6; + set_if_smaller(bitmap->used_size, bitmap->total_size); + best_data= data; + best_pos= best_bits= 0; + } + +found: + fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_HEAD_PAGE); + DBUG_RETURN(0); +} + + +/* + Allocate data for tail block + + SYNOPSIS + allocate_tail() + bitmap bitmap + size Size of block we need to find + block Store found information here + + RETURN + 0 ok (block is updated) + 1 error (no space in bitmap; block is not touched) +*/ + + +static my_bool allocate_tail(MARIA_FILE_BITMAP *bitmap, uint size, + MARIA_BITMAP_BLOCK *block) +{ + uint min_bits= size_to_tail_pattern(bitmap, size); + uchar *data= bitmap->map, *end= data + bitmap->used_size; + uchar *best_data= 0; + uint best_bits= (uint) -1, best_pos; + DBUG_ENTER("allocate_tail"); + DBUG_PRINT("enter", ("size: %u", size)); + + LINT_INIT(best_pos); + /* + We have to add DIR_ENTRY_SIZE here as this is not part of the data size + See call to allocate_tail() in find_tail(). + */ + DBUG_ASSERT(size <= MAX_TAIL_SIZE(bitmap->block_size) + DIR_ENTRY_SIZE); + + for (; data < end; data += 6) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uint i; + + /* + Skip common patterns + We can skip empty pages (if we already found a match) or + the following patterns: 1-4 (head pages, not suitable for tail) or + 7 (full tail page). See 'Dynamic size records' comment at start of file. + + At the moment we only skip full head and tail pages (ie, all bits are + set) as this is easy to detect with one simple test and is a + quite common case if we have blobs. + */ + + if ((!bits && best_data) || bits == LL(0xffffffffffff) || + bits == LL(04444444444444444)) + continue; + for (i= 0; i < 16; i++, bits >>= 3) + { + uint pattern= (uint) (bits & 7); + if (pattern <= min_bits && (!pattern || pattern >= 5)) + { + if ((int) pattern > (int) best_bits) + { + best_bits= pattern; + best_data= data; + best_pos= i; + if (pattern == min_bits) + goto found; /* Can't be better */ + } + } + } + } + if (!best_data) + { + if (data >= bitmap->map + bitmap->total_size) + DBUG_RETURN(1); + /* Allocate data at end of bitmap */ + best_data= data; + bitmap->used_size+= 6; + set_if_smaller(bitmap->used_size, bitmap->total_size); + best_pos= best_bits= 0; + } + +found: + fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_TAIL_PAGE); + DBUG_RETURN(0); +} + + +/* + Allocate data for full blocks + + SYNOPSIS + allocate_full_pages() + bitmap bitmap + pages_needed Total size in pages (bitmap->total_size) we would like to have + block Store found information here + full_page 1 if we are not allowed to split extent + + IMPLEMENTATION + We will return the smallest area >= size. If there is no such + block, we will return the biggest area that satisfies + area_size >= min(BLOB_SEGMENT_MIN_SIZE*full_page_size, size) + + To speed up searches, we will only consider areas that has at least 16 free + pages starting on an even boundary. When finding such an area, we will + extend it with all previous and following free pages. This will ensure + we don't get holes between areas + + RETURN + # Blocks used + 0 error (no space in bitmap; block is not touched) +*/ + +static ulong allocate_full_pages(MARIA_FILE_BITMAP *bitmap, + ulong pages_needed, + MARIA_BITMAP_BLOCK *block, my_bool full_page) +{ + uchar *data= bitmap->map, *data_end= data + bitmap->used_size; + uchar *page_end= data + bitmap->total_size; + uchar *best_data= 0; + uint min_size; + uint best_area_size, best_prefix_area_size, best_suffix_area_size; + uint page, size; + ulonglong best_prefix_bits; + DBUG_ENTER("allocate_full_pages"); + DBUG_PRINT("enter", ("pages_needed: %lu", pages_needed)); + + /* Following variables are only used if best_data is set */ + LINT_INIT(best_prefix_bits); + LINT_INIT(best_prefix_area_size); + LINT_INIT(best_suffix_area_size); + + min_size= pages_needed; + if (!full_page && min_size > BLOB_SEGMENT_MIN_SIZE) + min_size= BLOB_SEGMENT_MIN_SIZE; + best_area_size= ~(uint) 0; + + for (; data < page_end; data+= 6) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uchar *data_start; + ulonglong prefix_bits= 0; + uint area_size, prefix_area_size, suffix_area_size; + + /* Find area with at least 16 free pages */ + if (bits) + continue; + data_start= data; + /* Find size of area */ + for (data+=6 ; data < data_end ; data+= 6) + { + if ((bits= uint6korr(data))) + break; + } + area_size= (uint) (data - data_start) / 6 * 16; + if (area_size >= best_area_size) + continue; + prefix_area_size= suffix_area_size= 0; + if (!bits) + { + /* + End of page; All the rest of the bits on page are part of area + This is needed because bitmap->used_size only covers the set bits + in the bitmap. + */ + area_size+= (uint) (page_end - data) / 6 * 16; + if (area_size >= best_area_size) + break; + data= page_end; + } + else + { + /* Add bits at end of page */ + for (; !(bits & 7); bits >>= 3) + suffix_area_size++; + area_size+= suffix_area_size; + } + if (data_start != bitmap->map) + { + /* Add bits before page */ + bits= prefix_bits= uint6korr(data_start - 6); + DBUG_ASSERT(bits != 0); + /* 111 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 */ + if (!(bits & LL(07000000000000000))) + { + data_start-= 6; + do + { + prefix_area_size++; + bits<<= 3; + } while (!(bits & LL(07000000000000000))); + area_size+= prefix_area_size; + /* Calculate offset to page from data_start */ + prefix_area_size= 16 - prefix_area_size; + } + } + if (area_size >= min_size && area_size <= best_area_size) + { + best_data= data_start; + best_area_size= area_size; + best_prefix_bits= prefix_bits; + best_prefix_area_size= prefix_area_size; + best_suffix_area_size= suffix_area_size; + + /* Prefer to put data in biggest possible area */ + if (area_size <= pages_needed) + min_size= area_size; + else + min_size= pages_needed; + } + } + if (!best_data) + DBUG_RETURN(0); /* No room on page */ + + /* + Now allocate min(pages_needed, area_size), starting from + best_start + best_prefix_area_size + */ + if (best_area_size > pages_needed) + best_area_size= pages_needed; + + /* For each 6 bytes we have 6*8/3= 16 patterns */ + page= ((uint) (best_data - bitmap->map) * 8) / 3 + best_prefix_area_size; + block->page= bitmap->page + 1 + page; + block->page_count= best_area_size; + block->empty_space= 0; + block->sub_blocks= 0; + block->org_bitmap_value= 0; + block->used= 0; + DBUG_ASSERT(page + best_area_size < bitmap->pages_covered); + DBUG_PRINT("info", ("page: %lu page_count: %u", + (ulong) block->page, block->page_count)); + + if (best_prefix_area_size) + { + ulonglong tmp; + /* Convert offset back to bits */ + best_prefix_area_size= 16 - best_prefix_area_size; + if (best_area_size < best_prefix_area_size) + { + tmp= (LL(1) << best_area_size*3) - 1; + best_area_size= best_prefix_area_size; /* for easy end test */ + } + else + tmp= (LL(1) << best_prefix_area_size*3) - 1; + tmp<<= (16 - best_prefix_area_size) * 3; + DBUG_ASSERT((best_prefix_bits & tmp) == 0); + best_prefix_bits|= tmp; + int6store(best_data, best_prefix_bits); + if (!(best_area_size-= best_prefix_area_size)) + { + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_RETURN(block->page_count); + } + best_data+= 6; + } + best_area_size*= 3; /* Bits to set */ + size= best_area_size/8; /* Bytes to set */ + bfill(best_data, size, 255); + best_data+= size; + if ((best_area_size-= size * 8)) + { + /* fill last uchar */ + *best_data|= (uchar) ((1 << best_area_size) -1); + best_data++; + } + if (data_end < best_data) + { + bitmap->used_size= (uint) (best_data - bitmap->map); + DBUG_ASSERT(bitmap->used_size <= bitmap->total_size); + } + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_RETURN(block->page_count); +} + + +/**************************************************************************** + Find right bitmaps where to store data +****************************************************************************/ + +/* + Find right bitmap and position for head block + + SYNOPSIS + find_head() + info Maria handler + length Size of data region we need store + position Position in bitmap_blocks where to store the + information for the head block. + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_head(MARIA_HA *info, uint length, uint position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + /* + There is always place for the head block in bitmap_blocks as these are + preallocated at _ma_init_block_record(). + */ + block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *); + + /* + We need to have DIRENTRY_SIZE here to take into account that we may + need an extra directory entry for the row + */ + while (allocate_head(bitmap, length + DIR_ENTRY_SIZE, block)) + if (move_to_next_bitmap(info, bitmap)) + return 1; + return 0; +} + + +/* + Find right bitmap and position for tail + + SYNOPSIS + find_tail() + info Maria handler + length Size of data region we need store + position Position in bitmap_blocks where to store the + information for the head block. + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_tail(MARIA_HA *info, uint length, uint position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + DBUG_ENTER("find_tail"); + DBUG_ASSERT(length <= info->s->block_size - PAGE_OVERHEAD_SIZE); + + /* Needed, as there is no error checking in dynamic_element */ + if (allocate_dynamic(&info->bitmap_blocks, position)) + DBUG_RETURN(1); + block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *); + + /* + We have to add DIR_ENTRY_SIZE to ensure we have space for the tail and + it's directroy entry on the page + */ + while (allocate_tail(bitmap, length + DIR_ENTRY_SIZE, block)) + if (move_to_next_bitmap(info, bitmap)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/* + Find right bitmap and position for full blocks in one extent + + SYNOPSIS + find_mid() + info Maria handler. + pages How many pages to allocate. + position Position in bitmap_blocks where to store the + information for the head block. + NOTES + This is used to allocate the main extent after the 'head' block + (Ie, the middle part of the head-middle-tail entry) + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_mid(MARIA_HA *info, ulong pages, uint position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *); + + while (!allocate_full_pages(bitmap, pages, block, 1)) + { + if (move_to_next_bitmap(info, bitmap)) + return 1; + } + return 0; +} + + +/* + Find right bitmap and position for putting a blob + + SYNOPSIS + find_blob() + info Maria handler. + length Length of the blob + + NOTES + The extents are stored last in info->bitmap_blocks + + IMPLEMENTATION + Allocate all full pages for the block + optionally one tail + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_blob(MARIA_HA *info, ulong length) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + uint full_page_size= FULL_PAGE_SIZE(info->s->block_size); + ulong pages; + uint rest_length, used; + uint first_block_pos; + MARIA_BITMAP_BLOCK *first_block= 0; + DBUG_ENTER("find_blob"); + DBUG_PRINT("enter", ("length: %lu", length)); + LINT_INIT(first_block_pos); + + pages= length / full_page_size; + rest_length= (uint) (length - pages * full_page_size); + if (rest_length >= MAX_TAIL_SIZE(info->s->block_size)) + { + pages++; + rest_length= 0; + } + + first_block_pos= info->bitmap_blocks.elements; + if (pages) + { + MARIA_BITMAP_BLOCK *block; + if (allocate_dynamic(&info->bitmap_blocks, + info->bitmap_blocks.elements + + pages / BLOB_SEGMENT_MIN_SIZE + 2)) + DBUG_RETURN(1); + block= dynamic_element(&info->bitmap_blocks, info->bitmap_blocks.elements, + MARIA_BITMAP_BLOCK*); + do + { + /* + We use 0x3fff here as the two upmost bits are reserved for + TAIL_BIT and START_EXTENT_BIT + */ + used= allocate_full_pages(bitmap, + (pages >= 0x3fff ? 0x3fff : (uint) pages), + block, 0); + if (!used) + { + if (move_to_next_bitmap(info, bitmap)) + DBUG_RETURN(1); + } + else + { + pages-= used; + info->bitmap_blocks.elements++; + block++; + } + } while (pages != 0); + } + if (rest_length && find_tail(info, rest_length, + info->bitmap_blocks.elements++)) + DBUG_RETURN(1); + first_block= dynamic_element(&info->bitmap_blocks, first_block_pos, + MARIA_BITMAP_BLOCK*); + first_block->sub_blocks= info->bitmap_blocks.elements - first_block_pos; + DBUG_RETURN(0); +} + + +/* + Find pages to put ALL blobs + + SYNOPSIS + allocate_blobs() + info Maria handler + row Information of what is in the row (from calc_record_size()) + + RETURN + 0 ok + 1 error +*/ + +static my_bool allocate_blobs(MARIA_HA *info, MARIA_ROW *row) +{ + ulong *length, *end; + uint elements; + /* + Reserve size for: + head block + one extent + tail block + */ + elements= info->bitmap_blocks.elements; + for (length= row->blob_lengths, end= length + info->s->base.blobs; + length < end; length++) + { + if (*length && find_blob(info, *length)) + return 1; + } + row->extents_count= (info->bitmap_blocks.elements - elements); + return 0; +} + + +/* + Store in the bitmap the new size for a head page + + SYNOPSIS + use_head() + info Maria handler + page Page number to update + (Note that caller guarantees this is in the active + bitmap) + size How much free space is left on the page + block_position In which info->bitmap_block we have the + information about the head block. + + NOTES + This is used on update where we are updating an existing head page +*/ + +static void use_head(MARIA_HA *info, pgcache_page_no_t page, uint size, + uint block_position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + uchar *data; + uint offset, tmp, offset_page; + DBUG_ENTER("use_head"); + + DBUG_ASSERT(page % bitmap->pages_covered); + + block= dynamic_element(&info->bitmap_blocks, block_position, + MARIA_BITMAP_BLOCK*); + block->page= page; + block->page_count= 1 + TAIL_BIT; + block->empty_space= size; + block->used= BLOCKUSED_TAIL; + + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + offset_page= (uint) (page - bitmap->page - 1) * 3; + offset= offset_page & 7; + data= bitmap->map + offset_page / 8; + tmp= uint2korr(data); + block->org_bitmap_value= (tmp >> offset) & 7; + tmp= (tmp & ~(7 << offset)) | (FULL_HEAD_PAGE << offset); + int2store(data, tmp); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_VOID_RETURN; +} + + +/* + Find out where to split the row (ie, what goes in head, middle, tail etc) + + SYNOPSIS + find_where_to_split_row() + share Maria share + row Information of what is in the row (from calc_record_size()) + extents_length Number of bytes needed to store all extents + split_size Free size on the page (The head length must be less + than this) + + RETURN + row_length for the head block. +*/ + +static uint find_where_to_split_row(MARIA_SHARE *share, MARIA_ROW *row, + uint extents_length, uint split_size) +{ + uint *lengths, *lengths_end; + /* + Ensure we have the minimum required space on head page: + - Header + length of field lengths (row->min_length) + - Number of extents + - One extent + */ + uint row_length= (row->min_length + + size_to_store_key_length(extents_length) + + ROW_EXTENT_SIZE); + DBUG_ASSERT(row_length < split_size); + /* + Store first in all_field_lengths the different parts that are written + to the row. This needs to be in same order as in + ma_block_rec.c::write_block_record() + */ + row->null_field_lengths[-3]= extents_length; + row->null_field_lengths[-2]= share->base.fixed_not_null_fields_length; + row->null_field_lengths[-1]= row->field_lengths_length; + for (lengths= row->null_field_lengths - EXTRA_LENGTH_FIELDS, + lengths_end= (lengths + share->base.pack_fields - share->base.blobs + + EXTRA_LENGTH_FIELDS); lengths < lengths_end; lengths++) + { + if (row_length + *lengths > split_size) + break; + row_length+= *lengths; + } + return row_length; +} + + +/* + Find where to write the middle parts of the row and the tail + + SYNOPSIS + write_rest_of_head() + info Maria handler + position Position in bitmap_blocks. Is 0 for rows that needs + full blocks (ie, has a head, middle part and optional tail) + rest_length How much left of the head block to write. + + RETURN + 0 ok + 1 error +*/ + +static my_bool write_rest_of_head(MARIA_HA *info, uint position, + ulong rest_length) +{ + MARIA_SHARE *share= info->s; + uint full_page_size= FULL_PAGE_SIZE(share->block_size); + MARIA_BITMAP_BLOCK *block; + DBUG_ENTER("write_rest_of_head"); + DBUG_PRINT("enter", ("position: %u rest_length: %lu", position, + rest_length)); + + if (position == 0) + { + /* Write out full pages */ + uint pages= rest_length / full_page_size; + + rest_length%= full_page_size; + if (rest_length >= MAX_TAIL_SIZE(share->block_size)) + { + /* Put tail on a full page */ + pages++; + rest_length= 0; + } + if (find_mid(info, pages, 1)) + DBUG_RETURN(1); + /* + Insert empty block after full pages, to allow write_block_record() to + split segment into used + free page + */ + block= dynamic_element(&info->bitmap_blocks, 2, MARIA_BITMAP_BLOCK*); + block->page_count= 0; + block->used= 0; + } + if (rest_length) + { + if (find_tail(info, rest_length, ELEMENTS_RESERVED_FOR_MAIN_PART - 1)) + DBUG_RETURN(1); + } + else + { + /* Empty tail block */ + block= dynamic_element(&info->bitmap_blocks, + ELEMENTS_RESERVED_FOR_MAIN_PART - 1, + MARIA_BITMAP_BLOCK *); + block->page_count= 0; + block->used= 0; + } + DBUG_RETURN(0); +} + + +/* + Find where to store one row + + SYNPOSIS + _ma_bitmap_find_place() + info Maria handler + row Information about row to write + blocks Store data about allocated places here + + RETURN + 0 ok + row->space_on_head_page contains minimum number of bytes we + expect to put on the head page. + 1 error + my_errno is set to error +*/ + +my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row, + MARIA_BITMAP_BLOCKS *blocks) +{ + MARIA_SHARE *share= info->s; + my_bool res= 1; + uint full_page_size, position, max_page_size; + uint head_length, row_length, rest_length, extents_length; + DBUG_ENTER("_ma_bitmap_find_place"); + + blocks->count= 0; + blocks->tail_page_skipped= blocks->page_skipped= 0; + row->extents_count= 0; + + /* + Reserve place for the following blocks: + - Head block + - Full page block + - Marker block to allow write_block_record() to split full page blocks + into full and free part + - Tail block + */ + + info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART; + max_page_size= (share->block_size - PAGE_OVERHEAD_SIZE); + + pthread_mutex_lock(&share->bitmap.bitmap_lock); + + if (row->total_length <= max_page_size) + { + /* Row fits in one page */ + position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1; + if (find_head(info, (uint) row->total_length, position)) + goto abort; + row->space_on_head_page= row->total_length; + goto end; + } + + /* + First allocate all blobs so that we can find out the needed size for + the main block. + */ + if (row->blob_length && allocate_blobs(info, row)) + goto abort; + + extents_length= row->extents_count * ROW_EXTENT_SIZE; + /* + The + 3 is reserved for storing the number of segments in the row header. + */ + if ((head_length= (row->head_length + extents_length + 3)) <= + max_page_size) + { + /* Main row part fits into one page */ + position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1; + if (find_head(info, head_length, position)) + goto abort; + row->space_on_head_page= head_length; + goto end; + } + + /* Allocate enough space */ + head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE; + + /* The first segment size is stored in 'row_length' */ + row_length= find_where_to_split_row(share, row, extents_length, + max_page_size); + + full_page_size= MAX_TAIL_SIZE(share->block_size); + position= 0; + if (head_length - row_length <= full_page_size) + position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */ + if (find_head(info, row_length, position)) + goto abort; + row->space_on_head_page= row_length; + + rest_length= head_length - row_length; + if (write_rest_of_head(info, position, rest_length)) + goto abort; + +end: + blocks->block= dynamic_element(&info->bitmap_blocks, position, + MARIA_BITMAP_BLOCK*); + blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position; + /* First block's page_count is for all blocks */ + blocks->count= info->bitmap_blocks.elements - position; + res= 0; + +abort: + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/* + Find where to put row on update (when head page is already defined) + + SYNPOSIS + _ma_bitmap_find_new_place() + info Maria handler + row Information about row to write + page On which page original row was stored + free_size Free size on head page + blocks Store data about allocated places here + + NOTES + This function is only called when the new row can't fit in the space of + the old row in the head page. + + This is essently same as _ma_bitmap_find_place() except that + we don't call find_head() to search in bitmaps where to put the page. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *row, + pgcache_page_no_t page, uint free_size, + MARIA_BITMAP_BLOCKS *blocks) +{ + MARIA_SHARE *share= info->s; + my_bool res= 1; + uint position; + uint head_length, row_length, rest_length, extents_length; + ulonglong bitmap_page; + DBUG_ENTER("_ma_bitmap_find_new_place"); + + blocks->count= 0; + blocks->tail_page_skipped= blocks->page_skipped= 0; + row->extents_count= 0; + info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART; + + pthread_mutex_lock(&share->bitmap.bitmap_lock); + + /* + First allocate all blobs (so that we can find out the needed size for + the main block. + */ + if (row->blob_length && allocate_blobs(info, row)) + goto abort; + + /* Switch bitmap to current head page */ + bitmap_page= page / share->bitmap.pages_covered; + bitmap_page*= share->bitmap.pages_covered; + + if (share->bitmap.page != bitmap_page && + _ma_change_bitmap_page(info, &share->bitmap, bitmap_page)) + goto abort; + + extents_length= row->extents_count * ROW_EXTENT_SIZE; + if ((head_length= (row->head_length + extents_length + 3)) <= free_size) + { + /* Main row part fits into one page */ + position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1; + use_head(info, page, head_length, position); + row->space_on_head_page= head_length; + goto end; + } + + /* Allocate enough space */ + head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE; + + /* The first segment size is stored in 'row_length' */ + row_length= find_where_to_split_row(share, row, extents_length, free_size); + + position= 0; + if (head_length - row_length < MAX_TAIL_SIZE(share->block_size)) + position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */ + use_head(info, page, row_length, position); + row->space_on_head_page= row_length; + + rest_length= head_length - row_length; + if (write_rest_of_head(info, position, rest_length)) + goto abort; + +end: + blocks->block= dynamic_element(&info->bitmap_blocks, position, + MARIA_BITMAP_BLOCK*); + blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position; + /* First block's page_count is for all blocks */ + blocks->count= info->bitmap_blocks.elements - position; + res= 0; + +abort: + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/**************************************************************************** + Clear and reset bits +****************************************************************************/ + +/* + Set fill pattern for a page + + set_page_bits() + info Maria handler + bitmap Bitmap handler + page Adress to page + fill_pattern Pattern (not size) for page + + NOTES + Page may not be part of active bitmap + + RETURN + 0 ok + 1 error +*/ + +static my_bool set_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, uint fill_pattern) +{ + pgcache_page_no_t bitmap_page; + uint offset_page, offset, tmp, org_tmp; + uchar *data; + DBUG_ENTER("set_page_bits"); + DBUG_ASSERT(fill_pattern <= 7); + + bitmap_page= page - page % bitmap->pages_covered; + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(1); + + /* Find page number from start of bitmap */ + offset_page= (uint) (page - bitmap->page - 1); + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + offset_page*= 3; + offset= offset_page & 7; + data= bitmap->map + offset_page / 8; + org_tmp= tmp= uint2korr(data); + tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset); + if (tmp == org_tmp) + DBUG_RETURN(0); /* No changes */ + int2store(data, tmp); + + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + if (fill_pattern != 3 && fill_pattern != 7) + set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page); + /* + Note that if the condition above is false (page is full), and all pages of + this bitmap are now full, and that bitmap page was + first_bitmap_with_space, we don't modify first_bitmap_with_space, indeed + its value still tells us where to start our search for a bitmap with space + (which is for sure after this full one). + That does mean that first_bitmap_with_space is only a lower bound. + */ + DBUG_RETURN(0); +} + + +/* + Get bitmap pattern for a given page + + SYNOPSIS + get_page_bits() + info Maria handler + bitmap Bitmap handler + page Page number + + RETURN + 0-7 Bitmap pattern + ~0 Error (couldn't read page) +*/ + +uint _ma_bitmap_get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page) +{ + pgcache_page_no_t bitmap_page; + uint offset_page, offset, tmp; + uchar *data; + DBUG_ENTER("_ma_bitmap_get_page_bits"); + + bitmap_page= page - page % bitmap->pages_covered; + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(~ (uint) 0); + + /* Find page number from start of bitmap */ + offset_page= (uint) (page - bitmap->page - 1); + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + offset_page*= 3; + offset= offset_page & 7; + data= bitmap->map + offset_page / 8; + tmp= uint2korr(data); + DBUG_RETURN((tmp >> offset) & 7); +} + + +/* + Mark all pages in a region as free + + SYNOPSIS + _ma_bitmap_reset_full_page_bits() + info Maria handler + bitmap Bitmap handler + page Start page + page_count Number of pages + + NOTES + We assume that all pages in region is covered by same bitmap + One must have a lock on info->s->bitmap.bitmap_lock + + RETURN + 0 ok + 1 Error (when reading bitmap) +*/ + +my_bool _ma_bitmap_reset_full_page_bits(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, + uint page_count) +{ + ulonglong bitmap_page; + uint offset, bit_start, bit_count, tmp; + uchar *data; + DBUG_ENTER("_ma_bitmap_reset_full_page_bits"); + DBUG_PRINT("enter", ("page: %lu page_count: %u", (ulong) page, page_count)); + safe_mutex_assert_owner(&info->s->bitmap.bitmap_lock); + + bitmap_page= page - page % bitmap->pages_covered; + DBUG_ASSERT(page != bitmap_page); + + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(1); + + /* Find page number from start of bitmap */ + offset= (uint) (page - bitmap->page - 1); + + /* Clear bits from 'page * 3' -> '(page + page_count) * 3' */ + bit_start= offset * 3; + bit_count= page_count * 3; + + data= bitmap->map + bit_start / 8; + offset= bit_start & 7; + + tmp= (255 << offset); /* Bits to keep */ + if (bit_count + offset < 8) + { + /* Only clear bits between 'offset' and 'offset+bit_count-1' */ + tmp^= (255 << (offset + bit_count)); + } + *data&= ~tmp; + + if ((int) (bit_count-= (8 - offset)) > 0) + { + uint fill; + data++; + /* + -1 is here to avoid one 'if' statement and to let the following code + handle the last byte + */ + if ((fill= (bit_count - 1) / 8)) + { + bzero(data, fill); + data+= fill; + } + bit_count-= fill * 8; /* Bits left to clear */ + tmp= (1 << bit_count) - 1; + *data&= ~tmp; + } + set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_RETURN(0); +} + +/* + Set all pages in a region as used + + SYNOPSIS + _ma_bitmap_set_full_page_bits() + info Maria handler + bitmap Bitmap handler + page Start page + page_count Number of pages + + NOTES + We assume that all pages in region is covered by same bitmap + One must have a lock on info->s->bitmap.bitmap_lock + + RETURN + 0 ok + 1 Error (when reading bitmap) +*/ + +my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, uint page_count) +{ + ulonglong bitmap_page; + uint offset, bit_start, bit_count, tmp; + uchar *data; + DBUG_ENTER("_ma_bitmap_set_full_page_bits"); + DBUG_PRINT("enter", ("page: %lu page_count: %u", (ulong) page, page_count)); + safe_mutex_assert_owner(&info->s->bitmap.bitmap_lock); + + bitmap_page= page - page % bitmap->pages_covered; + if (page == bitmap_page || + page + page_count >= bitmap_page + bitmap->pages_covered) + { + DBUG_ASSERT(0); /* Wrong in data */ + DBUG_RETURN(1); + } + + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(1); + + /* Find page number from start of bitmap */ + offset= (uint) (page - bitmap->page - 1); + + /* Set bits from 'page * 3' -> '(page + page_count) * 3' */ + bit_start= offset * 3; + bit_count= page_count * 3; + + data= bitmap->map + bit_start / 8; + offset= bit_start & 7; + + tmp= (255 << offset); /* Bits to keep */ + if (bit_count + offset < 8) + { + /* Only set bits between 'offset' and 'offset+bit_count-1' */ + tmp^= (255 << (offset + bit_count)); + } + *data|= tmp; + + if ((int) (bit_count-= (8 - offset)) > 0) + { + uint fill; + data++; + /* + -1 is here to avoid one 'if' statement and to let the following code + handle the last byte + */ + if ((fill= (bit_count - 1) / 8)) + { + bfill(data, fill, 255); + data+= fill; + } + bit_count-= fill * 8; /* Bits left to set */ + tmp= (1 << bit_count) - 1; + *data|= tmp; + } + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_RETURN(0); +} + + +/** + @brief + Make a transition of MARIA_FILE_BITMAP::non_flushable. + If the bitmap becomes flushable, which requires that REDO-UNDO has been + logged and all bitmap pages touched by the thread have a correct + allocation, it unpins all bitmap pages, and if _ma_bitmap_flush_all() is + waiting (in practice it is a checkpoint), it wakes it up. + If the bitmap becomes or stays unflushable, the function merely records it + unless a concurrent _ma_bitmap_flush_all() is happening, in which case the + function first waits for the flush to be done. + + @note + this sets info->non_flushable_state to 1 if we have incremented + bitmap->non_flushable and not yet decremented it. + + @param share Table's share + @param non_flushable_inc Increment of MARIA_FILE_BITMAP::non_flushable + (-1 or +1). +*/ + +void _ma_bitmap_flushable(MARIA_HA *info, int non_flushable_inc) +{ + MARIA_SHARE *share= info->s; + MARIA_FILE_BITMAP *bitmap; + DBUG_ENTER("_ma_bitmap_flushable"); + + /* + Not transactional tables are never automaticly flushed and needs no + protection + */ + if (!share->now_transactional) + DBUG_VOID_RETURN; + + bitmap= &share->bitmap; + pthread_mutex_lock(&bitmap->bitmap_lock); + + if (non_flushable_inc == -1) + { + DBUG_ASSERT((int) bitmap->non_flushable > 0); + DBUG_ASSERT(info->non_flushable_state == 1); + if (--bitmap->non_flushable == 0) + { + /* + We unlock and unpin pages locked and pinned by other threads. It does + not seem to be an issue as all bitmap changes are serialized with + the bitmap's mutex. + */ + _ma_bitmap_unpin_all(share); + if (unlikely(bitmap->flush_all_requested)) + { + DBUG_PRINT("info", ("bitmap flushable waking up flusher")); + pthread_cond_broadcast(&bitmap->bitmap_cond); + } + } + DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable)); + pthread_mutex_unlock(&bitmap->bitmap_lock); + info->non_flushable_state= 0; + DBUG_VOID_RETURN; + } + DBUG_ASSERT(non_flushable_inc == 1); + DBUG_ASSERT(info->non_flushable_state == 0); + while (unlikely(bitmap->flush_all_requested)) + { + /* + Some other thread is waiting for the bitmap to become + flushable. Not the moment to make the bitmap unflushable or more + unflushable; let's rather back off and wait. If we didn't do this, with + multiple writers, there may always be one thread causing the bitmap to + be unflushable and _ma_bitmap_flush_all() would wait for long. + There should not be a deadlock because if our thread increased + non_flushable (and thus _ma_bitmap_flush_all() is waiting for at least + our thread), it is not going to increase it more so is not going to come + here. + */ + DBUG_PRINT("info", ("waiting for bitmap flusher")); + pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock); + } + bitmap->non_flushable++; + DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable)); + pthread_mutex_unlock(&bitmap->bitmap_lock); + info->non_flushable_state= 1; + DBUG_VOID_RETURN; +} + + +/* + Correct bitmap pages to reflect the true allocation + + SYNOPSIS + _ma_bitmap_release_unused() + info Maria handle + blocks Bitmap blocks + + IMPLEMENTATION + If block->used & BLOCKUSED_TAIL is set: + If block->used & BLOCKUSED_USED is set, then the bits for the + corresponding page is set according to block->empty_space + If block->used & BLOCKUSED_USED is not set, then the bits for + the corresponding page is set to org_bitmap_value; + + If block->used & BLOCKUSED_TAIL is not set: + if block->used is not set, the bits for the corresponding page are + cleared + + For the first block (head block) the logic is same as for a tail block + + Note that we may have 'filler blocks' that are used to split a block + in half; These can be recognized by that they have page_count == 0. + + This code also reverse the effect of ma_bitmap_flushable(.., 1); + + RETURN + 0 ok + 1 error (Couldn't write or read bitmap page) +*/ + +my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks) +{ + MARIA_BITMAP_BLOCK *block= blocks->block, *end= block + blocks->count; + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + uint bits, current_bitmap_value; + DBUG_ENTER("_ma_bitmap_release_unused"); + + /* + We can skip FULL_HEAD_PAGE (4) as the page was marked as 'full' + when we allocated space in the page + */ + current_bitmap_value= FULL_HEAD_PAGE; + + pthread_mutex_lock(&bitmap->bitmap_lock); + + /* First handle head block */ + if (block->used & BLOCKUSED_USED) + { + DBUG_PRINT("info", ("head page: %lu empty_space: %u", + (ulong) block->page, block->empty_space)); + bits= _ma_free_size_to_head_pattern(bitmap, block->empty_space); + if (block->used & BLOCKUSED_USE_ORG_BITMAP) + current_bitmap_value= block->org_bitmap_value; + } + else + bits= block->org_bitmap_value; + if (bits != current_bitmap_value) + { + if (set_page_bits(info, bitmap, block->page, bits)) + goto err; + } + else + { + DBUG_ASSERT(current_bitmap_value == + _ma_bitmap_get_page_bits(info, bitmap, block->page)); + } + + /* Handle all full pages and tail pages (for head page and blob) */ + for (block++; block < end; block++) + { + uint page_count; + if (!block->page_count) + continue; /* Skip 'filler blocks' */ + + page_count= block->page_count; + if (block->used & BLOCKUSED_TAIL) + { + current_bitmap_value= FULL_TAIL_PAGE; + /* The bitmap page is only one page */ + page_count= 1; + if (block->used & BLOCKUSED_USED) + { + DBUG_PRINT("info", ("tail page: %lu empty_space: %u", + (ulong) block->page, block->empty_space)); + bits= free_size_to_tail_pattern(bitmap, block->empty_space); + if (block->used & BLOCKUSED_USE_ORG_BITMAP) + current_bitmap_value= block->org_bitmap_value; + } + else + bits= block->org_bitmap_value; + + /* + The page has all bits set; The following test is an optimization + to not set the bits to the same value as before. + */ + if (bits != current_bitmap_value) + { + if (set_page_bits(info, bitmap, block->page, bits)) + goto err; + } + else + { + DBUG_ASSERT(current_bitmap_value == + _ma_bitmap_get_page_bits(info, bitmap, block->page)); + } + } + else if (!(block->used & BLOCKUSED_USED) && + _ma_bitmap_reset_full_page_bits(info, bitmap, + block->page, page_count)) + goto err; + } + + /* This duplicates ma_bitmap_flushable(-1) except it already has mutex */ + if (info->non_flushable_state) + { + DBUG_ASSERT(((int) (bitmap->non_flushable)) > 0); + info->non_flushable_state= 0; + if (--bitmap->non_flushable == 0) + { + _ma_bitmap_unpin_all(info->s); + if (unlikely(bitmap->flush_all_requested)) + { + DBUG_PRINT("info", ("bitmap flushable waking up flusher")); + pthread_cond_broadcast(&bitmap->bitmap_cond); + } + } + } + DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable)); + + pthread_mutex_unlock(&bitmap->bitmap_lock); + DBUG_RETURN(0); + +err: + pthread_mutex_unlock(&bitmap->bitmap_lock); + DBUG_RETURN(1); +} + + +/* + Free full pages from bitmap and pagecache + + SYNOPSIS + _ma_bitmap_free_full_pages() + info Maria handle + extents Extents (as stored on disk) + count Number of extents + + IMPLEMENTATION + Mark all full pages (not tails) from extents as free, both in bitmap + and page cache. + + RETURN + 0 ok + 1 error (Couldn't write or read bitmap page) +*/ + +my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents, + uint count) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + DBUG_ENTER("_ma_bitmap_free_full_pages"); + + pthread_mutex_lock(&bitmap->bitmap_lock); + for (; count--; extents+= ROW_EXTENT_SIZE) + { + pgcache_page_no_t page= uint5korr(extents); + uint page_count= (uint2korr(extents + ROW_EXTENT_PAGE_SIZE) & + ~START_EXTENT_BIT); + if (!(page_count & TAIL_BIT)) + { + if (page == 0 && page_count == 0) + continue; /* Not used extent */ + if (pagecache_delete_pages(info->s->pagecache, &info->dfile, page, + page_count, PAGECACHE_LOCK_WRITE, 1) || + _ma_bitmap_reset_full_page_bits(info, bitmap, page, page_count)) + { + pthread_mutex_unlock(&bitmap->bitmap_lock); + DBUG_RETURN(1); + } + } + } + pthread_mutex_unlock(&bitmap->bitmap_lock); + DBUG_RETURN(0); +} + + +/* + Mark in the bitmap how much free space there is on a page + + SYNOPSIS + _ma_bitmap_set() + info Maria handler + page Adress to page + head 1 if page is a head page, 0 if tail page + empty_space How much empty space there is on page + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_set(MARIA_HA *info, pgcache_page_no_t page, my_bool head, + uint empty_space) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + uint bits; + my_bool res; + DBUG_ENTER("_ma_bitmap_set"); + DBUG_PRINT("enter", ("page: %lu head: %d empty_space: %u", + (ulong) page, head, empty_space)); + + pthread_mutex_lock(&info->s->bitmap.bitmap_lock); + bits= (head ? + _ma_free_size_to_head_pattern(bitmap, empty_space) : + free_size_to_tail_pattern(bitmap, empty_space)); + res= set_page_bits(info, bitmap, page, bits); + pthread_mutex_unlock(&info->s->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/* + Check that bitmap pattern is correct for a page + + NOTES + Used in maria_chk + + SYNOPSIS + _ma_check_bitmap_data() + info Maria handler + page_type What kind of page this is + page Adress to page + empty_space Empty space on page + bitmap_pattern Store here the pattern that was in the bitmap for the + page. This is always updated. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_check_bitmap_data(MARIA_HA *info, + enum en_page_type page_type, pgcache_page_no_t page, + uint empty_space, uint *bitmap_pattern) +{ + uint bits; + switch (page_type) { + case UNALLOCATED_PAGE: + case MAX_PAGE_TYPE: + bits= 0; + break; + case HEAD_PAGE: + bits= _ma_free_size_to_head_pattern(&info->s->bitmap, empty_space); + break; + case TAIL_PAGE: + bits= free_size_to_tail_pattern(&info->s->bitmap, empty_space); + break; + case BLOB_PAGE: + bits= FULL_TAIL_PAGE; + break; + default: + bits= 0; /* to satisfy compiler */ + DBUG_ASSERT(0); + } + return ((*bitmap_pattern= _ma_bitmap_get_page_bits(info, &info->s->bitmap, + page)) != bits); +} + + +/* + Check if the page type matches the one that we have in the bitmap + + SYNOPSIS + _ma_check_if_right_bitmap_type() + info Maria handler + page_type What kind of page this is + page Adress to page + bitmap_pattern Store here the pattern that was in the bitmap for the + page. This is always updated. + + NOTES + Used in maria_chk + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info, + enum en_page_type page_type, + pgcache_page_no_t page, + uint *bitmap_pattern) +{ + if ((*bitmap_pattern= _ma_bitmap_get_page_bits(info, &info->s->bitmap, + page)) > 7) + return 1; /* Couldn't read page */ + switch (page_type) { + case HEAD_PAGE: + return *bitmap_pattern < 1 || *bitmap_pattern > 4; + case TAIL_PAGE: + return *bitmap_pattern < 5; + case BLOB_PAGE: + return *bitmap_pattern != 7; + default: + break; + } + DBUG_ASSERT(0); + return 1; +} + + +/** + @brief create the first bitmap page of a freshly created data file + + @param share table's share + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int _ma_bitmap_create_first(MARIA_SHARE *share) +{ + uint block_size= share->bitmap.block_size; + File file= share->bitmap.file.file; + uchar marker[CRC_SIZE]; + + /* + Next write operation of the page will write correct CRC + if it is needed + */ + int4store(marker, MARIA_NO_CRC_BITMAP_PAGE); + + if (my_chsize(file, block_size - sizeof(marker), + 0, MYF(MY_WME)) || + my_pwrite(file, marker, sizeof(marker), + block_size - sizeof(marker), + MYF(MY_NABP | MY_WME))) + return 1; + share->state.state.data_file_length= block_size; + _ma_bitmap_delete_all(share); + return 0; +} + + +/** + @brief Pagecache callback to get the TRANSLOG_ADDRESS to flush up to, when a + bitmap page needs to be flushed. + + @param page Page's content + @param page_no Page's number (<offset>/<page length>) + @param data_ptr Callback data pointer (pointer to MARIA_SHARE) + + @retval TRANSLOG_ADDRESS to flush up to. +*/ + +static my_bool +flush_log_for_bitmap(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no __attribute__((unused)), + uchar *data_ptr __attribute__((unused))) +{ +#ifndef DBUG_OFF + const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr; +#endif + DBUG_ENTER("flush_log_for_bitmap"); + DBUG_ASSERT(share->now_transactional); + /* + WAL imposes that UNDOs reach disk before bitmap is flushed. We don't know + the LSN of the last UNDO about this bitmap page, so we flush whole log. + */ + DBUG_RETURN(translog_flush(translog_get_horizon())); +} + + +/** + @brief Set callbacks for bitmap pages + + @note + We don't use pagecache_file_init here, as we want to keep the + code readable +*/ + +void _ma_bitmap_set_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share) +{ + file->callback_data= (uchar*) share; + file->flush_log_callback= maria_flush_log_for_page_none; + file->write_fail= maria_page_write_failure; + + if (share->temporary) + { + file->read_callback= &maria_page_crc_check_none; + file->write_callback= &maria_page_filler_set_none; + } + else + { + file->read_callback= &maria_page_crc_check_bitmap; + if (share->options & HA_OPTION_PAGE_CHECKSUM) + file->write_callback= &maria_page_crc_set_normal; + else + file->write_callback= &maria_page_filler_set_bitmap; + if (share->now_transactional) + file->flush_log_callback= flush_log_for_bitmap; + } +} + + +/** + Extends data file with zeroes and creates new bitmap pages into page cache. + + Writes all bitmap pages in [from, to]. + + Non-bitmap pages of zeroes are correct as they are marked empty in + bitmaps. Bitmap pages will not be zeroes: they will get their CRC fixed when + flushed. And if there is a crash before flush (so they are zeroes at + restart), a REDO will re-create them in page cache. +*/ + +static my_bool +_ma_bitmap_create_missing_into_pagecache(MARIA_SHARE *share, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t from, + pgcache_page_no_t to, + uchar *zeroes) +{ + pgcache_page_no_t i; + /* + We do not use my_chsize() because there can be a race between when it + reads the physical size and when it writes (assume data_file_length is 10, + physical length is 8 and two data pages are in cache, and here we do a + my_chsize: my_chsize sees physical length is 8, then the two data pages go + to disk then my_chsize writes from page 8 and so overwrites the two data + pages, wrongly). + We instead rely on the filesystem filling gaps with zeroes. + */ + for (i= from; i <= to; i+= bitmap->pages_covered) + { + /** + No need to keep them pinned, they are new so flushable. + @todo but we may want to keep them pinned, as an optimization: if they + are not pinned they may go to disk before the data pages go (so, the + physical pages would be in non-ascending "sparse" order on disk), or the + filesystem may fill gaps with zeroes physically which is a waste of + time. + */ + if (pagecache_write(share->pagecache, + &bitmap->file, i, 0, + zeroes, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE)) + goto err; + } + /* + Data pages after data_file_length are full of zeroes but that is allowed + as they are marked empty in the bitmap. + */ + return FALSE; +err: + return TRUE; +} + + +/** + Creates missing bitmaps when we extend the data file. + + At run-time, when we need a new bitmap page we come here; and only one bitmap + page at a time is created. + + In some recovery cases we insert at a large offset in the data file, way + beyond state.data_file_length, so can need to create more than one bitmap + page in one go. Known case is: + Start a transaction in Maria; + delete last row of very large table (with delete_row) + do a bulk insert + crash + Then UNDO_BULK_INSERT will truncate table files, and + UNDO_ROW_DELETE will want to put the row back to its original position, + extending the data file a lot: bitmap page*s* in the hole must be created, + or he table would look corrupted. + + We need to log REDOs for bitmap creation, consider: we apply a REDO for a + data page, which creates the first data page covered by a new bitmap + not yet created. If the data page is flushed but the bitmap page is not and + there is a crash, re-execution of the REDO will complain about the zeroed + bitmap page (see it as corruption). Thus a REDO is needed to re-create the + bitmap. + + @param info Maria handler + @param bitmap Bitmap handler + @param page Last bitmap page to create + + @note When this function is called this must be true: + ((page + 1) * bitmap->block_size > info->s->state.state.data_file_length) + +*/ + +static my_bool _ma_bitmap_create_missing(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page) +{ + MARIA_SHARE *share= info->s; + uint block_size= bitmap->block_size; + pgcache_page_no_t from, to; + my_off_t data_file_length= share->state.state.data_file_length; + DBUG_ENTER("_ma_bitmap_create_missing"); + + /* First (in offset order) bitmap page to create */ + if (data_file_length < block_size) + goto err; /* corrupted, should have first bitmap page */ + + from= (data_file_length / block_size - 1) / bitmap->pages_covered + 1; + from*= bitmap->pages_covered; + /* + page>=from because: + (page + 1) * bs > dfl, and page == k * pc so: + (k * pc + 1) * bs > dfl; k * pc + 1 > dfl / bs; k * pc > dfl / bs - 1 + k > (dfl / bs - 1) / pc; k >= (dfl / bs - 1) / pc + 1 + k * pc >= ((dfl / bs - 1) / pc + 1) * pc == from. + */ + DBUG_ASSERT(page >= from); + + if (share->now_transactional) + { + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + page_store(log_data + FILEID_STORE_SIZE, from); + page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + /* + We don't use info->trn so that this REDO is always executed even though + the UNDO does not reach disk due to crash. This is also consistent with + the fact that the new bitmap pages are not pinned. + */ + if (translog_write_record(&lsn, LOGREC_REDO_BITMAP_NEW_PAGE, + &dummy_transaction_object, info, + (translog_size_t)sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + goto err; + /* + No need to flush the log: the bitmap pages we are going to create will + flush it when they go to disk. + */ + } + + /* + Last bitmap page. It has special creation: will go to the page cache + only later as we are going to modify it very soon. + */ + bzero(bitmap->map, bitmap->block_size); + bitmap->used_size= 0; +#ifndef DBUG_OFF + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); +#endif + + /* Last bitmap page to create before 'page' */ + DBUG_ASSERT(page >= bitmap->pages_covered); + to= page - bitmap->pages_covered; + /* + In run-time situations, from>=to is always false, i.e. we always create + one bitmap at a time ('page'). + */ + if ((from <= to) && + _ma_bitmap_create_missing_into_pagecache(share, bitmap, from, to, + bitmap->map)) + goto err; + + share->state.state.data_file_length= (page + 1) * bitmap->block_size; + + DBUG_RETURN(FALSE); +err: + DBUG_RETURN(TRUE); +} + + +my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info, + LSN lsn __attribute__ ((unused)), + const uchar *header) +{ + MARIA_SHARE *share= info->s; + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + my_bool error; + pgcache_page_no_t from, to, min_from; + DBUG_ENTER("_ma_apply_redo_bitmap_new_page"); + + from= page_korr(header); + to= page_korr(header + PAGE_STORE_SIZE); + DBUG_PRINT("info", ("from: %lu to: %lu", (ulong)from, (ulong)to)); + if ((from > to) || + (from % bitmap->pages_covered) != 0 || + (to % bitmap->pages_covered) != 0) + { + error= TRUE; /* corrupted log record */ + goto err; + } + + min_from= (share->state.state.data_file_length / bitmap->block_size - 1) / + bitmap->pages_covered + 1; + min_from*= bitmap->pages_covered; + if (from < min_from) + { + DBUG_PRINT("info", ("overwrite bitmap pages from %lu", (ulong)min_from)); + /* + We have to overwrite. It could be that there was a bitmap page in + memory, covering a data page which went to disk, then crash: the + bitmap page is now full of zeros and is ==min_from, we have to overwrite + it with correct checksum. + */ + } + share->state.changed|= STATE_CHANGED; + bzero(info->buff, bitmap->block_size); + if (!(error= + _ma_bitmap_create_missing_into_pagecache(share, bitmap, from, to, + info->buff))) + share->state.state.data_file_length= (to + 1) * bitmap->block_size; + +err: + DBUG_RETURN(error); +} diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c new file mode 100644 index 00000000000..fd02e2ac0ec --- /dev/null +++ b/storage/maria/ma_blockrec.c @@ -0,0 +1,7404 @@ +/* Copyright (C) 2007-2008 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Storage of records in block + + Some clarifications about the abbrev used: + + NULL fields -> Fields that may have contain a NULL value. + Not null fields -> Fields that may not contain a NULL value. + Critical fields -> Fields that can't be null and can't be dropped without + causing a table reorganization. + + + Maria will have a LSN at start of each page (excluding the bitmap pages) + + The different page types that are in a data file are: + + Bitmap pages Map of free pages in the next extent (8192 page size + gives us 256M of mapped pages / bitmap) + Head page Start of rows are stored on this page. + A rowid always points to a head page + Blob page This page is totally filled with data from one blob or by + a set of long VARCHAR/CHAR fields + Tail page This contains the last part from different rows, blobs + or varchar fields. + + The data file starts with a bitmap page, followed by as many data + pages as the bitmap can cover. After this there is a new bitmap page + and more data pages etc. + + For information about the bitmap page, see ma_bitmap.c + + Structure of data and tail page: + + The page has a row directory at end of page to allow us to do deletes + without having to reorganize the page. It also allows us to later store + some more bytes after each row to allow them to grow without having to move + around other rows. + + Page header: + + LSN 7 bytes Log position for last page change + PAGE_TYPE 1 uchar 1 for head / 2 for tail / 3 for blob + DIR_COUNT 1 uchar Number of row/tail entries on page + FREE_DIR_LINK 1 uchar Pointer to first free director entry or 255 if no + empty space 2 bytes Empty space on page + + The most significant bit in PAGE_TYPE is set to 1 if the data on the page + can be compacted to get more space. (PAGE_CAN_BE_COMPACTED) + + Row data + + Row directory of NO entries, that consist of the following for each row + (in reverse order; i.e., first record is stored last): + + Position 2 bytes Position of row on page + Length 2 bytes Length of entry + + For Position and Length, the 1 most significant bit of the position and + the 1 most significant bit of the length could be used for some states of + the row (in other words, we should try to keep these reserved) + + Position is 0 if the entry is not used. In this case length[0] points + to a previous free entry (255 if no previous entry) and length[1] + to the next free entry (or 255 if last free entry). This works because + the directory entry 255 can never be marked free (if the first directory + entry is freed, the directory is shrinked). + + checksum 4 bytes Reserved for full page read testing and live backup. + + ---------------- + + Structure of blob pages: + + LSN 7 bytes Log position for last page change + PAGE_TYPE 1 uchar 3 + + data + + ----------------- + + Row data structure: + + Flag 1 uchar Marker of which header field exists + TRANSID 6 bytes TRANSID of changing transaction + (optional, added on insert and first + update/delete) + VER_PTR 7 bytes Pointer to older version in log + (undo record) + (optional, added after first + update/delete) + DELETE_TRANSID 6 bytes (optional). TRANSID of original row. + Added on delete. + Nulls_extended 1 uchar To allow us to add new DEFAULT NULL + fields (optional, added after first + change of row after alter table) + Number of ROW_EXTENT's 1-3 uchar Length encoded, optional + This is the number of extents the + row is split into + First row_extent 7 uchar Pointer to first row extent (optional) + + Total length of length array 1-3 uchar Only used if we have + char/varchar/blob fields. + Row checksum 1 uchar Only if table created with checksums + Null_bits .. One bit for each NULL field (a field that may + have the value NULL) + Empty_bits .. One bit for each field that may be 'empty'. + (Both for null and not null fields). + This bit is 1 if the value for the field is + 0 or empty string. + + field_offsets 2 byte/offset + For each 32'th field, there is one offset + that points to where the field information + starts in the block. This is to provide + fast access to later field in the row + when we only need to return a small + set of fields. + TODO: Implement this. + + Things marked above as 'optional' will only be present if the + corresponding bit is set in 'Flag' field. Flag gives us a way to + get more space on a page when doing page compaction as we don't need + to store TRANSID that have committed before the smallest running + transaction we have in memory. + + Data in the following order: + (Field order is precalculated when table is created) + + Critical fixed length, not null, fields. (Note, these can't be dropped) + Fixed length, null fields + + Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields. + Number of bytes used in length array per entry is depending on max length + for field. + + ROW_EXTENT's + CHAR data (space stripped) + VARCHAR data + BLOB data + + Fields marked in null_bits or empty_bits are not stored in data part or + length array. + + If row doesn't fit into the given block, then the first EXTENT will be + stored last on the row. This is done so that we don't break any field + data in the middle. + + We first try to store the full row into one block. If that's not possible + we move out each big blob into their own extents. If this is not enough we + move out a concatenation of all varchars to their own extent. + + Each blob and the concatenated char/varchar fields are stored the following + way: + - Store the parts in as many full-contiguous pages as possible. + - The last part, that doesn't fill a full page, is stored in tail page. + + When doing an insert of a new row, we don't have to have + VER_PTR in the row. This will make rows that are not changed stored + efficiently. On update and delete we would add TRANSID (if it was an old + committed row) and VER_PTR to + the row. On row page compaction we can easily detect rows where + TRANSID was committed before the longest running transaction + started and we can then delete TRANSID and VER_PTR from the row to + gain more space. + + If a row is deleted in Maria, we change TRANSID to the deleting + transaction's id, change VER_PTR to point to the undo record for the delete, + and add DELETE_TRANSID (the id of the transaction which last + inserted/updated the row before its deletion). DELETE_TRANSID allows an old + transaction to avoid reading the log to know if it can see the last version + before delete (in other words it reduces the probability of having to follow + VER_PTR). TODO: depending on a compilation option, evaluate the performance + impact of not storing DELETE_TRANSID (which would make the row smaller). + + Description of the different parts: + + Flag is coded as: + + Description bit + TRANS_ID_exists 0 + VER_PTR_exists 1 + Row is deleted 2 (Means that DELETE_TRANSID exists) + Nulls_extended_exists 3 + Row is split 7 This means that 'Number_of_row_extents' exists + + Nulls_extended is the number of new DEFAULT NULL fields in the row + compared to the number of DEFAULT NULL fields when the first version + of the table was created. If Nulls_extended doesn't exist in the row, + we know it's 0 as this must be one of the original rows from when the + table was created first time. This coding allows us to add 255*8 = + 2048 new fields without requiring a full alter table. + + Empty_bits is used to allow us to store 0, 0.0, empty string, empty + varstring and empty blob efficiently. (This is very good for data + warehousing where NULL's are often regarded as evil). Having this + bitmap also allows us to drop information of a field during a future + delete if field was deleted with ALTER TABLE DROP COLUMN. To be able + to handle DROP COLUMN, we must store in the index header the fields + that has been dropped. When unpacking a row we will ignore dropped + fields. When storing a row, we will mark a dropped field either with a + null in the null bit map or in the empty_bits and not store any data + for it. + TODO: Add code for handling dropped fields. + + + A ROW EXTENT is range of pages. One ROW_EXTENT is coded as: + + START_PAGE 5 bytes + PAGE_COUNT 2 bytes. Bit 16 is set if this is a tail page. + Bit 15 is to set if this is start of a new + blob extent. + + With 8K pages, we can cover 256M in one extent. This coding gives us a + maximum file size of 2^40*8192 = 8192 tera + + As an example of ROW_EXTENT handling, assume a row with one integer + field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2 + big BLOB fields that we have updated. + + The record format for storing this into an empty file would be: + + Page 1: + + 00 00 00 00 00 00 00 LSN + 01 Only one row in page + FF No free dir entry + xx xx Empty space on page + + 10 Flag: row split, VER_PTR exists + 01 00 00 00 00 00 TRANSID 1 + 00 00 00 00 00 01 00 VER_PTR to first block in LOG file 1 + 5 Number of row extents + 02 00 00 00 00 03 00 VARCHAR's are stored in full pages 2,3,4 + 0 No null fields + 0 No empty fields + 05 00 00 00 00 00 80 Tail page for VARCHAR, rowid 0 + 06 00 00 00 00 80 00 First blob, stored at page 6-133 + 05 00 00 00 00 01 80 Tail of first blob (896 bytes) at page 5 + 86 00 00 00 00 80 00 Second blob, stored at page 134-262 + 05 00 00 00 00 02 80 Tail of second blob (896 bytes) at page 5 + 05 00 5 integer + FA Length of first varchar field (size 250) + 00 60 Length of second varchar field (size 8192*3) + 00 60 10 First medium BLOB, 1M + 01 00 10 00 Second BLOB, 1M + xx xx xx xx xx xx Varchars are stored here until end of page + + ..... until end of page + + 09 00 F4 1F Start position 9, length 8180 + xx xx xx xx Checksum + + A data page is allowed to have a wrong CRC and header as long as it is + marked empty in the bitmap and its directory's count is 0. +*/ + +#include "maria_def.h" +#include "ma_blockrec.h" +#include "trnman.h" +#include "ma_key_recover.h" +#include "ma_recovery_util.h" +#include <lf.h> + +/* + Struct for having a cursor over a set of extent. + This is used to loop over all extents for a row when reading + the row data. It's also used to store the tail positions for + a read row to be used by a later update/delete command. +*/ + +typedef struct st_maria_extent_cursor +{ + /* + Pointer to packed uchar array of extents for the row. + Format is described above in the header + */ + uchar *extent; + /* Where data starts on page; Only for debugging */ + uchar *data_start; + /* Position to all tails in the row. Updated when reading a row */ + MARIA_RECORD_POS *tail_positions; + /* Current page */ + pgcache_page_no_t page; + /* How many pages in the page region */ + uint page_count; + /* What kind of lock to use for tail pages */ + enum pagecache_page_lock lock_for_tail_pages; + /* Total number of extents (i.e., entries in the 'extent' slot) */ + uint extent_count; + /* <> 0 if current extent is a tail page; Set while using cursor */ + uint tail; + /* Position for tail on tail page */ + uint tail_row_nr; + /* + == 1 if we are working on the first extent (i.e., the one that is stored in + the row header, not an extent that is stored as part of the row data). + */ + my_bool first_extent; +} MARIA_EXTENT_CURSOR; + + +/** + @brief Structure for passing down info to write_hook_for_clr_end(). + This hooks needs to know the variation of the live checksum caused by the + current operation to update state.checksum under log's mutex, + needs to know the transaction's previous undo_lsn to set + trn->undo_lsn under log mutex, and needs to know the type of UNDO being + undone now to modify state.records under log mutex. +*/ + +/** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */ +#define store_checksum_in_rec(S,D,E,P,L) do \ + { \ + D= 0; \ + if ((S)->calc_checksum != NULL) \ + { \ + D= (E); \ + ha_checksum_store(P, D); \ + L+= HA_CHECKSUM_STORE_SIZE; \ + } \ + } while (0) + + +static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails); +static my_bool delete_head_or_tail(MARIA_HA *info, + pgcache_page_no_t page, uint record_number, + my_bool head, my_bool from_update); +#ifndef DBUG_OFF +static void _ma_print_directory(FILE *file, uchar *buff, uint block_size); +#endif +static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block, + uint block_size, ulong length, + uint *tot_ranges); +static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record, + LEX_CUSTRING *log_parts, + uint *log_parts_count); +static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec, + const uchar *newrec, + LEX_CUSTRING *log_parts, + uint *log_parts_count); + +/**************************************************************************** + Initialization +****************************************************************************/ + +/* + Initialize data needed for block structures +*/ + + +/* Size of the different header elements for a row */ + +static uchar header_sizes[]= +{ + TRANSID_SIZE, + VERPTR_SIZE, + TRANSID_SIZE, /* Delete transid */ + 1 /* Null extends */ +}; + +/* + Calculate array of all used headers + + Used to speed up: + + size= 1; + if (flag & 1) + size+= TRANSID_SIZE; + if (flag & 2) + size+= VERPTR_SIZE; + if (flag & 4) + size+= TRANSID_SIZE + if (flag & 8) + size+= 1; + + NOTES + This is called only once at startup of Maria +*/ + +static uchar total_header_size[1 << array_elements(header_sizes)]; +#define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1) + +void _ma_init_block_record_data(void) +{ + uint i; + bzero(total_header_size, sizeof(total_header_size)); + total_header_size[0]= FLAG_SIZE; /* Flag uchar */ + for (i= 1; i < array_elements(total_header_size); i++) + { + uint size= FLAG_SIZE, j, bit; + for (j= 0; (bit= (1 << j)) <= i; j++) + { + if (i & bit) + size+= header_sizes[j]; + } + total_header_size[i]= size; + } +} + + +my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file) +{ + + share->base.max_data_file_length= + (((ulonglong) 1 << ((share->base.rec_reflength-1)*8))-1) * + share->block_size; +#if SIZEOF_OFF_T == 4 + set_if_smaller(share->base.max_data_file_length, INT_MAX32); +#endif + return _ma_bitmap_init(share, data_file); +} + + +my_bool _ma_once_end_block_record(MARIA_SHARE *share) +{ + int res= _ma_bitmap_end(share); + if (share->bitmap.file.file >= 0) + { + if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file, + ((share->temporary || share->deleting) ? + FLUSH_IGNORE_CHANGED : + FLUSH_RELEASE))) + res= 1; + /* + File must be synced as it is going out of the maria_open_list and so + becoming unknown to Checkpoint. + */ + if (share->now_transactional && + my_sync(share->bitmap.file.file, MYF(MY_WME))) + res= 1; + if (my_close(share->bitmap.file.file, MYF(MY_WME))) + res= 1; + /* + Trivial assignment to guard against multiple invocations + (May happen if file are closed but we want to keep the maria object + around a bit longer) + */ + share->bitmap.file.file= -1; + } + if (share->id != 0) + { + /* + We de-assign the id even though index has not been flushed, this is ok + as close_lock serializes us with a Checkpoint looking at our share. + */ + translog_deassign_id_from_share(share); + } + return res; +} + + +/* Init info->cur_row structure */ + +my_bool _ma_init_block_record(MARIA_HA *info) +{ + MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row; + MARIA_SHARE *share= info->s; + uint default_extents; + DBUG_ENTER("_ma_init_block_record"); + + if (!my_multi_malloc(MY_WME, + &row->empty_bits, share->base.pack_bytes, + &row->field_lengths, + share->base.max_field_lengths + 2, + &row->blob_lengths, sizeof(ulong) * share->base.blobs, + &row->null_field_lengths, (sizeof(uint) * + (share->base.fields - + share->base.blobs + + EXTRA_LENGTH_FIELDS)), + &row->tail_positions, (sizeof(MARIA_RECORD_POS) * + (share->base.blobs + 2)), + &new_row->empty_bits, share->base.pack_bytes, + &new_row->field_lengths, + share->base.max_field_lengths + 2, + &new_row->blob_lengths, + sizeof(ulong) * share->base.blobs, + &new_row->null_field_lengths, (sizeof(uint) * + (share->base.fields - + share->base.blobs + + EXTRA_LENGTH_FIELDS)), + &info->log_row_parts, + sizeof(*info->log_row_parts) * + (TRANSLOG_INTERNAL_PARTS + 3 + + share->base.fields + 3), + &info->update_field_data, + (share->base.fields * 4 + + share->base.max_field_lengths + 1 + 4), + NullS, 0)) + DBUG_RETURN(1); + /* Skip over bytes used to store length of field length for logging */ + row->field_lengths+= 2; + new_row->field_lengths+= 2; + + /* Reserve some initial space to avoid mallocs during execution */ + default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 + + (AVERAGE_BLOB_SIZE / + FULL_PAGE_SIZE(share->block_size) / + BLOB_SEGMENT_MIN_SIZE)); + + if (my_init_dynamic_array(&info->bitmap_blocks, + sizeof(MARIA_BITMAP_BLOCK), default_extents, + 64)) + goto err; + info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE; + if (!(info->cur_row.extents= my_malloc(info->cur_row.extents_buffer_length, + MYF(MY_WME)))) + goto err; + + info->row_base_length= share->base_length; + info->row_flag= share->base.default_row_flag; + + /* + We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in + null_field_lengths to allow splitting of rows in 'find_where_to_split_row' + */ + row->null_field_lengths+= EXTRA_LENGTH_FIELDS; + new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS; + + DBUG_RETURN(0); + +err: + _ma_end_block_record(info); + DBUG_RETURN(1); +} + + +void _ma_end_block_record(MARIA_HA *info) +{ + DBUG_ENTER("_ma_end_block_record"); + my_free(info->cur_row.empty_bits, MYF(MY_ALLOW_ZERO_PTR)); + delete_dynamic(&info->bitmap_blocks); + my_free(info->cur_row.extents, MYF(MY_ALLOW_ZERO_PTR)); + my_free(info->blob_buff, MYF(MY_ALLOW_ZERO_PTR)); + /* + The data file is closed, when needed, in ma_once_end_block_record(). + The following protects us from doing an extra, not allowed, close + in maria_close() + */ + info->dfile.file= -1; + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + Helper functions +****************************************************************************/ + +/* + Return the next unused postion on the page after a directory entry. + + SYNOPSIS + start_of_next_entry() + dir Directory entry to be used. This can not be the + the last entry on the page! + + RETURN + # Position in page where next entry starts. + Everything between the '*dir' and this are free to be used. +*/ + +static inline uint start_of_next_entry(uchar *dir) +{ + uchar *prev; + /* + Find previous used entry. (There is always a previous entry as + the directory never starts with a deleted entry) + */ + for (prev= dir - DIR_ENTRY_SIZE ; + prev[0] == 0 && prev[1] == 0 ; + prev-= DIR_ENTRY_SIZE) + {} + return (uint) uint2korr(prev); +} + + +/* + Return the offset where the previous entry ends (before on page) + + SYNOPSIS + end_of_previous_entry() + dir Address for current directory entry + end Address to last directory entry + + RETURN + # Position where previous entry ends (smallest address on page) + Everything between # and current entry are free to be used. +*/ + + +static inline uint end_of_previous_entry(uchar *dir, uchar *end) +{ + uchar *pos; + for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE) + { + uint offset; + if ((offset= uint2korr(pos))) + return offset + uint2korr(pos+2); + } + return PAGE_HEADER_SIZE; +} + + +#ifndef DBUG_OFF + +static void _ma_print_directory(FILE *file, uchar *buff, uint block_size) +{ + uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0; + uint end_of_prev_row= PAGE_HEADER_SIZE; + uchar *dir, *end; + + dir= dir_entry_pos(buff, block_size, max_entry-1); + end= dir_entry_pos(buff, block_size, 0); + + DBUG_LOCK_FILE; /* If using DBUG_FILE */ + fprintf(file,"Directory dump (pos:length):\n"); + + for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++) + { + uint offset= uint2korr(end); + uint length= uint2korr(end+2); + fprintf(file, " %4u:%4u", offset, offset ? length : 0); + if (!(row % (80/12))) + fputc('\n', file); + if (offset) + { + DBUG_ASSERT(offset >= end_of_prev_row); + end_of_prev_row= offset + length; + } + } + fputc('\n', file); + fflush(file); + DBUG_UNLOCK_FILE; +} + + +static void check_directory(uchar *buff, uint block_size, uint min_row_length, + uint real_empty_size) +{ + uchar *dir, *end; + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + uint start_of_dir, deleted; + uint end_of_prev_row= PAGE_HEADER_SIZE; + uint empty_size_on_page; + uint empty_size; + uchar free_entry, prev_free_entry; + + dir= dir_entry_pos(buff, block_size, max_entry-1); + start_of_dir= (uint) (dir - buff); + end= dir_entry_pos(buff, block_size, 0); + deleted= empty_size= 0; + + empty_size_on_page= (real_empty_size != (uint) -1 ? real_empty_size : + uint2korr(buff + EMPTY_SPACE_OFFSET)); + + /* Ensure that all rows are in increasing order and no overlaps */ + for (; dir <= end ; end-= DIR_ENTRY_SIZE) + { + uint offset= uint2korr(end); + uint length= uint2korr(end+2); + if (offset) + { + DBUG_ASSERT(offset >= end_of_prev_row); + DBUG_ASSERT(!length || length >= min_row_length); + empty_size+= offset - end_of_prev_row; + end_of_prev_row= offset + length; + } + else + deleted++; + } + empty_size+= start_of_dir - end_of_prev_row; + DBUG_ASSERT(end_of_prev_row <= start_of_dir); + DBUG_ASSERT(empty_size == empty_size_on_page); + + /* check free links */ + free_entry= buff[DIR_FREE_OFFSET]; + prev_free_entry= END_OF_DIR_FREE_LIST; + while (free_entry != END_OF_DIR_FREE_LIST) + { + uchar *dir= dir_entry_pos(buff, block_size, free_entry); + DBUG_ASSERT(dir[0] == 0 && dir[1] == 0); + DBUG_ASSERT(dir[2] == prev_free_entry); + prev_free_entry= free_entry; + free_entry= dir[3]; + deleted--; + } + DBUG_ASSERT(deleted == 0); +} +#else +#define check_directory(A,B,C,D) +#endif /* DBUG_OFF */ + + +/** + @brief Calculate if there is enough entries on the page +*/ + +static my_bool enough_free_entries(uchar *buff, uint block_size, + uint wanted_entries) +{ + uint entries= (uint) buff[DIR_COUNT_OFFSET]; + uint needed_free_entries, free_entry; + + if (entries + wanted_entries <= MAX_ROWS_PER_PAGE) + return 1; + + /* Check if enough free entries in free list */ + needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE; + + free_entry= (uint) buff[DIR_FREE_OFFSET]; + while (free_entry != END_OF_DIR_FREE_LIST) + { + uchar *dir; + if (!--needed_free_entries) + return 1; + dir= dir_entry_pos(buff, block_size, free_entry); + free_entry= dir[3]; + } + return 0; /* Not enough entries */ +} + + +/** + @brief Check if there is room for more rows on page + + @fn enough_free_entries_on_page + + @return 0 Directory is full + @return 1 There is room for more entries on the page +*/ + +my_bool enough_free_entries_on_page(MARIA_SHARE *share, + uchar *page_buff) +{ + enum en_page_type page_type; + page_type= (enum en_page_type) (page_buff[PAGE_TYPE_OFFSET] & + ~(uchar) PAGE_CAN_BE_COMPACTED); + + if (page_type == HEAD_PAGE) + { + uint row_count= (uint) page_buff[DIR_COUNT_OFFSET]; + return !(row_count == MAX_ROWS_PER_PAGE && + page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST); + } + return enough_free_entries(page_buff, share->block_size, + 1 + share->base.blobs); +} + + +/** + @brief Extend a record area to fit a given size block + + @fn extend_area_on_page() + @param info Handler if head page and 0 if tail page + @param buff Page buffer + @param dir Pointer to dir entry in buffer + @param rownr Row number we working on + @param block_size Block size of buffer + @param request_length How much data we want to put at [dir] + @param empty_space Total empty space in buffer + This is updated with length after dir + is allocated and current block freed + + @implementation + The logic is as follows (same as in _ma_update_block_record()) + - If new data fits in old block, use old block. + - Extend block with empty space before block. If enough, use it. + - Extend block with empty space after block. If enough, use it. + - Use _ma_compact_block_page() to get all empty space at dir. + + @note + The given directory entry is set to rec length. + empty_space doesn't include the new directory entry + + + @return + @retval 0 ok + @retval ret_offset Pointer to store offset to found area + @retval ret_length Pointer to store length of found area + @retval [dir] rec_offset is store here too + + @retval 1 error (wrong info in block) +*/ + +static my_bool extend_area_on_page(MARIA_HA *info, + uchar *buff, uchar *dir, + uint rownr, uint block_size, + uint request_length, + uint *empty_space, uint *ret_offset, + uint *ret_length) +{ + uint rec_offset, length, org_rec_length; + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + DBUG_ENTER("extend_area_on_page"); + + /* + We can't check for min length here as we may have called + extend_directory() to create a new (empty) entry just before + */ + check_directory(buff, block_size, 0, *empty_space); + + rec_offset= uint2korr(dir); + if (rec_offset) + { + /* Extending old row; Mark current space as 'free' */ + length= org_rec_length= uint2korr(dir + 2); + DBUG_PRINT("info", ("rec_offset: %u length: %u request_length: %u " + "empty_space: %u", + rec_offset, org_rec_length, request_length, + *empty_space)); + + *empty_space+= org_rec_length; + } + else + { + /* Reusing free directory entry; Free it from the directory list */ + if (dir[2] == END_OF_DIR_FREE_LIST) + buff[DIR_FREE_OFFSET]= dir[3]; + else + { + uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]); + DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr); + prev_dir[3]= dir[3]; + } + if (dir[3] != END_OF_DIR_FREE_LIST) + { + uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr); + next_dir[2]= dir[2]; + } + rec_offset= start_of_next_entry(dir); + length= 0; + } + if (length < request_length) + { + uint old_rec_offset; + /* + New data did not fit in old position. + Find first possible position where to put new data. + */ + old_rec_offset= rec_offset; + rec_offset= end_of_previous_entry(dir, buff + block_size - + PAGE_SUFFIX_SIZE); + length+= (uint) (old_rec_offset - rec_offset); + DBUG_ASSERT(old_rec_offset); + /* + 'length' is 0 if we are doing an insert into a not allocated block. + This can only happen during "REDO of INSERT" or "UNDO of DELETE." + */ + if (length < request_length) + { + /* + Did not fit in current block + empty space. Extend with + empty space after block. + */ + if (rownr == max_entry - 1) + { + /* Last entry; Everything is free between this and directory */ + length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) - + rec_offset); + } + else + length= start_of_next_entry(dir) - rec_offset; + DBUG_ASSERT((int) length >= 0); + if (length < request_length) + { + /* Not enough continuous space, compact page to get more */ + int2store(dir, rec_offset); + /* Reset length, as this may be a deleted block */ + int2store(dir+2, 0); + _ma_compact_block_page(buff, block_size, rownr, 1, + info ? info->trn->min_read_from: 0, + info ? info->s->base.min_block_length : 0); + rec_offset= uint2korr(dir); + length= uint2korr(dir+2); + if (length < request_length) + { + DBUG_PRINT("error", ("Not enough space: " + "length: %u request_length: %u", + length, request_length)); + my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */ + DBUG_ASSERT(0); /* For debugging */ + DBUG_RETURN(1); /* Error in block */ + } + *empty_space= length; /* All space is here */ + } + } + } + int2store(dir, rec_offset); + int2store(dir + 2, length); + *ret_offset= rec_offset; + *ret_length= length; + + check_directory(buff, block_size, info ? info->s->base.min_block_length : 0, + *empty_space - length); + DBUG_RETURN(0); +} + + +/** + @brief Copy not changed fields from 'from' to 'to' + + @notes + Assumption is that most fields are not changed! + (Which is why we don't test if all bits are set for some bytes in bitmap) +*/ + +void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields, + uchar *to, uchar *from) +{ + MARIA_COLUMNDEF *column, *end_column; + uchar *bitmap= (uchar*) changed_fields->bitmap; + MARIA_SHARE *share= info->s; + uint bit= 1; + + for (column= share->columndef, end_column= column+ share->base.fields; + column < end_column; column++) + { + if (!(*bitmap & bit)) + { + uint field_length= column->length; + if (column->type == FIELD_VARCHAR) + { + if (column->fill_length == 1) + field_length= (uint) from[column->offset] + 1; + else + field_length= uint2korr(from + column->offset) + 2; + } + memcpy(to + column->offset, from + column->offset, field_length); + } + if ((bit= (bit << 1)) == 256) + { + bitmap++; + bit= 1; + } + } +} + +#ifdef NOT_YET_NEEDED +/* Calculate empty space on a page */ + +static uint empty_space_on_page(uchar *buff, uint block_size) +{ + enum en_page_type; + page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] & + ~(uchar) PAGE_CAN_BE_COMPACTED); + if (page_type == UNALLOCATED_PAGE) + return block_size; + if ((uint) page_type <= TAIL_PAGE) + return uint2korr(buff+EMPTY_SPACE_OFFSET); + return 0; /* Blob page */ +} +#endif + + +/* + @brief Ensure we have space for new directory entries + + @fn make_space_for_directory() + @param buff Page buffer + @param block_size Block size for pages + @param max_entry Number of current entries in directory + @param count Number of new entries to be added to directory + @param first_dir First directory entry on page + @param empty_space Total empty space in buffer. It's updated + to reflect the new empty space + @param first_pos Store position to last data byte on page here + + @note + This function is inline as the argument passing is the biggest + part of the function + + @return + @retval 0 ok + @retval 1 error (No data on page, fatal error) +*/ + +static inline my_bool +make_space_for_directory(MARIA_HA *info, + uchar *buff, uint block_size, uint max_entry, + uint count, uchar *first_dir, uint *empty_space, + uint *first_pos) +{ + uint length_needed= DIR_ENTRY_SIZE * count; + + /* + The following is not true only in the case and UNDO is used to reinsert + a row on a previously not used page + */ + if (likely(max_entry)) + { + /* Check if there is place for the directory entry on the page */ + *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2); + + if ((uint) (first_dir - buff) < *first_pos + length_needed) + { + /* Create place for directory */ + _ma_compact_block_page(buff, block_size, max_entry - 1, 0, + info ? info->trn->min_read_from : 0, + info ? info->s->base.min_block_length : 0); + *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2)); + *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + if (*empty_space < length_needed) + { + /* + We should always have space, as we only come here for + UNDO of DELETE (in which case we know the row was on the + page before) or if the bitmap told us there was space on page + */ + DBUG_ASSERT(0); + return(1); + } + } + } + else + *first_pos= PAGE_HEADER_SIZE; + + /* Reduce directory entry size from free space size */ + (*empty_space)-= length_needed; + buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count); + return(0); +} + + +/* + Find free position in directory + + SYNOPSIS + find_free_position() + info Handler if head page and 0 otherwise + buff Page + block_size Size of page + res_rownr Store index to free position here + res_length Store length of found segment here + empty_space Store length of empty space on disk here. This is + all empty space, including the found block. + + NOTES + If there is a free directory entry (entry with position == 0), + then use it and change it to be the size of the empty block + after the previous entry. This guarantees that all row entries + are stored on disk in inverse directory order, which makes life easier for + '_ma_compact_block_page()' and to know if there is free space after any + block. + + If there is no free entry (entry with position == 0), then we create + a new one. If there is not space for the directory entry (because + the last block overlapps with the directory), we compact the page. + + We will update the offset and the length of the found dir entry to + match the position and empty space found. + + buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller + + See start of file for description of how free directory entires are linked + + RETURN + 0 Error (directory full or last block goes over directory) + # Pointer to directory entry on page +*/ + +static uchar *find_free_position(MARIA_HA *info, + uchar *buff, uint block_size, uint *res_rownr, + uint *res_length, uint *empty_space) +{ + uint max_entry, free_entry; + uint length, first_pos; + uchar *dir, *first_dir; + DBUG_ENTER("find_free_position"); + + max_entry= (uint) buff[DIR_COUNT_OFFSET]; + free_entry= (uint) buff[DIR_FREE_OFFSET]; + *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + + DBUG_PRINT("info", ("max_entry: %u free_entry: %u", max_entry, free_entry)); + + first_dir= dir_entry_pos(buff, block_size, max_entry - 1); + + /* Search after first free position */ + if (free_entry != END_OF_DIR_FREE_LIST) + { + if (free_entry >= max_entry) + DBUG_RETURN(0); /* Consistency error */ + dir= dir_entry_pos(buff, block_size, free_entry); + DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST); + /* Relink free list */ + if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST) + { + uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT((uint) next_entry[2] == free_entry && + uint2korr(next_entry) == 0); + next_entry[2]= END_OF_DIR_FREE_LIST; /* Backlink */ + } + + first_pos= end_of_previous_entry(dir, buff + block_size - + PAGE_SUFFIX_SIZE); + length= start_of_next_entry(dir) - first_pos; + int2store(dir, first_pos); /* Update dir entry */ + int2store(dir + 2, 0); + *res_rownr= free_entry; + *res_length= length; + + check_directory(buff, block_size, + info ? info->s->base.min_block_length : 0, (uint) -1); + DBUG_RETURN(dir); + } + /* No free places in dir; create a new one */ + + /* Check if there is place for the directory entry */ + if (max_entry == MAX_ROWS_PER_PAGE) + DBUG_RETURN(0); + + if (make_space_for_directory(info, buff, block_size, max_entry, 1, + first_dir, empty_space, &first_pos)) + DBUG_RETURN(0); + + dir= first_dir - DIR_ENTRY_SIZE; + length= (uint) (dir - buff - first_pos); + DBUG_ASSERT(length <= *empty_space); + int2store(dir, first_pos); + int2store(dir + 2, 0); /* Max length of region */ + *res_rownr= max_entry; + *res_length= length; + + check_directory(buff, block_size, info ? info->s->base.min_block_length : 0, + *empty_space); + DBUG_RETURN(dir); +} + + +/** + @brief Enlarge page directory to hold more entries + + @fn extend_directory() + @param info Handler if head page and 0 otherwise + @param buff Page buffer + @param block_size Block size + @param max_entry Number of directory entries on page + @param new_entry Position for new entry + @param empty_space Total empty space in buffer. It's updated + to reflect the new empty space + + @note + This is only called on UNDO when we want to expand the directory + to be able to re-insert row in a given position + + The new directory entry will be set to cover the maximum possible space + + @return + @retval 0 ok + @retval 1 error (No data on page, fatal error) +*/ + +static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size, + uint max_entry, uint new_entry, + uint *empty_space) +{ + uint length, first_pos; + uchar *dir, *first_dir; + DBUG_ENTER("extend_directory"); + + /* + Note that in if max_entry is 0, then first_dir will point to + an illegal directory entry. This is ok, as in this case we will + not access anything through first_dir. + */ + first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE; + + if (make_space_for_directory(info, buff, block_size, max_entry, + new_entry - max_entry + 1, + first_dir, empty_space, &first_pos)) + DBUG_RETURN(1); + + /* Set the new directory entry to cover the max possible length */ + dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1); + length= (uint) (dir - buff - first_pos); + int2store(dir, first_pos); + int2store(dir+2, length); + *empty_space-= length; + + if (new_entry-- > max_entry) + { + /* Link all row entries between new_entry and max_entry into free list */ + uint free_entry= (uint) buff[DIR_FREE_OFFSET]; + uint prev_entry= END_OF_DIR_FREE_LIST; + buff[DIR_FREE_OFFSET]= new_entry; + do + { + dir+= DIR_ENTRY_SIZE; + dir[0]= dir[1]= 0; + dir[2]= (uchar) prev_entry; + dir[3]= (uchar) new_entry-1; + prev_entry= new_entry; + } while (new_entry-- > max_entry); + if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST) + { + /* Relink next entry to point to newly freed entry */ + uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT(uint2korr(next_entry) == 0 && + next_entry[2] == END_OF_DIR_FREE_LIST); + next_entry[2]= max_entry; + } + } + + check_directory(buff, block_size, + info ? min(info->s->base.min_block_length, length) : 0, + *empty_space); + DBUG_RETURN(0); +} + + +/**************************************************************************** + Updating records +****************************************************************************/ + +/* + Calculate length of all the different field parts + + SYNOPSIS + calc_record_size() + info Maria handler + record Row to store + row Store statistics about row here + + NOTES + The statistics is used to find out how much space a row will need + and also where we can split a row when we need to split it into several + extents. +*/ + +static void calc_record_size(MARIA_HA *info, const uchar *record, + MARIA_ROW *row) +{ + MARIA_SHARE *share= info->s; + uchar *field_length_data; + MARIA_COLUMNDEF *column, *end_column; + uint *null_field_lengths= row->null_field_lengths; + ulong *blob_lengths= row->blob_lengths; + DBUG_ENTER("calc_record_size"); + + row->normal_length= row->char_length= row->varchar_length= + row->blob_length= row->extents_count= 0; + + /* Create empty bitmap and calculate length of each varlength/char field */ + bzero(row->empty_bits, share->base.pack_bytes); + field_length_data= row->field_lengths; + for (column= share->columndef + share->base.fixed_not_null_fields, + end_column= share->columndef + share->base.fields; + column < end_column; column++, null_field_lengths++) + { + if ((record[column->null_pos] & column->null_bit)) + { + if (column->type != FIELD_BLOB) + *null_field_lengths= 0; + else + *blob_lengths++= 0; + continue; + } + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + DBUG_ASSERT(column->empty_bit == 0); + /* fall through */ + case FIELD_SKIP_PRESPACE: /* Not packed */ + row->normal_length+= column->length; + *null_field_lengths= column->length; + break; + case FIELD_SKIP_ZERO: /* Fixed length field */ + if (memcmp(record+ column->offset, maria_zero_string, + column->length) == 0) + { + row->empty_bits[column->empty_pos] |= column->empty_bit; + *null_field_lengths= 0; + } + else + { + row->normal_length+= column->length; + *null_field_lengths= column->length; + } + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + const uchar *pos, *end; + for (pos= record + column->offset, end= pos + column->length; + end > pos && end[-1] == ' '; end--) + ; + if (pos == end) /* If empty string */ + { + row->empty_bits[column->empty_pos]|= column->empty_bit; + *null_field_lengths= 0; + } + else + { + uint length= (uint) (end - pos); + if (column->length <= 255) + *field_length_data++= (uchar) length; + else + { + int2store(field_length_data, length); + field_length_data+= 2; + } + row->char_length+= length; + *null_field_lengths= length; + } + break; + } + case FIELD_VARCHAR: + { + uint length, field_length_data_length; + const uchar *field_pos= record + column->offset; + + /* 256 is correct as this includes the length uchar */ + field_length_data[0]= field_pos[0]; + if (column->length <= 256) + { + length= (uint) (uchar) *field_pos; + field_length_data_length= 1; + } + else + { + length= uint2korr(field_pos); + field_length_data[1]= field_pos[1]; + field_length_data_length= 2; + } + *null_field_lengths= length; + if (!length) + { + row->empty_bits[column->empty_pos]|= column->empty_bit; + break; + } + row->varchar_length+= length; + *null_field_lengths= length; + field_length_data+= field_length_data_length; + break; + } + case FIELD_BLOB: + { + const uchar *field_pos= record + column->offset; + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_pos); + + *blob_lengths++= blob_length; + if (!blob_length) + row->empty_bits[column->empty_pos]|= column->empty_bit; + else + { + row->blob_length+= blob_length; + memcpy(field_length_data, field_pos, size_length); + field_length_data+= size_length; + } + break; + } + default: + DBUG_ASSERT(0); + } + } + row->field_lengths_length= (uint) (field_length_data - row->field_lengths); + /* + - info->row_base_length is base information we must have on a page in first + extent: + - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes + + table_checksum (0 | 1) + - row->min_length is minimum amount of data we must store on + a page. bitmap code will ensure we get at list this much + + total number of extents and one extent information + - fixed_not_null_fields_length is length of fixed length fields that can't + be compacted + - head_length is the amount of data for the head page + (ie, all fields except blobs) + */ + row->min_length= (info->row_base_length + + (share->base.max_field_lengths ? + size_to_store_key_length(row->field_lengths_length) : + 0)); + row->head_length= (row->min_length + + share->base.fixed_not_null_fields_length + + row->field_lengths_length + + row->normal_length + + row->char_length + row->varchar_length); + row->total_length= (row->head_length + row->blob_length); + if (row->total_length < share->base.min_block_length) + row->total_length= share->base.min_block_length; + DBUG_PRINT("exit", ("head_length: %lu total_length: %lu", + (ulong) row->head_length, (ulong) row->total_length)); + DBUG_VOID_RETURN; +} + + +/** + Compact page by removing all space between rows + + Moves up all rows to start of page. Moves blocks that are directly after + each other with one memmove. + + @note if rownr is the last row in the page, and extend_block is false, + caller has to make sure to update bitmap page afterwards to reflect freed + space. + + @param buff Page to compact + @param block_size Size of page + @param rownr Put empty data after this row + @param extend_block If 1, extend the block at 'rownr' to cover the + whole block. + @param min_read_from If <> 0, remove all trid's that are less than this +*/ + +void _ma_compact_block_page(uchar *buff, uint block_size, uint rownr, + my_bool extend_block, TrID min_read_from, + uint min_row_length) +{ + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block; + uint freed_size= 0; + uchar *dir, *end; + DBUG_ENTER("_ma_compact_block_page"); + DBUG_PRINT("enter", ("rownr: %u min_read_from: %lu", rownr, + (ulong) min_read_from)); + DBUG_ASSERT(max_entry > 0 && + max_entry < (block_size - PAGE_HEADER_SIZE - + PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE); + + /* Move all entries before and including rownr up to start of page */ + dir= dir_entry_pos(buff, block_size, rownr); + end= dir_entry_pos(buff, block_size, 0); + page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE; + diff= 0; + for (; dir <= end ; end-= DIR_ENTRY_SIZE) + { + uint offset= uint2korr(end); + + if (offset) + { + uint row_length= uint2korr(end + 2); + DBUG_ASSERT(offset >= page_pos); + DBUG_ASSERT(buff + offset + row_length <= dir); + DBUG_ASSERT(row_length >= min_row_length || row_length == 0); + + /* Row length can be zero if row is to be deleted */ + if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID)) + { + TrID transid= transid_korr(buff+offset+1); + if (transid < min_read_from) + { + /* Remove transid from row by moving the start point of the row up */ + buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID; + offset+= TRANSID_SIZE; + freed_size+= TRANSID_SIZE; + row_length-= TRANSID_SIZE; + int2store(end+2, row_length); + } + } + + if (offset != next_free_pos) + { + uint length= (next_free_pos - start_of_found_block); + /* + There was empty space before this and prev block + Check if we have to move previous block up to page start + */ + if (page_pos != start_of_found_block) + { + /* move up previous block */ + memmove(buff + page_pos, buff + start_of_found_block, length); + } + page_pos+= length; + /* next continuous block starts here */ + start_of_found_block= offset; + diff= offset - page_pos; + } + int2store(end, offset - diff); /* correct current pos */ + next_free_pos= offset + row_length; + + if (unlikely(row_length < min_row_length) && row_length) + { + /* + This can only happen in the case we compacted transid and + the row become 'too short' + + Move the current row down to it's right place and extend it + with 0. + */ + uint row_diff= min_row_length - row_length; + uint length= (next_free_pos - start_of_found_block); + + DBUG_ASSERT(page_pos != start_of_found_block); + bmove(buff + page_pos, buff + start_of_found_block, length); + bzero(buff+ page_pos + length, row_diff); + page_pos+= min_row_length; + int2store(end+2, min_row_length); + freed_size-= row_diff; + next_free_pos= start_of_found_block= page_pos; + diff= 0; + } + } + } + if (page_pos != start_of_found_block) + { + uint length= (next_free_pos - start_of_found_block); + memmove(buff + page_pos, buff + start_of_found_block, length); + } + start_of_found_block= uint2korr(dir); + + if (rownr != max_entry - 1) + { + /* Move all entries after rownr to end of page */ + uint rownr_length; + + DBUG_ASSERT(extend_block); /* Should always be true */ + next_free_pos= end_of_found_block= page_pos= + block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE; + diff= 0; + /* End points to entry before 'rownr' */ + for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE) + { + uint offset= uint2korr(dir); + uint row_length; + uint row_end; + if (!offset) + continue; + row_length= uint2korr(dir + 2); + row_end= offset + row_length; + DBUG_ASSERT(offset >= start_of_found_block && + row_end <= next_free_pos && row_length >= min_row_length); + + if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID)) + { + TrID transid= transid_korr(buff + offset+1); + if (transid < min_read_from) + { + /* Remove transid from row */ + buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID; + offset+= TRANSID_SIZE; + row_length-= TRANSID_SIZE; + int2store(dir+2, row_length); + } + if (unlikely(row_length < min_row_length)) + { + /* + This can only happen in the case we compacted transid and + the row become 'too short' + */ + uint row_diff= min_row_length - row_length; + if (next_free_pos < row_end + row_diff) + { + /* + Not enough space for extending next block with enough + end 0's. Move current data down to get place for them + */ + uint move_down= row_diff - (next_free_pos - row_end); + bmove(buff + offset - move_down, buff + offset, row_length); + offset-= move_down; + } + /* + Extend the next block with 0, which will be part of current + row when the blocks are joined together later + */ + bzero(buff + next_free_pos - row_diff, row_diff); + next_free_pos-= row_diff; + int2store(dir+2, min_row_length); + } + row_end= offset + row_length; + } + + if (row_end != next_free_pos) + { + uint length= (end_of_found_block - next_free_pos); + if (page_pos != end_of_found_block) + { + /* move next block down */ + memmove(buff + page_pos - length, buff + next_free_pos, length); + } + page_pos-= length; + /* next continuous block starts here */ + end_of_found_block= row_end; + diff= page_pos - row_end; + } + int2store(dir, offset + diff); /* correct current pos */ + next_free_pos= offset; + } + if (page_pos != end_of_found_block) + { + uint length= (end_of_found_block - next_free_pos); + memmove(buff + page_pos - length, buff + next_free_pos, length); + next_free_pos= page_pos- length; + } + + /* Extend rownr block to cover hole */ + rownr_length= next_free_pos - start_of_found_block; + int2store(dir+2, rownr_length); + DBUG_ASSERT(rownr_length >= min_row_length); + } + else + { + if (extend_block) + { + /* Extend last block to cover whole page */ + uint length= ((uint) (dir - buff) - start_of_found_block); + int2store(dir+2, length); + DBUG_ASSERT(length >= min_row_length); + } + else + { + /* Add length gained from freed transaction id's to this page */ + uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size; + int2store(buff + EMPTY_SPACE_OFFSET, length); + } + buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED; + } + check_directory(buff, block_size, min_row_length, + extend_block ? 0 : (uint) -1); + DBUG_EXECUTE("directory", _ma_print_directory(DBUG_FILE, buff, block_size);); + DBUG_VOID_RETURN; +} + + +/* + Create an empty tail or head page + + SYNOPSIS + make_empty_page() + buff Page buffer + block_size Block size + page_type HEAD_PAGE or TAIL_PAGE + create_dir_entry TRUE of we should create a directory entry + + NOTES + EMPTY_SPACE is not updated +*/ + +static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type, + my_bool create_dir_entry) +{ + uint block_size= info->s->block_size; + DBUG_ENTER("make_empty_page"); + + bzero(buff, PAGE_HEADER_SIZE); + +#if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_valgrind) + /* + We zero the rest of the block to avoid getting old memory information + to disk and to allow the file to be compressed better if archived. + The code does not assume the block is zeroed. + */ + if (page_type != BLOB_PAGE) + bzero(buff+ PAGE_HEADER_SIZE, block_size - PAGE_HEADER_SIZE); +#endif + buff[PAGE_TYPE_OFFSET]= (uchar) page_type; + buff[DIR_COUNT_OFFSET]= (int) create_dir_entry; + buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST; + if (create_dir_entry) + { + /* Create directory entry to point to start of page with size 0 */ + buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; + int2store(buff, PAGE_HEADER_SIZE); + int2store(buff+2, 0); + } + DBUG_VOID_RETURN; +} + + +/* + Read or initialize new head or tail page + + SYNOPSIS + get_head_or_tail_page() + info Maria handler + block Block to read + buff Suggest this buffer to key cache + length Minimum space needed + page_type HEAD_PAGE || TAIL_PAGE + res Store result position here + + NOTES + We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data + as we don't know how much data the caller will actually use. + + res->empty_space is set to length of empty space + + RETURN + 0 ok All slots in 'res' are updated + 1 error my_errno is set +*/ + +struct st_row_pos_info +{ + uchar *buff; /* page buffer */ + uchar *data; /* Place for data */ + uchar *dir; /* Directory */ + uint length; /* Length for data */ + uint rownr; /* Offset in directory */ + uint empty_space; /* Space left on page */ +}; + + +static my_bool get_head_or_tail_page(MARIA_HA *info, + MARIA_BITMAP_BLOCK *block, + uchar *buff, uint length, uint page_type, + enum pagecache_page_lock lock, + struct st_row_pos_info *res) +{ + uint block_size; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + DBUG_ENTER("get_head_or_tail_page"); + DBUG_PRINT("enter", ("page_type: %u length: %u", page_type, length)); + + block_size= share->block_size; + if (block->org_bitmap_value == 0) /* Empty block */ + { + /* New page */ + make_empty_page(info, buff, page_type, 1); + res->buff= buff; + res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE); + res->data= (buff + PAGE_HEADER_SIZE); + res->dir= res->data + res->length; + res->rownr= 0; + DBUG_ASSERT(length <= res->length); + } + else + { + uchar *dir; + /* Read old page */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + res->buff= pagecache_read(share->pagecache, &info->dfile, + block->page, 0, 0, share->page_type, + lock, &page_link.link); + page_link.changed= res->buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!page_link.changed) + goto crashed; + + DBUG_ASSERT((uint) (res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == + page_type); + if (!(dir= find_free_position(page_type == HEAD_PAGE ? info : 0, + res->buff, block_size, &res->rownr, + &res->length, &res->empty_space))) + goto crashed; + + if (res->length < length) + { + if (res->empty_space + res->length >= length) + { + _ma_compact_block_page(res->buff, block_size, res->rownr, 1, + (page_type == HEAD_PAGE ? + info->trn->min_read_from : 0), + (page_type == HEAD_PAGE ? + share->base.min_block_length : + 0)); + /* All empty space are now after current position */ + dir= dir_entry_pos(res->buff, block_size, res->rownr); + res->length= res->empty_space= uint2korr(dir+2); + } + if (res->length < length) + { + DBUG_PRINT("error", ("length: %u res->length: %u empty_space: %u", + length, res->length, res->empty_space)); + goto crashed; /* Wrong bitmap information */ + } + } + res->dir= dir; + res->data= res->buff + uint2korr(dir); + } + DBUG_RETURN(0); + +crashed: + my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */ + DBUG_RETURN(1); +} + + +/* + @brief Create room for a head or tail row on a given page at given position + + @fn get_rowpos_in_head_or_tail_page() + @param info Maria handler + @param block Block to read + @param buff Suggest this buffer to key cache + @param length Minimum space needed + @param page_type HEAD_PAGE || TAIL_PAGE + @param rownr Rownr to use + @param res Store result position here + + @note + This is essential same as get_head_or_tail_page, with the difference + that the caller species at what position the row should be put. + This is used when restoring a row to it's original position as + part of UNDO DELETE or UNDO UPDATE + + @return + @retval 0 ok All slots in 'res' are updated + @retval 1 error my_errno is set +*/ + +static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info, + MARIA_BITMAP_BLOCK *block, + uchar *buff, uint length, + uint page_type, + enum pagecache_page_lock lock, + uint rownr, + struct st_row_pos_info *res) +{ + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + uchar *dir; + uint block_size= share->block_size; + uint max_entry, max_length, rec_offset; + DBUG_ENTER("get_rowpos_in_head_or_tail_page"); + + if (block->org_bitmap_value == 0) /* Empty block */ + { + /* New page */ + make_empty_page(info, buff, page_type, 0); + res->empty_space= block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE; + } + else + { + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + buff= pagecache_read(share->pagecache, &info->dfile, + block->page, 0, 0, share->page_type, + lock, &page_link.link); + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!page_link.changed) /* Read error */ + goto err; + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == + (uchar) page_type); + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type) + goto err; + res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + } + + max_entry= (uint) buff[DIR_COUNT_OFFSET]; + if (max_entry <= rownr) + { + if (extend_directory(page_type == HEAD_PAGE ? info : 0, buff, block_size, + max_entry, rownr, &res->empty_space)) + goto err; + } + + /* + The following dir entry is unused in case of insert / update but + not in case of undo_update / undo_delete + */ + dir= dir_entry_pos(buff, block_size, rownr); + + if (extend_area_on_page(page_type == HEAD_PAGE ? info : 0, buff, dir, + rownr, block_size, length, + &res->empty_space, &rec_offset, &max_length)) + goto err; + + res->buff= buff; + res->rownr= rownr; + res->dir= dir; + res->data= buff + rec_offset; + res->length= length; + DBUG_RETURN(0); + +err: + my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */ + DBUG_RETURN(1); +} + + +/* + Write tail for head data or blob + + SYNOPSIS + write_tail() + info Maria handler + block Block to tail page + row_part Data to write to page + length Length of data + + NOTES + block->page_count is updated to the directory offset for the tail + so that we can store the position in the row extent information + + RETURN + 0 ok + block->page_count is set to point (dir entry + TAIL_BIT) + + 1 error; In this case my_errno is set to the error +*/ + +static my_bool write_tail(MARIA_HA *info, + MARIA_BITMAP_BLOCK *block, + uchar *row_part, uint org_length) +{ + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + uint block_size= share->block_size, empty_space, length= org_length; + struct st_row_pos_info row_pos; + my_off_t position; + my_bool res, block_is_read; + DBUG_ENTER("write_tail"); + DBUG_PRINT("enter", ("page: %lu length: %u", + (ulong) block->page, length)); + + info->keyread_buff_used= 1; + /* + Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows + some place to grow in the future) + */ + if (length < MIN_TAIL_SIZE) + length= MIN_TAIL_SIZE; + + if (block->page_count == TAIL_PAGE_COUNT_MARKER) + { + /* + Create new tail + page will be pinned & locked by get_head_or_tail_page + */ + if (get_head_or_tail_page(info, block, info->keyread_buff, length, + TAIL_PAGE, PAGECACHE_LOCK_WRITE, + &row_pos)) + DBUG_RETURN(1); + } + else + { + /* Write tail on predefined row position */ + if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff, + length, TAIL_PAGE, + PAGECACHE_LOCK_WRITE, + block->page_count & ~TAIL_BIT, + &row_pos)) + DBUG_RETURN(1); + } + DBUG_PRINT("info", ("tailid: %lu (%lu:%u)", + (ulong) ma_recordpos(block->page, row_pos.rownr), + (ulong) block->page, row_pos.rownr)); + + block_is_read= block->org_bitmap_value != 0; + + memcpy(row_pos.data, row_part, org_length); + + if (share->now_transactional) + { + /* Log changes in tail block */ + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + LSN lsn; + + /* + Log REDO changes of tail page + Note that we have to log length, not org_length, to be sure that + REDO, which doesn't use write_tail, also creates a block of at least + MIN_TAIL_SIZE + */ + page_store(log_data + FILEID_STORE_SIZE, block->page); + dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + row_pos.rownr); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos.data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; + if (translog_write_record(&lsn, + (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL : + LOGREC_REDO_NEW_ROW_TAIL), + info->trn, info, + (translog_size_t) (sizeof(log_data) + length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL)) + DBUG_RETURN(1); + } + + int2store(row_pos.dir + 2, length); + empty_space= row_pos.empty_space - length; + int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space); + block->page_count= row_pos.rownr + TAIL_BIT; + /* + If there is less directory entries free than number of possible tails + we can write for a row, we mark the page full to ensure that we don't + during _ma_bitmap_find_place() allocate more entries on the tail page + than it can hold + */ + block->empty_space= (enough_free_entries(row_pos.buff, share->block_size, + 1 + share->base.blobs) ? + empty_space : 0); + /* Keep BLOCKUSED_USE_ORG_BITMAP */ + block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL; + + if (block_is_read) + { + /* Current page link is last element in pinned_pages */ + MARIA_PINNED_PAGE *page_link; + page_link= dynamic_element(&info->pinned_pages, + info->pinned_pages.elements-1, + MARIA_PINNED_PAGE*); + pagecache_unlock_by_link(share->pagecache, page_link->link, + PAGECACHE_LOCK_WRITE_TO_READ, + PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + DBUG_ASSERT(page_link->changed); + page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK; + res= 0; + } + else + { + if (!(res= pagecache_write(share->pagecache, + &info->dfile, block->page, 0, + row_pos.buff,share->page_type, + PAGECACHE_LOCK_READ, + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE))) + { + page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + + /* Increase data file size, if extended */ + position= (my_off_t) block->page * block_size; + if (share->state.state.data_file_length <= position) + { + /* + We are modifying a state member before writing the UNDO; this is a WAL + violation. But for data_file_length this is ok, as long as we change + data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see + collect_tables()). + */ + _ma_set_share_data_file_length(share, position + block_size); + } + } + DBUG_RETURN(res); +} + + +/* + Write full pages + + SYNOPSIS + write_full_pages() + info Maria handler + lsn LSN for the undo record + block Where to write data + data Data to write + length Length of data + + NOTES + Logging of the changes to the full pages are done in the caller + write_block_record(). + + RETURN + 0 ok + 1 error on write +*/ + +static my_bool write_full_pages(MARIA_HA *info, + LSN lsn, + MARIA_BITMAP_BLOCK *block, + uchar *data, ulong length) +{ + pgcache_page_no_t page; + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + uint data_size= FULL_PAGE_SIZE(block_size); + uchar *buff= info->keyread_buff; + uint page_count, sub_blocks; + my_off_t position, max_position; + DBUG_ENTER("write_full_pages"); + DBUG_PRINT("enter", ("length: %lu page: %lu page_count: %lu", + (ulong) length, (ulong) block->page, + (ulong) block->page_count)); + DBUG_ASSERT((block->page_count & TAIL_BIT) == 0); + + info->keyread_buff_used= 1; + page= block->page; + page_count= block->page_count; + sub_blocks= block->sub_blocks; + + max_position= (my_off_t) (page + page_count) * block_size; + + /* Increase data file size, if extended */ + + for (; length; data+= data_size) + { + uint copy_length; + if (!page_count--) + { + if (!--sub_blocks) + { + DBUG_ASSERT(0); /* Wrong in bitmap or UNDO */ + my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */ + DBUG_RETURN(1); + } + + block++; + page= block->page; + page_count= block->page_count - 1; + DBUG_PRINT("info", ("page: %lu page_count: %lu", + (ulong) block->page, (ulong) block->page_count)); + + position= (page + page_count + 1) * block_size; + set_if_bigger(max_position, position); + } + lsn_store(buff, lsn); + buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE; + copy_length= min(data_size, length); + memcpy(buff + LSN_SIZE + PAGE_TYPE_SIZE, data, copy_length); + length-= copy_length; + + /* + Zero out old information from the block. This removes possible + sensitive information from the block and also makes the file + easier to compress and easier to compare after recovery. + */ + if (copy_length != data_size) + bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length), + (data_size - copy_length) + PAGE_SUFFIX_SIZE); + + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, info->trn->rec_lsn)) + DBUG_RETURN(1); + page++; + DBUG_ASSERT(block->used & BLOCKUSED_USED); + } + if (share->state.state.data_file_length < max_position) + _ma_set_share_data_file_length(share, max_position); + DBUG_RETURN(0); +} + + +/* + Store ranges of full pages in compact format for logging + + SYNOPSIS + store_page_range() + to Store data here + block Where pages are to be written + block_size block size + length Length of data to be written + Normally this is full pages, except for the last + tail block that may only partly fit the last page. + tot_ranges Add here the number of ranges used + + NOTES + The format of one entry is: + + Ranges SUB_RANGE_SIZE + Empty bytes at end of last byte BLOCK_FILLER_SIZE + For each range + Page number PAGE_STORE_SIZE + Number of pages PAGERANGE_STORE_SIZE + + RETURN + # end position for 'to' +*/ + +static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block, + uint block_size, ulong length, + uint *tot_ranges) +{ + uint data_size= FULL_PAGE_SIZE(block_size); + ulong pages_left= (length + data_size -1) / data_size; + uint page_count, ranges, empty_space; + uchar *to_start; + DBUG_ENTER("store_page_range"); + + to_start= to; + to+= SUB_RANGE_SIZE; + + /* Store number of unused bytes at last page */ + empty_space= (uint) (pages_left * data_size - length); + int2store(to, empty_space); + to+= BLOCK_FILLER_SIZE; + + ranges= 0; + do + { + pgcache_page_no_t page; + page= block->page; + page_count= block->page_count; + block++; + if (page_count > pages_left) + page_count= pages_left; + + page_store(to, page); + to+= PAGE_STORE_SIZE; + pagerange_store(to, page_count); + to+= PAGERANGE_STORE_SIZE; + ranges++; + } while ((pages_left-= page_count)); + /* Store number of ranges for this block */ + int2store(to_start, ranges); + (*tot_ranges)+= ranges; + + DBUG_RETURN(to); +} + + +/* + Store packed extent data + + SYNOPSIS + store_extent_info() + to Store first packed data here + row_extents_second_part Store rest here + first_block First block to store + count Number of blocks + + NOTES + We don't have to store the position for the head block + + We have to set the START_EXTENT_BIT for every extent where the + blob will be stored on a page of it's own. We need this in the + UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and + undo-update. +*/ + +static void store_extent_info(uchar *to, + uchar *row_extents_second_part, + MARIA_BITMAP_BLOCK *first_block, + uint count) +{ + MARIA_BITMAP_BLOCK *block, *end_block; + uint copy_length; + my_bool first_found= 0; + DBUG_ENTER("store_extent_info"); + DBUG_PRINT("enter", ("count: %u", count)); + + for (block= first_block, end_block= first_block+count ; + block < end_block; block++) + { + /* The following is only false for marker blocks */ + if (likely(block->used & BLOCKUSED_USED)) + { + uint page_count= block->page_count; + DBUG_ASSERT(page_count != 0); + page_store(to, block->page); + if (block->sub_blocks) + { + /* + Set a bit so that we later know that this was the first block + for a blob + */ + page_count|= START_EXTENT_BIT; + } + pagerange_store(to + PAGE_STORE_SIZE, page_count); + DBUG_DUMP("extent", to, ROW_EXTENT_SIZE); + to+= ROW_EXTENT_SIZE; + if (!first_found) + { + first_found= 1; + to= row_extents_second_part; + } + } + } + copy_length= (count - 1) * ROW_EXTENT_SIZE; + /* + In some unlikely cases we have allocated to many blocks. Clear this + data. + */ + bzero(to, (size_t) (row_extents_second_part + copy_length - to)); + DBUG_VOID_RETURN; +} + + +/** + @brief + Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable + for write_block_record + + @note + In case of blobs, this function marks all the blob pages in the bitmap + as full pages. The bitmap bits for other pages will be marked + when write_block_record() calls _ma_bitmap_release_unused(). + + This function will be removed in Maria 2.0 when we instead of delete rows + mark them as deleted and only remove them after commit. + + @return + @retval 0 ok + @retval 1 Error (out of memory or disk error changing bitmap) or + wrong information in extent information +*/ + +static my_bool extent_to_bitmap_blocks(MARIA_HA *info, + MARIA_BITMAP_BLOCKS *blocks, + pgcache_page_no_t head_page, + uint extent_count, + const uchar *extent_info) +{ + MARIA_BITMAP_BLOCK *block, *start_block; + MARIA_SHARE *share= info->s; + uint i, tail_page; + DBUG_ENTER("extent_to_bitmap_blocks"); + + if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2)) + DBUG_RETURN(1); + block= blocks->block= dynamic_element(&info->bitmap_blocks, 0, + MARIA_BITMAP_BLOCK*); + blocks->count= extent_count + 1; + blocks->tail_page_skipped= blocks->page_skipped= 0; + block->page= head_page; + block->page_count= 1; + block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP; + /* Impossible value, will force storage of real value */ + block->org_bitmap_value= 255; + + start_block= block++; + for (i=0 ; + i++ < extent_count ; + block++, extent_info+= ROW_EXTENT_SIZE) + { + uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE); + if (page_count & START_EXTENT_BIT) + { + page_count&= ~START_EXTENT_BIT; + start_block->sub_blocks= (uint) (block - start_block); + start_block= block; + } + block->page= page_korr(extent_info); + block->page_count= page_count; + block->sub_blocks= 0; + if (block->page_count == 0) + { + /* Extend allocated but not used by write_block_record() */ + DBUG_ASSERT(block->page == 0); + /* This is the last block */ + blocks->count= i; + break; + } + if ((tail_page= page_count & TAIL_BIT)) + page_count= 1; + + /* Check if wrong data */ + if (block->page == 0 || page_count == 0 || + (block->page + page_count) * share->block_size > + share->state.state.data_file_length) + { + DBUG_PRINT("error", ("page: %lu page_count: %u tail: %u length: %ld data_length: %ld", + (ulong) block->page, + (block->page_count & ~TAIL_BIT), + (uint) test(block->page_count & TAIL_BIT), + (ulong) ((block->page + (page_count & ~TAIL_BIT)) * + share->block_size), + (ulong) share->state.state.data_file_length)); + DBUG_RETURN(1); + } + if (tail_page) + { + block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap, + block->page); + block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED | + BLOCKUSED_USE_ORG_BITMAP); + } + else + { + my_bool res; + pthread_mutex_lock(&share->bitmap.bitmap_lock); + res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, + block->page, page_count); + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + if (res) + DBUG_RETURN(1); + block->used= BLOCKUSED_USED; + } + } + start_block->sub_blocks= (uint) (block - start_block); + DBUG_RETURN(0); +} + + +/* + Free regions of pages with logging + + NOTES + We are removing filler events and tail page events from + row->extents to get smaller log. + + RETURN + 0 ok + 1 error +*/ + +static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row) +{ + uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + LSN lsn; + size_t extents_length; + uchar *extents= row->extents; + DBUG_ENTER("free_full_pages"); + + if (info->s->now_transactional) + { + /* Compact events by removing filler and tail events */ + uchar *new_block= 0; + uchar *end, *to, *compact_extent_info; + my_bool res; + uint extents_count; + + if (!(compact_extent_info= my_alloca(row->extents_count * + ROW_EXTENT_SIZE))) + DBUG_RETURN(1); + + to= compact_extent_info; + for (end= extents + row->extents_count * ROW_EXTENT_SIZE ; + extents < end ; + extents+= ROW_EXTENT_SIZE) + { + uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE); + page_count&= ~START_EXTENT_BIT; + if (! (page_count & TAIL_BIT) && page_count != 0) + { + /* Found correct extent */ + if (!new_block) + new_block= extents; /* First extent in range */ + continue; + } + /* Found extent to remove, copy everything found so far */ + if (new_block) + { + size_t length= (size_t) (extents - new_block); + memcpy(to, new_block, length); + to+= length; + new_block= 0; + } + } + if (new_block) + { + size_t length= (size_t) (extents - new_block); + memcpy(to, new_block, length); + to+= length; + } + + if (!unlikely(extents_length= (uint) (to - compact_extent_info))) + { + /* + No ranges. This happens in the rear case when we have a allocated + place for a blob on a tail page but it did fit into the main page. + */ + my_afree(compact_extent_info); + DBUG_RETURN(0); + } + extents_count= (uint) (extents_length / ROW_EXTENT_SIZE); + pagerange_store(log_data + FILEID_STORE_SIZE, extents_count); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= compact_extent_info; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length; + res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn, + info, + (translog_size_t) (sizeof(log_data) + + extents_length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL); + my_afree(compact_extent_info); + if (res) + DBUG_RETURN(1); + } + + DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents, + row->extents_count)); +} + + +/* + Free one page range + + NOTES + This is very similar to free_full_pages() + + RETURN + 0 ok + 1 error +*/ + +static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page, + uint count) +{ + my_bool res= 0; + uint delete_count; + MARIA_SHARE *share= info->s; + DBUG_ENTER("free_full_page_range"); + + delete_count= count; + if (share->state.state.data_file_length == + (page + count) * share->block_size) + { + /* + Don't delete last page from pagecache as this will make the file + shorter than expected if the last operation extended the file + */ + delete_count--; + } + if (delete_count && + pagecache_delete_pages(share->pagecache, &info->dfile, + page, delete_count, PAGECACHE_LOCK_WRITE, 0)) + res= 1; + + if (share->now_transactional) + { + LSN lsn; + /** @todo unify log_data's shape with delete_head_or_tail() */ + uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + ROW_EXTENT_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + DBUG_ASSERT(info->trn->rec_lsn); + pagerange_store(log_data + FILEID_STORE_SIZE, 1); + page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, + page); + int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + PAGE_STORE_SIZE, count); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, + info->trn, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + res= 1; + } + pthread_mutex_lock(&share->bitmap.bitmap_lock); + if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count)) + res= 1; + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/** + @brief Write a record to a (set of) pages + + @fn write_block_record() + @param info Maria handler + @param old_record Original record in case of update; NULL in case of + insert + @param record Record we should write + @param row Statistics about record (calculated by + calc_record_size()) + @param map_blocks On which pages the record should be stored + @param row_pos Position on head page where to put head part of + record + @param undo_lsn <> LSN_ERROR if we are executing an UNDO + @param old_record_checksum Checksum of old_record: ignored if table does + not have live checksum; otherwise if + old_record==NULL it must be 0. + + @note + On return all pinned pages are released. + + [page_buff + EMPTY_SPACE_OFFSET] is set to + row_pos->empty_space - head_length + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool write_block_record(MARIA_HA *info, + const uchar *old_record, + const uchar *record, + MARIA_ROW *row, + MARIA_BITMAP_BLOCKS *bitmap_blocks, + my_bool head_block_is_read, + struct st_row_pos_info *row_pos, + LSN undo_lsn, + ha_checksum old_record_checksum) +{ + uchar *data, *end_of_data, *tmp_data_used, *tmp_data; + uchar *row_extents_first_part, *row_extents_second_part; + uchar *field_length_data; + uchar *page_buff; + MARIA_BITMAP_BLOCK *block, *head_block; + MARIA_SHARE *share= info->s; + MARIA_COLUMNDEF *column, *end_column; + MARIA_PINNED_PAGE page_link; + uint block_size, flag, head_length; + ulong *blob_lengths; + my_bool row_extents_in_use, blob_full_pages_exists; + LSN lsn; + my_off_t position; + uint save_my_errno; + DBUG_ENTER("write_block_record"); + + LINT_INIT(row_extents_first_part); + LINT_INIT(row_extents_second_part); + + head_block= bitmap_blocks->block; + block_size= share->block_size; + + page_buff= row_pos->buff; + /* Position on head page where we should store the head part */ + data= row_pos->data; + end_of_data= data + row_pos->length; + + /* Write header */ + flag= info->row_flag; + row_extents_in_use= 0; + if (unlikely(row->total_length > row_pos->length)) + { + /* Need extent */ + DBUG_ASSERT(bitmap_blocks->count > 1); + if (bitmap_blocks->count <= 1) + goto crashed; /* Wrong in bitmap */ + flag|= ROW_FLAG_EXTENTS; + row_extents_in_use= 1; + } + /* For now we have only a minimum header */ + *data++= (uchar) flag; + if (flag & ROW_FLAG_TRANSID) + { + transid_store(data, info->trn->trid); + data+= TRANSID_SIZE; + } + + if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED)) + *data++= (uchar) (share->base.null_bytes - + share->base.original_null_bytes); + if (row_extents_in_use) + { + /* Store first extent in header */ + store_key_length_inc(data, bitmap_blocks->count - 1); + row_extents_first_part= data; + data+= ROW_EXTENT_SIZE; + } + if (share->base.max_field_lengths) + store_key_length_inc(data, row->field_lengths_length); + if (share->calc_checksum) + { + *(data++)= (uchar) (row->checksum); /* store least significant byte */ + DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL))); + } + memcpy(data, record, share->base.null_bytes); + data+= share->base.null_bytes; + memcpy(data, row->empty_bits, share->base.pack_bytes); + data+= share->base.pack_bytes; + + DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR || + (uint) (data - row_pos->data) == row->min_length); + + /* + Allocate a buffer of rest of data (except blobs) + + To avoid double copying of data, we copy as many columns that fits into + the page. The rest goes into info->packed_row. + + Using an extra buffer, instead of doing continuous writes to different + pages, uses less code and we don't need to have to do a complex call + for every data segment we want to store. + */ + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + row->head_length)) + DBUG_RETURN(1); + + tmp_data_used= 0; /* Either 0 or last used uchar in 'data' */ + tmp_data= data; + + if (row_extents_in_use) + { + uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE; + if (!tmp_data_used && tmp_data + copy_length > end_of_data) + { + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + row_extents_second_part= tmp_data; + /* + We will copy the extents here when we have figured out the tail + positions. + */ + tmp_data+= copy_length; + } + + /* Copy fields that has fixed lengths (primary key etc) */ + for (column= share->columndef, + end_column= column + share->base.fixed_not_null_fields; + column < end_column; column++) + { + if (!tmp_data_used && tmp_data + column->length > end_of_data) + { + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + memcpy(tmp_data, record + column->offset, column->length); + tmp_data+= column->length; + } + + /* Copy length of data for variable length fields */ + if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data) + { + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + field_length_data= row->field_lengths; + memcpy(tmp_data, field_length_data, row->field_lengths_length); + tmp_data+= row->field_lengths_length; + + DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR || + (uint) (tmp_data - row_pos->data) == row->min_length + + share->base.fixed_not_null_fields_length + + row->field_lengths_length); + + /* Copy variable length fields and fields with null/zero */ + for (end_column= share->columndef + share->base.fields - share->base.blobs; + column < end_column ; + column++) + { + const uchar *field_pos; + ulong length; + if ((record[column->null_pos] & column->null_bit) || + (row->empty_bits[column->empty_pos] & column->empty_bit)) + continue; + + field_pos= record + column->offset; + switch (column->type) { + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_SKIP_PRESPACE: + case FIELD_SKIP_ZERO: /* Fixed length field */ + length= column->length; + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + /* Char that is space filled */ + if (column->length <= 255) + length= (uint) (uchar) *field_length_data++; + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + } + break; + case FIELD_VARCHAR: + if (column->length <= 256) + { + length= (uint) (uchar) *field_length_data++; + field_pos++; /* Skip length uchar */ + } + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + field_pos+= 2; + } + DBUG_ASSERT(length <= column->length); + break; + default: /* Wrong data */ + DBUG_ASSERT(0); + length=0; + break; + } + if (!tmp_data_used && tmp_data + length > end_of_data) + { + /* Data didn't fit in page; Change to use tmp buffer */ + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + memcpy((char*) tmp_data, field_pos, length); + tmp_data+= length; + } + + block= head_block + head_block->sub_blocks; /* Point to first blob data */ + + end_column= column + share->base.blobs; + blob_lengths= row->blob_lengths; + if (!tmp_data_used) + { + /* Still room on page; Copy as many blobs we can into this page */ + data= tmp_data; + for (; column < end_column && + *blob_lengths <= (ulong)(end_of_data - data); + column++, blob_lengths++) + { + uchar *tmp_pos; + uint length; + if (!*blob_lengths) /* Null or "" */ + continue; + length= column->length - portable_sizeof_char_ptr; + memcpy_fixed((uchar*) &tmp_pos, record + column->offset + length, + sizeof(char*)); + memcpy(data, tmp_pos, *blob_lengths); + data+= *blob_lengths; + /* + The following is not true when we want to insert data into original + place. In this case we don't have any extra blocks allocated + */ + if (likely(undo_lsn == LSN_ERROR)) + { + /* Skip over tail page that was prepared for storing blob */ + block++; + bitmap_blocks->tail_page_skipped= 1; + } + } + if (head_block->sub_blocks > 1) + { + /* We have allocated pages that where not used */ + bitmap_blocks->page_skipped= 1; + } + } + else + data= tmp_data_used; /* Get last used on page */ + + /* Update page directory */ + head_length= (uint) (data - row_pos->data); + DBUG_PRINT("info", ("Used head length on page: %u header_length: %u", + head_length, + (uint) (flag & ROW_FLAG_TRANSID ? TRANSID_SIZE : 0))); + DBUG_ASSERT(data <= end_of_data); + if (head_length < share->base.min_block_length) + { + /* Extend row to be of size min_block_length */ + uint diff_length= share->base.min_block_length - head_length; + bzero(data, diff_length); + data+= diff_length; + head_length= share->base.min_block_length; + } + /* + If this is a redo entry (ie, undo_lsn != LSN_ERROR) then we should have + written exactly head_length bytes (same as original record). + */ + DBUG_ASSERT(undo_lsn == LSN_ERROR || head_length == row_pos->length); + int2store(row_pos->dir + 2, head_length); + /* update empty space at start of block */ + row_pos->empty_space-= head_length; + int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space); + /* Mark in bitmaps how the current page was actually used */ + head_block->empty_space= row_pos->empty_space; + if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE && + page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST) + head_block->empty_space= 0; /* Page is full */ + head_block->used|= BLOCKUSED_USED; + + check_directory(page_buff, share->block_size, share->base.min_block_length, + (uint) -1); + + /* + Now we have to write tail pages, as we need to store the position + to them in the row extent header. + + We first write out all blob tails, to be able to store them in + the current page or 'tmp_data'. + + Then we write the tail of the non-blob fields (The position to the + tail page is stored either in row header, the extents in the head + page or in the first full page of the non-blob data. It's never in + the tail page of the non-blob data) + */ + + blob_full_pages_exists= 0; + if (row_extents_in_use) + { + if (column != end_column) /* If blob fields */ + { + MARIA_COLUMNDEF *save_column= column; + MARIA_BITMAP_BLOCK *save_block= block; + MARIA_BITMAP_BLOCK *end_block; + ulong *save_blob_lengths= blob_lengths; + + for (; column < end_column; column++, blob_lengths++) + { + uchar *blob_pos; + if (!*blob_lengths) /* Null or "" */ + continue; + if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL) + { + uint length; + length= column->length - portable_sizeof_char_ptr; + memcpy_fixed((uchar *) &blob_pos, record + column->offset + length, + sizeof(char*)); + length= *blob_lengths % FULL_PAGE_SIZE(block_size); /* tail size */ + if (length != *blob_lengths) + blob_full_pages_exists= 1; + if (write_tail(info, block + block->sub_blocks-1, + blob_pos + *blob_lengths - length, + length)) + goto disk_err; + } + else + blob_full_pages_exists= 1; + + for (end_block= block + block->sub_blocks; block < end_block; block++) + { + /* + Set only a bit, to not cause bitmap code to believe a block is full + when there is still a lot of entries in it. + */ + block->used|= BLOCKUSED_USED; + } + } + DBUG_ASSERT((undo_lsn == LSN_ERROR || + block == bitmap_blocks->block + bitmap_blocks->count)); + column= save_column; + block= save_block; + blob_lengths= save_blob_lengths; + } + + if (tmp_data_used) /* non blob data overflows */ + { + MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block; + MARIA_BITMAP_BLOCK *head_tail_block= 0; + ulong length; + ulong data_length= (ulong) (tmp_data - info->rec_buff); + +#ifdef SANITY_CHECKS + DBUG_ASSERT(head_block->sub_blocks != 1); + if (head_block->sub_blocks == 1) + goto crashed; /* no reserved full or tails */ +#endif + /* + Find out where to write tail for non-blob fields. + + Problem here is that the bitmap code may have allocated more + space than we need. We have to handle the following cases: + + - Bitmap code allocated a tail page we don't need. + - The last full page allocated needs to be changed to a tail page + (Because we where able to put more data on the head page than + the bitmap allocation assumed) + + The reserved pages in bitmap_blocks for the main page has one of + the following allocations: + - Full pages, with following blocks: + # * full pages + empty page ; To be used if we change last full to tail page. This + has 'count' = 0. + tail page (optional, if last full page was part full) + - One tail page + */ + + cur_block= head_block + 1; + end_block= head_block + head_block->sub_blocks; + /* + Loop until we have find a block bigger than we need or + we find the empty page block. + */ + while (data_length >= (length= (cur_block->page_count * + FULL_PAGE_SIZE(block_size))) && + cur_block->page_count) + { +#ifdef SANITY_CHECKS + DBUG_ASSERT(!((cur_block == end_block) || + (cur_block->used & BLOCKUSED_USED))); + if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED)) + goto crashed; +#endif + data_length-= length; + (cur_block++)->used|= BLOCKUSED_USED; + } + last_head_block= cur_block; + if (data_length) + { + if (cur_block->page_count == 0) + { + /* Skip empty filler block */ + cur_block++; + } +#ifdef SANITY_CHECKS + DBUG_ASSERT(!(cur_block >= end_block)); + if ((cur_block >= end_block)) + goto crashed; +#endif + if (cur_block->used & BLOCKUSED_TAIL) + { + DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size)); + /* tail written to tail page */ + cur_block->used|= BLOCKUSED_USED; + head_tail_block= cur_block; + } + else if (data_length > length - MAX_TAIL_SIZE(block_size)) + { + /* tail written to full page */ + cur_block->used|= BLOCKUSED_USED; + if ((cur_block != end_block - 1) && + (end_block[-1].used & BLOCKUSED_TAIL)) + bitmap_blocks->tail_page_skipped= 1; + } + else + { + /* + cur_block is a full block, followed by an empty and optional + tail block. Change cur_block to a tail block or split it + into full blocks and tail blocks. + + TODO: + If there is enough space on the following tail block, use + this instead of creating a new tail block. + */ + DBUG_ASSERT(cur_block[1].page_count == 0); + if (cur_block->page_count == 1) + { + /* convert full block to tail block */ + cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL; + head_tail_block= cur_block; + } + else + { + DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(block_size)); + DBUG_PRINT("info", ("Splitting blocks into full and tail")); + cur_block[1].page= (cur_block->page + cur_block->page_count - 1); + cur_block[1].page_count= 1; /* Avoid DBUG_ASSERT */ + cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL; + cur_block->page_count--; + cur_block->used|= BLOCKUSED_USED; + last_head_block= head_tail_block= cur_block+1; + } + if (end_block[-1].used & BLOCKUSED_TAIL) + bitmap_blocks->tail_page_skipped= 1; + } + } + else + { + /* Must be an empty or tail page */ + DBUG_ASSERT(cur_block->page_count == 0 || + cur_block->used & BLOCKUSED_TAIL); + if (end_block[-1].used & BLOCKUSED_TAIL) + bitmap_blocks->tail_page_skipped= 1; + } + + /* + Write all extents into page or tmp_data + + Note that we still don't have a correct position for the tail + of the non-blob fields. + */ + store_extent_info(row_extents_first_part, + row_extents_second_part, + head_block+1, bitmap_blocks->count - 1); + if (head_tail_block) + { + ulong block_length= (ulong) (tmp_data - info->rec_buff); + uchar *extent_data; + + length= (uint) (block_length % FULL_PAGE_SIZE(block_size)); + if (write_tail(info, head_tail_block, + info->rec_buff + block_length - length, + length)) + goto disk_err; + tmp_data-= length; /* Remove the tail */ + if (tmp_data == info->rec_buff) + { + /* We have no full blocks to write for the head part */ + tmp_data_used= 0; + } + + /* Store the tail position for the non-blob fields */ + if (head_tail_block == head_block + 1) + { + /* + We had a head block + tail block, which means that the + tail block is the first extent + */ + extent_data= row_extents_first_part; + } + else + { + /* + We have a head block + some full blocks + tail block + last_head_block is pointing after the last used extent + for the head block. + */ + extent_data= row_extents_second_part + + ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE; + } + DBUG_ASSERT(uint2korr(extent_data+5) & TAIL_BIT); + page_store(extent_data, head_tail_block->page); + int2store(extent_data + PAGE_STORE_SIZE, head_tail_block->page_count); + } + } + else + store_extent_info(row_extents_first_part, + row_extents_second_part, + head_block+1, bitmap_blocks->count - 1); + } + + if (share->now_transactional) + { + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + + /* Log REDO changes of head page */ + page_store(log_data + FILEID_STORE_SIZE, head_block->page); + dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + row_pos->rownr); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos->data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length; + if (translog_write_record(&lsn, + head_block_is_read ? + LOGREC_REDO_INSERT_ROW_HEAD : + LOGREC_REDO_NEW_ROW_HEAD, + info->trn, + info, + (translog_size_t) (sizeof(log_data) + + head_length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL)) + goto disk_err; + } + +#ifdef RECOVERY_EXTRA_DEBUG + if (info->trn->undo_lsn != LSN_IMPOSSIBLE) + { + /* Stop right after the REDO; testing incomplete log record groups */ + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash", + { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); }); + } +#endif + + if (head_block_is_read) + { + MARIA_PINNED_PAGE *page_link; + /* Head page is always the first pinned page */ + page_link= dynamic_element(&info->pinned_pages, 0, + MARIA_PINNED_PAGE*); + pagecache_unlock_by_link(share->pagecache, page_link->link, + PAGECACHE_LOCK_WRITE_TO_READ, + PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK; + page_link->changed= 1; + } + else + { + if (pagecache_write(share->pagecache, + &info->dfile, head_block->page, 0, + page_buff, share->page_type, + head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ : + PAGECACHE_LOCK_READ, + head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED : + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE)) + goto disk_err; + page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + /* Increase data file size, if extended */ + position= (my_off_t) head_block->page * block_size; + if (share->state.state.data_file_length <= position) + _ma_set_share_data_file_length(share, position + block_size); + } + + if (share->now_transactional && (tmp_data_used || blob_full_pages_exists)) + { + /* + Log REDO writes for all full pages (head part and all blobs) + We write all here to be able to generate the UNDO record early + so that we can write the LSN for the UNDO record to all full pages. + */ + uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) * + ROW_EXTENTS_ON_STACK]; + uchar *log_data, *log_pos; + LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 + + ROW_EXTENTS_ON_STACK]; + LEX_CUSTRING *log_array_pos, *log_array; + int error; + translog_size_t log_entry_length= 0; + uint ext_length, extents= 0, sub_extents= 0; + + /* If few extents, then allocate things on stack to avoid a malloc call */ + if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK) + { + log_array= tmp_log_array; + log_data= tmp_log_data; + } + else + { + if (!my_multi_malloc(MY_WME, &log_array, + (uint) ((bitmap_blocks->count + + TRANSLOG_INTERNAL_PARTS + 2) * + sizeof(*log_array)), + &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + bitmap_blocks->count * (ROW_EXTENT_SIZE + + BLOCK_FILLER_SIZE + + SUB_RANGE_SIZE), + NullS)) + goto disk_err; + } + log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2; + log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1; + + if (tmp_data_used) + { + /* Full head page */ + translog_size_t block_length= (translog_size_t) (tmp_data - + info->rec_buff); + log_pos= store_page_range(log_pos, head_block+1, block_size, + (ulong) block_length, &extents); + log_array_pos->str= info->rec_buff; + log_array_pos->length= block_length; + log_entry_length+= block_length; + log_array_pos++; + sub_extents++; + } + if (blob_full_pages_exists) + { + MARIA_COLUMNDEF *tmp_column= column; + ulong *tmp_blob_lengths= blob_lengths; + MARIA_BITMAP_BLOCK *tmp_block= block; + + /* Full blob pages */ + for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++) + { + ulong blob_length; + uint length; + + if (!*tmp_blob_lengths) /* Null or "" */ + continue; + blob_length= *tmp_blob_lengths; + length= tmp_column->length - portable_sizeof_char_ptr; + /* + If last part of blog was on tail page, change blob_length to + reflect this + */ + if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL) + blob_length-= (blob_length % FULL_PAGE_SIZE(block_size)); + if (blob_length) + { + memcpy_fixed((uchar*) &log_array_pos->str, + record + tmp_column->offset + length, + sizeof(uchar*)); + log_array_pos->length= blob_length; + log_entry_length+= blob_length; + log_array_pos++; + sub_extents++; + + log_pos= store_page_range(log_pos, tmp_block, block_size, + blob_length, &extents); + } + tmp_block+= tmp_block->sub_blocks; + } + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + ext_length= (uint) (log_pos - log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length; + pagerange_store(log_data+ FILEID_STORE_SIZE, extents); + pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, + sub_extents); + + log_entry_length+= ext_length; + /* trn->rec_lsn is already set earlier in this function */ + error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS, + info->trn, info, log_entry_length, + (uint) (log_array_pos - log_array), + log_array, log_data, NULL); + if (log_array != tmp_log_array) + my_free(log_array, MYF(0)); + if (error) + goto disk_err; + } + + /* Write UNDO or CLR record */ + lsn= LSN_IMPOSSIBLE; + if (share->now_transactional) + { + LEX_CUSTRING *log_array= info->log_row_parts; + + if (undo_lsn != LSN_ERROR) + { + /* + Store if this CLR is about UNDO_DELETE or UNDO_UPDATE; + in the first case, Recovery, when it sees the CLR_END in the + REDO phase, may decrement the records' count. + */ + if (_ma_write_clr(info, undo_lsn, + old_record ? LOGREC_UNDO_ROW_UPDATE : + LOGREC_UNDO_ROW_DELETE, + share->calc_checksum != 0, + row->checksum - old_record_checksum, + &lsn, (void*) 0)) + goto disk_err; + } + else + { + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 + + HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE + + ROW_EXTENT_SIZE]; + uchar *log_pos; + ha_checksum checksum_delta; + + /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */ + lsn_store(log_data, info->trn->undo_lsn); + page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, + head_block->page); + dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE, + row_pos->rownr); + log_pos= (log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE); + store_checksum_in_rec(share, checksum_delta, + row->checksum - old_record_checksum, + log_pos, log_pos); + compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + if (!old_record) + { + /* Store undo_lsn in case we are aborting the insert */ + row->orig_undo_lsn= info->trn->undo_lsn; + /* Write UNDO log record for the INSERT */ + if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + TRANSLOG_INTERNAL_PARTS + 1, + log_array, + log_data + LSN_STORE_SIZE, &checksum_delta)) + goto disk_err; + } + else + { + /* Write UNDO log record for the UPDATE */ + size_t row_length, extents_length; + uint row_parts_count, cur_head_length; + + /* + Write head length and extents of the original row so that we + during UNDO can put it back in the original position. + We don't store size for TRANSID, as we don't write this during + UNDO. + */ + cur_head_length= (info->cur_row.head_length - + info->cur_row.header_length); + int2store(log_pos, cur_head_length); + pagerange_store(log_pos + 2, info->cur_row.extents_count); + log_pos+= 2 + PAGERANGE_STORE_SIZE; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 + + PAGERANGE_STORE_SIZE); + info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str= + info->cur_row.extents; + info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length= + extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE; + + row_length= fill_update_undo_parts(info, old_record, record, + log_array + + TRANSLOG_INTERNAL_PARTS + 2, + &row_parts_count); + if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn, + info, + (translog_size_t) + (log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extents_length + + row_length), + TRANSLOG_INTERNAL_PARTS + 2 + + row_parts_count, + log_array, + log_data + LSN_STORE_SIZE, + &checksum_delta)) + goto disk_err; + } + } + } + /* Release not used space in used pages */ + if (_ma_bitmap_release_unused(info, bitmap_blocks)) + goto disk_err; + _ma_unpin_all_pages(info, lsn); + + if (tmp_data_used) + { + /* + Write data stored in info->rec_buff to pages + This is the char/varchar data that didn't fit into the head page. + */ + DBUG_ASSERT(bitmap_blocks->count != 0); + if (write_full_pages(info, lsn, head_block + 1, + info->rec_buff, (ulong) (tmp_data - info->rec_buff))) + goto disk_err; + } + + /* Write rest of blobs (data, but no tails as they are already written) */ + for (; column < end_column; column++, blob_lengths++) + { + uchar *blob_pos; + uint length; + ulong blob_length; + if (!*blob_lengths) /* Null or "" */ + continue; + length= column->length - portable_sizeof_char_ptr; + memcpy_fixed((uchar*) &blob_pos, record + column->offset + length, + sizeof(char*)); + /* remove tail part */ + blob_length= *blob_lengths; + if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL) + blob_length-= (blob_length % FULL_PAGE_SIZE(block_size)); + + if (blob_length && write_full_pages(info, lsn, block, + blob_pos, blob_length)) + goto disk_err; + block+= block->sub_blocks; + } + + _ma_finalize_row(info); + DBUG_RETURN(0); + +crashed: + /* Something was wrong with data on page */ + my_errno= HA_ERR_WRONG_IN_RECORD; + +disk_err: + /** + @todo RECOVERY we are going to let dirty pages go to disk while we have + logged UNDO, this violates WAL. We must mark the table corrupted! + + @todo RECOVERY we have written some REDOs without a closing UNDO, + it's possible that a next operation by this transaction succeeds and then + Recovery would glue the "orphan REDOs" to the succeeded operation and + execute the failed REDOs. We need some mark "abort this group" in the + log, or mark the table corrupted (then user will repair it and thus REDOs + will be skipped). + + @todo RECOVERY to not let write errors go unnoticed, pagecache_write() + should take a MARIA_HA* in argument, and it it + fails when flushing a page to disk it should call + (*the_maria_ha->write_error_func)(the_maria_ha) + and this hook will mark the table corrupted. + Maybe hook should be stored in the pagecache's block structure, or in a + hash "file->maria_ha*". + + @todo RECOVERY we should distinguish below between log write error and + table write error. The former should stop Maria immediately, the latter + should mark the table corrupted. + */ + /* + Unpin all pinned pages to not cause problems for disk cache. This is + safe to call even if we already called _ma_unpin_all_pages() above. + */ + save_my_errno= my_errno; + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + my_errno= save_my_errno; + DBUG_RETURN(1); +} + + +/* + @brief Write a record + + @fn allocate_and_write_block_record() + @param info Maria handler + @param record Record to write + @param row Information about fields in 'record' + @param undo_lsn <> LSN_ERROR if we are executing an UNDO + + @return + @retval 0 ok + @retval 1 Error +*/ + +static my_bool allocate_and_write_block_record(MARIA_HA *info, + const uchar *record, + MARIA_ROW *row, + LSN undo_lsn) +{ + struct st_row_pos_info row_pos; + MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks; + int save_my_errno; + DBUG_ENTER("allocate_and_write_block_record"); + + _ma_bitmap_flushable(info, 1); + if (_ma_bitmap_find_place(info, row, blocks)) + goto err; /* Error reading bitmap */ + + /* + Sleep; a checkpoint will happen and should not send this over-allocated + bitmap to disk but rather wait. + */ + DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10);); + + /* page will be pinned & locked by get_head_or_tail_page */ + if (get_head_or_tail_page(info, blocks->block, info->buff, + row->space_on_head_page, HEAD_PAGE, + PAGECACHE_LOCK_WRITE, &row_pos)) + goto err; + row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr); + if (info->s->calc_checksum) + { + if (undo_lsn == LSN_ERROR) + row->checksum= (info->s->calc_checksum)(info, record); + else + { + /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */ + DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record)); + } + } + DBUG_PRINT("info", ("rowid: %lu (%lu:%u) length: %u", (ulong) row->lastpos, + (ulong) ma_recordpos_to_page(row->lastpos), + ma_recordpos_to_dir_entry(row->lastpos), + row_pos.length)); + if (write_block_record(info, (uchar*) 0, record, row, + blocks, blocks->block->org_bitmap_value != 0, + &row_pos, undo_lsn, 0)) + goto err; + /* Now let checkpoint happen but don't commit */ + DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000);); + DBUG_RETURN(0); + +err: + save_my_errno= my_errno; + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + my_errno= save_my_errno; + DBUG_RETURN(1); +} + + +/* + Write a record and return rowid for it + + SYNOPSIS + _ma_write_init_block_record() + info Maria handler + record Record to write + + NOTES + This is done BEFORE we write the keys to the row! + + RETURN + HA_OFFSET_ERROR Something went wrong + # Rowid for row +*/ + +MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info, + const uchar *record) +{ + DBUG_ENTER("_ma_write_init_block_record"); + + calc_record_size(info, record, &info->cur_row); + if (allocate_and_write_block_record(info, record, + &info->cur_row, LSN_ERROR)) + DBUG_RETURN(HA_OFFSET_ERROR); + DBUG_RETURN(info->cur_row.lastpos); +} + + +/* + Dummy function for (*info->s->write_record)() + + Nothing to do here, as we already wrote the record in + _ma_write_init_block_record() +*/ + +my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)), + const uchar *record __attribute__ ((unused))) +{ + return 0; /* Row already written */ +} + + +/** + @brief Remove row written by _ma_write_block_record() and log undo + + @param info Maria handler + + @note + This is called in case we got a duplicate unique key while + writing keys. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool _ma_write_abort_block_record(MARIA_HA *info) +{ + my_bool res= 0; + MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks; + MARIA_BITMAP_BLOCK *block, *end; + LSN lsn= LSN_IMPOSSIBLE; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_write_abort_block_record"); + + _ma_bitmap_lock(share); /* Lock bitmap from other insert threads */ + if (delete_head_or_tail(info, + ma_recordpos_to_page(info->cur_row.lastpos), + ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1, + 0)) + res= 1; + for (block= blocks->block + 1, end= block + blocks->count - 1; block < end; + block++) + { + if (block->used & BLOCKUSED_USED) + { + if (block->used & BLOCKUSED_TAIL) + { + /* + block->page_count is set to the tail directory entry number in + write_block_record() + */ + if (delete_head_or_tail(info, block->page, + block->page_count & ~TAIL_BIT, + 0, 0)) + res= 1; + } + else + { + if (free_full_page_range(info, block->page, block->page_count)) + res= 1; + } + } + } + if (share->now_transactional) + { + if (_ma_write_clr(info, info->cur_row.orig_undo_lsn, + LOGREC_UNDO_ROW_INSERT, + share->calc_checksum != 0, + (ha_checksum) 0 - info->cur_row.checksum, + &lsn, (void*) 0)) + res= 1; + } + _ma_bitmap_unlock(share); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/* + Update a record + + NOTES + For the moment, we assume that info->curr_row.extents is always updated + when a row is read. In the future we may decide to read this on demand + for rows split into many extents. +*/ + +static my_bool _ma_update_block_record2(MARIA_HA *info, + MARIA_RECORD_POS record_pos, + const uchar *oldrec, + const uchar *record, + LSN undo_lsn) +{ + MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks; + uchar *buff; + MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row; + MARIA_PINNED_PAGE page_link; + uint rownr, org_empty_size, head_length; + uint block_size= info->s->block_size; + uint errpos= 0; + uchar *dir; + pgcache_page_no_t page; + struct st_row_pos_info row_pos; + my_bool res; + ha_checksum old_checksum; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_update_block_record2"); + DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos)); + +#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE + DBUG_DUMP("oldrec", oldrec, share->base.reclength); + DBUG_DUMP("newrec", record, share->base.reclength); +#endif + + /* + Checksums of new and old rows were computed by callers already; new + row's was put into cur_row, old row's was put into new_row. + */ + old_checksum= new_row->checksum; + new_row->checksum= cur_row->checksum; + calc_record_size(info, record, new_row); + page= ma_recordpos_to_page(record_pos); + + _ma_bitmap_flushable(info, 1); + buff= pagecache_read(share->pagecache, + &info->dfile, (pgcache_page_no_t) page, 0, 0, + share->page_type, + PAGECACHE_LOCK_WRITE, &page_link.link); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!buff) + goto err; + + org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET); + rownr= ma_recordpos_to_dir_entry(record_pos); + dir= dir_entry_pos(buff, block_size, rownr); + + /* + We can't use cur_row->head_length as the block may have been compacted + since we read it. + */ + head_length= uint2korr(dir + 2); + + if ((org_empty_size + head_length) >= new_row->total_length) + { + uint rec_offset, length; + MARIA_BITMAP_BLOCK block; + + DBUG_PRINT("info", ("org_empty_size: %u org_length: %u new_length: %lu", + org_empty_size, head_length, + new_row->total_length)); + + /* + We can fit the new row in the same page as the original head part + of the row + */ + block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap, + org_empty_size); + if (extend_area_on_page(info, buff, dir, rownr, block_size, + new_row->total_length, &org_empty_size, + &rec_offset, &length)) + { + errpos= 1; + goto err; + } + + row_pos.buff= buff; + row_pos.rownr= rownr; + row_pos.empty_space= org_empty_size; + row_pos.dir= dir; + row_pos.data= buff + rec_offset; + row_pos.length= length; + blocks->block= █ + blocks->count= 1; + block.page= page; + block.sub_blocks= 1; + block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP; + block.empty_space= row_pos.empty_space; + + if (*cur_row->tail_positions && + delete_tails(info, cur_row->tail_positions)) + { + errpos= 2; + goto err; + } + if (cur_row->extents_count && free_full_pages(info, cur_row)) + { + errpos= 3; + goto err; + } + res= write_block_record(info, oldrec, record, new_row, blocks, + 1, &row_pos, undo_lsn, old_checksum); + /* We can't update or delete this without re-reading it again */ + info->update&= ~HA_STATE_AKTIV; + DBUG_RETURN(res); + } + /* Delete old row */ + if (*cur_row->tail_positions && + delete_tails(info, cur_row->tail_positions)) + { + errpos= 4; + goto err; + } + if (cur_row->extents_count && free_full_pages(info, cur_row)) + { + errpos= 5; + goto err; + } + + head_length= uint2korr(dir + 2); + if (_ma_bitmap_find_new_place(info, new_row, page, head_length + + org_empty_size, blocks)) + { + errpos= 6; + goto err; + } + + /* + Allocate all size in block for record + TODO: + Need to improve this to do compact if we can fit one more blob into + the head page + */ + if ((head_length < new_row->space_on_head_page || + (new_row->total_length <= head_length && + org_empty_size + head_length >= new_row->total_length))) + { + _ma_compact_block_page(buff, block_size, rownr, 1, + info->trn->min_read_from, + share->base.min_block_length); + org_empty_size= 0; + head_length= uint2korr(dir + 2); + } + + row_pos.buff= buff; + row_pos.rownr= rownr; + row_pos.empty_space= org_empty_size + head_length; + row_pos.dir= dir; + row_pos.data= buff + uint2korr(dir); + row_pos.length= head_length; + if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1, + &row_pos, undo_lsn, old_checksum))) + { + errpos= 7; + goto err; + } + DBUG_RETURN(0); + +err: + DBUG_PRINT("error", ("errpos: %d", errpos)); + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + DBUG_RETURN(1); +} + + +/* + @brief Store new row on it's original position + + @note + This is basicly a copy of _ma_update_block_record2 + When we have a purge thread for deleted row, we can remove this function + and use _ma_update_block_record2 instead. + + This is the main reason we don't make a lot of subfunctions that are + common between _ma_update_block_record2() and this function. + + Note: If something goes wrong we mark the file crashed +*/ + +static my_bool _ma_update_at_original_place(MARIA_HA *info, + pgcache_page_no_t page, + uint rownr, + uint length_on_head_page, + uint extent_count, + const uchar *extent_info, + const uchar *oldrec, + const uchar *record, + LSN undo_lsn) +{ + MARIA_BITMAP_BLOCKS *blocks; + MARIA_BITMAP_BLOCK *block; + MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + ha_checksum old_checksum; + uint org_empty_size, empty_size; + uint block_size= info->s->block_size; + uchar *dir, *buff; + struct st_row_pos_info row_pos; + my_bool res; + uint rec_offset, length; + DBUG_ENTER("_ma_update_at_original_place"); + +#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE + DBUG_DUMP("oldrec", oldrec, share->base.reclength); + DBUG_DUMP("newrec", record, share->base.reclength); +#endif + + /* + Checksums of new and old rows were computed by callers already; new + row's was put into cur_row, old row's was put into new_row. + */ + old_checksum= new_row->checksum; + new_row->checksum= cur_row->checksum; + calc_record_size(info, record, new_row); + + _ma_bitmap_flushable(info, 1); + buff= pagecache_read(share->pagecache, + &info->dfile, (pgcache_page_no_t) page, 0, 0, + share->page_type, + PAGECACHE_LOCK_WRITE, &page_link.link); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!buff) + goto err; + + org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET); + dir= dir_entry_pos(buff, block_size, rownr); + + if ((org_empty_size + cur_row->head_length) < length_on_head_page) + { + DBUG_PRINT("error", + ("org_empty_size: %u head_length: %u length_on_page: %u", + org_empty_size, (uint) cur_row->head_length, + length_on_head_page)); + my_errno= HA_ERR_WRONG_IN_RECORD; + goto err; + } + + /* + We can fit the new row in the same page as the original head part + of the row + */ + empty_size= org_empty_size; + if (extend_area_on_page(info, buff, dir, rownr, block_size, + length_on_head_page, &empty_size, + &rec_offset, &length)) + goto err; + + row_pos.buff= buff; + row_pos.rownr= rownr; + row_pos.empty_space= empty_size; + row_pos.dir= dir; + row_pos.data= buff + rec_offset; + + /* Delete old row */ + if (*cur_row->tail_positions && + delete_tails(info, cur_row->tail_positions)) + goto err; + if (cur_row->extents_count && free_full_pages(info, cur_row)) + goto err; + + /* Change extent information to be usable by write_block_record() */ + blocks= &cur_row->insert_blocks; + if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info)) + goto err; + block= blocks->block; + block->empty_space= row_pos.empty_space; + block->org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap, + org_empty_size); + DBUG_ASSERT(block->org_bitmap_value == + _ma_bitmap_get_page_bits(info, &info->s->bitmap, page)); + block->used|= BLOCKUSED_USE_ORG_BITMAP; + + /* + We have to use <= below as the new_row may be smaller than the original + row as the new row doesn't have transaction id + */ + + DBUG_ASSERT(blocks->count > 1 || + max(new_row->total_length, share->base.min_block_length) <= + length_on_head_page); + + /* Store same amount of data on head page as on original page */ + row_pos.length= (length_on_head_page - + (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE); + set_if_bigger(row_pos.length, share->base.min_block_length); + if ((res= write_block_record(info, oldrec, record, new_row, blocks, + 1, &row_pos, undo_lsn, old_checksum))) + goto err; + DBUG_RETURN(0); + +err: + _ma_mark_file_crashed(share); + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + DBUG_RETURN(1); +} + + +/* Wrapper for _ma_update_block_record2() used by ma_update() */ + +my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos, + const uchar *orig_rec, const uchar *new_rec) +{ + return _ma_update_block_record2(info, record_pos, orig_rec, new_rec, + LSN_ERROR); +} + + +/* + Delete a directory entry + + SYNOPSIS + delete_dir_entry() + buff Page buffer + block_size Block size + record_number Record number to delete + empty_space Empty space on page after delete + + RETURN + -1 Error on page + 0 ok + 1 Page is now empty +*/ + +static int delete_dir_entry(uchar *buff, uint block_size, uint record_number, + uint *empty_space_res) +{ + uint number_of_records= (uint) buff[DIR_COUNT_OFFSET]; + uint length, empty_space; + uchar *dir; + DBUG_ENTER("delete_dir_entry"); + +#ifdef SANITY_CHECKS + if (record_number >= number_of_records || + record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 - + PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE)) + { + DBUG_PRINT("error", ("record_number: %u number_of_records: %u", + record_number, number_of_records)); + + DBUG_RETURN(-1); + } +#endif + + check_directory(buff, block_size, 0, (uint) -1); + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + dir= dir_entry_pos(buff, block_size, record_number); + length= uint2korr(dir + 2); + + if (record_number == number_of_records - 1) + { + /* Delete this entry and all following free directory entries */ + uchar *end= buff + block_size - PAGE_SUFFIX_SIZE; + number_of_records--; + dir+= DIR_ENTRY_SIZE; + empty_space+= DIR_ENTRY_SIZE; + + /* Unlink and free the next empty ones */ + while (dir < end && dir[0] == 0 && dir[1] == 0) + { + number_of_records--; + if (dir[2] == END_OF_DIR_FREE_LIST) + buff[DIR_FREE_OFFSET]= dir[3]; + else + { + uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]); + DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] == + number_of_records); + prev_entry[3]= dir[3]; + } + if (dir[3] != END_OF_DIR_FREE_LIST) + { + uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] == + number_of_records); + next_entry[2]= dir[2]; + } + dir+= DIR_ENTRY_SIZE; + empty_space+= DIR_ENTRY_SIZE; + } + + if (number_of_records == 0) + { + /* All entries on page deleted */ + DBUG_PRINT("info", ("Page marked as unallocated")); + buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + { + dir= dir_entry_pos(buff, block_size, record_number); + bzero(dir, (record_number+1) * DIR_ENTRY_SIZE); + } +#endif + *empty_space_res= block_size; + DBUG_RETURN(1); + } + buff[DIR_COUNT_OFFSET]= (uchar) number_of_records; + } + else + { + /* Update directory */ + dir[0]= dir[1]= 0; + dir[2]= END_OF_DIR_FREE_LIST; + if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST) + { + /* Relink next entry to point to newly freed entry */ + uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT(uint2korr(next_entry) == 0 && + next_entry[2] == END_OF_DIR_FREE_LIST); + next_entry[2]= record_number; + } + buff[DIR_FREE_OFFSET]= record_number; + } + empty_space+= length; + + int2store(buff + EMPTY_SPACE_OFFSET, empty_space); + buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED; + + *empty_space_res= empty_space; + + check_directory(buff, block_size, 0, empty_space); + DBUG_RETURN(0); +} + + +/* + Delete a head a tail part + + SYNOPSIS + delete_head_or_tail() + info Maria handler + page Page (not file offset!) on which the row is + head 1 if this is a head page + from_update 1 if we are called from update. In this case we + leave the page as write locked as we may put + the new row into the old position. + + RETURN + 0 ok + 1 error +*/ + +static my_bool delete_head_or_tail(MARIA_HA *info, + pgcache_page_no_t page, uint record_number, + my_bool head, my_bool from_update) +{ + MARIA_SHARE *share= info->s; + uint empty_space; + uint block_size= share->block_size; + uchar *buff; + LSN lsn; + MARIA_PINNED_PAGE page_link; + int res; + enum pagecache_page_lock lock_at_write, lock_at_unpin; + DBUG_ENTER("delete_head_or_tail"); + DBUG_PRINT("enter", ("id: %lu (%lu:%u)", + (ulong) ma_recordpos(page, record_number), + (ulong) page, record_number)); + + buff= pagecache_read(share->pagecache, + &info->dfile, page, 0, 0, + share->page_type, + PAGECACHE_LOCK_WRITE, &page_link.link); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!buff) + DBUG_RETURN(1); + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == + (head ? HEAD_PAGE : TAIL_PAGE)); + + if (from_update) + { + lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED; + lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK; + } + else + { + lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ; + lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK; + } + + res= delete_dir_entry(buff, block_size, record_number, &empty_space); + if (res < 0) + DBUG_RETURN(1); + if (res == 0) /* after our deletion, page is still not empty */ + { + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + if (share->now_transactional) + { + /* Log REDO data */ + page_store(log_data + FILEID_STORE_SIZE, page); + dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + record_number); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD : + LOGREC_REDO_PURGE_ROW_TAIL), + info->trn, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + DBUG_RETURN(1); + } + } + else /* page is now empty */ + { + if (share->now_transactional) + { + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + page_store(log_data + FILEID_STORE_SIZE, page); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL, + info->trn, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + DBUG_RETURN(1); + } + DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]); + } + + pagecache_unlock_by_link(share->pagecache, page_link.link, + lock_at_write, + PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + page_link.unlock= lock_at_unpin; + set_dynamic(&info->pinned_pages, (void*) &page_link, + info->pinned_pages.elements-1); + + DBUG_PRINT("info", ("empty_space: %u", empty_space)); + + /* + If there is not enough space for all possible tails, mark the + page full + */ + if (!head && !enough_free_entries(buff, share->block_size, + 1 + share->base.blobs)) + empty_space= 0; + + DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space)); +} + + +/* + delete all tails + + SYNOPSIS + delete_tails() + info Handler + tails Pointer to vector of tail positions, ending with 0 + + RETURN + 0 ok + 1 error +*/ + +static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails) +{ + my_bool res= 0; + DBUG_ENTER("delete_tails"); + for (; *tails; tails++) + { + if (delete_head_or_tail(info, + ma_recordpos_to_page(*tails), + ma_recordpos_to_dir_entry(*tails), 0, 1)) + res= 1; + } + DBUG_RETURN(res); +} + + +/* + Delete a record + + NOTES + For the moment, we assume that info->cur_row.extents is always updated + when a row is read. In the future we may decide to read this on demand + for rows with many splits. +*/ + +my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record) +{ + pgcache_page_no_t page; + uint record_number; + MARIA_SHARE *share= info->s; + LSN lsn= LSN_IMPOSSIBLE; + DBUG_ENTER("_ma_delete_block_record"); + + page= ma_recordpos_to_page(info->cur_row.lastpos); + record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos); + DBUG_PRINT("enter", ("rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos, + (ulong) page, record_number)); + + _ma_bitmap_flushable(info, 1); + if (delete_head_or_tail(info, page, record_number, 1, 0) || + delete_tails(info, info->cur_row.tail_positions)) + goto err; + + if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row)) + goto err; + + if (share->now_transactional) + { + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE + + HA_CHECKSUM_STORE_SIZE]; + uchar *log_pos; + size_t row_length; + uint row_parts_count, extents_length; + ha_checksum checksum_delta; + + /* Write UNDO record */ + lsn_store(log_data, info->trn->undo_lsn); + page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page); + log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE; + dirpos_store(log_pos, record_number); + log_pos+= DIRPOS_STORE_SIZE; + int2store(log_pos, info->cur_row.head_length - + info->cur_row.header_length); + log_pos+= 2; + pagerange_store(log_pos, info->cur_row.extents_count); + log_pos+= PAGERANGE_STORE_SIZE; + + info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data; + info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length= + sizeof(log_data) - HA_CHECKSUM_STORE_SIZE; + store_checksum_in_rec(share, checksum_delta, + (ha_checksum) 0 - info->cur_row.checksum, log_pos, + info->log_row_parts[TRANSLOG_INTERNAL_PARTS + + 0].length); + info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str= + info->cur_row.extents; + info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length= + extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE; + + row_length= fill_insert_undo_parts(info, record, + (info->log_row_parts + + TRANSLOG_INTERNAL_PARTS + 2), + &row_parts_count); + + if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn, + info, + (translog_size_t) + (info->log_row_parts[TRANSLOG_INTERNAL_PARTS + + 0].length + row_length + + extents_length), + TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count, + info->log_row_parts, + log_data + LSN_STORE_SIZE, + &checksum_delta)) + goto err; + } + + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(0); + +err: + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + DBUG_RETURN(1); +} + + +/**************************************************************************** + Reading of records +****************************************************************************/ + +/* + Read position to record from record directory at end of page + + SYNOPSIS + get_record_position() + buff page buffer + block_size block size for page + record_number Record number in index + end_of_data pointer to end of data for record + + RETURN + 0 Error in data + # Pointer to start of record. + In this case *end_of_data is set. +*/ + +static uchar *get_record_position(uchar *buff, uint block_size, + uint record_number, uchar **end_of_data) +{ + uint number_of_records= (uint) buff[DIR_COUNT_OFFSET]; + uchar *dir; + uchar *data; + uint offset, length; + +#ifdef SANITY_CHECKS + if (record_number >= number_of_records || + record_number > ((block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE) / + DIR_ENTRY_SIZE)) + { + DBUG_PRINT("error", + ("Wrong row number: record_number: %u number_of_records: %u", + record_number, number_of_records)); + return 0; + } +#endif + + dir= dir_entry_pos(buff, block_size, record_number); + offset= uint2korr(dir); + length= uint2korr(dir + 2); +#ifdef SANITY_CHECKS + if (offset < PAGE_HEADER_SIZE || + offset + length > (block_size - + number_of_records * DIR_ENTRY_SIZE - + PAGE_SUFFIX_SIZE)) + { + DBUG_PRINT("error", + ("Wrong row position: record_number: %u offset: %u " + "length: %u number_of_records: %u", + record_number, offset, length, number_of_records)); + return 0; + } +#endif + data= buff + offset; + *end_of_data= data + length; + return data; +} + + +/* + Init extent + + NOTES + extent is a cursor over which pages to read +*/ + +static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info, + uint extents, MARIA_RECORD_POS *tail_positions) +{ + uint page_count; + extent->extent= extent_info; + extent->extent_count= extents; + extent->page= page_korr(extent_info); /* First extent */ + page_count= (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) & + ~START_EXTENT_BIT); + extent->tail= page_count & TAIL_BIT; + if (extent->tail) + { + extent->page_count= 1; + extent->tail_row_nr= page_count & ~TAIL_BIT; + } + else + extent->page_count= page_count; + extent->tail_positions= tail_positions; + extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED; +} + + +/* + Read next extent + + SYNOPSIS + read_next_extent() + info Maria handler + extent Pointer to current extent (this is updated to point + to next) + end_of_data Pointer to end of data in read block (out) + + NOTES + New block is read into info->buff + + RETURN + 0 Error; my_errno is set + # Pointer to start of data in read block + In this case end_of_data is updated to point to end of data. +*/ + +static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent, + uchar **end_of_data) +{ + MARIA_SHARE *share= info->s; + uchar *buff, *data; + MARIA_PINNED_PAGE page_link; + enum pagecache_page_lock lock; + DBUG_ENTER("read_next_extent"); + + if (!extent->page_count) + { + uint page_count; + if (!--extent->extent_count) + goto crashed; + extent->extent+= ROW_EXTENT_SIZE; + extent->page= page_korr(extent->extent); + page_count= (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) & + ~START_EXTENT_BIT); + if (!page_count) + goto crashed; + extent->tail= page_count & TAIL_BIT; + if (extent->tail) + extent->tail_row_nr= page_count & ~TAIL_BIT; + else + extent->page_count= page_count; + DBUG_PRINT("info",("New extent. Page: %lu page_count: %u tail_flag: %d", + (ulong) extent->page, extent->page_count, + extent->tail != 0)); + } + extent->first_extent= 0; + + lock= PAGECACHE_LOCK_LEFT_UNLOCKED; + if (extent->tail) + lock= extent->lock_for_tail_pages; + + buff= pagecache_read(share->pagecache, + &info->dfile, extent->page, 0, + info->buff, share->page_type, + lock, &page_link.link); + if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED) + { + /* Read during UNDO */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + if (!buff) + { + /* check if we tried to read over end of file (ie: bad data in record) */ + if ((extent->page + 1) * share->block_size > + share->state.state.data_file_length) + goto crashed; + DBUG_RETURN(0); + } + + if (!extent->tail) + { + /* Full data page */ + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE) + goto crashed; + extent->page++; /* point to next page */ + extent->page_count--; + *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE; + info->cur_row.full_page_count++; /* For maria_chk */ + DBUG_RETURN(extent->data_start= buff + LSN_SIZE + PAGE_TYPE_SIZE); + } + + /* Found tail */ + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE) + goto crashed; + *(extent->tail_positions++)= ma_recordpos(extent->page, + extent->tail_row_nr); + info->cur_row.tail_count++; /* For maria_chk */ + + if (!(data= get_record_position(buff, share->block_size, + extent->tail_row_nr, + end_of_data))) + goto crashed; + extent->data_start= data; + extent->page_count= 0; /* No more data in extent */ + DBUG_RETURN(data); + + +crashed: + my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */ + DBUG_PRINT("error", ("wrong extent information")); + DBUG_RETURN(0); +} + + +/* + Read data that may be split over many blocks + + SYNOPSIS + read_long_data() + info Maria handler + to Store result string here (this is allocated) + extent Pointer to current extent position + data Current position in buffer + end_of_data End of data in buffer + + NOTES + When we have to read a new buffer, it's read into info->buff + + This loop is implemented by goto's instead of a for() loop as + the code is notable smaller and faster this way (and it's not nice + to jump into a for loop() or into a 'then' clause) + + RETURN + 0 ok + 1 error +*/ + +static my_bool read_long_data2(MARIA_HA *info, uchar *to, ulong length, + MARIA_EXTENT_CURSOR *extent, + uchar **data, uchar **end_of_data) +{ + uint left_length= (uint) (*end_of_data - *data); + DBUG_ENTER("read_long_data2"); + DBUG_PRINT("enter", ("length: %lu left_length: %u", + length, left_length)); + DBUG_ASSERT(*data <= *end_of_data); + + /* + Fields are never split in middle. This means that if length > rest-of-data + we should start reading from the next extent. The reason we may have + data left on the page is that if the fixed part of the row was less than + min_block_length the head block was extended to min_block_length. + + This may change in the future, which is why we have the loop written + the way it's written. + */ + if (extent->first_extent && length > left_length) + { + *end_of_data= *data; + left_length= 0; + } + + for(;;) + { + if (unlikely(left_length >= length)) + { + memcpy(to, *data, length); + (*data)+= length; + DBUG_PRINT("info", ("left_length: %u", left_length - (uint) length)); + DBUG_RETURN(0); + } + memcpy(to, *data, left_length); + to+= left_length; + length-= left_length; + if (!(*data= read_next_extent(info, extent, end_of_data))) + break; + left_length= (uint) (*end_of_data - *data); + } + DBUG_RETURN(1); +} + +static inline my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length, + MARIA_EXTENT_CURSOR *extent, + uchar **data, uchar **end_of_data) +{ + uint left_length= (uint) (*end_of_data - *data); + if (likely(left_length >= length)) + { + memcpy(to, *data, length); + (*data)+= length; + return 0; + } + return read_long_data2(info, to, length, extent, data, end_of_data); +} + + +/* + Read a record from page (helper function for _ma_read_block_record()) + + SYNOPSIS + _ma_read_block_record2() + info Maria handler + record Store record here + data Start of head data for row + end_of_data End of data for row + + NOTES + The head page is already read by caller + Following data is update in info->cur_row: + + cur_row.head_length is set to size of entry in head block + cur_row.tail_positions is set to point to all tail blocks + cur_row.extents points to extents data + cur_row.extents_counts contains number of extents + cur_row.empty_bits is set to empty bits + cur_row.field_lengths contains packed length of all fields + cur_row.blob_length contains total length of all blobs + cur_row.checksum contains checksum of read record. + + RETURN + 0 ok + # Error code +*/ + +int _ma_read_block_record2(MARIA_HA *info, uchar *record, + uchar *data, uchar *end_of_data) +{ + MARIA_SHARE *share= info->s; + uchar *field_length_data, *blob_buffer, *start_of_data; + uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths; + my_bool found_blob= 0; + MARIA_EXTENT_CURSOR extent; + MARIA_COLUMNDEF *column, *end_column; + MARIA_ROW *cur_row= &info->cur_row; + DBUG_ENTER("_ma_read_block_record2"); + + LINT_INIT(field_length_data); + LINT_INIT(blob_buffer); + + start_of_data= data; + flag= (uint) (uchar) data[0]; + cur_null_bytes= share->base.original_null_bytes; + null_bytes= share->base.null_bytes; + cur_row->head_length= (uint) (end_of_data - data); + cur_row->full_page_count= cur_row->tail_count= 0; + cur_row->blob_length= 0; + /* Number of bytes in header that we don't need to write during undo */ + cur_row->header_length= total_header_size[(flag & PRECALC_HEADER_BITMASK)]-1; + + if (flag & ROW_FLAG_TRANSID) + { + cur_row->trid= transid_korr(data+1); + if (!info->trn) + DBUG_RETURN(my_errno= HA_ERR_WRONG_IN_RECORD); /* File crashed */ + if (!trnman_can_read_from(info->trn, cur_row->trid)) + DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE); + } + + /* Skip trans header (for now, until we have MVCC csupport) */ + data+= cur_row->header_length + 1 ; + if (flag & ROW_FLAG_NULLS_EXTENDED) + cur_null_bytes+= data[-1]; + + row_extents= 0; + if (flag & ROW_FLAG_EXTENTS) + { + uint row_extent_size; + /* + Record is split over many data pages. + Get number of extents and first extent + */ + get_key_length(row_extents, data); + cur_row->extents_count= row_extents; + row_extent_size= row_extents * ROW_EXTENT_SIZE; + if (cur_row->extents_buffer_length < row_extent_size && + _ma_alloc_buffer(&cur_row->extents, + &cur_row->extents_buffer_length, + row_extent_size)) + DBUG_RETURN(my_errno); + memcpy(cur_row->extents, data, ROW_EXTENT_SIZE); + data+= ROW_EXTENT_SIZE; + init_extent(&extent, cur_row->extents, row_extents, + cur_row->tail_positions); + } + else + { + cur_row->extents_count= 0; + (*cur_row->tail_positions)= 0; + extent.page_count= 0; + extent.extent_count= 1; + } + extent.first_extent= 1; + + field_lengths= 0; + if (share->base.max_field_lengths) + { + get_key_length(field_lengths, data); + cur_row->field_lengths_length= field_lengths; +#ifdef SANITY_CHECKS + if (field_lengths > share->base.max_field_lengths) + goto err; +#endif + } + + if (share->calc_checksum) + cur_row->checksum= (uint) (uchar) *data++; + /* data now points on null bits */ + memcpy(record, data, cur_null_bytes); + if (unlikely(cur_null_bytes != null_bytes)) + { + /* + This only happens if we have added more NULL columns with + ALTER TABLE and are fetching an old, not yet modified old row + */ + bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes)); + } + data+= null_bytes; + /* We copy the empty bits to be able to use them for delete/update */ + memcpy(cur_row->empty_bits, data, share->base.pack_bytes); + data+= share->base.pack_bytes; + + /* TODO: Use field offsets, instead of just skipping them */ + data+= share->base.field_offsets * FIELD_OFFSET_SIZE; + + /* + Read row extents (note that first extent was already read into + cur_row->extents above) + */ + if (row_extents > 1) + { + if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE, + (row_extents - 1) * ROW_EXTENT_SIZE, + &extent, &data, &end_of_data)) + DBUG_RETURN(my_errno); + } + + /* + Data now points to start of fixed length field data that can't be null + or 'empty'. Note that these fields can't be split over blocks. + */ + for (column= share->columndef, + end_column= column + share->base.fixed_not_null_fields; + column < end_column; column++) + { + uint column_length= column->length; + if (data + column_length > end_of_data && + !(data= read_next_extent(info, &extent, &end_of_data))) + goto err; + memcpy(record + column->offset, data, column_length); + data+= column_length; + } + + /* Read array of field lengths. This may be stored in several extents */ + if (field_lengths) + { + field_length_data= cur_row->field_lengths; + if (read_long_data(info, field_length_data, field_lengths, &extent, + &data, &end_of_data)) + DBUG_RETURN(my_errno); + } + + /* Read variable length data. Each of these may be split over many extents */ + for (end_column= share->columndef + share->base.fields; + column < end_column; column++) + { + enum en_fieldtype type= column->type; + uchar *field_pos= record + column->offset; + /* First check if field is present in record */ + if ((record[column->null_pos] & column->null_bit) || + (cur_row->empty_bits[column->empty_pos] & column->empty_bit)) + { + bfill(record + column->offset, column->fill_length, + type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + continue; + } + switch (type) { + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_SKIP_PRESPACE: + case FIELD_SKIP_ZERO: /* Fixed length field */ + if (data + column->length > end_of_data && + !(data= read_next_extent(info, &extent, &end_of_data))) + goto err; + memcpy(field_pos, data, column->length); + data+= column->length; + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + /* Char that is space filled */ + uint length; + if (column->length <= 255) + length= (uint) (uchar) *field_length_data++; + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + } +#ifdef SANITY_CHECKS + if (length > column->length) + goto err; +#endif + if (read_long_data(info, field_pos, length, &extent, &data, + &end_of_data)) + DBUG_RETURN(my_errno); + bfill(field_pos + length, column->length - length, ' '); + break; + } + case FIELD_VARCHAR: + { + ulong length; + if (column->length <= 256) + { + length= (uint) (uchar) (*field_pos++= *field_length_data++); + } + else + { + length= uint2korr(field_length_data); + field_pos[0]= field_length_data[0]; + field_pos[1]= field_length_data[1]; + field_pos+= 2; + field_length_data+= 2; + } +#ifdef SANITY_CHECKS + if (length > column->length) + goto err; +#endif + if (read_long_data(info, field_pos, length, &extent, &data, + &end_of_data)) + DBUG_RETURN(my_errno); + break; + } + case FIELD_BLOB: + { + uint column_size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(column_size_length, + field_length_data); + + if (!found_blob) + { + /* Calculate total length for all blobs */ + ulong blob_lengths= 0; + uchar *length_data= field_length_data; + MARIA_COLUMNDEF *blob_field= column; + + found_blob= 1; + for (; blob_field < end_column; blob_field++) + { + uint size_length; + if ((record[blob_field->null_pos] & blob_field->null_bit) || + (cur_row->empty_bits[blob_field->empty_pos] & + blob_field->empty_bit)) + continue; + size_length= blob_field->length - portable_sizeof_char_ptr; + blob_lengths+= _ma_calc_blob_length(size_length, length_data); + length_data+= size_length; + } + cur_row->blob_length= blob_lengths; + DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths)); + if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size, + blob_lengths)) + DBUG_RETURN(my_errno); + blob_buffer= info->blob_buff; + } + + memcpy(field_pos, field_length_data, column_size_length); + memcpy_fixed(field_pos + column_size_length, (uchar *) &blob_buffer, + sizeof(char*)); + field_length_data+= column_size_length; + + /* + After we have read one extent, then each blob is in it's own extent + */ + if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length) + end_of_data= data; /* Force read of next extent */ + + if (read_long_data(info, blob_buffer, blob_length, &extent, &data, + &end_of_data)) + DBUG_RETURN(my_errno); + blob_buffer+= blob_length; + break; + } + default: +#ifdef EXTRA_DEBUG + DBUG_ASSERT(0); /* purecov: deadcode */ +#endif + goto err; + } + continue; + } + + if (row_extents) + { + DBUG_PRINT("info", ("Row read: page_count: %u extent_count: %u", + extent.page_count, extent.extent_count)); + *extent.tail_positions= 0; /* End marker */ + if (extent.page_count) + goto err; + if (extent.extent_count > 1) + { + if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE, + (extent.extent_count-1) * ROW_EXTENT_SIZE)) + { + DBUG_PRINT("error", ("Data in extent is not zero")); + DBUG_DUMP("extent", extent.extent + ROW_EXTENT_SIZE, + (extent.extent_count-1) * ROW_EXTENT_SIZE); + goto err; + } + } + } + else + { + DBUG_PRINT("info", ("Row read")); + /* + data should normally point to end_of_date. The only exception is if + the row is very short in which case we allocated 'min_block_length' data + for allowing the row to expand. + */ + if (data != end_of_data && (uint) (end_of_data - start_of_data) > + share->base.min_block_length) + goto err; + } +#ifdef EXTRA_DEBUG + if (share->calc_checksum) + { + /* Esnure that row checksum is correct */ + DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) == + cur_row->checksum); + } +#endif + info->update|= HA_STATE_AKTIV; /* We have an active record */ + DBUG_RETURN(0); + +err: + /* Something was wrong with data on record */ + DBUG_PRINT("error", ("Found record with wrong data")); + DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD)); +} + + +/** @brief Read positions to tail blocks and full blocks + + @fn read_row_extent_info() + @param info Handler + + @notes + This function is a simpler version of _ma_read_block_record2() + The data about the used pages is stored in info->cur_row. + + @return Status + @retval 0 ok + @retval 1 Error. my_errno contains error number +*/ + +static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff, + uint record_number) +{ + MARIA_SHARE *share= info->s; + MARIA_EXTENT_CURSOR extent; + MARIA_RECORD_POS *tail_pos; + uchar *data, *end_of_data; + uint flag, row_extents, row_extents_size, field_lengths; + uchar *extents, *end; + DBUG_ENTER("read_row_extent_info"); + + if (!(data= get_record_position(buff, share->block_size, + record_number, &end_of_data))) + DBUG_RETURN(1); /* Wrong in record */ + + flag= (uint) (uchar) data[0]; + /* Skip trans header */ + data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)]; + + row_extents= 0; + row_extents_size= 0; + if (flag & ROW_FLAG_EXTENTS) + { + /* + Record is split over many data pages. + Get number of extents and first extent + */ + get_key_length(row_extents, data); + row_extents_size= row_extents * ROW_EXTENT_SIZE; + if (info->cur_row.extents_buffer_length < row_extents_size && + _ma_alloc_buffer(&info->cur_row.extents, + &info->cur_row.extents_buffer_length, + row_extents_size)) + DBUG_RETURN(1); + memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE); + data+= ROW_EXTENT_SIZE; + init_extent(&extent, info->cur_row.extents, row_extents, + info->cur_row.tail_positions); + extent.first_extent= 1; + } + info->cur_row.extents_count= row_extents; + + if (share->base.max_field_lengths) + get_key_length(field_lengths, data); + + if (share->calc_checksum) + info->cur_row.checksum= (uint) (uchar) *data++; + if (row_extents > 1) + { + data+= share->base.null_bytes; + data+= share->base.pack_bytes; + data+= share->base.field_offsets * FIELD_OFFSET_SIZE; + + /* + Read row extents (note that first extent was already read into + info->cur_row.extents above) + Lock tails with write lock as we will delete them later. + */ + extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED; + if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE, + row_extents_size - ROW_EXTENT_SIZE, + &extent, &data, &end_of_data)) + DBUG_RETURN(1); + } + + /* Update tail_positions with pointer to tails */ + tail_pos= info->cur_row.tail_positions; + for (extents= info->cur_row.extents, end= extents + row_extents_size; + extents < end; + extents+= ROW_EXTENT_SIZE) + { + pgcache_page_no_t page= uint5korr(extents); + uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE); + if (page_count & TAIL_BIT) + *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT | + START_EXTENT_BIT))); + } + *tail_pos= 0; /* End marker */ + DBUG_RETURN(0); +} + + +/* + Read a record based on record position + + @fn _ma_read_block_record() + @param info Maria handler + @param record Store record here + @param record_pos Record position + + @return Status + @retval 0 ok + @retval # Error number +*/ + +int _ma_read_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS record_pos) +{ + MARIA_SHARE *share= info->s; + uchar *data, *end_of_data, *buff; + uint offset; + uint block_size= share->block_size; + DBUG_ENTER("_ma_read_block_record"); + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) record_pos, + (ulong) ma_recordpos_to_page(record_pos), + ma_recordpos_to_dir_entry(record_pos))); + + offset= ma_recordpos_to_dir_entry(record_pos); + + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, ma_recordpos_to_page(record_pos), 0, + info->buff, share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE); + if (!(data= get_record_position(buff, block_size, offset, &end_of_data))) + { + DBUG_PRINT("error", ("Wrong directory entry in data block")); + my_errno= HA_ERR_RECORD_DELETED; /* File crashed */ + DBUG_RETURN(HA_ERR_RECORD_DELETED); + } + DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data)); +} + + +/* compare unique constraint between stored rows */ + +my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos) +{ + uchar *org_rec_buff, *old_record; + size_t org_rec_buff_size; + int error; + DBUG_ENTER("_ma_cmp_block_unique"); + + if (!(old_record= my_alloca(info->s->base.reclength))) + DBUG_RETURN(1); + + /* Don't let the compare destroy blobs that may be in use */ + org_rec_buff= info->rec_buff; + org_rec_buff_size= info->rec_buff_size; + if (info->s->base.blobs) + { + /* Force realloc of record buffer*/ + info->rec_buff= 0; + info->rec_buff_size= 0; + } + error= _ma_read_block_record(info, old_record, pos); + if (!error) + error= _ma_unique_comp(def, record, old_record, def->null_are_equal); + if (info->s->base.blobs) + { + my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + info->rec_buff= org_rec_buff; + info->rec_buff_size= org_rec_buff_size; + } + DBUG_PRINT("exit", ("result: %d", error)); + my_afree(old_record); + DBUG_RETURN(error != 0); +} + + +/**************************************************************************** + Table scan +****************************************************************************/ + +/* + Allocate buffers for table scan + + SYNOPSIS + _ma_scan_init_block_record(MARIA_HA *info) + + IMPLEMENTATION + We allocate one buffer for the current bitmap and one buffer for the + current page + + RETURN + 0 ok + 1 error (couldn't allocate memory or disk error) +*/ + +my_bool _ma_scan_init_block_record(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_scan_init_block_record"); + /* + bitmap_buff may already be allocated if this is the second call to + rnd_init() without a rnd_end() in between, see sql/handler.h + */ + if (!(info->scan.bitmap_buff || + ((info->scan.bitmap_buff= + (uchar *) my_malloc(share->block_size * 2, MYF(MY_WME)))))) + DBUG_RETURN(1); + info->scan.page_buff= info->scan.bitmap_buff + share->block_size; + info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.total_size; + + /* Set scan variables to get _ma_scan_block() to start with reading bitmap */ + info->scan.number_of_rows= 0; + info->scan.bitmap_pos= info->scan.bitmap_end; + info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered; + info->scan.max_page= share->state.state.data_file_length / share->block_size; + /* + We need to flush what's in memory (bitmap.map) to page cache otherwise, as + we are going to read bitmaps from page cache in table scan (see + _ma_scan_block_record()), we may miss recently inserted rows (bitmap page + in page cache would be too old). + */ + DBUG_RETURN(_ma_bitmap_flush(info->s)); +} + + +/* Free buffers allocated by _ma_scan_block_init() */ + +void _ma_scan_end_block_record(MARIA_HA *info) +{ + DBUG_ENTER("_ma_scan_end_block_record"); + my_free(info->scan.bitmap_buff, MYF(MY_ALLOW_ZERO_PTR)); + info->scan.bitmap_buff= 0; + if (info->scan_save) + { + my_free(info->scan_save, MYF(0)); + info->scan_save= 0; + } + DBUG_VOID_RETURN; +} + + +/** + @brief Save current scan position + + @note + For the moment we can only remember one position, but this is + good enough for MySQL usage + + @Warning + When this function is called, we assume that the thread is not deleting + or updating the current row before ma_scan_restore_block_record() + is called! + + @return + @retval 0 ok + @retval HA_ERR_WRONG_IN_RECORD Could not allocate memory to hold position +*/ + +int _ma_scan_remember_block_record(MARIA_HA *info, + MARIA_RECORD_POS *lastpos) +{ + uchar *bitmap_buff; + DBUG_ENTER("_ma_scan_remember_block_record"); + if (!(info->scan_save)) + { + if (!(info->scan_save= my_malloc(ALIGN_SIZE(sizeof(*info->scan_save)) + + info->s->block_size * 2, + MYF(MY_WME)))) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + info->scan_save->bitmap_buff= ((uchar*) info->scan_save + + ALIGN_SIZE(sizeof(*info->scan_save))); + } + /* Point to the last read row */ + *lastpos= info->cur_row.nextpos - 1; + info->scan.dir+= DIR_ENTRY_SIZE; + + /* Remember used bitmap and used head page */ + bitmap_buff= info->scan_save->bitmap_buff; + memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save)); + info->scan_save->bitmap_buff= bitmap_buff; + memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2); + DBUG_RETURN(0); +} + + +/** + @brief restore scan block it's original values + + @note + In theory we could swap bitmap buffers instead of copy them. + For the moment we don't do that because there are variables pointing + inside the buffers and it's a bit of hassle to either make them relative + or repoint them. +*/ + +void _ma_scan_restore_block_record(MARIA_HA *info, + MARIA_RECORD_POS lastpos) +{ + uchar *bitmap_buff; + DBUG_ENTER("_ma_scan_restore_block_record"); + + info->cur_row.nextpos= lastpos; + bitmap_buff= info->scan.bitmap_buff; + memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save)); + info->scan.bitmap_buff= bitmap_buff; + memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2); + + DBUG_VOID_RETURN; +} + + +/* + Read next record while scanning table + + SYNOPSIS + _ma_scan_block_record() + info Maria handler + record Store found here + record_pos Value stored in info->cur_row.next_pos after last call + skip_deleted + + NOTES + - One must have called mi_scan() before this + - In this version, we don't actually need record_pos, we as easily + use a variable in info->scan + + IMPLEMENTATION + Current code uses a lot of goto's to separate the different kind of + states we may be in. This gives us a minimum of executed if's for + the normal cases. I tried several different ways to code this, but + the current one was in the end the most readable and fastest. + + RETURN + 0 ok + # Error code +*/ + +int _ma_scan_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS record_pos, + my_bool skip_deleted __attribute__ ((unused))) +{ + uint block_size; + my_off_t filepos; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_scan_block_record"); + +restart_record_read: + /* Find next row in current page */ + while (likely(record_pos < info->scan.number_of_rows)) + { + uint length, offset; + uchar *data, *end_of_data; + int error; + + while (!(offset= uint2korr(info->scan.dir))) + { + info->scan.dir-= DIR_ENTRY_SIZE; + record_pos++; +#ifdef SANITY_CHECKS + if (info->scan.dir < info->scan.dir_end) + { + DBUG_ASSERT(0); + goto err; + } +#endif + } + /* found row */ + info->cur_row.lastpos= info->scan.row_base_page + record_pos; + info->cur_row.nextpos= record_pos + 1; + data= info->scan.page_buff + offset; + length= uint2korr(info->scan.dir + 2); + end_of_data= data + length; + info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */ +#ifdef SANITY_CHECKS + if (end_of_data > info->scan.dir_end || + offset < PAGE_HEADER_SIZE || length < share->base.min_block_length) + { + DBUG_ASSERT(!(end_of_data > info->scan.dir_end)); + DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE)); + DBUG_ASSERT(!(length < share->base.min_block_length)); + goto err; + } +#endif + DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos)); + error= _ma_read_block_record2(info, record, data, end_of_data); + if (error != HA_ERR_ROW_NOT_VISIBLE) + DBUG_RETURN(error); + record_pos++; + } + + /* Find next head page in current bitmap */ +restart_bitmap_scan: + block_size= share->block_size; + if (likely(info->scan.bitmap_pos < info->scan.bitmap_end)) + { + uchar *data= info->scan.bitmap_pos; + longlong bits= info->scan.bits; + uint bit_pos= info->scan.bit_pos; + + do + { + while (likely(bits)) + { + uint pattern= (uint) (bits & 7); + bits >>= 3; + bit_pos++; + if (pattern > 0 && pattern <= 4) + { + /* Found head page; Read it */ + pgcache_page_no_t page; + info->scan.bitmap_pos= data; + info->scan.bits= bits; + info->scan.bit_pos= bit_pos; + page= (info->scan.bitmap_page + 1 + + (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1); + info->scan.row_base_page= ma_recordpos(page, 0); + if (page >= info->scan.max_page) + { + DBUG_PRINT("info", ("Found end of file")); + DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE)); + } + if (!(pagecache_read(share->pagecache, + &info->dfile, + page, 0, info->scan.page_buff, + share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != + HEAD_PAGE)) + { + /* + This may happen if someone has been deleting all rows + from a page since we read the bitmap, so it may be ok. + Print warning in debug log and continue. + */ + DBUG_PRINT("warning", + ("Found page of type %d when expecting head page", + (info->scan.page_buff[PAGE_TYPE_OFFSET] & + PAGE_TYPE_MASK))); + continue; + } + if ((info->scan.number_of_rows= + (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0) + { + DBUG_PRINT("error", ("Wrong page header")); + DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD)); + } + DBUG_PRINT("info", ("Page %lu has %u rows", + (ulong) page, info->scan.number_of_rows)); + info->scan.dir= (info->scan.page_buff + block_size - + PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE); + info->scan.dir_end= (info->scan.dir - + (info->scan.number_of_rows - 1) * + DIR_ENTRY_SIZE); + record_pos= 0; + goto restart_record_read; + } + } + for (data+= 6; data < info->scan.bitmap_end; data+= 6) + { + bits= uint6korr(data); + /* Skip not allocated pages and blob / full tail pages */ + if (bits && bits != LL(07777777777777777)) + break; + } + bit_pos= 0; + } while (data < info->scan.bitmap_end); + } + + /* Read next bitmap */ + info->scan.bitmap_page+= share->bitmap.pages_covered; + filepos= (my_off_t) info->scan.bitmap_page * block_size; + if (unlikely(filepos >= share->state.state.data_file_length)) + { + DBUG_PRINT("info", ("Found end of file")); + DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE)); + } + DBUG_PRINT("info", ("Reading bitmap at %lu", + (ulong) info->scan.bitmap_page)); + if (!(pagecache_read(share->pagecache, &info->s->bitmap.file, + info->scan.bitmap_page, + 0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + /* Skip scanning 'bits' in bitmap scan code */ + info->scan.bitmap_pos= info->scan.bitmap_buff - 6; + info->scan.bits= 0; + goto restart_bitmap_scan; + +err: + DBUG_PRINT("error", ("Wrong data on page")); + DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD)); +} + + +/* + Compare a row against a stored one + + NOTES + Not implemented, as block record is not supposed to be used in a shared + global environment +*/ + +my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)), + const uchar *record __attribute__ ((unused))) +{ + return 0; +} + + +/* + Store an integer with simple packing + + SYNOPSIS + ma_store_integer() + to Store the packed integer here + nr Integer to store + + NOTES + This is mostly used to store field numbers and lengths of strings. + We have to cast the result for the LL() becasue of a bug in Forte CC + compiler. + + Packing used is: + nr < 251 is stored as is (in 1 byte) + Numbers that require 1-4 bytes are stored as char(250+byte_length), data + Bigger numbers are stored as 255, data as ulonglong (not yet done). + + RETURN + Position in 'to' after the packed length +*/ + +uchar *ma_store_length(uchar *to, ulong nr) +{ + if (nr < 251) + { + *to=(uchar) nr; + return to+1; + } + if (nr < 65536) + { + if (nr <= 255) + { + to[0]= (uchar) 251; + to[1]= (uchar) nr; + return to+2; + } + to[0]= (uchar) 252; + int2store(to+1, nr); + return to+3; + } + if (nr < 16777216) + { + *to++= (uchar) 253; + int3store(to, nr); + return to+3; + } + *to++= (uchar) 254; + int4store(to, nr); + return to+4; +} + + +/* Calculate how many bytes needed to store a number */ + +uint ma_calc_length_for_store_length(ulong nr) +{ + if (nr < 251) + return 1; + if (nr < 65536) + { + if (nr <= 255) + return 2; + return 3; + } + if (nr < 16777216) + return 4; + return 5; +} + + +/* Retrive a stored number */ + +static ulong ma_get_length(const uchar **packet) +{ + reg1 const uchar *pos= *packet; + if (*pos < 251) + { + (*packet)++; + return (ulong) *pos; + } + if (*pos == 251) + { + (*packet)+= 2; + return (ulong) pos[1]; + } + if (*pos == 252) + { + (*packet)+= 3; + return (ulong) uint2korr(pos+1); + } + if (*pos == 253) + { + (*packet)+= 4; + return (ulong) uint3korr(pos+1); + } + DBUG_ASSERT(*pos == 254); + (*packet)+= 5; + return (ulong) uint4korr(pos+1); +} + + +/* + Fill array with pointers to field parts to be stored in log for insert + + SYNOPSIS + fill_insert_undo_parts() + info Maria handler + record Inserted row + log_parts Store pointers to changed memory areas here + log_parts_count See RETURN + + NOTES + We have information in info->cur_row about the read row. + + RETURN + length of data in log_parts. + log_parts_count contains number of used log_parts +*/ + +static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record, + LEX_CUSTRING *log_parts, + uint *log_parts_count) +{ + MARIA_SHARE *share= info->s; + MARIA_COLUMNDEF *column, *end_column; + uchar *field_lengths= info->cur_row.field_lengths; + size_t row_length; + MARIA_ROW *cur_row= &info->cur_row; + LEX_CUSTRING *start_log_parts; + DBUG_ENTER("fill_insert_undo_parts"); + + start_log_parts= log_parts; + + /* Store null bits */ + log_parts->str= record; + log_parts->length= share->base.null_bytes; + row_length= log_parts->length; + log_parts++; + + /* Stored bitmap over packed (zero length or all-zero fields) */ + log_parts->str= info->cur_row.empty_bits; + log_parts->length= share->base.pack_bytes; + row_length+= log_parts->length; + log_parts++; + + if (share->base.max_field_lengths) + { + /* Store length of all not empty char, varchar and blob fields */ + log_parts->str= field_lengths - 2; + log_parts->length= info->cur_row.field_lengths_length+2; + int2store(log_parts->str, info->cur_row.field_lengths_length); + row_length+= log_parts->length; + log_parts++; + } + + if (share->base.blobs) + { + /* + Store total blob length to make buffer allocation easier during UNDO + */ + log_parts->str= info->length_buff; + log_parts->length= (uint) (ma_store_length(info->length_buff, + info->cur_row.blob_length) - + (uchar*) log_parts->str); + row_length+= log_parts->length; + log_parts++; + } + + /* Handle constant length fields that are always present */ + for (column= share->columndef, + end_column= column+ share->base.fixed_not_null_fields; + column < end_column; + column++) + { + log_parts->str= record + column->offset; + log_parts->length= column->length; + row_length+= log_parts->length; + log_parts++; + } + + /* Handle NULL fields and CHAR/VARCHAR fields */ + for (end_column= share->columndef + share->base.fields - share->base.blobs; + column < end_column; + column++) + { + const uchar *column_pos; + size_t column_length; + if ((record[column->null_pos] & column->null_bit) || + cur_row->empty_bits[column->empty_pos] & column->empty_bit) + continue; + + column_pos= record+ column->offset; + column_length= column->length; + + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + case FIELD_SKIP_ZERO: /* Fixed length field */ + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + if (column->length <= 255) + column_length= *field_lengths++; + else + { + column_length= uint2korr(field_lengths); + field_lengths+= 2; + } + break; + } + case FIELD_VARCHAR: + { + if (column->fill_length == 1) + column_length= *field_lengths; + else + column_length= uint2korr(field_lengths); + field_lengths+= column->fill_length; + column_pos+= column->fill_length; + break; + } + default: + DBUG_ASSERT(0); + } + log_parts->str= column_pos; + log_parts->length= column_length; + row_length+= log_parts->length; + log_parts++; + } + + /* Add blobs */ + for (end_column+= share->base.blobs; column < end_column; column++) + { + const uchar *field_pos= record + column->offset; + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_pos); + + /* + We don't have to check for null, as blob_length is guranteed to be 0 + if the blob is null + */ + if (blob_length) + { + uchar *blob_pos; + memcpy_fixed(&blob_pos, record + column->offset + size_length, + sizeof(blob_pos)); + log_parts->str= blob_pos; + log_parts->length= blob_length; + row_length+= log_parts->length; + log_parts++; + } + } + *log_parts_count= (uint) (log_parts - start_log_parts); + DBUG_RETURN(row_length); +} + + +/* + Fill array with pointers to field parts to be stored in log for update + + SYNOPSIS + fill_update_undo_parts() + info Maria handler + oldrec Original row + newrec New row + log_parts Store pointers to changed memory areas here + log_parts_count See RETURN + + IMPLEMENTATION + Format of undo record: + + Fields are stored in same order as the field array. + + Offset to changed field data (packed) + + For each changed field + Fieldnumber (packed) + Length, if variable length field (packed) + + For each changed field + Data + + Packing is using ma_store_integer() + + The reason we store field numbers & length separated from data (ie, not + after each other) is to get better cpu caching when we loop over + fields (as we probably don't have to access data for each field when we + want to read and old row through the undo log record). + + As a special case, we use '255' for the field number of the null bitmap. + + RETURN + length of data in log_parts. + log_parts_count contains number of used log_parts +*/ + +static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec, + const uchar *newrec, + LEX_CUSTRING *log_parts, + uint *log_parts_count) +{ + MARIA_SHARE *share= info->s; + MARIA_COLUMNDEF *column, *end_column; + MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row; + uchar *field_data, *start_field_data, *length_str; + uchar *old_field_lengths= old_row->field_lengths; + uchar *new_field_lengths= new_row->field_lengths; + size_t row_length= 0; + uint field_lengths; + LEX_CUSTRING *start_log_parts; + my_bool new_column_is_empty; + DBUG_ENTER("fill_update_undo_parts"); + + start_log_parts= log_parts; + + /* + First log part is for number of fields, field numbers and lengths + The +4 is to reserve place for the number of changed fields. + */ + start_field_data= field_data= info->update_field_data + 4; + log_parts++; + + if (memcmp(oldrec, newrec, share->base.null_bytes)) + { + /* Store changed null bits */ + *field_data++= (uchar) 255; /* Special case */ + log_parts->str= oldrec; + log_parts->length= share->base.null_bytes; + row_length= log_parts->length; + log_parts++; + } + + /* Handle constant length fields */ + for (column= share->columndef, + end_column= column+ share->base.fixed_not_null_fields; + column < end_column; + column++) + { + if (memcmp(oldrec + column->offset, newrec + column->offset, + column->length)) + { + field_data= ma_store_length(field_data, + (uint) (column - share->columndef)); + log_parts->str= oldrec + column->offset; + log_parts->length= column->length; + row_length+= column->length; + log_parts++; + } + } + + /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */ + for (end_column= share->columndef + share->base.fields; + column < end_column; + column++) + { + const uchar *new_column_pos, *old_column_pos; + size_t new_column_length, old_column_length; + + /* First check if old column is null or empty */ + if (oldrec[column->null_pos] & column->null_bit) + { + /* + It's safe to skip this one as either the new column is also null + (no change) or the new_column is not null, in which case the null-bit + maps differed and we have already stored the null bitmap. + */ + continue; + } + if (old_row->empty_bits[column->empty_pos] & column->empty_bit) + { + if (new_row->empty_bits[column->empty_pos] & column->empty_bit) + continue; /* Both are empty; skip */ + + /* Store null length column */ + field_data= ma_store_length(field_data, + (uint) (column - share->columndef)); + field_data= ma_store_length(field_data, 0); + continue; + } + /* + Remember if the 'new' value is empty (as in this case we must always + log the original value + */ + new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) || + (new_row->empty_bits[column->empty_pos] & + column->empty_bit)); + + old_column_pos= oldrec + column->offset; + new_column_pos= newrec + column->offset; + old_column_length= new_column_length= column->length; + + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + case FIELD_SKIP_ZERO: /* Fixed length field */ + break; + case FIELD_VARCHAR: + new_column_length--; /* Skip length prefix */ + old_column_pos+= column->fill_length; + new_column_pos+= column->fill_length; + /* Fall through */ + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + if (new_column_length <= 255) + { + old_column_length= *old_field_lengths++; + if (!new_column_is_empty) + new_column_length= *new_field_lengths++; + } + else + { + old_column_length= uint2korr(old_field_lengths); + old_field_lengths+= 2; + if (!new_column_is_empty) + { + new_column_length= uint2korr(new_field_lengths); + new_field_lengths+= 2; + } + } + break; + } + case FIELD_BLOB: + { + uint size_length= column->length - portable_sizeof_char_ptr; + old_column_length= _ma_calc_blob_length(size_length, old_column_pos); + memcpy_fixed((uchar*) &old_column_pos, + oldrec + column->offset + size_length, + sizeof(old_column_pos)); + if (!new_column_is_empty) + { + new_column_length= _ma_calc_blob_length(size_length, new_column_pos); + memcpy_fixed((uchar*) &new_column_pos, + newrec + column->offset + size_length, + sizeof(old_column_pos)); + } + break; + } + default: + DBUG_ASSERT(0); + } + + if (new_column_is_empty || new_column_length != old_column_length || + memcmp(old_column_pos, new_column_pos, new_column_length)) + { + field_data= ma_store_length(field_data, + (ulong) (column - share->columndef)); + field_data= ma_store_length(field_data, (ulong) old_column_length); + + log_parts->str= old_column_pos; + log_parts->length= old_column_length; + row_length+= old_column_length; + log_parts++; + } + } + + *log_parts_count= (uint) (log_parts - start_log_parts); + + /* Store length of field length data before the field/field_lengths */ + field_lengths= (uint) (field_data - start_field_data); + length_str= start_field_data - ma_calc_length_for_store_length(field_lengths); + start_log_parts->str= length_str; + ma_store_length(length_str, field_lengths); + start_log_parts->length= (size_t) (field_data - start_log_parts->str); + row_length+= start_log_parts->length; + DBUG_RETURN(row_length); +} + +/*************************************************************************** + In-write hooks called under log's lock when log record is written +***************************************************************************/ + +/** + @brief Sets transaction's rec_lsn if needed + + A transaction sometimes writes a REDO even before the page is in the + pagecache (example: brand new head or tail pages; full pages). So, if + Checkpoint happens just after the REDO write, it needs to know that the + REDO phase must start before this REDO. Scanning the pagecache cannot + tell that as the page is not in the cache. So, transaction sets its rec_lsn + to the REDO's LSN or somewhere before, and Checkpoint reads the + transaction's rec_lsn. + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_redo(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, void *hook_arg + __attribute__ ((unused))) +{ + /* + Users of dummy_transaction_object must keep this TRN clean as it + is used by many threads (like those manipulating non-transactional + tables). It might be dangerous if one user sets rec_lsn or some other + member and it is picked up by another user (like putting this rec_lsn into + a page of a non-transactional table); it's safer if all members stay 0. So + non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not + call this hook; we trust them but verify ;) + */ + DBUG_ASSERT(trn->trid != 0); + /* + If the hook stays so simple, it would be faster to pass + !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn + to translog_write_record(), like Monty did in his original code, and not + have a hook. For now we keep it like this. + */ + if (trn->rec_lsn == 0) + trn->rec_lsn= *lsn; + return 0; +} + + +/** + @brief Sets transaction's undo_lsn, first_undo_lsn if needed + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, void *hook_arg + __attribute__ ((unused))) +{ + DBUG_ASSERT(trn->trid != 0); + trn->undo_lsn= *lsn; + if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0)) + trn->first_undo_lsn= + trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn); + return 0; + /* + when we implement purging, we will specialize this hook: UNDO_PURGE + records will additionally set trn->undo_purge_lsn + */ +} + + +/** + @brief Sets the table's records count and checksum and others to 0, then + calls the generic REDO hook. + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_redo_delete_all(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, void *hook_arg) +{ + _ma_reset_status(tbl_info); + return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Updates "records" and "checksum" and calls the generic UNDO hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_row_insert(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + MARIA_SHARE *share= tbl_info->s; + share->state.state.records++; + share->state.state.checksum+= *(ha_checksum *)hook_arg; + return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Upates "records" and calls the generic UNDO hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_row_delete(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + MARIA_SHARE *share= tbl_info->s; + share->state.state.records--; + share->state.state.checksum+= *(ha_checksum *)hook_arg; + return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Upates "records" and "checksum" and calls the generic UNDO hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_row_update(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + MARIA_SHARE *share= tbl_info->s; + share->state.state.checksum+= *(ha_checksum *)hook_arg; + return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); +} + + +my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + /* + We are going to call maria_delete_all_rows(), but without logging and + syncing, as an optimization (if we crash before commit, the UNDO will + empty; if we crash after commit, we have flushed and forced the files). + Status still needs to be reset under log mutex, in case of a concurrent + checkpoint. + */ + _ma_reset_status(tbl_info); + return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Updates table's lsn_of_file_id. + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_file_id(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn + __attribute__ ((unused)), + MARIA_HA *tbl_info, + LSN *lsn, + void *hook_arg + __attribute__ ((unused))) +{ + DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0); + tbl_info->s->lsn_of_file_id= *lsn; + return 0; +} + + +/** + Updates transaction's rec_lsn when committing. + + A transaction writes its commit record before being committed in trnman, so + if Checkpoint happens just between the COMMIT record log write and the + commit in trnman, it will record that transaction is not committed. Assume + the transaction (trn1) did an INSERT; after the checkpoint, a second + transaction (trn2) does a DELETE of what trn1 has inserted. Then crash, + Checkpoint record says that trn1 was not committed, and REDO phase starts + from Checkpoint record's LSN. So it will not find the COMMIT record of + trn1, will want to roll back trn1, which will fail because the row/key + which it wants to delete does not exist anymore. + To avoid this, Checkpoint needs to know that the REDO phase must start + before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's + record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint + will know. + + @note so after commit trn->rec_lsn is a "commit LSN", which could be of + use later. + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_commit(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, + MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, + void *hook_arg + __attribute__ ((unused))) +{ + trn->rec_lsn= *lsn; + return 0; +} + + +/*************************************************************************** + Applying of REDO log records +***************************************************************************/ + +/* + Apply changes to head and tail pages + + SYNOPSIS + _ma_apply_redo_insert_row_head_or_tail() + info Maria handler + lsn LSN to put on page + page_type HEAD_PAGE or TAIL_PAGE + new_page True if this is first entry on page + header Header (without FILEID) + data Data to be put on page + data_length Length of data + + NOTE + Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL + LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL + + RETURN + 0 ok + # Error number +*/ + +uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + my_bool new_page, + const uchar *header, + const uchar *data, + size_t data_length) +{ + MARIA_SHARE *share= info->s; + pgcache_page_no_t page; + uint rownr, empty_space; + uint block_size= share->block_size; + uint rec_offset; + uchar *buff, *dir; + uint result; + MARIA_PINNED_PAGE page_link; + enum pagecache_page_lock unlock_method; + enum pagecache_page_pin unpin_method; + my_off_t end_of_page; + uint error; + DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail"); + + page= page_korr(header); + rownr= dirpos_korr(header + PAGE_STORE_SIZE); + + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u data_length: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr, (uint) data_length)); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + end_of_page= (page + 1) * share->block_size; + if (end_of_page > share->state.state.data_file_length) + { + DBUG_PRINT("info", ("Enlarging data file from %lu to %lu", + (ulong) share->state.state.data_file_length, + (ulong) end_of_page)); + /* + New page at end of file. Note that the test above is also positive if + data_file_length is not a multiple of block_size (system crashed while + writing the last page): in this case we just extend the last page and + fill it entirely with zeroes, then the REDO will put correct data on + it. + */ + unlock_method= PAGECACHE_LOCK_WRITE; + unpin_method= PAGECACHE_PIN; + + DBUG_ASSERT(rownr == 0 && new_page); + if (rownr != 0 || !new_page) + goto crashed_file; + + buff= info->keyread_buff; + info->keyread_buff_used= 1; + make_empty_page(info, buff, page_type, 1); + empty_space= (block_size - PAGE_OVERHEAD_SIZE); + rec_offset= PAGE_HEADER_SIZE; + dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; + } + else + { + unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED; + unpin_method= PAGECACHE_PIN_LEFT_PINNED; + + share->pagecache->readwrite_flags&= ~MY_WME; + buff= pagecache_read(share->pagecache, &info->dfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link); + share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags; + if (!buff) + { + /* Skip errors when reading outside of file and uninitialized pages */ + if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT && + my_errno != HA_ERR_WRONG_CRC)) + { + DBUG_PRINT("error", ("Error %d when reading page", (int) my_errno)); + goto err; + } + /* Create new page */ + buff= pagecache_block_link_to_buffer(page_link.link); + buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; + } + else if (lsn_korr(buff) >= lsn) /* Test if already applied */ + { + /* Fix bitmap, just in case */ + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + goto err; + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + DBUG_RETURN(0); + } + + if (((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type)) + { + /* + This is a page that has been freed before and now should be + changed to new type. + */ + if (!new_page) + { + DBUG_PRINT("error", + ("Found page of wrong type: %u, should have been %u", + (uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK), + page_type)); + goto crashed_file; + } + make_empty_page(info, buff, page_type, 0); + empty_space= block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE; + (void) extend_directory(page_type == HEAD_PAGE ? info: 0, buff, + block_size, 0, rownr, &empty_space); + rec_offset= PAGE_HEADER_SIZE; + dir= dir_entry_pos(buff, block_size, rownr); + empty_space+= uint2korr(dir+2); + } + else + { + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + uint length; + + DBUG_ASSERT(!new_page); + dir= dir_entry_pos(buff, block_size, rownr); + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + + if (max_entry <= rownr) + { + /* Add directory entry first in directory and data last on page */ + if (extend_directory(page_type == HEAD_PAGE ? info : 0, buff, + block_size, max_entry, rownr, &empty_space)) + goto crashed_file; + } + if (extend_area_on_page(page_type == HEAD_PAGE ? info : 0, buff, + dir, rownr, block_size, + (uint) data_length, &empty_space, + &rec_offset, &length)) + goto crashed_file; + } + } + /* Copy data */ + int2store(dir+2, data_length); + memcpy(buff + rec_offset, data, data_length); + empty_space-= (uint) data_length; + int2store(buff + EMPTY_SPACE_OFFSET, empty_space); + + /* + If page was not read before, write it but keep it pinned. + We don't update its LSN When we have processed all REDOs for this page + in the current REDO's group, we will stamp page with UNDO's LSN + (if we stamped it now, a next REDO, in + this group, for this page, would be skipped) and unpin then. + */ + result= 0; + if (unlock_method == PAGECACHE_LOCK_WRITE && + pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, PAGECACHE_PLAIN_PAGE, + unlock_method, unpin_method, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE)) + result= my_errno; + + /* Fix bitmap */ + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + goto err; + + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + /* + Data page and bitmap page are in place, we can update data_file_length in + case we extended the file. We could not do it earlier: bitmap code tests + data_file_length to know if it has to create a new page or not. + */ + set_if_bigger(share->state.state.data_file_length, end_of_page); + DBUG_RETURN(result); + +crashed_file: + my_errno= HA_ERR_WRONG_IN_RECORD; +err: + error= my_errno; + if (unlock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED) + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + _ma_mark_file_crashed(share); + DBUG_ASSERT(0); /* catch recovery errors early */ + DBUG_RETURN((my_errno= error)); +} + + +/* + Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL + + SYNOPSIS + _ma_apply_redo_purge_row_head_or_tail() + info Maria handler + lsn LSN to put on page + page_type HEAD_PAGE or TAIL_PAGE + header Header (without FILEID) + + NOTES + This function is very similar to delete_head_or_tail() + + RETURN + 0 ok + # Error number +*/ + +uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + const uchar *header) +{ + MARIA_SHARE *share= info->s; + pgcache_page_no_t page; + uint rownr, empty_space; + uint block_size= share->block_size; + uchar *buff; + int result; + uint error; + MARIA_PINNED_PAGE page_link; + DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail"); + + page= page_korr(header); + rownr= dirpos_korr(header+PAGE_STORE_SIZE); + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr)); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + if (!(buff= pagecache_read(share->pagecache, &info->dfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + goto err; + + if (lsn_korr(buff) >= lsn) + { + /* + Already applied + Note that in case the page is not anymore a head or tail page + a future redo will fix the bitmap. + */ + if ((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type) + { + empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET); + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, + empty_space)) + goto err; + } + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + DBUG_RETURN(0); + } + + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type); + + if (delete_dir_entry(buff, block_size, rownr, &empty_space) < 0) + { + my_errno= HA_ERR_WRONG_IN_RECORD; + goto err; + } + + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + result= 0; + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + /* This will work even if the page was marked as UNALLOCATED_PAGE */ + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + result= my_errno; + + DBUG_RETURN(result); + +err: + error= my_errno; + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + _ma_mark_file_crashed(share); + DBUG_ASSERT(0); + DBUG_RETURN((my_errno= error)); + +} + + +/** + @brief Apply LOGREC_REDO_FREE_BLOCKS + + @param info Maria handler + @param header Header (without FILEID) + + @note It marks the pages free in the bitmap + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_free_blocks(MARIA_HA *info, + LSN lsn __attribute__((unused)), + const uchar *header) +{ + MARIA_SHARE *share= info->s; + uint ranges; + DBUG_ENTER("_ma_apply_redo_free_blocks"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + ranges= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + DBUG_ASSERT(ranges > 0); + + while (ranges--) + { + my_bool res; + uint page_range; + pgcache_page_no_t page, start_page; + + start_page= page= page_korr(header); + header+= PAGE_STORE_SIZE; + /* Page range may have this bit set to indicate a tail page */ + page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT); + DBUG_ASSERT(page_range > 0); + + header+= PAGERANGE_STORE_SIZE; + + DBUG_PRINT("info", ("page: %lu pages: %u", (long) page, page_range)); + + /** @todo leave bitmap lock to the bitmap code... */ + pthread_mutex_lock(&share->bitmap.bitmap_lock); + res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page, + page_range); + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + if (res) + { + _ma_mark_file_crashed(share); + DBUG_ASSERT(0); + DBUG_RETURN(res); + } + } + DBUG_RETURN(0); +} + + +/** + @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL + + @param info Maria handler + @param header Header (without FILEID) + + @note It marks the page free in the bitmap, and sets the directory's count + to 0. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn, + const uchar *header) +{ + MARIA_SHARE *share= info->s; + uchar *buff; + pgcache_page_no_t page; + MARIA_PINNED_PAGE page_link; + my_bool res; + DBUG_ENTER("_ma_apply_redo_free_head_or_tail"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + page= page_korr(header); + + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, &page_link.link))) + { + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + } + else + { + buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + { + uint number_of_records= (uint) buff[DIR_COUNT_OFFSET]; + uchar *dir= dir_entry_pos(buff, share->block_size, + number_of_records-1); + buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST; + bzero(dir, number_of_records * DIR_ENTRY_SIZE); + } +#endif + + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + /** @todo leave bitmap lock to the bitmap code... */ + pthread_mutex_lock(&share->bitmap.bitmap_lock); + res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1); + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + if (res) + goto err; + DBUG_RETURN(0); + +err: + _ma_mark_file_crashed(share); + DBUG_ASSERT(0); + DBUG_RETURN(1); +} + + +/** + @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS + + @param info Maria handler + @parma lsn LSN to put on pages + @param header Header (with FILEID) + @param redo_lsn REDO record's LSN + @param[out] number_of_blobs Number of blobs found in log record + @param[out] number_of_ranges Number of ranges found + @param[out] first_page First page touched + @param[out] last_page Last page touched + + @note Write full pages (full head & blob pages) + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, + LSN lsn, const uchar *header, + LSN redo_lsn, + uint * const number_of_blobs, + uint * const number_of_ranges, + pgcache_page_no_t * const first_page, + pgcache_page_no_t * const last_page) +{ + MARIA_SHARE *share= info->s; + const uchar *data; + uint data_size= FULL_PAGE_SIZE(share->block_size); + uint blob_count, ranges; + uint16 sid; + pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0; + DBUG_ENTER("_ma_apply_redo_insert_row_blobs"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + sid= fileid_korr(header); + header+= FILEID_STORE_SIZE; + *number_of_ranges= ranges= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + *number_of_blobs= blob_count= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + DBUG_ASSERT(ranges >= blob_count); + + data= (header + ranges * ROW_EXTENT_SIZE + + blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE)); + + while (blob_count--) + { + uint sub_ranges, empty_space; + + sub_ranges= uint2korr(header); + header+= SUB_RANGE_SIZE; + empty_space= uint2korr(header); + header+= BLOCK_FILLER_SIZE; + DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size); + ranges-= sub_ranges; + + while (sub_ranges--) + { + uint i; + uint res; + uint page_range; + pgcache_page_no_t page, start_page; + uchar *buff; + + start_page= page= page_korr(header); + header+= PAGE_STORE_SIZE; + page_range= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + + for (i= page_range; i-- > 0 ; page++) + { + MARIA_PINNED_PAGE page_link; + enum pagecache_page_lock unlock_method; + enum pagecache_page_pin unpin_method; + uint length; + + set_if_smaller(first_page2, page); + set_if_bigger(last_page2, page); + if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE)) + continue; + + if (((page + 1) * share->block_size) > + share->state.state.data_file_length) + { + /* New page or half written page at end of file */ + DBUG_PRINT("info", ("Enlarging data file from %lu to %lu", + (ulong) share->state.state.data_file_length, + (ulong) ((page + 1 ) * share->block_size))); + share->state.state.data_file_length= (page + 1) * share->block_size; + buff= info->keyread_buff; + info->keyread_buff_used= 1; + make_empty_page(info, buff, BLOB_PAGE, 0); + unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED; + unpin_method= PAGECACHE_PIN_LEFT_UNPINNED; + } + else + { + share->pagecache->readwrite_flags&= ~MY_WME; + buff= pagecache_read(share->pagecache, + &info->dfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, &page_link.link); + share->pagecache->readwrite_flags= share->pagecache-> + org_readwrite_flags; + if (!buff) + { + if (my_errno != HA_ERR_FILE_TOO_SHORT && + my_errno != HA_ERR_WRONG_CRC) + { + /* If not read outside of file */ + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + goto err; + } + /* + Physical file was too short, create new page. It can be that + recovery started with a file with N pages, wrote page N+2 into + pagecache (increased data_file_length but not physical file + length), now reads page N+1: the read fails. + */ + buff= pagecache_block_link_to_buffer(page_link.link); + make_empty_page(info, buff, BLOB_PAGE, 0); + } + else + { +#ifndef DBUG_OFF + uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK); +#endif + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + DBUG_PRINT("info", ("already applied %llu >= %llu", + lsn_korr(buff), lsn)); + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + continue; + } + DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) || + (found_page_type == (uchar) UNALLOCATED_PAGE)); + } + unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK; + unpin_method= PAGECACHE_UNPIN; + } + + /* + Blob pages are never updated twice in same redo-undo chain, so + it's safe to update lsn for them here + */ + lsn_store(buff, lsn); + buff[PAGE_TYPE_OFFSET]= BLOB_PAGE; + + length= data_size; + if (i == 0 && sub_ranges == 0) + { + /* + Last page may be only partly filled. We zero the rest, like + write_full_pages() does. + */ + length-= empty_space; + bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space, + empty_space); + } + memcpy(buff+ PAGE_TYPE_OFFSET + 1, data, length); + data+= length; + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, PAGECACHE_PLAIN_PAGE, + unlock_method, unpin_method, + PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE)) + goto err; + } + /** @todo leave bitmap lock to the bitmap code... */ + pthread_mutex_lock(&share->bitmap.bitmap_lock); + res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, start_page, + page_range); + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + if (res) + goto err; + } + } + *first_page= first_page2; + *last_page= last_page2; + DBUG_RETURN(0); + +err: + _ma_mark_file_crashed(share); + DBUG_ASSERT(0); + DBUG_RETURN(1); +} + + +/**************************************************************************** + Applying of UNDO entries +****************************************************************************/ + +/** Execute undo of a row insert (delete the inserted row) */ + +my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header) +{ + pgcache_page_no_t page; + uint rownr; + uchar *buff; + my_bool res; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + ha_checksum checksum; + LSN lsn; + DBUG_ENTER("_ma_apply_undo_row_insert"); + + page= page_korr(header); + header+= PAGE_STORE_SIZE; + rownr= dirpos_korr(header); + header+= DIRPOS_STORE_SIZE; + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr)); + + buff= pagecache_read(share->pagecache, + &info->dfile, page, 0, + 0, share->page_type, + PAGECACHE_LOCK_WRITE, + &page_link.link); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!buff) + goto err; + + if (read_row_extent_info(info, buff, rownr)) + goto err; + + _ma_bitmap_flushable(info, 1); + if (delete_head_or_tail(info, page, rownr, 1, 1) || + delete_tails(info, info->cur_row.tail_positions)) + goto err; + + if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row)) + goto err; + + checksum= 0; + if (share->calc_checksum) + checksum= (ha_checksum) 0 - ha_checksum_korr(header); + info->last_auto_increment= ~ (ulonglong) 0; + if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT, + share->calc_checksum != 0, checksum, &lsn, (void*) 0)) + goto err; + + res= 0; +end: + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); + +err: + res= 1; + _ma_mark_file_crashed(share); + goto end; +} + + +/** Execute undo of a row delete (insert the row back where it was) */ + +my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, size_t header_length + __attribute__((unused))) +{ + MARIA_SHARE *share= info->s; + MARIA_ROW row; + MARIA_COLUMNDEF *column, *end_column; + MARIA_BITMAP_BLOCKS *blocks; + struct st_row_pos_info row_pos; + uchar *record; + const uchar *null_bits, *field_length_data, *extent_info; + pgcache_page_no_t page; + ulong *blob_lengths; + uint *null_field_lengths, extent_count, rownr, length_on_head_page; + DBUG_ENTER("_ma_apply_undo_row_delete"); + + /* + Use cur row as a base; We need to make a copy as we will change + some buffers to point directly to 'header' + */ + memcpy(&row, &info->cur_row, sizeof(row)); + + page= page_korr(header); + header+= PAGE_STORE_SIZE; + rownr= dirpos_korr(header); + header+= DIRPOS_STORE_SIZE; + length_on_head_page= uint2korr(header); + header+= 2; + extent_count= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr)); + + if (share->calc_checksum) + { + /* + We extract the checksum delta here, saving a recomputation in + allocate_and_write_block_record(). It's only an optimization. + */ + row.checksum= (ha_checksum) 0 - ha_checksum_korr(header); + header+= HA_CHECKSUM_STORE_SIZE; + } + extent_info= header; + header+= extent_count * ROW_EXTENT_SIZE; + + null_field_lengths= row.null_field_lengths; + blob_lengths= row.blob_lengths; + + /* + Fill in info->cur_row with information about the row, like in + calc_record_size(), to be used by write_block_record() + */ + + row.normal_length= row.char_length= row.varchar_length= + row.blob_length= row.extents_count= row.field_lengths_length= 0; + + null_bits= header; + header+= share->base.null_bytes; + /* This will not be changed */ + row.empty_bits= (uchar*) header; + header+= share->base.pack_bytes; + if (share->base.max_field_lengths) + { + row.field_lengths_length= uint2korr(header); + row.field_lengths= (uchar*) header + 2 ; + header+= 2 + row.field_lengths_length; + } + if (share->base.blobs) + row.blob_length= ma_get_length(&header); + + /* We need to build up a record (without blobs) in rec_buff */ + if (!(record= my_malloc(share->base.reclength, MYF(MY_WME)))) + DBUG_RETURN(1); + + memcpy(record, null_bits, share->base.null_bytes); + + /* Copy field information from header to record */ + + /* Handle constant length fields that are always present */ + for (column= share->columndef, + end_column= column+ share->base.fixed_not_null_fields; + column < end_column; + column++) + { + memcpy(record + column->offset, header, column->length); + header+= column->length; + } + + /* Handle NULL fields and CHAR/VARCHAR fields */ + field_length_data= row.field_lengths; + for (end_column= share->columndef + share->base.fields; + column < end_column; + column++, null_field_lengths++) + { + if ((record[column->null_pos] & column->null_bit) || + row.empty_bits[column->empty_pos] & column->empty_bit) + { + if (column->type != FIELD_BLOB) + *null_field_lengths= 0; + else + *blob_lengths++= 0; + if (share->calc_checksum) + bfill(record + column->offset, column->fill_length, + column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + continue; + } + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + case FIELD_SKIP_ZERO: /* Fixed length field */ + row.normal_length+= column->length; + *null_field_lengths= column->length; + memcpy(record + column->offset, header, column->length); + header+= column->length; + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + uint length; + if (column->length <= 255) + length= (uint) *field_length_data++; + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + } + row.char_length+= length; + *null_field_lengths= length; + memcpy(record + column->offset, header, length); + if (share->calc_checksum) + bfill(record + column->offset + length, (column->length - length), + ' '); + header+= length; + break; + } + case FIELD_VARCHAR: + { + uint length; + uchar *field_pos= record + column->offset; + + /* 256 is correct as this includes the length uchar */ + if (column->fill_length == 1) + { + field_pos[0]= *field_length_data; + length= (uint) *field_length_data; + } + else + { + field_pos[0]= field_length_data[0]; + field_pos[1]= field_length_data[1]; + length= uint2korr(field_length_data); + } + field_length_data+= column->fill_length; + field_pos+= column->fill_length; + row.varchar_length+= length; + *null_field_lengths= length; + memcpy(field_pos, header, length); + header+= length; + break; + } + case FIELD_BLOB: + { + /* Copy length of blob and pointer to blob data to record */ + uchar *field_pos= record + column->offset; + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_length_data); + + memcpy(field_pos, field_length_data, size_length); + field_length_data+= size_length; + memcpy(field_pos + size_length, &header, sizeof(&header)); + header+= blob_length; + *blob_lengths++= blob_length; + break; + } + default: + DBUG_ASSERT(0); + } + } + row.head_length= (info->row_base_length + + share->base.fixed_not_null_fields_length + + row.field_lengths_length + + size_to_store_key_length(row.field_lengths_length) + + row.normal_length + + row.char_length + row.varchar_length); + row.total_length= (row.head_length + row.blob_length); + if (row.total_length < share->base.min_block_length) + row.total_length= share->base.min_block_length; + + /* + Row is now generated. Now we need to insert record on the original + pages with original size on each page. + */ + + _ma_bitmap_flushable(info, 1); + /* Change extent information to be usable by write_block_record() */ + blocks= &row.insert_blocks; + if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info)) + goto err; + blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info, + &share->bitmap, + page); + blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP; + + /* Read head page and allocate data for rowid */ + if (get_rowpos_in_head_or_tail_page(info, blocks->block, + info->buff, + length_on_head_page, + HEAD_PAGE, PAGECACHE_LOCK_WRITE, + rownr, &row_pos)) + goto err; + + if (share->calc_checksum) + { + DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record)); + } + /* Store same amount of data on head page as on original page */ + row_pos.length= (length_on_head_page - + (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE); + set_if_bigger(row_pos.length, share->base.min_block_length); + if (write_block_record(info, (uchar*) 0, record, &row, + blocks, blocks->block->org_bitmap_value != 0, + &row_pos, undo_lsn, 0)) + goto err; + + my_free(record, MYF(0)); + DBUG_RETURN(0); + +err: + _ma_mark_file_crashed(share); + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + my_free(record, MYF(0)); + DBUG_RETURN(1); +} + + +/** + Execute undo of a row update + + @fn _ma_apply_undo_row_update() + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn, + const uchar *header, + size_t header_length + __attribute__((unused))) +{ + MARIA_SHARE *share= info->s; + MARIA_RECORD_POS record_pos; + const uchar *field_length_data, *field_length_data_end, *extent_info; + uchar *current_record, *orig_record; + pgcache_page_no_t page; + ha_checksum checksum_delta; + uint rownr, field_length_header, extent_count, length_on_head_page; + int error; + DBUG_ENTER("_ma_apply_undo_row_update"); + LINT_INIT(checksum_delta); + + page= page_korr(header); + header+= PAGE_STORE_SIZE; + rownr= dirpos_korr(header); + header+= DIRPOS_STORE_SIZE; + + record_pos= ma_recordpos(page, rownr); + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) record_pos, (ulong) page, rownr)); + + if (share->calc_checksum) + { + checksum_delta= ha_checksum_korr(header); + header+= HA_CHECKSUM_STORE_SIZE; + } + length_on_head_page= uint2korr(header); + set_if_bigger(length_on_head_page, share->base.min_block_length); + header+= 2; + extent_count= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + extent_info= header; + header+= extent_count * ROW_EXTENT_SIZE; + + /* + Set header to point to old field values, generated by + fill_update_undo_parts() + */ + field_length_header= ma_get_length(&header); + field_length_data= (uchar*) header; + header+= field_length_header; + field_length_data_end= header; + + /* Allocate buffer for current row & original row */ + if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME)))) + DBUG_RETURN(1); + orig_record= current_record+ share->base.reclength; + + /* Read current record */ + if (_ma_read_block_record(info, current_record, record_pos)) + goto err; + + if (*field_length_data == 255) + { + /* Bitmap changed */ + field_length_data++; + memcpy(orig_record, header, share->base.null_bytes); + header+= share->base.null_bytes; + } + else + memcpy(orig_record, current_record, share->base.null_bytes); + bitmap_clear_all(&info->changed_fields); + + while (field_length_data < field_length_data_end) + { + uint field_nr= ma_get_length(&field_length_data), field_length; + MARIA_COLUMNDEF *column= share->columndef + field_nr; + uchar *orig_field_pos= orig_record + column->offset; + + bitmap_set_bit(&info->changed_fields, field_nr); + if (field_nr >= share->base.fixed_not_null_fields) + { + if (!(field_length= ma_get_length(&field_length_data))) + { + /* Null field or empty field */ + bfill(orig_field_pos, column->fill_length, + column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + continue; + } + } + else + field_length= column->length; + + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + memcpy(orig_field_pos, header, column->length); + header+= column->length; + break; + case FIELD_SKIP_ZERO: /* Number */ + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + uint diff; + memcpy(orig_field_pos, header, field_length); + if ((diff= (column->length - field_length))) + bfill(orig_field_pos + column->length - diff, diff, + column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + header+= field_length; + } + break; + case FIELD_VARCHAR: + if (column->length <= 256) + { + *orig_field_pos++= (uchar) field_length; + } + else + { + int2store(orig_field_pos, field_length); + orig_field_pos+= 2; + } + memcpy(orig_field_pos, header, field_length); + header+= field_length; + break; + case FIELD_BLOB: + { + uint size_length= column->length - portable_sizeof_char_ptr; + _ma_store_blob_length(orig_field_pos, size_length, field_length); + memcpy_fixed(orig_field_pos + size_length, &header, sizeof(header)); + header+= field_length; + break; + } + default: + DBUG_ASSERT(0); + } + } + copy_not_changed_fields(info, &info->changed_fields, + orig_record, current_record); + + if (share->calc_checksum) + { + info->new_row.checksum= checksum_delta + + (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record)); + /* verify that record's content is sane */ + DBUG_ASSERT(info->new_row.checksum == + (*share->calc_checksum)(info, current_record)); + } + + info->last_auto_increment= ~ (ulonglong) 0; + /* Now records are up to date, execute the update to original values */ + if (_ma_update_at_original_place(info, page, rownr, length_on_head_page, + extent_count, extent_info, + current_record, orig_record, undo_lsn)) + goto err; + + error= 0; +end: + my_free(current_record, MYF(0)); + DBUG_RETURN(error); + +err: + error= 1; + _ma_mark_file_crashed(share); + goto end; +} + + +/** + Execute undo of a bulk insert which used repair + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn) +{ + my_bool error; + LSN lsn; + DBUG_ENTER("_ma_apply_undo_bulk_insert"); + /* + We delete all rows, re-enable indices as bulk insert had disabled + non-unique ones. + */ + error= (maria_delete_all_rows(info) || + maria_enable_indexes(info) || + /* we enabled indices so need '2' below */ + _ma_state_info_write(info->s, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO | + MA_STATE_INFO_WRITE_LOCK) || + _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT, + FALSE, 0, &lsn, NULL)); + DBUG_RETURN(error); +} + + +/** + @brief Get the TRANSLOG_ADDRESS to flush up to + + @param page Page's content + @param page_no Page's number (<offset>/<page length>) + @param data_ptr Callback data pointer (pointer to MARIA_SHARE) + + @note + Usable for data (non-bitmap) and index pages + + @retval LSN to flush up to +*/ + +TRANSLOG_ADDRESS +maria_page_get_lsn(uchar *page, + pgcache_page_no_t page_no __attribute__((unused)), + uchar* data_ptr __attribute__((unused))) +{ +#ifndef DBUG_OFF + const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr; + DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE && + share->now_transactional); +#endif + return lsn_korr(page); +} + + +/** + @brief Enable reading of all rows, ignoring versioning + + @note + This is mainly useful in single user applications, like maria_pack, + where we want to be able to read all rows without having to read the + transaction id from the control file +*/ + +void maria_ignore_trids(MARIA_HA *info) +{ + if (info->s->base.born_transactional) + { + if (!info->trn) + _ma_set_trn_for_table(info, &dummy_transaction_object); + /* Ignore transaction id when row is read */ + info->trn->min_read_from= ~(TrID) 0; + } +} + + +#ifndef DBUG_OFF + +/* The following functions are useful to call from debugger */ + +void _ma_print_block_info(uchar *buff) +{ + LSN lsn= lsn_korr(buff); + + printf("LSN: %lu,0x%lx type: %u dir_entries: %u dir_free: %u empty_space: %u\n", + LSN_IN_PARTS(lsn), + (uint)buff[PAGE_TYPE_OFFSET], + (uint)buff[DIR_COUNT_OFFSET], + (uint)buff[DIR_FREE_OFFSET], + (uint) uint2korr(buff + EMPTY_SPACE_OFFSET)); + printf("Start of directory: %lu\n", + maria_block_size - PAGE_SUFFIX_SIZE - + (uint) buff[DIR_COUNT_OFFSET] * DIR_ENTRY_SIZE); + _ma_print_directory(stdout, buff, maria_block_size); +} +#endif diff --git a/storage/maria/ma_blockrec.h b/storage/maria/ma_blockrec.h new file mode 100644 index 00000000000..a5858880dd0 --- /dev/null +++ b/storage/maria/ma_blockrec.h @@ -0,0 +1,290 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Storage of records in block +*/ + +#define LSN_SIZE 7 +#define DIR_COUNT_SIZE 1 /* Stores number of rows on page */ +#define DIR_FREE_SIZE 1 /* Pointer to first free dir entry */ +#define EMPTY_SPACE_SIZE 2 /* Stores empty space on page */ +#define PAGE_TYPE_SIZE 1 +#define PAGE_SUFFIX_SIZE 4 /* Bytes for checksum */ +#define PAGE_HEADER_SIZE (LSN_SIZE + DIR_COUNT_SIZE + DIR_FREE_SIZE +\ + EMPTY_SPACE_SIZE + PAGE_TYPE_SIZE) +#define PAGE_OVERHEAD_SIZE (PAGE_HEADER_SIZE + DIR_ENTRY_SIZE + \ + PAGE_SUFFIX_SIZE) +#define BLOCK_RECORD_POINTER_SIZE 6 + +#define FULL_PAGE_SIZE(block_size) ((block_size) - LSN_SIZE - \ + PAGE_TYPE_SIZE - PAGE_SUFFIX_SIZE) + +#define ROW_EXTENT_PAGE_SIZE 5 +#define ROW_EXTENT_COUNT_SIZE 2 +#define SUB_RANGE_SIZE 2 +#define BLOCK_FILLER_SIZE 2 +#define ROW_EXTENT_SIZE (ROW_EXTENT_PAGE_SIZE + ROW_EXTENT_COUNT_SIZE) +#define TAIL_BIT 0x8000 /* Bit in page_count to signify tail */ +#define START_EXTENT_BIT 0x4000 /* Bit in page_count to signify start*/ +/* page_count set by bitmap code for tail pages */ +#define TAIL_PAGE_COUNT_MARKER 0xffff +/* Number of extents reserved MARIA_BITMAP_BLOCKS to store head part */ +#define ELEMENTS_RESERVED_FOR_MAIN_PART 4 +/* This is just used to prealloc a dynamic array */ +#define AVERAGE_BLOB_SIZE 1024L*1024L +/* Number of pages to store continuous blob parts */ +#define BLOB_SEGMENT_MIN_SIZE 128 + +/* Fields before 'row->null_field_lengths' used by find_where_to_split_row */ +#define EXTRA_LENGTH_FIELDS 3 + +/* Size for the different parts in the row header (and head page) */ +#define FLAG_SIZE 1 +#define VERPTR_SIZE 7 +#define DIR_ENTRY_SIZE 4 +#define FIELD_OFFSET_SIZE 2 /* size of pointers to field starts */ + +/* Minimum header size needed for a new row */ +#define BASE_ROW_HEADER_SIZE FLAG_SIZE +#define TRANS_ROW_EXTRA_HEADER_SIZE TRANSID_SIZE + +#define PAGE_TYPE_MASK 7 +enum en_page_type { UNALLOCATED_PAGE, HEAD_PAGE, TAIL_PAGE, BLOB_PAGE, MAX_PAGE_TYPE }; +#define PAGE_CAN_BE_COMPACTED 128 /* Bit in PAGE_TYPE */ + +#define PAGE_TYPE_OFFSET LSN_SIZE +#define DIR_COUNT_OFFSET (LSN_SIZE+PAGE_TYPE_SIZE) +#define DIR_FREE_OFFSET (DIR_COUNT_OFFSET+DIR_COUNT_SIZE) +#define EMPTY_SPACE_OFFSET (DIR_FREE_OFFSET+DIR_FREE_SIZE) + +/* Bits used for flag uchar (one byte, first in record) */ +#define ROW_FLAG_TRANSID 1 +#define ROW_FLAG_VER_PTR 2 +#define ROW_FLAG_DELETE_TRANSID 4 +#define ROW_FLAG_NULLS_EXTENDED 8 +#define ROW_FLAG_EXTENTS 128 +#define ROW_FLAG_ALL (1+2+4+8+128) + +/******** Variables that affects how data pages are utilized ********/ + +/* Minium size of tail segment */ +#define MIN_TAIL_SIZE 32 + +/* + Fixed length part of Max possible header size; See row data structure + table in ma_blockrec.c. +*/ +#define MAX_FIXED_HEADER_SIZE (FLAG_SIZE + 3 + ROW_EXTENT_SIZE + 3) +#define TRANS_MAX_FIXED_HEADER_SIZE (MAX_FIXED_HEADER_SIZE + \ + TRANSID_SIZE + VERPTR_SIZE + \ + TRANSID_SIZE) + +/* We use 1 uchar in record header to store number of directory entries */ +#define MAX_ROWS_PER_PAGE 255 +#define END_OF_DIR_FREE_LIST ((uchar) 255) + +/* Bits for MARIA_BITMAP_BLOCKS->used */ +/* We stored data on disk in the block */ +#define BLOCKUSED_USED 1 +/* Bitmap on disk is block->org_bitmap_value ; Happens only on update */ +#define BLOCKUSED_USE_ORG_BITMAP 2 +/* We stored tail data on disk for the block */ +#define BLOCKUSED_TAIL 4 + +/******* defines that affects allocation (density) of data *******/ + +/* + If the tail part (from the main block or a blob) would use more than 75 % of + the size of page, store the tail on a full page instead of a shared + tail page. +*/ +#define MAX_TAIL_SIZE(block_size) ((block_size) *3 / 4) + +/* Don't allocate memory for too many row extents on the stack */ +#define ROW_EXTENTS_ON_STACK 32 + +/* Functions to convert MARIA_RECORD_POS to/from page:offset */ + +static inline MARIA_RECORD_POS ma_recordpos(pgcache_page_no_t page, + uint dir_entry) +{ + DBUG_ASSERT(dir_entry <= 255); + DBUG_ASSERT(page > 0); /* page 0 is bitmap, not data page */ + return (MARIA_RECORD_POS) (((ulonglong) page << 8) | dir_entry); +} + +static inline pgcache_page_no_t ma_recordpos_to_page(MARIA_RECORD_POS record_pos) +{ + return (pgcache_page_no_t) (record_pos >> 8); +} + +static inline uint ma_recordpos_to_dir_entry(MARIA_RECORD_POS record_pos) +{ + return (uint) (record_pos & 255); +} + +static inline uchar *dir_entry_pos(uchar *buff, uint block_size, uint pos) +{ + return (buff + block_size - DIR_ENTRY_SIZE * pos - PAGE_SUFFIX_SIZE - + DIR_ENTRY_SIZE); +} + +/* ma_blockrec.c */ +void _ma_init_block_record_data(void); +my_bool _ma_once_init_block_record(MARIA_SHARE *share, File dfile); +my_bool _ma_once_end_block_record(MARIA_SHARE *share); +my_bool _ma_init_block_record(MARIA_HA *info); +void _ma_end_block_record(MARIA_HA *info); + +my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec, const uchar *newrec); +my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record); +int _ma_read_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS record_pos); +int _ma_read_block_record2(MARIA_HA *info, uchar *record, + uchar *data, uchar *end_of_data); +int _ma_scan_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS, my_bool); +my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos); +my_bool _ma_scan_init_block_record(MARIA_HA *info); +void _ma_scan_end_block_record(MARIA_HA *info); +int _ma_scan_remember_block_record(MARIA_HA *info, + MARIA_RECORD_POS *lastpos); +void _ma_scan_restore_block_record(MARIA_HA *info, + MARIA_RECORD_POS lastpos); + +MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info, + const uchar *record); +my_bool _ma_write_block_record(MARIA_HA *info, const uchar *record); +my_bool _ma_write_abort_block_record(MARIA_HA *info); +my_bool _ma_compare_block_record(register MARIA_HA *info, + register const uchar *record); +void _ma_compact_block_page(uchar *buff, uint block_size, uint rownr, + my_bool extend_block, TrID min_read_from, + uint min_row_length); +my_bool enough_free_entries_on_page(MARIA_SHARE *share, uchar *page_buff); +TRANSLOG_ADDRESS +maria_page_get_lsn(uchar *page, pgcache_page_no_t page_no, uchar* data_ptr); + +/* ma_bitmap.c */ +my_bool _ma_bitmap_init(MARIA_SHARE *share, File file); +my_bool _ma_bitmap_end(MARIA_SHARE *share); +my_bool _ma_bitmap_flush(MARIA_SHARE *share); +my_bool _ma_bitmap_flush_all(MARIA_SHARE *share); +void _ma_bitmap_reset_cache(MARIA_SHARE *share); +my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row, + MARIA_BITMAP_BLOCKS *result_blocks); +my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks); +my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents, + uint count); +my_bool _ma_bitmap_set(MARIA_HA *info, pgcache_page_no_t pos, my_bool head, + uint empty_space); +my_bool _ma_bitmap_reset_full_page_bits(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, + uint page_count); +my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, uint page_count); +uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size); +my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *new_row, + pgcache_page_no_t page, uint free_size, + MARIA_BITMAP_BLOCKS *result_blocks); +my_bool _ma_check_bitmap_data(MARIA_HA *info, + enum en_page_type page_type, + pgcache_page_no_t page, + uint empty_space, uint *bitmap_pattern); +my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info, + enum en_page_type page_type, + pgcache_page_no_t page, + uint *bitmap_pattern); +uint _ma_bitmap_get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page); +void _ma_bitmap_delete_all(MARIA_SHARE *share); +int _ma_bitmap_create_first(MARIA_SHARE *share); +void _ma_bitmap_flushable(MARIA_HA *info, int non_flushable_inc); +void _ma_bitmap_lock(MARIA_SHARE *share); +void _ma_bitmap_unlock(MARIA_SHARE *share); +void _ma_bitmap_set_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share); +#ifndef DBUG_OFF +void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data, + pgcache_page_no_t page); +#endif + +uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + my_bool new_page, + const uchar *header, + const uchar *data, + size_t data_length); +uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + const uchar *header); +uint _ma_apply_redo_free_blocks(MARIA_HA *info, LSN lsn, + const uchar *header); +uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn, + const uchar *header); +uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, LSN lsn, + const uchar *header, LSN redo_lsn, + uint * const number_of_blobs, + uint * const number_of_ranges, + pgcache_page_no_t * const first_page, + pgcache_page_no_t * const last_page); +my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info, LSN lsn, + const uchar *header); +my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header); +my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, size_t length); +my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn, + const uchar *header, size_t length); +my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn); + +my_bool write_hook_for_redo(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +my_bool write_hook_for_undo(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +my_bool write_hook_for_redo_delete_all(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_undo_row_insert(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_undo_row_delete(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_undo_row_update(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_file_id(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +my_bool write_hook_for_commit(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +void _ma_block_get_status(void *param, my_bool concurrent_insert); +my_bool _ma_block_start_trans(void* param); +my_bool _ma_block_start_trans_no_versioning(void *param); +void _ma_block_update_status(void *param); +void _ma_block_restore_status(void *param); +my_bool _ma_block_check_status(void *param); diff --git a/storage/maria/ma_cache.c b/storage/maria/ma_cache.c new file mode 100644 index 00000000000..82b5ddd8047 --- /dev/null +++ b/storage/maria/ma_cache.c @@ -0,0 +1,107 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Functions for read record cacheing with maria + Used for reading dynamic/compressed records from datafile. + + Can fetch data directly from file (outside cache), + if reading a small chunk straight before the cached part (with possible + overlap). + + Can be explicitly asked not to use cache (by not setting READING_NEXT in + flag) - useful for occasional out-of-cache reads, when the next read is + expected to hit the cache again. + + Allows "partial read" errors in the record header (when READING_HEADER flag + is set) - unread part is bzero'ed + + Note: out-of-cache reads are enabled for shared IO_CACHE's too, + as these reads will be cached by OS cache (and my_pread is always atomic) +*/ + + +#include "maria_def.h" + +my_bool _ma_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos, + size_t length, uint flag) +{ + size_t read_length,in_buff_length; + my_off_t offset; + uchar *in_buff_pos; + DBUG_ENTER("_ma_read_cache"); + + if (pos < info->pos_in_file) + { + read_length=length; + if ((my_off_t) read_length > (my_off_t) (info->pos_in_file-pos)) + read_length=(uint) (info->pos_in_file-pos); + info->seek_not_done=1; + if (my_pread(info->file,buff,read_length,pos,MYF(MY_NABP))) + DBUG_RETURN(1); + if (!(length-=read_length)) + DBUG_RETURN(0); + pos+=read_length; + buff+=read_length; + } + if (pos >= info->pos_in_file && + (offset= (my_off_t) (pos - info->pos_in_file)) < + (my_off_t) (info->read_end - info->request_pos)) + { + in_buff_pos=info->request_pos+(uint) offset; + in_buff_length= min(length,(size_t) (info->read_end-in_buff_pos)); + memcpy(buff,info->request_pos+(uint) offset,(size_t) in_buff_length); + if (!(length-=in_buff_length)) + DBUG_RETURN(0); + pos+=in_buff_length; + buff+=in_buff_length; + } + else + in_buff_length=0; + if (flag & READING_NEXT) + { + if (pos != (info->pos_in_file + + (uint) (info->read_end - info->request_pos))) + { + info->pos_in_file=pos; /* Force start here */ + info->read_pos=info->read_end=info->request_pos; /* Everything used */ + info->seek_not_done=1; + } + else + info->read_pos=info->read_end; /* All block used */ + if (!(*info->read_function)(info,buff,length)) + DBUG_RETURN(0); + read_length=info->error; + } + else + { + info->seek_not_done=1; + if ((read_length=my_pread(info->file,buff,length,pos,MYF(0))) == length) + DBUG_RETURN(0); + } + if (!(flag & READING_HEADER) || (int) read_length == -1 || + read_length+in_buff_length < 3) + { + DBUG_PRINT("error", + ("Error %d reading next-multi-part block (Got %d bytes)", + my_errno, (int) read_length)); + if (!my_errno || my_errno == HA_ERR_FILE_TOO_SHORT) + my_errno= HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(1); + } + bzero(buff+read_length,MARIA_BLOCK_INFO_HEADER_LENGTH - in_buff_length - + read_length); + DBUG_RETURN(0); +} /* _ma_read_cache */ diff --git a/storage/maria/ma_changed.c b/storage/maria/ma_changed.c new file mode 100644 index 00000000000..4d0964581f6 --- /dev/null +++ b/storage/maria/ma_changed.c @@ -0,0 +1,33 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Check if somebody has changed table since last check. */ + +#include "maria_def.h" + + /* Return 0 if table isn't changed */ + +int maria_is_changed(MARIA_HA *info) +{ + int result; + DBUG_ENTER("maria_is_changed"); + if (fast_ma_readinfo(info)) + DBUG_RETURN(-1); + VOID(_ma_writeinfo(info,0)); + result=(int) info->data_changed; + info->data_changed=0; + DBUG_PRINT("exit",("result: %d",result)); + DBUG_RETURN(result); +} diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c new file mode 100644 index 00000000000..307befab5c7 --- /dev/null +++ b/storage/maria/ma_check.c @@ -0,0 +1,6805 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Describe, check and repair of MARIA tables */ + +/* + About checksum calculation. + + There are two types of checksums. Table checksum and row checksum. + + Row checksum is an additional uchar at the end of dynamic length + records. It must be calculated if the table is configured for them. + Otherwise they must not be used. The variable + MYISAM_SHARE::calc_checksum determines if row checksums are used. + MI_INFO::checksum is used as temporary storage during row handling. + For parallel repair we must assure that only one thread can use this + variable. There is no problem on the write side as this is done by one + thread only. But when checking a record after read this could go + wrong. But since all threads read through a common read buffer, it is + sufficient if only one thread checks it. + + Table checksum is an eight uchar value in the header of the index file. + It can be calculated even if row checksums are not used. The variable + MI_CHECK::glob_crc is calculated over all records. + MI_SORT_PARAM::calc_checksum determines if this should be done. This + variable is not part of MI_CHECK because it must be set per thread for + parallel repair. The global glob_crc must be changed by one thread + only. And it is sufficient to calculate the checksum once only. +*/ + +#include "ma_ftdefs.h" +#include "ma_rt_index.h" +#include "ma_blockrec.h" +#include "trnman.h" +#include "ma_key_recover.h" + +#include <stdarg.h> +#include <my_getopt.h> +#ifdef HAVE_SYS_VADVISE_H +#include <sys/vadvise.h> +#endif +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif + +/* Functions defined in this file */ + +static int check_k_link(HA_CHECK *param, MARIA_HA *info, my_off_t next_link); +static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_PAGE *page, ha_rows *keys, + ha_checksum *key_checksum, uint level); +static uint isam_key_length(MARIA_HA *info,MARIA_KEYDEF *keyinfo); +static ha_checksum calc_checksum(ha_rows count); +static int writekeys(MARIA_SORT_PARAM *sort_param); +static int sort_one_index(HA_CHECK *param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t pagepos, File new_file); +static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key); +static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key); +static int sort_get_next_record(MARIA_SORT_PARAM *sort_param); +static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a, + const void *b); +static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param, + const uchar *a); +static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a); +static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo, const uchar *key); +static int sort_insert_key(MARIA_SORT_PARAM *sort_param, + reg1 SORT_KEY_BLOCKS *key_block, + const uchar *key, my_off_t prev_block); +static int sort_delete_record(MARIA_SORT_PARAM *sort_param); +/*static int _ma_flush_pending_blocks(HA_CHECK *param);*/ +static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks, + uint buffer_length); +static ha_checksum maria_byte_checksum(const uchar *buf, uint length); +static void set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share); +static void restore_data_file_type(MARIA_SHARE *share); +static void change_data_file_descriptor(MARIA_HA *info, File new_file); +static void unuse_data_file_descriptor(MARIA_HA *info); +static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info, + MARIA_HA *info, uchar *record); +static void copy_data_file_state(MARIA_STATE_INFO *to, + MARIA_STATE_INFO *from); +static void report_keypage_fault(HA_CHECK *param, MARIA_HA *info, + my_off_t position); +static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file); +static my_bool _ma_flush_table_files_before_swap(HA_CHECK *param, + MARIA_HA *info); +static TrID max_trid_in_system(void); +static void _ma_check_print_not_visible_error(HA_CHECK *param, TrID used_trid); +void retry_if_quick(MARIA_SORT_PARAM *param, int error); + + +/* Initialize check param with default values */ + +void maria_chk_init(HA_CHECK *param) +{ + bzero((uchar*) param,sizeof(*param)); + param->opt_follow_links=1; + param->keys_in_use= ~(ulonglong) 0; + param->search_after_block=HA_OFFSET_ERROR; + param->auto_increment_value= 0; + param->use_buffers=USE_BUFFER_INIT; + param->read_buffer_length=READ_BUFFER_INIT; + param->write_buffer_length=READ_BUFFER_INIT; + param->sort_buffer_length=SORT_BUFFER_INIT; + param->sort_key_blocks=BUFFERS_WHEN_SORTING; + param->tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL; + param->myf_rw=MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL); + param->start_check_pos=0; + param->max_record_length= LONGLONG_MAX; + param->pagecache_block_size= KEY_CACHE_BLOCK_SIZE; + param->stats_method= MI_STATS_METHOD_NULLS_NOT_EQUAL; +} + + +/* Initialize check param and maria handler for check of table */ + +void maria_chk_init_for_check(HA_CHECK *param, MARIA_HA *info) +{ + param->not_visible_rows_found= 0; + param->max_found_trid= 0; + + /* + Set up transaction handler so that we can see all rows. When rows is read + we will check the found id against param->max_tried + */ + if (param->max_trid == 0) + { + if (!ma_control_file_inited()) + param->max_trid= 0; /* Give warning for first trid found */ + else + param->max_trid= max_trid_in_system(); + } + maria_ignore_trids(info); +} + + + /* Check the status flags for the table */ + +int maria_chk_status(HA_CHECK *param, MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + if (maria_is_crashed_on_repair(info)) + _ma_check_print_warning(param, + "Table is marked as crashed and last repair failed"); + else if (maria_in_repair(info)) + _ma_check_print_warning(param, + "Last repair was aborted before finishing"); + else if (maria_is_crashed(info)) + _ma_check_print_warning(param, + "Table is marked as crashed"); + if (share->state.open_count != (uint) (share->global_changed ? 1 : 0)) + { + /* Don't count this as a real warning, as check can correct this ! */ + uint save=param->warning_printed; + _ma_check_print_warning(param, + share->state.open_count==1 ? + "%d client is using or hasn't closed the table properly" : + "%d clients are using or haven't closed the table properly", + share->state.open_count); + /* If this will be fixed by the check, forget the warning */ + if (param->testflag & T_UPDATE_STATE) + param->warning_printed=save; + } + return 0; +} + +/* + Check delete links in row data +*/ + +int maria_chk_del(HA_CHECK *param, register MARIA_HA *info, + ulonglong test_flag) +{ + MARIA_SHARE *share= info->s; + reg2 ha_rows i; + uint delete_link_length; + my_off_t empty,next_link,old_link; + char buff[22],buff2[22]; + DBUG_ENTER("maria_chk_del"); + + LINT_INIT(old_link); + + param->record_checksum=0; + + if (share->data_file_type == BLOCK_RECORD) + DBUG_RETURN(0); /* No delete links here */ + + delete_link_length=((share->options & HA_OPTION_PACK_RECORD) ? 20 : + share->rec_reflength+1); + + if (!(test_flag & T_SILENT)) + puts("- check record delete-chain"); + + next_link=share->state.dellink; + if (share->state.state.del == 0) + { + if (test_flag & T_VERBOSE) + { + puts("No recordlinks"); + } + } + else + { + if (test_flag & T_VERBOSE) + printf("Recordlinks: "); + empty=0; + for (i= share->state.state.del ; i > 0L && next_link != HA_OFFSET_ERROR ; i--) + { + if (_ma_killed_ptr(param)) + DBUG_RETURN(1); + if (test_flag & T_VERBOSE) + printf(" %9s",llstr(next_link,buff)); + if (next_link >= share->state.state.data_file_length) + goto wrong; + if (my_pread(info->dfile.file, (uchar*) buff, delete_link_length, + next_link,MYF(MY_NABP))) + { + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"Can't read delete-link at filepos: %s", + llstr(next_link,buff)); + DBUG_RETURN(1); + } + if (*buff != '\0') + { + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"Record at pos: %s is not remove-marked", + llstr(next_link,buff)); + goto wrong; + } + if (share->options & HA_OPTION_PACK_RECORD) + { + my_off_t prev_link=mi_sizekorr(buff+12); + if (empty && prev_link != old_link) + { + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"Deleted block at %s doesn't point back at previous delete link",llstr(next_link,buff2)); + goto wrong; + } + old_link=next_link; + next_link=mi_sizekorr(buff+4); + empty+=mi_uint3korr(buff+1); + } + else + { + param->record_checksum+=(ha_checksum) next_link; + next_link= _ma_rec_pos(share, (uchar *) buff + 1); + empty+=share->base.pack_reclength; + } + } + if (share->state.state.del && (test_flag & T_VERBOSE)) + puts("\n"); + if (empty != share->state.state.empty) + { + _ma_check_print_warning(param, + "Found %s deleted space in delete link chain. Should be %s", + llstr(empty,buff2), + llstr(share->state.state.empty,buff)); + } + if (next_link != HA_OFFSET_ERROR) + { + _ma_check_print_error(param, + "Found more than the expected %s deleted rows in delete link chain", + llstr(share->state.state.del, buff)); + goto wrong; + } + if (i != 0) + { + _ma_check_print_error(param, + "Found %s deleted rows in delete link chain. Should be %s", + llstr(share->state.state.del - i, buff2), + llstr(share->state.state.del, buff)); + goto wrong; + } + } + DBUG_RETURN(0); + +wrong: + param->testflag|=T_RETRY_WITHOUT_QUICK; + if (test_flag & T_VERBOSE) + puts(""); + _ma_check_print_error(param,"record delete-link-chain corrupted"); + DBUG_RETURN(1); +} /* maria_chk_del */ + + +/* Check delete links in index file */ + +static int check_k_link(HA_CHECK *param, register MARIA_HA *info, + my_off_t next_link) +{ + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + ha_rows records; + char llbuff[21], llbuff2[21]; + uchar *buff; + DBUG_ENTER("check_k_link"); + + if (next_link == HA_OFFSET_ERROR) + DBUG_RETURN(0); /* Avoid printing empty line */ + + records= (ha_rows) (share->state.state.key_file_length / block_size); + while (next_link != HA_OFFSET_ERROR && records > 0) + { + if (_ma_killed_ptr(param)) + DBUG_RETURN(1); + if (param->testflag & T_VERBOSE) + printf("%16s",llstr(next_link,llbuff)); + + /* Key blocks must lay within the key file length entirely. */ + if (next_link + block_size > share->state.state.key_file_length) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "Invalid key block position: %s " + "key block size: %u file_length: %s", + llstr(next_link, llbuff), block_size, + llstr(share->state.state.key_file_length, llbuff2)); + DBUG_RETURN(1); + /* purecov: end */ + } + + /* Key blocks must be aligned at block_size */ + if (next_link & (block_size -1)) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "Mis-aligned key block: %s " + "minimum key block length: %u", + llstr(next_link, llbuff), + block_size); + DBUG_RETURN(1); + /* purecov: end */ + } + + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (!(buff= pagecache_read(share->pagecache, + &share->kfile, + (pgcache_page_no_t) (next_link / block_size), + DFLT_INIT_HITS, + info->buff, PAGECACHE_READ_UNKNOWN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "key cache read error for block: %s", + llstr(next_link,llbuff)); + DBUG_RETURN(1); + /* purecov: end */ + } + if (_ma_get_keynr(info->s, buff) != MARIA_DELETE_KEY_NR) + _ma_check_print_error(param, "Page at %s is not delete marked", + llstr(next_link, llbuff)); + + next_link= mi_sizekorr(buff + share->keypage_header); + records--; + param->key_file_blocks+=block_size; + } + if (param->testflag & T_VERBOSE) + { + if (next_link != HA_OFFSET_ERROR) + printf("%16s\n",llstr(next_link,llbuff)); + else + puts(""); + } + DBUG_RETURN (next_link != HA_OFFSET_ERROR); +} /* check_k_link */ + + + /* Check sizes of files */ + +int maria_chk_size(HA_CHECK *param, register MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + int error; + register my_off_t skr,size; + char buff[22],buff2[22]; + DBUG_ENTER("maria_chk_size"); + + if (!(param->testflag & T_SILENT)) + puts("- check file-size"); + + /* + The following is needed if called externally (not from maria_chk). + To get a correct physical size we need to flush them. + */ + if ((error= _ma_flush_table_files(info, + MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE))) + _ma_check_print_error(param, "Failed to flush data or index file"); + + size= my_seek(share->kfile.file, 0L, MY_SEEK_END, MYF(MY_THREADSAFE)); + if ((skr=(my_off_t) share->state.state.key_file_length) != size) + { + /* Don't give error if file generated by mariapack */ + if (skr > size && maria_is_any_key_active(share->state.key_map)) + { + error=1; + _ma_check_print_error(param, + "Size of indexfile is: %-8s Should be: %s", + llstr(size,buff), llstr(skr,buff2)); + } + else if (!(param->testflag & T_VERY_SILENT)) + _ma_check_print_warning(param, + "Size of indexfile is: %-8s Should be: %s", + llstr(size,buff), llstr(skr,buff2)); + } + if (!(param->testflag & T_VERY_SILENT) && + ! (share->options & HA_OPTION_COMPRESS_RECORD) && + ulonglong2double(share->state.state.key_file_length) > + ulonglong2double(share->base.margin_key_file_length)*0.9) + _ma_check_print_warning(param,"Keyfile is almost full, %10s of %10s used", + llstr(share->state.state.key_file_length,buff), + llstr(share->base.max_key_file_length-1,buff)); + + size= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)); + skr=(my_off_t) share->state.state.data_file_length; + if (share->options & HA_OPTION_COMPRESS_RECORD) + skr+= MEMMAP_EXTRA_MARGIN; +#ifdef USE_RELOC + if (share->data_file_type == STATIC_RECORD && + skr < (my_off_t) share->base.reloc*share->base.min_pack_length) + skr=(my_off_t) share->base.reloc*share->base.min_pack_length; +#endif + if (skr != size) + { + if (skr > size && skr != size + MEMMAP_EXTRA_MARGIN) + { + share->state.state.data_file_length=size; /* Skip other errors */ + error=1; + _ma_check_print_error(param,"Size of datafile is: %-9s Should be: %s", + llstr(size,buff), llstr(skr,buff2)); + param->testflag|=T_RETRY_WITHOUT_QUICK; + } + else + { + _ma_check_print_warning(param, + "Size of datafile is: %-9s Should be: %s", + llstr(size,buff), llstr(skr,buff2)); + } + } + if (!(param->testflag & T_VERY_SILENT) && + !(share->options & HA_OPTION_COMPRESS_RECORD) && + ulonglong2double(share->state.state.data_file_length) > + (ulonglong2double(share->base.max_data_file_length)*0.9)) + _ma_check_print_warning(param, "Datafile is almost full, %10s of %10s used", + llstr(share->state.state.data_file_length,buff), + llstr(share->base.max_data_file_length-1,buff2)); + DBUG_RETURN(error); +} /* maria_chk_size */ + + +/* Check keys */ + +int maria_chk_key(HA_CHECK *param, register MARIA_HA *info) +{ + uint key,found_keys=0,full_text_keys=0,result=0; + ha_rows keys; + ha_checksum old_record_checksum,init_checksum; + my_off_t all_keydata,all_totaldata,key_totlength,length; + double *rec_per_key_part; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + char buff[22],buff2[22]; + MARIA_PAGE page; + DBUG_ENTER("maria_chk_key"); + + if (!(param->testflag & T_SILENT)) + puts("- check key delete-chain"); + + param->key_file_blocks=share->base.keystart; + if (check_k_link(param, info, share->state.key_del)) + { + if (param->testflag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"key delete-link-chain corrupted"); + DBUG_RETURN(-1); + } + + if (!(param->testflag & T_SILENT)) + puts("- check index reference"); + + all_keydata=all_totaldata=key_totlength=0; + init_checksum=param->record_checksum; + old_record_checksum=0; + if (share->data_file_type == STATIC_RECORD) + old_record_checksum= (calc_checksum(share->state.state.records + + share->state.state.del-1) * + share->base.pack_reclength); + rec_per_key_part= param->new_rec_per_key_part; + for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ; + rec_per_key_part+=keyinfo->keysegs, key++, keyinfo++) + { + param->key_crc[key]=0; + if (! maria_is_key_active(share->state.key_map, key)) + { + /* Remember old statistics for key */ + memcpy((char*) rec_per_key_part, + (char*) (share->state.rec_per_key_part + + (uint) (rec_per_key_part - param->new_rec_per_key_part)), + keyinfo->keysegs*sizeof(*rec_per_key_part)); + continue; + } + found_keys++; + + param->record_checksum=init_checksum; + + bzero((char*) ¶m->unique_count,sizeof(param->unique_count)); + bzero((char*) ¶m->notnull_count,sizeof(param->notnull_count)); + + if ((!(param->testflag & T_SILENT))) + printf ("- check data record references index: %d\n",key+1); + if (keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL)) + full_text_keys++; + if (share->state.key_root[key] == HA_OFFSET_ERROR) + { + if (share->state.state.records != 0 && !(keyinfo->flag & HA_FULLTEXT)) + _ma_check_print_error(param, "Key tree %u is empty", key + 1); + goto do_stat; + } + if (_ma_fetch_keypage(&page, info, keyinfo, share->state.key_root[key], + PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, + info->buff, 0)) + { + report_keypage_fault(param, info, share->state.key_root[key]); + if (!(param->testflag & T_INFO)) + DBUG_RETURN(-1); + result= -1; + continue; + } + param->key_file_blocks+=keyinfo->block_length; + keys=0; + param->keydata=param->totaldata=0; + param->key_blocks=0; + param->max_level=0; + if (chk_index(param, info,keyinfo, &page, &keys, param->key_crc+key,1)) + DBUG_RETURN(-1); + if (!(keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL | HA_RTREE_INDEX))) + { + if (keys != share->state.state.records) + { + _ma_check_print_error(param,"Found %s keys of %s",llstr(keys,buff), + llstr(share->state.state.records,buff2)); + if (!(param->testflag & T_INFO)) + DBUG_RETURN(-1); + result= -1; + continue; + } + if ((found_keys - full_text_keys == 1 && + !(share->data_file_type == STATIC_RECORD)) || + (param->testflag & T_DONT_CHECK_CHECKSUM)) + old_record_checksum= param->record_checksum; + else if (old_record_checksum != param->record_checksum) + { + if (key) + _ma_check_print_error(param, + "Key %u doesn't point at same records as " + "key 1", + key+1); + else + _ma_check_print_error(param,"Key 1 doesn't point at all records"); + if (!(param->testflag & T_INFO)) + DBUG_RETURN(-1); + result= -1; + continue; + } + } + if ((uint) share->base.auto_key -1 == key) + { + /* Check that auto_increment key is bigger than max key value */ + ulonglong auto_increment; + const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg; + info->lastinx=key; + _ma_read_key_record(info, info->rec_buff, 0); + auto_increment= + ma_retrieve_auto_increment(info->rec_buff + keyseg->start, + keyseg->type); + if (auto_increment > share->state.auto_increment) + { + _ma_check_print_warning(param, "Auto-increment value: %s is smaller " + "than max used value: %s", + llstr(share->state.auto_increment,buff2), + llstr(auto_increment, buff)); + } + if (param->testflag & T_AUTO_INC) + { + set_if_bigger(share->state.auto_increment, + auto_increment); + set_if_bigger(share->state.auto_increment, + param->auto_increment_value); + } + + /* Check that there isn't a row with auto_increment = 0 in the table */ + maria_extra(info,HA_EXTRA_KEYREAD,0); + bzero(info->lastkey_buff, keyinfo->seg->length); + if (!maria_rkey(info, info->rec_buff, key, + info->lastkey_buff, + (key_part_map) 1, HA_READ_KEY_EXACT)) + { + /* Don't count this as a real warning, as maria_chk can't correct it */ + uint save=param->warning_printed; + _ma_check_print_warning(param, "Found row where the auto_increment " + "column has the value 0"); + param->warning_printed=save; + } + maria_extra(info,HA_EXTRA_NO_KEYREAD,0); + } + + length=(my_off_t) isam_key_length(info,keyinfo)*keys + param->key_blocks*2; + if (param->testflag & T_INFO && param->totaldata != 0L && keys != 0L) + printf("Key: %2d: Keyblocks used: %3d%% Packed: %4d%% Max levels: %2d\n", + key+1, + (int) (my_off_t2double(param->keydata)*100.0/my_off_t2double(param->totaldata)), + (int) ((my_off_t2double(length) - my_off_t2double(param->keydata))*100.0/ + my_off_t2double(length)), + param->max_level); + all_keydata+=param->keydata; all_totaldata+=param->totaldata; key_totlength+=length; + +do_stat: + if (param->testflag & T_STATISTICS) + maria_update_key_parts(keyinfo, rec_per_key_part, param->unique_count, + param->stats_method == MI_STATS_METHOD_IGNORE_NULLS? + param->notnull_count: NULL, + (ulonglong)share->state.state.records); + } + if (param->testflag & T_INFO) + { + if (all_totaldata != 0L && found_keys > 0) + printf("Total: Keyblocks used: %3d%% Packed: %4d%%\n\n", + (int) (my_off_t2double(all_keydata)*100.0/ + my_off_t2double(all_totaldata)), + (int) ((my_off_t2double(key_totlength) - + my_off_t2double(all_keydata))*100.0/ + my_off_t2double(key_totlength))); + else if (all_totaldata != 0L && maria_is_any_key_active(share->state.key_map)) + puts(""); + } + if (param->key_file_blocks != share->state.state.key_file_length && + share->state.key_map == ~(ulonglong) 0) + _ma_check_print_warning(param, "Some data are unreferenced in keyfile"); + if (found_keys != full_text_keys) + param->record_checksum=old_record_checksum-init_checksum; /* Remove delete links */ + else + param->record_checksum=0; + DBUG_RETURN(result); +} /* maria_chk_key */ + + + +static int chk_index_down(HA_CHECK *param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t page, uchar *buff, ha_rows *keys, + ha_checksum *key_checksum, uint level) +{ + char llbuff[22],llbuff2[22]; + MARIA_SHARE *share= info->s; + MARIA_PAGE ma_page; + DBUG_ENTER("chk_index_down"); + + /* Key blocks must lay within the key file length entirely. */ + if (page + keyinfo->block_length > share->state.state.key_file_length) + { + /* purecov: begin tested */ + /* Give it a chance to fit in the real file size. */ + my_off_t max_length= my_seek(info->s->kfile.file, 0L, MY_SEEK_END, + MYF(MY_THREADSAFE)); + _ma_check_print_error(param, "Invalid key block position: %s " + "key block size: %u file_length: %s", + llstr(page, llbuff), keyinfo->block_length, + llstr(share->state.state.key_file_length, llbuff2)); + if (page + keyinfo->block_length > max_length) + goto err; + /* Fix the remembered key file length. */ + share->state.state.key_file_length= (max_length & + ~ (my_off_t) (keyinfo->block_length - + 1)); + /* purecov: end */ + } + + /* Key blocks must be aligned at block length */ + if (page & (info->s->block_size -1)) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "Mis-aligned key block: %s " + "key block length: %u", + llstr(page, llbuff), info->s->block_size); + goto err; + /* purecov: end */ + } + + if (_ma_fetch_keypage(&ma_page, info, keyinfo, page, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, buff, 0)) + { + report_keypage_fault(param, info, page); + goto err; + } + param->key_file_blocks+=keyinfo->block_length; + if (chk_index(param, info, keyinfo, &ma_page, keys, key_checksum,level)) + goto err; + + DBUG_RETURN(0); + + /* purecov: begin tested */ +err: + DBUG_RETURN(1); + /* purecov: end */ +} + + +/* + "Ignore NULLs" statistics collection method: process first index tuple. + + SYNOPSIS + maria_collect_stats_nonulls_first() + keyseg IN Array of key part descriptions + notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i} + tuples that don't contain NULLs) + key IN Key values tuple + + DESCRIPTION + Process the first index tuple - find out which prefix tuples don't + contain NULLs, and update the array of notnull counters accordingly. +*/ + +static +void maria_collect_stats_nonulls_first(HA_KEYSEG *keyseg, ulonglong *notnull, + const uchar *key) +{ + uint first_null, kp; + first_null= ha_find_null(keyseg, key) - keyseg; + /* + All prefix tuples that don't include keypart_{first_null} are not-null + tuples (and all others aren't), increment counters for them. + */ + for (kp= 0; kp < first_null; kp++) + notnull[kp]++; +} + + +/* + "Ignore NULLs" statistics collection method: process next index tuple. + + SYNOPSIS + maria_collect_stats_nonulls_next() + keyseg IN Array of key part descriptions + notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i} + tuples that don't contain NULLs) + prev_key IN Previous key values tuple + last_key IN Next key values tuple + + DESCRIPTION + Process the next index tuple: + 1. Find out which prefix tuples of last_key don't contain NULLs, and + update the array of notnull counters accordingly. + 2. Find the first keypart number where the prev_key and last_key tuples + are different(A), or last_key has NULL value(B), and return it, so the + caller can count number of unique tuples for each key prefix. We don't + need (B) to be counted, and that is compensated back in + maria_update_key_parts(). + + RETURN + 1 + number of first keypart where values differ or last_key tuple has NULL +*/ + +static +int maria_collect_stats_nonulls_next(HA_KEYSEG *keyseg, ulonglong *notnull, + const uchar *prev_key, + const uchar *last_key) +{ + uint diffs[2]; + uint first_null_seg, kp; + HA_KEYSEG *seg; + + /* + Find the first keypart where values are different or either of them is + NULL. We get results in diffs array: + diffs[0]= 1 + number of first different keypart + diffs[1]=offset: (last_key + diffs[1]) points to first value in + last_key that is NULL or different from corresponding + value in prev_key. + */ + ha_key_cmp(keyseg, prev_key, last_key, USE_WHOLE_KEY, + SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diffs); + seg= keyseg + diffs[0] - 1; + + /* Find first NULL in last_key */ + first_null_seg= ha_find_null(seg, last_key + diffs[1]) - keyseg; + for (kp= 0; kp < first_null_seg; kp++) + notnull[kp]++; + + /* + Return 1+ number of first key part where values differ. Don't care if + these were NULLs and not .... We compensate for that in + maria_update_key_parts. + */ + return diffs[0]; +} + + +/* Check if index is ok */ + +static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_PAGE *anc_page, ha_rows *keys, + ha_checksum *key_checksum, uint level) +{ + int flag; + uint comp_flag, page_flag, nod_flag; + uchar *temp_buff, *keypos, *old_keypos, *endpos; + my_off_t next_page,record; + MARIA_SHARE *share= info->s; + char llbuff[22]; + uint diff_pos[2]; + uchar tmp_key_buff[MARIA_MAX_KEY_BUFF]; + MARIA_KEY tmp_key; + DBUG_ENTER("chk_index"); + DBUG_DUMP("buff", anc_page->buff, anc_page->size); + + /* TODO: implement appropriate check for RTree keys */ + if (keyinfo->flag & (HA_SPATIAL | HA_RTREE_INDEX)) + DBUG_RETURN(0); + + if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length))) + { + _ma_check_print_error(param,"Not enough memory for keyblock"); + DBUG_RETURN(-1); + } + + if (keyinfo->flag & HA_NOSAME) + { + /* Not real duplicates */ + comp_flag=SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT; + } + else + comp_flag=SEARCH_SAME; /* Keys in positionorder */ + + page_flag= anc_page->flag; + nod_flag= anc_page->node; + old_keypos= anc_page->buff + share->keypage_header; + keypos= old_keypos + nod_flag; + endpos= anc_page->buff + anc_page->size; + + param->keydata+= anc_page->size; + param->totaldata+= keyinfo->block_length; /* INFO */ + param->key_blocks++; + if (level > param->max_level) + param->max_level=level; + + if (_ma_get_keynr(share, anc_page->buff) != + (uint) (keyinfo - share->keyinfo)) + _ma_check_print_error(param, "Page at %s is not marked for index %u", + llstr(anc_page->pos, llbuff), + (uint) (keyinfo - share->keyinfo)); + if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) && + !share->base.born_transactional) + { + _ma_check_print_error(param, + "Page at %s is marked with HAS_TRANSID even if " + "table is not transactional", + llstr(anc_page->pos, llbuff)); + } + + if (anc_page->size > share->max_index_block_size) + { + _ma_check_print_error(param, + "Page at %s has impossible (too big) pagelength", + llstr(anc_page->pos, llbuff)); + goto err; + } + + info->last_key.keyinfo= tmp_key.keyinfo= keyinfo; + tmp_key.data= tmp_key_buff; + for ( ;; ) + { + if (nod_flag) + { + if (_ma_killed_ptr(param)) + goto err; + next_page= _ma_kpos(nod_flag,keypos); + if (chk_index_down(param,info,keyinfo,next_page, + temp_buff,keys,key_checksum,level+1)) + { + DBUG_DUMP("page_data", old_keypos, (uint) (keypos - old_keypos)); + goto err; + } + } + old_keypos=keypos; + if (keypos >= endpos || + !(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos)) + break; + if (keypos > endpos) + { + _ma_check_print_error(param, + "Page length and length of keys don't match at " + "page: %s", + llstr(anc_page->pos,llbuff)); + goto err; + } + if (share->data_file_type == BLOCK_RECORD && + !(page_flag & KEYPAGE_FLAG_HAS_TRANSID) && + key_has_transid(tmp_key.data + tmp_key.data_length + + share->rec_reflength-1)) + { + _ma_check_print_error(param, + "Found key marked for transid on page that is not " + "marked for transid at: %s", + llstr(anc_page->pos,llbuff)); + goto err; + } + + if ((*keys)++ && + (flag=ha_key_cmp(keyinfo->seg, info->last_key.data, tmp_key.data, + tmp_key.data_length + tmp_key.ref_length, + (comp_flag | SEARCH_INSERT | (tmp_key.flag >> 1) | + info->last_key.flag), diff_pos)) >=0) + { + DBUG_DUMP_KEY("old", &info->last_key); + DBUG_DUMP_KEY("new", &tmp_key); + DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos)); + + if ((comp_flag & SEARCH_FIND) && flag == 0) + _ma_check_print_error(param,"Found duplicated key at page %s", + llstr(anc_page->pos,llbuff)); + else + _ma_check_print_error(param,"Key in wrong position at page %s", + llstr(anc_page->pos,llbuff)); + goto err; + } + + if (param->testflag & T_STATISTICS) + { + if (*keys != 1L) /* not first_key */ + { + if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL) + ha_key_cmp(keyinfo->seg, info->last_key.data, + tmp_key.data, tmp_key.data_length, + SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, + diff_pos); + else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + { + diff_pos[0]= maria_collect_stats_nonulls_next(keyinfo->seg, + param->notnull_count, + info->last_key.data, + tmp_key.data); + } + param->unique_count[diff_pos[0]-1]++; + } + else + { + if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + maria_collect_stats_nonulls_first(keyinfo->seg, param->notnull_count, + tmp_key.data); + } + } + _ma_copy_key(&info->last_key, &tmp_key); + (*key_checksum)+= maria_byte_checksum(tmp_key.data, tmp_key.data_length); + record= _ma_row_pos_from_key(&tmp_key); + + if (keyinfo->flag & HA_FULLTEXT) /* special handling for ft2 */ + { + uint off; + int subkeys; + get_key_full_length_rdonly(off, tmp_key.data); + subkeys= ft_sintXkorr(tmp_key.data + off); + if (subkeys < 0) + { + ha_rows tmp_keys=0; + if (chk_index_down(param,info,&share->ft2_keyinfo,record, + temp_buff,&tmp_keys,key_checksum,1)) + goto err; + if (tmp_keys + subkeys) + { + _ma_check_print_error(param, + "Number of words in the 2nd level tree " + "does not match the number in the header. " + "Parent word in on the page %s, offset %u", + llstr(anc_page->pos,llbuff), + (uint) (old_keypos - anc_page->buff)); + goto err; + } + (*keys)+=tmp_keys-1; + continue; + } + /* fall through */ + } + if ((share->data_file_type != BLOCK_RECORD && + record >= share->state.state.data_file_length) || + (share->data_file_type == BLOCK_RECORD && + ma_recordpos_to_page(record) * share->base.min_block_length >= + share->state.state.data_file_length)) + { +#ifndef DBUG_OFF + char llbuff2[22], llbuff3[22]; +#endif + _ma_check_print_error(param, + "Found key at page %s that points to record " + "outside datafile", + llstr(anc_page->pos,llbuff)); + DBUG_PRINT("test",("page: %s record: %s filelength: %s", + llstr(anc_page->pos,llbuff),llstr(record,llbuff2), + llstr(share->state.state.data_file_length,llbuff3))); + DBUG_DUMP_KEY("key", &tmp_key); + DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos)); + goto err; + } + param->record_checksum+= (ha_checksum) record; + } + if (keypos != endpos) + { + _ma_check_print_error(param, + "Keyblock size at page %s is not correct. " + "Block length: %u key length: %u", + llstr(anc_page->pos, llbuff), anc_page->size, + (uint) (keypos - anc_page->buff)); + goto err; + } + my_afree(temp_buff); + DBUG_RETURN(0); + err: + my_afree(temp_buff); + DBUG_RETURN(1); +} /* chk_index */ + + + /* Calculate a checksum of 1+2+3+4...N = N*(N+1)/2 without overflow */ + +static ha_checksum calc_checksum(ha_rows count) +{ + ulonglong sum,a,b; + DBUG_ENTER("calc_checksum"); + + sum=0; + a=count; b=count+1; + if (a & 1) + b>>=1; + else + a>>=1; + while (b) + { + if (b & 1) + sum+=a; + a<<=1; b>>=1; + } + DBUG_PRINT("exit",("sum: %lx",(ulong) sum)); + DBUG_RETURN((ha_checksum) sum); +} /* calc_checksum */ + + + /* Calc length of key in normal isam */ + +static uint isam_key_length(MARIA_HA *info, register MARIA_KEYDEF *keyinfo) +{ + uint length; + HA_KEYSEG *keyseg; + DBUG_ENTER("isam_key_length"); + + length= info->s->rec_reflength; + for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++) + length+= keyseg->length; + + DBUG_PRINT("exit",("length: %d",length)); + DBUG_RETURN(length); +} /* key_length */ + + + +static void record_pos_to_txt(MARIA_HA *info, my_off_t recpos, + char *buff) +{ + if (info->s->data_file_type != BLOCK_RECORD) + llstr(recpos, buff); + else + { + my_off_t page= ma_recordpos_to_page(recpos); + uint row= ma_recordpos_to_dir_entry(recpos); + char *end= longlong10_to_str(page, buff, 10); + *(end++)= ':'; + longlong10_to_str(row, end, 10); + } +} + + +/* + Check that keys in records exist in index tree + + SYNOPSIS + check_keys_in_record() + param Check paramenter + info Maria handler + extend Type of check (extended or normal) + start_recpos Position to row + record Record buffer + + NOTES + This function also calculates record checksum & number of rows +*/ + +static int check_keys_in_record(HA_CHECK *param, MARIA_HA *info, int extend, + my_off_t start_recpos, uchar *record) +{ + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + char llbuff[22+4]; + uint keynr; + + param->tmp_record_checksum+= (ha_checksum) start_recpos; + param->records++; + if (param->testflag & T_WRITE_LOOP && param->records % WRITE_COUNT == 0) + { + printf("%s\r", llstr(param->records, llbuff)); + VOID(fflush(stdout)); + } + + /* Check if keys match the record */ + for (keynr=0, keyinfo= share->keyinfo; keynr < share->base.keys; + keynr++, keyinfo++) + { + if (maria_is_key_active(share->state.key_map, keynr)) + { + MARIA_KEY key; + if (!(keyinfo->flag & HA_FULLTEXT)) + { + (*keyinfo->make_key)(info, &key, keynr, info->lastkey_buff, record, + start_recpos, 0); + if (extend) + { + /* We don't need to lock the key tree here as we don't allow + concurrent threads when running maria_chk + */ + int search_result= +#ifdef HAVE_RTREE_KEYS + (keyinfo->flag & (HA_SPATIAL | HA_RTREE_INDEX)) ? + maria_rtree_find_first(info, &key, MBR_EQUAL | MBR_DATA) : +#endif + _ma_search(info, &key, SEARCH_SAME, share->state.key_root[keynr]); + if (search_result) + { + record_pos_to_txt(info, start_recpos, llbuff); + _ma_check_print_error(param, + "Record at: %14s " + "Can't find key for index: %2d", + llbuff, keynr+1); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + return -1; + } + } + else + param->tmp_key_crc[keynr]+= + maria_byte_checksum(key.data, key.data_length); + } + } + } + return 0; +} + + +/* + Functions to loop through all rows and check if they are ok + + NOTES + One function for each record format + + RESULT + 0 ok + -1 Interrupted by user + 1 Error +*/ + +static int check_static_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + MARIA_SHARE *share= info->s; + my_off_t start_recpos, pos; + char llbuff[22]; + + pos= 0; + while (pos < share->state.state.data_file_length) + { + if (_ma_killed_ptr(param)) + return -1; + if (my_b_read(¶m->read_cache, record, + share->base.pack_reclength)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: " + "%s", + my_errno, llstr(pos, llbuff)); + return 1; + } + start_recpos= pos; + pos+= share->base.pack_reclength; + param->splits++; + if (*record == '\0') + { + param->del_blocks++; + param->del_length+= share->base.pack_reclength; + continue; /* Record removed */ + } + param->glob_crc+= _ma_static_checksum(info,record); + param->used+= share->base.pack_reclength; + if (check_keys_in_record(param, info, extend, start_recpos, record)) + return 1; + } + return 0; +} + + +static int check_dynamic_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + my_off_t start_recpos, start_block, pos; + uchar *to; + ulong left_length; + uint b_type; + char llbuff[22],llbuff2[22],llbuff3[22]; + DBUG_ENTER("check_dynamic_record"); + + LINT_INIT(left_length); + LINT_INIT(start_recpos); + LINT_INIT(to); + + pos= 0; + while (pos < share->state.state.data_file_length) + { + my_bool got_error= 0; + int flag; + if (_ma_killed_ptr(param)) + DBUG_RETURN(-1); + + flag= block_info.second_read=0; + block_info.next_filepos=pos; + do + { + if (_ma_read_cache(¶m->read_cache, block_info.header, + (start_block=block_info.next_filepos), + sizeof(block_info.header), + (flag ? 0 : READING_NEXT) | READING_HEADER)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at " + "position: %s", + my_errno, llstr(start_block, llbuff)); + DBUG_RETURN(1); + } + + if (start_block & (MARIA_DYN_ALIGN_SIZE-1)) + { + _ma_check_print_error(param,"Wrong aligned block at %s", + llstr(start_block,llbuff)); + DBUG_RETURN(1); + } + b_type= _ma_get_block_info(&block_info,-1,start_block); + if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if (b_type & BLOCK_SYNC_ERROR) + { + if (flag) + { + _ma_check_print_error(param,"Unexpected byte: %d at link: %s", + (int) block_info.header[0], + llstr(start_block,llbuff)); + DBUG_RETURN(1); + } + pos=block_info.filepos+block_info.block_len; + goto next; + } + if (b_type & BLOCK_DELETED) + { + if (block_info.block_len < share->base.min_block_length) + { + _ma_check_print_error(param, + "Deleted block with impossible length %lu " + "at %s", + block_info.block_len,llstr(pos,llbuff)); + DBUG_RETURN(1); + } + if ((block_info.next_filepos != HA_OFFSET_ERROR && + block_info.next_filepos >= share->state.state.data_file_length) || + (block_info.prev_filepos != HA_OFFSET_ERROR && + block_info.prev_filepos >= share->state.state.data_file_length)) + { + _ma_check_print_error(param,"Delete link points outside datafile " + "at %s", + llstr(pos,llbuff)); + DBUG_RETURN(1); + } + param->del_blocks++; + param->del_length+= block_info.block_len; + param->splits++; + pos= block_info.filepos+block_info.block_len; + goto next; + } + _ma_check_print_error(param,"Wrong bytesec: %d-%d-%d at linkstart: %s", + block_info.header[0],block_info.header[1], + block_info.header[2], + llstr(start_block,llbuff)); + DBUG_RETURN(1); + } + if (share->state.state.data_file_length < block_info.filepos+ + block_info.block_len) + { + _ma_check_print_error(param, + "Recordlink that points outside datafile at %s", + llstr(pos,llbuff)); + got_error=1; + break; + } + param->splits++; + if (!flag++) /* First block */ + { + start_recpos=pos; + pos=block_info.filepos+block_info.block_len; + if (block_info.rec_len > (uint) share->base.max_pack_length) + { + _ma_check_print_error(param,"Found too long record (%lu) at %s", + (ulong) block_info.rec_len, + llstr(start_recpos,llbuff)); + got_error=1; + break; + } + if (share->base.blobs) + { + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + block_info.rec_len + + share->base.extra_rec_buff_size)) + + { + _ma_check_print_error(param, + "Not enough memory (%lu) for blob at %s", + (ulong) block_info.rec_len, + llstr(start_recpos,llbuff)); + got_error=1; + break; + } + } + to= info->rec_buff; + left_length= block_info.rec_len; + } + if (left_length < block_info.data_len) + { + _ma_check_print_error(param,"Found too long record (%lu) at %s", + (ulong) block_info.data_len, + llstr(start_recpos,llbuff)); + got_error=1; + break; + } + if (_ma_read_cache(¶m->read_cache, to, block_info.filepos, + (uint) block_info.data_len, + flag == 1 ? READING_NEXT : 0)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at " + "position: %s", my_errno, + llstr(block_info.filepos, llbuff)); + + DBUG_RETURN(1); + } + to+=block_info.data_len; + param->link_used+= block_info.filepos-start_block; + param->used+= block_info.filepos - start_block + block_info.data_len; + param->empty+= block_info.block_len-block_info.data_len; + left_length-= block_info.data_len; + if (left_length) + { + if (b_type & BLOCK_LAST) + { + _ma_check_print_error(param, + "Wrong record length %s of %s at %s", + llstr(block_info.rec_len-left_length,llbuff), + llstr(block_info.rec_len, llbuff2), + llstr(start_recpos,llbuff3)); + got_error=1; + break; + } + if (share->state.state.data_file_length < block_info.next_filepos) + { + _ma_check_print_error(param, + "Found next-recordlink that points outside " + "datafile at %s", + llstr(block_info.filepos,llbuff)); + got_error=1; + break; + } + } + } while (left_length); + + if (! got_error) + { + if (_ma_rec_unpack(info,record,info->rec_buff,block_info.rec_len) == + MY_FILE_ERROR) + { + _ma_check_print_error(param,"Found wrong record at %s", + llstr(start_recpos,llbuff)); + got_error=1; + } + else + { + ha_checksum checksum= 0; + if (share->calc_checksum) + checksum= (*share->calc_checksum)(info, record); + + if (param->testflag & (T_EXTEND | T_MEDIUM | T_VERBOSE)) + { + if (_ma_rec_check(info,record, info->rec_buff,block_info.rec_len, + test(share->calc_checksum), checksum)) + { + _ma_check_print_error(param,"Found wrong packed record at %s", + llstr(start_recpos,llbuff)); + got_error= 1; + } + } + param->glob_crc+= checksum; + } + + if (! got_error) + { + if (check_keys_in_record(param, info, extend, start_recpos, record)) + DBUG_RETURN(1); + } + else + { + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + } + else if (!flag) + pos= block_info.filepos+block_info.block_len; +next:; + } + DBUG_RETURN(0); +} + + +static int check_compressed_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + my_off_t start_recpos, pos; + char llbuff[22]; + my_bool got_error= 0; + DBUG_ENTER("check_compressed_record"); + + pos= share->pack.header_length; /* Skip header */ + while (pos < share->state.state.data_file_length) + { + if (_ma_killed_ptr(param)) + DBUG_RETURN(-1); + + if (_ma_read_cache(¶m->read_cache, block_info.header, pos, + share->pack.ref_length, READING_NEXT)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: " + "%s", + my_errno, llstr(pos, llbuff)); + DBUG_RETURN(1); + } + + start_recpos= pos; + param->splits++; + VOID(_ma_pack_get_block_info(info, &info->bit_buff, &block_info, + &info->rec_buff, &info->rec_buff_size, -1, + start_recpos)); + pos=block_info.filepos+block_info.rec_len; + if (block_info.rec_len < (uint) share->min_pack_length || + block_info.rec_len > (uint) share->max_pack_length) + { + _ma_check_print_error(param, + "Found block with wrong recordlength: %lu at %s", + block_info.rec_len, llstr(start_recpos,llbuff)); + got_error=1; + goto end; + } + if (_ma_read_cache(¶m->read_cache, info->rec_buff, + block_info.filepos, block_info.rec_len, READING_NEXT)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: " + "%s", + my_errno, llstr(block_info.filepos, llbuff)); + DBUG_RETURN(1); + } + if (_ma_pack_rec_unpack(info, &info->bit_buff, record, + info->rec_buff, block_info.rec_len)) + { + _ma_check_print_error(param,"Found wrong record at %s", + llstr(start_recpos,llbuff)); + got_error=1; + goto end; + } + param->glob_crc+= (*share->calc_checksum)(info,record); + param->link_used+= (block_info.filepos - start_recpos); + param->used+= (pos-start_recpos); + +end: + if (! got_error) + { + if (check_keys_in_record(param, info, extend, start_recpos, record)) + DBUG_RETURN(1); + } + else + { + got_error= 0; /* Reset for next loop */ + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Check if layout on head or tail page is ok + + NOTES + This is for rows-in-block format. +*/ + +static int check_page_layout(HA_CHECK *param, MARIA_HA *info, + my_off_t page_pos, uchar *page, + uint row_count, uint head_empty, + uint *real_rows_found, uint *free_slots_found) +{ + uint empty, last_row_end, row, first_dir_entry, free_entry, block_size; + uint free_entries, prev_free_entry; + uchar *dir_entry; + char llbuff[22]; + my_bool error_in_free_list= 0; + DBUG_ENTER("check_page_layout"); + + block_size= info->s->block_size; + empty= 0; + last_row_end= PAGE_HEADER_SIZE; + *real_rows_found= 0; + + /* Check free directory list */ + free_entry= (uint) page[DIR_FREE_OFFSET]; + free_entries= 0; + prev_free_entry= END_OF_DIR_FREE_LIST; + while (free_entry != END_OF_DIR_FREE_LIST) + { + uchar *dir; + if (free_entry > row_count) + { + _ma_check_print_error(param, + "Page %9s: Directory free entry points outside " + "directory", + llstr(page_pos, llbuff)); + error_in_free_list= 1; + break; + } + dir= dir_entry_pos(page, block_size, free_entry); + if (uint2korr(dir) != 0) + { + _ma_check_print_error(param, + "Page %9s: Directory free entry points to " + "not deleted entry", + llstr(page_pos, llbuff)); + error_in_free_list= 1; + break; + } + if (dir[2] != prev_free_entry) + { + _ma_check_print_error(param, + "Page %9s: Directory free list back pointer " + "points to wrong entry", + llstr(page_pos, llbuff)); + error_in_free_list= 1; + break; + } + prev_free_entry= free_entry; + free_entry= dir[3]; + free_entries++; + } + *free_slots_found= free_entries; + + /* Check directry */ + dir_entry= page+ block_size - PAGE_SUFFIX_SIZE; + first_dir_entry= (block_size - row_count * DIR_ENTRY_SIZE - + PAGE_SUFFIX_SIZE); + for (row= 0 ; row < row_count ; row++) + { + uint pos, length; + dir_entry-= DIR_ENTRY_SIZE; + pos= uint2korr(dir_entry); + if (!pos) + { + free_entries--; + if (row == row_count -1) + { + _ma_check_print_error(param, + "Page %9s: First entry in directory is 0", + llstr(page_pos, llbuff)); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + continue; /* Deleted row */ + } + (*real_rows_found)++; + length= uint2korr(dir_entry+2); + param->used+= length; + if (pos < last_row_end) + { + _ma_check_print_error(param, + "Page %9s: Row %3u overlapps with previous row", + llstr(page_pos, llbuff), row); + DBUG_RETURN(1); + } + empty+= (pos - last_row_end); + last_row_end= pos + length; + if (last_row_end > first_dir_entry) + { + _ma_check_print_error(param, + "Page %9s: Row %3u overlapps with directory", + llstr(page_pos, llbuff), row); + DBUG_RETURN(1); + } + } + empty+= (first_dir_entry - last_row_end); + + if (empty != head_empty) + { + _ma_check_print_error(param, + "Page %9s: Wrong empty size. Stored: %5u " + "Actual: %5u", + llstr(page_pos, llbuff), head_empty, empty); + param->err_count++; + } + if (free_entries != 0 && !error_in_free_list) + { + _ma_check_print_error(param, + "Page %9s: Directory free link don't include " + "all free entries", + llstr(page_pos, llbuff)); + param->err_count++; + } + DBUG_RETURN(param->err_count && + (param->err_count >= MAXERR || !(param->testflag & T_VERBOSE))); +} + + +/* + Check all rows on head page + + NOTES + This is for rows-in-block format. + + Before this, we have already called check_page_layout(), so + we know the block is logicaly correct (even if the rows may not be that) + + RETURN + 0 ok + 1 error +*/ + + +static my_bool check_head_page(HA_CHECK *param, MARIA_HA *info, uchar *record, + int extend, my_off_t page_pos, uchar *page_buff, + uint row_count) +{ + MARIA_SHARE *share= info->s; + uchar *dir_entry; + uint row; + char llbuff[22], llbuff2[22]; + ulonglong page= page_pos / share->block_size; + DBUG_ENTER("check_head_page"); + + dir_entry= page_buff+ share->block_size - PAGE_SUFFIX_SIZE; + for (row= 0 ; row < row_count ; row++) + { + uint pos, length, flag; + dir_entry-= DIR_ENTRY_SIZE; + pos= uint2korr(dir_entry); + if (!pos) + continue; + length= uint2korr(dir_entry+2); + if (length < share->base.min_block_length) + { + _ma_check_print_error(param, + "Page %9s: Row %3u is too short " + "(%d of min %d bytes)", + llstr(page, llbuff), row, length, + (uint) share->base.min_block_length); + DBUG_RETURN(1); + } + flag= (uint) (uchar) page_buff[pos]; + if (flag & ~(ROW_FLAG_ALL)) + _ma_check_print_error(param, + "Page %9s: Row %3u has wrong flag: %u", + llstr(page, llbuff), row, flag); + + DBUG_PRINT("info", ("rowid: %s page: %lu row: %u", + llstr(ma_recordpos(page, row), llbuff), + (ulong) page, row)); + info->cur_row.trid= 0; + if (_ma_read_block_record2(info, record, page_buff+pos, + page_buff+pos+length)) + { + _ma_check_print_error(param, + "Page %9s: Row %3d is crashed", + llstr(page, llbuff), row); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + continue; + } + set_if_bigger(param->max_found_trid, info->cur_row.trid); + if (info->cur_row.trid > param->max_trid) + _ma_check_print_not_visible_error(param, info->cur_row.trid); + + if (share->calc_checksum) + { + ha_checksum checksum= (*share->calc_checksum)(info, record); + if (info->cur_row.checksum != (checksum & 255)) + _ma_check_print_error(param, "Page %9s: Row %3d has wrong checksum", + llstr(page, llbuff), row); + param->glob_crc+= checksum; + } + if (info->cur_row.extents_count) + { + uchar *extents= info->cur_row.extents; + uint i; + /* Check that bitmap has the right marker for the found extents */ + for (i= 0 ; i < info->cur_row.extents_count ; i++) + { + pgcache_page_no_t extent_page; + uint page_count, page_type; + extent_page= uint5korr(extents); + page_count= uint2korr(extents+5) & ~START_EXTENT_BIT; + extents+= ROW_EXTENT_SIZE; + page_type= BLOB_PAGE; + if (page_count & TAIL_BIT) + { + page_count= 1; + page_type= TAIL_PAGE; + } + /* + TODO OPTIMIZE: + Check the whole extent with one test and only do the loop if + something is wrong (for exact error reporting) + */ + for ( ; page_count--; extent_page++) + { + uint bitmap_pattern; + if (_ma_check_if_right_bitmap_type(info, page_type, extent_page, + &bitmap_pattern)) + { + _ma_check_print_error(param, + "Page %9s: Row: %3d has an extent with " + "wrong information in bitmap: " + "Page: %9s Page_type: %d Bitmap: %d", + llstr(page, llbuff), row, + llstr(extent_page, llbuff2), + page_type, bitmap_pattern); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + } + } + } + param->full_page_count+= info->cur_row.full_page_count; + param->tail_count+= info->cur_row.tail_count; + if (check_keys_in_record(param, info, extend, + ma_recordpos(page, row), record)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +/* + Check if rows-in-block data file is consistent +*/ + +static int check_block_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + MARIA_SHARE *share= info->s; + my_off_t pos; + pgcache_page_no_t page; + uchar *page_buff, *bitmap_buff, *data; + char llbuff[22], llbuff2[22]; + uint block_size= share->block_size; + ha_rows full_page_count, tail_count; + my_bool full_dir; + uint offset_page, offset, free_count; + + LINT_INIT(full_dir); + + if (_ma_scan_init_block_record(info)) + { + _ma_check_print_error(param, "got error %d when initializing scan", + my_errno); + return 1; + } + bitmap_buff= info->scan.bitmap_buff; + page_buff= info->scan.page_buff; + full_page_count= tail_count= 0; + param->full_page_count= param->tail_count= 0; + param->used= param->link_used= 0; + param->splits= share->state.state.data_file_length / block_size; + + for (pos= 0, page= 0; + pos < share->state.state.data_file_length; + pos+= block_size, page++) + { + uint row_count, real_row_count, empty_space, page_type, bitmap_pattern; + LINT_INIT(row_count); + LINT_INIT(empty_space); + + if (_ma_killed_ptr(param)) + { + _ma_scan_end_block_record(info); + return -1; + } + if ((page % share->bitmap.pages_covered) == 0) + { + /* Bitmap page */ + if (pagecache_read(share->pagecache, + &info->s->bitmap.file, + page, 1, + bitmap_buff, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0) + { + _ma_check_print_error(param, + "Page %9s: Got error: %d when reading datafile", + llstr(page, llbuff), my_errno); + goto err; + } + param->used+= block_size; + param->link_used+= block_size; + continue; + } + /* Skip pages marked as empty in bitmap */ + offset_page= (uint) ((page % share->bitmap.pages_covered) -1) * 3; + offset= offset_page & 7; + data= bitmap_buff + offset_page / 8; + bitmap_pattern= uint2korr(data); + if (!((bitmap_pattern >> offset) & 7)) + { + param->empty+= block_size; + param->del_blocks++; + continue; + } + + if (pagecache_read(share->pagecache, + &info->dfile, + page, 1, + page_buff, + share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0) + { + _ma_check_print_error(param, + "Page %9s: Got error: %d when reading datafile", + llstr(page, llbuff), my_errno); + goto err; + } + page_type= page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK; + if (page_type == UNALLOCATED_PAGE || page_type >= MAX_PAGE_TYPE) + { + _ma_check_print_error(param, + "Page: %9s Found wrong page type %d", + llstr(page, llbuff), page_type); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + goto err; + continue; + } + switch ((enum en_page_type) page_type) { + case UNALLOCATED_PAGE: + case MAX_PAGE_TYPE: + default: + DBUG_ASSERT(0); /* Impossible */ + break; + case HEAD_PAGE: + row_count= page_buff[DIR_COUNT_OFFSET]; + empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET); + param->used+= block_size - empty_space; + param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE + + row_count * DIR_ENTRY_SIZE); + if (empty_space < share->bitmap.sizes[3]) + param->lost+= empty_space; + if (check_page_layout(param, info, pos, page_buff, row_count, + empty_space, &real_row_count, &free_count)) + goto err; + full_dir= (row_count == MAX_ROWS_PER_PAGE && + page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST); + break; + case TAIL_PAGE: + row_count= page_buff[DIR_COUNT_OFFSET]; + empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET); + param->used+= block_size - empty_space; + param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE + + row_count * DIR_ENTRY_SIZE); + if (empty_space < share->bitmap.sizes[6]) + param->lost+= empty_space; + if (check_page_layout(param, info, pos, page_buff, row_count, + empty_space, &real_row_count, &free_count)) + goto err; + full_dir= (row_count - free_count >= MAX_ROWS_PER_PAGE - + share->base.blobs); + break; + case BLOB_PAGE: + full_page_count++; + full_dir= 0; + empty_space= block_size; /* for error reporting */ + param->link_used+= (LSN_SIZE + PAGE_TYPE_SIZE); + param->used+= block_size; + break; + } + if (_ma_check_bitmap_data(info, page_type, page, + full_dir ? 0 : empty_space, + &bitmap_pattern)) + { + if (bitmap_pattern == ~(uint) 0) + _ma_check_print_error(param, + "Page %9s: Wrong bitmap for data on page", + llstr(page, llbuff)); + else + _ma_check_print_error(param, + "Page %9s: Wrong data in bitmap. Page_type: " + "%d full: %d empty_space: %u Bitmap-bits: %d", + llstr(page, llbuff), page_type, full_dir, + empty_space, bitmap_pattern); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + goto err; + } + if ((enum en_page_type) page_type == BLOB_PAGE) + continue; + param->empty+= empty_space; + if ((enum en_page_type) page_type == TAIL_PAGE) + { + tail_count+= real_row_count; + continue; + } + if (check_head_page(param, info, record, extend, pos, page_buff, + row_count)) + goto err; + } + + /* Verify that rest of bitmap is zero */ + + if (page % share->bitmap.pages_covered) + { + /* Not at end of bitmap */ + uint bitmap_pattern; + offset_page= (uint) ((page % share->bitmap.pages_covered) -1) * 3; + offset= offset_page & 7; + data= bitmap_buff + offset_page / 8; + bitmap_pattern= uint2korr(data); + if (((bitmap_pattern >> offset)) || + (data + 2 < bitmap_buff + share->bitmap.total_size && + _ma_check_if_zero(data+2, bitmap_buff + share->bitmap.total_size - + data - 2))) + { + ulonglong bitmap_page; + bitmap_page= page / share->bitmap.pages_covered; + bitmap_page*= share->bitmap.pages_covered; + + _ma_check_print_error(param, + "Bitmap at page %s has pages reserved outside of " + "data file length", + llstr(bitmap_page, llbuff)); + DBUG_EXECUTE("bitmap", _ma_print_bitmap(&share->bitmap, bitmap_buff, + bitmap_page);); + } + } + + _ma_scan_end_block_record(info); + + if (full_page_count != param->full_page_count) + _ma_check_print_error(param, "Full page count read through records was %s " + "but we found %s pages while scanning table", + llstr(param->full_page_count, llbuff), + llstr(full_page_count, llbuff2)); + if (tail_count != param->tail_count) + _ma_check_print_error(param, "Tail count read through records was %s but " + "we found %s tails while scanning table", + llstr(param->tail_count, llbuff), + llstr(tail_count, llbuff2)); + + return param->error_printed != 0; + +err: + _ma_scan_end_block_record(info); + return 1; +} + + +/* Check that record-link is ok */ + +int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info, my_bool extend) +{ + MARIA_SHARE *share= info->s; + int error; + uchar *record; + char llbuff[22],llbuff2[22],llbuff3[22]; + DBUG_ENTER("maria_chk_data_link"); + + if (!(param->testflag & T_SILENT)) + { + if (extend) + puts("- check records and index references"); + else + puts("- check record links"); + } + + if (!(record= (uchar*) my_malloc(share->base.default_rec_buff_size, MYF(0)))) + { + _ma_check_print_error(param,"Not enough memory for record"); + DBUG_RETURN(-1); + } + param->records= param->del_blocks= 0; + param->used= param->link_used= param->splits= param->del_length= 0; + param->lost= 0; + param->tmp_record_checksum= param->glob_crc= 0; + param->err_count= 0; + + error= 0; + param->empty= share->pack.header_length; + + bzero((char*) param->tmp_key_crc, + share->base.keys * sizeof(param->tmp_key_crc[0])); + + switch (share->data_file_type) { + case BLOCK_RECORD: + error= check_block_record(param, info, extend, record); + break; + case STATIC_RECORD: + error= check_static_record(param, info, extend, record); + break; + case DYNAMIC_RECORD: + error= check_dynamic_record(param, info, extend, record); + break; + case COMPRESSED_RECORD: + error= check_compressed_record(param, info, extend, record); + break; + } /* switch */ + + if (error) + goto err; + + if (param->testflag & T_WRITE_LOOP) + { + VOID(fputs(" \r",stdout)); VOID(fflush(stdout)); + } + if (param->records != share->state.state.records) + { + _ma_check_print_error(param, + "Record-count is not ok; found %-10s Should be: %s", + llstr(param->records,llbuff), + llstr(share->state.state.records,llbuff2)); + error=1; + } + else if (param->record_checksum && + param->record_checksum != param->tmp_record_checksum) + { + _ma_check_print_error(param, + "Key pointers and record positions doesn't match"); + error=1; + } + else if (param->glob_crc != share->state.state.checksum && + (share->options & + (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))) + { + _ma_check_print_warning(param, + "Record checksum is not the same as checksum " + "stored in the index file"); + error=1; + } + else if (!extend) + { + uint key; + for (key=0 ; key < share->base.keys; key++) + { + if (param->tmp_key_crc[key] != param->key_crc[key] && + !(share->keyinfo[key].flag & + (HA_FULLTEXT | HA_SPATIAL | HA_RTREE_INDEX))) + { + _ma_check_print_error(param,"Checksum for key: %2d doesn't match " + "checksum for records", + key+1); + error=1; + } + } + } + + if (param->del_length != share->state.state.empty) + { + _ma_check_print_warning(param, + "Found %s deleted space. Should be %s", + llstr(param->del_length,llbuff2), + llstr(share->state.state.empty,llbuff)); + } + /* Skip following checks for BLOCK RECORD as they don't make any sence */ + if (share->data_file_type != BLOCK_RECORD) + { + if (param->used + param->empty + param->del_length != + share->state.state.data_file_length) + { + _ma_check_print_warning(param, + "Found %s record data and %s unused data and %s " + "deleted data", + llstr(param->used, llbuff), + llstr(param->empty,llbuff2), + llstr(param->del_length,llbuff3)); + _ma_check_print_warning(param, + "Total %s Should be: %s", + llstr((param->used+param->empty + + param->del_length), llbuff), + llstr(share->state.state.data_file_length, + llbuff2)); + } + if (param->del_blocks != share->state.state.del) + { + _ma_check_print_warning(param, + "Found %10s deleted blocks. Should be: %s", + llstr(param->del_blocks,llbuff), + llstr(share->state.state.del,llbuff2)); + } + if (param->splits != share->state.split) + { + _ma_check_print_warning(param, + "Found %10s parts. Should be: %s", + llstr(param->splits, llbuff), + llstr(share->state.split,llbuff2)); + } + } + if (param->testflag & T_INFO) + { + if (param->warning_printed || param->error_printed) + puts(""); + if (param->used != 0 && ! param->error_printed) + { + if (param->records) + { + printf("Records:%18s M.recordlength:%9lu Packed:%14.0f%%\n", + llstr(param->records,llbuff), + (long)((param->used - param->link_used)/param->records), + (share->base.blobs ? 0.0 : + (ulonglong2double((ulonglong) share->base.reclength * + param->records)- + my_off_t2double(param->used))/ + ulonglong2double((ulonglong) share->base.reclength * + param->records)*100.0)); + printf("Recordspace used:%9.0f%% Empty space:%12d%% " + "Blocks/Record: %6.2f\n", + (ulonglong2double(param->used - param->link_used)/ + ulonglong2double(param->used-param->link_used+param->empty) * + 100.0), + (!param->records ? 100 : + (int) (ulonglong2double(param->del_length+param->empty)/ + my_off_t2double(param->used)*100.0)), + ulonglong2double(param->splits - param->del_blocks) / + param->records); + } + else + printf("Records:%18s\n", "0"); + } + printf("Record blocks:%12s Delete blocks:%10s\n", + llstr(param->splits - param->del_blocks, llbuff), + llstr(param->del_blocks, llbuff2)); + printf("Record data: %12s Deleted data: %10s\n", + llstr(param->used - param->link_used,llbuff), + llstr(param->del_length, llbuff2)); + printf("Empty space: %12s Linkdata: %10s\n", + llstr(param->empty, llbuff),llstr(param->link_used, llbuff2)); + if (param->lost) + printf("Lost space: %12s", llstr(param->lost, llbuff)); + if (param->max_found_trid) + { + printf("Max trans. id: %11s\n", + llstr(param->max_found_trid, llbuff)); + } + } + my_free(record,MYF(0)); + DBUG_RETURN (error); + +err: + my_free(record,MYF(0)); + param->testflag|=T_RETRY_WITHOUT_QUICK; + DBUG_RETURN(1); +} /* maria_chk_data_link */ + + +/** + Prepares a table for a repair or index sort: flushes pages, records durably + in the table that it is undergoing the operation (if that op crashes, that + info will serve for Recovery and the user). + + If we start overwriting the index file, and crash then, old REDOs will + be tried and fail. To prevent that, we bump skip_redo_lsn, and thus we have + to flush and sync pages so that old REDOs can be skipped. + If this is not a bulk insert, which Recovery can handle gracefully (by + truncating files, see UNDO_BULK_INSERT) we also mark the table + crashed-on-repair, so that user knows it has to re-repair. If bulk insert we + shouldn't mark it crashed-on-repair, because if we did this, the UNDO phase + would skip the table (UNDO_BULK_INSERT would not be applied), + and maria_chk would not improve that. + If this is an OPTIMIZE which merely sorts index, we need to do the same + too: old REDOs should not apply to the new index file. + Only the flush is needed when in maria_chk which is not crash-safe. + + @param info table + @param param repair parameters + @param discard_index if index pages can be thrown away +*/ + +static my_bool protect_against_repair_crash(MARIA_HA *info, + const HA_CHECK *param, + my_bool discard_index) +{ + MARIA_SHARE *share= info->s; + + /* + There are other than recovery-related reasons to do the writes below: + - the physical size of the data file is sometimes used during repair: we + need to flush to have it exact + - we flush the state because maria_open(HA_OPEN_COPY) will want to read + it from disk. + */ + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_FORCE_WRITE, + discard_index ? FLUSH_IGNORE_CHANGED : + FLUSH_FORCE_WRITE) || + (share->changed && + _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO | + MA_STATE_INFO_WRITE_LOCK))) + return TRUE; + /* In maria_chk this is not needed: */ + if (maria_multi_threaded && share->base.born_transactional) + { + if ((param->testflag & T_NO_CREATE_RENAME_LSN) == 0) + { + /* this can be true only for a transactional table */ + maria_mark_in_repair(info); + if (_ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_LOCK)) + return TRUE; + } + if (translog_status == TRANSLOG_OK && + _ma_update_state_lsns(share, translog_get_horizon(), + share->state.create_trid, FALSE, FALSE)) + return TRUE; + if (_ma_sync_table_files(info)) + return TRUE; + } + return FALSE; +} + + +/** + @brief Initialize variables for repair +*/ + +static int initialize_variables_for_repair(HA_CHECK *param, + MARIA_SORT_INFO *sort_info, + MARIA_SORT_PARAM *sort_param, + MARIA_HA *info, + my_bool rep_quick, + MARIA_SHARE *org_share) +{ + MARIA_SHARE *share= info->s; + + /* Ro allow us to restore state and check how state changed */ + memcpy(org_share, share, sizeof(*share)); + + /* Repair code relies on share->state.state so we have to update it here */ + if (share->lock.update_status) + (*share->lock.update_status)(info); + + bzero((char*) sort_info, sizeof(*sort_info)); + bzero((char*) sort_param, sizeof(*sort_param)); + + param->testflag|= T_REP; /* for easy checking */ + if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + param->testflag|= T_CALC_CHECKSUM; + param->glob_crc= 0; + if (rep_quick) + param->testflag|= T_QUICK; + else + param->testflag&= ~T_QUICK; + param->org_key_map= share->state.key_map; + + sort_param->sort_info= sort_info; + sort_param->fix_datafile= ! rep_quick; + sort_param->calc_checksum= test(param->testflag & T_CALC_CHECKSUM); + sort_info->info= sort_info->new_info= info; + sort_info->param= param; + set_data_file_type(sort_info, info->s); + sort_info->org_data_file_type= share->data_file_type; + + bzero(&info->rec_cache, sizeof(info->rec_cache)); + info->rec_cache.file= info->dfile.file; + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + if (protect_against_repair_crash(info, param, !test(param->testflag & + T_CREATE_MISSING_KEYS))) + return 1; + + /* calculate max_records */ + sort_info->filelength= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)); + if ((param->testflag & T_CREATE_MISSING_KEYS) || + sort_info->org_data_file_type == COMPRESSED_RECORD) + sort_info->max_records= share->state.state.records; + else + { + ulong rec_length; + rec_length= max(share->base.min_pack_length, + share->base.min_block_length); + sort_info->max_records= (ha_rows) (sort_info->filelength / rec_length); + } + + /* Set up transaction handler so that we can see all rows */ + if (param->max_trid == 0) + { + if (!ma_control_file_inited()) + param->max_trid= 0; /* Give warning for first trid found */ + else + param->max_trid= max_trid_in_system(); + } + maria_ignore_trids(info); + /* Don't write transid's during repair */ + maria_versioning(info, 0); + return 0; +} + + +/* + During initialize_variables_for_repair and related functions we set some + variables to values that makes sence during repair. + This function restores these values to their original values so that we can + use the handler in MariaDB without having to close and open the table. +*/ + +static void restore_table_state_after_repair(MARIA_HA *info, + MARIA_SHARE *org_share) +{ + maria_versioning(info, info->s->have_versioning); + info->s->lock_key_trees= org_share->lock_key_trees; +} + + + + +/** + @brief Drop all indexes + + @param[in] param check parameters + @param[in] info MARIA_HA handle + @param[in] force if to force drop all indexes + + @return status + @retval 0 OK + @retval != 0 Error + + @note + Once allocated, index blocks remain part of the key file forever. + When indexes are disabled, no block is freed. When enabling indexes, + no block is freed either. The new indexes are create from new + blocks. (Bug #4692) + + Before recreating formerly disabled indexes, the unused blocks + must be freed. There are two options to do this: + - Follow the tree of disabled indexes, add all blocks to the + deleted blocks chain. Would require a lot of random I/O. + - Drop all blocks by clearing all index root pointers and all + delete chain pointers and resetting key_file_length to the end + of the index file header. This requires to recreate all indexes, + even those that may still be intact. + The second method is probably faster in most cases. + + When disabling indexes, MySQL disables either all indexes or all + non-unique indexes. When MySQL [re-]enables disabled indexes + (T_CREATE_MISSING_KEYS), then we either have "lost" blocks in the + index file, or there are no non-unique indexes. In the latter case, + maria_repair*() would not be called as there would be no disabled + indexes. + + If there would be more unique indexes than disabled (non-unique) + indexes, we could do the first method. But this is not implemented + yet. By now we drop and recreate all indexes when repair is called. + + However, there is an exception. Sometimes MySQL disables non-unique + indexes when the table is empty (e.g. when copying a table in + mysql_alter_table()). When enabling the non-unique indexes, they + are still empty. So there is no index block that can be lost. This + optimization is implemented in this function. + + Note that in normal repair (T_CREATE_MISSING_KEYS not set) we + recreate all enabled indexes unconditonally. We do not change the + key_map. Otherwise we invert the key map temporarily (outside of + this function) and recreate the then "seemingly" enabled indexes. + When we cannot use the optimization, and drop all indexes, we + pretend that all indexes were disabled. By the inversion, we will + then recrate all indexes. +*/ + +static int maria_drop_all_indexes(HA_CHECK *param, MARIA_HA *info, + my_bool force) +{ + MARIA_SHARE *share= info->s; + MARIA_STATE_INFO *state= &share->state; + uint i; + DBUG_ENTER("maria_drop_all_indexes"); + + /* + If any of the disabled indexes has a key block assigned, we must + drop and recreate all indexes to avoid losing index blocks. + + If we want to recreate disabled indexes only _and_ all of these + indexes are empty, we don't need to recreate the existing indexes. + */ + if (!force && (param->testflag & T_CREATE_MISSING_KEYS)) + { + DBUG_PRINT("repair", ("creating missing indexes")); + for (i= 0; i < share->base.keys; i++) + { + DBUG_PRINT("repair", ("index #: %u key_root: 0x%lx active: %d", + i, (long) state->key_root[i], + maria_is_key_active(state->key_map, i))); + if ((state->key_root[i] != HA_OFFSET_ERROR) && + !maria_is_key_active(state->key_map, i)) + { + /* + This index has at least one key block and it is disabled. + We would lose its block(s) if would just recreate it. + So we need to drop and recreate all indexes. + */ + DBUG_PRINT("repair", ("nonempty and disabled: recreate all")); + break; + } + } + if (i >= share->base.keys) + goto end; + + /* + We do now drop all indexes and declare them disabled. With the + T_CREATE_MISSING_KEYS flag, maria_repair*() will recreate all + disabled indexes and enable them. + */ + maria_clear_all_keys_active(state->key_map); + DBUG_PRINT("repair", ("declared all indexes disabled")); + } + + /* Clear index root block pointers. */ + for (i= 0; i < share->base.keys; i++) + state->key_root[i]= HA_OFFSET_ERROR; + + /* Drop the delete chain. */ + share->state.key_del= HA_OFFSET_ERROR; + + /* Reset index file length to end of index file header. */ + share->state.state.key_file_length= share->base.keystart; + +end: + DBUG_RETURN(0); +} + + +/* + Recover old table by reading each record and writing all keys + + NOTES + Save new datafile-name in temp_filename. + We overwrite the index file as we go (writekeys() for example), so if we + crash during this the table is unusable and user (or Recovery in the + future) must repeat the REPAIR/OPTIMIZE operation. We could use a + temporary index file in the future (drawback: more disk space). + + IMPLEMENTATION (for hard repair with block format) + - Create new, unrelated MARIA_HA of the table + - Create new datafile and associate it with new handler + - Reset all statistic information in new handler + - Copy all data to new handler with normal write operations + - Move state of new handler to old handler + - Close new handler + - Close data file in old handler + - Rename old data file to new data file. + - Reopen data file in old handler +*/ + +int maria_repair(HA_CHECK *param, register MARIA_HA *info, + char *name, my_bool rep_quick) +{ + int error, got_error; + ha_rows start_records,new_header_length; + my_off_t del; + File new_file; + MARIA_SHARE *share= info->s; + char llbuff[22],llbuff2[22]; + MARIA_SORT_INFO sort_info; + MARIA_SORT_PARAM sort_param; + my_bool block_record, scan_inited= 0, reenable_logging= 0; + enum data_file_type org_data_file_type= share->data_file_type; + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + MARIA_SHARE backup_share; + DBUG_ENTER("maria_repair"); + + got_error= 1; + new_file= -1; + start_records= share->state.state.records; + if (!(param->testflag & T_SILENT)) + { + printf("- recovering (with keycache) Aria-table '%s'\n",name); + printf("Data records: %s\n", llstr(start_records, llbuff)); + } + + if (initialize_variables_for_repair(param, &sort_info, &sort_param, info, + rep_quick, &backup_share)) + goto err; + + if ((reenable_logging= share->now_transactional)) + _ma_tmp_disable_logging_for_table(info, 0); + + sort_param.current_filepos= sort_param.filepos= new_header_length= + ((param->testflag & T_UNPACK) ? 0L : share->pack.header_length); + + if (!rep_quick) + { + /* Get real path for data file */ + if ((new_file= my_create(fn_format(param->temp_filename, + share->data_file_name.str, "", + DATA_TMP_EXT, 2+4), + 0,param->tmpfile_createflag, + MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (new_header_length && + maria_filecopy(param, new_file, info->dfile.file, 0L, + new_header_length, "datafile-header")) + goto err; + share->state.dellink= HA_OFFSET_ERROR; + info->rec_cache.file= new_file; /* For sort_delete_record */ + if (share->data_file_type == BLOCK_RECORD || + (param->testflag & T_UNPACK)) + { + if (create_new_data_handle(&sort_param, new_file)) + goto err; + sort_info.new_info->rec_cache.file= new_file; + } + } + + block_record= sort_info.new_info->s->data_file_type == BLOCK_RECORD; + + if (org_data_file_type != BLOCK_RECORD) + { + /* We need a read buffer to read rows in big blocks */ + if (init_io_cache(¶m->read_cache, info->dfile.file, + (uint) param->read_buffer_length, + READ_CACHE, share->pack.header_length, 1, MYF(MY_WME))) + goto err; + } + if (sort_info.new_info->s->data_file_type != BLOCK_RECORD) + { + /* When writing to not block records, we need a write buffer */ + if (!rep_quick) + { + if (init_io_cache(&sort_info.new_info->rec_cache, new_file, + (uint) param->write_buffer_length, + WRITE_CACHE, new_header_length, 1, + MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw)) + goto err; + sort_info.new_info->opt_flag|=WRITE_CACHE_USED; + } + } + else if (block_record) + { + scan_inited= 1; + if (maria_scan_init(sort_info.info)) + goto err; + } + + if (!(sort_param.record= + (uchar *) my_malloc((uint) + share->base.default_rec_buff_size, MYF(0))) || + _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size, + share->base.default_rec_buff_size)) + { + _ma_check_print_error(param, "Not enough memory for extra record"); + goto err; + } + + sort_param.read_cache=param->read_cache; + sort_param.pos=sort_param.max_pos=share->pack.header_length; + param->read_cache.end_of_file= sort_info.filelength; + sort_param.master=1; + sort_info.max_records= ~(ha_rows) 0; + + del= share->state.state.del; + share->state.state.records= share->state.state.del= share->state.split= 0; + share->state.state.empty= 0; + + if (param->testflag & T_CREATE_MISSING_KEYS) + maria_set_all_keys_active(share->state.key_map, share->base.keys); + maria_drop_all_indexes(param, info, TRUE); + + maria_lock_memory(param); /* Everything is alloced */ + + /* Re-create all keys, which are set in key_map. */ + while (!(error=sort_get_next_record(&sort_param))) + { + if (block_record && _ma_sort_write_record(&sort_param)) + goto err; + + if (writekeys(&sort_param)) + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY) + goto err; + DBUG_DUMP("record", sort_param.record, + share->base.default_rec_buff_size); + _ma_check_print_warning(param, + "Duplicate key %2d for record at %10s against " + "new record at %10s", + info->errkey+1, + llstr(sort_param.current_filepos, llbuff), + llstr(info->dup_key_pos,llbuff2)); + if (param->testflag & T_VERBOSE) + { + MARIA_KEY tmp_key; + MARIA_KEYDEF *keyinfo= share->keyinfo + info->errkey; + (*keyinfo->make_key)(info, &tmp_key, (uint) info->errkey, + info->lastkey_buff, + sort_param.record, 0L, 0); + _ma_print_key(stdout, &tmp_key); + } + sort_info.dupp++; + if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK) + { + param->testflag|=T_RETRY_WITHOUT_QUICK; + param->error_printed=1; + goto err; + } + /* purecov: begin tested */ + if (block_record) + { + sort_info.new_info->s->state.state.records--; + if ((*sort_info.new_info->s->write_record_abort)(sort_info.new_info)) + { + _ma_check_print_error(param,"Couldn't delete duplicate row"); + goto err; + } + } + /* purecov: end */ + continue; + } + if (!block_record) + { + if (_ma_sort_write_record(&sort_param)) + goto err; + /* Filepos is pointer to where next row will be stored */ + sort_param.current_filepos= sort_param.filepos; + } + } + if (error > 0 || maria_write_data_suffix(&sort_info, !rep_quick) || + flush_io_cache(&sort_info.new_info->rec_cache) || + param->read_cache.error < 0) + goto err; + + if (param->testflag & T_WRITE_LOOP) + { + VOID(fputs(" \r",stdout)); VOID(fflush(stdout)); + } + if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0, MYF(0))) + { + _ma_check_print_warning(param, + "Can't change size of indexfile, error: %d", + my_errno); + goto err; + } + + if (rep_quick && del+sort_info.dupp != share->state.state.del) + { + _ma_check_print_error(param,"Couldn't fix table with quick recovery: " + "Found wrong number of deleted records"); + _ma_check_print_error(param,"Run recovery again without -q"); + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + goto err; + } + + if (param->testflag & T_SAFE_REPAIR) + { + /* Don't repair if we loosed more than one row */ + if (sort_info.new_info->s->state.state.records+1 < start_records) + { + share->state.state.records= start_records; + goto err; + } + } + + VOID(end_io_cache(&sort_info.new_info->rec_cache)); + info->opt_flag&= ~WRITE_CACHE_USED; + + /* + As we have read the data file (sort_get_next_record()) we may have + cached, non-changed blocks of it in the page cache. We must throw them + away as we are going to close their descriptor ('new_file'). We also want + to flush any index block, so that it is ready for the upcoming sync. + */ + if (_ma_flush_table_files_before_swap(param, info)) + goto err; + + if (!rep_quick) + { + sort_info.new_info->s->state.state.data_file_length= sort_param.filepos; + if (sort_info.new_info != sort_info.info) + { + MARIA_STATE_INFO save_state= sort_info.new_info->s->state; + if (maria_close(sort_info.new_info)) + { + _ma_check_print_error(param, "Got error %d on close", my_errno); + goto err; + } + copy_data_file_state(&share->state, &save_state); + new_file= -1; + sort_info.new_info= info; + } + share->state.version=(ulong) time((time_t*) 0); /* Force reopen */ + + /* Replace the actual file with the temporary file */ + if (new_file >= 0) + my_close(new_file, MYF(MY_WME)); + new_file= -1; + change_data_file_descriptor(info, -1); + if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT, + DATA_TMP_EXT, + (param->testflag & T_BACKUP_DATA ? + MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) | + sync_dir) || + _ma_open_datafile(info, share, NullS, -1)) + { + goto err; + } + } + else + { + share->state.state.data_file_length= sort_param.max_pos; + } + if (param->testflag & T_CALC_CHECKSUM) + share->state.state.checksum= param->glob_crc; + + if (!(param->testflag & T_SILENT)) + { + if (start_records != share->state.state.records) + printf("Data records: %s\n", llstr(share->state.state.records,llbuff)); + } + if (sort_info.dupp) + _ma_check_print_warning(param, + "%s records have been removed", + llstr(sort_info.dupp,llbuff)); + + got_error= 0; + /* If invoked by external program that uses thr_lock */ + if (&share->state.state != info->state) + *info->state= *info->state_start= share->state.state; + +err: + if (scan_inited) + maria_scan_end(sort_info.info); + _ma_reset_state(info); + + VOID(end_io_cache(¶m->read_cache)); + VOID(end_io_cache(&sort_info.new_info->rec_cache)); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + /* this below could fail, shouldn't we detect error? */ + if (got_error) + { + if (! param->error_printed) + _ma_check_print_error(param,"%d for record at pos %s",my_errno, + llstr(sort_param.start_recpos,llbuff)); + (void)_ma_flush_table_files_before_swap(param, info); + if (sort_info.new_info && sort_info.new_info != sort_info.info) + { + unuse_data_file_descriptor(sort_info.new_info); + maria_close(sort_info.new_info); + } + if (new_file >= 0) + { + VOID(my_close(new_file,MYF(0))); + VOID(my_delete(param->temp_filename, MYF(MY_WME))); + } + maria_mark_crashed_on_repair(info); + } + /* If caller had disabled logging it's not up to us to re-enable it */ + if (reenable_logging) + _ma_reenable_logging_for_table(info, FALSE); + restore_table_state_after_repair(info, &backup_share); + + my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR)); + if (!got_error && (param->testflag & T_UNPACK)) + restore_data_file_type(share); + share->state.changed|= (STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES | + STATE_NOT_ANALYZED | STATE_NOT_ZEROFILLED); + if (!rep_quick) + share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_MOVABLE); + DBUG_RETURN(got_error); +} + + +/* Uppdate keyfile when doing repair */ + +static int writekeys(MARIA_SORT_PARAM *sort_param) +{ + uint i; + MARIA_HA *info= sort_param->sort_info->info; + MARIA_SHARE *share= info->s; + uchar *record= sort_param->record; + uchar *key_buff; + my_off_t filepos= sort_param->current_filepos; + MARIA_KEY key; + DBUG_ENTER("writekeys"); + + key_buff= info->lastkey_buff+share->base.max_key_length; + + for (i=0 ; i < share->base.keys ; i++) + { + if (maria_is_key_active(share->state.key_map, i)) + { + if (share->keyinfo[i].flag & HA_FULLTEXT ) + { + if (_ma_ft_add(info, i, key_buff, record, filepos)) + goto err; + } + else + { + if (!(*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, + filepos, 0)) + goto err; + if ((*share->keyinfo[i].ck_insert)(info, &key)) + goto err; + } + } + } + DBUG_RETURN(0); + + err: + if (my_errno == HA_ERR_FOUND_DUPP_KEY) + { + info->errkey=(int) i; /* This key was found */ + while ( i-- > 0 ) + { + if (maria_is_key_active(share->state.key_map, i)) + { + if (share->keyinfo[i].flag & HA_FULLTEXT) + { + if (_ma_ft_del(info,i,key_buff,record,filepos)) + break; + } + else + { + (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, + filepos, 0); + if (_ma_ck_delete(info, &key)) + break; + } + } + } + } + /* Remove checksum that was added to glob_crc in sort_get_next_record */ + if (sort_param->calc_checksum) + sort_param->sort_info->param->glob_crc-= info->cur_row.checksum; + DBUG_PRINT("error",("errno: %d",my_errno)); + DBUG_RETURN(-1); +} /* writekeys */ + + + /* Change all key-pointers that points to a records */ + +int maria_movepoint(register MARIA_HA *info, uchar *record, + MARIA_RECORD_POS oldpos, MARIA_RECORD_POS newpos, + uint prot_key) +{ + uint i; + uchar *key_buff; + MARIA_SHARE *share= info->s; + MARIA_PAGE page; + DBUG_ENTER("maria_movepoint"); + + key_buff= info->lastkey_buff + share->base.max_key_length; + for (i=0 ; i < share->base.keys; i++) + { + if (i != prot_key && maria_is_key_active(share->state.key_map, i)) + { + MARIA_KEY key; + (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, oldpos, + 0); + if (key.keyinfo->flag & HA_NOSAME) + { /* Change pointer direct */ + MARIA_KEYDEF *keyinfo; + keyinfo=share->keyinfo+i; + if (_ma_search(info, &key, (uint32) (SEARCH_SAME | SEARCH_SAVE_BUFF), + share->state.key_root[i])) + DBUG_RETURN(-1); + _ma_page_setup(&page, info, keyinfo, info->last_keypage, + info->keyread_buff); + + _ma_dpointer(share, info->int_keypos - page.node - + share->rec_reflength,newpos); + + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS)) + DBUG_RETURN(-1); + } + else + { /* Change old key to new */ + if (_ma_ck_delete(info, &key)) + DBUG_RETURN(-1); + (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, newpos, + 0); + if (_ma_ck_write(info, &key)) + DBUG_RETURN(-1); + } + } + } + DBUG_RETURN(0); +} /* maria_movepoint */ + + + /* Tell system that we want all memory for our cache */ + +void maria_lock_memory(HA_CHECK *param __attribute__((unused))) +{ +#ifdef SUN_OS /* Key-cacheing thrases on sun 4.1 */ + if (param->opt_maria_lock_memory) + { + int success = mlockall(MCL_CURRENT); /* or plock(DATLOCK); */ + if (geteuid() == 0 && success != 0) + _ma_check_print_warning(param, + "Failed to lock memory. errno %d",my_errno); + } +#endif +} /* maria_lock_memory */ + + +/** + Flush all changed blocks to disk. + + We release blocks as it's unlikely that they would all be needed soon. + This function needs to be called before swapping data or index files or + syncing them. + + @param param description of the repair operation + @param info table +*/ + +static my_bool _ma_flush_table_files_before_swap(HA_CHECK *param, + MARIA_HA *info) +{ + DBUG_ENTER("_ma_flush_table_files_before_swap"); + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE)) + { + _ma_check_print_error(param, "%d when trying to write buffers", my_errno); + DBUG_RETURN(TRUE); + } + DBUG_RETURN(FALSE); +} + + + /* Sort index for more efficent reads */ + +int maria_sort_index(HA_CHECK *param, register MARIA_HA *info, char *name) +{ + reg2 uint key; + reg1 MARIA_KEYDEF *keyinfo; + File new_file; + my_off_t index_pos[HA_MAX_POSSIBLE_KEY]; + uint r_locks,w_locks; + int old_lock; + MARIA_SHARE *share= info->s; + MARIA_STATE_INFO old_state; + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + DBUG_ENTER("maria_sort_index"); + + /* cannot sort index files with R-tree indexes */ + for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ; + key++,keyinfo++) + if (keyinfo->key_alg == HA_KEY_ALG_RTREE) + DBUG_RETURN(0); + + if (!(param->testflag & T_SILENT)) + printf("- Sorting index for Aria-table '%s'\n",name); + + if (protect_against_repair_crash(info, param, FALSE)) + DBUG_RETURN(1); + + /* Get real path for index file */ + fn_format(param->temp_filename,name,"", MARIA_NAME_IEXT,2+4+32); + if ((new_file=my_create(fn_format(param->temp_filename,param->temp_filename, + "", INDEX_TMP_EXT,2+4), + 0,param->tmpfile_createflag,MYF(0))) <= 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + DBUG_RETURN(-1); + } + if (maria_filecopy(param, new_file, share->kfile.file, 0L, + (ulong) share->base.keystart, "headerblock")) + goto err; + + param->new_file_pos=share->base.keystart; + for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ; + key++,keyinfo++) + { + if (! maria_is_key_active(share->state.key_map, key)) + continue; + + if (share->state.key_root[key] != HA_OFFSET_ERROR) + { + index_pos[key]=param->new_file_pos; /* Write first block here */ + if (sort_one_index(param,info,keyinfo,share->state.key_root[key], + new_file)) + goto err; + } + else + index_pos[key]= HA_OFFSET_ERROR; /* No blocks */ + } + + /* Flush key cache for this file if we are calling this outside maria_chk */ + flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_IGNORE_CHANGED); + + share->state.version=(ulong) time((time_t*) 0); + old_state= share->state; /* save state if not stored */ + r_locks= share->r_locks; + w_locks= share->w_locks; + old_lock= info->lock_type; + + /* Put same locks as old file */ + share->r_locks= share->w_locks= share->tot_locks= 0; + (void) _ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE); + pthread_mutex_lock(&share->intern_lock); + VOID(my_close(share->kfile.file, MYF(MY_WME))); + share->kfile.file = -1; + pthread_mutex_unlock(&share->intern_lock); + VOID(my_close(new_file,MYF(MY_WME))); + if (maria_change_to_newfile(share->index_file_name.str, MARIA_NAME_IEXT, + INDEX_TMP_EXT, sync_dir) || + _ma_open_keyfile(share)) + goto err2; + info->lock_type= F_UNLCK; /* Force maria_readinfo to lock */ + _ma_readinfo(info,F_WRLCK,0); /* Will lock the table */ + info->lock_type= old_lock; + share->r_locks= r_locks; + share->w_locks= w_locks; + share->tot_locks= r_locks+w_locks; + share->state= old_state; /* Restore old state */ + + share->state.state.key_file_length=param->new_file_pos; + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + for (key=0 ; key < share->base.keys ; key++) + share->state.key_root[key]=index_pos[key]; + share->state.key_del= HA_OFFSET_ERROR; + + share->state.changed&= ~STATE_NOT_SORTED_PAGES; + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_sort_index", + { + DBUG_PRINT("maria_crash_sort_index", ("now")); + DBUG_ABORT(); + }); + DBUG_RETURN(0); + +err: + VOID(my_close(new_file,MYF(MY_WME))); +err2: + VOID(my_delete(param->temp_filename,MYF(MY_WME))); + DBUG_RETURN(-1); +} /* maria_sort_index */ + + +/** + @brief put CRC on the page + + @param buff reference on the page buffer. + @param pos position of the page in the file. + @param length length of the page +*/ + +static void put_crc(uchar *buff, my_off_t pos, MARIA_SHARE *share) +{ + maria_page_crc_set_index(buff, (pgcache_page_no_t) (pos / share->block_size), + (uchar*) share); +} + + +/* Sort index blocks recursive using one index */ + +static int sort_one_index(HA_CHECK *param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t pagepos, File new_file) +{ + uint length,nod_flag; + uchar *buff,*keypos,*endpos; + my_off_t new_page_pos,next_page; + MARIA_SHARE *share= info->s; + MARIA_KEY key; + MARIA_PAGE page; + DBUG_ENTER("sort_one_index"); + + /* cannot walk over R-tree indices */ + DBUG_ASSERT(keyinfo->key_alg != HA_KEY_ALG_RTREE); + new_page_pos=param->new_file_pos; + param->new_file_pos+=keyinfo->block_length; + key.keyinfo= keyinfo; + + if (!(buff= (uchar*) my_alloca((uint) keyinfo->block_length + + keyinfo->maxlength))) + { + _ma_check_print_error(param,"Not enough memory for key block"); + DBUG_RETURN(-1); + } + key.data= buff + keyinfo->block_length; + + if (_ma_fetch_keypage(&page, info, keyinfo, pagepos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, buff, 0)) + { + report_keypage_fault(param, info, pagepos); + goto err; + } + + if ((nod_flag= page.node) || keyinfo->flag & HA_FULLTEXT) + { + keypos= page.buff + share->keypage_header + nod_flag; + endpos= page.buff + page.size; + + for ( ;; ) + { + if (nod_flag) + { + next_page= _ma_kpos(nod_flag,keypos); + /* Save new pos */ + _ma_kpointer(info,keypos-nod_flag,param->new_file_pos); + if (sort_one_index(param,info,keyinfo,next_page,new_file)) + { + DBUG_PRINT("error", + ("From page: %ld, keyoffset: %lu used_length: %d", + (ulong) pagepos, (ulong) (keypos - buff), + (int) page.size)); + DBUG_DUMP("buff", page.buff, page.size); + goto err; + } + } + if (keypos >= endpos || + !(*keyinfo->get_key)(&key, page.flag, nod_flag, &keypos)) + break; + DBUG_ASSERT(keypos <= endpos); + if (keyinfo->flag & HA_FULLTEXT) + { + uint off; + int subkeys; + get_key_full_length_rdonly(off, key.data); + subkeys= ft_sintXkorr(key.data + off); + if (subkeys < 0) + { + next_page= _ma_row_pos_from_key(&key); + _ma_dpointer(share, keypos - nod_flag - share->rec_reflength, + param->new_file_pos); /* Save new pos */ + if (sort_one_index(param,info,&share->ft2_keyinfo, + next_page,new_file)) + goto err; + } + } + } + } + + /* Fill block with zero and write it to the new index file */ + length= page.size; + bzero(buff+length,keyinfo->block_length-length); + put_crc(buff, new_page_pos, share); + if (my_pwrite(new_file, buff,(uint) keyinfo->block_length, + new_page_pos,MYF(MY_NABP | MY_WAIT_IF_FULL))) + { + _ma_check_print_error(param,"Can't write indexblock, error: %d",my_errno); + goto err; + } + my_afree(buff); + DBUG_RETURN(0); +err: + my_afree(buff); + DBUG_RETURN(1); +} /* sort_one_index */ + + +/** + @brief Fill empty space in index file with zeroes + + @return + @retval 0 Ok + @retval 1 Error +*/ + +static my_bool maria_zerofill_index(HA_CHECK *param, MARIA_HA *info, + const char *name) +{ + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + char llbuff[21]; + uchar *buff; + pgcache_page_no_t page; + my_off_t pos; + my_off_t key_file_length= share->state.state.key_file_length; + uint block_size= share->block_size; + my_bool zero_lsn= (share->base.born_transactional && + !(param->testflag & T_ZEROFILL_KEEP_LSN)); + DBUG_ENTER("maria_zerofill_index"); + + if (!(param->testflag & T_SILENT)) + printf("- Zerofilling index for Aria-table '%s'\n",name); + + /* Go through the index file */ + for (pos= share->base.keystart, page= (ulonglong) (pos / block_size); + pos < key_file_length; + pos+= block_size, page++) + { + uint length; + if (!(buff= pagecache_read(share->pagecache, + &share->kfile, page, + DFLT_INIT_HITS, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + _ma_check_print_error(param, + "Page %9s: Got error %d when reading index file", + llstr(pos, llbuff), my_errno); + DBUG_RETURN(1); + } + if (zero_lsn) + bzero(buff, LSN_SIZE); + + if (share->base.born_transactional) + { + uint keynr= _ma_get_keynr(share, buff); + if (keynr != MARIA_DELETE_KEY_NR) + { + MARIA_PAGE page; + DBUG_ASSERT(keynr < share->base.keys); + + _ma_page_setup(&page, info, share->keyinfo + keynr, pos, buff); + if (_ma_compact_keypage(&page, ~(TrID) 0)) + { + _ma_check_print_error(param, + "Page %9s: Got error %d when reading index " + "file", + llstr(pos, llbuff), my_errno); + DBUG_RETURN(1); + } + } + } + + length= _ma_get_page_used(share, buff); + DBUG_ASSERT(length <= block_size); + if (length < block_size) + bzero(buff + length, block_size - length); + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + } + if (flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_FORCE_WRITE)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + @brief Fill empty space in data file with zeroes + + @todo + Zerofill all pages marked in bitmap as empty and change them to + be of type UNALLOCATED_PAGE + + @return + @retval 0 Ok + @retval 1 Error +*/ + +static my_bool maria_zerofill_data(HA_CHECK *param, MARIA_HA *info, + const char *name) +{ + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + char llbuff[21]; + my_off_t pos; + pgcache_page_no_t page; + uint block_size= share->block_size; + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + my_bool zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN), error; + DBUG_ENTER("maria_zerofill_data"); + + /* This works only with BLOCK_RECORD files */ + if (share->data_file_type != BLOCK_RECORD) + DBUG_RETURN(0); + + if (!(param->testflag & T_SILENT)) + printf("- Zerofilling data for Aria-table '%s'\n",name); + + /* Go through the record file */ + for (page= 1, pos= block_size; + pos < share->state.state.data_file_length; + pos+= block_size, page++) + { + uchar *buff; + enum en_page_type page_type; + + /* Ignore bitmap pages */ + if ((page % share->bitmap.pages_covered) == 0) + continue; + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, + page, 1, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + _ma_check_print_error(param, + "Page %9s: Got error: %d when reading datafile", + llstr(pos, llbuff), my_errno); + goto err; + } + page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK); + switch (page_type) { + case UNALLOCATED_PAGE: + if (zero_lsn) + bzero(buff, block_size); + else + bzero(buff + LSN_SIZE, block_size - LSN_SIZE); + break; + case BLOB_PAGE: + if (_ma_bitmap_get_page_bits(info, bitmap, page) == 0) + { + /* Unallocated page */ + if (zero_lsn) + bzero(buff, block_size); + else + bzero(buff + LSN_SIZE, block_size - LSN_SIZE); + } + else + if (zero_lsn) + bzero(buff, LSN_SIZE); + break; + case HEAD_PAGE: + case TAIL_PAGE: + { + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + uint offset, dir_start, empty_space; + uchar *dir; + + if (zero_lsn) + bzero(buff, LSN_SIZE); + if (max_entry != 0) + { + my_bool is_head_page= (page_type == HEAD_PAGE); + dir= dir_entry_pos(buff, block_size, max_entry - 1); + _ma_compact_block_page(buff, block_size, max_entry -1, 0, + is_head_page ? ~(TrID) 0 : 0, + is_head_page ? + share->base.min_block_length : 0); + + /* compactation may have increased free space */ + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + if (_ma_bitmap_set(info, page, is_head_page, + empty_space)) + goto err; + + /* Zerofill the not used part */ + offset= uint2korr(dir) + uint2korr(dir+2); + dir_start= (uint) (dir - buff); + DBUG_ASSERT(dir_start >= offset); + if (dir_start > offset) + bzero(buff + offset, dir_start - offset); + } + break; + } + default: + _ma_check_print_error(param, + "Page %9s: Found unrecognizable block of type %d", + llstr(pos, llbuff), page_type); + goto err; + } + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + } + error= _ma_bitmap_flush(share); + if (flush_pagecache_blocks(share->pagecache, &info->dfile, + FLUSH_FORCE_WRITE)) + error= 1; + DBUG_RETURN(error); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + /* flush what was changed so far */ + (void) _ma_bitmap_flush(share); + (void) flush_pagecache_blocks(share->pagecache, &info->dfile, + FLUSH_FORCE_WRITE); + + DBUG_RETURN(1); +} + + +/** + @brief Fill empty space in index and data files with zeroes + + @return + @retval 0 Ok + @retval 1 Error +*/ + +int maria_zerofill(HA_CHECK *param, MARIA_HA *info, const char *name) +{ + my_bool error, reenable_logging, + zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN); + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_zerofill"); + if ((reenable_logging= share->now_transactional)) + _ma_tmp_disable_logging_for_table(info, 0); + if (!(error= (maria_zerofill_index(param, info, name) || + maria_zerofill_data(param, info, name) || + _ma_set_uuid(info, 0)))) + { + /* + Mark that we have done zerofill of data and index. If we zeroed pages' + LSN, table is movable. + */ + share->state.changed&= ~STATE_NOT_ZEROFILLED; + if (zero_lsn) + { + share->state.changed&= ~(STATE_NOT_MOVABLE | STATE_MOVED); + /* Table should get new LSNs */ + share->state.create_rename_lsn= share->state.is_of_horizon= + share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS; + } + /* Ensure state is later flushed to disk, if within maria_chk */ + info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + /* Reset create_trid to make file comparable */ + share->state.create_trid= 0; + } + if (reenable_logging) + _ma_reenable_logging_for_table(info, FALSE); + DBUG_RETURN(error); +} + + +/* + Let temporary file replace old file. + This assumes that the new file was created in the same + directory as given by realpath(filename). + This will ensure that any symlinks that are used will still work. + Copy stats from old file to new file, deletes orignal and + changes new file name to old file name +*/ + +int maria_change_to_newfile(const char * filename, const char * old_ext, + const char * new_ext, myf MyFlags) +{ + char old_filename[FN_REFLEN],new_filename[FN_REFLEN]; +#ifdef USE_RAID + if (raid_chunks) + return my_raid_redel(fn_format(old_filename,filename,"",old_ext,2+4), + fn_format(new_filename,filename,"",new_ext,2+4), + raid_chunks, + MYF(MY_WME | MY_LINK_WARNING | MyFlags)); +#endif + /* Get real path to filename */ + (void) fn_format(old_filename,filename,"",old_ext,2+4+32); + return my_redel(old_filename, + fn_format(new_filename,old_filename,"",new_ext,2+4), + MYF(MY_WME | MY_LINK_WARNING | MyFlags)); +} /* maria_change_to_newfile */ + + +/* Copy a block between two files */ + +int maria_filecopy(HA_CHECK *param, File to,File from,my_off_t start, + my_off_t length, const char *type) +{ + uchar tmp_buff[IO_SIZE], *buff; + ulong buff_length; + DBUG_ENTER("maria_filecopy"); + + buff_length=(ulong) min(param->write_buffer_length,length); + if (!(buff=my_malloc(buff_length,MYF(0)))) + { + buff=tmp_buff; buff_length=IO_SIZE; + } + + VOID(my_seek(from,start,MY_SEEK_SET,MYF(0))); + while (length > buff_length) + { + if (my_read(from, buff, buff_length, MYF(MY_NABP)) || + my_write(to, buff, buff_length, param->myf_rw)) + goto err; + length-= buff_length; + } + if (my_read(from, buff, (size_t) length,MYF(MY_NABP)) || + my_write(to, buff, (size_t) length,param->myf_rw)) + goto err; + if (buff != tmp_buff) + my_free(buff,MYF(0)); + DBUG_RETURN(0); +err: + if (buff != tmp_buff) + my_free(buff,MYF(0)); + _ma_check_print_error(param,"Can't copy %s to tempfile, error %d", + type,my_errno); + DBUG_RETURN(1); +} + + +/* + Repair table or given index using sorting + + SYNOPSIS + maria_repair_by_sort() + param Repair parameters + info MARIA handler to repair + name Name of table (for warnings) + rep_quick set to <> 0 if we should not change data file + + RESULT + 0 ok + <>0 Error +*/ + +int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info, + const char * name, my_bool rep_quick) +{ + int got_error; + uint i; + ha_rows start_records; + my_off_t new_header_length, org_header_length, del; + File new_file; + MARIA_SORT_PARAM sort_param; + MARIA_SHARE *share= info->s; + HA_KEYSEG *keyseg; + double *rec_per_key_part; + char llbuff[22]; + MARIA_SORT_INFO sort_info; + ulonglong key_map; + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + my_bool scan_inited= 0, reenable_logging= 0; + MARIA_SHARE backup_share; + DBUG_ENTER("maria_repair_by_sort"); + LINT_INIT(key_map); + + got_error= 1; + new_file= -1; + start_records= share->state.state.records; + if (!(param->testflag & T_SILENT)) + { + printf("- recovering (with sort) Aria-table '%s'\n",name); + printf("Data records: %s\n", llstr(start_records,llbuff)); + } + + if (initialize_variables_for_repair(param, &sort_info, &sort_param, info, + rep_quick, &backup_share)) + goto err; + + if ((reenable_logging= share->now_transactional)) + _ma_tmp_disable_logging_for_table(info, 0); + + org_header_length= share->pack.header_length; + new_header_length= (param->testflag & T_UNPACK) ? 0 : org_header_length; + sort_param.filepos= new_header_length; + + if (!rep_quick) + { + /* Get real path for data file */ + if ((new_file=my_create(fn_format(param->temp_filename, + share->data_file_name.str, "", + DATA_TMP_EXT, 2+4), + 0,param->tmpfile_createflag, + MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (new_header_length && + maria_filecopy(param, new_file, info->dfile.file, 0L, + new_header_length, "datafile-header")) + goto err; + + share->state.dellink= HA_OFFSET_ERROR; + info->rec_cache.file= new_file; /* For sort_delete_record */ + if (share->data_file_type == BLOCK_RECORD || + (param->testflag & T_UNPACK)) + { + if (create_new_data_handle(&sort_param, new_file)) + goto err; + sort_info.new_info->rec_cache.file= new_file; + } + } + + if (!(sort_info.key_block= + alloc_key_blocks(param, + (uint) param->sort_key_blocks, + share->base.max_key_block_length))) + goto err; + sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks; + + if (share->data_file_type != BLOCK_RECORD) + { + /* We need a read buffer to read rows in big blocks */ + if (init_io_cache(¶m->read_cache, info->dfile.file, + (uint) param->read_buffer_length, + READ_CACHE, org_header_length, 1, MYF(MY_WME))) + goto err; + } + if (sort_info.new_info->s->data_file_type != BLOCK_RECORD) + { + /* When writing to not block records, we need a write buffer */ + if (!rep_quick) + { + if (init_io_cache(&sort_info.new_info->rec_cache, new_file, + (uint) param->write_buffer_length, + WRITE_CACHE, new_header_length, 1, + MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw)) + goto err; + sort_info.new_info->opt_flag|= WRITE_CACHE_USED; + } + } + + if (!(sort_param.record= + (uchar*) my_malloc((size_t) share->base.default_rec_buff_size, + MYF(0))) || + _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size, + share->base.default_rec_buff_size)) + { + _ma_check_print_error(param, "Not enough memory for extra record"); + goto err; + } + + /* Optionally drop indexes and optionally modify the key_map */ + maria_drop_all_indexes(param, info, FALSE); + key_map= share->state.key_map; + if (param->testflag & T_CREATE_MISSING_KEYS) + { + /* Invert the copied key_map to recreate all disabled indexes. */ + key_map= ~key_map; + } + + param->read_cache.end_of_file= sort_info.filelength; + sort_param.wordlist=NULL; + init_alloc_root(&sort_param.wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0); + + sort_param.key_cmp=sort_key_cmp; + sort_param.lock_in_memory=maria_lock_memory; + sort_param.tmpdir=param->tmpdir; + sort_param.master =1; + + del=share->state.state.del; + + rec_per_key_part= param->new_rec_per_key_part; + for (sort_param.key=0 ; sort_param.key < share->base.keys ; + rec_per_key_part+=sort_param.keyinfo->keysegs, sort_param.key++) + { + sort_param.keyinfo=share->keyinfo+sort_param.key; + /* + Skip this index if it is marked disabled in the copied + (and possibly inverted) key_map. + */ + if (! maria_is_key_active(key_map, sort_param.key)) + { + /* Remember old statistics for key */ + memcpy((char*) rec_per_key_part, + (char*) (share->state.rec_per_key_part + + (uint) (rec_per_key_part - param->new_rec_per_key_part)), + sort_param.keyinfo->keysegs*sizeof(*rec_per_key_part)); + DBUG_PRINT("repair", ("skipping seemingly disabled index #: %u", + sort_param.key)); + continue; + } + + if ((!(param->testflag & T_SILENT))) + printf ("- Fixing index %d\n",sort_param.key+1); + + sort_param.read_cache=param->read_cache; + sort_param.seg=sort_param.keyinfo->seg; + sort_param.max_pos= sort_param.pos= org_header_length; + keyseg=sort_param.seg; + bzero((char*) sort_param.unique,sizeof(sort_param.unique)); + sort_param.key_length=share->rec_reflength; + for (i=0 ; keyseg[i].type != HA_KEYTYPE_END; i++) + { + sort_param.key_length+=keyseg[i].length; + if (keyseg[i].flag & HA_SPACE_PACK) + sort_param.key_length+=get_pack_length(keyseg[i].length); + if (keyseg[i].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) + sort_param.key_length+=2 + test(keyseg[i].length >= 127); + if (keyseg[i].flag & HA_NULL_PART) + sort_param.key_length++; + } + share->state.state.records=share->state.state.del=share->state.split=0; + share->state.state.empty=0; + + if (sort_param.keyinfo->flag & HA_FULLTEXT) + { + uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT* + sort_param.keyinfo->seg->charset->mbmaxlen; + sort_param.key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN; + /* + fulltext indexes may have much more entries than the + number of rows in the table. We estimate the number here. + + Note, built-in parser is always nr. 0 - see ftparser_call_initializer() + */ + if (sort_param.keyinfo->ftkey_nr == 0) + { + /* + for built-in parser the number of generated index entries + cannot be larger than the size of the data file divided + by the minimal word's length + */ + sort_info.max_records= + (ha_rows) (sort_info.filelength/ft_min_word_len+1); + } + else + { + /* + for external plugin parser we cannot tell anything at all :( + so, we'll use all the sort memory and start from ~10 buffpeks. + (see _ma_create_index_by_sort) + */ + sort_info.max_records= + 10*param->sort_buffer_length/sort_param.key_length; + } + + sort_param.key_read= sort_maria_ft_key_read; + sort_param.key_write= sort_maria_ft_key_write; + } + else + { + sort_param.key_read= sort_key_read; + sort_param.key_write= sort_key_write; + } + + if (sort_info.new_info->s->data_file_type == BLOCK_RECORD) + { + scan_inited= 1; + if (maria_scan_init(sort_info.info)) + goto err; + } + if (_ma_create_index_by_sort(&sort_param, + (my_bool) (!(param->testflag & T_VERBOSE)), + (size_t) param->sort_buffer_length)) + { + param->retry_repair=1; + _ma_check_print_error(param, "Create index by sort failed"); + goto err; + } + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_create_index_by_sort", + { + DBUG_PRINT("maria_crash_create_index_by_sort", ("now")); + DBUG_ABORT(); + }); + if (scan_inited) + { + scan_inited= 0; + maria_scan_end(sort_info.info); + } + + /* No need to calculate checksum again. */ + sort_param.calc_checksum= 0; + free_root(&sort_param.wordroot, MYF(0)); + + /* Set for next loop */ + sort_info.max_records= (ha_rows) sort_info.new_info->s->state.state.records; + if (param->testflag & T_STATISTICS) + maria_update_key_parts(sort_param.keyinfo, rec_per_key_part, + sort_param.unique, + (param->stats_method == + MI_STATS_METHOD_IGNORE_NULLS ? + sort_param.notnull : NULL), + (ulonglong) share->state.state.records); + maria_set_key_active(share->state.key_map, sort_param.key); + DBUG_PRINT("repair", ("set enabled index #: %u", sort_param.key)); + + if (_ma_flush_table_files_before_swap(param, info)) + goto err; + + if (sort_param.fix_datafile) + { + param->read_cache.end_of_file=sort_param.filepos; + if (maria_write_data_suffix(&sort_info,1) || + end_io_cache(&sort_info.new_info->rec_cache)) + { + _ma_check_print_error(param, "Got error when flushing row cache"); + goto err; + } + sort_info.new_info->opt_flag&= ~WRITE_CACHE_USED; + + if (param->testflag & T_SAFE_REPAIR) + { + /* Don't repair if we loosed more than one row */ + if (share->state.state.records+1 < start_records) + { + _ma_check_print_error(param, + "Rows lost; Aborting because safe repair was " + "requested"); + share->state.state.records=start_records; + goto err; + } + } + + sort_info.new_info->s->state.state.data_file_length= sort_param.filepos; + if (sort_info.new_info != sort_info.info) + { + MARIA_STATE_INFO save_state= sort_info.new_info->s->state; + if (maria_close(sort_info.new_info)) + { + _ma_check_print_error(param, "Got error %d on close", my_errno); + goto err; + } + copy_data_file_state(&share->state, &save_state); + new_file= -1; + sort_info.new_info= info; + info->rec_cache.file= info->dfile.file; + } + + share->state.version=(ulong) time((time_t*) 0); /* Force reopen */ + + /* Replace the actual file with the temporary file */ + if (new_file >= 0) + { + my_close(new_file, MYF(MY_WME)); + new_file= -1; + } + change_data_file_descriptor(info, -1); + if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT, + DATA_TMP_EXT, + (param->testflag & T_BACKUP_DATA ? + MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) | + sync_dir) || + _ma_open_datafile(info, share, NullS, -1)) + { + _ma_check_print_error(param, "Couldn't change to new data file"); + goto err; + } + if (param->testflag & T_UNPACK) + restore_data_file_type(share); + + org_header_length= share->pack.header_length; + sort_info.org_data_file_type= share->data_file_type; + sort_info.filelength= share->state.state.data_file_length; + sort_param.fix_datafile=0; + } + else + share->state.state.data_file_length=sort_param.max_pos; + + param->read_cache.file= info->dfile.file; /* re-init read cache */ + reinit_io_cache(¶m->read_cache,READ_CACHE,share->pack.header_length, + 1,1); + } + + if (param->testflag & T_WRITE_LOOP) + { + VOID(fputs(" \r",stdout)); VOID(fflush(stdout)); + } + + if (rep_quick && del+sort_info.dupp != share->state.state.del) + { + _ma_check_print_error(param,"Couldn't fix table with quick recovery: " + "Found wrong number of deleted records"); + _ma_check_print_error(param,"Run recovery again without -q"); + got_error=1; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + goto err; + } + + if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS)) + { + my_off_t skr= (share->state.state.data_file_length + + (sort_info.org_data_file_type == COMPRESSED_RECORD) ? + MEMMAP_EXTRA_MARGIN : 0); +#ifdef USE_RELOC + if (sort_info.org_data_file_type == STATIC_RECORD && + skr < share->base.reloc*share->base.min_pack_length) + skr=share->base.reloc*share->base.min_pack_length; +#endif + if (skr != sort_info.filelength) + if (my_chsize(info->dfile.file, skr, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of datafile, error: %d", + my_errno); + } + + if (param->testflag & T_CALC_CHECKSUM) + share->state.state.checksum=param->glob_crc; + + if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0, + MYF(0))) + _ma_check_print_warning(param, + "Can't change size of indexfile, error: %d", + my_errno); + + if (!(param->testflag & T_SILENT)) + { + if (start_records != share->state.state.records) + printf("Data records: %s\n", llstr(share->state.state.records,llbuff)); + } + if (sort_info.dupp) + _ma_check_print_warning(param, + "%s records have been removed", + llstr(sort_info.dupp,llbuff)); + got_error=0; + /* If invoked by external program that uses thr_lock */ + if (&share->state.state != info->state) + *info->state= *info->state_start= share->state.state; + +err: + if (scan_inited) + maria_scan_end(sort_info.info); + _ma_reset_state(info); + + VOID(end_io_cache(&sort_info.new_info->rec_cache)); + VOID(end_io_cache(¶m->read_cache)); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + if (got_error) + { + if (! param->error_printed) + _ma_check_print_error(param,"%d when fixing table",my_errno); + (void)_ma_flush_table_files_before_swap(param, info); + if (sort_info.new_info && sort_info.new_info != sort_info.info) + { + unuse_data_file_descriptor(sort_info.new_info); + maria_close(sort_info.new_info); + } + if (new_file >= 0) + { + VOID(my_close(new_file,MYF(0))); + VOID(my_delete(param->temp_filename, MYF(MY_WME))); + } + maria_mark_crashed_on_repair(info); + } + else + { + if (key_map == share->state.key_map) + share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS; + /* + Now that we have flushed and forced everything, we can bump + create_rename_lsn: + */ + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_repair", + { + DBUG_PRINT("maria_crash_repair", ("now")); + DBUG_ABORT(); + }); + } + share->state.changed|= STATE_NOT_SORTED_PAGES; + if (!rep_quick) + share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + /* If caller had disabled logging it's not up to us to re-enable it */ + if (reenable_logging) + _ma_reenable_logging_for_table(info, FALSE); + restore_table_state_after_repair(info, &backup_share); + + my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_info.key_block, MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR)); + DBUG_RETURN(got_error); +} + + +/* + Threaded repair of table using sorting + + SYNOPSIS + maria_repair_parallel() + param Repair parameters + info MARIA handler to repair + name Name of table (for warnings) + rep_quick set to <> 0 if we should not change data file + + DESCRIPTION + Same as maria_repair_by_sort but do it multithreaded + Each key is handled by a separate thread. + TODO: make a number of threads a parameter + + In parallel repair we use one thread per index. There are two modes: + + Quick + + Only the indexes are rebuilt. All threads share a read buffer. + Every thread that needs fresh data in the buffer enters the shared + cache lock. The last thread joining the lock reads the buffer from + the data file and wakes all other threads. + + Non-quick + + The data file is rebuilt and all indexes are rebuilt to point to + the new record positions. One thread is the master thread. It + reads from the old data file and writes to the new data file. It + also creates one of the indexes. The other threads read from a + buffer which is filled by the master. If they need fresh data, + they enter the shared cache lock. If the masters write buffer is + full, it flushes it to the new data file and enters the shared + cache lock too. When all threads joined in the lock, the master + copies its write buffer to the read buffer for the other threads + and wakes them. + + RESULT + 0 ok + <>0 Error +*/ + +int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info, + const char * name, my_bool rep_quick) +{ +#ifndef THREAD + return maria_repair_by_sort(param, info, name, rep_quick); +#else + int got_error; + uint i,key, total_key_length, istep; + ha_rows start_records; + my_off_t new_header_length,del; + File new_file; + MARIA_SORT_PARAM *sort_param=0, tmp_sort_param; + MARIA_SHARE *share= info->s; + double *rec_per_key_part; + HA_KEYSEG *keyseg; + char llbuff[22]; + IO_CACHE new_data_cache; /* For non-quick repair. */ + IO_CACHE_SHARE io_share; + MARIA_SORT_INFO sort_info; + MARIA_SHARE backup_share; + ulonglong key_map; + pthread_attr_t thr_attr; + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + my_bool reenable_logging= 0; + DBUG_ENTER("maria_repair_parallel"); + LINT_INIT(key_map); + + got_error= 1; + new_file= -1; + start_records= share->state.state.records; + if (!(param->testflag & T_SILENT)) + { + printf("- parallel recovering (with sort) Aria-table '%s'\n",name); + printf("Data records: %s\n", llstr(start_records, llbuff)); + } + + if (initialize_variables_for_repair(param, &sort_info, &tmp_sort_param, info, + rep_quick, &backup_share)) + goto err; + + if ((reenable_logging= share->now_transactional)) + _ma_tmp_disable_logging_for_table(info, 0); + + new_header_length= ((param->testflag & T_UNPACK) ? 0 : + share->pack.header_length); + + /* + Quick repair (not touching data file, rebuilding indexes): + { + Read cache is (HA_CHECK *param)->read_cache using info->dfile.file. + } + + Non-quick repair (rebuilding data file and indexes): + { + Master thread: + + Read cache is (HA_CHECK *param)->read_cache using info->dfile.file. + Write cache is (MARIA_INFO *info)->rec_cache using new_file. + + Slave threads: + + Read cache is new_data_cache synced to master rec_cache. + + The final assignment of the filedescriptor for rec_cache is done + after the cache creation. + + Don't check file size on new_data_cache, as the resulting file size + is not known yet. + + As rec_cache and new_data_cache are synced, write_buffer_length is + used for the read cache 'new_data_cache'. Both start at the same + position 'new_header_length'. + } + */ + DBUG_PRINT("info", ("is quick repair: %d", (int) rep_quick)); + + /* Initialize pthread structures before goto err. */ + pthread_mutex_init(&sort_info.mutex, MY_MUTEX_INIT_FAST); + pthread_cond_init(&sort_info.cond, 0); + + if (!(sort_info.key_block= + alloc_key_blocks(param, (uint) param->sort_key_blocks, + share->base.max_key_block_length)) || + init_io_cache(¶m->read_cache, info->dfile.file, + (uint) param->read_buffer_length, + READ_CACHE, share->pack.header_length, 1, MYF(MY_WME)) || + (!rep_quick && + (init_io_cache(&info->rec_cache, info->dfile.file, + (uint) param->write_buffer_length, + WRITE_CACHE, new_header_length, 1, + MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw) || + init_io_cache(&new_data_cache, -1, + (uint) param->write_buffer_length, + READ_CACHE, new_header_length, 1, + MYF(MY_WME | MY_DONT_CHECK_FILESIZE))))) + goto err; + sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks; + info->opt_flag|=WRITE_CACHE_USED; + info->rec_cache.file= info->dfile.file; /* for sort_delete_record */ + + if (!rep_quick) + { + /* Get real path for data file */ + if ((new_file= my_create(fn_format(param->temp_filename, + share->data_file_name.str, "", + DATA_TMP_EXT, + 2+4), + 0,param->tmpfile_createflag, + MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (new_header_length && + maria_filecopy(param, new_file, info->dfile.file,0L,new_header_length, + "datafile-header")) + goto err; + if (param->testflag & T_UNPACK) + restore_data_file_type(share); + share->state.dellink= HA_OFFSET_ERROR; + info->rec_cache.file=new_file; + } + + /* Optionally drop indexes and optionally modify the key_map. */ + maria_drop_all_indexes(param, info, FALSE); + key_map= share->state.key_map; + if (param->testflag & T_CREATE_MISSING_KEYS) + { + /* Invert the copied key_map to recreate all disabled indexes. */ + key_map= ~key_map; + } + + param->read_cache.end_of_file= sort_info.filelength; + + /* + +1 below is required hack for parallel repair mode. + The share->state.state.records value, that is compared later + to sort_info.max_records and cannot exceed it, is + increased in sort_key_write. In maria_repair_by_sort, sort_key_write + is called after sort_key_read, where the comparison is performed, + but in parallel mode master thread can call sort_key_write + before some other repair thread calls sort_key_read. + Furthermore I'm not even sure +1 would be enough. + May be sort_info.max_records shold be always set to max value in + parallel mode. + */ + sort_info.max_records++; + + del=share->state.state.del; + + if (!(sort_param=(MARIA_SORT_PARAM *) + my_malloc((uint) share->base.keys * + (sizeof(MARIA_SORT_PARAM) + share->base.pack_reclength), + MYF(MY_ZEROFILL)))) + { + _ma_check_print_error(param,"Not enough memory for key!"); + goto err; + } + total_key_length=0; + rec_per_key_part= param->new_rec_per_key_part; + share->state.state.records=share->state.state.del=share->state.split=0; + share->state.state.empty=0; + + for (i=key=0, istep=1 ; key < share->base.keys ; + rec_per_key_part+=sort_param[i].keyinfo->keysegs, i+=istep, key++) + { + sort_param[i].key=key; + sort_param[i].keyinfo=share->keyinfo+key; + sort_param[i].seg=sort_param[i].keyinfo->seg; + /* + Skip this index if it is marked disabled in the copied + (and possibly inverted) key_map. + */ + if (! maria_is_key_active(key_map, key)) + { + /* Remember old statistics for key */ + memcpy((char*) rec_per_key_part, + (char*) (share->state.rec_per_key_part+ + (uint) (rec_per_key_part - param->new_rec_per_key_part)), + sort_param[i].keyinfo->keysegs*sizeof(*rec_per_key_part)); + istep=0; + continue; + } + istep=1; + if ((!(param->testflag & T_SILENT))) + printf ("- Fixing index %d\n",key+1); + if (sort_param[i].keyinfo->flag & HA_FULLTEXT) + { + sort_param[i].key_read=sort_maria_ft_key_read; + sort_param[i].key_write=sort_maria_ft_key_write; + } + else + { + sort_param[i].key_read=sort_key_read; + sort_param[i].key_write=sort_key_write; + } + sort_param[i].key_cmp=sort_key_cmp; + sort_param[i].lock_in_memory=maria_lock_memory; + sort_param[i].tmpdir=param->tmpdir; + sort_param[i].sort_info=&sort_info; + sort_param[i].master=0; + sort_param[i].fix_datafile=0; + sort_param[i].calc_checksum= 0; + + sort_param[i].filepos=new_header_length; + sort_param[i].max_pos=sort_param[i].pos=share->pack.header_length; + + sort_param[i].record= (((uchar *)(sort_param+share->base.keys))+ + (share->base.pack_reclength * i)); + if (_ma_alloc_buffer(&sort_param[i].rec_buff, &sort_param[i].rec_buff_size, + share->base.default_rec_buff_size)) + { + _ma_check_print_error(param,"Not enough memory!"); + goto err; + } + sort_param[i].key_length=share->rec_reflength; + for (keyseg=sort_param[i].seg; keyseg->type != HA_KEYTYPE_END; + keyseg++) + { + sort_param[i].key_length+=keyseg->length; + if (keyseg->flag & HA_SPACE_PACK) + sort_param[i].key_length+=get_pack_length(keyseg->length); + if (keyseg->flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) + sort_param[i].key_length+=2 + test(keyseg->length >= 127); + if (keyseg->flag & HA_NULL_PART) + sort_param[i].key_length++; + } + total_key_length+=sort_param[i].key_length; + + if (sort_param[i].keyinfo->flag & HA_FULLTEXT) + { + uint ft_max_word_len_for_sort= + (FT_MAX_WORD_LEN_FOR_SORT * + sort_param[i].keyinfo->seg->charset->mbmaxlen); + sort_param[i].key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN; + init_alloc_root(&sort_param[i].wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0); + } + } + sort_info.total_keys=i; + sort_param[0].master= 1; + sort_param[0].fix_datafile= ! rep_quick; + sort_param[0].calc_checksum= test(param->testflag & T_CALC_CHECKSUM); + + if (!maria_ftparser_alloc_param(info)) + goto err; + + sort_info.got_error=0; + pthread_mutex_lock(&sort_info.mutex); + + /* + Initialize the I/O cache share for use with the read caches and, in + case of non-quick repair, the write cache. When all threads join on + the cache lock, the writer copies the write cache contents to the + read caches. + */ + if (i > 1) + { + if (rep_quick) + init_io_cache_share(¶m->read_cache, &io_share, NULL, i); + else + init_io_cache_share(&new_data_cache, &io_share, &info->rec_cache, i); + } + else + io_share.total_threads= 0; /* share not used */ + + (void) pthread_attr_init(&thr_attr); + (void) pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED); + + for (i=0 ; i < sort_info.total_keys ; i++) + { + /* + Copy the properly initialized IO_CACHE structure so that every + thread has its own copy. In quick mode param->read_cache is shared + for use by all threads. In non-quick mode all threads but the + first copy the shared new_data_cache, which is synchronized to the + write cache of the first thread. The first thread copies + param->read_cache, which is not shared. + */ + sort_param[i].read_cache= ((rep_quick || !i) ? param->read_cache : + new_data_cache); + DBUG_PRINT("io_cache_share", ("thread: %u read_cache: 0x%lx", + i, (long) &sort_param[i].read_cache)); + + /* + two approaches: the same amount of memory for each thread + or the memory for the same number of keys for each thread... + In the second one all the threads will fill their sort_buffers + (and call write_keys) at the same time, putting more stress on i/o. + */ + sort_param[i].sortbuff_size= +#ifndef USING_SECOND_APPROACH + param->sort_buffer_length/sort_info.total_keys; +#else + param->sort_buffer_length*sort_param[i].key_length/total_key_length; +#endif + if (pthread_create(&sort_param[i].thr, &thr_attr, + _ma_thr_find_all_keys, + (void *) (sort_param+i))) + { + _ma_check_print_error(param,"Cannot start a repair thread"); + /* Cleanup: Detach from the share. Avoid others to be blocked. */ + if (io_share.total_threads) + remove_io_thread(&sort_param[i].read_cache); + DBUG_PRINT("error", ("Cannot start a repair thread")); + sort_info.got_error=1; + } + else + sort_info.threads_running++; + } + (void) pthread_attr_destroy(&thr_attr); + + /* waiting for all threads to finish */ + while (sort_info.threads_running) + pthread_cond_wait(&sort_info.cond, &sort_info.mutex); + pthread_mutex_unlock(&sort_info.mutex); + + if ((got_error= _ma_thr_write_keys(sort_param))) + { + param->retry_repair=1; + goto err; + } + got_error=1; /* Assume the following may go wrong */ + + if (_ma_flush_table_files_before_swap(param, info)) + goto err; + + if (sort_param[0].fix_datafile) + { + /* + Append some nulls to the end of a memory mapped file. Destroy the + write cache. The master thread did already detach from the share + by remove_io_thread() in sort.c:thr_find_all_keys(). + */ + if (maria_write_data_suffix(&sort_info,1) || + end_io_cache(&info->rec_cache)) + goto err; + if (param->testflag & T_SAFE_REPAIR) + { + /* Don't repair if we loosed more than one row */ + if (share->state.state.records+1 < start_records) + { + share->state.state.records=start_records; + goto err; + } + } + share->state.state.data_file_length= sort_param->filepos; + /* Only whole records */ + share->state.version= (ulong) time((time_t*) 0); + /* + Exchange the data file descriptor of the table, so that we use the + new file from now on. + */ + my_close(info->dfile.file, MYF(0)); + info->dfile.file= new_file; + share->pack.header_length=(ulong) new_header_length; + } + else + share->state.state.data_file_length=sort_param->max_pos; + + if (rep_quick && del+sort_info.dupp != share->state.state.del) + { + _ma_check_print_error(param,"Couldn't fix table with quick recovery: " + "Found wrong number of deleted records"); + _ma_check_print_error(param,"Run recovery again without -q"); + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + goto err; + } + + if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS)) + { + my_off_t skr= (share->state.state.data_file_length + + (sort_info.org_data_file_type == COMPRESSED_RECORD) ? + MEMMAP_EXTRA_MARGIN : 0); +#ifdef USE_RELOC + if (sort_info.org_data_file_type == STATIC_RECORD && + skr < share->base.reloc*share->base.min_pack_length) + skr=share->base.reloc*share->base.min_pack_length; +#endif + if (skr != sort_info.filelength) + if (my_chsize(info->dfile.file, skr, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of datafile, error: %d", + my_errno); + } + if (param->testflag & T_CALC_CHECKSUM) + share->state.state.checksum=param->glob_crc; + + if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0, + MYF(0))) + _ma_check_print_warning(param, + "Can't change size of indexfile, error: %d", + my_errno); + + if (!(param->testflag & T_SILENT)) + { + if (start_records != share->state.state.records) + printf("Data records: %s\n", llstr(share->state.state.records,llbuff)); + } + if (sort_info.dupp) + _ma_check_print_warning(param, + "%s records have been removed", + llstr(sort_info.dupp,llbuff)); + got_error=0; + /* If invoked by external program that uses thr_lock */ + if (&share->state.state != info->state) + *info->state= *info->state_start= share->state.state; + +err: + _ma_reset_state(info); + + /* + Destroy the write cache. The master thread did already detach from + the share by remove_io_thread() or it was not yet started (if the + error happend before creating the thread). + */ + VOID(end_io_cache(&sort_info.new_info->rec_cache)); + VOID(end_io_cache(¶m->read_cache)); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + /* + Destroy the new data cache in case of non-quick repair. All slave + threads did either detach from the share by remove_io_thread() + already or they were not yet started (if the error happend before + creating the threads). + */ + if (!rep_quick) + VOID(end_io_cache(&new_data_cache)); + if (!got_error) + { + /* Replace the actual file with the temporary file */ + if (new_file >= 0) + { + my_close(new_file,MYF(0)); + info->dfile.file= new_file= -1; + if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT, + DATA_TMP_EXT, + MYF((param->testflag & T_BACKUP_DATA ? + MY_REDEL_MAKE_BACKUP : 0) | + sync_dir)) || + _ma_open_datafile(info,share, NullS, -1)) + got_error=1; + } + } + if (got_error) + { + if (! param->error_printed) + _ma_check_print_error(param,"%d when fixing table",my_errno); + (void)_ma_flush_table_files_before_swap(param, info); + if (new_file >= 0) + { + VOID(my_close(new_file,MYF(0))); + VOID(my_delete(param->temp_filename, MYF(MY_WME))); + if (info->dfile.file == new_file) + info->dfile.file= -1; + } + maria_mark_crashed_on_repair(info); + } + else if (key_map == share->state.key_map) + share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS; + share->state.changed|= STATE_NOT_SORTED_PAGES; + if (!rep_quick) + share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + pthread_cond_destroy (&sort_info.cond); + pthread_mutex_destroy(&sort_info.mutex); + + /* If caller had disabled logging it's not up to us to re-enable it */ + if (reenable_logging) + _ma_reenable_logging_for_table(info, FALSE); + restore_table_state_after_repair(info, &backup_share); + + my_free(sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_info.key_block,MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_param,MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR)); + if (!got_error && (param->testflag & T_UNPACK)) + restore_data_file_type(share); + DBUG_RETURN(got_error); +#endif /* THREAD */ +} + + /* Read next record and return next key */ + +static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key) +{ + int error; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + MARIA_HA *info= sort_info->info; + MARIA_KEY int_key; + DBUG_ENTER("sort_key_read"); + + if ((error=sort_get_next_record(sort_param))) + DBUG_RETURN(error); + if (info->s->state.state.records == sort_info->max_records) + { + _ma_check_print_error(sort_info->param, + "Key %d - Found too many records; Can't continue", + sort_param->key+1); + DBUG_RETURN(1); + } + if (_ma_sort_write_record(sort_param)) + DBUG_RETURN(1); + + (*info->s->keyinfo[sort_param->key].make_key)(info, &int_key, + sort_param->key, key, + sort_param->record, + sort_param->current_filepos, + 0); + sort_param->real_key_length= int_key.data_length + int_key.ref_length; +#ifdef HAVE_valgrind + bzero(key+sort_param->real_key_length, + (sort_param->key_length-sort_param->real_key_length)); +#endif + DBUG_RETURN(0); +} /* sort_key_read */ + + +static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key) +{ + int error; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + MARIA_HA *info=sort_info->info; + FT_WORD *wptr=0; + MARIA_KEY int_key; + DBUG_ENTER("sort_maria_ft_key_read"); + + if (!sort_param->wordlist) + { + for (;;) + { + free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE)); + if ((error=sort_get_next_record(sort_param))) + DBUG_RETURN(error); + if ((error= _ma_sort_write_record(sort_param))) + DBUG_RETURN(error); + if (!(wptr= _ma_ft_parserecord(info,sort_param->key,sort_param->record, + &sort_param->wordroot))) + + DBUG_RETURN(1); + if (wptr->pos) + break; + } + sort_param->wordptr=sort_param->wordlist=wptr; + } + else + { + error=0; + wptr=(FT_WORD*)(sort_param->wordptr); + } + + _ma_ft_make_key(info, &int_key, sort_param->key, key, wptr++, + sort_param->current_filepos); + sort_param->real_key_length= int_key.data_length + int_key.ref_length; + +#ifdef HAVE_valgrind + if (sort_param->key_length > sort_param->real_key_length) + bzero(key+sort_param->real_key_length, + (sort_param->key_length-sort_param->real_key_length)); +#endif + if (!wptr->pos) + { + free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE)); + sort_param->wordlist=0; + } + else + sort_param->wordptr=(void*)wptr; + + DBUG_RETURN(error); +} /* sort_maria_ft_key_read */ + + +/* + Read next record from file using parameters in sort_info. + + SYNOPSIS + sort_get_next_record() + sort_param Information about and for the sort process + + NOTES + Dynamic Records With Non-Quick Parallel Repair + + For non-quick parallel repair we use a synchronized read/write + cache. This means that one thread is the master who fixes the data + file by reading each record from the old data file and writing it + to the new data file. By doing this the records in the new data + file are written contiguously. Whenever the write buffer is full, + it is copied to the read buffer. The slaves read from the read + buffer, which is not associated with a file. Thus read_cache.file + is -1. When using _mi_read_cache(), the slaves must always set + flag to READING_NEXT so that the function never tries to read from + file. This is safe because the records are contiguous. There is no + need to read outside the cache. This condition is evaluated in the + variable 'parallel_flag' for quick reference. read_cache.file must + be >= 0 in every other case. + + RETURN + -1 end of file + 0 ok + sort_param->current_filepos points to record position. + sort_param->record contains record + sort_param->max_pos contains position to last byte read + > 0 error +*/ + +static int sort_get_next_record(MARIA_SORT_PARAM *sort_param) +{ + int searching; + int parallel_flag; + uint found_record,b_type,left_length; + my_off_t pos; + MARIA_BLOCK_INFO block_info; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_HA *info=sort_info->info; + MARIA_SHARE *share= info->s; + char llbuff[22],llbuff2[22]; + DBUG_ENTER("sort_get_next_record"); + + if (_ma_killed_ptr(param)) + DBUG_RETURN(1); + + switch (sort_info->org_data_file_type) { + case BLOCK_RECORD: + { + for (;;) + { + int flag; + /* + Assume table is transactional and it had LSN pages in the + cache. Repair has flushed them, left data pages stay in + cache, and disabled transactionality (so share's current page + type is PLAIN); page cache would assert if it finds a cached LSN page + while _ma_scan_block_record() requested a PLAIN page. So we use + UNKNOWN. + */ + enum pagecache_page_type save_page_type= share->page_type; + share->page_type= PAGECACHE_READ_UNKNOWN_PAGE; + if (info != sort_info->new_info) + { + /* Safe scanning */ + flag= _ma_safe_scan_block_record(sort_info, info, + sort_param->record); + } + else + { + /* + Scan on clean table. + It requires a reliable data_file_length so we set it. + */ + share->state.state.data_file_length= sort_info->filelength; + info->cur_row.trid= 0; + flag= _ma_scan_block_record(info, sort_param->record, + info->cur_row.nextpos, 1); + set_if_bigger(param->max_found_trid, info->cur_row.trid); + if (info->cur_row.trid > param->max_trid) + { + _ma_check_print_not_visible_error(param, info->cur_row.trid); + flag= HA_ERR_ROW_NOT_VISIBLE; + } + } + share->page_type= save_page_type; + if (!flag) + { + if (sort_param->calc_checksum) + { + ha_checksum checksum; + checksum= (*share->calc_check_checksum)(info, sort_param->record); + if (share->calc_checksum && + info->cur_row.checksum != (checksum & 255)) + { + if (param->testflag & T_VERBOSE) + { + record_pos_to_txt(info, info->cur_row.lastpos, llbuff); + _ma_check_print_info(param, + "Found record with wrong checksum at %s", + llbuff); + } + continue; + } + info->cur_row.checksum= checksum; + param->glob_crc+= checksum; + } + sort_param->start_recpos= sort_param->current_filepos= + info->cur_row.lastpos; + DBUG_RETURN(0); + } + if (flag == HA_ERR_END_OF_FILE) + { + sort_param->max_pos= share->state.state.data_file_length; + DBUG_RETURN(-1); + } + /* Retry only if wrong record, not if disk error */ + if (flag != HA_ERR_WRONG_IN_RECORD) + { + retry_if_quick(sort_param, flag); + DBUG_RETURN(flag); + } + } + break; /* Impossible */ + } + case STATIC_RECORD: + for (;;) + { + if (my_b_read(&sort_param->read_cache,sort_param->record, + share->base.pack_reclength)) + { + if (sort_param->read_cache.error) + param->out_flag |= O_DATA_LOST; + retry_if_quick(sort_param, my_errno); + DBUG_RETURN(-1); + } + sort_param->start_recpos=sort_param->pos; + if (!sort_param->fix_datafile) + { + sort_param->current_filepos= sort_param->pos; + if (sort_param->master) + share->state.split++; + } + sort_param->max_pos=(sort_param->pos+=share->base.pack_reclength); + if (*sort_param->record) + { + if (sort_param->calc_checksum) + param->glob_crc+= (info->cur_row.checksum= + _ma_static_checksum(info,sort_param->record)); + DBUG_RETURN(0); + } + if (!sort_param->fix_datafile && sort_param->master) + { + share->state.state.del++; + share->state.state.empty+=share->base.pack_reclength; + } + } + case DYNAMIC_RECORD: + { + uchar *to; + ha_checksum checksum= 0; + LINT_INIT(to); + + pos=sort_param->pos; + searching=(sort_param->fix_datafile && (param->testflag & T_EXTEND)); + parallel_flag= (sort_param->read_cache.file < 0) ? READING_NEXT : 0; + for (;;) + { + found_record=block_info.second_read= 0; + left_length=1; + if (searching) + { + pos=MY_ALIGN(pos,MARIA_DYN_ALIGN_SIZE); + param->testflag|=T_RETRY_WITHOUT_QUICK; + sort_param->start_recpos=pos; + } + do + { + if (pos > sort_param->max_pos) + sort_param->max_pos=pos; + if (pos & (MARIA_DYN_ALIGN_SIZE-1)) + { + if ((param->testflag & T_VERBOSE) || searching == 0) + _ma_check_print_info(param,"Wrong aligned block at %s", + llstr(pos,llbuff)); + if (searching) + goto try_next; + } + if (found_record && pos == param->search_after_block) + _ma_check_print_info(param,"Block: %s used by record at %s", + llstr(param->search_after_block,llbuff), + llstr(sort_param->start_recpos,llbuff2)); + if (_ma_read_cache(&sort_param->read_cache, + block_info.header, pos, + MARIA_BLOCK_INFO_HEADER_LENGTH, + (! found_record ? READING_NEXT : 0) | + parallel_flag | READING_HEADER)) + { + if (found_record) + { + _ma_check_print_info(param, + "Can't read whole record at %s (errno: %d)", + llstr(sort_param->start_recpos,llbuff),errno); + goto try_next; + } + DBUG_RETURN(-1); + } + if (searching && ! sort_param->fix_datafile) + { + param->error_printed=1; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + DBUG_RETURN(1); /* Something wrong with data */ + } + b_type= _ma_get_block_info(&block_info,-1,pos); + if ((b_type & (BLOCK_ERROR | BLOCK_FATAL_ERROR)) || + ((b_type & BLOCK_FIRST) && + (block_info.rec_len < (uint) share->base.min_pack_length || + block_info.rec_len > (uint) share->base.max_pack_length))) + { + uint i; + if (param->testflag & T_VERBOSE || searching == 0) + _ma_check_print_info(param, + "Wrong bytesec: %3d-%3d-%3d at %10s; Skipped", + block_info.header[0],block_info.header[1], + block_info.header[2],llstr(pos,llbuff)); + if (found_record) + goto try_next; + block_info.second_read=0; + searching=1; + /* Search after block in read header string */ + for (i=MARIA_DYN_ALIGN_SIZE ; + i < MARIA_BLOCK_INFO_HEADER_LENGTH ; + i+= MARIA_DYN_ALIGN_SIZE) + if (block_info.header[i] >= 1 && + block_info.header[i] <= MARIA_MAX_DYN_HEADER_BYTE) + break; + pos+=(ulong) i; + sort_param->start_recpos=pos; + continue; + } + if (b_type & BLOCK_DELETED) + { + my_bool error=0; + if (block_info.block_len+ (uint) (block_info.filepos-pos) < + share->base.min_block_length) + { + if (!searching) + _ma_check_print_info(param, + "Deleted block with impossible length %lu " + "at %s", + block_info.block_len,llstr(pos,llbuff)); + error=1; + } + else + { + if ((block_info.next_filepos != HA_OFFSET_ERROR && + block_info.next_filepos >= + share->state.state.data_file_length) || + (block_info.prev_filepos != HA_OFFSET_ERROR && + block_info.prev_filepos >= + share->state.state.data_file_length)) + { + if (!searching) + _ma_check_print_info(param, + "Delete link points outside datafile at " + "%s", + llstr(pos,llbuff)); + error=1; + } + } + if (error) + { + if (found_record) + goto try_next; + searching=1; + pos+= MARIA_DYN_ALIGN_SIZE; + sort_param->start_recpos=pos; + block_info.second_read=0; + continue; + } + } + else + { + if (block_info.block_len+ (uint) (block_info.filepos-pos) < + share->base.min_block_length || + block_info.block_len > (uint) share->base.max_pack_length+ + MARIA_SPLIT_LENGTH) + { + if (!searching) + _ma_check_print_info(param, + "Found block with impossible length %lu " + "at %s; Skipped", + block_info.block_len+ + (uint) (block_info.filepos-pos), + llstr(pos,llbuff)); + if (found_record) + goto try_next; + searching=1; + pos+= MARIA_DYN_ALIGN_SIZE; + sort_param->start_recpos=pos; + block_info.second_read=0; + continue; + } + } + if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + { + if (!sort_param->fix_datafile && sort_param->master && + (b_type & BLOCK_DELETED)) + { + share->state.state.empty+=block_info.block_len; + share->state.state.del++; + share->state.split++; + } + if (found_record) + goto try_next; + if (searching) + { + pos+=MARIA_DYN_ALIGN_SIZE; + sort_param->start_recpos=pos; + } + else + pos=block_info.filepos+block_info.block_len; + block_info.second_read=0; + continue; + } + + if (!sort_param->fix_datafile && sort_param->master) + share->state.split++; + if (! found_record++) + { + sort_param->find_length=left_length=block_info.rec_len; + sort_param->start_recpos=pos; + if (!sort_param->fix_datafile) + sort_param->current_filepos= sort_param->start_recpos; + if (sort_param->fix_datafile && (param->testflag & T_EXTEND)) + sort_param->pos=block_info.filepos+1; + else + sort_param->pos=block_info.filepos+block_info.block_len; + if (share->base.blobs) + { + if (_ma_alloc_buffer(&sort_param->rec_buff, + &sort_param->rec_buff_size, + block_info.rec_len + + share->base.extra_rec_buff_size)) + + { + if (param->max_record_length >= block_info.rec_len) + { + _ma_check_print_error(param,"Not enough memory for blob at %s " + "(need %lu)", + llstr(sort_param->start_recpos,llbuff), + (ulong) block_info.rec_len); + DBUG_RETURN(1); + } + else + { + _ma_check_print_info(param,"Not enough memory for blob at %s " + "(need %lu); Row skipped", + llstr(sort_param->start_recpos,llbuff), + (ulong) block_info.rec_len); + goto try_next; + } + } + } + to= sort_param->rec_buff; + } + if (left_length < block_info.data_len || ! block_info.data_len) + { + _ma_check_print_info(param, + "Found block with too small length at %s; " + "Skipped", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + if (block_info.filepos + block_info.data_len > + sort_param->read_cache.end_of_file) + { + _ma_check_print_info(param, + "Found block that points outside data file " + "at %s", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + /* + Copy information that is already read. Avoid accessing data + below the cache start. This could happen if the header + streched over the end of the previous buffer contents. + */ + { + uint header_len= (uint) (block_info.filepos - pos); + uint prefetch_len= (MARIA_BLOCK_INFO_HEADER_LENGTH - header_len); + + if (prefetch_len > block_info.data_len) + prefetch_len= block_info.data_len; + if (prefetch_len) + { + memcpy(to, block_info.header + header_len, prefetch_len); + block_info.filepos+= prefetch_len; + block_info.data_len-= prefetch_len; + left_length-= prefetch_len; + to+= prefetch_len; + } + } + if (block_info.data_len && + _ma_read_cache(&sort_param->read_cache,to,block_info.filepos, + block_info.data_len, + (found_record == 1 ? READING_NEXT : 0) | + parallel_flag)) + { + _ma_check_print_info(param, + "Read error for block at: %s (error: %d); " + "Skipped", + llstr(block_info.filepos,llbuff),my_errno); + goto try_next; + } + left_length-=block_info.data_len; + to+=block_info.data_len; + pos=block_info.next_filepos; + if (pos == HA_OFFSET_ERROR && left_length) + { + _ma_check_print_info(param, + "Wrong block with wrong total length " + "starting at %s", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + if (pos + MARIA_BLOCK_INFO_HEADER_LENGTH > + sort_param->read_cache.end_of_file) + { + _ma_check_print_info(param, + "Found link that points at %s (outside data " + "file) at %s", + llstr(pos,llbuff2), + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + } while (left_length); + + if (_ma_rec_unpack(info,sort_param->record,sort_param->rec_buff, + sort_param->find_length) != MY_FILE_ERROR) + { + if (sort_param->read_cache.error < 0) + DBUG_RETURN(1); + if (sort_param->calc_checksum) + checksum= (share->calc_check_checksum)(info, sort_param->record); + if ((param->testflag & (T_EXTEND | T_REP)) || searching) + { + if (_ma_rec_check(info, sort_param->record, sort_param->rec_buff, + sort_param->find_length, + (param->testflag & T_QUICK) && + sort_param->calc_checksum && + test(share->calc_checksum), checksum)) + { + _ma_check_print_info(param,"Found wrong packed record at %s", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + } + if (sort_param->calc_checksum) + param->glob_crc+= checksum; + DBUG_RETURN(0); + } + if (!searching) + _ma_check_print_info(param,"Key %d - Found wrong stored record at %s", + sort_param->key+1, + llstr(sort_param->start_recpos,llbuff)); + try_next: + pos=(sort_param->start_recpos+=MARIA_DYN_ALIGN_SIZE); + searching=1; + } + } + case COMPRESSED_RECORD: + for (searching=0 ;; searching=1, sort_param->pos++) + { + if (_ma_read_cache(&sort_param->read_cache, block_info.header, + sort_param->pos, + share->pack.ref_length,READING_NEXT)) + DBUG_RETURN(-1); + if (searching && ! sort_param->fix_datafile) + { + param->error_printed=1; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + DBUG_RETURN(1); /* Something wrong with data */ + } + sort_param->start_recpos=sort_param->pos; + if (_ma_pack_get_block_info(info, &sort_param->bit_buff, &block_info, + &sort_param->rec_buff, + &sort_param->rec_buff_size, -1, + sort_param->pos)) + DBUG_RETURN(-1); + if (!block_info.rec_len && + sort_param->pos + MEMMAP_EXTRA_MARGIN == + sort_param->read_cache.end_of_file) + DBUG_RETURN(-1); + if (block_info.rec_len < (uint) share->min_pack_length || + block_info.rec_len > (uint) share->max_pack_length) + { + if (! searching) + _ma_check_print_info(param, + "Found block with wrong recordlength: %lu " + "at %s\n", + block_info.rec_len, + llstr(sort_param->pos,llbuff)); + continue; + } + if (_ma_read_cache(&sort_param->read_cache, sort_param->rec_buff, + block_info.filepos, block_info.rec_len, + READING_NEXT)) + { + if (! searching) + _ma_check_print_info(param,"Couldn't read whole record from %s", + llstr(sort_param->pos,llbuff)); + continue; + } +#ifdef HAVE_valgrind + bzero(sort_param->rec_buff + block_info.rec_len, + share->base.extra_rec_buff_size); +#endif + if (_ma_pack_rec_unpack(info, &sort_param->bit_buff, sort_param->record, + sort_param->rec_buff, block_info.rec_len)) + { + if (! searching) + _ma_check_print_info(param,"Found wrong record at %s", + llstr(sort_param->pos,llbuff)); + continue; + } + if (!sort_param->fix_datafile) + { + sort_param->current_filepos= sort_param->pos; + if (sort_param->master) + share->state.split++; + } + sort_param->max_pos= (sort_param->pos=block_info.filepos+ + block_info.rec_len); + info->packed_length=block_info.rec_len; + + if (sort_param->calc_checksum) + { + info->cur_row.checksum= (*share->calc_check_checksum)(info, + sort_param-> + record); + param->glob_crc+= info->cur_row.checksum; + } + DBUG_RETURN(0); + } + } + DBUG_RETURN(1); /* Impossible */ +} + + +/** + @brief Write record to new file. + + @fn _ma_sort_write_record() + @param sort_param Sort parameters. + + @note + This is only called by a master thread if parallel repair is used. + + @return + @retval 0 OK + sort_param->current_filepos points to inserted record for + block_records and to the place for the next record for + other row types. + sort_param->filepos points to end of file + @retval 1 Error +*/ + +int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param) +{ + int flag; + uint length; + ulong block_length,reclength; + uchar *from; + uchar block_buff[8]; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param= sort_info->param; + MARIA_HA *info= sort_info->new_info; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_sort_write_record"); + + if (sort_param->fix_datafile) + { + sort_param->current_filepos= sort_param->filepos; + switch (sort_info->new_data_file_type) { + case BLOCK_RECORD: + if ((sort_param->current_filepos= + (*share->write_record_init)(info, sort_param->record)) == + HA_OFFSET_ERROR) + DBUG_RETURN(1); + /* Pointer to end of file */ + sort_param->filepos= share->state.state.data_file_length; + break; + case STATIC_RECORD: + if (my_b_write(&info->rec_cache,sort_param->record, + share->base.pack_reclength)) + { + _ma_check_print_error(param,"%d when writing to datafile",my_errno); + DBUG_RETURN(1); + } + sort_param->filepos+=share->base.pack_reclength; + share->state.split++; + break; + case DYNAMIC_RECORD: + if (! info->blobs) + from=sort_param->rec_buff; + else + { + /* must be sure that local buffer is big enough */ + reclength=share->base.pack_reclength+ + _ma_calc_total_blob_length(info,sort_param->record)+ + ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+ + MARIA_DYN_DELETE_BLOCK_HEADER; + if (sort_info->buff_length < reclength) + { + if (!(sort_info->buff=my_realloc(sort_info->buff, (uint) reclength, + MYF(MY_FREE_ON_ERROR | + MY_ALLOW_ZERO_PTR)))) + DBUG_RETURN(1); + sort_info->buff_length=reclength; + } + from= (uchar *) sort_info->buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER); + } + /* We can use info->checksum here as only one thread calls this */ + info->cur_row.checksum= (*share->calc_check_checksum)(info, + sort_param-> + record); + reclength= _ma_rec_pack(info,from,sort_param->record); + flag=0; + + do + { + block_length=reclength+ 3 + test(reclength >= (65520-3)); + if (block_length < share->base.min_block_length) + block_length=share->base.min_block_length; + info->update|=HA_STATE_WRITE_AT_END; + block_length=MY_ALIGN(block_length,MARIA_DYN_ALIGN_SIZE); + if (block_length > MARIA_MAX_BLOCK_LENGTH) + block_length=MARIA_MAX_BLOCK_LENGTH; + if (_ma_write_part_record(info,0L,block_length, + sort_param->filepos+block_length, + &from,&reclength,&flag)) + { + _ma_check_print_error(param,"%d when writing to datafile",my_errno); + DBUG_RETURN(1); + } + sort_param->filepos+=block_length; + share->state.split++; + } while (reclength); + break; + case COMPRESSED_RECORD: + reclength=info->packed_length; + length= _ma_save_pack_length((uint) share->pack.version, block_buff, + reclength); + if (share->base.blobs) + length+= _ma_save_pack_length((uint) share->pack.version, + block_buff + length, info->blob_length); + if (my_b_write(&info->rec_cache,block_buff,length) || + my_b_write(&info->rec_cache, sort_param->rec_buff, reclength)) + { + _ma_check_print_error(param,"%d when writing to datafile",my_errno); + DBUG_RETURN(1); + } + sort_param->filepos+=reclength+length; + share->state.split++; + break; + } + } + if (sort_param->master) + { + share->state.state.records++; + if ((param->testflag & T_WRITE_LOOP) && + (share->state.state.records % WRITE_COUNT) == 0) + { + char llbuff[22]; + printf("%s\r", llstr(share->state.state.records,llbuff)); + VOID(fflush(stdout)); + } + } + DBUG_RETURN(0); +} /* _ma_sort_write_record */ + + +/* Compare two keys from _ma_create_index_by_sort */ + +static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a, + const void *b) +{ + uint not_used[2]; + return (ha_key_cmp(sort_param->seg, *((uchar* const *) a), + *((uchar* const *) b), + USE_WHOLE_KEY, SEARCH_SAME, not_used)); +} /* sort_key_cmp */ + + +static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a) +{ + uint diff_pos[2]; + char llbuff[22],llbuff2[22]; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param= sort_info->param; + int cmp; + + if (sort_info->key_block->inited) + { + cmp= ha_key_cmp(sort_param->seg, sort_info->key_block->lastkey, + a, USE_WHOLE_KEY, + SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT, + diff_pos); + if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL) + ha_key_cmp(sort_param->seg, sort_info->key_block->lastkey, + a, USE_WHOLE_KEY, + SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diff_pos); + else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + { + diff_pos[0]= maria_collect_stats_nonulls_next(sort_param->seg, + sort_param->notnull, + sort_info->key_block->lastkey, + a); + } + sort_param->unique[diff_pos[0]-1]++; + } + else + { + cmp= -1; + if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + maria_collect_stats_nonulls_first(sort_param->seg, sort_param->notnull, + a); + } + if ((sort_param->keyinfo->flag & HA_NOSAME) && cmp == 0) + { + sort_info->dupp++; + sort_info->info->cur_row.lastpos= get_record_for_key(sort_param->keyinfo, + a); + _ma_check_print_warning(param, + "Duplicate key %2u for record at %10s against " + "record at %10s", + sort_param->key + 1, + llstr(sort_info->info->cur_row.lastpos, llbuff), + llstr(get_record_for_key(sort_param->keyinfo, + sort_info->key_block-> + lastkey), + llbuff2)); + param->testflag|=T_RETRY_WITHOUT_QUICK; + if (sort_info->param->testflag & T_VERBOSE) + _ma_print_keydata(stdout,sort_param->seg, a, USE_WHOLE_KEY); + return (sort_delete_record(sort_param)); + } +#ifndef DBUG_OFF + if (cmp > 0) + { + _ma_check_print_error(param, + "Internal error: Keys are not in order from sort"); + return(1); + } +#endif + return (sort_insert_key(sort_param, sort_info->key_block, + a, HA_OFFSET_ERROR)); +} /* sort_key_write */ + + +int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param) +{ + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + SORT_KEY_BLOCKS *key_block=sort_info->key_block; + MARIA_SHARE *share=sort_info->info->s; + uint val_off, val_len; + int error; + SORT_FT_BUF *maria_ft_buf=sort_info->ft_buf; + uchar *from, *to; + + val_len=share->ft2_keyinfo.keylength; + get_key_full_length_rdonly(val_off, maria_ft_buf->lastkey); + to= maria_ft_buf->lastkey+val_off; + + if (maria_ft_buf->buf) + { + /* flushing first-level tree */ + error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey, + HA_OFFSET_ERROR); + for (from=to+val_len; + !error && from < maria_ft_buf->buf; + from+= val_len) + { + memcpy(to, from, val_len); + error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey, + HA_OFFSET_ERROR); + } + return error; + } + /* flushing second-level tree keyblocks */ + error=_ma_flush_pending_blocks(sort_param); + /* updating lastkey with second-level tree info */ + ft_intXstore(maria_ft_buf->lastkey+val_off, -maria_ft_buf->count); + _ma_dpointer(sort_info->info->s, maria_ft_buf->lastkey+val_off+HA_FT_WLEN, + share->state.key_root[sort_param->key]); + /* restoring first level tree data in sort_info/sort_param */ + sort_info->key_block=sort_info->key_block_end- sort_info->param->sort_key_blocks; + sort_param->keyinfo=share->keyinfo+sort_param->key; + share->state.key_root[sort_param->key]=HA_OFFSET_ERROR; + /* writing lastkey in first-level tree */ + return error ? error : + sort_insert_key(sort_param,sort_info->key_block, + maria_ft_buf->lastkey,HA_OFFSET_ERROR); +} + + +static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param, + const uchar *a) +{ + uint a_len, val_off, val_len, error; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + SORT_FT_BUF *ft_buf= sort_info->ft_buf; + SORT_KEY_BLOCKS *key_block= sort_info->key_block; + MARIA_SHARE *share= sort_info->info->s; + + val_len=HA_FT_WLEN+share->base.rec_reflength; + get_key_full_length_rdonly(a_len, a); + + if (!ft_buf) + { + /* + use two-level tree only if key_reflength fits in rec_reflength place + and row format is NOT static - for _ma_dpointer not to garble offsets + */ + if ((share->base.key_reflength <= + share->base.rec_reflength) && + (share->options & + (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD))) + ft_buf= (SORT_FT_BUF *)my_malloc(sort_param->keyinfo->block_length + + sizeof(SORT_FT_BUF), MYF(MY_WME)); + + if (!ft_buf) + { + sort_param->key_write=sort_key_write; + return sort_key_write(sort_param, a); + } + sort_info->ft_buf= ft_buf; + goto word_init_ft_buf; /* no need to duplicate the code */ + } + get_key_full_length_rdonly(val_off, ft_buf->lastkey); + + if (ha_compare_text(sort_param->seg->charset, + a+1,a_len-1, + ft_buf->lastkey+1,val_off-1, 0, 0)==0) + { + uchar *p; + if (!ft_buf->buf) /* store in second-level tree */ + { + ft_buf->count++; + return sort_insert_key(sort_param,key_block, + a + a_len, HA_OFFSET_ERROR); + } + + /* storing the key in the buffer. */ + memcpy (ft_buf->buf, (const char *)a+a_len, val_len); + ft_buf->buf+=val_len; + if (ft_buf->buf < ft_buf->end) + return 0; + + /* converting to two-level tree */ + p=ft_buf->lastkey+val_off; + + while (key_block->inited) + key_block++; + sort_info->key_block=key_block; + sort_param->keyinfo= &share->ft2_keyinfo; + ft_buf->count=(ft_buf->buf - p)/val_len; + + /* flushing buffer to second-level tree */ + for (error=0; !error && p < ft_buf->buf; p+= val_len) + error=sort_insert_key(sort_param,key_block,p,HA_OFFSET_ERROR); + ft_buf->buf=0; + return error; + } + + /* flushing buffer */ + if ((error=_ma_sort_ft_buf_flush(sort_param))) + return error; + +word_init_ft_buf: + a_len+=val_len; + memcpy(ft_buf->lastkey, a, a_len); + ft_buf->buf=ft_buf->lastkey+a_len; + /* + 32 is just a safety margin here + (at least max(val_len, sizeof(nod_flag)) should be there). + May be better performance could be achieved if we'd put + (sort_info->keyinfo->block_length-32)/XXX + instead. + TODO: benchmark the best value for XXX. + */ + ft_buf->end= ft_buf->lastkey+ (sort_param->keyinfo->block_length-32); + return 0; +} /* sort_maria_ft_key_write */ + + +/* get pointer to record from a key */ + +static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo, + const uchar *key_data) +{ + MARIA_KEY key; + key.keyinfo= keyinfo; + key.data= (uchar*) key_data; + key.data_length= _ma_keylength(keyinfo, key_data); + return _ma_row_pos_from_key(&key); +} /* get_record_for_key */ + + +/* Insert a key in sort-key-blocks */ + +static int sort_insert_key(MARIA_SORT_PARAM *sort_param, + register SORT_KEY_BLOCKS *key_block, + const uchar *key, + my_off_t prev_block) +{ + uint a_length,t_length,nod_flag; + my_off_t filepos,key_file_length; + uchar *anc_buff,*lastkey; + MARIA_KEY_PARAM s_temp; + MARIA_KEYDEF *keyinfo=sort_param->keyinfo; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_KEY tmp_key; + MARIA_HA *info= sort_info->info; + MARIA_SHARE *share= info->s; + DBUG_ENTER("sort_insert_key"); + + anc_buff= key_block->buff; + lastkey=key_block->lastkey; + nod_flag= (key_block == sort_info->key_block ? 0 : + share->base.key_reflength); + + if (!key_block->inited) + { + key_block->inited=1; + if (key_block == sort_info->key_block_end) + { + _ma_check_print_error(param, + "To many key-block-levels; " + "Try increasing sort_key_blocks"); + DBUG_RETURN(1); + } + a_length= share->keypage_header + nod_flag; + key_block->end_pos= anc_buff + share->keypage_header; + bzero(anc_buff, share->keypage_header); + _ma_store_keynr(share, anc_buff, (uint) (sort_param->keyinfo - + share->keyinfo)); + lastkey=0; /* No previous key in block */ + } + else + a_length= _ma_get_page_used(share, anc_buff); + + /* Save pointer to previous block */ + if (nod_flag) + { + _ma_store_keypage_flag(share, anc_buff, KEYPAGE_FLAG_ISNOD); + _ma_kpointer(info,key_block->end_pos,prev_block); + } + + tmp_key.keyinfo= keyinfo; + tmp_key.data= (uchar*) key; + tmp_key.data_length= _ma_keylength(keyinfo, key) - share->base.rec_reflength; + tmp_key.ref_length= share->base.rec_reflength; + + t_length= (*keyinfo->pack_key)(&tmp_key, nod_flag, + (uchar*) 0, lastkey, lastkey, &s_temp); + (*keyinfo->store_key)(keyinfo, key_block->end_pos+nod_flag,&s_temp); + a_length+=t_length; + _ma_store_page_used(share, anc_buff, a_length); + key_block->end_pos+=t_length; + if (a_length <= share->max_index_block_size) + { + MARIA_KEY tmp_key2; + tmp_key2.data= key_block->lastkey; + _ma_copy_key(&tmp_key2, &tmp_key); + key_block->last_length=a_length-t_length; + DBUG_RETURN(0); + } + + /* Fill block with end-zero and write filled block */ + _ma_store_page_used(share, anc_buff, key_block->last_length); + bzero(anc_buff+key_block->last_length, + keyinfo->block_length- key_block->last_length); + key_file_length=share->state.state.key_file_length; + if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR) + DBUG_RETURN(1); + _ma_fast_unlock_key_del(info); + + /* If we read the page from the key cache, we have to write it back to it */ + if (page_link->changed) + { + MARIA_PAGE page; + pop_dynamic(&info->pinned_pages); + _ma_page_setup(&page, info, keyinfo, filepos, anc_buff); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_WRITE_UNLOCK, DFLT_INIT_HITS)) + DBUG_RETURN(1); + } + else + { + put_crc(anc_buff, filepos, share); + if (my_pwrite(share->kfile.file, anc_buff, + (uint) keyinfo->block_length, filepos, param->myf_rw)) + DBUG_RETURN(1); + } + DBUG_DUMP("buff", anc_buff, _ma_get_page_used(share, anc_buff)); + + /* Write separator-key to block in next level */ + if (sort_insert_key(sort_param,key_block+1,key_block->lastkey,filepos)) + DBUG_RETURN(1); + + /* clear old block and write new key in it */ + key_block->inited=0; + DBUG_RETURN(sort_insert_key(sort_param, key_block,key,prev_block)); +} /* sort_insert_key */ + + +/* Delete record when we found a duplicated key */ + +static int sort_delete_record(MARIA_SORT_PARAM *sort_param) +{ + uint i; + int old_file,error; + uchar *key; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_HA *row_info= sort_info->new_info, *key_info= sort_info->info; + DBUG_ENTER("sort_delete_record"); + + if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK) + { + _ma_check_print_error(param, + "Quick-recover aborted; Run recovery without switch " + "-q or with switch -qq"); + DBUG_RETURN(1); + } + if (key_info->s->options & HA_OPTION_COMPRESS_RECORD) + { + _ma_check_print_error(param, + "Recover aborted; Can't run standard recovery on " + "compressed tables with errors in data-file. " + "Use 'aria_chk --safe-recover' to fix it"); + DBUG_RETURN(1); + } + + old_file= row_info->dfile.file; + /* This only affects static and dynamic row formats */ + row_info->dfile.file= row_info->rec_cache.file; + if (flush_io_cache(&row_info->rec_cache)) + DBUG_RETURN(1); + + key= key_info->lastkey_buff + key_info->s->base.max_key_length; + if ((error=(*row_info->s->read_record)(row_info, sort_param->record, + key_info->cur_row.lastpos)) && + error != HA_ERR_RECORD_DELETED) + { + _ma_check_print_error(param,"Can't read record to be removed"); + row_info->dfile.file= old_file; + DBUG_RETURN(1); + } + row_info->cur_row.lastpos= key_info->cur_row.lastpos; + + for (i=0 ; i < sort_info->current_key ; i++) + { + MARIA_KEY tmp_key; + (*key_info->s->keyinfo[i].make_key)(key_info, &tmp_key, i, key, + sort_param->record, + key_info->cur_row.lastpos, 0); + if (_ma_ck_delete(key_info, &tmp_key)) + { + _ma_check_print_error(param, + "Can't delete key %d from record to be removed", + i+1); + row_info->dfile.file= old_file; + DBUG_RETURN(1); + } + } + if (sort_param->calc_checksum) + param->glob_crc-=(*key_info->s->calc_check_checksum)(key_info, + sort_param->record); + error= (*row_info->s->delete_record)(row_info, sort_param->record); + if (error) + _ma_check_print_error(param,"Got error %d when deleting record", + my_errno); + row_info->dfile.file= old_file; /* restore actual value */ + row_info->s->state.state.records--; + DBUG_RETURN(error); +} /* sort_delete_record */ + + +/* Fix all pending blocks and flush everything to disk */ + +int _ma_flush_pending_blocks(MARIA_SORT_PARAM *sort_param) +{ + uint nod_flag,length; + my_off_t filepos,key_file_length; + SORT_KEY_BLOCKS *key_block; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + myf myf_rw=sort_info->param->myf_rw; + MARIA_HA *info=sort_info->info; + MARIA_KEYDEF *keyinfo=sort_param->keyinfo; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + DBUG_ENTER("_ma_flush_pending_blocks"); + + filepos= HA_OFFSET_ERROR; /* if empty file */ + nod_flag=0; + for (key_block=sort_info->key_block ; key_block->inited ; key_block++) + { + key_block->inited=0; + length= _ma_get_page_used(info->s, key_block->buff); + if (nod_flag) + _ma_kpointer(info,key_block->end_pos,filepos); + key_file_length= info->s->state.state.key_file_length; + bzero(key_block->buff+length, keyinfo->block_length-length); + if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) + goto err; + + /* If we read the page from the key cache, we have to write it back */ + if (page_link->changed) + { + MARIA_PAGE page; + pop_dynamic(&info->pinned_pages); + + _ma_page_setup(&page, info, keyinfo, filepos, key_block->buff); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_WRITE_UNLOCK, + DFLT_INIT_HITS)) + goto err; + } + else + { + put_crc(key_block->buff, filepos, info->s); + if (my_pwrite(info->s->kfile.file, key_block->buff, + (uint) keyinfo->block_length,filepos, myf_rw)) + goto err; + } + DBUG_DUMP("buff",key_block->buff,length); + nod_flag=1; + } + info->s->state.key_root[sort_param->key]=filepos; /* Last is root for tree */ + _ma_fast_unlock_key_del(info); + DBUG_RETURN(0); + +err: + _ma_fast_unlock_key_del(info); + DBUG_RETURN(1); +} /* _ma_flush_pending_blocks */ + + /* alloc space and pointers for key_blocks */ + +static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks, + uint buffer_length) +{ + reg1 uint i; + SORT_KEY_BLOCKS *block; + DBUG_ENTER("alloc_key_blocks"); + + if (!(block= (SORT_KEY_BLOCKS*) my_malloc((sizeof(SORT_KEY_BLOCKS)+ + buffer_length+IO_SIZE)*blocks, + MYF(0)))) + { + _ma_check_print_error(param,"Not enough memory for sort-key-blocks"); + return(0); + } + for (i=0 ; i < blocks ; i++) + { + block[i].inited=0; + block[i].buff= (uchar*) (block+blocks)+(buffer_length+IO_SIZE)*i; + } + DBUG_RETURN(block); +} /* alloc_key_blocks */ + + + /* Check if file is almost full */ + +int maria_test_if_almost_full(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + if (share->options & HA_OPTION_COMPRESS_RECORD) + return 0; + return my_seek(share->kfile.file, 0L, MY_SEEK_END, + MYF(MY_THREADSAFE))/10*9 > + (my_off_t) share->base.max_key_file_length || + my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) / 10 * 9 > + (my_off_t) share->base.max_data_file_length; +} + + +/* Recreate table with bigger more alloced record-data */ + +int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename) +{ + int error; + MARIA_HA info; + MARIA_SHARE share; + MARIA_KEYDEF *keyinfo,*key,*key_end; + HA_KEYSEG *keysegs,*keyseg; + MARIA_COLUMNDEF *columndef,*column,*end; + MARIA_UNIQUEDEF *uniquedef,*u_ptr,*u_end; + MARIA_STATUS_INFO status_info; + uint unpack,key_parts; + ha_rows max_records; + ulonglong file_length,tmp_length; + MARIA_CREATE_INFO create_info; + DBUG_ENTER("maria_recreate_table"); + + error=1; /* Default error */ + info= **org_info; + status_info= (*org_info)->state[0]; + info.state= &status_info; + share= *(*org_info)->s; + unpack= ((share.data_file_type == COMPRESSED_RECORD) && + (param->testflag & T_UNPACK)); + if (!(keyinfo=(MARIA_KEYDEF*) my_alloca(sizeof(MARIA_KEYDEF) * + share.base.keys))) + DBUG_RETURN(0); + memcpy((uchar*) keyinfo,(uchar*) share.keyinfo, + (size_t) (sizeof(MARIA_KEYDEF)*share.base.keys)); + + key_parts= share.base.all_key_parts; + if (!(keysegs=(HA_KEYSEG*) my_alloca(sizeof(HA_KEYSEG)* + (key_parts+share.base.keys)))) + { + my_afree(keyinfo); + DBUG_RETURN(1); + } + if (!(columndef=(MARIA_COLUMNDEF*) + my_alloca(sizeof(MARIA_COLUMNDEF)*(share.base.fields+1)))) + { + my_afree(keyinfo); + my_afree(keysegs); + DBUG_RETURN(1); + } + if (!(uniquedef=(MARIA_UNIQUEDEF*) + my_alloca(sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques+1)))) + { + my_afree(columndef); + my_afree(keyinfo); + my_afree(keysegs); + DBUG_RETURN(1); + } + + /* Copy the column definitions in their original order */ + for (column= share.columndef, end= share.columndef+share.base.fields; + column != end ; + column++) + columndef[column->column_nr]= *column; + + /* Change the new key to point at the saved key segments */ + memcpy((uchar*) keysegs,(uchar*) share.keyparts, + (size_t) (sizeof(HA_KEYSEG)*(key_parts+share.base.keys+ + share.state.header.uniques))); + keyseg=keysegs; + for (key=keyinfo,key_end=keyinfo+share.base.keys; key != key_end ; key++) + { + key->seg=keyseg; + for (; keyseg->type ; keyseg++) + { + if (param->language) + keyseg->language=param->language; /* change language */ + } + keyseg++; /* Skip end pointer */ + } + + /* + Copy the unique definitions and change them to point at the new key + segments + */ + memcpy((uchar*) uniquedef,(uchar*) share.uniqueinfo, + (size_t) (sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques))); + for (u_ptr=uniquedef,u_end=uniquedef+share.state.header.uniques; + u_ptr != u_end ; u_ptr++) + { + u_ptr->seg=keyseg; + keyseg+=u_ptr->keysegs+1; + } + + file_length=(ulonglong) my_seek(info.dfile.file, 0L, MY_SEEK_END, MYF(0)); + if (share.options & HA_OPTION_COMPRESS_RECORD) + share.base.records=max_records=info.state->records; + else if (share.base.min_pack_length) + max_records=(ha_rows) (file_length / share.base.min_pack_length); + else + max_records=0; + share.options&= ~HA_OPTION_TEMP_COMPRESS_RECORD; + + tmp_length= file_length+file_length/10; + set_if_bigger(file_length,param->max_data_file_length); + set_if_bigger(file_length,tmp_length); + set_if_bigger(file_length,(ulonglong) share.base.max_data_file_length); + + VOID(maria_close(*org_info)); + + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=max(max_records,share.base.records); + create_info.reloc_rows=share.base.reloc; + create_info.old_options=(share.options | + (unpack ? HA_OPTION_TEMP_COMPRESS_RECORD : 0)); + + create_info.data_file_length=file_length; + create_info.auto_increment=share.state.auto_increment; + create_info.language = (param->language ? param->language : + share.state.header.language); + create_info.key_file_length= status_info.key_file_length; + create_info.org_data_file_type= ((enum data_file_type) + share.state.header.org_data_file_type); + + /* + Allow for creating an auto_increment key. This has an effect only if + an auto_increment key exists in the original table. + */ + create_info.with_auto_increment= TRUE; + create_info.null_bytes= share.base.null_bytes; + create_info.transactional= share.base.born_transactional; + + /* + We don't have to handle symlinks here because we are using + HA_DONT_TOUCH_DATA + */ + if (maria_create(filename, share.data_file_type, + share.base.keys - share.state.header.uniques, + keyinfo, share.base.fields, columndef, + share.state.header.uniques, uniquedef, + &create_info, + HA_DONT_TOUCH_DATA)) + { + _ma_check_print_error(param, + "Got error %d when trying to recreate indexfile", + my_errno); + goto end; + } + *org_info= maria_open(filename,O_RDWR, + (HA_OPEN_FOR_REPAIR | + ((param->testflag & T_WAIT_FOREVER) ? + HA_OPEN_WAIT_IF_LOCKED : + (param->testflag & T_DESCRIPT) ? + HA_OPEN_IGNORE_IF_LOCKED : + HA_OPEN_ABORT_IF_LOCKED))); + if (!*org_info) + { + _ma_check_print_error(param, + "Got error %d when trying to open re-created " + "indexfile", my_errno); + goto end; + } + /* We are modifing */ + (*org_info)->s->options&= ~HA_OPTION_READ_ONLY_DATA; + VOID(_ma_readinfo(*org_info,F_WRLCK,0)); + (*org_info)->s->state.state.records= info.state->records; + if (share.state.create_time) + (*org_info)->s->state.create_time=share.state.create_time; +#ifdef EXTERNAL_LOCKING + (*org_info)->s->state.unique= (*org_info)->this_unique= share.state.unique; +#endif + (*org_info)->s->state.state.checksum= info.state->checksum; + (*org_info)->s->state.state.del= info.state->del; + (*org_info)->s->state.dellink= share.state.dellink; + (*org_info)->s->state.state.empty= info.state->empty; + (*org_info)->s->state.state.data_file_length= info.state->data_file_length; + *(*org_info)->state= (*org_info)->s->state.state; + if (maria_update_state_info(param,*org_info,UPDATE_TIME | UPDATE_STAT | + UPDATE_OPEN_COUNT)) + goto end; + error=0; +end: + my_afree(uniquedef); + my_afree(keyinfo); + my_afree(columndef); + my_afree(keysegs); + DBUG_RETURN(error); +} + + + /* write suffix to data file if neaded */ + +int maria_write_data_suffix(MARIA_SORT_INFO *sort_info, my_bool fix_datafile) +{ + MARIA_HA *info=sort_info->new_info; + + if (info->s->data_file_type == COMPRESSED_RECORD && fix_datafile) + { + uchar buff[MEMMAP_EXTRA_MARGIN]; + bzero(buff,sizeof(buff)); + if (my_b_write(&info->rec_cache,buff,sizeof(buff))) + { + _ma_check_print_error(sort_info->param, + "%d when writing to datafile",my_errno); + return 1; + } + sort_info->param->read_cache.end_of_file+=sizeof(buff); + } + return 0; +} + + +/* Update state and maria_chk time of indexfile */ + +int maria_update_state_info(HA_CHECK *param, MARIA_HA *info,uint update) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_update_state_info"); + + if (update & UPDATE_OPEN_COUNT) + { + share->state.open_count=0; + share->global_changed=0; + } + if (update & UPDATE_STAT) + { + uint i, key_parts= mi_uint2korr(share->state.header.key_parts); + share->state.records_at_analyze= share->state.state.records; + share->state.changed&= ~STATE_NOT_ANALYZED; + if (share->state.state.records) + { + for (i=0; i<key_parts; i++) + { + if (!(share->state.rec_per_key_part[i]=param->new_rec_per_key_part[i])) + share->state.changed|= STATE_NOT_ANALYZED; + } + } + } + if (update & (UPDATE_STAT | UPDATE_SORT | UPDATE_TIME | UPDATE_AUTO_INC)) + { + if (update & UPDATE_TIME) + { + share->state.check_time= time((time_t*) 0); + if (!share->state.create_time) + share->state.create_time= share->state.check_time; + } + if (_ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO)) + goto err; + share->changed=0; + } + { /* Force update of status */ + int error; + uint r_locks=share->r_locks,w_locks=share->w_locks; + share->r_locks= share->w_locks= share->tot_locks= 0; + error= _ma_writeinfo(info,WRITEINFO_NO_UNLOCK); + share->r_locks=r_locks; + share->w_locks=w_locks; + share->tot_locks=r_locks+w_locks; + if (!error) + DBUG_RETURN(0); + } +err: + _ma_check_print_error(param,"%d when updating keyfile",my_errno); + DBUG_RETURN(1); +} + +/* + Update auto increment value for a table + When setting the 'repair_only' flag we only want to change the + old auto_increment value if its wrong (smaller than some given key). + The reason is that we shouldn't change the auto_increment value + for a table without good reason when only doing a repair; If the + user have inserted and deleted rows, the auto_increment value + may be bigger than the biggest current row and this is ok. + + If repair_only is not set, we will update the flag to the value in + param->auto_increment is bigger than the biggest key. +*/ + +void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info, + my_bool repair_only) +{ + MARIA_SHARE *share= info->s; + uchar *record; + DBUG_ENTER("update_auto_increment_key"); + + if (!share->base.auto_key || + ! maria_is_key_active(share->state.key_map, share->base.auto_key - 1)) + { + if (!(param->testflag & T_VERY_SILENT)) + _ma_check_print_info(param, + "Table: %s doesn't have an auto increment key\n", + param->isam_file_name); + DBUG_VOID_RETURN; + } + if (!(param->testflag & T_SILENT) && + !(param->testflag & T_REP)) + printf("Updating Aria file: %s\n", param->isam_file_name); + /* + We have to use an allocated buffer instead of info->rec_buff as + _ma_put_key_in_record() may use info->rec_buff + */ + if (!(record= (uchar*) my_malloc((size_t) share->base.default_rec_buff_size, + MYF(0)))) + { + _ma_check_print_error(param,"Not enough memory for extra record"); + DBUG_VOID_RETURN; + } + + maria_extra(info,HA_EXTRA_KEYREAD,0); + if (maria_rlast(info, record, share->base.auto_key-1)) + { + if (my_errno != HA_ERR_END_OF_FILE) + { + maria_extra(info,HA_EXTRA_NO_KEYREAD,0); + my_free((char*) record, MYF(0)); + _ma_check_print_error(param,"%d when reading last record",my_errno); + DBUG_VOID_RETURN; + } + if (!repair_only) + share->state.auto_increment=param->auto_increment_value; + } + else + { + const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg; + ulonglong auto_increment= + ma_retrieve_auto_increment(record + keyseg->start, keyseg->type); + set_if_bigger(share->state.auto_increment,auto_increment); + if (!repair_only) + set_if_bigger(share->state.auto_increment, param->auto_increment_value); + } + maria_extra(info,HA_EXTRA_NO_KEYREAD,0); + my_free((char*) record, MYF(0)); + maria_update_state_info(param, info, UPDATE_AUTO_INC); + DBUG_VOID_RETURN; +} + + +/* + Update statistics for each part of an index + + SYNOPSIS + maria_update_key_parts() + keyinfo IN Index information (only key->keysegs used) + rec_per_key_part OUT Store statistics here + unique IN Array of (#distinct tuples) + notnull_tuples IN Array of (#tuples), or NULL + records Number of records in the table + + DESCRIPTION + This function is called produce index statistics values from unique and + notnull_tuples arrays after these arrays were produced with sequential + index scan (the scan is done in two places: chk_index() and + sort_key_write()). + + This function handles all 3 index statistics collection methods. + + Unique is an array: + unique[0]= (#different values of {keypart1}) - 1 + unique[1]= (#different values of {keypart1,keypart2} tuple)-unique[0]-1 + ... + + For MI_STATS_METHOD_IGNORE_NULLS method, notnull_tuples is an array too: + notnull_tuples[0]= (#of {keypart1} tuples such that keypart1 is not NULL) + notnull_tuples[1]= (#of {keypart1,keypart2} tuples such that all + keypart{i} are not NULL) + ... + For all other statistics collection methods notnull_tuples==NULL. + + Output is an array: + rec_per_key_part[k] = + = E(#records in the table such that keypart_1=c_1 AND ... AND + keypart_k=c_k for arbitrary constants c_1 ... c_k) + + = {assuming that values have uniform distribution and index contains all + tuples from the domain (or that {c_1, ..., c_k} tuple is choosen from + index tuples} + + = #tuples-in-the-index / #distinct-tuples-in-the-index. + + The #tuples-in-the-index and #distinct-tuples-in-the-index have different + meaning depending on which statistics collection method is used: + + MI_STATS_METHOD_* how are nulls compared? which tuples are counted? + NULLS_EQUAL NULL == NULL all tuples in table + NULLS_NOT_EQUAL NULL != NULL all tuples in table + IGNORE_NULLS n/a tuples that don't have NULLs +*/ + +void maria_update_key_parts(MARIA_KEYDEF *keyinfo, double *rec_per_key_part, + ulonglong *unique, ulonglong *notnull, + ulonglong records) +{ + ulonglong count=0, unique_tuples; + ulonglong tuples= records; + uint parts; + double tmp; + for (parts=0 ; parts < keyinfo->keysegs ; parts++) + { + count+=unique[parts]; + unique_tuples= count + 1; + if (notnull) + { + tuples= notnull[parts]; + /* + #(unique_tuples not counting tuples with NULLs) = + #(unique_tuples counting tuples with NULLs as different) - + #(tuples with NULLs) + */ + unique_tuples -= (records - notnull[parts]); + } + + if (unique_tuples == 0) + tmp= 1; + else if (count == 0) + tmp= ulonglong2double(tuples); /* 1 unique tuple */ + else + tmp= ulonglong2double(tuples) / ulonglong2double(unique_tuples); + + /* + for some weird keys (e.g. FULLTEXT) tmp can be <1 here. + let's ensure it is not + */ + set_if_bigger(tmp,1); + + *rec_per_key_part++= tmp; + } +} + + +static ha_checksum maria_byte_checksum(const uchar *buf, uint length) +{ + ha_checksum crc; + const uchar *end=buf+length; + for (crc=0; buf != end; buf++) + crc=((crc << 1) + *buf) + + test(crc & (((ha_checksum) 1) << (8*sizeof(ha_checksum)-1))); + return crc; +} + +static my_bool maria_too_big_key_for_sort(MARIA_KEYDEF *key, ha_rows rows) +{ + uint key_maxlength=key->maxlength; + if (key->flag & HA_FULLTEXT) + { + uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT* + key->seg->charset->mbmaxlen; + key_maxlength+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN; + } + return (key->flag & HA_SPATIAL) || + (key->flag & (HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY | HA_FULLTEXT) && + ((ulonglong) rows * key_maxlength > + (ulonglong) maria_max_temp_length)); +} + +/* + Deactivate all not unique index that can be recreated fast + These include packed keys on which sorting will use more temporary + space than the max allowed file length or for which the unpacked keys + will take much more space than packed keys. + Note that 'rows' may be zero for the case when we don't know how many + rows we will put into the file. + */ + +void maria_disable_non_unique_index(MARIA_HA *info, ha_rows rows) +{ + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *key=share->keyinfo; + uint i; + + DBUG_ASSERT(share->state.state.records == 0 && + (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES)); + for (i=0 ; i < share->base.keys ; i++,key++) + { + if (!(key->flag & + (HA_NOSAME | HA_SPATIAL | HA_AUTO_KEY | HA_RTREE_INDEX)) && + ! maria_too_big_key_for_sort(key,rows) && share->base.auto_key != i+1) + { + maria_clear_key_active(share->state.key_map, i); + info->update|= HA_STATE_CHANGED; + } + } +} + + +/* + Return TRUE if we can use repair by sorting + One can set the force argument to force to use sorting + even if the temporary file would be quite big! +*/ + +my_bool maria_test_if_sort_rep(MARIA_HA *info, ha_rows rows, + ulonglong key_map, my_bool force) +{ + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *key=share->keyinfo; + uint i; + + /* + maria_repair_by_sort only works if we have at least one key. If we don't + have any keys, we should use the normal repair. + */ + if (! maria_is_any_key_active(key_map)) + return FALSE; /* Can't use sort */ + for (i=0 ; i < share->base.keys ; i++,key++) + { + if (!force && maria_too_big_key_for_sort(key,rows)) + return FALSE; + } + return TRUE; +} + + +/** + @brief Create a new handle for manipulation the new record file + + @note + It's ok for Recovery to have two MARIA_SHARE on the same index file + because the one we create here is not transactional +*/ + +static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file) +{ + + MARIA_SORT_INFO *sort_info= param->sort_info; + MARIA_HA *info= sort_info->info; + MARIA_HA *new_info; + DBUG_ENTER("create_new_data_handle"); + + if (!(sort_info->new_info= maria_open(info->s->open_file_name.str, O_RDWR, + HA_OPEN_COPY | HA_OPEN_FOR_REPAIR))) + DBUG_RETURN(1); + + new_info= sort_info->new_info; + _ma_bitmap_set_pagecache_callbacks(&new_info->s->bitmap.file, + new_info->s); + _ma_set_data_pagecache_callbacks(&new_info->dfile, new_info->s); + change_data_file_descriptor(new_info, new_file); + maria_lock_database(new_info, F_EXTRA_LCK); + if ((sort_info->param->testflag & T_UNPACK) && + info->s->data_file_type == COMPRESSED_RECORD) + { + (*new_info->s->once_end)(new_info->s); + (*new_info->s->end)(new_info); + restore_data_file_type(new_info->s); + _ma_setup_functions(new_info->s); + if ((*new_info->s->once_init)(new_info->s, new_file) || + (*new_info->s->init)(new_info)) + DBUG_RETURN(1); + } + _ma_reset_status(new_info); + if (_ma_initialize_data_file(new_info->s, new_file)) + DBUG_RETURN(1); + + /* Take into account any bitmap page created above: */ + param->filepos= new_info->s->state.state.data_file_length; + + /* Use new virtual functions for key generation */ + info->s->keypos_to_recpos= new_info->s->keypos_to_recpos; + info->s->recpos_to_keypos= new_info->s->recpos_to_keypos; + DBUG_RETURN(0); +} + + +static void +set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share) +{ + if ((sort_info->new_data_file_type=share->data_file_type) == + COMPRESSED_RECORD && sort_info->param->testflag & T_UNPACK) + { + MARIA_SHARE tmp; + sort_info->new_data_file_type= share->state.header.org_data_file_type; + /* Set delete_function for sort_delete_record() */ + tmp= *share; + tmp.state.header.data_file_type= tmp.state.header.org_data_file_type; + tmp.options= ~HA_OPTION_COMPRESS_RECORD; + _ma_setup_functions(&tmp); + share->delete_record=tmp.delete_record; + } +} + +static void restore_data_file_type(MARIA_SHARE *share) +{ + MARIA_SHARE tmp_share; + share->options&= ~HA_OPTION_COMPRESS_RECORD; + mi_int2store(share->state.header.options,share->options); + share->state.header.data_file_type= + share->state.header.org_data_file_type; + share->data_file_type= share->state.header.data_file_type; + share->pack.header_length= 0; + + /* Use new virtual functions for key generation */ + tmp_share= *share; + _ma_setup_functions(&tmp_share); + share->keypos_to_recpos= tmp_share.keypos_to_recpos; + share->recpos_to_keypos= tmp_share.recpos_to_keypos; +} + + +static void change_data_file_descriptor(MARIA_HA *info, File new_file) +{ + my_close(info->dfile.file, MYF(MY_WME)); + info->dfile.file= info->s->bitmap.file.file= new_file; + _ma_bitmap_reset_cache(info->s); +} + + +/** + @brief Mark the data file to not be used + + @note + This is used in repair when we want to ensure the handler will not + write anything to the data file anymore +*/ + +static void unuse_data_file_descriptor(MARIA_HA *info) +{ + info->dfile.file= info->s->bitmap.file.file= -1; + _ma_bitmap_reset_cache(info->s); +} + + +/* + Copy all states that has to do with the data file + + NOTES + This is done to copy the state from the data file generated from + repair to the original handler +*/ + +static void copy_data_file_state(MARIA_STATE_INFO *to, + MARIA_STATE_INFO *from) +{ + to->state.records= from->state.records; + to->state.del= from->state.del; + to->state.empty= from->state.empty; + to->state.data_file_length= from->state.data_file_length; + to->split= from->split; + to->dellink= from->dellink; + to->first_bitmap_with_space= from->first_bitmap_with_space; +} + + +/* + Read 'safely' next record while scanning table. + + SYNOPSIS + _ma_safe_scan_block_record() + info Maria handler + record Store found here + + NOTES + - One must have called mi_scan() before this + + Differences compared to _ma_scan_block_records() are: + - We read all blocks, not only blocks marked by the bitmap to be safe + - In case of errors, next read will read next record. + - More sanity checks + + RETURN + 0 ok + HA_ERR_END_OF_FILE End of file + # error number +*/ + + +static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info, + MARIA_HA *info, uchar *record) +{ + MARIA_SHARE *share= info->s; + MARIA_RECORD_POS record_pos= info->cur_row.nextpos; + pgcache_page_no_t page= sort_info->page; + DBUG_ENTER("_ma_safe_scan_block_record"); + + for (;;) + { + /* Find next row in current page */ + if (likely(record_pos < info->scan.number_of_rows)) + { + uint length, offset; + uchar *data, *end_of_data; + char llbuff[22]; + + while (!(offset= uint2korr(info->scan.dir))) + { + info->scan.dir-= DIR_ENTRY_SIZE; + record_pos++; + if (info->scan.dir < info->scan.dir_end) + { + _ma_check_print_info(sort_info->param, + "Wrong directory on page %s", + llstr(page, llbuff)); + goto read_next_page; + } + } + /* found row */ + info->cur_row.lastpos= info->scan.row_base_page + record_pos; + info->cur_row.nextpos= record_pos + 1; + data= info->scan.page_buff + offset; + length= uint2korr(info->scan.dir + 2); + end_of_data= data + length; + info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */ + + if (end_of_data > info->scan.dir_end || + offset < PAGE_HEADER_SIZE || length < share->base.min_block_length) + { + _ma_check_print_info(sort_info->param, + "Wrong directory entry %3u at page %s", + (uint) record_pos, llstr(page, llbuff)); + record_pos++; + continue; + } + else + { + DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos)); + DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data)); + } + } + +read_next_page: + /* Read until we find next head page */ + for (;;) + { + uint page_type; + char llbuff[22]; + + sort_info->page++; /* In case of errors */ + page++; + if (!(page % share->bitmap.pages_covered)) + { + /* Skip bitmap */ + page++; + sort_info->page++; + } + if ((my_off_t) (page + 1) * share->block_size > sort_info->filelength) + DBUG_RETURN(HA_ERR_END_OF_FILE); + if (!(pagecache_read(share->pagecache, + &info->dfile, + page, 0, info->scan.page_buff, + PAGECACHE_READ_UNKNOWN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + { + if (my_errno == HA_ERR_WRONG_CRC) + { + _ma_check_print_info(sort_info->param, + "Wrong CRC on datapage at %s", + llstr(page, llbuff)); + continue; + } + DBUG_RETURN(my_errno); + } + page_type= (info->scan.page_buff[PAGE_TYPE_OFFSET] & + PAGE_TYPE_MASK); + if (page_type == HEAD_PAGE) + { + if ((info->scan.number_of_rows= + (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) != 0) + break; + _ma_check_print_info(sort_info->param, + "Wrong head page at page %s", + llstr(page, llbuff)); + } + else if (page_type >= MAX_PAGE_TYPE) + { + _ma_check_print_info(sort_info->param, + "Found wrong page type: %d at page %s", + page_type, llstr(page, llbuff)); + } + } + + /* New head page */ + info->scan.dir= (info->scan.page_buff + share->block_size - + PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE); + info->scan.dir_end= (info->scan.dir - + (info->scan.number_of_rows - 1) * + DIR_ENTRY_SIZE); + info->scan.row_base_page= ma_recordpos(page, 0); + record_pos= 0; + } +} + + +/** + @brief Writes a LOGREC_REPAIR_TABLE record and updates create_rename_lsn + if needed (so that maria_read_log does not redo the repair). + + @param param description of the REPAIR operation + @param info table + + @return Operation status + @retval 0 ok + @retval 1 error (disk problem) +*/ + +my_bool write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + /* in case this is maria_chk or recovery... */ + if (translog_status == TRANSLOG_OK && !maria_in_recovery && + share->base.born_transactional) + { + my_bool save_now_transactional= share->now_transactional; + + /* + For now this record is only informative. It could serve when applying + logs to a backup, but that needs more thought. Assume table became + corrupted. It is repaired, then some writes happen to it. + Later we restore an old backup, and want to apply this REDO_REPAIR_TABLE + record. For it to give the same result as originally, the table should + be corrupted the same way, so applying previous REDOs should produce the + same corruption; that's really not guaranteed (different execution paths + in execution of REDOs vs runtime code so not same bugs hit, temporary + hardware issues not repeatable etc). Corruption may not be repeatable. + A reasonable solution is to execute the REDO_REPAIR_TABLE record and + check if the checksum of the resulting table matches what it was at the + end of the original repair (should be stored in log record); or execute + the REDO_REPAIR_TABLE if the checksum of the table-before-repair matches + was it was at the start of the original repair (should be stored in log + record). + */ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[FILEID_STORE_SIZE + 8 + 8]; + LSN lsn; + + /* + testflag gives an idea of what REPAIR did (in particular T_QUICK + or not: did it touch the data file or not?). + */ + int8store(log_data + FILEID_STORE_SIZE, param->testflag); + /* org_key_map is used when recreating index after a load data infile */ + int8store(log_data + FILEID_STORE_SIZE + 8, param->org_key_map); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + share->now_transactional= 1; + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_REPAIR_TABLE, + &dummy_transaction_object, info, + (translog_size_t) sizeof(log_data), + sizeof(log_array)/sizeof(log_array[0]), + log_array, log_data, NULL) || + translog_flush(lsn))) + return TRUE; + /* + The table's existence was made durable earlier (MY_SYNC_DIR passed to + maria_change_to_newfile()). All pages have been flushed, state too, we + need to force it to disk. Old REDOs should not be applied to the table, + which is already enforced as skip_redos_lsn was increased in + protect_against_repair_crash(). But if this is an explicit repair, + even UNDO phase should ignore this table: create_rename_lsn should be + increased, and this also serves for the REDO_REPAIR to be ignored by + maria_read_log. + The fully correct order would be: sync data and index file, remove crash + mark and update LSNs then write state and sync index file. But at this + point state (without crash mark) is already written. + */ + if ((!(param->testflag & T_NO_CREATE_RENAME_LSN) && + _ma_update_state_lsns(share, lsn, share->state.create_trid, FALSE, + FALSE)) || + _ma_sync_table_files(info)) + return TRUE; + share->now_transactional= save_now_transactional; + } + return FALSE; +} + + +/** + Writes an UNDO record which if executed in UNDO phase, will empty the + table. Such record is thus logged only in certain cases of bulk insert + (table needs to be empty etc). +*/ +my_bool write_log_record_for_bulk_insert(MARIA_HA *info) +{ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE]; + LSN lsn; + lsn_store(log_data, info->trn->undo_lsn); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + return translog_write_record(&lsn, LOGREC_UNDO_BULK_INSERT, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data + LSN_STORE_SIZE, NULL) || + translog_flush(lsn); /* WAL */ +} + + +/* Give error message why reading of key page failed */ + +static void report_keypage_fault(HA_CHECK *param, MARIA_HA *info, + my_off_t position) +{ + char buff[11]; + uint32 block_size= info->s->block_size; + + if (my_errno == HA_ERR_CRASHED) + _ma_check_print_error(param, + "Wrong base information on indexpage at page: %s", + llstr(position / block_size, buff)); + else + _ma_check_print_error(param, + "Can't read indexpage from page: %s, " + "error: %d", + llstr(position / block_size, buff), my_errno); +} + + +/** + When we want to check a table, we verify that the transaction ids of rows + and keys are not bigger than the biggest id generated by Maria so far, which + is returned by the function below. + + @note If control file is not open, 0 may be returned; to not confuse + this with a valid max trid of 0, the caller should notice that it failed to + open the control file (ma_control_file_inited() can serve for that). +*/ + +static TrID max_trid_in_system(void) +{ + TrID id= trnman_get_max_trid(); /* 0 if transac manager not initialized */ + /* 'id' may be far bigger, if last shutdown is old */ + return max(id, max_trid_in_control_file); +} + + +static void _ma_check_print_not_visible_error(HA_CHECK *param, TrID used_trid) +{ + char buff[22], buff2[22]; + if (!param->not_visible_rows_found++) + { + if (!ma_control_file_inited()) + { + _ma_check_print_warning(param, + "Found row with transaction id %s but no " + "aria_control_file was used or specified. " + "The table may be corrupted", + llstr(used_trid, buff)); + } + else + { + _ma_check_print_error(param, + "Found row with transaction id %s when max " + "transaction id according to aria_control_file " + "is %s", + llstr(used_trid, buff), + llstr(param->max_trid, buff2)); + } + } +} + + +/** + Mark that we can retry normal repair if we used quick repair + + We shouldn't do this in case of disk error as in this case we are likely + to loose much more than expected. +*/ + +void retry_if_quick(MARIA_SORT_PARAM *sort_param, int error) +{ + HA_CHECK *param=sort_param->sort_info->param; + + if (!sort_param->fix_datafile && error >= HA_ERR_FIRST) + { + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + } +} diff --git a/storage/maria/ma_check_standalone.h b/storage/maria/ma_check_standalone.h new file mode 100644 index 00000000000..8cda285bb99 --- /dev/null +++ b/storage/maria/ma_check_standalone.h @@ -0,0 +1,104 @@ +/* Copyright (C) 2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + All standalone programs which need to use functions from ma_check.c + (like maria_repair()) must define their version of _ma_killed_ptr() + and _ma_check_print_info|warning|error(). Indeed, linking with ma_check.o + brings in the dependencies of ma_check.o which are definitions of the above + functions; if the program does not define them then the ones of + ha_maria.o are used i.e. ha_maria.o is linked into the program, and this + brings dependencies of ha_maria.o on mysqld.o into the program's linking + which thus fails, as the program is not linked with mysqld.o. + This file contains the versions of these functions used by maria_chk and + maria_read_log. +*/ + +/* + Check if check/repair operation was killed by a signal +*/ + +int _ma_killed_ptr(HA_CHECK *param __attribute__((unused))) +{ + return 0; +} + + /* print warnings and errors */ + /* VARARGS */ + +void _ma_check_print_info(HA_CHECK *param __attribute__((unused)), + const char *fmt,...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_info"); + DBUG_PRINT("enter", ("format: %s", fmt)); + + va_start(args,fmt); + VOID(vfprintf(stdout, fmt, args)); + VOID(fputc('\n',stdout)); + va_end(args); + DBUG_VOID_RETURN; +} + +/* VARARGS */ + +void _ma_check_print_warning(HA_CHECK *param, const char *fmt,...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_warning"); + DBUG_PRINT("enter", ("format: %s", fmt)); + + fflush(stdout); + if (!param->warning_printed && !param->error_printed) + { + if (param->testflag & T_SILENT) + fprintf(stderr,"%s: Aria file %s\n",my_progname_short, + param->isam_file_name); + param->out_flag|= O_DATA_LOST; + } + param->warning_printed=1; + va_start(args,fmt); + fprintf(stderr,"%s: warning: ",my_progname_short); + VOID(vfprintf(stderr, fmt, args)); + VOID(fputc('\n',stderr)); + fflush(stderr); + va_end(args); + DBUG_VOID_RETURN; +} + +/* VARARGS */ + +void _ma_check_print_error(HA_CHECK *param, const char *fmt,...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_error"); + DBUG_PRINT("enter", ("format: %s", fmt)); + + fflush(stdout); + if (!param->warning_printed && !param->error_printed) + { + if (param->testflag & T_SILENT) + fprintf(stderr,"%s: Aria file %s\n",my_progname_short,param->isam_file_name); + param->out_flag|= O_DATA_LOST; + } + param->error_printed|=1; + va_start(args,fmt); + fprintf(stderr,"%s: error: ",my_progname_short); + VOID(vfprintf(stderr, fmt, args)); + VOID(fputc('\n',stderr)); + fflush(stderr); + va_end(args); + DBUG_VOID_RETURN; +} diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c new file mode 100644 index 00000000000..cf13cee9452 --- /dev/null +++ b/storage/maria/ma_checkpoint.c @@ -0,0 +1,1196 @@ +/* Copyright (C) 2006,2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3071 Maria checkpoint + First version written by Guilhem Bichot on 2006-04-27. +*/ + +/* Here is the implementation of this module */ + +/** @todo RECOVERY BUG this is unreviewed code */ +/* + Summary: + checkpoints are done either by a background thread (checkpoint every Nth + second) or by a client. + In ha_maria, it's not made available to clients, and will soon be done by a + background thread (periodically taking checkpoints and flushing dirty + pages). +*/ + +#include "maria_def.h" +#include "ma_pagecache.h" +#include "ma_blockrec.h" +#include "ma_checkpoint.h" +#include "ma_loghandler_lsn.h" +#include "ma_servicethread.h" + + +/** @brief type of checkpoint currently running */ +static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE; +/** @brief protects checkpoint_in_progress */ +static pthread_mutex_t LOCK_checkpoint; +/** @brief for killing the background checkpoint thread */ +static pthread_cond_t COND_checkpoint; +/** @brief control structure for checkpoint background thread */ +static MA_SERVICE_THREAD_CONTROL checkpoint_control= + {THREAD_DEAD, FALSE, &LOCK_checkpoint, &COND_checkpoint}; +/* is ulong like pagecache->blocks_changed */ +static ulong pages_to_flush_before_next_checkpoint; +static PAGECACHE_FILE *dfiles, /**< data files to flush in background */ + *dfiles_end; /**< list of data files ends here */ +static PAGECACHE_FILE *kfiles, /**< index files to flush in background */ + *kfiles_end; /**< list of index files ends here */ +/* those two statistics below could serve in SHOW GLOBAL STATUS */ +static uint checkpoints_total= 0, /**< all checkpoint requests made */ + checkpoints_ok_total= 0; /**< all checkpoints which succeeded */ + +struct st_filter_param +{ + LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */ + uint max_pages; /**< stop after flushing this number pages */ +}; /**< information to determine which dirty pages should be flushed */ + +static enum pagecache_flush_filter_result +filter_flush_file_medium(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); +static enum pagecache_flush_filter_result +filter_flush_file_full(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); +static enum pagecache_flush_filter_result +filter_flush_file_evenly(enum pagecache_page_type type, + pgcache_page_no_t pageno, + LSN rec_lsn, void *arg); +static int really_execute_checkpoint(void); +pthread_handler_t ma_checkpoint_background(void *arg); +static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon); + +/** + @brief Does a checkpoint + + @param level what level of checkpoint to do + @param no_wait if another checkpoint of same or stronger level + is already running, consider our job done + + @note In ha_maria, there can never be two threads trying a checkpoint at + the same time. + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait) +{ + int result= 0; + DBUG_ENTER("ma_checkpoint_execute"); + + if (!checkpoint_control.inited) + { + /* + If ha_maria failed to start, maria_panic_hton is called, we come here. + */ + DBUG_RETURN(0); + } + DBUG_ASSERT(level > CHECKPOINT_NONE); + + /* look for already running checkpoints */ + pthread_mutex_lock(&LOCK_checkpoint); + while (checkpoint_in_progress != CHECKPOINT_NONE) + { + if (no_wait && (checkpoint_in_progress >= level)) + { + /* + If we are the checkpoint background thread, we don't wait (it's + smarter to flush pages instead of waiting here while the other thread + finishes its checkpoint). + */ + pthread_mutex_unlock(&LOCK_checkpoint); + goto end; + } + pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint); + } + + checkpoint_in_progress= level; + pthread_mutex_unlock(&LOCK_checkpoint); + /* from then on, we are sure to be and stay the only checkpointer */ + + result= really_execute_checkpoint(); + pthread_cond_broadcast(&COND_checkpoint); +end: + DBUG_RETURN(result); +} + + +/** + @brief Does a checkpoint, really; expects no other checkpoints + running. + + Checkpoint level requested is read from checkpoint_in_progress. + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +static int really_execute_checkpoint(void) +{ + uint i, error= 0; + /** @brief checkpoint_start_log_horizon will be stored there */ + char *ptr; + LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */ + LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn; + TRANSLOG_ADDRESS checkpoint_start_log_horizon; + char checkpoint_start_log_horizon_char[LSN_STORE_SIZE]; + DBUG_ENTER("really_execute_checkpoint"); + DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress)); + bzero(&record_pieces, sizeof(record_pieces)); + + /* + STEP 1: record current end-of-log position using log's lock. It is + critical for the correctness of Checkpoint (related to memory visibility + rules, the log's lock is a mutex). + "Horizon" is a lower bound of the LSN of the next log record. + */ + checkpoint_start_log_horizon= translog_get_horizon(); + DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)", + LSN_IN_PARTS(checkpoint_start_log_horizon))); + lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon); + + /* + STEP 2: fetch information about transactions. + We must fetch transactions before dirty pages. Indeed, a transaction + first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn + to 0. If we fetched pages first, we may see no dirty page yet, then we + fetch transactions but the transaction has already reset its rec_lsn to 0 + so we miss rec_lsn again. + For a similar reason (over-allocated bitmap pages) we have to fetch + transactions before flushing bitmap pages. + + min_trn_rec_lsn will serve to lower the starting point of the REDO phase + (down from checkpoint_start_log_horizon). + */ + if (unlikely(trnman_collect_transactions(&record_pieces[0], + &record_pieces[1], + &min_trn_rec_lsn, + &min_first_undo_lsn))) + goto err; + + + /* STEP 3: fetch information about table files */ + if (unlikely(collect_tables(&record_pieces[2], + checkpoint_start_log_horizon))) + goto err; + + + /* STEP 4: fetch information about dirty pages */ + /* + It's better to do it _after_ having flushed some data pages (which + collect_tables() may have done), because those are now non-dirty and so we + have a more up-to-date dirty pages list to put into the checkpoint record, + and thus we will have less work at Recovery. + */ + /* Using default pagecache for now */ + if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache, + &record_pieces[3], + &min_page_rec_lsn))) + goto err; + + + /* LAST STEP: now write the checkpoint log record */ + { + LSN lsn; + translog_size_t total_rec_length; + /* + the log handler is allowed to modify "str" and "length" (but not "*str") + of its argument, so we must not pass it record_pieces directly, + otherwise we would later not know what memory pieces to my_free(). + */ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5]; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= + (uchar*) checkpoint_start_log_horizon_char; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length= + sizeof(checkpoint_start_log_horizon_char); + for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) + { + log_array[TRANSLOG_INTERNAL_PARTS + 1 + i]= + *(LEX_CUSTRING *)&record_pieces[i]; + total_rec_length+= (translog_size_t) record_pieces[i].length; + } + if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT, + &dummy_transaction_object, NULL, + total_rec_length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) || + translog_flush(lsn))) + goto err; + translog_lock(); + /* + This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because + such hook would be called before translog_flush (and we must be sure + that log was flushed before we write to the control file). + */ + if (unlikely(ma_control_file_write_and_force(lsn, last_logno, + max_trid_in_control_file, + recovery_failures))) + { + translog_unlock(); + goto err; + } + translog_unlock(); + } + + /* + Note that we should not alter memory structures until we have successfully + written the checkpoint record and control file. + */ + /* checkpoint succeeded */ + ptr= record_pieces[3].str; + pages_to_flush_before_next_checkpoint= uint4korr(ptr); + DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint", + (uint)pages_to_flush_before_next_checkpoint)); + + /* compute log's low-water mark */ + { + TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn; + set_if_smaller(log_low_water_mark, min_trn_rec_lsn); + set_if_smaller(log_low_water_mark, min_first_undo_lsn); + set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon); + /** + Now purge unneeded logs. + As some systems have an unreliable fsync (drive lying), we could try to + be robust against that: remember a few previous checkpoints in the + control file, and not purge logs immediately... Think about it. + */ + if (translog_purge(log_low_water_mark)) + ma_message_no_user(0, "log purging failed"); + } + + goto end; + +err: + error= 1; + ma_message_no_user(0, "checkpoint failed"); + /* we were possibly not able to determine what pages to flush */ + pages_to_flush_before_next_checkpoint= 0; + +end: + for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) + my_free(record_pieces[i].str, MYF(MY_ALLOW_ZERO_PTR)); + pthread_mutex_lock(&LOCK_checkpoint); + checkpoint_in_progress= CHECKPOINT_NONE; + checkpoints_total++; + checkpoints_ok_total+= !error; + pthread_mutex_unlock(&LOCK_checkpoint); + DBUG_RETURN(error); +} + + +/** + @brief Initializes the checkpoint module + + @param interval If one wants the module to create a + thread which will periodically do + checkpoints, and flush dirty pages, in the + background, it should specify a non-zero + interval in seconds. The thread will then be + created and will take checkpoints separated by + approximately 'interval' second. + + @note A checkpoint is taken only if there has been some significant + activity since the previous checkpoint. Between checkpoint N and N+1 the + thread flushes all dirty pages which were already dirty at the time of + checkpoint N. + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +int ma_checkpoint_init(ulong interval) +{ + pthread_t th; + int res= 0; + DBUG_ENTER("ma_checkpoint_init"); + if (ma_service_thread_control_init(&checkpoint_control)) + res= 1; + else if (interval > 0) + { + compile_time_assert(sizeof(void *) >= sizeof(ulong)); + if (!(res= pthread_create(&th, NULL, ma_checkpoint_background, + (void *)interval))) + { + /* thread lives, will have to be killed */ + checkpoint_control.status= THREAD_RUNNING; + } + } + DBUG_RETURN(res); +} + + +#ifndef DBUG_OFF +/** + Function used to test recovery: flush some table pieces and then caller + crashes. + + @param what_to_flush 0: current bitmap and all data pages + 1: state + 2: all bitmap pages +*/ +static void flush_all_tables(int what_to_flush) +{ + int res= 0; + LIST *pos; /**< to iterate over open tables */ + pthread_mutex_lock(&THR_LOCK_maria); + for (pos= maria_open_list; pos; pos= pos->next) + { + MARIA_HA *info= (MARIA_HA*)pos->data; + if (info->s->now_transactional) + { + switch (what_to_flush) + { + case 0: + res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_KEEP, FLUSH_KEEP); + break; + case 1: + res= _ma_state_info_write(info->s, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET| + MA_STATE_INFO_WRITE_LOCK); + DBUG_PRINT("maria_flush_states", + ("is_of_horizon: LSN (%lu,0x%lx)", + LSN_IN_PARTS(info->s->state.is_of_horizon))); + break; + case 2: + res= _ma_bitmap_flush_all(info->s); + break; + } + } + DBUG_ASSERT(res == 0); + } + pthread_mutex_unlock(&THR_LOCK_maria); +} +#endif + + +/** + @brief Destroys the checkpoint module +*/ + +void ma_checkpoint_end(void) +{ + DBUG_ENTER("ma_checkpoint_end"); + /* + Some intentional crash methods, usually triggered by + SET MARIA_CHECKPOINT_INTERVAL=X + */ + DBUG_EXECUTE_IF("maria_flush_bitmap", + { + DBUG_PRINT("maria_flush_bitmap", ("now")); + flush_all_tables(2); + }); + DBUG_EXECUTE_IF("maria_flush_whole_page_cache", + { + DBUG_PRINT("maria_flush_whole_page_cache", ("now")); + flush_all_tables(0); + }); + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + /* + Note that for WAL reasons, maria_flush_states requires + maria_flush_whole_log. + */ + DBUG_EXECUTE_IF("maria_flush_states", + { + DBUG_PRINT("maria_flush_states", ("now")); + flush_all_tables(1); + }); + DBUG_EXECUTE_IF("maria_crash", + { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); }); + + if (checkpoint_control.inited) + { + ma_service_thread_control_end(&checkpoint_control); + my_free((uchar *)dfiles, MYF(MY_ALLOW_ZERO_PTR)); + my_free((uchar *)kfiles, MYF(MY_ALLOW_ZERO_PTR)); + dfiles= kfiles= NULL; + } + DBUG_VOID_RETURN; +} + + +/** + @brief dirty-page filtering criteria for MEDIUM checkpoint. + + We flush data/index pages which have been dirty since the previous + checkpoint (this is the two-checkpoint rule: the REDO phase will not have + to start from earlier than the next-to-last checkpoint). + Bitmap pages are handled by _ma_bitmap_flush_all(). + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param +*/ + +static enum pagecache_flush_filter_result +filter_flush_file_medium(enum pagecache_page_type type, + pgcache_page_no_t pageno __attribute__ ((unused)), + LSN rec_lsn, void *arg) +{ + struct st_filter_param *param= (struct st_filter_param *)arg; + return (type == PAGECACHE_LSN_PAGE) && + (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0); +} + + +/** + @brief dirty-page filtering criteria for FULL checkpoint. + + We flush all dirty data/index pages. + Bitmap pages are handled by _ma_bitmap_flush_all(). + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param +*/ + +static enum pagecache_flush_filter_result +filter_flush_file_full(enum pagecache_page_type type, + pgcache_page_no_t pageno __attribute__ ((unused)), + LSN rec_lsn __attribute__ ((unused)), + void *arg __attribute__ ((unused))) +{ + return (type == PAGECACHE_LSN_PAGE); +} + + +/** + @brief dirty-page filtering criteria for background flushing thread. + + We flush data/index pages which have been dirty since the previous + checkpoint (this is the two-checkpoint rule: the REDO phase will not have + to start from earlier than the next-to-last checkpoint), and no + bitmap pages. But we flush no more than a certain number of pages (to have + an even flushing, no write burst). + The reason to not flush bitmap pages is that they may not be in a flushable + state at this moment and we don't want to wait for them. + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param +*/ + +static enum pagecache_flush_filter_result +filter_flush_file_evenly(enum pagecache_page_type type, + pgcache_page_no_t pageno __attribute__ ((unused)), + LSN rec_lsn, void *arg) +{ + struct st_filter_param *param= (struct st_filter_param *)arg; + if (unlikely(param->max_pages == 0)) /* all flushed already */ + return FLUSH_FILTER_SKIP_ALL; + if ((type == PAGECACHE_LSN_PAGE) && + (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0)) + { + param->max_pages--; + return FLUSH_FILTER_OK; + } + return FLUSH_FILTER_SKIP_TRY_NEXT; +} + + +/** + @brief Background thread which does checkpoints and flushes periodically. + + Takes a checkpoint. After this, all pages dirty at the time of that + checkpoint are flushed evenly until it is time to take another checkpoint. + This ensures that the REDO phase starts at earliest (in LSN time) at the + next-to-last checkpoint record ("two-checkpoint rule"). + + @note MikaelR questioned why the same thread does two different jobs, the + risk could be that while a checkpoint happens no LRD flushing happens. +*/ + +pthread_handler_t ma_checkpoint_background(void *arg) +{ + /** @brief At least this of log/page bytes written between checkpoints */ + const uint checkpoint_min_activity= 2*1024*1024; + /* + If the interval could be changed by the user while we are in this thread, + it could be annoying: for example it could cause "case 2" to be executed + right after "case 0", thus having 'dfile' unset. So the thread cares only + about the interval's value when it started. + */ + const ulong interval= (ulong)arg; + uint sleeps, sleep_time; + TRANSLOG_ADDRESS log_horizon_at_last_checkpoint= + translog_get_horizon(); + ulonglong pagecache_flushes_at_last_checkpoint= + maria_pagecache->global_cache_write; + uint pages_bunch_size; + struct st_filter_param filter_param; + PAGECACHE_FILE *dfile; /**< data file currently being flushed */ + PAGECACHE_FILE *kfile; /**< index file currently being flushed */ + LINT_INIT(kfile); + LINT_INIT(dfile); + LINT_INIT(pages_bunch_size); + + my_thread_init(); + DBUG_PRINT("info",("Maria background checkpoint thread starts")); + DBUG_ASSERT(interval > 0); + + /* + Recovery ended with all tables closed and a checkpoint: no need to take + one immediately. + */ + sleeps= 1; + pages_to_flush_before_next_checkpoint= 0; + + for(;;) /* iterations of checkpoints and dirty page flushing */ + { +#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */ + sleeps=0; +#endif + switch (sleeps % interval) + { + case 0: + /* + With background flushing evenly distributed over the time + between two checkpoints, we should have only little flushing to do + in the checkpoint. + */ + /* + No checkpoint if little work of interest for recovery was done + since last checkpoint. Such work includes log writing (lengthens + recovery, checkpoint would shorten it), page flushing (checkpoint + would decrease the amount of read pages in recovery). + In case of one short statement per minute (very low load), we don't + want to checkpoint every minute, hence the positive + checkpoint_min_activity. + */ + if (((translog_get_horizon() - log_horizon_at_last_checkpoint) + + (maria_pagecache->global_cache_write - + pagecache_flushes_at_last_checkpoint) * + maria_pagecache->block_size) < checkpoint_min_activity) + { + /* don't take checkpoint, so don't know what to flush */ + pages_to_flush_before_next_checkpoint= 0; + sleep_time= interval; + break; + } + sleep_time= 1; + ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE); + /* + Snapshot this kind of "state" of the engine. Note that the value below + is possibly greater than last_checkpoint_lsn. + */ + log_horizon_at_last_checkpoint= translog_get_horizon(); + pagecache_flushes_at_last_checkpoint= + maria_pagecache->global_cache_write; + /* + If the checkpoint above succeeded it has set d|kfiles and + d|kfiles_end. If is has failed, it has set + pages_to_flush_before_next_checkpoint to 0 so we will skip flushing + and sleep until the next checkpoint. + */ + break; + case 1: + /* set up parameters for background page flushing */ + filter_param.up_to_lsn= last_checkpoint_lsn; + pages_bunch_size= pages_to_flush_before_next_checkpoint / interval; + dfile= dfiles; + kfile= kfiles; + /* fall through */ + default: + if (pages_bunch_size > 0) + { + DBUG_PRINT("checkpoint", + ("Maria background checkpoint thread: %u pages", + pages_bunch_size)); + /* flush a bunch of dirty pages */ + filter_param.max_pages= pages_bunch_size; + while (dfile != dfiles_end) + { + /* + We use FLUSH_KEEP_LAZY: if a file is already in flush, it's + smarter to move to the next file than wait for this one to be + completely flushed, which may take long. + StaleFilePointersInFlush: notice how below we use "dfile" which + is an OS file descriptor plus some function and MARIA_SHARE + pointers; this data dates from a previous checkpoint; since then, + the table may have been closed (so MARIA_SHARE* became stale), and + the file descriptor reassigned to another table which does not + have the same CRC-read-set callbacks: it is thus important that + flush_pagecache_blocks_with_filter() does not use the pointers, + only the OS file descriptor. + */ + int res= + flush_pagecache_blocks_with_filter(maria_pagecache, + dfile, FLUSH_KEEP_LAZY, + filter_flush_file_evenly, + &filter_param); + if (unlikely(res & PCFLUSH_ERROR)) + ma_message_no_user(0, "background data page flush failed"); + if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ + break; /* and we will continue with the same file */ + dfile++; /* otherwise all this file is flushed, move to next file */ + /* + MikaelR noted that he observed that Linux's file cache may never + fsync to disk until this cache is full, at which point it decides + to empty the cache, making the machine very slow. A solution was + to fsync after writing 2 MB. So we might want to fsync() here if + we wrote enough pages. + */ + } + while (kfile != kfiles_end) + { + int res= + flush_pagecache_blocks_with_filter(maria_pagecache, + kfile, FLUSH_KEEP_LAZY, + filter_flush_file_evenly, + &filter_param); + if (unlikely(res & PCFLUSH_ERROR)) + ma_message_no_user(0, "background index page flush failed"); + if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ + break; /* and we will continue with the same file */ + kfile++; /* otherwise all this file is flushed, move to next file */ + } + sleep_time= 1; + } + else + { + /* Can directly sleep until the next checkpoint moment */ + sleep_time= interval - (sleeps % interval); + } + } + if (my_service_thread_sleep(&checkpoint_control, + sleep_time * 1000000000ULL)) + break; + sleeps+= sleep_time; + } + DBUG_PRINT("info",("Maria background checkpoint thread ends")); + { + CHECKPOINT_LEVEL level= CHECKPOINT_FULL; + /* + That's the final one, which guarantees that a clean shutdown always ends + with a checkpoint. + */ + DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;); + ma_checkpoint_execute(level, FALSE); + } + my_service_thread_signal_end(&checkpoint_control); + my_thread_end(); + return 0; +} + + +/** + @brief Allocates buffer and stores in it some info about open tables, + does some flushing on those. + + Does the allocation because the caller cannot know the size itself. + Memory freeing is to be done by the caller (if the "str" member of the + LEX_STRING is not NULL). + The caller is taking a checkpoint. + + @param[out] str pointer to where the allocated buffer, + and its size, will be put; buffer will be filled + with info about open tables + @param checkpoint_start_log_horizon Of the in-progress checkpoint + record. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) +{ + MARIA_SHARE **distinct_shares= NULL; + char *ptr; + uint error= 1, sync_error= 0, nb, nb_stored, i; + my_bool unmark_tables= TRUE; + uint total_names_length; + LIST *pos; /**< to iterate over open tables */ + struct st_state_copy { + uint index; + MARIA_STATE_INFO state; + }; + struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */ + *state_copies_end, /**< cache ends here */ + *state_copy; /**< iterator in cache */ + TRANSLOG_ADDRESS state_copies_horizon; /**< horizon of states' _copies_ */ + struct st_filter_param filter_param; + PAGECACHE_FLUSH_FILTER filter; + DBUG_ENTER("collect_tables"); + + LINT_INIT(state_copies_horizon); + /* let's make a list of distinct shares */ + pthread_mutex_lock(&THR_LOCK_maria); + for (nb= 0, pos= maria_open_list; pos; pos= pos->next) + { + MARIA_HA *info= (MARIA_HA*)pos->data; + MARIA_SHARE *share= info->s; + /* the first three variables below can never change */ + if (share->base.born_transactional && !share->temporary && + share->mode != O_RDONLY && + !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)) + { + /* + Apart from us, only maria_close() reads/sets in_checkpoint but cannot + run now as we hold THR_LOCK_maria. + */ + /* + This table is relevant for checkpoint and not already seen. Mark it, + so that it is not seen again in the loop. + */ + nb++; + DBUG_ASSERT(share->in_checkpoint == 0); + /* This flag ensures that we count only _distinct_ shares. */ + share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP; + } + } + if (unlikely((distinct_shares= + (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *), + MYF(MY_WME))) == NULL)) + goto err; + for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next) + { + MARIA_HA *info= (MARIA_HA*)pos->data; + MARIA_SHARE *share= info->s; + if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP) + { + distinct_shares[i++]= share; + /* + With this we prevent the share from going away while we later flush + and force it without holding THR_LOCK_maria. For example if the share + could be my_free()d by maria_close() we would have a problem when we + access it to flush the table. We "pin" the share pointer. + And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is + not seen again in the loop. + */ + share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME; + total_names_length+= share->open_file_name.length; + } + } + + DBUG_ASSERT(i == nb); + pthread_mutex_unlock(&THR_LOCK_maria); + DBUG_PRINT("info",("found %u table shares", nb)); + + str->length= + 4 + /* number of tables */ + (2 + /* short id */ + LSN_STORE_SIZE + /* first_log_write_at_lsn */ + 1 /* end-of-name 0 */ + ) * nb + total_names_length; + if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL)) + goto err; + + ptr= str->str; + ptr+= 4; /* real number of stored tables is not yet know */ + + /* only possible checkpointer, so can do the read below without mutex */ + filter_param.up_to_lsn= last_checkpoint_lsn; + switch(checkpoint_in_progress) + { + case CHECKPOINT_MEDIUM: + filter= &filter_flush_file_medium; + break; + case CHECKPOINT_FULL: + filter= &filter_flush_file_full; + break; + case CHECKPOINT_INDIRECT: + filter= NULL; + break; + default: + DBUG_ASSERT(0); + goto err; + } + + /* + The principle of reading/writing the state below is explained in + ma_recovery.c, look for "Recovery of the state". + */ +#define STATE_COPIES 1024 + state_copies= (struct st_state_copy *) + my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME)); + dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles, + /* avoid size of 0 for my_realloc */ + max(1, nb) * sizeof(PAGECACHE_FILE), + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); + kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles, + /* avoid size of 0 for my_realloc */ + max(1, nb) * sizeof(PAGECACHE_FILE), + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); + if (unlikely((state_copies == NULL) || + (dfiles == NULL) || (kfiles == NULL))) + goto err; + state_copy= state_copies_end= NULL; + dfiles_end= dfiles; + kfiles_end= kfiles; + + for (nb_stored= 0, i= 0; i < nb; i++) + { + MARIA_SHARE *share= distinct_shares[i]; + PAGECACHE_FILE kfile, dfile; + my_bool ignore_share; + if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) + { + /* + No need for a mutex to read the above, only us can write *this* bit of + the in_checkpoint bitmap + */ + continue; + } + /** + @todo We should not look at tables which didn't change since last + checkpoint. + */ + DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str)); + if (state_copy == state_copies_end) /* we have no more cached states */ + { + /* + Collect and cache a bunch of states. We do this for many states at a + time, to not lock/unlock the log's lock too often. + */ + uint j, bound= min(nb, i + STATE_COPIES); + state_copy= state_copies; + /* part of the state is protected by log's lock */ + translog_lock(); + state_copies_horizon= translog_get_horizon_no_lock(); + for (j= i; j < bound; j++) + { + MARIA_SHARE *share2= distinct_shares[j]; + if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) + continue; + state_copy->index= j; + state_copy->state= share2->state; /* we copy the state */ + state_copy++; + /* + data_file_length is not updated under log's lock by the bitmap + code, but writing a wrong data_file_length is ok: a next + maria_close() will correct it; if we crash before, Recovery will + set it to the true physical size. + */ + } + translog_unlock(); + /** + We are going to flush these states. + Before, all records describing how to undo such state must be + in the log (WAL). Usually this means UNDOs. In the special case of + data|key_file_length, recovery just needs to open the table to fix the + length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to + understand it must open a table, is enough; so as long as + data|key_file_length is updated after writing any log record it's ok: + if we copied new value above, it means the record was before + state_copies_horizon and we flush such record below. + Apart from data|key_file_length which are easily recoverable from the + real file's size, all other state members must be updated only when + writing the UNDO; otherwise, if updated before, if their new value is + flushed by a checkpoint and there is a crash before UNDO is written, + their REDO group will be missing or at least incomplete and skipped + by recovery, so bad state value will stay. For example, setting + key_root before writing the UNDO: the table would have old index + pages (they were pinned at time of crash) and a new, thus wrong, + key_root. + @todo RECOVERY BUG check that all code honours that. + */ + if (translog_flush(state_copies_horizon)) + goto err; + /* now we have cached states and they are WAL-safe*/ + state_copies_end= state_copy; + state_copy= state_copies; + } + + /* locate our state among these cached ones */ + for ( ; state_copy->index != i; state_copy++) + DBUG_ASSERT(state_copy < state_copies_end); + + /* OS file descriptors are ints which we stored in 4 bytes */ + compile_time_assert(sizeof(int) <= 4); + /* + Protect against maria_close() (which does some memory freeing in + MARIA_FILE_BITMAP) with close_lock. intern_lock is not + sufficient as we, as well as maria_close(), are going to unlock + intern_lock in the middle of manipulating the table. Serializing us and + maria_close() should help avoid problems. + */ + pthread_mutex_lock(&share->close_lock); + pthread_mutex_lock(&share->intern_lock); + /* + Tables in a normal state have their two file descriptors open. + In some rare cases like REPAIR, some descriptor may be closed or even + -1. If that happened, the _ma_state_info_write() may fail. This is + prevented by enclosing all all places which close/change kfile.file with + intern_lock. + */ + kfile= share->kfile; + dfile= share->bitmap.file; + /* + Ignore table which has no logged writes (all its future log records will + be found naturally by Recovery). Ignore obsolete shares (_before_ + setting themselves to last_version=0 they already did all flush and + sync; if we flush their state now we may be flushing an obsolete state + onto a newer one (assuming the table has been reopened with a different + share but of course same physical index file). + */ + ignore_share= (share->id == 0) | (share->last_version == 0); + DBUG_PRINT("info", ("ignore_share: %d", ignore_share)); + if (!ignore_share) + { + uint open_file_name_len= share->open_file_name.length + 1; + /* remember the descriptors for background flush */ + *(dfiles_end++)= dfile; + *(kfiles_end++)= kfile; + /* we will store this table in the record */ + nb_stored++; + int2store(ptr, share->id); + ptr+= 2; + lsn_store(ptr, share->lsn_of_file_id); + ptr+= LSN_STORE_SIZE; + /* + first_bitmap_with_space is not updated under log's lock, and is + important. We would need the bitmap's lock to get it right. Recovery + of this is not clear, so we just play safe: write it out as + unknown: if crash, _ma_bitmap_init() at next open (for example in + Recovery) will convert it to 0 and thus the first insertion will + search for free space from the file's first bitmap (0) - + under-optimal but safe. + If no crash, maria_close() will write the exact value. + */ + state_copy->state.first_bitmap_with_space= ~(ulonglong)0; + memcpy(ptr, share->open_file_name.str, open_file_name_len); + ptr+= open_file_name_len; + if (cmp_translog_addr(share->state.is_of_horizon, + checkpoint_start_log_horizon) >= 0) + { + /* + State was flushed recently, it does not hold down the log's + low-water mark and will not give avoidable work to Recovery. So we + needn't flush it. Also, it is possible that while we copied the + state above (under log's lock, without intern_lock) it was being + modified in memory or flushed to disk (without log's lock, under + intern_lock, like in maria_extra()), so our copy may be incorrect + and we should not flush it. + It may also be a share which got last_version==0 since we checked + last_version; in this case, it flushed its state and the LSN test + above will catch it. + */ + } + else + { + /* + We could do the state flush only if share->changed, but it's + tricky. + Consider a maria_write() which has written REDO,UNDO, and before it + calls _ma_writeinfo() (setting share->changed=1), checkpoint + happens and sees share->changed=0, does not flush state. It is + possible that Recovery does not start from before the REDO and thus + the state is not recovered. A solution may be to set + share->changed=1 under log mutex when writing log records. + But as anyway we have another problem below, this optimization would + be of little use. + */ + /** @todo flush state only if changed since last checkpoint */ + DBUG_ASSERT(share->last_version != 0); + state_copy->state.is_of_horizon= share->state.is_of_horizon= + state_copies_horizon; + if (kfile.file >= 0) + sync_error|= + _ma_state_info_write_sub(kfile.file, &state_copy->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET); + /* + We don't set share->changed=0 because it may interfere with a + concurrent _ma_writeinfo() doing share->changed=1 (cancel its + effect). The sad consequence is that we will flush the same state at + each checkpoint if the table was once written and then not anymore. + */ + } + } + /* + _ma_bitmap_flush_all() may wait, so don't keep intern_lock as + otherwise this would deadlock with allocate_and_write_block_record() + calling _ma_set_share_data_file_length() + */ + pthread_mutex_unlock(&share->intern_lock); + + if (!ignore_share) + { + /* + share->bitmap is valid because it's destroyed under close_lock which + we hold. + */ + if (_ma_bitmap_flush_all(share)) + { + sync_error= 1; + /** @todo all write failures should mark table corrupted */ + ma_message_no_user(0, "checkpoint bitmap page flush failed"); + } + DBUG_ASSERT(share->pagecache == maria_pagecache); + } + /* + Clean up any unused states. + TODO: Only do this call if there has been # (10?) ended transactions + since last call. + We had to release intern_lock to respect lock order with LOCK_trn_list. + */ + _ma_remove_not_visible_states_with_lock(share, FALSE); + + if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) + { + /* + maria_close() left us free the share. When it run it set share->id + to 0. As it run before we locked close_lock, we should have seen this + and so this assertion should be true: + */ + DBUG_ASSERT(ignore_share); + pthread_mutex_destroy(&share->intern_lock); + pthread_mutex_unlock(&share->close_lock); + pthread_mutex_destroy(&share->close_lock); + my_free((uchar *)share, MYF(0)); + } + else + { + /* share goes back to normal state */ + share->in_checkpoint= 0; + pthread_mutex_unlock(&share->close_lock); + } + + /* + We do the big disk writes out of intern_lock to not block other + users of this table (intern_lock is taken at the start and end of + every statement). This means that file descriptors may be invalid + (files may have been closed for example by HA_EXTRA_PREPARE_FOR_* + under Windows, or REPAIR). This should not be a problem as we use + MY_IGNORE_BADFD. Descriptors may even point to other files but then + the old blocks (of before the close) must have been flushed for sure, + so our flush will flush new blocks (of after the latest open) and that + should do no harm. + */ + /* + If CHECKPOINT_MEDIUM, this big flush below may result in a + serious write burst. Realize that all pages dirtied between the + last checkpoint and the one we are doing now, will be flushed at + next checkpoint, except those evicted by LRU eviction (depending on + the size of the page cache compared to the size of the working data + set, eviction may be rare or frequent). + We avoid that burst by anticipating: those pages are flushed + in bunches spanned regularly over the time interval between now and + the next checkpoint, by a background thread. Thus the next checkpoint + will have only little flushing to do (CHECKPOINT_MEDIUM should thus be + only a little slower than CHECKPOINT_INDIRECT). + */ + + /* + PageCacheFlushConcurrencyBugs + Inside the page cache, calls to flush_pagecache_blocks_int() on the same + file are serialized. Examples of concurrency bugs which happened when we + didn't have this serialization: + - maria_chk_size() (via CHECK TABLE) happens concurrently with + Checkpoint: Checkpoint is flushing a page: it pins the page and is + pre-empted, maria_chk_size() wants to flush this page too so gets an + error because Checkpoint pinned this page. Such error makes + maria_chk_size() mark the table as corrupted. + - maria_close() happens concurrently with Checkpoint: + Checkpoint is flushing a page: it registers a request on the page, is + pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE: + FLUSH_RELEASE will cause a free_block() which assumes the page is in the + LRU, but it is not (as Checkpoint registered a request). Crash. + - one thread is evicting a page of the file out of the LRU: it marks it + iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes + of the same file concurrently (like above). Then one flusher sees the + page is in switch, removes it from changed_blocks[] and puts it in its + first_in_switch, so the other flusher will not see the page at all and + return too early. If it's maria_close() which returns too early, then + maria_close() may close the file descriptor, and the other flusher, and + the evicter will fail to write their page: corruption. + */ + + if (!ignore_share) + { + if (filter != NULL) + { + if ((flush_pagecache_blocks_with_filter(maria_pagecache, + &dfile, FLUSH_KEEP_LAZY, + filter, &filter_param) & + PCFLUSH_ERROR)) + ma_message_no_user(0, "checkpoint data page flush failed"); + if ((flush_pagecache_blocks_with_filter(maria_pagecache, + &kfile, FLUSH_KEEP_LAZY, + filter, &filter_param) & + PCFLUSH_ERROR)) + ma_message_no_user(0, "checkpoint index page flush failed"); + } + /* + fsyncs the fd, that's the loooong operation (e.g. max 150 fsync + per second, so if you have touched 1000 files it's 7 seconds). + */ + sync_error|= + my_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) | + my_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD)); + /* + in case of error, we continue because writing other tables to disk is + still useful. + */ + } + } + + if (sync_error) + goto err; + /* We maybe over-estimated (due to share->id==0 or last_version==0) */ + DBUG_ASSERT(str->length >= (uint)(ptr - str->str)); + str->length= (uint)(ptr - str->str); + /* + As we support max 65k tables open at a time (2-byte short id), we + assume uint is enough for the cumulated length of table names; and + LEX_STRING::length is uint. + */ + int4store(str->str, nb_stored); + error= unmark_tables= 0; + +err: + if (unlikely(unmark_tables)) + { + /* maria_close() uses THR_LOCK_maria from start to end */ + pthread_mutex_lock(&THR_LOCK_maria); + for (i= 0; i < nb; i++) + { + MARIA_SHARE *share= distinct_shares[i]; + if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) + { + /* maria_close() left us to free the share */ + pthread_mutex_destroy(&share->intern_lock); + my_free((uchar *)share, MYF(0)); + } + else + { + /* share goes back to normal state */ + share->in_checkpoint= 0; + } + } + pthread_mutex_unlock(&THR_LOCK_maria); + } + my_free((uchar *)distinct_shares, MYF(MY_ALLOW_ZERO_PTR)); + my_free((uchar *)state_copies, MYF(MY_ALLOW_ZERO_PTR)); + DBUG_RETURN(error); +} diff --git a/storage/maria/ma_checkpoint.h b/storage/maria/ma_checkpoint.h new file mode 100644 index 00000000000..126f8111a23 --- /dev/null +++ b/storage/maria/ma_checkpoint.h @@ -0,0 +1,92 @@ +/* Copyright (C) 2006,2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3071 Maria checkpoint + First version written by Guilhem Bichot on 2006-04-27. + Does not compile yet. +*/ + +/* This is the interface of this module. */ + +typedef enum enum_ma_checkpoint_level { + CHECKPOINT_NONE= 0, + /* just write dirty_pages, transactions table and sync files */ + CHECKPOINT_INDIRECT, + /* also flush all dirty pages which were already dirty at prev checkpoint */ + CHECKPOINT_MEDIUM, + /* also flush all dirty pages */ + CHECKPOINT_FULL +} CHECKPOINT_LEVEL; + +C_MODE_START +int ma_checkpoint_init(ulong interval); +void ma_checkpoint_end(void); +int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait); +C_MODE_END + +/** + @brief reads some LSNs with special trickery + + If a 64-bit variable transitions between both halves being zero to both + halves being non-zero, and back, this function can be used to do a read of + it (without mutex, without atomic load) which always produces a correct + (though maybe slightly old) value (even on 32-bit CPUs). The value is at + least as new as the latest mutex unlock done by the calling thread. + The assumption is that the system sets both 4-byte halves either at the + same time, or one after the other (in any order), but NOT some bytes of the + first half then some bytes of the second half then the rest of bytes of the + first half. With this assumption, the function can detect when it is + seeing an inconsistent value. + + @param LSN pointer to the LSN variable to read + + @return LSN part (most significant byte always 0) +*/ +#if ( SIZEOF_CHARP >= 8 ) +/* 64-bit CPU, 64-bit reads are atomic */ +#define lsn_read_non_atomic LSN_WITH_FLAGS_TO_LSN +#else +static inline LSN lsn_read_non_atomic_32(const volatile LSN *x) +{ + /* + 32-bit CPU, 64-bit reads may give a mixed of old half and new half (old + low bits and new high bits, or the contrary). + */ + for (;;) /* loop until no atomicity problems */ + { + /* + Remove most significant byte in case this is a LSN_WITH_FLAGS object. + Those flags in TRN::first_undo_lsn break the condition on transitions so + they must be removed below. + */ + LSN y= LSN_WITH_FLAGS_TO_LSN(*x); + if (likely((y == LSN_IMPOSSIBLE) || LSN_VALID(y))) + return y; + } +} +#define lsn_read_non_atomic(x) lsn_read_non_atomic_32(&x) +#endif + +/** + prints a message from a task not connected to any user (checkpoint + and recovery for example). + + @param level 0 if error, ME_JUST_WARNING if warning, + ME_JUST_INFO if info + @param sentence text to write +*/ +#define ma_message_no_user(level, sentence) \ + my_printf_error(HA_ERR_GENERIC, "Aria engine: %s", MYF(level), sentence) diff --git a/storage/maria/ma_checksum.c b/storage/maria/ma_checksum.c new file mode 100644 index 00000000000..61ec638053a --- /dev/null +++ b/storage/maria/ma_checksum.c @@ -0,0 +1,89 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Calculate a checksum for a row */ + +#include "maria_def.h" + +/** + Calculate a checksum for the record + + _ma_checksum() + @param info Maria handler + @param record Record + + @note + To ensure that the checksum is independent of the row format + we need to always calculate the checksum in the original field order. + + @return checksum +*/ + +ha_checksum _ma_checksum(MARIA_HA *info, const uchar *record) +{ + ha_checksum crc=0; + uint i,end; + MARIA_COLUMNDEF *base_column= info->s->columndef; + uint16 *column_nr= info->s->column_nr; + + if (info->s->base.null_bytes) + crc= my_checksum(crc, record, info->s->base.null_bytes); + + for (i= 0, end= info->s->base.fields ; i < end ; i++) + { + MARIA_COLUMNDEF *column= base_column + column_nr[i]; + const uchar *pos; + ulong length; + + if (record[column->null_pos] & column->null_bit) + continue; /* Null field */ + + pos= record + column->offset; + switch (column->type) { + case FIELD_BLOB: + { + uint blob_size_length= column->length- portable_sizeof_char_ptr; + length= _ma_calc_blob_length(blob_size_length, pos); + if (length) + { + memcpy((char*) &pos, pos + blob_size_length, sizeof(char*)); + crc= my_checksum(crc, pos, length); + } + continue; + } + case FIELD_VARCHAR: + { + uint pack_length= column->fill_length; + if (pack_length == 1) + length= (ulong) *pos; + else + length= uint2korr(pos); + pos+= pack_length; /* Skip length information */ + break; + } + default: + length= column->length; + break; + } + crc= my_checksum(crc, pos, length); + } + return crc; +} + + +ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *pos) +{ + return my_checksum(0, pos, info->s->base.reclength); +} diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c new file mode 100644 index 00000000000..df525d45d14 --- /dev/null +++ b/storage/maria/ma_close.c @@ -0,0 +1,208 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* close a isam-database */ +/* + TODO: + We need to have a separate mutex on the closed file to allow other threads + to open other files during the time we flush the cache and close this file +*/ + +#include "maria_def.h" + +int maria_close(register MARIA_HA *info) +{ + int error=0,flag; + my_bool share_can_be_freed= FALSE; + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_close"); + DBUG_PRINT("enter",("base: 0x%lx reopen: %u locks: %u", + (long) info, (uint) share->reopen, + (uint) share->tot_locks)); + + /* Check that we have unlocked key delete-links properly */ + DBUG_ASSERT(info->key_del_used == 0); + + pthread_mutex_lock(&THR_LOCK_maria); + if (info->lock_type == F_EXTRA_LCK) + info->lock_type=F_UNLCK; /* HA_EXTRA_NO_USER_CHANGE */ + + if (share->reopen == 1 && share->kfile.file >= 0) + _ma_decrement_open_count(info); + + if (info->lock_type != F_UNLCK) + { + if (maria_lock_database(info,F_UNLCK)) + error=my_errno; + } + pthread_mutex_lock(&share->close_lock); + pthread_mutex_lock(&share->intern_lock); + + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + share->r_locks--; + share->tot_locks--; + } + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + if (end_io_cache(&info->rec_cache)) + error=my_errno; + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + } + flag= !--share->reopen; + maria_open_list=list_delete(maria_open_list,&info->open_list); + + my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + (*share->end)(info); + + if (flag) + { + /* Last close of file; Flush everything */ + + /* Check that we don't have any dangling pointers from the transaction */ + DBUG_ASSERT(share->in_trans == 0); + + if (share->kfile.file >= 0) + { + if ((*share->once_end)(share)) + error= my_errno; + if (flush_pagecache_blocks(share->pagecache, &share->kfile, + ((share->temporary || share->deleting) ? + FLUSH_IGNORE_CHANGED : + FLUSH_RELEASE))) + error= my_errno; +#ifdef HAVE_MMAP + if (share->file_map) + _ma_unmap_file(info); +#endif + /* + If we are crashed, we can safely flush the current state as it will + not change the crashed state. + We can NOT write the state in other cases as other threads + may be using the file at this point + IF using --external-locking, which does not apply to Maria. + */ + if (((share->changed && share->base.born_transactional) || + maria_is_crashed(info))) + { + /* + State must be written to file as it was not done at table's + unlocking. + */ + if (_ma_state_info_write(share, MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)) + error= my_errno; + } + /* + File must be synced as it is going out of the maria_open_list and so + becoming unknown to future Checkpoints. + */ + if (share->now_transactional && my_sync(share->kfile.file, MYF(MY_WME))) + error= my_errno; + if (my_close(share->kfile.file, MYF(0))) + error= my_errno; + } +#ifdef THREAD + thr_lock_delete(&share->lock); + (void) pthread_mutex_destroy(&share->key_del_lock); + { + int i,keys; + keys = share->state.header.keys; + VOID(rwlock_destroy(&share->mmap_lock)); + for(i=0; i<keys; i++) { + VOID(rwlock_destroy(&share->keyinfo[i].root_lock)); + } + } +#endif + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + /* + We assign -1 because checkpoint does not need to flush (in case we + have concurrent checkpoint if no then we do not need it here also) + */ + share->kfile.file= -1; + + /* + Remember share->history for future opens + + We have to unlock share->intern_lock then lock it after + LOCK_trn_list (trnman_lock()) to avoid dead locks. + */ + pthread_mutex_unlock(&share->intern_lock); + _ma_remove_not_visible_states_with_lock(share, TRUE); + pthread_mutex_lock(&share->intern_lock); + + if (share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME) + { + /* we cannot my_free() the share, Checkpoint would see a bad pointer */ + share->in_checkpoint|= MARIA_CHECKPOINT_SHOULD_FREE_ME; + } + else + share_can_be_freed= TRUE; + + if (share->state_history) + { + MARIA_STATE_HISTORY_CLOSED *history; + /* + Here we ignore the unlikely case that we don't have memory to + store the state. In the worst case what happens is that any transaction + that tries to access this table will get a wrong status information. + */ + if ((history= (MARIA_STATE_HISTORY_CLOSED *) + my_malloc(sizeof(*history), MYF(MY_WME)))) + { + history->create_rename_lsn= share->state.create_rename_lsn; + history->state_history= share->state_history; + if (my_hash_insert(&maria_stored_state, (uchar*) history)) + my_free(history, MYF(0)); + } + /* Marker for concurrent checkpoint */ + share->state_history= 0; + } + } + pthread_mutex_unlock(&THR_LOCK_maria); + pthread_mutex_unlock(&share->intern_lock); + pthread_mutex_unlock(&share->close_lock); + if (share_can_be_freed) + { + (void) pthread_mutex_destroy(&share->intern_lock); + (void) pthread_mutex_destroy(&share->close_lock); + (void) pthread_cond_destroy(&share->key_del_cond); + my_free((uchar *)share, MYF(0)); + /* + If share cannot be freed, it's because checkpoint has previously + recorded to include this share in the checkpoint and so is soon going to + look at some of its content (share->in_checkpoint/id/last_version). + */ + } + my_free(info->ftparser_param, MYF(MY_ALLOW_ZERO_PTR)); + if (info->dfile.file >= 0) + { + /* + This is outside of mutex so would confuse a concurrent + Checkpoint. Fortunately in BLOCK_RECORD we close earlier under mutex. + */ + if (my_close(info->dfile.file, MYF(0))) + error= my_errno; + } + + delete_dynamic(&info->pinned_pages); + my_free(info, MYF(0)); + + if (error) + { + DBUG_PRINT("error", ("Got error on close: %d", my_errno)); + DBUG_RETURN(my_errno= error); + } + DBUG_RETURN(0); +} /* maria_close */ diff --git a/storage/maria/ma_commit.c b/storage/maria/ma_commit.c new file mode 100644 index 00000000000..70bc668a220 --- /dev/null +++ b/storage/maria/ma_commit.c @@ -0,0 +1,129 @@ +/* Copyright (C) 2007-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "trnman.h" + +/** + writes a COMMIT record to log and commits transaction in memory + + @param trn transaction + + @return Operation status + @retval 0 ok + @retval 1 error (disk error or out of memory) +*/ + +int ma_commit(TRN *trn) +{ + int res; + LSN commit_lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS]; + DBUG_ENTER("ma_commit"); + + DBUG_ASSERT(trn->rec_lsn == LSN_IMPOSSIBLE); + if (trn->undo_lsn == 0) /* no work done, rollback (cheaper than commit) */ + DBUG_RETURN(trnman_rollback_trn(trn)); + /* + - if COMMIT record is written before trnman_commit_trn(): + if Checkpoint comes in the middle it will see trn is not committed, + then if crash, Recovery might roll back trn (if min(rec_lsn) is after + COMMIT record) and this is not an issue as + * transaction's updates were not made visible to other transactions + * "commit ok" was not sent to client + Alternatively, Recovery might commit trn (if min(rec_lsn) is before COMMIT + record), which is ok too. All in all it means that "trn committed" is not + 100% equal to "COMMIT record written". + - if COMMIT record is written after trnman_commit_trn(): + if crash happens between the two, trn will be rolled back which is an + issue (transaction's updates were made visible to other transactions). + So we need to go the first way. + + Note that we have to use | here to ensure that all calls are made. + */ + + /* + We do not store "thd->transaction.xid_state.xid" for now, it will be + needed only when we support XA. + */ + res= (translog_write_record(&commit_lsn, LOGREC_COMMIT, + trn, NULL, 0, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) | + translog_flush(commit_lsn)); + + DBUG_EXECUTE_IF("maria_sleep_in_commit", + { + DBUG_PRINT("info", ("maria_sleep_in_commit")); + sleep(3); + }); + res|= trnman_commit_trn(trn); + + + /* + Note: if trnman_commit_trn() fails above, we have already + written the COMMIT record, so Checkpoint and Recovery will see the + transaction as committed. + */ + DBUG_RETURN(res); +} + + +/** + Writes a COMMIT record for a transaciton associated with a file + + @param info Maria handler + + @return Operation status + @retval 0 ok + @retval # error (disk error or out of memory) +*/ + +int maria_commit(MARIA_HA *info) +{ + return info->s->now_transactional ? ma_commit(info->trn) : 0; +} + + +/** + Starts a transaction on a file handle + + @param info Maria handler + + @return Operation status + @retval 0 ok + @retval # Error code. + + @note this can be used only in single-threaded programs (tests), + because we create a transaction (trnman_new_trn) with WT_THD=0. + XXX it needs to be fixed when we'll start using maria_begin from SQL. +*/ + +int maria_begin(MARIA_HA *info) +{ + DBUG_ENTER("maria_begin"); + + if (info->s->now_transactional) + { + TRN *trn= trnman_new_trn(0); + if (unlikely(!trn)) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + + DBUG_PRINT("info", ("TRN set to 0x%lx", (ulong) trn)); + _ma_set_trn_for_table(info, trn); + } + DBUG_RETURN(0); +} + diff --git a/storage/maria/ma_commit.h b/storage/maria/ma_commit.h new file mode 100644 index 00000000000..2c57c73fd7a --- /dev/null +++ b/storage/maria/ma_commit.h @@ -0,0 +1,18 @@ +/* Copyright (C) 2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +C_MODE_START +int ma_commit(TRN *trn); +C_MODE_END diff --git a/storage/maria/ma_control_file.c b/storage/maria/ma_control_file.c new file mode 100644 index 00000000000..6f9018885e9 --- /dev/null +++ b/storage/maria/ma_control_file.c @@ -0,0 +1,607 @@ +/* Copyright (C) 2007 MySQL AB & Guilhem Bichot & Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3234 Maria control file + First version written by Guilhem Bichot on 2006-04-27. +*/ + +#ifndef EXTRACT_DEFINITIONS +#include "maria_def.h" +#include "ma_checkpoint.h" +#endif + +/* + A control file contains the following objects: + +Start of create time variables (at start of file): + - Magic string (including version number of Maria control file) + - Uuid + - Size of create time part + - Size of dynamic part + - Maria block size +..... Here we can add new variables without changing format + - Checksum of create time part (last of block) + +Start of changeable part: + - Checksum of changeable part + - LSN of last checkpoint + - Number of last log file + - Max trid in control file (since Maria 1.5 May 2008) + - Number of consecutive recovery failures (since Maria 1.5 May 2008) +..... Here we can add new variables without changing format + +The idea is that one can add new variables to the control file and still +use it with old program versions. If one needs to do an incompatible change +one should increment the control file version number. +*/ + +/* Total size should be < sector size for atomic write operation */ +#define CF_MAX_SIZE 512 +#define CF_MIN_SIZE (CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + \ + CF_CHECKSUM_SIZE * 2 + CF_LSN_SIZE + CF_FILENO_SIZE) + +/* Create time variables */ +#define CF_MAGIC_STRING "\xfe\xfe\xc" +#define CF_MAGIC_STRING_OFFSET 0 +#define CF_MAGIC_STRING_SIZE (sizeof(CF_MAGIC_STRING)-1) +#define CF_VERSION_OFFSET (CF_MAGIC_STRING_OFFSET + CF_MAGIC_STRING_SIZE) +#define CF_VERSION_SIZE 1 +#define CF_UUID_OFFSET (CF_VERSION_OFFSET + CF_VERSION_SIZE) +#define CF_UUID_SIZE MY_UUID_SIZE +#define CF_CREATE_TIME_SIZE_OFFSET (CF_UUID_OFFSET + CF_UUID_SIZE) +#define CF_SIZE_SIZE 2 +#define CF_CHANGEABLE_SIZE_OFFSET (CF_CREATE_TIME_SIZE_OFFSET + CF_SIZE_SIZE) +#define CF_BLOCKSIZE_OFFSET (CF_CHANGEABLE_SIZE_OFFSET + CF_SIZE_SIZE) +#define CF_BLOCKSIZE_SIZE 2 + +#define CF_CREATE_TIME_TOTAL_SIZE (CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + \ + CF_CHECKSUM_SIZE) + +/* + Start of the part that changes during execution + This is stored at offset uint2korr(file[CF_CHANGEABLE_SIZE]) +*/ +#define CF_CHECKSUM_OFFSET 0 +#define CF_CHECKSUM_SIZE 4 +#define CF_LSN_OFFSET (CF_CHECKSUM_OFFSET + CF_CHECKSUM_SIZE) +#define CF_LSN_SIZE LSN_STORE_SIZE +#define CF_FILENO_OFFSET (CF_LSN_OFFSET + CF_LSN_SIZE) +#define CF_FILENO_SIZE 4 +#define CF_MAX_TRID_OFFSET (CF_FILENO_OFFSET + CF_FILENO_SIZE) +#define CF_MAX_TRID_SIZE TRANSID_SIZE +#define CF_RECOV_FAIL_OFFSET (CF_MAX_TRID_OFFSET + CF_MAX_TRID_SIZE) +#define CF_RECOV_FAIL_SIZE 1 +#define CF_CHANGEABLE_TOTAL_SIZE (CF_RECOV_FAIL_OFFSET + CF_RECOV_FAIL_SIZE) + +/* + The following values should not be changed, except when changing version + number of the maria control file. These are the minimum sizes of the + parts the code can handle. +*/ + +#define CF_MIN_CREATE_TIME_TOTAL_SIZE \ +(CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + CF_CHECKSUM_SIZE) +#define CF_MIN_CHANGEABLE_TOTAL_SIZE \ +(CF_FILENO_OFFSET + CF_FILENO_SIZE) + +#ifndef EXTRACT_DEFINITIONS + +/* This module owns these two vars. */ +/** + This LSN serves for the two-checkpoint rule, and also to find the + checkpoint record when doing a recovery. +*/ +LSN last_checkpoint_lsn= LSN_IMPOSSIBLE; +uint32 last_logno= FILENO_IMPOSSIBLE; +/** + The maximum transaction id given to a transaction. It is only updated at + clean shutdown (in case of crash, logs have better information). +*/ +TrID max_trid_in_control_file= 0; + +/** + Number of consecutive log or recovery failures. Reset to 0 after recovery's + success. +*/ +uint8 recovery_failures= 0; + +/** + @brief If log's lock should be asserted when writing to control file. + + Can be re-used by any function which needs to be thread-safe except when + it is called at startup. +*/ +my_bool maria_multi_threaded= FALSE; +/** @brief if currently doing a recovery */ +my_bool maria_in_recovery= FALSE; + +/** + Control file is less then 512 bytes (a disk sector), + to be as atomic as possible +*/ +static int control_file_fd= -1; + +static uint cf_create_time_size; +static uint cf_changeable_size; + +/** + @brief Create Maria control file +*/ + +static CONTROL_FILE_ERROR create_control_file(const char *name, + int open_flags) +{ + uint32 sum; + uchar buffer[CF_CREATE_TIME_TOTAL_SIZE]; + DBUG_ENTER("maria_create_control_file"); + + if ((control_file_fd= my_create(name, 0, + open_flags, + MYF(MY_SYNC_DIR | MY_WME))) < 0) + DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); + + /* Reset variables, as we are creating the file */ + cf_create_time_size= CF_CREATE_TIME_TOTAL_SIZE; + cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE; + + /* Create unique uuid for the control file */ + my_uuid_init((ulong) &buffer, (ulong) &maria_uuid); + my_uuid(maria_uuid); + + /* Prepare and write the file header */ + memcpy(buffer, CF_MAGIC_STRING, CF_MAGIC_STRING_SIZE); + buffer[CF_VERSION_OFFSET]= CONTROL_FILE_VERSION; + memcpy(buffer + CF_UUID_OFFSET, maria_uuid, CF_UUID_SIZE); + int2store(buffer + CF_CREATE_TIME_SIZE_OFFSET, cf_create_time_size); + int2store(buffer + CF_CHANGEABLE_SIZE_OFFSET, cf_changeable_size); + + /* Write create time variables */ + int2store(buffer + CF_BLOCKSIZE_OFFSET, maria_block_size); + + /* Store checksum for create time parts */ + sum= (uint32) my_checksum(0, buffer, cf_create_time_size - + CF_CHECKSUM_SIZE); + int4store(buffer + cf_create_time_size - CF_CHECKSUM_SIZE, sum); + + if (my_pwrite(control_file_fd, buffer, cf_create_time_size, + 0, MYF(MY_FNABP | MY_WME))) + DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); + + /* + To be safer we should make sure that there are no logs or data/index + files around (indeed it could be that the control file alone was deleted + or not restored, and we should not go on with life at this point). + + Things should still be relatively safe as if someone tries to use + an old table with a new control file the different uuid:s between + the files will cause ma_open() to generate an HA_ERR_OLD_FILE + error. When used from mysqld this will cause the table to be open + in repair mode which will remove all dependencies between the + table and the old control file. + + We could have a tool which can rebuild the control file, by reading the + directory of logs, finding the newest log, reading it to find last + checkpoint... Slow but can save your db. For this to be possible, we + must always write to the control file right after writing the checkpoint + log record, and do nothing in between (i.e. the checkpoint must be + usable as soon as it has been written to the log). + */ + + /* init the file with these "undefined" values */ + DBUG_RETURN(ma_control_file_write_and_force(LSN_IMPOSSIBLE, + FILENO_IMPOSSIBLE, 0, 0)); +} + + +/** + Locks control file exclusively. This is kept for the duration of the engine + process, to prevent another Maria instance to write to our logs or control + file. +*/ + +static int lock_control_file(const char *name) +{ + uint retry= 0; + /* + On Windows, my_lock() uses locking() which is mandatory locking and so + prevents maria-recovery.test from copying the control file. And in case of + crash, it may take a while for Windows to unlock file, causing downtime. + */ + /** + @todo BUG We should explore my_sopen(_SH_DENYWRD) to open or create the + file under Windows. + */ +#ifndef __WIN__ + /* + We can't here use the automatic wait in my_lock() as the alarm thread + may not yet exists. + */ + while (my_lock(control_file_fd, F_WRLCK, 0L, F_TO_EOF, + MYF(MY_SEEK_NOT_DONE | MY_FORCE_LOCK | MY_NO_WAIT))) + { + if (retry == 0) + my_printf_error(HA_ERR_INITIALIZATION, + "Can't lock aria control file '%s' for exclusive use, " + "error: %d. Will retry for %d seconds", 0, + name, my_errno, MARIA_MAX_CONTROL_FILE_LOCK_RETRY); + if (retry++ > MARIA_MAX_CONTROL_FILE_LOCK_RETRY) + return 1; + sleep(1); + } +#endif + return 0; +} + + +/* + @brief Initialize control file subsystem + + Looks for the control file. If none and creation is requested, creates file. + If present, reads it to find out last checkpoint's LSN and last log, updates + the last_checkpoint_lsn and last_logno global variables. + Called at engine's start. + + @note + The format of the control file is defined in the comments and defines + at the start of this file. + + @param create_if_missing create file if not found + + @return Operation status + @retval 0 OK + @retval 1 Error (in which case the file is left closed) +*/ + +CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing, + my_bool print_error) +{ + uchar buffer[CF_MAX_SIZE]; + char name[FN_REFLEN], errmsg_buff[256]; + const char *errmsg, *lock_failed_errmsg= "Could not get an exclusive lock;" + " file is probably in use by another process"; + uint new_cf_create_time_size, new_cf_changeable_size, new_block_size; + my_off_t file_size; + int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR; + int error= CONTROL_FILE_UNKNOWN_ERROR; + DBUG_ENTER("ma_control_file_open"); + + /* + If you change sizes in the #defines, you at least have to change the + "*store" and "*korr" calls in this file, and can even create backward + compatibility problems. Beware! + */ + DBUG_ASSERT(CF_LSN_SIZE == (3+4)); + DBUG_ASSERT(CF_FILENO_SIZE == 4); + + if (control_file_fd >= 0) /* already open */ + DBUG_RETURN(0); + + if (fn_format(name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) == NullS) + DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); + + if (my_access(name,F_OK)) + { + CONTROL_FILE_ERROR create_error; + if (!create_if_missing) + { + error= CONTROL_FILE_MISSING; + errmsg= "Can't find file"; + goto err; + } + if ((create_error= create_control_file(name, open_flags))) + { + error= create_error; + errmsg= "Can't create file"; + goto err; + } + if (lock_control_file(name)) + { + errmsg= lock_failed_errmsg; + goto err; + } + goto ok; + } + + /* Otherwise, file exists */ + + if ((control_file_fd= my_open(name, open_flags, MYF(MY_WME))) < 0) + { + errmsg= "Can't open file"; + goto err; + } + + if (lock_control_file(name)) /* lock it before reading content */ + { + errmsg= lock_failed_errmsg; + goto err; + } + + file_size= my_seek(control_file_fd, 0, SEEK_END, MYF(MY_WME)); + if (file_size == MY_FILEPOS_ERROR) + { + errmsg= "Can't read size"; + goto err; + } + if (file_size < CF_MIN_SIZE) + { + /* + Given that normally we write only a sector and it's atomic, the only + possibility for a file to be of too short size is if we crashed at the + very first startup, between file creation and file write. Quite unlikely + (and can be made even more unlikely by doing this: create a temp file, + write it, and then rename it to be the control file). + What's more likely is if someone forgot to restore the control file, + just did a "touch control" to try to get Maria to start, or if the + disk/filesystem has a problem. + So let's be rigid. + */ + error= CONTROL_FILE_TOO_SMALL; + errmsg= "Size of control file is smaller than expected"; + goto err; + } + + /* Check if control file is unexpectedly big */ + if (file_size > CF_MAX_SIZE) + { + error= CONTROL_FILE_TOO_BIG; + errmsg= "File size bigger than expected"; + goto err; + } + + if (my_pread(control_file_fd, buffer, (size_t)file_size, 0, MYF(MY_FNABP))) + { + errmsg= "Can't read file"; + goto err; + } + + if (memcmp(buffer + CF_MAGIC_STRING_OFFSET, + CF_MAGIC_STRING, CF_MAGIC_STRING_SIZE)) + { + error= CONTROL_FILE_BAD_MAGIC_STRING; + errmsg= "Missing valid id at start of file. File is not a valid aria control file"; + goto err; + } + + if (buffer[CF_VERSION_OFFSET] > CONTROL_FILE_VERSION) + { + error= CONTROL_FILE_BAD_VERSION; + sprintf(errmsg_buff, "File is from a future aria system: %d. Current version is: %d", + (int) buffer[CF_VERSION_OFFSET], CONTROL_FILE_VERSION); + errmsg= errmsg_buff; + goto err; + } + + new_cf_create_time_size= uint2korr(buffer + CF_CREATE_TIME_SIZE_OFFSET); + new_cf_changeable_size= uint2korr(buffer + CF_CHANGEABLE_SIZE_OFFSET); + + if (new_cf_create_time_size < CF_MIN_CREATE_TIME_TOTAL_SIZE || + new_cf_changeable_size < CF_MIN_CHANGEABLE_TOTAL_SIZE || + new_cf_create_time_size + new_cf_changeable_size != file_size) + { + error= CONTROL_FILE_INCONSISTENT_INFORMATION; + errmsg= "Sizes stored in control file are inconsistent"; + goto err; + } + + new_block_size= uint2korr(buffer + CF_BLOCKSIZE_OFFSET); + if (new_block_size != maria_block_size && maria_block_size) + { + error= CONTROL_FILE_WRONG_BLOCKSIZE; + sprintf(errmsg_buff, + "Block size in control file (%u) is different than given aria_block_size: %u", + new_block_size, (uint) maria_block_size); + errmsg= errmsg_buff; + goto err; + } + maria_block_size= new_block_size; + + if (my_checksum(0, buffer, new_cf_create_time_size - CF_CHECKSUM_SIZE) != + uint4korr(buffer + new_cf_create_time_size - CF_CHECKSUM_SIZE)) + { + error= CONTROL_FILE_BAD_HEAD_CHECKSUM; + errmsg= "Fixed part checksum mismatch"; + goto err; + } + + if (my_checksum(0, buffer + new_cf_create_time_size + CF_CHECKSUM_SIZE, + new_cf_changeable_size - CF_CHECKSUM_SIZE) != + uint4korr(buffer + new_cf_create_time_size)) + { + error= CONTROL_FILE_BAD_CHECKSUM; + errmsg= "Changeable part (end of control file) checksum mismatch"; + goto err; + } + + memcpy(maria_uuid, buffer + CF_UUID_OFFSET, CF_UUID_SIZE); + cf_create_time_size= new_cf_create_time_size; + cf_changeable_size= new_cf_changeable_size; + last_checkpoint_lsn= lsn_korr(buffer + new_cf_create_time_size + + CF_LSN_OFFSET); + last_logno= uint4korr(buffer + new_cf_create_time_size + CF_FILENO_OFFSET); + if (new_cf_changeable_size >= (CF_MAX_TRID_OFFSET + CF_MAX_TRID_SIZE)) + max_trid_in_control_file= + transid_korr(buffer + new_cf_create_time_size + CF_MAX_TRID_OFFSET); + if (new_cf_changeable_size >= (CF_RECOV_FAIL_OFFSET + CF_RECOV_FAIL_SIZE)) + recovery_failures= + (buffer + new_cf_create_time_size + CF_RECOV_FAIL_OFFSET)[0]; + +ok: + DBUG_RETURN(0); + +err: + if (print_error) + my_printf_error(HA_ERR_INITIALIZATION, + "Got error '%s' when trying to use aria control file " + "'%s'", 0, errmsg, name); + ma_control_file_end(); /* will unlock file if needed */ + DBUG_RETURN(error); +} + + +/* + Write information durably to the control file; stores this information into + the last_checkpoint_lsn, last_logno, max_trid_in_control_file, + recovery_failures global variables. + Called when we have created a new log (after syncing this log's creation), + when we have written a checkpoint (after syncing this log record), at + shutdown (for storing trid in case logs are soon removed by user), and + before and after recovery (to store recovery_failures). + Variables last_checkpoint_lsn and last_logno must be protected by caller + using log's lock, unless this function is called at startup. + + SYNOPSIS + ma_control_file_write_and_force() + last_checkpoint_lsn_arg LSN of last checkpoint + last_logno_arg last log file number + max_trid_arg maximum transaction longid + recovery_failures_arg consecutive recovery failures + + NOTE + We always want to do one single my_pwrite() here to be as atomic as + possible. + + RETURN + 0 - OK + 1 - Error +*/ + +int ma_control_file_write_and_force(LSN last_checkpoint_lsn_arg, + uint32 last_logno_arg, + TrID max_trid_arg, + uint8 recovery_failures_arg) +{ + uchar buffer[CF_MAX_SIZE]; + uint32 sum; + my_bool no_need_sync; + DBUG_ENTER("ma_control_file_write_and_force"); + + /* + We don't need to sync if this is just an increase of + recovery_failures: it's even good if that counter is not increased on disk + in case of power or hardware failure (less false positives when removing + logs). + */ + no_need_sync= ((last_checkpoint_lsn == last_checkpoint_lsn_arg) && + (last_logno == last_logno_arg) && + (max_trid_in_control_file == max_trid_arg) && + (recovery_failures_arg > 0)); + + if (control_file_fd < 0) + DBUG_RETURN(1); + +#ifndef DBUG_OFF + if (maria_multi_threaded) + translog_lock_handler_assert_owner(); +#endif + + lsn_store(buffer + CF_LSN_OFFSET, last_checkpoint_lsn_arg); + int4store(buffer + CF_FILENO_OFFSET, last_logno_arg); + transid_store(buffer + CF_MAX_TRID_OFFSET, max_trid_arg); + (buffer + CF_RECOV_FAIL_OFFSET)[0]= recovery_failures_arg; + + if (cf_changeable_size > CF_CHANGEABLE_TOTAL_SIZE) + { + /* + More room than needed for us. Must be a newer version. Clear part which + we cannot maintain, so that any future version notices we didn't + maintain its extra data. + */ + uint zeroed= cf_changeable_size - CF_CHANGEABLE_TOTAL_SIZE; + char msg[150]; + bzero(buffer + CF_CHANGEABLE_TOTAL_SIZE, zeroed); + my_snprintf(msg, sizeof(msg), + "Control file must be from a newer version; zero-ing out %u" + " unknown bytes in control file at offset %u", zeroed, + cf_changeable_size + cf_create_time_size); + ma_message_no_user(ME_JUST_WARNING, msg); + } + else + { + /* not enough room for what we need to store: enlarge */ + cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE; + } + /* Note that the create-time portion is not touched */ + + /* Checksum is stored first */ + compile_time_assert(CF_CHECKSUM_OFFSET == 0); + sum= my_checksum(0, buffer + CF_CHECKSUM_SIZE, + cf_changeable_size - CF_CHECKSUM_SIZE); + int4store(buffer, sum); + + if (my_pwrite(control_file_fd, buffer, cf_changeable_size, + cf_create_time_size, MYF(MY_FNABP | MY_WME)) || + (!no_need_sync && my_sync(control_file_fd, MYF(MY_WME)))) + DBUG_RETURN(1); + + last_checkpoint_lsn= last_checkpoint_lsn_arg; + last_logno= last_logno_arg; + max_trid_in_control_file= max_trid_arg; + recovery_failures= recovery_failures_arg; + + cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE; /* no more warning */ + DBUG_RETURN(0); +} + + +/* + Free resources taken by control file subsystem + + SYNOPSIS + ma_control_file_end() +*/ + +int ma_control_file_end(void) +{ + int close_error; + DBUG_ENTER("ma_control_file_end"); + + if (control_file_fd < 0) /* already closed */ + DBUG_RETURN(0); + +#ifndef __WIN__ + (void) my_lock(control_file_fd, F_UNLCK, 0L, F_TO_EOF, + MYF(MY_SEEK_NOT_DONE | MY_FORCE_LOCK)); +#endif + + close_error= my_close(control_file_fd, MYF(MY_WME)); + /* + As my_close() frees structures even if close() fails, we do the same, + i.e. we mark the file as closed in all cases. + */ + control_file_fd= -1; + /* + As this module owns these variables, closing the module forbids access to + them (just a safety): + */ + last_checkpoint_lsn= LSN_IMPOSSIBLE; + last_logno= FILENO_IMPOSSIBLE; + max_trid_in_control_file= recovery_failures= 0; + + DBUG_RETURN(close_error); +} + + +/** + Tells if control file is initialized. +*/ + +my_bool ma_control_file_inited(void) +{ + return (control_file_fd >= 0); +} + +#endif /* EXTRACT_DEFINITIONS */ diff --git a/storage/maria/ma_control_file.h b/storage/maria/ma_control_file.h new file mode 100644 index 00000000000..f828ae69c6d --- /dev/null +++ b/storage/maria/ma_control_file.h @@ -0,0 +1,74 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3234 Maria control file + First version written by Guilhem Bichot on 2006-04-27. +*/ + +#ifndef _ma_control_file_h +#define _ma_control_file_h + +#define CONTROL_FILE_BASE_NAME "aria_log_control" +/* + Major version for control file. Should only be changed when doing + big changes that made the new control file incompatible with all + older versions of Maria. +*/ +#define CONTROL_FILE_VERSION 1 + +/* Here is the interface of this module */ + +/* + LSN of the last checkoint + (if last_checkpoint_lsn == LSN_IMPOSSIBLE then there was never a checkpoint) +*/ +extern LSN last_checkpoint_lsn; +/* + Last log number (if last_logno == FILENO_IMPOSSIBLE then there is no log + file yet) +*/ +extern uint32 last_logno; + +extern TrID max_trid_in_control_file; + +extern uint8 recovery_failures; + +extern my_bool maria_multi_threaded, maria_in_recovery; + +typedef enum enum_control_file_error { + CONTROL_FILE_OK= 0, + CONTROL_FILE_TOO_SMALL, + CONTROL_FILE_TOO_BIG, + CONTROL_FILE_BAD_MAGIC_STRING, + CONTROL_FILE_BAD_VERSION, + CONTROL_FILE_BAD_CHECKSUM, + CONTROL_FILE_BAD_HEAD_CHECKSUM, + CONTROL_FILE_MISSING, + CONTROL_FILE_INCONSISTENT_INFORMATION, + CONTROL_FILE_WRONG_BLOCKSIZE, + CONTROL_FILE_UNKNOWN_ERROR /* any other error */ +} CONTROL_FILE_ERROR; + +C_MODE_START +CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing, + my_bool print_error); +int ma_control_file_write_and_force(LSN last_checkpoint_lsn_arg, + uint32 last_logno_arg, TrID max_trid_arg, + uint8 recovery_failures_arg); +int ma_control_file_end(void); +my_bool ma_control_file_inited(void); +C_MODE_END +#endif diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c new file mode 100644 index 00000000000..9cf042ed21e --- /dev/null +++ b/storage/maria/ma_create.c @@ -0,0 +1,1419 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Create a MARIA table */ + +#include "ma_ftdefs.h" +#include "ma_sp_defs.h" +#include <my_bit.h> +#include "ma_blockrec.h" +#include "trnman_public.h" + +#if defined(MSDOS) || defined(__WIN__) +#ifdef __WIN__ +#include <fcntl.h> +#else +#include <process.h> /* Prototype for getpid */ +#endif +#endif +#include <m_ctype.h> + +static int compare_columns(MARIA_COLUMNDEF **a, MARIA_COLUMNDEF **b); + +/* + Old options is used when recreating database, from maria_chk +*/ + +int maria_create(const char *name, enum data_file_type datafile_type, + uint keys,MARIA_KEYDEF *keydefs, + uint columns, MARIA_COLUMNDEF *columndef, + uint uniques, MARIA_UNIQUEDEF *uniquedefs, + MARIA_CREATE_INFO *ci,uint flags) +{ + register uint i,j; + File dfile,file; + int errpos,save_errno, create_mode= O_RDWR | O_TRUNC, res; + myf create_flag; + uint length,max_key_length,packed,pack_bytes,pointer,real_length_diff, + key_length,info_length,key_segs,options,min_key_length, + base_pos,long_varchar_count,varchar_length, + unique_key_parts,fulltext_keys,offset, not_block_record_extra_length; + uint max_field_lengths, extra_header_size, column_nr; + ulong reclength, real_reclength,min_pack_length; + char filename[FN_REFLEN], linkname[FN_REFLEN], *linkname_ptr; + ulong pack_reclength; + ulonglong tot_length,max_rows, tmp; + enum en_fieldtype type; + enum data_file_type org_datafile_type= datafile_type; + MARIA_SHARE share; + MARIA_KEYDEF *keydef,tmp_keydef; + MARIA_UNIQUEDEF *uniquedef; + HA_KEYSEG *keyseg,tmp_keyseg; + MARIA_COLUMNDEF *column, *end_column; + double *rec_per_key_part; + ulong *nulls_per_key_part; + uint16 *column_array; + my_off_t key_root[HA_MAX_POSSIBLE_KEY], kfile_size_before_extension; + MARIA_CREATE_INFO tmp_create_info; + my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */ + my_bool forced_packed; + myf sync_dir= 0; + uchar *log_data= NULL; + DBUG_ENTER("maria_create"); + DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u", + keys, columns, uniques, flags)); + + DBUG_ASSERT(maria_inited); + LINT_INIT(dfile); + LINT_INIT(file); + + if (!ci) + { + bzero((char*) &tmp_create_info,sizeof(tmp_create_info)); + ci=&tmp_create_info; + } + + if (keys + uniques > MARIA_MAX_KEY) + { + DBUG_RETURN(my_errno=HA_WRONG_CREATE_OPTION); + } + errpos=0; + options=0; + bzero((uchar*) &share,sizeof(share)); + + if (flags & HA_DONT_TOUCH_DATA) + { + /* We come here from recreate table */ + org_datafile_type= ci->org_data_file_type; + if (!(ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD)) + options= (ci->old_options & + (HA_OPTION_COMPRESS_RECORD | HA_OPTION_PACK_RECORD | + HA_OPTION_READ_ONLY_DATA | HA_OPTION_CHECKSUM | + HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE | + HA_OPTION_LONG_BLOB_PTR | HA_OPTION_PAGE_CHECKSUM)); + else + { + /* Uncompressing rows */ + options= (ci->old_options & + (HA_OPTION_CHECKSUM | HA_OPTION_TMP_TABLE | + HA_OPTION_DELAY_KEY_WRITE | HA_OPTION_LONG_BLOB_PTR | + HA_OPTION_PAGE_CHECKSUM)); + } + } + else + { + /* Transactional tables must be of type BLOCK_RECORD */ + if (ci->transactional) + datafile_type= BLOCK_RECORD; + } + + if (ci->reloc_rows > ci->max_rows) + ci->reloc_rows=ci->max_rows; /* Check if wrong parameter */ + + if (!(rec_per_key_part= + (double*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(double) + + (keys + uniques)*HA_MAX_KEY_SEG*sizeof(ulong) + + sizeof(uint16) * columns, + MYF(MY_WME | MY_ZEROFILL)))) + DBUG_RETURN(my_errno); + nulls_per_key_part= (ulong*) (rec_per_key_part + + (keys + uniques) * HA_MAX_KEY_SEG); + column_array= (uint16*) (nulls_per_key_part + + (keys + uniques) * HA_MAX_KEY_SEG); + + + /* Start by checking fields and field-types used */ + varchar_length=long_varchar_count=packed= not_block_record_extra_length= + pack_reclength= max_field_lengths= 0; + reclength= min_pack_length= ci->null_bytes; + forced_packed= 0; + column_nr= 0; + + for (column= columndef, end_column= column + columns ; + column != end_column ; + column++) + { + /* Fill in not used struct parts */ + column->column_nr= column_nr++; + column->offset= reclength; + column->empty_pos= 0; + column->empty_bit= 0; + column->fill_length= column->length; + if (column->null_bit) + options|= HA_OPTION_NULL_FIELDS; + + reclength+= column->length; + type= column->type; + if (datafile_type == BLOCK_RECORD) + { + if (type == FIELD_SKIP_PRESPACE) + type= column->type= FIELD_NORMAL; /* SKIP_PRESPACE not supported */ + if (type == FIELD_NORMAL && + column->length > FULL_PAGE_SIZE(maria_block_size)) + { + /* FIELD_NORMAL can't be split over many blocks, convert to a CHAR */ + type= column->type= FIELD_SKIP_ENDSPACE; + } + } + + if (type != FIELD_NORMAL && type != FIELD_CHECK) + { + column->empty_pos= packed/8; + column->empty_bit= (1 << (packed & 7)); + if (type == FIELD_BLOB) + { + forced_packed= 1; + packed++; + share.base.blobs++; + if (pack_reclength != INT_MAX32) + { + if (column->length == 4+portable_sizeof_char_ptr) + pack_reclength= INT_MAX32; + else + { + /* Add max possible blob length */ + pack_reclength+= (1 << ((column->length- + portable_sizeof_char_ptr)*8)); + } + } + max_field_lengths+= (column->length - portable_sizeof_char_ptr); + } + else if (type == FIELD_SKIP_PRESPACE || + type == FIELD_SKIP_ENDSPACE) + { + forced_packed= 1; + max_field_lengths+= column->length > 255 ? 2 : 1; + not_block_record_extra_length++; + packed++; + } + else if (type == FIELD_VARCHAR) + { + varchar_length+= column->length-1; /* Used for min_pack_length */ + pack_reclength++; + not_block_record_extra_length++; + max_field_lengths++; + packed++; + column->fill_length= 1; + options|= HA_OPTION_NULL_FIELDS; /* Use ma_checksum() */ + + /* We must test for 257 as length includes pack-length */ + if (test(column->length >= 257)) + { + long_varchar_count++; + max_field_lengths++; + column->fill_length= 2; + } + } + else if (type == FIELD_SKIP_ZERO) + packed++; + else + { + if (!column->null_bit) + min_pack_length+= column->length; + else + { + /* Only BLOCK_RECORD skips NULL fields for all field values */ + not_block_record_extra_length+= column->length; + } + column->empty_pos= 0; + column->empty_bit= 0; + } + } + else /* FIELD_NORMAL */ + { + if (!column->null_bit) + { + min_pack_length+= column->length; + share.base.fixed_not_null_fields++; + share.base.fixed_not_null_fields_length+= column->length; + } + else + not_block_record_extra_length+= column->length; + } + } + + if (datafile_type == STATIC_RECORD && forced_packed) + { + /* Can't use fixed length records, revert to block records */ + datafile_type= BLOCK_RECORD; + } + + if (datafile_type == DYNAMIC_RECORD) + options|= HA_OPTION_PACK_RECORD; /* Must use packed records */ + + if (datafile_type == STATIC_RECORD) + { + /* We can't use checksum with static length rows */ + flags&= ~HA_CREATE_CHECKSUM; + options&= ~HA_OPTION_CHECKSUM; + min_pack_length= reclength; + packed= 0; + } + else if (datafile_type != BLOCK_RECORD) + min_pack_length+= not_block_record_extra_length; + else + min_pack_length+= 5; /* Min row overhead */ + + if (flags & HA_CREATE_TMP_TABLE) + { + options|= HA_OPTION_TMP_TABLE; + tmp_table= TRUE; + create_mode|= O_NOFOLLOW; + /* "CREATE TEMPORARY" tables are not crash-safe (dropped at restart) */ + ci->transactional= FALSE; + flags&= ~HA_CREATE_PAGE_CHECKSUM; + } + share.base.null_bytes= ci->null_bytes; + share.base.original_null_bytes= ci->null_bytes; + share.base.born_transactional= ci->transactional; + share.base.max_field_lengths= max_field_lengths; + share.base.field_offsets= 0; /* for future */ + + if (flags & HA_CREATE_CHECKSUM || (options & HA_OPTION_CHECKSUM)) + { + options|= HA_OPTION_CHECKSUM; + min_pack_length++; + pack_reclength++; + } + if (pack_reclength < INT_MAX32) + pack_reclength+= max_field_lengths + long_varchar_count; + else + pack_reclength= INT_MAX32; + + if (flags & HA_CREATE_DELAY_KEY_WRITE) + options|= HA_OPTION_DELAY_KEY_WRITE; + if (flags & HA_CREATE_RELIES_ON_SQL_LAYER) + options|= HA_OPTION_RELIES_ON_SQL_LAYER; + if (flags & HA_CREATE_PAGE_CHECKSUM) + options|= HA_OPTION_PAGE_CHECKSUM; + + pack_bytes= (packed + 7) / 8; + if (pack_reclength != INT_MAX32) + pack_reclength+= reclength+pack_bytes + + test(test_all_bits(options, HA_OPTION_CHECKSUM | HA_OPTION_PACK_RECORD)); + min_pack_length+= pack_bytes; + /* Calculate min possible row length for rows-in-block */ + extra_header_size= MAX_FIXED_HEADER_SIZE; + if (ci->transactional) + { + extra_header_size= TRANS_MAX_FIXED_HEADER_SIZE; + DBUG_PRINT("info",("creating a transactional table")); + } + share.base.min_block_length= (extra_header_size + share.base.null_bytes + + pack_bytes); + if (!ci->data_file_length && ci->max_rows) + { + if (pack_reclength == INT_MAX32 || + (~(ulonglong) 0)/ci->max_rows < (ulonglong) pack_reclength) + ci->data_file_length= ~(ulonglong) 0; + else + ci->data_file_length=(ulonglong) ci->max_rows*pack_reclength; + } + else if (!ci->max_rows) + { + if (datafile_type == BLOCK_RECORD) + { + uint rows_per_page= ((maria_block_size - PAGE_OVERHEAD_SIZE) / + (min_pack_length + extra_header_size + + DIR_ENTRY_SIZE)); + ulonglong data_file_length= ci->data_file_length; + if (!data_file_length) + data_file_length= ((((ulonglong) 1 << ((BLOCK_RECORD_POINTER_SIZE-1) * + 8)) -1) * maria_block_size); + if (rows_per_page > 0) + { + set_if_smaller(rows_per_page, MAX_ROWS_PER_PAGE); + ci->max_rows= data_file_length / maria_block_size * rows_per_page; + } + else + ci->max_rows= data_file_length / (min_pack_length + + extra_header_size + + DIR_ENTRY_SIZE); + } + else + ci->max_rows=(ha_rows) (ci->data_file_length/(min_pack_length + + ((options & + HA_OPTION_PACK_RECORD) ? + 3 : 0))); + } + max_rows= (ulonglong) ci->max_rows; + if (datafile_type == BLOCK_RECORD) + { + /* + The + 1 is for record position withing page + The / 2 is because we need one bit for knowing if there is transid's + after the row pointer + */ + pointer= maria_get_pointer_length((ci->data_file_length / + (maria_block_size * 2)), 3) + 1; + set_if_smaller(pointer, BLOCK_RECORD_POINTER_SIZE); + + if (!max_rows) + max_rows= (((((ulonglong) 1 << ((pointer-1)*8)) -1) * maria_block_size) / + min_pack_length / 2); + } + else + { + if (datafile_type != STATIC_RECORD) + pointer= maria_get_pointer_length(ci->data_file_length, + maria_data_pointer_size); + else + pointer= maria_get_pointer_length(ci->max_rows, maria_data_pointer_size); + if (!max_rows) + max_rows= ((((ulonglong) 1 << (pointer*8)) -1) / min_pack_length); + } + + real_reclength=reclength; + if (datafile_type == STATIC_RECORD) + { + if (reclength <= pointer) + reclength=pointer+1; /* reserve place for delete link */ + } + else + reclength+= long_varchar_count; /* We need space for varchar! */ + + max_key_length=0; tot_length=0 ; key_segs=0; + fulltext_keys=0; + share.state.rec_per_key_part= rec_per_key_part; + share.state.nulls_per_key_part= nulls_per_key_part; + share.state.key_root=key_root; + share.state.key_del= HA_OFFSET_ERROR; + if (uniques) + max_key_length= MARIA_UNIQUE_HASH_LENGTH + pointer; + + for (i=0, keydef=keydefs ; i < keys ; i++ , keydef++) + { + share.state.key_root[i]= HA_OFFSET_ERROR; + length= real_length_diff= 0; + min_key_length= key_length= pointer; + + if (keydef->key_alg == HA_KEY_ALG_RTREE) + keydef->flag|= HA_RTREE_INDEX; /* For easier tests */ + + if (keydef->flag & HA_SPATIAL) + { +#ifdef HAVE_SPATIAL + /* BAR TODO to support 3D and more dimensions in the future */ + uint sp_segs=SPDIMS*2; + keydef->flag=HA_SPATIAL; + + if (flags & HA_DONT_TOUCH_DATA) + { + /* + Called by maria_chk - i.e. table structure was taken from + MYI file and SPATIAL key *does have* additional sp_segs keysegs. + keydef->seg here points right at the GEOMETRY segment, + so we only need to decrease keydef->keysegs. + (see maria_recreate_table() in _ma_check.c) + */ + keydef->keysegs-=sp_segs-1; + } + + for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ; + j++, keyseg++) + { + if (keyseg->type != HA_KEYTYPE_BINARY && + keyseg->type != HA_KEYTYPE_VARBINARY1 && + keyseg->type != HA_KEYTYPE_VARBINARY2) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + } + keydef->keysegs+=sp_segs; + key_length+=SPLEN*sp_segs; + length++; /* At least one length uchar */ + min_key_length++; +#else + my_errno= HA_ERR_UNSUPPORTED; + goto err_no_lock; +#endif /*HAVE_SPATIAL*/ + } + else if (keydef->flag & HA_FULLTEXT) + { + keydef->flag=HA_FULLTEXT | HA_PACK_KEY | HA_VAR_LENGTH_KEY; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + + for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ; + j++, keyseg++) + { + if (keyseg->type != HA_KEYTYPE_TEXT && + keyseg->type != HA_KEYTYPE_VARTEXT1 && + keyseg->type != HA_KEYTYPE_VARTEXT2) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + if (!(keyseg->flag & HA_BLOB_PART) && + (keyseg->type == HA_KEYTYPE_VARTEXT1 || + keyseg->type == HA_KEYTYPE_VARTEXT2)) + { + /* Make a flag that this is a VARCHAR */ + keyseg->flag|= HA_VAR_LENGTH_PART; + /* Store in bit_start number of bytes used to pack the length */ + keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1)? + 1 : 2); + } + } + + fulltext_keys++; + key_length+= HA_FT_MAXBYTELEN+HA_FT_WLEN; + length++; /* At least one length uchar */ + min_key_length+= 1 + HA_FT_WLEN; + real_length_diff=HA_FT_MAXBYTELEN-FT_MAX_WORD_LEN_FOR_SORT; + } + else + { + /* Test if prefix compression */ + if (keydef->flag & HA_PACK_KEY) + { + /* Can't use space_compression on number keys */ + if ((keydef->seg[0].flag & HA_SPACE_PACK) && + keydef->seg[0].type == (int) HA_KEYTYPE_NUM) + keydef->seg[0].flag&= ~HA_SPACE_PACK; + + /* Only use HA_PACK_KEY when first segment is a variable length key */ + if (!(keydef->seg[0].flag & (HA_SPACE_PACK | HA_BLOB_PART | + HA_VAR_LENGTH_PART))) + { + /* pack relative to previous key */ + keydef->flag&= ~HA_PACK_KEY; + keydef->flag|= HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY; + } + else + { + keydef->seg[0].flag|=HA_PACK_KEY; /* for easyer intern test */ + keydef->flag|=HA_VAR_LENGTH_KEY; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + } + } + if (keydef->flag & HA_BINARY_PACK_KEY) + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + + if (keydef->flag & HA_AUTO_KEY && ci->with_auto_increment) + share.base.auto_key=i+1; + for (j=0, keyseg=keydef->seg ; j < keydef->keysegs ; j++, keyseg++) + { + /* numbers are stored with high by first to make compression easier */ + switch (keyseg->type) { + case HA_KEYTYPE_SHORT_INT: + case HA_KEYTYPE_LONG_INT: + case HA_KEYTYPE_FLOAT: + case HA_KEYTYPE_DOUBLE: + case HA_KEYTYPE_USHORT_INT: + case HA_KEYTYPE_ULONG_INT: + case HA_KEYTYPE_LONGLONG: + case HA_KEYTYPE_ULONGLONG: + case HA_KEYTYPE_INT24: + case HA_KEYTYPE_UINT24: + case HA_KEYTYPE_INT8: + keyseg->flag|= HA_SWAP_KEY; + break; + case HA_KEYTYPE_VARTEXT1: + case HA_KEYTYPE_VARTEXT2: + case HA_KEYTYPE_VARBINARY1: + case HA_KEYTYPE_VARBINARY2: + if (!(keyseg->flag & HA_BLOB_PART)) + { + /* Make a flag that this is a VARCHAR */ + keyseg->flag|= HA_VAR_LENGTH_PART; + /* Store in bit_start number of bytes used to pack the length */ + keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 || + keyseg->type == HA_KEYTYPE_VARBINARY1) ? + 1 : 2); + } + break; + default: + break; + } + if (keyseg->flag & HA_SPACE_PACK) + { + DBUG_ASSERT(!(keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))); + keydef->flag |= HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + length++; /* At least one length uchar */ + if (!keyseg->null_bit) + min_key_length++; + key_length+= keyseg->length; + if (keyseg->length >= 255) + { + /* prefix may be 3 bytes */ + length+= 2; + } + } + else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) + { + DBUG_ASSERT(!test_all_bits(keyseg->flag, + (HA_VAR_LENGTH_PART | HA_BLOB_PART))); + keydef->flag|=HA_VAR_LENGTH_KEY; + length++; /* At least one length uchar */ + if (!keyseg->null_bit) + min_key_length++; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + key_length+= keyseg->length; + if (keyseg->length >= 255) + { + /* prefix may be 3 bytes */ + length+= 2; + } + } + else + { + key_length+= keyseg->length; + if (!keyseg->null_bit) + min_key_length+= keyseg->length; + } + if (keyseg->null_bit) + { + key_length++; + /* min key part is 1 byte */ + min_key_length++; + options|=HA_OPTION_PACK_KEYS; + keyseg->flag|=HA_NULL_PART; + keydef->flag|=HA_VAR_LENGTH_KEY | HA_NULL_PART_KEY; + } + } + } /* if HA_FULLTEXT */ + key_segs+=keydef->keysegs; + if (keydef->keysegs > HA_MAX_KEY_SEG) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + /* + key_segs may be 0 in the case when we only want to be able to + add on row into the table. This can happen with some DISTINCT queries + in MySQL + */ + if ((keydef->flag & (HA_NOSAME | HA_NULL_PART_KEY)) == HA_NOSAME && + key_segs) + share.state.rec_per_key_part[key_segs-1]=1L; + length+=key_length; + /* + A key can't be longer than than half a index block (as we have + to be able to put at least 2 keys on an index block for the key + algorithms to work). + */ + if (length > maria_max_key_length()) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + keydef->block_length= (uint16) maria_block_size; + keydef->keylength= (uint16) key_length; + keydef->minlength= (uint16) min_key_length; + keydef->maxlength= (uint16) length; + + if (length > max_key_length) + max_key_length= length; + tot_length+= ((max_rows/(ulong) (((uint) maria_block_size - + MAX_KEYPAGE_HEADER_SIZE - + KEYPAGE_CHECKSUM_SIZE)/ + (length*2))) * + maria_block_size); + } + + unique_key_parts=0; + for (i=0, uniquedef=uniquedefs ; i < uniques ; i++ , uniquedef++) + { + uniquedef->key=keys+i; + unique_key_parts+=uniquedef->keysegs; + share.state.key_root[keys+i]= HA_OFFSET_ERROR; + tot_length+= (max_rows/(ulong) (((uint) maria_block_size - + MAX_KEYPAGE_HEADER_SIZE - + KEYPAGE_CHECKSUM_SIZE) / + ((MARIA_UNIQUE_HASH_LENGTH + pointer)*2)))* + (ulong) maria_block_size; + } + keys+=uniques; /* Each unique has 1 key */ + key_segs+=uniques; /* Each unique has 1 key seg */ + + base_pos=(MARIA_STATE_INFO_SIZE + keys * MARIA_STATE_KEY_SIZE + + key_segs * MARIA_STATE_KEYSEG_SIZE); + info_length= base_pos+(uint) (MARIA_BASE_INFO_SIZE+ + keys * MARIA_KEYDEF_SIZE+ + uniques * MARIA_UNIQUEDEF_SIZE + + (key_segs + unique_key_parts)*HA_KEYSEG_SIZE+ + columns*(MARIA_COLUMNDEF_SIZE + 2)); + + DBUG_PRINT("info", ("info_length: %u", info_length)); + /* There are only 16 bits for the total header length. */ + if (info_length > 65535) + { + my_printf_error(HA_WRONG_CREATE_OPTION, + "Aria table '%s' has too many columns and/or " + "indexes and/or unique constraints.", + MYF(0), name + dirname_length(name)); + my_errno= HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + + bmove(share.state.header.file_version, maria_file_magic, 4); + ci->old_options=options | (ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD ? + HA_OPTION_COMPRESS_RECORD | + HA_OPTION_TEMP_COMPRESS_RECORD: 0); + mi_int2store(share.state.header.options,ci->old_options); + mi_int2store(share.state.header.header_length,info_length); + mi_int2store(share.state.header.state_info_length,MARIA_STATE_INFO_SIZE); + mi_int2store(share.state.header.base_info_length,MARIA_BASE_INFO_SIZE); + mi_int2store(share.state.header.base_pos,base_pos); + share.state.header.data_file_type= share.data_file_type= datafile_type; + share.state.header.org_data_file_type= org_datafile_type; + share.state.header.language= (ci->language ? + ci->language : default_charset_info->number); + + share.state.dellink = HA_OFFSET_ERROR; + share.state.first_bitmap_with_space= 0; +#ifdef EXTERNAL_LOCKING + share.state.process= (ulong) getpid(); +#endif + share.state.version= (ulong) time((time_t*) 0); + share.state.sortkey= (ushort) ~0; + share.state.auto_increment=ci->auto_increment; + share.options=options; + share.base.rec_reflength=pointer; + share.base.block_size= maria_block_size; + + /* + Get estimate for index file length (this may be wrong for FT keys) + This is used for pointers to other key pages. + */ + tmp= (tot_length + maria_block_size * keys * + MARIA_INDEX_BLOCK_MARGIN) / maria_block_size; + + /* + use maximum of key_file_length we calculated and key_file_length value we + got from MAI file header (see also mariapack.c:save_state) + */ + share.base.key_reflength= + maria_get_pointer_length(max(ci->key_file_length,tmp),3); + share.base.keys= share.state.header.keys= keys; + share.state.header.uniques= uniques; + share.state.header.fulltext_keys= fulltext_keys; + mi_int2store(share.state.header.key_parts,key_segs); + mi_int2store(share.state.header.unique_key_parts,unique_key_parts); + + maria_set_all_keys_active(share.state.key_map, keys); + + share.base.keystart = share.state.state.key_file_length= + MY_ALIGN(info_length, maria_block_size); + share.base.max_key_block_length= maria_block_size; + share.base.max_key_length=ALIGN_SIZE(max_key_length+4); + share.base.records=ci->max_rows; + share.base.reloc= ci->reloc_rows; + share.base.reclength=real_reclength; + share.base.pack_reclength=reclength+ test(options & HA_OPTION_CHECKSUM); + share.base.max_pack_length=pack_reclength; + share.base.min_pack_length=min_pack_length; + share.base.pack_bytes= pack_bytes; + share.base.fields= columns; + share.base.pack_fields= packed; + + if (share.data_file_type == BLOCK_RECORD) + { + /* + we are going to create a first bitmap page, set data_file_length + to reflect this, before the state goes to disk + */ + share.state.state.data_file_length= maria_block_size; + /* Add length of packed fields + length */ + share.base.pack_reclength+= share.base.max_field_lengths+3; + + /* Adjust max_pack_length, to be used if we have short rows */ + if (share.base.max_pack_length < maria_block_size) + { + share.base.max_pack_length+= FLAG_SIZE; + if (ci->transactional) + share.base.max_pack_length+= TRANSID_SIZE * 2; + } + } + + /* max_data_file_length and max_key_file_length are recalculated on open */ + if (tmp_table) + share.base.max_data_file_length= (my_off_t) ci->data_file_length; + else if (ci->transactional && translog_status == TRANSLOG_OK && + !maria_in_recovery) + { + /* + we have checked translog_inited above, because maria_chk may call us + (via maria_recreate_table()) and it does not have a log. + */ + sync_dir= MY_SYNC_DIR; + /* + If crash between _ma_state_info_write_sub() and + _ma_update_state__lsns_sub(), table should be ignored by Recovery (or + old REDOs would fail), so we cannot let LSNs be 0: + */ + share.state.skip_redo_lsn= share.state.is_of_horizon= + share.state.create_rename_lsn= LSN_MAX; + } + + if (datafile_type == DYNAMIC_RECORD) + { + share.base.min_block_length= + (share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH && + ! share.base.blobs) ? + max(share.base.pack_reclength,MARIA_MIN_BLOCK_LENGTH) : + MARIA_EXTEND_BLOCK_LENGTH; + } + else if (datafile_type == STATIC_RECORD) + share.base.min_block_length= share.base.pack_reclength; + + if (! (flags & HA_DONT_TOUCH_DATA)) + share.state.create_time= time((time_t*) 0); + + pthread_mutex_lock(&THR_LOCK_maria); + + /* + NOTE: For test_if_reopen() we need a real path name. Hence we need + MY_RETURN_REAL_PATH for every fn_format(filename, ...). + */ + if (ci->index_file_name) + { + char *iext= strrchr(ci->index_file_name, '.'); + int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT); + if (tmp_table) + { + char *path; + /* chop off the table name, tempory tables use generated name */ + if ((path= strrchr(ci->index_file_name, FN_LIBCHAR))) + *path= '\0'; + fn_format(filename, name, ci->index_file_name, MARIA_NAME_IEXT, + MY_REPLACE_DIR | MY_UNPACK_FILENAME | + MY_RETURN_REAL_PATH | MY_APPEND_EXT); + } + else + { + fn_format(filename, ci->index_file_name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH | + (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT)); + } + fn_format(linkname, name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME|MY_APPEND_EXT); + linkname_ptr= linkname; + /* + Don't create the table if the link or file exists to ensure that one + doesn't accidently destroy another table. + Don't sync dir now if the data file has the same path. + */ + create_flag= + (ci->data_file_name && + !strcmp(ci->index_file_name, ci->data_file_name)) ? 0 : sync_dir; + } + else + { + char *iext= strrchr(name, '.'); + int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT); + fn_format(filename, name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH | + (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT)); + linkname_ptr= NullS; + /* + Replace the current file. + Don't sync dir now if the data file has the same path. + */ + create_flag= (flags & HA_CREATE_KEEP_FILES) ? 0 : MY_DELETE_OLD; + create_flag|= (!ci->data_file_name ? 0 : sync_dir); + } + + /* + If a MRG_MARIA table is in use, the mapped MARIA tables are open, + but no entry is made in the table cache for them. + A TRUNCATE command checks for the table in the cache only and could + be fooled to believe, the table is not open. + Pull the emergency brake in this situation. (Bug #8306) + + + NOTE: The filename is compared against unique_file_name of every + open table. Hence we need a real path here. + */ + if (_ma_test_if_reopen(filename)) + { + my_printf_error(0, "Aria table '%s' is in use " + "(most likely by a MERGE table). Try FLUSH TABLES.", + MYF(0), name + dirname_length(name)); + my_errno= HA_ERR_TABLE_EXIST; + goto err; + } + + if ((file= my_create_with_symlink(linkname_ptr, filename, 0, create_mode, + MYF(MY_WME|create_flag))) < 0) + goto err; + errpos=1; + + DBUG_PRINT("info", ("write state info and base info")); + if (_ma_state_info_write_sub(file, &share.state, + MA_STATE_INFO_WRITE_FULL_INFO) || + _ma_base_info_write(file, &share.base)) + goto err; + DBUG_PRINT("info", ("base_pos: %d base_info_size: %d", + base_pos, MARIA_BASE_INFO_SIZE)); + DBUG_ASSERT(my_tell(file,MYF(0)) == base_pos+ MARIA_BASE_INFO_SIZE); + + /* Write key and keyseg definitions */ + DBUG_PRINT("info", ("write key and keyseg definitions")); + for (i=0 ; i < share.base.keys - uniques; i++) + { + uint sp_segs=(keydefs[i].flag & HA_SPATIAL) ? 2*SPDIMS : 0; + + if (_ma_keydef_write(file, &keydefs[i])) + goto err; + for (j=0 ; j < keydefs[i].keysegs-sp_segs ; j++) + if (_ma_keyseg_write(file, &keydefs[i].seg[j])) + goto err; +#ifdef HAVE_SPATIAL + for (j=0 ; j < sp_segs ; j++) + { + HA_KEYSEG sseg; + sseg.type=SPTYPE; + sseg.language= 7; /* Binary */ + sseg.null_bit=0; + sseg.bit_start=0; + sseg.bit_end=0; + sseg.bit_length= 0; + sseg.bit_pos= 0; + sseg.length=SPLEN; + sseg.null_pos=0; + sseg.start=j*SPLEN; + sseg.flag= HA_SWAP_KEY; + if (_ma_keyseg_write(file, &sseg)) + goto err; + } +#endif + } + /* Create extra keys for unique definitions */ + offset= real_reclength - uniques*MARIA_UNIQUE_HASH_LENGTH; + bzero((char*) &tmp_keydef,sizeof(tmp_keydef)); + bzero((char*) &tmp_keyseg,sizeof(tmp_keyseg)); + for (i=0; i < uniques ; i++) + { + tmp_keydef.keysegs=1; + tmp_keydef.flag= HA_UNIQUE_CHECK; + tmp_keydef.block_length= (uint16) maria_block_size; + tmp_keydef.keylength= MARIA_UNIQUE_HASH_LENGTH + pointer; + tmp_keydef.minlength=tmp_keydef.maxlength=tmp_keydef.keylength; + tmp_keyseg.type= MARIA_UNIQUE_HASH_TYPE; + tmp_keyseg.length= MARIA_UNIQUE_HASH_LENGTH; + tmp_keyseg.start= offset; + offset+= MARIA_UNIQUE_HASH_LENGTH; + if (_ma_keydef_write(file,&tmp_keydef) || + _ma_keyseg_write(file,(&tmp_keyseg))) + goto err; + } + + /* Save unique definition */ + DBUG_PRINT("info", ("write unique definitions")); + for (i=0 ; i < share.state.header.uniques ; i++) + { + HA_KEYSEG *keyseg_end; + keyseg= uniquedefs[i].seg; + if (_ma_uniquedef_write(file, &uniquedefs[i])) + goto err; + for (keyseg= uniquedefs[i].seg, keyseg_end= keyseg+ uniquedefs[i].keysegs; + keyseg < keyseg_end; + keyseg++) + { + switch (keyseg->type) { + case HA_KEYTYPE_VARTEXT1: + case HA_KEYTYPE_VARTEXT2: + case HA_KEYTYPE_VARBINARY1: + case HA_KEYTYPE_VARBINARY2: + if (!(keyseg->flag & HA_BLOB_PART)) + { + keyseg->flag|= HA_VAR_LENGTH_PART; + keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 || + keyseg->type == HA_KEYTYPE_VARBINARY1) ? + 1 : 2); + } + break; + default: + DBUG_ASSERT((keyseg->flag & HA_VAR_LENGTH_PART) == 0); + break; + } + if (_ma_keyseg_write(file, keyseg)) + goto err; + } + } + DBUG_PRINT("info", ("write field definitions")); + if (datafile_type == BLOCK_RECORD) + { + /* Store columns in a more efficent order */ + MARIA_COLUMNDEF **col_order, **pos; + if (!(col_order= (MARIA_COLUMNDEF**) my_malloc(share.base.fields * + sizeof(MARIA_COLUMNDEF*), + MYF(MY_WME)))) + goto err; + for (column= columndef, pos= col_order ; + column != end_column ; + column++, pos++) + *pos= column; + qsort(col_order, share.base.fields, sizeof(*col_order), + (qsort_cmp) compare_columns); + for (i=0 ; i < share.base.fields ; i++) + { + column_array[col_order[i]->column_nr]= i; + if (_ma_columndef_write(file, col_order[i])) + { + my_free(col_order, MYF(0)); + goto err; + } + } + my_free(col_order, MYF(0)); + } + else + { + for (i=0 ; i < share.base.fields ; i++) + { + column_array[i]= (uint16) i; + if (_ma_columndef_write(file, &columndef[i])) + goto err; + } + } + if (_ma_column_nr_write(file, column_array, columns)) + goto err; + + if ((kfile_size_before_extension= my_tell(file,MYF(0))) == MY_FILEPOS_ERROR) + goto err; +#ifndef DBUG_OFF + if (kfile_size_before_extension != info_length) + DBUG_PRINT("warning",("info_length: %u != used_length: %u", + info_length, (uint)kfile_size_before_extension)); +#endif + + if (sync_dir) + { + /* + we log the first bytes and then the size to which we extend; this is + not log 1 KB of mostly zeroes if this is a small table. + */ + char empty_string[]= ""; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + translog_size_t total_rec_length= 0; + uint k; + LSN lsn; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= 1 + 2 + 2 + + (uint) kfile_size_before_extension; + /* we are needing maybe 64 kB, so don't use the stack */ + log_data= my_malloc(log_array[TRANSLOG_INTERNAL_PARTS + 1].length, MYF(0)); + if ((log_data == NULL) || + my_pread(file, 1 + 2 + 2 + log_data, + (size_t) kfile_size_before_extension, 0, MYF(MY_NABP))) + goto err; + /* + remember if the data file was created or not, to know if Recovery can + do it or not, in the future + */ + log_data[0]= test(flags & HA_DONT_TOUCH_DATA); + int2store(log_data + 1, kfile_size_before_extension); + int2store(log_data + 1 + 2, share.base.keystart); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar *)name; + /* we store the end-zero, for Recovery to just pass it to my_create() */ + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_data; + /* symlink description is also needed for re-creation by Recovery: */ + { + const char *s= ci->data_file_name ? ci->data_file_name : empty_string; + log_array[TRANSLOG_INTERNAL_PARTS + 2].str= (uchar*)s; + log_array[TRANSLOG_INTERNAL_PARTS + 2].length= strlen(s) + 1; + s= ci->index_file_name ? ci->index_file_name : empty_string; + log_array[TRANSLOG_INTERNAL_PARTS + 3].str= (uchar*)s; + log_array[TRANSLOG_INTERNAL_PARTS + 3].length= strlen(s) + 1; + } + for (k= TRANSLOG_INTERNAL_PARTS; + k < (sizeof(log_array)/sizeof(log_array[0])); k++) + total_rec_length+= (translog_size_t) log_array[k].length; + /** + For this record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe, which it is not now (that would require + work using the ddl_log of sql/sql_table.cc); when it is, we should + reconsider the moment of writing this log record (before or after op, + under THR_LOCK_maria or not...), how to use it in Recovery. + For now this record can serve when we apply logs to a backup, + so we sync it. This happens before the data file is created. If the + data file was created before, and we crashed before writing the log + record, at restart the table may be used, so we would not have a + trustable history in the log (impossible to apply this log to a + backup). The way we do it, if we crash before writing the log record + then there is no data file and the table cannot be used. + @todo Note that in case of TRUNCATE TABLE we also come here; for + Recovery to be able to finish TRUNCATE TABLE, instead of leaving a + half-truncated table, we should log the record at start of + maria_create(); for that we shouldn't write to the index file but to a + buffer (DYNAMIC_STRING), put the buffer into the record, then put the + buffer into the index file (so, change _ma_keydef_write() etc). That + would also enable Recovery to finish a CREATE TABLE. The final result + would be that we would be able to finish what the SQL layer has asked + for: it would be atomic. + When in CREATE/TRUNCATE (or DROP or RENAME or REPAIR) we have not + called external_lock(), so have no TRN. It does not matter, as all + these operations are non-transactional and sync their files. + */ + if (unlikely(translog_write_record(&lsn, + LOGREC_REDO_CREATE_TABLE, + &dummy_transaction_object, NULL, + total_rec_length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) || + translog_flush(lsn))) + goto err; + share.kfile.file= file; + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_create_table", + { + DBUG_PRINT("maria_crash_create_table", ("now")); + DBUG_ABORT(); + }); + /* + store LSN into file, needed for Recovery to not be confused if a + DROP+CREATE happened (applying REDOs to the wrong table). + */ + if (_ma_update_state_lsns_sub(&share, lsn, trnman_get_min_safe_trid(), + FALSE, TRUE)) + goto err; + my_free(log_data, MYF(0)); + } + + if (!(flags & HA_DONT_TOUCH_DATA)) + { + if (ci->data_file_name) + { + char *dext= strrchr(ci->data_file_name, '.'); + int have_dext= dext && !strcmp(dext, MARIA_NAME_DEXT); + + if (tmp_table) + { + char *path; + /* chop off the table name, tempory tables use generated name */ + if ((path= strrchr(ci->data_file_name, FN_LIBCHAR))) + *path= '\0'; + fn_format(filename, name, ci->data_file_name, MARIA_NAME_DEXT, + MY_REPLACE_DIR | MY_UNPACK_FILENAME | MY_APPEND_EXT); + } + else + { + fn_format(filename, ci->data_file_name, "", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | + (have_dext ? MY_REPLACE_EXT : MY_APPEND_EXT)); + } + fn_format(linkname, name, "",MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + linkname_ptr= linkname; + create_flag=0; + } + else + { + fn_format(filename,name,"", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + linkname_ptr= NullS; + create_flag= (flags & HA_CREATE_KEEP_FILES) ? 0 : MY_DELETE_OLD; + } + if ((dfile= + my_create_with_symlink(linkname_ptr, filename, 0, create_mode, + MYF(MY_WME | create_flag | sync_dir))) < 0) + goto err; + errpos=3; + + if (_ma_initialize_data_file(&share, dfile)) + goto err; + } + + /* Enlarge files */ + DBUG_PRINT("info", ("enlarge to keystart: %lu", + (ulong) share.base.keystart)); + if (my_chsize(file,(ulong) share.base.keystart,0,MYF(0))) + goto err; + + if (sync_dir && my_sync(file, MYF(0))) + goto err; + + if (! (flags & HA_DONT_TOUCH_DATA)) + { +#ifdef USE_RELOC + if (my_chsize(dfile,share.base.min_pack_length*ci->reloc_rows,0,MYF(0))) + goto err; +#endif + if (sync_dir && my_sync(dfile, MYF(0))) + goto err; + if (my_close(dfile,MYF(0))) + goto err; + } + pthread_mutex_unlock(&THR_LOCK_maria); + res= 0; + my_free((char*) rec_per_key_part,MYF(0)); + errpos=0; + if (my_close(file,MYF(0))) + res= my_errno; + DBUG_RETURN(res); + +err: + pthread_mutex_unlock(&THR_LOCK_maria); + +err_no_lock: + save_errno=my_errno; + switch (errpos) { + case 3: + VOID(my_close(dfile,MYF(0))); + /* fall through */ + case 2: + if (! (flags & HA_DONT_TOUCH_DATA)) + my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT), + sync_dir); + /* fall through */ + case 1: + VOID(my_close(file,MYF(0))); + if (! (flags & HA_DONT_TOUCH_DATA)) + my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_IEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT), + sync_dir); + } + my_free(log_data, MYF(MY_ALLOW_ZERO_PTR)); + my_free((char*) rec_per_key_part, MYF(0)); + DBUG_RETURN(my_errno=save_errno); /* return the fatal errno */ +} + + +uint maria_get_pointer_length(ulonglong file_length, uint def) +{ + DBUG_ASSERT(def >= 2 && def <= 7); + if (file_length) /* If not default */ + { +#ifdef NOT_YET_READY_FOR_8_BYTE_POINTERS + if (file_length >= (ULL(1) << 56)) + def=8; + else +#endif + if (file_length >= (ULL(1) << 48)) + def=7; + else if (file_length >= (ULL(1) << 40)) + def=6; + else if (file_length >= (ULL(1) << 32)) + def=5; + else if (file_length >= (ULL(1) << 24)) + def=4; + else if (file_length >= (ULL(1) << 16)) + def=3; + else + def=2; + } + return def; +} + + +/* + Sort columns for records-in-block + + IMPLEMENTATION + Sort columns in following order: + + Fixed size, not null columns + Fixed length, null fields + Numbers (zero fill fields) + Variable length fields (CHAR, VARCHAR) according to length + Blobs + + For same kind of fields, keep fields in original order +*/ + +static inline int sign(long a) +{ + return a < 0 ? -1 : (a > 0 ? 1 : 0); +} + + +static int compare_columns(MARIA_COLUMNDEF **a_ptr, MARIA_COLUMNDEF **b_ptr) +{ + MARIA_COLUMNDEF *a= *a_ptr, *b= *b_ptr; + enum en_fieldtype a_type, b_type; + + a_type= (a->type == FIELD_CHECK) ? FIELD_NORMAL : a->type; + b_type= (b->type == FIELD_CHECK) ? FIELD_NORMAL : b->type; + + if (a_type == FIELD_NORMAL && !a->null_bit) + { + if (b_type != FIELD_NORMAL || b->null_bit) + return -1; + return sign((long) a->offset - (long) b->offset); + } + if (b_type == FIELD_NORMAL && !b->null_bit) + return 1; + if (a_type == b_type) + return sign((long) a->offset - (long) b->offset); + if (a_type == FIELD_NORMAL) + return -1; + if (b_type == FIELD_NORMAL) + return 1; + if (a_type == FIELD_SKIP_ZERO) + return -1; + if (b_type == FIELD_SKIP_ZERO) + return 1; + if (a->type != FIELD_BLOB && b->type != FIELD_BLOB) + if (a->length != b->length) + return sign((long) a->length - (long) b->length); + if (a_type == FIELD_BLOB) + return 1; + if (b_type == FIELD_BLOB) + return -1; + return sign((long) a->offset - (long) b->offset); +} + + +/** + @brief Initialize data file + + @note + In BLOCK_RECORD, a freshly created datafile is one page long; while in + other formats it is 0-byte long. + */ + +int _ma_initialize_data_file(MARIA_SHARE *share, File dfile) +{ + if (share->data_file_type == BLOCK_RECORD) + { + share->bitmap.block_size= share->base.block_size; + share->bitmap.file.file = dfile; + return _ma_bitmap_create_first(share); + } + return 0; +} + + +/** + @brief Writes create_rename_lsn, skip_redo_lsn and is_of_horizon to disk, + can force. + + This is for special cases where: + - we don't want to write the full state to disk (so, not call + _ma_state_info_write()) because some parts of the state may be + currently inconsistent, or because it would be overkill + - we must sync these LSNs immediately for correctness. + It acquires intern_lock to protect the LSNs and state write. + + @param share table's share + @param lsn LSN to write to log files + @param create_trid Trid to be used as state.create_trid + @param do_sync if the write should be forced to disk + @param update_create_rename_lsn if this LSN should be updated or not + + @return Operation status + @retval 0 ok + @retval 1 error (disk problem) +*/ + +int _ma_update_state_lsns(MARIA_SHARE *share, LSN lsn, TrID create_trid, + my_bool do_sync, my_bool update_create_rename_lsn) +{ + int res; + pthread_mutex_lock(&share->intern_lock); + res= _ma_update_state_lsns_sub(share, lsn, create_trid, do_sync, + update_create_rename_lsn); + pthread_mutex_unlock(&share->intern_lock); + return res; +} + + +/** + @brief Writes create_rename_lsn, skip_redo_lsn and is_of_horizon to disk, + can force. + + Shortcut of _ma_update_state_lsns() when we know that intern_lock is not + needed (when creating a table or opening it for the first time). + + @param share table's share + @param lsn LSN to write to state; if LSN_IMPOSSIBLE, write + a LOGREC_IMPORTED_TABLE and use its LSN as lsn. + @param create_trid Trid to be used as state.create_trid + @param do_sync if the write should be forced to disk + @param update_create_rename_lsn if this LSN should be updated or not + + @return Operation status + @retval 0 ok + @retval 1 error (disk problem) +*/ + +#if (_MSC_VER == 1310) +/* + Visual Studio 2003 compiler produces internal compiler error + in this function. Disable optimizations to workaround. +*/ +#pragma optimize("",off) +#endif +int _ma_update_state_lsns_sub(MARIA_SHARE *share, LSN lsn, TrID create_trid, + my_bool do_sync, + my_bool update_create_rename_lsn) +{ + uchar buf[LSN_STORE_SIZE * 3], *ptr; + uchar trid_buff[8]; + File file= share->kfile.file; + DBUG_ASSERT(file >= 0); + + if (lsn == LSN_IMPOSSIBLE) + { + int res; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + /* table name is logged only for information */ + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= + (uchar *)(share->open_file_name.str); + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= + share->open_file_name.length + 1; + if ((res= translog_write_record(&lsn, LOGREC_IMPORTED_TABLE, + &dummy_transaction_object, NULL, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL))) + return res; + } + + for (ptr= buf; ptr < (buf + sizeof(buf)); ptr+= LSN_STORE_SIZE) + lsn_store(ptr, lsn); + share->state.skip_redo_lsn= share->state.is_of_horizon= lsn; + share->state.create_trid= create_trid; + mi_int8store(trid_buff, create_trid); + if (update_create_rename_lsn) + { + share->state.create_rename_lsn= lsn; + if (share->id != 0) + { + /* + If OP is the operation which is calling us, if table is later written, + we could see in the log: + FILE_ID ... REDO_OP ... REDO_INSERT. + (that can happen in real life at least with OP=REPAIR). + As FILE_ID will be ignored by Recovery because it is < + create_rename_lsn, REDO_INSERT would be ignored too, wrongly. + To avoid that, we force a LOGREC_FILE_ID to be logged at next write: + */ + translog_deassign_id_from_share(share); + } + } + else + lsn_store(buf, share->state.create_rename_lsn); + return (my_pwrite(file, buf, sizeof(buf), + sizeof(share->state.header) + + MARIA_FILE_CREATE_RENAME_LSN_OFFSET, MYF(MY_NABP)) || + my_pwrite(file, trid_buff, sizeof(trid_buff), + sizeof(share->state.header) + + MARIA_FILE_CREATE_TRID_OFFSET, MYF(MY_NABP)) || + (do_sync && my_sync(file, MYF(0)))); +} +#if (_MSC_VER == 1310) +#pragma optimize("",on) +#endif /*VS2003 compiler bug workaround*/ diff --git a/storage/maria/ma_dbug.c b/storage/maria/ma_dbug.c new file mode 100644 index 00000000000..af90a108e2a --- /dev/null +++ b/storage/maria/ma_dbug.c @@ -0,0 +1,201 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Support rutiner with are using with dbug */ + +#include "maria_def.h" + +void _ma_print_key(FILE *stream, MARIA_KEY *key) +{ + _ma_print_keydata(stream, key->keyinfo->seg, key->data, key->data_length); +} + + +/* Print a key in a user understandable format */ + +void _ma_print_keydata(FILE *stream, register HA_KEYSEG *keyseg, + const uchar *key, uint length) +{ + int flag; + short int s_1; + long int l_1; + float f_1; + double d_1; + const uchar *end; + const uchar *key_end= key + length; + + VOID(fputs("Key: \"",stream)); + flag=0; + for (; keyseg->type && key < key_end ;keyseg++) + { + if (flag++) + VOID(putc('-',stream)); + end= key+ keyseg->length; + if (keyseg->flag & HA_NULL_PART) + { + /* A NULL value is encoded by a 1-byte flag. Zero means NULL. */ + if (! *(key++)) + { + fprintf(stream,"NULL"); + continue; + } + end++; + } + + switch (keyseg->type) { + case HA_KEYTYPE_BINARY: + if (!(keyseg->flag & HA_SPACE_PACK) && keyseg->length == 1) + { /* packed binary digit */ + VOID(fprintf(stream,"%d",(uint) *key++)); + break; + } + /* fall through */ + case HA_KEYTYPE_TEXT: + case HA_KEYTYPE_NUM: + if (keyseg->flag & HA_SPACE_PACK) + { + VOID(fprintf(stream,"%.*s",(int) *key,key+1)); + key+= (int) *key+1; + } + else + { + VOID(fprintf(stream,"%.*s",(int) keyseg->length,key)); + key=end; + } + break; + case HA_KEYTYPE_INT8: + VOID(fprintf(stream,"%d",(int) *((const signed char*) key))); + key=end; + break; + case HA_KEYTYPE_SHORT_INT: + s_1= mi_sint2korr(key); + VOID(fprintf(stream,"%d",(int) s_1)); + key=end; + break; + case HA_KEYTYPE_USHORT_INT: + { + ushort u_1; + u_1= mi_uint2korr(key); + VOID(fprintf(stream,"%u",(uint) u_1)); + key=end; + break; + } + case HA_KEYTYPE_LONG_INT: + l_1=mi_sint4korr(key); + VOID(fprintf(stream,"%ld",l_1)); + key=end; + break; + case HA_KEYTYPE_ULONG_INT: + l_1=mi_uint4korr(key); + VOID(fprintf(stream,"%lu",(ulong) l_1)); + key=end; + break; + case HA_KEYTYPE_INT24: + VOID(fprintf(stream,"%ld",(long) mi_sint3korr(key))); + key=end; + break; + case HA_KEYTYPE_UINT24: + VOID(fprintf(stream,"%lu",(ulong) mi_uint3korr(key))); + key=end; + break; + case HA_KEYTYPE_FLOAT: + mi_float4get(f_1,key); + VOID(fprintf(stream,"%g",(double) f_1)); + key=end; + break; + case HA_KEYTYPE_DOUBLE: + mi_float8get(d_1,key); + VOID(fprintf(stream,"%g",d_1)); + key=end; + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + { + char buff[21]; + longlong10_to_str(mi_sint8korr(key),buff,-10); + VOID(fprintf(stream,"%s",buff)); + key=end; + break; + } + case HA_KEYTYPE_ULONGLONG: + { + char buff[21]; + longlong10_to_str(mi_sint8korr(key),buff,10); + VOID(fprintf(stream,"%s",buff)); + key=end; + break; + } +#endif + case HA_KEYTYPE_BIT: + { + uint i; + fputs("0x",stream); + for (i=0 ; i < keyseg->length ; i++) + fprintf(stream, "%02x", (uint) *key++); + key= end; + break; + } + case HA_KEYTYPE_VARTEXT1: /* VARCHAR and TEXT */ + case HA_KEYTYPE_VARTEXT2: /* VARCHAR and TEXT */ + case HA_KEYTYPE_VARBINARY1: /* VARBINARY and BLOB */ + case HA_KEYTYPE_VARBINARY2: /* VARBINARY and BLOB */ + { + uint tmp_length; + get_key_length(tmp_length,key); + /* + The following command sometimes gives a warning from valgrind. + Not yet sure if the bug is in valgrind, glibc or mysqld + */ + VOID(fprintf(stream,"%.*s",(int) tmp_length,key)); + key+=tmp_length; + break; + } + default: break; /* This never happens */ + } + } + VOID(fputs("\"\n",stream)); + return; +} /* print_key */ + + +#ifdef EXTRA_DEBUG + +my_bool _ma_check_table_is_closed(const char *name, const char *where) +{ + char filename[FN_REFLEN]; + LIST *pos; + DBUG_ENTER("_ma_check_table_is_closed"); + + (void) fn_format(filename,name,"",MARIA_NAME_IEXT,4+16+32); + pthread_mutex_lock(&THR_LOCK_maria); + for (pos=maria_open_list ; pos ; pos=pos->next) + { + MARIA_HA *info=(MARIA_HA*) pos->data; + MARIA_SHARE *share= info->s; + if (!strcmp(share->unique_file_name.str, filename)) + { + if (share->last_version) + { + fprintf(stderr,"Warning: Table: %s is open on %s\n", name,where); + DBUG_PRINT("warning",("Table: %s is open on %s", name,where)); + pthread_mutex_unlock(&THR_LOCK_maria); + DBUG_RETURN(1); + } + } + } + pthread_mutex_unlock(&THR_LOCK_maria); + DBUG_RETURN(0); +} +#endif /* EXTRA_DEBUG */ diff --git a/storage/maria/ma_delete.c b/storage/maria/ma_delete.c new file mode 100644 index 00000000000..5c04f358b14 --- /dev/null +++ b/storage/maria/ma_delete.c @@ -0,0 +1,1650 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (C) 2009-2010 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "ma_fulltext.h" +#include "ma_rt_index.h" +#include "trnman.h" +#include "ma_key_recover.h" + +static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag, + MARIA_PAGE *page); +static int del(MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page, + uchar *keypos, my_off_t next_block, uchar *ret_key_buff); +static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page, + uchar *keypos); +static uint remove_key(MARIA_KEYDEF *keyinfo, uint page_flag, uint nod_flag, + uchar *keypos, uchar *lastkey, uchar *page_end, + my_off_t *next_block, MARIA_KEY_PARAM *s_temp); + +/* @breif Remove a row from a MARIA table */ + +int maria_delete(MARIA_HA *info,const uchar *record) +{ + uint i; + uchar *old_key; + int save_errno; + char lastpos[8]; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_delete"); + + /* Test if record is in datafile */ + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage", + maria_print_error(share, HA_ERR_CRASHED); + DBUG_RETURN(my_errno= HA_ERR_CRASHED);); + DBUG_EXECUTE_IF("my_error_test_undefined_error", + maria_print_error(share, INT_MAX); + DBUG_RETURN(my_errno= INT_MAX);); + if (!(info->update & HA_STATE_AKTIV)) + { + DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No database read */ + } + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + if ((*share->compare_record)(info,record)) + goto err; /* Error on read-check */ + + if (_ma_mark_file_changed(info)) + goto err; + + /* Ensure we don't change the autoincrement value */ + info->last_auto_increment= ~(ulonglong) 0; + /* Remove all keys from the index file */ + + old_key= info->lastkey_buff2; + + for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++) + { + if (maria_is_key_active(share->state.key_map, i)) + { + keyinfo->version++; + if (keyinfo->flag & HA_FULLTEXT) + { + if (_ma_ft_del(info, i, old_key, record, info->cur_row.lastpos)) + goto err; + } + else + { + MARIA_KEY key; + if (keyinfo->ck_delete(info, + (*keyinfo->make_key)(info, &key, i, old_key, + record, + info->cur_row.lastpos, + info->cur_row.trid))) + goto err; + } + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + } + } + + if (share->calc_checksum) + { + /* + We can't use the row based checksum as this doesn't have enough + precision. + */ + info->cur_row.checksum= (*share->calc_checksum)(info, record); + } + + if ((*share->delete_record)(info, record)) + goto err; /* Remove record from database */ + + info->state->checksum-= info->cur_row.checksum; + info->state->records--; + info->update= HA_STATE_CHANGED+HA_STATE_DELETED+HA_STATE_ROW_CHANGED; + share->state.changed|= (STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_MOVABLE | + STATE_NOT_ZEROFILLED); + info->state->changed=1; + + mi_sizestore(lastpos, info->cur_row.lastpos); + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + allow_break(); /* Allow SIGHUP & SIGINT */ + if (info->invalidator != 0) + { + DBUG_PRINT("info", ("invalidator... '%s' (delete)", + share->open_file_name.str)); + (*info->invalidator)(share->open_file_name.str); + info->invalidator=0; + } + DBUG_RETURN(0); + +err: + save_errno= my_errno; + DBUG_ASSERT(save_errno); + if (!save_errno) + save_errno= HA_ERR_INTERNAL_ERROR; /* Should never happen */ + + mi_sizestore(lastpos, info->cur_row.lastpos); + if (save_errno != HA_ERR_RECORD_CHANGED) + { + maria_print_error(share, HA_ERR_CRASHED); + maria_mark_crashed(info); /* mark table crashed */ + } + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + info->update|=HA_STATE_WRITTEN; /* Buffer changed */ + allow_break(); /* Allow SIGHUP & SIGINT */ + if (save_errno == HA_ERR_KEY_NOT_FOUND) + { + maria_print_error(share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + } + DBUG_RETURN(my_errno= save_errno); +} /* maria_delete */ + + +/* + Remove a key from the btree index + + TODO: + Change ma_ck_real_delete() to use another buffer for changed keys instead + of key->data. This would allows us to remove the copying of the key here. +*/ + +my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key) +{ + MARIA_SHARE *share= info->s; + int res; + LSN lsn= LSN_IMPOSSIBLE; + my_off_t new_root= share->state.key_root[key->keyinfo->key_nr]; + uchar key_buff[MARIA_MAX_KEY_BUFF], *save_key_data; + MARIA_KEY org_key; + DBUG_ENTER("_ma_ck_delete"); + + LINT_INIT_STRUCT(org_key); + + save_key_data= key->data; + if (share->now_transactional) + { + /* Save original value as the key may change */ + memcpy(key_buff, key->data, key->data_length + key->ref_length); + org_key= *key; + key->data= key_buff; + } + + if ((res= _ma_ck_real_delete(info, key, &new_root))) + { + /* We have to mark the table crashed before unpin_all_pages() */ + maria_mark_crashed(info); + } + + key->data= save_key_data; + if (!res && share->now_transactional) + res= _ma_write_undo_key_delete(info, &org_key, new_root, &lsn); + else + { + share->state.key_root[key->keyinfo->key_nr]= new_root; + _ma_fast_unlock_key_del(info); + } + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res != 0); +} /* _ma_ck_delete */ + + +my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key, + my_off_t *root) +{ + int error; + my_bool result= 0; + my_off_t old_root; + uchar *root_buff; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("_ma_ck_real_delete"); + + if ((old_root=*root) == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(1); + } + if (!(root_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ + MARIA_MAX_KEY_BUFF*2))) + { + DBUG_PRINT("error",("Couldn't allocate memory")); + my_errno=ENOMEM; + DBUG_RETURN(1); + } + DBUG_PRINT("info",("root_page: %lu", + (ulong) (old_root / keyinfo->block_length))); + if (_ma_fetch_keypage(&page, info, keyinfo, old_root, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, root_buff, 0)) + { + result= 1; + goto err; + } + if ((error= d_search(info, key, (keyinfo->flag & HA_FULLTEXT ? + SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT: + SEARCH_SAME), + &page))) + { + if (error < 0) + result= 1; + else if (error == 2) + { + DBUG_PRINT("test",("Enlarging of root when deleting")); + if (_ma_enlarge_root(info, key, root)) + result= 1; + } + else /* error == 1 */ + { + MARIA_SHARE *share= info->s; + + page_mark_changed(info, &page); + + if (page.size <= page.node + share->keypage_header + 1) + { + if (page.node) + *root= _ma_kpos(page.node, root_buff +share->keypage_header + + page.node); + else + *root=HA_OFFSET_ERROR; + if (_ma_dispose(info, old_root, 0)) + result= 1; + } + else if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + result= 1; + } + } +err: + my_afree(root_buff); + DBUG_PRINT("exit",("Return: %d",result)); + DBUG_RETURN(result); +} /* _ma_ck_real_delete */ + + +/** + @brief Remove key below key root + + @param key Key to delete. Will contain new key if block was enlarged + + @return + @retval 0 ok (anc_page is not changed) + @retval 1 If data on page is too small; In this case anc_buff is not saved + @retval 2 If data on page is too big + @retval -1 On errors +*/ + +static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag, + MARIA_PAGE *anc_page) +{ + int flag,ret_value,save_flag; + uint nod_flag, page_flag; + my_bool last_key; + uchar *leaf_buff,*keypos; + uchar lastkey[MARIA_MAX_KEY_BUFF]; + MARIA_KEY_PARAM s_temp; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE leaf_page; + DBUG_ENTER("d_search"); + DBUG_DUMP("page", anc_page->buff, anc_page->size); + + flag=(*keyinfo->bin_search)(key, anc_page, comp_flag, &keypos, lastkey, + &last_key); + if (flag == MARIA_FOUND_WRONG_KEY) + { + DBUG_PRINT("error",("Found wrong key")); + DBUG_RETURN(-1); + } + page_flag= anc_page->flag; + nod_flag= anc_page->node; + + if (!flag && (keyinfo->flag & HA_FULLTEXT)) + { + uint off; + int subkeys; + + get_key_full_length_rdonly(off, lastkey); + subkeys=ft_sintXkorr(lastkey+off); + DBUG_ASSERT(info->ft1_to_ft2==0 || subkeys >=0); + comp_flag=SEARCH_SAME; + if (subkeys >= 0) + { + /* normal word, one-level tree structure */ + if (info->ft1_to_ft2) + { + /* we're in ft1->ft2 conversion mode. Saving key data */ + insert_dynamic(info->ft1_to_ft2, (lastkey+off)); + } + else + { + /* we need exact match only if not in ft1->ft2 conversion mode */ + flag=(*keyinfo->bin_search)(key, anc_page, comp_flag, &keypos, + lastkey, &last_key); + } + /* fall through to normal delete */ + } + else + { + /* popular word. two-level tree. going down */ + uint tmp_key_length; + my_off_t root; + uchar *kpos=keypos; + MARIA_KEY tmp_key; + + tmp_key.data= lastkey; + tmp_key.keyinfo= keyinfo; + + if (!(tmp_key_length=(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, + &kpos))) + { + my_errno= HA_ERR_CRASHED; + DBUG_RETURN(-1); + } + root= _ma_row_pos_from_key(&tmp_key); + if (subkeys == -1) + { + /* the last entry in sub-tree */ + if (_ma_dispose(info, root, 1)) + DBUG_RETURN(-1); + /* fall through to normal delete */ + } + else + { + MARIA_KEY word_key; + keyinfo=&share->ft2_keyinfo; + /* we'll modify key entry 'in vivo' */ + kpos-=keyinfo->keylength+nod_flag; + get_key_full_length_rdonly(off, key->data); + + word_key.data= key->data + off; + word_key.keyinfo= &share->ft2_keyinfo; + word_key.data_length= HA_FT_WLEN; + word_key.ref_length= 0; + word_key.flag= 0; + ret_value= _ma_ck_real_delete(info, &word_key, &root); + _ma_dpointer(share, kpos+HA_FT_WLEN, root); + subkeys++; + ft_intXstore(kpos, subkeys); + if (!ret_value) + { + page_mark_changed(info, anc_page); + ret_value= _ma_write_keypage(anc_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS); + } + DBUG_PRINT("exit",("Return: %d",ret_value)); + DBUG_RETURN(ret_value); + } + } + } + leaf_buff=0; + if (nod_flag) + { + /* Read left child page */ + leaf_page.pos= _ma_kpos(nod_flag,keypos); + if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ + MARIA_MAX_KEY_BUFF*2))) + { + DBUG_PRINT("error", ("Couldn't allocate memory")); + my_errno=ENOMEM; + DBUG_RETURN(-1); + } + if (_ma_fetch_keypage(&leaf_page, info,keyinfo, leaf_page.pos, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, leaf_buff, + 0)) + goto err; + } + + if (flag != 0) + { + if (!nod_flag) + { + DBUG_PRINT("error",("Didn't find key")); + my_errno=HA_ERR_CRASHED; /* This should newer happend */ + goto err; + } + save_flag=0; + ret_value= d_search(info, key, comp_flag, &leaf_page); + } + else + { /* Found key */ + uint tmp; + uint anc_buff_length= anc_page->size; + uint anc_page_flag= anc_page->flag; + my_off_t next_block; + + if (!(tmp= remove_key(keyinfo, anc_page_flag, nod_flag, keypos, lastkey, + anc_page->buff + anc_buff_length, + &next_block, &s_temp))) + goto err; + + page_mark_changed(info, anc_page); + anc_buff_length-= tmp; + anc_page->size= anc_buff_length; + page_store_size(share, anc_page); + + /* + Log initial changes on pages + If there is an underflow, there will be more changes logged to the + page + */ + if (share->now_transactional && + _ma_log_delete(anc_page, s_temp.key_pos, + s_temp.changed_length, s_temp.move_length, + 0, KEY_OP_DEBUG_LOG_DEL_CHANGE_1)) + DBUG_RETURN(-1); + + if (!nod_flag) + { /* On leaf page */ + if (anc_buff_length <= (info->quick_mode ? + MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length)) + { + /* Page will be written by caller if we return 1 */ + DBUG_RETURN(1); + } + if (_ma_write_keypage(anc_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + DBUG_RETURN(-1); + DBUG_RETURN(0); + } + save_flag=1; /* Mark that anc_buff is changed */ + ret_value= del(info, key, anc_page, &leaf_page, + keypos, next_block, lastkey); + } + if (ret_value >0) + { + save_flag= 2; + if (ret_value == 1) + ret_value= underflow(info, keyinfo, anc_page, &leaf_page, keypos); + else + { + /* This can only happen with variable length keys */ + MARIA_KEY last_key; + DBUG_PRINT("test",("Enlarging of key when deleting")); + + last_key.data= lastkey; + last_key.keyinfo= keyinfo; + if (!_ma_get_last_key(&last_key, anc_page, keypos)) + goto err; + ret_value= _ma_insert(info, key, anc_page, keypos, + last_key.data, + (MARIA_PAGE*) 0, (uchar*) 0, (my_bool) 0); + + if (_ma_write_keypage(&leaf_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + ret_value= -1; + } + } + if (ret_value == 0 && anc_page->size > share->max_index_block_size) + { + /* + parent buffer got too big ; We have to split the page. + The | 2 is there to force write of anc page below + */ + save_flag= 3; + ret_value= _ma_split_page(info, key, anc_page, + share->max_index_block_size, + (uchar*) 0, 0, 0, lastkey, 0) | 2; + DBUG_ASSERT(anc_page->org_size == anc_page->size); + } + if (save_flag && ret_value != 1) + { + page_mark_changed(info, anc_page); + if (_ma_write_keypage(anc_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + ret_value= -1; + } + else + { + DBUG_DUMP("page", anc_page->buff, anc_page->size); + } + my_afree(leaf_buff); + DBUG_PRINT("exit",("Return: %d",ret_value)); + DBUG_RETURN(ret_value); + +err: + my_afree(leaf_buff); + DBUG_PRINT("exit",("Error: %d",my_errno)); + DBUG_RETURN (-1); +} /* d_search */ + + +/** + @brief Remove a key that has a page-reference + + @param info Maria handler + @param key Buffer for key to be inserted at upper level + @param anc_page Page address for page where deleted key was + @param anc_buff Page buffer (nod) where deleted key was + @param leaf_page Page address for nod before the deleted key + @param leaf_buff Buffer for leaf_page + @param leaf_buff_link Pinned page link for leaf_buff + @param keypos Pos to where deleted key was on anc_buff + @param next_block Page adress for nod after deleted key + @param ret_key_buff Key before keypos in anc_buff + + @notes + leaf_page must be written to disk if retval > 0 + anc_page is not updated on disk. Caller should do this + + @return + @retval < 0 Error + @retval 0 OK. leaf_buff is written to disk + + @retval 1 key contains key to upper level (from balance page) + leaf_buff has underflow + @retval 2 key contains key to upper level (from split space) +*/ + +static int del(MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page, + uchar *keypos, my_off_t next_block, uchar *ret_key_buff) +{ + int ret_value,length; + uint a_length, page_flag, nod_flag, leaf_length, new_leaf_length; + uchar keybuff[MARIA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key; + uchar *anc_buff; + MARIA_KEY_PARAM s_temp; + MARIA_KEY tmp_key; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_KEY ret_key; + MARIA_PAGE next_page; + DBUG_ENTER("del"); + DBUG_PRINT("enter",("leaf_page: %lu keypos: 0x%lx", + (ulong) (leaf_page->pos / share->block_size), + (ulong) keypos)); + DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size); + + page_flag= leaf_page->flag; + leaf_length= leaf_page->size; + nod_flag= leaf_page->node; + + endpos= leaf_page->buff + leaf_length; + tmp_key.keyinfo= keyinfo; + tmp_key.data= keybuff; + + if (!(key_start= _ma_get_last_key(&tmp_key, leaf_page, endpos))) + DBUG_RETURN(-1); + + if (nod_flag) + { + next_page.pos= _ma_kpos(nod_flag,endpos); + if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ + MARIA_MAX_KEY_BUFF*2))) + DBUG_RETURN(-1); + if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, next_buff, 0)) + ret_value= -1; + else + { + DBUG_DUMP("next_page", next_page.buff, next_page.size); + if ((ret_value= del(info, key, anc_page, &next_page, + keypos, next_block, ret_key_buff)) >0) + { + /* Get new length after key was deleted */ + endpos= leaf_page->buff+ leaf_page->size; + if (ret_value == 1) + { + /* underflow writes "next_page" to disk */ + ret_value= underflow(info, keyinfo, leaf_page, &next_page, + endpos); + if (ret_value == 0 && leaf_page->size > + share->max_index_block_size) + { + ret_value= (_ma_split_page(info, key, leaf_page, + share->max_index_block_size, + (uchar*) 0, 0, 0, + ret_key_buff, 0) | 2); + } + } + else + { + if (_ma_write_keypage(&next_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + DBUG_PRINT("test",("Inserting of key when deleting")); + if (!_ma_get_last_key(&tmp_key, leaf_page, endpos)) + goto err; + ret_value= _ma_insert(info, key, leaf_page, endpos, + tmp_key.data, (MARIA_PAGE *) 0, (uchar*) 0, + 0); + } + } + page_mark_changed(info, leaf_page); + /* + If ret_value <> 0, then leaf_page underflowed and caller will have + to handle underflow and write leaf_page to disk. + We can't write it here, as if leaf_page is empty we get an assert + in _ma_write_keypage. + */ + if (ret_value == 0 && _ma_write_keypage(leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + my_afree(next_buff); + DBUG_RETURN(ret_value); + } + + /* + Remove last key from leaf page + Note that leaf_page page may only have had one key (can normally only + happen in quick mode), in which ase it will now temporary have 0 keys + on it. This will be corrected by the caller as we will return 0. + */ + new_leaf_length= (uint) (key_start - leaf_page->buff); + leaf_page->size= new_leaf_length; + page_store_size(share, leaf_page); + + if (share->now_transactional && + _ma_log_suffix(leaf_page, leaf_length, new_leaf_length)) + goto err; + + page_mark_changed(info, leaf_page); /* Safety */ + if (new_leaf_length <= (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length)) + { + /* Underflow, leaf_page will be written by caller */ + ret_value= 1; + } + else + { + ret_value= 0; + if (_ma_write_keypage(leaf_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + + /* Place last key in ancestor page on deleted key position */ + a_length= anc_page->size; + anc_buff= anc_page->buff; + endpos= anc_buff + a_length; + + ret_key.keyinfo= keyinfo; + ret_key.data= ret_key_buff; + + prev_key= 0; + if (keypos != anc_buff+share->keypage_header + share->base.key_reflength) + { + if (!_ma_get_last_key(&ret_key, anc_page, keypos)) + goto err; + prev_key= ret_key.data; + } + length= (*keyinfo->pack_key)(&tmp_key, share->base.key_reflength, + keypos == endpos ? (uchar*) 0 : keypos, + prev_key, prev_key, + &s_temp); + if (length > 0) + bmove_upp(endpos+length,endpos,(uint) (endpos-keypos)); + else + bmove(keypos,keypos-length, (int) (endpos-keypos)+length); + (*keyinfo->store_key)(keyinfo,keypos,&s_temp); + key_start= keypos; + if (tmp_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + { + _ma_mark_page_with_transid(share, anc_page); + } + + /* Save pointer to next leaf on parent page */ + if (!(*keyinfo->get_key)(&ret_key, page_flag, share->base.key_reflength, + &keypos)) + goto err; + _ma_kpointer(info,keypos - share->base.key_reflength,next_block); + anc_page->size= a_length + length; + page_store_size(share, anc_page); + + if (share->now_transactional && + _ma_log_add(anc_page, a_length, + key_start, s_temp.changed_length, s_temp.move_length, 1, + KEY_OP_DEBUG_LOG_ADD_2)) + goto err; + + DBUG_RETURN(new_leaf_length <= + (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length)); +err: + DBUG_RETURN(-1); +} /* del */ + + +/** + @brief Balances adjacent pages if underflow occours + + @fn underflow() + @param anc_buff Anchestor page data + @param leaf_page Leaf page (page that underflowed) + @param leaf_page_link Pointer to pin information about leaf page + @param keypos Position after current key in anc_buff + + @note + This function writes redo entries for all changes + leaf_page is saved to disk + Caller must save anc_buff + + @return + @retval 0 ok + @retval 1 ok, but anc_buff did underflow + @retval -1 error + */ + +static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page, + uchar *keypos) +{ + int t_length; + uint anc_length,buff_length,leaf_length,p_length,s_length,nod_flag; + uint next_buff_length, new_buff_length, key_reflength; + uint unchanged_leaf_length, new_leaf_length, new_anc_length; + uint anc_page_flag, page_flag; + uchar anc_key_buff[MARIA_MAX_KEY_BUFF], leaf_key_buff[MARIA_MAX_KEY_BUFF]; + uchar *endpos, *next_keypos, *anc_pos, *half_pos, *prev_key; + uchar *anc_buff, *leaf_buff; + uchar *after_key, *anc_end_pos; + MARIA_KEY_PARAM key_deleted, key_inserted; + MARIA_SHARE *share= info->s; + my_bool first_key; + MARIA_KEY tmp_key, anc_key, leaf_key; + MARIA_PAGE next_page; + DBUG_ENTER("underflow"); + DBUG_PRINT("enter",("leaf_page: %lu keypos: 0x%lx", + (ulong) (leaf_page->pos / share->block_size), + (ulong) keypos)); + DBUG_DUMP("anc_buff", anc_page->buff, anc_page->size); + DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size); + + anc_page_flag= anc_page->flag; + anc_buff= anc_page->buff; + leaf_buff= leaf_page->buff; + info->keyread_buff_used=1; + next_keypos=keypos; + nod_flag= leaf_page->node; + p_length= nod_flag+share->keypage_header; + anc_length= anc_page->size; + leaf_length= leaf_page->size; + key_reflength= share->base.key_reflength; + if (share->keyinfo+info->lastinx == keyinfo) + info->page_changed=1; + first_key= keypos == anc_buff + share->keypage_header + key_reflength; + + tmp_key.data= info->buff; + anc_key.data= anc_key_buff; + leaf_key.data= leaf_key_buff; + tmp_key.keyinfo= leaf_key.keyinfo= anc_key.keyinfo= keyinfo; + + if ((keypos < anc_buff + anc_length && (info->state->records & 1)) || + first_key) + { + size_t tmp_length; + uint next_page_flag; + /* Use page right of anc-page */ + DBUG_PRINT("test",("use right page")); + + /* + Calculate position after the current key. Note that keydata itself is + not used + */ + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { + if (!(next_keypos= _ma_get_key(&tmp_key, anc_page, keypos))) + goto err; + } + else + { + /* Avoid length error check if packed key */ + tmp_key.data[0]= tmp_key.data[1]= 0; + /* Got to end of found key */ + if (!(*keyinfo->get_key)(&tmp_key, anc_page_flag, key_reflength, + &next_keypos)) + goto err; + } + next_page.pos= _ma_kpos(key_reflength, next_keypos); + if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, info->buff, 0)) + goto err; + next_buff_length= next_page.size; + next_page_flag= next_page.flag; + DBUG_DUMP("next", next_page.buff, next_page.size); + + /* find keys to make a big key-page */ + bmove(next_keypos-key_reflength, next_page.buff + share->keypage_header, + key_reflength); + + if (!_ma_get_last_key(&anc_key, anc_page, next_keypos) || + !_ma_get_last_key(&leaf_key, leaf_page, leaf_buff+leaf_length)) + goto err; + + /* merge pages and put parting key from anc_page between */ + prev_key= (leaf_length == p_length ? (uchar*) 0 : leaf_key.data); + t_length= (*keyinfo->pack_key)(&anc_key, nod_flag, next_page.buff+p_length, + prev_key, prev_key, &key_inserted); + tmp_length= next_buff_length - p_length; + endpos= next_page.buff + tmp_length + leaf_length + t_length; + /* next_page.buff will always be larger than before !*/ + bmove_upp(endpos, next_page.buff + next_buff_length, tmp_length); + memcpy(next_page.buff, leaf_buff,(size_t) leaf_length); + (*keyinfo->store_key)(keyinfo, next_page.buff+leaf_length, &key_inserted); + buff_length= (uint) (endpos - next_page.buff); + + /* Set page flag from combination of both key pages and parting key */ + page_flag= next_page_flag | leaf_page->flag; + if (anc_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + page_flag|= KEYPAGE_FLAG_HAS_TRANSID; + + next_page.size= buff_length; + next_page.flag= page_flag; + page_store_info(share, &next_page); + + /* remove key from anc_page */ + if (!(s_length=remove_key(keyinfo, anc_page_flag, key_reflength, keypos, + anc_key_buff, anc_buff+anc_length, + (my_off_t *) 0, &key_deleted))) + goto err; + + new_anc_length= anc_length - s_length; + anc_page->size= new_anc_length; + page_store_size(share, anc_page); + + if (buff_length <= share->max_index_block_size) + { + /* All keys fitted into one page */ + page_mark_changed(info, &next_page); + if (_ma_dispose(info, next_page.pos, 0)) + goto err; + + memcpy(leaf_buff, next_page.buff, (size_t) buff_length); + leaf_page->size= next_page.size; + leaf_page->flag= next_page.flag; + + if (share->now_transactional) + { + /* + Log changes to parent page. Note that this page may have been + temporarily bigger than block_size. + */ + if (_ma_log_delete(anc_page, key_deleted.key_pos, + key_deleted.changed_length, + key_deleted.move_length, + anc_length - anc_page->org_size, + KEY_OP_DEBUG_LOG_DEL_CHANGE_2)) + goto err; + /* + Log changes to leaf page. Data for leaf page is in leaf_buff + which contains original leaf_buff, parting key and next_buff + */ + if (_ma_log_suffix(leaf_page, leaf_length, buff_length)) + goto err; + } + } + else + { + /* + Balancing didn't free a page, so we have to split 'buff' into two + pages: + - Find key in middle of buffer + - Store everything before key in 'leaf_page' + - Pack key into anc_page at position of deleted key + Note that anc_page may overflow! (is handled by caller) + - Store remaining keys in next_page (buff) + */ + MARIA_KEY_PARAM anc_key_inserted; + + anc_end_pos= anc_buff + new_anc_length; + + DBUG_PRINT("test",("anc_buff: 0x%lx anc_end_pos: 0x%lx", + (long) anc_buff, (long) anc_end_pos)); + + if (!first_key && !_ma_get_last_key(&anc_key, anc_page, keypos)) + goto err; + if (!(half_pos= _ma_find_half_pos(&leaf_key, &next_page, &after_key))) + goto err; + new_leaf_length= (uint) (half_pos - next_page.buff); + memcpy(leaf_buff, next_page.buff, (size_t) new_leaf_length); + + leaf_page->size= new_leaf_length; + leaf_page->flag= page_flag; + page_store_info(share, leaf_page); + + /* Correct new keypointer to leaf_page */ + half_pos=after_key; + _ma_kpointer(info, + leaf_key.data + leaf_key.data_length + leaf_key.ref_length, + next_page.pos); + + /* Save key in anc_page */ + prev_key= (first_key ? (uchar*) 0 : anc_key.data); + t_length= (*keyinfo->pack_key)(&leaf_key, key_reflength, + (keypos == anc_end_pos ? (uchar*) 0 : + keypos), + prev_key, prev_key, &anc_key_inserted); + if (t_length >= 0) + bmove_upp(anc_end_pos+t_length, anc_end_pos, + (uint) (anc_end_pos - keypos)); + else + bmove(keypos,keypos-t_length,(uint) (anc_end_pos-keypos)+t_length); + (*keyinfo->store_key)(keyinfo,keypos, &anc_key_inserted); + new_anc_length+= t_length; + anc_page->size= new_anc_length; + page_store_size(share, anc_page); + + if (leaf_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + _ma_mark_page_with_transid(share, anc_page); + + /* Store key first in new page */ + if (nod_flag) + bmove(next_page.buff + share->keypage_header, half_pos-nod_flag, + (size_t) nod_flag); + if (!(*keyinfo->get_key)(&leaf_key, page_flag, nod_flag, &half_pos)) + goto err; + t_length=(int) (*keyinfo->pack_key)(&leaf_key, nod_flag, (uchar*) 0, + (uchar*) 0, (uchar*) 0, + &key_inserted); + /* t_length will always be > 0 for a new page !*/ + tmp_length= (size_t) ((next_page.buff + buff_length) - half_pos); + bmove(next_page.buff + p_length + t_length, half_pos, tmp_length); + (*keyinfo->store_key)(keyinfo, next_page.buff + p_length, &key_inserted); + new_buff_length= tmp_length + t_length + p_length; + next_page.size= new_buff_length; + page_store_size(share, &next_page); + /* keypage flag is already up to date */ + + if (share->now_transactional) + { + /* + Log changes to parent page + This has one key deleted from it and one key inserted to it at + keypos + + ma_log_add ensures that we don't log changes that is outside of + key block size, as the REDO code can't handle that + */ + if (_ma_log_add(anc_page, anc_length, keypos, + anc_key_inserted.move_length + + max(anc_key_inserted.changed_length - + anc_key_inserted.move_length, + key_deleted.changed_length), + anc_key_inserted.move_length - + key_deleted.move_length, 1, + KEY_OP_DEBUG_LOG_ADD_3)) + goto err; + + /* + Log changes to leaf page. + This contains original data with new data added at end + */ + DBUG_ASSERT(leaf_length <= new_leaf_length); + if (_ma_log_suffix(leaf_page, leaf_length, new_leaf_length)) + goto err; + /* + Log changes to next page + + This contains original data with some prefix data deleted and + some compressed data at start possible extended + + Data in buff was originally: + org_leaf_buff [leaf_length] + separator_key [buff_key_inserted.move_length] + next_key_changes [buff_key_inserted.changed_length -move_length] + next_page_data [next_buff_length - p_length - + (buff_key_inserted.changed_length -move_length)] + + After changes it's now: + unpacked_key [key_inserted.changed_length] + next_suffix [next_buff_length - key_inserted.changed_length] + + */ + DBUG_ASSERT(new_buff_length <= next_buff_length); + if (_ma_log_prefix(&next_page, key_inserted.changed_length, + (int) (new_buff_length - next_buff_length), + KEY_OP_DEBUG_LOG_PREFIX_1)) + goto err; + } + page_mark_changed(info, &next_page); + if (_ma_write_keypage(&next_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + } + + page_mark_changed(info, leaf_page); + if (_ma_write_keypage(leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + DBUG_RETURN(new_anc_length <= + ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length))); + } + + DBUG_PRINT("test",("use left page")); + + keypos= _ma_get_last_key(&anc_key, anc_page, keypos); + if (!keypos) + goto err; + next_page.pos= _ma_kpos(key_reflength,keypos); + if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, info->buff, 0)) + goto err; + buff_length= next_page.size; + endpos= next_page.buff + buff_length; + DBUG_DUMP("prev", next_page.buff, next_page.size); + + /* find keys to make a big key-page */ + bmove(next_keypos - key_reflength, leaf_buff + share->keypage_header, + key_reflength); + next_keypos=keypos; + if (!(*keyinfo->get_key)(&anc_key, anc_page_flag, key_reflength, + &next_keypos)) + goto err; + if (!_ma_get_last_key(&leaf_key, &next_page, endpos)) + goto err; + + /* merge pages and put parting key from anc_page between */ + prev_key= (leaf_length == p_length ? (uchar*) 0 : leaf_key.data); + t_length=(*keyinfo->pack_key)(&anc_key, nod_flag, + (leaf_length == p_length ? + (uchar*) 0 : leaf_buff+p_length), + prev_key, prev_key, + &key_inserted); + if (t_length >= 0) + bmove(endpos+t_length, leaf_buff+p_length, + (size_t) (leaf_length-p_length)); + else /* We gained space */ + bmove(endpos,leaf_buff+((int) p_length-t_length), + (size_t) (leaf_length-p_length+t_length)); + (*keyinfo->store_key)(keyinfo,endpos, &key_inserted); + + /* Remember for logging how many bytes of leaf_buff that are not changed */ + DBUG_ASSERT((int) key_inserted.changed_length >= key_inserted.move_length); + unchanged_leaf_length= (leaf_length - p_length - + (key_inserted.changed_length - + key_inserted.move_length)); + + new_buff_length= buff_length + leaf_length - p_length + t_length; + +#ifdef EXTRA_DEBUG + /* Ensure that unchanged_leaf_length is correct */ + DBUG_ASSERT(bcmp(next_page.buff + new_buff_length - unchanged_leaf_length, + leaf_buff + leaf_length - unchanged_leaf_length, + unchanged_leaf_length) == 0); +#endif + + page_flag= next_page.flag | leaf_page->flag; + if (anc_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + page_flag|= KEYPAGE_FLAG_HAS_TRANSID; + + next_page.size= new_buff_length; + next_page.flag= page_flag; + page_store_info(share, &next_page); + + /* remove key from anc_page */ + if (!(s_length= remove_key(keyinfo, anc_page_flag, key_reflength, keypos, + anc_key_buff, + anc_buff+anc_length, (my_off_t *) 0, + &key_deleted))) + goto err; + + new_anc_length= anc_length - s_length; + anc_page->size= new_anc_length; + page_store_size(share, anc_page); + + if (new_buff_length <= share->max_index_block_size) + { + /* All keys fitted into one page */ + page_mark_changed(info, leaf_page); + if (_ma_dispose(info, leaf_page->pos, 0)) + goto err; + + if (share->now_transactional) + { + /* + Log changes to parent page. Note that this page may have been + temporarily bigger than block_size. + */ + if (_ma_log_delete(anc_page, key_deleted.key_pos, + key_deleted.changed_length, key_deleted.move_length, + anc_length - anc_page->org_size, + KEY_OP_DEBUG_LOG_DEL_CHANGE_3)) + goto err; + /* + Log changes to next page. Data for leaf page is in buff + that contains original leaf_buff, parting key and next_buff + */ + if (_ma_log_suffix(&next_page, buff_length, new_buff_length)) + goto err; + } + } + else + { + /* + Balancing didn't free a page, so we have to split 'next_page' into two + pages + - Find key in middle of buffer (buff) + - Pack key at half_buff into anc_page at position of deleted key + Note that anc_page may overflow! (is handled by caller) + - Move everything after middlekey to 'leaf_buff' + - Shorten buff at 'endpos' + */ + MARIA_KEY_PARAM anc_key_inserted; + size_t tmp_length; + + if (keypos == anc_buff + share->keypage_header + key_reflength) + anc_pos= 0; /* First key */ + else + { + if (!_ma_get_last_key(&anc_key, anc_page, keypos)) + goto err; + anc_pos= anc_key.data; + } + if (!(endpos= _ma_find_half_pos(&leaf_key, &next_page, &half_pos))) + goto err; + + /* Correct new keypointer to leaf_page */ + _ma_kpointer(info,leaf_key.data + leaf_key.data_length + + leaf_key.ref_length, leaf_page->pos); + + /* Save key in anc_page */ + DBUG_DUMP("anc_buff", anc_buff, new_anc_length); + DBUG_DUMP_KEY("key_to_anc", &leaf_key); + anc_end_pos= anc_buff + new_anc_length; + t_length=(*keyinfo->pack_key)(&leaf_key, key_reflength, + keypos == anc_end_pos ? (uchar*) 0 + : keypos, + anc_pos, anc_pos, + &anc_key_inserted); + if (t_length >= 0) + bmove_upp(anc_end_pos+t_length, anc_end_pos, + (uint) (anc_end_pos-keypos)); + else + bmove(keypos,keypos-t_length,(uint) (anc_end_pos-keypos)+t_length); + (*keyinfo->store_key)(keyinfo,keypos, &anc_key_inserted); + new_anc_length+= t_length; + anc_page->size= new_anc_length; + page_store_size(share, anc_page); + + if (leaf_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + _ma_mark_page_with_transid(share, anc_page); + + /* Store first key on new page */ + if (nod_flag) + bmove(leaf_buff + share->keypage_header, half_pos-nod_flag, + (size_t) nod_flag); + if (!(*keyinfo->get_key)(&leaf_key, page_flag, nod_flag, &half_pos)) + goto err; + DBUG_DUMP_KEY("key_to_leaf", &leaf_key); + t_length=(*keyinfo->pack_key)(&leaf_key, nod_flag, (uchar*) 0, + (uchar*) 0, (uchar*) 0, &key_inserted); + /* t_length will always be > 0 for a new page !*/ + tmp_length= (size_t) ((next_page.buff + new_buff_length) - half_pos); + DBUG_PRINT("info",("t_length: %d length: %d",t_length, (int) tmp_length)); + bmove(leaf_buff+p_length+t_length, half_pos, tmp_length); + (*keyinfo->store_key)(keyinfo,leaf_buff+p_length, &key_inserted); + new_leaf_length= tmp_length + t_length + p_length; + + leaf_page->size= new_leaf_length; + leaf_page->flag= page_flag; + page_store_info(share, leaf_page); + + new_buff_length= (uint) (endpos - next_page.buff); + next_page.size= new_buff_length; + page_store_size(share, &next_page); + + if (share->now_transactional) + { + /* + Log changes to parent page + This has one key deleted from it and one key inserted to it at + keypos + + ma_log_add() ensures that we don't log changes that is outside of + key block size, as the REDO code can't handle that + */ + if (_ma_log_add(anc_page, anc_length, keypos, + anc_key_inserted.move_length + + max(anc_key_inserted.changed_length - + anc_key_inserted.move_length, + key_deleted.changed_length), + anc_key_inserted.move_length - + key_deleted.move_length, 1,KEY_OP_DEBUG_LOG_ADD_4)) + goto err; + + /* + Log changes to leaf page. + This contains original data with new data added first + */ + DBUG_ASSERT(leaf_length <= new_leaf_length); + DBUG_ASSERT(new_leaf_length >= unchanged_leaf_length); + if (_ma_log_prefix(leaf_page, new_leaf_length - unchanged_leaf_length, + (int) (new_leaf_length - leaf_length), + KEY_OP_DEBUG_LOG_PREFIX_2)) + goto err; + /* + Log changes to next page + This contains original data with some suffix data deleted + + */ + DBUG_ASSERT(new_buff_length <= buff_length); + if (_ma_log_suffix(&next_page, buff_length, new_buff_length)) + goto err; + } + + page_mark_changed(info, leaf_page); + if (_ma_write_keypage(leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + } + page_mark_changed(info, &next_page); + if (_ma_write_keypage(&next_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + + DBUG_RETURN(new_anc_length <= + ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length))); + +err: + DBUG_RETURN(-1); +} /* underflow */ + + +/** + @brief Remove a key from page + + @fn remove_key() + keyinfo Key handle + nod_flag Length of node ptr + keypos Where on page key starts + lastkey Buffer for storing keys to be removed + page_end Pointer to end of page + next_block If <> 0 and node-page, this is set to address of + next page + s_temp Information about what changes was done one the page: + s_temp.key_pos Start of key + s_temp.move_length Number of bytes removed at keypos + s_temp.changed_length Number of bytes changed at keypos + + @todo + The current code doesn't handle the case that the next key may be + packed better against the previous key if there is a case difference + + @return + @retval 0 error + @retval # How many chars was removed +*/ + +static uint remove_key(MARIA_KEYDEF *keyinfo, uint page_flag, uint nod_flag, + uchar *keypos, uchar *lastkey, + uchar *page_end, my_off_t *next_block, + MARIA_KEY_PARAM *s_temp) +{ + int s_length; + uchar *start; + DBUG_ENTER("remove_key"); + DBUG_PRINT("enter", ("keypos: 0x%lx page_end: 0x%lx", + (long) keypos, (long) page_end)); + + start= s_temp->key_pos= keypos; + s_temp->changed_length= 0; + if (!(keyinfo->flag & + (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | + HA_BINARY_PACK_KEY)) && + !(page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + /* Static length key */ + s_length=(int) (keyinfo->keylength+nod_flag); + if (next_block && nod_flag) + *next_block= _ma_kpos(nod_flag,keypos+s_length); + } + else + { + /* Let keypos point at next key */ + MARIA_KEY key; + + /* Calculate length of key */ + key.keyinfo= keyinfo; + key.data= lastkey; + if (!(*keyinfo->get_key)(&key, page_flag, nod_flag, &keypos)) + DBUG_RETURN(0); /* Error */ + + if (next_block && nod_flag) + *next_block= _ma_kpos(nod_flag,keypos); + s_length=(int) (keypos-start); + if (keypos != page_end) + { + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { + uchar *old_key= start; + uint next_length,prev_length,prev_pack_length; + + /* keypos points here on start of next key */ + get_key_length(next_length,keypos); + get_key_pack_length(prev_length,prev_pack_length,old_key); + if (next_length > prev_length) + { + uint diff= (next_length-prev_length); + /* We have to copy data from the current key to the next key */ + keypos-= diff + prev_pack_length; + store_key_length(keypos, prev_length); + bmove(keypos + prev_pack_length, lastkey + prev_length, diff); + s_length=(int) (keypos-start); + s_temp->changed_length= diff + prev_pack_length; + } + } + else + { + /* Check if a variable length first key part */ + if ((keyinfo->seg->flag & HA_PACK_KEY) && *keypos & 128) + { + /* Next key is packed against the current one */ + uint next_length,prev_length,prev_pack_length,lastkey_length, + rest_length; + if (keyinfo->seg[0].length >= 127) + { + if (!(prev_length=mi_uint2korr(start) & 32767)) + goto end; + next_length=mi_uint2korr(keypos) & 32767; + keypos+=2; + prev_pack_length=2; + } + else + { + if (!(prev_length= *start & 127)) + goto end; /* Same key as previous*/ + next_length= *keypos & 127; + keypos++; + prev_pack_length=1; + } + if (!(*start & 128)) + prev_length=0; /* prev key not packed */ + if (keyinfo->seg[0].flag & HA_NULL_PART) + lastkey++; /* Skip null marker */ + get_key_length(lastkey_length,lastkey); + if (!next_length) /* Same key after */ + { + next_length=lastkey_length; + rest_length=0; + } + else + get_key_length(rest_length,keypos); + + if (next_length >= prev_length) + { + /* Next key is based on deleted key */ + uint pack_length; + uint diff= (next_length-prev_length); + + /* keypos points to data of next key (after key length) */ + bmove(keypos - diff, lastkey + prev_length, diff); + rest_length+= diff; + pack_length= prev_length ? get_pack_length(rest_length): 0; + keypos-= diff + pack_length + prev_pack_length; + s_length=(int) (keypos-start); + if (prev_length) /* Pack against prev key */ + { + *keypos++= start[0]; + if (prev_pack_length == 2) + *keypos++= start[1]; + store_key_length(keypos,rest_length); + } + else + { + /* Next key is not packed anymore */ + if (keyinfo->seg[0].flag & HA_NULL_PART) + { + rest_length++; /* Mark not null */ + } + if (prev_pack_length == 2) + { + mi_int2store(keypos,rest_length); + } + else + *keypos= rest_length; + } + s_temp->changed_length= diff + pack_length + prev_pack_length; + } + } + } + } + } + end: + bmove(start, start+s_length, (uint) (page_end-start-s_length)); + s_temp->move_length= s_length; + DBUG_RETURN((uint) s_length); +} /* remove_key */ + + +/**************************************************************************** + Logging of redos +****************************************************************************/ + +/** + @brief + log entry where some parts are deleted and some things are changed + and some data could be added last. + + @fn _ma_log_delete() + @param info Maria handler + @param page Pageaddress for changed page + @param buff Page buffer + @param key_pos Start of change area + @param changed_length How many bytes where changed at key_pos + @param move_length How many bytes where deleted at key_pos + @param append_length Length of data added last + This is taken from end of ma_page->buff + + This is mainly used when a key is deleted. The append happens + when we delete a key from a page with data > block_size kept in + memory and we have to add back the data that was stored > block_size +*/ + +my_bool _ma_log_delete(MARIA_PAGE *ma_page, const uchar *key_pos, + uint changed_length, uint move_length, + uint append_length __attribute__((unused)), + enum en_key_debug debug_marker __attribute__((unused))) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 5+ 2 + 3 + 3 + 6 + 3 + 7]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 7]; + uint translog_parts, current_size, extra_length; + uint offset= (uint) (key_pos - ma_page->buff); + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + my_off_t page= ma_page->pos / share->block_size; + DBUG_ENTER("_ma_log_delete"); + DBUG_PRINT("enter", ("page: %lu changed_length: %u move_length: %d", + (ulong) page, changed_length, move_length)); + DBUG_ASSERT(share->now_transactional && move_length); + DBUG_ASSERT(offset + changed_length <= ma_page->size); + DBUG_ASSERT(ma_page->org_size - move_length + append_length == ma_page->size); + DBUG_ASSERT(move_length <= ma_page->org_size - share->keypage_header); + + /* Store address of new root page */ + page_store(log_data + FILEID_STORE_SIZE, page); + log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE; + current_size= ma_page->org_size; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + *log_pos++= KEY_OP_DEBUG; + *log_pos++= debug_marker; + + *log_pos++= KEY_OP_DEBUG_2; + int2store(log_pos, ma_page->org_size); + int2store(log_pos+2, ma_page->size); + log_pos+=4; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET]; + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + translog_parts= TRANSLOG_INTERNAL_PARTS + 1; + extra_length= 0; + + if (changed_length) + { + if (offset + changed_length >= share->max_index_block_size) + { + changed_length= share->max_index_block_size - offset; + move_length= 0; /* Nothing to move */ + current_size= share->max_index_block_size; + } + + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, changed_length); + log_pos+= 3; + log_array[translog_parts].str= ma_page->buff + offset; + log_array[translog_parts].length= changed_length; + translog_parts++; + + /* We only have to move things after offset+changed_length */ + offset+= changed_length; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data); + + if (move_length) + { + uint log_length; + if (offset + move_length < share->max_index_block_size) + { + /* + Move down things that is on page. + page_offset in apply_redo_inxed() will be at original offset + + changed_length. + */ + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, - (int) move_length); + log_length= 3; + current_size-= move_length; + } + else + { + /* Delete to end of page */ + uint tmp= current_size - offset; + current_size= offset; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, tmp); + log_length= 3; + } + log_array[translog_parts].str= log_pos; + log_array[translog_parts].length= log_length; + translog_parts++; + log_pos+= log_length; + extra_length+= log_length; + } + + if (current_size != ma_page->size && + current_size != share->max_index_block_size) + { + /* Append data that didn't fit on the page before */ + uint length= (min(ma_page->size, share->max_index_block_size) - + current_size); + uchar *data= ma_page->buff + current_size; + + DBUG_ASSERT(length <= append_length); + + log_pos[0]= KEY_OP_ADD_SUFFIX; + int2store(log_pos+1, length); + log_array[translog_parts].str= log_pos; + log_array[translog_parts].length= 3; + log_array[translog_parts + 1].str= data; + log_array[translog_parts + 1].length= length; + log_pos+= 3; + translog_parts+= 2; + current_size+= length; + extra_length+= 3 + length; + } + + _ma_log_key_changes(ma_page, + log_array + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= current_size; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS].length + + changed_length + extra_length, translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(1); + + DBUG_RETURN(0); +} + + +/**************************************************************************** + Logging of undos +****************************************************************************/ + +my_bool _ma_write_undo_key_delete(MARIA_HA *info, const MARIA_KEY *key, + my_off_t new_root, LSN *res_lsn) +{ + MARIA_SHARE *share= info->s; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE], *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + struct st_msg_to_write_hook_for_undo_key msg; + enum translog_record_type log_type= LOGREC_UNDO_KEY_DELETE; + uint keynr= key->keyinfo->key_nr; + + lsn_store(log_data, info->trn->undo_lsn); + key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, keynr); + log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE; + + /** + @todo BUG if we had concurrent insert/deletes, reading state's key_root + like this would be unsafe. + */ + if (new_root != share->state.key_root[keynr]) + { + my_off_t page; + page= ((new_root == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO : + new_root / share->block_size); + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + log_type= LOGREC_UNDO_KEY_DELETE_WITH_ROOT; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key->data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= (key->data_length + + key->ref_length); + + msg.root= &share->state.key_root[keynr]; + msg.value= new_root; + /* + set autoincrement to 1 if this is an auto_increment key + This is only used if we are now in a rollback of a duplicate key + */ + msg.auto_increment= share->base.auto_key == keynr + 1; + + return translog_write_record(res_lsn, log_type, + info->trn, info, + (translog_size_t) + (log_array[TRANSLOG_INTERNAL_PARTS + 0].length + + log_array[TRANSLOG_INTERNAL_PARTS + 1].length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data + LSN_STORE_SIZE, &msg) ? -1 : 0; +} diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c new file mode 100644 index 00000000000..4661ea0ab59 --- /dev/null +++ b/storage/maria/ma_delete_all.c @@ -0,0 +1,192 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Remove all rows from a MARIA table */ +/* This clears the status information and truncates files */ + +#include "maria_def.h" +#include "trnman.h" + +/** + @brief deletes all rows from a table + + @param info Maria handler + + @note It is important that this function does not rely on the state + information, as it may be called by ma_apply_undo_bulk_insert() on an + inconsistent table left by a crash. + + @return Operation status + @retval 0 ok + @retval 1 error +*/ + +int maria_delete_all_rows(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + my_bool log_record; + LSN lsn; + DBUG_ENTER("maria_delete_all_rows"); + + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + /** + @todo LOCK take X-lock on table here. + When we have versioning, if some other thread is looking at this table, + we cannot shrink the file like this. + */ + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + log_record= share->now_transactional && !share->temporary; + if (_ma_mark_file_changed(info)) + goto err; + + if (log_record) + { + /* + This record will be used by Recovery to finish the deletion if it + crashed. We force it to have a complete history in the log. + */ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[FILEID_STORE_SIZE]; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DELETE_ALL, + info->trn, info, 0, + sizeof(log_array)/sizeof(log_array[0]), + log_array, log_data, NULL) || + translog_flush(lsn))) + goto err; + /* + If we fail in this function after this point, log and table will be + inconsistent. + */ + } + else + { + /* Other branch called function below when writing log record, in hook */ + _ma_reset_status(info); + } + /* Remove old history as the table is now empty for everyone */ + _ma_reset_state(info); + + /* + If we are using delayed keys or if the user has done changes to the tables + since it was locked then there may be key blocks in the page cache. Or + there may be data blocks there. We need to throw them away or they may + re-enter the emptied table or another table later. + */ + +#ifdef HAVE_MMAP + if (share->file_map) + _ma_unmap_file(info); +#endif + + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX, + FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) || + my_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) || + my_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME))) + goto err; + + if (_ma_initialize_data_file(share, info->dfile.file)) + goto err; + + if (log_record) + { + /* + Because LOGREC_REDO_DELETE_ALL does not operate on pages, it has the + following problem: + delete_all; inserts (redo_insert); all pages get flushed; checkpoint: + the dirty pages list will be empty. In recovery, delete_all is executed, + but redo_insert are skipped (dirty pages list is empty). + To avoid this, we need to set skip_redo_lsn now, and thus need to sync + files. + Also fixes the problem of: + bulk insert; insert; delete_all; crash: + "bulk insert" is skipped (no REDOs), so if "insert" would not be skipped + (if we didn't update skip_redo_lsn below) then "insert" would be tried + and fail, saying that it sees that the first page has to be created + though the inserted row has rownr>0. + */ + my_bool error= _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_LOCK) || + _ma_update_state_lsns(share, lsn, trnman_get_min_trid(), FALSE, FALSE) || + _ma_sync_table_files(info); + info->trn->rec_lsn= LSN_IMPOSSIBLE; + if (error) + goto err; + } + + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); +#ifdef HAVE_MMAP + /* Map again */ + if (share->file_map) + _ma_dynmap_file(info, (my_off_t) 0); +#endif + allow_break(); /* Allow SIGHUP & SIGINT */ + DBUG_RETURN(0); + +err: + { + int save_errno=my_errno; + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + info->update|=HA_STATE_WRITTEN; /* Buffer changed */ + allow_break(); /* Allow SIGHUP & SIGINT */ + DBUG_RETURN(my_errno=save_errno); + } +} /* maria_delete_all_rows */ + + +/* + Reset status information + + SYNOPSIS + _ma_reset_status() + maria Maria handler + + DESCRIPTION + Resets data and index file information as if the file would be empty + Files are not touched. +*/ + +void _ma_reset_status(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + MARIA_STATE_INFO *state= &share->state; + uint i; + DBUG_ENTER("_ma_reset_status"); + + state->split= 0; + state->state.records= state->state.del= 0; + state->changed= 0; /* File is optimized */ + state->dellink= HA_OFFSET_ERROR; + state->sortkey= (ushort) ~0; + state->state.key_file_length= share->base.keystart; + state->state.data_file_length= 0; + state->state.empty= state->state.key_empty= 0; + state->state.checksum= 0; + + *info->state= state->state; + + /* Drop the delete key chain. */ + state->key_del= HA_OFFSET_ERROR; + /* Clear all keys */ + for (i=0 ; i < share->base.keys ; i++) + state->key_root[i]= HA_OFFSET_ERROR; + DBUG_VOID_RETURN; +} diff --git a/storage/maria/ma_delete_table.c b/storage/maria/ma_delete_table.c new file mode 100644 index 00000000000..0237bb884c5 --- /dev/null +++ b/storage/maria/ma_delete_table.c @@ -0,0 +1,107 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "ma_fulltext.h" +#include "trnman_public.h" + +/** + @brief drops (deletes) a table + + @param name table's name + + @return Operation status + @retval 0 ok + @retval 1 error +*/ + +int maria_delete_table(const char *name) +{ + char from[FN_REFLEN]; +#ifdef USE_RAID + uint raid_type=0,raid_chunks=0; +#endif + MARIA_HA *info; + myf sync_dir; + DBUG_ENTER("maria_delete_table"); + +#ifdef EXTRA_DEBUG + _ma_check_table_is_closed(name,"delete"); +#endif + /** @todo LOCK take X-lock on table */ + /* + We need to know if this table is transactional. + When built with RAID support, we also need to determine if this table + makes use of the raid feature. If yes, we need to remove all raid + chunks. This is done with my_raid_delete(). Unfortunately it is + necessary to open the table just to check this. We use + 'open_for_repair' to be able to open even a crashed table. If even + this open fails, we assume no raid configuration for this table + and try to remove the normal data file only. This may however + leave the raid chunks behind. + */ + if (!(info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR))) + { +#ifdef USE_RAID + raid_type= 0; +#endif + sync_dir= 0; + } + else + { +#ifdef USE_RAID + raid_type= info->s->base.raid_type; + raid_chunks= info->s->base.raid_chunks; +#endif + sync_dir= (info->s->now_transactional && !info->s->temporary && + !maria_in_recovery) ? + MY_SYNC_DIR : 0; + maria_close(info); + } + + if (sync_dir) + { + /* + For this log record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe in DDLs. + For now this record can serve when we apply logs to a backup, so we sync + it. + */ + LSN lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar*)name; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1; + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DROP_TABLE, + &dummy_transaction_object, NULL, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) || + translog_flush(lsn))) + DBUG_RETURN(1); + } + + fn_format(from,name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + if (my_delete_with_symlink(from, MYF(MY_WME | sync_dir))) + DBUG_RETURN(my_errno); + fn_format(from,name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); +#ifdef USE_RAID + if (raid_type) + DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME | sync_dir)) ? + my_errno : 0); +#endif + DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME | sync_dir)) ? + my_errno : 0); +} diff --git a/storage/maria/ma_dynrec.c b/storage/maria/ma_dynrec.c new file mode 100644 index 00000000000..57b76b713f4 --- /dev/null +++ b/storage/maria/ma_dynrec.c @@ -0,0 +1,2042 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Functions to handle space-packed-records and blobs + + A row may be stored in one or more linked blocks. + The block size is between MARIA_MIN_BLOCK_LENGTH and MARIA_MAX_BLOCK_LENGTH. + Each block is aligned on MARIA_DYN_ALIGN_SIZE. + The reson for the max block size is to not have too many different types + of blocks. For the differnet block types, look at _ma_get_block_info() +*/ + +#include "maria_def.h" + +static my_bool write_dynamic_record(MARIA_HA *info,const uchar *record, + ulong reclength); +static int _ma_find_writepos(MARIA_HA *info,ulong reclength,my_off_t *filepos, + ulong *length); +static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, + uchar *record, ulong reclength); +static my_bool delete_dynamic_record(MARIA_HA *info,MARIA_RECORD_POS filepos, + uint second_read); +static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos, + uint length); + +#ifdef THREAD +/* Play it safe; We have a small stack when using threads */ +#undef my_alloca +#undef my_afree +#define my_alloca(A) my_malloc((A),MYF(0)) +#define my_afree(A) my_free((A),MYF(0)) +#endif + + /* Interface function from MARIA_HA */ + +#ifdef HAVE_MMAP + +/* + Create mmaped area for MARIA handler + + SYNOPSIS + _ma_dynmap_file() + info MARIA handler + + RETURN + 0 ok + 1 error. +*/ + +my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size) +{ + DBUG_ENTER("_ma_dynmap_file"); + if (size > (my_off_t) (~((size_t) 0)) - MEMMAP_EXTRA_MARGIN) + { + DBUG_PRINT("warning", ("File is too large for mmap")); + DBUG_RETURN(1); + } + /* + Ingo wonders if it is good to use MAP_NORESERVE. From the Linux man page: + MAP_NORESERVE + Do not reserve swap space for this mapping. When swap space is + reserved, one has the guarantee that it is possible to modify the + mapping. When swap space is not reserved one might get SIGSEGV + upon a write if no physical memory is available. + */ + info->s->file_map= (uchar*) + my_mmap(0, (size_t)(size + MEMMAP_EXTRA_MARGIN), + info->s->mode==O_RDONLY ? PROT_READ : + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_NORESERVE, + info->dfile.file, 0L); + if (info->s->file_map == (uchar*) MAP_FAILED) + { + info->s->file_map= NULL; + DBUG_RETURN(1); + } +#if defined(HAVE_MADVISE) + madvise((char*) info->s->file_map, size, MADV_RANDOM); +#endif + info->s->mmaped_length= size; + DBUG_RETURN(0); +} + + +/* + Resize mmaped area for MARIA handler + + SYNOPSIS + _ma_remap_file() + info MARIA handler + + RETURN +*/ + +void _ma_remap_file(MARIA_HA *info, my_off_t size) +{ + if (info->s->file_map) + { + VOID(my_munmap((char*) info->s->file_map, + (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN)); + _ma_dynmap_file(info, size); + } +} +#endif + + +/* + Read bytes from MySAM handler, using mmap or pread + + SYNOPSIS + _ma_mmap_pread() + info MARIA handler + Buffer Input buffer + Count Count of bytes for read + offset Start position + MyFlags + + RETURN + 0 ok +*/ + +size_t _ma_mmap_pread(MARIA_HA *info, uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags) +{ + DBUG_PRINT("info", ("maria_read with mmap %d\n", info->dfile.file)); + if (info->s->lock_key_trees) + rw_rdlock(&info->s->mmap_lock); + + /* + The following test may fail in the following cases: + - We failed to remap a memory area (fragmented memory?) + - This thread has done some writes, but not yet extended the + memory mapped area. + */ + + if (info->s->mmaped_length >= offset + Count) + { + memcpy(Buffer, info->s->file_map + offset, Count); + if (info->s->lock_key_trees) + rw_unlock(&info->s->mmap_lock); + return 0; + } + else + { + if (info->s->lock_key_trees) + rw_unlock(&info->s->mmap_lock); + return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags); + } +} + + + /* wrapper for my_pread in case if mmap isn't used */ + +size_t _ma_nommap_pread(MARIA_HA *info, uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags) +{ + return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags); +} + + +/* + Write bytes to MySAM handler, using mmap or pwrite + + SYNOPSIS + _ma_mmap_pwrite() + info MARIA handler + Buffer Output buffer + Count Count of bytes for write + offset Start position + MyFlags + + RETURN + 0 ok + !=0 error. In this case return error from pwrite +*/ + +size_t _ma_mmap_pwrite(MARIA_HA *info, const uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags) +{ + DBUG_PRINT("info", ("maria_write with mmap %d\n", info->dfile.file)); + if (info->s->lock_key_trees) + rw_rdlock(&info->s->mmap_lock); + + /* + The following test may fail in the following cases: + - We failed to remap a memory area (fragmented memory?) + - This thread has done some writes, but not yet extended the + memory mapped area. + */ + + if (info->s->mmaped_length >= offset + Count) + { + memcpy(info->s->file_map + offset, Buffer, Count); + if (info->s->lock_key_trees) + rw_unlock(&info->s->mmap_lock); + return 0; + } + else + { + info->s->nonmmaped_inserts++; + if (info->s->lock_key_trees) + rw_unlock(&info->s->mmap_lock); + return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags); + } + +} + + + /* wrapper for my_pwrite in case if mmap isn't used */ + +size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags) +{ + return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags); +} + + +my_bool _ma_write_dynamic_record(MARIA_HA *info, const uchar *record) +{ + ulong reclength= _ma_rec_pack(info,info->rec_buff + MARIA_REC_BUFF_OFFSET, + record); + return (write_dynamic_record(info,info->rec_buff + MARIA_REC_BUFF_OFFSET, + reclength)); +} + +my_bool _ma_update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec __attribute__ ((unused)), + const uchar *record) +{ + uint length= _ma_rec_pack(info, info->rec_buff + MARIA_REC_BUFF_OFFSET, + record); + return (update_dynamic_record(info, pos, + info->rec_buff + MARIA_REC_BUFF_OFFSET, + length)); +} + + +my_bool _ma_write_blob_record(MARIA_HA *info, const uchar *record) +{ + uchar *rec_buff; + int error; + ulong reclength,reclength2,extra; + + extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+ + MARIA_DYN_DELETE_BLOCK_HEADER+1); + reclength= (info->s->base.pack_reclength + + _ma_calc_total_blob_length(info,record)+ extra); + if (!(rec_buff=(uchar*) my_alloca(reclength))) + { + my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */ + return(1); + } + reclength2= _ma_rec_pack(info, + rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + record); + DBUG_PRINT("info",("reclength: %lu reclength2: %lu", + reclength, reclength2)); + DBUG_ASSERT(reclength2 <= reclength); + error= write_dynamic_record(info, + rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + reclength2); + my_afree(rec_buff); + return(error != 0); +} + + +my_bool _ma_update_blob_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec __attribute__ ((unused)), + const uchar *record) +{ + uchar *rec_buff; + int error; + ulong reclength,extra; + + extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+ + MARIA_DYN_DELETE_BLOCK_HEADER); + reclength= (info->s->base.pack_reclength+ + _ma_calc_total_blob_length(info,record)+ extra); +#ifdef NOT_USED /* We now support big rows */ + if (reclength > MARIA_DYN_MAX_ROW_LENGTH) + { + my_errno=HA_ERR_TO_BIG_ROW; + return 1; + } +#endif + if (!(rec_buff=(uchar*) my_alloca(reclength))) + { + my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */ + return(1); + } + reclength= _ma_rec_pack(info,rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + record); + error=update_dynamic_record(info,pos, + rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + reclength); + my_afree(rec_buff); + return(error != 0); +} + + +my_bool _ma_delete_dynamic_record(MARIA_HA *info, + const uchar *record __attribute__ ((unused))) +{ + return delete_dynamic_record(info, info->cur_row.lastpos, 0); +} + + +/** + Write record to data-file. + + @todo it's cheating: it casts "const uchar*" to uchar*. +*/ + +static my_bool write_dynamic_record(MARIA_HA *info, const uchar *record, + ulong reclength) +{ + int flag; + ulong length; + my_off_t filepos; + DBUG_ENTER("write_dynamic_record"); + + flag=0; + + /* + Check if we have enough room for the new record. + First we do simplified check to make usual case faster. + Then we do more precise check for the space left. + Though it still is not absolutely precise, as + we always use MARIA_MAX_DYN_BLOCK_HEADER while it can be + less in the most of the cases. + */ + + if (unlikely(info->s->base.max_data_file_length - + info->state->data_file_length < + reclength + MARIA_MAX_DYN_BLOCK_HEADER)) + { + if (info->s->base.max_data_file_length - info->state->data_file_length + + info->state->empty - info->state->del * MARIA_MAX_DYN_BLOCK_HEADER < + reclength + MARIA_MAX_DYN_BLOCK_HEADER) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + DBUG_RETURN(1); + } + } + + do + { + if (_ma_find_writepos(info,reclength,&filepos,&length)) + goto err; + if (_ma_write_part_record(info,filepos,length, + (info->append_insert_at_end ? + HA_OFFSET_ERROR : info->s->state.dellink), + (uchar**) &record,&reclength,&flag)) + goto err; + } while (reclength); + + DBUG_RETURN(0); +err: + DBUG_RETURN(1); +} + + + /* Get a block for data ; The given data-area must be used !! */ + +static int _ma_find_writepos(MARIA_HA *info, + ulong reclength, /* record length */ + my_off_t *filepos, /* Return file pos */ + ulong *length) /* length of block at filepos */ +{ + MARIA_BLOCK_INFO block_info; + ulong tmp; + DBUG_ENTER("_ma_find_writepos"); + + if (info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end) + { + /* Deleted blocks exists; Get last used block */ + *filepos=info->s->state.dellink; + block_info.second_read=0; + info->rec_cache.seek_not_done=1; + if (!(_ma_get_block_info(&block_info, info->dfile.file, + info->s->state.dellink) & + BLOCK_DELETED)) + { + DBUG_PRINT("error",("Delete link crashed")); + my_errno=HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(-1); + } + info->s->state.dellink=block_info.next_filepos; + info->state->del--; + info->state->empty-= block_info.block_len; + *length= block_info.block_len; + } + else + { + /* No deleted blocks; Allocate a new block */ + *filepos=info->state->data_file_length; + if ((tmp=reclength+3 + test(reclength >= (65520-3))) < + info->s->base.min_block_length) + tmp= info->s->base.min_block_length; + else + tmp= ((tmp+MARIA_DYN_ALIGN_SIZE-1) & + (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1))); + if (info->state->data_file_length > + (info->s->base.max_data_file_length - tmp)) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + DBUG_RETURN(-1); + } + if (tmp > MARIA_MAX_BLOCK_LENGTH) + tmp=MARIA_MAX_BLOCK_LENGTH; + *length= tmp; + info->state->data_file_length+= tmp; + info->s->state.split++; + info->update|=HA_STATE_WRITE_AT_END; + } + DBUG_RETURN(0); +} /* _ma_find_writepos */ + + + +/* + Unlink a deleted block from the deleted list. + This block will be combined with the preceding or next block to form + a big block. +*/ + +static my_bool unlink_deleted_block(MARIA_HA *info, + MARIA_BLOCK_INFO *block_info) +{ + DBUG_ENTER("unlink_deleted_block"); + if (block_info->filepos == info->s->state.dellink) + { + /* First deleted block; We can just use this ! */ + info->s->state.dellink=block_info->next_filepos; + } + else + { + MARIA_BLOCK_INFO tmp; + tmp.second_read=0; + /* Unlink block from the previous block */ + if (!(_ma_get_block_info(&tmp, info->dfile.file, block_info->prev_filepos) + & BLOCK_DELETED)) + DBUG_RETURN(1); /* Something is wrong */ + mi_sizestore(tmp.header+4,block_info->next_filepos); + if (info->s->file_write(info, tmp.header+4,8, + block_info->prev_filepos+4, MYF(MY_NABP))) + DBUG_RETURN(1); + /* Unlink block from next block */ + if (block_info->next_filepos != HA_OFFSET_ERROR) + { + if (!(_ma_get_block_info(&tmp, info->dfile.file, + block_info->next_filepos) + & BLOCK_DELETED)) + DBUG_RETURN(1); /* Something is wrong */ + mi_sizestore(tmp.header+12,block_info->prev_filepos); + if (info->s->file_write(info, tmp.header+12,8, + block_info->next_filepos+12, + MYF(MY_NABP))) + DBUG_RETURN(1); + } + } + /* We now have one less deleted block */ + info->state->del--; + info->state->empty-= block_info->block_len; + info->s->state.split--; + + /* + If this was a block that we where accessing through table scan + (maria_rrnd() or maria_scan(), then ensure that we skip over this block + when doing next maria_rrnd() or maria_scan(). + */ + if (info->cur_row.nextpos == block_info->filepos) + info->cur_row.nextpos+= block_info->block_len; + DBUG_RETURN(0); +} + + +/* + Add a backward link to delete block + + SYNOPSIS + update_backward_delete_link() + info MARIA handler + delete_block Position to delete block to update. + If this is 'HA_OFFSET_ERROR', nothing will be done + filepos Position to block that 'delete_block' should point to + + RETURN + 0 ok + 1 error. In this case my_error is set. +*/ + +static my_bool update_backward_delete_link(MARIA_HA *info, + my_off_t delete_block, + MARIA_RECORD_POS filepos) +{ + MARIA_BLOCK_INFO block_info; + DBUG_ENTER("update_backward_delete_link"); + + if (delete_block != HA_OFFSET_ERROR) + { + block_info.second_read=0; + if (_ma_get_block_info(&block_info, info->dfile.file, delete_block) + & BLOCK_DELETED) + { + uchar buff[8]; + mi_sizestore(buff,filepos); + if (info->s->file_write(info,buff, 8, delete_block+12, MYF(MY_NABP))) + DBUG_RETURN(1); /* Error on write */ + } + else + { + my_errno=HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(1); /* Wrong delete link */ + } + } + DBUG_RETURN(0); +} + +/* Delete datarecord from database */ +/* info->rec_cache.seek_not_done is updated in cmp_record */ + +static my_bool delete_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, + uint second_read) +{ + uint length,b_type; + MARIA_BLOCK_INFO block_info,del_block; + int error; + my_bool remove_next_block; + DBUG_ENTER("delete_dynamic_record"); + + /* First add a link from the last block to the new one */ + error= update_backward_delete_link(info, info->s->state.dellink, filepos); + + block_info.second_read=second_read; + do + { + /* Remove block at 'filepos' */ + if ((b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos)) + & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR) || + (length=(uint) (block_info.filepos-filepos) +block_info.block_len) < + MARIA_MIN_BLOCK_LENGTH) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(1); + } + /* Check if next block is a delete block */ + del_block.second_read=0; + remove_next_block=0; + if (_ma_get_block_info(&del_block, info->dfile.file, filepos + length) & + BLOCK_DELETED && del_block.block_len+length < + MARIA_DYN_MAX_BLOCK_LENGTH) + { + /* We can't remove this yet as this block may be the head block */ + remove_next_block=1; + length+=del_block.block_len; + } + + block_info.header[0]=0; + mi_int3store(block_info.header+1,length); + mi_sizestore(block_info.header+4,info->s->state.dellink); + if (b_type & BLOCK_LAST) + bfill(block_info.header+12,8,255); + else + mi_sizestore(block_info.header+12,block_info.next_filepos); + if (info->s->file_write(info, block_info.header, 20, filepos, + MYF(MY_NABP))) + DBUG_RETURN(1); + info->s->state.dellink = filepos; + info->state->del++; + info->state->empty+=length; + filepos=block_info.next_filepos; + + /* Now it's safe to unlink the deleted block directly after this one */ + if (remove_next_block && unlink_deleted_block(info,&del_block)) + error=1; + } while (!(b_type & BLOCK_LAST)); + + DBUG_RETURN(error); +} + + + /* Write a block to datafile */ + +int _ma_write_part_record(MARIA_HA *info, + my_off_t filepos, /* points at empty block */ + ulong length, /* length of block */ + my_off_t next_filepos,/* Next empty block */ + uchar **record, /* pointer to record ptr */ + ulong *reclength, /* length of *record */ + int *flag) /* *flag == 0 if header */ +{ + ulong head_length,res_length,extra_length,long_block,del_length; + uchar *pos,*record_end; + my_off_t next_delete_block; + uchar temp[MARIA_SPLIT_LENGTH+MARIA_DYN_DELETE_BLOCK_HEADER]; + DBUG_ENTER("_ma_write_part_record"); + + next_delete_block=HA_OFFSET_ERROR; + + res_length=extra_length=0; + if (length > *reclength + MARIA_SPLIT_LENGTH) + { /* Splitt big block */ + res_length=MY_ALIGN(length- *reclength - MARIA_EXTEND_BLOCK_LENGTH, + MARIA_DYN_ALIGN_SIZE); + length-= res_length; /* Use this for first part */ + } + long_block= (length < 65520L && *reclength < 65520L) ? 0 : 1; + if (length == *reclength+ 3 + long_block) + { + /* Block is exactly of the right length */ + temp[0]=(uchar) (1+ *flag)+(uchar) long_block; /* Flag is 0 or 6 */ + if (long_block) + { + mi_int3store(temp+1,*reclength); + head_length=4; + } + else + { + mi_int2store(temp+1,*reclength); + head_length=3; + } + } + else if (length-long_block < *reclength+4) + { /* To short block */ + if (next_filepos == HA_OFFSET_ERROR) + next_filepos= (info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end ? + info->s->state.dellink : info->state->data_file_length); + if (*flag == 0) /* First block */ + { + if (*reclength > MARIA_MAX_BLOCK_LENGTH) + { + head_length= 16; + temp[0]=13; + mi_int4store(temp+1,*reclength); + mi_int3store(temp+5,length-head_length); + mi_sizestore(temp+8,next_filepos); + } + else + { + head_length=5+8+long_block*2; + temp[0]=5+(uchar) long_block; + if (long_block) + { + mi_int3store(temp+1,*reclength); + mi_int3store(temp+4,length-head_length); + mi_sizestore(temp+7,next_filepos); + } + else + { + mi_int2store(temp+1,*reclength); + mi_int2store(temp+3,length-head_length); + mi_sizestore(temp+5,next_filepos); + } + } + } + else + { + head_length=3+8+long_block; + temp[0]=11+(uchar) long_block; + if (long_block) + { + mi_int3store(temp+1,length-head_length); + mi_sizestore(temp+4,next_filepos); + } + else + { + mi_int2store(temp+1,length-head_length); + mi_sizestore(temp+3,next_filepos); + } + } + } + else + { /* Block with empty info last */ + head_length=4+long_block; + extra_length= length- *reclength-head_length; + temp[0]= (uchar) (3+ *flag)+(uchar) long_block; /* 3,4 or 9,10 */ + if (long_block) + { + mi_int3store(temp+1,*reclength); + temp[4]= (uchar) (extra_length); + } + else + { + mi_int2store(temp+1,*reclength); + temp[3]= (uchar) (extra_length); + } + length= *reclength+head_length; /* Write only what is needed */ + } + DBUG_DUMP("header", temp, head_length); + + /* Make a long block for one write */ + record_end= *record+length-head_length; + del_length=(res_length ? MARIA_DYN_DELETE_BLOCK_HEADER : 0); + bmove((*record-head_length), temp, head_length); + memcpy(temp,record_end,(size_t) (extra_length+del_length)); + bzero(record_end, extra_length); + + if (res_length) + { + /* Check first if we can join this block with the next one */ + MARIA_BLOCK_INFO del_block; + my_off_t next_block=filepos+length+extra_length+res_length; + + del_block.second_read=0; + if (next_block < info->state->data_file_length && + info->s->state.dellink != HA_OFFSET_ERROR) + { + if ((_ma_get_block_info(&del_block, info->dfile.file, next_block) + & BLOCK_DELETED) && + res_length + del_block.block_len < MARIA_DYN_MAX_BLOCK_LENGTH) + { + if (unlink_deleted_block(info,&del_block)) + goto err; + res_length+=del_block.block_len; + } + } + + /* Create a delete link of the last part of the block */ + pos=record_end+extra_length; + pos[0]= '\0'; + mi_int3store(pos+1,res_length); + mi_sizestore(pos+4,info->s->state.dellink); + bfill(pos+12,8,255); /* End link */ + next_delete_block=info->s->state.dellink; + info->s->state.dellink= filepos+length+extra_length; + info->state->del++; + info->state->empty+=res_length; + info->s->state.split++; + } + if (info->opt_flag & WRITE_CACHE_USED && + info->update & HA_STATE_WRITE_AT_END) + { + if (info->update & HA_STATE_EXTEND_BLOCK) + { + info->update&= ~HA_STATE_EXTEND_BLOCK; + if (my_block_write(&info->rec_cache, *record-head_length, + length+extra_length+del_length,filepos)) + goto err; + } + else if (my_b_write(&info->rec_cache, *record-head_length, + length+extra_length+del_length)) + goto err; + } + else + { + info->rec_cache.seek_not_done=1; + if (info->s->file_write(info, *record-head_length, + length+extra_length+ + del_length,filepos,info->s->write_flag)) + goto err; + } + memcpy(record_end,temp,(size_t) (extra_length+del_length)); + *record=record_end; + *reclength-=(length-head_length); + *flag=6; + + if (del_length) + { + /* link the next delete block to this */ + if (update_backward_delete_link(info, next_delete_block, + info->s->state.dellink)) + goto err; + } + + DBUG_RETURN(0); +err: + DBUG_PRINT("exit",("errno: %d",my_errno)); + DBUG_RETURN(1); +} /* _ma_write_part_record */ + + + /* update record from datafile */ + +static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, + uchar *record, ulong reclength) +{ + int flag; + uint error; + ulong length; + MARIA_BLOCK_INFO block_info; + DBUG_ENTER("update_dynamic_record"); + + flag=block_info.second_read=0; + /* + Check if we have enough room for the record. + First we do simplified check to make usual case faster. + Then we do more precise check for the space left. + Though it still is not absolutely precise, as + we always use MARIA_MAX_DYN_BLOCK_HEADER while it can be + less in the most of the cases. + */ + + /* + compare with just the reclength as we're going + to get some space from the old replaced record + */ + if (unlikely(info->s->base.max_data_file_length - + info->state->data_file_length < reclength)) + { + /* If new record isn't longer, we can go on safely */ + if (info->cur_row.total_length < reclength) + { + if (info->s->base.max_data_file_length - info->state->data_file_length + + info->state->empty - info->state->del * MARIA_MAX_DYN_BLOCK_HEADER < + reclength - info->cur_row.total_length + MARIA_MAX_DYN_BLOCK_HEADER) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + goto err; + } + } + } + /* Remember length for updated row if it's updated again */ + info->cur_row.total_length= reclength; + + while (reclength > 0) + { + if (filepos != info->s->state.dellink) + { + block_info.next_filepos= HA_OFFSET_ERROR; + if ((error= _ma_get_block_info(&block_info, info->dfile.file, filepos)) + & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + DBUG_PRINT("error",("Got wrong block info")); + if (!(error & BLOCK_FATAL_ERROR)) + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; + } + length=(ulong) (block_info.filepos-filepos) + block_info.block_len; + if (length < reclength) + { + uint tmp=MY_ALIGN(reclength - length + 3 + + test(reclength >= 65520L),MARIA_DYN_ALIGN_SIZE); + /* Don't create a block bigger than MARIA_MAX_BLOCK_LENGTH */ + tmp= min(length+tmp, MARIA_MAX_BLOCK_LENGTH)-length; + /* Check if we can extend this block */ + if (block_info.filepos + block_info.block_len == + info->state->data_file_length && + info->state->data_file_length < + info->s->base.max_data_file_length-tmp) + { + /* extend file */ + DBUG_PRINT("info",("Extending file with %d bytes",tmp)); + if (info->cur_row.nextpos == info->state->data_file_length) + info->cur_row.nextpos+= tmp; + info->state->data_file_length+= tmp; + info->update|= HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK; + length+=tmp; + } + else if (length < MARIA_MAX_BLOCK_LENGTH - MARIA_MIN_BLOCK_LENGTH) + { + /* + Check if next block is a deleted block + Above we have MARIA_MIN_BLOCK_LENGTH to avoid the problem where + the next block is so small it can't be splited which could + casue problems + */ + + MARIA_BLOCK_INFO del_block; + del_block.second_read=0; + if (_ma_get_block_info(&del_block, info->dfile.file, + block_info.filepos + block_info.block_len) & + BLOCK_DELETED) + { + /* Use; Unlink it and extend the current block */ + DBUG_PRINT("info",("Extending current block")); + if (unlink_deleted_block(info,&del_block)) + goto err; + if ((length+=del_block.block_len) > MARIA_MAX_BLOCK_LENGTH) + { + /* + New block was too big, link overflow part back to + delete list + */ + my_off_t next_pos; + ulong rest_length= length-MARIA_MAX_BLOCK_LENGTH; + set_if_bigger(rest_length, MARIA_MIN_BLOCK_LENGTH); + next_pos= del_block.filepos+ del_block.block_len - rest_length; + + if (update_backward_delete_link(info, info->s->state.dellink, + next_pos)) + DBUG_RETURN(1); + + /* create delete link for data that didn't fit into the page */ + del_block.header[0]=0; + mi_int3store(del_block.header+1, rest_length); + mi_sizestore(del_block.header+4,info->s->state.dellink); + bfill(del_block.header+12,8,255); + if (info->s->file_write(info, del_block.header, 20, + next_pos, MYF(MY_NABP))) + DBUG_RETURN(1); + info->s->state.dellink= next_pos; + info->s->state.split++; + info->state->del++; + info->state->empty+= rest_length; + length-= rest_length; + } + } + } + } + } + else + { + if (_ma_find_writepos(info,reclength,&filepos,&length)) + goto err; + } + if (_ma_write_part_record(info,filepos,length,block_info.next_filepos, + &record,&reclength,&flag)) + goto err; + if ((filepos=block_info.next_filepos) == HA_OFFSET_ERROR) + { + /* Start writing data on deleted blocks */ + filepos=info->s->state.dellink; + } + } + + if (block_info.next_filepos != HA_OFFSET_ERROR) + if (delete_dynamic_record(info,block_info.next_filepos,1)) + goto err; + + DBUG_RETURN(0); +err: + DBUG_RETURN(1); +} + + + /* Pack a record. Return new reclength */ + +uint _ma_rec_pack(MARIA_HA *info, register uchar *to, + register const uchar *from) +{ + uint length,new_length,flag,bit,i; + const uchar *pos,*end; + uchar *startpos,*packpos; + enum en_fieldtype type; + reg3 MARIA_COLUMNDEF *column; + MARIA_BLOB *blob; + DBUG_ENTER("_ma_rec_pack"); + + flag= 0; + bit= 1; + startpos= packpos=to; + to+= info->s->base.pack_bytes; + blob= info->blobs; + column= info->s->columndef; + if (info->s->base.null_bytes) + { + memcpy(to, from, info->s->base.null_bytes); + from+= info->s->base.null_bytes; + to+= info->s->base.null_bytes; + } + + for (i=info->s->base.fields ; i-- > 0; from+= length, column++) + { + length=(uint) column->length; + if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL) + { + if (type == FIELD_BLOB) + { + if (!blob->length) + flag|=bit; + else + { + char *temp_pos; + size_t tmp_length=length-portable_sizeof_char_ptr; + memcpy(to,from,tmp_length); + memcpy_fixed(&temp_pos,from+tmp_length,sizeof(char*)); + memcpy(to+tmp_length,temp_pos,(size_t) blob->length); + to+=tmp_length+blob->length; + } + blob++; + } + else if (type == FIELD_SKIP_ZERO) + { + if (memcmp(from, maria_zero_string, length) == 0) + flag|=bit; + else + { + memcpy(to, from, (size_t) length); + to+=length; + } + } + else if (type == FIELD_SKIP_ENDSPACE || + type == FIELD_SKIP_PRESPACE) + { + pos= from; end= from + length; + if (type == FIELD_SKIP_ENDSPACE) + { /* Pack trailing spaces */ + while (end > from && *(end-1) == ' ') + end--; + } + else + { /* Pack pref-spaces */ + while (pos < end && *pos == ' ') + pos++; + } + new_length=(uint) (end-pos); + if (new_length +1 + test(column->length > 255 && new_length > 127) + < length) + { + if (column->length > 255 && new_length > 127) + { + to[0]= (uchar) ((new_length & 127) + 128); + to[1]= (uchar) (new_length >> 7); + to+=2; + } + else + *to++= (uchar) new_length; + memcpy(to, pos, (size_t) new_length); to+=new_length; + flag|=bit; + } + else + { + memcpy(to,from,(size_t) length); to+=length; + } + } + else if (type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1); + uint tmp_length; + if (pack_length == 1) + { + tmp_length= (uint) *from; + *to++= *from; + } + else + { + tmp_length= uint2korr(from); + store_key_length_inc(to,tmp_length); + } + memcpy(to, from+pack_length,tmp_length); + to+= tmp_length; + continue; + } + else + { + memcpy(to,from,(size_t) length); to+=length; + continue; /* Normal field */ + } + if ((bit= bit << 1) >= 256) + { + *packpos++ = (uchar) flag; + bit=1; flag=0; + } + } + else + { + memcpy(to,from,(size_t) length); to+=length; + } + } + if (bit != 1) + *packpos= (uchar) flag; + if (info->s->calc_checksum) + *to++= (uchar) info->cur_row.checksum; + DBUG_PRINT("exit",("packed length: %d",(int) (to-startpos))); + DBUG_RETURN((uint) (to-startpos)); +} /* _ma_rec_pack */ + + + +/* + Check if a record was correctly packed. Used only by maria_chk + Returns 0 if record is ok. +*/ + +my_bool _ma_rec_check(MARIA_HA *info,const uchar *record, uchar *rec_buff, + ulong packed_length, my_bool with_checksum, + ha_checksum checksum) +{ + uint length,new_length,flag,bit,i; + const uchar *pos,*end; + uchar *packpos,*to; + enum en_fieldtype type; + reg3 MARIA_COLUMNDEF *column; + DBUG_ENTER("_ma_rec_check"); + + packpos=rec_buff; to= rec_buff+info->s->base.pack_bytes; + column= info->s->columndef; + flag= *packpos; bit=1; + record+= info->s->base.null_bytes; + to+= info->s->base.null_bytes; + + for (i=info->s->base.fields ; i-- > 0; record+= length, column++) + { + length=(uint) column->length; + if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL) + { + if (type == FIELD_BLOB) + { + uint blob_length= + _ma_calc_blob_length(length-portable_sizeof_char_ptr,record); + if (!blob_length && !(flag & bit)) + goto err; + if (blob_length) + to+=length - portable_sizeof_char_ptr+ blob_length; + } + else if (type == FIELD_SKIP_ZERO) + { + if (memcmp(record, maria_zero_string, length) == 0) + { + if (!(flag & bit)) + goto err; + } + else + to+=length; + } + else if (type == FIELD_SKIP_ENDSPACE || + type == FIELD_SKIP_PRESPACE) + { + pos= record; end= record + length; + if (type == FIELD_SKIP_ENDSPACE) + { /* Pack trailing spaces */ + while (end > record && *(end-1) == ' ') + end--; + } + else + { /* Pack pre-spaces */ + while (pos < end && *pos == ' ') + pos++; + } + new_length=(uint) (end-pos); + if (new_length +1 + test(column->length > 255 && new_length > 127) + < length) + { + if (!(flag & bit)) + goto err; + if (column->length > 255 && new_length > 127) + { + /* purecov: begin inspected */ + if (to[0] != (uchar) ((new_length & 127) + 128) || + to[1] != (uchar) (new_length >> 7)) + goto err; + to+=2; + /* purecov: end */ + } + else if (*to++ != (uchar) new_length) + goto err; + to+=new_length; + } + else + to+=length; + } + else if (type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1); + uint tmp_length; + if (pack_length == 1) + { + tmp_length= (uint) *record; + to+= 1+ tmp_length; + continue; + } + else + { + tmp_length= uint2korr(record); + to+= get_pack_length(tmp_length)+tmp_length; + } + continue; + } + else + { + to+=length; + continue; /* Normal field */ + } + if ((bit= bit << 1) >= 256) + { + flag= *++packpos; + bit=1; + } + } + else + to+= length; + } + if (packed_length != (uint) (to - rec_buff) + + test(info->s->calc_checksum) || (bit != 1 && (flag & ~(bit - 1)))) + goto err; + if (with_checksum && ((uchar) checksum != (uchar) *to)) + { + DBUG_PRINT("error",("wrong checksum for row")); + goto err; + } + DBUG_RETURN(0); + +err: + DBUG_RETURN(1); +} + + +/* + @brief Unpacks a record + + @return Recordlength + @retval >0 ok + @retval MY_FILE_ERROR (== -1) Error. + my_errno is set to HA_ERR_WRONG_IN_RECORD +*/ + +ulong _ma_rec_unpack(register MARIA_HA *info, register uchar *to, uchar *from, + ulong found_length) +{ + uint flag,bit,length,min_pack_length, column_length; + enum en_fieldtype type; + uchar *from_end,*to_end,*packpos; + reg3 MARIA_COLUMNDEF *column, *end_column; + DBUG_ENTER("_ma_rec_unpack"); + + to_end=to + info->s->base.reclength; + from_end=from+found_length; + flag= (uchar) *from; bit=1; packpos=from; + if (found_length < info->s->base.min_pack_length) + goto err; + from+= info->s->base.pack_bytes; + min_pack_length= info->s->base.min_pack_length - info->s->base.pack_bytes; + + if ((length= info->s->base.null_bytes)) + { + memcpy(to, from, length); + from+= length; + to+= length; + min_pack_length-= length; + } + + for (column= info->s->columndef, end_column= column + info->s->base.fields; + column < end_column ; to+= column_length, column++) + { + column_length= column->length; + if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL && + (type != FIELD_CHECK)) + { + if (type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(column_length-1); + if (pack_length == 1) + { + length= (uint) *(uchar*) from; + if (length > column_length-1) + goto err; + *to= *from++; + } + else + { + get_key_length(length, from); + if (length > column_length-2) + goto err; + int2store(to,length); + } + if (from+length > from_end) + goto err; + memcpy(to+pack_length, from, length); + from+= length; + min_pack_length--; + continue; + } + if (flag & bit) + { + if (type == FIELD_BLOB || type == FIELD_SKIP_ZERO) + bzero(to, column_length); + else if (type == FIELD_SKIP_ENDSPACE || + type == FIELD_SKIP_PRESPACE) + { + if (column->length > 255 && *from & 128) + { + if (from + 1 >= from_end) + goto err; + length= (*from & 127)+ ((uint) (uchar) *(from+1) << 7); from+=2; + } + else + { + if (from == from_end) + goto err; + length= (uchar) *from++; + } + min_pack_length--; + if (length >= column_length || + min_pack_length + length > (uint) (from_end - from)) + goto err; + if (type == FIELD_SKIP_ENDSPACE) + { + memcpy(to, from, (size_t) length); + bfill(to+length, column_length-length, ' '); + } + else + { + bfill(to, column_length-length, ' '); + memcpy(to+column_length-length, from, (size_t) length); + } + from+=length; + } + } + else if (type == FIELD_BLOB) + { + uint size_length=column_length- portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length,from); + ulong from_left= (ulong) (from_end - from); + if (from_left < size_length || + from_left - size_length < blob_length || + from_left - size_length - blob_length < min_pack_length) + goto err; + memcpy(to, from, (size_t) size_length); + from+=size_length; + memcpy_fixed(to+size_length,(uchar*) &from,sizeof(char*)); + from+=blob_length; + } + else + { + if (type == FIELD_SKIP_ENDSPACE || type == FIELD_SKIP_PRESPACE) + min_pack_length--; + if (min_pack_length + column_length > (uint) (from_end - from)) + goto err; + memcpy(to, from, (size_t) column_length); from+=column_length; + } + if ((bit= bit << 1) >= 256) + { + flag= (uchar) *++packpos; bit=1; + } + } + else + { + if (min_pack_length > (uint) (from_end - from)) + goto err; + min_pack_length-=column_length; + memcpy(to, from, (size_t) column_length); + from+=column_length; + } + } + if (info->s->calc_checksum) + info->cur_row.checksum= (uint) (uchar) *from++; + if (to == to_end && from == from_end && (bit == 1 || !(flag & ~(bit-1)))) + DBUG_RETURN(found_length); + +err: + my_errno= HA_ERR_WRONG_IN_RECORD; + DBUG_PRINT("error",("to_end: 0x%lx -> 0x%lx from_end: 0x%lx -> 0x%lx", + (long) to, (long) to_end, (long) from, (long) from_end)); + DBUG_DUMP("from", info->rec_buff, info->s->base.min_pack_length); + DBUG_RETURN(MY_FILE_ERROR); +} /* _ma_rec_unpack */ + + + /* Calc length of blob. Update info in blobs->length */ + +ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record) +{ + ulong length; + MARIA_BLOB *blob,*end; + + for (length=0, blob= info->blobs, end=blob+info->s->base.blobs ; + blob != end; + blob++) + { + blob->length= _ma_calc_blob_length(blob->pack_length, + record + blob->offset); + length+=blob->length; + } + return length; +} + + +ulong _ma_calc_blob_length(uint length, const uchar *pos) +{ + switch (length) { + case 1: + return (uint) (uchar) *pos; + case 2: + return (uint) uint2korr(pos); + case 3: + return uint3korr(pos); + case 4: + return uint4korr(pos); + default: + break; + } + return 0; /* Impossible */ +} + + +void _ma_store_blob_length(uchar *pos,uint pack_length,uint length) +{ + switch (pack_length) { + case 1: + *pos= (uchar) length; + break; + case 2: + int2store(pos,length); + break; + case 3: + int3store(pos,length); + break; + case 4: + int4store(pos,length); + default: + break; + } + return; +} + + +/* + Read record from datafile. + + SYNOPSIS + _ma_read_dynamic_record() + info MARIA_HA pointer to table. + filepos From where to read the record. + buf Destination for record. + + NOTE + If a write buffer is active, it needs to be flushed if its contents + intersects with the record to read. We always check if the position + of the first uchar of the write buffer is lower than the position + past the last uchar to read. In theory this is also true if the write + buffer is completely below the read segment. That is, if there is no + intersection. But this case is unusual. We flush anyway. Only if the + first uchar in the write buffer is above the last uchar to read, we do + not flush. + + A dynamic record may need several reads. So this check must be done + before every read. Reading a dynamic record starts with reading the + block header. If the record does not fit into the free space of the + header, the block may be longer than the header. In this case a + second read is necessary. These one or two reads repeat for every + part of the record. + + RETURN + 0 OK + # Error number +*/ + +int _ma_read_dynamic_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos) +{ + int block_of_record; + uint b_type; + MARIA_BLOCK_INFO block_info; + File file; + uchar *to; + uint left_length; + DBUG_ENTER("_ma_read_dynamic_record"); + + if (filepos == HA_OFFSET_ERROR) + goto err; + + LINT_INIT(to); + LINT_INIT(left_length); + file= info->dfile.file; + block_of_record= 0; /* First block of record is numbered as zero. */ + block_info.second_read= 0; + do + { + /* A corrupted table can have wrong pointers. (Bug# 19835) */ + if (filepos == HA_OFFSET_ERROR) + goto panic; + if (info->opt_flag & WRITE_CACHE_USED && + (info->rec_cache.pos_in_file < filepos + + MARIA_BLOCK_INFO_HEADER_LENGTH) && + flush_io_cache(&info->rec_cache)) + goto err; + info->rec_cache.seek_not_done=1; + if ((b_type= _ma_get_block_info(&block_info, file, filepos)) & + (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED)) + my_errno=HA_ERR_RECORD_DELETED; + goto err; + } + if (block_of_record++ == 0) /* First block */ + { + info->cur_row.total_length= block_info.rec_len; + if (block_info.rec_len > (uint) info->s->base.max_pack_length) + goto panic; + if (info->s->base.blobs) + { + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + block_info.rec_len + + info->s->base.extra_rec_buff_size)) + goto err; + } + to= info->rec_buff; + left_length=block_info.rec_len; + } + if (left_length < block_info.data_len || ! block_info.data_len) + goto panic; /* Wrong linked record */ + /* copy information that is already read */ + { + uint offset= (uint) (block_info.filepos - filepos); + uint prefetch_len= (sizeof(block_info.header) - offset); + filepos+= sizeof(block_info.header); + + if (prefetch_len > block_info.data_len) + prefetch_len= block_info.data_len; + if (prefetch_len) + { + memcpy(to, block_info.header + offset, prefetch_len); + block_info.data_len-= prefetch_len; + left_length-= prefetch_len; + to+= prefetch_len; + } + } + /* read rest of record from file */ + if (block_info.data_len) + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file < filepos + block_info.data_len && + flush_io_cache(&info->rec_cache)) + goto err; + /* + What a pity that this method is not called 'file_pread' and that + there is no equivalent without seeking. We are at the right + position already. :( + */ + if (info->s->file_read(info, to, block_info.data_len, + filepos, MYF(MY_NABP))) + goto panic; + left_length-=block_info.data_len; + to+=block_info.data_len; + } + filepos= block_info.next_filepos; + } while (left_length); + + info->update|= HA_STATE_AKTIV; /* We have a aktive record */ + fast_ma_writeinfo(info); + DBUG_RETURN(_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) != + MY_FILE_ERROR ? 0 : my_errno); + +err: + fast_ma_writeinfo(info); + DBUG_RETURN(my_errno); + +panic: + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; +} + + /* compare unique constraint between stored rows */ + +my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos) +{ + uchar *old_rec_buff,*old_record; + size_t old_rec_buff_size; + my_bool error; + DBUG_ENTER("_ma_cmp_dynamic_unique"); + + if (!(old_record=my_alloca(info->s->base.reclength))) + DBUG_RETURN(1); + + /* Don't let the compare destroy blobs that may be in use */ + old_rec_buff= info->rec_buff; + old_rec_buff_size= info->rec_buff_size; + + if (info->s->base.blobs) + { + info->rec_buff= 0; + info->rec_buff_size= 0; + } + error= _ma_read_dynamic_record(info, old_record, pos) != 0; + if (!error) + error=_ma_unique_comp(def, record, old_record, def->null_are_equal) != 0; + if (info->s->base.blobs) + { + my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + info->rec_buff= old_rec_buff; + info->rec_buff_size= old_rec_buff_size; + } + my_afree(old_record); + DBUG_RETURN(error); +} + + + /* Compare of record on disk with packed record in memory */ + +my_bool _ma_cmp_dynamic_record(register MARIA_HA *info, + register const uchar *record) +{ + uint flag, reclength, b_type,cmp_length; + my_off_t filepos; + uchar *buffer; + MARIA_BLOCK_INFO block_info; + my_bool error= 1; + DBUG_ENTER("_ma_cmp_dynamic_record"); + + /* We are going to do changes; dont let anybody disturb */ + dont_break(); /* Dont allow SIGHUP or SIGINT */ + + if (info->opt_flag & WRITE_CACHE_USED) + { + info->update&= ~(HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK); + if (flush_io_cache(&info->rec_cache)) + DBUG_RETURN(1); + } + info->rec_cache.seek_not_done=1; + + /* If nobody have touched the database we don't have to test rec */ + + buffer=info->rec_buff; + if ((info->opt_flag & READ_CHECK_USED)) + { /* If check isn't disabled */ + if (info->s->base.blobs) + { + if (!(buffer=(uchar*) my_alloca(info->s->base.pack_reclength+ + _ma_calc_total_blob_length(info,record)))) + DBUG_RETURN(1); + } + reclength= _ma_rec_pack(info,buffer,record); + record= buffer; + + filepos= info->cur_row.lastpos; + flag=block_info.second_read=0; + block_info.next_filepos=filepos; + while (reclength > 0) + { + if ((b_type= _ma_get_block_info(&block_info, info->dfile.file, + block_info.next_filepos)) + & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED)) + my_errno=HA_ERR_RECORD_CHANGED; + goto err; + } + if (flag == 0) /* First block */ + { + flag=1; + if (reclength != block_info.rec_len) + { + my_errno=HA_ERR_RECORD_CHANGED; + goto err; + } + } else if (reclength < block_info.data_len) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; + } + reclength-= block_info.data_len; + cmp_length= block_info.data_len; + if (!reclength && info->s->calc_checksum) + cmp_length--; /* 'record' may not contain checksum */ + + if (_ma_cmp_buffer(info->dfile.file, record, block_info.filepos, + cmp_length)) + { + my_errno=HA_ERR_RECORD_CHANGED; + goto err; + } + flag=1; + record+=block_info.data_len; + } + } + my_errno=0; + error= 0; +err: + if (buffer != info->rec_buff) + my_afree(buffer); + DBUG_PRINT("exit", ("result: %d", error)); + DBUG_RETURN(error); +} + + + /* Compare file to buffert */ + +static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos, + uint length) +{ + uint next_length; + uchar temp_buff[IO_SIZE*2]; + DBUG_ENTER("_ma_cmp_buffer"); + + next_length= IO_SIZE*2 - (uint) (filepos & (IO_SIZE-1)); + + while (length > IO_SIZE*2) + { + if (my_pread(file,temp_buff,next_length,filepos, MYF(MY_NABP)) || + memcmp(buff, temp_buff, next_length)) + goto err; + filepos+=next_length; + buff+=next_length; + length-= next_length; + next_length=IO_SIZE*2; + } + if (my_pread(file,temp_buff,length,filepos,MYF(MY_NABP))) + goto err; + DBUG_RETURN(memcmp(buff, temp_buff, length) != 0); +err: + DBUG_RETURN(1); +} + + +/* + Read next record from datafile during table scan. + + SYNOPSIS + _ma_read_rnd_dynamic_record() + info MARIA_HA pointer to table. + buf Destination for record. + filepos From where to read the record. + skip_deleted_blocks If to repeat reading until a non-deleted + record is found. + + NOTE + This is identical to _ma_read_dynamic_record(), except the following + cases: + + - If there is no active row at 'filepos', continue scanning for + an active row. (This is becasue the previous + _ma_read_rnd_dynamic_record() call stored the next block position + in filepos, but this position may not be a start block for a row + - We may have READ_CACHING enabled, in which case we use the cache + to read rows. + + For other comments, check _ma_read_dynamic_record() + + RETURN + 0 OK + != 0 Error number +*/ + +int _ma_read_rnd_dynamic_record(MARIA_HA *info, + uchar *buf, + MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks) +{ + int block_of_record, info_read; + uint left_len,b_type; + uchar *to; + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_read_rnd_dynamic_record"); + + info_read=0; + LINT_INIT(to); + + if (info->lock_type == F_UNLCK) + { +#ifndef UNSAFE_LOCKING +#else + info->tmp_lock_type=F_RDLCK; +#endif + } + else + info_read=1; /* memory-keyinfoblock is ok */ + + block_of_record= 0; /* First block of record is numbered as zero. */ + block_info.second_read= 0; + left_len=1; + do + { + if (filepos >= info->state->data_file_length) + { + if (!info_read) + { /* Check if changed */ + info_read=1; + info->rec_cache.seek_not_done=1; + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + goto panic; + } + if (filepos >= info->state->data_file_length) + { + my_errno= HA_ERR_END_OF_FILE; + goto err; + } + } + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(&info->rec_cache, block_info.header, filepos, + sizeof(block_info.header), + (!block_of_record && skip_deleted_blocks ? + READING_NEXT : 0) | READING_HEADER)) + goto panic; + b_type= _ma_get_block_info(&block_info,-1,filepos); + } + else + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file < filepos + MARIA_BLOCK_INFO_HEADER_LENGTH && + flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + info->rec_cache.seek_not_done=1; + b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos); + } + + if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if ((b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + && skip_deleted_blocks) + { + filepos=block_info.filepos+block_info.block_len; + block_info.second_read=0; + continue; /* Search after next_record */ + } + if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + { + my_errno= HA_ERR_RECORD_DELETED; + info->cur_row.lastpos= block_info.filepos; + info->cur_row.nextpos= block_info.filepos+block_info.block_len; + } + goto err; + } + if (block_of_record == 0) /* First block */ + { + info->cur_row.total_length= block_info.rec_len; + if (block_info.rec_len > (uint) share->base.max_pack_length) + goto panic; + info->cur_row.lastpos= filepos; + if (share->base.blobs) + { + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + block_info.rec_len + + info->s->base.extra_rec_buff_size)) + goto err; + } + to= info->rec_buff; + left_len=block_info.rec_len; + } + if (left_len < block_info.data_len) + goto panic; /* Wrong linked record */ + + /* copy information that is already read */ + { + uint offset=(uint) (block_info.filepos - filepos); + uint tmp_length= (sizeof(block_info.header) - offset); + filepos=block_info.filepos; + + if (tmp_length > block_info.data_len) + tmp_length= block_info.data_len; + if (tmp_length) + { + memcpy(to, block_info.header+offset, tmp_length); + block_info.data_len-=tmp_length; + left_len-=tmp_length; + to+=tmp_length; + filepos+=tmp_length; + } + } + /* read rest of record from file */ + if (block_info.data_len) + { + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(&info->rec_cache, to,filepos, + block_info.data_len, + (!block_of_record && skip_deleted_blocks) ? + READING_NEXT : 0)) + goto panic; + } + else + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file < + block_info.filepos + block_info.data_len && + flush_io_cache(&info->rec_cache)) + goto err; + /* VOID(my_seek(info->dfile.file, filepos, MY_SEEK_SET, MYF(0))); */ + if (my_read(info->dfile.file, to, block_info.data_len, MYF(MY_NABP))) + { + if (my_errno == HA_ERR_FILE_TOO_SHORT) + my_errno= HA_ERR_WRONG_IN_RECORD; /* Unexpected end of file */ + goto err; + } + } + } + /* + Increment block-of-record counter. If it was the first block, + remember the position behind the block for the next call. + */ + if (block_of_record++ == 0) + { + info->cur_row.nextpos= block_info.filepos+block_info.block_len; + skip_deleted_blocks=0; + } + left_len-=block_info.data_len; + to+=block_info.data_len; + filepos=block_info.next_filepos; + } while (left_len); + + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + fast_ma_writeinfo(info); + if (_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) != + MY_FILE_ERROR) + DBUG_RETURN(0); + DBUG_RETURN(my_errno); /* Wrong record */ + +panic: + my_errno=HA_ERR_WRONG_IN_RECORD; /* Something is fatal wrong */ +err: + fast_ma_writeinfo(info); + DBUG_RETURN(my_errno); +} + + + /* Read and process header from a dynamic-record-file */ + +uint _ma_get_block_info(MARIA_BLOCK_INFO *info, File file, my_off_t filepos) +{ + uint return_val=0; + uchar *header=info->header; + + if (file >= 0) + { + /* + We do not use my_pread() here because we want to have the file + pointer set to the end of the header after this function. + my_pread() may leave the file pointer untouched. + */ + VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0))); + if (my_read(file, header, sizeof(info->header),MYF(0)) != + sizeof(info->header)) + goto err; + } + DBUG_DUMP("header",header,MARIA_BLOCK_INFO_HEADER_LENGTH); + if (info->second_read) + { + if (info->header[0] <= 6 || info->header[0] == 13) + return_val=BLOCK_SYNC_ERROR; + } + else + { + if (info->header[0] > 6 && info->header[0] != 13) + return_val=BLOCK_SYNC_ERROR; + } + info->next_filepos= HA_OFFSET_ERROR; /* Dummy if no next block */ + + switch (info->header[0]) { + case 0: + if ((info->block_len=(uint) mi_uint3korr(header+1)) < + MARIA_MIN_BLOCK_LENGTH || + (info->block_len & (MARIA_DYN_ALIGN_SIZE -1))) + goto err; + info->filepos=filepos; + info->next_filepos=mi_sizekorr(header+4); + info->prev_filepos=mi_sizekorr(header+12); +#if SIZEOF_OFF_T == 4 + if ((mi_uint4korr(header+4) != 0 && + (mi_uint4korr(header+4) != (ulong) ~0 || + info->next_filepos != (ulong) ~0)) || + (mi_uint4korr(header+12) != 0 && + (mi_uint4korr(header+12) != (ulong) ~0 || + info->prev_filepos != (ulong) ~0))) + goto err; +#endif + return return_val | BLOCK_DELETED; /* Deleted block */ + + case 1: + info->rec_len=info->data_len=info->block_len=mi_uint2korr(header+1); + info->filepos=filepos+3; + return return_val | BLOCK_FIRST | BLOCK_LAST; + case 2: + info->rec_len=info->data_len=info->block_len=mi_uint3korr(header+1); + info->filepos=filepos+4; + return return_val | BLOCK_FIRST | BLOCK_LAST; + + case 13: + info->rec_len=mi_uint4korr(header+1); + info->block_len=info->data_len=mi_uint3korr(header+5); + info->next_filepos=mi_sizekorr(header+8); + info->second_read=1; + info->filepos=filepos+16; + return return_val | BLOCK_FIRST; + + case 3: + info->rec_len=info->data_len=mi_uint2korr(header+1); + info->block_len=info->rec_len+ (uint) header[3]; + info->filepos=filepos+4; + return return_val | BLOCK_FIRST | BLOCK_LAST; + case 4: + info->rec_len=info->data_len=mi_uint3korr(header+1); + info->block_len=info->rec_len+ (uint) header[4]; + info->filepos=filepos+5; + return return_val | BLOCK_FIRST | BLOCK_LAST; + + case 5: + info->rec_len=mi_uint2korr(header+1); + info->block_len=info->data_len=mi_uint2korr(header+3); + info->next_filepos=mi_sizekorr(header+5); + info->second_read=1; + info->filepos=filepos+13; + return return_val | BLOCK_FIRST; + case 6: + info->rec_len=mi_uint3korr(header+1); + info->block_len=info->data_len=mi_uint3korr(header+4); + info->next_filepos=mi_sizekorr(header+7); + info->second_read=1; + info->filepos=filepos+15; + return return_val | BLOCK_FIRST; + + /* The following blocks are identical to 1-6 without rec_len */ + case 7: + info->data_len=info->block_len=mi_uint2korr(header+1); + info->filepos=filepos+3; + return return_val | BLOCK_LAST; + case 8: + info->data_len=info->block_len=mi_uint3korr(header+1); + info->filepos=filepos+4; + return return_val | BLOCK_LAST; + + case 9: + info->data_len=mi_uint2korr(header+1); + info->block_len=info->data_len+ (uint) header[3]; + info->filepos=filepos+4; + return return_val | BLOCK_LAST; + case 10: + info->data_len=mi_uint3korr(header+1); + info->block_len=info->data_len+ (uint) header[4]; + info->filepos=filepos+5; + return return_val | BLOCK_LAST; + + case 11: + info->data_len=info->block_len=mi_uint2korr(header+1); + info->next_filepos=mi_sizekorr(header+3); + info->second_read=1; + info->filepos=filepos+11; + return return_val; + case 12: + info->data_len=info->block_len=mi_uint3korr(header+1); + info->next_filepos=mi_sizekorr(header+4); + info->second_read=1; + info->filepos=filepos+12; + return return_val; + } + +err: + my_errno=HA_ERR_WRONG_IN_RECORD; /* Garbage */ + return BLOCK_ERROR; +} diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c new file mode 100644 index 00000000000..7a30b613ea5 --- /dev/null +++ b/storage/maria/ma_extra.c @@ -0,0 +1,637 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif +#include "ma_blockrec.h" + +static void maria_extra_keyflag(MARIA_HA *info, + enum ha_extra_function function); + +/** + @brief Set options and buffers to optimize table handling + + @param name table's name + @param info open table + @param function operation + @param extra_arg Pointer to extra argument (normally pointer to + ulong); used when function is one of: + HA_EXTRA_WRITE_CACHE + HA_EXTRA_CACHE + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +int maria_extra(MARIA_HA *info, enum ha_extra_function function, + void *extra_arg) +{ + int error= 0; + ulong cache_size; + MARIA_SHARE *share= info->s; + my_bool block_records= share->data_file_type == BLOCK_RECORD; + DBUG_ENTER("maria_extra"); + DBUG_PRINT("enter",("function: %d",(int) function)); + + switch (function) { + case HA_EXTRA_RESET_STATE: /* Reset state (don't free buffers) */ + info->lastinx= 0; /* Use first index as def */ + info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR; + info->page_changed= 1; + /* Next/prev gives first/last */ + if (info->opt_flag & READ_CACHE_USED) + { + reinit_io_cache(&info->rec_cache,READ_CACHE,0, + (pbool) (info->lock_type != F_UNLCK), + (pbool) test(info->update & HA_STATE_ROW_CHANGED) + ); + } + info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND | + HA_STATE_PREV_FOUND); + break; + case HA_EXTRA_CACHE: + if (block_records) + break; /* Not supported */ + + if (info->lock_type == F_UNLCK && + (share->options & HA_OPTION_PACK_RECORD)) + { + error= 1; /* Not possibly if not locked */ + my_errno= EACCES; + break; + } + if (info->s->file_map) /* Don't use cache if mmap */ + break; +#if defined(HAVE_MMAP) && defined(HAVE_MADVISE) + if ((share->options & HA_OPTION_COMPRESS_RECORD)) + { + pthread_mutex_lock(&share->intern_lock); + if (_ma_memmap_file(info)) + { + /* We don't nead MADV_SEQUENTIAL if small file */ + madvise((char*) share->file_map, share->state.state.data_file_length, + share->state.state.data_file_length <= RECORD_CACHE_SIZE*16 ? + MADV_RANDOM : MADV_SEQUENTIAL); + pthread_mutex_unlock(&share->intern_lock); + break; + } + pthread_mutex_unlock(&share->intern_lock); + } +#endif + if (info->opt_flag & WRITE_CACHE_USED) + { + info->opt_flag&= ~WRITE_CACHE_USED; + if ((error= end_io_cache(&info->rec_cache))) + break; + } + if (!(info->opt_flag & + (READ_CACHE_USED | WRITE_CACHE_USED | MEMMAP_USED))) + { + cache_size= (extra_arg ? *(ulong*) extra_arg : + my_default_record_cache_size); + if (!(init_io_cache(&info->rec_cache, info->dfile.file, + (uint) min(share->state.state.data_file_length+1, + cache_size), + READ_CACHE,0L,(pbool) (info->lock_type != F_UNLCK), + MYF(share->write_flag & MY_WAIT_IF_FULL)))) + { + info->opt_flag|= READ_CACHE_USED; + info->update&= ~HA_STATE_ROW_CHANGED; + } + if (share->non_transactional_concurrent_insert) + info->rec_cache.end_of_file= info->state->data_file_length; + } + break; + case HA_EXTRA_REINIT_CACHE: + if (info->opt_flag & READ_CACHE_USED) + { + reinit_io_cache(&info->rec_cache, READ_CACHE, info->cur_row.nextpos, + (pbool) (info->lock_type != F_UNLCK), + (pbool) test(info->update & HA_STATE_ROW_CHANGED)); + info->update&= ~HA_STATE_ROW_CHANGED; + if (share->non_transactional_concurrent_insert) + info->rec_cache.end_of_file= info->state->data_file_length; + } + break; + case HA_EXTRA_WRITE_CACHE: + if (info->lock_type == F_UNLCK) + { + error= 1; /* Not possibly if not locked */ + break; + } + if (block_records) + break; /* Not supported */ + + cache_size= (extra_arg ? *(ulong*) extra_arg : + my_default_record_cache_size); + if (!(info->opt_flag & + (READ_CACHE_USED | WRITE_CACHE_USED | OPT_NO_ROWS)) && + !share->state.header.uniques) + if (!(init_io_cache(&info->rec_cache, info->dfile.file, cache_size, + WRITE_CACHE,share->state.state.data_file_length, + (pbool) (info->lock_type != F_UNLCK), + MYF(share->write_flag & MY_WAIT_IF_FULL)))) + { + info->opt_flag|= WRITE_CACHE_USED; + info->update&= ~(HA_STATE_ROW_CHANGED | + HA_STATE_WRITE_AT_END | + HA_STATE_EXTEND_BLOCK); + } + break; + case HA_EXTRA_PREPARE_FOR_UPDATE: + if (info->s->data_file_type != DYNAMIC_RECORD) + break; + /* Remove read/write cache if dynamic rows */ + case HA_EXTRA_NO_CACHE: + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + error= end_io_cache(&info->rec_cache); + /* Sergei will insert full text index caching here */ + } +#if defined(HAVE_MMAP) && defined(HAVE_MADVISE) + if (info->opt_flag & MEMMAP_USED) + madvise((char*) share->file_map, share->state.state.data_file_length, + MADV_RANDOM); +#endif + break; + case HA_EXTRA_FLUSH_CACHE: + if (info->opt_flag & WRITE_CACHE_USED) + { + if ((error= flush_io_cache(&info->rec_cache))) + { + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); /* Fatal error found */ + } + } + break; + case HA_EXTRA_NO_READCHECK: + info->opt_flag&= ~READ_CHECK_USED; /* No readcheck */ + break; + case HA_EXTRA_READCHECK: + info->opt_flag|= READ_CHECK_USED; + break; + case HA_EXTRA_KEYREAD: /* Read only keys to record */ + case HA_EXTRA_REMEMBER_POS: + info->opt_flag|= REMEMBER_OLD_POS; + bmove(info->last_key.data + share->base.max_key_length*2, + info->last_key.data, + info->last_key.data_length + info->last_key.ref_length); + info->save_update= info->update; + info->save_lastinx= info->lastinx; + info->save_lastpos= info->cur_row.lastpos; + info->save_lastkey_data_length= info->last_key.data_length; + info->save_lastkey_ref_length= info->last_key.ref_length; + if (function == HA_EXTRA_REMEMBER_POS) + break; + /* fall through */ + case HA_EXTRA_KEYREAD_CHANGE_POS: + info->opt_flag|= KEY_READ_USED; + info->read_record= _ma_read_key_record; + break; + case HA_EXTRA_NO_KEYREAD: + case HA_EXTRA_RESTORE_POS: + if (info->opt_flag & REMEMBER_OLD_POS) + { + bmove(info->last_key.data, + info->last_key.data + share->base.max_key_length*2, + info->save_lastkey_data_length + info->save_lastkey_ref_length); + info->update= info->save_update | HA_STATE_WRITTEN; + info->lastinx= info->save_lastinx; + info->cur_row.lastpos= info->save_lastpos; + info->last_key.data_length= info->save_lastkey_data_length; + info->last_key.ref_length= info->save_lastkey_ref_length; + info->last_key.flag= 0; + } + info->read_record= share->read_record; + info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS); + break; + case HA_EXTRA_NO_USER_CHANGE: /* Database is somehow locked agains changes */ + info->lock_type= F_EXTRA_LCK; /* Simulate as locked */ + break; + case HA_EXTRA_WAIT_LOCK: + info->lock_wait= 0; + break; + case HA_EXTRA_NO_WAIT_LOCK: + info->lock_wait= MY_SHORT_WAIT; + break; + case HA_EXTRA_NO_KEYS: + /* we're going to modify pieces of the state, stall Checkpoint */ + pthread_mutex_lock(&share->intern_lock); + if (info->lock_type == F_UNLCK) + { + pthread_mutex_unlock(&share->intern_lock); + error= 1; /* Not possibly if not lock */ + break; + } + if (maria_is_any_key_active(share->state.key_map)) + { + MARIA_KEYDEF *key= share->keyinfo; + uint i; + for (i =0 ; i < share->base.keys ; i++,key++) + { + if (!(key->flag & HA_NOSAME) && info->s->base.auto_key != i+1) + { + maria_clear_key_active(share->state.key_map, i); + info->update|= HA_STATE_CHANGED; + } + } + + if (!share->changed) + { + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; + share->changed= 1; /* Update on close */ + if (!share->global_changed) + { + share->global_changed= 1; + share->state.open_count++; + } + } + if (!share->now_transactional) + share->state.state= *info->state; + /* + That state write to disk must be done, even for transactional tables; + indeed the table's share is going to be lost (there was a + HA_EXTRA_FORCE_REOPEN before, which set share->last_version to + 0), and so the only way it leaves information (share->state.key_map) + for the posterity is by writing it to disk. + */ + DBUG_ASSERT(!maria_in_recovery); + error= _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO); + } + pthread_mutex_unlock(&share->intern_lock); + break; + case HA_EXTRA_FORCE_REOPEN: + /* + MySQL uses this case after it has closed all other instances + of this table. + We however do a flush here for additional safety. + */ + /** @todo consider porting these flush-es to MyISAM */ + DBUG_ASSERT(share->reopen == 1); + error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE); + if (!error && share->changed) + { + pthread_mutex_lock(&share->intern_lock); + if (!(error= _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET| + MA_STATE_INFO_WRITE_FULL_INFO))) + share->changed= 0; + pthread_mutex_unlock(&share->intern_lock); + } + pthread_mutex_lock(&THR_LOCK_maria); + pthread_mutex_lock(&share->intern_lock); /* protect against Checkpoint */ + /* this makes the share not be re-used next time the table is opened */ + share->last_version= 0L; /* Impossible version */ + pthread_mutex_unlock(&share->intern_lock); + pthread_mutex_unlock(&THR_LOCK_maria); + break; + case HA_EXTRA_PREPARE_FOR_DROP: + /* Signals about intent to delete this table */ + share->deleting= TRUE; + share->global_changed= FALSE; /* force writing changed flag */ + /* To force repair if reopened */ + _ma_mark_file_changed(info); + /* Fall trough */ + case HA_EXTRA_PREPARE_FOR_RENAME: + { + my_bool do_flush= test(function != HA_EXTRA_PREPARE_FOR_DROP); + enum flush_type type; + pthread_mutex_lock(&THR_LOCK_maria); + /* + This share, to have last_version=0, needs to save all its data/index + blocks to disk if this is not for a DROP TABLE. Otherwise they would be + invisible to future openers; and they could even go to disk late and + cancel the work of future openers. + */ + if (info->lock_type != F_UNLCK && !info->was_locked) + { + info->was_locked= info->lock_type; + if (maria_lock_database(info, F_UNLCK)) + error= my_errno; + info->lock_type= F_UNLCK; + } + /* + We don't need to call _mi_decrement_open_count() if we are + dropping the table, as the files will be removed anyway. If we + are aborted before the files is removed, it's better to not + call it as in that case the automatic repair on open will add + the missing index entries + */ + pthread_mutex_lock(&share->intern_lock); + if (share->kfile.file >= 0 && function != HA_EXTRA_PREPARE_FOR_DROP) + _ma_decrement_open_count(info); + if (info->trn) + { + _ma_remove_table_from_trnman(share, info->trn); + /* Ensure we don't point to the deleted data in trn */ + info->state= info->state_start= &share->state.state; + } + + type= do_flush ? FLUSH_RELEASE : FLUSH_IGNORE_CHANGED; + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + type, type)) + { + error=my_errno; + share->changed= 1; + } + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + if (end_io_cache(&info->rec_cache)) + error= 1; + } + if (share->kfile.file >= 0) + { + if (do_flush) + { + /* Save the state so that others can find it from disk. */ + if ((share->changed && + _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO)) || + my_sync(share->kfile.file, MYF(0))) + error= my_errno; + else + share->changed= 0; + } + else + { + /* be sure that state is not tried for write as file may be closed */ + share->changed= 0; + } + } + if (share->data_file_type == BLOCK_RECORD && + share->bitmap.file.file >= 0) + { + if (do_flush && my_sync(share->bitmap.file.file, MYF(0))) + error= my_errno; + } + /* For protection against Checkpoint, we set under intern_lock: */ + share->last_version= 0L; /* Impossible version */ + pthread_mutex_unlock(&share->intern_lock); + pthread_mutex_unlock(&THR_LOCK_maria); + break; + } + case HA_EXTRA_PREPARE_FOR_FORCED_CLOSE: + if (info->trn) + { + pthread_mutex_lock(&share->intern_lock); + _ma_remove_table_from_trnman(share, info->trn); + /* Ensure we don't point to the deleted data in trn */ + info->state= info->state_start= &share->state.state; + pthread_mutex_unlock(&share->intern_lock); + } + break; + case HA_EXTRA_FLUSH: + if (!share->temporary) + error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_KEEP, FLUSH_KEEP); +#ifdef HAVE_PWRITE + _ma_decrement_open_count(info); +#endif + if (share->not_flushed) + { + share->not_flushed= 0; + if (_ma_sync_table_files(info)) + error= my_errno; + if (error) + { + share->changed= 1; + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); /* Fatal error found */ + } + } + break; + case HA_EXTRA_NORMAL: /* Theese isn't in use */ + info->quick_mode= 0; + break; + case HA_EXTRA_QUICK: + info->quick_mode= 1; + break; + case HA_EXTRA_NO_ROWS: + if (!share->state.header.uniques) + info->opt_flag|= OPT_NO_ROWS; + break; + case HA_EXTRA_PRELOAD_BUFFER_SIZE: + info->preload_buff_size= *((ulong *) extra_arg); + break; + case HA_EXTRA_CHANGE_KEY_TO_UNIQUE: + case HA_EXTRA_CHANGE_KEY_TO_DUP: + maria_extra_keyflag(info, function); + break; + case HA_EXTRA_MMAP: +#ifdef HAVE_MMAP + if (block_records) + break; /* Not supported */ + pthread_mutex_lock(&share->intern_lock); + /* + Memory map the data file if it is not already mapped. It is safe + to memory map a file while other threads are using file I/O on it. + Assigning a new address to a function pointer is an atomic + operation. intern_lock prevents that two or more mappings are done + at the same time. + */ + if (!share->file_map) + { + if (_ma_dynmap_file(info, share->state.state.data_file_length)) + { + DBUG_PRINT("warning",("mmap failed: errno: %d",errno)); + error= my_errno= errno; + } + else + { + share->file_read= _ma_mmap_pread; + share->file_write= _ma_mmap_pwrite; + } + } + pthread_mutex_unlock(&share->intern_lock); +#endif + break; + case HA_EXTRA_MARK_AS_LOG_TABLE: + pthread_mutex_lock(&share->intern_lock); + share->is_log_table= TRUE; + pthread_mutex_unlock(&share->intern_lock); + break; + case HA_EXTRA_KEY_CACHE: + case HA_EXTRA_NO_KEY_CACHE: + default: + break; + } + DBUG_RETURN(error); +} /* maria_extra */ + + +void ma_set_index_cond_func(MARIA_HA *info, index_cond_func_t func, + void *func_arg) +{ + info->index_cond_func= func; + info->index_cond_func_arg= func_arg; +} + + +/* + Start/Stop Inserting Duplicates Into a Table, WL#1648. +*/ + +static void maria_extra_keyflag(MARIA_HA *info, + enum ha_extra_function function) +{ + uint idx; + + for (idx= 0; idx< info->s->base.keys; idx++) + { + switch (function) { + case HA_EXTRA_CHANGE_KEY_TO_UNIQUE: + info->s->keyinfo[idx].flag|= HA_NOSAME; + break; + case HA_EXTRA_CHANGE_KEY_TO_DUP: + info->s->keyinfo[idx].flag&= ~(HA_NOSAME); + break; + default: + break; + } + } +} + + +int maria_reset(MARIA_HA *info) +{ + int error= 0; + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_reset"); + /* + Free buffers and reset the following flags: + EXTRA_CACHE, EXTRA_WRITE_CACHE, EXTRA_KEYREAD, EXTRA_QUICK + + If the row buffer cache is large (for dynamic tables), reduce it + to save memory. + */ + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + error= end_io_cache(&info->rec_cache); + } + /* Free memory used for keeping blobs */ + if (share->base.blobs) + { + if (info->rec_buff_size > share->base.default_rec_buff_size) + { + info->rec_buff_size= 1; /* Force realloc */ + _ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + share->base.default_rec_buff_size); + } + if (info->blob_buff_size > MARIA_SMALL_BLOB_BUFFER) + { + info->blob_buff_size= 1; /* Force realloc */ + _ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size, + MARIA_SMALL_BLOB_BUFFER); + } + } +#if defined(HAVE_MMAP) && defined(HAVE_MADVISE) + if (info->opt_flag & MEMMAP_USED) + madvise((char*) share->file_map, share->state.state.data_file_length, + MADV_RANDOM); +#endif + info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS); + info->quick_mode= 0; + info->lastinx= 0; /* Use first index as def */ + info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR; + info->page_changed= 1; + info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND | + HA_STATE_PREV_FOUND); + DBUG_RETURN(error); +} + + +int _ma_sync_table_files(const MARIA_HA *info) +{ + return (my_sync(info->dfile.file, MYF(MY_WME)) || + my_sync(info->s->kfile.file, MYF(MY_WME))); +} + + +/** + @brief flushes the data and/or index file of a table + + This is useful when one wants to read a table using OS syscalls (like + my_copy()) and first wants to be sure that MySQL-level caches go down to + the OS so that OS syscalls can see all data. It can flush rec_cache, + bitmap, pagecache of data file, pagecache of index file. + + @param info table + @param flush_data_or_index one or two of these flags: + MARIA_FLUSH_DATA, MARIA_FLUSH_INDEX + @param flush_type_for_data + @param flush_type_for_index + + @note does not sync files (@see _ma_sync_table_files()). + @note Progressively this function will be used in all places where we flush + the index but not the data file (probable bugs). + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index, + enum flush_type flush_type_for_data, + enum flush_type flush_type_for_index) +{ + int error= 0; + MARIA_SHARE *share= info->s; + /* flush data file first because it's more critical */ + if (flush_data_or_index & MARIA_FLUSH_DATA) + { + if ((info->opt_flag & WRITE_CACHE_USED) && + flush_type_for_data != FLUSH_IGNORE_CHANGED && + flush_io_cache(&info->rec_cache)) + error= 1; + if (share->data_file_type == BLOCK_RECORD) + { + if (flush_type_for_data != FLUSH_IGNORE_CHANGED) + { + if (_ma_bitmap_flush(share)) + error= 1; + } + else + { + pthread_mutex_lock(&share->bitmap.bitmap_lock); + share->bitmap.changed= 0; + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + } + if (flush_pagecache_blocks(share->pagecache, &info->dfile, + flush_type_for_data)) + error= 1; + } + } + if ((flush_data_or_index & MARIA_FLUSH_INDEX) && + flush_pagecache_blocks(share->pagecache, &share->kfile, + flush_type_for_index)) + error= 1; + if (!error) + return 0; + + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); + return 1; +} + diff --git a/storage/maria/ma_ft_boolean_search.c b/storage/maria/ma_ft_boolean_search.c new file mode 100644 index 00000000000..0783f679843 --- /dev/null +++ b/storage/maria/ma_ft_boolean_search.c @@ -0,0 +1,1042 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* TODO: add caching - pre-read several index entries at once */ + +/* + Added optimization for full-text queries with plus-words. It was + implemented by sharing maximal document id (max_docid) variable + inside plus subtree. max_docid could be used by any word in plus + subtree, but it could be updated by plus-word only. + + Fulltext "smarter index merge" optimization assumes that rows + it gets are ordered by doc_id. That is not the case when we + search for a word with truncation operator. It may return + rows in random order. Thus we may not use "smarter index merge" + optimization with "trunc-words". + + The idea is: there is no need to search for docid smaller than + biggest docid inside current plus subtree or any upper plus subtree. + + Examples: + +word1 word2 + share same max_docid + max_docid updated by word1 + +word1 +(word2 word3) + share same max_docid + max_docid updated by word1 + +(word1 -word2) +(+word3 word4) + share same max_docid + max_docid updated by word3 + +word1 word2 (+word3 word4 (+word5 word6)) + three subexpressions (including the top-level one), + every one has its own max_docid, updated by its plus word. + but for the search word6 uses + max(word1.max_docid, word3.max_docid, word5.max_docid), + while word4 uses, accordingly, + max(word1.max_docid, word3.max_docid). +*/ + +#define FT_CORE +#include "ma_ftdefs.h" + +/* search with boolean queries */ + +static double _wghts[11]= +{ + 0.131687242798354, + 0.197530864197531, + 0.296296296296296, + 0.444444444444444, + 0.666666666666667, + 1.000000000000000, + 1.500000000000000, + 2.250000000000000, + 3.375000000000000, + 5.062500000000000, + 7.593750000000000}; +static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */ + +static double _nwghts[11]= +{ + -0.065843621399177, + -0.098765432098766, + -0.148148148148148, + -0.222222222222222, + -0.333333333333334, + -0.500000000000000, + -0.750000000000000, + -1.125000000000000, + -1.687500000000000, + -2.531250000000000, + -3.796875000000000}; +static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */ + +#define FTB_FLAG_TRUNC 1 +/* At most one of the following flags can be set */ +#define FTB_FLAG_YES 2 +#define FTB_FLAG_NO 4 +#define FTB_FLAG_WONLY 8 + +typedef struct st_ftb_expr FTB_EXPR; +struct st_ftb_expr +{ + FTB_EXPR *up; + uint flags; +/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ + my_off_t docid[2]; + my_off_t max_docid; + float weight; + float cur_weight; + LIST *phrase; /* phrase words */ + LIST *document; /* for phrase search */ + uint yesses; /* number of "yes" words matched */ + uint nos; /* number of "no" words matched */ + uint ythresh; /* number of "yes" words in expr */ + uint yweaks; /* number of "yes" words for scan only */ +}; + +typedef struct st_ftb_word +{ + FTB_EXPR *up; + uint flags; +/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ + my_off_t docid[2]; /* for index search and for scan */ + my_off_t key_root; + FTB_EXPR *max_docid_expr; + MARIA_KEYDEF *keyinfo; + struct st_ftb_word *prev; + float weight; + uint ndepth; + uint len; + uchar off; + uchar word[1]; +} FTB_WORD; + +typedef struct st_ft_info +{ + struct _ft_vft *please; + MARIA_HA *info; + CHARSET_INFO *charset; + FTB_EXPR *root; + FTB_WORD **list; + FTB_WORD *last_word; + MEM_ROOT mem_root; + QUEUE queue; + TREE no_dupes; + my_off_t lastpos; + uint keynr; + uchar with_scan; + enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state; +} FTB; + +static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b) +{ + int i; + + /* if a==curdoc, take it as a < b */ + if (v && a->docid[0] == *v) + return -1; + + /* ORDER BY docid, ndepth DESC */ + i=CMP_NUM(a->docid[0], b->docid[0]); + if (!i) + i=CMP_NUM(b->ndepth,a->ndepth); + return i; +} + +static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b) +{ + /* ORDER BY word, ndepth */ + int i= ha_compare_text(cs, (uchar*) (*a)->word + 1,(*a)->len - 1, + (uchar*) (*b)->word + 1,(*b)->len - 1, 0, 0); + if (!i) + i=CMP_NUM((*a)->ndepth, (*b)->ndepth); + return i; +} + + +typedef struct st_my_ftb_param +{ + FTB *ftb; + FTB_EXPR *ftbe; + uchar *up_quot; + uint depth; +} MY_FTB_PARAM; + + +static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param, + const uchar *word, mysql_ft_size_t word_len, + MYSQL_FTPARSER_BOOLEAN_INFO *info) +{ + MY_FTB_PARAM *ftb_param= param->mysql_ftparam; + FTB_WORD *ftbw; + FTB_EXPR *ftbe, *tmp_expr; + FT_WORD *phrase_word; + LIST *tmp_element; + int r= info->weight_adjust; + float weight= (float) + (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)]; + + switch (info->type) { + case FT_TOKEN_WORD: + ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root, + sizeof(FTB_WORD) + + (info->trunc ? MARIA_MAX_KEY_BUFF : + word_len * ftb_param->ftb->charset->mbmaxlen + + HA_FT_WLEN + + ftb_param->ftb->info->s->rec_reflength)); + ftbw->len= word_len + 1; + ftbw->flags= 0; + ftbw->off= 0; + if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES; + if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO; + if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC; + ftbw->weight= weight; + ftbw->up= ftb_param->ftbe; + ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR; + ftbw->ndepth= (info->yesno < 0) + ftb_param->depth; + ftbw->key_root= HA_OFFSET_ERROR; + memcpy(ftbw->word + 1, word, word_len); + ftbw->word[0]= word_len; + if (info->yesno > 0) ftbw->up->ythresh++; + ftb_param->ftb->queue.max_elements++; + ftbw->prev= ftb_param->ftb->last_word; + ftb_param->ftb->last_word= ftbw; + ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC); + for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up) + if (! (tmp_expr->flags & FTB_FLAG_YES)) + break; + ftbw->max_docid_expr= tmp_expr; + /* fall through */ + case FT_TOKEN_STOPWORD: + if (! ftb_param->up_quot) break; + phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); + tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); + phrase_word->pos= (uchar*) word; + phrase_word->len= word_len; + tmp_element->data= (void *)phrase_word; + ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element); + /* Allocate document list at this point. + It allows to avoid huge amount of allocs/frees for each row.*/ + tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); + tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); + ftb_param->ftbe->document= + list_add(ftb_param->ftbe->document, tmp_element); + break; + case FT_TOKEN_LEFT_PAREN: + ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR)); + ftbe->flags= 0; + if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES; + if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO; + ftbe->weight= weight; + ftbe->up= ftb_param->ftbe; + ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; + ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR; + ftbe->phrase= NULL; + ftbe->document= 0; + if (info->quot) ftb_param->ftb->with_scan|= 2; + if (info->yesno > 0) ftbe->up->ythresh++; + ftb_param->ftbe= ftbe; + ftb_param->depth++; + ftb_param->up_quot= (uchar*) info->quot; + break; + case FT_TOKEN_RIGHT_PAREN: + if (ftb_param->ftbe->document) + { + /* Circuit document list */ + for (tmp_element= ftb_param->ftbe->document; + tmp_element->next; tmp_element= tmp_element->next) /* no-op */; + tmp_element->next= ftb_param->ftbe->document; + ftb_param->ftbe->document->prev= tmp_element; + } + info->quot= 0; + if (ftb_param->ftbe->up) + { + DBUG_ASSERT(ftb_param->depth); + ftb_param->ftbe= ftb_param->ftbe->up; + ftb_param->depth--; + ftb_param->up_quot= 0; + } + break; + case FT_TOKEN_EOF: + default: + break; + } + return(0); +} + + +static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param, + const uchar *query, mysql_ft_size_t len) +{ + MY_FTB_PARAM *ftb_param= param->mysql_ftparam; + MYSQL_FTPARSER_BOOLEAN_INFO info; + CHARSET_INFO *cs= ftb_param->ftb->charset; + const uchar **start= &query; + const uchar *end= query + len; + FT_WORD w; + + info.prev= ' '; + info.quot= 0; + while (maria_ft_get_word(cs, start, end, &w, &info)) + param->mysql_add_word(param, w.pos, w.len, &info); + return(0); +} + + +static int _ftb_parse_query(FTB *ftb, uchar *query, size_t len, + struct st_mysql_ftparser *parser) +{ + MYSQL_FTPARSER_PARAM *param; + MY_FTB_PARAM ftb_param; + DBUG_ENTER("_ftb_parse_query"); + DBUG_ASSERT(parser); + + if (ftb->state != UNINITIALIZED) + DBUG_RETURN(0); + if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0))) + DBUG_RETURN(1); + + ftb_param.ftb= ftb; + ftb_param.depth= 0; + ftb_param.ftbe= ftb->root; + ftb_param.up_quot= 0; + + param->mysql_parse= ftb_parse_query_internal; + param->mysql_add_word= ftb_query_add_word; + param->mysql_ftparam= (void *)&ftb_param; + param->cs= ftb->charset; + param->doc= query; + param->length= len; + param->flags= 0; + param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO; + DBUG_RETURN(parser->parse(param)); +} + + +static int _ftb_no_dupes_cmp(void* not_used __attribute__((unused)), + const void *a,const void *b) +{ + return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b))); +} + + +/* returns 1 if the search was finished (must-word wasn't found) */ + +static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) +{ + int r; + int subkeys=1; + my_bool can_go_down; + MARIA_HA *info=ftb->info; + uint off, extra=HA_FT_WLEN+info->s->base.rec_reflength; + uchar *lastkey_buf= ftbw->word+ftbw->off; + MARIA_KEY key; + LINT_INIT(off); + + if (ftbw->flags & FTB_FLAG_TRUNC) + lastkey_buf+=ftbw->len; + + if (init_search) + { + ftbw->key_root=info->s->state.key_root[ftb->keynr]; + ftbw->keyinfo=info->s->keyinfo+ftb->keynr; + key.keyinfo= ftbw->keyinfo; + key.data= ftbw->word; + key.data_length= ftbw->len; + key.ref_length= 0; + key.flag= 0; + + r= _ma_search(info, &key, SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root); + } + else + { + uint sflag= SEARCH_BIGGER; + my_off_t max_docid=0; + FTB_EXPR *tmp; + + for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up) + set_if_bigger(max_docid, tmp->max_docid); + + if (ftbw->docid[0] < max_docid) + { + sflag|= SEARCH_SAME; + _ma_dpointer(info->s, (uchar*) (ftbw->word + ftbw->len + HA_FT_WLEN), + max_docid); + } + + key.keyinfo= ftbw->keyinfo; + key.data= lastkey_buf; + key.data_length= USE_WHOLE_KEY; + key.ref_length= 0; + key.flag= 0; + + r= _ma_search(info, &key, sflag, ftbw->key_root); + } + + can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC))); + /* Skip rows inserted by concurrent insert */ + while (!r) + { + if (can_go_down) + { + /* going down ? */ + off= info->last_key.data_length + info->last_key.ref_length - extra; + subkeys=ft_sintXkorr(info->last_key.data + off); + } + if (subkeys<0 || info->cur_row.lastpos < info->state->data_file_length) + break; + r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, ftbw->key_root); + } + + if (!r && !ftbw->off) + { + r= ha_compare_text(ftb->charset, + info->last_key.data+1, + info->last_key.data_length + info->last_key.ref_length- + extra-1, + (uchar*) ftbw->word+1, + ftbw->len-1, + (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0); + } + + if (r) /* not found */ + { + if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC)) + { + ftbw->docid[0]=HA_OFFSET_ERROR; + if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0) + { + /* + This word MUST BE present in every document returned, + so we can stop the search right now + */ + ftb->state=INDEX_DONE; + return 1; /* search is done */ + } + else + return 0; + } + + /* going up to the first-level tree to continue search there */ + _ma_dpointer(info->s, (lastkey_buf+HA_FT_WLEN), ftbw->key_root); + ftbw->key_root=info->s->state.key_root[ftb->keynr]; + ftbw->keyinfo=info->s->keyinfo+ftb->keynr; + ftbw->off=0; + return _ft2_search(ftb, ftbw, 0); + } + + /* matching key found */ + memcpy(lastkey_buf, info->last_key.data, + info->last_key.data_length + info->last_key.ref_length); + if (lastkey_buf == ftbw->word) + ftbw->len= info->last_key.data_length + info->last_key.ref_length - extra; + + /* going down ? */ + if (subkeys<0) + { + /* + yep, going down, to the second-level tree + TODO here: subkey-based optimization + */ + ftbw->off=off; + ftbw->key_root= info->cur_row.lastpos; + ftbw->keyinfo=& info->s->ft2_keyinfo; + r= _ma_search_first(info, ftbw->keyinfo, ftbw->key_root); + DBUG_ASSERT(r==0); /* found something */ + memcpy(lastkey_buf+off, info->last_key.data, + info->last_key.data_length + info->last_key.ref_length); + } + ftbw->docid[0]= info->cur_row.lastpos; + if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC)) + ftbw->max_docid_expr->max_docid= info->cur_row.lastpos; + return 0; +} + +static void _ftb_init_index_search(FT_INFO *ftb) +{ + int i; + FTB_WORD *ftbw; + + if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY) + return; + ftb->state=INDEX_SEARCH; + + for (i= queue_last_element(&ftb->queue); + (int) i >= (int) queue_first_element(&ftb->queue); + i--) + { + ftbw=(FTB_WORD *)(queue_element(&ftb->queue, i)); + + if (ftbw->flags & FTB_FLAG_TRUNC) + { + /* + special treatment for truncation operator + 1. there are some (besides this) +words + | no need to search in the index, it can never ADD new rows + | to the result, and to remove half-matched rows we do scan anyway + 2. -trunc* + | same as 1. + 3. in 1 and 2, +/- need not be on the same expr. level, + but can be on any upper level, as in +word +(trunc1* trunc2*) + 4. otherwise + | We have to index-search for this prefix. + | It may cause duplicates, as in the index (sorted by <word,docid>) + | <aaaa,row1> + | <aabb,row2> + | <aacc,row1> + | Searching for "aa*" will find row1 twice... + */ + FTB_EXPR *ftbe; + for (ftbe=(FTB_EXPR*)ftbw; + ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC); + ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up) + { + if (ftbe->flags & FTB_FLAG_NO || /* 2 */ + ftbe->up->ythresh - ftbe->up->yweaks > + (uint) test(ftbe->flags & FTB_FLAG_YES)) /* 1 */ + { + FTB_EXPR *top_ftbe=ftbe->up; + ftbw->docid[0]=HA_OFFSET_ERROR; + for (ftbe=(FTB_EXPR *)ftbw; + ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO); + ftbe=ftbe->up) + ftbe->up->yweaks++; + ftbe=0; + break; + } + } + if (!ftbe) + continue; + /* 4 */ + if (!is_tree_inited(& ftb->no_dupes)) + init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t), + _ftb_no_dupes_cmp,0,0,0); + else + reset_tree(& ftb->no_dupes); + } + + ftbw->off=0; /* in case of reinit */ + if (_ft2_search(ftb, ftbw, 1)) + return; + } + queue_fix(& ftb->queue); +} + + +FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr, + uchar *query, size_t query_len, + CHARSET_INFO *cs) +{ + FTB *ftb; + FTB_EXPR *ftbe; + FTB_WORD *ftbw; + + if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME)))) + return 0; + ftb->please= (struct _ft_vft *) & _ma_ft_vft_boolean; + ftb->state=UNINITIALIZED; + ftb->info=info; + ftb->keynr=keynr; + ftb->charset=cs; + DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset); + ftb->with_scan=0; + ftb->lastpos=HA_OFFSET_ERROR; + bzero(& ftb->no_dupes, sizeof(TREE)); + ftb->last_word= 0; + + init_alloc_root(&ftb->mem_root, 1024, 1024); + ftb->queue.max_elements= 0; + if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR)))) + goto err; + ftbe->weight=1; + ftbe->flags=FTB_FLAG_YES; + ftbe->nos=1; + ftbe->up=0; + ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; + ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR; + ftbe->phrase= NULL; + ftbe->document= 0; + ftb->root=ftbe; + if (unlikely(_ftb_parse_query(ftb, query, query_len, + keynr == NO_SUCH_KEY ? &ft_default_parser : + info->s->keyinfo[keynr].parser))) + goto err; + /* + Hack: instead of init_queue, we'll use reinit queue to be able + to alloc queue with alloc_root() + */ + if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root, + (ftb->queue.max_elements + 1) * + sizeof(void *)))) + goto err; + reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0, + (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0, 0, 0); + for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev) + queue_insert(&ftb->queue, (uchar *)ftbw); + ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root, + sizeof(FTB_WORD *)*ftb->queue.elements); + memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements); + my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *), + (qsort2_cmp)FTB_WORD_cmp_list, (void*) ftb->charset); + if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC; + ftb->state=READY; + return ftb; +err: + free_root(& ftb->mem_root, MYF(0)); + my_free(ftb, MYF(0)); + return 0; +} + + +typedef struct st_my_ftb_phrase_param +{ + LIST *phrase; + LIST *document; + CHARSET_INFO *cs; + uint phrase_length; + uint document_length; + uint match; +} MY_FTB_PHRASE_PARAM; + + +static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param, + const uchar *word, mysql_ft_size_t word_len, + MYSQL_FTPARSER_BOOLEAN_INFO + *boolean_info __attribute__((unused))) +{ + MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; + FT_WORD *w= (FT_WORD *)phrase_param->document->data; + LIST *phrase, *document; + w->pos= (uchar*) word; + w->len= word_len; + phrase_param->document= phrase_param->document->prev; + if (phrase_param->phrase_length > phrase_param->document_length) + { + phrase_param->document_length++; + return 0; + } + /* TODO: rewrite phrase search to avoid + comparing the same word twice. */ + for (phrase= phrase_param->phrase, document= phrase_param->document->next; + phrase; phrase= phrase->next, document= document->next) + { + FT_WORD *phrase_word= (FT_WORD *)phrase->data; + FT_WORD *document_word= (FT_WORD *)document->data; + if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos, + phrase_word->len, + (uchar*) document_word->pos, document_word->len)) + return 0; + } + phrase_param->match++; + return 0; +} + + +static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param, + const uchar *document, + mysql_ft_size_t len) +{ + FT_WORD word; + MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; + const uchar *docend= document + len; + while (maria_ft_simple_get_word(phrase_param->cs, &document, + docend, &word, FALSE)) + { + param->mysql_add_word(param, word.pos, word.len, 0); + if (phrase_param->match) + break; + } + return 0; +} + + +/* + Checks if given buffer matches phrase list. + + SYNOPSIS + _ftb_check_phrase() + s0 start of buffer + e0 end of buffer + phrase broken into list phrase + cs charset info + + RETURN VALUE + 1 is returned if phrase found, 0 else. + -1 is returned if error occurs. +*/ + +static int _ftb_check_phrase(FTB *ftb, const uchar *document, size_t len, + FTB_EXPR *ftbe, struct st_mysql_ftparser *parser) +{ + MY_FTB_PHRASE_PARAM ftb_param; + MYSQL_FTPARSER_PARAM *param; + DBUG_ENTER("_ftb_check_phrase"); + DBUG_ASSERT(parser); + + if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 1))) + DBUG_RETURN(0); + ftb_param.phrase= ftbe->phrase; + ftb_param.document= ftbe->document; + ftb_param.cs= ftb->charset; + ftb_param.phrase_length= list_length(ftbe->phrase); + ftb_param.document_length= 1; + ftb_param.match= 0; + + param->mysql_parse= ftb_check_phrase_internal; + param->mysql_add_word= ftb_phrase_add_word; + param->mysql_ftparam= (void *)&ftb_param; + param->cs= ftb->charset; + param->doc= document; + param->length= len; + param->flags= 0; + param->mode= MYSQL_FTPARSER_WITH_STOPWORDS; + if (unlikely(parser->parse(param))) + return -1; + DBUG_RETURN(ftb_param.match ? 1 : 0); +} + + +static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig) +{ + FT_SEG_ITERATOR ftsi; + FTB_EXPR *ftbe; + float weight=ftbw->weight; + int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0); + my_off_t curdoc=ftbw->docid[mode]; + struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? + &ft_default_parser : + ftb->info->s->keyinfo[ftb->keynr].parser; + + for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up) + { + ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks); + if (ftbe->docid[mode] != curdoc) + { + ftbe->cur_weight=0; + ftbe->yesses=ftbe->nos=0; + ftbe->docid[mode]=curdoc; + } + if (ftbe->nos) + break; + if (yn_flag & FTB_FLAG_YES) + { + weight /= ftbe->ythresh; + ftbe->cur_weight += weight; + if ((int) ++ftbe->yesses == ythresh) + { + yn_flag=ftbe->flags; + weight=ftbe->cur_weight*ftbe->weight; + if (mode && ftbe->phrase) + { + int found= 0; + + memcpy(&ftsi, ftsi_orig, sizeof(ftsi)); + while (_ma_ft_segiterator(&ftsi) && !found) + { + if (!ftsi.pos) + continue; + found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser); + if (unlikely(found < 0)) + return 1; + } + if (!found) + break; + } /* ftbe->quot */ + } + else + break; + } + else + if (yn_flag & FTB_FLAG_NO) + { + /* + NOTE: special sort function of queue assures that all + (yn_flag & FTB_FLAG_NO) != 0 + events for every particular subexpression will + "auto-magically" happen BEFORE all the + (yn_flag & FTB_FLAG_YES) != 0 events. So no + already matched expression can become not-matched again. + */ + ++ftbe->nos; + break; + } + else + { + if (ftbe->ythresh) + weight/=3; + ftbe->cur_weight += weight; + if ((int) ftbe->yesses < ythresh) + break; + if (!(yn_flag & FTB_FLAG_WONLY)) + yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ; + weight*= ftbe->weight; + } + } + return 0; +} + + +int maria_ft_boolean_read_next(FT_INFO *ftb, char *record) +{ + FTB_EXPR *ftbe; + FTB_WORD *ftbw; + MARIA_HA *info=ftb->info; + my_off_t curdoc; + + if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE) + return -1; + + /* black magic ON */ + if ((int) _ma_check_index(info, ftb->keynr) < 0) + return my_errno; + if (_ma_readinfo(info, F_RDLCK, 1)) + return my_errno; + /* black magic OFF */ + + if (!ftb->queue.elements) + return my_errno=HA_ERR_END_OF_FILE; + + /* Attention!!! Address of a local variable is used here! See err: label */ + ftb->queue.first_cmp_arg=(void *)&curdoc; + + while (ftb->state == INDEX_SEARCH && + (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) != + HA_OFFSET_ERROR) + { + while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0]) + { + if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0))) + { + my_errno= HA_ERR_OUT_OF_MEM; + goto err; + } + + /* update queue */ + _ft2_search(ftb, ftbw, 0); + queue_replace_top(&ftb->queue); + } + + ftbe=ftb->root; + if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 && + ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos) + { + /* curdoc matched ! */ + if (is_tree_inited(&ftb->no_dupes) && + tree_insert(&ftb->no_dupes, &curdoc, 0, + ftb->no_dupes.custom_arg)->count >1) + /* but it managed already to get past this line once */ + continue; + + info->cur_row.lastpos= curdoc; + /* Clear all states, except that the table was updated */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + if (!(*info->read_record)(info, (uchar *) record, curdoc)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + if (ftb->with_scan && + maria_ft_boolean_find_relevance(ftb, (uchar *) record, 0)==0) + continue; /* no match */ + my_errno=0; + goto err; + } + goto err; + } + } + ftb->state=INDEX_DONE; + my_errno=HA_ERR_END_OF_FILE; +err: + ftb->queue.first_cmp_arg=(void *)0; + return my_errno; +} + + +typedef struct st_my_ftb_find_param +{ + FT_INFO *ftb; + FT_SEG_ITERATOR *ftsi; +} MY_FTB_FIND_PARAM; + + +static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param, + const uchar *word, mysql_ft_size_t len, + MYSQL_FTPARSER_BOOLEAN_INFO + *boolean_info __attribute__((unused))) +{ + MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; + FT_INFO *ftb= ftb_param->ftb; + FTB_WORD *ftbw; + int a, b, c; + /* + Find right-most element in the array of query words matching this + word from a document. + */ + for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2) + { + ftbw= ftb->list[c]; + if (ha_compare_text(ftb->charset, (uchar*)word, len, + (uchar*)ftbw->word+1, ftbw->len-1, + (my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) < 0) + b= c; + else + a= c; + } + /* + If there were no words with truncation operator, we iterate to the + beginning of an array until array element is equal to the word from + a document. This is done mainly because the same word may be + mentioned twice (or more) in the query. + + In case query has words with truncation operator we must iterate + to the beginning of the array. There may be non-matching query words + between matching word with truncation operator and the right-most + matching element. E.g., if we're looking for 'aaa15' in an array of + 'aaa1* aaa14 aaa15 aaa16'. + + Worse of that there still may be match even if the binary search + above didn't find matching element. E.g., if we're looking for + 'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will + stop at 'aaa16'. + */ + for (; c >= 0; c--) + { + ftbw= ftb->list[c]; + if (ha_compare_text(ftb->charset, (uchar*)word, len, + (uchar*)ftbw->word + 1,ftbw->len - 1, + (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0)) + { + if (ftb->with_scan & FTB_FLAG_TRUNC) + continue; + else + break; + } + if (ftbw->docid[1] == ftb->info->cur_row.lastpos) + continue; + ftbw->docid[1]= ftb->info->cur_row.lastpos; + if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi))) + return 1; + } + return(0); +} + + +static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param, + const uchar *doc, mysql_ft_size_t len) +{ + MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; + FT_INFO *ftb= ftb_param->ftb; + const uchar *end= doc + len; + FT_WORD w; + while (maria_ft_simple_get_word(ftb->charset, &doc, end, &w, TRUE)) + param->mysql_add_word(param, w.pos, w.len, 0); + return(0); +} + + +float maria_ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length) +{ + FTB_EXPR *ftbe; + FT_SEG_ITERATOR ftsi, ftsi2; + MARIA_RECORD_POS docid= ftb->info->cur_row.lastpos; + MY_FTB_FIND_PARAM ftb_param; + MYSQL_FTPARSER_PARAM *param; + struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? + &ft_default_parser : + ftb->info->s->keyinfo[ftb->keynr].parser; + + if (docid == HA_OFFSET_ERROR) + return -2.0; + if (!ftb->queue.elements) + return 0; + if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0))) + return 0; + + if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos) + { + FTB_EXPR *x; + uint i; + + for (i=0; i < ftb->queue.elements; i++) + { + ftb->list[i]->docid[1]=HA_OFFSET_ERROR; + for (x=ftb->list[i]->up; x; x=x->up) + x->docid[1]=HA_OFFSET_ERROR; + } + } + + ftb->lastpos=docid; + + if (ftb->keynr==NO_SUCH_KEY) + _ma_ft_segiterator_dummy_init(record, length, &ftsi); + else + _ma_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi); + memcpy(&ftsi2, &ftsi, sizeof(ftsi)); + + ftb_param.ftb= ftb; + ftb_param.ftsi= &ftsi2; + param->mysql_parse= ftb_find_relevance_parse; + param->mysql_add_word= ftb_find_relevance_add_word; + param->mysql_ftparam= (void *)&ftb_param; + param->flags= 0; + param->cs= ftb->charset; + param->mode= MYSQL_FTPARSER_SIMPLE_MODE; + + while (_ma_ft_segiterator(&ftsi)) + { + if (!ftsi.pos) + continue; + param->doc= ftsi.pos; + param->length= ftsi.len; + if (unlikely(parser->parse(param))) + return 0; + } + ftbe=ftb->root; + if (ftbe->docid[1]==docid && ftbe->cur_weight>0 && + ftbe->yesses>=ftbe->ythresh && !ftbe->nos) + { /* row matched ! */ + return ftbe->cur_weight; + } + else + { /* match failed ! */ + return 0.0; + } +} + + +void maria_ft_boolean_close_search(FT_INFO *ftb) +{ + if (is_tree_inited(& ftb->no_dupes)) + { + delete_tree(& ftb->no_dupes); + } + free_root(& ftb->mem_root, MYF(0)); + my_free(ftb, MYF(0)); +} + + +float maria_ft_boolean_get_relevance(FT_INFO *ftb) +{ + return ftb->root->cur_weight; +} + + +void maria_ft_boolean_reinit_search(FT_INFO *ftb) +{ + _ftb_init_index_search(ftb); +} diff --git a/storage/maria/ma_ft_eval.c b/storage/maria/ma_ft_eval.c new file mode 100644 index 00000000000..5fc67c6c664 --- /dev/null +++ b/storage/maria/ma_ft_eval.c @@ -0,0 +1,254 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code + added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ + +#include "ma_ftdefs.h" +#include "maria_ft_eval.h" +#include <stdarg.h> +#include <my_getopt.h> + +static void print_error(int exit_code, const char *fmt,...); +static void get_options(int argc, char *argv[]); +static int create_record(char *pos, FILE *file); +static void usage(); + +static struct my_option my_long_options[] = +{ + {"", 's', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'q', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '#', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +int main(int argc, char *argv[]) +{ + MARIA_HA *file; + int i,j; + + MY_INIT(argv[0]); + get_options(argc,argv); + bzero((char*)recinfo,sizeof(recinfo)); + + maria_init(); + /* First define 2 columns */ + recinfo[0].type=FIELD_SKIP_ENDSPACE; + recinfo[0].length=docid_length; + recinfo[1].type=FIELD_BLOB; + recinfo[1].length= 4+portable_sizeof_char_ptr; + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].seg[0].type= HA_KEYTYPE_TEXT; + keyinfo[0].seg[0].flag= HA_BLOB_PART; + keyinfo[0].seg[0].start=recinfo[0].length; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit=0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].bit_start=4; + keyinfo[0].seg[0].language=MY_CHARSET_CURRENT; + keyinfo[0].flag = HA_FULLTEXT; + + if (!silent) + printf("- Creating isam-file\n"); + if (maria_create(filename,1,keyinfo,2,recinfo,0,NULL,(MARIA_CREATE_INFO*) 0,0)) + goto err; + if (!(file=maria_open(filename,2,0))) + goto err; + if (!silent) + printf("Initializing stopwords\n"); + maria_ft_init_stopwords(stopwordlist); + + if (!silent) + printf("- Writing key:s\n"); + + my_errno=0; + i=0; + while (create_record(record,df)) + { + error=maria_write(file,record); + if (error) + printf("I= %2d maria_write: %d errno: %d\n",i,error,my_errno); + i++; + } + fclose(df); + + if (maria_close(file)) goto err; + if (!silent) + printf("- Reopening file\n"); + if (!(file=maria_open(filename,2,0))) goto err; + if (!silent) + printf("- Reading rows with key\n"); + for (i=1;create_record(record,qf);i++) + { + FT_DOCLIST *result; + double w; + int t, err; + + result=maria_ft_nlq_init_search(file,0,blob_record,(uint) strlen(blob_record),1); + if (!result) + { + printf("Query %d failed with errno %3d\n",i,my_errno); + goto err; + } + if (!silent) + printf("Query %d. Found: %d.\n",i,result->ndocs); + for (j=0;(err=maria_ft_nlq_read_next(result, read_record))==0;j++) + { + t=uint2korr(read_record); + w=maria_ft_nlq_get_relevance(result); + printf("%d %.*s %f\n",i,t,read_record+2,w); + } + if (err != HA_ERR_END_OF_FILE) + { + printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno); + goto err; + } + maria_ft_nlq_close_search(result); + } + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return (0); + + err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ + +} + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument) +{ + switch (optid) { + case 's': + if (stopwordlist && stopwordlist != maria_ft_precompiled_stopwords) + break; + { + FILE *f; char s[HA_FT_MAXLEN]; int i=0,n=SWL_INIT; + + if (!(stopwordlist=(const char**) malloc(n*sizeof(char *)))) + print_error(1,"malloc(%d)",n*sizeof(char *)); + if (!(f=fopen(argument,"r"))) + print_error(1,"fopen(%s)",argument); + while (!feof(f)) + { + if (!(fgets(s,HA_FT_MAXLEN,f))) + print_error(1,"fgets(s,%d,%s)",HA_FT_MAXLEN,argument); + if (!(stopwordlist[i++]=strdup(s))) + print_error(1,"strdup(%s)",s); + if (i >= n) + { + n+=SWL_PLUS; + if (!(stopwordlist=(const char**) realloc((char*) stopwordlist, + n*sizeof(char *)))) + print_error(1,"realloc(%d)",n*sizeof(char *)); + } + } + fclose(f); + stopwordlist[i]=NULL; + break; + } + case 'q': silent=1; break; + case 'S': if (stopwordlist==maria_ft_precompiled_stopwords) stopwordlist=NULL; break; + case '#': + DBUG_PUSH (argument); + break; + case 'V': + case '?': + case 'h': + usage(); + exit(1); + } + return 0; +} + + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + + if (!(d_file=argv[optind])) print_error(1,"No d_file"); + if (!(df=fopen(d_file,"r"))) + print_error(1,"fopen(%s)",d_file); + if (!(q_file=argv[optind+1])) print_error(1,"No q_file"); + if (!(qf=fopen(q_file,"r"))) + print_error(1,"fopen(%s)",q_file); + return; +} /* get options */ + + +static int create_record(char *pos, FILE *file) +{ + uint tmp; char *ptr; + + bzero((char *)pos,MAX_REC_LENGTH); + + /* column 1 - VARCHAR */ + if (!(fgets(pos+2,MAX_REC_LENGTH-32,file))) + { + if (feof(file)) + return 0; + else + print_error(1,"fgets(docid) - 1"); + } + tmp=(uint) strlen(pos+2)-1; + int2store(pos,tmp); + pos+=recinfo[0].length; + + /* column 2 - BLOB */ + + if (!(fgets(blob_record,MAX_BLOB_LENGTH,file))) + print_error(1,"fgets(docid) - 2"); + tmp=(uint) strlen(blob_record); + int4store(pos,tmp); + ptr=blob_record; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + return 1; +} + +/* VARARGS */ + +static void print_error(int exit_code, const char *fmt,...) +{ + va_list args; + + va_start(args,fmt); + fprintf(stderr,"%s: error: ",my_progname); + VOID(vfprintf(stderr, fmt, args)); + VOID(fputc('\n',stderr)); + fflush(stderr); + va_end(args); + exit(exit_code); +} + + +static void usage() +{ + printf("%s [options]\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/ma_ft_eval.h b/storage/maria/ma_ft_eval.h new file mode 100644 index 00000000000..481943dfb0b --- /dev/null +++ b/storage/maria/ma_ft_eval.h @@ -0,0 +1,41 @@ +/* Copyright (C) 2006 MySQL AB & Sergei A. Golubchik + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +const char **stopwordlist=maria_ft_precompiled_stopwords; + +#define MAX_REC_LENGTH 128 +#define MAX_BLOB_LENGTH 60000 +char record[MAX_REC_LENGTH], read_record[MAX_REC_LENGTH+MAX_BLOB_LENGTH]; +char blob_record[MAX_BLOB_LENGTH+20*20]; + +char *filename= (char*) "EVAL"; + +int silent=0, error=0; + +uint key_length=MAX_BLOB_LENGTH,docid_length=32; +char *d_file, *q_file; +FILE *df,*qf; + +MARIA_COLUMNDEF recinfo[3]; +MARIA_KEYDEF keyinfo[2]; +HA_KEYSEG keyseg[10]; + +#define SWL_INIT 500 +#define SWL_PLUS 50 + +#define MAX_LINE_LENGTH 128 +char line[MAX_LINE_LENGTH]; diff --git a/storage/maria/ma_ft_nlq_search.c b/storage/maria/ma_ft_nlq_search.c new file mode 100644 index 00000000000..3bb7defcaaf --- /dev/null +++ b/storage/maria/ma_ft_nlq_search.c @@ -0,0 +1,380 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +#define FT_CORE +#include "ma_ftdefs.h" + +/* search with natural language queries */ + +typedef struct ft_doc_rec +{ + my_off_t dpos; + double weight; +} FT_DOC; + +struct st_ft_info +{ + struct _ft_vft *please; + MARIA_HA *info; + int ndocs; + int curdoc; + FT_DOC doc[1]; +}; + +typedef struct st_all_in_one +{ + MARIA_HA *info; + uint keynr; + CHARSET_INFO *charset; + uchar *keybuff; + TREE dtree; +} ALL_IN_ONE; + +typedef struct st_ft_superdoc +{ + FT_DOC doc; + FT_WORD *word_ptr; + double tmp_weight; +} FT_SUPERDOC; + + +static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)), + FT_SUPERDOC *p1, FT_SUPERDOC *p2) +{ + if (p1->doc.dpos < p2->doc.dpos) + return -1; + if (p1->doc.dpos == p2->doc.dpos) + return 0; + return 1; +} + +static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio) +{ + FT_WEIGTH subkeys; + int r; + uint doc_cnt; + FT_SUPERDOC sdoc, *sptr; + TREE_ELEMENT *selem; + double gweight=1; + MARIA_HA *info= aio->info; + uchar *keybuff= aio->keybuff; + MARIA_KEYDEF *keyinfo= info->s->keyinfo+aio->keynr; + my_off_t key_root=info->s->state.key_root[aio->keynr]; + uint extra=HA_FT_WLEN+info->s->base.rec_reflength; + MARIA_KEY key; +#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT + float tmp_weight; +#else +#error +#endif + DBUG_ENTER("walk_and_match"); + + word->weight=LWS_FOR_QUERY; + + _ma_ft_make_key(info, &key, aio->keynr, keybuff, word, 0); + key.data_length-= HA_FT_WLEN; + doc_cnt=0; + + /* Skip rows inserted by current inserted */ + for (r= _ma_search(info, &key, SEARCH_FIND, key_root) ; + !r && + (subkeys.i= ft_sintXkorr(info->last_key.data + + info->last_key.data_length + + info->last_key.ref_length - extra)) > 0 && + info->cur_row.lastpos >= info->state->data_file_length ; + r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root)) + ; + + info->update|= HA_STATE_AKTIV; /* for _ma_test_if_changed() */ + + /* The following should be safe, even if we compare doubles */ + while (!r && gweight) + { + + if (key.data_length && + ha_compare_text(aio->charset, + info->last_key.data+1, + info->last_key.data_length + + info->last_key.ref_length - extra - 1, + key.data+1, key.data_length-1, 0, 0)) + break; + + if (subkeys.i < 0) + { + if (doc_cnt) + DBUG_RETURN(1); /* index is corrupted */ + /* + TODO here: unsafe optimization, should this word + be skipped (based on subkeys) ? + */ + keybuff+= key.data_length; + keyinfo= &info->s->ft2_keyinfo; + key_root= info->cur_row.lastpos; + key.data_length= 0; + r= _ma_search_first(info, keyinfo, key_root); + goto do_skip; + } +#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT + /* The weight we read was actually a float */ + tmp_weight= subkeys.f; +#else +#error +#endif + /* The following should be safe, even if we compare doubles */ + if (tmp_weight==0) + DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */ + + sdoc.doc.dpos= info->cur_row.lastpos; + + /* saving document matched into dtree */ + if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg))) + DBUG_RETURN(1); + + sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem); + + if (selem->count==1) /* document's first match */ + sptr->doc.weight=0; + else + sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight; + + sptr->word_ptr=word; + sptr->tmp_weight=tmp_weight; + + doc_cnt++; + + gweight=word->weight*GWS_IN_USE; + if (gweight < 0 || doc_cnt > 2000000) + gweight=0; + + if (_ma_test_if_changed(info) == 0) + r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root); + else + r= _ma_search(info, &info->last_key, SEARCH_BIGGER, key_root); +do_skip: + while ((subkeys.i= ft_sintXkorr(info->last_key.data + + info->last_key.data_length + + info->last_key.ref_length - extra)) > 0 && + !r && info->cur_row.lastpos >= info->state->data_file_length) + r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root); + + } + word->weight=gweight; + + DBUG_RETURN(0); +} + + +static int walk_and_copy(FT_SUPERDOC *from, + uint32 count __attribute__((unused)), FT_DOC **to) +{ + DBUG_ENTER("walk_and_copy"); + from->doc.weight+=from->tmp_weight*from->word_ptr->weight; + (*to)->dpos=from->doc.dpos; + (*to)->weight=from->doc.weight; + (*to)++; + DBUG_RETURN(0); +} + +static int walk_and_push(FT_SUPERDOC *from, + uint32 count __attribute__((unused)), QUEUE *best) +{ + DBUG_ENTER("walk_and_copy"); + from->doc.weight+=from->tmp_weight*from->word_ptr->weight; + set_if_smaller(best->elements, ft_query_expansion_limit-1); + queue_insert(best, (uchar *)& from->doc); + DBUG_RETURN(0); +} + + +static int FT_DOC_cmp(void *unused __attribute__((unused)), + FT_DOC *a, FT_DOC *b) +{ + return sgn(b->weight - a->weight); +} + + +FT_INFO *maria_ft_init_nlq_search(MARIA_HA *info, uint keynr, uchar *query, + size_t query_len, uint flags, uchar *record) +{ + TREE wtree; + ALL_IN_ONE aio; + FT_DOC *dptr; + FT_INFO *dlist=NULL; + MARIA_RECORD_POS saved_lastpos= info->cur_row.lastpos; + struct st_mysql_ftparser *parser; + MYSQL_FTPARSER_PARAM *ftparser_param; + DBUG_ENTER("maria_ft_init_nlq_search"); + + /* black magic ON */ + if ((int) (keynr = _ma_check_index(info,keynr)) < 0) + DBUG_RETURN(NULL); + if (_ma_readinfo(info,F_RDLCK,1)) + DBUG_RETURN(NULL); + /* black magic OFF */ + + aio.info=info; + aio.keynr=keynr; + aio.charset=info->s->keyinfo[keynr].seg->charset; + aio.keybuff= info->lastkey_buff2; + parser= info->s->keyinfo[keynr].parser; + if (! (ftparser_param= maria_ftparser_call_initializer(info, keynr, 0))) + goto err; + + bzero(&wtree,sizeof(wtree)); + + init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0, + NULL, NULL); + + maria_ft_parse_init(&wtree, aio.charset); + ftparser_param->flags= 0; + if (maria_ft_parse(&wtree, query, query_len, parser, ftparser_param, + &wtree.mem_root)) + goto err; + + if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio, + left_root_right)) + goto err; + + if (flags & FT_EXPAND && ft_query_expansion_limit) + { + QUEUE best; + init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp, + 0, 0, 0); + tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push, + &best, left_root_right); + while (best.elements) + { + my_off_t docid= ((FT_DOC *)queue_remove_top(&best))->dpos; + if (!(*info->read_record)(info, record, docid)) + { + info->update|= HA_STATE_AKTIV; + ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY; + if (unlikely(_ma_ft_parse(&wtree, info, keynr, record, ftparser_param, + &wtree.mem_root))) + { + delete_queue(&best); + goto err; + } + } + } + delete_queue(&best); + reset_tree(&aio.dtree); + if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio, + left_root_right)) + goto err; + + } + + /* + If ndocs == 0, this will not allocate RAM for FT_INFO.doc[], + so if ndocs == 0, FT_INFO.doc[] must not be accessed. + */ + dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+ + sizeof(FT_DOC)* + (int)(aio.dtree.elements_in_tree-1), + MYF(0)); + if (!dlist) + goto err; + + dlist->please= (struct _ft_vft *) & _ma_ft_vft_nlq; + dlist->ndocs=aio.dtree.elements_in_tree; + dlist->curdoc=-1; + dlist->info=aio.info; + dptr=dlist->doc; + + tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy, + &dptr, left_root_right); + + if (flags & FT_SORTED) + my_qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC), + (qsort2_cmp)&FT_DOC_cmp, 0); + +err: + delete_tree(&aio.dtree); + delete_tree(&wtree); + info->cur_row.lastpos= saved_lastpos; + DBUG_RETURN(dlist); +} + + +int maria_ft_nlq_read_next(FT_INFO *handler, char *record) +{ + MARIA_HA *info= (MARIA_HA *) handler->info; + + if (++handler->curdoc >= handler->ndocs) + { + --handler->curdoc; + return HA_ERR_END_OF_FILE; + } + + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + info->cur_row.lastpos= handler->doc[handler->curdoc].dpos; + if (!(*info->read_record)(info, (uchar *) record, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + return 0; + } + return my_errno; +} + + +float maria_ft_nlq_find_relevance(FT_INFO *handler, + uchar *record __attribute__((unused)), + uint length __attribute__((unused))) +{ + int a,b,c; + FT_DOC *docs=handler->doc; + MARIA_RECORD_POS docid= handler->info->cur_row.lastpos; + + if (docid == HA_POS_ERROR) + return -5.0; + + /* Assuming docs[] is sorted by dpos... */ + + for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2) + { + if (docs[c].dpos > docid) + b=c; + else + a=c; + } + /* bounds check to avoid accessing unallocated handler->doc */ + if (a < handler->ndocs && docs[a].dpos == docid) + return (float) docs[a].weight; + else + return 0.0; +} + + +void maria_ft_nlq_close_search(FT_INFO *handler) +{ + my_free(handler, MYF(0)); +} + + +float maria_ft_nlq_get_relevance(FT_INFO *handler) +{ + return (float) handler->doc[handler->curdoc].weight; +} + + +void maria_ft_nlq_reinit_search(FT_INFO *handler) +{ + handler->curdoc=-1; +} + diff --git a/storage/maria/ma_ft_parser.c b/storage/maria/ma_ft_parser.c new file mode 100644 index 00000000000..b35c2227ca2 --- /dev/null +++ b/storage/maria/ma_ft_parser.c @@ -0,0 +1,417 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +#include "ma_ftdefs.h" + +typedef struct st_maria_ft_docstat { + FT_WORD *list; + uint uniq; + double sum; +} FT_DOCSTAT; + + +typedef struct st_my_maria_ft_parser_param +{ + TREE *wtree; + MEM_ROOT *mem_root; +} MY_FT_PARSER_PARAM; + + +static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2) +{ + return ha_compare_text(cs, (uchar*) w1->pos, w1->len, + (uchar*) w2->pos, w2->len, 0, 0); +} + +static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat) +{ + word->weight=LWS_IN_USE; + docstat->sum+=word->weight; + memcpy_fixed((docstat->list)++,word,sizeof(FT_WORD)); + return 0; +} + +/* transforms tree of words into the array, applying normalization */ + +FT_WORD * maria_ft_linearize(TREE *wtree, MEM_ROOT *mem_root) +{ + FT_WORD *wlist,*p; + FT_DOCSTAT docstat; + DBUG_ENTER("maria_ft_linearize"); + + if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)* + (1+wtree->elements_in_tree)))) + { + docstat.list=wlist; + docstat.uniq=wtree->elements_in_tree; + docstat.sum=0; + tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right); + } + delete_tree(wtree); + if (!wlist) + DBUG_RETURN(NULL); + + docstat.list->pos=NULL; + + for (p=wlist;p->pos;p++) + { + p->weight=PRENORM_IN_USE; + } + + for (p=wlist;p->pos;p++) + { + p->weight/=NORM_IN_USE; + } + + DBUG_RETURN(wlist); +} + +my_bool maria_ft_boolean_check_syntax_string(const uchar *str) +{ + uint i, j; + + if (!str || + (strlen((const char *) str) + 1 != sizeof(ft_boolean_syntax)) || + (str[0] != ' ' && str[1] != ' ')) + return 1; + for (i=0; i<sizeof(ft_boolean_syntax); i++) + { + /* limiting to 7-bit ascii only */ + if ((unsigned char)(str[i]) > 127 || + my_isalnum(default_charset_info, str[i])) + return 1; + for (j=0; j<i; j++) + if (str[i] == str[j] && (i != 11 || j != 10)) + return 1; + } + return 0; +} + +/* + RETURN VALUE + 0 - eof + 1 - word found + 2 - left bracket + 3 - right bracket + 4 - stopword found +*/ +uchar maria_ft_get_word(CHARSET_INFO *cs, const uchar **start, + const uchar *end, + FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) +{ + const uchar *doc= *start; + int ctype; + uint mwc, length; + int mbl; + + param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); + param->weight_adjust= param->wasign= 0; + param->type= FT_TOKEN_EOF; + + while (doc<end) + { + for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) + { + mbl= cs->cset->ctype(cs, &ctype, doc, end); + if (true_word_char(ctype, *doc)) + break; + if (*doc == FTB_RQUOT && param->quot) + { + param->quot= (char *) doc; + *start=doc+1; + param->type= FT_TOKEN_RIGHT_PAREN; + goto ret; + } + if (!param->quot) + { + if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) + { + /* param->prev=' '; */ + *start=doc+1; + if (*doc == FTB_LQUOT) + param->quot= (char *) *start; + param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN); + goto ret; + } + if (param->prev == ' ') + { + if (*doc == FTB_YES ) { param->yesno=+1; continue; } else + if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else + if (*doc == FTB_NO ) { param->yesno=-1; continue; } else + if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else + if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else + if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; } + } + } + param->prev=*doc; + param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); + param->weight_adjust= param->wasign= 0; + } + + mwc=length=0; + for (word->pos= doc; doc < end; length++, + doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) + { + mbl= cs->cset->ctype(cs, &ctype, doc, end); + if (true_word_char(ctype, *doc)) + mwc=0; + else if (!misc_word_char(*doc) || mwc) + break; + else + mwc++; + } + param->prev='A'; /* be sure *prev is true_word_char */ + word->len= (uint)(doc-word->pos) - mwc; + if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) + doc++; + + if (((length >= ft_min_word_len && !is_stopword((char *) word->pos, + word->len)) + || param->trunc) && length < ft_max_word_len) + { + *start=doc; + param->type= FT_TOKEN_WORD; + goto ret; + } + else if (length) /* make sure length > 0 (if start contains spaces only) */ + { + *start= doc; + param->type= FT_TOKEN_STOPWORD; + goto ret; + } + } + if (param->quot) + { + param->quot= (char *)(*start= doc); + param->type= 3; /* FT_RBR */ + goto ret; + } +ret: + return param->type; +} + +uchar maria_ft_simple_get_word(CHARSET_INFO *cs, const uchar **start, + const uchar *end, FT_WORD *word, + my_bool skip_stopwords) +{ + const uchar *doc= *start; + uint mwc, length; + int ctype, mbl; + DBUG_ENTER("maria_ft_simple_get_word"); + + do + { + for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) + { + if (doc >= end) + DBUG_RETURN(0); + mbl= cs->cset->ctype(cs, &ctype, doc, end); + if (true_word_char(ctype, *doc)) + break; + } + + mwc= length= 0; + for (word->pos= doc; doc < end; length++, + doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) + { + mbl= cs->cset->ctype(cs, &ctype, doc, end); + if (true_word_char(ctype, *doc)) + mwc= 0; + else if (!misc_word_char(*doc) || mwc) + break; + else + mwc++; + } + + word->len= (uint)(doc-word->pos) - mwc; + + if (skip_stopwords == FALSE || + (length >= ft_min_word_len && length < ft_max_word_len && + !is_stopword((char *) word->pos, word->len))) + { + *start= doc; + DBUG_RETURN(1); + } + } while (doc < end); + DBUG_RETURN(0); +} + +void maria_ft_parse_init(TREE *wtree, CHARSET_INFO *cs) +{ + DBUG_ENTER("maria_ft_parse_init"); + if (!is_tree_inited(wtree)) + init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0, NULL, + (void*) cs); + DBUG_VOID_RETURN; +} + + +static int maria_ft_add_word(MYSQL_FTPARSER_PARAM *param, + const uchar *word, mysql_ft_size_t word_len, + MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info + __attribute__((unused))) +{ + TREE *wtree; + FT_WORD w; + MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; + DBUG_ENTER("maria_ft_add_word"); + wtree= ft_param->wtree; + if (param->flags & MYSQL_FTFLAGS_NEED_COPY) + { + uchar *ptr; + DBUG_ASSERT(wtree->with_delete == 0); + ptr= (uchar *)alloc_root(ft_param->mem_root, word_len); + memcpy(ptr, word, word_len); + w.pos= ptr; + } + else + w.pos= word; + w.len= word_len; + if (!tree_insert(wtree, &w, 0, wtree->custom_arg)) + { + delete_tree(wtree); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +static int maria_ft_parse_internal(MYSQL_FTPARSER_PARAM *param, + const uchar *doc_arg, + mysql_ft_size_t doc_len) +{ + const uchar *doc= doc_arg; + const uchar *end= doc + doc_len; + MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; + TREE *wtree= ft_param->wtree; + FT_WORD w; + DBUG_ENTER("maria_ft_parse_internal"); + + while (maria_ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE)) + if (param->mysql_add_word(param, w.pos, w.len, 0)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +int maria_ft_parse(TREE *wtree, uchar *doc, size_t doclen, + struct st_mysql_ftparser *parser, + MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) +{ + MY_FT_PARSER_PARAM my_param; + DBUG_ENTER("maria_ft_parse"); + DBUG_ASSERT(parser); + my_param.wtree= wtree; + my_param.mem_root= mem_root; + + param->mysql_parse= maria_ft_parse_internal; + param->mysql_add_word= maria_ft_add_word; + param->mysql_ftparam= &my_param; + param->cs= wtree->custom_arg; + param->doc= doc; + param->length= doclen; + param->mode= MYSQL_FTPARSER_SIMPLE_MODE; + DBUG_RETURN(parser->parse(param)); +} + + +#define MAX_PARAM_NR 2 + +MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info) +{ + if (!info->ftparser_param) + { + /* +. info->ftparser_param can not be zero after the initialization, + because it always includes built-in fulltext parser. And built-in + parser can be called even if the table has no fulltext indexes and + no varchar/text fields. + + ftb_find_relevance... parser (ftb_find_relevance_parse, + ftb_find_relevance_add_word) calls ftb_check_phrase... parser + (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2. + */ + info->ftparser_param= (MYSQL_FTPARSER_PARAM *) + my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) * + info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL)); + init_alloc_root(&info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0); + } + return info->ftparser_param; +} + + +MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, + uint keynr, uint paramnr) +{ + uint32 ftparser_nr; + struct st_mysql_ftparser *parser; + + if (!maria_ftparser_alloc_param(info)) + return 0; + + if (keynr == NO_SUCH_KEY) + { + ftparser_nr= 0; + parser= &ft_default_parser; + } + else + { + ftparser_nr= info->s->keyinfo[keynr].ftkey_nr; + parser= info->s->keyinfo[keynr].parser; + } + DBUG_ASSERT(paramnr < MAX_PARAM_NR); + ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr; + if (! info->ftparser_param[ftparser_nr].mysql_add_word) + { + /* Note, that mysql_add_word is used here as a flag: + mysql_add_word == 0 - parser is not initialized + mysql_add_word != 0 - parser is initialized, or no + initialization needed. */ + info->ftparser_param[ftparser_nr].mysql_add_word= + (int (*)(struct st_mysql_ftparser_param *, const uchar *, + mysql_ft_size_t, MYSQL_FTPARSER_BOOLEAN_INFO *)) 1; + if (parser->init && parser->init(&info->ftparser_param[ftparser_nr])) + return 0; + } + return &info->ftparser_param[ftparser_nr]; +} + + +void maria_ftparser_call_deinitializer(MARIA_HA *info) +{ + uint i, j, keys= info->s->state.header.keys; + free_root(&info->ft_memroot, MYF(0)); + if (! info->ftparser_param) + return; + for (i= 0; i < keys; i++) + { + MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i]; + for (j=0; j < MAX_PARAM_NR; j++) + { + MYSQL_FTPARSER_PARAM *ftparser_param= + &info->ftparser_param[keyinfo->ftkey_nr*MAX_PARAM_NR + j]; + if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word) + { + if (keyinfo->parser->deinit) + keyinfo->parser->deinit(ftparser_param); + ftparser_param->mysql_add_word= 0; + } + else + break; + } + } +} diff --git a/storage/maria/ma_ft_stem.c b/storage/maria/ma_ft_stem.c new file mode 100644 index 00000000000..06fc0b2df6c --- /dev/null +++ b/storage/maria/ma_ft_stem.c @@ -0,0 +1,18 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* mulitingual stem */ diff --git a/storage/maria/ma_ft_test1.c b/storage/maria/ma_ft_test1.c new file mode 100644 index 00000000000..4c98e766234 --- /dev/null +++ b/storage/maria/ma_ft_test1.c @@ -0,0 +1,317 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code + added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ + +#include "ma_ftdefs.h" +#include "maria_ft_test1.h" +#include <my_getopt.h> + +static int key_field=FIELD_VARCHAR,extra_field=FIELD_SKIP_ENDSPACE; +static uint key_length=200,extra_length=50; +static int key_type=HA_KEYTYPE_TEXT; +static int verbose=0,silent=0,skip_update=0, + no_keys=0,no_stopwords=0,no_search=0,no_fulltext=0; +static int create_flag=0,error=0; + +#define MAX_REC_LENGTH 300 +static char record[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH]; + +static int run_test(const char *filename); +static void get_options(int argc, char *argv[]); +static void create_record(char *, int); +static void usage(); + +static struct my_option my_long_options[] = +{ + {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 's', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'N', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'K', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'F', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'U', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '#', "", 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +int main(int argc, char *argv[]) +{ + MY_INIT(argv[0]); + + get_options(argc,argv); + maria_init(); + + exit(run_test("FT1")); +} + +static MARIA_COLUMNDEF recinfo[3]; +static MARIA_KEYDEF keyinfo[2]; +static HA_KEYSEG keyseg[10]; + +static int run_test(const char *filename) +{ + MARIA_HA *file; + int i,j; + my_off_t pos; + + bzero((char*) recinfo,sizeof(recinfo)); + + /* First define 2 columns */ + recinfo[0].type=extra_field; + recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : + extra_length); + if (extra_field == FIELD_VARCHAR) + recinfo[0].length+= HA_VARCHAR_PACKLENGTH(extra_length); + recinfo[1].type=key_field; + recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr : + key_length); + if (key_field == FIELD_VARCHAR) + recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length); + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].seg[0].type= key_type; + keyinfo[0].seg[0].flag= (key_field == FIELD_BLOB) ? HA_BLOB_PART: + (key_field == FIELD_VARCHAR) ? HA_VAR_LENGTH_PART:0; + keyinfo[0].seg[0].start=recinfo[0].length; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit= 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language= default_charset_info->number; + keyinfo[0].flag = (no_fulltext?HA_PACK_KEY:HA_FULLTEXT); + + if (!silent) + printf("- Creating isam-file\n"); + if (maria_create(filename,(no_keys?0:1),keyinfo,2,recinfo,0,NULL, + (MARIA_CREATE_INFO*) 0, create_flag)) + goto err; + if (!(file=maria_open(filename,2,0))) + goto err; + + if (!silent) + printf("- %s stopwords\n",no_stopwords?"Skipping":"Initializing"); + maria_ft_init_stopwords(no_stopwords?NULL:maria_ft_precompiled_stopwords); + + if (!silent) + printf("- Writing key:s\n"); + + my_errno=0; + for (i=NUPD ; i<NDATAS; i++ ) + { + create_record(record,i); + error=maria_write(file,record); + if (verbose || error) + printf("I= %2d maria_write: %d errno: %d, record: %s\n", + i,error,my_errno,data[i].f0); + } + + if (!skip_update) + { + if (!silent) + printf("- Updating rows\n"); + + /* Read through all rows and update them */ + pos=(ha_rows) 0; + i=0; + while ((error=maria_rrnd(file,read_record,pos)) == 0) + { + create_record(record,NUPD-i-1); + if (maria_update(file,read_record,record)) + { + printf("Can't update row: %.*s, error: %d\n", + keyinfo[0].seg[0].length,record,my_errno); + } + if(++i == NUPD) break; + pos=HA_OFFSET_ERROR; + } + if (i != NUPD) + printf("Found %d of %d rows\n", i,NUPD); + } + + if (maria_close(file)) goto err; + if(no_search) return 0; + if (!silent) + printf("- Reopening file\n"); + if (!(file=maria_open(filename,2,0))) goto err; + if (!silent) + printf("- Reading rows with key\n"); + for (i=0 ; i < NQUERIES ; i++) + { + FT_DOCLIST *result; + result=maria_ft_nlq_init_search(file,0,(char*) query[i],strlen(query[i]),1); + if(!result) + { + printf("Query %d: `%s' failed with errno %3d\n",i,query[i],my_errno); + continue; + } + printf("Query %d: `%s'. Found: %d. Top five documents:\n", + i,query[i],result->ndocs); + for (j=0;j<5;j++) + { + double w; int err; + err= maria_ft_nlq_read_next(result, read_record); + if (err==HA_ERR_END_OF_FILE) + { + printf("No more matches!\n"); + break; + } + else if (err) + { + printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno); + break; + } + w=maria_ft_nlq_get_relevance(result); + if (key_field == FIELD_VARCHAR) + { + uint l; + char *p; + p=recinfo[0].length+read_record; + l=uint2korr(p); + printf("%10.7f: %.*s\n",w,(int) l,p+2); + } + else + printf("%10.7f: %.*s\n",w,recinfo[1].length, + recinfo[0].length+read_record); + } + maria_ft_nlq_close_search(result); + } + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return (0); +err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + +static char blob_key[MAX_REC_LENGTH]; +/* static char blob_record[MAX_REC_LENGTH+20*20]; */ + +void create_record(char *pos, int n) +{ + bzero((char*) pos,MAX_REC_LENGTH); + if (recinfo[0].type == FIELD_BLOB) + { + uint tmp; + char *ptr; + strnmov(blob_key,data[n].f0,keyinfo[0].seg[0].length); + tmp=strlen(blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + uint tmp; + /* -1 is here because pack_length is stored in seg->length */ + uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1); + strnmov(pos+pack_length,data[n].f0,keyinfo[0].seg[0].length); + tmp=strlen(pos+pack_length); + if (pack_length == 1) + *pos= (char) tmp; + else + int2store(pos,tmp); + pos+=recinfo[0].length; + } + else + { + strnmov(pos,data[n].f0,keyinfo[0].seg[0].length); + pos+=recinfo[0].length; + } + if (recinfo[1].type == FIELD_BLOB) + { + uint tmp; + char *ptr; + strnmov(blob_key,data[n].f2,keyinfo[0].seg[0].length); + tmp=strlen(blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[1].length; + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + uint tmp; + /* -1 is here because pack_length is stored in seg->length */ + uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1); + strnmov(pos+pack_length,data[n].f2,keyinfo[0].seg[0].length); + tmp=strlen(pos+1); + if (pack_length == 1) + *pos= (char) tmp; + else + int2store(pos,tmp); + pos+=recinfo[1].length; + } + else + { + strnmov(pos,data[n].f2,keyinfo[0].seg[0].length); + pos+=recinfo[1].length; + } +} + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument) +{ + switch(optid) { + case 'v': verbose=1; break; + case 's': silent=1; break; + case 'F': no_fulltext=1; no_search=1; + case 'U': skip_update=1; break; + case 'K': no_keys=no_search=1; break; + case 'N': no_search=1; break; + case 'S': no_stopwords=1; break; + case '#': + DBUG_PUSH (argument); + break; + case 'V': + case '?': + case 'h': + usage(); + exit(1); + } + return 0; +} + +/* Read options */ + +static void get_options(int argc,char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + return; +} /* get options */ + + +static void usage() +{ + printf("%s [options]\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/ma_ft_test1.h b/storage/maria/ma_ft_test1.h new file mode 100644 index 00000000000..5883c42f5c5 --- /dev/null +++ b/storage/maria/ma_ft_test1.h @@ -0,0 +1,420 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +#define NUPD 20 +#define NDATAS 389 +struct { const char *f0, *f2; } data[NDATAS] = { + {"1", "General Information about MySQL"}, + {"1.1", "What is MySQL?"}, + {"1.2", "About this manual"}, + {"1.3", "History of MySQL"}, + {"1.4", "The main features of MySQL"}, + {"1.5", "General SQL information and tutorials"}, + {"1.6", "Useful MySQL-related links"}, + {"1.7", "What are stored procedures and triggers and so on?"}, + {"2", "MySQL mailing lists and how to ask questions/give error (bug) reports"}, + {"2.1", "Subscribing to/un-subscribing from the MySQL mailing list"}, + {"2.2", "Asking questions or reporting bugs"}, + {"2.3", "I think I have found a bug. What information do you need to help me?"}, + {"2.3.1", "MySQL keeps crashing"}, + {"2.4", "Guidelines for answering questions on the mailing list"}, + {"3", "Licensing or When do I have/want to pay for MySQL?"}, + {"3.1", "How much does MySQL cost?"}, + {"3.2", "How do I get commercial support?"}, + {"3.2.1", "Types of commercial support"}, + {"3.2.1.1", "Basic email support"}, + {"3.2.1.2", "Extended email support"}, +/*------------------------------- NUPD=20 -------------------------------*/ + {"3.2.1.3", "Asking: Login support"}, + {"3.2.1.4", "Extended login support"}, + {"3.3", "How do I pay for licenses/support?"}, + {"3.4", "Who do I contact when I want more information about licensing/support?"}, + {"3.5", "What Copyright does MySQL use?"}, + {"3.6", "When may I distribute MySQL commercially without a fee?"}, + {"3.7", "I want to sell a product that can be configured to use MySQL"}, + {"3.8", "I am running a commercial web server using MySQL"}, + {"3.9", "Do I need a license to sell commercial Perl/tcl/PHP/Web+ etc applications?"}, + {"3.10", "Possible future changes in the licensing"}, + {"4", "Compiling and installing MySQL"}, + {"4.1", "How do I get MySQL?"}, + {"4.2", "Which MySQL version should I use?"}, + {"4.3", "How/when will you release updates?"}, + {"4.4", "What operating systems does MySQL support?"}, + {"4.5", "Compiling MySQL from source code"}, + {"4.5.1", "Quick installation overview"}, + {"4.5.2", "Usual configure switches"}, + {"4.5.3", "Applying a patch"}, + {"4.6", "Problems compiling?"}, + {"4.7", "General compilation notes"}, + {"4.8", "MIT-pthreads notes (FreeBSD)"}, + {"4.9", "Perl installation comments"}, + {"4.10", "Special things to consider for some machine/OS combinations"}, + {"4.10.1", "Solaris notes"}, + {"4.10.2", "SunOS 4 notes"}, + {"4.10.3", "Linux notes for all versions"}, + {"4.10.3.1", "Linux-x86 notes"}, + {"4.10.3.2", "RedHat 5.0"}, + {"4.10.3.3", "RedHat 5.1"}, + {"4.10.3.4", "Linux-Sparc notes"}, + {"4.10.3.5", "Linux-Alpha notes"}, + {"4.10.3.6", "MkLinux notes"}, + {"4.10.4", "Alpha-DEC-Unix notes"}, + {"4.10.5", "Alpha-DEC-OSF1 notes"}, + {"4.10.6", "SGI-IRIX notes"}, + {"4.10.7", "FreeBSD notes"}, + {"4.10.7.1", "FreeBSD-3.0 notes"}, + {"4.10.8", "BSD/OS 2.# notes"}, + {"4.10.8.1", "BSD/OS 3.# notes"}, + {"4.10.9", "SCO notes"}, + {"4.10.10", "SCO Unixware 7.0 notes"}, + {"4.10.11", "IBM-AIX notes"}, + {"4.10.12", "HP-UX notes"}, + {"4.11", "TcX binaries"}, + {"4.12", "Win32 notes"}, + {"4.13", "Installation instructions for MySQL binary releases"}, + {"4.13.1", "How to get MySQL Perl support working"}, + {"4.13.2", "Linux notes"}, + {"4.13.3", "HP-UX notes"}, + {"4.13.4", "Linking client libraries"}, + {"4.14", "Problems running mysql_install_db"}, + {"4.15", "Problems starting MySQL"}, + {"4.16", "Automatic start/stop of MySQL"}, + {"4.17", "Option files"}, + {"5", "How standards-compatible is MySQL?"}, + {"5.1", "What extensions has MySQL to ANSI SQL92?"}, + {"5.2", "What functionality is missing in MySQL?"}, + {"5.2.1", "Sub-selects"}, + {"5.2.2", "SELECT INTO TABLE"}, + {"5.2.3", "Transactions"}, + {"5.2.4", "Triggers"}, + {"5.2.5", "Foreign Keys"}, + {"5.2.5.1", "Some reasons NOT to use FOREIGN KEYS"}, + {"5.2.6", "Views"}, + {"5.2.7", "-- as start of a comment"}, + {"5.3", "What standards does MySQL follow?"}, + {"5.4", "What functions exist only for compatibility?"}, + {"5.5", "Limitations of BLOB and TEXT types"}, + {"5.6", "How to cope without COMMIT-ROLLBACK"}, + {"6", "The MySQL access privilege system"}, + {"6.1", "What the privilege system does"}, + {"6.2", "Connecting to the MySQL server"}, + {"6.2.1", "Keeping your password secure"}, + {"6.3", "Privileges provided by MySQL"}, + {"6.4", "How the privilege system works"}, + {"6.5", "The privilege tables"}, + {"6.6", "Setting up the initial MySQL privileges"}, + {"6.7", "Adding new user privileges to MySQL"}, + {"6.8", "An example permission setup"}, + {"6.9", "Causes of Access denied errors"}, + {"6.10", "How to make MySQL secure against crackers"}, + {"7", "MySQL language reference"}, + {"7.1", "Literals: how to write strings and numbers"}, + {"7.1.1", "Strings"}, + {"7.1.2", "Numbers"}, + {"7.1.3", "NULL values"}, + {"7.1.4", "Database, table, index, column and alias names"}, + {"7.1.4.1", "Case sensitivity in names"}, + {"7.2", "Column types"}, + {"7.2.1", "Column type storage requirements"}, + {"7.2.5", "Numeric types"}, + {"7.2.6", "Date and time types"}, + {"7.2.6.1", "The DATE type"}, + {"7.2.6.2", "The TIME type"}, + {"7.2.6.3", "The DATETIME type"}, + {"7.2.6.4", "The TIMESTAMP type"}, + {"7.2.6.5", "The YEAR type"}, + {"7.2.6.6", "Miscellaneous date and time properties"}, + {"7.2.7", "String types"}, + {"7.2.7.1", "The CHAR and VARCHAR types"}, + {"7.2.7.2", "The BLOB and TEXT types"}, + {"7.2.7.3", "The ENUM type"}, + {"7.2.7.4", "The SET type"}, + {"7.2.8", "Choosing the right type for a column"}, + {"7.2.9", "Column indexes"}, + {"7.2.10", "Multiple-column indexes"}, + {"7.2.11", "Using column types from other database engines"}, + {"7.3", "Functions for use in SELECT and WHERE clauses"}, + {"7.3.1", "Grouping functions"}, + {"7.3.2", "Normal arithmetic operations"}, + {"7.3.3", "Bit functions"}, + {"7.3.4", "Logical operations"}, + {"7.3.5", "Comparison operators"}, + {"7.3.6", "String comparison functions"}, + {"7.3.7", "Control flow functions"}, + {"7.3.8", "Mathematical functions"}, + {"7.3.9", "String functions"}, + {"7.3.10", "Date and time functions"}, + {"7.3.11", "Miscellaneous functions"}, + {"7.3.12", "Functions for use with GROUP BY clauses"}, + {"7.4", "CREATE DATABASE syntax"}, + {"7.5", "DROP DATABASE syntax"}, + {"7.6", "CREATE TABLE syntax"}, + {"7.7", "ALTER TABLE syntax"}, + {"7.8", "OPTIMIZE TABLE syntax"}, + {"7.9", "DROP TABLE syntax"}, + {"7.10", "DELETE syntax"}, + {"7.11", "SELECT syntax"}, + {"7.12", "JOIN syntax"}, + {"7.13", "INSERT syntax"}, + {"7.14", "REPLACE syntax"}, + {"7.15", "LOAD DATA INFILE syntax"}, + {"7.16", "UPDATE syntax"}, + {"7.17", "USE syntax"}, + {"7.18", "SHOW syntax (Get information about tables, columns...)"}, + {"7.19", "EXPLAIN syntax (Get information about a SELECT)"}, + {"7.20", "DESCRIBE syntax (Get information about columns)"}, + {"7.21", "LOCK TABLES/UNLOCK TABLES syntax"}, + {"7.22", "SET OPTION syntax"}, + {"7.23", "GRANT syntax (Compatibility function)"}, + {"7.24", "CREATE INDEX syntax (Compatibility function)"}, + {"7.25", "DROP INDEX syntax (Compatibility function)"}, + {"7.26", "Comment syntax"}, + {"7.27", "CREATE FUNCTION/DROP FUNCTION syntax"}, + {"7.28", "Is MySQL picky about reserved words?"}, + {"8", "Example SQL queries"}, + {"8.1", "Queries from twin project"}, + {"8.1.1", "Find all non-distributed twins"}, + {"8.1.2", "Show a table on twin pair status"}, + {"9", "How safe/stable is MySQL?"}, + {"9.1", "How stable is MySQL?"}, + {"9.2", "Why are there is so many releases of MySQL?"}, + {"9.3", "Checking a table for errors"}, + {"9.4", "How to repair tables"}, + {"9.5", "Is there anything special to do when upgrading/downgrading MySQL?"}, + {"9.5.1", "Upgrading from a 3.21 version to 3.22"}, + {"9.5.2", "Upgrading from a 3.20 version to 3.21"}, + {"9.5.3", "Upgrading to another architecture"}, + {"9.6", "Year 2000 compliance"}, + {"10", "MySQL Server functions"}, + {"10.1", "What languages are supported by MySQL?"}, + {"10.1.1", "Character set used for data & sorting"}, + {"10.2", "The update log"}, + {"10.3", "How big can MySQL tables be?"}, + {"11", "Getting maximum performance from MySQL"}, + {"11.1", "How does one change the size of MySQL buffers?"}, + {"11.2", "How compiling and linking affects the speed of MySQL"}, + {"11.3", "How does MySQL use memory?"}, + {"11.4", "How does MySQL use indexes?"}, + {"11.5", "What optimizations are done on WHERE clauses?"}, + {"11.6", "How does MySQL open & close tables?"}, + {"11.6.0.1", "What are the drawbacks of creating possibly thousands of tables in a database?"}, + {"11.7", "How does MySQL lock tables?"}, + {"11.8", "How should I arrange my table to be as fast/small as possible?"}, + {"11.9", "What affects the speed of INSERT statements?"}, + {"11.10", "What affects the speed DELETE statements?"}, + {"11.11", "How do I get MySQL to run at full speed?"}, + {"11.12", "What are the different row formats? Or, when should VARCHAR/CHAR be used?"}, + {"11.13", "Why so many open tables?"}, + {"12", "MySQL benchmark suite"}, + {"13", "MySQL Utilites"}, + {"13.1", "Overview of the different MySQL programs"}, + {"13.2", "The MySQL table check, optimize and repair program"}, + {"13.2.1", "isamchk memory use"}, + {"13.2.2", "Getting low-level table information"}, + {"13.3", "The MySQL compressed read-only table generator"}, + {"14", "Adding new functions to MySQL"}, + {"15", "MySQL ODBC Support"}, + {"15.1", "Operating systems supported by MyODBC"}, + {"15.2", "How to report problems with MyODBC"}, + {"15.3", "Programs known to work with MyODBC"}, + {"15.4", "How to fill in the various fields in the ODBC administrator program"}, + {"15.5", "How to get the value of an AUTO_INCREMENT column in ODBC"}, + {"16", "Problems and common errors"}, + {"16.1", "Some common errors when using MySQL"}, + {"16.1.1", "MySQL server has gone away error"}, + {"16.1.2", "Can't connect to local MySQL server error"}, + {"16.1.3", "Out of memory error"}, + {"16.1.4", "Packet too large error"}, + {"16.1.5", "The table is full error"}, + {"16.1.6", "Commands out of sync error in client"}, + {"16.1.7", "Removing user error"}, + {"16.2", "How MySQL handles a full disk"}, + {"16.3", "How to run SQL commands from a text file"}, + {"16.4", "Where MySQL stores temporary files"}, + {"16.5", "Access denied error"}, + {"16.6", "How to run MySQL as a normal user"}, + {"16.7", "Problems with file permissions"}, + {"16.8", "File not found"}, + {"16.9", "Problems using DATE columns"}, + {"16.10", "Case sensitivity in searches"}, + {"16.11", "Problems with NULL values"}, + {"17", "Solving some common problems with MySQL"}, + {"17.1", "Database replication"}, + {"17.2", "Database backups"}, + {"18", "MySQL client tools and API's"}, + {"18.1", "MySQL C API"}, + {"18.2", "C API datatypes"}, + {"18.3", "C API function overview"}, + {"18.4", "C API function descriptions"}, + {"18.4.1", "mysql_affected_rows()"}, + {"18.4.2", "mysql_close()"}, + {"18.4.3", "mysql_connect()"}, + {"18.4.4", "mysql_create_db()"}, + {"18.4.5", "mysql_data_seek()"}, + {"18.4.6", "mysql_debug()"}, + {"18.4.7", "mysql_drop_db()"}, + {"18.4.8", "mysql_dump_debug_info()"}, + {"18.4.9", "mysql_eof()"}, + {"18.4.10", "mysql_errno()"}, + {"18.4.11", "mysql_error()"}, + {"18.4.12", "mysql_escape_string()"}, + {"18.4.13", "mysql_fetch_field()"}, + {"18.4.14", "mysql_fetch_fields()"}, + {"18.4.15", "mysql_fetch_field_direct()"}, + {"18.4.16", "mysql_fetch_lengths()"}, + {"18.4.17", "mysql_fetch_row()"}, + {"18.4.18", "mysql_field_seek()"}, + {"18.4.19", "mysql_field_tell()"}, + {"18.4.20", "mysql_free_result()"}, + {"18.4.21", "mysql_get_client_info()"}, + {"18.4.22", "mysql_get_host_info()"}, + {"18.4.23", "mysql_get_proto_info()"}, + {"18.4.24", "mysql_get_server_info()"}, + {"18.4.25", "mysql_info()"}, + {"18.4.26", "mysql_init()"}, + {"18.4.27", "mysql_insert_id()"}, + {"18.4.28", "mysql_kill()"}, + {"18.4.29", "mysql_list_dbs()"}, + {"18.4.30", "mysql_list_fields()"}, + {"18.4.31", "mysql_list_processes()"}, + {"18.4.32", "mysql_list_tables()"}, + {"18.4.33", "mysql_num_fields()"}, + {"18.4.34", "mysql_num_rows()"}, + {"18.4.35", "mysql_query()"}, + {"18.4.36", "mysql_real_connect()"}, + {"18.4.37", "mysql_real_query()"}, + {"18.4.38", "mysql_reload()"}, + {"18.4.39", "mysql_row_tell()"}, + {"18.4.40", "mysql_select_db()"}, + {"18.4.41", "mysql_shutdown()"}, + {"18.4.42", "mysql_stat()"}, + {"18.4.43", "mysql_store_result()"}, + {"18.4.44", "mysql_thread_id()"}, + {"18.4.45", "mysql_use_result()"}, + {"18.4.46", "Why is it that after mysql_query() returns success, mysql_store_result() sometimes returns NULL?"}, + {"18.4.47", "What results can I get from a query?"}, + {"18.4.48", "How can I get the unique ID for the last inserted row?"}, + {"18.4.49", "Problems linking with the C API"}, + {"18.4.50", "How to make a thread-safe client"}, + {"18.5", "MySQL Perl API's"}, + {"18.5.1", "DBI with DBD::mysql"}, + {"18.5.1.1", "The DBI interface"}, + {"18.5.1.2", "More DBI/DBD information"}, + {"18.6", "MySQL Java connectivity (JDBC)"}, + {"18.7", "MySQL PHP API's"}, + {"18.8", "MySQL C++ API's"}, + {"18.9", "MySQL Python API's"}, + {"18.10", "MySQL TCL API's"}, + {"19", "How MySQL compares to other databases"}, + {"19.1", "How MySQL compares to mSQL"}, + {"19.1.1", "How to convert mSQL tools for MySQL"}, + {"19.1.2", "How mSQL and MySQL client/server communications protocols differ"}, + {"19.1.3", "How mSQL 2.0 SQL syntax differs from MySQL"}, + {"19.2", "How MySQL compares to PostgreSQL"}, + {"A", "Some users of MySQL"}, + {"B", "Contributed programs"}, + {"C", "Contributors to MySQL"}, + {"D", "MySQL change history"}, + {"19.3", "Changes in release 3.22.x (Alpha version)"}, + {"19.3.1", "Changes in release 3.22.7"}, + {"19.3.2", "Changes in release 3.22.6"}, + {"19.3.3", "Changes in release 3.22.5"}, + {"19.3.4", "Changes in release 3.22.4"}, + {"19.3.5", "Changes in release 3.22.3"}, + {"19.3.6", "Changes in release 3.22.2"}, + {"19.3.7", "Changes in release 3.22.1"}, + {"19.3.8", "Changes in release 3.22.0"}, + {"19.4", "Changes in release 3.21.x"}, + {"19.4.1", "Changes in release 3.21.33"}, + {"19.4.2", "Changes in release 3.21.32"}, + {"19.4.3", "Changes in release 3.21.31"}, + {"19.4.4", "Changes in release 3.21.30"}, + {"19.4.5", "Changes in release 3.21.29"}, + {"19.4.6", "Changes in release 3.21.28"}, + {"19.4.7", "Changes in release 3.21.27"}, + {"19.4.8", "Changes in release 3.21.26"}, + {"19.4.9", "Changes in release 3.21.25"}, + {"19.4.10", "Changes in release 3.21.24"}, + {"19.4.11", "Changes in release 3.21.23"}, + {"19.4.12", "Changes in release 3.21.22"}, + {"19.4.13", "Changes in release 3.21.21a"}, + {"19.4.14", "Changes in release 3.21.21"}, + {"19.4.15", "Changes in release 3.21.20"}, + {"19.4.16", "Changes in release 3.21.19"}, + {"19.4.17", "Changes in release 3.21.18"}, + {"19.4.18", "Changes in release 3.21.17"}, + {"19.4.19", "Changes in release 3.21.16"}, + {"19.4.20", "Changes in release 3.21.15"}, + {"19.4.21", "Changes in release 3.21.14b"}, + {"19.4.22", "Changes in release 3.21.14a"}, + {"19.4.23", "Changes in release 3.21.13"}, + {"19.4.24", "Changes in release 3.21.12"}, + {"19.4.25", "Changes in release 3.21.11"}, + {"19.4.26", "Changes in release 3.21.10"}, + {"19.4.27", "Changes in release 3.21.9"}, + {"19.4.28", "Changes in release 3.21.8"}, + {"19.4.29", "Changes in release 3.21.7"}, + {"19.4.30", "Changes in release 3.21.6"}, + {"19.4.31", "Changes in release 3.21.5"}, + {"19.4.32", "Changes in release 3.21.4"}, + {"19.4.33", "Changes in release 3.21.3"}, + {"19.4.34", "Changes in release 3.21.2"}, + {"19.4.35", "Changes in release 3.21.0"}, + {"19.5", "Changes in release 3.20.x"}, + {"19.5.1", "Changes in release 3.20.18"}, + {"19.5.2", "Changes in release 3.20.17"}, + {"19.5.3", "Changes in release 3.20.16"}, + {"19.5.4", "Changes in release 3.20.15"}, + {"19.5.5", "Changes in release 3.20.14"}, + {"19.5.6", "Changes in release 3.20.13"}, + {"19.5.7", "Changes in release 3.20.11"}, + {"19.5.8", "Changes in release 3.20.10"}, + {"19.5.9", "Changes in release 3.20.9"}, + {"19.5.10", "Changes in release 3.20.8"}, + {"19.5.11", "Changes in release 3.20.7"}, + {"19.5.12", "Changes in release 3.20.6"}, + {"19.5.13", "Changes in release 3.20.3"}, + {"19.5.14", "Changes in release 3.20.0"}, + {"19.6", "Changes in release 3.19.x"}, + {"19.6.1", "Changes in release 3.19.5"}, + {"19.6.2", "Changes in release 3.19.4"}, + {"19.6.3", "Changes in release 3.19.3"}, + {"E", "Known errors and design deficiencies in MySQL"}, + {"F", "List of things we want to add to MySQL in the future (The TODO)"}, + {"19.7", "Things that must done in the real near future"}, + {"19.8", "Things that have to be done sometime"}, + {"19.9", "Some things we don't have any plans to do"}, + {"G", "Comments on porting to other systems"}, + {"19.10", "Debugging MySQL"}, + {"19.11", "Comments about RTS threads"}, + {"19.12", "What is the difference between different thread packages?"}, + {"H", "Description of MySQL regular expression syntax"}, + {"I", "What is Unireg?"}, + {"J", "The MySQL server license"}, + {"K", "The MySQL license for Microsoft operating systems"}, + {"*", "SQL command, type and function index"}, + {"*", "Concept Index"} +}; + +#define NQUERIES 5 +const char *query[NQUERIES]={ + "mysql information and manual", + "upgrading from previous version", + "column indexes", + "against about after more right the with/without", /* stopwords test */ + "mysql license and copyright" +}; diff --git a/storage/maria/ma_ft_update.c b/storage/maria/ma_ft_update.c new file mode 100644 index 00000000000..f38990efab9 --- /dev/null +++ b/storage/maria/ma_ft_update.c @@ -0,0 +1,379 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* functions to work with full-text indices */ + +#include "ma_ftdefs.h" +#include <math.h> + +void _ma_ft_segiterator_init(MARIA_HA *info, uint keynr, const uchar *record, + FT_SEG_ITERATOR *ftsi) +{ + DBUG_ENTER("_ma_ft_segiterator_init"); + + ftsi->num=info->s->keyinfo[keynr].keysegs; + ftsi->seg=info->s->keyinfo[keynr].seg; + ftsi->rec=record; + DBUG_VOID_RETURN; +} + +void _ma_ft_segiterator_dummy_init(const uchar *record, uint len, + FT_SEG_ITERATOR *ftsi) +{ + DBUG_ENTER("_ma_ft_segiterator_dummy_init"); + + ftsi->num=1; + ftsi->seg=0; + ftsi->pos=record; + ftsi->len=len; + DBUG_VOID_RETURN; +} + +/* + This function breaks convention "return 0 in success" + but it's easier to use like this + + while(_ma_ft_segiterator()) + + so "1" means "OK", "0" means "EOF" +*/ + +uint _ma_ft_segiterator(register FT_SEG_ITERATOR *ftsi) +{ + DBUG_ENTER("_ma_ft_segiterator"); + + if (!ftsi->num) + DBUG_RETURN(0); + + ftsi->num--; + if (!ftsi->seg) + DBUG_RETURN(1); + + ftsi->seg--; + + if (ftsi->seg->null_bit && + (ftsi->rec[ftsi->seg->null_pos] & ftsi->seg->null_bit)) + { + ftsi->pos=0; + DBUG_RETURN(1); + } + ftsi->pos= ftsi->rec+ftsi->seg->start; + if (ftsi->seg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= (ftsi->seg->bit_start); + ftsi->len= (pack_length == 1 ? (uint) * ftsi->pos : + uint2korr(ftsi->pos)); + ftsi->pos+= pack_length; /* Skip VARCHAR length */ + DBUG_RETURN(1); + } + if (ftsi->seg->flag & HA_BLOB_PART) + { + ftsi->len= _ma_calc_blob_length(ftsi->seg->bit_start,ftsi->pos); + memcpy_fixed((char*) &ftsi->pos, ftsi->pos+ftsi->seg->bit_start, + sizeof(char*)); + DBUG_RETURN(1); + } + ftsi->len=ftsi->seg->length; + DBUG_RETURN(1); +} + + +/* parses a document i.e. calls maria_ft_parse for every keyseg */ + +uint _ma_ft_parse(TREE *parsed, MARIA_HA *info, uint keynr, const uchar *record, + MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) +{ + FT_SEG_ITERATOR ftsi; + struct st_mysql_ftparser *parser; + DBUG_ENTER("_ma_ft_parse"); + + _ma_ft_segiterator_init(info, keynr, record, &ftsi); + + maria_ft_parse_init(parsed, info->s->keyinfo[keynr].seg->charset); + parser= info->s->keyinfo[keynr].parser; + while (_ma_ft_segiterator(&ftsi)) + { + /** @todo this casts ftsi.pos (const) to non-const */ + if (ftsi.pos) + if (maria_ft_parse(parsed, (uchar *)ftsi.pos, ftsi.len, parser, param, + mem_root)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + +FT_WORD * _ma_ft_parserecord(MARIA_HA *info, uint keynr, const uchar *record, + MEM_ROOT *mem_root) +{ + TREE ptree; + MYSQL_FTPARSER_PARAM *param; + DBUG_ENTER("_ma_ft_parserecord"); + if (! (param= maria_ftparser_call_initializer(info, keynr, 0))) + DBUG_RETURN(NULL); + bzero((char*) &ptree, sizeof(ptree)); + param->flags= 0; + if (_ma_ft_parse(&ptree, info, keynr, record, param, mem_root)) + DBUG_RETURN(NULL); + + DBUG_RETURN(maria_ft_linearize(&ptree, mem_root)); +} + +static int _ma_ft_store(MARIA_HA *info, uint keynr, uchar *keybuf, + FT_WORD *wlist, my_off_t filepos) +{ + DBUG_ENTER("_ma_ft_store"); + + for (; wlist->pos; wlist++) + { + MARIA_KEY key; + _ma_ft_make_key(info, &key, keynr, keybuf, wlist, filepos); + if (_ma_ck_write(info, &key)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + +static int _ma_ft_erase(MARIA_HA *info, uint keynr, uchar *keybuf, + FT_WORD *wlist, my_off_t filepos) +{ + uint err=0; + DBUG_ENTER("_ma_ft_erase"); + + for (; wlist->pos; wlist++) + { + MARIA_KEY key; + _ma_ft_make_key(info, &key, keynr, keybuf, wlist, filepos); + if (_ma_ck_delete(info, &key)) + err=1; + } + DBUG_RETURN(err); +} + +/* + Compares an appropriate parts of two WORD_KEY keys directly out of records + returns 1 if they are different +*/ + +#define THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT 1 +#define GEE_THEY_ARE_ABSOLUTELY_IDENTICAL 0 + +int _ma_ft_cmp(MARIA_HA *info, uint keynr, const uchar *rec1, const uchar *rec2) +{ + FT_SEG_ITERATOR ftsi1, ftsi2; + CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset; + DBUG_ENTER("_ma_ft_cmp"); + + _ma_ft_segiterator_init(info, keynr, rec1, &ftsi1); + _ma_ft_segiterator_init(info, keynr, rec2, &ftsi2); + + while (_ma_ft_segiterator(&ftsi1) && _ma_ft_segiterator(&ftsi2)) + { + if ((ftsi1.pos != ftsi2.pos) && + (!ftsi1.pos || !ftsi2.pos || + ha_compare_text(cs, ftsi1.pos,ftsi1.len, + ftsi2.pos,ftsi2.len,0,0))) + DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT); + } + DBUG_RETURN(GEE_THEY_ARE_ABSOLUTELY_IDENTICAL); +} + + +/* update a document entry */ + +int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf, + const uchar *oldrec, const uchar *newrec, my_off_t pos) +{ + int error= -1; + FT_WORD *oldlist,*newlist, *old_word, *new_word; + CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset; + int cmp, cmp2; + DBUG_ENTER("_ma_ft_update"); + + if (!(old_word=oldlist=_ma_ft_parserecord(info, keynr, oldrec, + &info->ft_memroot)) || + !(new_word=newlist=_ma_ft_parserecord(info, keynr, newrec, + &info->ft_memroot))) + goto err; + + error=0; + while(old_word->pos && new_word->pos) + { + cmp= ha_compare_text(cs, (uchar*) old_word->pos,old_word->len, + (uchar*) new_word->pos,new_word->len,0,0); + cmp2= cmp ? 0 : (fabs(old_word->weight - new_word->weight) > 1.e-5); + + if (cmp < 0 || cmp2) + { + MARIA_KEY key; + _ma_ft_make_key(info, &key, keynr, keybuf, old_word, pos); + if (_ma_ck_delete(info, &key)) + { + error= -1; + goto err; + } + } + if (cmp > 0 || cmp2) + { + MARIA_KEY key; + _ma_ft_make_key(info, &key, keynr, keybuf, new_word,pos); + if ((error= _ma_ck_write(info, &key))) + goto err; + } + if (cmp<=0) old_word++; + if (cmp>=0) new_word++; + } + if (old_word->pos) + error= _ma_ft_erase(info,keynr,keybuf,old_word,pos); + else if (new_word->pos) + error= _ma_ft_store(info,keynr,keybuf,new_word,pos); + +err: + free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE)); + DBUG_RETURN(error); +} + + +/* adds a document to the collection */ + +int _ma_ft_add(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record, + my_off_t pos) +{ + int error= -1; + FT_WORD *wlist; + DBUG_ENTER("_ma_ft_add"); + DBUG_PRINT("enter",("keynr: %d",keynr)); + + if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot))) + error= _ma_ft_store(info,keynr,keybuf,wlist,pos); + free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE)); + DBUG_PRINT("exit",("Return: %d",error)); + DBUG_RETURN(error); +} + + +/* removes a document from the collection */ + +int _ma_ft_del(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record, + my_off_t pos) +{ + int error= -1; + FT_WORD *wlist; + DBUG_ENTER("_ma_ft_del"); + DBUG_PRINT("enter",("keynr: %d",keynr)); + + if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot))) + error= _ma_ft_erase(info,keynr,keybuf,wlist,pos); + free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE)); + DBUG_PRINT("exit",("Return: %d",error)); + DBUG_RETURN(error); +} + + +MARIA_KEY *_ma_ft_make_key(MARIA_HA *info, MARIA_KEY *key, uint keynr, + uchar *keybuf, + FT_WORD *wptr, my_off_t filepos) +{ + uchar buf[HA_FT_MAXBYTELEN+16]; + DBUG_ENTER("_ma_ft_make_key"); + +#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT + { + float weight=(float) ((filepos==HA_OFFSET_ERROR) ? 0 : wptr->weight); + mi_float4store(buf,weight); + } +#else +#error +#endif + + int2store(buf+HA_FT_WLEN,wptr->len); + memcpy(buf+HA_FT_WLEN+2,wptr->pos,wptr->len); + /* Can't be spatial so it's ok to call _ma_make_key directly here */ + DBUG_RETURN(_ma_make_key(info, key, keynr, keybuf, buf, filepos, 0)); +} + + +/* + convert key value to ft2 +*/ + +my_bool _ma_ft_convert_to_ft2(MARIA_HA *info, MARIA_KEY *key) +{ + MARIA_SHARE *share= info->s; + my_off_t root; + DYNAMIC_ARRAY *da=info->ft1_to_ft2; + MARIA_KEYDEF *keyinfo=&share->ft2_keyinfo; + uchar *key_ptr= (uchar*) dynamic_array_ptr(da, 0), *end; + uint length, key_length; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_KEY tmp_key; + MARIA_PAGE page; + DBUG_ENTER("_ma_ft_convert_to_ft2"); + + /* we'll generate one pageful at once, and insert the rest one-by-one */ + /* calculating the length of this page ...*/ + length=(keyinfo->block_length-2) / keyinfo->keylength; + set_if_smaller(length, da->elements); + length=length * keyinfo->keylength; + + get_key_full_length_rdonly(key_length, key->data); + while (_ma_ck_delete(info, key) == 0) + { + /* + nothing to do here. + _ma_ck_delete() will populate info->ft1_to_ft2 with deleted keys + */ + } + + /* creating pageful of keys */ + bzero(info->buff, share->keypage_header); + _ma_store_keynr(share, info->buff, keyinfo->key_nr); + _ma_store_page_used(share, info->buff, length + share->keypage_header); + memcpy(info->buff + share->keypage_header, key_ptr, length); + info->keyread_buff_used= info->page_changed=1; /* info->buff is used */ + /** + @todo RECOVERY BUG this is not logged yet. Ok as this code is never + called, but soon it will be. + */ + if ((root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR) + DBUG_RETURN(1); + + _ma_page_setup(&page, info, keyinfo, root, info->buff); + if (_ma_write_keypage(&page, page_link->write_lock, DFLT_INIT_HITS)) + DBUG_RETURN(1); + + /* inserting the rest of key values */ + end= (uchar*) dynamic_array_ptr(da, da->elements); + tmp_key.keyinfo= keyinfo; + tmp_key.data_length= keyinfo->keylength; + tmp_key.ref_length= 0; + tmp_key.flag= 0; + for (key_ptr+=length; key_ptr < end; key_ptr+=keyinfo->keylength) + { + tmp_key.data= key_ptr; + if (_ma_ck_real_write_btree(info, key, &root, SEARCH_SAME)) + DBUG_RETURN(1); + } + + /* now, writing the word key entry */ + ft_intXstore(key->data + key_length, - (int) da->elements); + _ma_dpointer(share, key->data + key_length + HA_FT_WLEN, root); + + DBUG_RETURN(_ma_ck_real_write_btree(info, key, + &share->state.key_root[key->keyinfo-> + key_nr], + SEARCH_SAME)); +} diff --git a/storage/maria/ma_ftdefs.h b/storage/maria/ma_ftdefs.h new file mode 100644 index 00000000000..4ce4e9e22ba --- /dev/null +++ b/storage/maria/ma_ftdefs.h @@ -0,0 +1,156 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* some definitions for full-text indices */ + +#include "ma_fulltext.h" +#include <m_ctype.h> +#include <my_tree.h> +#include <queues.h> +#include <mysql/plugin.h> + +#define true_word_char(ctype, character) \ + ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \ + (character) == '_') +#define misc_word_char(X) 0 + +#define FT_MAX_WORD_LEN_FOR_SORT 31 + +#define FTPARSER_MEMROOT_ALLOC_SIZE 65536 + +#define COMPILE_STOPWORDS_IN + +/* Interested readers may consult SMART + (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z) + for an excellent implementation of vector space model we use. + It also demonstrate the usage of different weghting techniques. + This code, though, is completely original and is not based on the + SMART code but was in some cases inspired by it. + + NORM_PIVOT was taken from the article + A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization", + ACM SIGIR'96, 21-29, 1996 + */ + +#define LWS_FOR_QUERY LWS_TF +#define LWS_IN_USE LWS_LOG +#define PRENORM_IN_USE PRENORM_AVG +#define NORM_IN_USE NORM_PIVOT +#define GWS_IN_USE GWS_PROB +/*==============================================================*/ +#define LWS_TF (count) +#define LWS_BINARY (count>0) +#define LWS_SQUARE (count*count) +#define LWS_LOG (count?(log( (double) count)+1):0) +/*--------------------------------------------------------------*/ +#define PRENORM_NONE (p->weight) +#define PRENORM_MAX (p->weight/docstat.max) +#define PRENORM_AUG (0.4+0.6*p->weight/docstat.max) +#define PRENORM_AVG (p->weight/docstat.sum*docstat.uniq) +#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq))) +/*--------------------------------------------------------------*/ +#define NORM_NONE (1) +#define NORM_SUM (docstat.nsum) +#define NORM_COS (sqrt(docstat.nsum2)) + +#define PIVOT_VAL (0.0115) +#define NORM_PIVOT (1+PIVOT_VAL*docstat.uniq) +/*---------------------------------------------------------------*/ +#define GWS_NORM (1/sqrt(sum2)) +#define GWS_GFIDF (sum/doc_cnt) +/* Mysterious, but w/o (double) GWS_IDF performs better :-o */ +#define GWS_IDF log(aio->info->state->records/doc_cnt) +#define GWS_IDF1 log((double)aio->info->state->records/doc_cnt) +#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 ) +#define GWS_FREQ (1.0/doc_cnt) +#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2) +#define GWS_CUBIC pow(log((double)aio->info->state->records/doc_cnt),3) +#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records)) +/*=================================================================*/ + +/* Boolean search operators */ +#define FTB_YES (ft_boolean_syntax[0]) +#define FTB_EGAL (ft_boolean_syntax[1]) +#define FTB_NO (ft_boolean_syntax[2]) +#define FTB_INC (ft_boolean_syntax[3]) +#define FTB_DEC (ft_boolean_syntax[4]) +#define FTB_LBR (ft_boolean_syntax[5]) +#define FTB_RBR (ft_boolean_syntax[6]) +#define FTB_NEG (ft_boolean_syntax[7]) +#define FTB_TRUNC (ft_boolean_syntax[8]) +#define FTB_LQUOT (ft_boolean_syntax[10]) +#define FTB_RQUOT (ft_boolean_syntax[11]) + +typedef struct st_maria_ft_word { + const uchar * pos; + uint len; + double weight; +} FT_WORD; + +int is_stopword(char *word, uint len); + +MARIA_KEY *_ma_ft_make_key(MARIA_HA *, MARIA_KEY *, uint , uchar *, FT_WORD *, + my_off_t); + +uchar maria_ft_get_word(CHARSET_INFO *, const uchar **, const uchar *, + FT_WORD *, MYSQL_FTPARSER_BOOLEAN_INFO *); +uchar maria_ft_simple_get_word(CHARSET_INFO *, const uchar **, const uchar *, + FT_WORD *, my_bool); + +typedef struct _st_maria_ft_seg_iterator { + uint num, len; + HA_KEYSEG *seg; + const uchar *rec, *pos; +} FT_SEG_ITERATOR; + +void _ma_ft_segiterator_init(MARIA_HA *, uint, const uchar *, FT_SEG_ITERATOR *); +void _ma_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *); +uint _ma_ft_segiterator(FT_SEG_ITERATOR *); + +void maria_ft_parse_init(TREE *, CHARSET_INFO *); +int maria_ft_parse(TREE *, uchar *, size_t, struct st_mysql_ftparser *parser, + MYSQL_FTPARSER_PARAM *, MEM_ROOT *); +FT_WORD * maria_ft_linearize(TREE *, MEM_ROOT *); +FT_WORD * _ma_ft_parserecord(MARIA_HA *, uint, const uchar *, MEM_ROOT *); +uint _ma_ft_parse(TREE *, MARIA_HA *, uint, const uchar *, + MYSQL_FTPARSER_PARAM *, MEM_ROOT *); + +FT_INFO *maria_ft_init_nlq_search(MARIA_HA *, uint, uchar *, size_t, uint, + uchar *); +FT_INFO *maria_ft_init_boolean_search(MARIA_HA *, uint, uchar *, size_t, + CHARSET_INFO *); + +extern const struct _ft_vft _ma_ft_vft_nlq; +int maria_ft_nlq_read_next(FT_INFO *, char *); +float maria_ft_nlq_find_relevance(FT_INFO *, uchar *, uint); +void maria_ft_nlq_close_search(FT_INFO *); +float maria_ft_nlq_get_relevance(FT_INFO *); +my_off_t maria_ft_nlq_get_docid(FT_INFO *); +void maria_ft_nlq_reinit_search(FT_INFO *); + +extern const struct _ft_vft _ma_ft_vft_boolean; +int maria_ft_boolean_read_next(FT_INFO *, char *); +float maria_ft_boolean_find_relevance(FT_INFO *, uchar *, uint); +void maria_ft_boolean_close_search(FT_INFO *); +float maria_ft_boolean_get_relevance(FT_INFO *); +my_off_t maria_ft_boolean_get_docid(FT_INFO *); +void maria_ft_boolean_reinit_search(FT_INFO *); +MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info); +extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, + uint keynr, + uint paramnr); +extern void maria_ftparser_call_deinitializer(MARIA_HA *info); diff --git a/storage/maria/ma_fulltext.h b/storage/maria/ma_fulltext.h new file mode 100644 index 00000000000..6e087990bd2 --- /dev/null +++ b/storage/maria/ma_fulltext.h @@ -0,0 +1,27 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* some definitions for full-text indices */ + +#include "maria_def.h" +#include "ft_global.h" + +int _ma_ft_cmp(MARIA_HA *, uint, const uchar *, const uchar *); +int _ma_ft_add(MARIA_HA *, uint, uchar *, const uchar *, my_off_t); +int _ma_ft_del(MARIA_HA *, uint, uchar *, const uchar *, my_off_t); + +my_bool _ma_ft_convert_to_ft2(MARIA_HA *, MARIA_KEY *); diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c new file mode 100644 index 00000000000..1bbfa3cbf7e --- /dev/null +++ b/storage/maria/ma_info.c @@ -0,0 +1,142 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Return useful base information for an open table */ + +#include "maria_def.h" +#ifdef __WIN__ +#include <sys/stat.h> +#endif + + /* Get position to last record */ + +MARIA_RECORD_POS maria_position(MARIA_HA *info) +{ + return info->cur_row.lastpos; +} + + +/* Get information about the table */ +/* if flag == 2 one get current info (no sync from database */ + +int maria_status(MARIA_HA *info, register MARIA_INFO *x, uint flag) +{ + MY_STAT state; + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_status"); + + x->recpos= info->cur_row.lastpos; + if (flag == HA_STATUS_POS) + DBUG_RETURN(0); /* Compatible with ISAM */ + if (!(flag & HA_STATUS_NO_LOCK)) + { + pthread_mutex_lock(&share->intern_lock); + VOID(_ma_readinfo(info,F_RDLCK,0)); + fast_ma_writeinfo(info); + pthread_mutex_unlock(&share->intern_lock); + } + if (flag & HA_STATUS_VARIABLE) + { + x->records = info->state->records; + x->deleted = share->state.state.del; + x->delete_length = share->state.state.empty; + x->data_file_length = share->state.state.data_file_length; + x->index_file_length= share->state.state.key_file_length; + + x->keys = share->state.header.keys; + x->check_time = share->state.check_time; + x->mean_reclength = x->records ? + (ulong) ((x->data_file_length - x->delete_length) /x->records) : + (ulong) share->min_pack_length; + } + if (flag & HA_STATUS_ERRKEY) + { + x->errkey= info->errkey; + x->dup_key_pos= info->dup_key_pos; + } + if (flag & HA_STATUS_CONST) + { + x->reclength = share->base.reclength; + x->max_data_file_length=share->base.max_data_file_length; + x->max_index_file_length=info->s->base.max_key_file_length; + x->filenr = info->dfile.file; + x->options = share->options; + x->create_time=share->state.create_time; + x->reflength= maria_get_pointer_length(share->base.max_data_file_length, + maria_data_pointer_size); + x->record_offset= (info->s->data_file_type == STATIC_RECORD ? + share->base.pack_reclength: 0); + x->sortkey= -1; /* No clustering */ + x->rec_per_key = share->state.rec_per_key_part; + x->key_map = share->state.key_map; + x->data_file_name = share->data_file_name.str; + x->index_file_name = share->index_file_name.str; + x->data_file_type = share->data_file_type; + } + if ((flag & HA_STATUS_TIME) && !my_fstat(info->dfile.file, &state, MYF(0))) + x->update_time=state.st_mtime; + else + x->update_time=0; + if (flag & HA_STATUS_AUTO) + { + x->auto_increment= share->state.auto_increment+1; + if (!x->auto_increment) /* This shouldn't happen */ + x->auto_increment= ~(ulonglong) 0; + } + DBUG_RETURN(0); +} + + +/* + Write a message to the error log. + + SYNOPSIS + _ma_report_error() + file_name Name of table file (e.g. index_file_name). + errcode Error number. + + DESCRIPTION + This function supplies my_error() with a table name. Most error + messages need one. Since string arguments in error messages are limited + to 64 characters by convention, we ensure that in case of truncation, + that the end of the index file path is in the message. This contains + the most valuable information (the table name and the database name). + + RETURN + void +*/ + +void _ma_report_error(int errcode, const LEX_STRING *name) +{ + size_t length; + const char *file_name= name->str; + DBUG_ENTER("_ma_report_error"); + DBUG_PRINT("enter",("errcode %d, table '%s'", errcode, file_name)); + + if ((length= name->length) > 64) + { + /* we first remove the directory */ + size_t dir_length= dirname_length(file_name); + file_name+= dir_length; + if ((length-= dir_length) > 64) + { + /* still too long, chop start of table name */ + file_name+= length - 64; + } + } + + my_error(errcode, MYF(ME_NOREFRESH), file_name); + DBUG_VOID_RETURN; +} diff --git a/storage/maria/ma_init.c b/storage/maria/ma_init.c new file mode 100644 index 00000000000..902f06d93e5 --- /dev/null +++ b/storage/maria/ma_init.c @@ -0,0 +1,184 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Initialize an maria-database */ + +#include "maria_def.h" +#include <ft_global.h> +#include "ma_blockrec.h" +#include "trnman_public.h" +#include "ma_checkpoint.h" +#include <hash.h> + +void history_state_free(MARIA_STATE_HISTORY_CLOSED *closed_history) +{ + MARIA_STATE_HISTORY *history, *next; + + /* + Free all active history + In case of maria_open() this list should be empty as the history is moved + to handler->share. + */ + for (history= closed_history->state_history; history ; history= next) + { + next= history->next; + my_free(history, MYF(0)); + } + my_free(closed_history, MYF(0)); +} + + +static int dummy_maria_create_trn_hook(MARIA_HA *info __attribute__((unused))) +{ + return 0; +} + +/* + Initialize maria + + SYNOPSIS + maria_init() + + TODO + Open log files and do recovery if need + + RETURN + 0 ok + # error number +*/ + +int maria_init(void) +{ + DBUG_ASSERT(maria_block_size && + maria_block_size % MARIA_MIN_KEY_BLOCK_LENGTH == 0); + if (!maria_inited) + { + maria_inited= TRUE; + pthread_mutex_init(&THR_LOCK_maria,MY_MUTEX_INIT_SLOW); + _ma_init_block_record_data(); + trnman_end_trans_hook= _ma_trnman_end_trans_hook; + maria_create_trn_hook= dummy_maria_create_trn_hook; + my_handler_error_register(); + } + hash_init(&maria_stored_state, &my_charset_bin, 32, + 0, sizeof(LSN), 0, (hash_free_key) history_state_free, 0); + DBUG_PRINT("info",("dummy_transaction_object: %p", + &dummy_transaction_object)); + return 0; +} + + +void maria_end(void) +{ + if (maria_inited) + { + TrID trid; + maria_inited= maria_multi_threaded= FALSE; + ft_free_stopwords(); + ma_checkpoint_end(); + if (translog_status == TRANSLOG_OK) + { + translog_soft_sync_end(); + translog_sync(); + } + if ((trid= trnman_get_max_trid()) > max_trid_in_control_file) + { + /* + Store max transaction id into control file, in case logs are removed + by user, or maria_chk wants to check tables (it cannot access max trid + from the log, as it cannot process REDOs). + */ + (void)ma_control_file_write_and_force(last_checkpoint_lsn, last_logno, + trid, recovery_failures); + } + trnman_destroy(); + if (translog_status == TRANSLOG_OK) + translog_destroy(); + end_pagecache(maria_log_pagecache, TRUE); + end_pagecache(maria_pagecache, TRUE); + ma_control_file_end(); + pthread_mutex_destroy(&THR_LOCK_maria); + hash_free(&maria_stored_state); + } +} + +/** + Upgrade from older Aria versions: + + - In MariaDB 5.1, the name of the control file and log files had the + 'maria' prefix, now they have the 'aria' prefix. + + @return: 0 ok + 1 error + +*/ + +my_bool maria_upgrade() +{ + char name[FN_REFLEN], new_name[FN_REFLEN]; + DBUG_ENTER("maria_upgrade"); + + fn_format(name, "maria_log_control", maria_data_root, "", MYF(MY_WME)); + + if (!my_access(name,F_OK)) + { + /* + Old style control file found; Rename the control file and the log files. + We start by renaming all log files, so that if we get a crash + we will continue from where we left. + */ + uint i; + MY_DIR *dir= my_dir(maria_data_root, MYF(MY_WME)); + if (!dir) + DBUG_RETURN(1); + + my_message(HA_ERR_INITIALIZATION, + "Found old style Maria log files; " + "Converting them to Aria names", + MYF(ME_JUST_INFO)); + + for (i= 0; i < dir->number_off_files; i++) + { + const char *file= dir->dir_entry[i].name; + if (strncmp(file, "maria_log.", 10) == 0 && + file[10] >= '0' && file[10] <= '9' && + file[11] >= '0' && file[11] <= '9' && + file[12] >= '0' && file[12] <= '9' && + file[13] >= '0' && file[13] <= '9' && + file[14] >= '0' && file[14] <= '9' && + file[15] >= '0' && file[15] <= '9' && + file[16] >= '0' && file[16] <= '9' && + file[17] >= '0' && file[17] <= '9' && + file[18] == '\0') + { + /* Remove the 'm' in 'maria' */ + char old_logname[FN_REFLEN], new_logname[FN_REFLEN]; + fn_format(old_logname, file, maria_data_root, "", MYF(0)); + fn_format(new_logname, file+1, maria_data_root, "", MYF(0)); + if (my_rename(old_logname, new_logname, MYF(MY_WME))) + { + my_dirend(dir); + DBUG_RETURN(1); + } + } + } + my_dirend(dir); + + fn_format(new_name, CONTROL_FILE_BASE_NAME, maria_data_root, "", MYF(0)); + if (my_rename(name, new_name, MYF(MY_WME))) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} diff --git a/storage/maria/ma_key.c b/storage/maria/ma_key.c new file mode 100644 index 00000000000..ac23bf5fef6 --- /dev/null +++ b/storage/maria/ma_key.c @@ -0,0 +1,775 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Functions to handle keys */ + +#include "maria_def.h" +#include "m_ctype.h" +#include "ma_sp_defs.h" +#include "ma_blockrec.h" /* For ROW_FLAG_TRANSID */ +#include "trnman.h" +#ifdef HAVE_IEEEFP_H +#include <ieeefp.h> +#endif + +#define CHECK_KEYS /* Enable safety checks */ + +static int _ma_put_key_in_record(MARIA_HA *info, uint keynr, + my_bool unpack_blobs, uchar *record); + +#define FIX_LENGTH(cs, pos, length, char_length) \ + do { \ + if (length > char_length) \ + char_length= (uint) my_charpos(cs, pos, pos+length, char_length); \ + set_if_smaller(char_length,length); \ + } while(0) + + +/** + Store trid in a packed format as part of a key + + @fn transid_store_packed + @param info Maria handler + @param to End of key to which we should store a packed transid + @param trid Trid to be stored + + @notes + + Keys that have a transid has the lowest bit set for the last byte of the key + This function sets this bit for the key. + + Trid is max 6 bytes long + + First Trid it's converted to a smaller number by using + trid= trid - create_trid. + Then trid is then shifted up one bit so that we can use the + lowest bit as a marker if it's followed by another trid. + + Trid is then stored as follows: + + if trid < 256-12 + one byte + else + one byte prefix length_of_trid_in_bytes + 249 followed by data + in high-byte-first order + + Prefix bytes 244 to 249 are reserved for negative transid, that can be used + when we pack transid relative to each other on a key block. + + We have to store transid in high-byte-first order so that we can compare + them unpacked byte per byte and as soon we find a difference we know + which is smaller. + + For example, assuming we the following data: + + key_data: 1 (4 byte integer) + pointer_to_row: 2 << 8 + 3 = 515 (page 2, row 3) + table_create_transid 1000 Defined at create table time and + stored in table definition + transid 1010 Transaction that created row + delete_transid 2011 Transaction that deleted row + + In addition we assume the table is created with a data pointer length + of 4 bytes (this is automatically calculated based on the medium + length of rows and the given max number of rows) + + The binary data for the key would then look like this in hex: + + 00 00 00 01 Key data (1 stored high byte first) + 00 00 00 47 (515 << 1) + 1 ; The last 1 is marker that key cont. + 15 ((1010-1000) << 1) + 1 ; The last 1 is marker that key cont. + FB 07 E6 Length byte (FE = 249 + 2 means 2 bytes) and + ((2011 - 1000) << 1) = 07 E6 +*/ + +uint transid_store_packed(MARIA_HA *info, uchar *to, ulonglong trid) +{ + uchar *start; + uint length; + uchar buff[8]; + DBUG_ASSERT(trid < (LL(1) << (MARIA_MAX_PACK_TRANSID_SIZE*8))); + DBUG_ASSERT(trid >= info->s->state.create_trid); + + trid= (trid - info->s->state.create_trid) << 1; + + /* Mark that key contains transid */ + to[-1]|= 1; + + if (trid < MARIA_MIN_TRANSID_PACK_OFFSET) + { + to[0]= (uchar) trid; + return 1; + } + start= to; + + /* store things in low-byte-first-order in buff */ + to= buff; + do + { + *to++= (uchar) trid; + trid= trid>>8; + } while (trid); + + length= (uint) (to - buff); + /* Store length prefix */ + start[0]= (uchar) (length + MARIA_TRANSID_PACK_OFFSET); + start++; + /* Copy things in high-byte-first order to output buffer */ + do + { + *start++= *--to; + } while (to != buff); + return length+1; +} + + +/** + Read packed transid + + @fn transid_get_packed + @param info Maria handler + @param from Transid is stored here + + See transid_store_packed() for how transid is packed + +*/ + +ulonglong transid_get_packed(MARIA_SHARE *share, const uchar *from) +{ + ulonglong value; + uint length; + + if (from[0] < MARIA_MIN_TRANSID_PACK_OFFSET) + value= (ulonglong) from[0]; + else + { + value= 0; + for (length= (uint) (from[0] - MARIA_TRANSID_PACK_OFFSET), + value= (ulonglong) from[1], from+=2; + --length ; + from++) + value= (value << 8) + ((ulonglong) *from); + } + return (value >> 1) + share->state.create_trid; +} + + +/* + Make a normal (not spatial or fulltext) intern key from a record + + SYNOPSIS + _ma_make_key() + info MyiSAM handler + int_key Store created key here + keynr key number + key Buffer used to store key data + record Record + filepos Position to record in the data file + + NOTES + This is used to generate keys from the record on insert, update and delete + + RETURN + key +*/ + +MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr, + uchar *key, const uchar *record, + MARIA_RECORD_POS filepos, ulonglong trid) +{ + const uchar *pos; + reg1 HA_KEYSEG *keyseg; + my_bool is_ft; + DBUG_ENTER("_ma_make_key"); + + int_key->data= key; + int_key->flag= 0; /* Always return full key */ + int_key->keyinfo= info->s->keyinfo + keynr; + + is_ft= int_key->keyinfo->flag & HA_FULLTEXT; + for (keyseg= int_key->keyinfo->seg ; keyseg->type ;keyseg++) + { + enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type; + uint length=keyseg->length; + uint char_length; + CHARSET_INFO *cs=keyseg->charset; + + if (keyseg->null_bit) + { + if (record[keyseg->null_pos] & keyseg->null_bit) + { + *key++= 0; /* NULL in key */ + continue; + } + *key++=1; /* Not NULL */ + } + + char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen : + length); + + pos= record+keyseg->start; + if (type == HA_KEYTYPE_BIT) + { + if (keyseg->bit_length) + { + uchar bits= get_rec_bits(record + keyseg->bit_pos, + keyseg->bit_start, keyseg->bit_length); + *key++= (char) bits; + length--; + } + memcpy(key, pos, length); + key+= length; + continue; + } + if (keyseg->flag & HA_SPACE_PACK) + { + if (type != HA_KEYTYPE_NUM) + { + length= (uint) cs->cset->lengthsp(cs, (const char*)pos, length); + } + else + { + const uchar *end= pos + length; + while (pos < end && pos[0] == ' ') + pos++; + length= (uint) (end-pos); + } + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key, pos, (size_t) char_length); + key+=char_length; + continue; + } + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= (keyseg->bit_start == 1 ? 1 : 2); + uint tmp_length= (pack_length == 1 ? (uint) *pos : + uint2korr(pos)); + pos+= pack_length; /* Skip VARCHAR length */ + set_if_smaller(length,tmp_length); + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key,pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & HA_BLOB_PART) + { + uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos); + uchar *blob_pos; + memcpy_fixed(&blob_pos, pos+keyseg->bit_start,sizeof(char*)); + set_if_smaller(length,tmp_length); + FIX_LENGTH(cs, blob_pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key, blob_pos, (size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & HA_SWAP_KEY) + { /* Numerical column */ +#ifdef HAVE_ISNAN + if (type == HA_KEYTYPE_FLOAT) + { + float nr; + float4get(nr,pos); + if (isnan(nr)) + { + /* Replace NAN with zero */ + bzero(key,length); + key+=length; + continue; + } + } + else if (type == HA_KEYTYPE_DOUBLE) + { + double nr; + float8get(nr,pos); + if (isnan(nr)) + { + bzero(key,length); + key+=length; + continue; + } + } +#endif + pos+=length; + while (length--) + { + *key++ = *--pos; + } + continue; + } + FIX_LENGTH(cs, pos, length, char_length); + memcpy(key, pos, char_length); + if (length > char_length) + cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' '); + key+= length; + } + _ma_dpointer(info->s, key, filepos); + int_key->data_length= (key - int_key->data); + int_key->ref_length= info->s->rec_reflength; + int_key->flag= 0; + if (_ma_have_versioning(info) && trid) + { + int_key->ref_length+= transid_store_packed(info, + key + int_key->ref_length, + (TrID) trid); + int_key->flag|= SEARCH_USER_KEY_HAS_TRANSID; + } + + DBUG_PRINT("exit",("keynr: %d",keynr)); + DBUG_DUMP_KEY("key", int_key); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, int_key);); + DBUG_RETURN(int_key); +} /* _ma_make_key */ + + +/* + Pack a key to intern format from given format (c_rkey) + + SYNOPSIS + _ma_pack_key() + info MARIA handler + int_key Store key here + keynr key number + key Buffer for key data + old Original not packed key + keypart_map bitmap of used keyparts + last_used_keyseg out parameter. May be NULL + + RETURN + int_key + + last_use_keyseg Store pointer to the keyseg after the last used one +*/ + +MARIA_KEY *_ma_pack_key(register MARIA_HA *info, MARIA_KEY *int_key, + uint keynr, uchar *key, + const uchar *old, key_part_map keypart_map, + HA_KEYSEG **last_used_keyseg) +{ + HA_KEYSEG *keyseg; + my_bool is_ft; + DBUG_ENTER("_ma_pack_key"); + + int_key->data= key; + int_key->keyinfo= info->s->keyinfo + keynr; + + /* "one part" rtree key is 2*SPDIMS part key in Maria */ + if (int_key->keyinfo->key_alg == HA_KEY_ALG_RTREE) + keypart_map= (((key_part_map)1) << (2*SPDIMS)) - 1; + + /* only key prefixes are supported */ + DBUG_ASSERT(((keypart_map+1) & keypart_map) == 0); + + is_ft= int_key->keyinfo->flag & HA_FULLTEXT; + for (keyseg=int_key->keyinfo->seg ; keyseg->type && keypart_map; + old+= keyseg->length, keyseg++) + { + enum ha_base_keytype type= (enum ha_base_keytype) keyseg->type; + uint length= keyseg->length; + uint char_length; + const uchar *pos; + CHARSET_INFO *cs=keyseg->charset; + + keypart_map>>= 1; + if (keyseg->null_bit) + { + if (!(*key++= (char) 1-*old++)) /* Copy null marker */ + { + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) + old+= 2; + continue; /* Found NULL */ + } + } + char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen : + length); + pos= old; + if (keyseg->flag & HA_SPACE_PACK) + { + const uchar *end= pos + length; + if (type == HA_KEYTYPE_NUM) + { + while (pos < end && pos[0] == ' ') + pos++; + } + else if (type != HA_KEYTYPE_BINARY) + { + while (end > pos && end[-1] == ' ') + end--; + } + length=(uint) (end-pos); + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key,pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) + { + /* Length of key-part used with maria_rkey() always 2 */ + uint tmp_length=uint2korr(pos); + pos+=2; + set_if_smaller(length,tmp_length); /* Safety */ + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + old+=2; /* Skip length */ + memcpy(key, pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & HA_SWAP_KEY) + { /* Numerical column */ + pos+=length; + while (length--) + *key++ = *--pos; + continue; + } + FIX_LENGTH(cs, pos, length, char_length); + memcpy(key, pos, char_length); + if (length > char_length) + cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' '); + key+= length; + } + if (last_used_keyseg) + *last_used_keyseg= keyseg; + + /* set flag to SEARCH_PART_KEY if we are not using all key parts */ + int_key->flag= keyseg->type ? SEARCH_PART_KEY : 0; + int_key->ref_length= 0; + int_key->data_length= (key - int_key->data); + + DBUG_PRINT("exit", ("length: %u", int_key->data_length)); + DBUG_RETURN(int_key); +} /* _ma_pack_key */ + + +/** + Copy a key +*/ + +void _ma_copy_key(MARIA_KEY *to, const MARIA_KEY *from) +{ + memcpy(to->data, from->data, from->data_length + from->ref_length); + to->keyinfo= from->keyinfo; + to->data_length= from->data_length; + to->ref_length= from->ref_length; + to->flag= from->flag; +} + + +/* + Store found key in record + + SYNOPSIS + _ma_put_key_in_record() + info MARIA handler + keynr Key number that was used + unpack_blobs TRUE <=> Unpack blob columns + FALSE <=> Skip them. This is used by index condition + pushdown check function + record Store key here + + Last read key is in info->lastkey + + NOTES + Used when only-keyread is wanted + + RETURN + 0 ok + 1 error +*/ + +static int _ma_put_key_in_record(register MARIA_HA *info, uint keynr, + my_bool unpack_blobs, uchar *record) +{ + reg2 uchar *key; + uchar *pos,*key_end; + reg1 HA_KEYSEG *keyseg; + uchar *blob_ptr; + DBUG_ENTER("_ma_put_key_in_record"); + + blob_ptr= info->lastkey_buff2; /* Place to put blob parts */ + key= info->last_key.data; /* Key that was read */ + key_end= key + info->last_key.data_length; + for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type ;keyseg++) + { + if (keyseg->null_bit) + { + if (!*key++) + { + record[keyseg->null_pos]|= keyseg->null_bit; + continue; + } + record[keyseg->null_pos]&= ~keyseg->null_bit; + } + if (keyseg->type == HA_KEYTYPE_BIT) + { + uint length= keyseg->length; + + if (keyseg->bit_length) + { + uchar bits= *key++; + set_rec_bits(bits, record + keyseg->bit_pos, keyseg->bit_start, + keyseg->bit_length); + length--; + } + else + { + clr_rec_bits(record + keyseg->bit_pos, keyseg->bit_start, + keyseg->bit_length); + } + memcpy(record + keyseg->start, key, length); + key+= length; + continue; + } + if (keyseg->flag & HA_SPACE_PACK) + { + uint length; + get_key_length(length,key); +#ifdef CHECK_KEYS + if (length > keyseg->length || key+length > key_end) + goto err; +#endif + pos= record+keyseg->start; + if (keyseg->type != (int) HA_KEYTYPE_NUM) + { + memcpy(pos,key,(size_t) length); + keyseg->charset->cset->fill(keyseg->charset, + (char*) pos + length, + keyseg->length - length, + ' '); + } + else + { + bfill(pos,keyseg->length-length,' '); + memcpy(pos+keyseg->length-length,key,(size_t) length); + } + key+=length; + continue; + } + + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint length; + get_key_length(length,key); +#ifdef CHECK_KEYS + if (length > keyseg->length || key+length > key_end) + goto err; +#endif + /* Store key length */ + if (keyseg->bit_start == 1) + *(uchar*) (record+keyseg->start)= (uchar) length; + else + int2store(record+keyseg->start, length); + /* And key data */ + memcpy(record+keyseg->start + keyseg->bit_start, key, length); + key+= length; + } + else if (keyseg->flag & HA_BLOB_PART) + { + uint length; + get_key_length(length,key); +#ifdef CHECK_KEYS + if (length > keyseg->length || key+length > key_end) + goto err; +#endif + if (unpack_blobs) + { + memcpy(record+keyseg->start+keyseg->bit_start, + (char*) &blob_ptr,sizeof(char*)); + memcpy(blob_ptr,key,length); + blob_ptr+=length; + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + _ma_store_blob_length(record+keyseg->start, + (uint) keyseg->bit_start,length); + } + key+=length; + } + else if (keyseg->flag & HA_SWAP_KEY) + { + uchar *to= record+keyseg->start+keyseg->length; + uchar *end= key+keyseg->length; +#ifdef CHECK_KEYS + if (end > key_end) + goto err; +#endif + do + { + *--to= *key++; + } while (key != end); + continue; + } + else + { +#ifdef CHECK_KEYS + if (key+keyseg->length > key_end) + goto err; +#endif + memcpy(record+keyseg->start, key, (size_t) keyseg->length); + key+= keyseg->length; + } + } + DBUG_RETURN(0); + +err: + DBUG_PRINT("info",("error")); + DBUG_RETURN(1); /* Crashed row */ +} /* _ma_put_key_in_record */ + + + /* Here when key reads are used */ + +int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos) +{ + fast_ma_writeinfo(info); + if (filepos != HA_OFFSET_ERROR) + { + if (info->lastinx >= 0) + { /* Read only key */ + if (_ma_put_key_in_record(info, (uint)info->lastinx, TRUE, buf)) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + return -1; + } + info->update|= HA_STATE_AKTIV; /* We should find a record */ + return 0; + } + my_errno=HA_ERR_WRONG_INDEX; + } + return(-1); /* Wrong data to read */ +} + + + +/* + Save current key tuple to record and call index condition check function + + SYNOPSIS + ma_check_index_cond() + info MyISAM handler + keynr Index we're running a scan on + record Record buffer to use (it is assumed that index check function + will look for column values there) + + RETURN + ICP_ERROR Error + ICP_NO_MATCH Index condition is not satisfied, continue scanning + ICP_MATCH Index condition is satisfied + ICP_OUT_OF_RANGE Index condition is not satisfied, end the scan. +*/ + +int ma_check_index_cond(register MARIA_HA *info, uint keynr, uchar *record) +{ + if (info->index_cond_func) + { + if (_ma_put_key_in_record(info, keynr, FALSE, record)) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + return -1; + } + return info->index_cond_func(info->index_cond_func_arg); + } + return 1; +} + + +/* + Retrieve auto_increment info + + SYNOPSIS + retrieve_auto_increment() + key Auto-increment key + key_type Key's type + + NOTE + 'key' should in "record" format, that is, how it is packed in a record + (this matters with HA_SWAP_KEY). + + IMPLEMENTATION + For signed columns we don't retrieve the auto increment value if it's + less than zero. +*/ + +ulonglong ma_retrieve_auto_increment(const uchar *key, uint8 key_type) +{ + ulonglong value= 0; /* Store unsigned values here */ + longlong s_value= 0; /* Store signed values here */ + + switch (key_type) { + case HA_KEYTYPE_INT8: + s_value= (longlong) *(const char*)key; + break; + case HA_KEYTYPE_BINARY: + value=(ulonglong) *key; + break; + case HA_KEYTYPE_SHORT_INT: + s_value= (longlong) sint2korr(key); + break; + case HA_KEYTYPE_USHORT_INT: + value=(ulonglong) uint2korr(key); + break; + case HA_KEYTYPE_LONG_INT: + s_value= (longlong) sint4korr(key); + break; + case HA_KEYTYPE_ULONG_INT: + value=(ulonglong) uint4korr(key); + break; + case HA_KEYTYPE_INT24: + s_value= (longlong) sint3korr(key); + break; + case HA_KEYTYPE_UINT24: + value=(ulonglong) uint3korr(key); + break; + case HA_KEYTYPE_FLOAT: /* This shouldn't be used */ + { + float f_1; + float4get(f_1,key); + /* Ignore negative values */ + value = (f_1 < (float) 0.0) ? 0 : (ulonglong) f_1; + break; + } + case HA_KEYTYPE_DOUBLE: /* This shouldn't be used */ + { + double f_1; + float8get(f_1,key); + /* Ignore negative values */ + value = (f_1 < 0.0) ? 0 : (ulonglong) f_1; + break; + } + case HA_KEYTYPE_LONGLONG: + s_value= sint8korr(key); + break; + case HA_KEYTYPE_ULONGLONG: + value= uint8korr(key); + break; + default: + DBUG_ASSERT(0); + value=0; /* Error */ + break; + } + + /* + The following code works becasue if s_value < 0 then value is 0 + and if s_value == 0 then value will contain either s_value or the + correct value. + */ + return (s_value > 0) ? (ulonglong) s_value : value; +} diff --git a/storage/maria/ma_key_recover.c b/storage/maria/ma_key_recover.c new file mode 100644 index 00000000000..6de5253a2dd --- /dev/null +++ b/storage/maria/ma_key_recover.c @@ -0,0 +1,1432 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Redo of index */ + +#include "maria_def.h" +#include "ma_blockrec.h" +#include "trnman.h" +#include "ma_key_recover.h" +#include "ma_rt_index.h" + +/**************************************************************************** + Some helper functions used both by key page loggin and block page loggin +****************************************************************************/ + +/** + @brief Unpin all pinned pages + + @fn _ma_unpin_all_pages() + @param info Maria handler + @param undo_lsn LSN for undo pages. LSN_IMPOSSIBLE if we shouldn't write + undo (like on duplicate key errors) + + info->pinned_pages is the list of pages to unpin. Each member of the list + must have its 'changed' saying if the page was changed or not. + + @note + We unpin pages in the reverse order as they where pinned; This is not + necessary now, but may simplify things in the future. + + @return + @retval 0 ok + @retval 1 error (fatal disk error) +*/ + +void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn) +{ + MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*) + dynamic_array_ptr(&info->pinned_pages, 0)); + MARIA_PINNED_PAGE *pinned_page= page_link + info->pinned_pages.elements; + DBUG_ENTER("_ma_unpin_all_pages"); + DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn)); + + if (!info->s->now_transactional) + DBUG_ASSERT(undo_lsn == LSN_IMPOSSIBLE || maria_in_recovery); + + while (pinned_page-- != page_link) + { + /* + Note this assert fails if we got a disk error or the record file + is corrupted, which means we should have this enabled only in debug + builds. + */ +#ifdef EXTRA_DEBUG + DBUG_ASSERT((!pinned_page->changed || + undo_lsn != LSN_IMPOSSIBLE || !info->s->now_transactional) || + (info->s->state.changed & STATE_CRASHED)); +#endif + pagecache_unlock_by_link(info->s->pagecache, pinned_page->link, + pinned_page->unlock, PAGECACHE_UNPIN, + info->trn->rec_lsn, undo_lsn, + pinned_page->changed, FALSE); + } + + info->pinned_pages.elements= 0; + DBUG_VOID_RETURN; +} + + +my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn, + enum translog_record_type undo_type, + my_bool store_checksum, ha_checksum checksum, + LSN *res_lsn, void *extra_msg) +{ + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE + + HA_CHECKSUM_STORE_SIZE+ KEY_NR_STORE_SIZE + PAGE_STORE_SIZE]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + struct st_msg_to_write_hook_for_clr_end msg; + my_bool res; + DBUG_ENTER("_ma_write_clr"); + + /* undo_lsn must be first for compression to work */ + lsn_store(log_data, undo_lsn); + clr_type_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, undo_type); + log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE; + + /* Extra_msg is handled in write_hook_for_clr_end() */ + msg.undone_record_type= undo_type; + msg.previous_undo_lsn= undo_lsn; + msg.extra_msg= extra_msg; + msg.checksum_delta= 0; + + if (store_checksum) + { + msg.checksum_delta= checksum; + ha_checksum_store(log_pos, checksum); + log_pos+= HA_CHECKSUM_STORE_SIZE; + } + else if (undo_type == LOGREC_UNDO_KEY_INSERT_WITH_ROOT || + undo_type == LOGREC_UNDO_KEY_DELETE_WITH_ROOT) + { + /* Key root changed. Store new key root */ + struct st_msg_to_write_hook_for_undo_key *undo_msg= extra_msg; + pgcache_page_no_t page; + key_nr_store(log_pos, undo_msg->keynr); + page= (undo_msg->value == HA_OFFSET_ERROR ? IMPOSSIBLE_PAGE_NO : + undo_msg->value / info->s->block_size); + page_store(log_pos + KEY_NR_STORE_SIZE, page); + log_pos+= KEY_NR_STORE_SIZE + PAGE_STORE_SIZE; + } + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data); + + + /* + We need intern_lock mutex for calling _ma_state_info_write in the trigger. + We do it here to have the same sequence of mutexes locking everywhere + (first intern_lock then transactional log buffer lock) + */ + if (undo_type == LOGREC_UNDO_BULK_INSERT) + pthread_mutex_lock(&info->s->intern_lock); + + res= translog_write_record(res_lsn, LOGREC_CLR_END, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + 0].length, + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data + LSN_STORE_SIZE, &msg); + if (undo_type == LOGREC_UNDO_BULK_INSERT) + pthread_mutex_unlock(&info->s->intern_lock); + DBUG_RETURN(res); +} + + +/** + @brief Sets transaction's undo_lsn, first_undo_lsn if needed + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_clr_end(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn __attribute__ ((unused)), + void *hook_arg) +{ + MARIA_SHARE *share= tbl_info->s; + struct st_msg_to_write_hook_for_clr_end *msg= + (struct st_msg_to_write_hook_for_clr_end *)hook_arg; + my_bool error= FALSE; + DBUG_ENTER("write_hook_for_clr_end"); + DBUG_ASSERT(trn->trid != 0); + trn->undo_lsn= msg->previous_undo_lsn; + + switch (msg->undone_record_type) { + case LOGREC_UNDO_ROW_DELETE: + share->state.state.records++; + share->state.state.checksum+= msg->checksum_delta; + break; + case LOGREC_UNDO_ROW_INSERT: + share->state.state.records--; + share->state.state.checksum+= msg->checksum_delta; + break; + case LOGREC_UNDO_ROW_UPDATE: + share->state.state.checksum+= msg->checksum_delta; + break; + case LOGREC_UNDO_KEY_INSERT_WITH_ROOT: + case LOGREC_UNDO_KEY_DELETE_WITH_ROOT: + { + /* Update key root */ + struct st_msg_to_write_hook_for_undo_key *extra_msg= + (struct st_msg_to_write_hook_for_undo_key *) msg->extra_msg; + *extra_msg->root= extra_msg->value; + break; + } + case LOGREC_UNDO_KEY_INSERT: + case LOGREC_UNDO_KEY_DELETE: + break; + case LOGREC_UNDO_BULK_INSERT: + safe_mutex_assert_owner(&share->intern_lock); + error= (maria_enable_indexes(tbl_info) || + /* we enabled indices, need '2' below */ + _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO)); + /* no need for _ma_reset_status(): REDO_DELETE_ALL is just before us */ + break; + default: + DBUG_ASSERT(0); + } + if (trn->undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */ + trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn); + DBUG_RETURN(error); +} + + +/** + @brief write hook for undo key +*/ + +my_bool write_hook_for_undo_key(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + struct st_msg_to_write_hook_for_undo_key *msg= + (struct st_msg_to_write_hook_for_undo_key *) hook_arg; + + *msg->root= msg->value; + _ma_fast_unlock_key_del(tbl_info); + return write_hook_for_undo(type, trn, tbl_info, lsn, 0); +} + + +/** + Updates "auto_increment" and calls the generic UNDO_KEY hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_key_insert(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + struct st_msg_to_write_hook_for_undo_key *msg= + (struct st_msg_to_write_hook_for_undo_key *) hook_arg; + MARIA_SHARE *share= tbl_info->s; + if (msg->auto_increment > 0) + { + /* + Only reason to set it here is to have a mutex protect from checkpoint + reading at the same time (would see a corrupted value). + + The purpose of the following code is to set auto_increment if the row + has a with auto_increment value higher than the current one. We also + want to be able to restore the old value, in case of rollback, + if no one else has tried to set the value. + + The logic used is that we only restore the auto_increment value if + tbl_info->last_auto_increment == share->last_auto_increment + when it's time to do the rollback. + */ + DBUG_PRINT("info",("auto_inc: %lu new auto_inc: %lu", + (ulong)share->state.auto_increment, + (ulong)msg->auto_increment)); + if (share->state.auto_increment < msg->auto_increment) + { + /* Remember the original value, in case of rollback */ + tbl_info->last_auto_increment= share->last_auto_increment= + share->state.auto_increment; + share->state.auto_increment= msg->auto_increment; + } + else + { + /* + If the current value would have affected the original auto_increment + value, set it to an impossible value so that it's not restored on + rollback + */ + if (msg->auto_increment > share->last_auto_increment) + share->last_auto_increment= ~(ulonglong) 0; + } + } + return write_hook_for_undo_key(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Updates "share->auto_increment" in case of abort and calls + generic UNDO_KEY hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_key_delete(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + struct st_msg_to_write_hook_for_undo_key *msg= + (struct st_msg_to_write_hook_for_undo_key *) hook_arg; + MARIA_SHARE *share= tbl_info->s; + if (msg->auto_increment > 0) /* If auto increment key */ + { + /* Restore auto increment if no one has changed it in between */ + if (share->last_auto_increment == tbl_info->last_auto_increment && + tbl_info->last_auto_increment != ~(ulonglong) 0) + share->state.auto_increment= tbl_info->last_auto_increment; + } + return write_hook_for_undo_key(type, trn, tbl_info, lsn, hook_arg); +} + + +/***************************************************************************** + Functions for logging of key page changes +*****************************************************************************/ + +/** + @brief + Write log entry for page that has got data added or deleted at start of page +*/ + +my_bool _ma_log_prefix(MARIA_PAGE *ma_page, uint changed_length, + int move_length, + enum en_key_debug debug_marker __attribute__((unused))) +{ + uint translog_parts; + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 7 + 7 + 2 + 2]; + uchar *log_pos; + uchar *buff= ma_page->buff; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + MARIA_HA *info= ma_page->info; + pgcache_page_no_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_prefix"); + DBUG_PRINT("enter", ("page: %lu changed_length: %u move_length: %d", + (ulong) page, changed_length, move_length)); + + DBUG_ASSERT(ma_page->size == ma_page->org_size + move_length); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + (*log_pos++)= KEY_OP_DEBUG; + (*log_pos++)= debug_marker; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= buff[KEYPAGE_TRANSFLAG_OFFSET]; + + if (move_length < 0) + { + /* Delete prefix */ + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, -move_length); + log_pos+= 3; + if (changed_length) + { + /* + We don't need a KEY_OP_OFFSET as KEY_OP_DEL_PREFIX has an implicit + offset + */ + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, changed_length); + log_pos+= 3; + } + } + else + { + /* Add prefix */ + DBUG_ASSERT(changed_length >0 && (int) changed_length >= move_length); + log_pos[0]= KEY_OP_ADD_PREFIX; + int2store(log_pos+1, move_length); + int2store(log_pos+3, changed_length); + log_pos+= 5; + } + + translog_parts= 1; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + if (changed_length) + { + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (buff + + info->s->keypage_header); + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length; + translog_parts= 2; + } + + _ma_log_key_changes(ma_page, log_array + TRANSLOG_INTERNAL_PARTS + + translog_parts, log_pos, &changed_length, + &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + changed_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got data added or deleted at end of page +*/ + +my_bool _ma_log_suffix(MARIA_PAGE *ma_page, uint org_length, uint new_length) +{ + LSN lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 10 + 7 + 2], *log_pos; + uchar *buff= ma_page->buff; + int diff; + uint translog_parts, extra_length; + MARIA_HA *info= ma_page->info; + pgcache_page_no_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_suffix"); + DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u", + (ulong) page, org_length, new_length)); + DBUG_ASSERT(ma_page->size == new_length); + DBUG_ASSERT(ma_page->org_size == org_length); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= buff[KEYPAGE_TRANSFLAG_OFFSET]; + + if ((diff= (int) (new_length - org_length)) < 0) + { + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, -diff); + log_pos+= 3; + translog_parts= 1; + extra_length= 0; + } + else + { + log_pos[0]= KEY_OP_ADD_SUFFIX; + int2store(log_pos+1, diff); + log_pos+= 3; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= buff + org_length; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= (uint) diff; + translog_parts= 2; + extra_length= (uint) diff; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief Log that a key was added to the page + + @param ma_page Changed page + @param org_page_length Length of data in page before key was added + Final length in ma_page->size + + @note + If handle_overflow is set, then we have to protect against + logging changes that is outside of the page. + This may happen during underflow() handling where the buffer + in memory temporary contains more data than block_size + + ma_page may be a page that was previously logged and cuted down + becasue it's too big. (org_page_length > ma_page->org_size) +*/ + +my_bool _ma_log_add(MARIA_PAGE *ma_page, + uint org_page_length __attribute__ ((unused)), + uchar *key_pos, uint changed_length, int move_length, + my_bool handle_overflow __attribute__ ((unused)), + enum en_key_debug debug_marker __attribute__((unused))) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 3 + 3 + 3 + 3 + 7 + + 3 + 2]; + uchar *log_pos; + uchar *buff= ma_page->buff; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6]; + MARIA_HA *info= ma_page->info; + uint offset= (uint) (key_pos - buff); + uint max_page_size= info->s->max_index_block_size; + uint translog_parts, current_size; + pgcache_page_no_t page_pos= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_add"); + DBUG_PRINT("enter", ("page: %lu org_page_length: %u changed_length: %u " + "move_length: %d", + (ulong) page_pos, org_page_length, changed_length, + move_length)); + DBUG_ASSERT(info->s->now_transactional); + DBUG_ASSERT(move_length <= (int) changed_length); + DBUG_ASSERT(ma_page->org_size == min(org_page_length, max_page_size)); + DBUG_ASSERT(ma_page->size == org_page_length + move_length); + DBUG_ASSERT(offset <= ma_page->org_size); + + /* + Write REDO entry that contains the logical operations we need + to do the page + */ + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page_pos); + current_size= ma_page->org_size; + log_pos+= PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + *log_pos++= KEY_OP_DEBUG; + *log_pos++= debug_marker; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= buff[KEYPAGE_TRANSFLAG_OFFSET]; + + /* + Don't overwrite page boundary + It's ok to cut this as we will append the data at end of page + in the next log entry + */ + if (offset + changed_length > max_page_size) + { + DBUG_ASSERT(handle_overflow); + changed_length= max_page_size - offset; /* Update to end of page */ + move_length= 0; /* Nothing to move */ + /* Extend the page to max length on recovery */ + *log_pos++= KEY_OP_MAX_PAGELENGTH; + current_size= max_page_size; + } + + /* Check if adding the key made the page overflow */ + if (current_size + move_length > max_page_size) + { + /* + Adding the key caused an overflow. Cut away the part of the + page that doesn't fit. + */ + uint diff; + DBUG_ASSERT(handle_overflow); + diff= current_size + move_length - max_page_size; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, diff); + log_pos+= 3; + current_size= max_page_size - move_length; + } + + if (offset == current_size) + { + log_pos[0]= KEY_OP_ADD_SUFFIX; + current_size+= changed_length; + } + else + { + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + if (move_length) + { + if (move_length < 0) + { + DBUG_ASSERT(offset - move_length <= org_page_length); + if (offset - move_length > current_size) + { + /* + Truncate to end of page. We will add data to it from + the page buffer below + */ + move_length= (int) offset - (int) current_size; + } + } + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + current_size+= move_length; + } + /* + Handle case where page was shortend but 'changed_length' goes over + 'current_size'. This can only happen when there was a page overflow + and we will below add back the overflow part + */ + if (offset + changed_length > current_size) + { + DBUG_ASSERT(offset + changed_length <= ma_page->size); + changed_length= current_size - offset; + } + log_pos[0]= KEY_OP_CHANGE; + } + int2store(log_pos+1, changed_length); + log_pos+= 3; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length; + translog_parts= TRANSLOG_INTERNAL_PARTS + 2; + + /* + If page was originally > block_size before operation and now all data + fits, append the end data that was not part of the previous logged + page to it. + */ + DBUG_ASSERT(current_size <= max_page_size && current_size <= ma_page->size); + if (current_size != ma_page->size && current_size != max_page_size) + { + uint length= min(ma_page->size, max_page_size) - current_size; + uchar *data= ma_page->buff + current_size; + + log_pos[0]= KEY_OP_ADD_SUFFIX; + int2store(log_pos+1, length); + log_array[translog_parts].str= log_pos; + log_array[translog_parts].length= 3; + log_array[translog_parts+1].str= data; + log_array[translog_parts+1].length= length; + log_pos+= 3; + translog_parts+= 2; + current_size+= length; + changed_length+= length + 3; + } + + _ma_log_key_changes(ma_page, log_array + translog_parts, + log_pos, &changed_length, &translog_parts); + /* + Remember new page length for future log entries for same page + Note that this can be different from ma_page->size in case of page + overflow! + */ + ma_page->org_size= current_size; + DBUG_ASSERT(ma_page->org_size == min(ma_page->size, max_page_size)); + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + 0].length + + changed_length, translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(-1); + DBUG_RETURN(0); +} + + +#ifdef EXTRA_DEBUG_KEY_CHANGES + +/* Log checksum and optionally key page to log */ + +void _ma_log_key_changes(MARIA_PAGE *ma_page, LEX_CUSTRING *log_array, + uchar *log_pos, uint *changed_length, + uint *translog_parts) +{ + MARIA_SHARE *share= ma_page->info->s; + int page_length= min(ma_page->size, share->max_index_block_size); + uint org_length; + ha_checksum crc; + + DBUG_ASSERT(ma_page->flag == (uint) ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET]); + + /* We have to change length as the page may have been shortened */ + org_length= _ma_get_page_used(share, ma_page->buff); + _ma_store_page_used(share, ma_page->buff, page_length); + crc= my_checksum(0, ma_page->buff + LSN_STORE_SIZE, + page_length - LSN_STORE_SIZE); + _ma_store_page_used(share, ma_page->buff, org_length); + + log_pos[0]= KEY_OP_CHECK; + int2store(log_pos+1, page_length); + int4store(log_pos+3, crc); + + log_array[0].str= log_pos; + log_array[0].length= 7; + (*changed_length)+= 7; + (*translog_parts)++; +#ifdef EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES + log_array[1].str= ma_page->buff; + log_array[1].length= page_length; + (*changed_length)+= page_length; + (*translog_parts)++; +#endif /* EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES */ +} + +#endif /* EXTRA_DEBUG_KEY_CHANGES */ + +/**************************************************************************** + Redo of key pages +****************************************************************************/ + +/** + @brief Apply LOGREC_REDO_INDEX_NEW_PAGE + + @param info Maria handler + @param header Header (without FILEID) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn, + const uchar *header, uint length) +{ + pgcache_page_no_t root_page= page_korr(header); + pgcache_page_no_t free_page= page_korr(header + PAGE_STORE_SIZE); + uint key_nr= key_nr_korr(header + PAGE_STORE_SIZE * 2); + my_bool page_type_flag= header[PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE]; + enum pagecache_page_lock unlock_method; + enum pagecache_page_pin unpin_method; + MARIA_PINNED_PAGE page_link; + my_off_t file_size; + uchar *buff; + uint result; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_apply_redo_index_new_page"); + DBUG_PRINT("enter", ("root_page: %lu free_page: %lu", + (ulong) root_page, (ulong) free_page)); + + /* Set header to point at key data */ + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + header+= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1; + length-= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1; + + file_size= (my_off_t) (root_page + 1) * share->block_size; + if (cmp_translog_addr(lsn, share->state.is_of_horizon) >= 0) + { + /* free_page is 0 if we shouldn't set key_del */ + if (free_page) + { + if (free_page != IMPOSSIBLE_PAGE_NO) + share->state.key_del= (my_off_t) free_page * share->block_size; + else + share->state.key_del= HA_OFFSET_ERROR; + } + if (page_type_flag) /* root page */ + share->state.key_root[key_nr]= file_size - share->block_size; + } + + if (file_size > share->state.state.key_file_length) + { + share->state.state.key_file_length= file_size; + buff= info->keyread_buff; + info->keyread_buff_used= 1; + unlock_method= PAGECACHE_LOCK_WRITE; + unpin_method= PAGECACHE_PIN; + } + else + { + if (!(buff= pagecache_read(share->pagecache, &share->kfile, + root_page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + if (my_errno != HA_ERR_FILE_TOO_SHORT && + my_errno != HA_ERR_WRONG_CRC) + { + result= 1; + goto err; + } + buff= pagecache_block_link_to_buffer(page_link.link); + } + else if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + DBUG_PRINT("info", ("Page is up to date, skipping redo")); + result= 0; + goto err; + } + unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED; + unpin_method= PAGECACHE_PIN_LEFT_PINNED; + } + + /* Write modified page */ + bzero(buff, LSN_STORE_SIZE); + memcpy(buff + LSN_STORE_SIZE, header, length); + bzero(buff + LSN_STORE_SIZE + length, + share->max_index_block_size - LSN_STORE_SIZE - length); + bfill(buff + share->block_size - KEYPAGE_CHECKSUM_SIZE, + KEYPAGE_CHECKSUM_SIZE, (uchar) 255); + + result= 0; + if (unlock_method == PAGECACHE_LOCK_WRITE && + pagecache_write(share->pagecache, + &share->kfile, root_page, 0, + buff, PAGECACHE_PLAIN_PAGE, + unlock_method, unpin_method, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE)) + result= 1; + + /* Mark page to be unlocked and written at _ma_unpin_all_pages() */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + DBUG_RETURN(result); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + DBUG_RETURN(result); +} + + +/** + @brief Apply LOGREC_REDO_INDEX_FREE_PAGE + + @param info Maria handler + @param header Header (without FILEID) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index_free_page(MARIA_HA *info, + LSN lsn, + const uchar *header) +{ + pgcache_page_no_t page= page_korr(header); + pgcache_page_no_t free_page= page_korr(header + PAGE_STORE_SIZE); + my_off_t old_link; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + uchar *buff; + int result; + DBUG_ENTER("_ma_apply_redo_index_free_page"); + DBUG_PRINT("enter", ("page: %lu free_page: %lu", + (ulong) page, (ulong) free_page)); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + if (cmp_translog_addr(lsn, share->state.is_of_horizon) >= 0) + share->state.key_del= (my_off_t) page * share->block_size; + + old_link= ((free_page != IMPOSSIBLE_PAGE_NO) ? + (my_off_t) free_page * share->block_size : + HA_OFFSET_ERROR); + if (!(buff= pagecache_read(share->pagecache, &share->kfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= (uint) my_errno; + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + result= 0; + goto err; + } + /* Free page */ + bzero(buff + LSN_STORE_SIZE, share->keypage_header - LSN_STORE_SIZE); + _ma_store_keynr(share, buff, (uchar) MARIA_DELETE_KEY_NR); + _ma_store_page_used(share, buff, share->keypage_header + 8); + mi_sizestore(buff + share->keypage_header, old_link); + +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + { + bzero(buff + share->keypage_header + 8, + share->block_size - share->keypage_header - 8 - + KEYPAGE_CHECKSUM_SIZE); + } +#endif + + /* Mark page to be unlocked and written at _ma_unpin_all_pages() */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + DBUG_RETURN(result); +} + + +/** + @brief Apply LOGREC_REDO_INDEX + + @fn ma_apply_redo_index() + @param info Maria handler + @param header Header (without FILEID) + + @notes + Data for this part is a set of logical instructions of how to + construct the key page. + + Information of the layout of the components for REDO_INDEX: + + Name Parameters (in byte) Information + KEY_OP_OFFSET 2 Set position for next operations + KEY_OP_SHIFT 2 (signed int) How much to shift down or up + KEY_OP_CHANGE 2 length, data Data to replace at 'pos' + KEY_OP_ADD_PREFIX 2 move-length How much data should be moved up + 2 change-length Data to be replaced at page start + KEY_OP_DEL_PREFIX 2 length Bytes to be deleted at page start + KEY_OP_ADD_SUFFIX 2 length, data Add data to end of page + KEY_OP_DEL_SUFFIX 2 length Reduce page length with this + Sets position to start of page + KEY_OP_CHECK 6 page_length[2],CRC Used only when debugging + This may be followed by page_length + of data (until end of log record) + KEY_OP_COMPACT_PAGE 6 transid + KEY_OP_SET_PAGEFLAG 1 flag for page + KEY_OP_MAX_PAGELENGTH 0 Set page to max length + KEY_OP_DEBUG 1 Info where logging was done + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +long my_counter= 0; + +uint _ma_apply_redo_index(MARIA_HA *info, + LSN lsn, const uchar *header, uint head_length) +{ + MARIA_SHARE *share= info->s; + pgcache_page_no_t page_pos= page_korr(header); + MARIA_PINNED_PAGE page_link; + uchar *buff; + const uchar *header_end= header + head_length; + uint page_offset= 0, org_page_length; + uint nod_flag, page_length, keypage_header, keynr; + uint max_page_size= share->max_index_block_size; + int result; + MARIA_PAGE page; + DBUG_ENTER("_ma_apply_redo_index"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page_pos)); + + /* Set header to point at key data */ + header+= PAGE_STORE_SIZE; + + if (!(buff= pagecache_read(share->pagecache, &share->kfile, + page_pos, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= 1; + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + DBUG_PRINT("info", ("Page is up to date, skipping redo")); + result= 0; + goto err; + } + + keynr= _ma_get_keynr(share, buff); + _ma_page_setup(&page, info, share->keyinfo + keynr, page_pos, buff); + nod_flag= page.node; + org_page_length= page_length= page.size; + + keypage_header= share->keypage_header; + DBUG_PRINT("redo", ("page_length: %u", page_length)); + + /* Apply modifications to page */ + do + { + switch ((enum en_key_op) (*header++)) { + case KEY_OP_OFFSET: /* 1 */ + page_offset= uint2korr(header); + header+= 2; + DBUG_PRINT("redo", ("key_op_offset: %u", page_offset)); + DBUG_ASSERT(page_offset >= keypage_header && page_offset <= page_length); + break; + case KEY_OP_SHIFT: /* 2 */ + { + int length= sint2korr(header); + header+= 2; + DBUG_PRINT("redo", ("key_op_shift: %d", length)); + DBUG_ASSERT(page_offset != 0 && page_offset <= page_length && + page_length + length <= max_page_size); + + if (length < 0) + { + DBUG_ASSERT(page_offset - length <= page_length); + bmove(buff + page_offset, buff + page_offset - length, + page_length - page_offset + length); + } + else if (page_length != page_offset) + bmove_upp(buff + page_length + length, buff + page_length, + page_length - page_offset); + page_length+= length; + break; + } + case KEY_OP_CHANGE: /* 3 */ + { + uint length= uint2korr(header); + DBUG_PRINT("redo", ("key_op_change: %u", length)); + DBUG_ASSERT(page_offset != 0 && page_offset + length <= page_length); + + memcpy(buff + page_offset, header + 2 , length); + page_offset+= length; /* Put offset after changed length */ + header+= 2 + length; + break; + } + case KEY_OP_ADD_PREFIX: /* 4 */ + { + uint insert_length= uint2korr(header); + uint changed_length= uint2korr(header+2); + DBUG_PRINT("redo", ("key_op_add_prefix: %u %u", + insert_length, changed_length)); + + DBUG_ASSERT(insert_length <= changed_length && + page_length + changed_length <= max_page_size); + + bmove_upp(buff + page_length + insert_length, buff + page_length, + page_length - keypage_header); + memcpy(buff + keypage_header, header + 4 , changed_length); + header+= 4 + changed_length; + page_length+= insert_length; + break; + } + case KEY_OP_DEL_PREFIX: /* 5 */ + { + uint length= uint2korr(header); + header+= 2; + DBUG_PRINT("redo", ("key_op_del_prefix: %u", length)); + DBUG_ASSERT(length <= page_length - keypage_header); + + bmove(buff + keypage_header, buff + keypage_header + + length, page_length - keypage_header - length); + page_length-= length; + + page_offset= keypage_header; /* Prepare for change */ + break; + } + case KEY_OP_ADD_SUFFIX: /* 6 */ + { + uint insert_length= uint2korr(header); + DBUG_PRINT("redo", ("key_op_add_suffix: %u", insert_length)); + DBUG_ASSERT(page_length + insert_length <= max_page_size); + memcpy(buff + page_length, header+2, insert_length); + + page_length+= insert_length; + header+= 2 + insert_length; + break; + } + case KEY_OP_DEL_SUFFIX: /* 7 */ + { + uint del_length= uint2korr(header); + header+= 2; + DBUG_PRINT("redo", ("key_op_del_suffix: %u", del_length)); + DBUG_ASSERT(page_length - del_length >= keypage_header); + page_length-= del_length; + break; + } + case KEY_OP_CHECK: /* 8 */ + { +#ifdef EXTRA_DEBUG_KEY_CHANGES + uint check_page_length; + ha_checksum crc; + check_page_length= uint2korr(header); + crc= uint4korr(header+2); + _ma_store_page_used(share, buff, page_length); + if (check_page_length != page_length || + crc != (uint32) my_checksum(0, buff + LSN_STORE_SIZE, + page_length - LSN_STORE_SIZE)) + { + DBUG_DUMP("KEY_OP_CHECK bad page", buff, page_length); + if (header + 6 + check_page_length <= header_end) + { + DBUG_DUMP("KEY_OP_CHECK org page", header + 6, check_page_length); + } + DBUG_ASSERT("crc failure in REDO_INDEX" == 0); + } +#endif + DBUG_PRINT("redo", ("key_op_check")); + /* + This is the last entry in the block and it can contain page_length + data or not + */ + DBUG_ASSERT(header + 6 == header_end || + header + 6 + page_length == header_end); + header= header_end; + break; + } + case KEY_OP_DEBUG: + DBUG_PRINT("redo", ("Debug: %u", (uint) header[0])); + header++; + break; + case KEY_OP_DEBUG_2: + DBUG_PRINT("redo", ("org_page_length: %u new_page_length: %u", + uint2korr(header), uint2korr(header+2))); + header+= 4; + break; + case KEY_OP_MAX_PAGELENGTH: + DBUG_PRINT("redo", ("key_op_max_page_length")); + page_length= max_page_size; + break; + case KEY_OP_MULTI_COPY: /* 9 */ + { + /* + List of fixed-len memcpy() operations with their source located inside + the page. The log record's piece looks like: + first the length 'full_length' to be used by memcpy() + then the number of bytes used by the list of (to,from) pairs + then the (to,from) pairs, so we do: + for (t,f) in [list of (to,from) pairs]: + memcpy(t, f, full_length). + */ + uint full_length, log_memcpy_length; + const uchar *log_memcpy_end; + + DBUG_PRINT("redo", ("key_op_multi_copy")); + full_length= uint2korr(header); + header+= 2; + log_memcpy_length= uint2korr(header); + header+= 2; + log_memcpy_end= header + log_memcpy_length; + DBUG_ASSERT(full_length <= max_page_size); + while (header < log_memcpy_end) + { + uint to, from; + to= uint2korr(header); + header+= 2; + from= uint2korr(header); + header+= 2; + /* "from" is a place in the existing page */ + DBUG_ASSERT(max(from, to) < max_page_size); + memcpy(buff + to, buff + from, full_length); + } + break; + } + case KEY_OP_SET_PAGEFLAG: + DBUG_PRINT("redo", ("key_op_set_pageflag")); + buff[KEYPAGE_TRANSFLAG_OFFSET]= *header++; + break; + case KEY_OP_COMPACT_PAGE: + { + TrID transid= transid_korr(header); + + DBUG_PRINT("redo", ("key_op_compact_page")); + header+= TRANSID_SIZE; + if (_ma_compact_keypage(&page, transid)) + { + result= 1; + goto err; + } + page_length= page.size; + } + case KEY_OP_NONE: + default: + DBUG_ASSERT(0); + result= 1; + goto err; + } + } while (header < header_end); + DBUG_ASSERT(header == header_end); + + /* Write modified page */ + page.size= page_length; + _ma_store_page_used(share, buff, page_length); + + /* + Clean old stuff up. Gives us better compression of we archive things + and makes things easer to debug + */ + if (page_length < org_page_length) + bzero(buff + page_length, org_page_length-page_length); + + /* Mark page to be unlocked and written at _ma_unpin_all_pages() */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + if (result) + _ma_mark_file_crashed(share); + DBUG_RETURN(result); +} + + +/**************************************************************************** + Undo of key block changes +****************************************************************************/ + +/** + @brief Undo of insert of key (ie, delete the inserted key) +*/ + +my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length) +{ + LSN lsn; + my_bool res; + uint keynr; + uchar key_buff[MARIA_MAX_KEY_BUFF]; + MARIA_SHARE *share= info->s; + MARIA_KEY key; + my_off_t new_root; + struct st_msg_to_write_hook_for_undo_key msg; + DBUG_ENTER("_ma_apply_undo_key_insert"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + keynr= key_nr_korr(header); + length-= KEY_NR_STORE_SIZE; + + /* We have to copy key as _ma_ck_real_delete() may change it */ + memcpy(key_buff, header + KEY_NR_STORE_SIZE, length); + DBUG_DUMP("key_buff", key_buff, length); + + new_root= share->state.key_root[keynr]; + /* + Change the key to an internal structure. + It's safe to have SEARCH_USER_KEY_HAS_TRANSID even if there isn't + a transaction id, as ha_key_cmp() will stop comparison when key length + is reached. + For index with transid flag, the ref_length of the key is not correct. + This should however be safe as long as this key is only used for + comparsion against other keys (not for packing or for read-next etc as + in this case we use data_length + ref_length, which is correct. + */ + key.keyinfo= share->keyinfo + keynr; + key.data= key_buff; + key.data_length= length - share->rec_reflength; + key.ref_length= share->rec_reflength; + key.flag= SEARCH_USER_KEY_HAS_TRANSID; + + res= ((share->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE) ? + maria_rtree_real_delete(info, &key, &new_root) : + _ma_ck_real_delete(info, &key, &new_root)); + if (res) + _ma_mark_file_crashed(share); + msg.root= &share->state.key_root[keynr]; + msg.value= new_root; + msg.keynr= keynr; + + if (_ma_write_clr(info, undo_lsn, *msg.root == msg.value ? + LOGREC_UNDO_KEY_INSERT : LOGREC_UNDO_KEY_INSERT_WITH_ROOT, + 0, 0, &lsn, (void*) &msg)) + res= 1; + + _ma_fast_unlock_key_del(info); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/** + @brief Undo of delete of key (ie, insert the deleted key) + + @param with_root If the UNDO is UNDO_KEY_DELETE_WITH_ROOT +*/ + +my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length, + my_bool with_root) +{ + LSN lsn; + my_bool res; + uint keynr, skip_bytes; + uchar key_buff[MARIA_MAX_KEY_BUFF]; + MARIA_SHARE *share= info->s; + my_off_t new_root; + struct st_msg_to_write_hook_for_undo_key msg; + MARIA_KEY key; + DBUG_ENTER("_ma_apply_undo_key_delete"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + keynr= key_nr_korr(header); + skip_bytes= KEY_NR_STORE_SIZE + (with_root ? PAGE_STORE_SIZE : 0); + header+= skip_bytes; + length-= skip_bytes; + + /* We have to copy key as _ma_ck_real_write_btree() may change it */ + memcpy(key_buff, header, length); + DBUG_DUMP("key", key_buff, length); + + key.keyinfo= share->keyinfo + keynr; + key.data= key_buff; + key.data_length= length - share->rec_reflength; + key.ref_length= share->rec_reflength; + key.flag= SEARCH_USER_KEY_HAS_TRANSID; + + new_root= share->state.key_root[keynr]; + res= (share->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE) ? + maria_rtree_insert_level(info, &key, -1, &new_root) : + _ma_ck_real_write_btree(info, &key, &new_root, + share->keyinfo[keynr].write_comp_flag | + key.flag); + if (res) + _ma_mark_file_crashed(share); + + msg.root= &share->state.key_root[keynr]; + msg.value= new_root; + msg.keynr= keynr; + if (_ma_write_clr(info, undo_lsn, + *msg.root == msg.value ? + LOGREC_UNDO_KEY_DELETE : LOGREC_UNDO_KEY_DELETE_WITH_ROOT, + 0, 0, &lsn, + (void*) &msg)) + res= 1; + + _ma_fast_unlock_key_del(info); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/**************************************************************************** + Handle some local variables +****************************************************************************/ + +/** + @brief lock key_del for other threads usage + + @fn _ma_lock_key_del() + @param info Maria handler + @param insert_at_end Set to 1 if we are doing an insert + + @note + To allow higher concurrency in the common case where we do inserts + and we don't have any linked blocks we do the following: + - Mark in info->key_del_used that we are not using key_del + - Return at once (without marking key_del as used) + + This is safe as we in this case don't write key_del_current into + the redo log and during recover we are not updating key_del. + + @retval 1 Use page at end of file + @retval 0 Use page at share->key_del_current +*/ + +my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end) +{ + MARIA_SHARE *share= info->s; + + /* + info->key_del_used is 0 initially. + If the caller needs a block (_ma_new()), we look at the free list: + - looks empty? then caller will create a new block at end of file and + remember (through info->key_del_used==2) that it will not change + state.key_del and does not need to wake up waiters as nobody will wait for + it. + - non-empty? then we wait for other users of the state.key_del list to + have finished, then we lock this list (through share->key_del_used==1) + because we need to prevent some other thread to also read state.key_del + and use the same page as ours. We remember through info->key_del_used==1 + that we will have to set state.key_del at unlock time and wake up + waiters. + If the caller wants to free a block (_ma_dispose()), "empty" and + "non-empty" are treated as "non-empty" is treated above. + When we are ready to unlock, we copy share->key_del_current into + state.key_del. Unlocking happens when writing the UNDO log record, that + can make a long lock time. + Why we wrote "*looks* empty": because we are looking at state.key_del + which may be slightly old (share->key_del_current may be more recent and + exact): when we want a new page, we tolerate to treat "there was no free + page 1 millisecond ago" as "there is no free page". It's ok to non-pop + (_ma_new(), page will be found later anyway) but it's not ok to non-push + (_ma_dispose(), page would be lost). + When we leave this function, info->key_del_used is always 1 or 2. + */ + if (info->key_del_used != 1) + { + pthread_mutex_lock(&share->key_del_lock); + if (share->state.key_del == HA_OFFSET_ERROR && insert_at_end) + { + pthread_mutex_unlock(&share->key_del_lock); + info->key_del_used= 2; /* insert-with-append */ + return 1; + } +#ifdef THREAD + while (share->key_del_used) + pthread_cond_wait(&share->key_del_cond, &share->key_del_lock); +#endif + info->key_del_used= 1; + share->key_del_used= 1; + share->key_del_current= share->state.key_del; + pthread_mutex_unlock(&share->key_del_lock); + } + return share->key_del_current == HA_OFFSET_ERROR; +} + + +/** + @brief copy changes to key_del and unlock it + + @notes + In case of many threads using the maria table, we always have a lock + on the translog when comming here. +*/ + +void _ma_unlock_key_del(MARIA_HA *info) +{ + DBUG_ASSERT(info->key_del_used); + if (info->key_del_used == 1) /* Ignore insert-with-append */ + { + MARIA_SHARE *share= info->s; + pthread_mutex_lock(&share->key_del_lock); + share->key_del_used= 0; + share->state.key_del= share->key_del_current; + pthread_mutex_unlock(&share->key_del_lock); + pthread_cond_signal(&share->key_del_cond); + } + info->key_del_used= 0; +} diff --git a/storage/maria/ma_key_recover.h b/storage/maria/ma_key_recover.h new file mode 100644 index 00000000000..d6b69010d5d --- /dev/null +++ b/storage/maria/ma_key_recover.h @@ -0,0 +1,122 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + When we have finished the write/update/delete of a row, we have cleanups to + do. For now it is signalling to Checkpoint that all dirtied pages have + their rec_lsn set and page LSN set (_ma_unpin_all_pages() has been called), + and that bitmap pages are correct (_ma_bitmap_release_unused() has been + called). +*/ + +/* Struct for clr_end */ + +struct st_msg_to_write_hook_for_clr_end +{ + LSN previous_undo_lsn; + enum translog_record_type undone_record_type; + ha_checksum checksum_delta; + void *extra_msg; +}; + +struct st_msg_to_write_hook_for_undo_key +{ + my_off_t *root; + my_off_t value; + uint keynr; + ulonglong auto_increment; +}; + + +/* Function definitions for some redo functions */ + +my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn, + enum translog_record_type undo_type, + my_bool store_checksum, ha_checksum checksum, + LSN *res_lsn, void *extra_msg); +int _ma_write_undo_key_insert(MARIA_HA *info, const MARIA_KEY *key, + my_off_t *root, my_off_t new_root, + LSN *res_lsn); +my_bool _ma_write_undo_key_delete(MARIA_HA *info, const MARIA_KEY *key, + my_off_t new_root, LSN *res_lsn); +my_bool write_hook_for_clr_end(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +extern my_bool write_hook_for_undo_key(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +extern my_bool write_hook_for_undo_key_insert(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +extern my_bool write_hook_for_undo_key_delete(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); + +my_bool _ma_log_prefix(MARIA_PAGE *page, uint changed_length, int move_length, + enum en_key_debug debug_marker); +my_bool _ma_log_suffix(MARIA_PAGE *page, uint org_length, + uint new_length); +my_bool _ma_log_add(MARIA_PAGE *page, uint buff_length, uchar *key_pos, + uint changed_length, int move_length, + my_bool handle_overflow, + enum en_key_debug debug_marker); +my_bool _ma_log_delete(MARIA_PAGE *page, const uchar *key_pos, + uint changed_length, uint move_length, + uint append_length, enum en_key_debug debug_marker); +my_bool _ma_log_change(MARIA_PAGE *page, const uchar *key_pos, uint length, + enum en_key_debug debug_marker); +my_bool _ma_log_new(MARIA_PAGE *page, my_bool root_page); +#ifdef EXTRA_DEBUG_KEY_CHANGES +void _ma_log_key_changes(MARIA_PAGE *ma_page, LEX_CUSTRING *log_array, + uchar *log_pos, uint *changed_length, + uint *translog_parts); +#else +#define _ma_log_key_changes(A,B,C,D,E) +#endif + +uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn, + const uchar *header, uint length); +uint _ma_apply_redo_index_free_page(MARIA_HA *info, LSN lsn, + const uchar *header); +uint _ma_apply_redo_index(MARIA_HA *info, + LSN lsn, const uchar *header, uint length); + +my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length); +my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length, + my_bool with_root); + +static inline void _ma_finalize_row(MARIA_HA *info) +{ + info->trn->rec_lsn= LSN_IMPOSSIBLE; +} + +/* unpinning is often the last operation before finalizing */ + +static inline void _ma_unpin_all_pages_and_finalize_row(MARIA_HA *info, + LSN undo_lsn) +{ + _ma_unpin_all_pages(info, undo_lsn); + _ma_finalize_row(info); +} + +extern my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end); +extern void _ma_unlock_key_del(MARIA_HA *info); +static inline void _ma_fast_unlock_key_del(MARIA_HA *info) +{ + if (info->key_del_used) + _ma_unlock_key_del(info); +} diff --git a/storage/maria/ma_keycache.c b/storage/maria/ma_keycache.c new file mode 100644 index 00000000000..39fc7d421ae --- /dev/null +++ b/storage/maria/ma_keycache.c @@ -0,0 +1,164 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Key cache assignments +*/ + +#include "maria_def.h" + +/* + Assign pages of the index file for a table to a key cache + + SYNOPSIS + maria_assign_to_pagecache() + info open table + key_map map of indexes to assign to the key cache + pagecache_ptr pointer to the key cache handle + assign_lock Mutex to lock during assignment + + PREREQUESTS + One must have a READ lock or a WRITE lock on the table when calling + the function to ensure that there is no other writers to it. + + The caller must also ensure that one doesn't call this function from + two different threads with the same table. + + NOTES + At present pages for all indexes must be assigned to the same key cache. + In future only pages for indexes specified in the key_map parameter + of the table will be assigned to the specified key cache. + + RETURN VALUE + 0 If a success + # Error code +*/ + +int maria_assign_to_pagecache(MARIA_HA *info, + ulonglong key_map __attribute__((unused)), + PAGECACHE *pagecache) +{ + int error= 0; + MARIA_SHARE* share= info->s; + DBUG_ENTER("maria_assign_to_pagecache"); + DBUG_PRINT("enter", + ("old_pagecache_handle: 0x%lx new_pagecache_handle: 0x%lx", + (long) share->pagecache, (long) pagecache)); + + /* + Skip operation if we didn't change key cache. This can happen if we + call this for all open instances of the same table + */ + if (share->pagecache == pagecache) + DBUG_RETURN(0); + + /* + First flush all blocks for the table in the old key cache. + This is to ensure that the disk is consistent with the data pages + in memory (which may not be the case if the table uses delayed_key_write) + + Note that some other read thread may still fill in the key cache with + new blocks during this call and after, but this doesn't matter as + all threads will start using the new key cache for their next call to + maria library and we know that there will not be any changed blocks + in the old key cache. + */ + + if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE)) + { + error= my_errno; + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); /* Mark that table must be checked */ + } + + /* + Flush the new key cache for this file. This is needed to ensure + that there is no old blocks (with outdated data) left in the new key + cache from an earlier assign_to_keycache operation + + (This can never fail as there is never any not written data in the + new key cache) + */ + (void) flush_pagecache_blocks(pagecache, &share->kfile, FLUSH_RELEASE); + + /* + ensure that setting the key cache and changing the multi_pagecache + is done atomicly + */ + pthread_mutex_lock(&share->intern_lock); + /* + Tell all threads to use the new key cache + This should be seen at the lastes for the next call to an maria function. + */ + share->pagecache= pagecache; + + /* store the key cache in the global hash structure for future opens */ + if (multi_pagecache_set((uchar*) share->unique_file_name.str, + share->unique_file_name.length, + share->pagecache)) + error= my_errno; + pthread_mutex_unlock(&share->intern_lock); + DBUG_RETURN(error); +} + + +/* + Change all MARIA entries that uses one key cache to another key cache + + SYNOPSIS + maria_change_pagecache() + old_pagecache Old key cache + new_pagecache New key cache + + NOTES + This is used when we delete one key cache. + + To handle the case where some other threads tries to open an MARIA + table associated with the to-be-deleted key cache while this operation + is running, we have to call 'multi_pagecache_change()' from this + function while we have a lock on the MARIA table list structure. + + This is safe as long as it's only MARIA that is using this specific + key cache. +*/ + + +void maria_change_pagecache(PAGECACHE *old_pagecache, + PAGECACHE *new_pagecache) +{ + LIST *pos; + DBUG_ENTER("maria_change_pagecache"); + + /* + Lock list to ensure that no one can close the table while we manipulate it + */ + pthread_mutex_lock(&THR_LOCK_maria); + for (pos=maria_open_list ; pos ; pos=pos->next) + { + MARIA_HA *info= (MARIA_HA*) pos->data; + MARIA_SHARE *share= info->s; + if (share->pagecache == old_pagecache) + maria_assign_to_pagecache(info, (ulonglong) ~0, new_pagecache); + } + + /* + We have to do the following call while we have the lock on the + MARIA list structure to ensure that another thread is not trying to + open a new table that will be associted with the old key cache + */ + multi_pagecache_change(old_pagecache, new_pagecache); + pthread_mutex_unlock(&THR_LOCK_maria); + DBUG_VOID_RETURN; +} diff --git a/storage/maria/ma_locking.c b/storage/maria/ma_locking.c new file mode 100644 index 00000000000..6bb308e5959 --- /dev/null +++ b/storage/maria/ma_locking.c @@ -0,0 +1,554 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Locking of Maria-tables. + Must be first request before doing any furter calls to any Maria function. + Is used to allow many process use the same non transactional Maria table +*/ + +#include "ma_ftdefs.h" + + /* lock table by F_UNLCK, F_RDLCK or F_WRLCK */ + +int maria_lock_database(MARIA_HA *info, int lock_type) +{ + int error; + uint count; + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_lock_database"); + DBUG_PRINT("enter",("lock_type: %d old lock %d r_locks: %u w_locks: %u " + "global_changed: %d open_count: %u name: '%s'", + lock_type, info->lock_type, share->r_locks, + share->w_locks, + share->global_changed, share->state.open_count, + share->index_file_name.str)); + if (share->options & HA_OPTION_READ_ONLY_DATA || + info->lock_type == lock_type) + DBUG_RETURN(0); + if (lock_type == F_EXTRA_LCK) /* Used by TMP tables */ + { + ++share->w_locks; + ++share->tot_locks; + info->lock_type= lock_type; + DBUG_RETURN(0); + } + + error=0; + pthread_mutex_lock(&share->intern_lock); + if (share->kfile.file >= 0) /* May only be false on windows */ + { + switch (lock_type) { + case F_UNLCK: + maria_ftparser_call_deinitializer(info); + if (info->lock_type == F_RDLCK) + { + count= --share->r_locks; + if (share->lock_restore_status) + (*share->lock_restore_status)(info); + } + else + { + count= --share->w_locks; + if (share->lock.update_status) + _ma_update_status_with_lock(info); + } + --share->tot_locks; + if (info->lock_type == F_WRLCK && !share->w_locks) + { + /* pages of transactional tables get flushed at Checkpoint */ + if (!share->base.born_transactional && !share->temporary && + _ma_flush_table_files(info, + share->delay_key_write ? MARIA_FLUSH_DATA : + MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_KEEP, FLUSH_KEEP)) + error= my_errno; + } + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + if (end_io_cache(&info->rec_cache)) + { + error=my_errno; + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); + } + } + if (!count) + { + DBUG_PRINT("info",("changed: %u w_locks: %u", + (uint) share->changed, share->w_locks)); + if (share->changed && !share->w_locks) + { +#ifdef HAVE_MMAP + if ((share->mmaped_length != + share->state.state.data_file_length) && + (share->nonmmaped_inserts > MAX_NONMAPPED_INSERTS)) + { + if (share->lock_key_trees) + rw_wrlock(&share->mmap_lock); + _ma_remap_file(info, share->state.state.data_file_length); + share->nonmmaped_inserts= 0; + if (share->lock_key_trees) + rw_unlock(&share->mmap_lock); + } +#endif +#ifdef EXTERNAL_LOCKING + share->state.process= share->last_process=share->this_process; + share->state.unique= info->last_unique= info->this_unique; + share->state.update_count= info->last_loop= ++info->this_loop; +#endif + /* transactional tables rather flush their state at Checkpoint */ + if (!share->base.born_transactional) + { + if (_ma_state_info_write_sub(share->kfile.file, &share->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)) + error= my_errno; + else + { + /* A value of 0 means below means "state flushed" */ + share->changed= 0; + } + } + if (maria_flush) + { + if (_ma_sync_table_files(info)) + error= my_errno; + } + else + share->not_flushed=1; + if (error) + { + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); + } + } + } + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + info->lock_type= F_UNLCK; + break; + case F_RDLCK: + if (info->lock_type == F_WRLCK) + { + /* + Change RW to READONLY + + mysqld does not turn write locks to read locks, + so we're never here in mysqld. + */ + share->w_locks--; + share->r_locks++; + info->lock_type=lock_type; + break; + } +#ifdef MARIA_EXTERNAL_LOCKING + if (!share->r_locks && !share->w_locks) + { + /* note that a transactional table should not do this */ + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + { + error=my_errno; + break; + } + } +#endif + VOID(_ma_test_if_changed(info)); + share->r_locks++; + share->tot_locks++; + info->lock_type=lock_type; + break; + case F_WRLCK: + if (info->lock_type == F_RDLCK) + { /* Change READONLY to RW */ + if (share->r_locks == 1) + { + share->r_locks--; + share->w_locks++; + info->lock_type=lock_type; + break; + } + } +#ifdef MARIA_EXTERNAL_LOCKING + if (!(share->options & HA_OPTION_READ_ONLY_DATA)) + { + if (!share->w_locks) + { + if (!share->r_locks) + { + /* + Note that transactional tables should not do this. + If we enabled this code, we should make sure to skip it if + born_transactional is true. We should not test + now_transactional to decide if we can call + _ma_state_info_read_dsk(), because it can temporarily be 0 + (TRUNCATE on a partitioned table) and thus it would make a state + modification below without mutex, confusing a concurrent + checkpoint running. + Even if this code was enabled only for non-transactional tables: + in scenario LOCK TABLE t1 WRITE; INSERT INTO t1; DELETE FROM t1; + state on disk read by DELETE is obsolete as it was not flushed + at the end of INSERT. MyISAM same. It however causes no issue as + maria_delete_all_rows() calls _ma_reset_status() thus is not + influenced by the obsolete read values. + */ + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + { + error=my_errno; + break; + } + } + } + } +#endif /* defined(MARIA_EXTERNAL_LOCKING) */ + VOID(_ma_test_if_changed(info)); + + info->lock_type=lock_type; + info->invalidator=share->invalidator; + share->w_locks++; + share->tot_locks++; + break; + default: + DBUG_ASSERT(0); + break; /* Impossible */ + } + } +#ifdef __WIN__ + else + { + /* + Check for bad file descriptors if this table is part + of a merge union. Failing to capture this may cause + a crash on windows if the table is renamed and + later on referenced by the merge table. + */ + if( info->owned_by_merge && (info->s)->kfile.file < 0 ) + { + error = HA_ERR_NO_SUCH_TABLE; + } + } +#endif + pthread_mutex_unlock(&share->intern_lock); + DBUG_RETURN(error); +} /* maria_lock_database */ + + +/**************************************************************************** + ** functions to read / write the state +****************************************************************************/ + +int _ma_readinfo(register MARIA_HA *info __attribute__ ((unused)), + int lock_type __attribute__ ((unused)), + int check_keybuffer __attribute__ ((unused))) +{ +#ifdef MARIA_EXTERNAL_LOCKING + DBUG_ENTER("_ma_readinfo"); + + if (info->lock_type == F_UNLCK) + { + MARIA_SHARE *share= info->s; + if (!share->tot_locks) + { + /* should not be done for transactional tables */ + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + { + if (!my_errno) + my_errno= HA_ERR_FILE_TOO_SHORT; + DBUG_RETURN(1); + } + } + if (check_keybuffer) + VOID(_ma_test_if_changed(info)); + info->invalidator=share->invalidator; + } + else if (lock_type == F_WRLCK && info->lock_type == F_RDLCK) + { + my_errno=EACCES; /* Not allowed to change */ + DBUG_RETURN(-1); /* when have read_lock() */ + } + DBUG_RETURN(0); +#else + return 0; +#endif /* defined(MARIA_EXTERNAL_LOCKING) */ +} /* _ma_readinfo */ + + +/* + Every isam-function that uppdates the isam-database MUST end with this + request + + NOTES + my_errno is not changed if this succeeds! +*/ + +int _ma_writeinfo(register MARIA_HA *info, uint operation) +{ + int error,olderror; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_writeinfo"); + DBUG_PRINT("info",("operation: %u tot_locks: %u", operation, + share->tot_locks)); + + error=0; + if (share->tot_locks == 0 && !share->base.born_transactional) + { + /* transactional tables flush their state at Checkpoint */ + if (operation) + { /* Two threads can't be here */ + olderror= my_errno; /* Remember last error */ + +#ifdef EXTERNAL_LOCKING + /* + The following only makes sense if we want to be allow two different + processes access the same table at the same time + */ + share->state.process= share->last_process= share->this_process; + share->state.unique= info->last_unique= info->this_unique; + share->state.update_count= info->last_loop= ++info->this_loop; +#endif + + if ((error= + _ma_state_info_write_sub(share->kfile.file, + &share->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET))) + olderror=my_errno; +#ifdef __WIN__ + if (maria_flush) + { + _commit(share->kfile.file); + _commit(info->dfile.file); + } +#endif + my_errno=olderror; + } + } + else if (operation) + share->changed= 1; /* Mark keyfile changed */ + DBUG_RETURN(error); +} /* _ma_writeinfo */ + + +/* + Test if an external process has changed the database + (Should be called after readinfo) +*/ + +int _ma_test_if_changed(register MARIA_HA *info) +{ +#ifdef EXTERNAL_LOCKING + MARIA_SHARE *share= info->s; + if (share->state.process != share->last_process || + share->state.unique != info->last_unique || + share->state.update_count != info->last_loop) + { /* Keyfile has changed */ + DBUG_PRINT("info",("index file changed")); + if (share->state.process != share->this_process) + VOID(flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_RELEASE)); + share->last_process=share->state.process; + info->last_unique= share->state.unique; + info->last_loop= share->state.update_count; + info->update|= HA_STATE_WRITTEN; /* Must use file on next */ + info->data_changed= 1; /* For maria_is_changed */ + return 1; + } +#endif + return (!(info->update & HA_STATE_AKTIV) || + (info->update & (HA_STATE_WRITTEN | HA_STATE_DELETED | + HA_STATE_KEY_CHANGED))); +} /* _ma_test_if_changed */ + + +/* + Put a mark in the .MAI file that someone is updating the table + + DOCUMENTATION + state.open_count in the .MAI file is used the following way: + - For the first change of the .MYI file in this process open_count is + incremented by _ma_mark_file_changed(). (We have a write lock on the file + when this happens) + - In maria_close() it's decremented by _ma_decrement_open_count() if it + was incremented in the same process. + + This mean that if we are the only process using the file, the open_count + tells us if the MARIA file wasn't properly closed. (This is true if + my_disable_locking is set). + + open_count is not maintained on disk for temporary tables. +*/ + +#define _MA_ALREADY_MARKED_FILE_CHANGED \ + ((share->state.changed & STATE_CHANGED) && share->global_changed) + +int _ma_mark_file_changed(MARIA_HA *info) +{ + uchar buff[3]; + register MARIA_SHARE *share= info->s; + int error= 1; + DBUG_ENTER("_ma_mark_file_changed"); + + if (_MA_ALREADY_MARKED_FILE_CHANGED) + DBUG_RETURN(0); + pthread_mutex_lock(&share->intern_lock); /* recheck under mutex */ + if (! _MA_ALREADY_MARKED_FILE_CHANGED) + { + share->state.changed|=(STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS); + if (!share->global_changed) + { + share->global_changed=1; + share->state.open_count++; + } + /* + Temp tables don't need an open_count as they are removed on crash. + In theory transactional tables are fixed by log-based recovery, so don't + need an open_count either, but if recovery has failed and logs have been + removed (by maria-force-start-after-recovery-failures), we still need to + detect dubious tables. + If we didn't maintain open_count on disk for a table, after a crash + we wouldn't know if it was closed at crash time (thus does not need a + check) or not. So we would have to check all tables: overkill. + */ + if (!share->temporary) + { + mi_int2store(buff,share->state.open_count); + buff[2]=1; /* Mark that it's changed */ + if (my_pwrite(share->kfile.file, buff, sizeof(buff), + sizeof(share->state.header) + + MARIA_FILE_OPEN_COUNT_OFFSET, + MYF(MY_NABP))) + goto err; + } + /* Set uuid of file if not yet set (zerofilled file) */ + if (share->base.born_transactional && + !(share->state.changed & STATE_NOT_MOVABLE)) + { + /* Lock table to current installation */ + if (_ma_set_uuid(info, 0) || + (share->state.create_rename_lsn == LSN_NEEDS_NEW_STATE_LSNS && + _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE, + trnman_get_min_trid(), + TRUE, TRUE))) + goto err; + share->state.changed|= STATE_NOT_MOVABLE; + } + } + error= 0; +err: + pthread_mutex_unlock(&share->intern_lock); + DBUG_RETURN(error); +#undef _MA_ALREADY_MARKED_FILE_CHANGED +} + +/* + Check that a region is all zero + + SYNOPSIS + check_if_zero() + pos Start of memory to check + length length of memory region + + NOTES + Used mainly to detect rows with wrong extent information +*/ + +my_bool _ma_check_if_zero(uchar *pos, size_t length) +{ + uchar *end; + for (end= pos+ length; pos != end ; pos++) + if (pos[0] != 0) + return 1; + return 0; +} + +/* + This is only called by close or by extra(HA_FLUSH) if the OS has the pwrite() + call. In these context the following code should be safe! + */ + +int _ma_decrement_open_count(MARIA_HA *info) +{ + uchar buff[2]; + register MARIA_SHARE *share= info->s; + int lock_error=0,write_error=0; + if (share->global_changed) + { + uint old_lock=info->lock_type; + share->global_changed=0; + lock_error= my_disable_locking ? 0 : maria_lock_database(info, F_WRLCK); + /* Its not fatal even if we couldn't get the lock ! */ + if (share->state.open_count > 0) + { + share->state.open_count--; + share->changed= 1; /* We have to update state */ + if (!share->temporary) + { + mi_int2store(buff,share->state.open_count); + write_error= (int) my_pwrite(share->kfile.file, buff, sizeof(buff), + sizeof(share->state.header) + + MARIA_FILE_OPEN_COUNT_OFFSET, + MYF(MY_NABP)); + } + } + if (!lock_error && !my_disable_locking) + lock_error=maria_lock_database(info,old_lock); + } + return test(lock_error || write_error); +} + + +/** @brief mark file as crashed */ + +void _ma_mark_file_crashed(MARIA_SHARE *share) +{ + uchar buff[2]; + DBUG_ENTER("_ma_mark_file_crashed"); + + share->state.changed|= STATE_CRASHED; + mi_int2store(buff, share->state.changed); + /* + We can ignore the errors, as if the mark failed, there isn't anything + else we can do; The user should already have got an error that the + table was crashed. + */ + (void) my_pwrite(share->kfile.file, buff, sizeof(buff), + sizeof(share->state.header) + + MARIA_FILE_CHANGED_OFFSET, + MYF(MY_NABP)); + DBUG_VOID_RETURN; +} + + +/** + @brief Set uuid of for a Maria file + + @fn _ma_set_uuid() + @param info Maria handler + @param reset_uuid Instead of setting file to maria_uuid, set it to + 0 to mark it as movable +*/ + +my_bool _ma_set_uuid(MARIA_HA *info, my_bool reset_uuid) +{ + uchar buff[MY_UUID_SIZE], *uuid; + + uuid= maria_uuid; + if (reset_uuid) + { + bzero(buff, sizeof(buff)); + uuid= buff; + } + return (my_bool) my_pwrite(info->s->kfile.file, uuid, MY_UUID_SIZE, + mi_uint2korr(info->s->state.header.base_pos), + MYF(MY_NABP)); +} diff --git a/storage/maria/ma_loghandler.c b/storage/maria/ma_loghandler.c new file mode 100644 index 00000000000..dc99554a08d --- /dev/null +++ b/storage/maria/ma_loghandler.c @@ -0,0 +1,9316 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_blockrec.h" /* for some constants and in-write hooks */ +#include "ma_key_recover.h" /* For some in-write hooks */ +#include "ma_checkpoint.h" +#include "ma_servicethread.h" + +/* + On Windows, neither my_open() nor my_sync() work for directories. + Also there is no need to flush filesystem changes ,i.e to sync() + directories. +*/ +#ifdef __WIN__ +#define sync_dir(A,B) 0 +#else +#define sync_dir(A,B) my_sync(A,B) +#endif + +/** + @file + @brief Module which writes and reads to a transaction log +*/ + +/* 0xFF can never be valid first byte of a chunk */ +#define TRANSLOG_FILLER 0xFF + +/* number of opened log files in the pagecache (should be at least 2) */ +#define OPENED_FILES_NUM 3 +#define CACHED_FILES_NUM 5 +#define CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT 7 +#if CACHED_FILES_NUM > CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT +#include <hash.h> +#include <m_ctype.h> +#endif + +/** @brief protects checkpoint_in_progress */ +static pthread_mutex_t LOCK_soft_sync; +/** @brief for killing the background checkpoint thread */ +static pthread_cond_t COND_soft_sync; +/** @brief control structure for checkpoint background thread */ +static MA_SERVICE_THREAD_CONTROL soft_sync_control= + {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync}; + + +/* transaction log file descriptor */ +typedef struct st_translog_file +{ + uint32 number; + PAGECACHE_FILE handler; + my_bool was_recovered; + my_bool is_sync; +} TRANSLOG_FILE; + +/* records buffer size (should be TRANSLOG_PAGE_SIZE * n) */ +#define TRANSLOG_WRITE_BUFFER (1024*1024) +/* + pagecache_read/write/inject() use bmove512() on their buffers so those must + be long-aligned, which we guarantee by using the type below: +*/ +typedef union +{ + ulonglong dummy; + uchar buffer[TRANSLOG_PAGE_SIZE]; +} TRANSLOG_PAGE_SIZE_BUFF; + +/* min chunk length */ +#define TRANSLOG_MIN_CHUNK 3 +/* + Number of buffers used by loghandler + + Should be at least 4, because one thread can block up to 2 buffers in + normal circumstances (less then half of one and full other, or just + switched one and other), But if we met end of the file in the middle and + have to switch buffer it will be 3. + 1 buffer for flushing/writing. + We have a bigger number here for higher concurrency and to make division + faster. + + The number should be power of 2 to be fast. +*/ +#define TRANSLOG_BUFFERS_NO 8 +/* number of bytes (+ header) which can be unused on first page in sequence */ +#define TRANSLOG_MINCHUNK_CONTENT 1 +/* version of log file */ +#define TRANSLOG_VERSION_ID 10000 /* 1.00.00 */ + +#define TRANSLOG_PAGE_FLAGS 6 /* transaction log page flags offset */ + +/* Maximum length of compressed LSNs (the worst case of whole LSN storing) */ +#define COMPRESSED_LSN_MAX_STORE_SIZE (2 + LSN_STORE_SIZE) +#define MAX_NUMBER_OF_LSNS_PER_RECORD 2 + + +/* max lsn calculation for buffer */ +#define BUFFER_MAX_LSN(B) \ + ((B)->last_lsn == LSN_IMPOSSIBLE ? (B)->prev_last_lsn : (B)->last_lsn) + +/* log write buffer descriptor */ +struct st_translog_buffer +{ + /* + Cache for current log. Comes first to be aligned for bmove512() in + pagecache_inject() + */ + uchar buffer[TRANSLOG_WRITE_BUFFER]; + /* + Maximum LSN of records which ends in this buffer (or IMPOSSIBLE_LSN + if no LSNs ends here) + */ + LSN last_lsn; + /* last_lsn of previous buffer or IMPOSSIBLE_LSN if it is very first one */ + LSN prev_last_lsn; + /* This buffer offset in the file */ + TRANSLOG_ADDRESS offset; + /* + Next buffer offset in the file (it is not always offset + size, + in case of flush by LSN it can be offset + size - TRANSLOG_PAGE_SIZE) + */ + TRANSLOG_ADDRESS next_buffer_offset; + /* Previous buffer offset to detect it flush finish */ + TRANSLOG_ADDRESS prev_buffer_offset; + /* + If the buffer was forced to close it save value of its horizon + otherwise LSN_IMPOSSIBLE + */ + TRANSLOG_ADDRESS pre_force_close_horizon; + /* + How much is written (or will be written when copy_to_buffer_in_progress + become 0) to this buffer + */ + translog_size_t size; + /* + When moving from one log buffer to another, we write the last of the + previous buffer to file and then move to start using the new log + buffer. In the case of a part filed last page, this page is not moved + to the start of the new buffer but instead we set the 'skip_data' + variable to tell us how much data at the beginning of the buffer is not + relevant. + */ + uint skipped_data; + /* File handler for this buffer */ + TRANSLOG_FILE *file; + /* Threads which are waiting for buffer filling/freeing */ + pthread_cond_t waiting_filling_buffer; + /* Number of records which are in copy progress */ + uint copy_to_buffer_in_progress; + /* list of waiting buffer ready threads */ + struct st_my_thread_var *waiting_flush; + /* + If true then previous buffer overlap with this one (due to flush of + loghandler, the last page of that buffer is the same as the first page + of this buffer) and have to be written first (because contain old + content of page which present in both buffers) + */ + my_bool overlay; + uint buffer_no; + /* + Lock for the buffer. + + Current buffer also lock the whole handler (if one want lock the handler + one should lock the current buffer). + + Buffers are locked only in one direction (with overflow and beginning + from the first buffer). If we keep lock on buffer N we can lock only + buffer N+1 (never N-1). + + One thread do not lock more then 2 buffer in a time, so to make dead + lock it should be N thread (where N equal number of buffers) takes one + buffer and try to lock next. But it is impossible because there is only + 2 cases when thread take 2 buffers: 1) one thread finishes current + buffer (where horizon is) and start next (to which horizon moves). 2) + flush start from buffer after current (oldest) and go till the current + crabbing by buffer sequence. And there is only one flush in a moment + (they are serialised). + + Because of above and number of buffers equal 5 we can't get dead lock (it is + impossible to get all 5 buffers locked simultaneously). + */ + pthread_mutex_t mutex; + /* + Some thread is going to close the buffer and it should be + done only by that thread + */ + my_bool is_closing_buffer; + /* + Version of the buffer increases every time buffer the buffer flushed. + With file and offset it allow detect buffer changes + */ + uint8 ver; + + /* + When previous buffer sent to disk it set its address here to allow + to detect when it is done + (we have to keep it in this buffer to lock buffers only in one direction). + */ + TRANSLOG_ADDRESS prev_sent_to_disk; + pthread_cond_t prev_sent_to_disk_cond; +}; + + +struct st_buffer_cursor +{ + /* pointer into the buffer */ + uchar *ptr; + /* current buffer */ + struct st_translog_buffer *buffer; + /* How many bytes we wrote on the current page */ + uint16 current_page_fill; + /* + How many times we write the page on the disk during flushing process + (for sector protection). + */ + uint16 write_counter; + /* previous write offset */ + uint16 previous_offset; + /* Number of current buffer */ + uint8 buffer_no; + /* + True if it is just filling buffer after advancing the pointer to + the horizon. + */ + my_bool chaser; + /* + Is current page of the cursor already finished (sector protection + should be applied if it is needed) + */ + my_bool protected; +}; + + +typedef uint8 dirty_buffer_mask_t; + +struct st_translog_descriptor +{ + /* *** Parameters of the log handler *** */ + + /* Page cache for the log reads */ + PAGECACHE *pagecache; + uint flags; + /* File open flags */ + uint open_flags; + /* max size of one log size (for new logs creation) */ + uint32 log_file_max_size; + uint32 server_version; + /* server ID (used for replication) */ + uint32 server_id; + /* Loghandler's buffer capacity in case of chunk 2 filling */ + uint32 buffer_capacity_chunk_2; + /* + Half of the buffer capacity in case of chunk 2 filling, + used to decide will we write a record in one group or many. + It is written to the variable just to avoid devision every + time we need it. + */ + uint32 half_buffer_capacity_chunk_2; + /* Page overhead calculated by flags (whether CRC is enabled, etc) */ + uint16 page_overhead; + /* + Page capacity ("useful load") calculated by flags + (TRANSLOG_PAGE_SIZE - page_overhead-1) + */ + uint16 page_capacity_chunk_2; + /* Path to the directory where we store log store files */ + char directory[FN_REFLEN]; + + /* *** Current state of the log handler *** */ + /* list of opened files */ + DYNAMIC_ARRAY open_files; + /* min/max number of file in the array */ + uint32 max_file, min_file; + /* the opened files list guard */ + rw_lock_t open_files_lock; + + /* + File descriptor of the directory where we store log files for syncing + it. + */ + File directory_fd; + /* buffers for log writing */ + struct st_translog_buffer buffers[TRANSLOG_BUFFERS_NO]; + /* Mask where 1 in position N mean that buffer N is not flushed */ + dirty_buffer_mask_t dirty_buffer_mask; + /* The above variable protection */ + pthread_mutex_t dirty_buffer_mask_lock; + /* + horizon - visible end of the log (here is absolute end of the log: + position where next chunk can start + */ + TRANSLOG_ADDRESS horizon; + /* horizon buffer cursor */ + struct st_buffer_cursor bc; + /* maximum LSN of the current (not finished) file */ + LSN max_lsn; + + /* + Last flushed LSN (protected by log_flush_lock). + Pointers in the log ordered like this: + last_lsn_checked <= flushed <= sent_to_disk <= in_buffers_only <= + max_lsn <= horizon + */ + LSN flushed; + /* Last LSN sent to the disk (but maybe not written yet) */ + LSN sent_to_disk; + /* Horizon from which log started after initialization */ + TRANSLOG_ADDRESS log_start; + TRANSLOG_ADDRESS previous_flush_horizon; + /* All what is after this address is not sent to disk yet */ + TRANSLOG_ADDRESS in_buffers_only; + /* protection of sent_to_disk and in_buffers_only */ + pthread_mutex_t sent_to_disk_lock; + /* + Protect flushed (see above) and for flush serialization (will + be removed in v1.5 + */ + pthread_mutex_t log_flush_lock; + pthread_cond_t log_flush_cond; + pthread_cond_t new_goal_cond; + + /* Protects changing of headers of finished files (max_lsn) */ + pthread_mutex_t file_header_lock; + + /* + Sorted array (with protection) of files where we started writing process + and so we can't give last LSN yet + */ + pthread_mutex_t unfinished_files_lock; + DYNAMIC_ARRAY unfinished_files; + + /* + minimum number of still need file calculeted during last + translog_purge call + */ + uint32 min_need_file; + /* Purger data: minimum file in the log (or 0 if unknown) */ + uint32 min_file_number; + /* Protect purger from many calls and it's data */ + pthread_mutex_t purger_lock; + /* last low water mark checked */ + LSN last_lsn_checked; + /** + Must be set to 0 under loghandler lock every time a new LSN + is generated. + */ + my_bool is_everything_flushed; + /* True when flush pass is in progress */ + my_bool flush_in_progress; + /* The flush number (used to distinguish two flushes goes one by one) */ + volatile int flush_no; + /* Next flush pass variables */ + TRANSLOG_ADDRESS next_pass_max_lsn; + pthread_t max_lsn_requester; +}; + +static struct st_translog_descriptor log_descriptor; + +ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE; +ulong log_file_size= TRANSLOG_FILE_SIZE; +/* sync() of log files directory mode */ +ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE; +ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE; +ulong maria_group_commit_interval= 0; + +/* Marker for end of log */ +static uchar end_of_log= 0; +#define END_OF_LOG &end_of_log +/** + Switch for "soft" sync (no real sync() but periodical sync by service + thread) +*/ +static volatile my_bool soft_sync= FALSE; +/** + Switch for "hard" group commit mode +*/ +static volatile my_bool hard_group_commit= FALSE; +/** + File numbers interval which have to be sync() +*/ +static uint32 soft_sync_min= 0; +static uint32 soft_sync_max= 0; +static uint32 soft_need_sync= 1; +/** + stores interval in microseconds +*/ +static uint32 group_commit_wait= 0; + +enum enum_translog_status translog_status= TRANSLOG_UNINITED; +ulonglong translog_syncs= 0; /* Number of sync()s */ + +/* time of last flush */ +static ulonglong flush_start= 0; + +/* chunk types */ +#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */ +#define TRANSLOG_CHUNK_FIXED (1 << 6) /* 1 (pseudo)fixed record (also LSN) */ +#define TRANSLOG_CHUNK_NOHDR (2 << 6) /* 2 no head chunk (till page end) */ +#define TRANSLOG_CHUNK_LNGTH (3 << 6) /* 3 chunk with chunk length */ +#define TRANSLOG_CHUNK_TYPE (3 << 6) /* Mask to get chunk type */ +#define TRANSLOG_REC_TYPE 0x3F /* Mask to get record type */ +#define TRANSLOG_CHUNK_0_CONT 0x3F /* the type to mark chunk 0 continue */ + +/* compressed (relative) LSN constants */ +#define TRANSLOG_CLSN_LEN_BITS 0xC0 /* Mask to get compressed LSN length */ + + +#include <my_atomic.h> +/* an array that maps id of a MARIA_SHARE to this MARIA_SHARE */ +static MARIA_SHARE **id_to_share= NULL; +/* lock for id_to_share */ +static my_atomic_rwlock_t LOCK_id_to_share; + +static my_bool translog_dummy_callback(uchar *page, + pgcache_page_no_t page_no, + uchar* data_ptr); +static my_bool translog_page_validator(uchar *page, + pgcache_page_no_t page_no, + uchar* data_ptr); + +static my_bool translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner); +static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected); +LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon); + + +/* + Initialize log_record_type_descriptors +*/ + +LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES]; + + +#ifndef DBUG_OFF + +#define translog_buffer_lock_assert_owner(B) \ + safe_mutex_assert_owner(&(B)->mutex) +#define translog_lock_assert_owner() \ + safe_mutex_assert_owner(&log_descriptor.bc.buffer->mutex) +void translog_lock_handler_assert_owner() +{ + translog_lock_assert_owner(); +} + +/** + @brief check the description table validity + + @param num how many records should be filled +*/ + +static void check_translog_description_table(int num) +{ + int i; + DBUG_ENTER("check_translog_description_table"); + DBUG_PRINT("enter", ("last record: %d", num)); + DBUG_ASSERT(num > 0); + /* last is reserved for extending the table */ + DBUG_ASSERT(num < LOGREC_NUMBER_OF_TYPES - 1); + DBUG_ASSERT(log_record_type_descriptor[0].rclass == LOGRECTYPE_NOT_ALLOWED); + + for (i= 0; i <= num; i++) + { + DBUG_PRINT("info", + ("record type: %d class: %d fixed: %u header: %u LSNs: %u " + "name: %s", + i, log_record_type_descriptor[i].rclass, + (uint)log_record_type_descriptor[i].fixed_length, + (uint)log_record_type_descriptor[i].read_header_len, + (uint)log_record_type_descriptor[i].compressed_LSN, + log_record_type_descriptor[i].name)); + switch (log_record_type_descriptor[i].rclass) { + case LOGRECTYPE_NOT_ALLOWED: + DBUG_ASSERT(i == 0); + break; + case LOGRECTYPE_VARIABLE_LENGTH: + DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == 0); + DBUG_ASSERT((log_record_type_descriptor[i].compressed_LSN == 0) || + ((log_record_type_descriptor[i].compressed_LSN == 1) && + (log_record_type_descriptor[i].read_header_len >= + LSN_STORE_SIZE)) || + ((log_record_type_descriptor[i].compressed_LSN == 2) && + (log_record_type_descriptor[i].read_header_len >= + LSN_STORE_SIZE * 2))); + break; + case LOGRECTYPE_PSEUDOFIXEDLENGTH: + DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == + log_record_type_descriptor[i].read_header_len); + DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN > 0); + DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN <= 2); + break; + case LOGRECTYPE_FIXEDLENGTH: + DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == + log_record_type_descriptor[i].read_header_len); + DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN == 0); + break; + default: + DBUG_ASSERT(0); + } + } + for (i= num + 1; i < LOGREC_NUMBER_OF_TYPES; i++) + { + DBUG_ASSERT(log_record_type_descriptor[i].rclass == + LOGRECTYPE_NOT_ALLOWED); + } + DBUG_VOID_RETURN; +} +#else +#define translog_buffer_lock_assert_owner(B) {} +#define translog_lock_assert_owner() {} +#endif + +static LOG_DESC INIT_LOGREC_RESERVED_FOR_CHUNKS23= +{LOGRECTYPE_NOT_ALLOWED, 0, 0, NULL, NULL, NULL, 0, + "reserved", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL }; + +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_HEAD= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_insert_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_TAIL= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_insert_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_HEAD= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_new_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_TAIL= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_new_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOBS= +{LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_insert_row_blobs", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_HEAD= +{LOGRECTYPE_FIXEDLENGTH, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_purge_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_TAIL= +{LOGRECTYPE_FIXEDLENGTH, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_purge_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_FREE_BLOCKS= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_free_blocks", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL= +{LOGRECTYPE_FIXEDLENGTH, + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_free_head_or_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +/* not yet used; for when we have versioning */ +static LOG_DESC INIT_LOGREC_REDO_DELETE_ROW= +{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0, + "redo_delete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +/** @todo RECOVERY BUG unused, remove? */ +static LOG_DESC INIT_LOGREC_REDO_UPDATE_ROW_HEAD= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0, + "redo_update_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INDEX= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0, + "redo_index", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INDEX_NEW_PAGE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1, + NULL, write_hook_for_redo, NULL, 0, + "redo_index_new_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INDEX_FREE_PAGE= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + NULL, write_hook_for_redo, NULL, 0, + "redo_index_free_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_UNDELETE_ROW= +{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0, + "redo_undelete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_CLR_END= +{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + + CLR_TYPE_STORE_SIZE, NULL, write_hook_for_clr_end, NULL, 1, + "clr_end", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_PURGE_END= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, NULL, NULL, 1, + "purge_end", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_ROW_INSERT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_undo_row_insert, NULL, 1, + "undo_row_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_ROW_DELETE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_undo_row_delete, NULL, 1, + "undo_row_delete", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_ROW_UPDATE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_undo_row_update, NULL, 1, + "undo_row_update", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE, + NULL, write_hook_for_undo_key_insert, NULL, 1, + "undo_key_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +/* This will never be in the log, only in the clr */ +static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE, + NULL, write_hook_for_undo_key, NULL, 1, + "undo_key_insert_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE, + NULL, write_hook_for_undo_key_delete, NULL, 1, + "undo_key_delete", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE, + NULL, write_hook_for_undo_key_delete, NULL, 1, + "undo_key_delete_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_PREPARE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "prepare", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_PREPARE_WITH_UNDO_PURGE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE, NULL, NULL, NULL, 1, + "prepare_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_COMMIT= +{LOGRECTYPE_FIXEDLENGTH, 0, 0, NULL, + write_hook_for_commit, NULL, 0, "commit", LOGREC_IS_GROUP_ITSELF, NULL, + NULL}; + +static LOG_DESC INIT_LOGREC_COMMIT_WITH_UNDO_PURGE= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, write_hook_for_commit, NULL, 1, + "commit_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_CHECKPOINT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "checkpoint", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_CREATE_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 1 + 2, NULL, NULL, NULL, 0, +"redo_create_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_RENAME_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "redo_rename_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_DROP_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "redo_drop_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_DELETE_ALL= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE, + NULL, write_hook_for_redo_delete_all, NULL, 0, + "redo_delete_all", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_REPAIR_TABLE= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + 8 + 8, FILEID_STORE_SIZE + 8 + 8, + NULL, NULL, NULL, 0, + "redo_repair_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_FILE_ID= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 2, NULL, write_hook_for_file_id, NULL, 0, + "file_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_LONG_TRANSACTION_ID= +{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0, + "long_transaction_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_INCOMPLETE_LOG= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE, + NULL, NULL, NULL, 0, + "incomplete_log", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_INCOMPLETE_GROUP= +{LOGRECTYPE_FIXEDLENGTH, 0, 0, + NULL, NULL, NULL, 0, + "incomplete_group", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_BULK_INSERT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE, + NULL, write_hook_for_undo_bulk_insert, NULL, 1, + "undo_bulk_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_BITMAP_NEW_PAGE= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + NULL, NULL, NULL, 0, + "redo_create_bitmap", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_IMPORTED_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "imported_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_DEBUG_INFO= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "info", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL; + +void translog_table_init() +{ + int i; + log_record_type_descriptor[LOGREC_RESERVED_FOR_CHUNKS23]= + INIT_LOGREC_RESERVED_FOR_CHUNKS23; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_HEAD]= + INIT_LOGREC_REDO_INSERT_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_TAIL]= + INIT_LOGREC_REDO_INSERT_ROW_TAIL; + log_record_type_descriptor[LOGREC_REDO_NEW_ROW_HEAD]= + INIT_LOGREC_REDO_NEW_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_NEW_ROW_TAIL]= + INIT_LOGREC_REDO_NEW_ROW_TAIL; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_BLOBS]= + INIT_LOGREC_REDO_INSERT_ROW_BLOBS; + log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_HEAD]= + INIT_LOGREC_REDO_PURGE_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_TAIL]= + INIT_LOGREC_REDO_PURGE_ROW_TAIL; + log_record_type_descriptor[LOGREC_REDO_FREE_BLOCKS]= + INIT_LOGREC_REDO_FREE_BLOCKS; + log_record_type_descriptor[LOGREC_REDO_FREE_HEAD_OR_TAIL]= + INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL; + log_record_type_descriptor[LOGREC_REDO_DELETE_ROW]= + INIT_LOGREC_REDO_DELETE_ROW; + log_record_type_descriptor[LOGREC_REDO_UPDATE_ROW_HEAD]= + INIT_LOGREC_REDO_UPDATE_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_INDEX]= + INIT_LOGREC_REDO_INDEX; + log_record_type_descriptor[LOGREC_REDO_INDEX_NEW_PAGE]= + INIT_LOGREC_REDO_INDEX_NEW_PAGE; + log_record_type_descriptor[LOGREC_REDO_INDEX_FREE_PAGE]= + INIT_LOGREC_REDO_INDEX_FREE_PAGE; + log_record_type_descriptor[LOGREC_REDO_UNDELETE_ROW]= + INIT_LOGREC_REDO_UNDELETE_ROW; + log_record_type_descriptor[LOGREC_CLR_END]= + INIT_LOGREC_CLR_END; + log_record_type_descriptor[LOGREC_PURGE_END]= + INIT_LOGREC_PURGE_END; + log_record_type_descriptor[LOGREC_UNDO_ROW_INSERT]= + INIT_LOGREC_UNDO_ROW_INSERT; + log_record_type_descriptor[LOGREC_UNDO_ROW_DELETE]= + INIT_LOGREC_UNDO_ROW_DELETE; + log_record_type_descriptor[LOGREC_UNDO_ROW_UPDATE]= + INIT_LOGREC_UNDO_ROW_UPDATE; + log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT]= + INIT_LOGREC_UNDO_KEY_INSERT; + log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT_WITH_ROOT]= + INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT; + log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE]= + INIT_LOGREC_UNDO_KEY_DELETE; + log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE_WITH_ROOT]= + INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT; + log_record_type_descriptor[LOGREC_PREPARE]= + INIT_LOGREC_PREPARE; + log_record_type_descriptor[LOGREC_PREPARE_WITH_UNDO_PURGE]= + INIT_LOGREC_PREPARE_WITH_UNDO_PURGE; + log_record_type_descriptor[LOGREC_COMMIT]= + INIT_LOGREC_COMMIT; + log_record_type_descriptor[LOGREC_COMMIT_WITH_UNDO_PURGE]= + INIT_LOGREC_COMMIT_WITH_UNDO_PURGE; + log_record_type_descriptor[LOGREC_CHECKPOINT]= + INIT_LOGREC_CHECKPOINT; + log_record_type_descriptor[LOGREC_REDO_CREATE_TABLE]= + INIT_LOGREC_REDO_CREATE_TABLE; + log_record_type_descriptor[LOGREC_REDO_RENAME_TABLE]= + INIT_LOGREC_REDO_RENAME_TABLE; + log_record_type_descriptor[LOGREC_REDO_DROP_TABLE]= + INIT_LOGREC_REDO_DROP_TABLE; + log_record_type_descriptor[LOGREC_REDO_DELETE_ALL]= + INIT_LOGREC_REDO_DELETE_ALL; + log_record_type_descriptor[LOGREC_REDO_REPAIR_TABLE]= + INIT_LOGREC_REDO_REPAIR_TABLE; + log_record_type_descriptor[LOGREC_FILE_ID]= + INIT_LOGREC_FILE_ID; + log_record_type_descriptor[LOGREC_LONG_TRANSACTION_ID]= + INIT_LOGREC_LONG_TRANSACTION_ID; + log_record_type_descriptor[LOGREC_INCOMPLETE_LOG]= + INIT_LOGREC_INCOMPLETE_LOG; + log_record_type_descriptor[LOGREC_INCOMPLETE_GROUP]= + INIT_LOGREC_INCOMPLETE_GROUP; + log_record_type_descriptor[LOGREC_UNDO_BULK_INSERT]= + INIT_LOGREC_UNDO_BULK_INSERT; + log_record_type_descriptor[LOGREC_REDO_BITMAP_NEW_PAGE]= + INIT_LOGREC_REDO_BITMAP_NEW_PAGE; + log_record_type_descriptor[LOGREC_IMPORTED_TABLE]= + INIT_LOGREC_IMPORTED_TABLE; + log_record_type_descriptor[LOGREC_DEBUG_INFO]= + INIT_LOGREC_DEBUG_INFO; + + for (i= LOGREC_FIRST_FREE; i < LOGREC_NUMBER_OF_TYPES; i++) + log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED; +#ifndef DBUG_OFF + check_translog_description_table(LOGREC_FIRST_FREE -1); +#endif +} + + +/* all possible flags page overheads */ +static uint page_overhead[TRANSLOG_FLAGS_NUM]; + +typedef struct st_translog_validator_data +{ + TRANSLOG_ADDRESS *addr; + my_bool was_recovered; +} TRANSLOG_VALIDATOR_DATA; + + +/* + Check cursor/buffer consistence + + SYNOPSIS + translog_check_cursor + cursor cursor which will be checked +*/ + +static void translog_check_cursor(struct st_buffer_cursor *cursor + __attribute__((unused))) +{ + DBUG_ASSERT(cursor->chaser || + ((ulong) (cursor->ptr - cursor->buffer->buffer) == + cursor->buffer->size)); + DBUG_ASSERT(cursor->buffer->buffer_no == cursor->buffer_no); + DBUG_ASSERT((cursor->ptr -cursor->buffer->buffer) %TRANSLOG_PAGE_SIZE == + cursor->current_page_fill % TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); +} + + +/** + @brief switch the loghandler in read only mode in case of write error +*/ + +void translog_stop_writing() +{ + DBUG_ENTER("translog_stop_writing"); + DBUG_PRINT("error", ("errno: %d my_errno: %d", errno, my_errno)); + translog_status= (translog_status == TRANSLOG_SHUTDOWN ? + TRANSLOG_UNINITED : + TRANSLOG_READONLY); + log_descriptor.is_everything_flushed= 1; + log_descriptor.open_flags= O_BINARY | O_RDONLY; + DBUG_ASSERT(0); + DBUG_VOID_RETURN; +} + + +/* + @brief Get file name of the log by log number + + @param file_no Number of the log we want to open + @param path Pointer to buffer where file name will be + stored (must be FN_REFLEN bytes at least) + + @return pointer to path +*/ + +char *translog_filename_by_fileno(uint32 file_no, char *path) +{ + char buff[11], *end; + uint length; + DBUG_ENTER("translog_filename_by_fileno"); + DBUG_ASSERT(file_no <= 0xfffffff); + + /* log_descriptor.directory is already formated */ + end= strxmov(path, log_descriptor.directory, "aria_log.0000000", NullS); + length= (uint) (int10_to_str(file_no, buff, 10) - buff); + strmov(end - length +1, buff); + + DBUG_PRINT("info", ("Path: '%s' path: 0x%lx", path, (ulong) path)); + DBUG_RETURN(path); +} + + +/** + @brief Create log file with given number without cache + + @param file_no Number of the log we want to open + + retval -1 error + retval # file descriptor number +*/ + +static File create_logfile_by_number_no_cache(uint32 file_no) +{ + File file; + char path[FN_REFLEN]; + DBUG_ENTER("create_logfile_by_number_no_cache"); + + if (translog_status != TRANSLOG_OK) + DBUG_RETURN(-1); + + /* TODO: add O_DIRECT to open flags (when buffer is aligned) */ + if ((file= my_create(translog_filename_by_fileno(file_no, path), + 0, O_BINARY | O_RDWR, MYF(MY_WME))) < 0) + { + DBUG_PRINT("error", ("Error %d during creating file '%s'", errno, path)); + translog_stop_writing(); + DBUG_RETURN(-1); + } + if (sync_log_dir >= TRANSLOG_SYNC_DIR_NEWFILE && + sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))) + { + DBUG_PRINT("error", ("Error %d during syncing directory '%s'", + errno, log_descriptor.directory)); + translog_stop_writing(); + DBUG_RETURN(-1); + } + DBUG_PRINT("info", ("File: '%s' handler: %d", path, file)); + DBUG_RETURN(file); +} + +/** + @brief Open (not create) log file with given number without cache + + @param file_no Number of the log we want to open + + retval -1 error + retval # file descriptor number +*/ + +static File open_logfile_by_number_no_cache(uint32 file_no) +{ + File file; + char path[FN_REFLEN]; + DBUG_ENTER("open_logfile_by_number_no_cache"); + + /* TODO: add O_DIRECT to open flags (when buffer is aligned) */ + /* TODO: use my_create() */ + if ((file= my_open(translog_filename_by_fileno(file_no, path), + log_descriptor.open_flags, + MYF(MY_WME))) < 0) + { + DBUG_PRINT("error", ("Error %d during opening file '%s'", errno, path)); + DBUG_RETURN(-1); + } + DBUG_PRINT("info", ("File: '%s' handler: %d", path, file)); + DBUG_RETURN(file); +} + + +/** + @brief get file descriptor by given number using cache + + @param file_no Number of the log we want to open + + retval # file descriptor + retval NULL file is not opened +*/ + +static TRANSLOG_FILE *get_logfile_by_number(uint32 file_no) +{ + TRANSLOG_FILE *file; + DBUG_ENTER("get_logfile_by_number"); + rw_rdlock(&log_descriptor.open_files_lock); + if (log_descriptor.max_file - file_no >= + log_descriptor.open_files.elements) + { + DBUG_PRINT("info", ("File #%u is not opened", file_no)); + rw_unlock(&log_descriptor.open_files_lock); + DBUG_RETURN(NULL); + } + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + DBUG_ASSERT(log_descriptor.max_file >= file_no); + DBUG_ASSERT(log_descriptor.min_file <= file_no); + + file= *dynamic_element(&log_descriptor.open_files, + log_descriptor.max_file - file_no, TRANSLOG_FILE **); + rw_unlock(&log_descriptor.open_files_lock); + DBUG_PRINT("info", ("File 0x%lx File no: %lu, File handler: %d", + (ulong)file, (ulong)file_no, + (file ? file->handler.file : -1))); + DBUG_ASSERT(!file || file->number == file_no); + DBUG_RETURN(file); +} + + +/** + @brief get current file descriptor + + retval # file descriptor +*/ + +static TRANSLOG_FILE *get_current_logfile() +{ + TRANSLOG_FILE *file; + DBUG_ENTER("get_current_logfile"); + rw_rdlock(&log_descriptor.open_files_lock); + DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu", + (ulong) log_descriptor.max_file, + (ulong) log_descriptor.min_file, + (ulong) log_descriptor.open_files.elements)); + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **); + rw_unlock(&log_descriptor.open_files_lock); + DBUG_RETURN(file); +} + +uchar NEAR maria_trans_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 11, '\001', 'M', 'A', 'R', 'I', 'A', + 'L', 'O', 'G' }; +#define LOG_HEADER_DATA_SIZE (sizeof(maria_trans_file_magic) + \ + 8 + 4 + 4 + 4 + 2 + 3 + \ + LSN_STORE_SIZE) + + +/* + Write log file page header in the just opened new log file + + SYNOPSIS + translog_write_file_header(); + + NOTES + First page is just a marker page; We don't store any real log data in it. + + RETURN + 0 OK + 1 ERROR +*/ + +static my_bool translog_write_file_header() +{ + TRANSLOG_FILE *file; + ulonglong timestamp; + uchar page_buff[TRANSLOG_PAGE_SIZE], *page= page_buff; + my_bool rc; + DBUG_ENTER("translog_write_file_header"); + + /* file tag */ + memcpy(page, maria_trans_file_magic, sizeof(maria_trans_file_magic)); + page+= sizeof(maria_trans_file_magic); + /* timestamp */ + timestamp= my_getsystime(); + int8store(page, timestamp); + page+= 8; + /* maria version */ + int4store(page, TRANSLOG_VERSION_ID); + page+= 4; + /* mysql version (MYSQL_VERSION_ID) */ + int4store(page, log_descriptor.server_version); + page+= 4; + /* server ID */ + int4store(page, log_descriptor.server_id); + page+= 4; + /* loghandler page_size */ + int2store(page, TRANSLOG_PAGE_SIZE - 1); + page+= 2; + /* file number */ + int3store(page, LSN_FILE_NO(log_descriptor.horizon)); + page+= 3; + lsn_store(page, LSN_IMPOSSIBLE); + page+= LSN_STORE_SIZE; + memset(page, TRANSLOG_FILLER, sizeof(page_buff) - (page- page_buff)); + + file= get_current_logfile(); + rc= my_pwrite(file->handler.file, page_buff, sizeof(page_buff), 0, + log_write_flags) != 0; + /* + Dropping the flag in such way can make false alarm: signalling than the + file in not sync when it is sync, but the situation is quite rare and + protections with mutexes give much more overhead to the whole engine + */ + file->is_sync= 0; + DBUG_RETURN(rc); +} + +/* + @brief write the new LSN on the given file header + + @param file The file descriptor + @param lsn That LSN which should be written + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_max_lsn_to_header(File file, LSN lsn) +{ + uchar lsn_buff[LSN_STORE_SIZE]; + my_bool rc; + DBUG_ENTER("translog_max_lsn_to_header"); + DBUG_PRINT("enter", ("File descriptor: %ld " + "lsn: (%lu,0x%lx)", + (long) file, + LSN_IN_PARTS(lsn))); + + lsn_store(lsn_buff, lsn); + + rc= (my_pwrite(file, lsn_buff, + LSN_STORE_SIZE, + (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE), + log_write_flags) != 0 || + my_sync(file, MYF(MY_WME)) != 0); + /* + We should not increase counter in case of error above, but it is so + unlikely that we can ignore this case + */ + translog_syncs++; + DBUG_RETURN(rc); +} + + +/* + Information from transaction log file header +*/ + +typedef struct st_loghandler_file_info +{ + /* + LSN_IMPOSSIBLE for current file (not finished file). + Maximum LSN of the record which parts stored in the + file. + */ + LSN max_lsn; + ulonglong timestamp; /* Time stamp */ + ulong maria_version; /* Version of maria loghandler */ + ulong mysql_version; /* Version of mysql server */ + ulong server_id; /* Server ID */ + ulong page_size; /* Loghandler page size */ + ulong file_number; /* Number of the file (from the file header) */ +} LOGHANDLER_FILE_INFO; + +/* + @brief Extract hander file information from loghandler file page + + @param desc header information descriptor to be filled with information + @param page_buff buffer with the page content +*/ + +static void translog_interpret_file_header(LOGHANDLER_FILE_INFO *desc, + uchar *page_buff) +{ + uchar *ptr; + + ptr= page_buff + sizeof(maria_trans_file_magic); + desc->timestamp= uint8korr(ptr); + ptr+= 8; + desc->maria_version= uint4korr(ptr); + ptr+= 4; + desc->mysql_version= uint4korr(ptr); + ptr+= 4; + desc->server_id= uint4korr(ptr + 4); + ptr+= 4; + desc->page_size= uint2korr(ptr) + 1; + ptr+= 2; + desc->file_number= uint3korr(ptr); + ptr+=3; + desc->max_lsn= lsn_korr(ptr); +} + + +/* + @brief Read hander file information from loghandler file + + @param desc header information descriptor to be filled with information + @param file file descriptor to read + + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_read_file_header(LOGHANDLER_FILE_INFO *desc, File file) +{ + uchar page_buff[LOG_HEADER_DATA_SIZE]; + DBUG_ENTER("translog_read_file_header"); + + if (my_pread(file, page_buff, + sizeof(page_buff), 0, MYF(MY_FNABP | MY_WME))) + { + DBUG_PRINT("info", ("log read fail error: %d", my_errno)); + DBUG_RETURN(1); + } + translog_interpret_file_header(desc, page_buff); + DBUG_PRINT("info", ("timestamp: %llu aria ver: %lu mysql ver: %lu " + "server id %lu page size %lu file number %lu " + "max lsn: (%lu,0x%lx)", + (ulonglong) desc->timestamp, + (ulong) desc->maria_version, + (ulong) desc->mysql_version, + (ulong) desc->server_id, + desc->page_size, (ulong) desc->file_number, + LSN_IN_PARTS(desc->max_lsn))); + DBUG_RETURN(0); +} + + +/* + @brief set the lsn to the files from_file - to_file if it is greater + then written in the file + + @param from_file first file number (min) + @param to_file last file number (max) + @param lsn the lsn for writing + @param is_locked true if current thread locked the log handler + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_set_lsn_for_files(uint32 from_file, uint32 to_file, + LSN lsn, my_bool is_locked) +{ + uint32 file; + DBUG_ENTER("translog_set_lsn_for_files"); + DBUG_PRINT("enter", ("From: %lu to: %lu lsn: (%lu,0x%lx) locked: %d", + (ulong) from_file, (ulong) to_file, + LSN_IN_PARTS(lsn), + is_locked)); + DBUG_ASSERT(from_file <= to_file); + DBUG_ASSERT(from_file > 0); /* we have not file 0 */ + + /* Checks the current file (not finished yet file) */ + if (!is_locked) + translog_lock(); + if (to_file == (uint32) LSN_FILE_NO(log_descriptor.horizon)) + { + if (likely(cmp_translog_addr(lsn, log_descriptor.max_lsn) > 0)) + log_descriptor.max_lsn= lsn; + to_file--; + } + if (!is_locked) + translog_unlock(); + + /* Checks finished files if they are */ + pthread_mutex_lock(&log_descriptor.file_header_lock); + for (file= from_file; file <= to_file; file++) + { + LOGHANDLER_FILE_INFO info; + File fd; + LINT_INIT(info.max_lsn); + + fd= open_logfile_by_number_no_cache(file); + if ((fd < 0) || + ((translog_read_file_header(&info, fd) || + (cmp_translog_addr(lsn, info.max_lsn) > 0 && + translog_max_lsn_to_header(fd, lsn))) | + my_close(fd, MYF(MY_WME)))) + { + translog_stop_writing(); + DBUG_RETURN(1); + } + } + pthread_mutex_unlock(&log_descriptor.file_header_lock); + + DBUG_RETURN(0); +} + + +/* descriptor of file in unfinished_files */ +struct st_file_counter +{ + uint32 file; /* file number */ + uint32 counter; /* counter for started writes */ +}; + + +/* + @brief mark file "in progress" (for multi-group records) + + @param file log file number +*/ + +static void translog_mark_file_unfinished(uint32 file) +{ + int place, i; + struct st_file_counter fc, *fc_ptr; + + DBUG_ENTER("translog_mark_file_unfinished"); + DBUG_PRINT("enter", ("file: %lu", (ulong) file)); + + fc.file= file; fc.counter= 1; + pthread_mutex_lock(&log_descriptor.unfinished_files_lock); + + if (log_descriptor.unfinished_files.elements == 0) + { + insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc); + DBUG_PRINT("info", ("The first element inserted")); + goto end; + } + + for (place= log_descriptor.unfinished_files.elements - 1; + place >= 0; + place--) + { + fc_ptr= dynamic_element(&log_descriptor.unfinished_files, + place, struct st_file_counter *); + if (fc_ptr->file <= file) + break; + } + + if (place >= 0 && fc_ptr->file == file) + { + fc_ptr->counter++; + DBUG_PRINT("info", ("counter increased")); + goto end; + } + + if (place == (int)log_descriptor.unfinished_files.elements) + { + insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc); + DBUG_PRINT("info", ("The last element inserted")); + goto end; + } + /* shift and assign new element */ + insert_dynamic(&log_descriptor.unfinished_files, + (uchar*) + dynamic_element(&log_descriptor.unfinished_files, + log_descriptor.unfinished_files.elements- 1, + struct st_file_counter *)); + for(i= log_descriptor.unfinished_files.elements - 1; i > place; i--) + { + /* we do not use set_dynamic() to avoid unneeded checks */ + memcpy(dynamic_element(&log_descriptor.unfinished_files, + i, struct st_file_counter *), + dynamic_element(&log_descriptor.unfinished_files, + i + 1, struct st_file_counter *), + sizeof(struct st_file_counter)); + } + memcpy(dynamic_element(&log_descriptor.unfinished_files, + place + 1, struct st_file_counter *), + &fc, sizeof(struct st_file_counter)); +end: + pthread_mutex_unlock(&log_descriptor.unfinished_files_lock); + DBUG_VOID_RETURN; +} + + +/* + @brief remove file mark "in progress" (for multi-group records) + + @param file log file number +*/ + +static void translog_mark_file_finished(uint32 file) +{ + int i; + struct st_file_counter *fc_ptr; + DBUG_ENTER("translog_mark_file_finished"); + DBUG_PRINT("enter", ("file: %lu", (ulong) file)); + + LINT_INIT(fc_ptr); + + pthread_mutex_lock(&log_descriptor.unfinished_files_lock); + + DBUG_ASSERT(log_descriptor.unfinished_files.elements > 0); + for (i= 0; + i < (int) log_descriptor.unfinished_files.elements; + i++) + { + fc_ptr= dynamic_element(&log_descriptor.unfinished_files, + i, struct st_file_counter *); + if (fc_ptr->file == file) + { + break; + } + } + DBUG_ASSERT(i < (int) log_descriptor.unfinished_files.elements); + + if (! --fc_ptr->counter) + delete_dynamic_element(&log_descriptor.unfinished_files, i); + pthread_mutex_unlock(&log_descriptor.unfinished_files_lock); + DBUG_VOID_RETURN; +} + + +/* + @brief get max LSN of the record which parts stored in this file + + @param file file number + + @return requested LSN or LSN_IMPOSSIBLE/LSN_ERROR + @retval LSN_IMPOSSIBLE File is still not finished + @retval LSN_ERROR Error opening file + @retval # LSN of the record which parts stored in this file +*/ + +LSN translog_get_file_max_lsn_stored(uint32 file) +{ + uint32 limit= FILENO_IMPOSSIBLE; + DBUG_ENTER("translog_get_file_max_lsn_stored"); + DBUG_PRINT("enter", ("file: %lu", (ulong)file)); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + pthread_mutex_lock(&log_descriptor.unfinished_files_lock); + + /* find file with minimum file number "in progress" */ + if (log_descriptor.unfinished_files.elements > 0) + { + struct st_file_counter *fc_ptr; + fc_ptr= dynamic_element(&log_descriptor.unfinished_files, + 0, struct st_file_counter *); + limit= fc_ptr->file; /* minimal file number "in progress" */ + } + pthread_mutex_unlock(&log_descriptor.unfinished_files_lock); + + /* + if there is no "in progress file" then unfinished file is in progress + for sure + */ + if (limit == FILENO_IMPOSSIBLE) + { + TRANSLOG_ADDRESS horizon= translog_get_horizon(); + limit= LSN_FILE_NO(horizon); + } + + if (file >= limit) + { + DBUG_PRINT("info", ("The file in in progress")); + DBUG_RETURN(LSN_IMPOSSIBLE); + } + + { + LOGHANDLER_FILE_INFO info; + File fd; + LINT_INIT_STRUCT(info); + fd= open_logfile_by_number_no_cache(file); + if ((fd < 0) || + (translog_read_file_header(&info, fd) | my_close(fd, MYF(MY_WME)))) + { + DBUG_PRINT("error", ("Can't read file header")); + DBUG_RETURN(LSN_ERROR); + } + DBUG_PRINT("info", ("Max lsn: (%lu,0x%lx)", + LSN_IN_PARTS(info.max_lsn))); + DBUG_RETURN(info.max_lsn); + } +} + +/* + Initialize transaction log file buffer + + SYNOPSIS + translog_buffer_init() + buffer The buffer to initialize + num Number of this buffer + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num) +{ + DBUG_ENTER("translog_buffer_init"); + buffer->pre_force_close_horizon= + buffer->prev_last_lsn= buffer->last_lsn= + LSN_IMPOSSIBLE; + DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx", + (ulong) buffer)); + + buffer->buffer_no= (uint8) num; + /* This Buffer File */ + buffer->file= NULL; + buffer->overlay= 0; + /* cache for current log */ + memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER); + /* Buffer size */ + buffer->size= 0; + buffer->skipped_data= 0; + /* cond of thread which is waiting for buffer filling */ + if (pthread_cond_init(&buffer->waiting_filling_buffer, 0)) + DBUG_RETURN(1); + /* Number of records which are in copy progress */ + buffer->copy_to_buffer_in_progress= 0; + /* list of waiting buffer ready threads */ + buffer->waiting_flush= 0; + /* + Buffers locked by fallowing mutex. As far as buffers create logical + circle (after last buffer goes first) it trigger false alarm of deadlock + detect system, so we remove check of deadlock for this buffers. In deed + all mutex locks concentrated around current buffer except flushing + thread (but it is only one thread). One thread can't take more then + 2 buffer locks at once. So deadlock is impossible here. + + To prevent false alarm of dead lock detection we switch dead lock + detection for one buffer in the middle of the buffers chain. Excluding + only one of eight buffers from deadlock detection hardly can hide other + possible problems which include this mutexes. + */ + if (my_pthread_mutex_init(&buffer->mutex, MY_MUTEX_INIT_FAST, + "translog_buffer->mutex", + (num == TRANSLOG_BUFFERS_NO - 2 ? + MYF_NO_DEADLOCK_DETECTION : 0)) || + pthread_cond_init(&buffer->prev_sent_to_disk_cond, 0)) + DBUG_RETURN(1); + buffer->is_closing_buffer= 0; + buffer->prev_sent_to_disk= LSN_IMPOSSIBLE; + buffer->prev_buffer_offset= LSN_IMPOSSIBLE; + buffer->ver= 0; + DBUG_RETURN(0); +} + + +/* + @brief close transaction log file by descriptor + + @param file pagegecache file descriptor reference + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_close_log_file(TRANSLOG_FILE *file) +{ + int rc= 0; + flush_pagecache_blocks(log_descriptor.pagecache, &file->handler, + FLUSH_RELEASE); + /* + Sync file when we close it + TODO: sync only we have changed the log + */ + if (!file->is_sync) + { + rc= my_sync(file->handler.file, MYF(MY_WME)); + translog_syncs++; + } + rc|= my_close(file->handler.file, MYF(MY_WME)); + my_free(file, MYF(0)); + return test(rc); +} + + +/** + @brief Dummy function for write failure (the log to not use + pagecache writing) +*/ + +void translog_dummy_write_failure(uchar *data __attribute__((unused))) +{ + return; +} + + +/** + @brief Initializes TRANSLOG_FILE structure + + @param file reference on the file to initialize + @param number file number + @param is_sync is file synced on disk +*/ + +static void translog_file_init(TRANSLOG_FILE *file, uint32 number, + my_bool is_sync) +{ + pagecache_file_init(file->handler, &translog_page_validator, + &translog_dummy_callback, + &translog_dummy_write_failure, + maria_flush_log_for_page_none, file); + file->number= number; + file->was_recovered= 0; + file->is_sync= is_sync; +} + + +/** + @brief Create and fill header of new file. + + @note the caller must call it right after it has increased + log_descriptor.horizon to the new file + (log_descriptor.horizon+= LSN_ONE_FILE) + + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_create_new_file() +{ + TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(sizeof(TRANSLOG_FILE), + MYF(0)); + + TRANSLOG_FILE *old= get_current_logfile(); + uint32 file_no= LSN_FILE_NO(log_descriptor.horizon); + DBUG_ENTER("translog_create_new_file"); + + if (file == NULL) + goto error; + + /* + Writes max_lsn to the file header before finishing it (there is no need + to lock file header buffer because it is still unfinished file, so only + one thread can finish the file and nobody interested of LSN of current + (unfinished) file, because no one can purge it). + */ + if (translog_max_lsn_to_header(old->handler.file, log_descriptor.max_lsn)) + goto error; + + rw_wrlock(&log_descriptor.open_files_lock); + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + DBUG_ASSERT(file_no == log_descriptor.max_file + 1); + if (allocate_dynamic(&log_descriptor.open_files, + log_descriptor.max_file - log_descriptor.min_file + 2)) + goto error_lock; + if ((file->handler.file= + create_logfile_by_number_no_cache(file_no)) == -1) + goto error_lock; + translog_file_init(file, file_no, 0); + + /* this call just expand the array */ + insert_dynamic(&log_descriptor.open_files, (uchar*)&file); + log_descriptor.max_file++; + { + char *start= (char*) dynamic_element(&log_descriptor.open_files, 0, + TRANSLOG_FILE**); + memmove(start + sizeof(TRANSLOG_FILE*), start, + sizeof(TRANSLOG_FILE*) * + (log_descriptor.max_file - log_descriptor.min_file + 1 - 1)); + } + /* can't fail we because we expanded array */ + set_dynamic(&log_descriptor.open_files, (uchar*)&file, 0); + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + rw_unlock(&log_descriptor.open_files_lock); + + DBUG_PRINT("info", ("file_no: %lu", (ulong)file_no)); + + if (translog_write_file_header()) + DBUG_RETURN(1); + + if (ma_control_file_write_and_force(last_checkpoint_lsn, file_no, + max_trid_in_control_file, + recovery_failures)) + { + translog_stop_writing(); + DBUG_RETURN(1); + } + + DBUG_RETURN(0); + +error_lock: + rw_unlock(&log_descriptor.open_files_lock); +error: + translog_stop_writing(); + DBUG_RETURN(1); +} + + +/** + @brief Locks the loghandler buffer. + + @param buffer This buffer which should be locked + + @note See comment before buffer 'mutex' variable. + + @retval 0 OK + @retval 1 Error +*/ + +static void translog_buffer_lock(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_lock"); + DBUG_PRINT("enter", + ("Lock buffer #%u: (0x%lx)", (uint) buffer->buffer_no, + (ulong) buffer)); + pthread_mutex_lock(&buffer->mutex); + DBUG_VOID_RETURN; +} + + +/* + Unlock the loghandler buffer + + SYNOPSIS + translog_buffer_unlock() + buffer This buffer which should be unlocked + + RETURN + 0 OK + 1 Error +*/ + +static void translog_buffer_unlock(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_unlock"); + DBUG_PRINT("enter", ("Unlock buffer... #%u (0x%lx)", + (uint) buffer->buffer_no, (ulong) buffer)); + + pthread_mutex_unlock(&buffer->mutex); + DBUG_VOID_RETURN; +} + + +/* + Write a header on the page + + SYNOPSIS + translog_new_page_header() + horizon Where to write the page + cursor Where to write the page + + NOTE + - space for page header should be checked before +*/ + +static uchar translog_sector_random; + +static void translog_new_page_header(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + uchar *ptr; + + DBUG_ENTER("translog_new_page_header"); + DBUG_ASSERT(cursor->ptr); + + cursor->protected= 0; + + ptr= cursor->ptr; + /* Page number */ + int3store(ptr, LSN_OFFSET(*horizon) / TRANSLOG_PAGE_SIZE); + ptr+= 3; + /* File number */ + int3store(ptr, LSN_FILE_NO(*horizon)); + ptr+= 3; + DBUG_ASSERT(TRANSLOG_PAGE_FLAGS == (ptr - cursor->ptr)); + cursor->ptr[TRANSLOG_PAGE_FLAGS]= (uchar) log_descriptor.flags; + ptr++; + if (log_descriptor.flags & TRANSLOG_PAGE_CRC) + { +#ifndef DBUG_OFF + DBUG_PRINT("info", ("write 0x11223344 CRC to (%lu,0x%lx)", + LSN_IN_PARTS(*horizon))); + /* This will be overwritten by real CRC; This is just for debugging */ + int4store(ptr, 0x11223344); +#endif + /* CRC will be put when page is finished */ + ptr+= CRC_SIZE; + } + if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) + { + /* + translog_sector_randmo works like "random" values producer because + it is enough to have such "random" for this purpose and it will + not interfere with higher level pseudo random value generator + */ + ptr[0]= translog_sector_random++; + ptr+= TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + } + { + uint len= (ptr - cursor->ptr); + (*horizon)+= len; /* increasing the offset part of the address */ + cursor->current_page_fill= len; + if (!cursor->chaser) + cursor->buffer->size+= len; + } + cursor->ptr= ptr; + DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu) " + "Horizon: (%lu,0x%lx)", + (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer), + LSN_IN_PARTS(*horizon))); + translog_check_cursor(cursor); + DBUG_VOID_RETURN; +} + + +/* + Put sector protection on the page image + + SYNOPSIS + translog_put_sector_protection() + page reference on the page content + cursor cursor of the buffer + + NOTES + We put a sector protection on all following sectors on the page, + except the first sector that is protected by page header. +*/ + +static void translog_put_sector_protection(uchar *page, + struct st_buffer_cursor *cursor) +{ + uchar *table= page + log_descriptor.page_overhead - + TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + uint i, offset; + uint16 last_protected_sector= ((cursor->previous_offset - 1) / + DISK_DRIVE_SECTOR_SIZE); + uint16 start_sector= cursor->previous_offset / DISK_DRIVE_SECTOR_SIZE; + uint8 value= table[0] + cursor->write_counter; + DBUG_ENTER("translog_put_sector_protection"); + + if (start_sector == 0) + { + /* First sector is protected by file & page numbers in the page header. */ + start_sector= 1; + } + + DBUG_PRINT("enter", ("Write counter:%u value:%u offset:%u, " + "last protected:%u start sector:%u", + (uint) cursor->write_counter, + (uint) value, + (uint) cursor->previous_offset, + (uint) last_protected_sector, (uint) start_sector)); + if (last_protected_sector == start_sector) + { + i= last_protected_sector; + offset= last_protected_sector * DISK_DRIVE_SECTOR_SIZE; + /* restore data, because we modified sector which was protected */ + if (offset < cursor->previous_offset) + page[offset]= table[i]; + } + for (i= start_sector, offset= start_sector * DISK_DRIVE_SECTOR_SIZE; + i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + i++, (offset+= DISK_DRIVE_SECTOR_SIZE)) + { + DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x", + i, offset, (uint) page[offset])); + table[i]= page[offset]; + page[offset]= value; + DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x", + i, offset, (uint) page[offset])); + } + DBUG_VOID_RETURN; +} + + +/* + Calculate CRC32 of given area + + SYNOPSIS + translog_crc() + area Pointer of the area beginning + length The Area length + + RETURN + CRC32 +*/ + +static uint32 translog_crc(uchar *area, uint length) +{ + DBUG_ENTER("translog_crc"); + DBUG_RETURN(crc32(0L, (unsigned char*) area, length)); +} + + +/* + Finish current page with zeros + + SYNOPSIS + translog_finish_page() + horizon \ horizon & buffer pointers + cursor / +*/ + +static void translog_finish_page(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + uint16 left= TRANSLOG_PAGE_SIZE - cursor->current_page_fill; + uchar *page= cursor->ptr - cursor->current_page_fill; + DBUG_ENTER("translog_finish_page"); + DBUG_PRINT("enter", ("Buffer: #%u 0x%lx " + "Buffer addr: (%lu,0x%lx) " + "Page addr: (%lu,0x%lx) " + "size:%lu (%lu) Pg:%u left:%u", + (uint) cursor->buffer_no, (ulong) cursor->buffer, + LSN_IN_PARTS(cursor->buffer->offset), + (ulong) LSN_FILE_NO(*horizon), + (ulong) (LSN_OFFSET(*horizon) - + cursor->current_page_fill), + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr -cursor->buffer->buffer), + (uint) cursor->current_page_fill, (uint) left)); + DBUG_ASSERT(LSN_FILE_NO(*horizon) == LSN_FILE_NO(cursor->buffer->offset)); + translog_check_cursor(cursor); + if (cursor->protected) + { + DBUG_PRINT("info", ("Already protected and finished")); + DBUG_VOID_RETURN; + } + cursor->protected= 1; + + DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE); + if (left != 0) + { + DBUG_PRINT("info", ("left: %u", (uint) left)); + memset(cursor->ptr, TRANSLOG_FILLER, left); + cursor->ptr+= left; + (*horizon)+= left; /* offset increasing */ + if (!cursor->chaser) + cursor->buffer->size+= left; + /* We are finishing the page so reset the counter */ + cursor->current_page_fill= 0; + DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx " + "chaser: %d Size: %lu (%lu)", + (uint) cursor->buffer->buffer_no, + (ulong) cursor->buffer, cursor->chaser, + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + translog_check_cursor(cursor); + } + /* + When we are finishing the page other thread might not finish the page + header yet (in case if we started from the middle of the page) so we + have to read log_descriptor.flags but not the flags from the page. + */ + if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) + { + translog_put_sector_protection(page, cursor); + DBUG_PRINT("info", ("drop write_counter")); + cursor->write_counter= 0; + cursor->previous_offset= 0; + } + if (log_descriptor.flags & TRANSLOG_PAGE_CRC) + { + uint32 crc= translog_crc(page + log_descriptor.page_overhead, + TRANSLOG_PAGE_SIZE - + log_descriptor.page_overhead); + DBUG_PRINT("info", ("CRC: %lx", (ulong) crc)); + /* We have page number, file number and flag before crc */ + int4store(page + 3 + 3 + 1, crc); + } + DBUG_VOID_RETURN; +} + + +/* + @brief Wait until all threads have finished closing this buffer. + + @param buffer This buffer should be check +*/ + +static void translog_wait_for_closing(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_wait_for_closing"); + DBUG_PRINT("enter", ("Buffer #%u 0x%lx copies in progress: %u " + "is closing %u File: %d size: %lu", + (uint) buffer->buffer_no, (ulong) buffer, + (uint) buffer->copy_to_buffer_in_progress, + (uint) buffer->is_closing_buffer, + (buffer->file ? buffer->file->handler.file : -1), + (ulong) buffer->size)); + translog_buffer_lock_assert_owner(buffer); + + while (buffer->is_closing_buffer) + { + DBUG_PRINT("info", ("wait for writers... buffer: #%u 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer)); + DBUG_ASSERT(buffer->file != NULL); + pthread_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex); + DBUG_PRINT("info", ("wait for writers done buffer: #%u 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer)); + } + + DBUG_VOID_RETURN; +} + + +/* + @brief Wait until all threads have finished filling this buffer. + + @param buffer This buffer should be check +*/ + +static void translog_wait_for_writers(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_wait_for_writers"); + DBUG_PRINT("enter", ("Buffer #%u 0x%lx copies in progress: %u " + "is closing %u File: %d size: %lu", + (uint) buffer->buffer_no, (ulong) buffer, + (uint) buffer->copy_to_buffer_in_progress, + (uint) buffer->is_closing_buffer, + (buffer->file ? buffer->file->handler.file : -1), + (ulong) buffer->size)); + translog_buffer_lock_assert_owner(buffer); + + while (buffer->copy_to_buffer_in_progress) + { + DBUG_PRINT("info", ("wait for writers... buffer: #%u 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer)); + DBUG_ASSERT(buffer->file != NULL); + pthread_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex); + DBUG_PRINT("info", ("wait for writers done buffer: #%u 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer)); + } + + DBUG_VOID_RETURN; +} + + +/* + + Wait for buffer to become free + + SYNOPSIS + translog_wait_for_buffer_free() + buffer The buffer we are waiting for + + NOTE + - this buffer should be locked +*/ + +static void translog_wait_for_buffer_free(struct st_translog_buffer *buffer) +{ + TRANSLOG_ADDRESS offset= buffer->offset; + TRANSLOG_FILE *file= buffer->file; + uint8 ver= buffer->ver; + DBUG_ENTER("translog_wait_for_buffer_free"); + DBUG_PRINT("enter", ("Buffer #%u 0x%lx copies in progress: %u " + "is closing %u File: %d size: %lu", + (uint) buffer->buffer_no, (ulong) buffer, + (uint) buffer->copy_to_buffer_in_progress, + (uint) buffer->is_closing_buffer, + (buffer->file ? buffer->file->handler.file : -1), + (ulong) buffer->size)); + + translog_wait_for_writers(buffer); + + if (offset != buffer->offset || file != buffer->file || ver != buffer->ver) + DBUG_VOID_RETURN; /* the buffer if already freed */ + + while (buffer->file != NULL) + { + DBUG_PRINT("info", ("wait for writers... buffer: #%u 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer)); + pthread_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex); + DBUG_PRINT("info", ("wait for writers done. buffer: #%u 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer)); + } + DBUG_ASSERT(buffer->copy_to_buffer_in_progress == 0); + DBUG_VOID_RETURN; +} + + +/* + Initialize the cursor for a buffer + + SYNOPSIS + translog_cursor_init() + buffer The buffer + cursor It's cursor + buffer_no Number of buffer +*/ + +static void translog_cursor_init(struct st_buffer_cursor *cursor, + struct st_translog_buffer *buffer, + uint8 buffer_no) +{ + DBUG_ENTER("translog_cursor_init"); + cursor->ptr= buffer->buffer; + cursor->buffer= buffer; + cursor->buffer_no= buffer_no; + cursor->current_page_fill= 0; + cursor->chaser= (cursor != &log_descriptor.bc); + cursor->write_counter= 0; + cursor->previous_offset= 0; + cursor->protected= 0; + DBUG_VOID_RETURN; +} + + +/* + @brief Initialize buffer for the current file, and a cursor for this buffer. + + @param buffer The buffer + @param cursor It's cursor + @param buffer_no Number of buffer +*/ + +static void translog_start_buffer(struct st_translog_buffer *buffer, + struct st_buffer_cursor *cursor, + uint buffer_no) +{ + DBUG_ENTER("translog_start_buffer"); + DBUG_PRINT("enter", + ("Assign buffer: #%u (0x%lx) offset: 0x%lx(%lu)", + (uint) buffer->buffer_no, (ulong) buffer, + (ulong) LSN_OFFSET(log_descriptor.horizon), + (ulong) LSN_OFFSET(log_descriptor.horizon))); + DBUG_ASSERT(buffer_no == buffer->buffer_no); + buffer->pre_force_close_horizon= + buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE; + DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx", + (ulong) buffer)); + buffer->offset= log_descriptor.horizon; + buffer->next_buffer_offset= LSN_IMPOSSIBLE; + buffer->file= get_current_logfile(); + buffer->overlay= 0; + buffer->size= 0; + buffer->skipped_data= 0; + translog_cursor_init(cursor, buffer, buffer_no); + DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx " + "chaser: %d Size: %lu (%lu)", + (long) (buffer->file ? buffer->file->number : 0), + (buffer->file ? buffer->file->handler.file : -1), + (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + translog_check_cursor(cursor); + pthread_mutex_lock(&log_descriptor.dirty_buffer_mask_lock); + log_descriptor.dirty_buffer_mask|= (1 << buffer->buffer_no); + pthread_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock); + + DBUG_VOID_RETURN; +} + + +/* + @brief Switch to the next buffer in a chain. + + @param horizon \ Pointers on current position in file and buffer + @param cursor / + @param new_file Also start new file + + @note + - loghandler should be locked + - after return new and old buffer still are locked + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_buffer_next(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + my_bool new_file) +{ + uint old_buffer_no= cursor->buffer_no; + uint new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO; + struct st_translog_buffer *new_buffer= log_descriptor.buffers + new_buffer_no; + my_bool chasing= cursor->chaser; + DBUG_ENTER("translog_buffer_next"); + + DBUG_PRINT("info", ("horizon: (%lu,0x%lx) chasing: %d", + LSN_IN_PARTS(log_descriptor.horizon), chasing)); + + DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, *horizon) >= 0); + + translog_finish_page(horizon, cursor); + + if (!chasing) + { + translog_buffer_lock(new_buffer); +#ifndef DBUG_OFF + { + TRANSLOG_ADDRESS offset= new_buffer->offset; + TRANSLOG_FILE *file= new_buffer->file; + uint8 ver= new_buffer->ver; + translog_lock_assert_owner(); +#endif + translog_wait_for_buffer_free(new_buffer); +#ifndef DBUG_OFF + /* We keep the handler locked so nobody can start this new buffer */ + DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL && + (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver); + } +#endif + } + else + DBUG_ASSERT(new_buffer->file != NULL); + + if (new_file) + { + /* move the horizon to the next file and its header page */ + (*horizon)+= LSN_ONE_FILE; + (*horizon)= LSN_REPLACE_OFFSET(*horizon, TRANSLOG_PAGE_SIZE); + if (!chasing && translog_create_new_file()) + { + DBUG_RETURN(1); + } + } + + /* prepare next page */ + if (chasing) + translog_cursor_init(cursor, new_buffer, new_buffer_no); + else + { + translog_lock_assert_owner(); + translog_start_buffer(new_buffer, cursor, new_buffer_no); + new_buffer->prev_buffer_offset= + log_descriptor.buffers[old_buffer_no].offset; + new_buffer->prev_last_lsn= + BUFFER_MAX_LSN(log_descriptor.buffers + old_buffer_no); + } + log_descriptor.buffers[old_buffer_no].next_buffer_offset= new_buffer->offset; + DBUG_PRINT("info", ("prev_last_lsn set to (%lu,0x%lx) buffer: 0x%lx", + LSN_IN_PARTS(new_buffer->prev_last_lsn), + (ulong) new_buffer)); + translog_new_page_header(horizon, cursor); + DBUG_RETURN(0); +} + + +/* + Sets max LSN sent to file, and address from which data is only in the buffer + + SYNOPSIS + translog_set_sent_to_disk() + buffer buffer which we have sent to disk + + TODO: use atomic operations if possible (64bit architectures?) +*/ + +static void translog_set_sent_to_disk(struct st_translog_buffer *buffer) +{ + LSN lsn= buffer->last_lsn; + TRANSLOG_ADDRESS in_buffers= buffer->next_buffer_offset; + + DBUG_ENTER("translog_set_sent_to_disk"); + pthread_mutex_lock(&log_descriptor.sent_to_disk_lock); + DBUG_PRINT("enter", ("lsn: (%lu,0x%lx) in_buffers: (%lu,0x%lx) " + "in_buffers_only: (%lu,0x%lx) start: (%lu,0x%lx) " + "sent_to_disk: (%lu,0x%lx)", + LSN_IN_PARTS(lsn), + LSN_IN_PARTS(in_buffers), + LSN_IN_PARTS(log_descriptor.log_start), + LSN_IN_PARTS(log_descriptor.in_buffers_only), + LSN_IN_PARTS(log_descriptor.sent_to_disk))); + /* + We write sequentially (first part of following assert) but we rewrite + the same page in case we started mysql and shut it down immediately + (second part of the following assert) + */ + DBUG_ASSERT(cmp_translog_addr(lsn, log_descriptor.sent_to_disk) >= 0 || + cmp_translog_addr(lsn, log_descriptor.log_start) < 0); + log_descriptor.sent_to_disk= lsn; + /* LSN_IMPOSSIBLE == 0 => it will work for very first time */ + if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0) + { + log_descriptor.in_buffers_only= in_buffers; + DBUG_PRINT("info", ("set new in_buffers_only")); + } + pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock); + DBUG_VOID_RETURN; +} + + +/* + Sets address from which data is only in the buffer + + SYNOPSIS + translog_set_only_in_buffers() + lsn LSN to assign + in_buffers to assign to in_buffers_only +*/ + +static void translog_set_only_in_buffers(TRANSLOG_ADDRESS in_buffers) +{ + DBUG_ENTER("translog_set_only_in_buffers"); + pthread_mutex_lock(&log_descriptor.sent_to_disk_lock); + DBUG_PRINT("enter", ("in_buffers: (%lu,0x%lx) " + "in_buffers_only: (%lu,0x%lx)", + LSN_IN_PARTS(in_buffers), + LSN_IN_PARTS(log_descriptor.in_buffers_only))); + /* LSN_IMPOSSIBLE == 0 => it will work for very first time */ + if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0) + { + if (translog_status != TRANSLOG_OK) + DBUG_VOID_RETURN; + log_descriptor.in_buffers_only= in_buffers; + DBUG_PRINT("info", ("set new in_buffers_only")); + } + pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock); + DBUG_VOID_RETURN; +} + + +/* + Gets address from which data is only in the buffer + + SYNOPSIS + translog_only_in_buffers() + + RETURN + address from which data is only in the buffer +*/ + +static TRANSLOG_ADDRESS translog_only_in_buffers() +{ + register TRANSLOG_ADDRESS addr; + DBUG_ENTER("translog_only_in_buffers"); + pthread_mutex_lock(&log_descriptor.sent_to_disk_lock); + addr= log_descriptor.in_buffers_only; + pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock); + DBUG_RETURN(addr); +} + + +/* + Get max LSN sent to file + + SYNOPSIS + translog_get_sent_to_disk() + + RETURN + max LSN send to file +*/ + +static LSN translog_get_sent_to_disk() +{ + register LSN lsn; + DBUG_ENTER("translog_get_sent_to_disk"); + pthread_mutex_lock(&log_descriptor.sent_to_disk_lock); + lsn= log_descriptor.sent_to_disk; + DBUG_PRINT("info", ("sent to disk up to (%lu,0x%lx)", LSN_IN_PARTS(lsn))); + pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock); + DBUG_RETURN(lsn); +} + + +/* + Get first chunk address on the given page + + SYNOPSIS + translog_get_first_chunk_offset() + page The page where to find first chunk + + RETURN + first chunk offset +*/ + +static my_bool translog_get_first_chunk_offset(uchar *page) +{ + DBUG_ENTER("translog_get_first_chunk_offset"); + DBUG_ASSERT(page[TRANSLOG_PAGE_FLAGS] < TRANSLOG_FLAGS_NUM); + DBUG_RETURN(page_overhead[page[TRANSLOG_PAGE_FLAGS]]); +} + + +/* + Write coded length of record + + SYNOPSIS + translog_write_variable_record_1group_code_len + dst Destination buffer pointer + length Length which should be coded + header_len Calculated total header length +*/ + +static void +translog_write_variable_record_1group_code_len(uchar *dst, + translog_size_t length, + uint16 header_len) +{ + switch (header_len) { + case 6: /* (5 + 1) */ + DBUG_ASSERT(length <= 250); + *dst= (uint8) length; + return; + case 8: /* (5 + 3) */ + DBUG_ASSERT(length <= 0xFFFF); + *dst= 251; + int2store(dst + 1, length); + return; + case 9: /* (5 + 4) */ + DBUG_ASSERT(length <= (ulong) 0xFFFFFF); + *dst= 252; + int3store(dst + 1, length); + return; + case 10: /* (5 + 5) */ + *dst= 253; + int4store(dst + 1, length); + return; + default: + DBUG_ASSERT(0); + } + return; +} + + +/* + Decode record data length and advance given pointer to the next field + + SYNOPSIS + translog_variable_record_1group_decode_len() + src The pointer to the pointer to the length beginning + + RETURN + decoded length +*/ + +static translog_size_t translog_variable_record_1group_decode_len(uchar **src) +{ + uint8 first= (uint8) (**src); + switch (first) { + case 251: + (*src)+= 3; + return (uint2korr((*src) - 2)); + case 252: + (*src)+= 4; + return (uint3korr((*src) - 3)); + case 253: + (*src)+= 5; + return (uint4korr((*src) - 4)); + case 254: + case 255: + DBUG_ASSERT(0); /* reserved for future use */ + return (0); + default: + (*src)++; + return (first); + } +} + + +/* + Get total length of this chunk (not only body) + + SYNOPSIS + translog_get_total_chunk_length() + page The page where chunk placed + offset Offset of the chunk on this place + + RETURN + total length of the chunk +*/ + +static uint16 translog_get_total_chunk_length(uchar *page, uint16 offset) +{ + DBUG_ENTER("translog_get_total_chunk_length"); + switch (page[offset] & TRANSLOG_CHUNK_TYPE) { + case TRANSLOG_CHUNK_LSN: + { + /* 0 chunk referred as LSN (head or tail) */ + translog_size_t rec_len; + uchar *start= page + offset; + uchar *ptr= start + 1 + 2; /* chunk type and short trid */ + uint16 chunk_len, header_len, page_rest; + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN")); + rec_len= translog_variable_record_1group_decode_len(&ptr); + chunk_len= uint2korr(ptr); + header_len= (uint16) (ptr -start) + 2; + DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u", + (ulong) rec_len, (uint) chunk_len, (uint) header_len)); + if (chunk_len) + { + DBUG_PRINT("info", ("chunk len: %u + %u = %u", + (uint) header_len, (uint) chunk_len, + (uint) (chunk_len + header_len))); + DBUG_RETURN(chunk_len + header_len); + } + page_rest= TRANSLOG_PAGE_SIZE - offset; + DBUG_PRINT("info", ("page_rest %u", (uint) page_rest)); + if (rec_len + header_len < page_rest) + DBUG_RETURN(rec_len + header_len); + DBUG_RETURN(page_rest); + } + case TRANSLOG_CHUNK_FIXED: + { + uchar *ptr; + uint type= page[offset] & TRANSLOG_REC_TYPE; + uint length; + int i; + /* 1 (pseudo)fixed record (also LSN) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED")); + DBUG_ASSERT(log_record_type_descriptor[type].rclass == + LOGRECTYPE_FIXEDLENGTH || + log_record_type_descriptor[type].rclass == + LOGRECTYPE_PSEUDOFIXEDLENGTH); + if (log_record_type_descriptor[type].rclass == LOGRECTYPE_FIXEDLENGTH) + { + DBUG_PRINT("info", + ("Fixed length: %u", + (uint) (log_record_type_descriptor[type].fixed_length + 3))); + DBUG_RETURN(log_record_type_descriptor[type].fixed_length + 3); + } + + ptr= page + offset + 3; /* first compressed LSN */ + length= log_record_type_descriptor[type].fixed_length + 3; + for (i= 0; i < log_record_type_descriptor[type].compressed_LSN; i++) + { + /* first 2 bits is length - 2 */ + uint len= (((uint8) (*ptr)) >> 6) + 2; + if (ptr[0] == 0 && ((uint8) ptr[1]) == 1) + len+= LSN_STORE_SIZE; /* case of full LSN storing */ + ptr+= len; + /* subtract saved bytes */ + length-= (LSN_STORE_SIZE - len); + } + DBUG_PRINT("info", ("Pseudo-fixed length: %u", length)); + DBUG_RETURN(length); + } + case TRANSLOG_CHUNK_NOHDR: + /* 2 no header chunk (till page end) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR length: %u", + (uint) (TRANSLOG_PAGE_SIZE - offset))); + DBUG_RETURN(TRANSLOG_PAGE_SIZE - offset); + case TRANSLOG_CHUNK_LNGTH: /* 3 chunk with chunk length */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH")); + DBUG_ASSERT(TRANSLOG_PAGE_SIZE - offset >= 3); + DBUG_PRINT("info", ("length: %u", uint2korr(page + offset + 1) + 3)); + DBUG_RETURN(uint2korr(page + offset + 1) + 3); + default: + DBUG_ASSERT(0); + DBUG_RETURN(0); + } +} + +/* + @brief Waits previous buffer flush finish + + @param buffer buffer for check + + @retval 0 previous buffer flushed and this thread have to flush this one + @retval 1 previous buffer flushed and this buffer flushed by other thread too +*/ + +my_bool translog_prev_buffer_flush_wait(struct st_translog_buffer *buffer) +{ + TRANSLOG_ADDRESS offset= buffer->offset; + TRANSLOG_FILE *file= buffer->file; + uint8 ver= buffer->ver; + DBUG_ENTER("translog_prev_buffer_flush_wait"); + DBUG_PRINT("enter", ("buffer: 0x%lx #%u offset: (%lu,0x%lx) " + "prev sent: (%lu,0x%lx) prev offset: (%lu,0x%lx)", + (ulong) buffer, (uint) buffer->buffer_no, + LSN_IN_PARTS(buffer->offset), + LSN_IN_PARTS(buffer->prev_sent_to_disk), + LSN_IN_PARTS(buffer->prev_buffer_offset))); + translog_buffer_lock_assert_owner(buffer); + /* + if prev_sent_to_disk == LSN_IMPOSSIBLE then + prev_buffer_offset should be LSN_IMPOSSIBLE + because it means that this buffer was never used + */ + DBUG_ASSERT((buffer->prev_sent_to_disk == LSN_IMPOSSIBLE && + buffer->prev_buffer_offset == LSN_IMPOSSIBLE) || + buffer->prev_sent_to_disk != LSN_IMPOSSIBLE); + if (buffer->prev_buffer_offset != buffer->prev_sent_to_disk) + { + do { + pthread_cond_wait(&buffer->prev_sent_to_disk_cond, &buffer->mutex); + if (buffer->file != file || buffer->offset != offset || + buffer->ver != ver) + { + translog_buffer_unlock(buffer); + DBUG_RETURN(1); /* some the thread flushed the buffer already */ + } + } while(buffer->prev_buffer_offset != buffer->prev_sent_to_disk); + } + DBUG_RETURN(0); +} + + +/* + Flush given buffer + + SYNOPSIS + translog_buffer_flush() + buffer This buffer should be flushed + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_buffer_flush(struct st_translog_buffer *buffer) +{ + uint32 i, pg; + TRANSLOG_ADDRESS offset= buffer->offset; + TRANSLOG_FILE *file= buffer->file; + uint8 ver= buffer->ver; + uint skipped_data; + DBUG_ENTER("translog_buffer_flush"); + DBUG_PRINT("enter", + ("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu", + (uint) buffer->buffer_no, (ulong) buffer, + buffer->file->handler.file, + LSN_IN_PARTS(buffer->offset), + (ulong) buffer->size)); + translog_buffer_lock_assert_owner(buffer); + + if (buffer->file == NULL) + DBUG_RETURN(0); + + translog_wait_for_writers(buffer); + + if (buffer->file != file || buffer->offset != offset || buffer->ver != ver) + DBUG_RETURN(0); /* some the thread flushed the buffer already */ + + if (buffer->is_closing_buffer) + { + /* some other flush in progress */ + translog_wait_for_closing(buffer); + } + + if (buffer->file != file || buffer->offset != offset || buffer->ver != ver) + DBUG_RETURN(0); /* some the thread flushed the buffer already */ + + if (buffer->overlay && translog_prev_buffer_flush_wait(buffer)) + DBUG_RETURN(0); /* some the thread flushed the buffer already */ + + /* + Send page by page in the pagecache what we are going to write on the + disk + */ + file= buffer->file; + skipped_data= buffer->skipped_data; + DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE); + for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE; + i < buffer->size; + i+= TRANSLOG_PAGE_SIZE, pg++) + { + TRANSLOG_ADDRESS addr= (buffer->offset + i); + TRANSLOG_VALIDATOR_DATA data; + DBUG_PRINT("info", ("send log form %lu till %lu address: (%lu,0x%lx) " + "page #: %lu buffer size: %lu buffer: 0x%lx", + (ulong) i, (ulong) (i + TRANSLOG_PAGE_SIZE), + LSN_IN_PARTS(addr), (ulong) pg, (ulong) buffer->size, + (ulong) buffer)); + data.addr= &addr; + DBUG_ASSERT(log_descriptor.pagecache->block_size == TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size); + if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN) + DBUG_RETURN(1); + if (pagecache_write_part(log_descriptor.pagecache, + &file->handler, pg, 3, + buffer->buffer + i, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DONE, 0, + LSN_IMPOSSIBLE, + skipped_data, + TRANSLOG_PAGE_SIZE - skipped_data)) + { + DBUG_PRINT("error", + ("Can't write page (%lu,0x%lx) to pagecache, error: %d", + (ulong) buffer->file->number, + (ulong) (LSN_OFFSET(buffer->offset)+ i), + my_errno)); + translog_stop_writing(); + DBUG_RETURN(1); + } + skipped_data= 0; + } + file->is_sync= 0; + if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data, + buffer->size - buffer->skipped_data, + LSN_OFFSET(buffer->offset) + buffer->skipped_data, + log_write_flags)) + { + DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu " + "to the disk (%d)", + (ulong) file->handler.file, + (ulong) LSN_OFFSET(buffer->offset), + (ulong) buffer->size, errno)); + translog_stop_writing(); + DBUG_RETURN(1); + } + /* + Dropping the flag in such way can make false alarm: signalling than the + file in not sync when it is sync, but the situation is quite rare and + protections with mutexes give much more overhead to the whole engine + */ + file->is_sync= 0; + + if (LSN_OFFSET(buffer->last_lsn) != 0) /* if buffer->last_lsn is set */ + { + if (translog_prev_buffer_flush_wait(buffer)) + DBUG_RETURN(0); /* some the thread flushed the buffer already */ + translog_set_sent_to_disk(buffer); + } + else + translog_set_only_in_buffers(buffer->next_buffer_offset); + + /* say to next buffer that we are finished */ + { + struct st_translog_buffer *next_buffer= + log_descriptor.buffers + ((buffer->buffer_no + 1) % TRANSLOG_BUFFERS_NO); + if (likely(translog_status == TRANSLOG_OK)){ + translog_buffer_lock(next_buffer); + next_buffer->prev_sent_to_disk= buffer->offset; + translog_buffer_unlock(next_buffer); + pthread_cond_broadcast(&next_buffer->prev_sent_to_disk_cond); + } + else + { + /* + It is shutdown => + 1) there is only one thread + 2) mutexes of other buffers can be destroyed => we can't use them + */ + next_buffer->prev_sent_to_disk= buffer->offset; + } + } + /* Free buffer */ + buffer->file= NULL; + buffer->overlay= 0; + buffer->ver++; + pthread_mutex_lock(&log_descriptor.dirty_buffer_mask_lock); + log_descriptor.dirty_buffer_mask&= ~(1 << buffer->buffer_no); + pthread_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock); + pthread_cond_broadcast(&buffer->waiting_filling_buffer); + DBUG_RETURN(0); +} + + +/* + Recover page with sector protection (wipe out failed chunks) + + SYNOPSYS + translog_recover_page_up_to_sector() + page reference on the page + offset offset of failed sector + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_recover_page_up_to_sector(uchar *page, uint16 offset) +{ + uint16 chunk_offset= translog_get_first_chunk_offset(page), valid_chunk_end; + DBUG_ENTER("translog_recover_page_up_to_sector"); + DBUG_PRINT("enter", ("offset: %u first chunk: %u", + (uint) offset, (uint) chunk_offset)); + + while (page[chunk_offset] != TRANSLOG_FILLER && chunk_offset < offset) + { + uint16 chunk_length; + if ((chunk_length= + translog_get_total_chunk_length(page, chunk_offset)) == 0) + { + DBUG_PRINT("error", ("cant get chunk length (offset %u)", + (uint) chunk_offset)); + DBUG_RETURN(1); + } + DBUG_PRINT("info", ("chunk: offset: %u length %u", + (uint) chunk_offset, (uint) chunk_length)); + if (((ulong) chunk_offset) + ((ulong) chunk_length) > TRANSLOG_PAGE_SIZE) + { + DBUG_PRINT("error", ("damaged chunk (offset %u) in trusted area", + (uint) chunk_offset)); + DBUG_RETURN(1); + } + chunk_offset+= chunk_length; + } + + valid_chunk_end= chunk_offset; + /* end of trusted area - sector parsing */ + while (page[chunk_offset] != TRANSLOG_FILLER) + { + uint16 chunk_length; + if ((chunk_length= + translog_get_total_chunk_length(page, chunk_offset)) == 0) + break; + + DBUG_PRINT("info", ("chunk: offset: %u length %u", + (uint) chunk_offset, (uint) chunk_length)); + if (((ulong) chunk_offset) + ((ulong) chunk_length) > + (uint) (offset + DISK_DRIVE_SECTOR_SIZE)) + break; + + chunk_offset+= chunk_length; + valid_chunk_end= chunk_offset; + } + DBUG_PRINT("info", ("valid chunk end offset: %u", (uint) valid_chunk_end)); + + memset(page + valid_chunk_end, TRANSLOG_FILLER, + TRANSLOG_PAGE_SIZE - valid_chunk_end); + + DBUG_RETURN(0); +} + + +/** + @brief Dummy write callback. +*/ + +static my_bool +translog_dummy_callback(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no __attribute__((unused)), + uchar* data_ptr __attribute__((unused))) +{ + return 0; +} + + +/** + @brief Checks and removes sector protection. + + @param page reference on the page content. + @param file transaction log descriptor. + + @retvat 0 OK + @retval 1 Error +*/ + +static my_bool +translog_check_sector_protection(uchar *page, TRANSLOG_FILE *file) +{ + uint i, offset; + uchar *table= page + page_overhead[page[TRANSLOG_PAGE_FLAGS]] - + TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + uint8 current= table[0]; + DBUG_ENTER("translog_check_sector_protection"); + + for (i= 1, offset= DISK_DRIVE_SECTOR_SIZE; + i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + i++, offset+= DISK_DRIVE_SECTOR_SIZE) + { + /* + TODO: add chunk counting for "suspecting" sectors (difference is + more than 1-2), if difference more then present chunks then it is + the problem. + */ + uint8 test= page[offset]; + DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx " + "read: 0x%x stored: 0x%x%x", + i, offset, (ulong) current, + (uint) uint2korr(page + offset), (uint) table[i], + (uint) table[i + 1])); + /* + 3 is minimal possible record length. So we can have "distance" + between 2 sectors value more then DISK_DRIVE_SECTOR_SIZE / 3 + only if it is old value, i.e. the sector was not written. + */ + if (((test < current) && + ((uint)(0xFFL - current + test) > DISK_DRIVE_SECTOR_SIZE / 3)) || + ((test >= current) && + ((uint)(test - current) > DISK_DRIVE_SECTOR_SIZE / 3))) + { + if (translog_recover_page_up_to_sector(page, offset)) + DBUG_RETURN(1); + file->was_recovered= 1; + DBUG_RETURN(0); + } + + /* Restore value on the page */ + page[offset]= table[i]; + current= test; + DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx " + "read: 0x%x stored: 0x%x", + i, offset, (ulong) current, + (uint) page[offset], (uint) table[i])); + } + DBUG_RETURN(0); +} + + +/** + @brief Log page validator (read callback) + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr Read callback data pointer (pointer to TRANSLOG_FILE) + + @todo: add turning loghandler to read-only mode after merging with + that patch. + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_page_validator(uchar *page, + pgcache_page_no_t page_no, + uchar* data_ptr) +{ + uint this_page_page_overhead; + uint flags; + uchar *page_pos; + TRANSLOG_FILE *data= (TRANSLOG_FILE *) data_ptr; +#ifndef DBUG_OFF + pgcache_page_no_t offset= page_no * TRANSLOG_PAGE_SIZE; +#endif + DBUG_ENTER("translog_page_validator"); + + data->was_recovered= 0; + + if ((pgcache_page_no_t) uint3korr(page) != page_no || + (uint32) uint3korr(page + 3) != data->number) + { + DBUG_PRINT("error", ("Page (%lu,0x%lx): " + "page address written in the page is incorrect: " + "File %lu instead of %lu or page %lu instead of %lu", + (ulong) data->number, (ulong) offset, + (ulong) uint3korr(page + 3), (ulong) data->number, + (ulong) uint3korr(page), + (ulong) page_no)); + DBUG_RETURN(1); + } + flags= (uint)(page[TRANSLOG_PAGE_FLAGS]); + this_page_page_overhead= page_overhead[flags]; + if (flags & ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | + TRANSLOG_RECORD_CRC)) + { + DBUG_PRINT("error", ("Page (%lu,0x%lx): " + "Garbage in the page flags field detected : %x", + (ulong) data->number, (ulong) offset, + (uint) flags)); + DBUG_RETURN(1); + } + page_pos= page + (3 + 3 + 1); + if (flags & TRANSLOG_PAGE_CRC) + { + uint32 crc= translog_crc(page + this_page_page_overhead, + TRANSLOG_PAGE_SIZE - + this_page_page_overhead); + if (crc != uint4korr(page_pos)) + { + DBUG_PRINT("error", ("Page (%lu,0x%lx): " + "CRC mismatch: calculated: %lx on the page %lx", + (ulong) data->number, (ulong) offset, + (ulong) crc, (ulong) uint4korr(page_pos))); + DBUG_RETURN(1); + } + page_pos+= CRC_SIZE; /* Skip crc */ + } + if (flags & TRANSLOG_SECTOR_PROTECTION && + translog_check_sector_protection(page, data)) + { + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +/** + @brief Locks the loghandler. +*/ + +void translog_lock() +{ + uint8 current_buffer; + DBUG_ENTER("translog_lock"); + + /* + Locking the loghandler mean locking current buffer, but it can change + during locking, so we should check it + */ + for (;;) + { + /* + log_descriptor.bc.buffer_no is only one byte so its reading is + an atomic operation + */ + current_buffer= log_descriptor.bc.buffer_no; + translog_buffer_lock(log_descriptor.buffers + current_buffer); + if (log_descriptor.bc.buffer_no == current_buffer) + break; + translog_buffer_unlock(log_descriptor.buffers + current_buffer); + } + DBUG_VOID_RETURN; +} + + +/* + Unlock the loghandler + + SYNOPSIS + translog_unlock() + + RETURN + 0 OK + 1 Error +*/ + +void translog_unlock() +{ + translog_buffer_unlock(log_descriptor.bc.buffer); +} + + +/** + @brief Get log page by file number and offset of the beginning of the page + + @param data validator data, which contains the page address + @param buffer buffer for page placing + (might not be used in some cache implementations) + @param direct_link if it is not NULL then caller can accept direct + link to the page cache + + @retval NULL Error + @retval # pointer to the page cache which should be used to read this page +*/ + +static uchar *translog_get_page(TRANSLOG_VALIDATOR_DATA *data, uchar *buffer, + PAGECACHE_BLOCK_LINK **direct_link) +{ + TRANSLOG_ADDRESS addr= *(data->addr), in_buffers; + uint32 file_no= LSN_FILE_NO(addr); + TRANSLOG_FILE *file; + DBUG_ENTER("translog_get_page"); + DBUG_PRINT("enter", ("File: %lu Offset: %lu(0x%lx)", + (ulong) file_no, + (ulong) LSN_OFFSET(addr), + (ulong) LSN_OFFSET(addr))); + + /* it is really page address */ + DBUG_ASSERT(LSN_OFFSET(addr) % TRANSLOG_PAGE_SIZE == 0); + if (direct_link) + *direct_link= NULL; + +restart: + + in_buffers= translog_only_in_buffers(); + DBUG_PRINT("info", ("in_buffers: (%lu,0x%lx)", + LSN_IN_PARTS(in_buffers))); + if (in_buffers != LSN_IMPOSSIBLE && + cmp_translog_addr(addr, in_buffers) >= 0) + { + translog_lock(); + DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0); + /* recheck with locked loghandler */ + in_buffers= translog_only_in_buffers(); + if (cmp_translog_addr(addr, in_buffers) >= 0) + { + uint16 buffer_no= log_descriptor.bc.buffer_no; +#ifndef DBUG_OFF + uint16 buffer_start= buffer_no; +#endif + struct st_translog_buffer *buffer_unlock= log_descriptor.bc.buffer; + struct st_translog_buffer *curr_buffer= log_descriptor.bc.buffer; + for (;;) + { + /* + if the page is in the buffer and it is the last version of the + page (in case of division the page by buffer flush) + */ + if (curr_buffer->file != NULL && + cmp_translog_addr(addr, curr_buffer->offset) >= 0 && + cmp_translog_addr(addr, + (curr_buffer->next_buffer_offset ? + curr_buffer->next_buffer_offset: + curr_buffer->offset + curr_buffer->size)) < 0) + { + TRANSLOG_ADDRESS offset= curr_buffer->offset; + TRANSLOG_FILE *fl= curr_buffer->file; + uchar *from, *table= NULL; + int is_last_unfinished_page; + uint last_protected_sector= 0; + uint skipped_data= curr_buffer->skipped_data; + TRANSLOG_FILE file_copy; + uint8 ver= curr_buffer->ver; + translog_wait_for_writers(curr_buffer); + if (offset != curr_buffer->offset || fl != curr_buffer->file || + ver != curr_buffer->ver) + { + DBUG_ASSERT(buffer_unlock == curr_buffer); + translog_buffer_unlock(buffer_unlock); + goto restart; + } + DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset)); + from= curr_buffer->buffer + (addr - curr_buffer->offset); + if (skipped_data && addr == curr_buffer->offset) + { + /* + We read page part of which is not present in buffer, + so we should read absent part from file (page cache actually) + */ + file= get_logfile_by_number(file_no); + DBUG_ASSERT(file != NULL); + /* + it's ok to not lock the page because: + - The log handler has it's own page cache. + - There is only one thread that can access the log + cache at a time + */ + if (!(buffer= pagecache_read(log_descriptor.pagecache, + &file->handler, + LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE, + 3, buffer, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + NULL))) + DBUG_RETURN(NULL); + } + else + skipped_data= 0; /* Read after skipped in buffer data */ + /* + Now we have correct data in buffer up to 'skipped_data'. The + following memcpy() will move the data from the internal buffer + that was not yet on disk. + */ + memcpy(buffer + skipped_data, from + skipped_data, + TRANSLOG_PAGE_SIZE - skipped_data); + /* + We can use copy then in translog_page_validator() because it + do not put it permanently somewhere. + We have to use copy because after releasing log lock we can't + guaranty that the file still be present (in real life it will be + present but theoretically possible that it will be released + already from last files cache); + */ + file_copy= *(curr_buffer->file); + file_copy.handler.callback_data= (uchar*) &file_copy; + is_last_unfinished_page= ((log_descriptor.bc.buffer == + curr_buffer) && + (log_descriptor.bc.ptr >= from) && + (log_descriptor.bc.ptr < + from + TRANSLOG_PAGE_SIZE)); + if (is_last_unfinished_page && + (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)) + { + last_protected_sector= ((log_descriptor.bc.previous_offset - 1) / + DISK_DRIVE_SECTOR_SIZE); + table= buffer + log_descriptor.page_overhead - + TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + } + + DBUG_ASSERT(buffer_unlock == curr_buffer); + translog_buffer_unlock(buffer_unlock); + if (is_last_unfinished_page) + { + uint i; + /* + This is last unfinished page => we should not check CRC and + remove only that protection which already installed (no need + to check it) + + We do not check the flag of sector protection, because if + (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) is + not set then last_protected_sector will be 0 so following loop + will be never executed + */ + DBUG_PRINT("info", ("This is last unfinished page, " + "last protected sector %u", + last_protected_sector)); + for (i= 1; i <= last_protected_sector; i++) + { + uint offset= i * DISK_DRIVE_SECTOR_SIZE; + DBUG_PRINT("info", ("Sector %u: 0x%02x <- 0x%02x", + i, buffer[offset], + table[i])); + buffer[offset]= table[i]; + } + } + else + { + /* + This IF should be true because we use in-memory data which + supposed to be correct. + */ + if (translog_page_validator(buffer, + LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE, + (uchar*) &file_copy)) + { + DBUG_ASSERT(0); + buffer= NULL; + } + } + DBUG_RETURN(buffer); + } + buffer_no= (buffer_no + 1) % TRANSLOG_BUFFERS_NO; + curr_buffer= log_descriptor.buffers + buffer_no; + translog_buffer_lock(curr_buffer); + translog_buffer_unlock(buffer_unlock); + buffer_unlock= curr_buffer; + /* we can't make a full circle */ + DBUG_ASSERT(buffer_start != buffer_no); + } + } + translog_unlock(); + } + file= get_logfile_by_number(file_no); + DBUG_ASSERT(file != NULL); + buffer= pagecache_read(log_descriptor.pagecache, &file->handler, + LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE, + 3, (direct_link ? NULL : buffer), + PAGECACHE_PLAIN_PAGE, + (direct_link ? + PAGECACHE_LOCK_READ : + PAGECACHE_LOCK_LEFT_UNLOCKED), + direct_link); + DBUG_PRINT("info", ("Direct link is assigned to : 0x%lx * 0x%lx", + (ulong) direct_link, + (ulong)(direct_link ? *direct_link : NULL))); + data->was_recovered= file->was_recovered; + DBUG_RETURN(buffer); +} + + +/** + @brief free direct log page link + + @param direct_link the direct log page link to be freed + +*/ + +static void translog_free_link(PAGECACHE_BLOCK_LINK *direct_link) +{ + DBUG_ENTER("translog_free_link"); + DBUG_PRINT("info", ("Direct link: 0x%lx", + (ulong) direct_link)); + if (direct_link) + pagecache_unlock_by_link(log_descriptor.pagecache, direct_link, + PAGECACHE_LOCK_READ_UNLOCK, PAGECACHE_UNPIN, + LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, 0, FALSE); + DBUG_VOID_RETURN; +} + + +/** + @brief Finds last full page of the given log file. + + @param addr address structure to fill with data, which contain + file number of the log file + @param last_page_ok Result of the check whether last page OK. + (for now only we check only that file length + divisible on page length). + @param no_errors suppress messages about non-critical errors + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_get_last_page_addr(TRANSLOG_ADDRESS *addr, + my_bool *last_page_ok, + my_bool no_errors) +{ + char path[FN_REFLEN]; + uint32 rec_offset; + my_off_t file_size; + uint32 file_no= LSN_FILE_NO(*addr); + TRANSLOG_FILE *file; +#ifndef DBUG_OFF + char buff[21]; +#endif + DBUG_ENTER("translog_get_last_page_addr"); + + if (likely((file= get_logfile_by_number(file_no)) != NULL)) + { + /* + This function used only during initialization of loghandler or in + scanner (which mean we need read that part of the log), so the + requested log file have to be opened and can't be freed after + returning pointer on it (file_size). + */ + file_size= my_seek(file->handler.file, 0, SEEK_END, MYF(0)); + } + else + { + /* + This branch is used only during very early initialization + when files are not opened. + */ + File fd; + if ((fd= my_open(translog_filename_by_fileno(file_no, path), + O_RDONLY, (no_errors ? MYF(0) : MYF(MY_WME)))) < 0) + { + my_errno= errno; + DBUG_PRINT("error", ("Error %d during opening file #%d", + errno, file_no)); + DBUG_RETURN(1); + } + file_size= my_seek(fd, 0, SEEK_END, MYF(0)); + my_close(fd, MYF(0)); + } + DBUG_PRINT("info", ("File size: %s", llstr(file_size, buff))); + if (file_size == MY_FILEPOS_ERROR) + DBUG_RETURN(1); + DBUG_ASSERT(file_size < ULL(0xffffffff)); + if (((uint32)file_size) > TRANSLOG_PAGE_SIZE) + { + rec_offset= (((((uint32)file_size) / TRANSLOG_PAGE_SIZE) - 1) * + TRANSLOG_PAGE_SIZE); + *last_page_ok= (((uint32)file_size) == rec_offset + TRANSLOG_PAGE_SIZE); + } + else + { + *last_page_ok= 0; + rec_offset= 0; + } + *addr= MAKE_LSN(file_no, rec_offset); + DBUG_PRINT("info", ("Last page: 0x%lx ok: %d", (ulong) rec_offset, + *last_page_ok)); + DBUG_RETURN(0); +} + + +/** + @brief Get number bytes for record length storing + + @param length Record length which will be encoded + + @return 1,3,4,5 - number of bytes to store given length +*/ + +static uint translog_variable_record_length_bytes(translog_size_t length) +{ + if (length < 250) + return 1; + if (length < 0xFFFF) + return 3; + if (length < (ulong) 0xFFFFFF) + return 4; + return 5; +} + + +/** + @brief Gets header of this chunk. + + @param chunk The pointer to the chunk beginning + + @retval # total length of the chunk + @retval 0 Error +*/ + +static uint16 translog_get_chunk_header_length(uchar *chunk) +{ + DBUG_ENTER("translog_get_chunk_header_length"); + switch (*chunk & TRANSLOG_CHUNK_TYPE) { + case TRANSLOG_CHUNK_LSN: + { + /* 0 chunk referred as LSN (head or tail) */ + translog_size_t rec_len; + uchar *start= chunk; + uchar *ptr= start + 1 + 2; + uint16 chunk_len, header_len; + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN")); + rec_len= translog_variable_record_1group_decode_len(&ptr); + chunk_len= uint2korr(ptr); + header_len= (uint16) (ptr - start) +2; + DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u", + (ulong) rec_len, (uint) chunk_len, (uint) header_len)); + if (chunk_len) + { + /* TODO: fine header end */ + /* + The last chunk of multi-group record can be base for it header + calculation (we skip to the first group to read the header) so if we + stuck here something is wrong. + */ + DBUG_ASSERT(0); + DBUG_RETURN(0); /* Keep compiler happy */ + } + DBUG_RETURN(header_len); + } + case TRANSLOG_CHUNK_FIXED: + { + /* 1 (pseudo)fixed record (also LSN) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED = 3")); + DBUG_RETURN(3); + } + case TRANSLOG_CHUNK_NOHDR: + /* 2 no header chunk (till page end) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR = 1")); + DBUG_RETURN(1); + break; + case TRANSLOG_CHUNK_LNGTH: + /* 3 chunk with chunk length */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH = 3")); + DBUG_RETURN(3); + break; + default: + DBUG_ASSERT(0); + DBUG_RETURN(0); /* Keep compiler happy */ + } +} + + +/** + @brief Truncate the log to the given address. Used during the startup if the + end of log if corrupted. + + @param addr new horizon + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_truncate_log(TRANSLOG_ADDRESS addr) +{ + uchar *page; + TRANSLOG_ADDRESS current_page; + uint32 next_page_offset, page_rest; + uint32 i; + File fd; + int rc; + TRANSLOG_VALIDATOR_DATA data; + char path[FN_REFLEN]; + uchar page_buff[TRANSLOG_PAGE_SIZE]; + DBUG_ENTER("translog_truncate_log"); + /* TODO: write warning to the client */ + DBUG_PRINT("warning", ("removing all records from (%lu,0x%lx) " + "till (%lu,0x%lx)", + LSN_IN_PARTS(addr), + LSN_IN_PARTS(log_descriptor.horizon))); + DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0); + /* remove files between the address and horizon */ + for (i= LSN_FILE_NO(addr) + 1; i <= LSN_FILE_NO(log_descriptor.horizon); i++) + if (my_delete(translog_filename_by_fileno(i, path), MYF(MY_WME))) + { + translog_unlock(); + DBUG_RETURN(1); + } + + /* truncate the last file up to the last page */ + next_page_offset= LSN_OFFSET(addr); + next_page_offset= (next_page_offset - + ((next_page_offset - 1) % TRANSLOG_PAGE_SIZE + 1) + + TRANSLOG_PAGE_SIZE); + page_rest= next_page_offset - LSN_OFFSET(addr); + memset(page_buff, TRANSLOG_FILLER, page_rest); + rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 || + ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) || + (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr), + log_write_flags)) || + my_sync(fd, MYF(MY_WME))))); + translog_syncs++; + rc|= (fd > 0 && my_close(fd, MYF(MY_WME))); + if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS) + { + rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD)); + translog_syncs++; + } + if (rc) + DBUG_RETURN(1); + + /* fix the horizon */ + log_descriptor.horizon= addr; + /* fix the buffer data */ + current_page= MAKE_LSN(LSN_FILE_NO(addr), (next_page_offset - + TRANSLOG_PAGE_SIZE)); + data.addr= ¤t_page; + if ((page= translog_get_page(&data, log_descriptor.buffers->buffer, NULL)) == + NULL) + DBUG_RETURN(1); + if (page != log_descriptor.buffers->buffer) + memcpy(log_descriptor.buffers->buffer, page, TRANSLOG_PAGE_SIZE); + log_descriptor.bc.buffer->offset= current_page; + log_descriptor.bc.buffer->size= LSN_OFFSET(addr) - LSN_OFFSET(current_page); + log_descriptor.bc.ptr= + log_descriptor.buffers->buffer + log_descriptor.bc.buffer->size; + log_descriptor.bc.current_page_fill= log_descriptor.bc.buffer->size; + DBUG_RETURN(0); +} + + +/** + Applies function 'callback' to all files (in a directory) which + name looks like a log's name (aria_log.[0-9]{7}). + If 'callback' returns TRUE this interrupts the walk and returns + TRUE. Otherwise FALSE is returned after processing all log files. + It cannot just use log_descriptor.directory because that may not yet have + been initialized. + + @param directory directory to scan + @param callback function to apply; is passed directory and base + name of found file +*/ + +my_bool translog_walk_filenames(const char *directory, + my_bool (*callback)(const char *, + const char *)) +{ + MY_DIR *dirp; + uint i; + my_bool rc= FALSE; + + /* Finds and removes transaction log files */ + if (!(dirp = my_dir(directory, MYF(MY_DONT_SORT)))) + return FALSE; + + for (i= 0; i < dirp->number_off_files; i++) + { + char *file= dirp->dir_entry[i].name; + if (strncmp(file, "aria_log.", 10) == 0 && + file[10] >= '0' && file[10] <= '9' && + file[11] >= '0' && file[11] <= '9' && + file[12] >= '0' && file[12] <= '9' && + file[13] >= '0' && file[13] <= '9' && + file[14] >= '0' && file[14] <= '9' && + file[15] >= '0' && file[15] <= '9' && + file[16] >= '0' && file[16] <= '9' && + file[17] >= '0' && file[17] <= '9' && + file[18] == '\0' && (*callback)(directory, file)) + { + rc= TRUE; + break; + } + } + my_dirend(dirp); + return rc; +} + + +/** + @brief Fills table of dependence length of page header from page flags +*/ + +static void translog_fill_overhead_table() +{ + uint i; + for (i= 0; i < TRANSLOG_FLAGS_NUM; i++) + { + page_overhead[i]= 7; + if (i & TRANSLOG_PAGE_CRC) + page_overhead[i]+= CRC_SIZE; + if (i & TRANSLOG_SECTOR_PROTECTION) + page_overhead[i]+= TRANSLOG_PAGE_SIZE / + DISK_DRIVE_SECTOR_SIZE; + } +} + + +/** + Callback to find first log in directory. +*/ + +static my_bool translog_callback_search_first(const char *directory + __attribute__((unused)), + const char *filename + __attribute__((unused))) +{ + return TRUE; +} + + +/** + @brief Checks that chunk is LSN one + + @param type type of the chunk + + @retval 1 the chunk is LNS + @retval 0 the chunk is not LSN +*/ + +static my_bool translog_is_LSN_chunk(uchar type) +{ + DBUG_ENTER("translog_is_LSN_chunk"); + DBUG_PRINT("info", ("byte: %x chunk type: %u record type: %u", + type, type >> 6, type & TRANSLOG_REC_TYPE)); + DBUG_RETURN(((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_FIXED) || + (((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_LSN) && + ((type & TRANSLOG_REC_TYPE)) != TRANSLOG_CHUNK_0_CONT)); +} + + +/** + @brief Initialize transaction log + + @param directory Directory where log files are put + @param log_file_max_size max size of one log size (for new logs creation) + @param server_version version of MySQL server (MYSQL_VERSION_ID) + @param server_id server ID (replication & Co) + @param pagecache Page cache for the log reads + @param flags flags (TRANSLOG_PAGE_CRC, TRANSLOG_SECTOR_PROTECTION + TRANSLOG_RECORD_CRC) + @param read_only Put transaction log in read-only mode + @param init_table_func function to initialize record descriptors table + @param no_errors suppress messages about non-critical errors + + @todo + Free used resources in case of error. + + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_init_with_table(const char *directory, + uint32 log_file_max_size, + uint32 server_version, + uint32 server_id, PAGECACHE *pagecache, + uint flags, my_bool readonly, + void (*init_table_func)(), + my_bool no_errors) +{ + int i; + int old_log_was_recovered= 0, logs_found= 0; + uint old_flags= flags; + uint32 start_file_num= 1; + TRANSLOG_ADDRESS sure_page, last_page, last_valid_page, checkpoint_lsn; + my_bool version_changed= 0; + DBUG_ENTER("translog_init_with_table"); + + translog_syncs= 0; + flush_start= 0; + id_to_share= NULL; + + log_descriptor.directory_fd= -1; + log_descriptor.is_everything_flushed= 1; + log_descriptor.flush_in_progress= 0; + log_descriptor.flush_no= 0; + log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE; + + (*init_table_func)(); + compile_time_assert(sizeof(log_descriptor.dirty_buffer_mask) * 8 >= + TRANSLOG_BUFFERS_NO); + log_descriptor.dirty_buffer_mask= 0; + if (readonly) + log_descriptor.open_flags= O_BINARY | O_RDONLY; + else + log_descriptor.open_flags= O_BINARY | O_RDWR; + if (pthread_mutex_init(&log_descriptor.sent_to_disk_lock, + MY_MUTEX_INIT_FAST) || + pthread_mutex_init(&log_descriptor.file_header_lock, + MY_MUTEX_INIT_FAST) || + pthread_mutex_init(&log_descriptor.unfinished_files_lock, + MY_MUTEX_INIT_FAST) || + pthread_mutex_init(&log_descriptor.purger_lock, + MY_MUTEX_INIT_FAST) || + pthread_mutex_init(&log_descriptor.log_flush_lock, + MY_MUTEX_INIT_FAST) || + pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock, + MY_MUTEX_INIT_FAST) || + pthread_cond_init(&log_descriptor.log_flush_cond, 0) || + pthread_cond_init(&log_descriptor.new_goal_cond, 0) || + my_rwlock_init(&log_descriptor.open_files_lock, + NULL) || + my_init_dynamic_array(&log_descriptor.open_files, + sizeof(TRANSLOG_FILE*), 10, 10) || + my_init_dynamic_array(&log_descriptor.unfinished_files, + sizeof(struct st_file_counter), + 10, 10)) + goto err; + log_descriptor.min_need_file= 0; + log_descriptor.min_file_number= 0; + log_descriptor.last_lsn_checked= LSN_IMPOSSIBLE; + + /* Directory to store files */ + unpack_dirname(log_descriptor.directory, directory); +#ifndef __WIN__ + if ((log_descriptor.directory_fd= my_open(log_descriptor.directory, + O_RDONLY, MYF(MY_WME))) < 0) + { + my_errno= errno; + DBUG_PRINT("error", ("Error %d during opening directory '%s'", + errno, log_descriptor.directory)); + goto err; + } +#endif + log_descriptor.in_buffers_only= LSN_IMPOSSIBLE; + DBUG_ASSERT(log_file_max_size % TRANSLOG_PAGE_SIZE == 0 && + log_file_max_size >= TRANSLOG_MIN_FILE_SIZE); + /* max size of one log size (for new logs creation) */ + log_file_size= log_descriptor.log_file_max_size= + log_file_max_size; + /* server version */ + log_descriptor.server_version= server_version; + /* server ID */ + log_descriptor.server_id= server_id; + /* Page cache for the log reads */ + log_descriptor.pagecache= pagecache; + /* Flags */ + DBUG_ASSERT((flags & + ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | + TRANSLOG_RECORD_CRC)) == 0); + log_descriptor.flags= flags; + translog_fill_overhead_table(); + log_descriptor.page_overhead= page_overhead[flags]; + log_descriptor.page_capacity_chunk_2= + TRANSLOG_PAGE_SIZE - log_descriptor.page_overhead - 1; + compile_time_assert(TRANSLOG_WRITE_BUFFER % TRANSLOG_PAGE_SIZE == 0); + log_descriptor.buffer_capacity_chunk_2= + (TRANSLOG_WRITE_BUFFER / TRANSLOG_PAGE_SIZE) * + log_descriptor.page_capacity_chunk_2; + log_descriptor.half_buffer_capacity_chunk_2= + log_descriptor.buffer_capacity_chunk_2 / 2; + DBUG_PRINT("info", + ("Overhead: %u pc2: %u bc2: %u, bc2/2: %u", + log_descriptor.page_overhead, + log_descriptor.page_capacity_chunk_2, + log_descriptor.buffer_capacity_chunk_2, + log_descriptor.half_buffer_capacity_chunk_2)); + + /* Just to init it somehow (hack for bootstrap)*/ + { + TRANSLOG_FILE *file= 0; + log_descriptor.min_file = log_descriptor.max_file= 1; + insert_dynamic(&log_descriptor.open_files, (uchar *)&file); + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + pop_dynamic(&log_descriptor.open_files); + } + + /* Buffers for log writing */ + for (i= 0; i < TRANSLOG_BUFFERS_NO; i++) + { + if (translog_buffer_init(log_descriptor.buffers + i, i)) + goto err; + DBUG_PRINT("info", ("translog_buffer buffer #%u: 0x%lx", + i, (ulong) log_descriptor.buffers + i)); + } + + /* + last_logno and last_checkpoint_lsn were set in + ma_control_file_create_or_open() + */ + logs_found= (last_logno != FILENO_IMPOSSIBLE); + + translog_status= (readonly ? TRANSLOG_READONLY : TRANSLOG_OK); + checkpoint_lsn= last_checkpoint_lsn; + + if (logs_found) + { + my_bool pageok; + DBUG_PRINT("info", ("log found...")); + /* + TODO: scan directory for aria_log.XXXXXXXX files and find + highest XXXXXXXX & set logs_found + TODO: check that last checkpoint within present log addresses space + + find the log end + */ + if (LSN_FILE_NO(last_checkpoint_lsn) == FILENO_IMPOSSIBLE) + { + DBUG_ASSERT(LSN_OFFSET(last_checkpoint_lsn) == 0); + /* only last log needs to be checked */ + sure_page= MAKE_LSN(last_logno, TRANSLOG_PAGE_SIZE); + } + else + { + sure_page= last_checkpoint_lsn; + DBUG_ASSERT(LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE != 0); + sure_page-= LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE; + } + /* Set horizon to the beginning of the last file first */ + log_descriptor.horizon= last_page= MAKE_LSN(last_logno, 0); + if (translog_get_last_page_addr(&last_page, &pageok, no_errors)) + { + if (!translog_walk_filenames(log_descriptor.directory, + &translog_callback_search_first)) + { + /* + Files was deleted, just start from the next log number, so that + existing tables are in the past. + */ + start_file_num= last_logno + 1; + checkpoint_lsn= LSN_IMPOSSIBLE; /* no log so no checkpoint */ + logs_found= 0; + } + else + goto err; + } + else if (LSN_OFFSET(last_page) == 0) + { + if (LSN_FILE_NO(last_page) == 1) + { + logs_found= 0; /* file #1 has no pages */ + DBUG_PRINT("info", ("log found. But is is empty => no log assumed")); + } + else + { + last_page-= LSN_ONE_FILE; + if (translog_get_last_page_addr(&last_page, &pageok, 0)) + goto err; + } + } + if (logs_found) + { + uint32 i; + log_descriptor.min_file= translog_first_file(log_descriptor.horizon, 1); + log_descriptor.max_file= last_logno; + /* Open all files */ + if (allocate_dynamic(&log_descriptor.open_files, + log_descriptor.max_file - + log_descriptor.min_file + 1)) + goto err; + for (i = log_descriptor.max_file; i >= log_descriptor.min_file; i--) + { + /* + We can't allocate all file together because they will be freed + one by one + */ + TRANSLOG_FILE *file= (TRANSLOG_FILE *)my_malloc(sizeof(TRANSLOG_FILE), + MYF(0)); + + compile_time_assert(MY_FILEPOS_ERROR > ULL(0xffffffff)); + if (file == NULL || + (file->handler.file= + open_logfile_by_number_no_cache(i)) < 0 || + my_seek(file->handler.file, 0, SEEK_END, MYF(0)) >= + ULL(0xffffffff)) + { + int j; + for (j= i - log_descriptor.min_file - 1; j > 0; j--) + { + TRANSLOG_FILE *el= + *dynamic_element(&log_descriptor.open_files, j, + TRANSLOG_FILE **); + my_close(el->handler.file, MYF(MY_WME)); + my_free(el, MYF(0)); + } + if (file) + { + free(file); + goto err; + } + else + goto err; + } + translog_file_init(file, i, 1); + /* we allocated space so it can't fail */ + insert_dynamic(&log_descriptor.open_files, (uchar *)&file); + } + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + } + } + else if (readonly) + { + /* There is no logs and there is read-only mode => nothing to read */ + DBUG_PRINT("error", ("No logs and read-only mode")); + goto err; + } + + if (logs_found) + { + TRANSLOG_ADDRESS current_page= sure_page; + my_bool pageok; + + DBUG_PRINT("info", ("The log is really present")); + DBUG_ASSERT(sure_page <= last_page); + + /* TODO: check page size */ + + last_valid_page= LSN_IMPOSSIBLE; + /* + Scans and validate pages. We need it to show "outside" only for sure + valid part of the log. If the log was damaged then fixed we have to + cut off damaged part before some other process start write something + in the log. + */ + do + { + TRANSLOG_ADDRESS current_file_last_page; + current_file_last_page= current_page; + if (translog_get_last_page_addr(¤t_file_last_page, &pageok, 0)) + goto err; + if (!pageok) + { + DBUG_PRINT("error", ("File %lu have no complete last page", + (ulong) LSN_FILE_NO(current_file_last_page))); + old_log_was_recovered= 1; + /* This file is not written till the end so it should be last */ + last_page= current_file_last_page; + /* TODO: issue warning */ + } + do + { + TRANSLOG_VALIDATOR_DATA data; + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + uchar *page; + data.addr= ¤t_page; + if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL) + goto err; + if (data.was_recovered) + { + DBUG_PRINT("error", ("file no: %lu (%d) " + "rec_offset: 0x%lx (%lu) (%d)", + (ulong) LSN_FILE_NO(current_page), + (uint3korr(page + 3) != + LSN_FILE_NO(current_page)), + (ulong) LSN_OFFSET(current_page), + (ulong) (LSN_OFFSET(current_page) / + TRANSLOG_PAGE_SIZE), + (uint3korr(page) != + LSN_OFFSET(current_page) / + TRANSLOG_PAGE_SIZE))); + old_log_was_recovered= 1; + break; + } + old_flags= page[TRANSLOG_PAGE_FLAGS]; + last_valid_page= current_page; + current_page+= TRANSLOG_PAGE_SIZE; /* increase offset */ + } while (current_page <= current_file_last_page); + current_page+= LSN_ONE_FILE; + current_page= LSN_REPLACE_OFFSET(current_page, TRANSLOG_PAGE_SIZE); + } while (LSN_FILE_NO(current_page) <= LSN_FILE_NO(last_page) && + !old_log_was_recovered); + if (last_valid_page == LSN_IMPOSSIBLE) + { + /* Panic!!! Even page which should be valid is invalid */ + /* TODO: issue error */ + goto err; + } + DBUG_PRINT("info", ("Last valid page is in file: %lu " + "offset: %lu (0x%lx) " + "Logs found: %d was recovered: %d " + "flags match: %d", + (ulong) LSN_FILE_NO(last_valid_page), + (ulong) LSN_OFFSET(last_valid_page), + (ulong) LSN_OFFSET(last_valid_page), + logs_found, old_log_was_recovered, + (old_flags == flags))); + + /* TODO: check server ID */ + if (logs_found && !old_log_was_recovered && old_flags == flags) + { + TRANSLOG_VALIDATOR_DATA data; + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + uchar *page; + uint16 chunk_offset; + data.addr= &last_valid_page; + /* continue old log */ + DBUG_ASSERT(LSN_FILE_NO(last_valid_page)== + LSN_FILE_NO(log_descriptor.horizon)); + if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL || + (chunk_offset= translog_get_first_chunk_offset(page)) == 0) + goto err; + + /* Puts filled part of old page in the buffer */ + log_descriptor.horizon= last_valid_page; + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + /* + Free space if filled with TRANSLOG_FILLER and first uchar of + real chunk can't be TRANSLOG_FILLER + */ + while (chunk_offset < TRANSLOG_PAGE_SIZE && + page[chunk_offset] != TRANSLOG_FILLER) + { + uint16 chunk_length; + if ((chunk_length= + translog_get_total_chunk_length(page, chunk_offset)) == 0) + goto err; + DBUG_PRINT("info", ("chunk: offset: %u length: %u", + (uint) chunk_offset, (uint) chunk_length)); + chunk_offset+= chunk_length; + + /* chunk can't cross the page border */ + DBUG_ASSERT(chunk_offset <= TRANSLOG_PAGE_SIZE); + } + memcpy(log_descriptor.buffers->buffer, page, chunk_offset); + log_descriptor.bc.buffer->size+= chunk_offset; + log_descriptor.bc.ptr+= chunk_offset; + log_descriptor.bc.current_page_fill= chunk_offset; + log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, + (chunk_offset + + LSN_OFFSET(last_valid_page))); + DBUG_PRINT("info", ("Move Page #%u: 0x%lx chaser: %d Size: %lu (%lu)", + (uint) log_descriptor.bc.buffer_no, + (ulong) log_descriptor.bc.buffer, + log_descriptor.bc.chaser, + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr - log_descriptor.bc. + buffer->buffer))); + translog_check_cursor(&log_descriptor.bc); + } + if (!old_log_was_recovered && old_flags == flags) + { + LOGHANDLER_FILE_INFO info; + LINT_INIT_STRUCT(info); + + /* + Accessing &log_descriptor.open_files without mutex is safe + because it is initialization + */ + if (translog_read_file_header(&info, + (*dynamic_element(&log_descriptor. + open_files, + 0, TRANSLOG_FILE **))-> + handler.file)) + goto err; + version_changed= (info.maria_version != TRANSLOG_VERSION_ID); + } + } + DBUG_PRINT("info", ("Logs found: %d was recovered: %d", + logs_found, old_log_was_recovered)); + if (!logs_found) + { + TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(sizeof(TRANSLOG_FILE), + MYF(0)); + DBUG_PRINT("info", ("The log is not found => we will create new log")); + if (file == NULL) + goto err; + /* Start new log system from scratch */ + log_descriptor.horizon= MAKE_LSN(start_file_num, + TRANSLOG_PAGE_SIZE); /* header page */ + if ((file->handler.file= + create_logfile_by_number_no_cache(start_file_num)) == -1) + goto err; + translog_file_init(file, start_file_num, 0); + if (insert_dynamic(&log_descriptor.open_files, (uchar*)&file)) + goto err; + log_descriptor.min_file= log_descriptor.max_file= start_file_num; + if (translog_write_file_header()) + goto err; + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + + if (ma_control_file_write_and_force(checkpoint_lsn, start_file_num, + max_trid_in_control_file, + recovery_failures)) + goto err; + /* assign buffer 0 */ + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); + } + else if ((old_log_was_recovered || old_flags != flags || version_changed) && + !readonly) + { + /* leave the damaged file untouched */ + log_descriptor.horizon+= LSN_ONE_FILE; + /* header page */ + log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, + TRANSLOG_PAGE_SIZE); + if (translog_create_new_file()) + goto err; + /* + Buffer system left untouched after recovery => we should init it + (starting from buffer 0) + */ + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); + } + + /* all LSNs that are on disk are flushed */ + log_descriptor.log_start= log_descriptor.sent_to_disk= + log_descriptor.flushed= log_descriptor.horizon; + log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset; + log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */ + /* + Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially) + address of the next LSN and we want indicate that all LSNs that are + already on the disk are flushed so we need decrease horizon on 1 (we are + sure that there is no LSN on the disk which is greater then 'flushed' + and there will not be LSN created that is equal or less then the value + of the 'flushed'). + */ + log_descriptor.flushed--; /* offset decreased */ + log_descriptor.sent_to_disk--; /* offset decreased */ + /* + Log records will refer to a MARIA_SHARE by a unique 2-byte id; set up + structures for generating 2-byte ids: + */ + my_atomic_rwlock_init(&LOCK_id_to_share); + id_to_share= (MARIA_SHARE **) my_malloc(SHARE_ID_MAX * sizeof(MARIA_SHARE*), + MYF(MY_WME | MY_ZEROFILL)); + if (unlikely(!id_to_share)) + goto err; + id_to_share--; /* min id is 1 */ + + /* Check the last LSN record integrity */ + if (logs_found) + { + TRANSLOG_SCANNER_DATA scanner; + TRANSLOG_ADDRESS page_addr; + LSN last_lsn= LSN_IMPOSSIBLE; + /* + take very last page address and try to find LSN record on it + if it fail take address of previous page and so on + */ + page_addr= (log_descriptor.horizon - + ((log_descriptor.horizon - 1) % TRANSLOG_PAGE_SIZE + 1)); + if (translog_scanner_init(page_addr, 1, &scanner, 1)) + goto err; + scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]]; + for (;;) + { + uint chunk_1byte; + chunk_1byte= scanner.page[scanner.page_offset]; + while (!translog_is_LSN_chunk(chunk_1byte) && + scanner.page != END_OF_LOG && + scanner.page[scanner.page_offset] != TRANSLOG_FILLER && + scanner.page_addr == page_addr) + { + if (translog_get_next_chunk(&scanner)) + { + translog_destroy_scanner(&scanner); + goto err; + } + if (scanner.page != END_OF_LOG) + chunk_1byte= scanner.page[scanner.page_offset]; + } + if (translog_is_LSN_chunk(chunk_1byte)) + { + last_lsn= scanner.page_addr + scanner.page_offset; + if (translog_get_next_chunk(&scanner)) + { + translog_destroy_scanner(&scanner); + goto err; + } + if (scanner.page == END_OF_LOG) + break; /* it was the last record */ + chunk_1byte= scanner.page[scanner.page_offset]; + continue; /* try to find other record on this page */ + } + + if (last_lsn != LSN_IMPOSSIBLE) + break; /* there is no more records on the page */ + + /* We have to make step back */ + if (unlikely(LSN_OFFSET(page_addr) == TRANSLOG_PAGE_SIZE)) + { + uint32 file_no= LSN_FILE_NO(page_addr); + my_bool last_page_ok; + /* it is beginning of the current file */ + if (unlikely(file_no == 1)) + { + /* + It is beginning of the log => there is no LSNs in the log => + There is no harm in leaving it "as-is". + */ + log_descriptor.previous_flush_horizon= log_descriptor.horizon; + DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor. + previous_flush_horizon))); + DBUG_RETURN(0); + } + file_no--; + page_addr= MAKE_LSN(file_no, TRANSLOG_PAGE_SIZE); + translog_get_last_page_addr(&page_addr, &last_page_ok, 0); + /* page should be OK as it is not the last file */ + DBUG_ASSERT(last_page_ok); + } + else + { + page_addr-= TRANSLOG_PAGE_SIZE; + } + translog_destroy_scanner(&scanner); + if (translog_scanner_init(page_addr, 1, &scanner, 1)) + goto err; + scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]]; + } + translog_destroy_scanner(&scanner); + + /* Now scanner points to the last LSN chunk, lets check it */ + { + TRANSLOG_HEADER_BUFFER rec; + translog_size_t rec_len; + int len; + uchar buffer[1]; + DBUG_PRINT("info", ("going to check the last found record (%lu,0x%lx)", + LSN_IN_PARTS(last_lsn))); + + len= + translog_read_record_header(last_lsn, &rec); + if (unlikely (len == RECHEADER_READ_ERROR || + len == RECHEADER_READ_EOF)) + { + DBUG_PRINT("error", ("unexpected end of log or record during " + "reading record header: (%lu,0x%lx) len: %d", + LSN_IN_PARTS(last_lsn), len)); + if (readonly) + log_descriptor.log_start= log_descriptor.horizon= last_lsn; + else if (translog_truncate_log(last_lsn)) + { + translog_free_record_header(&rec); + goto err; + } + } + else + { + DBUG_ASSERT(last_lsn == rec.lsn); + if (likely(rec.record_length != 0)) + { + /* + Reading the last byte of record will trigger scanning all + record chunks for now + */ + rec_len= translog_read_record(rec.lsn, rec.record_length - 1, 1, + buffer, NULL); + if (rec_len != 1) + { + DBUG_PRINT("error", ("unexpected end of log or record during " + "reading record body: (%lu,0x%lx) len: %d", + LSN_IN_PARTS(rec.lsn), + len)); + if (readonly) + log_descriptor.log_start= log_descriptor.horizon= last_lsn; + + else if (translog_truncate_log(last_lsn)) + { + translog_free_record_header(&rec); + goto err; + } + } + } + } + translog_free_record_header(&rec); + } + } + log_descriptor.previous_flush_horizon= log_descriptor.horizon; + DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.previous_flush_horizon))); + DBUG_RETURN(0); +err: + ma_message_no_user(0, "log initialization failed"); + DBUG_RETURN(1); +} + + +/* + @brief Free transaction log file buffer. + + @param buffer_no The buffer to free +*/ + +static void translog_buffer_destroy(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_destroy"); + DBUG_PRINT("enter", + ("Buffer #%u: 0x%lx file: %d offset: (%lu,0x%lx) size: %lu", + (uint) buffer->buffer_no, (ulong) buffer, + (buffer->file ? buffer->file->handler.file : -1), + LSN_IN_PARTS(buffer->offset), + (ulong) buffer->size)); + if (buffer->file != NULL) + { + /* + We ignore errors here, because we can't do something about it + (it is shutting down) + + We also have to take the locks even if there can't be any other + threads running, because translog_buffer_flush() + requires that we have the buffer locked. + */ + translog_buffer_lock(buffer); + translog_buffer_flush(buffer); + translog_buffer_unlock(buffer); + } + DBUG_PRINT("info", ("Destroy mutex: 0x%lx", (ulong) &buffer->mutex)); + pthread_mutex_destroy(&buffer->mutex); + pthread_cond_destroy(&buffer->waiting_filling_buffer); + DBUG_VOID_RETURN; +} + + +/* + Free log handler resources + + SYNOPSIS + translog_destroy() +*/ + +void translog_destroy() +{ + TRANSLOG_FILE **file; + uint i; + uint8 current_buffer; + DBUG_ENTER("translog_destroy"); + + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + translog_lock(); + current_buffer= log_descriptor.bc.buffer_no; + translog_status= (translog_status == TRANSLOG_READONLY ? + TRANSLOG_UNINITED : + TRANSLOG_SHUTDOWN); + if (log_descriptor.bc.buffer->file != NULL) + translog_finish_page(&log_descriptor.horizon, &log_descriptor.bc); + translog_unlock(); + + for (i= 0; i < TRANSLOG_BUFFERS_NO; i++) + { + struct st_translog_buffer *buffer= (log_descriptor.buffers + + ((i + current_buffer + 1) % + TRANSLOG_BUFFERS_NO)); + translog_buffer_destroy(buffer); + } + translog_status= TRANSLOG_UNINITED; + + /* close files */ + while ((file= (TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files))) + translog_close_log_file(*file); + pthread_mutex_destroy(&log_descriptor.sent_to_disk_lock); + pthread_mutex_destroy(&log_descriptor.file_header_lock); + pthread_mutex_destroy(&log_descriptor.unfinished_files_lock); + pthread_mutex_destroy(&log_descriptor.purger_lock); + pthread_mutex_destroy(&log_descriptor.log_flush_lock); + pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock); + pthread_cond_destroy(&log_descriptor.log_flush_cond); + pthread_cond_destroy(&log_descriptor.new_goal_cond); + rwlock_destroy(&log_descriptor.open_files_lock); + delete_dynamic(&log_descriptor.open_files); + delete_dynamic(&log_descriptor.unfinished_files); + + if (log_descriptor.directory_fd >= 0) + my_close(log_descriptor.directory_fd, MYF(MY_WME)); + my_atomic_rwlock_destroy(&LOCK_id_to_share); + if (id_to_share != NULL) + my_free((id_to_share + 1), MYF(MY_WME)); + DBUG_VOID_RETURN; +} + + +/* + @brief Starts new page. + + @param horizon \ Position in file and buffer where we are + @param cursor / + @param prev_buffer Buffer which should be flushed will be assigned here. + This is always set (to NULL if nothing to flush). + + @note We do not want to flush the buffer immediately because we want to + let caller of this function first advance 'horizon' pointer and unlock the + loghandler and only then flush the log which can take some time. + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_page_next(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + struct st_translog_buffer **prev_buffer) +{ + struct st_translog_buffer *buffer= cursor->buffer; + DBUG_ENTER("translog_page_next"); + + *prev_buffer= NULL; + if ((cursor->ptr + TRANSLOG_PAGE_SIZE > + cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER) || + (LSN_OFFSET(*horizon) > + log_descriptor.log_file_max_size - TRANSLOG_PAGE_SIZE)) + { + DBUG_PRINT("info", ("Switch to next buffer Buffer Size: %lu (%lu) => %d " + "File size: %lu max: %lu => %d", + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer), + (cursor->ptr + TRANSLOG_PAGE_SIZE > + cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER), + (ulong) LSN_OFFSET(*horizon), + (ulong) log_descriptor.log_file_max_size, + (LSN_OFFSET(*horizon) > + (log_descriptor.log_file_max_size - + TRANSLOG_PAGE_SIZE)))); + if (translog_buffer_next(horizon, cursor, + LSN_OFFSET(*horizon) > + (log_descriptor.log_file_max_size - + TRANSLOG_PAGE_SIZE))) + DBUG_RETURN(1); + *prev_buffer= buffer; + DBUG_PRINT("info", ("Buffer #%u (0x%lu): have to be flushed", + (uint) buffer->buffer_no, (ulong) buffer)); + } + else + { + DBUG_PRINT("info", ("Use the same buffer #%u (0x%lu): " + "Buffer Size: %lu (%lu)", + (uint) buffer->buffer_no, + (ulong) buffer, + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + translog_finish_page(horizon, cursor); + translog_new_page_header(horizon, cursor); + } + DBUG_RETURN(0); +} + + +/* + Write data of given length to the current page + + SYNOPSIS + translog_write_data_on_page() + horizon \ Pointers on file and buffer + cursor / + length IN length of the chunk + buffer buffer with data + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_write_data_on_page(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + translog_size_t length, + uchar *buffer) +{ + DBUG_ENTER("translog_write_data_on_page"); + DBUG_PRINT("enter", ("Chunk length: %lu Page size %u", + (ulong) length, (uint) cursor->current_page_fill)); + DBUG_ASSERT(length > 0); + DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer + + TRANSLOG_WRITE_BUFFER); + + memcpy(cursor->ptr, buffer, length); + cursor->ptr+= length; + (*horizon)+= length; /* adds offset */ + cursor->current_page_fill+= length; + if (!cursor->chaser) + cursor->buffer->size+= length; + DBUG_PRINT("info", ("Write data buffer #%u: 0x%lx " + "chaser: %d Size: %lu (%lu)", + (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + translog_check_cursor(cursor); + + DBUG_RETURN(0); +} + + +/* + Write data from parts of given length to the current page + + SYNOPSIS + translog_write_parts_on_page() + horizon \ Pointers on file and buffer + cursor / + length IN length of the chunk + parts IN/OUT chunk source + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_write_parts_on_page(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + translog_size_t length, + struct st_translog_parts *parts) +{ + translog_size_t left= length; + uint cur= (uint) parts->current; + DBUG_ENTER("translog_write_parts_on_page"); + DBUG_PRINT("enter", ("Chunk length: %lu parts: %u of %u. Page size: %u " + "Buffer size: %lu (%lu)", + (ulong) length, + (uint) (cur + 1), (uint) parts->elements, + (uint) cursor->current_page_fill, + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + DBUG_ASSERT(length > 0); + DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer + + TRANSLOG_WRITE_BUFFER); + + do + { + translog_size_t len; + LEX_CUSTRING *part; + const uchar *buff; + + DBUG_ASSERT(cur < parts->elements); + part= parts->parts + cur; + buff= part->str; + DBUG_PRINT("info", ("Part: %u Length: %lu left: %lu buff: 0x%lx", + (uint) (cur + 1), (ulong) part->length, (ulong) left, + (ulong) buff)); + + if (part->length > left) + { + /* we should write less then the current part */ + len= left; + part->length-= len; + part->str+= len; + DBUG_PRINT("info", ("Set new part: %u Length: %lu", + (uint) (cur + 1), (ulong) part->length)); + } + else + { + len= (translog_size_t) part->length; + cur++; + DBUG_PRINT("info", ("moved to next part (len: %lu)", (ulong) len)); + } + DBUG_PRINT("info", ("copy: 0x%lx <- 0x%lx %u", + (ulong) cursor->ptr, (ulong)buff, (uint)len)); + if (likely(len)) + { + memcpy(cursor->ptr, buff, len); + left-= len; + cursor->ptr+= len; + } + } while (left); + + DBUG_PRINT("info", ("Horizon: (%lu,0x%lx) Length %lu(0x%lx)", + LSN_IN_PARTS(*horizon), + (ulong) length, (ulong) length)); + parts->current= cur; + (*horizon)+= length; /* offset increasing */ + cursor->current_page_fill+= length; + if (!cursor->chaser) + cursor->buffer->size+= length; + /* + We do not not updating parts->total_record_length here because it is + need only before writing record to have total length + */ + DBUG_PRINT("info", ("Write parts buffer #%u: 0x%lx " + "chaser: %d Size: %lu (%lu) " + "Horizon: (%lu,0x%lx) buff offset: 0x%lx", + (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer), + LSN_IN_PARTS(*horizon), + (ulong) (LSN_OFFSET(cursor->buffer->offset) + + cursor->buffer->size))); + translog_check_cursor(cursor); + + DBUG_RETURN(0); +} + + +/* + Put 1 group chunk type 0 header into parts array + + SYNOPSIS + translog_write_variable_record_1group_header() + parts Descriptor of record source parts + type The log record type + short_trid Short transaction ID or 0 if it has no sense + header_length Calculated header length of chunk type 0 + chunk0_header Buffer for the chunk header writing +*/ + +static void +translog_write_variable_record_1group_header(struct st_translog_parts *parts, + enum translog_record_type type, + SHORT_TRANSACTION_ID short_trid, + uint16 header_length, + uchar *chunk0_header) +{ + LEX_CUSTRING *part; + DBUG_ASSERT(parts->current != 0); /* first part is left for header */ + part= parts->parts + (--parts->current); + parts->total_record_length+= (translog_size_t) (part->length= header_length); + part->str= chunk0_header; + /* puts chunk type */ + *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN); + int2store(chunk0_header + 1, short_trid); + /* puts record length */ + translog_write_variable_record_1group_code_len(chunk0_header + 3, + parts->record_length, + header_length); + /* puts 0 as chunk length which indicate 1 group record */ + int2store(chunk0_header + header_length - 2, 0); +} + + +/* + Increase number of writers for this buffer + + SYNOPSIS + translog_buffer_increase_writers() + buffer target buffer +*/ + +static inline void +translog_buffer_increase_writers(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_increase_writers"); + translog_buffer_lock_assert_owner(buffer); + buffer->copy_to_buffer_in_progress++; + DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u 0x%lx progress: %d", + (uint) buffer->buffer_no, (ulong) buffer, + buffer->copy_to_buffer_in_progress)); + DBUG_VOID_RETURN; +} + + +/* + Decrease number of writers for this buffer + + SYNOPSIS + translog_buffer_decrease_writers() + buffer target buffer +*/ + +static void translog_buffer_decrease_writers(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_decrease_writers"); + translog_buffer_lock_assert_owner(buffer); + buffer->copy_to_buffer_in_progress--; + DBUG_PRINT("info", + ("copy_to_buffer_in_progress. Buffer #%u 0x%lx progress: %d", + (uint) buffer->buffer_no, (ulong) buffer, + buffer->copy_to_buffer_in_progress)); + if (buffer->copy_to_buffer_in_progress == 0) + pthread_cond_broadcast(&buffer->waiting_filling_buffer); + DBUG_VOID_RETURN; +} + + +/** + @brief Skip to the next page for chaser (thread which advanced horizon + pointer and now feeling the buffer) + + @param horizon \ Pointers on file position and buffer + @param cursor / + + @retval 1 OK + @retval 0 Error +*/ + +static my_bool translog_chaser_page_next(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + struct st_translog_buffer *buffer_to_flush; + my_bool rc; + DBUG_ENTER("translog_chaser_page_next"); + DBUG_ASSERT(cursor->chaser); + rc= translog_page_next(horizon, cursor, &buffer_to_flush); + if (buffer_to_flush != NULL) + { + translog_buffer_lock(buffer_to_flush); + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + DBUG_RETURN(rc); +} + +/* + Put chunk 2 from new page beginning + + SYNOPSIS + translog_write_variable_record_chunk2_page() + parts Descriptor of record source parts + horizon \ Pointers on file position and buffer + cursor / + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_write_variable_record_chunk2_page(struct st_translog_parts *parts, + TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + uchar chunk2_header[1]; + DBUG_ENTER("translog_write_variable_record_chunk2_page"); + chunk2_header[0]= TRANSLOG_CHUNK_NOHDR; + + if (translog_chaser_page_next(horizon, cursor)) + DBUG_RETURN(1); + + /* Puts chunk type */ + translog_write_data_on_page(horizon, cursor, 1, chunk2_header); + /* Puts chunk body */ + translog_write_parts_on_page(horizon, cursor, + log_descriptor.page_capacity_chunk_2, parts); + DBUG_RETURN(0); +} + + +/* + Put chunk 3 of requested length in the buffer from new page beginning + + SYNOPSIS + translog_write_variable_record_chunk3_page() + parts Descriptor of record source parts + length Length of this chunk + horizon \ Pointers on file position and buffer + cursor / + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_write_variable_record_chunk3_page(struct st_translog_parts *parts, + uint16 length, + TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + LEX_CUSTRING *part; + uchar chunk3_header[1 + 2]; + DBUG_ENTER("translog_write_variable_record_chunk3_page"); + + if (translog_chaser_page_next(horizon, cursor)) + DBUG_RETURN(1); + + if (length == 0) + { + /* It was call to write page header only (no data for chunk 3) */ + DBUG_PRINT("info", ("It is a call to make page header only")); + DBUG_RETURN(0); + } + + DBUG_ASSERT(parts->current != 0); /* first part is left for header */ + part= parts->parts + (--parts->current); + parts->total_record_length+= (translog_size_t) (part->length= 1 + 2); + part->str= chunk3_header; + /* Puts chunk type */ + *chunk3_header= (uchar) (TRANSLOG_CHUNK_LNGTH); + /* Puts chunk length */ + int2store(chunk3_header + 1, length); + + translog_write_parts_on_page(horizon, cursor, length + 1 + 2, parts); + DBUG_RETURN(0); +} + +/* + Move log pointer (horizon) on given number pages starting from next page, + and given offset on the last page + + SYNOPSIS + translog_advance_pointer() + pages Number of full pages starting from the next one + last_page_data Plus this data on the last page + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_advance_pointer(int pages, uint16 last_page_data) +{ + translog_size_t last_page_offset= (log_descriptor.page_overhead + + last_page_data); + translog_size_t offset= (TRANSLOG_PAGE_SIZE - + log_descriptor.bc.current_page_fill + + pages * TRANSLOG_PAGE_SIZE + last_page_offset); + translog_size_t buffer_end_offset, file_end_offset, min_offset; + DBUG_ENTER("translog_advance_pointer"); + DBUG_PRINT("enter", ("Pointer: (%lu, 0x%lx) + %u + %u pages + %u + %u", + LSN_IN_PARTS(log_descriptor.horizon), + (uint) (TRANSLOG_PAGE_SIZE - + log_descriptor.bc.current_page_fill), + pages, (uint) log_descriptor.page_overhead, + (uint) last_page_data)); + translog_lock_assert_owner(); + + if (pages == -1) + { + /* + It is special case when we advance the pointer on the same page. + It can happened when we write last part of multi-group record. + */ + DBUG_ASSERT(last_page_data + log_descriptor.bc.current_page_fill <= + TRANSLOG_PAGE_SIZE); + offset= last_page_data; + last_page_offset= log_descriptor.bc.current_page_fill + last_page_data; + goto end; + } + DBUG_PRINT("info", ("last_page_offset %lu", (ulong) last_page_offset)); + DBUG_ASSERT(last_page_offset <= TRANSLOG_PAGE_SIZE); + + /* + The loop will be executed 1-3 times. Usually we advance the + pointer to fill only the current buffer (if we have more then 1/2 of + buffer free or 2 buffers (rest of current and all next). In case of + really huge record end where we write last group with "table of + content" of all groups and ignore buffer borders we can occupy + 3 buffers. + */ + for (;;) + { + uint8 new_buffer_no; + struct st_translog_buffer *new_buffer; + struct st_translog_buffer *old_buffer; + buffer_end_offset= TRANSLOG_WRITE_BUFFER - log_descriptor.bc.buffer->size; + if (likely(log_descriptor.log_file_max_size >= + LSN_OFFSET(log_descriptor.horizon))) + file_end_offset= (log_descriptor.log_file_max_size - + LSN_OFFSET(log_descriptor.horizon)); + else + { + /* + We already have written more then current file limit allow, + So we will finish this page and start new file + */ + file_end_offset= (TRANSLOG_PAGE_SIZE - + log_descriptor.bc.current_page_fill); + } + DBUG_PRINT("info", ("offset: %lu buffer_end_offs: %lu, " + "file_end_offs: %lu", + (ulong) offset, (ulong) buffer_end_offset, + (ulong) file_end_offset)); + DBUG_PRINT("info", ("Buff #%u %u (0x%lx) offset 0x%lx + size 0x%lx = " + "0x%lx (0x%lx)", + (uint) log_descriptor.bc.buffer->buffer_no, + (uint) log_descriptor.bc.buffer_no, + (ulong) log_descriptor.bc.buffer, + (ulong) LSN_OFFSET(log_descriptor.bc.buffer->offset), + (ulong) log_descriptor.bc.buffer->size, + (ulong) (LSN_OFFSET(log_descriptor.bc.buffer->offset) + + log_descriptor.bc.buffer->size), + (ulong) LSN_OFFSET(log_descriptor.horizon))); + DBUG_ASSERT(LSN_OFFSET(log_descriptor.bc.buffer->offset) + + log_descriptor.bc.buffer->size == + LSN_OFFSET(log_descriptor.horizon)); + + if (offset <= buffer_end_offset && offset <= file_end_offset) + break; + old_buffer= log_descriptor.bc.buffer; + new_buffer_no= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO; + new_buffer= log_descriptor.buffers + new_buffer_no; + + translog_buffer_lock(new_buffer); +#ifndef DBUG_OFF + { + TRANSLOG_ADDRESS offset= new_buffer->offset; + TRANSLOG_FILE *file= new_buffer->file; + uint8 ver= new_buffer->ver; + translog_lock_assert_owner(); +#endif + translog_wait_for_buffer_free(new_buffer); +#ifndef DBUG_OFF + /* We keep the handler locked so nobody can start this new buffer */ + DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL && + (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver); + } +#endif + + min_offset= min(buffer_end_offset, file_end_offset); + /* TODO: check is it ptr or size enough */ + log_descriptor.bc.buffer->size+= min_offset; + log_descriptor.bc.ptr+= min_offset; + DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu)", + (uint) log_descriptor.bc.buffer->buffer_no, + (ulong) log_descriptor.bc.buffer, + log_descriptor.bc.chaser, + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr -log_descriptor.bc. + buffer->buffer))); + DBUG_ASSERT((ulong) (log_descriptor.bc.ptr - + log_descriptor.bc.buffer->buffer) == + log_descriptor.bc.buffer->size); + DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no == + log_descriptor.bc.buffer_no); + translog_buffer_increase_writers(log_descriptor.bc.buffer); + + if (file_end_offset <= buffer_end_offset) + { + log_descriptor.horizon+= LSN_ONE_FILE; + log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, + TRANSLOG_PAGE_SIZE); + DBUG_PRINT("info", ("New file: %lu", + (ulong) LSN_FILE_NO(log_descriptor.horizon))); + if (translog_create_new_file()) + { + DBUG_RETURN(1); + } + } + else + { + DBUG_PRINT("info", ("The same file")); + log_descriptor.horizon+= min_offset; /* offset increasing */ + } + translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no); + old_buffer->next_buffer_offset= new_buffer->offset; + new_buffer->prev_buffer_offset= old_buffer->offset; + translog_buffer_unlock(old_buffer); + offset-= min_offset; + } + DBUG_PRINT("info", ("drop write_counter")); + log_descriptor.bc.write_counter= 0; + log_descriptor.bc.previous_offset= 0; +end: + log_descriptor.bc.ptr+= offset; + log_descriptor.bc.buffer->size+= offset; + translog_buffer_increase_writers(log_descriptor.bc.buffer); + log_descriptor.horizon+= offset; /* offset increasing */ + log_descriptor.bc.current_page_fill= last_page_offset; + DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu) " + "offset: %u last page: %u", + (uint) log_descriptor.bc.buffer->buffer_no, + (ulong) log_descriptor.bc.buffer, + log_descriptor.bc.chaser, + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr - + log_descriptor.bc.buffer-> + buffer), (uint) offset, + (uint) last_page_offset)); + DBUG_PRINT("info", + ("pointer moved to: (%lu, 0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon))); + translog_check_cursor(&log_descriptor.bc); + log_descriptor.bc.protected= 0; + DBUG_RETURN(0); +} + + +/* + Get page rest + + SYNOPSIS + translog_get_current_page_rest() + + NOTE loghandler should be locked + + RETURN + number of bytes left on the current page +*/ + +static uint translog_get_current_page_rest() +{ + return (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill); +} + + +/* + Get buffer rest in full pages + + SYNOPSIS + translog_get_current_buffer_rest() + + NOTE loghandler should be locked + + RETURN + number of full pages left on the current buffer +*/ + +static uint translog_get_current_buffer_rest() +{ + return ((log_descriptor.bc.buffer->buffer + TRANSLOG_WRITE_BUFFER - + log_descriptor.bc.ptr) / + TRANSLOG_PAGE_SIZE); +} + +/* + Calculate possible group size without first (current) page + + SYNOPSIS + translog_get_current_group_size() + + NOTE loghandler should be locked + + RETURN + group size without first (current) page +*/ + +static translog_size_t translog_get_current_group_size() +{ + /* buffer rest in full pages */ + translog_size_t buffer_rest= translog_get_current_buffer_rest(); + DBUG_ENTER("translog_get_current_group_size"); + DBUG_PRINT("info", ("buffer_rest in pages: %u", buffer_rest)); + + buffer_rest*= log_descriptor.page_capacity_chunk_2; + /* in case of only half of buffer free we can write this and next buffer */ + if (buffer_rest < log_descriptor.half_buffer_capacity_chunk_2) + { + DBUG_PRINT("info", ("buffer_rest: %lu -> add %lu", + (ulong) buffer_rest, + (ulong) log_descriptor.buffer_capacity_chunk_2)); + buffer_rest+= log_descriptor.buffer_capacity_chunk_2; + } + + DBUG_PRINT("info", ("buffer_rest: %lu", (ulong) buffer_rest)); + + DBUG_RETURN(buffer_rest); +} + + +static inline void set_lsn(LSN *lsn, LSN value) +{ + DBUG_ENTER("set_lsn"); + translog_lock_assert_owner(); + *lsn= value; + /* we generate LSN so something is not flushed in log */ + log_descriptor.is_everything_flushed= 0; + DBUG_PRINT("info", ("new LSN appeared: (%lu,0x%lx)", LSN_IN_PARTS(value))); + DBUG_VOID_RETURN; +} + + +/** + @brief Write variable record in 1 group. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param buffer_to_flush Buffer which have to be flushed if it is not 0 + @param header_length Calculated header length of chunk type 0 + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @note + We must have a translog_lock() when entering this function + We must have buffer_to_flush locked (if not null) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool +translog_write_variable_record_1group(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + struct st_translog_buffer + *buffer_to_flush, uint16 header_length, + TRN *trn, void *hook_arg) +{ + TRANSLOG_ADDRESS horizon; + struct st_buffer_cursor cursor; + int rc= 0; + uint i; + translog_size_t record_rest, full_pages, first_page; + uint additional_chunk3_page= 0; + uchar chunk0_header[1 + 2 + 5 + 2]; + DBUG_ENTER("translog_write_variable_record_1group"); + translog_lock_assert_owner(); + if (buffer_to_flush) + translog_buffer_lock_assert_owner(buffer_to_flush); + + set_lsn(lsn, horizon= log_descriptor.horizon); + if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), + *lsn, TRUE) || + (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, + lsn, hook_arg))) + { + translog_unlock(); + DBUG_RETURN(1); + } + cursor= log_descriptor.bc; + cursor.chaser= 1; + + /* Advance pointer to be able unlock the loghandler */ + first_page= translog_get_current_page_rest(); + record_rest= parts->record_length - (first_page - header_length); + full_pages= record_rest / log_descriptor.page_capacity_chunk_2; + record_rest= (record_rest % log_descriptor.page_capacity_chunk_2); + + if (record_rest + 1 == log_descriptor.page_capacity_chunk_2) + { + DBUG_PRINT("info", ("2 chunks type 3 is needed")); + /* We will write 2 chunks type 3 at the end of this group */ + additional_chunk3_page= 1; + record_rest= 1; + } + + DBUG_PRINT("info", ("first_page: %u (%u) full_pages: %u (%lu) " + "additional: %u (%u) rest %u = %u", + first_page, first_page - header_length, + full_pages, + (ulong) full_pages * + log_descriptor.page_capacity_chunk_2, + additional_chunk3_page, + additional_chunk3_page * + (log_descriptor.page_capacity_chunk_2 - 1), + record_rest, parts->record_length)); + /* record_rest + 3 is chunk type 3 overhead + record_rest */ + rc|= translog_advance_pointer((int)(full_pages + additional_chunk3_page), + (record_rest ? record_rest + 3 : 0)); + log_descriptor.bc.buffer->last_lsn= *lsn; + DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx) buffer: 0x%lx", + LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn), + (ulong) log_descriptor.bc.buffer)); + + translog_unlock(); + + /* + Check if we switched buffer and need process it (current buffer is + unlocked already => we will not delay other threads + */ + if (buffer_to_flush != NULL) + { + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + if (rc) + DBUG_RETURN(1); + + translog_write_variable_record_1group_header(parts, type, short_trid, + header_length, chunk0_header); + + /* fill the pages */ + translog_write_parts_on_page(&horizon, &cursor, first_page, parts); + + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + + for (i= 0; i < full_pages; i++) + { + if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) + DBUG_RETURN(1); + + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + } + + if (additional_chunk3_page) + { + if (translog_write_variable_record_chunk3_page(parts, + log_descriptor. + page_capacity_chunk_2 - 2, + &horizon, &cursor)) + DBUG_RETURN(1); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + DBUG_ASSERT(cursor.current_page_fill == TRANSLOG_PAGE_SIZE); + } + + if (translog_write_variable_record_chunk3_page(parts, + record_rest, + &horizon, &cursor)) + DBUG_RETURN(1); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + (ulong) LSN_FILE_NO(log_descriptor.horizon), + (ulong) LSN_OFFSET(log_descriptor.horizon), + (ulong) LSN_FILE_NO(horizon), + (ulong) LSN_OFFSET(horizon))); + + translog_buffer_lock(cursor.buffer); + translog_buffer_decrease_writers(cursor.buffer); + translog_buffer_unlock(cursor.buffer); + DBUG_RETURN(rc); +} + + +/** + @brief Write variable record in 1 chunk. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param buffer_to_flush Buffer which have to be flushed if it is not 0 + @param header_length Calculated header length of chunk type 0 + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @note + We must have a translog_lock() when entering this function + We must have buffer_to_flush locked (if not null) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool +translog_write_variable_record_1chunk(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + struct st_translog_buffer + *buffer_to_flush, uint16 header_length, + TRN *trn, void *hook_arg) +{ + int rc; + uchar chunk0_header[1 + 2 + 5 + 2]; + DBUG_ENTER("translog_write_variable_record_1chunk"); + translog_lock_assert_owner(); + if (buffer_to_flush) + translog_buffer_lock_assert_owner(buffer_to_flush); + + translog_write_variable_record_1group_header(parts, type, short_trid, + header_length, chunk0_header); + set_lsn(lsn, log_descriptor.horizon); + if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), + *lsn, TRUE) || + (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, + lsn, hook_arg))) + { + translog_unlock(); + DBUG_RETURN(1); + } + + rc= translog_write_parts_on_page(&log_descriptor.horizon, + &log_descriptor.bc, + parts->total_record_length, parts); + log_descriptor.bc.buffer->last_lsn= *lsn; + DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx) buffer: 0x%lx", + LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn), + (ulong) log_descriptor.bc.buffer)); + translog_unlock(); + + /* + check if we switched buffer and need process it (current buffer is + unlocked already => we will not delay other threads + */ + if (buffer_to_flush != NULL) + { + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + + DBUG_RETURN(rc); +} + + +/* + @brief Calculates and write LSN difference (compressed LSN). + + @param base_lsn LSN from which we calculate difference + @param lsn LSN for codding + @param dst Result will be written to dst[-pack_length] .. dst[-1] + + @note To store an LSN in a compact way we will use the following compression: + If a log record has LSN1, and it contains the LSN2 as a back reference, + Instead of LSN2 we write LSN1-LSN2, encoded as: + two bits the number N (see below) + 14 bits + N bytes + That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2 + is stored in the first two bits. + + @note function made to write the result in backward direction with no + special sense or tricks both directions are equal in complicity + + @retval # pointer on coded LSN +*/ + +static uchar *translog_put_LSN_diff(LSN base_lsn, LSN lsn, uchar *dst) +{ + uint64 diff; + DBUG_ENTER("translog_put_LSN_diff"); + DBUG_PRINT("enter", ("Base: (%lu,0x%lx) val: (%lu,0x%lx) dst: 0x%lx", + LSN_IN_PARTS(base_lsn), LSN_IN_PARTS(lsn), + (ulong) dst)); + DBUG_ASSERT(base_lsn > lsn); + diff= base_lsn - lsn; + DBUG_PRINT("info", ("Diff: 0x%llx", (ulonglong) diff)); + if (diff <= 0x3FFF) + { + dst-= 2; + /* + Note we store this high uchar first to ensure that first uchar has + 0 in the 3 upper bits. + */ + dst[0]= (uchar)(diff >> 8); + dst[1]= (uchar)(diff & 0xFF); + } + else if (diff <= 0x3FFFFFL) + { + dst-= 3; + dst[0]= (uchar)(0x40 | (diff >> 16)); + int2store(dst + 1, diff & 0xFFFF); + } + else if (diff <= 0x3FFFFFFFL) + { + dst-= 4; + dst[0]= (uchar)(0x80 | (diff >> 24)); + int3store(dst + 1, diff & 0xFFFFFFL); + } + else if (diff <= LL(0x3FFFFFFFFF)) + + { + dst-= 5; + dst[0]= (uchar)(0xC0 | (diff >> 32)); + int4store(dst + 1, diff & 0xFFFFFFFFL); + } + else + { + /* + It is full LSN after special 1 diff (which is impossible + in real life) + */ + dst-= 2 + LSN_STORE_SIZE; + dst[0]= 0; + dst[1]= 1; + lsn_store(dst + 2, lsn); + } + DBUG_PRINT("info", ("new dst: 0x%lx", (ulong) dst)); + DBUG_RETURN(dst); +} + + +/* + Get LSN from LSN-difference (compressed LSN) + + SYNOPSIS + translog_get_LSN_from_diff() + base_lsn LSN from which we calculate difference + src pointer to coded lsn + dst pointer to buffer where to write 7byte LSN + + NOTE: + To store an LSN in a compact way we will use the following compression: + + If a log record has LSN1, and it contains the lSN2 as a back reference, + Instead of LSN2 we write LSN1-LSN2, encoded as: + + two bits the number N (see below) + 14 bits + N bytes + + That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2 + is stored in the first two bits. + + RETURN + pointer to buffer after decoded LSN +*/ + +static uchar *translog_get_LSN_from_diff(LSN base_lsn, uchar *src, uchar *dst) +{ + LSN lsn; + uint32 diff; + uint32 first_byte; + uint32 file_no, rec_offset; + uint8 code; + DBUG_ENTER("translog_get_LSN_from_diff"); + DBUG_PRINT("enter", ("Base: (%lu,0x%lx) src: 0x%lx dst 0x%lx", + LSN_IN_PARTS(base_lsn), (ulong) src, (ulong) dst)); + first_byte= *((uint8*) src); + code= first_byte >> 6; /* Length is in 2 most significant bits */ + first_byte&= 0x3F; + src++; /* Skip length + encode */ + file_no= LSN_FILE_NO(base_lsn); /* Assume relative */ + DBUG_PRINT("info", ("code: %u first byte: %lu", + (uint) code, (ulong) first_byte)); + switch (code) { + case 0: + if (first_byte == 0 && *((uint8*)src) == 1) + { + /* + It is full LSN after special 1 diff (which is impossible + in real life) + */ + memcpy(dst, src + 1, LSN_STORE_SIZE); + DBUG_PRINT("info", ("Special case of full LSN, new src: 0x%lx", + (ulong) (src + 1 + LSN_STORE_SIZE))); + DBUG_RETURN(src + 1 + LSN_STORE_SIZE); + } + rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 8) + *((uint8*)src)); + break; + case 1: + diff= uint2korr(src); + rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 16) + diff); + break; + case 2: + diff= uint3korr(src); + rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 24) + diff); + break; + case 3: + { + ulonglong base_offset= LSN_OFFSET(base_lsn); + diff= uint4korr(src); + if (diff > LSN_OFFSET(base_lsn)) + { + /* take 1 from file offset */ + first_byte++; + base_offset+= LL(0x100000000); + } + file_no= LSN_FILE_NO(base_lsn) - first_byte; + DBUG_ASSERT(base_offset - diff <= UINT_MAX); + rec_offset= (uint32)(base_offset - diff); + break; + } + default: + DBUG_ASSERT(0); + DBUG_RETURN(NULL); + } + lsn= MAKE_LSN(file_no, rec_offset); + src+= code + 1; + lsn_store(dst, lsn); + DBUG_PRINT("info", ("new src: 0x%lx", (ulong) src)); + DBUG_RETURN(src); +} + + +/** + @brief Encodes relative LSNs listed in the parameters. + + @param parts Parts list with encoded LSN(s) + @param base_lsn LSN which is base for encoding + @param lsns number of LSN(s) to encode + @param compressed_LSNs buffer which can be used for storing compressed LSN(s) +*/ + +static void translog_relative_LSN_encode(struct st_translog_parts *parts, + LSN base_lsn, + uint lsns, uchar *compressed_LSNs) +{ + LEX_CUSTRING *part; + uint lsns_len= lsns * LSN_STORE_SIZE; + uchar buffer_src[MAX_NUMBER_OF_LSNS_PER_RECORD * LSN_STORE_SIZE]; + uchar *buffer= buffer_src; + const uchar *cbuffer; + + DBUG_ENTER("translog_relative_LSN_encode"); + + DBUG_ASSERT(parts->current != 0); + part= parts->parts + parts->current; + + /* collect all LSN(s) in one chunk if it (they) is (are) divided */ + if (part->length < lsns_len) + { + uint copied= part->length; + LEX_CUSTRING *next_part; + DBUG_PRINT("info", ("Using buffer: 0x%lx", (ulong) compressed_LSNs)); + memcpy(buffer, part->str, part->length); + next_part= parts->parts + parts->current + 1; + do + { + DBUG_ASSERT(next_part < parts->parts + parts->elements); + if ((next_part->length + copied) < lsns_len) + { + memcpy(buffer + copied, next_part->str, + next_part->length); + copied+= next_part->length; + next_part->length= 0; next_part->str= 0; + /* delete_dynamic_element(&parts->parts, parts->current + 1); */ + next_part++; + parts->current++; + part= parts->parts + parts->current; + } + else + { + uint len= lsns_len - copied; + memcpy(buffer + copied, next_part->str, len); + copied= lsns_len; + next_part->str+= len; + next_part->length-= len; + } + } while (copied < lsns_len); + cbuffer= buffer; + } + else + { + cbuffer= part->str; + part->str+= lsns_len; + part->length-= lsns_len; + parts->current--; + part= parts->parts + parts->current; + } + + { + /* Compress */ + LSN ref; + int economy; + const uchar *src_ptr; + uchar *dst_ptr= compressed_LSNs + (MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE); + /* + We write the result in backward direction with no special sense or + tricks both directions are equal in complicity + */ + for (src_ptr= cbuffer + lsns_len - LSN_STORE_SIZE; + src_ptr >= (const uchar*)cbuffer; + src_ptr-= LSN_STORE_SIZE) + { + ref= lsn_korr(src_ptr); + dst_ptr= translog_put_LSN_diff(base_lsn, ref, dst_ptr); + } + part->length= (uint)((compressed_LSNs + + (MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE)) - + dst_ptr); + parts->record_length-= (economy= lsns_len - part->length); + DBUG_PRINT("info", ("new length of LSNs: %lu economy: %d", + (ulong)part->length, economy)); + parts->total_record_length-= economy; + part->str= dst_ptr; + } + DBUG_VOID_RETURN; +} + + +/** + @brief Write multi-group variable-size record. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param buffer_to_flush Buffer which have to be flushed if it is not 0 + @param header_length Header length calculated for 1 group + @param buffer_rest Beginning from which we plan to write in full pages + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @note + We must have a translog_lock() when entering this function + + We must have buffer_to_flush locked (if not null) + buffer_to_flush should *NOT* be locked when calling this function. + (This is note is here as this is different from most other + translog_write...() functions which require the buffer to be locked) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool +translog_write_variable_record_mgroup(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + struct st_translog_buffer + *buffer_to_flush, + uint16 header_length, + translog_size_t buffer_rest, + TRN *trn, void *hook_arg) +{ + TRANSLOG_ADDRESS horizon; + struct st_buffer_cursor cursor; + int rc= 0; + uint i, chunk2_page, full_pages; + uint curr_group= 0; + translog_size_t record_rest, first_page, chunk3_pages, chunk0_pages= 1; + translog_size_t done= 0; + struct st_translog_group_descriptor group; + DYNAMIC_ARRAY groups; + uint16 chunk3_size; + uint16 page_capacity= log_descriptor.page_capacity_chunk_2 + 1; + uint16 last_page_capacity; + my_bool new_page_before_chunk0= 1, first_chunk0= 1; + uchar chunk0_header[1 + 2 + 5 + 2 + 2], group_desc[7 + 1]; + uchar chunk2_header[1]; + uint header_fixed_part= header_length + 2; + uint groups_per_page= (page_capacity - header_fixed_part) / (7 + 1); + uint file_of_the_first_group; + int pages_to_skip; + struct st_translog_buffer *buffer_of_last_lsn; + DBUG_ENTER("translog_write_variable_record_mgroup"); + translog_lock_assert_owner(); + + chunk2_header[0]= TRANSLOG_CHUNK_NOHDR; + + if (my_init_dynamic_array(&groups, + sizeof(struct st_translog_group_descriptor), + 10, 10)) + { + translog_unlock(); + DBUG_PRINT("error", ("init array failed")); + DBUG_RETURN(1); + } + + first_page= translog_get_current_page_rest(); + record_rest= parts->record_length - (first_page - 1); + DBUG_PRINT("info", ("Record Rest: %lu", (ulong) record_rest)); + + if (record_rest < buffer_rest) + { + /* + The record (group 1 type) is larger than the free space on the page + - we need to split it in two. But when we split it in two, the first + part is big enough to hold all the data of the record (because the + header of the first part of the split is smaller than the header of + the record as a whole when it takes only one chunk) + */ + DBUG_PRINT("info", ("too many free space because changing header")); + buffer_rest-= log_descriptor.page_capacity_chunk_2; + DBUG_ASSERT(record_rest >= buffer_rest); + } + + file_of_the_first_group= LSN_FILE_NO(log_descriptor.horizon); + translog_mark_file_unfinished(file_of_the_first_group); + do + { + group.addr= horizon= log_descriptor.horizon; + cursor= log_descriptor.bc; + cursor.chaser= 1; + if ((full_pages= buffer_rest / log_descriptor.page_capacity_chunk_2) > 255) + { + /* sizeof(uint8) == 256 is max number of chunk in multi-chunks group */ + full_pages= 255; + buffer_rest= full_pages * log_descriptor.page_capacity_chunk_2; + } + /* + group chunks = + full pages + first page (which actually can be full, too). + But here we assign number of chunks - 1 + */ + group.num= full_pages; + if (insert_dynamic(&groups, (uchar*) &group)) + { + DBUG_PRINT("error", ("insert into array failed")); + goto err_unlock; + } + + DBUG_PRINT("info", ("chunk: #%u first_page: %u (%u) " + "full_pages: %lu (%lu) " + "Left %lu", + groups.elements, + first_page, first_page - 1, + (ulong) full_pages, + (ulong) (full_pages * + log_descriptor.page_capacity_chunk_2), + (ulong)(parts->record_length - (first_page - 1 + + buffer_rest) - + done))); + rc|= translog_advance_pointer((int)full_pages, 0); + + translog_unlock(); + + if (buffer_to_flush != NULL) + { + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + buffer_to_flush= NULL; + } + if (rc) + { + DBUG_PRINT("error", ("flush of unlock buffer failed")); + goto err; + } + + translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header); + translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) " + "Left %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + done))); + + for (i= 0; i < full_pages; i++) + { + if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) + goto err; + + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) " + "local: (%lu,0x%lx) " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + i * log_descriptor.page_capacity_chunk_2 - + done))); + } + + done+= (first_page - 1 + buffer_rest); + + if (translog_chaser_page_next(&horizon, &cursor)) + { + DBUG_PRINT("error", ("flush of unlock buffer failed")); + goto err; + } + translog_buffer_lock(cursor.buffer); + translog_buffer_decrease_writers(cursor.buffer); + translog_buffer_unlock(cursor.buffer); + + translog_lock(); + + /* Check that we have place for chunk type 2 */ + first_page= translog_get_current_page_rest(); + if (first_page <= 1) + { + if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, + &buffer_to_flush)) + goto err_unlock; + first_page= translog_get_current_page_rest(); + } + buffer_rest= translog_get_current_group_size(); + } while ((translog_size_t)(first_page + buffer_rest) < + (translog_size_t)(parts->record_length - done)); + + group.addr= horizon= log_descriptor.horizon; + cursor= log_descriptor.bc; + cursor.chaser= 1; + group.num= 0; /* 0 because it does not matter */ + if (insert_dynamic(&groups, (uchar*) &group)) + { + DBUG_PRINT("error", ("insert into array failed")); + goto err_unlock; + } + record_rest= parts->record_length - done; + DBUG_PRINT("info", ("Record rest: %lu", (ulong) record_rest)); + if (first_page > record_rest + 1) + { + /* + We have not so much data to fill all first page + (no speaking about full pages) + so it will be: + <chunk0 <data>> + or + <chunk0>...<chunk0><chunk0 <data>> + or + <chunk3 <data>><chunk0>...<chunk0><chunk0 <possible data of 1 byte>> + */ + chunk2_page= full_pages= 0; + last_page_capacity= first_page; + pages_to_skip= -1; + } + else + { + /* + We will have: + <chunk2 <data>>...<chunk2 <data>><chunk0 <data>> + or + <chunk2 <data>>...<chunk2 <data>><chunk0>...<chunk0><chunk0 <data>> + or + <chunk3 <data>><chunk0>...<chunk0><chunk0 <possible data of 1 byte>> + */ + chunk2_page= 1; + record_rest-= (first_page - 1); + pages_to_skip= full_pages= + record_rest / log_descriptor.page_capacity_chunk_2; + record_rest= (record_rest % log_descriptor.page_capacity_chunk_2); + last_page_capacity= page_capacity; + } + chunk3_size= 0; + chunk3_pages= 0; + if (last_page_capacity > record_rest + 1 && record_rest != 0) + { + if (last_page_capacity > + record_rest + header_fixed_part + groups.elements * (7 + 1)) + { + /* 1 record of type 0 */ + chunk3_pages= 0; + } + else + { + pages_to_skip++; + chunk3_pages= 1; + if (record_rest + 2 == last_page_capacity) + { + chunk3_size= record_rest - 1; + record_rest= 1; + } + else + { + chunk3_size= record_rest; + record_rest= 0; + } + } + } + /* + A first non-full page will hold type 0 chunk only if it fit in it with + all its headers + */ + while (page_capacity < + record_rest + header_fixed_part + + (groups.elements - groups_per_page * (chunk0_pages - 1)) * (7 + 1)) + chunk0_pages++; + DBUG_PRINT("info", ("chunk0_pages: %u groups %u groups per full page: %u " + "Group on last page: %u", + chunk0_pages, groups.elements, + groups_per_page, + (groups.elements - + ((page_capacity - header_fixed_part) / (7 + 1)) * + (chunk0_pages - 1)))); + DBUG_PRINT("info", ("first_page: %u chunk2: %u full_pages: %u (%lu) " + "chunk3: %u (%u) rest: %u", + first_page, + chunk2_page, full_pages, + (ulong) full_pages * + log_descriptor.page_capacity_chunk_2, + chunk3_pages, (uint) chunk3_size, (uint) record_rest)); + rc= translog_advance_pointer(pages_to_skip + (int)(chunk0_pages - 1), + record_rest + header_fixed_part + + (groups.elements - + ((page_capacity - + header_fixed_part) / (7 + 1)) * + (chunk0_pages - 1)) * (7 + 1)); + buffer_of_last_lsn= log_descriptor.bc.buffer; + translog_unlock(); + + if (buffer_to_flush != NULL) + { + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + buffer_to_flush= NULL; + } + if (rc) + { + DBUG_PRINT("error", ("flush of unlock buffer failed")); + goto err; + } + + if (rc) + goto err; + + if (chunk2_page) + { + DBUG_PRINT("info", ("chunk 2 to finish first page")); + translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header); + translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + done))); + } + else if (chunk3_pages) + { + uchar chunk3_header[3]; + DBUG_PRINT("info", ("chunk 3")); + DBUG_ASSERT(full_pages == 0); + chunk3_pages= 0; + chunk3_header[0]= TRANSLOG_CHUNK_LNGTH; + int2store(chunk3_header + 1, chunk3_size); + translog_write_data_on_page(&horizon, &cursor, 3, chunk3_header); + translog_write_parts_on_page(&horizon, &cursor, chunk3_size, parts); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - chunk3_size - done))); + } + else + { + DBUG_PRINT("info", ("no new_page_before_chunk0")); + new_page_before_chunk0= 0; + } + + for (i= 0; i < full_pages; i++) + { + DBUG_ASSERT(chunk2_page != 0); + if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) + goto err; + + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + i * log_descriptor.page_capacity_chunk_2 - + done))); + } + + if (chunk3_pages && + translog_write_variable_record_chunk3_page(parts, + chunk3_size, + &horizon, &cursor)) + goto err; + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + + *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN); + int2store(chunk0_header + 1, short_trid); + translog_write_variable_record_1group_code_len(chunk0_header + 3, + parts->record_length, + header_length); + do + { + int limit; + if (new_page_before_chunk0 && + translog_chaser_page_next(&horizon, &cursor)) + { + DBUG_PRINT("error", ("flush of unlock buffer failed")); + goto err; + } + new_page_before_chunk0= 1; + + if (first_chunk0) + { + first_chunk0= 0; + + /* + We can drop "log_descriptor.is_everything_flushed" earlier when have + lock on loghandler and assign initial value of "horizon" variable or + before unlocking loghandler (because we will increase writers + counter on the buffer and every thread which wanted flush the buffer + will wait till we finish with it). But IMHO better here take short + lock and do not bother other threads with waiting. + */ + translog_lock(); + set_lsn(lsn, horizon); + buffer_of_last_lsn->last_lsn= *lsn; + DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx) buffer: 0x%lx", + LSN_IN_PARTS(buffer_of_last_lsn->last_lsn), + (ulong) buffer_of_last_lsn)); + if (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook) (type, trn, + tbl_info, + lsn, hook_arg)) + goto err_unlock; + translog_unlock(); + } + + /* + A first non-full page will hold type 0 chunk only if it fit in it with + all its headers => the fist page is full or number of groups less then + possible number of full page. + */ + limit= (groups_per_page < groups.elements - curr_group ? + groups_per_page : groups.elements - curr_group); + DBUG_PRINT("info", ("Groups: %u curr: %u limit: %u", + (uint) groups.elements, (uint) curr_group, + (uint) limit)); + + if (chunk0_pages == 1) + { + DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) + %u = %u", + (uint) limit, (uint) record_rest, + (uint) (2 + limit * (7 + 1) + record_rest))); + int2store(chunk0_header + header_length - 2, + 2 + limit * (7 + 1) + record_rest); + } + else + { + DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) = %u", + (uint) limit, (uint) (2 + limit * (7 + 1)))); + int2store(chunk0_header + header_length - 2, 2 + limit * (7 + 1)); + } + int2store(chunk0_header + header_length, groups.elements - curr_group); + translog_write_data_on_page(&horizon, &cursor, header_fixed_part, + chunk0_header); + for (i= curr_group; i < limit + curr_group; i++) + { + struct st_translog_group_descriptor *grp_ptr; + grp_ptr= dynamic_element(&groups, i, + struct st_translog_group_descriptor *); + lsn_store(group_desc, grp_ptr->addr); + group_desc[7]= grp_ptr->num; + translog_write_data_on_page(&horizon, &cursor, (7 + 1), group_desc); + } + + if (chunk0_pages == 1 && record_rest != 0) + translog_write_parts_on_page(&horizon, &cursor, record_rest, parts); + + chunk0_pages--; + curr_group+= limit; + /* put special type to indicate that it is not LSN chunk */ + *chunk0_header= (uchar) (TRANSLOG_CHUNK_LSN | TRANSLOG_CHUNK_0_CONT); + } while (chunk0_pages != 0); + translog_buffer_lock(cursor.buffer); + translog_buffer_decrease_writers(cursor.buffer); + translog_buffer_unlock(cursor.buffer); + rc= 0; + + if (translog_set_lsn_for_files(file_of_the_first_group, LSN_FILE_NO(*lsn), + *lsn, FALSE)) + goto err; + + translog_mark_file_finished(file_of_the_first_group); + + delete_dynamic(&groups); + DBUG_RETURN(rc); + +err_unlock: + + translog_unlock(); + +err: + if (buffer_to_flush != NULL) + { + /* This is to prevent locking buffer forever in case of error */ + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + buffer_to_flush= NULL; + } + + + translog_mark_file_finished(file_of_the_first_group); + + delete_dynamic(&groups); + DBUG_RETURN(1); +} + + +/** + @brief Write the variable length log record. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_write_variable_record(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + TRN *trn, void *hook_arg) +{ + struct st_translog_buffer *buffer_to_flush= NULL; + uint header_length1= 1 + 2 + 2 + + translog_variable_record_length_bytes(parts->record_length); + ulong buffer_rest; + uint page_rest; + /* Max number of such LSNs per record is 2 */ + uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE]; + my_bool res; + DBUG_ENTER("translog_write_variable_record"); + + translog_lock(); + DBUG_PRINT("info", ("horizon: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon))); + page_rest= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill; + DBUG_PRINT("info", ("header length: %u page_rest: %u", + header_length1, page_rest)); + + /* + header and part which we should read have to fit in one chunk + TODO: allow to divide readable header + */ + if (page_rest < + (header_length1 + log_record_type_descriptor[type].read_header_len)) + { + DBUG_PRINT("info", + ("Next page, size: %u header: %u + %u", + log_descriptor.bc.current_page_fill, + header_length1, + log_record_type_descriptor[type].read_header_len)); + translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, + &buffer_to_flush); + /* Chunk 2 header is 1 byte, so full page capacity will be one uchar more */ + page_rest= log_descriptor.page_capacity_chunk_2 + 1; + DBUG_PRINT("info", ("page_rest: %u", page_rest)); + } + + /* + To minimize compressed size we will compress always relative to + very first chunk address (log_descriptor.horizon for now) + */ + if (log_record_type_descriptor[type].compressed_LSN > 0) + { + translog_relative_LSN_encode(parts, log_descriptor.horizon, + log_record_type_descriptor[type]. + compressed_LSN, compressed_LSNs); + /* recalculate header length after compression */ + header_length1= 1 + 2 + 2 + + translog_variable_record_length_bytes(parts->record_length); + DBUG_PRINT("info", ("after compressing LSN(s) header length: %u " + "record length: %lu", + header_length1, (ulong)parts->record_length)); + } + + /* TODO: check space on current page for header + few bytes */ + if (page_rest >= parts->record_length + header_length1) + { + /* following function makes translog_unlock(); */ + res= translog_write_variable_record_1chunk(lsn, type, tbl_info, + short_trid, + parts, buffer_to_flush, + header_length1, trn, hook_arg); + DBUG_RETURN(res); + } + + buffer_rest= translog_get_current_group_size(); + + if (buffer_rest >= parts->record_length + header_length1 - page_rest) + { + /* following function makes translog_unlock(); */ + res= translog_write_variable_record_1group(lsn, type, tbl_info, + short_trid, + parts, buffer_to_flush, + header_length1, trn, hook_arg); + DBUG_RETURN(res); + } + /* following function makes translog_unlock(); */ + res= translog_write_variable_record_mgroup(lsn, type, tbl_info, + short_trid, + parts, buffer_to_flush, + header_length1, + buffer_rest, trn, hook_arg); + DBUG_RETURN(res); +} + + +/** + @brief Write the fixed and pseudo-fixed log record. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_write_fixed_record(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + TRN *trn, void *hook_arg) +{ + struct st_translog_buffer *buffer_to_flush= NULL; + uchar chunk1_header[1 + 2]; + /* Max number of such LSNs per record is 2 */ + uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE]; + LEX_CUSTRING *part; + int rc= 1; + DBUG_ENTER("translog_write_fixed_record"); + DBUG_ASSERT((log_record_type_descriptor[type].rclass == + LOGRECTYPE_FIXEDLENGTH && + parts->record_length == + log_record_type_descriptor[type].fixed_length) || + (log_record_type_descriptor[type].rclass == + LOGRECTYPE_PSEUDOFIXEDLENGTH && + parts->record_length == + log_record_type_descriptor[type].fixed_length)); + + translog_lock(); + DBUG_PRINT("info", ("horizon: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon))); + + DBUG_ASSERT(log_descriptor.bc.current_page_fill <= TRANSLOG_PAGE_SIZE); + DBUG_PRINT("info", + ("Page size: %u record: %u next cond: %d", + log_descriptor.bc.current_page_fill, + (parts->record_length + + log_record_type_descriptor[type].compressed_LSN * 2 + 3), + ((((uint) log_descriptor.bc.current_page_fill) + + (parts->record_length + + log_record_type_descriptor[type].compressed_LSN * 2 + 3)) > + TRANSLOG_PAGE_SIZE))); + /* + check that there is enough place on current page. + NOTE: compressing may increase page LSN size on two bytes for every LSN + */ + if ((((uint) log_descriptor.bc.current_page_fill) + + (parts->record_length + + log_record_type_descriptor[type].compressed_LSN * 2 + 3)) > + TRANSLOG_PAGE_SIZE) + { + DBUG_PRINT("info", ("Next page")); + if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, + &buffer_to_flush)) + goto err; /* rc == 1 */ + if (buffer_to_flush) + translog_buffer_lock_assert_owner(buffer_to_flush); + } + + set_lsn(lsn, log_descriptor.horizon); + if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), + *lsn, TRUE) || + (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, + lsn, hook_arg))) + goto err; + + /* compress LSNs */ + if (log_record_type_descriptor[type].rclass == + LOGRECTYPE_PSEUDOFIXEDLENGTH) + { + DBUG_ASSERT(log_record_type_descriptor[type].compressed_LSN > 0); + translog_relative_LSN_encode(parts, *lsn, + log_record_type_descriptor[type]. + compressed_LSN, compressed_LSNs); + } + + /* + Write the whole record at once (we know that there is enough place on + the destination page) + */ + DBUG_ASSERT(parts->current != 0); /* first part is left for header */ + part= parts->parts + (--parts->current); + parts->total_record_length+= (translog_size_t) (part->length= 1 + 2); + part->str= chunk1_header; + *chunk1_header= (uchar) (type | TRANSLOG_CHUNK_FIXED); + int2store(chunk1_header + 1, short_trid); + + rc= translog_write_parts_on_page(&log_descriptor.horizon, + &log_descriptor.bc, + parts->total_record_length, parts); + + log_descriptor.bc.buffer->last_lsn= *lsn; + DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx) buffer: 0x%lx", + LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn), + (ulong) log_descriptor.bc.buffer)); + +err: + translog_unlock(); + + /* + check if we switched buffer and need process it (current buffer is + unlocked already => we will not delay other threads + */ + if (buffer_to_flush != NULL) + { + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + + DBUG_RETURN(rc); +} + + +/** + @brief Writes the log record + + If share has no 2-byte-id yet, gives an id to the share and logs + LOGREC_FILE_ID. If transaction has not logged LOGREC_LONG_TRANSACTION_ID + yet, logs it. + + @param lsn LSN of the record will be written here + @param type the log record type + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param tbl_info MARIA_HA of table or NULL + @param rec_len record length or 0 (count it) + @param part_no number of parts or 0 (count it) + @param parts_data zero ended (in case of number of parts is 0) + array of LEX_STRINGs (parts), first + TRANSLOG_INTERNAL_PARTS positions in the log + should be unused (need for loghandler) + @param store_share_id if tbl_info!=NULL then share's id will + automatically be stored in the two first bytes + pointed (so pointer is assumed to be !=NULL) + @param hook_arg argument which will be passed to pre-write and + in-write hooks of this record. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_write_record(LSN *lsn, + enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + translog_size_t rec_len, + uint part_no, + LEX_CUSTRING *parts_data, + uchar *store_share_id, + void *hook_arg) +{ + struct st_translog_parts parts; + LEX_CUSTRING *part; + int rc; + uint short_trid= trn->short_id; + DBUG_ENTER("translog_write_record"); + DBUG_PRINT("enter", ("type: %u (%s) ShortTrID: %u rec_len: %lu", + (uint) type, log_record_type_descriptor[type].name, + (uint) short_trid, (ulong) rec_len)); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + if (unlikely(translog_status != TRANSLOG_OK)) + { + DBUG_PRINT("error", ("Transaction log is write protected")); + DBUG_RETURN(1); + } + + if (tbl_info) + { + MARIA_SHARE *share= tbl_info->s; + DBUG_ASSERT(share->now_transactional); + if (unlikely(share->id == 0)) + { + /* + First log write for this MARIA_SHARE; give it a short id. + When the lock manager is enabled and needs a short id, it should be + assigned in the lock manager (because row locks will be taken before + log records are written; for example SELECT FOR UPDATE takes locks but + writes no log record. + */ + if (unlikely(translog_assign_id_to_share(tbl_info, trn))) + DBUG_RETURN(1); + } + fileid_store(store_share_id, share->id); + } + if (unlikely(!(trn->first_undo_lsn & TRANSACTION_LOGGED_LONG_ID))) + { + LSN dummy_lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[6]; + DBUG_ASSERT(trn->undo_lsn == LSN_IMPOSSIBLE); + int6store(log_data, trn->trid); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; /* no recursion */ + if (unlikely(translog_write_record(&dummy_lsn, LOGREC_LONG_TRANSACTION_ID, + trn, NULL, sizeof(log_data), + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL))) + DBUG_RETURN(1); + } + + parts.parts= parts_data; + + /* count parts if they are not counted by upper level */ + if (part_no == 0) + { + for (part_no= TRANSLOG_INTERNAL_PARTS; + parts_data[part_no].length != 0; + part_no++); + } + parts.elements= part_no; + parts.current= TRANSLOG_INTERNAL_PARTS; + + /* clear TRANSLOG_INTERNAL_PARTS */ + compile_time_assert(TRANSLOG_INTERNAL_PARTS != 0); + parts_data[0].str= 0; + parts_data[0].length= 0; + + /* count length of the record */ + if (rec_len == 0) + { + for(part= parts_data + TRANSLOG_INTERNAL_PARTS;\ + part < parts_data + part_no; + part++) + { + rec_len+= (translog_size_t) part->length; + } + } + parts.record_length= rec_len; + +#ifndef DBUG_OFF + { + uint i; + uint len= 0; +#ifdef HAVE_valgrind + ha_checksum checksum= 0; +#endif + for (i= TRANSLOG_INTERNAL_PARTS; i < part_no; i++) + { +#ifdef HAVE_valgrind + /* Find unitialized bytes early */ + checksum+= my_checksum(checksum, parts_data[i].str, + parts_data[i].length); +#endif + len+= parts_data[i].length; + } + DBUG_ASSERT(len == rec_len); + } +#endif + /* + Start total_record_length from record_length then overhead will + be add + */ + parts.total_record_length= parts.record_length; + DBUG_PRINT("info", ("record length: %lu", (ulong) parts.record_length)); + + /* process this parts */ + if (!(rc= (log_record_type_descriptor[type].prewrite_hook && + (*log_record_type_descriptor[type].prewrite_hook) (type, trn, + tbl_info, + hook_arg)))) + { + switch (log_record_type_descriptor[type].rclass) { + case LOGRECTYPE_VARIABLE_LENGTH: + rc= translog_write_variable_record(lsn, type, tbl_info, + short_trid, &parts, trn, hook_arg); + break; + case LOGRECTYPE_PSEUDOFIXEDLENGTH: + case LOGRECTYPE_FIXEDLENGTH: + rc= translog_write_fixed_record(lsn, type, tbl_info, + short_trid, &parts, trn, hook_arg); + break; + case LOGRECTYPE_NOT_ALLOWED: + default: + DBUG_ASSERT(0); + rc= 1; + } + } + + DBUG_PRINT("info", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(*lsn))); + DBUG_RETURN(rc); +} + + +/* + Decode compressed (relative) LSN(s) + + SYNOPSIS + translog_relative_lsn_decode() + base_lsn LSN for encoding + src Decode LSN(s) from here + dst Put decoded LSNs here + lsns number of LSN(s) + + RETURN + position in sources after decoded LSN(s) +*/ + +static uchar *translog_relative_LSN_decode(LSN base_lsn, + uchar *src, uchar *dst, uint lsns) +{ + uint i; + for (i= 0; i < lsns; i++, dst+= LSN_STORE_SIZE) + { + src= translog_get_LSN_from_diff(base_lsn, src, dst); + } + return src; +} + +/** + @brief Get header of fixed/pseudo length record and call hook for + it processing + + @param page Pointer to the buffer with page where LSN chunk is + placed + @param page_offset Offset of the first chunk in the page + @param buff Buffer to be filled with header data + + @return Length of header or operation status + @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +static int translog_fixed_length_header(uchar *page, + translog_size_t page_offset, + TRANSLOG_HEADER_BUFFER *buff) +{ + struct st_log_record_type_descriptor *desc= + log_record_type_descriptor + buff->type; + uchar *src= page + page_offset + 3; + uchar *dst= buff->header; + uchar *start= src; + int lsns= desc->compressed_LSN; + uint length= desc->fixed_length; + DBUG_ENTER("translog_fixed_length_header"); + + buff->record_length= length; + + if (desc->rclass == LOGRECTYPE_PSEUDOFIXEDLENGTH) + { + DBUG_ASSERT(lsns > 0); + src= translog_relative_LSN_decode(buff->lsn, src, dst, lsns); + lsns*= LSN_STORE_SIZE; + dst+= lsns; + length-= lsns; + buff->compressed_LSN_economy= (lsns - (int) (src - start)); + } + else + buff->compressed_LSN_economy= 0; + + memcpy(dst, src, length); + buff->non_header_data_start_offset= (uint16) (page_offset + + ((src + length) - + (page + page_offset))); + buff->non_header_data_len= 0; + DBUG_RETURN(buff->record_length); +} + + +/* + Free resources used by TRANSLOG_HEADER_BUFFER + + SYNOPSIS + translog_free_record_header(); +*/ + +void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff) +{ + DBUG_ENTER("translog_free_record_header"); + if (buff->groups_no != 0) + { + my_free(buff->groups, MYF(0)); + buff->groups_no= 0; + } + DBUG_VOID_RETURN; +} + + +/** + @brief Returns the current horizon at the end of the current log + + @return Horizon + @retval LSN_ERROR error + @retvar # Horizon +*/ + +TRANSLOG_ADDRESS translog_get_horizon() +{ + TRANSLOG_ADDRESS res; + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + translog_lock(); + res= log_descriptor.horizon; + translog_unlock(); + return res; +} + + +/** + @brief Returns the current horizon at the end of the current log, caller is + assumed to already hold the lock + + @return Horizon + @retval LSN_ERROR error + @retvar # Horizon +*/ + +TRANSLOG_ADDRESS translog_get_horizon_no_lock() +{ + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + translog_lock_assert_owner(); + return log_descriptor.horizon; +} + + +/* + Set last page in the scanner data structure + + SYNOPSIS + translog_scanner_set_last_page() + scanner Information about current chunk during scanning + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_scanner_set_last_page(TRANSLOG_SCANNER_DATA *scanner) +{ + my_bool page_ok; + if (LSN_FILE_NO(scanner->page_addr) == LSN_FILE_NO(scanner->horizon)) + { + /* It is last file => we can easy find last page address by horizon */ + uint pagegrest= LSN_OFFSET(scanner->horizon) % TRANSLOG_PAGE_SIZE; + scanner->last_file_page= (scanner->horizon - + (pagegrest ? pagegrest : TRANSLOG_PAGE_SIZE)); + return (0); + } + scanner->last_file_page= scanner->page_addr; + return (translog_get_last_page_addr(&scanner->last_file_page, &page_ok, 0)); +} + + +/** + @brief Get page from page cache according to requested method + + @param scanner The scanner data + + @return operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool +translog_scanner_get_page(TRANSLOG_SCANNER_DATA *scanner) +{ + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_scanner_get_page"); + data.addr= &scanner->page_addr; + data.was_recovered= 0; + DBUG_RETURN((scanner->page= + translog_get_page(&data, scanner->buffer, + (scanner->use_direct_link ? + &scanner->direct_link : + NULL))) == + NULL); +} + + +/** + @brief Initialize reader scanner. + + @param lsn LSN with which it have to be inited + @param fixed_horizon true if it is OK do not read records which was written + after scanning beginning + @param scanner scanner which have to be inited + @param use_direct prefer using direct lings from page handler + where it is possible. + + @note If direct link was used translog_destroy_scanner should be + called after it using + + @return status of the operation + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_scanner_init(LSN lsn, + my_bool fixed_horizon, + TRANSLOG_SCANNER_DATA *scanner, + my_bool use_direct) +{ + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_scanner_init"); + DBUG_PRINT("enter", ("Scanner: 0x%lx LSN: (%lu,0x%lx)", + (ulong) scanner, LSN_IN_PARTS(lsn))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + data.addr= &scanner->page_addr; + data.was_recovered= 0; + + scanner->page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE; + + scanner->fixed_horizon= fixed_horizon; + scanner->use_direct_link= use_direct; + scanner->direct_link= NULL; + + scanner->horizon= translog_get_horizon(); + DBUG_PRINT("info", ("horizon: (%lu,0x%lx)", LSN_IN_PARTS(scanner->horizon))); + + /* lsn < horizon */ + DBUG_ASSERT(lsn <= scanner->horizon); + + scanner->page_addr= lsn; + scanner->page_addr-= scanner->page_offset; /*decrease offset */ + + if (translog_scanner_set_last_page(scanner)) + DBUG_RETURN(1); + + if (translog_scanner_get_page(scanner)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + @brief Destroy scanner object; + + @param scanner The scanner object to destroy +*/ + +void translog_destroy_scanner(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_destroy_scanner"); + DBUG_PRINT("enter", ("Scanner: 0x%lx", (ulong)scanner)); + translog_free_link(scanner->direct_link); + DBUG_VOID_RETURN; +} + + +/* + Checks End of the Log + + SYNOPSIS + translog_scanner_eol() + scanner Information about current chunk during scanning + + RETURN + 1 End of the Log + 0 OK +*/ + +static my_bool translog_scanner_eol(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_scanner_eol"); + DBUG_PRINT("enter", + ("Horizon: (%lu, 0x%lx) Current: (%lu, 0x%lx+0x%x=0x%lx)", + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->page_addr), + (uint) scanner->page_offset, + (ulong) (LSN_OFFSET(scanner->page_addr) + scanner->page_offset))); + if (scanner->horizon > (scanner->page_addr + + scanner->page_offset)) + { + DBUG_PRINT("info", ("Horizon is not reached")); + DBUG_RETURN(0); + } + if (scanner->fixed_horizon) + { + DBUG_PRINT("info", ("Horizon is fixed and reached")); + DBUG_RETURN(1); + } + scanner->horizon= translog_get_horizon(); + DBUG_PRINT("info", + ("Horizon is re-read, EOL: %d", + scanner->horizon <= (scanner->page_addr + + scanner->page_offset))); + DBUG_RETURN(scanner->horizon <= (scanner->page_addr + + scanner->page_offset)); +} + + +/** + @brief Cheks End of the Page + + @param scanner Information about current chunk during scanning + + @retval 1 End of the Page + @retval 0 OK +*/ + +static my_bool translog_scanner_eop(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_scanner_eop"); + DBUG_RETURN(scanner->page_offset >= TRANSLOG_PAGE_SIZE || + scanner->page[scanner->page_offset] == TRANSLOG_FILLER); +} + + +/** + @brief Checks End of the File (i.e. we are scanning last page, which do not + mean end of this page) + + @param scanner Information about current chunk during scanning + + @retval 1 End of the File + @retval 0 OK +*/ + +static my_bool translog_scanner_eof(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_scanner_eof"); + DBUG_ASSERT(LSN_FILE_NO(scanner->page_addr) == + LSN_FILE_NO(scanner->last_file_page)); + DBUG_PRINT("enter", ("curr Page: 0x%lx last page: 0x%lx " + "normal EOF: %d", + (ulong) LSN_OFFSET(scanner->page_addr), + (ulong) LSN_OFFSET(scanner->last_file_page), + LSN_OFFSET(scanner->page_addr) == + LSN_OFFSET(scanner->last_file_page))); + /* + TODO: detect damaged file EOF, + TODO: issue warning if damaged file EOF detected + */ + DBUG_RETURN(scanner->page_addr == + scanner->last_file_page); +} + +/* + Move scanner to the next chunk + + SYNOPSIS + translog_get_next_chunk() + scanner Information about current chunk during scanning + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner) +{ + uint16 len; + DBUG_ENTER("translog_get_next_chunk"); + + if (translog_scanner_eop(scanner)) + len= TRANSLOG_PAGE_SIZE - scanner->page_offset; + else if ((len= translog_get_total_chunk_length(scanner->page, + scanner->page_offset)) == 0) + DBUG_RETURN(1); + scanner->page_offset+= len; + + if (translog_scanner_eol(scanner)) + { + scanner->page= END_OF_LOG; + scanner->page_offset= 0; + DBUG_RETURN(0); + } + if (translog_scanner_eop(scanner)) + { + /* before reading next page we should unpin current one if it was pinned */ + translog_free_link(scanner->direct_link); + if (translog_scanner_eof(scanner)) + { + DBUG_PRINT("info", ("horizon: (%lu,0x%lx) pageaddr: (%lu,0x%lx)", + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->page_addr))); + /* if it is log end it have to be caught before */ + DBUG_ASSERT(LSN_FILE_NO(scanner->horizon) > + LSN_FILE_NO(scanner->page_addr)); + scanner->page_addr+= LSN_ONE_FILE; + scanner->page_addr= LSN_REPLACE_OFFSET(scanner->page_addr, + TRANSLOG_PAGE_SIZE); + if (translog_scanner_set_last_page(scanner)) + DBUG_RETURN(1); + } + else + { + scanner->page_addr+= TRANSLOG_PAGE_SIZE; /* offset increased */ + } + + if (translog_scanner_get_page(scanner)) + DBUG_RETURN(1); + + scanner->page_offset= translog_get_first_chunk_offset(scanner->page); + if (translog_scanner_eol(scanner)) + { + scanner->page= END_OF_LOG; + scanner->page_offset= 0; + DBUG_RETURN(0); + } + DBUG_ASSERT(scanner->page[scanner->page_offset] != TRANSLOG_FILLER); + } + DBUG_RETURN(0); +} + + +/** + @brief Get header of variable length record and call hook for it processing + + @param page Pointer to the buffer with page where LSN chunk is + placed + @param page_offset Offset of the first chunk in the page + @param buff Buffer to be filled with header data + @param scanner If present should be moved to the header page if + it differ from LSN page + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval RECHEADER_READ_EOF End of the log reached during the read + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +static int +translog_variable_length_header(uchar *page, translog_size_t page_offset, + TRANSLOG_HEADER_BUFFER *buff, + TRANSLOG_SCANNER_DATA *scanner) +{ + struct st_log_record_type_descriptor *desc= (log_record_type_descriptor + + buff->type); + uchar *src= page + page_offset + 1 + 2; + uchar *dst= buff->header; + LSN base_lsn; + uint lsns= desc->compressed_LSN; + uint16 chunk_len; + uint16 length= desc->read_header_len; + uint16 buffer_length= length; + uint16 body_len; + int rc; + TRANSLOG_SCANNER_DATA internal_scanner; + DBUG_ENTER("translog_variable_length_header"); + + buff->record_length= translog_variable_record_1group_decode_len(&src); + chunk_len= uint2korr(src); + DBUG_PRINT("info", ("rec len: %lu chunk len: %u length: %u bufflen: %u", + (ulong) buff->record_length, (uint) chunk_len, + (uint) length, (uint) buffer_length)); + if (chunk_len == 0) + { + uint16 page_rest; + DBUG_PRINT("info", ("1 group")); + src+= 2; + page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); + + base_lsn= buff->lsn; + body_len= min(page_rest, buff->record_length); + } + else + { + uint grp_no, curr; + uint header_to_skip; + uint16 page_rest; + + DBUG_PRINT("info", ("multi-group")); + grp_no= buff->groups_no= uint2korr(src + 2); + if (!(buff->groups= + (TRANSLOG_GROUP*) my_malloc(sizeof(TRANSLOG_GROUP) * grp_no, + MYF(0)))) + DBUG_RETURN(RECHEADER_READ_ERROR); + DBUG_PRINT("info", ("Groups: %u", (uint) grp_no)); + src+= (2 + 2); + page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); + curr= 0; + header_to_skip= src - (page + page_offset); + buff->chunk0_pages= 0; + + for (;;) + { + uint i, read_length= grp_no; + + buff->chunk0_pages++; + if (page_rest < grp_no * (7 + 1)) + read_length= page_rest / (7 + 1); + DBUG_PRINT("info", ("Read chunk0 page#%u read: %u left: %u " + "start from: %u", + buff->chunk0_pages, read_length, grp_no, curr)); + for (i= 0; i < read_length; i++, curr++) + { + DBUG_ASSERT(curr < buff->groups_no); + buff->groups[curr].addr= lsn_korr(src + i * (7 + 1)); + buff->groups[curr].num= src[i * (7 + 1) + 7]; + DBUG_PRINT("info", ("group #%u (%lu,0x%lx) chunks: %u", + curr, + LSN_IN_PARTS(buff->groups[curr].addr), + (uint) buff->groups[curr].num)); + } + grp_no-= read_length; + if (grp_no == 0) + { + if (scanner) + { + buff->chunk0_data_addr= scanner->page_addr; + /* offset increased */ + buff->chunk0_data_addr+= (page_offset + header_to_skip + + read_length * (7 + 1)); + } + else + { + buff->chunk0_data_addr= buff->lsn; + /* offset increased */ + buff->chunk0_data_addr+= (header_to_skip + read_length * (7 + 1)); + } + buff->chunk0_data_len= chunk_len - 2 - read_length * (7 + 1); + DBUG_PRINT("info", ("Data address: (%lu,0x%lx) len: %u", + LSN_IN_PARTS(buff->chunk0_data_addr), + buff->chunk0_data_len)); + break; + } + if (scanner == NULL) + { + DBUG_PRINT("info", ("use internal scanner for header reading")); + scanner= &internal_scanner; + if (translog_scanner_init(buff->lsn, 1, scanner, 0)) + { + rc= RECHEADER_READ_ERROR; + goto exit_and_free; + } + } + if (translog_get_next_chunk(scanner)) + { + if (scanner == &internal_scanner) + translog_destroy_scanner(scanner); + rc= RECHEADER_READ_ERROR; + goto exit_and_free; + } + if (scanner->page == END_OF_LOG) + { + if (scanner == &internal_scanner) + translog_destroy_scanner(scanner); + rc= RECHEADER_READ_EOF; + goto exit_and_free; + } + page= scanner->page; + page_offset= scanner->page_offset; + src= page + page_offset + header_to_skip; + chunk_len= uint2korr(src - 2 - 2); + DBUG_PRINT("info", ("Chunk len: %u", (uint) chunk_len)); + page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); + } + + if (scanner == NULL) + { + DBUG_PRINT("info", ("use internal scanner")); + scanner= &internal_scanner; + } + else + { + translog_destroy_scanner(scanner); + } + base_lsn= buff->groups[0].addr; + translog_scanner_init(base_lsn, 1, scanner, scanner == &internal_scanner); + /* first group chunk is always chunk type 2 */ + page= scanner->page; + page_offset= scanner->page_offset; + src= page + page_offset + 1; + page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); + body_len= page_rest; + if (scanner == &internal_scanner) + translog_destroy_scanner(scanner); + } + if (lsns) + { + uchar *start= src; + src= translog_relative_LSN_decode(base_lsn, src, dst, lsns); + lsns*= LSN_STORE_SIZE; + dst+= lsns; + length-= lsns; + buff->record_length+= (buff->compressed_LSN_economy= + (int) (lsns - (src - start))); + DBUG_PRINT("info", ("lsns: %u length: %u economy: %d new length: %lu", + lsns / LSN_STORE_SIZE, (uint) length, + (int) buff->compressed_LSN_economy, + (ulong) buff->record_length)); + body_len-= (uint16) (src - start); + } + else + buff->compressed_LSN_economy= 0; + + DBUG_ASSERT(body_len >= length); + body_len-= length; + memcpy(dst, src, length); + buff->non_header_data_start_offset= (uint16) (src + length - page); + buff->non_header_data_len= body_len; + DBUG_PRINT("info", ("non_header_data_start_offset: %u len: %u buffer: %u", + buff->non_header_data_start_offset, + buff->non_header_data_len, buffer_length)); + DBUG_RETURN(buffer_length); + +exit_and_free: + my_free(buff->groups, MYF(0)); + buff->groups_no= 0; /* prevent try to use of buff->groups */ + DBUG_RETURN(rc); +} + + +/** + @brief Read record header from the given buffer + + @param page page content buffer + @param page_offset offset of the chunk in the page + @param buff destination buffer + @param scanner If this is set the scanner will be moved to the + record header page (differ from LSN page in case of + multi-group records) + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +int translog_read_record_header_from_buffer(uchar *page, + uint16 page_offset, + TRANSLOG_HEADER_BUFFER *buff, + TRANSLOG_SCANNER_DATA *scanner) +{ + translog_size_t res; + DBUG_ENTER("translog_read_record_header_from_buffer"); + DBUG_PRINT("info", ("page byte: 0x%x offset: %u", + (uint) page[page_offset], (uint) page_offset)); + DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset])); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + buff->type= (page[page_offset] & TRANSLOG_REC_TYPE); + buff->short_trid= uint2korr(page + page_offset + 1); + DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)", + (uint) buff->type, (uint)buff->short_trid, + LSN_IN_PARTS(buff->lsn))); + /* Read required bytes from the header and call hook */ + switch (log_record_type_descriptor[buff->type].rclass) { + case LOGRECTYPE_VARIABLE_LENGTH: + res= translog_variable_length_header(page, page_offset, buff, + scanner); + break; + case LOGRECTYPE_PSEUDOFIXEDLENGTH: + case LOGRECTYPE_FIXEDLENGTH: + res= translog_fixed_length_header(page, page_offset, buff); + break; + default: + DBUG_ASSERT(0); /* we read some junk (got no LSN) */ + res= RECHEADER_READ_ERROR; + } + DBUG_RETURN(res); +} + + +/** + @brief Read record header and some fixed part of a record (the part depend + on record type). + + @param lsn log record serial number (address of the record) + @param buff log record header buffer + + @note Some type of record can be read completely by this call + @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative + LSN can be translated to absolute one), some fields can be added (like + actual header length in the record if the header has variable length) + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff) +{ + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + uchar *page; + translog_size_t res, page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE; + PAGECACHE_BLOCK_LINK *direct_link; + TRANSLOG_ADDRESS addr; + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_read_record_header"); + DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn))); + DBUG_ASSERT(LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE != 0); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + buff->lsn= lsn; + buff->groups_no= 0; + data.addr= &addr; + data.was_recovered= 0; + addr= lsn; + addr-= page_offset; /* offset decreasing */ + res= (!(page= translog_get_page(&data, psize_buff.buffer, &direct_link))) ? + RECHEADER_READ_ERROR : + translog_read_record_header_from_buffer(page, page_offset, buff, 0); + translog_free_link(direct_link); + DBUG_RETURN(res); +} + + +/** + @brief Read record header and some fixed part of a record (the part depend + on record type). + + @param scan scanner position to read + @param buff log record header buffer + @param move_scanner request to move scanner to the header position + + @note Some type of record can be read completely by this call + @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative + LSN can be translated to absolute one), some fields can be added (like + actual header length in the record if the header has variable length) + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where stored + decoded part of the header +*/ + +int translog_read_record_header_scan(TRANSLOG_SCANNER_DATA *scanner, + TRANSLOG_HEADER_BUFFER *buff, + my_bool move_scanner) +{ + translog_size_t res; + DBUG_ENTER("translog_read_record_header_scan"); + DBUG_PRINT("enter", ("Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) " + "Lst: (%lu,0x%lx) Offset: %u(%x) fixed %d", + LSN_IN_PARTS(scanner->page_addr), + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->last_file_page), + (uint) scanner->page_offset, + (uint) scanner->page_offset, scanner->fixed_horizon)); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + buff->groups_no= 0; + buff->lsn= scanner->page_addr; + buff->lsn+= scanner->page_offset; /* offset increasing */ + res= translog_read_record_header_from_buffer(scanner->page, + scanner->page_offset, + buff, + (move_scanner ? + scanner : 0)); + DBUG_RETURN(res); +} + + +/** + @brief Read record header and some fixed part of the next record (the part + depend on record type). + + @param scanner data for scanning if lsn is NULL scanner data + will be used for continue scanning. + The scanner can be NULL. + + @param buff log record header buffer + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval RECHEADER_READ_EOF EOF + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner, + TRANSLOG_HEADER_BUFFER *buff) +{ + translog_size_t res; + + DBUG_ENTER("translog_read_next_record_header"); + buff->groups_no= 0; /* to be sure that we will free it right */ + DBUG_PRINT("enter", ("scanner: 0x%lx", (ulong) scanner)); + DBUG_PRINT("info", ("Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) " + "Lst: (%lu,0x%lx) Offset: %u(%x) fixed: %d", + LSN_IN_PARTS(scanner->page_addr), + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->last_file_page), + (uint) scanner->page_offset, + (uint) scanner->page_offset, scanner->fixed_horizon)); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + do + { + if (translog_get_next_chunk(scanner)) + DBUG_RETURN(RECHEADER_READ_ERROR); + if (scanner->page == END_OF_LOG) + { + DBUG_PRINT("info", ("End of file from the scanner")); + /* Last record was read */ + buff->lsn= LSN_IMPOSSIBLE; + DBUG_RETURN(RECHEADER_READ_EOF); + } + DBUG_PRINT("info", ("Page: (%lu,0x%lx) offset: %lu byte: %x", + LSN_IN_PARTS(scanner->page_addr), + (ulong) scanner->page_offset, + (uint) scanner->page[scanner->page_offset])); + } while (!translog_is_LSN_chunk(scanner->page[scanner->page_offset]) && + scanner->page[scanner->page_offset] != TRANSLOG_FILLER); + + if (scanner->page[scanner->page_offset] == TRANSLOG_FILLER) + { + DBUG_PRINT("info", ("End of file")); + /* Last record was read */ + buff->lsn= LSN_IMPOSSIBLE; + /* Return 'end of log' marker */ + res= RECHEADER_READ_EOF; + } + else + res= translog_read_record_header_scan(scanner, buff, 0); + DBUG_RETURN(res); +} + + +/* + Moves record data reader to the next chunk and fill the data reader + information about that chunk. + + SYNOPSIS + translog_record_read_next_chunk() + data data cursor + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_record_read_next_chunk(TRANSLOG_READER_DATA *data) +{ + translog_size_t new_current_offset= data->current_offset + data->chunk_size; + uint16 chunk_header_len, chunk_len; + uint8 type; + DBUG_ENTER("translog_record_read_next_chunk"); + + if (data->eor) + { + DBUG_PRINT("info", ("end of the record flag set")); + DBUG_RETURN(1); + } + + if (data->header.groups_no && + data->header.groups_no - 1 != data->current_group && + data->header.groups[data->current_group].num == data->current_chunk) + { + /* Goto next group */ + data->current_group++; + data->current_chunk= 0; + DBUG_PRINT("info", ("skip to group: #%u", data->current_group)); + translog_destroy_scanner(&data->scanner); + translog_scanner_init(data->header.groups[data->current_group].addr, + 1, &data->scanner, 1); + } + else + { + data->current_chunk++; + if (translog_get_next_chunk(&data->scanner)) + DBUG_RETURN(1); + if (data->scanner.page == END_OF_LOG) + { + /* + Actually it should not happened, but we want to quit nicely in case + of a truncated log + */ + DBUG_RETURN(1); + } + } + type= data->scanner.page[data->scanner.page_offset] & TRANSLOG_CHUNK_TYPE; + + if (type == TRANSLOG_CHUNK_LSN && data->header.groups_no) + { + DBUG_PRINT("info", + ("Last chunk: data len: %u offset: %u group: %u of %u", + data->header.chunk0_data_len, data->scanner.page_offset, + data->current_group, data->header.groups_no - 1)); + DBUG_ASSERT(data->header.groups_no - 1 == data->current_group); + DBUG_ASSERT(data->header.lsn == + data->scanner.page_addr + data->scanner.page_offset); + translog_destroy_scanner(&data->scanner); + translog_scanner_init(data->header.chunk0_data_addr, 1, &data->scanner, 1); + data->chunk_size= data->header.chunk0_data_len; + data->body_offset= data->scanner.page_offset; + data->current_offset= new_current_offset; + data->eor= 1; + DBUG_RETURN(0); + } + + if (type == TRANSLOG_CHUNK_LSN || type == TRANSLOG_CHUNK_FIXED) + { + data->eor= 1; + DBUG_RETURN(1); /* End of record */ + } + + chunk_header_len= + translog_get_chunk_header_length(data->scanner.page + + data->scanner.page_offset); + chunk_len= translog_get_total_chunk_length(data->scanner.page, + data->scanner.page_offset); + data->chunk_size= chunk_len - chunk_header_len; + data->body_offset= data->scanner.page_offset + chunk_header_len; + data->current_offset= new_current_offset; + DBUG_PRINT("info", ("grp: %u chunk: %u body_offset: %u chunk_size: %u " + "current_offset: %lu", + (uint) data->current_group, + (uint) data->current_chunk, + (uint) data->body_offset, + (uint) data->chunk_size, (ulong) data->current_offset)); + DBUG_RETURN(0); +} + + +/* + Initialize record reader data from LSN + + SYNOPSIS + translog_init_reader_data() + lsn reference to LSN we should start from + data reader data to initialize + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_init_reader_data(LSN lsn, + TRANSLOG_READER_DATA *data) +{ + int read_header; + DBUG_ENTER("translog_init_reader_data"); + if (translog_scanner_init(lsn, 1, &data->scanner, 1) || + ((read_header= + translog_read_record_header_scan(&data->scanner, &data->header, 1)) + == RECHEADER_READ_ERROR)) + DBUG_RETURN(1); + data->read_header= read_header; + data->body_offset= data->header.non_header_data_start_offset; + data->chunk_size= data->header.non_header_data_len; + data->current_offset= data->read_header; + data->current_group= 0; + data->current_chunk= 0; + data->eor= 0; + DBUG_PRINT("info", ("read_header: %u " + "body_offset: %u chunk_size: %u current_offset: %lu", + (uint) data->read_header, + (uint) data->body_offset, + (uint) data->chunk_size, (ulong) data->current_offset)); + DBUG_RETURN(0); +} + + +/** + @brief Destroy reader data object +*/ + +static void translog_destroy_reader_data(TRANSLOG_READER_DATA *data) +{ + translog_destroy_scanner(&data->scanner); + translog_free_record_header(&data->header); +} + + +/* + Read a part of the record. + + SYNOPSIS + translog_read_record_header() + lsn log record serial number (address of the record) + offset From the beginning of the record beginning (read + by translog_read_record_header). + length Length of record part which have to be read. + buffer Buffer where to read the record part (have to be at + least 'length' bytes length) + + RETURN + length of data actually read +*/ + +translog_size_t translog_read_record(LSN lsn, + translog_size_t offset, + translog_size_t length, + uchar *buffer, + TRANSLOG_READER_DATA *data) +{ + translog_size_t requested_length= length; + translog_size_t end= offset + length; + TRANSLOG_READER_DATA internal_data; + DBUG_ENTER("translog_read_record"); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + if (data == NULL) + { + DBUG_ASSERT(lsn != LSN_IMPOSSIBLE); + data= &internal_data; + } + if (lsn || + (offset < data->current_offset && + !(offset < data->read_header && offset + length < data->read_header))) + { + if (translog_init_reader_data(lsn, data)) + DBUG_RETURN(0); + } + DBUG_PRINT("info", ("Offset: %lu length: %lu " + "Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) " + "Lst: (%lu,0x%lx) Offset: %u(%x) fixed: %d", + (ulong) offset, (ulong) length, + LSN_IN_PARTS(data->scanner.page_addr), + LSN_IN_PARTS(data->scanner.horizon), + LSN_IN_PARTS(data->scanner.last_file_page), + (uint) data->scanner.page_offset, + (uint) data->scanner.page_offset, + data->scanner.fixed_horizon)); + if (offset < data->read_header) + { + uint16 len= min(data->read_header, end) - offset; + DBUG_PRINT("info", + ("enter header offset: %lu length: %lu", + (ulong) offset, (ulong) length)); + memcpy(buffer, data->header.header + offset, len); + length-= len; + if (length == 0) + { + translog_destroy_reader_data(data); + DBUG_RETURN(requested_length); + } + offset+= len; + buffer+= len; + DBUG_PRINT("info", + ("len: %u offset: %lu curr: %lu length: %lu", + len, (ulong) offset, (ulong) data->current_offset, + (ulong) length)); + } + /* TODO: find first page which we should read by offset */ + + /* read the record chunk by chunk */ + for(;;) + { + uint page_end= data->current_offset + data->chunk_size; + DBUG_PRINT("info", + ("enter body offset: %lu curr: %lu " + "length: %lu page_end: %lu", + (ulong) offset, (ulong) data->current_offset, (ulong) length, + (ulong) page_end)); + if (offset < page_end) + { + uint len= page_end - offset; + set_if_smaller(len, length); /* in case we read beyond record's end */ + DBUG_ASSERT(offset >= data->current_offset); + memcpy(buffer, + data->scanner.page + data->body_offset + + (offset - data->current_offset), len); + length-= len; + if (length == 0) + { + translog_destroy_reader_data(data); + DBUG_RETURN(requested_length); + } + offset+= len; + buffer+= len; + DBUG_PRINT("info", + ("len: %u offset: %lu curr: %lu length: %lu", + len, (ulong) offset, (ulong) data->current_offset, + (ulong) length)); + } + if (translog_record_read_next_chunk(data)) + { + translog_destroy_reader_data(data); + DBUG_RETURN(requested_length - length); + } + } +} + + +/* + @brief Force skipping to the next buffer + + @todo Do not copy old page content if all page protections are switched off + (because we do not need calculate something or change old parts of the page) +*/ + +static void translog_force_current_buffer_to_finish() +{ + TRANSLOG_ADDRESS new_buff_beginning; + uint16 old_buffer_no= log_descriptor.bc.buffer_no; + uint16 new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO; + struct st_translog_buffer *new_buffer= (log_descriptor.buffers + + new_buffer_no); + struct st_translog_buffer *old_buffer= log_descriptor.bc.buffer; + uchar *data= log_descriptor.bc.ptr - log_descriptor.bc.current_page_fill; + uint16 left= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill; + uint16 current_page_fill, write_counter, previous_offset; + DBUG_ENTER("translog_force_current_buffer_to_finish"); + DBUG_PRINT("enter", ("Buffer #%u 0x%lx " + "Buffer addr: (%lu,0x%lx) " + "Page addr: (%lu,0x%lx) " + "size: %lu (%lu) Pg: %u left: %u in progress %u", + (uint) old_buffer_no, + (ulong) old_buffer, + LSN_IN_PARTS(old_buffer->offset), + (ulong) LSN_FILE_NO(log_descriptor.horizon), + (ulong) (LSN_OFFSET(log_descriptor.horizon) - + log_descriptor.bc.current_page_fill), + (ulong) old_buffer->size, + (ulong) (log_descriptor.bc.ptr -log_descriptor.bc. + buffer->buffer), + (uint) log_descriptor.bc.current_page_fill, + (uint) left, + (uint) old_buffer-> + copy_to_buffer_in_progress)); + translog_lock_assert_owner(); + LINT_INIT(current_page_fill); + new_buff_beginning= old_buffer->offset; + new_buff_beginning+= old_buffer->size; /* increase offset */ + + DBUG_ASSERT(log_descriptor.bc.ptr !=NULL); + DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) == + LSN_FILE_NO(old_buffer->offset)); + translog_check_cursor(&log_descriptor.bc); + DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE); + if (left) + { + /* + TODO: if 'left' is so small that can't hold any other record + then do not move the page + */ + DBUG_PRINT("info", ("left: %u", (uint) left)); + + old_buffer->pre_force_close_horizon= + old_buffer->offset + old_buffer->size; + /* decrease offset */ + new_buff_beginning-= log_descriptor.bc.current_page_fill; + current_page_fill= log_descriptor.bc.current_page_fill; + + memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left); + old_buffer->size+= left; + DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx " + "Size: %lu", + (uint) old_buffer->buffer_no, + (ulong) old_buffer, + (ulong) old_buffer->size)); + DBUG_ASSERT(old_buffer->buffer_no == + log_descriptor.bc.buffer_no); + } + else + { + log_descriptor.bc.current_page_fill= 0; + } + + translog_buffer_lock(new_buffer); +#ifndef DBUG_OFF + { + TRANSLOG_ADDRESS offset= new_buffer->offset; + TRANSLOG_FILE *file= new_buffer->file; + uint8 ver= new_buffer->ver; + translog_lock_assert_owner(); +#endif + translog_wait_for_buffer_free(new_buffer); +#ifndef DBUG_OFF + /* We keep the handler locked so nobody can start this new buffer */ + DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL && + (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver); + } +#endif + + write_counter= log_descriptor.bc.write_counter; + previous_offset= log_descriptor.bc.previous_offset; + translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no); + /* Fix buffer offset (which was incorrectly set to horizon) */ + log_descriptor.bc.buffer->offset= new_buff_beginning; + log_descriptor.bc.write_counter= write_counter; + log_descriptor.bc.previous_offset= previous_offset; + new_buffer->prev_last_lsn= BUFFER_MAX_LSN(old_buffer); + DBUG_PRINT("info", ("prev_last_lsn set to (%lu,0x%lx) buffer: 0x%lx", + LSN_IN_PARTS(new_buffer->prev_last_lsn), + (ulong) new_buffer)); + + /* + Advances this log pointer, increases writers and let other threads to + write to the log while we process old page content + */ + if (left) + { + log_descriptor.bc.ptr+= current_page_fill; + log_descriptor.bc.buffer->size= log_descriptor.bc.current_page_fill= + current_page_fill; + new_buffer->overlay= 1; + } + else + translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); + translog_buffer_increase_writers(new_buffer); + translog_buffer_unlock(new_buffer); + + /* + We have to wait until all writers finish before start changing the + pages by applying protection and copying the page content in the + new buffer. + */ +#ifndef DBUG_OFF + { + TRANSLOG_ADDRESS offset= old_buffer->offset; + TRANSLOG_FILE *file= old_buffer->file; + uint8 ver= old_buffer->ver; +#endif + /* + Now only one thread can flush log (buffer can flush many threads but + log flush log flush where this function is used can do only one thread) + so no other thread can set is_closing_buffer. + */ + DBUG_ASSERT(!old_buffer->is_closing_buffer); + old_buffer->is_closing_buffer= 1; /* Other flushes will wait */ + DBUG_PRINT("enter", ("Buffer #%u 0x%lx is_closing_buffer set", + (uint) old_buffer->buffer_no, (ulong) old_buffer)); + translog_wait_for_writers(old_buffer); +#ifndef DBUG_OFF + /* We blocked flushing this buffer so the buffer should not changed */ + DBUG_ASSERT(offset == old_buffer->offset && file == old_buffer->file && + ver == old_buffer->ver); + } +#endif + + if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) + { + translog_put_sector_protection(data, &log_descriptor.bc); + if (left) + { + log_descriptor.bc.write_counter++; + log_descriptor.bc.previous_offset= current_page_fill; + } + else + { + DBUG_PRINT("info", ("drop write_counter")); + log_descriptor.bc.write_counter= 0; + log_descriptor.bc.previous_offset= 0; + } + } + + if (log_descriptor.flags & TRANSLOG_PAGE_CRC) + { + uint32 crc= translog_crc(data + log_descriptor.page_overhead, + TRANSLOG_PAGE_SIZE - + log_descriptor.page_overhead); + DBUG_PRINT("info", ("CRC: 0x%lx", (ulong) crc)); + int4store(data + 3 + 3 + 1, crc); + } + old_buffer->is_closing_buffer= 0; + DBUG_PRINT("enter", ("Buffer #%u 0x%lx is_closing_buffer cleared", + (uint) old_buffer->buffer_no, (ulong) old_buffer)); + pthread_cond_broadcast(&old_buffer->waiting_filling_buffer); + + if (left) + { + if (log_descriptor.flags & + (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION)) + memcpy(new_buffer->buffer, data, current_page_fill); + else + { + /* + This page header does not change if we add more data to the page so + we can not copy it and will not overwrite later + */ + new_buffer->skipped_data= current_page_fill; +#ifndef DBUG_OFF + memset(new_buffer->buffer, 0xa5, current_page_fill); +#endif + DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE); + } + } + old_buffer->next_buffer_offset= new_buffer->offset; + translog_buffer_lock(new_buffer); + new_buffer->prev_buffer_offset= old_buffer->offset; + translog_buffer_decrease_writers(new_buffer); + translog_buffer_unlock(new_buffer); + + DBUG_VOID_RETURN; +} + + +/** + @brief Waits while given lsn will be flushed + + @param lsn log record serial number up to which (inclusive) + the log has to be flushed +*/ + +void translog_flush_wait_for_end(LSN lsn) +{ + DBUG_ENTER("translog_flush_wait_for_end"); + DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn))); + safe_mutex_assert_owner(&log_descriptor.log_flush_lock); + while (cmp_translog_addr(log_descriptor.flushed, lsn) < 0) + pthread_cond_wait(&log_descriptor.log_flush_cond, + &log_descriptor.log_flush_lock); + DBUG_VOID_RETURN; +} + + +/** + @brief Sets goal for the next flush pass and waits for this pass end. + + @param lsn log record serial number up to which (inclusive) + the log has to be flushed +*/ + +void translog_flush_set_new_goal_and_wait(TRANSLOG_ADDRESS lsn) +{ + int flush_no= log_descriptor.flush_no; + DBUG_ENTER("translog_flush_set_new_goal_and_wait"); + DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn))); + safe_mutex_assert_owner(&log_descriptor.log_flush_lock); + if (cmp_translog_addr(lsn, log_descriptor.next_pass_max_lsn) > 0) + { + log_descriptor.next_pass_max_lsn= lsn; + log_descriptor.max_lsn_requester= pthread_self(); + pthread_cond_broadcast(&log_descriptor.new_goal_cond); + } + while (flush_no == log_descriptor.flush_no) + { + pthread_cond_wait(&log_descriptor.log_flush_cond, + &log_descriptor.log_flush_lock); + } + DBUG_VOID_RETURN; +} + + +/** + @brief sync() range of files (inclusive) and directory (by request) + + @param min min internal file number to flush + @param max max internal file number to flush + @param sync_dir need sync directory + + return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_sync_files(uint32 min, uint32 max, + my_bool sync_dir) +{ + uint fn; + my_bool rc= 0; + ulonglong flush_interval; + DBUG_ENTER("translog_sync_files"); + DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d", + (ulong) min, (ulong) max, (int) sync_dir)); + DBUG_ASSERT(min <= max); + + flush_interval= group_commit_wait; + if (flush_interval) + flush_start= my_micro_time(); + for (fn= min; fn <= max; fn++) + { + TRANSLOG_FILE *file= get_logfile_by_number(fn); + DBUG_ASSERT(file != NULL); + if (!file->is_sync) + { + if (my_sync(file->handler.file, MYF(MY_WME))) + { + rc= 1; + translog_stop_writing(); + DBUG_RETURN(rc); + } + translog_syncs++; + file->is_sync= 1; + } + } + + if (sync_dir) + { + if (!(rc= sync_dir(log_descriptor.directory_fd, + MYF(MY_WME | MY_IGNORE_BADFD)))) + translog_syncs++; + } + + DBUG_RETURN(rc); +} + + +/* + @brief Flushes buffers with LSNs in them less or equal address <lsn> + + @param lsn address up to which all LSNs should be flushed, + can be reset to real last LSN address + @parem sent_to_disk returns 'sent to disk' position + @param flush_horizon returns horizon of the flush + + @note About terminology see comment to translog_flush(). +*/ + +void translog_flush_buffers(TRANSLOG_ADDRESS *lsn, + TRANSLOG_ADDRESS *sent_to_disk, + TRANSLOG_ADDRESS *flush_horizon) +{ + dirty_buffer_mask_t dirty_buffer_mask; + uint i; + uint8 last_buffer_no, start_buffer_no; + DBUG_ENTER("translog_flush_buffers"); + + /* + We will recheck information when will lock buffers one by + one so we can use unprotected read here (this is just for + speed up buffers processing) + */ + dirty_buffer_mask= log_descriptor.dirty_buffer_mask; + DBUG_PRINT("info", ("Dirty buffer mask: %lx current buffer: %u", + (ulong) dirty_buffer_mask, + (uint) log_descriptor.bc.buffer_no)); + for (i= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO; + i != log_descriptor.bc.buffer_no && !(dirty_buffer_mask & (1 << i)); + i= (i + 1) % TRANSLOG_BUFFERS_NO) {} + start_buffer_no= i; + + DBUG_PRINT("info", + ("start from: %u current: %u prev last lsn: (%lu,0x%lx)", + (uint) start_buffer_no, (uint) log_descriptor.bc.buffer_no, + LSN_IN_PARTS(log_descriptor.bc.buffer->prev_last_lsn))); + + + /* + if LSN up to which we have to flush bigger then maximum LSN of previous + buffer and at least one LSN was saved in the current buffer (last_lsn != + LSN_IMPOSSIBLE) then we have to close the current buffer. + */ + if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 && + log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE) + { + struct st_translog_buffer *buffer= log_descriptor.bc.buffer; + *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */ + DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn))); + last_buffer_no= log_descriptor.bc.buffer_no; + log_descriptor.is_everything_flushed= 1; + translog_force_current_buffer_to_finish(); + translog_buffer_unlock(buffer); + } + else + { + last_buffer_no= ((log_descriptor.bc.buffer_no + TRANSLOG_BUFFERS_NO -1) % + TRANSLOG_BUFFERS_NO); + translog_unlock(); + } + + /* flush buffers */ + *sent_to_disk= translog_get_sent_to_disk(); + if (cmp_translog_addr(*lsn, *sent_to_disk) > 0) + { + + DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u", + (uint) start_buffer_no, (uint) last_buffer_no)); + last_buffer_no= (last_buffer_no + 1) % TRANSLOG_BUFFERS_NO; + i= start_buffer_no; + do + { + struct st_translog_buffer *buffer= log_descriptor.buffers + i; + translog_buffer_lock(buffer); + DBUG_PRINT("info", ("Check buffer: 0x%lx #: %u " + "prev last LSN: (%lu,0x%lx) " + "last LSN: (%lu,0x%lx) status: %s", + (ulong)(buffer), + (uint) i, + LSN_IN_PARTS(buffer->prev_last_lsn), + LSN_IN_PARTS(buffer->last_lsn), + (buffer->file ? + "dirty" : "closed"))); + if (buffer->prev_last_lsn <= *lsn && + buffer->file != NULL) + { + DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size); + *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ? + buffer->pre_force_close_horizon : + buffer->offset + buffer->size); + /* pre_force_close_horizon is reset during new buffer start */ + DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)", + LSN_IN_PARTS(*flush_horizon))); + DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon); + + translog_buffer_flush(buffer); + } + translog_buffer_unlock(buffer); + i= (i + 1) % TRANSLOG_BUFFERS_NO; + } while (i != last_buffer_no); + *sent_to_disk= translog_get_sent_to_disk(); + } + + DBUG_VOID_RETURN; +} + +/** + @brief Flush the log up to given LSN (included) + + @param lsn log record serial number up to which (inclusive) + the log has to be flushed + + @return Operation status + @retval 0 OK + @retval 1 Error + + @note + + - Non group commit logic: Commits made in passes. Thread which started + flush first is performing actual flush, other threads sets new goal (LSN) + of the next pass (if it is maximum) and waits for the pass end or just + wait for the pass end. + + - If hard group commit enabled and rate set to zero: + The first thread sends all changed buffers to disk. This is repeated + as long as there are new LSNs added. The process can not loop + forever because we have limited number of threads and they will wait + for the data to be synced. + Pseudo code: + + do + send changed buffers to disk + while new_goal + sync + + - If hard group commit switched ON and less than rate microseconds has + passed from last sync, then after buffers have been sent to disk + wait until rate microseconds has passed since last sync, do sync and return. + This ensures that if we call sync infrequently we don't do any waits. + + - If soft group commit enabled everything works as with 'non group commit' + but the thread doesn't do any real sync(). If rate is not zero the + sync() will be performed by a service thread with the given rate + when needed (new LSN appears). + + @note Terminology: + 'sent to disk' means written to disk but not sync()ed, + 'flushed' mean sent to disk and synced(). +*/ + +my_bool translog_flush(TRANSLOG_ADDRESS lsn) +{ + struct timespec abstime; + ulonglong flush_interval; + ulonglong time_spent; + LSN sent_to_disk= LSN_IMPOSSIBLE; + TRANSLOG_ADDRESS flush_horizon; + my_bool rc= 0; + my_bool hgroup_commit_at_start; + DBUG_ENTER("translog_flush"); + DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + LINT_INIT(sent_to_disk); + LINT_INIT(flush_interval); + + pthread_mutex_lock(&log_descriptor.log_flush_lock); + DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.flushed))); + if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0) + { + pthread_mutex_unlock(&log_descriptor.log_flush_lock); + DBUG_RETURN(0); + } + if (log_descriptor.flush_in_progress) + { + translog_lock(); + /* fix lsn if it was horizon */ + if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0) + lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer); + translog_unlock(); + translog_flush_set_new_goal_and_wait(lsn); + if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self())) + { + /* + translog_flush_wait_for_end() release log_flush_lock while is + waiting then acquire it again + */ + translog_flush_wait_for_end(lsn); + pthread_mutex_unlock(&log_descriptor.log_flush_lock); + DBUG_RETURN(0); + } + log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE; + } + log_descriptor.flush_in_progress= 1; + flush_horizon= log_descriptor.previous_flush_horizon; + DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)", + LSN_IN_PARTS(flush_horizon))); + pthread_mutex_unlock(&log_descriptor.log_flush_lock); + + hgroup_commit_at_start= hard_group_commit; + if (hgroup_commit_at_start) + flush_interval= group_commit_wait; + + translog_lock(); + if (log_descriptor.is_everything_flushed) + { + DBUG_PRINT("info", ("everything is flushed")); + translog_unlock(); + pthread_mutex_lock(&log_descriptor.log_flush_lock); + goto out; + } + + for (;;) + { + /* Following function flushes buffers and makes translog_unlock() */ + translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon); + + if (!hgroup_commit_at_start) + break; /* flush pass is ended */ + +retest: + /* + We do not check time here because pthread_mutex_lock rarely takes + a lot of time so we can sacrifice a bit precision to performance + (taking into account that my_micro_time() might be expensive call). + */ + if (flush_interval == 0) + break; /* flush pass is ended */ + + pthread_mutex_lock(&log_descriptor.log_flush_lock); + if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE) + { + if (flush_interval == 0 || + (time_spent= (my_micro_time() - flush_start)) >= flush_interval) + { + pthread_mutex_unlock(&log_descriptor.log_flush_lock); + break; + } + DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu", + flush_interval - time_spent, + flush_interval, time_spent)); + /* wait time or next goal */ + set_timespec_nsec(abstime, flush_interval - time_spent); + pthread_cond_timedwait(&log_descriptor.new_goal_cond, + &log_descriptor.log_flush_lock, + &abstime); + pthread_mutex_unlock(&log_descriptor.log_flush_lock); + DBUG_PRINT("info", ("retest conditions")); + goto retest; + } + + /* take next goal */ + lsn= log_descriptor.next_pass_max_lsn; + log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE; + /* prevent other thread from continue */ + log_descriptor.max_lsn_requester= pthread_self(); + DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)", + LSN_IN_PARTS(lsn))); + pthread_mutex_unlock(&log_descriptor.log_flush_lock); + + /* next flush pass */ + DBUG_PRINT("info", ("next flush pass")); + translog_lock(); + } + + /* + sync() files from previous flush till current one + */ + if (!soft_sync || hgroup_commit_at_start) + { + if ((rc= + translog_sync_files(LSN_FILE_NO(log_descriptor.flushed), + LSN_FILE_NO(lsn), + sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS && + (LSN_FILE_NO(log_descriptor. + previous_flush_horizon) != + LSN_FILE_NO(flush_horizon) || + (LSN_OFFSET(log_descriptor. + previous_flush_horizon) / + TRANSLOG_PAGE_SIZE) != + (LSN_OFFSET(flush_horizon) / + TRANSLOG_PAGE_SIZE))))) + { + sent_to_disk= LSN_IMPOSSIBLE; + pthread_mutex_lock(&log_descriptor.log_flush_lock); + goto out; + } + /* keep values for soft sync() and forced sync() actual */ + { + uint32 fileno= LSN_FILE_NO(lsn); + soft_sync_min= fileno; + soft_sync_max= fileno; + } + } + else + { + soft_sync_max= LSN_FILE_NO(lsn); + soft_need_sync= 1; + } + + DBUG_ASSERT(flush_horizon <= log_descriptor.horizon); + + pthread_mutex_lock(&log_descriptor.log_flush_lock); + log_descriptor.previous_flush_horizon= flush_horizon; +out: + if (sent_to_disk != LSN_IMPOSSIBLE) + log_descriptor.flushed= sent_to_disk; + log_descriptor.flush_in_progress= 0; + log_descriptor.flush_no++; + DBUG_PRINT("info", ("flush_in_progress is dropped")); + pthread_mutex_unlock(&log_descriptor.log_flush_lock); + pthread_cond_broadcast(&log_descriptor.log_flush_cond); + DBUG_RETURN(rc); +} + + +/** + @brief Gives a 2-byte-id to MARIA_SHARE and logs this fact + + If a MARIA_SHARE does not yet have a 2-byte-id (unique over all currently + open MARIA_SHAREs), give it one and record this assignment in the log + (LOGREC_FILE_ID log record). + + @param tbl_info table + @param trn calling transaction + + @return Operation status + @retval 0 OK + @retval 1 Error + + @note Can be called even if share already has an id (then will do nothing) +*/ + +int translog_assign_id_to_share(MARIA_HA *tbl_info, TRN *trn) +{ + MARIA_SHARE *share= tbl_info->s; + /* + If you give an id to a non-BLOCK_RECORD table, you also need to release + this id somewhere. Then you can change the assertion. + */ + DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); + /* re-check under mutex to avoid having 2 ids for the same share */ + pthread_mutex_lock(&share->intern_lock); + if (unlikely(share->id == 0)) + { + LSN lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uchar log_data[FILEID_STORE_SIZE]; + /* Inspired by set_short_trid() of trnman.c */ + uint i= share->kfile.file % SHARE_ID_MAX + 1; + do + { + my_atomic_rwlock_wrlock(&LOCK_id_to_share); + for ( ; i <= SHARE_ID_MAX ; i++) /* the range is [1..SHARE_ID_MAX] */ + { + void *tmp= NULL; + if (id_to_share[i] == NULL && + my_atomic_casptr((void **)&id_to_share[i], &tmp, share)) + { + share->id= (uint16)i; + break; + } + } + my_atomic_rwlock_wrunlock(&LOCK_id_to_share); + i= 1; /* scan the whole array */ + } while (share->id == 0); + DBUG_PRINT("info", ("id_to_share: 0x%lx -> %u", (ulong)share, share->id)); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + /* + open_file_name is an unresolved name (symlinks are not resolved, datadir + is not realpath-ed, etc) which is good: the log can be moved to another + directory and continue working. + */ + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= + (uchar *)share->open_file_name.str; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= + share->open_file_name.length + 1; + /* + We can't unlock share->intern_lock before the log entry is written to + ensure no one uses the id before it's logged. + */ + if (unlikely(translog_write_record(&lsn, LOGREC_FILE_ID, trn, tbl_info, + (translog_size_t) + (sizeof(log_data) + + log_array[TRANSLOG_INTERNAL_PARTS + + 1].length), + sizeof(log_array)/sizeof(log_array[0]), + log_array, log_data, NULL))) + { + pthread_mutex_unlock(&share->intern_lock); + return 1; + } + } + pthread_mutex_unlock(&share->intern_lock); + return 0; +} + + +/** + @brief Recycles a MARIA_SHARE's short id. + + @param share table + + @note Must be called only if share has an id (i.e. id != 0) +*/ + +void translog_deassign_id_from_share(MARIA_SHARE *share) +{ + DBUG_PRINT("info", ("id_to_share: 0x%lx id %u -> 0", + (ulong)share, share->id)); + /* + We don't need any mutex as we are called only when closing the last + instance of the table or at the end of REPAIR: no writes can be + happening. But a Checkpoint may be reading share->id, so we require this + mutex: + */ + safe_mutex_assert_owner(&share->intern_lock); + my_atomic_rwlock_rdlock(&LOCK_id_to_share); + my_atomic_storeptr((void **)&id_to_share[share->id], 0); + my_atomic_rwlock_rdunlock(&LOCK_id_to_share); + share->id= 0; + /* useless but safety: */ + share->lsn_of_file_id= LSN_IMPOSSIBLE; +} + + +void translog_assign_id_to_share_from_recovery(MARIA_SHARE *share, + uint16 id) +{ + DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded); + DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); + DBUG_ASSERT(share->id == 0); + DBUG_ASSERT(id_to_share[id] == NULL); + id_to_share[share->id= id]= share; +} + + +/** + @brief check if such log file exists + + @param file_no number of the file to test + + @retval 0 no such file + @retval 1 there is file with such number +*/ + +my_bool translog_is_file(uint file_no) +{ + MY_STAT stat_buff; + char path[FN_REFLEN]; + return (test(my_stat(translog_filename_by_fileno(file_no, path), + &stat_buff, MYF(0)))); +} + + +/** + @brief returns minimum log file number + + @param horizon the end of the log + @param is_protected true if it is under purge_log protection + + @retval minimum file number + @retval 0 no files found +*/ + +static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected) +{ + uint min_file= 0, max_file; + DBUG_ENTER("translog_first_file"); + if (!is_protected) + pthread_mutex_lock(&log_descriptor.purger_lock); + if (log_descriptor.min_file_number && + translog_is_file(log_descriptor.min_file_number)) + { + DBUG_PRINT("info", ("cached %lu", + (ulong) log_descriptor.min_file_number)); + if (!is_protected) + pthread_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(log_descriptor.min_file_number); + } + + max_file= LSN_FILE_NO(horizon); + + /* binary search for last file */ + while (min_file != max_file && min_file != (max_file - 1)) + { + uint test= (min_file + max_file) / 2; + DBUG_PRINT("info", ("min_file: %u test: %u max_file: %u", + min_file, test, max_file)); + if (test == max_file) + test--; + if (translog_is_file(test)) + max_file= test; + else + min_file= test; + } + log_descriptor.min_file_number= max_file; + if (!is_protected) + pthread_mutex_unlock(&log_descriptor.purger_lock); + DBUG_PRINT("info", ("first file :%lu", (ulong) max_file)); + DBUG_ASSERT(max_file >= 1); + DBUG_RETURN(max_file); +} + + +/** + @brief returns the most close LSN higher the given chunk address + + @param addr the chunk address to start from + @param horizon the horizon if it is known or LSN_IMPOSSIBLE + + @retval LSN_ERROR Error + @retval LSN_IMPOSSIBLE no LSNs after the address + @retval # LSN of the most close LSN higher the given chunk address +*/ + +LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon) +{ + TRANSLOG_SCANNER_DATA scanner; + LSN result; + DBUG_ENTER("translog_next_LSN"); + + if (horizon == LSN_IMPOSSIBLE) + horizon= translog_get_horizon(); + + if (addr == horizon) + DBUG_RETURN(LSN_IMPOSSIBLE); + + translog_scanner_init(addr, 0, &scanner, 1); + /* + addr can point not to a chunk beginning but page end so next + page beginning. + */ + if (addr % TRANSLOG_PAGE_SIZE == 0) + { + /* + We are emulating the page end which cased such horizon value to + trigger translog_scanner_eop(). + + We can't just increase addr on page header overhead because it + can be file end so we allow translog_get_next_chunk() to skip + to the next page in correct way + */ + scanner.page_addr-= TRANSLOG_PAGE_SIZE; + scanner.page_offset= TRANSLOG_PAGE_SIZE; +#ifndef DBUG_OFF + scanner.page= NULL; /* prevent using incorrect page content */ +#endif + } + /* addr can point not to a chunk beginning but to a page end */ + if (translog_scanner_eop(&scanner)) + { + if (translog_get_next_chunk(&scanner)) + { + result= LSN_ERROR; + goto out; + } + if (scanner.page == END_OF_LOG) + { + result= LSN_IMPOSSIBLE; + goto out; + } + } + + while (!translog_is_LSN_chunk(scanner.page[scanner.page_offset]) && + scanner.page[scanner.page_offset] != TRANSLOG_FILLER) + { + if (translog_get_next_chunk(&scanner)) + { + result= LSN_ERROR; + goto out; + } + if (scanner.page == END_OF_LOG) + { + result= LSN_IMPOSSIBLE; + goto out; + } + } + + if (scanner.page[scanner.page_offset] == TRANSLOG_FILLER) + result= LSN_IMPOSSIBLE; /* reached page filler */ + else + result= scanner.page_addr + scanner.page_offset; +out: + translog_destroy_scanner(&scanner); + DBUG_RETURN(result); +} + + +/** + @brief returns the LSN of the first record starting in this log + + @retval LSN_ERROR Error + @retval LSN_IMPOSSIBLE no log or the log is empty + @retval # LSN of the first record +*/ + +LSN translog_first_lsn_in_log() +{ + TRANSLOG_ADDRESS addr, horizon= translog_get_horizon(); + TRANSLOG_VALIDATOR_DATA data; + uint file; + uint16 chunk_offset; + uchar *page; + DBUG_ENTER("translog_first_lsn_in_log"); + DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(horizon))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + if (!(file= translog_first_file(horizon, 0))) + { + /* log has no records yet */ + DBUG_RETURN(LSN_IMPOSSIBLE); + } + + addr= MAKE_LSN(file, TRANSLOG_PAGE_SIZE); /* the first page of the file */ + data.addr= &addr; + { + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL || + (chunk_offset= translog_get_first_chunk_offset(page)) == 0) + DBUG_RETURN(LSN_ERROR); + } + addr+= chunk_offset; + + DBUG_RETURN(translog_next_LSN(addr, horizon)); +} + + +/** + @brief Returns theoretical first LSN if first log is present + + @retval LSN_ERROR Error + @retval LSN_IMPOSSIBLE no log + @retval # LSN of the first record +*/ + +LSN translog_first_theoretical_lsn() +{ + TRANSLOG_ADDRESS addr= translog_get_horizon(); + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + uchar *page; + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_first_theoretical_lsn"); + DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(addr))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + if (!translog_is_file(1)) + DBUG_RETURN(LSN_IMPOSSIBLE); + if (addr == MAKE_LSN(1, TRANSLOG_PAGE_SIZE)) + { + /* log has no records yet */ + DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE + + log_descriptor.page_overhead)); + } + + addr= MAKE_LSN(1, TRANSLOG_PAGE_SIZE); /* the first page of the file */ + data.addr= &addr; + if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL) + DBUG_RETURN(LSN_ERROR); + + DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE + + page_overhead[page[TRANSLOG_PAGE_FLAGS]])); +} + + +/** + @brief Checks given low water mark and purge files if it is need + + @param low the last (minimum) address which is need + + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_purge(TRANSLOG_ADDRESS low) +{ + uint32 last_need_file= LSN_FILE_NO(low); + uint32 min_unsync; + int soft; + TRANSLOG_ADDRESS horizon= translog_get_horizon(); + int rc= 0; + DBUG_ENTER("translog_purge"); + DBUG_PRINT("enter", ("low: (%lu,0x%lx)", LSN_IN_PARTS(low))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + soft= soft_sync; + min_unsync= soft_sync_min; + DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync)); + if (soft && min_unsync < last_need_file) + { + last_need_file= min_unsync; + DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file)); + } + + pthread_mutex_lock(&log_descriptor.purger_lock); + DBUG_PRINT("info", ("last_lsn_checked file: %lu:", + (ulong) log_descriptor.last_lsn_checked)); + if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file) + { + uint32 i; + uint32 min_file= translog_first_file(horizon, 1); + DBUG_ASSERT(min_file != 0); /* log is already started */ + DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file)); + for(i= min_file; i < last_need_file && rc == 0; i++) + { + LSN lsn= translog_get_file_max_lsn_stored(i); + if (lsn == LSN_IMPOSSIBLE) + break; /* files are still in writing */ + if (lsn == LSN_ERROR) + { + rc= 1; + break; + } + if (cmp_translog_addr(lsn, low) >= 0) + break; + + DBUG_PRINT("info", ("purge file %lu", (ulong) i)); + + /* remove file descriptor from the cache */ + /* + log_descriptor.min_file can be changed only here during execution + and the function is serialized, so we can access it without problems + */ + if (i >= log_descriptor.min_file) + { + TRANSLOG_FILE *file; + rw_wrlock(&log_descriptor.open_files_lock); + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + DBUG_ASSERT(log_descriptor.min_file == i); + file= *((TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files)); + DBUG_PRINT("info", ("Files : %d", log_descriptor.open_files.elements)); + DBUG_ASSERT(i == file->number); + log_descriptor.min_file++; + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + rw_unlock(&log_descriptor.open_files_lock); + translog_close_log_file(file); + } + if (log_purge_type == TRANSLOG_PURGE_IMMIDIATE) + { + char path[FN_REFLEN], *file_name; + file_name= translog_filename_by_fileno(i, path); + rc= test(my_delete(file_name, MYF(MY_WME))); + } + } + if (unlikely(rc == 1)) + log_descriptor.min_need_file= 0; /* impossible value */ + else + log_descriptor.min_need_file= i; + } + + pthread_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(rc); +} + + +/** + @brief Purges files by stored min need file in case of + "ondemend" purge type + + @note This function do real work only if it is "ondemend" purge type + and translog_purge() was called at least once and last time without + errors + + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_purge_at_flush() +{ + uint32 i, min_file; + int rc= 0; + DBUG_ENTER("translog_purge_at_flush"); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + if (unlikely(translog_status == TRANSLOG_READONLY)) + { + DBUG_PRINT("info", ("The log is read only => exit")); + DBUG_RETURN(0); + } + + if (log_purge_type != TRANSLOG_PURGE_ONDEMAND) + { + DBUG_PRINT("info", ("It is not \"at_flush\" => exit")); + DBUG_RETURN(0); + } + + pthread_mutex_lock(&log_descriptor.purger_lock); + + if (unlikely(log_descriptor.min_need_file == 0)) + { + DBUG_PRINT("info", ("No info about min need file => exit")); + pthread_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(0); + } + + min_file= translog_first_file(translog_get_horizon(), 1); + DBUG_ASSERT(min_file != 0); /* log is already started */ + for(i= min_file; i < log_descriptor.min_need_file && rc == 0; i++) + { + char path[FN_REFLEN], *file_name; + DBUG_PRINT("info", ("purge file %lu\n", (ulong) i)); + file_name= translog_filename_by_fileno(i, path); + rc= test(my_delete(file_name, MYF(MY_WME))); + } + + pthread_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(rc); +} + + +/** + @brief Gets min file number + + @param horizon the end of the log + + @retval minimum file number + @retval 0 no files found +*/ + +uint32 translog_get_first_file(TRANSLOG_ADDRESS horizon) +{ + return translog_first_file(horizon, 0); +} + + +/** + @brief Gets min file number which is needed + + @retval minimum file number + @retval 0 unknown +*/ + +uint32 translog_get_first_needed_file() +{ + uint32 file_no; + pthread_mutex_lock(&log_descriptor.purger_lock); + file_no= log_descriptor.min_need_file; + pthread_mutex_unlock(&log_descriptor.purger_lock); + return file_no; +} + + +/** + @brief Gets transaction log file size + + @return transaction log file size +*/ + +uint32 translog_get_file_size() +{ + uint32 res; + translog_lock(); + res= log_descriptor.log_file_max_size; + translog_unlock(); + return (res); +} + + +/** + @brief Sets transaction log file size + + @return Returns actually set transaction log size +*/ + +void translog_set_file_size(uint32 size) +{ + struct st_translog_buffer *old_buffer= NULL; + DBUG_ENTER("translog_set_file_size"); + translog_lock(); + DBUG_PRINT("enter", ("Size: %lu", (ulong) size)); + DBUG_ASSERT(size % TRANSLOG_PAGE_SIZE == 0 && + size >= TRANSLOG_MIN_FILE_SIZE); + log_descriptor.log_file_max_size= size; + /* if current file longer then finish it*/ + if (LSN_OFFSET(log_descriptor.horizon) >= log_descriptor.log_file_max_size) + { + old_buffer= log_descriptor.bc.buffer; + translog_buffer_next(&log_descriptor.horizon, &log_descriptor.bc, 1); + translog_buffer_unlock(old_buffer); + } + translog_unlock(); + if (old_buffer) + { + translog_buffer_lock(old_buffer); + translog_buffer_flush(old_buffer); + translog_buffer_unlock(old_buffer); + } + DBUG_VOID_RETURN; +} + + +/** + Write debug information to log if we EXTRA_DEBUG is enabled +*/ + +my_bool translog_log_debug_info(TRN *trn __attribute__((unused)), + enum translog_debug_info_type type + __attribute__((unused)), + uchar *info __attribute__((unused)), + size_t length __attribute__((unused))) +{ +#ifdef EXTRA_DEBUG + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uchar debug_type; + LSN lsn; + + if (!trn) + { + /* + We can't log the current transaction because we don't have + an active transaction. Use a temporary transaction object instead + */ + trn= &dummy_transaction_object; + } + debug_type= (uchar) type; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= &debug_type; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= 1; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= info; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; + return translog_write_record(&lsn, LOGREC_DEBUG_INFO, + trn, NULL, + (translog_size_t) (1+ length), + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL); +#else + return 0; +#endif +} + + + +/** + Sets soft sync mode + + @param mode TRUE if we need switch soft sync on else off +*/ + +void translog_soft_sync(my_bool mode) +{ + soft_sync= mode; +} + + +/** + Sets hard group commit + + @param mode TRUE if we need switch hard group commit on else off +*/ + +void translog_hard_group_commit(my_bool mode) +{ + hard_group_commit= mode; +} + + +/** + @brief forced log sync (used when we are switching modes) +*/ + +void translog_sync() +{ + uint32 max= get_current_logfile()->number; + uint32 min; + DBUG_ENTER("ma_translog_sync"); + + min= soft_sync_min; + if (!min) + min= max; + + translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS); + + DBUG_VOID_RETURN; +} + + +/** + @brief set rate for group commit + + @param interval interval to set. + + @note We use this function with additional variable because have to + restart service thread with new value which we can't make inside changing + variable routine (update_maria_group_commit_interval) +*/ + +void translog_set_group_commit_interval(uint32 interval) +{ + DBUG_ENTER("translog_set_group_commit_interval"); + group_commit_wait= interval; + DBUG_PRINT("info", ("wait: %llu", + (ulonglong)group_commit_wait)); + DBUG_VOID_RETURN; +} + + +/** + @brief syncing service thread +*/ + +static pthread_handler_t +ma_soft_sync_background( void *arg __attribute__((unused))) +{ + + my_thread_init(); + { + DBUG_ENTER("ma_soft_sync_background"); + for(;;) + { + ulonglong prev_loop= my_micro_time(); + ulonglong time, sleep; + uint32 min, max, sync_request; + min= soft_sync_min; + max= soft_sync_max; + sync_request= soft_need_sync; + soft_sync_min= max; + soft_need_sync= 0; + + sleep= group_commit_wait; + if (sync_request) + translog_sync_files(min, max, FALSE); + time= my_micro_time() - prev_loop; + if (time > sleep) + sleep= 0; + else + sleep-= time; + if (my_service_thread_sleep(&soft_sync_control, sleep)) + break; + } + my_service_thread_signal_end(&soft_sync_control); + my_thread_end(); + DBUG_RETURN(0); + } +} + + +/** + @brief Starts syncing thread +*/ + +int translog_soft_sync_start(void) +{ + pthread_t th; + int res= 0; + uint32 min, max; + DBUG_ENTER("translog_soft_sync_start"); + + /* check and init variables */ + min= soft_sync_min; + max= soft_sync_max; + if (!max) + soft_sync_max= max= get_current_logfile()->number; + if (!min) + soft_sync_min= max; + soft_need_sync= 1; + + if (!(res= ma_service_thread_control_init(&soft_sync_control))) + if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL))) + soft_sync_control.status= THREAD_RUNNING; + DBUG_RETURN(res); +} + + +/** + @brief Stops syncing thread +*/ + +void translog_soft_sync_end(void) +{ + DBUG_ENTER("translog_soft_sync_end"); + if (soft_sync_control.inited) + { + ma_service_thread_control_end(&soft_sync_control); + } + DBUG_VOID_RETURN; +} + + +#ifdef MARIA_DUMP_LOG +#include <my_getopt.h> +extern void translog_example_table_init(); +static const char *load_default_groups[]= { "aria_dump_log",0 }; +static void get_options(int *argc,char * * *argv); +#ifndef DBUG_OFF +#if defined(__WIN__) +const char *default_dbug_option= "d:t:i:O,\\aria_dump_log.trace"; +#else +const char *default_dbug_option= "d:t:i:o,/tmp/aria_dump_log.trace"; +#endif +#endif +static ulonglong opt_offset; +static ulong opt_pages; +static const char *opt_file= NULL; +static File handler= -1; +static my_bool opt_unit= 0; +static struct my_option my_long_options[] = +{ +#ifdef IMPLTMENTED + {"body", 'b', + "Print chunk body dump", + (uchar **) &opt_body, (uchar **) &opt_body, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, +#endif +#ifndef DBUG_OFF + {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"file", 'f', "Path to file which will be read", + (uchar**) &opt_file, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "offset", 'o', "Start reading log from this offset", + (uchar**) &opt_offset, (uchar**) &opt_offset, + 0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 }, + { "pages", 'n', "Number of pages to read", + (uchar**) &opt_pages, (uchar**) &opt_pages, 0, + GET_ULONG, REQUIRED_ARG, (long) ~(ulong) 0, + (long) 1, (long) ~(ulong) 0, (long) 0, + (long) 1, 0}, + {"unit-test", 'U', + "Use unit test record table (for logs created by unittests", + (uchar **) &opt_unit, (uchar **) &opt_unit, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static void print_version(void) +{ + VOID(printf("%s Ver 1.0 for %s on %s\n", + my_progname_short, SYSTEM_TYPE, MACHINE_TYPE)); + NETWARE_SET_SCREEN_MODE(1); +} + + +static void usage(void) +{ + print_version(); + puts("Copyright (C) 2008 MySQL AB"); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Dump content of aria log pages."); + VOID(printf("\nUsage: %s -f file OPTIONS\n", my_progname_short)); + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + + +static my_bool +get_one_option(int optid __attribute__((unused)), + const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch (optid) { + case '?': + usage(); + exit(0); + case 'V': + print_version(); + exit(0); +#ifndef DBUG_OFF + case '#': + DBUG_SET_INITIAL(argument ? argument : default_dbug_option); + break; +#endif + } + return 0; +} + + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + if (opt_file == NULL) + { + usage(); + exit(1); + } +} + + +/** + @brief Dump information about file header page. +*/ + +static void dump_header_page(uchar *buff) +{ + LOGHANDLER_FILE_INFO desc; + char strbuff[21]; + LINT_INIT_STRUCT(desc); + translog_interpret_file_header(&desc, buff); + printf(" This can be header page:\n" + " Timestamp: %s\n" + " Aria log version: %lu\n" + " Server version: %lu\n" + " Server id %lu\n" + " Page size %lu\n", + llstr(desc.timestamp, strbuff), + desc.maria_version, + desc.mysql_version, + desc.server_id, + desc.page_size); + if (desc.page_size != TRANSLOG_PAGE_SIZE) + printf(" WARNING: page size is not equal compiled in one %lu!!!\n", + (ulong) TRANSLOG_PAGE_SIZE); + printf(" File number %lu\n" + " Max lsn: (%lu,0x%lx)\n", + desc.file_number, + LSN_IN_PARTS(desc.max_lsn)); +} + +static const char *record_class_string[]= +{ + "LOGRECTYPE_NOT_ALLOWED", + "LOGRECTYPE_VARIABLE_LENGTH", + "LOGRECTYPE_PSEUDOFIXEDLENGTH", + "LOGRECTYPE_FIXEDLENGTH" +}; + + +/** + @brief dump information about transaction log chunk + + @param buffer reference to the whole page + @param ptr pointer to the chunk + + @reval # reference to the next chunk + @retval NULL can't interpret data +*/ + +static uchar *dump_chunk(uchar *buffer, uchar *ptr) +{ + uint length; + if (*ptr == TRANSLOG_FILLER) + { + printf(" Filler till the page end\n"); + for (; ptr < buffer + TRANSLOG_PAGE_SIZE; ptr++) + { + if (*ptr != TRANSLOG_FILLER) + { + printf(" WARNING: non filler character met before page end " + "(page + 0x%04x: 0x%02x) (stop interpretation)!!!", + (uint) (ptr - buffer), (uint) ptr[0]); + return NULL; + } + } + return ptr; + } + if (*ptr == 0 || *ptr == 0xFF) + { + printf(" WARNING: chunk can't start from 0x0 " + "(stop interpretation)!!!\n"); + return NULL; + } + switch (ptr[0] & TRANSLOG_CHUNK_TYPE) { + case TRANSLOG_CHUNK_LSN: + printf(" LSN chunk type 0 (variable length)\n"); + if (likely((ptr[0] & TRANSLOG_REC_TYPE) != TRANSLOG_CHUNK_0_CONT)) + { + printf(" Record type %u: %s record class %s compressed LSNs: %u\n", + ptr[0] & TRANSLOG_REC_TYPE, + (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ? + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name : + "NULL"), + record_class_string[log_record_type_descriptor[ptr[0] & + TRANSLOG_REC_TYPE]. + rclass], + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE]. + compressed_LSN); + if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass != + LOGRECTYPE_VARIABLE_LENGTH) + { + printf(" WARNING: this record class here can't be used " + "(stop interpretation)!!!\n"); + break; + } + } + else + printf(" Continuation of previous chunk 0 header \n"); + printf(" Short transaction id: %u\n", (uint) uint2korr(ptr + 1)); + { + uchar *hdr_ptr= ptr + 1 + 2; /* chunk type and short trid */ + uint16 chunk_len; + printf (" Record length: %lu\n", + (ulong) translog_variable_record_1group_decode_len(&hdr_ptr)); + chunk_len= uint2korr(hdr_ptr); + if (chunk_len == 0) + printf (" It is 1 group record (chunk length == 0)\n"); + else + { + uint16 groups, i; + + printf (" Chunk length %u\n", (uint) chunk_len); + groups= uint2korr(hdr_ptr + 2); + hdr_ptr+= 4; + printf (" Number of groups left to the end %u:\n", (uint) groups); + for(i= 0; + i < groups && hdr_ptr < buffer + TRANSLOG_PAGE_SIZE; + i++, hdr_ptr+= LSN_STORE_SIZE + 1) + { + TRANSLOG_ADDRESS gpr_addr= lsn_korr(hdr_ptr); + uint pages= hdr_ptr[LSN_STORE_SIZE]; + printf (" Group +#%u: (%lu,0x%lx) pages: %u\n", + (uint) i, LSN_IN_PARTS(gpr_addr), pages); + } + } + } + break; + case TRANSLOG_CHUNK_FIXED: + printf(" LSN chunk type 1 (fixed size)\n"); + printf(" Record type %u: %s record class %s compressed LSNs: %u\n", + ptr[0] & TRANSLOG_REC_TYPE, + (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ? + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name : + "NULL"), + record_class_string[log_record_type_descriptor[ptr[0] & + TRANSLOG_REC_TYPE]. + rclass], + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE]. + compressed_LSN); + if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass != + LOGRECTYPE_PSEUDOFIXEDLENGTH && + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass != + LOGRECTYPE_FIXEDLENGTH) + { + printf(" WARNING: this record class here can't be used " + "(stop interpretation)!!!\n"); + } + printf(" Short transaction id: %u\n", (uint) uint2korr(ptr + 1)); + break; + case TRANSLOG_CHUNK_NOHDR: + printf(" No header chunk type 2(till the end of the page)\n"); + if (ptr[0] & TRANSLOG_REC_TYPE) + { + printf(" WARNING: chunk header content record type: 0x%02x " + "(dtop interpretation)!!!", + (uint) ptr[0]); + return NULL; + } + break; + case TRANSLOG_CHUNK_LNGTH: + printf(" Chunk with length type 3\n"); + if (ptr[0] & TRANSLOG_REC_TYPE) + { + printf(" WARNING: chunk header content record type: 0x%02x " + "(dtop interpretation)!!!", + (uint) ptr[0]); + return NULL; + } + break; + } + { + intptr offset= ptr - buffer; + DBUG_ASSERT(offset >= 0 && offset <= UINT_MAX16); + length= translog_get_total_chunk_length(buffer, (uint16)offset); + } + printf(" Length %u\n", length); + ptr+= length; + return ptr; +} + + +/** + @brief Dump information about page with data. +*/ + +static void dump_datapage(uchar *buffer) +{ + uchar *ptr; + ulong offset; + uint32 page, file; + uint header_len; + printf(" Page: %ld File number: %ld\n", + (ulong) (page= uint3korr(buffer)), + (ulong) (file= uint3korr(buffer + 3))); + if (page == 0) + printf(" WARNING: page == 0!!!\n"); + if (file == 0) + printf(" WARNING: file == 0!!!\n"); + offset= page * TRANSLOG_PAGE_SIZE; + printf(" Flags (0x%x):\n", (uint) buffer[TRANSLOG_PAGE_FLAGS]); + if (buffer[TRANSLOG_PAGE_FLAGS]) + { + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC) + printf(" Page CRC\n"); + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) + printf(" Sector protection\n"); + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC) + printf(" Record CRC (WARNING: not yet implemented!!!)\n"); + if (buffer[TRANSLOG_PAGE_FLAGS] & ~(TRANSLOG_PAGE_CRC | + TRANSLOG_SECTOR_PROTECTION | + TRANSLOG_RECORD_CRC)) + { + printf(" WARNING: unknown flags (stop interpretation)!!!\n"); + return; + } + } + else + printf(" No flags\n"); + printf(" Page header length: %u\n", + (header_len= page_overhead[buffer[TRANSLOG_PAGE_FLAGS]])); + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC) + { + uint32 crc= uint4korr(buffer + TRANSLOG_PAGE_FLAGS + 1); + uint32 ccrc; + printf (" Page CRC 0x%04lx\n", (ulong) crc); + ccrc= translog_crc(buffer + header_len, TRANSLOG_PAGE_SIZE - header_len); + if (crc != ccrc) + printf(" WARNING: calculated CRC: 0x%04lx!!!\n", (ulong) ccrc); + } + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) + { + TRANSLOG_FILE tfile; + { + uchar *table= buffer + header_len - + TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + uint i; + printf(" Sector protection current value: 0x%02x\n", (uint) table[0]); + for (i= 1; i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; i++) + { + printf(" Sector protection in sector: 0x%02x saved value 0x%02x\n", + (uint)buffer[i * DISK_DRIVE_SECTOR_SIZE], + (uint)table[i]); + } + } + tfile.number= file; + tfile.handler.file= handler; + pagecache_file_init(tfile.handler, NULL, NULL, NULL, NULL, NULL); + tfile.was_recovered= 0; + tfile.is_sync= 1; + if (translog_check_sector_protection(buffer, &tfile)) + printf(" WARNING: sector protection found problems!!!\n"); + } + ptr= buffer + header_len; + while (ptr && ptr < buffer + TRANSLOG_PAGE_SIZE) + { + printf(" Chunk (%lu,0x%lx):\n", + (ulong)file, (ulong) offset + (ptr - buffer)); + ptr= dump_chunk(buffer, ptr); + } +} + + +/** + @brief Dump information about page. +*/ + +static void dump_page(uchar *buffer) +{ + printf("Page by offset %llu (0x%llx)\n", opt_offset, opt_offset); + if (strncmp((char*)maria_trans_file_magic, (char*)buffer, + sizeof(maria_trans_file_magic)) == 0) + { + dump_header_page(buffer); + } + dump_datapage(buffer); +} + + +/** + @brief maria_dump_log main function. +*/ + +int main(int argc, char **argv) +{ + char **default_argv; + uchar buffer[TRANSLOG_PAGE_SIZE]; + MY_INIT(argv[0]); + + load_defaults("my", load_default_groups, &argc, &argv); + default_argv= argv; + get_options(&argc, &argv); + + if (opt_unit) + translog_example_table_init(); + else + translog_table_init(); + translog_fill_overhead_table(); + + maria_data_root= (char *)"."; + + if ((handler= my_open(opt_file, O_RDONLY, MYF(MY_WME))) < 0) + { + fprintf(stderr, "Can't open file: '%s' errno: %d\n", + opt_file, my_errno); + goto err; + } + if (my_seek(handler, opt_offset, SEEK_SET, MYF(MY_WME)) != + opt_offset) + { + fprintf(stderr, "Can't set position %lld file: '%s' errno: %d\n", + opt_offset, opt_file, my_errno); + goto err; + } + for (; + opt_pages; + opt_offset+= TRANSLOG_PAGE_SIZE, opt_pages--) + { + if (my_pread(handler, buffer, TRANSLOG_PAGE_SIZE, opt_offset, + MYF(MY_NABP))) + { + if (my_errno == HA_ERR_FILE_TOO_SHORT) + goto end; + fprintf(stderr, "Can't read page at position %lld file: '%s' " + "errno: %d\n", opt_offset, opt_file, my_errno); + goto err; + } + dump_page(buffer); + } + +end: + my_close(handler, MYF(0)); + free_defaults(default_argv); + exit(0); + return 0; /* No compiler warning */ + +err: + my_close(handler, MYF(0)); + fprintf(stderr, "%s: FAILED\n", my_progname_short); + free_defaults(default_argv); + exit(1); +} +#endif diff --git a/storage/maria/ma_loghandler.h b/storage/maria/ma_loghandler.h new file mode 100644 index 00000000000..698a8ead7b6 --- /dev/null +++ b/storage/maria/ma_loghandler.h @@ -0,0 +1,506 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _ma_loghandler_h +#define _ma_loghandler_h + +#define MB (1024UL*1024) + +/* transaction log default cache size (TODO: make it global variable) */ +#define TRANSLOG_PAGECACHE_SIZE (2*MB) +/* transaction log default file size */ +#define TRANSLOG_FILE_SIZE (1024U*MB) +/* minimum possible transaction log size */ +#define TRANSLOG_MIN_FILE_SIZE (8*MB) +/* transaction log default flags (TODO: make it global variable) */ +#define TRANSLOG_DEFAULT_FLAGS 0 + +/* + Transaction log flags. + + We allow all kind protections to be switched on together for people who + really unsure in their hardware/OS. +*/ +#define TRANSLOG_PAGE_CRC 1 +#define TRANSLOG_SECTOR_PROTECTION (1<<1) +#define TRANSLOG_RECORD_CRC (1<<2) +#define TRANSLOG_FLAGS_NUM ((TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | \ + TRANSLOG_RECORD_CRC) + 1) + +#define RECHEADER_READ_ERROR -1 +#define RECHEADER_READ_EOF -2 + +/* + Page size in transaction log + It should be Power of 2 and multiple of DISK_DRIVE_SECTOR_SIZE + (DISK_DRIVE_SECTOR_SIZE * 2^N) +*/ +#define TRANSLOG_PAGE_SIZE (8U*1024) + +#include "ma_loghandler_lsn.h" +#include "trnman_public.h" + +/* short transaction ID type */ +typedef uint16 SHORT_TRANSACTION_ID; + +struct st_maria_handler; + +/* Changing one of the "SIZE" below will break backward-compatibility! */ +/* Length of CRC at end of pages */ +#define ROW_EXTENT_PAGE_SIZE 5 +#define ROW_EXTENT_COUNT_SIZE 2 +/* Size of file id in logs */ +#define FILEID_STORE_SIZE 2 +/* Size of page reference in log */ +#define PAGE_STORE_SIZE ROW_EXTENT_PAGE_SIZE +/* Size of page ranges in log */ +#define PAGERANGE_STORE_SIZE ROW_EXTENT_COUNT_SIZE +#define DIRPOS_STORE_SIZE 1 +#define CLR_TYPE_STORE_SIZE 1 +/* If table has live checksum we store its changes in UNDOs */ +#define HA_CHECKSUM_STORE_SIZE 4 +#define KEY_NR_STORE_SIZE 1 +#define PAGE_LENGTH_STORE_SIZE 2 + +/* Store methods to match the above sizes */ +#define fileid_store(T,A) int2store(T,A) +#define page_store(T,A) int5store(T,((ulonglong)(A))) +#define dirpos_store(T,A) ((*(uchar*) (T)) = A) +#define pagerange_store(T,A) int2store(T,A) +#define clr_type_store(T,A) ((*(uchar*) (T)) = A) +#define key_nr_store(T, A) ((*(uchar*) (T)) = A) +#define ha_checksum_store(T,A) int4store(T,A) +#define fileid_korr(P) uint2korr(P) +#define page_korr(P) uint5korr(P) +#define dirpos_korr(P) (*(const uchar *) (P)) +#define pagerange_korr(P) uint2korr(P) +#define clr_type_korr(P) (*(const uchar *) (P)) +#define key_nr_korr(P) (*(const uchar *) (P)) +#define ha_checksum_korr(P) uint4korr(P) + +/* + Length of disk drive sector size (we assume that writing it + to disk is an atomic operation) +*/ +#define DISK_DRIVE_SECTOR_SIZE 512U + +/* position reserved in an array of parts of a log record */ +#define TRANSLOG_INTERNAL_PARTS 2 + +/* types of records in the transaction log */ +/* TODO: Set numbers for these when we have all entries figured out */ + +enum translog_record_type +{ + LOGREC_RESERVED_FOR_CHUNKS23= 0, + LOGREC_REDO_INSERT_ROW_HEAD, + LOGREC_REDO_INSERT_ROW_TAIL, + LOGREC_REDO_NEW_ROW_HEAD, + LOGREC_REDO_NEW_ROW_TAIL, + LOGREC_REDO_INSERT_ROW_BLOBS, + LOGREC_REDO_PURGE_ROW_HEAD, + LOGREC_REDO_PURGE_ROW_TAIL, + LOGREC_REDO_FREE_BLOCKS, + LOGREC_REDO_FREE_HEAD_OR_TAIL, + LOGREC_REDO_DELETE_ROW, /* unused */ + LOGREC_REDO_UPDATE_ROW_HEAD, /* unused */ + LOGREC_REDO_INDEX, + LOGREC_REDO_INDEX_NEW_PAGE, + LOGREC_REDO_INDEX_FREE_PAGE, + LOGREC_REDO_UNDELETE_ROW, + LOGREC_CLR_END, + LOGREC_PURGE_END, + LOGREC_UNDO_ROW_INSERT, + LOGREC_UNDO_ROW_DELETE, + LOGREC_UNDO_ROW_UPDATE, + LOGREC_UNDO_KEY_INSERT, + LOGREC_UNDO_KEY_INSERT_WITH_ROOT, + LOGREC_UNDO_KEY_DELETE, + LOGREC_UNDO_KEY_DELETE_WITH_ROOT, + LOGREC_PREPARE, + LOGREC_PREPARE_WITH_UNDO_PURGE, + LOGREC_COMMIT, + LOGREC_COMMIT_WITH_UNDO_PURGE, + LOGREC_CHECKPOINT, + LOGREC_REDO_CREATE_TABLE, + LOGREC_REDO_RENAME_TABLE, + LOGREC_REDO_DROP_TABLE, + LOGREC_REDO_DELETE_ALL, + LOGREC_REDO_REPAIR_TABLE, + LOGREC_FILE_ID, + LOGREC_LONG_TRANSACTION_ID, + LOGREC_INCOMPLETE_LOG, + LOGREC_INCOMPLETE_GROUP, + LOGREC_UNDO_BULK_INSERT, + LOGREC_REDO_BITMAP_NEW_PAGE, + LOGREC_IMPORTED_TABLE, + LOGREC_DEBUG_INFO, + LOGREC_FIRST_FREE, + LOGREC_RESERVED_FUTURE_EXTENSION= 63 +}; +#define LOGREC_NUMBER_OF_TYPES 64 /* Maximum, can't be extended */ + +/* Type of operations in LOGREC_REDO_INDEX */ + +enum en_key_op +{ + KEY_OP_NONE, /* Not used */ + KEY_OP_OFFSET, /* Set current position */ + KEY_OP_SHIFT, /* Shift up/or down at current position */ + KEY_OP_CHANGE, /* Change data at current position */ + KEY_OP_ADD_PREFIX, /* Insert data at start of page */ + KEY_OP_DEL_PREFIX, /* Delete data at start of page */ + KEY_OP_ADD_SUFFIX, /* Insert data at end of page */ + KEY_OP_DEL_SUFFIX, /* Delete data at end of page */ + KEY_OP_CHECK, /* For debugging; CRC of used part of page */ + KEY_OP_MULTI_COPY, /* List of memcpy()s with fixed-len sources in page */ + KEY_OP_SET_PAGEFLAG, /* Set pageflag from next byte */ + KEY_OP_COMPACT_PAGE, /* Compact key page */ + KEY_OP_MAX_PAGELENGTH, /* Set page to max page length */ + KEY_OP_DEBUG, /* Entry for storing what triggered redo_index */ + KEY_OP_DEBUG_2 /* Entry for pagelengths */ +}; + +enum en_key_debug +{ + KEY_OP_DEBUG_RTREE_COMBINE, /* 0 */ + KEY_OP_DEBUG_RTREE_SPLIT, /* 1 */ + KEY_OP_DEBUG_RTREE_SET_KEY, /* 2 */ + KEY_OP_DEBUG_FATHER_CHANGED_1, /* 3 */ + KEY_OP_DEBUG_FATHER_CHANGED_2, /* 4 */ + KEY_OP_DEBUG_LOG_SPLIT, /* 5 */ + KEY_OP_DEBUG_LOG_ADD_1, /* 6 */ + KEY_OP_DEBUG_LOG_ADD_2, /* 7 */ + KEY_OP_DEBUG_LOG_ADD_3, /* 8 */ + KEY_OP_DEBUG_LOG_ADD_4, /* 9 */ + KEY_OP_DEBUG_LOG_PREFIX_1, /* 10 */ + KEY_OP_DEBUG_LOG_PREFIX_2, /* 11 */ + KEY_OP_DEBUG_LOG_PREFIX_3, /* 12 */ + KEY_OP_DEBUG_LOG_PREFIX_4, /* 13 */ + KEY_OP_DEBUG_LOG_PREFIX_5, /* 14 */ + KEY_OP_DEBUG_LOG_DEL_CHANGE_1, /* 15 */ + KEY_OP_DEBUG_LOG_DEL_CHANGE_2, /* 16 */ + KEY_OP_DEBUG_LOG_DEL_CHANGE_3, /* 17 */ + KEY_OP_DEBUG_LOG_DEL_CHANGE_RT, /* 18 */ + KEY_OP_DEBUG_LOG_DEL_PREFIX, /* 19 */ + KEY_OP_DEBUG_LOG_MIDDLE /* 20 */ +}; + + +enum translog_debug_info_type +{ + LOGREC_DEBUG_INFO_QUERY +}; + +/* Size of log file; One log file is restricted to 4G */ +typedef uint32 translog_size_t; + +#define TRANSLOG_RECORD_HEADER_MAX_SIZE 1024U + +typedef struct st_translog_group_descriptor +{ + TRANSLOG_ADDRESS addr; + uint8 num; +} TRANSLOG_GROUP; + + +typedef struct st_translog_header_buffer +{ + /* LSN of the read record */ + LSN lsn; + /* array of groups descriptors, can be used only if groups_no > 0 */ + TRANSLOG_GROUP *groups; + /* short transaction ID or 0 if it has no sense for the record */ + SHORT_TRANSACTION_ID short_trid; + /* + The Record length in buffer (including read header, but excluding + hidden part of record (type, short TrID, length) + */ + translog_size_t record_length; + /* + Buffer for write decoded header of the record (depend on the record + type) + */ + uchar header[TRANSLOG_RECORD_HEADER_MAX_SIZE]; + /* number of groups listed in */ + uint groups_no; + /* in multi-group number of chunk0 pages (valid only if groups_no > 0) */ + uint chunk0_pages; + /* type of the read record */ + enum translog_record_type type; + /* chunk 0 data address (valid only if groups_no > 0) */ + TRANSLOG_ADDRESS chunk0_data_addr; + /* + Real compressed LSN(s) size economy (<number of LSN(s)>*7 - <real_size>) + */ + int16 compressed_LSN_economy; + /* short transaction ID or 0 if it has no sense for the record */ + uint16 non_header_data_start_offset; + /* non read body data length in this first chunk */ + uint16 non_header_data_len; + /* chunk 0 data size (valid only if groups_no > 0) */ + uint16 chunk0_data_len; +} TRANSLOG_HEADER_BUFFER; + + +typedef struct st_translog_scanner_data +{ + uchar buffer[TRANSLOG_PAGE_SIZE]; /* buffer for page content */ + TRANSLOG_ADDRESS page_addr; /* current page address */ + /* end of the log which we saw last time */ + TRANSLOG_ADDRESS horizon; + TRANSLOG_ADDRESS last_file_page; /* Last page on in this file */ + uchar *page; /* page content pointer */ + /* direct link on the current page or NULL if not supported/requested */ + PAGECACHE_BLOCK_LINK *direct_link; + /* offset of the chunk in the page */ + translog_size_t page_offset; + /* set horizon only once at init */ + my_bool fixed_horizon; + /* try to get direct link on the page if it is possible */ + my_bool use_direct_link; +} TRANSLOG_SCANNER_DATA; + + +typedef struct st_translog_reader_data +{ + TRANSLOG_HEADER_BUFFER header; /* Header */ + TRANSLOG_SCANNER_DATA scanner; /* chunks scanner */ + translog_size_t body_offset; /* current chunk body offset */ + /* data offset from the record beginning */ + translog_size_t current_offset; + /* number of bytes read in header */ + uint16 read_header; + uint16 chunk_size; /* current chunk size */ + uint current_group; /* current group */ + uint current_chunk; /* current chunk in the group */ + my_bool eor; /* end of the record */ +} TRANSLOG_READER_DATA; + +C_MODE_START + +/* Records types for unittests */ +#define LOGREC_FIXED_RECORD_0LSN_EXAMPLE 1 +#define LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE 2 +#define LOGREC_FIXED_RECORD_1LSN_EXAMPLE 3 +#define LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE 4 +#define LOGREC_FIXED_RECORD_2LSN_EXAMPLE 5 +#define LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE 6 + +extern void translog_example_table_init(); +extern void translog_table_init(); +#define translog_init(D,M,V,I,C,F,R) \ + translog_init_with_table(D,M,V,I,C,F,R,&translog_table_init,0) +extern my_bool translog_init_with_table(const char *directory, + uint32 log_file_max_size, + uint32 server_version, + uint32 server_id, + PAGECACHE *pagecache, + uint flags, + my_bool readonly, + void (*init_table_func)(), + my_bool no_error); + +extern my_bool +translog_write_record(LSN *lsn, enum translog_record_type type, TRN *trn, + MARIA_HA *tbl_info, + translog_size_t rec_len, uint part_no, + LEX_CUSTRING *parts_data, uchar *store_share_id, + void *hook_arg); + +extern void translog_destroy(); + +extern int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff); + +extern void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff); + +extern translog_size_t translog_read_record(LSN lsn, + translog_size_t offset, + translog_size_t length, + uchar *buffer, + struct st_translog_reader_data + *data); + +extern my_bool translog_flush(TRANSLOG_ADDRESS lsn); + +extern my_bool translog_scanner_init(LSN lsn, + my_bool fixed_horizon, + struct st_translog_scanner_data *scanner, + my_bool use_direct_link); +extern void translog_destroy_scanner(TRANSLOG_SCANNER_DATA *scanner); + +extern int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner, + TRANSLOG_HEADER_BUFFER *buff); +extern LSN translog_get_file_max_lsn_stored(uint32 file); +extern my_bool translog_purge(TRANSLOG_ADDRESS low); +extern my_bool translog_is_file(uint file_no); +extern void translog_lock(); +extern void translog_unlock(); +extern void translog_lock_handler_assert_owner(); +extern TRANSLOG_ADDRESS translog_get_horizon(); +extern TRANSLOG_ADDRESS translog_get_horizon_no_lock(); +extern int translog_assign_id_to_share(struct st_maria_handler *tbl_info, + TRN *trn); +extern void translog_deassign_id_from_share(struct st_maria_share *share); +extern void +translog_assign_id_to_share_from_recovery(struct st_maria_share *share, + uint16 id); +extern my_bool translog_walk_filenames(const char *directory, + my_bool (*callback)(const char *, + const char *)); +extern my_bool translog_log_debug_info(TRN *trn, + enum translog_debug_info_type type, + uchar *info, size_t length); + +enum enum_translog_status +{ + TRANSLOG_UNINITED, /* no initialization done or error during initialization */ + TRANSLOG_OK, /* transaction log is functioning */ + TRANSLOG_READONLY, /* read only mode due to write errors */ + TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */ +}; +extern enum enum_translog_status translog_status; +extern ulonglong translog_syncs; /* Number of sync()s */ + +void translog_soft_sync(my_bool mode); +void translog_hard_group_commit(my_bool mode); +int translog_soft_sync_start(void); +void translog_soft_sync_end(void); +void translog_sync(); +void translog_set_group_commit_interval(uint32 interval); + +/* + all the rest added because of recovery; should we make + ma_loghandler_for_recovery.h ? +*/ + +#define SHARE_ID_MAX 65535 /* array's size */ + +extern LSN translog_first_lsn_in_log(); +extern LSN translog_first_theoretical_lsn(); +extern LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon); +extern my_bool translog_purge_at_flush(); +extern uint32 translog_get_first_file(TRANSLOG_ADDRESS horizon); +extern uint32 translog_get_first_needed_file(); +extern char *translog_filename_by_fileno(uint32 file_no, char *path); +extern void translog_set_file_size(uint32 size); + +/* record parts descriptor */ +struct st_translog_parts +{ + /* full record length */ + translog_size_t record_length; + /* full record length with chunk headers */ + translog_size_t total_record_length; + /* current part index */ + uint current; + /* total number of elements in parts */ + uint elements; + /* array of parts */ + LEX_CUSTRING *parts; +}; + +typedef my_bool(*prewrite_rec_hook) (enum translog_record_type type, + TRN *trn, + struct st_maria_handler *tbl_info, + void *hook_arg); + +typedef my_bool(*inwrite_rec_hook) (enum translog_record_type type, + TRN *trn, + struct st_maria_handler *tbl_info, + LSN *lsn, void *hook_arg); + +typedef uint16(*read_rec_hook) (enum translog_record_type type, + uint16 read_length, uchar *read_buff, + uchar *decoded_buff); + + +/* record classes */ +enum record_class +{ + LOGRECTYPE_NOT_ALLOWED, + LOGRECTYPE_VARIABLE_LENGTH, + LOGRECTYPE_PSEUDOFIXEDLENGTH, + LOGRECTYPE_FIXEDLENGTH +}; + +enum enum_record_in_group { + LOGREC_NOT_LAST_IN_GROUP= 0, LOGREC_LAST_IN_GROUP, LOGREC_IS_GROUP_ITSELF +}; + +/* + Descriptor of log record type +*/ +typedef struct st_log_record_type_descriptor +{ + /* internal class of the record */ + enum record_class rclass; + /* + length for fixed-size record, pseudo-fixed record + length with uncompressed LSNs + */ + uint16 fixed_length; + /* how much record body (belonged to headers too) read with headers */ + uint16 read_header_len; + /* HOOK for writing the record called before lock */ + prewrite_rec_hook prewrite_hook; + /* HOOK for writing the record called when LSN is known, inside lock */ + inwrite_rec_hook inwrite_hook; + /* HOOK for reading headers */ + read_rec_hook read_hook; + /* + For pseudo fixed records number of compressed LSNs followed by + system header + */ + int16 compressed_LSN; + /* the rest is for maria_read_log & Recovery */ + /** @brief for debug error messages or "maria_read_log" command-line tool */ + const char *name; + enum enum_record_in_group record_in_group; + /* a function to execute when we see the record during the REDO phase */ + int (*record_execute_in_redo_phase)(const TRANSLOG_HEADER_BUFFER *); + /* a function to execute when we see the record during the UNDO phase */ + int (*record_execute_in_undo_phase)(const TRANSLOG_HEADER_BUFFER *, TRN *); +} LOG_DESC; + +extern LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES]; + +typedef enum +{ + TRANSLOG_GCOMMIT_NONE, + TRANSLOG_GCOMMIT_HARD, + TRANSLOG_GCOMMIT_SOFT +} enum_maria_group_commit; +extern ulong maria_group_commit; +extern ulong maria_group_commit_interval; +typedef enum +{ + TRANSLOG_PURGE_IMMIDIATE, + TRANSLOG_PURGE_EXTERNAL, + TRANSLOG_PURGE_ONDEMAND +} enum_maria_translog_purge_type; +extern ulong log_purge_type; +extern ulong log_file_size; + +typedef enum +{ + TRANSLOG_SYNC_DIR_NEVER, + TRANSLOG_SYNC_DIR_NEWFILE, + TRANSLOG_SYNC_DIR_ALWAYS +} enum_maria_sync_log_dir; +extern ulong sync_log_dir; + +C_MODE_END +#endif diff --git a/storage/maria/ma_loghandler_lsn.h b/storage/maria/ma_loghandler_lsn.h new file mode 100644 index 00000000000..7fa53bc0a50 --- /dev/null +++ b/storage/maria/ma_loghandler_lsn.h @@ -0,0 +1,111 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _ma_loghandler_lsn_h +#define _ma_loghandler_lsn_h + +/* + Transaction log record address: + file_no << 32 | offset + file_no is only 3 bytes so we can use signed integer to make + comparison simpler. +*/ +typedef int64 TRANSLOG_ADDRESS; + +/* + Compare addresses + A1 > A2 -> result > 0 + A1 == A2 -> 0 + A1 < A2 -> result < 0 +*/ +#define cmp_translog_addr(A1,A2) ((A1) - (A2)) + +/* + TRANSLOG_ADDRESS is just address of some byte in the log (usually some + chunk) + LSN used where address of some record in the log needed (not just any + address) +*/ +typedef TRANSLOG_ADDRESS LSN; + +/* Gets file number part of a LSN/log address */ +#define LSN_FILE_NO(L) (uint32) ((L) >> 32) + +/* Gets raw file number part of a LSN/log address */ +#define LSN_FILE_NO_PART(L) ((L) & ((int64)0xFFFFFF00000000LL)) + +/* Parts of LSN for printing */ +#define LSN_IN_PARTS(L) (ulong)LSN_FILE_NO(L),(ulong)LSN_OFFSET(L) + +/* Gets record offset of a LSN/log address */ +#define LSN_OFFSET(L) (ulong) ((L) & 0xFFFFFFFFL) + +/* Makes lsn/log address from file number and record offset */ +#define MAKE_LSN(F,S) ((LSN) ((((uint64)(F)) << 32) | (S))) + +/* checks LSN */ +#define LSN_VALID(L) \ + ((LSN_FILE_NO_PART(L) != FILENO_IMPOSSIBLE) && \ + (LSN_OFFSET(L) != LOG_OFFSET_IMPOSSIBLE)) + +/* size of stored LSN on a disk, don't change it! */ +#define LSN_STORE_SIZE 7 + +/* Puts LSN into buffer (dst) */ +#define lsn_store(dst, lsn) \ + do { \ + int3store((dst), LSN_FILE_NO(lsn)); \ + int4store((char*)(dst) + 3, LSN_OFFSET(lsn)); \ + } while (0) + +/* Unpacks LSN from the buffer (P) */ +#define lsn_korr(P) MAKE_LSN(uint3korr(P), uint4korr((const char*)(P) + 3)) + +/* what we need to add to LSN to increase it on one file */ +#define LSN_ONE_FILE ((int64)0x100000000LL) + +#define LSN_REPLACE_OFFSET(L, S) (LSN_FILE_NO_PART(L) | (S)) + +/* + an 8-byte type whose most significant uchar is used for "flags"; 7 + other bytes are a LSN. +*/ +typedef LSN LSN_WITH_FLAGS; +#define LSN_WITH_FLAGS_TO_LSN(x) (x & ULL(0x00FFFFFFFFFFFFFF)) +#define LSN_WITH_FLAGS_TO_FLAGS(x) (x & ULL(0xFF00000000000000)) + +#define FILENO_IMPOSSIBLE 0 /**< log file's numbering starts at 1 */ +#define LOG_OFFSET_IMPOSSIBLE 0 /**< log always has a header */ +#define LSN_IMPOSSIBLE ((LSN)0) +/* following LSN also is impossible */ +#define LSN_ERROR ((LSN)1) + +/** @brief some impossible LSN serve as markers */ + +/** + When table is modified by maria_chk, or auto-zerofilled, old REDOs don't + apply, table is freshly born again somehow: its state's LSNs need to be + updated to the new instance which receives this table. +*/ +#define LSN_NEEDS_NEW_STATE_LSNS ((LSN)2) + +/** + @brief the maximum valid LSN. + Unlike ULONGLONG_MAX, it can be safely used in comparison with valid LSNs + (ULONGLONG_MAX is too big for correctness of cmp_translog_addr()). +*/ +#define LSN_MAX (LSN)ULL(0x00FFFFFFFFFFFFFF) + +#endif diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c new file mode 100644 index 00000000000..63e1801a39a --- /dev/null +++ b/storage/maria/ma_open.c @@ -0,0 +1,1945 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* open a isam-database */ + +#include "ma_fulltext.h" +#include "ma_sp_defs.h" +#include "ma_rt_index.h" +#include "ma_blockrec.h" +#include <m_ctype.h> + +#if defined(MSDOS) || defined(__WIN__) +#ifdef __WIN__ +#include <fcntl.h> +#else +#include <process.h> /* Prototype for getpid */ +#endif +#endif + +static void setup_key_functions(MARIA_KEYDEF *keyinfo); +static my_bool maria_scan_init_dummy(MARIA_HA *info); +static void maria_scan_end_dummy(MARIA_HA *info); +static my_bool maria_once_init_dummy(MARIA_SHARE *, File); +static my_bool maria_once_end_dummy(MARIA_SHARE *); +static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base); +static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state); + +#define get_next_element(to,pos,size) { memcpy((char*) to,pos,(size_t) size); \ + pos+=size;} + + +#define disk_pos_assert(pos, end_pos) \ +if (pos > end_pos) \ +{ \ + my_errno=HA_ERR_CRASHED; \ + goto err; \ +} + + +/****************************************************************************** +** Return the shared struct if the table is already open. +** In MySQL the server will handle version issues. +******************************************************************************/ + +MARIA_HA *_ma_test_if_reopen(const char *filename) +{ + LIST *pos; + + for (pos=maria_open_list ; pos ; pos=pos->next) + { + MARIA_HA *info=(MARIA_HA*) pos->data; + MARIA_SHARE *share= info->s; + if (!strcmp(share->unique_file_name.str,filename) && share->last_version) + return info; + } + return 0; +} + + +/* + Open a new instance of an already opened Maria table + + SYNOPSIS + maria_clone_internal() + share Share of already open table + mode Mode of table (O_RDONLY | O_RDWR) + data_file Filedescriptor of data file to use < 0 if one should open + open it. + + RETURN + # Maria handler + 0 Error +*/ + + +static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, const char *name, + int mode, File data_file) +{ + int save_errno; + uint errpos; + MARIA_HA info,*m_info; + my_bitmap_map *changed_fields_bitmap; + DBUG_ENTER("maria_clone_internal"); + + errpos= 0; + bzero((uchar*) &info,sizeof(info)); + + if (mode == O_RDWR && share->mode == O_RDONLY) + { + my_errno=EACCES; /* Can't open in write mode */ + goto err; + } + if (data_file >= 0) + info.dfile.file= data_file; + else if (_ma_open_datafile(&info, share, name, -1)) + goto err; + errpos= 5; + + /* alloc and set up private structure parts */ + if (!my_multi_malloc(MY_WME, + &m_info,sizeof(MARIA_HA), + &info.blobs,sizeof(MARIA_BLOB)*share->base.blobs, + &info.buff,(share->base.max_key_block_length*2+ + share->base.max_key_length), + &info.lastkey_buff,share->base.max_key_length*2+1, + &info.first_mbr_key, share->base.max_key_length, + &info.maria_rtree_recursion_state, + share->have_rtree ? 1024 : 0, + &changed_fields_bitmap, + bitmap_buffer_size(share->base.fields), + NullS)) + goto err; + errpos= 6; + + memcpy(info.blobs,share->blobs,sizeof(MARIA_BLOB)*share->base.blobs); + info.lastkey_buff2= info.lastkey_buff + share->base.max_key_length; + info.last_key.data= info.lastkey_buff; + + info.s=share; + info.cur_row.lastpos= HA_OFFSET_ERROR; + info.update= (short) (HA_STATE_NEXT_FOUND+HA_STATE_PREV_FOUND); + info.opt_flag=READ_CHECK_USED; + info.this_unique= (ulong) info.dfile.file; /* Uniq number in process */ +#ifdef EXTERNAL_LOCKING + if (share->data_file_type == COMPRESSED_RECORD) + info.this_unique= share->state.unique; + info.this_loop=0; /* Update counter */ + info.last_unique= share->state.unique; + info.last_loop= share->state.update_count; +#endif + info.errkey= -1; + info.page_changed=1; + info.keyread_buff= info.buff + share->base.max_key_block_length; + + info.lock_type= F_UNLCK; + if (share->options & HA_OPTION_TMP_TABLE) + info.lock_type= F_WRLCK; + + _ma_set_data_pagecache_callbacks(&info.dfile, share); + bitmap_init(&info.changed_fields, changed_fields_bitmap, + share->base.fields, 0); + if ((*share->init)(&info)) + goto err; + + /* The following should be big enough for all pinning purposes */ + if (my_init_dynamic_array(&info.pinned_pages, + sizeof(MARIA_PINNED_PAGE), + max(share->base.blobs*2 + 4, + MARIA_MAX_TREE_LEVELS*3), 16)) + goto err; + + + pthread_mutex_lock(&share->intern_lock); + info.read_record= share->read_record; + share->reopen++; + share->write_flag=MYF(MY_NABP | MY_WAIT_IF_FULL); + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + info.lock_type=F_RDLCK; + share->r_locks++; + share->tot_locks++; + } + if ((share->options & HA_OPTION_DELAY_KEY_WRITE) && + maria_delay_key_write) + share->delay_key_write=1; + + if (!share->base.born_transactional) /* For transactional ones ... */ + { + /* ... force crash if no trn given */ + _ma_set_trn_for_table(&info, &dummy_transaction_object); + info.state= &share->state.state; /* Change global values by default */ + } + else + { + info.state= &share->state.common; + *info.state= share->state.state; /* Initial values */ + } + info.state_start= info.state; /* Initial values */ + + pthread_mutex_unlock(&share->intern_lock); + + /* Allocate buffer for one record */ + /* prerequisites: info->rec_buffer == 0 && info->rec_buff_size == 0 */ + if (_ma_alloc_buffer(&info.rec_buff, &info.rec_buff_size, + share->base.default_rec_buff_size)) + goto err; + + bzero(info.rec_buff, share->base.default_rec_buff_size); + + *m_info=info; +#ifdef THREAD + thr_lock_data_init(&share->lock,&m_info->lock,(void*) m_info); +#endif + m_info->open_list.data=(void*) m_info; + maria_open_list=list_add(maria_open_list,&m_info->open_list); + + DBUG_RETURN(m_info); + +err: + DBUG_PRINT("error", ("error: %d", my_errno)); + save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE; + if ((save_errno == HA_ERR_CRASHED) || + (save_errno == HA_ERR_CRASHED_ON_USAGE) || + (save_errno == HA_ERR_CRASHED_ON_REPAIR)) + _ma_report_error(save_errno, &share->open_file_name); + switch (errpos) { + case 6: + (*share->end)(&info); + delete_dynamic(&info.pinned_pages); + my_free(m_info, MYF(0)); + /* fall through */ + case 5: + if (data_file < 0) + VOID(my_close(info.dfile.file, MYF(0))); + break; + } + my_errno=save_errno; + DBUG_RETURN (NULL); +} /* maria_clone_internal */ + + +/* Make a clone of a maria table */ + +MARIA_HA *maria_clone(MARIA_SHARE *share, int mode) +{ + MARIA_HA *new_info; + pthread_mutex_lock(&THR_LOCK_maria); + new_info= maria_clone_internal(share, NullS, mode, + share->data_file_type == BLOCK_RECORD ? + share->bitmap.file.file : -1); + pthread_mutex_unlock(&THR_LOCK_maria); + return new_info; +} + + +/****************************************************************************** + open a MARIA table + + See my_base.h for the handle_locking argument + if handle_locking and HA_OPEN_ABORT_IF_CRASHED then abort if the table + is marked crashed or if we are not using locking and the table doesn't + have an open count of 0. +******************************************************************************/ + +MARIA_HA *maria_open(const char *name, int mode, uint open_flags) +{ + int kfile,open_mode,save_errno; + uint i,j,len,errpos,head_length,base_pos,keys, realpath_err, + key_parts,unique_key_parts,fulltext_keys,uniques; + size_t info_length; + char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN], + data_name[FN_REFLEN]; + uchar *disk_cache, *disk_pos, *end_pos; + MARIA_HA info,*m_info,*old_info; + MARIA_SHARE share_buff,*share; + double *rec_per_key_part; + ulong *nulls_per_key_part; + my_off_t key_root[HA_MAX_POSSIBLE_KEY]; + ulonglong max_key_file_length, max_data_file_length; + my_bool versioning= 1; + File data_file= -1; + DBUG_ENTER("maria_open"); + + LINT_INIT(m_info); + kfile= -1; + errpos= 0; + head_length=sizeof(share_buff.state.header); + bzero((uchar*) &info,sizeof(info)); + + realpath_err= my_realpath(name_buff, fn_format(org_name, name, "", + MARIA_NAME_IEXT, + MY_UNPACK_FILENAME),MYF(0)); + if (my_is_symlink(org_name) && + (realpath_err || (*maria_test_invalid_symlink)(name_buff))) + { + my_errno= HA_WRONG_CREATE_OPTION; + DBUG_RETURN(0); + } + + pthread_mutex_lock(&THR_LOCK_maria); + old_info= 0; + if ((open_flags & HA_OPEN_COPY) || + !(old_info=_ma_test_if_reopen(name_buff))) + { + share= &share_buff; + bzero((uchar*) &share_buff,sizeof(share_buff)); + share_buff.state.key_root=key_root; + share_buff.pagecache= multi_pagecache_search((uchar*) name_buff, + (uint) strlen(name_buff), + maria_pagecache); + + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_open", + if (strstr(name, "/t1")) + { + my_errno= HA_ERR_CRASHED; + goto err; + }); + if ((kfile=my_open(name_buff,(open_mode=O_RDWR) | O_SHARE,MYF(0))) < 0) + { + if ((errno != EROFS && errno != EACCES) || + mode != O_RDONLY || + (kfile=my_open(name_buff,(open_mode=O_RDONLY) | O_SHARE,MYF(0))) < 0) + goto err; + } + share->mode=open_mode; + errpos= 1; + if (my_pread(kfile,share->state.header.file_version, head_length, 0, + MYF(MY_NABP))) + { + my_errno= HA_ERR_NOT_A_TABLE; + goto err; + } + if (memcmp(share->state.header.file_version, maria_file_magic, 4)) + { + DBUG_PRINT("error",("Wrong header in %s",name_buff)); + DBUG_DUMP("error_dump", share->state.header.file_version, + head_length); + my_errno=HA_ERR_NOT_A_TABLE; + goto err; + } + share->options= mi_uint2korr(share->state.header.options); + if (share->options & + ~(HA_OPTION_PACK_RECORD | HA_OPTION_PACK_KEYS | + HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA | + HA_OPTION_TEMP_COMPRESS_RECORD | HA_OPTION_CHECKSUM | + HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE | + HA_OPTION_RELIES_ON_SQL_LAYER | HA_OPTION_NULL_FIELDS | + HA_OPTION_PAGE_CHECKSUM)) + { + DBUG_PRINT("error",("wrong options: 0x%lx", share->options)); + my_errno=HA_ERR_NEW_FILE; + goto err; + } + if ((share->options & HA_OPTION_RELIES_ON_SQL_LAYER) && + ! (open_flags & HA_OPEN_FROM_SQL_LAYER)) + { + DBUG_PRINT("error", ("table cannot be opened from non-sql layer")); + my_errno= HA_ERR_UNSUPPORTED; + goto err; + } + /* Don't call realpath() if the name can't be a link */ + if (!strcmp(name_buff, org_name) || + my_readlink(index_name, org_name, MYF(0)) == -1) + (void) strmov(index_name, org_name); + *strrchr(org_name, FN_EXTCHAR)= '\0'; + (void) fn_format(data_name,org_name,"",MARIA_NAME_DEXT, + MY_APPEND_EXT|MY_UNPACK_FILENAME|MY_RESOLVE_SYMLINKS); + + info_length=mi_uint2korr(share->state.header.header_length); + base_pos= mi_uint2korr(share->state.header.base_pos); + + /* + Allocate space for header information and for data that is too + big to keep on stack + */ + if (!my_multi_malloc(MY_WME, + &disk_cache, info_length+128, + &rec_per_key_part, + (sizeof(*rec_per_key_part) * HA_MAX_POSSIBLE_KEY * + HA_MAX_KEY_SEG), + &nulls_per_key_part, + (sizeof(*nulls_per_key_part) * HA_MAX_POSSIBLE_KEY * + HA_MAX_KEY_SEG), + NullS)) + { + my_errno=ENOMEM; + goto err; + } + share_buff.state.rec_per_key_part= rec_per_key_part; + share_buff.state.nulls_per_key_part= nulls_per_key_part; + + end_pos=disk_cache+info_length; + errpos= 3; + if (my_pread(kfile, disk_cache, info_length, 0L, MYF(MY_NABP))) + { + my_errno=HA_ERR_CRASHED; + goto err; + } + len=mi_uint2korr(share->state.header.state_info_length); + keys= (uint) share->state.header.keys; + uniques= (uint) share->state.header.uniques; + fulltext_keys= (uint) share->state.header.fulltext_keys; + key_parts= mi_uint2korr(share->state.header.key_parts); + unique_key_parts= mi_uint2korr(share->state.header.unique_key_parts); + if (len != MARIA_STATE_INFO_SIZE) + { + DBUG_PRINT("warning", + ("saved_state_info_length: %d state_info_length: %d", + len,MARIA_STATE_INFO_SIZE)); + } + share->state_diff_length=len-MARIA_STATE_INFO_SIZE; + + _ma_state_info_read(disk_cache, &share->state); + len= mi_uint2korr(share->state.header.base_info_length); + if (len != MARIA_BASE_INFO_SIZE) + { + DBUG_PRINT("warning",("saved_base_info_length: %d base_info_length: %d", + len,MARIA_BASE_INFO_SIZE)); + } + disk_pos= _ma_base_info_read(disk_cache + base_pos, &share->base); + share->state.state_length=base_pos; + + if (!(open_flags & HA_OPEN_FOR_REPAIR) && + ((share->state.changed & STATE_CRASHED) || + ((open_flags & HA_OPEN_ABORT_IF_CRASHED) && + (my_disable_locking && share->state.open_count)))) + { + DBUG_PRINT("error",("Table is marked as crashed. open_flags: %u " + "changed: %u open_count: %u !locking: %d", + open_flags, share->state.changed, + share->state.open_count, my_disable_locking)); + my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ? + HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE); + goto err; + } + + /* + We can ignore testing uuid if STATE_NOT_MOVABLE is set, as in this + case the uuid will be set in _ma_mark_file_changed() + */ + if ((share->state.changed & STATE_NOT_MOVABLE) && + share->base.born_transactional && + ((!(open_flags & HA_OPEN_IGNORE_MOVED_STATE) && + memcmp(share->base.uuid, maria_uuid, MY_UUID_SIZE)) || + (share->state.create_trid > trnman_get_max_trid() && + !maria_in_recovery))) + { + DBUG_PRINT("warning", ("table is moved from another system. uuid_diff: %d create_trid: %lu max_trid: %lu", + memcmp(share->base.uuid, maria_uuid, + MY_UUID_SIZE) != 0, + (ulong) share->state.create_trid, + (ulong) trnman_get_max_trid())); + if (open_flags & HA_OPEN_FOR_REPAIR) + share->state.changed|= STATE_MOVED; + else + { + my_errno= HA_ERR_OLD_FILE; + goto err; + } + } + + /* sanity check */ + if (share->base.keystart > 65535 || share->base.rec_reflength > 8) + { + my_errno=HA_ERR_CRASHED; + goto err; + } + + key_parts+=fulltext_keys*FT_SEGS; + if (share->base.max_key_length > maria_max_key_length() || + keys > MARIA_MAX_KEY || key_parts > MARIA_MAX_KEY * HA_MAX_KEY_SEG) + { + DBUG_PRINT("error",("Wrong key info: Max_key_length: %d keys: %d key_parts: %d", share->base.max_key_length, keys, key_parts)); + my_errno=HA_ERR_UNSUPPORTED; + goto err; + } + + /* Ensure we have space in the key buffer for transaction id's */ + if (share->base.born_transactional) + share->base.max_key_length= ALIGN_SIZE(share->base.max_key_length + + MARIA_MAX_PACK_TRANSID_SIZE); + + /* + If page cache is not initialized, then assume we will create the + page_cache after the table is opened! + This is only used by maria_check to allow it to check/repair tables + with different block sizes. + */ + if (share->base.block_size != maria_block_size && + share_buff.pagecache->inited != 0) + { + DBUG_PRINT("error", ("Wrong block size %u; Expected %u", + (uint) share->base.block_size, + (uint) maria_block_size)); + my_errno=HA_ERR_UNSUPPORTED; + goto err; + } + + /* Correct max_file_length based on length of sizeof(off_t) */ + max_data_file_length= + (share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) ? + (((ulonglong) 1 << (share->base.rec_reflength*8))-1) : + (_ma_safe_mul(share->base.pack_reclength, + (ulonglong) 1 << (share->base.rec_reflength*8))-1); + + max_key_file_length= + _ma_safe_mul(maria_block_size, + ((ulonglong) 1 << (share->base.key_reflength*8))-1); +#if SIZEOF_OFF_T == 4 + set_if_smaller(max_data_file_length, INT_MAX32); + set_if_smaller(max_key_file_length, INT_MAX32); +#endif + share->base.max_data_file_length=(my_off_t) max_data_file_length; + share->base.max_key_file_length=(my_off_t) max_key_file_length; + + if (share->options & HA_OPTION_COMPRESS_RECORD) + share->base.max_key_length+=2; /* For safety */ + /* Add space for node pointer */ + share->base.max_key_length+= share->base.key_reflength; + + share->unique_file_name.length= strlen(name_buff); + share->index_file_name.length= strlen(index_name); + share->data_file_name.length= strlen(data_name); + share->open_file_name.length= strlen(name); + if (!my_multi_malloc(MY_WME, + &share,sizeof(*share), + &share->state.rec_per_key_part, + sizeof(double) * key_parts, + &share->state.nulls_per_key_part, + sizeof(long)* key_parts, + &share->keyinfo,keys*sizeof(MARIA_KEYDEF), + &share->uniqueinfo,uniques*sizeof(MARIA_UNIQUEDEF), + &share->keyparts, + (key_parts+unique_key_parts+keys+uniques) * + sizeof(HA_KEYSEG), + &share->columndef, + (share->base.fields+1)*sizeof(MARIA_COLUMNDEF), + &share->column_nr, share->base.fields*sizeof(uint16), + &share->blobs,sizeof(MARIA_BLOB)*share->base.blobs, + &share->unique_file_name.str, + share->unique_file_name.length+1, + &share->index_file_name.str, + share->index_file_name.length+1, + &share->data_file_name.str, + share->data_file_name.length+1, + &share->open_file_name.str, + share->open_file_name.length+1, + &share->state.key_root,keys*sizeof(my_off_t), + &share->mmap_lock,sizeof(rw_lock_t), + NullS)) + goto err; + errpos= 4; + + *share=share_buff; + memcpy((char*) share->state.rec_per_key_part, + (char*) rec_per_key_part, sizeof(double)*key_parts); + memcpy((char*) share->state.nulls_per_key_part, + (char*) nulls_per_key_part, sizeof(long)*key_parts); + memcpy((char*) share->state.key_root, + (char*) key_root, sizeof(my_off_t)*keys); + strmov(share->unique_file_name.str, name_buff); + strmov(share->index_file_name.str, index_name); + strmov(share->data_file_name.str, data_name); + strmov(share->open_file_name.str, name); + + share->block_size= share->base.block_size; /* Convenience */ + share->max_index_block_size= share->block_size - KEYPAGE_CHECKSUM_SIZE; + { + HA_KEYSEG *pos=share->keyparts; + uint32 ftkey_nr= 1; + for (i=0 ; i < keys ; i++) + { + share->keyinfo[i].share= share; + disk_pos=_ma_keydef_read(disk_pos, &share->keyinfo[i]); + share->keyinfo[i].key_nr= i; + disk_pos_assert(disk_pos + share->keyinfo[i].keysegs * HA_KEYSEG_SIZE, + end_pos); + if (share->keyinfo[i].key_alg == HA_KEY_ALG_RTREE) + share->have_rtree= 1; + share->keyinfo[i].seg=pos; + for (j=0 ; j < share->keyinfo[i].keysegs; j++,pos++) + { + disk_pos=_ma_keyseg_read(disk_pos, pos); + if (pos->type == HA_KEYTYPE_TEXT || + pos->type == HA_KEYTYPE_VARTEXT1 || + pos->type == HA_KEYTYPE_VARTEXT2) + { + if (!pos->language) + pos->charset=default_charset_info; + else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME)))) + { + my_errno=HA_ERR_UNKNOWN_CHARSET; + goto err; + } + } + else if (pos->type == HA_KEYTYPE_BINARY) + pos->charset= &my_charset_bin; + } + if (share->keyinfo[i].flag & HA_SPATIAL) + { +#ifdef HAVE_SPATIAL + uint sp_segs=SPDIMS*2; + share->keyinfo[i].seg=pos-sp_segs; + share->keyinfo[i].keysegs--; + versioning= 0; +#else + my_errno=HA_ERR_UNSUPPORTED; + goto err; +#endif + } + else if (share->keyinfo[i].flag & HA_FULLTEXT) + { + versioning= 0; + DBUG_ASSERT(fulltext_keys); + { + uint k; + share->keyinfo[i].seg=pos; + for (k=0; k < FT_SEGS; k++) + { + *pos= ft_keysegs[k]; + pos[0].language= pos[-1].language; + if (!(pos[0].charset= pos[-1].charset)) + { + my_errno=HA_ERR_CRASHED; + goto err; + } + pos++; + } + } + if (!share->ft2_keyinfo.seg) + { + memcpy(&share->ft2_keyinfo, &share->keyinfo[i], + sizeof(MARIA_KEYDEF)); + share->ft2_keyinfo.keysegs=1; + share->ft2_keyinfo.flag=0; + share->ft2_keyinfo.keylength= + share->ft2_keyinfo.minlength= + share->ft2_keyinfo.maxlength=HA_FT_WLEN+share->base.rec_reflength; + share->ft2_keyinfo.seg=pos-1; + share->ft2_keyinfo.end=pos; + setup_key_functions(& share->ft2_keyinfo); + } + share->keyinfo[i].ftkey_nr= ftkey_nr++; + } + setup_key_functions(share->keyinfo+i); + share->keyinfo[i].end=pos; + pos->type=HA_KEYTYPE_END; /* End */ + pos->length=share->base.rec_reflength; + pos->null_bit=0; + pos->flag=0; /* For purify */ + pos++; + } + for (i=0 ; i < uniques ; i++) + { + disk_pos=_ma_uniquedef_read(disk_pos, &share->uniqueinfo[i]); + disk_pos_assert(disk_pos + share->uniqueinfo[i].keysegs * + HA_KEYSEG_SIZE, end_pos); + share->uniqueinfo[i].seg=pos; + for (j=0 ; j < share->uniqueinfo[i].keysegs; j++,pos++) + { + disk_pos=_ma_keyseg_read(disk_pos, pos); + if (pos->type == HA_KEYTYPE_TEXT || + pos->type == HA_KEYTYPE_VARTEXT1 || + pos->type == HA_KEYTYPE_VARTEXT2) + { + if (!pos->language) + pos->charset=default_charset_info; + else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME)))) + { + my_errno=HA_ERR_UNKNOWN_CHARSET; + goto err; + } + } + } + share->uniqueinfo[i].end=pos; + pos->type=HA_KEYTYPE_END; /* End */ + pos->null_bit=0; + pos->flag=0; + pos++; + } + share->ftkeys= ftkey_nr; + } + share->data_file_type= share->state.header.data_file_type; + share->base_length= (BASE_ROW_HEADER_SIZE + + share->base.is_nulls_extended + + share->base.null_bytes + + share->base.pack_bytes + + test(share->options & HA_OPTION_CHECKSUM)); + share->keypage_header= ((share->base.born_transactional ? + LSN_STORE_SIZE + TRANSID_SIZE : + 0) + KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE + + KEYPAGE_USED_SIZE); + share->kfile.file= kfile; + + if (open_flags & HA_OPEN_COPY) + { + /* + this instance will be a temporary one used just to create a data + file for REPAIR. Don't do logging. This base information will not go + to disk. + */ + share->base.born_transactional= FALSE; + } + if (share->base.born_transactional) + { + share->page_type= PAGECACHE_LSN_PAGE; + if (share->state.create_rename_lsn == LSN_NEEDS_NEW_STATE_LSNS) + { + /* + Was repaired with maria_chk, maybe later maria_pack-ed. Some sort of + import into the server. It starts its existence (from the point of + view of the server, including server's recovery) now. + */ + if (((open_flags & HA_OPEN_FROM_SQL_LAYER) && + (share->state.changed & STATE_NOT_MOVABLE)) || maria_in_recovery) + _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE, + trnman_get_min_safe_trid(), TRUE, TRUE); + } + else if ((!LSN_VALID(share->state.create_rename_lsn) || + !LSN_VALID(share->state.is_of_horizon) || + (cmp_translog_addr(share->state.create_rename_lsn, + share->state.is_of_horizon) > 0) || + !LSN_VALID(share->state.skip_redo_lsn) || + (cmp_translog_addr(share->state.create_rename_lsn, + share->state.skip_redo_lsn) > 0)) && + !(open_flags & HA_OPEN_FOR_REPAIR)) + { + /* + If in Recovery, it will not work. If LSN is invalid and not + LSN_NEEDS_NEW_STATE_LSNS, header must be corrupted. + In both cases, must repair. + */ + my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ? + HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE); + goto err; + } + } + else + share->page_type= PAGECACHE_PLAIN_PAGE; + share->now_transactional= share->base.born_transactional; + + /* Use pack_reclength as we don't want to modify base.pack_recklength */ + if (share->state.header.org_data_file_type == DYNAMIC_RECORD) + { + /* add bits used to pack data to pack_reclength for faster allocation */ + share->base.pack_reclength+= share->base.pack_bytes; + share->base.extra_rec_buff_size= + (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER) + MARIA_SPLIT_LENGTH + + MARIA_REC_BUFF_OFFSET); + } + if (share->data_file_type == COMPRESSED_RECORD) + { + /* Need some extra bytes for decode_bytes */ + share->base.extra_rec_buff_size+= 7; + } + share->base.default_rec_buff_size= max(share->base.pack_reclength + + share->base.extra_rec_buff_size, + share->base.max_key_length); + + disk_pos_assert(disk_pos + share->base.fields *MARIA_COLUMNDEF_SIZE, + end_pos); + for (i= j= 0 ; i < share->base.fields ; i++) + { + disk_pos=_ma_columndef_read(disk_pos,&share->columndef[i]); + share->columndef[i].pack_type=0; + share->columndef[i].huff_tree=0; + if (share->columndef[i].type == FIELD_BLOB) + { + share->blobs[j].pack_length= + share->columndef[i].length-portable_sizeof_char_ptr; + share->blobs[j].offset= share->columndef[i].offset; + j++; + } + } + share->columndef[i].type= FIELD_LAST; /* End marker */ + disk_pos= _ma_column_nr_read(disk_pos, share->column_nr, + share->base.fields); + + if ((share->data_file_type == BLOCK_RECORD || + share->data_file_type == COMPRESSED_RECORD)) + { + if (_ma_open_datafile(&info, share, name, -1)) + goto err; + data_file= info.dfile.file; + } + errpos= 5; + + if (open_flags & HA_OPEN_DELAY_KEY_WRITE) + share->options|= HA_OPTION_DELAY_KEY_WRITE; + if (mode == O_RDONLY) + share->options|= HA_OPTION_READ_ONLY_DATA; + share->is_log_table= FALSE; + + if (open_flags & HA_OPEN_TMP_TABLE) + { + share->options|= HA_OPTION_TMP_TABLE; + share->temporary= share->delay_key_write= 1; + share->write_flag=MYF(MY_NABP); + share->w_locks++; /* We don't have to update status */ + share->tot_locks++; + } + + _ma_set_index_pagecache_callbacks(&share->kfile, share); + share->this_process=(ulong) getpid(); +#ifdef EXTERNAL_LOCKING + share->last_process= share->state.process; +#endif + share->base.key_parts=key_parts; + share->base.all_key_parts=key_parts+unique_key_parts; + if (!(share->last_version=share->state.version)) + share->last_version=1; /* Safety */ + share->rec_reflength=share->base.rec_reflength; /* May be changed */ + share->base.margin_key_file_length=(share->base.max_key_file_length - + (keys ? MARIA_INDEX_BLOCK_MARGIN * + share->block_size * keys : 0)); + share->block_size= share->base.block_size; + my_free(disk_cache, MYF(0)); + _ma_setup_functions(share); + if ((*share->once_init)(share, info.dfile.file)) + goto err; + if (share->now_transactional) + { + /* Setup initial state that is visible for all */ + MARIA_STATE_HISTORY_CLOSED *history; + if ((history= (MARIA_STATE_HISTORY_CLOSED *) + hash_search(&maria_stored_state, + (uchar*) &share->state.create_rename_lsn, 0))) + { + /* + Move history from hash to share. This is safe to do as we + don't have a lock on share->intern_lock. + */ + share->state_history= + _ma_remove_not_visible_states(history->state_history, 0, 0); + history->state_history= 0; + (void) hash_delete(&maria_stored_state, (uchar*) history); + } + else + { + /* Table is not part of any active transaction; Create new history */ + if (!(share->state_history= (MARIA_STATE_HISTORY *) + my_malloc(sizeof(*share->state_history), MYF(MY_WME)))) + goto err; + share->state_history->trid= 0; /* Visible by all */ + share->state_history->state= share->state.state; + share->state_history->next= 0; + } + } +#ifdef THREAD + thr_lock_init(&share->lock); + pthread_mutex_init(&share->intern_lock, MY_MUTEX_INIT_FAST); + pthread_mutex_init(&share->key_del_lock, MY_MUTEX_INIT_FAST); + pthread_cond_init(&share->key_del_cond, 0); + pthread_mutex_init(&share->close_lock, MY_MUTEX_INIT_FAST); + for (i=0; i<keys; i++) + VOID(my_rwlock_init(&share->keyinfo[i].root_lock, NULL)); + VOID(my_rwlock_init(&share->mmap_lock, NULL)); + + share->row_is_visible= _ma_row_visible_always; + share->lock.get_status= _ma_reset_update_flag; + if (!thr_lock_inited) + { + /* Probably a single threaded program; Don't use concurrent inserts */ + maria_concurrent_insert=0; + } + else if (maria_concurrent_insert) + { + share->non_transactional_concurrent_insert= + ((share->options & (HA_OPTION_READ_ONLY_DATA | HA_OPTION_TMP_TABLE | + HA_OPTION_COMPRESS_RECORD | + HA_OPTION_TEMP_COMPRESS_RECORD)) || + (open_flags & HA_OPEN_TMP_TABLE) || + share->data_file_type == BLOCK_RECORD || + share->have_rtree) ? 0 : 1; + if (share->non_transactional_concurrent_insert || + (!share->temporary && share->now_transactional && versioning)) + { + share->lock_key_trees= 1; + if (share->data_file_type == BLOCK_RECORD) + { + DBUG_ASSERT(share->now_transactional); + share->have_versioning= 1; + share->row_is_visible= _ma_row_visible_transactional_table; + share->lock.get_status= _ma_block_get_status; + share->lock.check_status= _ma_block_check_status; + share->lock.start_trans= _ma_block_start_trans; + /* + We can for the moment only allow multiple concurrent inserts + only if there is no auto-increment key. To lift this restriction + we have to: + - Extend statement base replication to support auto-increment + intervalls. + - Fix that we allocate auto-increment in intervals and that + it's properly reset if the interval was not used + */ + share->lock.allow_multiple_concurrent_insert= + share->base.auto_key == 0; + share->lock_restore_status= 0; + } + else + { + share->row_is_visible= _ma_row_visible_non_transactional_table; + share->lock.get_status= _ma_get_status; + share->lock.copy_status= _ma_copy_status; + share->lock.update_status= _ma_update_status; + share->lock.restore_status= _ma_restore_status; + share->lock.check_status= _ma_check_status; + share->lock_restore_status= _ma_restore_status; + } + } + else if (share->now_transactional) + { + DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); + share->lock.start_trans= _ma_block_start_trans_no_versioning; + } + } +#endif + /* + Memory mapping can only be requested after initializing intern_lock. + */ + if (open_flags & HA_OPEN_MMAP) + { + info.s= share; + maria_extra(&info, HA_EXTRA_MMAP, 0); + } + } + else + { + share= old_info->s; + if (share->data_file_type == BLOCK_RECORD) + data_file= share->bitmap.file.file; /* Only opened once */ + } + + if (!(m_info= maria_clone_internal(share, name, mode, data_file))) + goto err; + + if (maria_is_crashed(m_info)) + DBUG_PRINT("warning", ("table is crashed: changed: %u", + share->state.changed)); + + pthread_mutex_unlock(&THR_LOCK_maria); + DBUG_RETURN(m_info); + +err: + DBUG_PRINT("error", ("error: %d errpos: %d", my_errno, errpos)); + save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE; + if ((save_errno == HA_ERR_CRASHED) || + (save_errno == HA_ERR_CRASHED_ON_USAGE) || + (save_errno == HA_ERR_CRASHED_ON_REPAIR)) + { + LEX_STRING tmp_name; + tmp_name.str= (char*) name; + tmp_name.length= strlen(name); + _ma_report_error(save_errno, &tmp_name); + } + if (save_errno == HA_ERR_OLD_FILE) /* uuid is different ? */ + save_errno= HA_ERR_CRASHED_ON_USAGE; /* the code to trigger auto-repair */ + switch (errpos) { + case 5: + if (data_file >= 0) + VOID(my_close(data_file, MYF(0))); + if (old_info) + break; /* Don't remove open table */ + (*share->once_end)(share); + /* fall through */ + case 4: + my_free(share,MYF(0)); + /* fall through */ + case 3: + my_free(disk_cache, MYF(0)); + /* fall through */ + case 1: + VOID(my_close(kfile,MYF(0))); + /* fall through */ + case 0: + default: + break; + } + pthread_mutex_unlock(&THR_LOCK_maria); + my_errno= save_errno; + DBUG_RETURN (NULL); +} /* maria_open */ + + +/* + Reallocate a buffer, if the current buffer is not large enough +*/ + +my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size, + size_t new_size) +{ + if (*old_size < new_size) + { + uchar *addr; + if (!(addr= (uchar*) my_realloc(*old_addr, new_size, + MYF(MY_ALLOW_ZERO_PTR)))) + return 1; + *old_addr= addr; + *old_size= new_size; + } + return 0; +} + + +ulonglong _ma_safe_mul(ulonglong a, ulonglong b) +{ + ulonglong max_val= ~ (ulonglong) 0; /* my_off_t is unsigned */ + + if (!a || max_val / a < b) + return max_val; + return a*b; +} + + /* Set up functions in structs */ + +void _ma_setup_functions(register MARIA_SHARE *share) +{ + share->once_init= maria_once_init_dummy; + share->once_end= maria_once_end_dummy; + share->init= maria_scan_init_dummy; + share->end= maria_scan_end_dummy; + share->scan_init= maria_scan_init_dummy;/* Compat. dummy function */ + share->scan_end= maria_scan_end_dummy;/* Compat. dummy function */ + share->scan_remember_pos= _ma_def_scan_remember_pos; + share->scan_restore_pos= _ma_def_scan_restore_pos; + + share->write_record_init= _ma_write_init_default; + share->write_record_abort= _ma_write_abort_default; + share->keypos_to_recpos= _ma_transparent_recpos; + share->recpos_to_keypos= _ma_transparent_recpos; + + switch (share->data_file_type) { + case COMPRESSED_RECORD: + share->read_record= _ma_read_pack_record; + share->scan= _ma_read_rnd_pack_record; + share->once_init= _ma_once_init_pack_row; + share->once_end= _ma_once_end_pack_row; + /* + Calculate checksum according to data in the original, not compressed, + row. + */ + if (share->state.header.org_data_file_type == STATIC_RECORD && + ! (share->options & HA_OPTION_NULL_FIELDS)) + share->calc_checksum= _ma_static_checksum; + else + share->calc_checksum= _ma_checksum; + share->calc_write_checksum= share->calc_checksum; + break; + case DYNAMIC_RECORD: + share->read_record= _ma_read_dynamic_record; + share->scan= _ma_read_rnd_dynamic_record; + share->delete_record= _ma_delete_dynamic_record; + share->compare_record= _ma_cmp_dynamic_record; + share->compare_unique= _ma_cmp_dynamic_unique; + share->calc_checksum= share->calc_write_checksum= _ma_checksum; + if (share->base.blobs) + { + share->update_record= _ma_update_blob_record; + share->write_record= _ma_write_blob_record; + } + else + { + share->write_record= _ma_write_dynamic_record; + share->update_record= _ma_update_dynamic_record; + } + break; + case STATIC_RECORD: + share->read_record= _ma_read_static_record; + share->scan= _ma_read_rnd_static_record; + share->delete_record= _ma_delete_static_record; + share->compare_record= _ma_cmp_static_record; + share->update_record= _ma_update_static_record; + share->write_record= _ma_write_static_record; + share->compare_unique= _ma_cmp_static_unique; + share->keypos_to_recpos= _ma_static_keypos_to_recpos; + share->recpos_to_keypos= _ma_static_recpos_to_keypos; + if (share->state.header.org_data_file_type == STATIC_RECORD && + ! (share->options & HA_OPTION_NULL_FIELDS)) + share->calc_checksum= _ma_static_checksum; + else + share->calc_checksum= _ma_checksum; + break; + case BLOCK_RECORD: + share->once_init= _ma_once_init_block_record; + share->once_end= _ma_once_end_block_record; + share->init= _ma_init_block_record; + share->end= _ma_end_block_record; + share->write_record_init= _ma_write_init_block_record; + share->write_record_abort= _ma_write_abort_block_record; + share->scan_init= _ma_scan_init_block_record; + share->scan_end= _ma_scan_end_block_record; + share->scan= _ma_scan_block_record; + share->scan_remember_pos= _ma_scan_remember_block_record; + share->scan_restore_pos= _ma_scan_restore_block_record; + share->read_record= _ma_read_block_record; + share->delete_record= _ma_delete_block_record; + share->compare_record= _ma_compare_block_record; + share->update_record= _ma_update_block_record; + share->write_record= _ma_write_block_record; + share->compare_unique= _ma_cmp_block_unique; + share->calc_checksum= _ma_checksum; + share->keypos_to_recpos= _ma_transaction_keypos_to_recpos; + share->recpos_to_keypos= _ma_transaction_recpos_to_keypos; + + /* + write_block_record() will calculate the checksum; Tell maria_write() + that it doesn't have to do this. + */ + share->calc_write_checksum= 0; + break; + } + share->file_read= _ma_nommap_pread; + share->file_write= _ma_nommap_pwrite; + share->calc_check_checksum= share->calc_checksum; + + if (!(share->options & HA_OPTION_CHECKSUM) && + share->data_file_type != COMPRESSED_RECORD) + share->calc_checksum= share->calc_write_checksum= 0; + return; +} + + +static void setup_key_functions(register MARIA_KEYDEF *keyinfo) +{ + if (keyinfo->key_alg == HA_KEY_ALG_RTREE) + { +#ifdef HAVE_RTREE_KEYS + keyinfo->ck_insert = maria_rtree_insert; + keyinfo->ck_delete = maria_rtree_delete; +#else + DBUG_ASSERT(0); /* maria_open should check it never happens */ +#endif + } + else + { + keyinfo->ck_insert = _ma_ck_write; + keyinfo->ck_delete = _ma_ck_delete; + } + if (keyinfo->flag & HA_SPATIAL) + keyinfo->make_key= _ma_sp_make_key; + else + keyinfo->make_key= _ma_make_key; + + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { /* Simple prefix compression */ + keyinfo->bin_search= _ma_seq_search; + keyinfo->get_key= _ma_get_binary_pack_key; + keyinfo->skip_key= _ma_skip_binary_pack_key; + keyinfo->pack_key= _ma_calc_bin_pack_key_length; + keyinfo->store_key= _ma_store_bin_pack_key; + } + else if (keyinfo->flag & HA_VAR_LENGTH_KEY) + { + keyinfo->get_key= _ma_get_pack_key; + keyinfo->skip_key= _ma_skip_pack_key; + if (keyinfo->seg[0].flag & HA_PACK_KEY) + { /* Prefix compression */ + /* + _ma_prefix_search() compares end-space against ASCII blank (' '). + It cannot be used for character sets, that do not encode the + blank character like ASCII does. UCS2 is an example. All + character sets with a fixed width > 1 or a mimimum width > 1 + cannot represent blank like ASCII does. In these cases we have + to use _ma_seq_search() for the search. + */ + if (!keyinfo->seg->charset || use_strnxfrm(keyinfo->seg->charset) || + (keyinfo->seg->flag & HA_NULL_PART) || + keyinfo->seg->charset->mbminlen > 1) + keyinfo->bin_search= _ma_seq_search; + else + keyinfo->bin_search= _ma_prefix_search; + keyinfo->pack_key= _ma_calc_var_pack_key_length; + keyinfo->store_key= _ma_store_var_pack_key; + } + else + { + keyinfo->bin_search= _ma_seq_search; + keyinfo->pack_key= _ma_calc_var_key_length; /* Variable length key */ + keyinfo->store_key= _ma_store_static_key; + } + } + else + { + keyinfo->bin_search= _ma_bin_search; + keyinfo->get_key= _ma_get_static_key; + keyinfo->skip_key= _ma_skip_static_key; + keyinfo->pack_key= _ma_calc_static_key_length; + keyinfo->store_key= _ma_store_static_key; + } + + /* set keyinfo->write_comp_flag */ + if (keyinfo->flag & HA_SORT_ALLOWS_SAME) + keyinfo->write_comp_flag=SEARCH_BIGGER; /* Put after same key */ + else if (keyinfo->flag & ( HA_NOSAME | HA_FULLTEXT)) + { + keyinfo->write_comp_flag= SEARCH_FIND | SEARCH_UPDATE; /* No duplicates */ + if (keyinfo->flag & HA_NULL_ARE_EQUAL) + keyinfo->write_comp_flag|= SEARCH_NULL_ARE_EQUAL; + } + else + keyinfo->write_comp_flag= SEARCH_SAME; /* Keys in rec-pos order */ + keyinfo->write_comp_flag|= SEARCH_INSERT; + return; +} + + +/** + @brief Function to save and store the header in the index file (.MYI) + + Operates under MARIA_SHARE::intern_lock if requested. + Sets MARIA_SHARE::MARIA_STATE_INFO::is_of_horizon if transactional table. + Then calls _ma_state_info_write_sub(). + + @param share table + @param pWrite bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET) + is set my_pwrite() is used otherwise my_write(); + if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info + about keys is written (should only be needed + after ALTER TABLE ENABLE/DISABLE KEYS, and + REPAIR/OPTIMIZE); if 4 (MA_STATE_INFO_WRITE_LOCK) + is set, MARIA_SHARE::intern_lock is taken. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite) +{ + uint res; + if (share->options & HA_OPTION_READ_ONLY_DATA) + return 0; + + if (pWrite & MA_STATE_INFO_WRITE_LOCK) + pthread_mutex_lock(&share->intern_lock); + else if (maria_multi_threaded) + { + safe_mutex_assert_owner(&share->intern_lock); + } + if (share->base.born_transactional && translog_status == TRANSLOG_OK && + !maria_in_recovery) + { + /* + In a recovery, we want to set is_of_horizon to the LSN of the last + record executed by Recovery, not the current EOF of the log (which + is too new). Recovery does it by itself. + */ + share->state.is_of_horizon= translog_get_horizon(); + DBUG_PRINT("info", ("is_of_horizon set to LSN (%lu,0x%lx)", + LSN_IN_PARTS(share->state.is_of_horizon))); + } + res= _ma_state_info_write_sub(share->kfile.file, &share->state, pWrite); + if (pWrite & MA_STATE_INFO_WRITE_LOCK) + pthread_mutex_unlock(&share->intern_lock); + share->changed= 0; + return res; +} + + +/** + @brief Function to save and store the header in the index file (.MYI). + + Shortcut to use instead of _ma_state_info_write() when appropriate. + + @param file descriptor of the index file to write + @param state state information to write to the file + @param pWrite bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET) + is set my_pwrite() is used otherwise my_write(); + if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info + about keys is written (should only be needed + after ALTER TABLE ENABLE/DISABLE KEYS, and + REPAIR/OPTIMIZE). + + @notes + For transactional multiuser tables, this function is called + with intern_lock & translog_lock or when the last thread who + is using the table is closing it. + Because of the translog_lock we don't need to have a lock on + key_del_lock. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite) +{ + uchar buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE]; + uchar *ptr=buff; + uint i, keys= (uint) state->header.keys; + size_t res; + DBUG_ENTER("_ma_state_info_write_sub"); + + memcpy_fixed(ptr,&state->header,sizeof(state->header)); + ptr+=sizeof(state->header); + + /* open_count must be first because of _ma_mark_file_changed ! */ + mi_int2store(ptr,state->open_count); ptr+= 2; + /* changed must be second, because of _ma_mark_file_crashed */ + mi_int2store(ptr,state->changed); ptr+= 2; + + /* + If you change the offset of these LSNs, note that some functions do a + direct write of them without going through this function. + */ + lsn_store(ptr, state->create_rename_lsn); ptr+= LSN_STORE_SIZE; + lsn_store(ptr, state->is_of_horizon); ptr+= LSN_STORE_SIZE; + lsn_store(ptr, state->skip_redo_lsn); ptr+= LSN_STORE_SIZE; + mi_rowstore(ptr,state->state.records); ptr+= 8; + mi_rowstore(ptr,state->state.del); ptr+= 8; + mi_rowstore(ptr,state->split); ptr+= 8; + mi_sizestore(ptr,state->dellink); ptr+= 8; + mi_sizestore(ptr,state->first_bitmap_with_space); ptr+= 8; + mi_sizestore(ptr,state->state.key_file_length); ptr+= 8; + mi_sizestore(ptr,state->state.data_file_length); ptr+= 8; + mi_sizestore(ptr,state->state.empty); ptr+= 8; + mi_sizestore(ptr,state->state.key_empty); ptr+= 8; + mi_int8store(ptr,state->auto_increment); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->state.checksum); ptr+= 8; + mi_int8store(ptr,state->create_trid); ptr+= 8; + mi_int4store(ptr,state->status); ptr+= 4; + mi_int4store(ptr,state->update_count); ptr+= 4; + *ptr++= state->sortkey; + *ptr++= 0; /* Reserved */ + ptr+= state->state_diff_length; + + for (i=0; i < keys; i++) + { + mi_sizestore(ptr,state->key_root[i]); ptr+= 8; + } + mi_sizestore(ptr,state->key_del); ptr+= 8; + if (pWrite & MA_STATE_INFO_WRITE_FULL_INFO) /* From maria_chk */ + { + uint key_parts= mi_uint2korr(state->header.key_parts); + mi_int4store(ptr,state->sec_index_changed); ptr+= 4; + mi_int4store(ptr,state->sec_index_used); ptr+= 4; + mi_int4store(ptr,state->version); ptr+= 4; + mi_int8store(ptr,state->key_map); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->create_time); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->recover_time); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->check_time); ptr+= 8; + mi_sizestore(ptr, state->records_at_analyze); ptr+= 8; + /* reserve place for some information per key */ + bzero(ptr, keys*4); ptr+= keys*4; + for (i=0 ; i < key_parts ; i++) + { + float8store(ptr, state->rec_per_key_part[i]); ptr+= 8; + mi_int4store(ptr, state->nulls_per_key_part[i]); ptr+= 4; + } + } + + res= (pWrite & MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET) ? + my_pwrite(file, buff, (size_t) (ptr-buff), 0L, + MYF(MY_NABP | MY_THREADSAFE)) : + my_write(file, buff, (size_t) (ptr-buff), + MYF(MY_NABP)); + DBUG_RETURN(res != 0); +} + + +static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state) +{ + uint i,keys,key_parts; + memcpy_fixed(&state->header,ptr, sizeof(state->header)); + ptr+= sizeof(state->header); + keys= (uint) state->header.keys; + key_parts= mi_uint2korr(state->header.key_parts); + + state->open_count = mi_uint2korr(ptr); ptr+= 2; + state->changed= mi_uint2korr(ptr); ptr+= 2; + state->create_rename_lsn= lsn_korr(ptr); ptr+= LSN_STORE_SIZE; + state->is_of_horizon= lsn_korr(ptr); ptr+= LSN_STORE_SIZE; + state->skip_redo_lsn= lsn_korr(ptr); ptr+= LSN_STORE_SIZE; + state->state.records= mi_rowkorr(ptr); ptr+= 8; + state->state.del = mi_rowkorr(ptr); ptr+= 8; + state->split = mi_rowkorr(ptr); ptr+= 8; + state->dellink= mi_sizekorr(ptr); ptr+= 8; + state->first_bitmap_with_space= mi_sizekorr(ptr); ptr+= 8; + state->state.key_file_length = mi_sizekorr(ptr); ptr+= 8; + state->state.data_file_length= mi_sizekorr(ptr); ptr+= 8; + state->state.empty = mi_sizekorr(ptr); ptr+= 8; + state->state.key_empty= mi_sizekorr(ptr); ptr+= 8; + state->auto_increment=mi_uint8korr(ptr); ptr+= 8; + state->state.checksum=(ha_checksum) mi_uint8korr(ptr);ptr+= 8; + state->create_trid= mi_uint8korr(ptr); ptr+= 8; + state->status = mi_uint4korr(ptr); ptr+= 4; + state->update_count=mi_uint4korr(ptr); ptr+= 4; + state->sortkey= (uint) *ptr++; + ptr++; /* reserved */ + + ptr+= state->state_diff_length; + + for (i=0; i < keys; i++) + { + state->key_root[i]= mi_sizekorr(ptr); ptr+= 8; + } + state->key_del= mi_sizekorr(ptr); ptr+= 8; + state->sec_index_changed = mi_uint4korr(ptr); ptr+= 4; + state->sec_index_used = mi_uint4korr(ptr); ptr+= 4; + state->version = mi_uint4korr(ptr); ptr+= 4; + state->key_map = mi_uint8korr(ptr); ptr+= 8; + state->create_time = (time_t) mi_sizekorr(ptr); ptr+= 8; + state->recover_time =(time_t) mi_sizekorr(ptr); ptr+= 8; + state->check_time = (time_t) mi_sizekorr(ptr); ptr+= 8; + state->records_at_analyze= mi_sizekorr(ptr); ptr+= 8; + ptr+= keys * 4; /* Skip reserved bytes */ + for (i=0 ; i < key_parts ; i++) + { + float8get(state->rec_per_key_part[i], ptr); ptr+= 8; + state->nulls_per_key_part[i]= mi_uint4korr(ptr); ptr+= 4; + } + return ptr; +} + + +/** + @brief Fills the state by reading its copy on disk. + + Should not be called for transactional tables, as their state on disk is + rarely current and so is often misleading for a reader. + Does nothing in single user mode. + + @param file file to read from + @param state state which will be filled +*/ + +uint _ma_state_info_read_dsk(File file __attribute__((unused)), + MARIA_STATE_INFO *state __attribute__((unused))) +{ +#ifdef EXTERNAL_LOCKING + uchar buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE]; + + /* trick to detect transactional tables */ + DBUG_ASSERT(state->create_rename_lsn == LSN_IMPOSSIBLE); + if (!maria_single_user) + { + if (my_pread(file, buff, state->state_length, 0L, MYF(MY_NABP))) + return 1; + _ma_state_info_read(buff, state); + } +#endif + return 0; +} + + +/**************************************************************************** +** store and read of MARIA_BASE_INFO +****************************************************************************/ + +uint _ma_base_info_write(File file, MARIA_BASE_INFO *base) +{ + uchar buff[MARIA_BASE_INFO_SIZE], *ptr=buff; + + bmove(ptr, maria_uuid, MY_UUID_SIZE); + ptr+= MY_UUID_SIZE; + mi_sizestore(ptr,base->keystart); ptr+= 8; + mi_sizestore(ptr,base->max_data_file_length); ptr+= 8; + mi_sizestore(ptr,base->max_key_file_length); ptr+= 8; + mi_rowstore(ptr,base->records); ptr+= 8; + mi_rowstore(ptr,base->reloc); ptr+= 8; + mi_int4store(ptr,base->mean_row_length); ptr+= 4; + mi_int4store(ptr,base->reclength); ptr+= 4; + mi_int4store(ptr,base->pack_reclength); ptr+= 4; + mi_int4store(ptr,base->min_pack_length); ptr+= 4; + mi_int4store(ptr,base->max_pack_length); ptr+= 4; + mi_int4store(ptr,base->min_block_length); ptr+= 4; + mi_int2store(ptr,base->fields); ptr+= 2; + mi_int2store(ptr,base->fixed_not_null_fields); ptr+= 2; + mi_int2store(ptr,base->fixed_not_null_fields_length); ptr+= 2; + mi_int2store(ptr,base->max_field_lengths); ptr+= 2; + mi_int2store(ptr,base->pack_fields); ptr+= 2; + mi_int2store(ptr,base->extra_options) ptr+= 2; + mi_int2store(ptr,base->null_bytes); ptr+= 2; + mi_int2store(ptr,base->original_null_bytes); ptr+= 2; + mi_int2store(ptr,base->field_offsets); ptr+= 2; + mi_int2store(ptr,0); ptr+= 2; /* reserved */ + mi_int2store(ptr,base->block_size); ptr+= 2; + *ptr++= base->rec_reflength; + *ptr++= base->key_reflength; + *ptr++= base->keys; + *ptr++= base->auto_key; + *ptr++= base->born_transactional; + *ptr++= 0; /* Reserved */ + mi_int2store(ptr,base->pack_bytes); ptr+= 2; + mi_int2store(ptr,base->blobs); ptr+= 2; + mi_int2store(ptr,base->max_key_block_length); ptr+= 2; + mi_int2store(ptr,base->max_key_length); ptr+= 2; + mi_int2store(ptr,base->extra_alloc_bytes); ptr+= 2; + *ptr++= base->extra_alloc_procent; + bzero(ptr,16); ptr+= 16; /* extra */ + DBUG_ASSERT((ptr - buff) == MARIA_BASE_INFO_SIZE); + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + + +static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base) +{ + bmove(base->uuid, ptr, MY_UUID_SIZE); ptr+= MY_UUID_SIZE; + base->keystart= mi_sizekorr(ptr); ptr+= 8; + base->max_data_file_length= mi_sizekorr(ptr); ptr+= 8; + base->max_key_file_length= mi_sizekorr(ptr); ptr+= 8; + base->records= (ha_rows) mi_sizekorr(ptr); ptr+= 8; + base->reloc= (ha_rows) mi_sizekorr(ptr); ptr+= 8; + base->mean_row_length= mi_uint4korr(ptr); ptr+= 4; + base->reclength= mi_uint4korr(ptr); ptr+= 4; + base->pack_reclength= mi_uint4korr(ptr); ptr+= 4; + base->min_pack_length= mi_uint4korr(ptr); ptr+= 4; + base->max_pack_length= mi_uint4korr(ptr); ptr+= 4; + base->min_block_length= mi_uint4korr(ptr); ptr+= 4; + base->fields= mi_uint2korr(ptr); ptr+= 2; + base->fixed_not_null_fields= mi_uint2korr(ptr); ptr+= 2; + base->fixed_not_null_fields_length= mi_uint2korr(ptr);ptr+= 2; + base->max_field_lengths= mi_uint2korr(ptr); ptr+= 2; + base->pack_fields= mi_uint2korr(ptr); ptr+= 2; + base->extra_options= mi_uint2korr(ptr); ptr+= 2; + base->null_bytes= mi_uint2korr(ptr); ptr+= 2; + base->original_null_bytes= mi_uint2korr(ptr); ptr+= 2; + base->field_offsets= mi_uint2korr(ptr); ptr+= 2; + ptr+= 2; + base->block_size= mi_uint2korr(ptr); ptr+= 2; + + base->rec_reflength= *ptr++; + base->key_reflength= *ptr++; + base->keys= *ptr++; + base->auto_key= *ptr++; + base->born_transactional= *ptr++; + ptr++; + base->pack_bytes= mi_uint2korr(ptr); ptr+= 2; + base->blobs= mi_uint2korr(ptr); ptr+= 2; + base->max_key_block_length= mi_uint2korr(ptr); ptr+= 2; + base->max_key_length= mi_uint2korr(ptr); ptr+= 2; + base->extra_alloc_bytes= mi_uint2korr(ptr); ptr+= 2; + base->extra_alloc_procent= *ptr++; + ptr+= 16; + return ptr; +} + +/*-------------------------------------------------------------------------- + maria_keydef +---------------------------------------------------------------------------*/ + +my_bool _ma_keydef_write(File file, MARIA_KEYDEF *keydef) +{ + uchar buff[MARIA_KEYDEF_SIZE]; + uchar *ptr=buff; + + *ptr++= (uchar) keydef->keysegs; + *ptr++= keydef->key_alg; /* Rtree or Btree */ + mi_int2store(ptr,keydef->flag); ptr+= 2; + mi_int2store(ptr,keydef->block_length); ptr+= 2; + mi_int2store(ptr,keydef->keylength); ptr+= 2; + mi_int2store(ptr,keydef->minlength); ptr+= 2; + mi_int2store(ptr,keydef->maxlength); ptr+= 2; + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + +uchar *_ma_keydef_read(uchar *ptr, MARIA_KEYDEF *keydef) +{ + keydef->keysegs = (uint) *ptr++; + keydef->key_alg = *ptr++; /* Rtree or Btree */ + + keydef->flag = mi_uint2korr(ptr); ptr+= 2; + keydef->block_length = mi_uint2korr(ptr); ptr+= 2; + keydef->keylength = mi_uint2korr(ptr); ptr+= 2; + keydef->minlength = mi_uint2korr(ptr); ptr+= 2; + keydef->maxlength = mi_uint2korr(ptr); ptr+= 2; + keydef->underflow_block_length=keydef->block_length/3; + keydef->version = 0; /* Not saved */ + keydef->parser = &ft_default_parser; + keydef->ftkey_nr = 0; + return ptr; +} + +/*************************************************************************** +** maria_keyseg +***************************************************************************/ + +my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg) +{ + uchar buff[HA_KEYSEG_SIZE]; + uchar *ptr=buff; + ulong pos; + + *ptr++= keyseg->type; + *ptr++= keyseg->language; + *ptr++= keyseg->null_bit; + *ptr++= keyseg->bit_start; + *ptr++= keyseg->bit_end; + *ptr++= keyseg->bit_length; + mi_int2store(ptr,keyseg->flag); ptr+= 2; + mi_int2store(ptr,keyseg->length); ptr+= 2; + mi_int4store(ptr,keyseg->start); ptr+= 4; + pos= keyseg->null_bit ? keyseg->null_pos : keyseg->bit_pos; + mi_int4store(ptr, pos); + ptr+=4; + + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + + +uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg) +{ + keyseg->type = *ptr++; + keyseg->language = *ptr++; + keyseg->null_bit = *ptr++; + keyseg->bit_start = *ptr++; + keyseg->bit_end = *ptr++; + keyseg->bit_length = *ptr++; + keyseg->flag = mi_uint2korr(ptr); ptr+= 2; + keyseg->length = mi_uint2korr(ptr); ptr+= 2; + keyseg->start = mi_uint4korr(ptr); ptr+= 4; + keyseg->null_pos = mi_uint4korr(ptr); ptr+= 4; + keyseg->charset=0; /* Will be filled in later */ + if (keyseg->null_bit) + keyseg->bit_pos= (uint16)(keyseg->null_pos + (keyseg->null_bit == 7)); + else + { + keyseg->bit_pos= (uint16)keyseg->null_pos; + keyseg->null_pos= 0; + } + return ptr; +} + +/*-------------------------------------------------------------------------- + maria_uniquedef +---------------------------------------------------------------------------*/ + +my_bool _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *def) +{ + uchar buff[MARIA_UNIQUEDEF_SIZE]; + uchar *ptr=buff; + + mi_int2store(ptr,def->keysegs); ptr+=2; + *ptr++= (uchar) def->key; + *ptr++ = (uchar) def->null_are_equal; + + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + +uchar *_ma_uniquedef_read(uchar *ptr, MARIA_UNIQUEDEF *def) +{ + def->keysegs = mi_uint2korr(ptr); + def->key = ptr[2]; + def->null_are_equal=ptr[3]; + return ptr+4; /* 1 extra uchar */ +} + +/*************************************************************************** +** MARIA_COLUMNDEF +***************************************************************************/ + +my_bool _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef) +{ + uchar buff[MARIA_COLUMNDEF_SIZE]; + uchar *ptr=buff; + + mi_int2store(ptr,(ulong) columndef->column_nr); ptr+= 2; + mi_int2store(ptr,(ulong) columndef->offset); ptr+= 2; + mi_int2store(ptr,columndef->type); ptr+= 2; + mi_int2store(ptr,columndef->length); ptr+= 2; + mi_int2store(ptr,columndef->fill_length); ptr+= 2; + mi_int2store(ptr,columndef->null_pos); ptr+= 2; + mi_int2store(ptr,columndef->empty_pos); ptr+= 2; + + (*ptr++)= columndef->null_bit; + (*ptr++)= columndef->empty_bit; + ptr[0]= ptr[1]= ptr[2]= ptr[3]= 0; ptr+= 4; /* For future */ + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + +uchar *_ma_columndef_read(uchar *ptr, MARIA_COLUMNDEF *columndef) +{ + columndef->column_nr= mi_uint2korr(ptr); ptr+= 2; + columndef->offset= mi_uint2korr(ptr); ptr+= 2; + columndef->type= mi_sint2korr(ptr); ptr+= 2; + columndef->length= mi_uint2korr(ptr); ptr+= 2; + columndef->fill_length= mi_uint2korr(ptr); ptr+= 2; + columndef->null_pos= mi_uint2korr(ptr); ptr+= 2; + columndef->empty_pos= mi_uint2korr(ptr); ptr+= 2; + columndef->null_bit= (uint8) *ptr++; + columndef->empty_bit= (uint8) *ptr++; + ptr+= 4; + return ptr; +} + +my_bool _ma_column_nr_write(File file, uint16 *offsets, uint columns) +{ + uchar *buff, *ptr, *end; + size_t size= columns*2; + my_bool res; + + if (!(buff= (uchar*) my_alloca(size))) + return 1; + for (ptr= buff, end= ptr + size; ptr < end ; ptr+= 2, offsets++) + int2store(ptr, *offsets); + res= my_write(file, buff, size, MYF(MY_NABP)) != 0; + my_afree(buff); + return res; +} + + +uchar *_ma_column_nr_read(uchar *ptr, uint16 *offsets, uint columns) +{ + uchar *end; + size_t size= columns*2; + for (end= ptr + size; ptr < end ; ptr+=2, offsets++) + *offsets= uint2korr(ptr); + return ptr; +} + +/** + @brief Set callbacks for data pages + + @note + We don't use pagecache_file_init here, as we want to keep the + code readable +*/ + +void _ma_set_data_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share) +{ + file->callback_data= (uchar*) share; + file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */ + + if (share->temporary) + { + file->read_callback= &maria_page_crc_check_none; + file->write_callback= &maria_page_filler_set_none; + } + else + { + file->read_callback= &maria_page_crc_check_data; + if (share->options & HA_OPTION_PAGE_CHECKSUM) + file->write_callback= &maria_page_crc_set_normal; + else + file->write_callback= &maria_page_filler_set_normal; + if (share->now_transactional) + file->flush_log_callback= maria_flush_log_for_page; + } +} + + +/** + @brief Set callbacks for index pages + + @note + We don't use pagecache_file_init here, as we want to keep the + code readable +*/ + +void _ma_set_index_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share) +{ + file->callback_data= (uchar*) share; + file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */ + file->write_fail= maria_page_write_failure; + + if (share->temporary) + { + file->read_callback= &maria_page_crc_check_none; + file->write_callback= &maria_page_filler_set_none; + } + else + { + file->read_callback= &maria_page_crc_check_index; + if (share->options & HA_OPTION_PAGE_CHECKSUM) + file->write_callback= &maria_page_crc_set_index; + else + file->write_callback= &maria_page_filler_set_normal; + + if (share->now_transactional) + file->flush_log_callback= maria_flush_log_for_page; + } +} + + +/************************************************************************** + Open data file + We can't use dup() here as the data file descriptors need to have different + active seek-positions. + + The argument file_to_dup is here for the future if there would on some OS + exist a dup()-like call that would give us two different file descriptors. +*************************************************************************/ + +int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share, const char *org_name, + File file_to_dup __attribute__((unused))) +{ + char *data_name= share->data_file_name.str; + char real_data_name[FN_REFLEN]; + + if (org_name) + { + fn_format(real_data_name, org_name, "", MARIA_NAME_DEXT, 4); + if (my_is_symlink(real_data_name)) + { + if (my_realpath(real_data_name, real_data_name, MYF(0)) || + (*maria_test_invalid_symlink)(real_data_name)) + { + my_errno= HA_WRONG_CREATE_OPTION; + return 1; + } + data_name= real_data_name; + } + } + + info->dfile.file= share->bitmap.file.file= + my_open(share->data_file_name.str, share->mode | O_SHARE, + MYF(MY_WME)); + return info->dfile.file >= 0 ? 0 : 1; +} + + +int _ma_open_keyfile(MARIA_SHARE *share) +{ + /* + Modifications to share->kfile should be under intern_lock to protect + against a concurrent checkpoint. + */ + pthread_mutex_lock(&share->intern_lock); + share->kfile.file= my_open(share->unique_file_name.str, + share->mode | O_SHARE, + MYF(MY_WME)); + pthread_mutex_unlock(&share->intern_lock); + return (share->kfile.file < 0); +} + + +/* + Disable all indexes. + + SYNOPSIS + maria_disable_indexes() + info A pointer to the MARIA storage engine MARIA_HA struct. + + DESCRIPTION + Disable all indexes. + + RETURN + 0 ok +*/ + +int maria_disable_indexes(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + maria_clear_all_keys_active(share->state.key_map); + return 0; +} + + +/* + Enable all indexes + + SYNOPSIS + maria_enable_indexes() + info A pointer to the MARIA storage engine MARIA_HA struct. + + DESCRIPTION + Enable all indexes. The indexes might have been disabled + by maria_disable_index() before. + The function works only if both data and indexes are empty, + otherwise a repair is required. + To be sure, call handler::delete_all_rows() before. + + RETURN + 0 ok + HA_ERR_CRASHED data or index is non-empty. +*/ + +int maria_enable_indexes(MARIA_HA *info) +{ + int error= 0; + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_enable_indexes"); + + if ((share->state.state.data_file_length != + (share->data_file_type == BLOCK_RECORD ? share->block_size : 0)) || + (share->state.state.key_file_length != share->base.keystart)) + { + DBUG_PRINT("error", ("data_file_length: %lu key_file_length: %lu", + (ulong) share->state.state.data_file_length, + (ulong) share->state.state.key_file_length)); + maria_print_error(info->s, HA_ERR_CRASHED); + error= HA_ERR_CRASHED; + } + else + maria_set_all_keys_active(share->state.key_map, share->base.keys); + DBUG_RETURN(error); +} + + +/* + Test if indexes are disabled. + + SYNOPSIS + maria_indexes_are_disabled() + info A pointer to the MARIA storage engine MARIA_HA struct. + + DESCRIPTION + Test if indexes are disabled. + + RETURN + 0 indexes are not disabled + 1 all indexes are disabled + 2 non-unique indexes are disabled +*/ + +int maria_indexes_are_disabled(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + /* + No keys or all are enabled. keys is the number of keys. Left shifted + gives us only one bit set. When decreased by one, gives us all all bits + up to this one set and it gets unset. + */ + if (!share->base.keys || + (maria_is_all_keys_active(share->state.key_map, share->base.keys))) + return 0; + + /* All are disabled */ + if (maria_is_any_key_active(share->state.key_map)) + return 1; + + /* + We have keys. Some enabled, some disabled. + Don't check for any non-unique disabled but return directly 2 + */ + return 2; +} + + +static my_bool maria_scan_init_dummy(MARIA_HA *info __attribute__((unused))) +{ + return 0; +} + +static void maria_scan_end_dummy(MARIA_HA *info __attribute__((unused))) +{ +} + +static my_bool maria_once_init_dummy(MARIA_SHARE *share + __attribute__((unused)), + File dfile __attribute__((unused))) +{ + return 0; +} + +static my_bool maria_once_end_dummy(MARIA_SHARE *share __attribute__((unused))) +{ + return 0; +} diff --git a/storage/maria/ma_packrec.c b/storage/maria/ma_packrec.c new file mode 100644 index 00000000000..4df00d9bb88 --- /dev/null +++ b/storage/maria/ma_packrec.c @@ -0,0 +1,1723 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + /* Functions to compressed records */ + +#include "maria_def.h" + +#define IS_CHAR ((uint) 32768) /* Bit if char (not offset) in tree */ + +/* Some definitions to keep in sync with maria_pack.c */ +#define HEAD_LENGTH 32 /* Length of fixed header */ + +#if INT_MAX > 32767 +#define BITS_SAVED 32 +#define MAX_QUICK_TABLE_BITS 9 /* Because we may shift in 24 bits */ +#else +#define BITS_SAVED 16 +#define MAX_QUICK_TABLE_BITS 6 +#endif + +#define get_bit(BU) ((BU)->bits ? \ + (BU)->current_byte & ((maria_bit_type) 1 << --(BU)->bits) :\ + (fill_buffer(BU), (BU)->bits= BITS_SAVED-1,\ + (BU)->current_byte & ((maria_bit_type) 1 << (BITS_SAVED-1)))) +#define skip_to_next_byte(BU) ((BU)->bits&=~7) +#define get_bits(BU,count) (((BU)->bits >= count) ? (((BU)->current_byte >> ((BU)->bits-=count)) & mask[count]) : fill_and_get_bits(BU,count)) + +#define decode_bytes_test_bit(bit) \ + if (low_byte & (1 << (7-bit))) \ + pos++; \ + if (*pos & IS_CHAR) \ + { bits-=(bit+1); break; } \ + pos+= *pos + +/* + Size in uint16 of a Huffman tree for uchar compression of 256 uchar values +*/ +#define OFFSET_TABLE_SIZE 512 + +static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file, + pbool fix_keys); +static uint read_huff_table(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree, + uint16 **decode_table,uchar **intervall_buff, + uint16 *tmp_buff); +static void make_quick_table(uint16 *to_table,uint16 *decode_table, + uint *next_free,uint value,uint bits, + uint max_bits); +static void fill_quick_table(uint16 *table,uint bits, uint max_bits, + uint value); +static uint copy_decode_table(uint16 *to_pos,uint offset, + uint16 *decode_table); +static uint find_longest_bitstream(uint16 *table, uint16 *end); +static void (*get_unpack_function(MARIA_COLUMNDEF *rec))(MARIA_COLUMNDEF *field, + MARIA_BIT_BUFF *buff, + uchar *to, + uchar *end); +static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_skip_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_endspace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_prespace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_zerofill_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_constant(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_intervall(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static uint decode_pos(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree); +static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff,uchar *buffer, + uint length); +static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff,uint count); +static void fill_buffer(MARIA_BIT_BUFF *bit_buff); +static uint max_bit(uint value); +static uint read_pack_length(uint version, const uchar *buf, ulong *length); +#ifdef HAVE_MMAP +static uchar *_ma_mempack_get_block_info(MARIA_HA *maria, + MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, + uchar **rec_buff_p, + size_t *rec_buff_size_p, + uchar *header); +#endif + +static maria_bit_type mask[]= +{ + 0x00000000, + 0x00000001, 0x00000003, 0x00000007, 0x0000000f, + 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, + 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, +#if BITS_SAVED > 16 + 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, + 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, + 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, + 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff, +#endif +}; + + +my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile) +{ + share->options|= HA_OPTION_READ_ONLY_DATA; + return (_ma_read_pack_info(share, dfile, + (pbool) + test(!(share->options & + (HA_OPTION_PACK_RECORD | + HA_OPTION_TEMP_COMPRESS_RECORD))))); +} + + +my_bool _ma_once_end_pack_row(MARIA_SHARE *share) +{ + if (share->decode_trees) + { + my_free(share->decode_trees,MYF(0)); + my_free(share->decode_tables,MYF(0)); + } + return 0; +} + + +/* Read all packed info, allocate memory and fix field structs */ + +static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file, + pbool fix_keys) +{ + int diff_length; + uint i,trees,huff_tree_bits,rec_reflength,length; + uint16 *decode_table,*tmp_buff; + ulong elements,intervall_length; + uchar *disk_cache; + uchar *intervall_buff; + uchar header[HEAD_LENGTH]; + MARIA_BIT_BUFF bit_buff; + DBUG_ENTER("_ma_read_pack_info"); + + if (maria_quick_table_bits < 4) + maria_quick_table_bits=4; + else if (maria_quick_table_bits > MAX_QUICK_TABLE_BITS) + maria_quick_table_bits=MAX_QUICK_TABLE_BITS; + + my_errno=0; + if (my_read(file, header, sizeof(header), MYF(MY_NABP))) + { + if (!my_errno) + my_errno=HA_ERR_END_OF_FILE; + goto err0; + } + /* Only the first three bytes of magic number are independent of version. */ + if (memcmp(header, maria_pack_file_magic, 3)) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err0; + } + share->pack.version= header[3]; /* fourth uchar of magic number */ + share->pack.header_length= uint4korr(header+4); + share->min_pack_length=(uint) uint4korr(header+8); + share->max_pack_length=(uint) uint4korr(header+12); + set_if_bigger(share->base.default_rec_buff_size, + share->max_pack_length + 7); + elements=uint4korr(header+16); + intervall_length=uint4korr(header+20); + trees=uint2korr(header+24); + share->pack.ref_length=header[26]; + rec_reflength=header[27]; + diff_length=(int) rec_reflength - (int) share->base.rec_reflength; + if (fix_keys) + share->rec_reflength=rec_reflength; + DBUG_PRINT("info", ("fixed header length: %u", HEAD_LENGTH)); + DBUG_PRINT("info", ("total header length: %lu", share->pack.header_length)); + DBUG_PRINT("info", ("pack file version: %u", share->pack.version)); + DBUG_PRINT("info", ("min pack length: %lu", share->min_pack_length)); + DBUG_PRINT("info", ("max pack length: %lu", share->max_pack_length)); + DBUG_PRINT("info", ("elements of all trees: %lu", elements)); + DBUG_PRINT("info", ("distinct values bytes: %lu", intervall_length)); + DBUG_PRINT("info", ("number of code trees: %u", trees)); + DBUG_PRINT("info", ("bytes for record lgt: %u", share->pack.ref_length)); + DBUG_PRINT("info", ("record pointer length: %u", rec_reflength)); + + + /* + Memory segment #1: + - Decode tree heads + - Distinct column values + */ + if (!(share->decode_trees=(MARIA_DECODE_TREE*) + my_malloc((uint) (trees*sizeof(MARIA_DECODE_TREE)+ + intervall_length*sizeof(uchar)), + MYF(MY_WME)))) + goto err0; + intervall_buff=(uchar*) (share->decode_trees+trees); + + /* + Memory segment #2: + - Decode tables + - Quick decode tables + - Temporary decode table + - Compressed data file header cache + This segment will be reallocated after construction of the tables. + */ + length=(uint) (elements*2+trees*(1 << maria_quick_table_bits)); + if (!(share->decode_tables=(uint16*) + my_malloc((length+OFFSET_TABLE_SIZE)*sizeof(uint16)+ + (uint) (share->pack.header_length - sizeof(header)) + + share->base.extra_rec_buff_size, + MYF(MY_WME | MY_ZEROFILL)))) + goto err1; + tmp_buff=share->decode_tables+length; + disk_cache=(uchar*) (tmp_buff+OFFSET_TABLE_SIZE); + + if (my_read(file,disk_cache, + (uint) (share->pack.header_length-sizeof(header)), + MYF(MY_NABP))) + goto err2; +#ifdef HAVE_valgrind + /* Zero bytes accessed by fill_buffer */ + bzero(disk_cache + (share->pack.header_length-sizeof(header)), + share->base.extra_rec_buff_size); +#endif + + huff_tree_bits=max_bit(trees ? trees-1 : 0); + init_bit_buffer(&bit_buff, disk_cache, + (uint) (share->pack.header_length-sizeof(header))); + /* Read new info for each field */ + for (i=0 ; i < share->base.fields ; i++) + { + share->columndef[i].base_type=(enum en_fieldtype) get_bits(&bit_buff,5); + share->columndef[i].pack_type=(uint) get_bits(&bit_buff,6); + share->columndef[i].space_length_bits=get_bits(&bit_buff,5); + share->columndef[i].huff_tree=share->decode_trees+(uint) get_bits(&bit_buff, + huff_tree_bits); + share->columndef[i].unpack= get_unpack_function(share->columndef + i); + DBUG_PRINT("info", ("col: %2u type: %2u pack: %u slbits: %2u", + i, share->columndef[i].base_type, + share->columndef[i].pack_type, + share->columndef[i].space_length_bits)); + } + skip_to_next_byte(&bit_buff); + /* + Construct the decoding tables from the file header. Keep track of + the used memory. + */ + decode_table=share->decode_tables; + for (i=0 ; i < trees ; i++) + if (read_huff_table(&bit_buff,share->decode_trees+i,&decode_table, + &intervall_buff,tmp_buff)) + goto err3; + /* Reallocate the decoding tables to the used size. */ + decode_table=(uint16*) + my_realloc((uchar*) share->decode_tables, + (uint) ((uchar*) decode_table - (uchar*) share->decode_tables), + MYF(MY_HOLD_ON_ERROR)); + /* Fix the table addresses in the tree heads. */ + { + my_ptrdiff_t diff= PTR_BYTE_DIFF(decode_table,share->decode_tables); + share->decode_tables=decode_table; + for (i=0 ; i < trees ; i++) + share->decode_trees[i].table=ADD_TO_PTR(share->decode_trees[i].table, + diff, uint16*); + } + + /* Fix record-ref-length for keys */ + if (fix_keys) + { + for (i=0 ; i < share->base.keys ; i++) + { + MARIA_KEYDEF *keyinfo= &share->keyinfo[i]; + keyinfo->keylength+= (uint16) diff_length; + keyinfo->minlength+= (uint16) diff_length; + keyinfo->maxlength+= (uint16) diff_length; + keyinfo->seg[keyinfo->flag & HA_FULLTEXT ? + FT_SEGS : keyinfo->keysegs].length= (uint16) rec_reflength; + } + if (share->ft2_keyinfo.seg) + { + MARIA_KEYDEF *ft2_keyinfo= &share->ft2_keyinfo; + ft2_keyinfo->keylength+= (uint16) diff_length; + ft2_keyinfo->minlength+= (uint16) diff_length; + ft2_keyinfo->maxlength+= (uint16) diff_length; + } + } + + if (bit_buff.error || bit_buff.pos < bit_buff.end) + goto err3; + + DBUG_RETURN(0); + +err3: + my_errno=HA_ERR_WRONG_IN_RECORD; +err2: + my_free(share->decode_tables, MYF(0)); +err1: + my_free(share->decode_trees, MYF(0)); +err0: + DBUG_RETURN(1); +} + + +/* + Read a huff-code-table from datafile. + + SYNOPSIS + read_huff_table() + bit_buff Bit buffer pointing at start of the + decoding table in the file header cache. + decode_tree Pointer to the decode tree head. + decode_table IN/OUT Address of a pointer to the next free space. + intervall_buff IN/OUT Address of a pointer to the next unused values. + tmp_buff Buffer for temporary extraction of a full + decoding table as read from bit_buff. + + RETURN + 0 OK. + 1 Error. +*/ +static uint read_huff_table(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree, + uint16 **decode_table, uchar **intervall_buff, + uint16 *tmp_buff) +{ + uint min_chr,elements,char_bits,offset_bits,size,intervall_length,table_bits, + next_free_offset; + uint16 *ptr,*end; + DBUG_ENTER("read_huff_table"); + + if (!get_bits(bit_buff,1)) + { + /* Byte value compression. */ + min_chr=get_bits(bit_buff,8); + elements=get_bits(bit_buff,9); + char_bits=get_bits(bit_buff,5); + offset_bits=get_bits(bit_buff,5); + intervall_length=0; + ptr=tmp_buff; + ptr=tmp_buff; + DBUG_PRINT("info", ("byte value compression")); + DBUG_PRINT("info", ("minimum uchar value: %u", min_chr)); + DBUG_PRINT("info", ("number of tree nodes: %u", elements)); + DBUG_PRINT("info", ("bits for values: %u", char_bits)); + DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits)); + if (elements > 256) + { + DBUG_PRINT("error", ("ERROR: illegal number of tree elements: %u", + elements)); + DBUG_RETURN(1); + } + } + else + { + /* Distinct column value compression. */ + min_chr=0; + elements=get_bits(bit_buff,15); + intervall_length=get_bits(bit_buff,16); + char_bits=get_bits(bit_buff,5); + offset_bits=get_bits(bit_buff,5); + decode_tree->quick_table_bits=0; + ptr= *decode_table; + DBUG_PRINT("info", ("distinct column value compression")); + DBUG_PRINT("info", ("number of tree nodes: %u", elements)); + DBUG_PRINT("info", ("value buffer length: %u", intervall_length)); + DBUG_PRINT("info", ("bits for value index: %u", char_bits)); + DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits)); + } + size=elements*2-2; + DBUG_PRINT("info", ("tree size in uint16: %u", size)); + DBUG_PRINT("info", ("tree size in bytes: %u", + size * (uint) sizeof(uint16))); + + for (end=ptr+size ; ptr < end ; ptr++) + { + if (get_bit(bit_buff)) + { + *ptr= (uint16) get_bits(bit_buff,offset_bits); + if ((ptr + *ptr >= end) || !*ptr) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + DBUG_RETURN(1); + } + } + else + *ptr= (uint16) (IS_CHAR + (get_bits(bit_buff,char_bits) + min_chr)); + } + skip_to_next_byte(bit_buff); + + decode_tree->table= *decode_table; + decode_tree->intervalls= *intervall_buff; + if (! intervall_length) + { + /* Byte value compression. ptr started from tmp_buff. */ + /* Find longest Huffman code from begin to end of tree in bits. */ + table_bits= find_longest_bitstream(tmp_buff, ptr); + if (table_bits >= OFFSET_TABLE_SIZE) + DBUG_RETURN(1); + if (table_bits > maria_quick_table_bits) + table_bits=maria_quick_table_bits; + DBUG_PRINT("info", ("table bits: %u", table_bits)); + + next_free_offset= (1 << table_bits); + make_quick_table(*decode_table,tmp_buff,&next_free_offset,0,table_bits, + table_bits); + (*decode_table)+= next_free_offset; + decode_tree->quick_table_bits=table_bits; + } + else + { + /* Distinct column value compression. ptr started from *decode_table */ + (*decode_table)=end; + /* + get_bits() moves some bytes to a cache buffer in advance. May need + to step back. + */ + bit_buff->pos-= bit_buff->bits/8; + /* Copy the distinct column values from the buffer. */ + memcpy(*intervall_buff,bit_buff->pos,(size_t) intervall_length); + (*intervall_buff)+=intervall_length; + bit_buff->pos+=intervall_length; + bit_buff->bits=0; + } + DBUG_RETURN(0); +} + + +/* + Make a quick_table for faster decoding. + + SYNOPSIS + make_quick_table() + to_table Target quick_table and remaining decode table. + decode_table Source Huffman (sub-)tree within tmp_buff. + next_free_offset IN/OUT Next free offset from to_table. + Starts behind quick_table on the top-level. + value Huffman bits found so far. + bits Remaining bits to be collected. + max_bits Total number of bits to collect (table_bits). + + DESCRIPTION + + The quick table is an array of 16-bit values. There exists one value + for each possible code representable by max_bits (table_bits) bits. + In most cases table_bits is 9. So there are 512 16-bit values. + + If the high-order bit (16) is set (IS_CHAR) then the array slot for + this value is a valid Huffman code for a resulting uchar value. + + The low-order 8 bits (1..8) are the resulting uchar value. + + Bits 9..14 are the length of the Huffman code for this uchar value. + This means so many bits from the input stream were needed to + represent this uchar value. The remaining bits belong to later + Huffman codes. This also means that for every Huffman code shorter + than table_bits there are multiple entires in the array, which + differ just in the unused bits. + + If the high-order bit (16) is clear (0) then the remaining bits are + the position of the remaining Huffman decode tree segment behind the + quick table. + + RETURN + void +*/ + +static void make_quick_table(uint16 *to_table, uint16 *decode_table, + uint *next_free_offset, uint value, uint bits, + uint max_bits) +{ + DBUG_ENTER("make_quick_table"); + + /* + When down the table to the requested maximum, copy the rest of the + Huffman table. + */ + if (!bits--) + { + /* + Remaining left Huffman tree segment starts behind quick table. + Remaining right Huffman tree segment starts behind left segment. + */ + to_table[value]= (uint16) *next_free_offset; + /* + Re-construct the remaining Huffman tree segment at + next_free_offset in to_table. + */ + *next_free_offset=copy_decode_table(to_table, *next_free_offset, + decode_table); + DBUG_VOID_RETURN; + } + + /* Descent on the left side. Left side bits are clear (0). */ + if (!(*decode_table & IS_CHAR)) + { + /* Not a leaf. Follow the pointer. */ + make_quick_table(to_table,decode_table+ *decode_table, + next_free_offset,value,bits,max_bits); + } + else + { + /* + A leaf. A Huffman code is complete. Fill the quick_table + array for all possible bit strings starting with this Huffman + code. + */ + fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table); + } + + /* Descent on the right side. Right side bits are set (1). */ + decode_table++; + value|= (1 << bits); + if (!(*decode_table & IS_CHAR)) + { + /* Not a leaf. Follow the pointer. */ + make_quick_table(to_table,decode_table+ *decode_table, + next_free_offset,value,bits,max_bits); + } + else + { + /* + A leaf. A Huffman code is complete. Fill the quick_table + array for all possible bit strings starting with this Huffman + code. + */ + fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table); + } + + DBUG_VOID_RETURN; +} + + +/* + Fill quick_table for all possible values starting with this Huffman code. + + SYNOPSIS + fill_quick_table() + table Target quick_table position. + bits Unused bits from max_bits. + max_bits Total number of bits to collect (table_bits). + value The uchar encoded by the found Huffman code. + + DESCRIPTION + + Fill the segment (all slots) of the quick_table array with the + resulting value for the found Huffman code. There are as many slots + as there are combinations representable by the unused bits. + + In most cases we use 9 table bits. Assume a 3-bit Huffman code. Then + there are 6 unused bits. Hence we fill 2**6 = 64 slots with the + value. + + RETURN + void +*/ + +static void fill_quick_table(uint16 *table, uint bits, uint max_bits, + uint value) +{ + uint16 *end; + DBUG_ENTER("fill_quick_table"); + + /* + Bits 1..8 of value represent the decoded uchar value. + Bits 9..14 become the length of the Huffman code for this uchar value. + Bit 16 flags a valid code (IS_CHAR). + */ + value|= (max_bits - bits) << 8 | IS_CHAR; + + for (end= table + ((my_ptrdiff_t) 1 << bits); table < end; table++) + { + *table= (uint16) value; + } + DBUG_VOID_RETURN; +} + + +/* + Reconstruct a decode subtree at the target position. + + SYNOPSIS + copy_decode_table() + to_pos Target quick_table and remaining decode table. + offset Next free offset from to_pos. + decode_table Source Huffman subtree within tmp_buff. + + NOTE + Pointers in the decode tree are relative to the pointers position. + + RETURN + next free offset from to_pos. +*/ + +static uint copy_decode_table(uint16 *to_pos, uint offset, + uint16 *decode_table) +{ + uint prev_offset= offset; + DBUG_ENTER("copy_decode_table"); + + /* Descent on the left side. */ + if (!(*decode_table & IS_CHAR)) + { + /* Set a pointer to the next target node. */ + to_pos[offset]=2; + /* Copy the left hand subtree there. */ + offset=copy_decode_table(to_pos,offset+2,decode_table+ *decode_table); + } + else + { + /* Copy the uchar value. */ + to_pos[offset]= *decode_table; + /* Step behind this node. */ + offset+=2; + } + + /* Descent on the right side. */ + decode_table++; + if (!(*decode_table & IS_CHAR)) + { + /* Set a pointer to the next free target node. */ + to_pos[prev_offset+1]=(uint16) (offset-prev_offset-1); + /* Copy the right hand subtree to the entry of that node. */ + offset=copy_decode_table(to_pos,offset,decode_table+ *decode_table); + } + else + { + /* Copy the uchar value. */ + to_pos[prev_offset+1]= *decode_table; + } + DBUG_RETURN(offset); +} + + +/* + Find the length of the longest Huffman code in this table in bits. + + SYNOPSIS + find_longest_bitstream() + table Code (sub-)table start. + end End of code table. + + IMPLEMENTATION + + Recursively follow the branch(es) of the code pair on every level of + the tree until two uchar values (and no branch) are found. Add one to + each level when returning back from each recursion stage. + + 'end' is used for error checking only. A clean tree terminates + before reaching 'end'. Hence the exact value of 'end' is not too + important. However having it higher than necessary could lead to + misbehaviour should 'next' jump into the dirty area. + + RETURN + length Length of longest Huffman code in bits. + >= OFFSET_TABLE_SIZE Error, broken tree. It does not end before 'end'. +*/ + +static uint find_longest_bitstream(uint16 *table, uint16 *end) +{ + uint length=1; + uint length2; + if (!(*table & IS_CHAR)) + { + uint16 *next= table + *table; + if (next > end || next == table) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + return OFFSET_TABLE_SIZE; + } + length=find_longest_bitstream(next, end)+1; + } + table++; + if (!(*table & IS_CHAR)) + { + uint16 *next= table + *table; + if (next > end || next == table) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + return OFFSET_TABLE_SIZE; + } + length2= find_longest_bitstream(next, end) + 1; + length=max(length,length2); + } + return length; +} + + +/* + Read record from datafile. + + SYNOPSIS + _ma_read_pack_record() + info A pointer to MARIA_HA. + filepos File offset of the record. + buf RETURN The buffer to receive the record. + + RETURN + 0 On success + # Error number +*/ + +int _ma_read_pack_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos) +{ + MARIA_BLOCK_INFO block_info; + File file; + DBUG_ENTER("maria_read_pack_record"); + + if (filepos == HA_OFFSET_ERROR) + DBUG_RETURN(my_errno); /* _search() didn't find record */ + + file= info->dfile.file; + if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info, + &info->rec_buff, &info->rec_buff_size, file, + filepos)) + goto err; + if (my_read(file, info->rec_buff + block_info.offset , + block_info.rec_len - block_info.offset, MYF(MY_NABP))) + goto panic; + info->update|= HA_STATE_AKTIV; + DBUG_RETURN(_ma_pack_rec_unpack(info,&info->bit_buff, buf, + info->rec_buff, block_info.rec_len)); +panic: + my_errno=HA_ERR_WRONG_IN_RECORD; +err: + DBUG_RETURN(my_errno); +} + + + +int _ma_pack_rec_unpack(register MARIA_HA *info, MARIA_BIT_BUFF *bit_buff, + register uchar *to, uchar *from, ulong reclength) +{ + uchar *end_field; + reg3 MARIA_COLUMNDEF *end; + MARIA_COLUMNDEF *current_field; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_pack_rec_unpack"); + + if (info->s->base.null_bytes) + { + memcpy(to, from, info->s->base.null_bytes); + to+= info->s->base.null_bytes; + from+= info->s->base.null_bytes; + reclength-= info->s->base.null_bytes; + } + init_bit_buffer(bit_buff, from, reclength); + for (current_field=share->columndef, end=current_field+share->base.fields ; + current_field < end ; + current_field++,to=end_field) + { + end_field=to+current_field->length; + (*current_field->unpack)(current_field, bit_buff, to, end_field); + } + if (!bit_buff->error && + bit_buff->pos - bit_buff->bits / 8 == bit_buff->end) + DBUG_RETURN(0); + info->update&= ~HA_STATE_AKTIV; + DBUG_RETURN(my_errno=HA_ERR_WRONG_IN_RECORD); +} /* _ma_pack_rec_unpack */ + + + /* Return function to unpack field */ + +static void (*get_unpack_function(MARIA_COLUMNDEF *rec)) + (MARIA_COLUMNDEF *, MARIA_BIT_BUFF *, uchar *, uchar *) +{ + switch (rec->base_type) { + case FIELD_SKIP_ZERO: + if (rec->pack_type & PACK_TYPE_ZERO_FILL) + return &uf_zerofill_skip_zero; + return &uf_skip_zero; + case FIELD_NORMAL: + if (rec->pack_type & PACK_TYPE_SPACE_FIELDS) + return &uf_space_normal; + if (rec->pack_type & PACK_TYPE_ZERO_FILL) + return &uf_zerofill_normal; + return &decode_bytes; + case FIELD_SKIP_ENDSPACE: + if (rec->pack_type & PACK_TYPE_SPACE_FIELDS) + { + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_space_endspace_selected; + return &uf_space_endspace; + } + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_endspace_selected; + return &uf_endspace; + case FIELD_SKIP_PRESPACE: + if (rec->pack_type & PACK_TYPE_SPACE_FIELDS) + { + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_space_prespace_selected; + return &uf_space_prespace; + } + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_prespace_selected; + return &uf_prespace; + case FIELD_CONSTANT: + return &uf_constant; + case FIELD_INTERVALL: + return &uf_intervall; + case FIELD_ZERO: + case FIELD_CHECK: + return &uf_zero; + case FIELD_BLOB: + return &uf_blob; + case FIELD_VARCHAR: + if (rec->length <= 256) /* 255 + 1 uchar length */ + return &uf_varchar1; + return &uf_varchar2; + case FIELD_LAST: + default: + return 0; /* This should never happend */ + } +} + + /* The different functions to unpack a field */ + +static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bzero((char*) to,(uint) (end-to)); + else + { + end-=rec->space_length_bits; + decode_bytes(rec,bit_buff,to,end); + bzero((char*) end,rec->space_length_bits); + } +} + +static void uf_skip_zero(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bzero((char*) to,(uint) (end-to)); + else + decode_bytes(rec,bit_buff,to,end); +} + +static void uf_space_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + decode_bytes(rec,bit_buff,to,end); +} + +static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + { + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill(end - spaces, spaces, ' '); + } + else + decode_bytes(rec,bit_buff,to,end); + } +} + +static void uf_endspace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill(end - spaces, spaces, ' '); + } + else + decode_bytes(rec,bit_buff,to,end); +} + +static void uf_space_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill(end - spaces, spaces, ' '); + } +} + +static void uf_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill(end - spaces, spaces, ' '); +} + +static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + { + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill(to, spaces, ' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); + } + else + decode_bytes(rec,bit_buff,to,end); + } +} + + +static void uf_prespace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill(to, spaces, ' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); + } + else + decode_bytes(rec,bit_buff,to,end); +} + + +static void uf_space_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill(to, spaces, ' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); + } +} + +static void uf_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill(to, spaces, ' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); +} + +static void uf_zerofill_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + end-=rec->space_length_bits; + decode_bytes(rec,bit_buff, to, end); + bzero((char*) end,rec->space_length_bits); +} + +static void uf_constant(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff __attribute__((unused)), + uchar *to, uchar *end) +{ + memcpy(to,rec->huff_tree->intervalls,(size_t) (end-to)); +} + +static void uf_intervall(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, + uchar *end) +{ + reg1 uint field_length=(uint) (end-to); + memcpy(to,rec->huff_tree->intervalls+field_length*decode_pos(bit_buff, + rec->huff_tree), + (size_t) field_length); +} + + +/*ARGSUSED*/ +static void uf_zero(MARIA_COLUMNDEF *rec __attribute__((unused)), + MARIA_BIT_BUFF *bit_buff __attribute__((unused)), + uchar *to, uchar *end) +{ + bzero(to, (uint) (end-to)); +} + +static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bzero(to, (uint) (end-to)); + else + { + ulong length=get_bits(bit_buff,rec->space_length_bits); + uint pack_length=(uint) (end-to)-portable_sizeof_char_ptr; + if (bit_buff->blob_pos+length > bit_buff->blob_end) + { + bit_buff->error=1; + bzero(to, (end-to)); + return; + } + decode_bytes(rec, bit_buff, bit_buff->blob_pos, + bit_buff->blob_pos + length); + _ma_store_blob_length(to, pack_length, length); + memcpy_fixed((uchar*) to+pack_length,(uchar*) &bit_buff->blob_pos, + sizeof(uchar*)); + bit_buff->blob_pos+=length; + } +} + + +static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end __attribute__((unused))) +{ + if (get_bit(bit_buff)) + to[0]= 0; /* Zero lengths */ + else + { + ulong length=get_bits(bit_buff,rec->space_length_bits); + *to= (char) length; + decode_bytes(rec,bit_buff,to+1,to+1+length); + } +} + + +static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end __attribute__((unused))) +{ + if (get_bit(bit_buff)) + to[0]=to[1]=0; /* Zero lengths */ + else + { + ulong length=get_bits(bit_buff,rec->space_length_bits); + int2store(to,length); + decode_bytes(rec,bit_buff,to+2,to+2+length); + } +} + + /* Functions to decode of buffer of bits */ + +#if BITS_SAVED == 64 + +static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + reg1 uint bits,low_byte; + reg3 uint16 *pos; + reg4 uint table_bits,table_and; + MARIA_DECODE_TREE *decode_tree; + + decode_tree=rec->decode_tree; + bits=bit_buff->bits; /* Save in reg for quicker access */ + table_bits=decode_tree->quick_table_bits; + table_and= (1 << table_bits)-1; + + do + { + if (bits <= 32) + { + if (bit_buff->pos > bit_buff->end+4) + { + bit_buff->error=1; + return; /* Can't be right */ + } + bit_buff->current_byte= (bit_buff->current_byte << 32) + + ((((uint) bit_buff->pos[3])) + + (((uint) bit_buff->pos[2]) << 8) + + (((uint) bit_buff->pos[1]) << 16) + + (((uint) bit_buff->pos[0]) << 24)); + bit_buff->pos+=4; + bits+=32; + } + /* + First use info in quick_table. + + The quick table is an array of 16-bit values. There exists one + value for each possible code representable by table_bits bits. + In most cases table_bits is 9. So there are 512 16-bit values. + + If the high-order bit (16) is set (IS_CHAR) then the array slot + for this value is a valid Huffman code for a resulting uchar value. + + The low-order 8 bits (1..8) are the resulting uchar value. + + Bits 9..14 are the length of the Huffman code for this uchar value. + This means so many bits from the input stream were needed to + represent this uchar value. The remaining bits belong to later + Huffman codes. This also means that for every Huffman code shorter + than table_bits there are multiple entires in the array, which + differ just in the unused bits. + + If the high-order bit (16) is clear (0) then the remaining bits are + the position of the remaining Huffman decode tree segment behind the + quick table. + */ + low_byte=(uint) (bit_buff->current_byte >> (bits - table_bits)) & table_and; + low_byte=decode_tree->table[low_byte]; + if (low_byte & IS_CHAR) + { + /* + All Huffman codes of less or equal table_bits length are in the + quick table. This is one of them. + */ + *to++ = (char) (low_byte & 255); /* Found char in quick table */ + bits-= ((low_byte >> 8) & 31); /* Remove bits used */ + } + else + { /* Map through rest of decode-table */ + /* This means that the Huffman code must be longer than table_bits. */ + pos=decode_tree->table+low_byte; + bits-=table_bits; + /* NOTE: decode_bytes_test_bit() is a macro wich contains a break !!! */ + for (;;) + { + low_byte=(uint) (bit_buff->current_byte >> (bits-8)); + decode_bytes_test_bit(0); + decode_bytes_test_bit(1); + decode_bytes_test_bit(2); + decode_bytes_test_bit(3); + decode_bytes_test_bit(4); + decode_bytes_test_bit(5); + decode_bytes_test_bit(6); + decode_bytes_test_bit(7); + bits-=8; + } + *to++ = (char) *pos; + } + } while (to != end); + + bit_buff->bits=bits; + return; +} + +#else + +static void decode_bytes(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + reg1 uint bits,low_byte; + reg3 uint16 *pos; + reg4 uint table_bits,table_and; + MARIA_DECODE_TREE *decode_tree; + + decode_tree=rec->huff_tree; + bits=bit_buff->bits; /* Save in reg for quicker access */ + table_bits=decode_tree->quick_table_bits; + table_and= (1 << table_bits)-1; + + do + { + if (bits < table_bits) + { + if (bit_buff->pos > bit_buff->end+1) + { + bit_buff->error=1; + return; /* Can't be right */ + } +#if BITS_SAVED == 32 + bit_buff->current_byte= (bit_buff->current_byte << 24) + + (((uint) ((uchar) bit_buff->pos[2]))) + + (((uint) ((uchar) bit_buff->pos[1])) << 8) + + (((uint) ((uchar) bit_buff->pos[0])) << 16); + bit_buff->pos+=3; + bits+=24; +#else + if (bits) /* We must have at leasts 9 bits */ + { + bit_buff->current_byte= (bit_buff->current_byte << 8) + + (uint) ((uchar) bit_buff->pos[0]); + bit_buff->pos++; + bits+=8; + } + else + { + bit_buff->current_byte= ((uint) ((uchar) bit_buff->pos[0]) << 8) + + ((uint) ((uchar) bit_buff->pos[1])); + bit_buff->pos+=2; + bits+=16; + } +#endif + } + /* First use info in quick_table */ + low_byte=(bit_buff->current_byte >> (bits - table_bits)) & table_and; + low_byte=decode_tree->table[low_byte]; + if (low_byte & IS_CHAR) + { + *to++ = (low_byte & 255); /* Found char in quick table */ + bits-= ((low_byte >> 8) & 31); /* Remove bits used */ + } + else + { /* Map through rest of decode-table */ + pos=decode_tree->table+low_byte; + bits-=table_bits; + for (;;) + { + if (bits < 8) + { /* We don't need to check end */ +#if BITS_SAVED == 32 + bit_buff->current_byte= (bit_buff->current_byte << 24) + + (((uint) ((uchar) bit_buff->pos[2]))) + + (((uint) ((uchar) bit_buff->pos[1])) << 8) + + (((uint) ((uchar) bit_buff->pos[0])) << 16); + bit_buff->pos+=3; + bits+=24; +#else + bit_buff->current_byte= (bit_buff->current_byte << 8) + + (uint) ((uchar) bit_buff->pos[0]); + bit_buff->pos+=1; + bits+=8; +#endif + } + low_byte=(uint) (bit_buff->current_byte >> (bits-8)); + decode_bytes_test_bit(0); + decode_bytes_test_bit(1); + decode_bytes_test_bit(2); + decode_bytes_test_bit(3); + decode_bytes_test_bit(4); + decode_bytes_test_bit(5); + decode_bytes_test_bit(6); + decode_bytes_test_bit(7); + bits-=8; + } + *to++ = (char) *pos; + } + } while (to != end); + + bit_buff->bits=bits; + return; +} +#endif /* BIT_SAVED == 64 */ + + +static uint decode_pos(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree) +{ + uint16 *pos=decode_tree->table; + for (;;) + { + if (get_bit(bit_buff)) + pos++; + if (*pos & IS_CHAR) + return (uint) (*pos & ~IS_CHAR); + pos+= *pos; + } +} + + +int _ma_read_rnd_pack_record(MARIA_HA *info, + uchar *buf, + register MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks) +{ + File file; + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_read_rnd_pack_record"); + + if (filepos >= info->state->data_file_length) + { + my_errno= HA_ERR_END_OF_FILE; + goto err; + } + + file= info->dfile.file; + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(&info->rec_cache, block_info.header, + filepos, share->pack.ref_length, + skip_deleted_blocks ? READING_NEXT : 0)) + goto err; + file= -1; + } + if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info, + &info->rec_buff, &info->rec_buff_size, + file, filepos)) + goto err; /* Error code is already set */ +#ifndef DBUG_OFF + if (block_info.rec_len > share->max_pack_length) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; + } +#endif + + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(&info->rec_cache, info->rec_buff, + block_info.filepos, block_info.rec_len, + skip_deleted_blocks ? READING_NEXT : 0)) + goto err; + } + else + { + if (my_read(info->dfile.file, info->rec_buff + block_info.offset, + block_info.rec_len-block_info.offset, + MYF(MY_NABP))) + goto err; + } + info->packed_length= block_info.rec_len; + info->cur_row.lastpos= filepos; + info->cur_row.nextpos= block_info.filepos+block_info.rec_len; + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + + DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf, + info->rec_buff, block_info.rec_len)); + err: + DBUG_RETURN(my_errno); +} + + + /* Read and process header from a huff-record-file */ + +uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, + uchar **rec_buff_p, size_t *rec_buff_size_p, + File file, my_off_t filepos) +{ + uchar *header= info->header; + uint head_length,ref_length; + LINT_INIT(ref_length); + + if (file >= 0) + { + ref_length=maria->s->pack.ref_length; + /* + We can't use my_pread() here because _ma_read_rnd_pack_record assumes + position is ok + */ + VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0))); + if (my_read(file, header,ref_length,MYF(MY_NABP))) + return BLOCK_FATAL_ERROR; + DBUG_DUMP("header", header, ref_length); + } + head_length= read_pack_length((uint) maria->s->pack.version, header, + &info->rec_len); + if (maria->s->base.blobs) + { + head_length+= read_pack_length((uint) maria->s->pack.version, + header + head_length, &info->blob_len); + /* + Ensure that the record buffer is big enough for the compressed + record plus all expanded blobs. [We do not have an extra buffer + for the resulting blobs. Sigh.] + */ + if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p, + info->rec_len + info->blob_len + + maria->s->base.extra_rec_buff_size)) + return BLOCK_FATAL_ERROR; /* not enough memory */ + bit_buff->blob_pos= *rec_buff_p + info->rec_len; + bit_buff->blob_end= bit_buff->blob_pos + info->blob_len; + maria->blob_length=info->blob_len; + } + info->filepos=filepos+head_length; + if (file > 0) + { + info->offset=min(info->rec_len, ref_length - head_length); + memcpy(*rec_buff_p, header + head_length, info->offset); + } + return 0; +} + + + /* rutines for bit buffer */ + /* Note buffer must be 6 uchar bigger than longest row */ + +static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff, uchar *buffer, + uint length) +{ + bit_buff->pos=buffer; + bit_buff->end=buffer+length; + bit_buff->bits=bit_buff->error=0; + bit_buff->current_byte=0; /* Avoid purify errors */ +} + +static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff, uint count) +{ + uint tmp; + count-=bit_buff->bits; + tmp=(bit_buff->current_byte & mask[bit_buff->bits]) << count; + fill_buffer(bit_buff); + bit_buff->bits=BITS_SAVED - count; + return tmp+(bit_buff->current_byte >> (BITS_SAVED - count)); +} + + /* Fill in empty bit_buff->current_byte from buffer */ + /* Sets bit_buff->error if buffer is exhausted */ + +static void fill_buffer(MARIA_BIT_BUFF *bit_buff) +{ + if (bit_buff->pos >= bit_buff->end) + { + bit_buff->error= 1; + bit_buff->current_byte=0; + return; + } +#if BITS_SAVED == 64 + bit_buff->current_byte= ((((uint) ((uchar) bit_buff->pos[7]))) + + (((uint) ((uchar) bit_buff->pos[6])) << 8) + + (((uint) ((uchar) bit_buff->pos[5])) << 16) + + (((uint) ((uchar) bit_buff->pos[4])) << 24) + + ((ulonglong) + ((((uint) ((uchar) bit_buff->pos[3]))) + + (((uint) ((uchar) bit_buff->pos[2])) << 8) + + (((uint) ((uchar) bit_buff->pos[1])) << 16) + + (((uint) ((uchar) bit_buff->pos[0])) << 24)) << 32)); + bit_buff->pos+=8; +#else +#if BITS_SAVED == 32 + bit_buff->current_byte= (((uint) ((uchar) bit_buff->pos[3])) + + (((uint) ((uchar) bit_buff->pos[2])) << 8) + + (((uint) ((uchar) bit_buff->pos[1])) << 16) + + (((uint) ((uchar) bit_buff->pos[0])) << 24)); + bit_buff->pos+=4; +#else + bit_buff->current_byte= (uint) (((uint) ((uchar) bit_buff->pos[1]))+ + (((uint) ((uchar) bit_buff->pos[0])) << 8)); + bit_buff->pos+=2; +#endif +#endif +} + + /* Get number of bits neaded to represent value */ + +static uint max_bit(register uint value) +{ + reg2 uint power=1; + + while ((value>>=1)) + power++; + return (power); +} + + +/***************************************************************************** + Some redefined functions to handle files when we are using memmap +*****************************************************************************/ +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif + +#ifdef HAVE_MMAP + +static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos); +static int _ma_read_rnd_mempack_record(MARIA_HA*, uchar *, MARIA_RECORD_POS, + my_bool); + +my_bool _ma_memmap_file(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_memmap_file"); + + if (!info->s->file_map) + { + if (my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) < + share->state.state.data_file_length+MEMMAP_EXTRA_MARGIN) + { + DBUG_PRINT("warning",("File isn't extended for memmap")); + DBUG_RETURN(0); + } + if (_ma_dynmap_file(info, share->state.state.data_file_length)) + DBUG_RETURN(0); + } + info->opt_flag|= MEMMAP_USED; + info->read_record= share->read_record= _ma_read_mempack_record; + share->scan= _ma_read_rnd_mempack_record; + DBUG_RETURN(1); +} + + +void _ma_unmap_file(MARIA_HA *info) +{ + VOID(my_munmap((char*) info->s->file_map, + (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN)); +} + + +static uchar * +_ma_mempack_get_block_info(MARIA_HA *maria, + MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, + uchar **rec_buff_p, + size_t *rec_buff_size_p, + uchar *header) +{ + header+= read_pack_length((uint) maria->s->pack.version, header, + &info->rec_len); + if (maria->s->base.blobs) + { + header+= read_pack_length((uint) maria->s->pack.version, header, + &info->blob_len); + /* _ma_alloc_rec_buff sets my_errno on error */ + if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p, + info->blob_len + maria->s->base.extra_rec_buff_size)) + return 0; /* not enough memory */ + bit_buff->blob_pos= *rec_buff_p; + bit_buff->blob_end= *rec_buff_p + info->blob_len; + } + return header; +} + + +static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + uchar *pos; + DBUG_ENTER("maria_read_mempack_record"); + + if (filepos == HA_OFFSET_ERROR) + DBUG_RETURN(my_errno); /* _search() didn't find record */ + + if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff, + &block_info, &info->rec_buff, + &info->rec_buff_size, + (uchar*) share->file_map+ + filepos))) + DBUG_RETURN(my_errno); + DBUG_RETURN(_ma_pack_rec_unpack(info, &info->bit_buff, buf, + pos, block_info.rec_len)); +} + + +/*ARGSUSED*/ +static int _ma_read_rnd_mempack_record(MARIA_HA *info, + uchar *buf, + register MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks + __attribute__((unused))) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + uchar *pos,*start; + DBUG_ENTER("_ma_read_rnd_mempack_record"); + + if (filepos >= share->state.state.data_file_length) + { + my_errno=HA_ERR_END_OF_FILE; + goto err; + } + if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff, + &block_info, + &info->rec_buff, + &info->rec_buff_size, + (uchar*) + (start= share->file_map + + filepos)))) + goto err; +#ifndef DBUG_OFF + if (block_info.rec_len > info->s->max_pack_length) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; + } +#endif + info->packed_length=block_info.rec_len; + info->cur_row.lastpos= filepos; + info->cur_row.nextpos= filepos+(uint) (pos-start)+block_info.rec_len; + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + + DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf, + pos, block_info.rec_len)); + err: + DBUG_RETURN(my_errno); +} + +#endif /* HAVE_MMAP */ + + /* Save length of row */ + +uint _ma_save_pack_length(uint version, uchar *block_buff, ulong length) +{ + if (length < 254) + { + *(uchar*) block_buff= (uchar) length; + return 1; + } + if (length <= 65535) + { + *(uchar*) block_buff=254; + int2store(block_buff+1,(uint) length); + return 3; + } + *(uchar*) block_buff=255; + if (version == 1) /* old format */ + { + DBUG_ASSERT(length <= 0xFFFFFF); + int3store(block_buff + 1, (ulong) length); + return 4; + } + else + { + int4store(block_buff + 1, (ulong) length); + return 5; + } +} + + +static uint read_pack_length(uint version, const uchar *buf, ulong *length) +{ + if (buf[0] < 254) + { + *length= buf[0]; + return 1; + } + else if (buf[0] == 254) + { + *length= uint2korr(buf + 1); + return 3; + } + if (version == 1) /* old format */ + { + *length= uint3korr(buf + 1); + return 4; + } + else + { + *length= uint4korr(buf + 1); + return 5; + } +} + + +uint _ma_calc_pack_length(uint version, ulong length) +{ + return (length < 254) ? 1 : (length < 65536) ? 3 : (version == 1) ? 4 : 5; +} diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c new file mode 100644 index 00000000000..a4423133270 --- /dev/null +++ b/storage/maria/ma_page.c @@ -0,0 +1,619 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Read and write key blocks + + The basic structure of a key block is as follows: + + LSN 7 (LSN_STORE_SIZE); Log number for last change; + Only for transactional pages + PACK_TRANSID 6 (TRANSID_SIZE); Relative transid to pack page transid's + Only for transactional pages + KEYNR 1 (KEYPAGE_KEYID_SIZE) Which index this page belongs to + FLAG 1 (KEYPAGE_FLAG_SIZE) Flags for page + PAGE_SIZE 2 (KEYPAGE_USED_SIZE) How much of the page is used. + high-byte-first + + The flag is a combination of the following values: + + KEYPAGE_FLAG_ISNOD Page is a node + KEYPAGE_FLAG_HAS_TRANSID There may be a transid on the page. + + After this we store key data, either packed or not packed, directly + after each other. If the page is a node flag, there is a pointer to + the next key page at page start and after each key. + + At end of page the last KEYPAGE_CHECKSUM_SIZE bytes are reserved for a + page checksum. +*/ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_key_recover.h" + +/** + Fill MARIA_PAGE structure for usage with _ma_write_keypage +*/ + +void _ma_page_setup(MARIA_PAGE *page, MARIA_HA *info, + const MARIA_KEYDEF *keyinfo, my_off_t pos, + uchar *buff) +{ + MARIA_SHARE *share= info->s; + + page->info= info; + page->keyinfo= keyinfo; + page->buff= buff; + page->pos= pos; + page->size= _ma_get_page_used(share, buff); + page->org_size= page->size; + page->flag= _ma_get_keypage_flag(share, buff); + page->node= ((page->flag & KEYPAGE_FLAG_ISNOD) ? + share->base.key_reflength : 0); +} + +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY +void page_cleanup(MARIA_SHARE *share, MARIA_PAGE *page) +{ + uint length= page->size; + DBUG_ASSERT(length <= share->max_index_block_size); + bzero(page->buff + length, share->block_size - length); +} +#endif + + +/** + Fetch a key-page in memory + + @fn _ma_fetch_keypage() + @param page Fill this struct with information about read page + @param info Maria handler + @param keyinfo Key definition for used key + @param pos Position for page (in bytes) + @param lock Lock type for page + @param level Importance of page; Priority for page cache + @param buff Buffer to use for page + @param return_buffer Set to 1 if we want to force useage of buff + + @return + @retval 0 ok + @retval 1 error +*/ + +my_bool _ma_fetch_keypage(MARIA_PAGE *page, MARIA_HA *info, + const MARIA_KEYDEF *keyinfo, + my_off_t pos, enum pagecache_page_lock lock, + int level, uchar *buff, + my_bool return_buffer __attribute__ ((unused))) +{ + uchar *tmp; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + DBUG_ENTER("_ma_fetch_keypage"); + DBUG_PRINT("enter",("page: %lu", (ulong) (pos / block_size))); + + tmp= pagecache_read(share->pagecache, &share->kfile, + (pgcache_page_no_t) (pos / block_size), level, buff, + share->page_type, lock, &page_link.link); + + if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED) + { + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE || PAGECACHE_LOCK_READ); + page_link.unlock= (lock == PAGECACHE_LOCK_WRITE ? + PAGECACHE_LOCK_WRITE_UNLOCK : + PAGECACHE_LOCK_READ_UNLOCK); + page_link.changed= 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + page->link_offset= info->pinned_pages.elements-1; + } + + if (tmp == info->buff) + info->keyread_buff_used=1; + else if (!tmp) + { + DBUG_PRINT("error",("Got errno: %d from pagecache_read",my_errno)); + info->last_keypage=HA_OFFSET_ERROR; + maria_print_error(share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(1); + } + info->last_keypage= pos; + + /* + Setup page structure to make pages easy to use + This is same as page_fill_info, but here inlined as this si used + so often. + */ + page->info= info; + page->keyinfo= keyinfo; + page->buff= tmp; + page->pos= pos; + page->size= _ma_get_page_used(share, tmp); + page->org_size= page->size; /* For debugging */ + page->flag= _ma_get_keypage_flag(share, tmp); + page->node= ((page->flag & KEYPAGE_FLAG_ISNOD) ? + share->base.key_reflength : 0); + +#ifdef EXTRA_DEBUG + { + uint page_size= page->size; + if (page_size < 4 || page_size > share->max_index_block_size || + _ma_get_keynr(share, tmp) != keyinfo->key_nr) + { + DBUG_PRINT("error",("page %lu had wrong page length: %u keynr: %u", + (ulong) (pos / block_size), page_size, + _ma_get_keynr(share, tmp))); + DBUG_DUMP("page", tmp, page_size); + info->last_keypage = HA_OFFSET_ERROR; + maria_print_error(share, HA_ERR_CRASHED); + my_errno= HA_ERR_CRASHED; + DBUG_RETURN(1); + } + } +#endif + DBUG_RETURN(0); +} /* _ma_fetch_keypage */ + + +/* Write a key-page on disk */ + +my_bool _ma_write_keypage(MARIA_PAGE *page, enum pagecache_page_lock lock, + int level) +{ + MARIA_SHARE *share= page->info->s; + uint block_size= share->block_size; + uchar *buff= page->buff; + my_bool res; + MARIA_PINNED_PAGE page_link; + DBUG_ENTER("_ma_write_keypage"); + + /* + The following ensures that for transactional tables we have logged + all changes that changes the page size (as the logging code sets + page->org_size) + */ + DBUG_ASSERT(!share->now_transactional || page->size == page->org_size); + +#ifdef EXTRA_DEBUG /* Safety check */ + { + uint page_length, nod_flag; + page_length= _ma_get_page_used(share, buff); + nod_flag= _ma_test_if_nod(share, buff); + + DBUG_ASSERT(page->size == page_length); + DBUG_ASSERT(page->flag == _ma_get_keypage_flag(share, buff)); + + if (page->pos < share->base.keystart || + page->pos+block_size > share->state.state.key_file_length || + (page->pos & (maria_block_size-1))) + { + DBUG_PRINT("error",("Trying to write inside key status region: " + "key_start: %lu length: %lu page_pos: %lu", + (long) share->base.keystart, + (long) share->state.state.key_file_length, + (long) page->pos)); + my_errno=EINVAL; + DBUG_ASSERT(0); + DBUG_RETURN(1); + } + DBUG_PRINT("page",("write page at: %lu",(ulong) (page->pos / block_size))); + DBUG_DUMP("buff", buff, page_length); + DBUG_ASSERT(page_length >= share->keypage_header + nod_flag + + page->keyinfo->minlength || maria_in_recovery); + } +#endif + + /* Verify that keynr is correct */ + DBUG_ASSERT(_ma_get_keynr(share, buff) == page->keyinfo->key_nr); + +#if defined(EXTRA_DEBUG) && defined(HAVE_valgrind) && defined(NOT_ANYMORE) + { + /* This is here to catch uninitialized bytes */ + uint length= page->size; + ulong crc= my_checksum(0, buff, length); + int4store(buff + block_size - KEYPAGE_CHECKSUM_SIZE, crc); + } +#endif + + page_cleanup(share, page); + res= pagecache_write(share->pagecache, + &share->kfile, + (pgcache_page_no_t) (page->pos / block_size), + level, buff, share->page_type, + lock, + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED ? + PAGECACHE_PIN_LEFT_PINNED : + (lock == PAGECACHE_LOCK_WRITE_UNLOCK ? + PAGECACHE_UNPIN : PAGECACHE_PIN), + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE); + + if (lock == PAGECACHE_LOCK_WRITE) + { + /* It was not locked before, we have to unlock it when we unpin pages */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&page->info->pinned_pages, (void*) &page_link); + } + DBUG_RETURN(res); +} + + +/** + @brief Put page in free list + + @fn _ma_dispose() + @param info Maria handle + @param pos Address to page + @param page_not_read 1 if page has not yet been read + + @note + The page at 'pos' must have been read with a write lock. + This function does logging (unlike _ma_new()). + + @return + @retval 0 ok + @retval 1 error + +*/ + +int _ma_dispose(register MARIA_HA *info, my_off_t pos, my_bool page_not_read) +{ + my_off_t old_link; + uchar buff[MAX_KEYPAGE_HEADER_SIZE+ 8 + 2]; + ulonglong page_no; + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + uint block_size= share->block_size; + int result= 0; + enum pagecache_page_lock lock_method; + enum pagecache_page_pin pin_method; + DBUG_ENTER("_ma_dispose"); + DBUG_PRINT("enter",("page: %lu", (ulong) (pos / block_size))); + DBUG_ASSERT(pos % block_size == 0); + + (void) _ma_lock_key_del(info, 0); + + old_link= share->key_del_current; + share->key_del_current= pos; + page_no= pos / block_size; + bzero(buff, share->keypage_header); + _ma_store_keynr(share, buff, (uchar) MARIA_DELETE_KEY_NR); + _ma_store_page_used(share, buff, share->keypage_header + 8); + mi_sizestore(buff + share->keypage_header, old_link); + share->state.changed|= STATE_NOT_SORTED_PAGES; + + if (share->now_transactional) + { + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + my_off_t page; + + /* Store address of deleted page */ + page_store(log_data + FILEID_STORE_SIZE, page_no); + + /* Store link to next unused page (the link that is written to page) */ + page= (old_link == HA_OFFSET_ERROR ? IMPOSSIBLE_PAGE_NO : + old_link / block_size); + page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX_FREE_PAGE, + info->trn, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + result= 1; + } + + if (page_not_read) + { + lock_method= PAGECACHE_LOCK_WRITE; + pin_method= PAGECACHE_PIN; + } + else + { + lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED; + pin_method= PAGECACHE_PIN_LEFT_PINNED; + } + + if (pagecache_write_part(share->pagecache, + &share->kfile, (pgcache_page_no_t) page_no, + PAGECACHE_PRIORITY_LOW, buff, + share->page_type, + lock_method, pin_method, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE, + 0, share->keypage_header + 8)) + result= 1; + +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + { + uchar *page_buff= pagecache_block_link_to_buffer(page_link.link); + bzero(page_buff + share->keypage_header + 8, + block_size - share->keypage_header - 8 - KEYPAGE_CHECKSUM_SIZE); + } +#endif + + if (page_not_read) + { + /* It was not locked before, we have to unlock it when we unpin pages */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + + DBUG_RETURN(result); +} /* _ma_dispose */ + + +/** + @brief Get address for free page to use + + @fn _ma_new() + @param info Maria handle + @param level Type of key block (caching priority for pagecache) + @param page_link Pointer to page in page cache if read. One can + check if this is used by checking if + page_link->changed != 0 + + @note Logging of this is left to the caller (so that the "new"ing and the + first changes done to this new page can be logged as one single entry - one + single _ma_log_new()) call). + + @return + HA_OFFSET_ERROR File is full or page read error + # Page address to use +*/ + +my_off_t _ma_new(register MARIA_HA *info, int level, + MARIA_PINNED_PAGE **page_link) + +{ + my_off_t pos; + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + DBUG_ENTER("_ma_new"); + + if (_ma_lock_key_del(info, 1)) + { + pthread_mutex_lock(&share->intern_lock); + pos= share->state.state.key_file_length; + if (pos >= share->base.max_key_file_length - block_size) + { + my_errno=HA_ERR_INDEX_FILE_FULL; + pthread_mutex_unlock(&share->intern_lock); + DBUG_RETURN(HA_OFFSET_ERROR); + } + share->state.state.key_file_length+= block_size; + /* Following is for not transactional tables */ + info->state->key_file_length= share->state.state.key_file_length; + pthread_mutex_unlock(&share->intern_lock); + (*page_link)->changed= 0; + (*page_link)->write_lock= PAGECACHE_LOCK_WRITE; + } + else + { + uchar *buff; + pos= share->key_del_current; /* Protected */ + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (!(buff= pagecache_read(share->pagecache, + &share->kfile, + (pgcache_page_no_t) (pos / block_size), level, + 0, share->page_type, + PAGECACHE_LOCK_WRITE, &(*page_link)->link))) + pos= HA_OFFSET_ERROR; + else + { + /* + Next deleted page's number is in the header of the present page + (single linked list): + */ +#ifndef DBUG_OFF + my_off_t key_del_current; +#endif + share->key_del_current= mi_sizekorr(buff+share->keypage_header); +#ifndef DBUG_OFF + key_del_current= share->key_del_current; + DBUG_ASSERT((key_del_current != 0) && + ((key_del_current == HA_OFFSET_ERROR) || + (key_del_current <= + (share->state.state.key_file_length - block_size)))); +#endif + } + + (*page_link)->unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + (*page_link)->write_lock= PAGECACHE_LOCK_WRITE; + /* + We have to mark it changed as _ma_flush_pending_blocks() uses + 'changed' to know if we used the page cache or not + */ + (*page_link)->changed= 1; + push_dynamic(&info->pinned_pages, (void*) *page_link); + *page_link= dynamic_element(&info->pinned_pages, + info->pinned_pages.elements-1, + MARIA_PINNED_PAGE *); + } + share->state.changed|= STATE_NOT_SORTED_PAGES; + DBUG_PRINT("exit",("Pos: %ld",(long) pos)); + DBUG_RETURN(pos); +} /* _ma_new */ + + +/** + Log compactation of a index page +*/ + +static my_bool _ma_log_compact_keypage(MARIA_PAGE *ma_page, + TrID min_read_from) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 1 + 7 + TRANSID_SIZE]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + uint translog_parts, extra_length; + my_off_t page= ma_page->pos; + DBUG_ENTER("_ma_log_compact_keypage"); + DBUG_PRINT("enter", ("page: %lu", (ulong) (page / share->block_size))); + + /* Store address of new root page */ + page/= share->block_size; + page_store(log_data + FILEID_STORE_SIZE, page); + + log_pos= log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE; + + log_pos[0]= KEY_OP_COMPACT_PAGE; + transid_store(log_pos + 1, min_read_from); + log_pos+= 1 + TRANSID_SIZE; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + translog_parts= 1; + extra_length= 0; + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + Remove all transaction id's less than given one from a key page + + @fn _ma_compact_keypage() + @param keyinfo Key handler + @param page_pos Page position on disk + @param page Buffer for page + @param min_read_from Remove all trids from page less than this + + @retval 0 Ok + ®retval 1 Error; my_errno contains the error +*/ + +my_bool _ma_compact_keypage(MARIA_PAGE *ma_page, TrID min_read_from) +{ + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + MARIA_KEY key; + uchar *page, *endpos, *start_of_empty_space; + uint page_flag, nod_flag, saved_space; + my_bool page_has_transid; + DBUG_ENTER("_ma_compact_keypage"); + + page_flag= ma_page->flag; + if (!(page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + DBUG_RETURN(0); /* No transaction id on page */ + + nod_flag= ma_page->node; + page= ma_page->buff; + endpos= page + ma_page->size; + key.data= info->lastkey_buff; + key.keyinfo= (MARIA_KEYDEF*) ma_page->keyinfo; + + page_has_transid= 0; + page+= share->keypage_header + nod_flag; + key.data[0]= 0; /* safety */ + start_of_empty_space= 0; + saved_space= 0; + do + { + if (!(page= (*ma_page->keyinfo->skip_key)(&key, 0, 0, page))) + { + DBUG_PRINT("error",("Couldn't find last key: page_pos: 0x%lx", + (long) page)); + maria_print_error(share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(1); + } + if (key_has_transid(page-1)) + { + uint transid_length; + transid_length= transid_packed_length(page); + + if (min_read_from == ~(TrID) 0 || + min_read_from < transid_get_packed(share, page)) + { + page[-1]&= 254; /* Remove transid marker */ + transid_length= transid_packed_length(page); + if (start_of_empty_space) + { + /* Move block before the transid up in page */ + uint copy_length= (uint) (page - start_of_empty_space) - saved_space; + memmove(start_of_empty_space, start_of_empty_space + saved_space, + copy_length); + start_of_empty_space+= copy_length; + } + else + start_of_empty_space= page; + saved_space+= transid_length; + } + else + page_has_transid= 1; /* At least one id left */ + page+= transid_length; + } + page+= nod_flag; + } while (page < endpos); + + DBUG_ASSERT(page == endpos); + + if (start_of_empty_space) + { + /* + Move last block down + This is always true if any transid was removed + */ + uint copy_length= (uint) (endpos - start_of_empty_space) - saved_space; + + if (copy_length) + memmove(start_of_empty_space, start_of_empty_space + saved_space, + copy_length); + ma_page->size= (uint) (start_of_empty_space + copy_length - ma_page->buff); + page_store_size(share, ma_page); + } + + if (!page_has_transid) + { + ma_page->flag&= ~KEYPAGE_FLAG_HAS_TRANSID; + _ma_store_keypage_flag(share, ma_page->buff, ma_page->flag); + /* Clear packed transid (in case of zerofill) */ + bzero(ma_page->buff + LSN_STORE_SIZE, TRANSID_SIZE); + } + + if (share->now_transactional) + { + if (_ma_log_compact_keypage(ma_page, min_read_from)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c new file mode 100644 index 00000000000..441310a60ea --- /dev/null +++ b/storage/maria/ma_pagecache.c @@ -0,0 +1,5104 @@ +/* Copyright (C) 2000-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + These functions handle page caching for Maria tables. + + One cache can handle many files. + It must contain buffers of the same blocksize. + init_pagecache() should be used to init cache handler. + + The free list (free_block_list) is a stack like structure. + When a block is freed by free_block(), it is pushed onto the stack. + When a new block is required it is first tried to pop one from the stack. + If the stack is empty, it is tried to get a never-used block from the pool. + If this is empty too, then a block is taken from the LRU ring, flushing it + to disk, if necessary. This is handled in find_block(). + With the new free list, the blocks can have three temperatures: + hot, warm and cold (which is free). This is remembered in the block header + by the enum PCBLOCK_TEMPERATURE temperature variable. Remembering the + temperature is necessary to correctly count the number of warm blocks, + which is required to decide when blocks are allowed to become hot. Whenever + a block is inserted to another (sub-)chain, we take the old and new + temperature into account to decide if we got one more or less warm block. + blocks_unused is the sum of never used blocks in the pool and of currently + free blocks. blocks_used is the number of blocks fetched from the pool and + as such gives the maximum number of in-use blocks at any time. + + TODO: Write operation locks whole cache till the end of the operation. + Should be fixed. +*/ + +#include "maria_def.h" +#include <m_string.h> +#include "ma_pagecache.h" +#include "ma_blockrec.h" +#include <my_bit.h> +#include <errno.h> + +/* + Some compilation flags have been added specifically for this module + to control the following: + - not to let a thread to yield the control when reading directly + from page cache, which might improve performance in many cases; + to enable this add: + #define SERIALIZED_READ_FROM_CACHE + - to set an upper bound for number of threads simultaneously + using the page cache; this setting helps to determine an optimal + size for hash table and improve performance when the number of + blocks in the page cache much less than the number of threads + accessing it; + to set this number equal to <N> add + #define MAX_THREADS <N> + - to substitute calls of pthread_cond_wait for calls of + pthread_cond_timedwait (wait with timeout set up); + this setting should be used only when you want to trap a deadlock + situation, which theoretically should not happen; + to set timeout equal to <T> seconds add + #define PAGECACHE_TIMEOUT <T> + - to enable the module traps and to send debug information from + page cache module to a special debug log add: + #define PAGECACHE_DEBUG + the name of this debug log file <LOG NAME> can be set through: + #define PAGECACHE_DEBUG_LOG <LOG NAME> + if the name is not defined, it's set by default; + if the PAGECACHE_DEBUG flag is not set up and we are in a debug + mode, i.e. when ! defined(DBUG_OFF), the debug information from the + module is sent to the regular debug log. + + Example of the settings: + #define SERIALIZED_READ_FROM_CACHE + #define MAX_THREADS 100 + #define PAGECACHE_TIMEOUT 1 + #define PAGECACHE_DEBUG + #define PAGECACHE_DEBUG_LOG "my_pagecache_debug.log" +*/ + +/* + In key cache we have external raw locking here we use + SERIALIZED_READ_FROM_CACHE to avoid problem of reading + not consistent data from the page. + (keycache functions (key_cache_read(), key_cache_insert() and + key_cache_write()) rely on external MyISAM lock, we don't) +*/ +#define SERIALIZED_READ_FROM_CACHE yes + +#define PCBLOCK_INFO(B) \ + DBUG_PRINT("info", \ + ("block: 0x%lx fd: %lu page: %lu s: %0x hshL: " \ + " 0x%lx req: %u/%u wrlocks: %u rdlocks %u " \ + "rdlocks_q: %u pins: %u status: %u type: %s", \ + (ulong)(B), \ + (ulong)((B)->hash_link ? \ + (B)->hash_link->file.file : \ + 0), \ + (ulong)((B)->hash_link ? \ + (B)->hash_link->pageno : \ + 0), \ + (B)->status, \ + (ulong)(B)->hash_link, \ + (uint) (B)->requests, \ + (uint)((B)->hash_link ? \ + (B)->hash_link->requests : \ + 0), \ + block->wlocks, block->rlocks, block->rlocks_queue, \ + (uint)(B)->pins, (uint)(B)->status, \ + page_cache_page_type_str[(B)->type])) + +/* TODO: put it to my_static.c */ +my_bool my_disable_flush_pagecache_blocks= 0; + +#define STRUCT_PTR(TYPE, MEMBER, a) \ + (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER)) + +/* types of condition variables */ +#define COND_FOR_REQUESTED 0 /* queue of thread waiting for read operation */ +#define COND_FOR_SAVED 1 /* queue of thread waiting for flush */ +#define COND_FOR_WRLOCK 2 /* queue of write lock */ +#define COND_SIZE 3 /* number of COND_* queues */ + +typedef pthread_cond_t KEYCACHE_CONDVAR; + +/* descriptor of the page in the page cache block buffer */ +struct st_pagecache_page +{ + PAGECACHE_FILE file; /* file to which the page belongs to */ + pgcache_page_no_t pageno; /* number of the page in the file */ +}; + +/* element in the chain of a hash table bucket */ +struct st_pagecache_hash_link +{ + struct st_pagecache_hash_link + *next, **prev; /* to connect links in the same bucket */ + struct st_pagecache_block_link + *block; /* reference to the block for the page: */ + PAGECACHE_FILE file; /* from such a file */ + pgcache_page_no_t pageno; /* this page */ + uint requests; /* number of requests for the page */ +}; + +/* simple states of a block */ +#define PCBLOCK_ERROR 1 /* an error occurred when performing disk i/o */ +#define PCBLOCK_READ 2 /* the is page in the block buffer */ +#define PCBLOCK_IN_SWITCH 4 /* block is preparing to read new page */ +#define PCBLOCK_REASSIGNED 8 /* block does not accept requests for old page */ +#define PCBLOCK_IN_FLUSH 16 /* block is in flush operation */ +#define PCBLOCK_CHANGED 32 /* block buffer contains a dirty page */ +#define PCBLOCK_DIRECT_W 64 /* possible direct write to the block */ + +/* page status, returned by find_block */ +#define PAGE_READ 0 +#define PAGE_TO_BE_READ 1 +#define PAGE_WAIT_TO_BE_READ 2 + +/* block temperature determines in which (sub-)chain the block currently is */ +enum PCBLOCK_TEMPERATURE { PCBLOCK_COLD /*free*/ , PCBLOCK_WARM , PCBLOCK_HOT }; + +/* debug info */ +#ifndef DBUG_OFF +static const char *page_cache_page_type_str[]= +{ + /* used only for control page type changing during debugging */ + "EMPTY", + "PLAIN", + "LSN", + "READ_UNKNOWN" +}; + +static const char *page_cache_page_write_mode_str[]= +{ + "DELAY", + "DONE" +}; + +static const char *page_cache_page_lock_str[]= +{ + "free -> free", + "read -> read", + "write -> write", + "free -> read", + "free -> write", + "read -> free", + "write -> free", + "write -> read" +}; + +static const char *page_cache_page_pin_str[]= +{ + "pinned -> pinned", + "unpinned -> unpinned", + "unpinned -> pinned", + "pinned -> unpinned" +}; + + +typedef struct st_pagecache_pin_info +{ + struct st_pagecache_pin_info *next, **prev; + struct st_my_thread_var *thread; +} PAGECACHE_PIN_INFO; + +/* + st_pagecache_lock_info structure should be kept in next, prev, thread part + compatible with st_pagecache_pin_info to be compatible in functions. +*/ + +typedef struct st_pagecache_lock_info +{ + struct st_pagecache_lock_info *next, **prev; + struct st_my_thread_var *thread; + my_bool write_lock; +} PAGECACHE_LOCK_INFO; + + +/* service functions maintain debugging info about pin & lock */ + + +/* + Links information about thread pinned/locked the block to the list + + SYNOPSIS + info_link() + list the list to link in + node the node which should be linked +*/ + +static void info_link(PAGECACHE_PIN_INFO **list, PAGECACHE_PIN_INFO *node) +{ + if ((node->next= *list)) + node->next->prev= &(node->next); + *list= node; + node->prev= list; +} + + +/* + Unlinks information about thread pinned/locked the block from the list + + SYNOPSIS + info_unlink() + node the node which should be unlinked +*/ + +static void info_unlink(PAGECACHE_PIN_INFO *node) +{ + if ((*node->prev= node->next)) + node->next->prev= node->prev; +} + + +/* + Finds information about given thread in the list of threads which + pinned/locked this block. + + SYNOPSIS + info_find() + list the list where to find the thread + thread thread ID (reference to the st_my_thread_var + of the thread) + any return any thread of the list + + RETURN + 0 - the thread was not found + pointer to the information node of the thread in the list, or, if 'any', + to any thread of the list. +*/ + +static PAGECACHE_PIN_INFO *info_find(PAGECACHE_PIN_INFO *list, + struct st_my_thread_var *thread, + my_bool any) +{ + register PAGECACHE_PIN_INFO *i= list; + if (any) + return i; + for(; i != 0; i= i->next) + if (i->thread == thread) + return i; + return 0; +} + +#endif /* !DBUG_OFF */ + +/* page cache block */ +struct st_pagecache_block_link +{ + struct st_pagecache_block_link + *next_used, **prev_used; /* to connect links in the LRU chain (ring) */ + struct st_pagecache_block_link + *next_changed, **prev_changed; /* for lists of file dirty/clean blocks */ + struct st_pagecache_hash_link + *hash_link; /* backward ptr to referring hash_link */ +#ifndef DBUG_OFF + PAGECACHE_PIN_INFO *pin_list; + PAGECACHE_LOCK_INFO *lock_list; +#endif + KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */ + uchar *buffer; /* buffer for the block page */ + pthread_t write_locker; + + ulonglong last_hit_time; /* timestamp of the last hit */ + WQUEUE + wqueue[COND_SIZE]; /* queues on waiting requests for new/old pages */ + uint32 requests; /* number of requests for the block */ + uint32 pins; /* pin counter */ + uint32 wlocks; /* write locks counter */ + uint32 rlocks; /* read locks counter */ + uint32 rlocks_queue; /* rd. locks waiting wr. lock of this thread */ + uint16 status; /* state of the block */ + int16 error; /* error code for block in case of error */ + enum PCBLOCK_TEMPERATURE temperature; /* block temperature: cold, warm, hot*/ + enum pagecache_page_type type; /* type of the block */ + uint hits_left; /* number of hits left until promotion */ + /** @brief LSN when first became dirty; LSN_MAX means "not yet set" */ + LSN rec_lsn; +}; + +/** @brief information describing a run of flush_pagecache_blocks_int() */ +struct st_file_in_flush +{ + File file; + /** + @brief threads waiting for the thread currently flushing this file to be + done + */ + WQUEUE flush_queue; + /** + @brief if the thread currently flushing the file has a non-empty + first_in_switch list. + */ + my_bool first_in_switch; +}; + +#ifndef DBUG_OFF +/* debug checks */ + +#ifdef NOT_USED +static my_bool info_check_pin(PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_pin mode + __attribute__((unused))) +{ + struct st_my_thread_var *thread= my_thread_var; + PAGECACHE_PIN_INFO *info= info_find(block->pin_list, thread); + DBUG_ENTER("info_check_pin"); + DBUG_PRINT("enter", ("thread: 0x%lx pin: %s", + (ulong) thread, page_cache_page_pin_str[mode])); + if (info) + { + if (mode == PAGECACHE_PIN_LEFT_UNPINNED) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_UNPINNED!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + else if (mode == PAGECACHE_PIN) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; PIN!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + } + else + { + if (mode == PAGECACHE_PIN_LEFT_PINNED) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_PINNED!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + else if (mode == PAGECACHE_UNPIN) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; UNPIN!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Debug function which checks current lock/pin state and requested changes + + SYNOPSIS + info_check_lock() + lock requested lock changes + pin requested pin changes + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool info_check_lock(PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin) +{ + struct st_my_thread_var *thread= my_thread_var; + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *) info_find((PAGECACHE_PIN_INFO *) block->lock_list, + thread); + DBUG_ENTER("info_check_lock"); + switch(lock) { + case PAGECACHE_LOCK_LEFT_UNLOCKED: + if (pin != PAGECACHE_PIN_LEFT_UNPINNED || + info) + goto error; + break; + case PAGECACHE_LOCK_LEFT_READLOCKED: + if ((pin != PAGECACHE_PIN_LEFT_UNPINNED && + pin != PAGECACHE_PIN_LEFT_PINNED) || + info == 0 || info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_LEFT_WRITELOCKED: + if (pin != PAGECACHE_PIN_LEFT_PINNED || + info == 0 || !info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_READ: + if ((pin != PAGECACHE_PIN_LEFT_UNPINNED && + pin != PAGECACHE_PIN) || + info != 0) + goto error; + break; + case PAGECACHE_LOCK_WRITE: + if (pin != PAGECACHE_PIN || + info != 0) + goto error; + break; + case PAGECACHE_LOCK_READ_UNLOCK: + if ((pin != PAGECACHE_PIN_LEFT_UNPINNED && + pin != PAGECACHE_UNPIN) || + info == 0 || info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_WRITE_UNLOCK: + if (pin != PAGECACHE_UNPIN || + info == 0 || !info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_WRITE_TO_READ: + if ((pin != PAGECACHE_PIN_LEFT_PINNED && + pin != PAGECACHE_UNPIN) || + info == 0 || !info->write_lock) + goto error; + break; + } + DBUG_RETURN(0); +error: + DBUG_PRINT("info", + ("info_check_lock: thread: 0x%lx block 0x%lx: info: %d wrt: %d," + "to lock: %s, to pin: %s", + (ulong)thread, (ulong)block, test(info), + (info ? info->write_lock : 0), + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + DBUG_RETURN(1); +} +#endif /* NOT_USED */ +#endif /* !DBUG_OFF */ + +#define FLUSH_CACHE 2000 /* sort this many blocks at once */ + +static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block); +#ifndef DBUG_OFF +static void test_key_cache(PAGECACHE *pagecache, + const char *where, my_bool lock); +#endif + +#define PAGECACHE_HASH(p, f, pos) (((ulong) (pos) + \ + (ulong) (f).file) & (p->hash_entries-1)) +#define FILE_HASH(f) ((uint) (f).file & (PAGECACHE_CHANGED_BLOCKS_HASH - 1)) + +#define DEFAULT_PAGECACHE_DEBUG_LOG "pagecache_debug.log" + +#if defined(PAGECACHE_DEBUG) && ! defined(PAGECACHE_DEBUG_LOG) +#define PAGECACHE_DEBUG_LOG DEFAULT_PAGECACHE_DEBUG_LOG +#endif + +#if defined(PAGECACHE_DEBUG_LOG) +static FILE *pagecache_debug_log= NULL; +static void pagecache_debug_print _VARARGS((const char *fmt, ...)); +#define PAGECACHE_DEBUG_OPEN \ + if (!pagecache_debug_log) \ + { \ + pagecache_debug_log= fopen(PAGECACHE_DEBUG_LOG, "w"); \ + (void) setvbuf(pagecache_debug_log, NULL, _IOLBF, BUFSIZ); \ + } + +#define PAGECACHE_DEBUG_CLOSE \ + if (pagecache_debug_log) \ + { \ + fclose(pagecache_debug_log); \ + pagecache_debug_log= 0; \ + } +#else +#define PAGECACHE_DEBUG_OPEN +#define PAGECACHE_DEBUG_CLOSE +#endif /* defined(PAGECACHE_DEBUG_LOG) */ + +#if defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG) +#define KEYCACHE_DBUG_PRINT(l, m) \ + { if (pagecache_debug_log) \ + fprintf(pagecache_debug_log, "%s: ", l); \ + pagecache_debug_print m; } + +#define KEYCACHE_DBUG_ASSERT(a) \ + { if (! (a) && pagecache_debug_log) \ + fclose(pagecache_debug_log); \ + assert(a); } +#else +#define KEYCACHE_DBUG_PRINT(l, m) DBUG_PRINT(l, m) +#define KEYCACHE_DBUG_ASSERT(a) DBUG_ASSERT(a) +#endif /* defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG) */ + +#if defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF) +#ifdef THREAD +static long pagecache_thread_id; +#define KEYCACHE_THREAD_TRACE(l) \ + KEYCACHE_DBUG_PRINT(l,("|thread %ld",pagecache_thread_id)) + +#define KEYCACHE_THREAD_TRACE_BEGIN(l) \ + { struct st_my_thread_var *thread_var= my_thread_var; \ + pagecache_thread_id= thread_var->id; \ + KEYCACHE_DBUG_PRINT(l,("[thread %ld",pagecache_thread_id)) } + +#define KEYCACHE_THREAD_TRACE_END(l) \ + KEYCACHE_DBUG_PRINT(l,("]thread %ld",pagecache_thread_id)) +#else /* THREAD */ +#define KEYCACHE_THREAD_TRACE(l) KEYCACHE_DBUG_PRINT(l,("")) +#define KEYCACHE_THREAD_TRACE_BEGIN(l) KEYCACHE_DBUG_PRINT(l,("")) +#define KEYCACHE_THREAD_TRACE_END(l) KEYCACHE_DBUG_PRINT(l,("")) +#endif /* THREAD */ +#else +#define KEYCACHE_THREAD_TRACE_BEGIN(l) +#define KEYCACHE_THREAD_TRACE_END(l) +#define KEYCACHE_THREAD_TRACE(l) +#endif /* defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF) */ + +#define PCBLOCK_NUMBER(p, b) \ + ((uint) (((char*)(b)-(char *) p->block_root)/sizeof(PAGECACHE_BLOCK_LINK))) +#define PAGECACHE_HASH_LINK_NUMBER(p, h) \ + ((uint) (((char*)(h)-(char *) p->hash_link_root)/ \ + sizeof(PAGECACHE_HASH_LINK))) + +#if (defined(PAGECACHE_TIMEOUT) && !defined(__WIN__)) || defined(PAGECACHE_DEBUG) +static int pagecache_pthread_cond_wait(pthread_cond_t *cond, + pthread_mutex_t *mutex); +#else +#define pagecache_pthread_cond_wait pthread_cond_wait +#endif + +#if defined(PAGECACHE_DEBUG) +static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex); +static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex); +static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond); +#define pagecache_pthread_mutex_lock(M) \ +{ DBUG_PRINT("lock", ("mutex lock 0x%lx %u", (ulong)(M), __LINE__)); \ + ___pagecache_pthread_mutex_lock(M);} +#define pagecache_pthread_mutex_unlock(M) \ +{ DBUG_PRINT("lock", ("mutex unlock 0x%lx %u", (ulong)(M), __LINE__)); \ + ___pagecache_pthread_mutex_unlock(M);} +#define pagecache_pthread_cond_signal(M) \ +{ DBUG_PRINT("lock", ("signal 0x%lx %u", (ulong)(M), __LINE__)); \ + ___pagecache_pthread_cond_signal(M);} +#else +#define pagecache_pthread_mutex_lock pthread_mutex_lock +#define pagecache_pthread_mutex_unlock pthread_mutex_unlock +#define pagecache_pthread_cond_signal pthread_cond_signal +#endif /* defined(PAGECACHE_DEBUG) */ + +extern my_bool translog_flush(TRANSLOG_ADDRESS lsn); + +/* + Write page to the disk + + SYNOPSIS + pagecache_fwrite() + pagecache - page cache pointer + filedesc - pagecache file descriptor structure + buffer - buffer which we will write + type - page type (plain or with LSN) + flags - MYF() flags + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool pagecache_fwrite(PAGECACHE *pagecache, + PAGECACHE_FILE *filedesc, + uchar *buffer, + pgcache_page_no_t pageno, + enum pagecache_page_type type + __attribute__((unused)), + myf flags) +{ + DBUG_ENTER("pagecache_fwrite"); + DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE); + + /* Todo: Integrate this with write_callback so we have only one callback */ + if ((*filedesc->flush_log_callback)(buffer, pageno, filedesc->callback_data)) + DBUG_RETURN(1); + DBUG_PRINT("info", ("write_callback: 0x%lx data: 0x%lx", + (ulong) filedesc->write_callback, + (ulong) filedesc->callback_data)); + if ((*filedesc->write_callback)(buffer, pageno, filedesc->callback_data)) + { + DBUG_PRINT("error", ("write callback problem")); + DBUG_RETURN(1); + } + if (my_pwrite(filedesc->file, buffer, pagecache->block_size, + ((my_off_t) pageno << pagecache->shift), flags)) + { + (*filedesc->write_fail)(filedesc->callback_data); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +/* + Read page from the disk + + SYNOPSIS + pagecache_fread() + pagecache - page cache pointer + filedesc - pagecache file descriptor structure + buffer - buffer in which we will read + pageno - page number + flags - MYF() flags +*/ +#define pagecache_fread(pagecache, filedesc, buffer, pageno, flags) \ + my_pread((filedesc)->file, buffer, pagecache->block_size, \ + ((my_off_t) pageno << pagecache->shift), flags) + + +/** + @brief set rec_lsn of pagecache block (if it is needed) + + @param block block where to set rec_lsn + @param first_REDO_LSN_for_page the LSN to set +*/ + +static inline void pagecache_set_block_rec_lsn(PAGECACHE_BLOCK_LINK *block, + LSN first_REDO_LSN_for_page) +{ + if (block->rec_lsn == LSN_MAX) + block->rec_lsn= first_REDO_LSN_for_page; + else + DBUG_ASSERT(cmp_translog_addr(block->rec_lsn, + first_REDO_LSN_for_page) <= 0); +} + + +/* + next_power(value) is 2 at the power of (1+floor(log2(value))); + e.g. next_power(2)=4, next_power(3)=4. +*/ +static inline uint next_power(uint value) +{ + return (uint) my_round_up_to_next_power((uint32) value) << 1; +} + + +/* + Initialize a page cache + + SYNOPSIS + init_pagecache() + pagecache pointer to a page cache data structure + key_cache_block_size size of blocks to keep cached data + use_mem total memory to use for the key cache + division_limit division limit (may be zero) + age_threshold age threshold (may be zero) + block_size size of block (should be power of 2) + my_read_flags Flags used for all pread/pwrite calls + Usually MY_WME in case of recovery + + RETURN VALUE + number of blocks in the key cache, if successful, + 0 - otherwise. + + NOTES. + if pagecache->inited != 0 we assume that the key cache + is already initialized. This is for now used by myisamchk, but shouldn't + be something that a program should rely on! + + It's assumed that no two threads call this function simultaneously + referring to the same key cache handle. + +*/ + +ulong init_pagecache(PAGECACHE *pagecache, size_t use_mem, + uint division_limit, uint age_threshold, + uint block_size, myf my_readwrite_flags) +{ + ulong blocks, hash_links, length; + int error; + DBUG_ENTER("init_pagecache"); + DBUG_ASSERT(block_size >= 512); + + PAGECACHE_DEBUG_OPEN; + if (pagecache->inited && pagecache->disk_blocks > 0) + { + DBUG_PRINT("warning",("key cache already in use")); + DBUG_RETURN(0); + } + + pagecache->global_cache_w_requests= pagecache->global_cache_r_requests= 0; + pagecache->global_cache_read= pagecache->global_cache_write= 0; + pagecache->disk_blocks= -1; + if (! pagecache->inited) + { + if (pthread_mutex_init(&pagecache->cache_lock, MY_MUTEX_INIT_FAST) || + hash_init(&pagecache->files_in_flush, &my_charset_bin, 32, + offsetof(struct st_file_in_flush, file), + sizeof(((struct st_file_in_flush *)NULL)->file), + NULL, NULL, 0)) + goto err; + pagecache->inited= 1; + pagecache->in_init= 0; + pagecache->resize_queue.last_thread= NULL; + } + + pagecache->mem_size= use_mem; + pagecache->block_size= block_size; + pagecache->shift= my_bit_log2(block_size); + pagecache->readwrite_flags= my_readwrite_flags | MY_NABP | MY_WAIT_IF_FULL; + pagecache->org_readwrite_flags= pagecache->readwrite_flags; + DBUG_PRINT("info", ("block_size: %u", block_size)); + DBUG_ASSERT(((uint)(1 << pagecache->shift)) == block_size); + + blocks= (ulong) (use_mem / (sizeof(PAGECACHE_BLOCK_LINK) + + 2 * sizeof(PAGECACHE_HASH_LINK) + + sizeof(PAGECACHE_HASH_LINK*) * + 5/4 + block_size)); + /* + We need to support page cache with just one block to be able to do + scanning of rows-in-block files + */ + for ( ; ; ) + { + if (blocks < 8) + { + my_errno= ENOMEM; + goto err; + } + /* Set my_hash_entries to the next bigger 2 power */ + if ((pagecache->hash_entries= next_power(blocks)) < + (blocks) * 5/4) + pagecache->hash_entries<<= 1; + hash_links= 2 * blocks; +#if defined(MAX_THREADS) + if (hash_links < MAX_THREADS + blocks - 1) + hash_links= MAX_THREADS + blocks - 1; +#endif + while ((length= (ALIGN_SIZE(blocks * sizeof(PAGECACHE_BLOCK_LINK)) + + ALIGN_SIZE(hash_links * sizeof(PAGECACHE_HASH_LINK)) + + ALIGN_SIZE(sizeof(PAGECACHE_HASH_LINK*) * + pagecache->hash_entries))) + + (blocks << pagecache->shift) > use_mem) + blocks--; + /* Allocate memory for cache page buffers */ + if ((pagecache->block_mem= + my_large_malloc((ulong) blocks * pagecache->block_size, + MYF(MY_WME)))) + { + /* + Allocate memory for blocks, hash_links and hash entries; + For each block 2 hash links are allocated + */ + if ((pagecache->block_root= + (PAGECACHE_BLOCK_LINK*) my_malloc((size_t) length, MYF(0)))) + break; + my_large_free(pagecache->block_mem, MYF(0)); + pagecache->block_mem= 0; + } + blocks= blocks / 4*3; + } + pagecache->blocks_unused= blocks; + pagecache->disk_blocks= (long) blocks; + pagecache->hash_links= hash_links; + pagecache->hash_root= + (PAGECACHE_HASH_LINK**) ((char*) pagecache->block_root + + ALIGN_SIZE(blocks*sizeof(PAGECACHE_BLOCK_LINK))); + pagecache->hash_link_root= + (PAGECACHE_HASH_LINK*) ((char*) pagecache->hash_root + + ALIGN_SIZE((sizeof(PAGECACHE_HASH_LINK*) * + pagecache->hash_entries))); + bzero((uchar*) pagecache->block_root, + pagecache->disk_blocks * sizeof(PAGECACHE_BLOCK_LINK)); + bzero((uchar*) pagecache->hash_root, + pagecache->hash_entries * sizeof(PAGECACHE_HASH_LINK*)); + bzero((uchar*) pagecache->hash_link_root, + pagecache->hash_links * sizeof(PAGECACHE_HASH_LINK)); + pagecache->hash_links_used= 0; + pagecache->free_hash_list= NULL; + pagecache->blocks_used= pagecache->blocks_changed= 0; + + pagecache->global_blocks_changed= 0; + pagecache->blocks_available=0; /* For debugging */ + + /* The LRU chain is empty after initialization */ + pagecache->used_last= NULL; + pagecache->used_ins= NULL; + pagecache->free_block_list= NULL; + pagecache->time= 0; + pagecache->warm_blocks= 0; + pagecache->min_warm_blocks= (division_limit ? + blocks * division_limit / 100 + 1 : + blocks); + pagecache->age_threshold= (age_threshold ? + blocks * age_threshold / 100 : + blocks); + + pagecache->cnt_for_resize_op= 0; + pagecache->resize_in_flush= 0; + pagecache->can_be_used= 1; + + pagecache->waiting_for_hash_link.last_thread= NULL; + pagecache->waiting_for_block.last_thread= NULL; + DBUG_PRINT("exit", + ("disk_blocks: %ld block_root: 0x%lx hash_entries: %ld\ + hash_root: 0x%lx hash_links: %ld hash_link_root: 0x%lx", + pagecache->disk_blocks, (long) pagecache->block_root, + pagecache->hash_entries, (long) pagecache->hash_root, + pagecache->hash_links, (long) pagecache->hash_link_root)); + bzero((uchar*) pagecache->changed_blocks, + sizeof(pagecache->changed_blocks[0]) * + PAGECACHE_CHANGED_BLOCKS_HASH); + bzero((uchar*) pagecache->file_blocks, + sizeof(pagecache->file_blocks[0]) * + PAGECACHE_CHANGED_BLOCKS_HASH); + + pagecache->blocks= pagecache->disk_blocks > 0 ? pagecache->disk_blocks : 0; + DBUG_RETURN((ulong) pagecache->disk_blocks); + +err: + error= my_errno; + pagecache->disk_blocks= 0; + pagecache->blocks= 0; + if (pagecache->block_mem) + { + my_large_free(pagecache->block_mem, MYF(0)); + pagecache->block_mem= NULL; + } + if (pagecache->block_root) + { + my_free(pagecache->block_root, MYF(0)); + pagecache->block_root= NULL; + } + my_errno= error; + pagecache->can_be_used= 0; + DBUG_RETURN(0); +} + + +/* + Flush all blocks in the key cache to disk +*/ + +#ifdef NOT_USED +static int flush_all_key_blocks(PAGECACHE *pagecache) +{ +#if defined(PAGECACHE_DEBUG) + uint cnt=0; +#endif + while (pagecache->blocks_changed > 0) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->used_last->next_used ; ; block=block->next_used) + { + if (block->hash_link) + { +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + if (flush_pagecache_blocks_int(pagecache, &block->hash_link->file, + FLUSH_RELEASE, NULL, NULL)) + return 1; + break; + } + if (block == pagecache->used_last) + break; + } + } + return 0; +} +#endif /* NOT_USED */ + +/* + Resize a key cache + + SYNOPSIS + resize_pagecache() + pagecache pointer to a page cache data structure + use_mem total memory to use for the new key cache + division_limit new division limit (if not zero) + age_threshold new age threshold (if not zero) + + RETURN VALUE + number of blocks in the key cache, if successful, + 0 - otherwise. + + NOTES. + The function first compares the memory size parameter + with the key cache value. + + If they differ the function free the the memory allocated for the + old key cache blocks by calling the end_pagecache function and + then rebuilds the key cache with new blocks by calling + init_key_cache. + + The function starts the operation only when all other threads + performing operations with the key cache let her to proceed + (when cnt_for_resize=0). + + Before being usable, this function needs: + - to receive fixes for BUG#17332 "changing key_buffer_size on a running + server can crash under load" similar to those done to the key cache + - to have us (Sanja) look at the additional constraints placed on + resizing, due to the page locking specific to this page cache. + So we disable it for now. +*/ +#if NOT_USED /* keep disabled until code is fixed see above !! */ +ulong resize_pagecache(PAGECACHE *pagecache, + size_t use_mem, uint division_limit, + uint age_threshold) +{ + ulong blocks; +#ifdef THREAD + struct st_my_thread_var *thread; + WQUEUE *wqueue; + +#endif + DBUG_ENTER("resize_pagecache"); + + if (!pagecache->inited) + DBUG_RETURN(pagecache->disk_blocks); + + if(use_mem == pagecache->mem_size) + { + change_pagecache_param(pagecache, division_limit, age_threshold); + DBUG_RETURN(pagecache->disk_blocks); + } + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + +#ifdef THREAD + wqueue= &pagecache->resize_queue; + thread= my_thread_var; + wqueue_link_into_queue(wqueue, thread); + + while (wqueue->last_thread->next != thread) + { + pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); + } +#endif + + pagecache->resize_in_flush= 1; + if (flush_all_key_blocks(pagecache)) + { + /* TODO: if this happens, we should write a warning in the log file ! */ + pagecache->resize_in_flush= 0; + blocks= 0; + pagecache->can_be_used= 0; + goto finish; + } + pagecache->resize_in_flush= 0; + pagecache->can_be_used= 0; +#ifdef THREAD + while (pagecache->cnt_for_resize_op) + { + KEYCACHE_DBUG_PRINT("resize_pagecache: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); + } +#else + KEYCACHE_DBUG_ASSERT(pagecache->cnt_for_resize_op == 0); +#endif + + end_pagecache(pagecache, 0); /* Don't free mutex */ + /* The following will work even if use_mem is 0 */ + blocks= init_pagecache(pagecache, pagecache->block_size, use_mem, + division_limit, age_threshold, + pagecache->readwrite_flags); + +finish: +#ifdef THREAD + wqueue_unlink_from_queue(wqueue, thread); + /* Signal for the next resize request to proceeed if any */ + if (wqueue->last_thread) + { + KEYCACHE_DBUG_PRINT("resize_pagecache: signal", + ("thread %ld", wqueue->last_thread->next->id)); + pagecache_pthread_cond_signal(&wqueue->last_thread->next->suspend); + } +#endif + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(blocks); +} +#endif /* 0 */ + + +/* + Increment counter blocking resize key cache operation +*/ +static inline void inc_counter_for_resize_op(PAGECACHE *pagecache) +{ + pagecache->cnt_for_resize_op++; +} + + +/* + Decrement counter blocking resize key cache operation; + Signal the operation to proceed when counter becomes equal zero +*/ +static inline void dec_counter_for_resize_op(PAGECACHE *pagecache) +{ +#ifdef THREAD + struct st_my_thread_var *last_thread; + if (!--pagecache->cnt_for_resize_op && + (last_thread= pagecache->resize_queue.last_thread)) + { + KEYCACHE_DBUG_PRINT("dec_counter_for_resize_op: signal", + ("thread %ld", last_thread->next->id)); + pagecache_pthread_cond_signal(&last_thread->next->suspend); + } +#else + pagecache->cnt_for_resize_op--; +#endif +} + +/* + Change the page cache parameters + + SYNOPSIS + change_pagecache_param() + pagecache pointer to a page cache data structure + division_limit new division limit (if not zero) + age_threshold new age threshold (if not zero) + + RETURN VALUE + none + + NOTES. + Presently the function resets the key cache parameters + concerning midpoint insertion strategy - division_limit and + age_threshold. +*/ + +void change_pagecache_param(PAGECACHE *pagecache, uint division_limit, + uint age_threshold) +{ + DBUG_ENTER("change_pagecache_param"); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (division_limit) + pagecache->min_warm_blocks= (pagecache->disk_blocks * + division_limit / 100 + 1); + if (age_threshold) + pagecache->age_threshold= (pagecache->disk_blocks * + age_threshold / 100); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_VOID_RETURN; +} + + +/* + Removes page cache from memory. Does NOT flush pages to disk. + + SYNOPSIS + end_pagecache() + pagecache page cache handle + cleanup Complete free (Free also mutex for key cache) + + RETURN VALUE + none +*/ + +void end_pagecache(PAGECACHE *pagecache, my_bool cleanup) +{ + DBUG_ENTER("end_pagecache"); + DBUG_PRINT("enter", ("key_cache: 0x%lx", (long) pagecache)); + + if (!pagecache->inited) + DBUG_VOID_RETURN; + + if (pagecache->disk_blocks > 0) + { + if (pagecache->block_mem) + { + my_large_free(pagecache->block_mem, MYF(0)); + pagecache->block_mem= NULL; + my_free(pagecache->block_root, MYF(0)); + pagecache->block_root= NULL; + } + pagecache->disk_blocks= -1; + /* Reset blocks_changed to be safe if flush_all_key_blocks is called */ + pagecache->blocks_changed= 0; + } + + DBUG_PRINT("status", ("used: %lu changed: %lu w_requests: %lu " + "writes: %lu r_requests: %lu reads: %lu", + pagecache->blocks_used, + pagecache->global_blocks_changed, + (ulong) pagecache->global_cache_w_requests, + (ulong) pagecache->global_cache_write, + (ulong) pagecache->global_cache_r_requests, + (ulong) pagecache->global_cache_read)); + + if (cleanup) + { + hash_free(&pagecache->files_in_flush); + pthread_mutex_destroy(&pagecache->cache_lock); + pagecache->inited= pagecache->can_be_used= 0; + PAGECACHE_DEBUG_CLOSE; + } + DBUG_VOID_RETURN; +} /* end_pagecache */ + + +/* + Unlink a block from the chain of dirty/clean blocks +*/ + +static inline void unlink_changed(PAGECACHE_BLOCK_LINK *block) +{ + if (block->next_changed) + block->next_changed->prev_changed= block->prev_changed; + *block->prev_changed= block->next_changed; +} + + +/* + Link a block into the chain of dirty/clean blocks +*/ + +static inline void link_changed(PAGECACHE_BLOCK_LINK *block, + PAGECACHE_BLOCK_LINK **phead) +{ + block->prev_changed= phead; + if ((block->next_changed= *phead)) + (*phead)->prev_changed= &block->next_changed; + *phead= block; +} + + +/* + Unlink a block from the chain of dirty/clean blocks, if it's asked for, + and link it to the chain of clean blocks for the specified file +*/ + +static void link_to_file_list(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + PAGECACHE_FILE *file, my_bool unlink_flag) +{ + if (unlink_flag) + unlink_changed(block); + link_changed(block, &pagecache->file_blocks[FILE_HASH(*file)]); + if (block->status & PCBLOCK_CHANGED) + { + block->status&= ~PCBLOCK_CHANGED; + block->rec_lsn= LSN_MAX; + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + } +} + + +/* + Unlink a block from the chain of clean blocks for the specified + file and link it to the chain of dirty blocks for this file +*/ + +static inline void link_to_changed_list(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block) +{ + unlink_changed(block); + link_changed(block, + &pagecache->changed_blocks[FILE_HASH(block->hash_link->file)]); + block->status|=PCBLOCK_CHANGED; + pagecache->blocks_changed++; + pagecache->global_blocks_changed++; +} + + +/* + Link a block to the LRU chain at the beginning or at the end of + one of two parts. + + SYNOPSIS + link_block() + pagecache pointer to a page cache data structure + block pointer to the block to link to the LRU chain + hot <-> to link the block into the hot subchain + at_end <-> to link the block at the end of the subchain + + RETURN VALUE + none + + NOTES. + The LRU chain is represented by a circular list of block structures. + The list is double-linked of the type (**prev,*next) type. + The LRU chain is divided into two parts - hot and warm. + There are two pointers to access the last blocks of these two + parts. The beginning of the warm part follows right after the + end of the hot part. + Only blocks of the warm part can be used for replacement. + The first block from the beginning of this subchain is always + taken for eviction (pagecache->last_used->next) + + LRU chain: +------+ H O T +------+ + +----| end |----...<----| beg |----+ + | +------+last +------+ | + v<-link in latest hot (new end) | + | link in latest warm (new end)->^ + | +------+ W A R M +------+ | + +----| beg |---->...----| end |----+ + +------+ +------+ins + first for eviction +*/ + +static void link_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + my_bool hot, my_bool at_end) +{ + PAGECACHE_BLOCK_LINK *ins; + PAGECACHE_BLOCK_LINK **ptr_ins; + + PCBLOCK_INFO(block); + KEYCACHE_DBUG_ASSERT(! (block->hash_link && block->hash_link->requests)); +#ifdef THREAD + if (!hot && pagecache->waiting_for_block.last_thread) + { + /* Signal that in the LRU warm sub-chain an available block has appeared */ + struct st_my_thread_var *last_thread= + pagecache->waiting_for_block.last_thread; + struct st_my_thread_var *first_thread= last_thread->next; + struct st_my_thread_var *next_thread= first_thread; + PAGECACHE_HASH_LINK *hash_link= + (PAGECACHE_HASH_LINK *) first_thread->opt_info; + struct st_my_thread_var *thread; + do + { + thread= next_thread; + next_thread= thread->next; + /* + We notify about the event all threads that ask + for the same page as the first thread in the queue + */ + if ((PAGECACHE_HASH_LINK *) thread->opt_info == hash_link) + { + KEYCACHE_DBUG_PRINT("link_block: signal", ("thread: %ld", thread->id)); + pagecache_pthread_cond_signal(&thread->suspend); + wqueue_unlink_from_queue(&pagecache->waiting_for_block, thread); + block->requests++; + } + } + while (thread != last_thread); + hash_link->block= block; + KEYCACHE_THREAD_TRACE("link_block: after signaling"); +#if defined(PAGECACHE_DEBUG) + KEYCACHE_DBUG_PRINT("link_block", + ("linked,unlinked block: %u status: %x #requests: %u #available: %u", + PCBLOCK_NUMBER(pagecache, block), block->status, + block->requests, pagecache->blocks_available)); +#endif + return; + } +#else /* THREAD */ + KEYCACHE_DBUG_ASSERT(! (!hot && pagecache->waiting_for_block.last_thread)); + /* Condition not transformed using DeMorgan, to keep the text identical */ +#endif /* THREAD */ + ptr_ins= hot ? &pagecache->used_ins : &pagecache->used_last; + ins= *ptr_ins; + if (ins) + { + ins->next_used->prev_used= &block->next_used; + block->next_used= ins->next_used; + block->prev_used= &ins->next_used; + ins->next_used= block; + if (at_end) + *ptr_ins= block; + } + else + { + /* The LRU chain is empty */ + pagecache->used_last= pagecache->used_ins= block->next_used= block; + block->prev_used= &block->next_used; + } + KEYCACHE_THREAD_TRACE("link_block"); +#if defined(PAGECACHE_DEBUG) + pagecache->blocks_available++; + KEYCACHE_DBUG_PRINT("link_block", + ("linked block: %u:%1u status: %x #requests: %u #available: %u", + PCBLOCK_NUMBER(pagecache, block), at_end, block->status, + block->requests, pagecache->blocks_available)); + KEYCACHE_DBUG_ASSERT((ulong) pagecache->blocks_available <= + pagecache->blocks_used); +#endif +} + + +/* + Unlink a block from the LRU chain + + SYNOPSIS + unlink_block() + pagecache pointer to a page cache data structure + block pointer to the block to unlink from the LRU chain + + RETURN VALUE + none + + NOTES. + See NOTES for link_block +*/ + +static void unlink_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("unlink_block"); + DBUG_PRINT("unlink_block", ("unlink 0x%lx", (ulong)block)); + DBUG_ASSERT(block->next_used != NULL); + if (block->next_used == block) + { + /* The list contains only one member */ + pagecache->used_last= pagecache->used_ins= NULL; + } + else + { + block->next_used->prev_used= block->prev_used; + *block->prev_used= block->next_used; + if (pagecache->used_last == block) + pagecache->used_last= STRUCT_PTR(PAGECACHE_BLOCK_LINK, + next_used, block->prev_used); + if (pagecache->used_ins == block) + pagecache->used_ins= STRUCT_PTR(PAGECACHE_BLOCK_LINK, + next_used, block->prev_used); + } + block->next_used= NULL; + + KEYCACHE_THREAD_TRACE("unlink_block"); +#if defined(PAGECACHE_DEBUG) + KEYCACHE_DBUG_ASSERT(pagecache->blocks_available != 0); + pagecache->blocks_available--; + KEYCACHE_DBUG_PRINT("unlink_block", + ("unlinked block: 0x%lx (%u) status: %x #requests: %u #available: %u", + (ulong)block, PCBLOCK_NUMBER(pagecache, block), + block->status, + block->requests, pagecache->blocks_available)); + PCBLOCK_INFO(block); +#endif + DBUG_VOID_RETURN; +} + + +/* + Register requests for a block + + SYNOPSIS + reg_requests() + pagecache this page cache reference + block the block we request reference + count how many requests we register (it is 1 everywhere) + + NOTE + Registration of request means we are going to use this block so we exclude + it from the LRU if it is first request +*/ +static void reg_requests(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + int count) +{ + DBUG_ENTER("reg_requests"); + DBUG_PRINT("enter", ("block: 0x%lx (%u) status: %x reqs: %u", + (ulong)block, PCBLOCK_NUMBER(pagecache, block), + block->status, block->requests)); + PCBLOCK_INFO(block); + if (! block->requests) + /* First request for the block unlinks it */ + unlink_block(pagecache, block); + block->requests+= count; + DBUG_VOID_RETURN; +} + + +/* + Unregister request for a block + linking it to the LRU chain if it's the last request + + SYNOPSIS + unreg_request() + pagecache pointer to a page cache data structure + block pointer to the block to link to the LRU chain + at_end <-> to link the block at the end of the LRU chain + + RETURN VALUE + none + + NOTES. + Every linking to the LRU chain decrements by one a special block + counter (if it's positive). If the at_end parameter is TRUE the block is + added either at the end of warm sub-chain or at the end of hot sub-chain. + It is added to the hot subchain if its counter is zero and number of + blocks in warm sub-chain is not less than some low limit (determined by + the division_limit parameter). Otherwise the block is added to the warm + sub-chain. If the at_end parameter is FALSE the block is always added + at beginning of the warm sub-chain. + Thus a warm block can be promoted to the hot sub-chain when its counter + becomes zero for the first time. + At the same time the block at the very beginning of the hot subchain + might be moved to the beginning of the warm subchain if it stays untouched + for a too long time (this time is determined by parameter age_threshold). +*/ + +static void unreg_request(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, int at_end) +{ + DBUG_ENTER("unreg_request"); + DBUG_PRINT("enter", ("block 0x%lx (%u) status: %x reqs: %u", + (ulong)block, PCBLOCK_NUMBER(pagecache, block), + block->status, block->requests)); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->requests > 0); + if (! --block->requests) + { + my_bool hot; + if (block->hits_left) + block->hits_left--; + hot= !block->hits_left && at_end && + pagecache->warm_blocks > pagecache->min_warm_blocks; + if (hot) + { + if (block->temperature == PCBLOCK_WARM) + pagecache->warm_blocks--; + block->temperature= PCBLOCK_HOT; + KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu", + pagecache->warm_blocks)); + } + link_block(pagecache, block, hot, (my_bool)at_end); + block->last_hit_time= pagecache->time; + pagecache->time++; + + block= pagecache->used_ins; + /* Check if we should link a hot block to the warm block */ + if (block && pagecache->time - block->last_hit_time > + pagecache->age_threshold) + { + unlink_block(pagecache, block); + link_block(pagecache, block, 0, 0); + if (block->temperature != PCBLOCK_WARM) + { + pagecache->warm_blocks++; + block->temperature= PCBLOCK_WARM; + } + KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu", + pagecache->warm_blocks)); + } + } + DBUG_VOID_RETURN; +} + +/* + Remove a reader of the page in block +*/ + +static inline void remove_reader(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("remove_reader"); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->hash_link->requests > 0); +#ifdef THREAD + if (! --block->hash_link->requests && block->condvar) + pagecache_pthread_cond_signal(block->condvar); +#else + --block->hash_link->requests; +#endif + DBUG_VOID_RETURN; +} + + +/* + Wait until the last reader of the page in block + signals on its termination +*/ + +static inline void wait_for_readers(PAGECACHE *pagecache + __attribute__((unused)), + PAGECACHE_BLOCK_LINK *block) +{ +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + while (block->hash_link->requests) + { + KEYCACHE_DBUG_PRINT("wait_for_readers: wait", + ("suspend thread: %ld block: %u", + thread->id, PCBLOCK_NUMBER(pagecache, block))); + block->condvar= &thread->suspend; + pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); + block->condvar= NULL; + } +#else + KEYCACHE_DBUG_ASSERT(block->hash_link->requests == 0); +#endif +} + + +/* + Add a hash link to a bucket in the hash_table +*/ + +static inline void link_hash(PAGECACHE_HASH_LINK **start, + PAGECACHE_HASH_LINK *hash_link) +{ + if (*start) + (*start)->prev= &hash_link->next; + hash_link->next= *start; + hash_link->prev= start; + *start= hash_link; +} + + +/* + Remove a hash link from the hash table +*/ + +static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link) +{ + KEYCACHE_DBUG_PRINT("unlink_hash", ("fd: %u pos_ %lu #requests=%u", + (uint) hash_link->file.file, (ulong) hash_link->pageno, + hash_link->requests)); + KEYCACHE_DBUG_ASSERT(hash_link->requests == 0); + if ((*hash_link->prev= hash_link->next)) + hash_link->next->prev= hash_link->prev; + hash_link->block= NULL; +#ifdef THREAD + if (pagecache->waiting_for_hash_link.last_thread) + { + /* Signal that a free hash link has appeared */ + struct st_my_thread_var *last_thread= + pagecache->waiting_for_hash_link.last_thread; + struct st_my_thread_var *first_thread= last_thread->next; + struct st_my_thread_var *next_thread= first_thread; + PAGECACHE_PAGE *first_page= (PAGECACHE_PAGE *) (first_thread->opt_info); + struct st_my_thread_var *thread; + + hash_link->file= first_page->file; + DBUG_ASSERT(first_page->pageno < ((ULL(1)) << 40)); + hash_link->pageno= first_page->pageno; + do + { + PAGECACHE_PAGE *page; + thread= next_thread; + page= (PAGECACHE_PAGE *) thread->opt_info; + next_thread= thread->next; + /* + We notify about the event all threads that ask + for the same page as the first thread in the queue + */ + if (page->file.file == hash_link->file.file && + page->pageno == hash_link->pageno) + { + KEYCACHE_DBUG_PRINT("unlink_hash: signal", ("thread %ld", thread->id)); + pagecache_pthread_cond_signal(&thread->suspend); + wqueue_unlink_from_queue(&pagecache->waiting_for_hash_link, thread); + } + } + while (thread != last_thread); + link_hash(&pagecache->hash_root[PAGECACHE_HASH(pagecache, + hash_link->file, + hash_link->pageno)], + hash_link); + return; + } +#else /* THREAD */ + KEYCACHE_DBUG_ASSERT(! (pagecache->waiting_for_hash_link.last_thread)); +#endif /* THREAD */ + hash_link->next= pagecache->free_hash_list; + pagecache->free_hash_list= hash_link; +} + + +/* + Get the hash link for the page if it is in the cache (do not put the + page in the cache if it is absent there) + + SYNOPSIS + get_present_hash_link() + pagecache Pagecache reference + file file ID + pageno page number in the file + start where to put pointer to found hash bucket (for + direct referring it) + + RETURN + found hashlink pointer +*/ + +static PAGECACHE_HASH_LINK *get_present_hash_link(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + PAGECACHE_HASH_LINK ***start) +{ + reg1 PAGECACHE_HASH_LINK *hash_link; +#if defined(PAGECACHE_DEBUG) + int cnt; +#endif + DBUG_ENTER("get_present_hash_link"); + + KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u pos: %lu", + (uint) file->file, (ulong) pageno)); + + /* + Find the bucket in the hash table for the pair (file, pageno); + start contains the head of the bucket list, + hash_link points to the first member of the list + */ + hash_link= *(*start= &pagecache->hash_root[PAGECACHE_HASH(pagecache, + *file, pageno)]); +#if defined(PAGECACHE_DEBUG) + cnt= 0; +#endif + /* Look for an element for the pair (file, pageno) in the bucket chain */ + while (hash_link && + (hash_link->pageno != pageno || + hash_link->file.file != file->file)) + { + hash_link= hash_link->next; +#if defined(PAGECACHE_DEBUG) + cnt++; + if (! (cnt <= pagecache->hash_links_used)) + { + int i; + for (i=0, hash_link= **start ; + i < cnt ; i++, hash_link= hash_link->next) + { + KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u pos: %lu", + (uint) hash_link->file.file, (ulong) hash_link->pageno)); + } + } + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->hash_links_used); +#endif + } + if (hash_link) + { + /* Register the request for the page */ + hash_link->requests++; + } + /* + As soon as the caller will release the page cache's lock, "hash_link" + will be potentially obsolete (unusable) information. + */ + DBUG_RETURN(hash_link); +} + + +/* + Get the hash link for a page +*/ + +static PAGECACHE_HASH_LINK *get_hash_link(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno) +{ + reg1 PAGECACHE_HASH_LINK *hash_link; + PAGECACHE_HASH_LINK **start; + + KEYCACHE_DBUG_PRINT("get_hash_link", ("fd: %u pos: %lu", + (uint) file->file, (ulong) pageno)); + +restart: + /* try to find the page in the cache */ + hash_link= get_present_hash_link(pagecache, file, pageno, + &start); + if (!hash_link) + { + /* There is no hash link in the hash table for the pair (file, pageno) */ + if (pagecache->free_hash_list) + { + hash_link= pagecache->free_hash_list; + pagecache->free_hash_list= hash_link->next; + } + else if (pagecache->hash_links_used < pagecache->hash_links) + { + hash_link= &pagecache->hash_link_root[pagecache->hash_links_used++]; + } + else + { +#ifdef THREAD + /* Wait for a free hash link */ + struct st_my_thread_var *thread= my_thread_var; + PAGECACHE_PAGE page; + KEYCACHE_DBUG_PRINT("get_hash_link", ("waiting")); + page.file= *file; + page.pageno= pageno; + thread->opt_info= (void *) &page; + wqueue_link_into_queue(&pagecache->waiting_for_hash_link, thread); + KEYCACHE_DBUG_PRINT("get_hash_link: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + thread->opt_info= NULL; +#else + KEYCACHE_DBUG_ASSERT(0); +#endif + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + hash_link->file= *file; + DBUG_ASSERT(pageno < ((ULL(1)) << 40)); + hash_link->pageno= pageno; + link_hash(start, hash_link); + /* Register the request for the page */ + hash_link->requests++; + } + + return hash_link; +} + + +/* + Get a block for the file page requested by a pagecache read/write operation; + If the page is not in the cache return a free block, if there is none + return the lru block after saving its buffer if the page is dirty. + + SYNOPSIS + + find_block() + pagecache pointer to a page cache data structure + file handler for the file to read page from + pageno number of the page in the file + init_hits_left how initialize the block counter for the page + wrmode <-> get for writing + reg_req Register request to thye page + page_st out {PAGE_READ,PAGE_TO_BE_READ,PAGE_WAIT_TO_BE_READ} + + RETURN VALUE + Pointer to the found block if successful, 0 - otherwise + + NOTES. + For the page from file positioned at pageno the function checks whether + the page is in the key cache specified by the first parameter. + If this is the case it immediately returns the block. + If not, the function first chooses a block for this page. If there is + no not used blocks in the key cache yet, the function takes the block + at the very beginning of the warm sub-chain. It saves the page in that + block if it's dirty before returning the pointer to it. + The function returns in the page_st parameter the following values: + PAGE_READ - if page already in the block, + PAGE_TO_BE_READ - if it is to be read yet by the current thread + WAIT_TO_BE_READ - if it is to be read by another thread + If an error occurs THE PCBLOCK_ERROR bit is set in the block status. + It might happen that there are no blocks in LRU chain (in warm part) - + all blocks are unlinked for some read/write operations. Then the function + waits until first of this operations links any block back. +*/ + +static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + int init_hits_left, + my_bool wrmode, + my_bool reg_req, + int *page_st) +{ + PAGECACHE_HASH_LINK *hash_link; + PAGECACHE_BLOCK_LINK *block; + int error= 0; + int page_status; + + DBUG_ENTER("find_block"); + KEYCACHE_THREAD_TRACE("find_block:begin"); + DBUG_PRINT("enter", ("fd: %d pos: %lu wrmode: %d", + file->file, (ulong) pageno, wrmode)); + KEYCACHE_DBUG_PRINT("find_block", ("fd: %d pos: %lu wrmode: %d", + file->file, (ulong) pageno, + wrmode)); +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "start of find_block", 0);); +#endif + +restart: + /* Find the hash link for the requested page (file, pageno) */ + hash_link= get_hash_link(pagecache, file, pageno); + + page_status= -1; + if ((block= hash_link->block) && + block->hash_link == hash_link && (block->status & PCBLOCK_READ)) + page_status= PAGE_READ; + + if (wrmode && pagecache->resize_in_flush) + { + /* This is a write request during the flush phase of a resize operation */ + + if (page_status != PAGE_READ) + { + /* We don't need the page in the cache: we are going to write on disk */ + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + unlink_hash(pagecache, hash_link); + return 0; + } + if (!(block->status & PCBLOCK_IN_FLUSH)) + { + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + /* + Remove block to invalidate the page in the block buffer + as we are going to write directly on disk. + Although we have an exclusive lock for the updated key part + the control can be yielded by the current thread as we might + have unfinished readers of other key parts in the block + buffer. Still we are guaranteed not to have any readers + of the key part we are writing into until the block is + removed from the cache as we set the PCBLOCK_REASSIGNED + flag (see the code below that handles reading requests). + */ + free_block(pagecache, block); + return 0; + } + /* Wait until the page is flushed on disk */ + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + { +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + do + { + KEYCACHE_DBUG_PRINT("find_block: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while(thread->next); +#else + KEYCACHE_DBUG_ASSERT(0); + /* + Given the use of "resize_in_flush", it seems impossible + that this whole branch is ever entered in single-threaded case + because "(wrmode && pagecache->resize_in_flush)" cannot be true. + TODO: Check this, and then put the whole branch into the + "#ifdef THREAD" guard. + */ +#endif + } + /* Invalidate page in the block if it has not been done yet */ + if (block->status) + free_block(pagecache, block); + return 0; + } + + if (page_status == PAGE_READ && + (block->status & (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED))) + { + /* This is a request for a page to be removed from cache */ + + KEYCACHE_DBUG_PRINT("find_block", + ("request for old page in block: %u " + "wrmode: %d block->status: %d", + PCBLOCK_NUMBER(pagecache, block), wrmode, + block->status)); + /* + Only reading requests can proceed until the old dirty page is flushed, + all others are to be suspended, then resubmitted + */ + if (!wrmode && !(block->status & PCBLOCK_REASSIGNED)) + { + if (reg_req) + reg_requests(pagecache, block, 1); + } + else + { + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + KEYCACHE_DBUG_PRINT("find_block", + ("request waiting for old page to be saved")); + { +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + /* Put the request into the queue of those waiting for the old page */ + wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + /* Wait until the request can be resubmitted */ + do + { + KEYCACHE_DBUG_PRINT("find_block: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while(thread->next); +#else + KEYCACHE_DBUG_ASSERT(0); + /* No parallel requests in single-threaded case */ +#endif + } + KEYCACHE_DBUG_PRINT("find_block", + ("request for old page resubmitted")); + DBUG_PRINT("info", ("restarting...")); + /* Resubmit the request */ + goto restart; + } + } + else + { + /* This is a request for a new page or for a page not to be removed */ + if (! block) + { + /* No block is assigned for the page yet */ + if (pagecache->blocks_unused) + { + if (pagecache->free_block_list) + { + /* There is a block in the free list. */ + block= pagecache->free_block_list; + pagecache->free_block_list= block->next_used; + block->next_used= NULL; + } + else + { + /* There are some never used blocks, take first of them */ + block= &pagecache->block_root[pagecache->blocks_used]; + block->buffer= ADD_TO_PTR(pagecache->block_mem, + ((ulong) pagecache->blocks_used* + pagecache->block_size), + uchar*); + pagecache->blocks_used++; + } + pagecache->blocks_unused--; + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->rlocks_queue == 0); + DBUG_ASSERT(block->pins == 0); + block->status= 0; +#ifndef DBUG_OFF + block->type= PAGECACHE_EMPTY_PAGE; +#endif + block->requests= 1; + block->temperature= PCBLOCK_COLD; + block->hits_left= init_hits_left; + block->last_hit_time= 0; + block->rec_lsn= LSN_MAX; + link_to_file_list(pagecache, block, file, 0); + block->hash_link= hash_link; + hash_link->block= block; + page_status= PAGE_TO_BE_READ; + DBUG_PRINT("info", ("page to be read set for page 0x%lx", + (ulong)block)); + KEYCACHE_DBUG_PRINT("find_block", + ("got free or never used block %u", + PCBLOCK_NUMBER(pagecache, block))); + } + else + { + /* There are no never used blocks, use a block from the LRU chain */ + + /* + Wait until a new block is added to the LRU chain; + several threads might wait here for the same page, + all of them must get the same block + */ + +#ifdef THREAD + if (! pagecache->used_last) + { + struct st_my_thread_var *thread= my_thread_var; + thread->opt_info= (void *) hash_link; + wqueue_link_into_queue(&pagecache->waiting_for_block, thread); + do + { + KEYCACHE_DBUG_PRINT("find_block: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + thread->opt_info= NULL; + } +#else + KEYCACHE_DBUG_ASSERT(pagecache->used_last); +#endif + block= hash_link->block; + if (! block) + { + /* + Take the first block from the LRU chain + unlinking it from the chain + */ + block= pagecache->used_last->next_used; + block->hits_left= init_hits_left; + block->last_hit_time= 0; + if (reg_req) + reg_requests(pagecache, block, 1); + hash_link->block= block; + } + PCBLOCK_INFO(block); + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->rlocks_queue == 0); + DBUG_ASSERT(block->pins == 0); + + if (block->hash_link != hash_link && + ! (block->status & PCBLOCK_IN_SWITCH) ) + { + /* this is a primary request for a new page */ + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->rlocks_queue == 0); + DBUG_ASSERT(block->pins == 0); + block->status|= PCBLOCK_IN_SWITCH; + + KEYCACHE_DBUG_PRINT("find_block", + ("got block %u for new page", + PCBLOCK_NUMBER(pagecache, block))); + + if (block->status & PCBLOCK_CHANGED) + { + /* The block contains a dirty page - push it out of the cache */ + + KEYCACHE_DBUG_PRINT("find_block", ("block is dirty")); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + The call is thread safe because only the current + thread might change the block->hash_link value + */ + DBUG_ASSERT(block->pins == 0); + error= pagecache_fwrite(pagecache, + &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + block->type, + pagecache->readwrite_flags); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + pagecache->global_cache_write++; + } + + block->status|= PCBLOCK_REASSIGNED; + if (block->hash_link) + { + /* + Wait until all pending read requests + for this page are executed + (we could have avoided this waiting, if we had read + a page in the cache in a sweep, without yielding control) + */ + wait_for_readers(pagecache, block); + + /* Remove the hash link for this page from the hash table */ + unlink_hash(pagecache, block->hash_link); + /* All pending requests for this page must be resubmitted */ +#ifdef THREAD + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); +#endif + } + link_to_file_list(pagecache, block, file, + (my_bool)(block->hash_link ? 1 : 0)); + PCBLOCK_INFO(block); + block->status= error ? PCBLOCK_ERROR : 0; + block->error= (int16) my_errno; +#ifndef DBUG_OFF + block->type= PAGECACHE_EMPTY_PAGE; + if (error) + my_debug_put_break_here(); +#endif + block->hash_link= hash_link; + page_status= PAGE_TO_BE_READ; + DBUG_PRINT("info", ("page to be read set for page 0x%lx", + (ulong)block)); + + KEYCACHE_DBUG_ASSERT(block->hash_link->block == block); + KEYCACHE_DBUG_ASSERT(hash_link->block->hash_link == hash_link); + } + else + { + /* This is for secondary requests for a new page only */ + KEYCACHE_DBUG_PRINT("find_block", + ("block->hash_link: %p hash_link: %p " + "block->status: %u", block->hash_link, + hash_link, block->status )); + page_status= (((block->hash_link == hash_link) && + (block->status & PCBLOCK_READ)) ? + PAGE_READ : PAGE_WAIT_TO_BE_READ); + } + } + } + else + { + if (reg_req) + reg_requests(pagecache, block, 1); + KEYCACHE_DBUG_PRINT("find_block", + ("block->hash_link: %p hash_link: %p " + "block->status: %u", block->hash_link, + hash_link, block->status )); + page_status= (((block->hash_link == hash_link) && + (block->status & PCBLOCK_READ)) ? + PAGE_READ : PAGE_WAIT_TO_BE_READ); + } + } + + KEYCACHE_DBUG_ASSERT(page_status != -1); + *page_st= page_status; + DBUG_PRINT("info", + ("block: 0x%lx fd: %u pos: %lu block->status: %u page_status: %u", + (ulong) block, (uint) file->file, + (ulong) pageno, block->status, (uint) page_status)); + KEYCACHE_DBUG_PRINT("find_block", + ("block: 0x%lx fd: %d pos: %lu block->status: %u page_status: %d", + (ulong) block, + file->file, (ulong) pageno, block->status, + page_status)); + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "end of find_block",0);); +#endif + KEYCACHE_THREAD_TRACE("find_block:end"); + DBUG_RETURN(block); +} + + +static void add_pin(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("add_pin"); + DBUG_PRINT("enter", ("block: 0x%lx pins: %u", + (ulong) block, + block->pins)); + PCBLOCK_INFO(block); + block->pins++; +#ifndef DBUG_OFF + { + PAGECACHE_PIN_INFO *info= + (PAGECACHE_PIN_INFO *)my_malloc(sizeof(PAGECACHE_PIN_INFO), MYF(0)); + info->thread= my_thread_var; + info_link(&block->pin_list, info); + } +#endif + DBUG_VOID_RETURN; +} + +static void remove_pin(PAGECACHE_BLOCK_LINK *block, my_bool any +#ifdef DBUG_OFF + __attribute__((unused)) +#endif + ) +{ + DBUG_ENTER("remove_pin"); + DBUG_PRINT("enter", ("block: 0x%lx pins: %u any: %d", + (ulong) block, + block->pins, (int)any)); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->pins > 0); + block->pins--; +#ifndef DBUG_OFF + { + PAGECACHE_PIN_INFO *info= info_find(block->pin_list, my_thread_var, any); + DBUG_ASSERT(info != 0); + info_unlink(info); + my_free(info, MYF(0)); + } +#endif + DBUG_VOID_RETURN; +} +#ifndef DBUG_OFF +static void info_add_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl) +{ + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *)my_malloc(sizeof(PAGECACHE_LOCK_INFO), MYF(0)); + info->thread= my_thread_var; + info->write_lock= wl; + info_link((PAGECACHE_PIN_INFO **)&block->lock_list, + (PAGECACHE_PIN_INFO *)info); +} +static void info_remove_lock(PAGECACHE_BLOCK_LINK *block) +{ + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list, + my_thread_var, FALSE); + DBUG_ASSERT(info != 0); + info_unlink((PAGECACHE_PIN_INFO *)info); + my_free(info, MYF(0)); +} +static void info_change_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl) +{ + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list, + my_thread_var, FALSE); + DBUG_ASSERT(info != 0); + DBUG_ASSERT(info->write_lock != wl); + info->write_lock= wl; +} +#else +#define info_add_lock(B,W) +#define info_remove_lock(B) +#define info_change_lock(B,W) +#endif + + +/** + @brief waiting for lock for read and write lock + + @parem pagecache pointer to a page cache data structure + @parem block the block to work with + @param file file of the block when it was locked + @param pageno page number of the block when it was locked + @param lock_type MY_PTHREAD_LOCK_READ or MY_PTHREAD_LOCK_WRITE + + @retval 0 OK + @retval 1 Can't lock this block, need retry +*/ + +static my_bool pagecache_wait_lock(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + PAGECACHE_FILE file, + pgcache_page_no_t pageno, + uint lock_type) +{ + /* Lock failed we will wait */ +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + DBUG_ENTER("pagecache_wait_lock"); + DBUG_PRINT("info", ("fail to lock, waiting... 0x%lx", (ulong)block)); + thread->lock_type= lock_type; + wqueue_add_to_queue(&block->wqueue[COND_FOR_WRLOCK], thread); + dec_counter_for_resize_op(pagecache); + do + { + KEYCACHE_DBUG_PRINT("get_wrlock: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while(thread->next); +#else + DBUG_ASSERT(0); +#endif + PCBLOCK_INFO(block); + if ((block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH)) || + file.file != block->hash_link->file.file || + pageno != block->hash_link->pageno) + { + DBUG_PRINT("info", ("the block 0x%lx changed => need retry " + "status: %x files %d != %d or pages %lu != %lu", + (ulong)block, block->status, + file.file, block->hash_link->file.file, + (ulong) pageno, (ulong) block->hash_link->pageno)); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + +/** + @brief Put on the block write lock + + @parem pagecache pointer to a page cache data structure + @parem block the block to work with + + @note We have loose scheme for locking by the same thread: + * Downgrade to read lock if no other locks are taken + * Our scheme of locking allow for the same thread + - the same kind of lock + - taking read lock if write lock present + - downgrading to read lock if still other place the same + thread keep write lock + * But unlock operation number should be the same to lock operation. + * If we try to get read lock having active write locks we put read + locks to queue, and as soon as write lock(s) gone the read locks + from queue came in force. + * If read lock is unlocked earlier then it came to force it + just removed from the queue + + @retval 0 OK + @retval 1 Can't lock this block, need retry +*/ + +static my_bool get_wrlock(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block) +{ + PAGECACHE_FILE file= block->hash_link->file; + pgcache_page_no_t pageno= block->hash_link->pageno; + pthread_t locker= pthread_self(); + DBUG_ENTER("get_wrlock"); + DBUG_PRINT("info", ("the block 0x%lx " + "files %d(%d) pages %lu(%lu)", + (ulong) block, + file.file, block->hash_link->file.file, + (ulong) pageno, (ulong) block->hash_link->pageno)); + PCBLOCK_INFO(block); + /* + We assume that the same thread will try write lock on block on which it + has already read lock. + */ + while ((block->wlocks && !pthread_equal(block->write_locker, locker)) || + block->rlocks) + { + /* Lock failed we will wait */ + if (pagecache_wait_lock(pagecache, block, file, pageno, + MY_PTHREAD_LOCK_WRITE)) + DBUG_RETURN(1); + } + /* we are doing it by global cache mutex protection, so it is OK */ + block->wlocks++; + block->write_locker= locker; + DBUG_PRINT("info", ("WR lock set, block 0x%lx", (ulong)block)); + DBUG_RETURN(0); +} + + +/* + @brief Put on the block read lock + + @param pagecache pointer to a page cache data structure + @param block the block to work with + @param user_file Unique handler per handler file. Used to check if + we request many write locks withing the same + statement + + @note see note for get_wrlock(). + + @retvalue 0 OK + @retvalue 1 Can't lock this block, need retry +*/ + +static my_bool get_rdlock(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block) +{ + PAGECACHE_FILE file= block->hash_link->file; + pgcache_page_no_t pageno= block->hash_link->pageno; + pthread_t locker= pthread_self(); + DBUG_ENTER("get_rdlock"); + DBUG_PRINT("info", ("the block 0x%lx " + "files %d(%d) pages %lu(%lu)", + (ulong) block, + file.file, block->hash_link->file.file, + (ulong) pageno, (ulong) block->hash_link->pageno)); + PCBLOCK_INFO(block); + while (block->wlocks && !pthread_equal(block->write_locker, locker)) + { + /* Lock failed we will wait */ + if (pagecache_wait_lock(pagecache, block, file, pageno, + MY_PTHREAD_LOCK_READ)) + DBUG_RETURN(1); + } + /* we are doing it by global cache mutex protection, so it is OK */ + if (block->wlocks) + { + DBUG_ASSERT(pthread_equal(block->write_locker, locker)); + block->rlocks_queue++; + DBUG_PRINT("info", ("RD lock put into queue, block 0x%lx", (ulong)block)); + } + else + { + block->rlocks++; + DBUG_PRINT("info", ("RD lock set, block 0x%lx", (ulong)block)); + } + DBUG_RETURN(0); +} + + +/* + @brief Remove write lock from the block + + @param pagecache pointer to a page cache data structure + @param block the block to work with + @param read_lock downgrade to read lock + + @note see note for get_wrlock(). +*/ + +static void release_wrlock(PAGECACHE_BLOCK_LINK *block, my_bool read_lock) +{ + DBUG_ENTER("release_wrlock"); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->wlocks > 0); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->pins > 0); + if (read_lock) + block->rlocks_queue++; + if (block->wlocks == 1) + { + block->rlocks= block->rlocks_queue; + block->rlocks_queue= 0; + } + block->wlocks--; + if (block->wlocks > 0) + DBUG_VOID_RETURN; /* Multiple write locked */ + DBUG_PRINT("info", ("WR lock reset, block 0x%lx", (ulong)block)); +#ifdef THREAD + /* release all threads waiting for read lock or one waiting for write */ + if (block->wqueue[COND_FOR_WRLOCK].last_thread) + wqueue_release_one_locktype_from_queue(&block->wqueue[COND_FOR_WRLOCK]); +#endif + PCBLOCK_INFO(block); + DBUG_VOID_RETURN; +} + +/* + @brief Remove read lock from the block + + @param pagecache pointer to a page cache data structure + @param block the block to work with + + @note see note for get_wrlock(). +*/ + +static void release_rdlock(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("release_wrlock"); + PCBLOCK_INFO(block); + if (block->wlocks) + { + DBUG_ASSERT(pthread_equal(block->write_locker, pthread_self())); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->rlocks_queue > 0); + block->rlocks_queue--; + DBUG_PRINT("info", ("RD lock queue decreased, block 0x%lx", (ulong)block)); + DBUG_VOID_RETURN; + } + DBUG_ASSERT(block->rlocks > 0); + DBUG_ASSERT(block->rlocks_queue == 0); + block->rlocks--; + DBUG_PRINT("info", ("RD lock decreased, block 0x%lx", (ulong)block)); + if (block->rlocks > 0) + DBUG_VOID_RETURN; /* Multiple write locked */ + DBUG_PRINT("info", ("RD lock reset, block 0x%lx", (ulong)block)); +#ifdef THREAD + /* release all threads waiting for read lock or one waiting for write */ + if (block->wqueue[COND_FOR_WRLOCK].last_thread) + wqueue_release_one_locktype_from_queue(&block->wqueue[COND_FOR_WRLOCK]); +#endif + PCBLOCK_INFO(block); + DBUG_VOID_RETURN; +} + +/** + @brief Try to lock/unlock and pin/unpin the block + + @param pagecache pointer to a page cache data structure + @param block the block to work with + @param lock lock change mode + @param pin pinchange mode + @param file File handler requesting pin + @param any allow unpinning block pinned by any thread; possible + only if not locked, see pagecache_unlock_by_link() + + @retval 0 OK + @retval 1 Try to lock the block failed +*/ + +static my_bool make_lock_and_pin(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + my_bool any) +{ + DBUG_ENTER("make_lock_and_pin"); + + DBUG_PRINT("enter", ("block: 0x%lx", (ulong)block)); +#ifndef DBUG_OFF + if (block) + { + DBUG_PRINT("enter", ("block: 0x%lx (%u) wrlocks: %u rdlocks: %u " + "rdlocks_q: %u pins: %u lock: %s pin: %s any %d", + (ulong)block, PCBLOCK_NUMBER(pagecache, block), + block->wlocks, block->rlocks, block->rlocks_queue, + block->pins, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin], (int)any)); + PCBLOCK_INFO(block); + } +#endif + + DBUG_ASSERT(!any || + ((lock == PAGECACHE_LOCK_LEFT_UNLOCKED) && + (pin == PAGECACHE_UNPIN))); + + switch (lock) { + case PAGECACHE_LOCK_WRITE: /* free -> write */ + /* Writelock and pin the buffer */ + if (get_wrlock(pagecache, block)) + { + /* Couldn't lock because block changed status => need retry */ + goto retry; + } + + /* The cache is locked so nothing afraid of */ + add_pin(block); + info_add_lock(block, 1); + break; + case PAGECACHE_LOCK_WRITE_TO_READ: /* write -> read */ + case PAGECACHE_LOCK_WRITE_UNLOCK: /* write -> free */ + /* Removes write lock and puts read lock */ + release_wrlock(block, lock == PAGECACHE_LOCK_WRITE_TO_READ); + /* fall through */ + case PAGECACHE_LOCK_READ_UNLOCK: /* read -> free */ + if (lock == PAGECACHE_LOCK_READ_UNLOCK) + release_rdlock(block); + /* fall through */ + case PAGECACHE_LOCK_LEFT_READLOCKED: /* read -> read */ + if (pin == PAGECACHE_UNPIN) + { + remove_pin(block, FALSE); + } + if (lock == PAGECACHE_LOCK_WRITE_TO_READ) + { + info_change_lock(block, 0); + } + else if (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_READ_UNLOCK) + { + info_remove_lock(block); + } + break; + case PAGECACHE_LOCK_READ: /* free -> read */ + if (get_rdlock(pagecache, block)) + { + /* Couldn't lock because block changed status => need retry */ + goto retry; + } + + if (pin == PAGECACHE_PIN) + { + /* The cache is locked so nothing afraid off */ + add_pin(block); + } + info_add_lock(block, 0); + break; + case PAGECACHE_LOCK_LEFT_UNLOCKED: /* free -> free */ + if (pin == PAGECACHE_UNPIN) + { + remove_pin(block, any); + } + /* fall through */ + case PAGECACHE_LOCK_LEFT_WRITELOCKED: /* write -> write */ + break; /* do nothing */ + default: + DBUG_ASSERT(0); /* Never should happened */ + } + +#ifndef DBUG_OFF + if (block) + PCBLOCK_INFO(block); +#endif + DBUG_RETURN(0); +retry: + DBUG_PRINT("INFO", ("Retry block 0x%lx", (ulong)block)); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->hash_link->requests > 0); + block->hash_link->requests--; + PCBLOCK_INFO(block); + DBUG_RETURN(1); + +} + + +/* + Read into a key cache block buffer from disk. + + SYNOPSIS + + read_block() + pagecache pointer to a page cache data structure + block block to which buffer the data is to be read + primary <-> the current thread will read the data + + RETURN VALUE + None + + NOTES. + The function either reads a page data from file to the block buffer, + or waits until another thread reads it. What page to read is determined + by a block parameter - reference to a hash link for this page. + If an error occurs THE PCBLOCK_ERROR bit is set in the block status. + + On entry cache_lock is locked +*/ + +static void read_block(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + my_bool primary) +{ + + DBUG_ENTER("read_block"); + DBUG_PRINT("enter", ("read block: 0x%lx primary: %d", + (ulong)block, primary)); + if (primary) + { + size_t error; + /* + This code is executed only by threads + that submitted primary requests + */ + + pagecache->global_cache_read++; + /* Page is not in buffer yet, is to be read from disk */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + Here other threads may step in and register as secondary readers. + They will register in block->wqueue[COND_FOR_REQUESTED]. + */ + error= pagecache_fread(pagecache, &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + pagecache->readwrite_flags); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (error) + { + block->status|= PCBLOCK_ERROR; + block->error= (int16) my_errno; + my_debug_put_break_here(); + } + else + { + block->status|= PCBLOCK_READ; + if ((*block->hash_link->file.read_callback)(block->buffer, + block->hash_link->pageno, + block->hash_link-> + file.callback_data)) + { + DBUG_PRINT("error", ("read callback problem")); + block->status|= PCBLOCK_ERROR; + block->error= (int16) my_errno; + my_debug_put_break_here(); + } + } + DBUG_PRINT("read_block", + ("primary request: new page in cache")); + /* Signal that all pending requests for this page now can be processed */ +#ifdef THREAD + if (block->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); +#endif + } + else + { + /* + This code is executed only by threads + that submitted secondary requests + */ + +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + /* Put the request into a queue and wait until it can be processed */ + wqueue_add_to_queue(&block->wqueue[COND_FOR_REQUESTED], thread); + do + { + DBUG_PRINT("read_block: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); +#else + KEYCACHE_DBUG_ASSERT(0); + /* No parallel requests in single-threaded case */ +#endif + DBUG_PRINT("read_block", + ("secondary request: new page in cache")); + } + DBUG_VOID_RETURN; +} + + +/** + @brief Set LSN on the page to the given one if the given LSN is bigger + + @param pagecache pointer to a page cache data structure + @param lsn LSN to set + @param block block to check and set +*/ + +static void check_and_set_lsn(PAGECACHE *pagecache, + LSN lsn, PAGECACHE_BLOCK_LINK *block) +{ + LSN old; + DBUG_ENTER("check_and_set_lsn"); + /* + In recovery, we can _ma_unpin_all_pages() to put a LSN on page, though + page would be PAGECACHE_PLAIN_PAGE (transactionality temporarily disabled + to not log REDOs). + */ + DBUG_ASSERT((block->type == PAGECACHE_LSN_PAGE) || maria_in_recovery); + old= lsn_korr(block->buffer); + DBUG_PRINT("info", ("old lsn: (%lu, 0x%lx) new lsn: (%lu, 0x%lx)", + LSN_IN_PARTS(old), LSN_IN_PARTS(lsn))); + if (cmp_translog_addr(lsn, old) > 0) + { + + DBUG_ASSERT(block->type != PAGECACHE_READ_UNKNOWN_PAGE); + lsn_store(block->buffer, lsn); + /* we stored LSN in page so we dirtied it */ + if (!(block->status & PCBLOCK_CHANGED)) + link_to_changed_list(pagecache, block); + } + DBUG_VOID_RETURN; +} + + +/** + @brief Unlock/unpin page and put LSN stamp if it need + + @param pagecache pointer to a page cache data structure + @pagam file handler for the file for the block of data to be read + @param pageno number of the block of data in the file + @param lock lock change + @param pin pin page + @param first_REDO_LSN_for_page do not set it if it is zero + @param lsn if it is not LSN_IMPOSSIBLE (0) and it + is bigger then LSN on the page it will be written on + the page + @param was_changed should be true if the page was write locked with + direct link giving and the page was changed + + @note + Pininig uses requests registration mechanism it works following way: + | beginnig | ending | + | of func. | of func. | + ----------------------------+-------------+---------------+ + PAGECACHE_PIN_LEFT_PINNED | - | - | + PAGECACHE_PIN_LEFT_UNPINNED | reg request | unreg request | + PAGECACHE_PIN | reg request | - | + PAGECACHE_UNPIN | - | unreg request | + + +*/ + +void pagecache_unlock(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn, my_bool was_changed) +{ + PAGECACHE_BLOCK_LINK *block; + int page_st; + DBUG_ENTER("pagecache_unlock"); + DBUG_PRINT("enter", ("fd: %u page: %lu %s %s", + (uint) file->file, (ulong) pageno, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + /* we do not allow any lock/pin increasing here */ + DBUG_ASSERT(pin != PAGECACHE_PIN); + DBUG_ASSERT(lock != PAGECACHE_LOCK_READ); + DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock because want + to unlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + /* See NOTE for pagecache_unlock about registering requests */ + block= find_block(pagecache, file, pageno, 0, 0, + pin == PAGECACHE_PIN_LEFT_UNPINNED, &page_st); + PCBLOCK_INFO(block); + DBUG_ASSERT(block != 0 && page_st == PAGE_READ); + if (first_REDO_LSN_for_page) + { + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK); + DBUG_ASSERT(pin == PAGECACHE_UNPIN); + pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page); + } + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + /* if we lock for write we must link the block to changed blocks */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 || + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_WRITE_TO_READ || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED)); + /* + if was_changed then status should be PCBLOCK_DIRECT_W or marked + as dirty + */ + DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) || + (block->status & PCBLOCK_CHANGED)); + if ((block->status & PCBLOCK_DIRECT_W) && + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_WRITE_TO_READ)) + { + if (!(block->status & PCBLOCK_CHANGED) && was_changed) + link_to_changed_list(pagecache, block); + block->status&= ~PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx", + (ulong) block)); + } + + if (make_lock_and_pin(pagecache, block, lock, pin, FALSE)) + { + DBUG_ASSERT(0); /* should not happend */ + } + + remove_reader(block); + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + if (pin != PAGECACHE_PIN_LEFT_PINNED) + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/* + Unpin page + + SYNOPSIS + pagecache_unpin() + pagecache pointer to a page cache data structure + file handler for the file for the block of data to be read + pageno number of the block of data in the file + lsn if it is not LSN_IMPOSSIBLE (0) and it + is bigger then LSN on the page it will be written on + the page +*/ + +void pagecache_unpin(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + LSN lsn) +{ + PAGECACHE_BLOCK_LINK *block; + int page_st; + DBUG_ENTER("pagecache_unpin"); + DBUG_PRINT("enter", ("fd: %u page: %lu", + (uint) file->file, (ulong) pageno)); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock bacause want + aunlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + /* See NOTE for pagecache_unlock about registering requests */ + block= find_block(pagecache, file, pageno, 0, 0, 0, &page_st); + DBUG_ASSERT(block != 0); + DBUG_ASSERT(page_st == PAGE_READ); + /* we can't unpin such page without unlock */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0); + + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + /* + we can just unpin only with keeping read lock because: + a) we can't pin without any lock + b) we can't unpin keeping write lock + */ + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_LEFT_READLOCKED, + PAGECACHE_UNPIN, FALSE)) + DBUG_ASSERT(0); /* should not happend */ + + remove_reader(block); + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests + */ + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/** + @brief Unlock/unpin page and put LSN stamp if it need + (uses direct block/page pointer) + + @param pagecache pointer to a page cache data structure + @param link direct link to page (returned by read or write) + @param lock lock change + @param pin pin page + @param first_REDO_LSN_for_page do not set it if it is LSN_IMPOSSIBLE (0) + @param lsn if it is not LSN_IMPOSSIBLE and it is bigger then + LSN on the page it will be written on the page + @param was_changed should be true if the page was write locked with + direct link giving and the page was changed + @param any allow unpinning block pinned by any thread; possible + only if not locked + + @note 'any' is a hack so that _ma_bitmap_unpin_all() is allowed to unpin + non-locked bitmap pages pinned by other threads. Because it always uses + PAGECACHE_LOCK_LEFT_UNLOCKED and PAGECACHE_UNPIN + (see write_changed_bitmap()), the hack is limited to these conditions. +*/ + +void pagecache_unlock_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn, my_bool was_changed, + my_bool any) +{ + DBUG_ENTER("pagecache_unlock_by_link"); + DBUG_PRINT("enter", ("block: 0x%lx fd: %u page: %lu changed: %d %s %s", + (ulong) block, + (uint) block->hash_link->file.file, + (ulong) block->hash_link->pageno, was_changed, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + /* + We do not allow any lock/pin increasing here and page can't be + unpinned because we use direct link. + */ + DBUG_ASSERT(pin != PAGECACHE_PIN); + DBUG_ASSERT(pin != PAGECACHE_PIN_LEFT_UNPINNED); + DBUG_ASSERT(lock != PAGECACHE_LOCK_READ); + DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (pin == PAGECACHE_PIN_LEFT_UNPINNED && + lock == PAGECACHE_LOCK_READ_UNLOCK) + { + if (make_lock_and_pin(pagecache, block, lock, pin, FALSE)) + DBUG_ASSERT(0); /* should not happend */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_VOID_RETURN; + } + + /* + As soon as we keep lock cache can be used, and we have lock because want + unlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + if (was_changed) + { + if (first_REDO_LSN_for_page != LSN_IMPOSSIBLE) + { + /* + LOCK_READ_UNLOCK is ok here as the page may have first locked + with WRITE lock that was temporarly converted to READ lock before + it's unpinned + */ + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_READ_UNLOCK); + DBUG_ASSERT(pin == PAGECACHE_UNPIN); + pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page); + } + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + /* + Reset error flag. Mark also that page is active; This may not have + been the case if there was an error reading the page + */ + block->status= (block->status & ~PCBLOCK_ERROR) | PCBLOCK_READ; + } + + /* if we lock for write we must link the block to changed blocks */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 || + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_WRITE_TO_READ || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED)); + /* + If was_changed then status should be PCBLOCK_DIRECT_W or marked + as dirty + */ + DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) || + (block->status & PCBLOCK_CHANGED)); + if ((block->status & PCBLOCK_DIRECT_W) && + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_WRITE_TO_READ)) + { + if (!(block->status & PCBLOCK_CHANGED) && was_changed) + link_to_changed_list(pagecache, block); + block->status&= ~PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx", + (ulong) block)); + } + + if (make_lock_and_pin(pagecache, block, lock, pin, any)) + DBUG_ASSERT(0); /* should not happend */ + + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + if (pin != PAGECACHE_PIN_LEFT_PINNED) + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/* + Unpin page + (uses direct block/page pointer) + + SYNOPSIS + pagecache_unpin_by_link() + pagecache pointer to a page cache data structure + link direct link to page (returned by read or write) + lsn if it is not LSN_IMPOSSIBLE (0) and it + is bigger then LSN on the page it will be written on + the page +*/ + +void pagecache_unpin_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + LSN lsn) +{ + DBUG_ENTER("pagecache_unpin_by_link"); + DBUG_PRINT("enter", ("block: 0x%lx fd: %u page: %lu", + (ulong) block, + (uint) block->hash_link->file.file, + (ulong) block->hash_link->pageno)); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock because want + unlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + /* we can't unpin such page without unlock */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0); + + inc_counter_for_resize_op(pagecache); + + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + /* + We can just unpin only with keeping read lock because: + a) we can't pin without any lock + b) we can't unpin keeping write lock + */ + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_LEFT_READLOCKED, + PAGECACHE_UNPIN, FALSE)) + DBUG_ASSERT(0); /* should not happend */ + + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + +/* description of how to change lock before and after read/write */ +struct rw_lock_change +{ + my_bool need_lock_change; /* need changing of lock at the end */ + enum pagecache_page_lock new_lock; /* lock at the beginning */ + enum pagecache_page_lock unlock_lock; /* lock at the end */ +}; + +/* description of how to change pin before and after read/write */ +struct rw_pin_change +{ + enum pagecache_page_pin new_pin; /* pin status at the beginning */ + enum pagecache_page_pin unlock_pin; /* pin status at the end */ +}; + +/** + Depending on the lock which the user wants in pagecache_read(), we + need to acquire a first type of lock at start of pagecache_read(), and + downgrade it to a second type of lock at end. For example, if user + asked for no lock (PAGECACHE_LOCK_LEFT_UNLOCKED) this translates into + taking first a read lock PAGECACHE_LOCK_READ (to rightfully block on + existing write locks) then read then unlock the lock i.e. change lock + to PAGECACHE_LOCK_READ_UNLOCK (the "1" below tells that a change is + needed). +*/ + +static struct rw_lock_change lock_to_read[8]= +{ + { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/ + 1, + PAGECACHE_LOCK_READ, PAGECACHE_LOCK_READ_UNLOCK + }, + { /*PAGECACHE_LOCK_LEFT_READLOCKED*/ + 0, + PAGECACHE_LOCK_LEFT_READLOCKED, PAGECACHE_LOCK_LEFT_READLOCKED + }, + { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/ + 0, + PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_LEFT_WRITELOCKED + }, + { /*PAGECACHE_LOCK_READ*/ + 1, + PAGECACHE_LOCK_READ, PAGECACHE_LOCK_LEFT_READLOCKED + }, + { /*PAGECACHE_LOCK_WRITE*/ + 1, + PAGECACHE_LOCK_WRITE, PAGECACHE_LOCK_LEFT_WRITELOCKED + }, + { /*PAGECACHE_LOCK_READ_UNLOCK*/ + 1, + PAGECACHE_LOCK_LEFT_READLOCKED, PAGECACHE_LOCK_READ_UNLOCK + }, + { /*PAGECACHE_LOCK_WRITE_UNLOCK*/ + 1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_WRITE_UNLOCK + }, + { /*PAGECACHE_LOCK_WRITE_TO_READ*/ + 1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_WRITE_TO_READ + } +}; + +/** + Two sets of pin modes (every as for lock upper but for pinning). The + difference between sets if whether we are going to provide caller with + reference on the block or not +*/ + +static struct rw_pin_change lock_to_pin[2][8]= +{ + { + { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_LEFT_READLOCKED*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED, + }, + { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_READ*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_WRITE*/ + PAGECACHE_PIN, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_READ_UNLOCK*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_WRITE_UNLOCK*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_UNPIN + }, + { /*PAGECACHE_LOCK_WRITE_TO_READ*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_UNPIN + } + }, + { + { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_LEFT_READLOCKED*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED, + }, + { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_READ*/ + PAGECACHE_PIN, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_WRITE*/ + PAGECACHE_PIN, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_READ_UNLOCK*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_WRITE_UNLOCK*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_UNPIN + }, + { /*PAGECACHE_LOCK_WRITE_TO_READ*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_PIN_LEFT_PINNED, + } + } +}; + + +/* + @brief Read a block of data from a cached file into a buffer; + + @param pagecache pointer to a page cache data structure + @param file handler for the file for the block of data to be read + @param pageno number of the block of data in the file + @param level determines the weight of the data + @param buff buffer to where the data must be placed + @param type type of the page + @param lock lock change + @param link link to the page if we pin it + + @return address from where the data is placed if successful, 0 - otherwise. + + @note Pin will be chosen according to lock parameter (see lock_to_pin) + + @note 'buff', if not NULL, must be long-aligned. + + @note If buff==0 then we provide reference on the page so should keep the + page pinned. +*/ + +uchar *pagecache_read(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + PAGECACHE_BLOCK_LINK **page_link) +{ + my_bool error= 0; + enum pagecache_page_pin + new_pin= lock_to_pin[buff==0][lock].new_pin, + unlock_pin= lock_to_pin[buff==0][lock].unlock_pin; + PAGECACHE_BLOCK_LINK *fake_link; + my_bool reg_request; +#ifndef DBUG_OFF + char llbuf[22]; + DBUG_ENTER("pagecache_read"); + DBUG_PRINT("enter", ("fd: %u page: %s buffer: 0x%lx level: %u " + "t:%s (%d)%s->%s %s->%s", + (uint) file->file, ullstr(pageno, llbuf), + (ulong) buff, level, + page_cache_page_type_str[type], + lock_to_read[lock].need_lock_change, + page_cache_page_lock_str[lock_to_read[lock].new_lock], + page_cache_page_lock_str[lock_to_read[lock].unlock_lock], + page_cache_page_pin_str[new_pin], + page_cache_page_pin_str[unlock_pin])); + DBUG_ASSERT(buff != 0 || (buff == 0 && (unlock_pin == PAGECACHE_PIN || + unlock_pin == PAGECACHE_PIN_LEFT_PINNED))); + DBUG_ASSERT(pageno < ((ULL(1)) << 40)); +#endif + + if (!page_link) + page_link= &fake_link; + *page_link= 0; /* Catch errors */ + +restart: + + if (pagecache->can_be_used) + { + /* Key cache is used */ + PAGECACHE_BLOCK_LINK *block; + uint status; + int page_st; + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + { + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + goto no_key_cache; + } + + inc_counter_for_resize_op(pagecache); + pagecache->global_cache_r_requests++; + /* See NOTE for pagecache_unlock about registering requests. */ + reg_request= ((new_pin == PAGECACHE_PIN_LEFT_UNPINNED) || + (new_pin == PAGECACHE_PIN)); + block= find_block(pagecache, file, pageno, level, + lock == PAGECACHE_LOCK_WRITE, + reg_request, &page_st); + DBUG_PRINT("info", ("Block type: %s current type %s", + page_cache_page_type_str[block->type], + page_cache_page_type_str[type])); + if (((block->status & PCBLOCK_ERROR) == 0) && (page_st != PAGE_READ)) + { + /* The requested page is to be read into the block buffer */ + read_block(pagecache, block, + (my_bool)(page_st == PAGE_TO_BE_READ)); + DBUG_PRINT("info", ("read is done")); + } + /* + Assert after block is read. Imagine two concurrent SELECTs on same + table (thread1 and 2), which want to pagecache_read() the same + pageno/fileno. Thread1 calls find_block(), decides to evict a dirty + page from LRU; while it's writing this dirty page to disk, it is + pre-empted and thread2 runs its find_block(), gets the block (in + PAGE_TO_BE_READ state). This block is still containing the in-eviction + dirty page so has an its type, which cannot be tested. + So thread2 has to wait for read_block() to finish (when it wakes up in + read_block(), it's woken up by read_block() of thread1, which implies + that block's type was set to EMPTY by thread1 as part of find_block()). + */ + DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE || + block->type == type || + type == PAGECACHE_LSN_PAGE || + type == PAGECACHE_READ_UNKNOWN_PAGE || + block->type == PAGECACHE_READ_UNKNOWN_PAGE); + if (type != PAGECACHE_READ_UNKNOWN_PAGE || + block->type == PAGECACHE_EMPTY_PAGE) + block->type= type; + + if (make_lock_and_pin(pagecache, block, lock_to_read[lock].new_lock, + new_pin, FALSE)) + { + /* + We failed to write lock the block, cache is unlocked, + we will try to get the block again. + */ + if (reg_request) + unreg_request(pagecache, block, 1); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + + status= block->status; + if (!buff) + { + buff= block->buffer; + /* possibly we will write here (resolved on unlock) */ + if ((lock == PAGECACHE_LOCK_WRITE || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED) && + !(block->status & PCBLOCK_CHANGED)) + { + block->status|= PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Set PCBLOCK_DIRECT_W for block: 0x%lx", + (ulong) block)); + } + } + else + { + if (!(status & PCBLOCK_ERROR)) + { +#if !defined(SERIALIZED_READ_FROM_CACHE) + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); +#endif + + DBUG_ASSERT((pagecache->block_size & 511) == 0); + /* Copy data from the cache buffer */ + bmove512(buff, block->buffer, pagecache->block_size); + +#if !defined(SERIALIZED_READ_FROM_CACHE) + pagecache_pthread_mutex_lock(&pagecache->cache_lock); +#endif + } + else + my_errno= block->error; + } + + remove_reader(block); + if (lock_to_read[lock].need_lock_change) + { + if (make_lock_and_pin(pagecache, block, + lock_to_read[lock].unlock_lock, + unlock_pin, FALSE)) + DBUG_ASSERT(0); + } + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + if (unlock_pin == PAGECACHE_PIN_LEFT_UNPINNED || + unlock_pin == PAGECACHE_UNPIN) + unreg_request(pagecache, block, 1); + else + *page_link= block; + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + if (status & PCBLOCK_ERROR) + { + DBUG_ASSERT(my_errno != 0); + DBUG_PRINT("error", ("Got error %d when doing page read", my_errno)); + DBUG_RETURN((uchar *) 0); + } + + DBUG_RETURN(buff); + } + +no_key_cache: /* Key cache is not used */ + + /* We can't use mutex here as the key cache may not be initialized */ + pagecache->global_cache_r_requests++; + pagecache->global_cache_read++; + if (pagecache_fread(pagecache, file, buff, pageno, + pagecache->readwrite_flags)) + error= 1; + DBUG_RETURN(error ? (uchar*) 0 : buff); +} + + +/* + @brief Delete page from the buffer (common part for link and file/page) + + @param pagecache pointer to a page cache data structure + @param block direct link to page (returned by read or write) + @param page_link hash link of the block + @param flush flush page if it is dirty + + @retval 0 deleted or was not present at all + @retval 1 error + +*/ + +static my_bool pagecache_delete_internal(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + PAGECACHE_HASH_LINK *page_link, + my_bool flush) +{ + my_bool error= 0; + if (block->status & PCBLOCK_CHANGED) + { + if (flush) + { + /* The block contains a dirty page - push it out of the cache */ + + KEYCACHE_DBUG_PRINT("find_block", ("block is dirty")); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + The call is thread safe because only the current + thread might change the block->hash_link value + */ + DBUG_ASSERT(block->pins == 1); + error= pagecache_fwrite(pagecache, + &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + block->type, + pagecache->readwrite_flags); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + pagecache->global_cache_write++; + + if (error) + { + block->status|= PCBLOCK_ERROR; + block->error= (int16) my_errno; + my_debug_put_break_here(); + goto err; + } + } + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + /* + free_block() will change the status and rec_lsn of the block so no + need to change them here. + */ + } + /* Cache is locked, so we can relese page before freeing it */ + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, FALSE)) + DBUG_ASSERT(0); + DBUG_ASSERT(block->hash_link->requests > 0); + page_link->requests--; + /* See NOTE for pagecache_unlock about registering requests. */ + free_block(pagecache, block); + +err: + dec_counter_for_resize_op(pagecache); + return error; +} + + +/* + @brief Delete page from the buffer by link + + @param pagecache pointer to a page cache data structure + @param link direct link to page (returned by read or write) + @param lock lock change + @param flush flush page if it is dirty + + @retval 0 deleted or was not present at all + @retval 1 error + + @note lock can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was + write locked before) or PAGECACHE_LOCK_WRITE (delete will write + lock page before delete) +*/ + +my_bool pagecache_delete_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + my_bool flush) +{ + my_bool error= 0; + enum pagecache_page_pin pin= PAGECACHE_PIN_LEFT_PINNED; + DBUG_ENTER("pagecache_delete_by_link"); + DBUG_PRINT("enter", ("fd: %d block 0x%lx %s %s", + block->hash_link->file.file, + (ulong) block, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED); + DBUG_ASSERT(block->pins != 0); /* should be pinned */ + + if (pagecache->can_be_used) + { + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + goto end; + + /* + This block should be pinned (i.e. has not zero request counter) => + Such block can't be chosen for eviction. + */ + DBUG_ASSERT((block->status & + (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED)) == 0); + /* + make_lock_and_pin() can't fail here, because we are keeping pin on the + block and it can't be evicted (which is cause of lock fail and retry) + */ + if (make_lock_and_pin(pagecache, block, lock, pin, FALSE)) + DBUG_ASSERT(0); + + /* + get_present_hash_link() side effect emulation before call + pagecache_delete_internal() + */ + block->hash_link->requests++; + + error= pagecache_delete_internal(pagecache, block, block->hash_link, + flush); +end: + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + } + + DBUG_RETURN(error); +} + + +/** + @brief Returns "hits" for promotion + + @return "hits" for promotion +*/ + +uint pagecache_pagelevel(PAGECACHE_BLOCK_LINK *block) +{ + return block->hits_left; +} + +/* + @brief Adds "hits" to the page + + @param link direct link to page (returned by read or write) + @param level number of "hits" which we add to the page +*/ + +void pagecache_add_level_by_link(PAGECACHE_BLOCK_LINK *block, + uint level) +{ + DBUG_ASSERT(block->pins != 0); /* should be pinned */ + /* + Operation is just for statistics so it is not really important + if it interfere with other hit increasing => we are doing it without + locking the pagecache. + */ + block->hits_left+= level; +} + +/* + @brief Delete page from the buffer + + @param pagecache pointer to a page cache data structure + @param file handler for the file for the block of data to be read + @param pageno number of the block of data in the file + @param lock lock change + @param flush flush page if it is dirty + + @retval 0 deleted or was not present at all + @retval 1 error + + @note lock can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was + write locked before) or PAGECACHE_LOCK_WRITE (delete will write + lock page before delete) +*/ +static enum pagecache_page_pin lock_to_pin_one_phase[8]= +{ + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_UNLOCKED*/, + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_READLOCKED*/, + PAGECACHE_PIN_LEFT_PINNED /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/, + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ*/, + PAGECACHE_PIN /*PAGECACHE_LOCK_WRITE*/, + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ_UNLOCK*/, + PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_UNLOCK*/, + PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_TO_READ*/ +}; + +my_bool pagecache_delete(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + my_bool flush) +{ + my_bool error= 0; + enum pagecache_page_pin pin= lock_to_pin_one_phase[lock]; + DBUG_ENTER("pagecache_delete"); + DBUG_PRINT("enter", ("fd: %u page: %lu %s %s", + (uint) file->file, (ulong) pageno, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED); + DBUG_ASSERT(pin == PAGECACHE_PIN || + pin == PAGECACHE_PIN_LEFT_PINNED); +restart: + + DBUG_ASSERT(pageno < ((ULL(1)) << 40)); + if (pagecache->can_be_used) + { + /* Key cache is used */ + reg1 PAGECACHE_BLOCK_LINK *block; + PAGECACHE_HASH_LINK **unused_start, *page_link; + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + goto end; + + inc_counter_for_resize_op(pagecache); + page_link= get_present_hash_link(pagecache, file, pageno, &unused_start); + if (!page_link) + { + DBUG_PRINT("info", ("There is no such page in the cache")); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(0); + } + block= page_link->block; + if (block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH)) + { + DBUG_PRINT("info", ("Block 0x%0lx already is %s", + (ulong) block, + ((block->status & PCBLOCK_REASSIGNED) ? + "reassigned" : "in switch"))); + PCBLOCK_INFO(block); + page_link->requests--; + goto end; + } + /* See NOTE for pagecache_unlock about registering requests. */ + if (pin == PAGECACHE_PIN) + reg_requests(pagecache, block, 1); + DBUG_ASSERT(block != 0); + if (make_lock_and_pin(pagecache, block, lock, pin, FALSE)) + { + /* + We failed to writelock the block, cache is unlocked, and last write + lock is released, we will try to get the block again. + */ + if (pin == PAGECACHE_PIN) + unreg_request(pagecache, block, 1); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + + /* we can't delete with opened direct link for write */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0); + + error= pagecache_delete_internal(pagecache, block, page_link, flush); +end: + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + } + + DBUG_RETURN(error); +} + + +my_bool pagecache_delete_pages(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint page_count, + enum pagecache_page_lock lock, + my_bool flush) +{ + pgcache_page_no_t page_end; + DBUG_ENTER("pagecache_delete_pages"); + DBUG_ASSERT(page_count > 0); + + page_end= pageno + page_count; + do + { + if (pagecache_delete(pagecache, file, pageno, + lock, flush)) + DBUG_RETURN(1); + } while (++pageno != page_end); + DBUG_RETURN(0); +} + + +/** + @brief Writes a buffer into a cached file. + + @param pagecache pointer to a page cache data structure + @param file handler for the file to write data to + @param pageno number of the block of data in the file + @param level determines the weight of the data + @param buff buffer with the data + @param type type of the page + @param lock lock change + @param pin pin page + @param write_mode how to write page + @param link link to the page if we pin it + @param first_REDO_LSN_for_page the lsn to set rec_lsn + @param offset offset in the page + @param size size of data + @param validator read page validator + @param validator_data the validator data + + @retval 0 if a success. + @retval 1 Error. +*/ + +static struct rw_lock_change write_lock_change_table[]= +{ + {1, + PAGECACHE_LOCK_WRITE, + PAGECACHE_LOCK_WRITE_UNLOCK} /*PAGECACHE_LOCK_LEFT_UNLOCKED*/, + {0, /*unsupported (we can't write having the block read locked) */ + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_LEFT_READLOCKED*/, + {0, PAGECACHE_LOCK_LEFT_WRITELOCKED, 0} /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/, + {1, + PAGECACHE_LOCK_WRITE, + PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_READ*/, + {0, PAGECACHE_LOCK_WRITE, 0} /*PAGECACHE_LOCK_WRITE*/, + {0, /*unsupported (we can't write having the block read locked) */ + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_READ_UNLOCK*/, + {1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_LOCK_WRITE_UNLOCK } /*PAGECACHE_LOCK_WRITE_UNLOCK*/, + {1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_WRITE_TO_READ*/ +}; + + +static struct rw_pin_change write_pin_change_table[]= +{ + {PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN_LEFT_PINNED*/, + {PAGECACHE_PIN, + PAGECACHE_UNPIN} /*PAGECACHE_PIN_LEFT_UNPINNED*/, + {PAGECACHE_PIN, + PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN*/, + {PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_UNPIN} /*PAGECACHE_UNPIN*/ +}; + + +/** + @note 'buff', if not NULL, must be long-aligned. +*/ + +my_bool pagecache_write_part(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + enum pagecache_write_mode write_mode, + PAGECACHE_BLOCK_LINK **page_link, + LSN first_REDO_LSN_for_page, + uint offset, uint size) +{ + PAGECACHE_BLOCK_LINK *block= NULL; + PAGECACHE_BLOCK_LINK *fake_link; + my_bool error= 0; + int need_lock_change= write_lock_change_table[lock].need_lock_change; + my_bool reg_request; +#ifndef DBUG_OFF + char llbuf[22]; + DBUG_ENTER("pagecache_write_part"); + DBUG_PRINT("enter", ("fd: %u page: %s level: %u type: %s lock: %s " + "pin: %s mode: %s offset: %u size %u", + (uint) file->file, ullstr(pageno, llbuf), level, + page_cache_page_type_str[type], + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin], + page_cache_page_write_mode_str[write_mode], + offset, size)); + DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE); + DBUG_ASSERT(lock != PAGECACHE_LOCK_LEFT_READLOCKED); + DBUG_ASSERT(lock != PAGECACHE_LOCK_READ_UNLOCK); + DBUG_ASSERT(offset + size <= pagecache->block_size); + DBUG_ASSERT(pageno < ((ULL(1)) << 40)); +#endif + + if (!page_link) + page_link= &fake_link; + *page_link= 0; + +restart: + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "start of key_cache_write", 1);); +#endif + + if (pagecache->can_be_used) + { + /* Key cache is used */ + int page_st; + my_bool need_page_ready_signal= FALSE; + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + { + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + goto no_key_cache; + } + + inc_counter_for_resize_op(pagecache); + pagecache->global_cache_w_requests++; + /* See NOTE for pagecache_unlock about registering requests. */ + reg_request= ((pin == PAGECACHE_PIN_LEFT_UNPINNED) || + (pin == PAGECACHE_PIN)); + block= find_block(pagecache, file, pageno, level, + TRUE, + reg_request, &page_st); + if (!block) + { + DBUG_ASSERT(write_mode != PAGECACHE_WRITE_DONE); + /* It happens only for requests submitted during resize operation */ + dec_counter_for_resize_op(pagecache); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* Write to the disk key cache is in resize at the moment*/ + goto no_key_cache; + } + DBUG_PRINT("info", ("page status: %d", page_st)); + if (!(block->status & PCBLOCK_ERROR) && + ((page_st == PAGE_TO_BE_READ && + (offset || size < pagecache->block_size)) || + (page_st == PAGE_WAIT_TO_BE_READ))) + { + /* The requested page is to be read into the block buffer */ + read_block(pagecache, block, + (my_bool)(page_st == PAGE_TO_BE_READ)); + DBUG_PRINT("info", ("read is done")); + } + else if (page_st == PAGE_TO_BE_READ) + { + need_page_ready_signal= TRUE; + } + + DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE || + block->type == PAGECACHE_READ_UNKNOWN_PAGE || + block->type == type || + /* this is for when going to non-trans to trans */ + (block->type == PAGECACHE_PLAIN_PAGE && + type == PAGECACHE_LSN_PAGE)); + block->type= type; + /* we write to the page so it has no sense to keep the flag */ + block->status&= ~PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx", + (ulong) block)); + + if (make_lock_and_pin(pagecache, block, + write_lock_change_table[lock].new_lock, + (need_lock_change ? + write_pin_change_table[pin].new_pin : + pin), FALSE)) + { + /* + We failed to writelock the block, cache is unlocked, and last write + lock is released, we will try to get the block again. + */ + if (reg_request) + unreg_request(pagecache, block, 1); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + + if (write_mode == PAGECACHE_WRITE_DONE) + { + if (block->status & PCBLOCK_ERROR) + { + my_debug_put_break_here(); + DBUG_PRINT("warning", ("Writing on page with error")); + } + else + { + /* Copy data from buff */ + if (!(size & 511)) + bmove512(block->buffer + offset, buff, size); + else + memcpy(block->buffer + offset, buff, size); + block->status= PCBLOCK_READ; + /* + The read_callback can change the page content (removing page + protection) so it have to be called + */ + DBUG_PRINT("info", ("read_callback: 0x%lx data: 0x%lx", + (ulong) block->hash_link->file.read_callback, + (ulong) block->hash_link->file.callback_data)); + if ((*block->hash_link->file.read_callback)(block->buffer, + block->hash_link->pageno, + block->hash_link-> + file.callback_data)) + { + DBUG_PRINT("error", ("read callback problem")); + block->status|= PCBLOCK_ERROR; + block->error= (int16) my_errno; + my_debug_put_break_here(); + } + KEYCACHE_DBUG_PRINT("key_cache_insert", + ("Page injection")); +#ifdef THREAD + /* Signal that all pending requests for this now can be processed. */ + if (block->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); +#endif + } + } + else + { + if (! (block->status & PCBLOCK_CHANGED)) + link_to_changed_list(pagecache, block); + + if (!(size & 511)) + bmove512(block->buffer + offset, buff, size); + else + memcpy(block->buffer + offset, buff, size); + block->status|= PCBLOCK_READ; + /* Page is correct again if we made a full write in it */ + if (size == pagecache->block_size) + block->status&= ~PCBLOCK_ERROR; + } + +#ifdef THREAD + if (need_page_ready_signal && + block->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); +#endif + + if (first_REDO_LSN_for_page) + { + /* single write action of the last write action */ + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_LEFT_UNLOCKED); + DBUG_ASSERT(pin == PAGECACHE_UNPIN || + pin == PAGECACHE_PIN_LEFT_UNPINNED); + pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page); + } + + if (need_lock_change) + { + /* + We don't set rec_lsn of the block; this is ok as for the + Maria-block-record's pages, we always keep pages pinned here. + */ + if (make_lock_and_pin(pagecache, block, + write_lock_change_table[lock].unlock_lock, + write_pin_change_table[pin].unlock_pin, FALSE)) + DBUG_ASSERT(0); + } + + /* Unregister the request */ + DBUG_ASSERT(block->hash_link->requests > 0); + block->hash_link->requests--; + /* See NOTE for pagecache_unlock about registering requests. */ + if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN) + unreg_request(pagecache, block, 1); + else + *page_link= block; + + if (block->status & PCBLOCK_ERROR) + { + error= 1; + my_debug_put_break_here(); + } + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + goto end; + } + +no_key_cache: + /* + We can't by pass the normal page cache operations because need + whole page for calling callbacks & so on. + This branch should not be used for now (but it is fixed as it + should be just to avoid confusing) + */ + DBUG_ASSERT(0); + /* Key cache is not used */ + if (write_mode == PAGECACHE_WRITE_DELAY) + { + /* We can't use mutex here as the key cache may not be initialized */ + pagecache->global_cache_w_requests++; + pagecache->global_cache_write++; + if (offset != 0 || size != pagecache->block_size) + { + uchar *page_buffer= (uchar *) alloca(pagecache->block_size); + + pagecache->global_cache_read++; + if ((error= (pagecache_fread(pagecache, file, + page_buffer, + pageno, + pagecache->readwrite_flags) != 0))) + goto end; + if ((file->read_callback)(page_buffer, pageno, file->callback_data)) + { + DBUG_PRINT("error", ("read callback problem")); + error= 1; + goto end; + } + memcpy((char *)page_buffer + offset, buff, size); + buff= page_buffer; + } + if (pagecache_fwrite(pagecache, file, buff, pageno, type, + pagecache->readwrite_flags)) + error= 1; + } + +end: +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("exec", + test_key_cache(pagecache, "end of key_cache_write", 1);); +#endif + if (block) + PCBLOCK_INFO(block); + else + DBUG_PRINT("info", ("No block")); + DBUG_RETURN(error); +} + + +/* + Free block: remove reference to it from hash table, + remove it from the chain file of dirty/clean blocks + and add it to the free list. +*/ + +static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) +{ + KEYCACHE_THREAD_TRACE("free block"); + KEYCACHE_DBUG_PRINT("free_block", + ("block: %u hash_link 0x%lx", + PCBLOCK_NUMBER(pagecache, block), + (long) block->hash_link)); + if (block->hash_link) + { + /* + While waiting for readers to finish, new readers might request the + block. But since we set block->status|= PCBLOCK_REASSIGNED, they + will wait on block->wqueue[COND_FOR_SAVED]. They must be signalled + later. + */ + block->status|= PCBLOCK_REASSIGNED; + wait_for_readers(pagecache, block); + unlink_hash(pagecache, block->hash_link); + } + + unlink_changed(block); + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->rlocks_queue == 0); + DBUG_ASSERT(block->pins == 0); + block->status= 0; +#ifndef DBUG_OFF + block->type= PAGECACHE_EMPTY_PAGE; +#endif + block->rec_lsn= LSN_MAX; + KEYCACHE_THREAD_TRACE("free block"); + KEYCACHE_DBUG_PRINT("free_block", + ("block is freed")); + unreg_request(pagecache, block, 0); + block->hash_link= NULL; + + /* Remove the free block from the LRU ring. */ + unlink_block(pagecache, block); + if (block->temperature == PCBLOCK_WARM) + pagecache->warm_blocks--; + block->temperature= PCBLOCK_COLD; + /* Insert the free block in the free list. */ + block->next_used= pagecache->free_block_list; + pagecache->free_block_list= block; + /* Keep track of the number of currently unused blocks. */ + pagecache->blocks_unused++; + +#ifdef THREAD + /* All pending requests for this page must be resubmitted. */ + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); +#endif +} + + +static int cmp_sec_link(PAGECACHE_BLOCK_LINK **a, PAGECACHE_BLOCK_LINK **b) +{ + return (((*a)->hash_link->pageno < (*b)->hash_link->pageno) ? -1 : + ((*a)->hash_link->pageno > (*b)->hash_link->pageno) ? 1 : 0); +} + + +/** + @brief Flush a portion of changed blocks to disk, free used blocks + if requested + + @param pagecache This page cache reference. + @param file File which should be flushed + @param cache Beginning of array of the block. + @param end Reference to the block after last in the array. + @param flush_type Type of the flush. + @param first_errno Where to store first errno of the flush. + + + @return Operation status + @retval PCFLUSH_OK OK + @retval PCFLUSH_ERROR There was errors during the flush process. + @retval PCFLUSH_PINNED Pinned blocks was met and skipped. + @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED. +*/ + +static int flush_cached_blocks(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + PAGECACHE_BLOCK_LINK **cache, + PAGECACHE_BLOCK_LINK **end, + enum flush_type type, + int *first_errno) +{ + int rc= PCFLUSH_OK; + my_bool error; + uint count= (uint) (end-cache); + DBUG_ENTER("flush_cached_blocks"); + *first_errno= 0; + + /* Don't lock the cache during the flush */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + As all blocks referred in 'cache' are marked by PCBLOCK_IN_FLUSH + we are guaranteed that no thread will change them + */ + qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + for (; cache != end; cache++) + { + PAGECACHE_BLOCK_LINK *block= *cache; + + /* + In the case of non_transactional tables we want to flush also + block pinned with reads. This is becasue we may have other + threads reading the block during flush, as non transactional + tables can have many readers while the one writer is doing the + flush. + We don't want to do flush pinned blocks during checkpoint. + We detect the checkpoint case by checking if type is LAZY. + */ + if ((type == FLUSH_KEEP_LAZY && block->pins) || block->wlocks) + { + KEYCACHE_DBUG_PRINT("flush_cached_blocks", + ("block: %u (0x%lx) pinned", + PCBLOCK_NUMBER(pagecache, block), (ulong)block)); + DBUG_PRINT("info", ("block: %u (0x%lx) pinned", + PCBLOCK_NUMBER(pagecache, block), (ulong)block)); + PCBLOCK_INFO(block); + /* undo the mark put by flush_pagecache_blocks_int(): */ + block->status&= ~PCBLOCK_IN_FLUSH; + rc|= PCFLUSH_PINNED; + DBUG_PRINT("warning", ("Page pinned")); + unreg_request(pagecache, block, 1); + if (!*first_errno) + *first_errno= HA_ERR_INTERNAL_ERROR; + continue; + } + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_READ, PAGECACHE_PIN, FALSE)) + DBUG_ASSERT(0); + + KEYCACHE_DBUG_PRINT("flush_cached_blocks", + ("block: %u (0x%lx) to be flushed", + PCBLOCK_NUMBER(pagecache, block), (ulong)block)); + DBUG_PRINT("info", ("block: %u (0x%lx) to be flushed", + PCBLOCK_NUMBER(pagecache, block), (ulong)block)); + PCBLOCK_INFO(block); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("block: %u (0x%lx) pins: %u", + PCBLOCK_NUMBER(pagecache, block), (ulong)block, + block->pins)); + /** + @todo IO If page is contiguous with next page to flush, group flushes + in one single my_pwrite(). + */ + /** + It is important to use block->hash_link->file below and not 'file', as + the first one is right and the second may have different out-of-date + content (see StaleFilePointersInFlush in ma_checkpoint.c). + @todo change argument of functions to be File. + */ + error= pagecache_fwrite(pagecache, &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + block->type, + pagecache->readwrite_flags); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_READ_UNLOCK, + PAGECACHE_UNPIN, FALSE)) + DBUG_ASSERT(0); + + pagecache->global_cache_write++; + if (error) + { + block->status|= PCBLOCK_ERROR; + block->error= (int16) my_errno; + my_debug_put_break_here(); + if (!*first_errno) + *first_errno= my_errno ? my_errno : -1; + rc|= PCFLUSH_ERROR; + } +#ifdef THREAD + /* + Let to proceed for possible waiting requests to write to the block page. + It might happen only during an operation to resize the key cache. + */ + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); +#endif + /* type will never be FLUSH_IGNORE_CHANGED here */ + if (! (type == FLUSH_KEEP || type == FLUSH_KEEP_LAZY || + type == FLUSH_FORCE_WRITE)) + { + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + free_block(pagecache, block); + } + else + { + block->status&= ~PCBLOCK_IN_FLUSH; + link_to_file_list(pagecache, block, file, 1); + unreg_request(pagecache, block, 1); + } + } + DBUG_RETURN(rc); +} + + +/** + @brief flush all blocks for a file to disk but don't do any mutex locks + + @param pagecache pointer to a pagecache data structure + @param file handler for the file to flush to + @param flush_type type of the flush + @param filter optional function which tells what blocks to flush; + can be non-NULL only if FLUSH_KEEP, FLUSH_KEEP_LAZY + or FLUSH_FORCE_WRITE. + @param filter_arg an argument to pass to 'filter'. Information about + the block will be passed too. + + @note + Flushes all blocks having the same OS file descriptor as 'file->file', so + can flush blocks having '*block->hash_link->file' != '*file'. + + @note + This function doesn't do any mutex locks because it needs to be called + both from flush_pagecache_blocks and flush_all_key_blocks (the later one + does the mutex lock in the resize_pagecache() function). + + @note + This function can cause problems if two threads call it + concurrently on the same file (look for "PageCacheFlushConcurrencyBugs" + in ma_checkpoint.c); to avoid them, it has internal logic to serialize in + this situation. + + @return Operation status + @retval PCFLUSH_OK OK + @retval PCFLUSH_ERROR There was errors during the flush process. + @retval PCFLUSH_PINNED Pinned blocks was met and skipped. + @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED. +*/ + +static int flush_pagecache_blocks_int(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + enum flush_type type, + PAGECACHE_FLUSH_FILTER filter, + void *filter_arg) +{ + PAGECACHE_BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache; + int last_errno= 0; + int rc= PCFLUSH_OK; + DBUG_ENTER("flush_pagecache_blocks_int"); + DBUG_PRINT("enter", + ("fd: %d blocks_used: %lu blocks_changed: %lu type: %d", + file->file, pagecache->blocks_used, pagecache->blocks_changed, + type)); + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, + "start of flush_pagecache_blocks", 0);); +#endif + + cache= cache_buff; + if (pagecache->disk_blocks > 0 && + (!my_disable_flush_pagecache_blocks || + (type != FLUSH_KEEP && type != FLUSH_KEEP_LAZY))) + { + /* + Key cache exists. If my_disable_flush_pagecache_blocks is true it + disables the operation but only FLUSH_KEEP[_LAZY]: other flushes still + need to be allowed: FLUSH_RELEASE has to free blocks, and + FLUSH_FORCE_WRITE is to overrule my_disable_flush_pagecache_blocks. + */ + int error= 0; + uint count= 0; + PAGECACHE_BLOCK_LINK **pos, **end; + PAGECACHE_BLOCK_LINK *first_in_switch= NULL; + PAGECACHE_BLOCK_LINK *block, *next; +#if defined(PAGECACHE_DEBUG) + uint cnt= 0; +#endif + +#ifdef THREAD + struct st_file_in_flush us_flusher, *other_flusher; + us_flusher.file= file->file; + us_flusher.flush_queue.last_thread= NULL; + us_flusher.first_in_switch= FALSE; + while ((other_flusher= (struct st_file_in_flush *) + hash_search(&pagecache->files_in_flush, (uchar *)&file->file, + sizeof(file->file)))) + { + /* + File is in flush already: wait, unless FLUSH_KEEP_LAZY. "Flusher" + means "who can mark PCBLOCK_IN_FLUSH", i.e. caller of + flush_pagecache_blocks_int(). + */ + struct st_my_thread_var *thread; + if (type == FLUSH_KEEP_LAZY) + { + DBUG_PRINT("info",("FLUSH_KEEP_LAZY skips")); + DBUG_RETURN(0); + } + thread= my_thread_var; + wqueue_add_to_queue(&other_flusher->flush_queue, thread); + do + { + KEYCACHE_DBUG_PRINT("flush_pagecache_blocks_int: wait1", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + } + /* we are the only flusher of this file now */ + while (my_hash_insert(&pagecache->files_in_flush, (uchar *)&us_flusher)) + { + /* + Out of memory, wait for flushers to empty the hash and retry; should + rarely happen. Other threads are flushing the file; when done, they + are going to remove themselves from the hash, and thus memory will + appear again. However, this memory may be stolen by yet another thread + (for a purpose unrelated to page cache), before we retry + hash_insert(). So the loop may run for long. Only if the thread was + killed do we abort the loop, returning 1 (error) which can cause the + table to be marked as corrupted (cf maria_chk_size(), maria_close()) + and thus require a table check. + */ + DBUG_ASSERT(0); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + if (my_thread_var->abort) + DBUG_RETURN(1); /* End if aborted by user */ + sleep(10); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + } +#endif + + if (type != FLUSH_IGNORE_CHANGED) + { + /* + Count how many key blocks we have to cache to be able + to flush all dirty pages with minimum seek moves. + */ + for (block= pagecache->changed_blocks[FILE_HASH(*file)] ; + block; + block= block->next_changed) + { + if (block->hash_link->file.file == file->file) + { + count++; + KEYCACHE_DBUG_ASSERT(count<= pagecache->blocks_used); + } + } + /* Allocate a new buffer only if its bigger than the one we have */ + if (count > FLUSH_CACHE && + !(cache= + (PAGECACHE_BLOCK_LINK**) + my_malloc(sizeof(PAGECACHE_BLOCK_LINK*)*count, MYF(0)))) + { + cache= cache_buff; + count= FLUSH_CACHE; + } + } + + /* Retrieve the blocks and write them to a buffer to be flushed */ +restart: + end= (pos= cache)+count; + for (block= pagecache->changed_blocks[FILE_HASH(*file)] ; + block; + block= next) + { +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + next= block->next_changed; + if (block->hash_link->file.file != file->file) + continue; + if (filter != NULL) + { + int filter_res= (*filter)(block->type, block->hash_link->pageno, + block->rec_lsn, filter_arg); + DBUG_PRINT("info",("filter returned %d", filter_res)); + if (filter_res == FLUSH_FILTER_SKIP_TRY_NEXT) + continue; + if (filter_res == FLUSH_FILTER_SKIP_ALL) + break; + DBUG_ASSERT(filter_res == FLUSH_FILTER_OK); + } + { + /* + Mark the block with BLOCK_IN_FLUSH in order not to let + other threads to use it for new pages and interfere with + our sequence of flushing dirty file pages + */ + block->status|= PCBLOCK_IN_FLUSH; + + if (! (block->status & PCBLOCK_IN_SWITCH)) + { + /* + We care only for the blocks for which flushing was not + initiated by other threads as a result of page swapping + */ + reg_requests(pagecache, block, 1); + if (type != FLUSH_IGNORE_CHANGED) + { + /* It's not a temporary file */ + if (pos == end) + { + /* + This happens only if there is not enough + memory for the big block + */ + if ((rc|= flush_cached_blocks(pagecache, file, cache, + end, type, &error)) & + (PCFLUSH_ERROR | PCFLUSH_PINNED)) + last_errno=error; + DBUG_PRINT("info", ("restarting...")); + /* + Restart the scan as some other thread might have changed + the changed blocks chain: the blocks that were in switch + state before the flush started have to be excluded + */ + goto restart; + } + *pos++= block; + } + else + { + /* It's a temporary file */ + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + free_block(pagecache, block); + } + } + else if (type != FLUSH_KEEP_LAZY) + { + /* + Link the block into a list of blocks 'in switch', and then we will + wait for this list to be empty, which means they have been flushed + */ + unlink_changed(block); + link_changed(block, &first_in_switch); + us_flusher.first_in_switch= TRUE; + } + } + } + if (pos != cache) + { + if ((rc|= flush_cached_blocks(pagecache, file, cache, pos, type, + &error)) & + (PCFLUSH_ERROR | PCFLUSH_PINNED)) + last_errno= error; + } + /* Wait until list of blocks in switch is empty */ + while (first_in_switch) + { +#if defined(PAGECACHE_DEBUG) + cnt= 0; +#endif + block= first_in_switch; + { +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + do + { + KEYCACHE_DBUG_PRINT("flush_pagecache_blocks_int: wait2", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); +#else + KEYCACHE_DBUG_ASSERT(0); + /* No parallel requests in single-threaded case */ +#endif + } +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + } + us_flusher.first_in_switch= FALSE; + /* The following happens very seldom */ + if (! (type == FLUSH_KEEP || type == FLUSH_KEEP_LAZY || + type == FLUSH_FORCE_WRITE)) + { + /* + this code would free all blocks while filter maybe handled only a + few, that is not possible. + */ + DBUG_ASSERT(filter == NULL); +#if defined(PAGECACHE_DEBUG) + cnt=0; +#endif + for (block= pagecache->file_blocks[FILE_HASH(*file)] ; + block; + block= next) + { +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + next= block->next_changed; + if (block->hash_link->file.file == file->file && + (! (block->status & PCBLOCK_CHANGED) + || type == FLUSH_IGNORE_CHANGED)) + { + reg_requests(pagecache, block, 1); + free_block(pagecache, block); + } + } + } +#ifdef THREAD + /* wake up others waiting to flush this file */ + hash_delete(&pagecache->files_in_flush, (uchar *)&us_flusher); + if (us_flusher.flush_queue.last_thread) + wqueue_release_queue(&us_flusher.flush_queue); +#endif + } + +#ifndef DBUG_OFF + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "end of flush_pagecache_blocks", 0);); +#endif + if (cache != cache_buff) + my_free(cache, MYF(0)); + if (rc != 0) + { + if (last_errno) + my_errno= last_errno; /* Return first error */ + DBUG_PRINT("error", ("Got error: %d", my_errno)); + } + DBUG_RETURN(rc); +} + + +/** + @brief flush all blocks for a file to disk + + @param pagecache pointer to a pagecache data structure + @param file handler for the file to flush to + @param flush_type type of the flush + @param filter optional function which tells what blocks to flush; + can be non-NULL only if FLUSH_KEEP, FLUSH_KEEP_LAZY + or FLUSH_FORCE_WRITE. + @param filter_arg an argument to pass to 'filter'. Information about + the block will be passed too. + + @return Operation status + @retval PCFLUSH_OK OK + @retval PCFLUSH_ERROR There was errors during the flush process. + @retval PCFLUSH_PINNED Pinned blocks was met and skipped. + @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED. +*/ + +int flush_pagecache_blocks_with_filter(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + enum flush_type type, + PAGECACHE_FLUSH_FILTER filter, + void *filter_arg) +{ + int res; + DBUG_ENTER("flush_pagecache_blocks_with_filter"); + DBUG_PRINT("enter", ("pagecache: 0x%lx", (long) pagecache)); + + if (pagecache->disk_blocks <= 0) + DBUG_RETURN(0); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + inc_counter_for_resize_op(pagecache); + res= flush_pagecache_blocks_int(pagecache, file, type, filter, filter_arg); + dec_counter_for_resize_op(pagecache); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(res); +} + + +/* + Reset the counters of a key cache. + + SYNOPSIS + reset_pagecache_counters() + name the name of a key cache + pagecache pointer to the pagecache to be reset + + DESCRIPTION + This procedure is used to reset the counters of all currently used key + caches, both the default one and the named ones. + + RETURN + 0 on success (always because it can't fail) +*/ + +int reset_pagecache_counters(const char *name __attribute__((unused)), + PAGECACHE *pagecache) +{ + DBUG_ENTER("reset_pagecache_counters"); + if (!pagecache->inited) + { + DBUG_PRINT("info", ("Key cache %s not initialized.", name)); + DBUG_RETURN(0); + } + DBUG_PRINT("info", ("Resetting counters for key cache %s.", name)); + + pagecache->global_blocks_changed= 0; /* Key_blocks_not_flushed */ + pagecache->global_cache_r_requests= 0; /* Key_read_requests */ + pagecache->global_cache_read= 0; /* Key_reads */ + pagecache->global_cache_w_requests= 0; /* Key_write_requests */ + pagecache->global_cache_write= 0; /* Key_writes */ + DBUG_RETURN(0); +} + + +/** + @brief Allocates a buffer and stores in it some info about all dirty pages + + Does the allocation because the caller cannot know the size itself. + Memory freeing is to be done by the caller (if the "str" member of the + LEX_STRING is not NULL). + Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they + are not interesting for a checkpoint record. + The caller has the intention of doing checkpoints. + + @param pagecache pointer to the page cache + @param[out] str pointer to where the allocated buffer, and + its size, will be put + @param[out] min_rec_lsn pointer to where the minimum rec_lsn of all + relevant dirty pages will be put + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache, + LEX_STRING *str, + LSN *min_rec_lsn) +{ + my_bool error= 0; + ulong stored_list_size= 0; + uint file_hash; + char *ptr; + LSN minimum_rec_lsn= LSN_MAX; + DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN"); + + DBUG_ASSERT(NULL == str->str); + /* + We lock the entire cache but will be quick, just reading/writing a few MBs + of memory at most. + */ + pagecache_pthread_mutex_lock(&pagecache->cache_lock); +#ifdef THREAD + for (;;) + { + struct st_file_in_flush *other_flusher; + for (file_hash= 0; + (other_flusher= (struct st_file_in_flush *) + hash_element(&pagecache->files_in_flush, file_hash)) != NULL && + !other_flusher->first_in_switch; + file_hash++) + {} + if (other_flusher == NULL) + break; + /* + other_flusher.first_in_switch is true: some thread is flushing a file + and has removed dirty blocks from changed_blocks[] while they were still + dirty (they were being evicted (=>flushed) by yet another thread, which + may not have flushed the block yet so it may still be dirty). + If Checkpoint proceeds now, it will not see the page. If there is a + crash right after writing the checkpoint record, before the page is + flushed, at recovery the page will be wrongly ignored because it won't + be in the dirty pages list in the checkpoint record. So wait. + */ + { + struct st_my_thread_var *thread= my_thread_var; + wqueue_add_to_queue(&other_flusher->flush_queue, thread); + do + { + KEYCACHE_DBUG_PRINT("pagecache_collect_changed_blocks_with_lsn: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + } + } +#endif + + /* Count how many dirty pages are interesting */ + for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) + { + /* + Q: is there something subtle with block->hash_link: can it be NULL? + does it have to be == hash_link->block... ? + */ + DBUG_ASSERT(block->hash_link != NULL); + DBUG_ASSERT(block->status & PCBLOCK_CHANGED); + /* + Note that we don't store bitmap pages, or pages from non-transactional + (like temporary) tables. Don't checkpoint during Recovery which uses + PAGECACHE_PLAIN_PAGE. + */ + if (block->type != PAGECACHE_LSN_PAGE) + continue; /* no need to store it */ + stored_list_size++; + } + } + + compile_time_assert(sizeof(pagecache->blocks) <= 8); + str->length= 8 + /* number of dirty pages */ + (2 + /* table id */ + 1 + /* data or index file */ + 5 + /* pageno */ + LSN_STORE_SIZE /* rec_lsn */ + ) * stored_list_size; + if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME)))) + goto err; + ptr= str->str; + int8store(ptr, (ulonglong)stored_list_size); + ptr+= 8; + DBUG_PRINT("info", ("found %lu dirty pages", stored_list_size)); + if (stored_list_size == 0) + goto end; + for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) + { + uint16 table_id; + MARIA_SHARE *share; + if (block->type != PAGECACHE_LSN_PAGE) + continue; /* no need to store it in the checkpoint record */ + share= (MARIA_SHARE *)(block->hash_link->file.callback_data); + table_id= share->id; + int2store(ptr, table_id); + ptr+= 2; + ptr[0]= (share->kfile.file == block->hash_link->file.file); + ptr++; + DBUG_ASSERT(block->hash_link->pageno < ((ULL(1)) << 40)); + page_store(ptr, block->hash_link->pageno); + ptr+= PAGE_STORE_SIZE; + lsn_store(ptr, block->rec_lsn); + ptr+= LSN_STORE_SIZE; + if (block->rec_lsn != LSN_MAX) + { + DBUG_ASSERT(LSN_VALID(block->rec_lsn)); + if (cmp_translog_addr(block->rec_lsn, minimum_rec_lsn) < 0) + minimum_rec_lsn= block->rec_lsn; + } /* otherwise, some trn->rec_lsn should hold the correct info */ + } + } +end: + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + *min_rec_lsn= minimum_rec_lsn; + DBUG_RETURN(error); + +err: + error= 1; + goto end; +} + + +#ifndef DBUG_OFF + +/** + Verifies that a file has no dirty pages. +*/ + +void pagecache_file_no_dirty_page(PAGECACHE *pagecache, PAGECACHE_FILE *file) +{ + File fd= file->file; + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[FILE_HASH(*file)]; + block != NULL; + block= block->next_changed) + if (block->hash_link->file.file == fd) + { + DBUG_PRINT("info", ("pagecache_file_not_in error")); + PCBLOCK_INFO(block); + DBUG_ASSERT(0); + } +} + + +/* + Test if disk-cache is ok +*/ +static void test_key_cache(PAGECACHE *pagecache __attribute__((unused)), + const char *where __attribute__((unused)), + my_bool lock __attribute__((unused))) +{ + /* TODO */ +} +#endif + +uchar *pagecache_block_link_to_buffer(PAGECACHE_BLOCK_LINK *block) +{ + return block->buffer; +} + +#if defined(PAGECACHE_TIMEOUT) + +#define KEYCACHE_DUMP_FILE "pagecache_dump.txt" +#define MAX_QUEUE_LEN 100 + + +static void pagecache_dump(PAGECACHE *pagecache) +{ + FILE *pagecache_dump_file=fopen(KEYCACHE_DUMP_FILE, "w"); + struct st_my_thread_var *last; + struct st_my_thread_var *thread; + PAGECACHE_BLOCK_LINK *block; + PAGECACHE_HASH_LINK *hash_link; + PAGECACHE_PAGE *page; + uint i; + + fprintf(pagecache_dump_file, "thread:%u\n", thread->id); + + i=0; + thread=last=waiting_for_hash_link.last_thread; + fprintf(pagecache_dump_file, "queue of threads waiting for hash link\n"); + if (thread) + do + { + thread= thread->next; + page= (PAGECACHE_PAGE *) thread->opt_info; + fprintf(pagecache_dump_file, + "thread:%u, (file,pageno)=(%u,%lu)\n", + thread->id,(uint) page->file.file,(ulong) page->pageno); + if (++i == MAX_QUEUE_LEN) + break; + } + while (thread != last); + + i=0; + thread=last=waiting_for_block.last_thread; + fprintf(pagecache_dump_file, "queue of threads waiting for block\n"); + if (thread) + do + { + thread=thread->next; + hash_link= (PAGECACHE_HASH_LINK *) thread->opt_info; + fprintf(pagecache_dump_file, + "thread:%u hash_link:%u (file,pageno)=(%u,%lu)\n", + thread->id, (uint) PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link), + (uint) hash_link->file.file,(ulong) hash_link->pageno); + if (++i == MAX_QUEUE_LEN) + break; + } + while (thread != last); + + for (i=0 ; i < pagecache->blocks_used ; i++) + { + int j; + block= &pagecache->block_root[i]; + hash_link= block->hash_link; + fprintf(pagecache_dump_file, + "block:%u hash_link:%d status:%x #requests=%u waiting_for_readers:%d\n", + i, (int) (hash_link ? + PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link) : + -1), + block->status, block->requests, block->condvar ? 1 : 0); + for (j=0 ; j < COND_SIZE; j++) + { + PAGECACHE_WQUEUE *wqueue=&block->wqueue[j]; + thread= last= wqueue->last_thread; + fprintf(pagecache_dump_file, "queue #%d\n", j); + if (thread) + { + do + { + thread=thread->next; + fprintf(pagecache_dump_file, + "thread:%u\n", thread->id); + if (++i == MAX_QUEUE_LEN) + break; + } + while (thread != last); + } + } + } + fprintf(pagecache_dump_file, "LRU chain:"); + block= pagecache= used_last; + if (block) + { + do + { + block= block->next_used; + fprintf(pagecache_dump_file, + "block:%u, ", PCBLOCK_NUMBER(pagecache, block)); + } + while (block != pagecache->used_last); + } + fprintf(pagecache_dump_file, "\n"); + + fclose(pagecache_dump_file); +} + +#endif /* defined(PAGECACHE_TIMEOUT) */ + +#if defined(PAGECACHE_TIMEOUT) && !defined(__WIN__) + + +static int pagecache_pthread_cond_wait(pthread_cond_t *cond, + pthread_mutex_t *mutex) +{ + int rc; + struct timeval now; /* time when we started waiting */ + struct timespec timeout; /* timeout value for the wait function */ + struct timezone tz; +#if defined(PAGECACHE_DEBUG) + int cnt=0; +#endif + + /* Get current time */ + gettimeofday(&now, &tz); + /* Prepare timeout value */ + timeout.tv_sec= now.tv_sec + PAGECACHE_TIMEOUT; + /* + timeval uses microseconds. + timespec uses nanoseconds. + 1 nanosecond = 1000 micro seconds + */ + timeout.tv_nsec= now.tv_usec * 1000; + KEYCACHE_THREAD_TRACE_END("started waiting"); +#if defined(PAGECACHE_DEBUG) + cnt++; + if (cnt % 100 == 0) + fprintf(pagecache_debug_log, "waiting...\n"); + fflush(pagecache_debug_log); +#endif + rc= pthread_cond_timedwait(cond, mutex, &timeout); + KEYCACHE_THREAD_TRACE_BEGIN("finished waiting"); + if (rc == ETIMEDOUT || rc == ETIME) + { +#if defined(PAGECACHE_DEBUG) + fprintf(pagecache_debug_log,"aborted by pagecache timeout\n"); + fclose(pagecache_debug_log); + abort(); +#endif + pagecache_dump(); + } + +#if defined(PAGECACHE_DEBUG) + KEYCACHE_DBUG_ASSERT(rc != ETIMEDOUT); +#else + assert(rc != ETIMEDOUT); +#endif + return rc; +} +#else +#if defined(PAGECACHE_DEBUG) +static int pagecache_pthread_cond_wait(pthread_cond_t *cond, + pthread_mutex_t *mutex) +{ + int rc; + KEYCACHE_THREAD_TRACE_END("started waiting"); + rc= pthread_cond_wait(cond, mutex); + KEYCACHE_THREAD_TRACE_BEGIN("finished waiting"); + return rc; +} +#endif +#endif /* defined(PAGECACHE_TIMEOUT) && !defined(__WIN__) */ + +#if defined(PAGECACHE_DEBUG) +static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex) +{ + int rc; + rc= pthread_mutex_lock(mutex); + KEYCACHE_THREAD_TRACE_BEGIN(""); + return rc; +} + + +static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex) +{ + KEYCACHE_THREAD_TRACE_END(""); + pthread_mutex_unlock(mutex); +} + + +static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond) +{ + int rc; + KEYCACHE_THREAD_TRACE("signal"); + rc= pthread_cond_signal(cond); + return rc; +} + + +#if defined(PAGECACHE_DEBUG_LOG) + + +static void pagecache_debug_print(const char * fmt, ...) +{ + va_list args; + va_start(args,fmt); + if (pagecache_debug_log) + { + VOID(vfprintf(pagecache_debug_log, fmt, args)); + VOID(fputc('\n',pagecache_debug_log)); + } + va_end(args); +} +#endif /* defined(PAGECACHE_DEBUG_LOG) */ + +#if defined(PAGECACHE_DEBUG_LOG) + + +void pagecache_debug_log_close(void) +{ + if (pagecache_debug_log) + fclose(pagecache_debug_log); +} +#endif /* defined(PAGECACHE_DEBUG_LOG) */ + +#endif /* defined(PAGECACHE_DEBUG) */ diff --git a/storage/maria/ma_pagecache.h b/storage/maria/ma_pagecache.h new file mode 100644 index 00000000000..821728ef374 --- /dev/null +++ b/storage/maria/ma_pagecache.h @@ -0,0 +1,325 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Page cache variable structures */ + +#ifndef _ma_pagecache_h +#define _ma_pagecache_h +C_MODE_START + +#include "ma_loghandler_lsn.h" +#include <m_string.h> +#include <hash.h> + +/* Type of the page */ +enum pagecache_page_type +{ + /* + Used only for control page type changing during debugging. This define + should only be using when using DBUG. + */ + PAGECACHE_EMPTY_PAGE, + /* the page does not contain LSN */ + PAGECACHE_PLAIN_PAGE, + /* the page contain LSN (maria tablespace page) */ + PAGECACHE_LSN_PAGE, + /* Page type used when scanning file and we don't care about the type */ + PAGECACHE_READ_UNKNOWN_PAGE +}; + +/* + This enum describe lock status changing. every type of page cache will + interpret WRITE/READ lock as it need. +*/ +enum pagecache_page_lock +{ + PAGECACHE_LOCK_LEFT_UNLOCKED, /* free -> free */ + PAGECACHE_LOCK_LEFT_READLOCKED, /* read -> read */ + PAGECACHE_LOCK_LEFT_WRITELOCKED, /* write -> write */ + PAGECACHE_LOCK_READ, /* free -> read */ + PAGECACHE_LOCK_WRITE, /* free -> write */ + PAGECACHE_LOCK_READ_UNLOCK, /* read -> free */ + PAGECACHE_LOCK_WRITE_UNLOCK, /* write -> free */ + PAGECACHE_LOCK_WRITE_TO_READ /* write -> read */ +}; +/* + This enum describe pin status changing +*/ +enum pagecache_page_pin +{ + PAGECACHE_PIN_LEFT_PINNED, /* pinned -> pinned */ + PAGECACHE_PIN_LEFT_UNPINNED, /* unpinned -> unpinned */ + PAGECACHE_PIN, /* unpinned -> pinned */ + PAGECACHE_UNPIN /* pinned -> unpinned */ +}; +/* How to write the page */ +enum pagecache_write_mode +{ + /* do not write immediately, i.e. it will be dirty page */ + PAGECACHE_WRITE_DELAY, + /* page already is in the file. (key cache insert analogue) */ + PAGECACHE_WRITE_DONE +}; + +/* page number for maria */ +typedef ulonglong pgcache_page_no_t; + +/* file descriptor for Maria */ +typedef struct st_pagecache_file +{ + File file; + /** Cannot be NULL */ + my_bool (*read_callback)(uchar *page, pgcache_page_no_t offset, + uchar *data); + /** Cannot be NULL */ + my_bool (*write_callback)(uchar *page, pgcache_page_no_t offset, + uchar *data); + void (*write_fail)(uchar *data); + /** Cannot be NULL */ + my_bool (*flush_log_callback)(uchar *page, pgcache_page_no_t offset, + uchar *data); + uchar *callback_data; +} PAGECACHE_FILE; + +/* declare structures that is used by st_pagecache */ + +struct st_pagecache_block_link; +typedef struct st_pagecache_block_link PAGECACHE_BLOCK_LINK; +struct st_pagecache_page; +typedef struct st_pagecache_page PAGECACHE_PAGE; +struct st_pagecache_hash_link; +typedef struct st_pagecache_hash_link PAGECACHE_HASH_LINK; + +#include <wqueue.h> + +#define PAGECACHE_CHANGED_BLOCKS_HASH 128 /* must be power of 2 */ +#define PAGECACHE_PRIORITY_LOW 0 +#define PAGECACHE_PRIORITY_DEFAULT 3 +#define PAGECACHE_PRIORITY_HIGH 6 + +/* + The page cache structure + It also contains read-only statistics parameters. +*/ + +typedef struct st_pagecache +{ + size_t mem_size; /* specified size of the cache memory */ + ulong min_warm_blocks; /* min number of warm blocks; */ + ulong age_threshold; /* age threshold for hot blocks */ + ulonglong time; /* total number of block link operations */ + ulong hash_entries; /* max number of entries in the hash table */ + long hash_links; /* max number of hash links */ + long hash_links_used; /* number of hash links taken from free links pool */ + long disk_blocks; /* max number of blocks in the cache */ + ulong blocks_used; /* maximum number of concurrently used blocks */ + ulong blocks_unused; /* number of currently unused blocks */ + ulong blocks_changed; /* number of currently dirty blocks */ + ulong warm_blocks; /* number of blocks in warm sub-chain */ + ulong cnt_for_resize_op; /* counter to block resize operation */ + ulong blocks_available; /* number of blocks available in the LRU chain */ + long blocks; /* max number of blocks in the cache */ + uint32 block_size; /* size of the page buffer of a cache block */ + PAGECACHE_HASH_LINK **hash_root;/* arr. of entries into hash table buckets */ + PAGECACHE_HASH_LINK *hash_link_root;/* memory for hash table links */ + PAGECACHE_HASH_LINK *free_hash_list;/* list of free hash links */ + PAGECACHE_BLOCK_LINK *free_block_list;/* list of free blocks */ + PAGECACHE_BLOCK_LINK *block_root;/* memory for block links */ + uchar HUGE_PTR *block_mem; /* memory for block buffers */ + PAGECACHE_BLOCK_LINK *used_last;/* ptr to the last block of the LRU chain */ + PAGECACHE_BLOCK_LINK *used_ins;/* ptr to the insertion block in LRU chain */ + pthread_mutex_t cache_lock; /* to lock access to the cache structure */ + WQUEUE resize_queue; /* threads waiting during resize operation */ + WQUEUE waiting_for_hash_link;/* waiting for a free hash link */ + WQUEUE waiting_for_block; /* requests waiting for a free block */ + /* hash for dirty file bl.*/ + PAGECACHE_BLOCK_LINK *changed_blocks[PAGECACHE_CHANGED_BLOCKS_HASH]; + /* hash for other file bl.*/ + PAGECACHE_BLOCK_LINK *file_blocks[PAGECACHE_CHANGED_BLOCKS_HASH]; + + /* + The following variables are and variables used to hold parameters for + initializing the key cache. + */ + + ulonglong param_buff_size; /* size the memory allocated for the cache */ + ulong param_block_size; /* size of the blocks in the key cache */ + ulong param_division_limit; /* min. percentage of warm blocks */ + ulong param_age_threshold; /* determines when hot block is downgraded */ + + /* Statistics variables. These are reset in reset_pagecache_counters(). */ + ulong global_blocks_changed; /* number of currently dirty blocks */ + ulonglong global_cache_w_requests;/* number of write requests (write hits) */ + ulonglong global_cache_write; /* number of writes from cache to files */ + ulonglong global_cache_r_requests;/* number of read requests (read hits) */ + ulonglong global_cache_read; /* number of reads from files to cache */ + + uint shift; /* block size = 2 ^ shift */ + myf readwrite_flags; /* Flags to pread/pwrite() */ + myf org_readwrite_flags; /* Flags to pread/pwrite() at init */ + my_bool inited; + my_bool resize_in_flush; /* true during flush of resize operation */ + my_bool can_be_used; /* usage of cache for read/write is allowed */ + my_bool in_init; /* Set to 1 in MySQL during init/resize */ + HASH files_in_flush; /**< files in flush_pagecache_blocks_int() */ +} PAGECACHE; + +/** @brief Return values for PAGECACHE_FLUSH_FILTER */ +enum pagecache_flush_filter_result +{ + FLUSH_FILTER_SKIP_TRY_NEXT= 0,/**< skip page and move on to next one */ + FLUSH_FILTER_OK, /**< flush page and move on to next one */ + FLUSH_FILTER_SKIP_ALL /**< skip page and all next ones */ +}; +/** @brief a filter function type for flush_pagecache_blocks_with_filter() */ +typedef enum pagecache_flush_filter_result +(*PAGECACHE_FLUSH_FILTER)(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); + +/* The default key cache */ +extern PAGECACHE dflt_pagecache_var, *dflt_pagecache; + +extern ulong init_pagecache(PAGECACHE *pagecache, size_t use_mem, + uint division_limit, uint age_threshold, + uint block_size, myf my_read_flags); +extern ulong resize_pagecache(PAGECACHE *pagecache, + size_t use_mem, uint division_limit, + uint age_threshold); +extern void change_pagecache_param(PAGECACHE *pagecache, uint division_limit, + uint age_threshold); + +extern uchar *pagecache_read(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + PAGECACHE_BLOCK_LINK **link); + +#define pagecache_write(P,F,N,L,B,T,O,I,M,K,R) \ + pagecache_write_part(P,F,N,L,B,T,O,I,M,K,R,0,(P)->block_size) + +#define pagecache_inject(P,F,N,L,B,T,O,I,K,R) \ + pagecache_write_part(P,F,N,L,B,T,O,I,PAGECACHE_WRITE_DONE, \ + K,R,0,(P)->block_size) + +extern my_bool pagecache_write_part(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + enum pagecache_write_mode write_mode, + PAGECACHE_BLOCK_LINK **link, + LSN first_REDO_LSN_for_page, + uint offset, + uint size); +extern void pagecache_unlock(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn, my_bool was_changed); +extern void pagecache_unlock_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn, my_bool was_changed, + my_bool any); +extern void pagecache_unpin(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + LSN lsn); +extern void pagecache_unpin_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *link, + LSN lsn); + + +/* Results of flush operation (bit field in fact) */ + +/* The flush is done. */ +#define PCFLUSH_OK 0 +/* There was errors during the flush process. */ +#define PCFLUSH_ERROR 1 +/* Pinned blocks was met and skipped. */ +#define PCFLUSH_PINNED 2 +/* PCFLUSH_ERROR and PCFLUSH_PINNED. */ +#define PCFLUSH_PINNED_AND_ERROR (PCFLUSH_ERROR|PCFLUSH_PINNED) + +#define pagecache_file_init(F,RC,WC,WF,GLC,D) \ + do{ \ + (F).read_callback= (RC); (F).write_callback= (WC); \ + (F).write_fail= (WF); \ + (F).flush_log_callback= (GLC); (F).callback_data= (uchar*)(D); \ + } while(0) + +#define flush_pagecache_blocks(A,B,C) \ + flush_pagecache_blocks_with_filter(A,B,C,NULL,NULL) +extern int flush_pagecache_blocks_with_filter(PAGECACHE *keycache, + PAGECACHE_FILE *file, + enum flush_type type, + PAGECACHE_FLUSH_FILTER filter, + void *filter_arg); +extern my_bool pagecache_delete(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + my_bool flush); +extern my_bool pagecache_delete_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *link, + enum pagecache_page_lock lock, + my_bool flush); +extern my_bool pagecache_delete_pages(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint page_count, + enum pagecache_page_lock lock, + my_bool flush); +extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup); +extern my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache, + LEX_STRING *str, + LSN *min_lsn); +extern int reset_pagecache_counters(const char *name, PAGECACHE *pagecache); +extern uchar *pagecache_block_link_to_buffer(PAGECACHE_BLOCK_LINK *block); + +extern uint pagecache_pagelevel(PAGECACHE_BLOCK_LINK *block); +extern void pagecache_add_level_by_link(PAGECACHE_BLOCK_LINK *block, + uint level); + +/* Functions to handle multiple key caches */ +extern my_bool multi_pagecache_init(void); +extern void multi_pagecache_free(void); +extern PAGECACHE *multi_pagecache_search(uchar *key, uint length, + PAGECACHE *def); +extern my_bool multi_pagecache_set(const uchar *key, uint length, + PAGECACHE *pagecache); +extern void multi_pagecache_change(PAGECACHE *old_data, + PAGECACHE *new_data); +extern int reset_pagecache_counters(const char *name, + PAGECACHE *pagecache); +#ifndef DBUG_OFF +void pagecache_file_no_dirty_page(PAGECACHE *pagecache, PAGECACHE_FILE *file); +#else +#define pagecache_file_no_dirty_page(A,B) {} +#endif + +C_MODE_END +#endif /* _keycache_h */ diff --git a/storage/maria/ma_pagecaches.c b/storage/maria/ma_pagecaches.c new file mode 100644 index 00000000000..8a1423ee0d7 --- /dev/null +++ b/storage/maria/ma_pagecaches.c @@ -0,0 +1,104 @@ +/* Copyright (C) 2003-2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Handling of multiple key caches + + The idea is to have a thread safe hash on the table name, + with a default key cache value that is returned if the table name is not in + the cache. +*/ + +#include "maria_def.h" +#include "ma_pagecache.h" +#include <hash.h> +#include <m_string.h> +#include "../../mysys/my_safehash.h" + +/***************************************************************************** + Functions to handle the pagecache objects +*****************************************************************************/ + +/* Variable to store all key cache objects */ +static SAFE_HASH pagecache_hash; + + +my_bool multi_pagecache_init(void) +{ + return safe_hash_init(&pagecache_hash, 16, (uchar*) maria_pagecache); +} + + +void multi_pagecache_free(void) +{ + safe_hash_free(&pagecache_hash); +} + +/* + Get a key cache to be used for a specific table. + + SYNOPSIS + multi_pagecache_search() + key key to find (usually table path) + uint length Length of key. + def Default value if no key cache + + NOTES + This function is coded in such a way that we will return the + default key cache even if one never called multi_pagecache_init. + This will ensure that it works with old MyISAM clients. + + RETURN + key cache to use +*/ + +PAGECACHE *multi_pagecache_search(uchar *key, uint length, + PAGECACHE *def) +{ + if (!pagecache_hash.hash.records) + return def; + return (PAGECACHE*) safe_hash_search(&pagecache_hash, key, length, + (void*) def); +} + + +/* + Assosiate a key cache with a key + + + SYONOPSIS + multi_pagecache_set() + key key (path to table etc..) + length Length of key + pagecache cache to assococite with the table + + NOTES + This can be used both to insert a new entry and change an existing + entry +*/ + + +my_bool multi_pagecache_set(const uchar *key, uint length, + PAGECACHE *pagecache) +{ + return safe_hash_set(&pagecache_hash, key, length, (uchar*) pagecache); +} + + +void multi_pagecache_change(PAGECACHE *old_data, + PAGECACHE *new_data) +{ + safe_hash_change(&pagecache_hash, (uchar*) old_data, (uchar*) new_data); +} diff --git a/storage/maria/ma_pagecrc.c b/storage/maria/ma_pagecrc.c new file mode 100644 index 00000000000..640bb8880f4 --- /dev/null +++ b/storage/maria/ma_pagecrc.c @@ -0,0 +1,378 @@ +/* Copyright (C) 2007-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + + +/** + @brief calculate crc of the page avoiding special values + + @param start The value to start CRC (we use page number here) + @param data data pointer + @param length length of the data + + @return crc of the page without special values +*/ + +static uint32 maria_page_crc(uint32 start, uchar *data, uint length) +{ + uint32 crc= crc32(start, data, length); + + /* we need this assert to get following comparison working */ + compile_time_assert(MARIA_NO_CRC_BITMAP_PAGE == + MARIA_NO_CRC_NORMAL_PAGE - 1 && + MARIA_NO_CRC_NORMAL_PAGE == 0xffffffff); + if (crc >= MARIA_NO_CRC_BITMAP_PAGE) + crc= MARIA_NO_CRC_BITMAP_PAGE - 1; + + return(crc); +} + +/** + @brief Maria pages read callback (checks the page CRC) + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr pointer to MARIA_SHARE + @param no_crc_val Value which means CRC absence + (MARIA_NO_CRC_NORMAL_PAGE or MARIA_NO_CRC_BITMAP_PAGE) + @param data_length length of data to calculate CRC + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool maria_page_crc_check(uchar *page, + pgcache_page_no_t page_no, + MARIA_SHARE *share, + uint32 no_crc_val, + int data_length) +{ + uint32 crc= uint4korr(page + share->block_size - CRC_SIZE), new_crc; + my_bool res; + DBUG_ENTER("maria_page_crc_check"); + + DBUG_ASSERT((uint)data_length <= share->block_size - CRC_SIZE); + + /* we need this assert to get following comparison working */ + compile_time_assert(MARIA_NO_CRC_BITMAP_PAGE == + MARIA_NO_CRC_NORMAL_PAGE - 1 && + MARIA_NO_CRC_NORMAL_PAGE == 0xffffffff); + /* + If crc is no_crc_val then + the page has no crc, so there is nothing to check. + */ + if (crc >= MARIA_NO_CRC_BITMAP_PAGE) + { + DBUG_PRINT("info", ("No crc: %lu crc: %lu page: %lu ", + (ulong) no_crc_val, (ulong) crc, (ulong) page_no)); + if (crc != no_crc_val) + { + my_errno= HA_ERR_WRONG_CRC; + DBUG_PRINT("error", ("Wrong no CRC value")); + DBUG_RETURN(1); + } + DBUG_RETURN(0); + } + new_crc= maria_page_crc((uint32) page_no, page, data_length); + DBUG_ASSERT(new_crc != no_crc_val); + res= test(new_crc != crc); + if (res) + { + /* + Bitmap pages may be totally zero filled in some cases. + This happens when we get a crash after the pagecache has written + out a page that is on a newly created bitmap page and we get + a crash before the bitmap page is written out. + + We handle this case with the following logic: + When reading, approve of bitmap pages where all bytes are zero + (This is after all a bitmap pages where no data is reserved and + the CRC will be corrected at next write) + */ + if (no_crc_val == MARIA_NO_CRC_BITMAP_PAGE && + crc == 0 && _ma_check_if_zero(page, data_length)) + { + DBUG_PRINT("warning", ("Found bitmap page that was not initialized")); + DBUG_RETURN(0); + } + + DBUG_PRINT("error", ("Page: %lu crc: %lu calculated crc: %lu", + (ulong) page_no, (ulong) crc, (ulong) new_crc)); + my_errno= HA_ERR_WRONG_CRC; + } + DBUG_RETURN(res); +} + + +/** + @brief Maria pages write callback (sets the page CRC for data and index + files) + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK +*/ + +my_bool maria_page_crc_set_normal(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr) +{ + MARIA_SHARE *share= (MARIA_SHARE *)data_ptr; + int data_length= share->block_size - CRC_SIZE; + uint32 crc= maria_page_crc((uint32) page_no, page, data_length); + DBUG_ENTER("maria_page_crc_set_normal"); + DBUG_PRINT("info", ("Page %lu crc: %lu", (ulong) page_no, (ulong)crc)); + + /* crc is on the stack so it is aligned, pagecache buffer is aligned, too */ + int4store_aligned(page + data_length, crc); + DBUG_RETURN(0); +} + + +/** + @brief Maria pages write callback (sets the page CRC for keys) + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK +*/ + +my_bool maria_page_crc_set_index(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr) +{ + MARIA_SHARE *share= (MARIA_SHARE *)data_ptr; + int data_length= _ma_get_page_used(share, page); + uint32 crc= maria_page_crc((uint32) page_no, page, data_length); + DBUG_ENTER("maria_page_crc_set_index"); + DBUG_PRINT("info", ("Page %lu crc: %lu", + (ulong) page_no, (ulong) crc)); + DBUG_ASSERT((uint)data_length <= share->block_size - CRC_SIZE); + /* crc is on the stack so it is aligned, pagecache buffer is aligned, too */ + int4store_aligned(page + share->block_size - CRC_SIZE, crc); + DBUG_RETURN(0); +} + + +/* interface functions */ + + +/** + @brief Maria pages read callback (checks the page CRC) for index/data pages + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr Read callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK + @retval 1 Error +*/ + +my_bool maria_page_crc_check_data(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr) +{ + MARIA_SHARE *share= (MARIA_SHARE *)data_ptr; + return (maria_page_crc_check(page, (uint32) page_no, share, + MARIA_NO_CRC_NORMAL_PAGE, + share->block_size - CRC_SIZE)); +} + + +/** + @brief Maria pages read callback (checks the page CRC) for bitmap pages + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr Read callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK + @retval 1 Error +*/ + +my_bool maria_page_crc_check_bitmap(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr) +{ + MARIA_SHARE *share= (MARIA_SHARE *)data_ptr; + return (maria_page_crc_check(page, (uint32) page_no, share, + MARIA_NO_CRC_BITMAP_PAGE, + share->block_size - CRC_SIZE)); +} + + +/** + @brief Maria pages read callback (checks the page CRC) for index pages + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr Read callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK + @retval 1 Error +*/ + +my_bool maria_page_crc_check_index(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr) +{ + MARIA_SHARE *share= (MARIA_SHARE *)data_ptr; + uint length= _ma_get_page_used(share, page); + if (length > share->block_size - CRC_SIZE) + { + DBUG_PRINT("error", ("Wrong page length: %u", length)); + return (my_errno= HA_ERR_WRONG_CRC); + } + return maria_page_crc_check(page, (uint32) page_no, share, + MARIA_NO_CRC_NORMAL_PAGE, + length); +} + + +/** + @brief Maria pages dumme read callback for temporary tables + + @retval 0 OK + @retval 1 Error +*/ + +my_bool maria_page_crc_check_none(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no + __attribute__((unused)), + uchar *data_ptr __attribute__((unused))) +{ + return 0; +} + + +/** + @brief Maria pages write callback (sets the page filler for index/data) + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK +*/ + +my_bool maria_page_filler_set_normal(uchar *page, + pgcache_page_no_t page_no + __attribute__((unused)), + uchar *data_ptr) +{ + DBUG_ENTER("maria_page_filler_set_normal"); + DBUG_ASSERT(page_no != 0); /* Catches some simple bugs */ + int4store_aligned(page + ((MARIA_SHARE *)data_ptr)->block_size - CRC_SIZE, + MARIA_NO_CRC_NORMAL_PAGE); + DBUG_RETURN(0); +} + + +/** + @brief Maria pages write callback (sets the page filler for bitmap) + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK +*/ + +my_bool maria_page_filler_set_bitmap(uchar *page, + pgcache_page_no_t page_no + __attribute__((unused)), + uchar *data_ptr) +{ + DBUG_ENTER("maria_page_filler_set_bitmap"); + int4store_aligned(page + ((MARIA_SHARE *)data_ptr)->block_size - CRC_SIZE, + MARIA_NO_CRC_BITMAP_PAGE); + DBUG_RETURN(0); +} + + +/** + @brief Maria pages dummy write callback for temporary tables + + @retval 0 OK +*/ + +my_bool maria_page_filler_set_none(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no + __attribute__((unused)), + uchar *data_ptr __attribute__((unused))) +{ +#ifdef HAVE_valgrind + int4store_aligned(page + ((MARIA_SHARE *)data_ptr)->block_size - CRC_SIZE, + 0); +#endif + return 0; +} + + +/** + @brief Write failure callback (mark table as corrupted) + + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) +*/ + +void maria_page_write_failure(uchar* data_ptr) +{ + maria_mark_crashed_share((MARIA_SHARE *)data_ptr); +} + + +/** + @brief Maria flush log log if needed + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK + @retval 1 error +*/ + +my_bool maria_flush_log_for_page(uchar *page, + pgcache_page_no_t page_no + __attribute__((unused)), + uchar *data_ptr __attribute__((unused))) +{ + LSN lsn; +#ifndef DBUG_OFF + const MARIA_SHARE *share= (MARIA_SHARE*) data_ptr; +#endif + DBUG_ENTER("maria_flush_log_for_page"); + /* share is 0 here only in unittest */ + DBUG_ASSERT(!share || (share->page_type == PAGECACHE_LSN_PAGE && + share->now_transactional)); + lsn= lsn_korr(page); + if (translog_flush(lsn)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +my_bool maria_flush_log_for_page_none(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no + __attribute__((unused)), + uchar *data_ptr __attribute__((unused))) +{ + return 0; +} diff --git a/storage/maria/ma_panic.c b/storage/maria/ma_panic.c new file mode 100644 index 00000000000..a86563f31fb --- /dev/null +++ b/storage/maria/ma_panic.c @@ -0,0 +1,140 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "ma_fulltext.h" + +/* + Stop usage of Maria + + SYNOPSIS + maria_panic() + flag HA_PANIC_CLOSE: All maria files (tables and log) are closed. + maria_end() is called. + HA_PANIC_WRITE: All misam files are unlocked and + all changed data in single user maria is + written to file + HA_PANIC_READ All maria files that was locked when + maria_panic(HA_PANIC_WRITE) was done is + locked. A maria_readinfo() is done for + all single user files to get changes + in database + + RETURN + 0 ok + # error number in case of error +*/ + +int maria_panic(enum ha_panic_function flag) +{ + int error=0; + LIST *list_element,*next_open; + MARIA_HA *info; + DBUG_ENTER("maria_panic"); + + if (!maria_inited) + DBUG_RETURN(0); + pthread_mutex_lock(&THR_LOCK_maria); + for (list_element=maria_open_list ; list_element ; list_element=next_open) + { + next_open=list_element->next; /* Save if close */ + info=(MARIA_HA*) list_element->data; + switch (flag) { + case HA_PANIC_CLOSE: + /* + If bad luck (if some tables would be used now, which normally does not + happen in MySQL), as we release the mutex, the list may change and so + we may crash. + */ + pthread_mutex_unlock(&THR_LOCK_maria); + if (maria_close(info)) + error=my_errno; + pthread_mutex_lock(&THR_LOCK_maria); + break; + case HA_PANIC_WRITE: /* Do this to free databases */ +#ifdef CANT_OPEN_FILES_TWICE + if (info->s->options & HA_OPTION_READ_ONLY_DATA) + break; +#endif + if (flush_pagecache_blocks(info->s->pagecache, &info->s->kfile, + FLUSH_RELEASE)) + error=my_errno; + if (info->opt_flag & WRITE_CACHE_USED) + if (flush_io_cache(&info->rec_cache)) + error=my_errno; + if (info->opt_flag & READ_CACHE_USED) + { + if (flush_io_cache(&info->rec_cache)) + error=my_errno; + reinit_io_cache(&info->rec_cache,READ_CACHE,0, + (pbool) (info->lock_type != F_UNLCK),1); + } + if (info->lock_type != F_UNLCK && ! info->was_locked) + { + info->was_locked=info->lock_type; + if (maria_lock_database(info,F_UNLCK)) + error=my_errno; + } +#ifdef CANT_OPEN_FILES_TWICE + if (info->s->kfile.file >= 0 && my_close(info->s->kfile.file, MYF(0))) + error = my_errno; + if (info->dfile.file >= 0 && my_close(info->dfile.file, MYF(0))) + error = my_errno; + info->s->kfile.file= info->dfile.file= -1;/* Files aren't open anymore */ + break; +#endif + case HA_PANIC_READ: /* Restore to before WRITE */ +#ifdef CANT_OPEN_FILES_TWICE + { /* Open closed files */ + char name_buff[FN_REFLEN]; + MARIA_SHARE *share= info->s; + if (share->kfile.file < 0) + { + + if ((share->kfile.file= my_open(fn_format(name_buff, + info->filename, "", + N_NAME_IEXT,4), + info->mode, + MYF(MY_WME))) < 0) + error = my_errno; + } + if (info->dfile.file < 0) + { + if ((info->dfile.file= my_open(fn_format(name_buff, info->filename, + "", N_NAME_DEXT, 4), + info->mode, + MYF(MY_WME))) < 0) + error = my_errno; + info->rec_cache.file= info->dfile.file; + } + if (share->bitmap.file.file < 0) + share->bitmap.file.file= info->dfile.file; + } +#endif + if (info->was_locked) + { + if (maria_lock_database(info, info->was_locked)) + error=my_errno; + info->was_locked=0; + } + break; + } + } + pthread_mutex_unlock(&THR_LOCK_maria); + if (flag == HA_PANIC_CLOSE) + maria_end(); + if (!error) + DBUG_RETURN(0); + DBUG_RETURN(my_errno=error); +} /* maria_panic */ diff --git a/storage/maria/ma_preload.c b/storage/maria/ma_preload.c new file mode 100644 index 00000000000..6dfb4e437b6 --- /dev/null +++ b/storage/maria/ma_preload.c @@ -0,0 +1,116 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Preload indexes into key cache +*/ + +#include "maria_def.h" + + +/* + Preload pages of the index file for a table into the key cache + + SYNOPSIS + maria_preload() + info open table + map map of indexes to preload into key cache + ignore_leaves only non-leaves pages are to be preloaded + + RETURN VALUE + 0 if a success. error code - otherwise. + + NOTES. + At present pages for all indexes are preloaded. + In future only pages for indexes specified in the key_map parameter + of the table will be preloaded. + We don't yet use preload_buff_size (we read page after page). +*/ + +int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves) +{ + ulong block_length= 0; + uchar *buff; + MARIA_SHARE* share= info->s; + uint keynr; + my_off_t key_file_length= share->state.state.key_file_length; + pgcache_page_no_t page_no, page_no_max; + PAGECACHE_BLOCK_LINK *page_link; + DBUG_ENTER("maria_preload"); + + if (!share->state.header.keys || !maria_is_any_key_active(key_map) || + (key_file_length == share->base.keystart)) + DBUG_RETURN(0); + + block_length= share->pagecache->block_size; + + if (!(buff= (uchar *) my_malloc(block_length, MYF(MY_WME)))) + DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM); + + if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE)) + goto err; + + /* + Currently when we come here all other open instances of the table have + been closed, and we flushed all pages of our own instance, so there + cannot be any page of this table in the cache. Thus my_pread() would be + safe. But in the future, we will allow more concurrency during + preloading, so we use pagecache_read() instead of my_pread() because we + observed that on some Linux, concurrent pread() and pwrite() (which + could be from a page eviction by another thread) to the same page can + make pread() see an half-written page. + In this future, we should find a way to read state.key_file_length + reliably, handle concurrent shrinks (delete_all_rows()) etc. + */ + for ((page_no= share->base.keystart / block_length), + (page_no_max= key_file_length / block_length); + page_no < page_no_max; page_no++) + { + /** + @todo instead of reading pages one by one we could have a call + pagecache_read_several_pages() which does a single my_pread() for many + consecutive pages (like the my_pread() in mi_preload()). + */ + if (pagecache_read(share->pagecache, &share->kfile, page_no, + DFLT_INIT_HITS, buff, share->page_type, + PAGECACHE_LOCK_WRITE, &page_link) == NULL) + goto err; + keynr= _ma_get_keynr(share, buff); + if (((ignore_leaves && !_ma_test_if_nod(share, buff)) || + keynr == MARIA_DELETE_KEY_NR || + !(key_map & ((ulonglong) 1 << keynr))) && + (pagecache_pagelevel(page_link) == DFLT_INIT_HITS)) + { + /* + This page is not interesting, and (last condition above) we are the + ones who put it in the cache, so nobody else is interested in it. + */ + if (pagecache_delete_by_link(share->pagecache, page_link, + PAGECACHE_LOCK_LEFT_WRITELOCKED, FALSE)) + goto err; + } + else /* otherwise it stays in cache: */ + pagecache_unlock_by_link(share->pagecache, page_link, + PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, + LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, FALSE, FALSE); + } + + my_free(buff, MYF(0)); + DBUG_RETURN(0); + +err: + my_free(buff, MYF(MY_ALLOW_ZERO_PTR)); + DBUG_RETURN(my_errno= errno); +} diff --git a/storage/maria/ma_range.c b/storage/maria/ma_range.c new file mode 100644 index 00000000000..5dc4e3a9959 --- /dev/null +++ b/storage/maria/ma_range.c @@ -0,0 +1,312 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Gives a approximated number of how many records there is between two keys. + Used when optimizing querries. + */ + +#include "maria_def.h" +#include "ma_rt_index.h" + +static ha_rows _ma_record_pos(MARIA_HA *,const uchar *, key_part_map, + enum ha_rkey_function); +static double _ma_search_pos(MARIA_HA *, MARIA_KEY *, uint32, my_off_t); +static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key); + + +/** + @brief Estimate how many records there is in a given range + + @param info MARIA handler + @param inx Index to use + @param min_key Min key. Is = 0 if no min range + @param max_key Max key. Is = 0 if no max range + + @note + We should ONLY return 0 if there is no rows in range + + @return Estimated number of rows or error + @retval HA_POS_ERROR error (or we can't estimate number of rows) + @retval number Estimated number of rows +*/ + +ha_rows maria_records_in_range(MARIA_HA *info, int inx, key_range *min_key, + key_range *max_key) +{ + ha_rows start_pos,end_pos,res; + MARIA_SHARE *share= info->s; + MARIA_KEY key; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_records_in_range"); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(HA_POS_ERROR); + + if (fast_ma_readinfo(info)) + DBUG_RETURN(HA_POS_ERROR); + info->update&= (HA_STATE_CHANGED+HA_STATE_ROW_CHANGED); + keyinfo= share->keyinfo + inx; + if (share->lock_key_trees) + rw_rdlock(&keyinfo->root_lock); + + switch (keyinfo->key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + { + uchar *key_buff; + + /* + The problem is that the optimizer doesn't support + RTree keys properly at the moment. + Hope this will be fixed some day. + But now NULL in the min_key means that we + didn't make the task for the RTree key + and expect BTree functionality from it. + As it's not able to handle such request + we return the error. + */ + if (!min_key) + { + res= HA_POS_ERROR; + break; + } + key_buff= info->last_key.data + share->base.max_key_length; + _ma_pack_key(info, &key, inx, key_buff, + min_key->key, min_key->keypart_map, + (HA_KEYSEG**) 0); + res= maria_rtree_estimate(info, &key, maria_read_vec[min_key->flag]); + res= res ? res : 1; /* Don't return 0 */ + break; + } +#endif + case HA_KEY_ALG_BTREE: + default: + start_pos= (min_key ? + _ma_record_pos(info, min_key->key, min_key->keypart_map, + min_key->flag) : + (ha_rows) 0); + end_pos= (max_key ? + _ma_record_pos(info, max_key->key, max_key->keypart_map, + max_key->flag) : + info->state->records + (ha_rows) 1); + res= (end_pos < start_pos ? (ha_rows) 0 : + (end_pos == start_pos ? (ha_rows) 1 : end_pos-start_pos)); + if (start_pos == HA_POS_ERROR || end_pos == HA_POS_ERROR) + res=HA_POS_ERROR; + } + + if (share->lock_key_trees) + rw_unlock(&keyinfo->root_lock); + fast_ma_writeinfo(info); + + /** + @todo LOCK + If res==0 (no rows), if we need to guarantee repeatability of the search, + we will need to set a next-key lock in this statement. + Also SELECT COUNT(*)... + */ + + DBUG_PRINT("info",("records: %ld",(ulong) (res))); + DBUG_RETURN(res); +} + + + /* Find relative position (in records) for key in index-tree */ + +static ha_rows _ma_record_pos(MARIA_HA *info, const uchar *key_data, + key_part_map keypart_map, + enum ha_rkey_function search_flag) +{ + uint inx= (uint) info->lastinx; + uint32 nextflag; + uchar *key_buff; + double pos; + MARIA_KEY key; + DBUG_ENTER("_ma_record_pos"); + DBUG_PRINT("enter",("search_flag: %d",search_flag)); + DBUG_ASSERT(keypart_map); + + key_buff= info->lastkey_buff+info->s->base.max_key_length; + _ma_pack_key(info, &key, inx, key_buff, key_data, keypart_map, + (HA_KEYSEG**) 0); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, &key);); + nextflag=maria_read_vec[search_flag]; + + /* + my_handler.c:ha_compare_text() has a flag 'skip_end_space'. + This is set in my_handler.c:ha_key_cmp() in dependence on the + compare flags 'nextflag' and the column type. + + TEXT columns are of type HA_KEYTYPE_VARTEXT. In this case the + condition is skip_end_space= ((nextflag & (SEARCH_FIND | + SEARCH_UPDATE)) == SEARCH_FIND). + + SEARCH_FIND is used for an exact key search. The combination + SEARCH_FIND | SEARCH_UPDATE is used in write/update/delete + operations with a comment like "Not real duplicates", whatever this + means. From the condition above we can see that 'skip_end_space' is + always false for these operations. The result is that trailing space + counts in key comparison and hence, emtpy strings ('', string length + zero, but not NULL) compare less that strings starting with control + characters and these in turn compare less than strings starting with + blanks. + + When estimating the number of records in a key range, we request an + exact search for the minimum key. This translates into a plain + SEARCH_FIND flag. Using this alone would lead to a 'skip_end_space' + compare. Empty strings would be expected above control characters. + Their keys would not be found because they are located below control + characters. + + This is the reason that we add the SEARCH_UPDATE flag here. It makes + the key estimation compare in the same way like key write operations + do. Olny so we will find the keys where they have been inserted. + + Adding the flag unconditionally does not hurt as it is used in the + above mentioned condition only. So it can safely be used together + with other flags. + */ + pos= _ma_search_pos(info, &key, + nextflag | SEARCH_SAVE_BUFF | SEARCH_UPDATE, + info->s->state.key_root[inx]); + if (pos >= 0.0) + { + DBUG_PRINT("exit",("pos: %ld",(ulong) (pos*info->state->records))); + DBUG_RETURN((ulong) (pos*info->state->records+0.5)); + } + DBUG_RETURN(HA_POS_ERROR); +} + + +/** + Find offset for key on index page + + @notes + Modified version of _ma_search() + + @return + @retval 0.0 <= x <= 1.0 +*/ + +static double _ma_search_pos(MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, my_off_t pos) +{ + int flag; + uint keynr, max_keynr; + my_bool after_key; + uchar *keypos; + double offset; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("_ma_search_pos"); + LINT_INIT(max_keynr); + + if (pos == HA_OFFSET_ERROR) + DBUG_RETURN(0.5); + + if (_ma_fetch_keypage(&page, info, keyinfo, pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, + info->buff, 1)) + goto err; + flag= (*keyinfo->bin_search)(key, &page, nextflag, &keypos, + info->lastkey_buff, &after_key); + keynr= _ma_keynr(&page, keypos, &max_keynr); + + if (flag) + { + if (flag == MARIA_FOUND_WRONG_KEY) + DBUG_RETURN(-1); /* error */ + /* + Didn't found match. keypos points at next (bigger) key + Try to find a smaller, better matching key. + Matches keynr + [0-1] + */ + if (flag > 0 && ! page.node) + offset= 1.0; + else if ((offset= _ma_search_pos(info, key, nextflag, + _ma_kpos(page.node,keypos))) < 0) + DBUG_RETURN(offset); + } + else + { + /* + Found match. Keypos points at the start of the found key + Matches keynr+1 + */ + offset=1.0; /* Matches keynr+1 */ + if ((nextflag & SEARCH_FIND) && page.node && + ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME || + (nextflag & (SEARCH_PREFIX | SEARCH_NO_FIND | SEARCH_LAST | + SEARCH_PART_KEY)))) + { + /* + There may be identical keys in the tree. Try to match on of those. + Matches keynr + [0-1] + */ + if ((offset= _ma_search_pos(info, key, SEARCH_FIND, + _ma_kpos(page.node,keypos))) < 0) + DBUG_RETURN(offset); /* Read error */ + } + } + DBUG_PRINT("info",("keynr: %d offset: %g max_keynr: %d nod: %d flag: %d", + keynr,offset,max_keynr,page.node,flag)); + DBUG_RETURN((keynr+offset)/(max_keynr+1)); +err: + DBUG_PRINT("exit",("Error: %d",my_errno)); + DBUG_RETURN (-1.0); +} + + +/* Get keynummer of current key and max number of keys in nod */ + +static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key) +{ + uint page_flag, nod_flag, keynr, max_key; + uchar t_buff[MARIA_MAX_KEY_BUFF], *pos, *end; + const MARIA_KEYDEF *keyinfo= page->keyinfo; + MARIA_KEY key; + + page_flag= page->flag; + nod_flag= page->node; + pos= page->buff + page->info->s->keypage_header + nod_flag; + end= page->buff + page->size; + + if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + *ret_max_key= (uint) (end - pos)/(keyinfo->keylength+nod_flag); + return (uint) (keypos - pos)/(keyinfo->keylength+nod_flag); + } + + max_key=keynr=0; + t_buff[0]=0; /* Safety */ + key.data= t_buff; + key.keyinfo= (MARIA_KEYDEF*) keyinfo; + + while (pos < end) + { + if (!(pos= (*keyinfo->skip_key)(&key, page_flag, nod_flag, pos))) + { + DBUG_ASSERT(0); + return 0; /* Error */ + } + max_key++; + if (pos == keypos) + keynr= max_key; + } + *ret_max_key=max_key; + return(keynr); +} diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c new file mode 100644 index 00000000000..7a7286e26f9 --- /dev/null +++ b/storage/maria/ma_recovery.c @@ -0,0 +1,3755 @@ +/* Copyright (C) 2006, 2007 MySQL AB + Copyright (C) 2010 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3072 Maria recovery + First version written by Guilhem Bichot on 2006-04-27. +*/ + +/* Here is the implementation of this module */ + +#include "maria_def.h" +#include "ma_recovery.h" +#include "ma_blockrec.h" +#include "ma_checkpoint.h" +#include "trnman.h" +#include "ma_key_recover.h" +#include "ma_recovery_util.h" + +struct st_trn_for_recovery /* used only in the REDO phase */ +{ + LSN group_start_lsn, undo_lsn, first_undo_lsn; + TrID long_trid; +}; +struct st_table_for_recovery /* used in the REDO and UNDO phase */ +{ + MARIA_HA *info; +}; +/* Variables used by all functions of this module. Ok as single-threaded */ +static struct st_trn_for_recovery *all_active_trans; +static struct st_table_for_recovery *all_tables; +static struct st_dirty_page *dirty_pages_pool; +static LSN current_group_end_lsn; +#ifndef DBUG_OFF +/** Current group of REDOs is about this table and only this one */ +static MARIA_HA *current_group_table; +#endif +static TrID max_long_trid= 0; /**< max long trid seen by REDO phase */ +static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */ +/** @brief to avoid writing a checkpoint if recovery did nothing. */ +static my_bool checkpoint_useful; +static my_bool in_redo_phase; +static my_bool trns_created; +static ulong skipped_undo_phase; +static ulonglong now; /**< for tracking execution time of phases */ +static int (*save_error_handler_hook)(uint, const char *,myf); +static uint recovery_warnings; /**< count of warnings */ +static uint recovery_found_crashed_tables; + +#define prototype_redo_exec_hook(R) \ + static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec) + +#define prototype_redo_exec_hook_dummy(R) \ + static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec \ + __attribute__ ((unused))) + +#define prototype_undo_exec_hook(R) \ + static int exec_UNDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec, TRN *trn) + +prototype_redo_exec_hook(LONG_TRANSACTION_ID); +prototype_redo_exec_hook_dummy(CHECKPOINT); +prototype_redo_exec_hook(REDO_CREATE_TABLE); +prototype_redo_exec_hook(REDO_RENAME_TABLE); +prototype_redo_exec_hook(REDO_REPAIR_TABLE); +prototype_redo_exec_hook(REDO_DROP_TABLE); +prototype_redo_exec_hook(FILE_ID); +prototype_redo_exec_hook(INCOMPLETE_LOG); +prototype_redo_exec_hook_dummy(INCOMPLETE_GROUP); +prototype_redo_exec_hook(UNDO_BULK_INSERT); +prototype_redo_exec_hook(IMPORTED_TABLE); +prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD); +prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL); +prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD); +prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD); +prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL); +prototype_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL); +prototype_redo_exec_hook(REDO_FREE_BLOCKS); +prototype_redo_exec_hook(REDO_DELETE_ALL); +prototype_redo_exec_hook(REDO_INDEX); +prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE); +prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE); +prototype_redo_exec_hook(REDO_BITMAP_NEW_PAGE); +prototype_redo_exec_hook(UNDO_ROW_INSERT); +prototype_redo_exec_hook(UNDO_ROW_DELETE); +prototype_redo_exec_hook(UNDO_ROW_UPDATE); +prototype_redo_exec_hook(UNDO_KEY_INSERT); +prototype_redo_exec_hook(UNDO_KEY_DELETE); +prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); +prototype_redo_exec_hook(COMMIT); +prototype_redo_exec_hook(CLR_END); +prototype_redo_exec_hook(DEBUG_INFO); +prototype_undo_exec_hook(UNDO_ROW_INSERT); +prototype_undo_exec_hook(UNDO_ROW_DELETE); +prototype_undo_exec_hook(UNDO_ROW_UPDATE); +prototype_undo_exec_hook(UNDO_KEY_INSERT); +prototype_undo_exec_hook(UNDO_KEY_DELETE); +prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); +prototype_undo_exec_hook(UNDO_BULK_INSERT); + +static int run_redo_phase(LSN lsn, LSN end_lsn, + enum maria_apply_log_way apply); +static uint end_of_redo_phase(my_bool prepare_for_undo_phase); +static int run_undo_phase(uint uncommitted); +static void display_record_position(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec, + uint number); +static int display_and_apply_record(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec); +static MARIA_HA *get_MARIA_HA_from_REDO_record(const + TRANSLOG_HEADER_BUFFER *rec); +static MARIA_HA *get_MARIA_HA_from_UNDO_record(const + TRANSLOG_HEADER_BUFFER *rec); +static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon); +static LSN parse_checkpoint_record(LSN lsn); +static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn, + LSN first_undo_lsn); +static int new_table(uint16 sid, const char *name, LSN lsn_of_file_id); +static int new_page(uint32 fileid, pgcache_page_no_t pageid, LSN rec_lsn, + struct st_dirty_page *dirty_page); +static int close_all_tables(void); +static my_bool close_one_table(const char *name, TRANSLOG_ADDRESS addr); +static void print_redo_phase_progress(TRANSLOG_ADDRESS addr); +static void delete_all_transactions(); + +/** @brief global [out] buffer for translog_read_record(); never shrinks */ +static struct +{ + /* + uchar* is more adapted (less casts) than char*, thus we don't use + LEX_STRING. + */ + uchar *str; + size_t length; +} log_record_buffer; +static void enlarge_buffer(const TRANSLOG_HEADER_BUFFER *rec) +{ + if (log_record_buffer.length < rec->record_length) + { + log_record_buffer.length= rec->record_length; + log_record_buffer.str= my_realloc(log_record_buffer.str, + rec->record_length, + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); + } +} +/** @brief Tells what kind of progress message was printed to the error log */ +static enum recovery_message_type +{ + REC_MSG_NONE= 0, REC_MSG_REDO, REC_MSG_UNDO, REC_MSG_FLUSH +} recovery_message_printed; + + +/* Hook to ensure we get nicer output if we get an error */ + +int maria_recover_error_handler_hook(uint error, const char *str, + myf flags) +{ + if (procent_printed) + { + procent_printed= 0; + fputc('\n', stderr); + fflush(stderr); + } + return (*save_error_handler_hook)(error, str, flags); +} + +/* Define this if you want gdb to break in some interesting situations */ +#define ALERT_USER() + +static void print_preamble() +{ + ma_message_no_user(ME_JUST_INFO, "starting recovery"); +} + + +/** + @brief Recovers from the last checkpoint. + + Runs the REDO phase using special structures, then sets up the playground + of runtime: recreates transactions inside trnman, open tables with their + two-byte-id mapping; takes a checkpoint and runs the UNDO phase. Closes all + tables. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int maria_recovery_from_log(void) +{ + int res= 1; + FILE *trace_file; + uint warnings_count; +#ifdef EXTRA_DEBUG + char name_buff[FN_REFLEN]; +#endif + DBUG_ENTER("maria_recovery_from_log"); + + DBUG_ASSERT(!maria_in_recovery); + maria_in_recovery= TRUE; + +#ifdef EXTRA_DEBUG + fn_format(name_buff, "aria_recovery.trace", maria_data_root, "", MYF(0)); + trace_file= my_fopen(name_buff, O_WRONLY|O_APPEND|O_CREAT, MYF(MY_WME)); +#else + trace_file= NULL; /* no trace file for being fast */ +#endif + tprint(trace_file, "TRACE of the last Aria recovery from mysqld\n"); + DBUG_ASSERT(maria_pagecache->inited); + res= maria_apply_log(LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, MARIA_LOG_APPLY, + trace_file, TRUE, TRUE, TRUE, &warnings_count); + if (!res) + { + if (warnings_count == 0 && recovery_found_crashed_tables == 0) + tprint(trace_file, "SUCCESS\n"); + else + tprint(trace_file, "DOUBTFUL (%u warnings, check previous output)\n", + warnings_count); + } + if (trace_file) + my_fclose(trace_file, MYF(0)); + maria_in_recovery= FALSE; + DBUG_RETURN(res); +} + + +/** + @brief Displays and/or applies the log + + @param from_lsn LSN from which log reading/applying should start; + LSN_IMPOSSIBLE means "use last checkpoint" + @param end_lsn Apply until this. LSN_IMPOSSIBLE means until end. + @param apply how log records should be applied or not + @param trace_file trace file where progress/debug messages will go + @param skip_DDLs_arg Should DDL records (CREATE/RENAME/DROP/REPAIR) + be skipped by the REDO phase or not + @param take_checkpoints Should we take checkpoints or not. + @param[out] warnings_count Count of warnings will be put there + + @todo This trace_file thing is primitive; soon we will make it similar to + ma_check_print_warning() etc, and a successful recovery does not need to + create a trace file. But for debugging now it is useful. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int maria_apply_log(LSN from_lsn, LSN end_lsn, + enum maria_apply_log_way apply, + FILE *trace_file, + my_bool should_run_undo_phase, my_bool skip_DDLs_arg, + my_bool take_checkpoints, uint *warnings_count) +{ + int error= 0; + uint uncommitted_trans; + ulonglong old_now; + my_bool abort_message_printed= 0; + DBUG_ENTER("maria_apply_log"); + + DBUG_ASSERT(apply == MARIA_LOG_APPLY || !should_run_undo_phase); + DBUG_ASSERT(!maria_multi_threaded); + recovery_warnings= recovery_found_crashed_tables= 0; + maria_recovery_changed_data= 0; + /* checkpoints can happen only if TRNs have been built */ + DBUG_ASSERT(should_run_undo_phase || !take_checkpoints); + DBUG_ASSERT(end_lsn == LSN_IMPOSSIBLE || should_run_undo_phase == 0); + all_active_trans= (struct st_trn_for_recovery *) + my_malloc((SHORT_TRID_MAX + 1) * sizeof(struct st_trn_for_recovery), + MYF(MY_ZEROFILL)); + all_tables= (struct st_table_for_recovery *) + my_malloc((SHARE_ID_MAX + 1) * sizeof(struct st_table_for_recovery), + MYF(MY_ZEROFILL)); + + save_error_handler_hook= error_handler_hook; + error_handler_hook= maria_recover_error_handler_hook; + + if (!all_active_trans || !all_tables) + goto err; + + if (take_checkpoints && ma_checkpoint_init(0)) + goto err; + + recovery_message_printed= REC_MSG_NONE; + checkpoint_useful= trns_created= FALSE; + tracef= trace_file; +#ifdef INSTANT_FLUSH_OF_MESSAGES + /* enable this for instant flush of messages to trace file */ + setbuf(tracef, NULL); +#endif + skip_DDLs= skip_DDLs_arg; + skipped_undo_phase= 0; + + if (from_lsn == LSN_IMPOSSIBLE) + { + if (last_checkpoint_lsn == LSN_IMPOSSIBLE) + { + from_lsn= translog_first_lsn_in_log(); + if (unlikely(from_lsn == LSN_ERROR)) + goto err; + } + else + { + from_lsn= parse_checkpoint_record(last_checkpoint_lsn); + if (from_lsn == LSN_ERROR) + goto err; + } + } + + now= my_getsystime(); + in_redo_phase= TRUE; + trnman_init(max_trid_in_control_file); + if (run_redo_phase(from_lsn, end_lsn, apply)) + { + ma_message_no_user(0, "Redo phase failed"); + trnman_destroy(); + goto err; + } + trnman_destroy(); + + if (end_lsn != LSN_IMPOSSIBLE) + { + abort_message_printed= 1; + if (!trace_file) + fputc('\n', stderr); + my_message(HA_ERR_INITIALIZATION, + "Maria recovery aborted as end_lsn/end of file was reached", + MYF(0)); + goto err2; + } + + if ((uncommitted_trans= + end_of_redo_phase(should_run_undo_phase)) == (uint)-1) + { + ma_message_no_user(0, "End of redo phase failed"); + goto err; + } + in_redo_phase= FALSE; + + old_now= now; + now= my_getsystime(); + if (recovery_message_printed == REC_MSG_REDO) + { + double phase_took= (now - old_now)/10000000.0; + /* + Detailed progress info goes to stderr, because ma_message_no_user() + cannot put several messages on one line. + */ + procent_printed= 1; + fprintf(stderr, " (%.1f seconds); ", phase_took); + fflush(stderr); + } + + /** + REDO phase does not fill blocks' rec_lsn, so a checkpoint now would be + wrong: if a future recovery used it, the REDO phase would always + start from the checkpoint and never from before, wrongly skipping REDOs + (tested). Another problem is that the REDO phase uses + PAGECACHE_PLAIN_PAGE, while Checkpoint only collects PAGECACHE_LSN_PAGE. + + @todo fix this. pagecache_write() now can have a rec_lsn argument. And we + could make a function which goes through pages at end of REDO phase and + changes their type. + */ +#ifdef FIX_AND_ENABLE_LATER + if (take_checkpoints && checkpoint_useful) + { + /* + We take a checkpoint as it can save future recovery work if we crash + during the UNDO phase. But we don't flush pages, as UNDOs will change + them again probably. + If we wanted to take checkpoints in the middle of the REDO phase, at a + moment when we haven't reached the end of log so don't have exact data + about transactions, we could write a special checkpoint: containing only + the list of dirty pages, otherwise to be treated as if it was at the + same LSN as the last checkpoint. + */ + if (ma_checkpoint_execute(CHECKPOINT_INDIRECT, FALSE)) + goto err; + } +#endif + + if (should_run_undo_phase) + { + if (run_undo_phase(uncommitted_trans)) + { + ma_message_no_user(0, "Undo phase failed"); + goto err; + } + } + else if (uncommitted_trans > 0) + { + eprint(tracef, "***WARNING: %u uncommitted transactions; some tables may" + " be left inconsistent!***", uncommitted_trans); + recovery_warnings++; + } + + if (skipped_undo_phase) + { + /* + We could want to print a list of tables for which UNDOs were skipped, + but not one line per skipped UNDO. + */ + eprint(tracef, "***WARNING: %lu UNDO records skipped in UNDO phase; some" + " tables may be left inconsistent!***", skipped_undo_phase); + recovery_warnings++; + } + + old_now= now; + now= my_getsystime(); + if (recovery_message_printed == REC_MSG_UNDO) + { + double phase_took= (now - old_now)/10000000.0; + procent_printed= 1; + fprintf(stderr, " (%.1f seconds); ", phase_took); + fflush(stderr); + } + + /* + we don't use maria_panic() because it would maria_end(), and Recovery does + not want that (we want to keep some modules initialized for runtime). + */ + if (close_all_tables()) + { + ma_message_no_user(0, "closing of tables failed"); + goto err; + } + + old_now= now; + now= my_getsystime(); + if (recovery_message_printed == REC_MSG_FLUSH) + { + double phase_took= (now - old_now)/10000000.0; + procent_printed= 1; + fprintf(stderr, " (%.1f seconds); ", phase_took); + fflush(stderr); + } + + if (take_checkpoints && checkpoint_useful) + { + /* No dirty pages, all tables are closed, no active transactions, save: */ + if (ma_checkpoint_execute(CHECKPOINT_FULL, FALSE)) + goto err; + } + + goto end; +err: + tprint(tracef, "\nRecovery of tables with transaction logs FAILED\n"); +err2: + if (trns_created) + delete_all_transactions(); + error= 1; + if (close_all_tables()) + { + ma_message_no_user(0, "closing of tables failed"); + } +end: + error_handler_hook= save_error_handler_hook; + hash_free(&all_dirty_pages); + bzero(&all_dirty_pages, sizeof(all_dirty_pages)); + my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR)); + dirty_pages_pool= NULL; + my_free(all_tables, MYF(MY_ALLOW_ZERO_PTR)); + all_tables= NULL; + my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR)); + all_active_trans= NULL; + my_free(log_record_buffer.str, MYF(MY_ALLOW_ZERO_PTR)); + log_record_buffer.str= NULL; + log_record_buffer.length= 0; + ma_checkpoint_end(); + *warnings_count= recovery_warnings + recovery_found_crashed_tables; + if (recovery_message_printed != REC_MSG_NONE) + { + if (procent_printed) + { + procent_printed= 0; + fprintf(stderr, "\n"); + fflush(stderr); + } + if (!error) + { + ma_message_no_user(ME_JUST_INFO, "recovery done"); + maria_recovery_changed_data= 1; + } + } + else if (!error && max_trid_in_control_file != max_long_trid) + { + /* + maria_end() will set max trid in log file so that one can run + maria_chk on the tables + */ + maria_recovery_changed_data= 1; + } + + if (error && !abort_message_printed) + { + if (!trace_file) + fputc('\n', stderr); + my_message(HA_ERR_INITIALIZATION, + "Aria recovery failed. Please run aria_chk -r on all Aria " + "tables and delete all aria_log.######## files", MYF(0)); + } + procent_printed= 0; + /* + We don't cleanly close tables if we hit some error (may corrupt them by + flushing some wrong blocks made from wrong REDOs). It also leaves their + open_count>0, which ensures that --aria-recover, if used, will try to + repair them. + */ + DBUG_RETURN(error); +} + + +/* very basic info about the record's header */ +static void display_record_position(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec, + uint number) +{ + /* + if number==0, we're going over records which we had already seen and which + form a group, so we indent below the group's end record + */ + tprint(tracef, + "%sRec#%u LSN (%lu,0x%lx) short_trid %u %s(num_type:%u) len %lu\n", + number ? "" : " ", number, LSN_IN_PARTS(rec->lsn), + rec->short_trid, log_desc->name, rec->type, + (ulong)rec->record_length); + if (rec->type == LOGREC_DEBUG_INFO) + { + /* Print some extra information */ + (*log_desc->record_execute_in_redo_phase)(rec); + } +} + + +static int display_and_apply_record(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec) +{ + int error; + if (log_desc->record_execute_in_redo_phase == NULL) + { + /* die on all not-yet-handled records :) */ + DBUG_ASSERT("one more hook to write" == 0); + return 1; + } + if (rec->type == LOGREC_DEBUG_INFO) + { + /* Query already printed by display_record_position() */ + return 0; + } + if ((error= (*log_desc->record_execute_in_redo_phase)(rec))) + eprint(tracef, "Got error %d when executing record %s", + my_errno, log_desc->name); + return error; +} + + +prototype_redo_exec_hook(LONG_TRANSACTION_ID) +{ + uint16 sid= rec->short_trid; + TrID long_trid= all_active_trans[sid].long_trid; + /* + Any incomplete group should be of an old crash which already had a + recovery and thus has logged INCOMPLETE_GROUP which we must have seen. + */ + DBUG_ASSERT(all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE); + if (long_trid != 0) + { + LSN ulsn= all_active_trans[sid].undo_lsn; + /* + If the first record of that transaction is after 'rec', it's probably + because that transaction was found in the checkpoint record, and then + it's ok, we can forget about that transaction (we'll meet it later + again in the REDO phase) and replace it with the one in 'rec'. + */ + if ((ulsn != LSN_IMPOSSIBLE) && + (cmp_translog_addr(ulsn, rec->lsn) < 0)) + { + char llbuf[22]; + llstr(long_trid, llbuf); + eprint(tracef, "Found an old transaction long_trid %s short_trid %u" + " with same short id as this new transaction, and has neither" + " committed nor rollback (undo_lsn: (%lu,0x%lx))", + llbuf, sid, LSN_IN_PARTS(ulsn)); + goto err; + } + } + long_trid= uint6korr(rec->header); + new_transaction(sid, long_trid, LSN_IMPOSSIBLE, LSN_IMPOSSIBLE); + goto end; +err: + ALERT_USER(); + return 1; +end: + return 0; +} + + +static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn, + LSN first_undo_lsn) +{ + char llbuf[22]; + all_active_trans[sid].long_trid= long_id; + llstr(long_id, llbuf); + tprint(tracef, "Transaction long_trid %s short_trid %u starts," + " undo_lsn (%lu,0x%lx) first_undo_lsn (%lu,0x%lx)\n", + llbuf, sid, LSN_IN_PARTS(undo_lsn), LSN_IN_PARTS(first_undo_lsn)); + all_active_trans[sid].undo_lsn= undo_lsn; + all_active_trans[sid].first_undo_lsn= first_undo_lsn; + set_if_bigger(max_long_trid, long_id); +} + + +prototype_redo_exec_hook_dummy(CHECKPOINT) +{ + /* the only checkpoint we care about was found via control file, ignore */ + return 0; +} + + +prototype_redo_exec_hook_dummy(INCOMPLETE_GROUP) +{ + /* abortion was already made */ + return 0; +} + + +prototype_redo_exec_hook(INCOMPLETE_LOG) +{ + MARIA_HA *info; + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + return 0; + } + if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL) + { + /* no such table, don't need to warn */ + return 0; + } + + if (maria_is_crashed(info)) + return 0; + + if (info->s->state.is_of_horizon > rec->lsn) + { + /* + This table was repaired at a time after this log entry. + We can assume that all rows was inserted sucessfully and we don't + have to warn about that the inserted data was not logged + */ + return 0; + } + + /* + Example of what can go wrong when replaying DDLs: + CREATE TABLE t (logged); INSERT INTO t VALUES(1) (logged); + ALTER TABLE t ... which does + CREATE a temporary table #sql... (logged) + INSERT data from t into #sql... (not logged) + RENAME #sql TO t (logged) + Removing tables by hand and replaying the log will leave in the + end an empty table "t": missing records. If after the RENAME an INSERT + into t was done, that row had number 1 in its page, executing the + REDO_INSERT_ROW_HEAD on the recreated empty t will fail (assertion + failure in _ma_apply_redo_insert_row_head_or_tail(): new data page is + created whereas rownr is not 0). + So when the server disables logging for ALTER TABLE or CREATE SELECT, it + logs LOGREC_INCOMPLETE_LOG to warn aria_read_log and then the user. + + Another issue is that replaying of DDLs is not correct enough to work if + there was a crash during a DDL (see comment in execution of + REDO_RENAME_TABLE ). + */ + + eprint(tracef, "***WARNING: Aria engine currently logs no records " + "about insertion of data by ALTER TABLE and CREATE SELECT, " + "as they are not necessary for recovery; " + "present applying of log records to table '%s' may well not work." + "***", info->s->index_file_name.str); + + /* Prevent using the table for anything else than undo repair */ + _ma_mark_file_crashed(info->s); + recovery_warnings++; + return 0; +} + + +static my_bool create_database_if_not_exists(const char *name) +{ + char dirname[FN_REFLEN]; + size_t length; + MY_STAT stat_info; + DBUG_ENTER("create_database_if_not_exists"); + + dirname_part(dirname, name, &length); + if (!length) + { + /* Skip files without directores */ + DBUG_RETURN(0); + } + /* + Safety; Don't create files with hard path; + Should never happen with MariaDB + If hard path, then error will be detected when trying to create index file + */ + if (test_if_hard_path(dirname)) + DBUG_RETURN(0); + + if (my_stat(dirname,&stat_info,MYF(0))) + DBUG_RETURN(0); + + + tprint(tracef, "Creating not existing database '%s'\n", dirname); + if (my_mkdir(dirname, 0777, MYF(MY_WME))) + { + eprint(tracef, "***WARNING: Can't create not existing database '%s'", + dirname); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + + + + +prototype_redo_exec_hook(REDO_CREATE_TABLE) +{ + File dfile= -1, kfile= -1; + char *linkname_ptr, filename[FN_REFLEN], *name, *ptr, *ptr2, + *data_file_name, *index_file_name; + uchar *kfile_header; + myf create_flag; + uint flags; + int error= 1, create_mode= O_RDWR | O_TRUNC, i; + MARIA_HA *info= NULL; + uint kfile_size_before_extension, keystart; + DBUG_ENTER("exec_REDO_LOGREC_REDO_CREATE_TABLE"); + + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + DBUG_RETURN(0); + } + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + name= (char *)log_record_buffer.str; + /* + TRUNCATE TABLE and REPAIR USE_FRM call maria_create(), so below we can + find a REDO_CREATE_TABLE for a table which we have open, that's why we + need to look for any open instances and close them first. + */ + if (close_one_table(name, rec->lsn)) + { + eprint(tracef, "Table '%s' got error %d on close", name, my_errno); + ALERT_USER(); + goto end; + } + /* we try hard to get create_rename_lsn, to avoid mistakes if possible */ + info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR); + if (info) + { + MARIA_SHARE *share= info->s; + /* check that we're not already using it */ + if (share->reopen != 1) + { + eprint(tracef, "Table '%s is already open (reopen=%u)", + name, share->reopen); + ALERT_USER(); + goto end; + } + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + if (!share->base.born_transactional) + { + /* + could be that transactional table was later dropped, and a non-trans + one was renamed to its name, thus create_rename_lsn is 0 and should + not be trusted. + */ + tprint(tracef, "Table '%s' is not transactional, ignoring creation\n", + name); + ALERT_USER(); + error= 0; + goto end; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, "Table '%s' has create_rename_lsn (%lu,0x%lx) more " + "recent than record, ignoring creation", + name, LSN_IN_PARTS(share->state.create_rename_lsn)); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + eprint(tracef, "Table '%s' is crashed, can't recreate it", name); + ALERT_USER(); + goto end; + } + maria_close(info); + info= NULL; + } + else + { + /* one or two files absent, or header corrupted... */ + tprint(tracef, "Table '%s' can't be opened (Error: %d)\n", + name, my_errno); + } + /* if does not exist, or is older, overwrite it */ + ptr= name + strlen(name) + 1; + if ((flags= ptr[0] ? HA_DONT_TOUCH_DATA : 0)) + tprint(tracef, ", we will only touch index file"); + ptr++; + kfile_size_before_extension= uint2korr(ptr); + ptr+= 2; + keystart= uint2korr(ptr); + ptr+= 2; + kfile_header= (uchar *)ptr; + ptr+= kfile_size_before_extension; + /* set header lsns */ + ptr2= (char *) kfile_header + sizeof(info->s->state.header) + + MARIA_FILE_CREATE_RENAME_LSN_OFFSET; + for (i= 0; i<3; i++) + { + lsn_store(ptr2, rec->lsn); + ptr2+= LSN_STORE_SIZE; + } + data_file_name= ptr; + ptr+= strlen(data_file_name) + 1; + index_file_name= ptr; + ptr+= strlen(index_file_name) + 1; + /** @todo handle symlinks */ + if (data_file_name[0] || index_file_name[0]) + { + eprint(tracef, "Table '%s' DATA|INDEX DIRECTORY clauses are not handled", + name); + goto end; + } + if (create_database_if_not_exists(name)) + goto end; + fn_format(filename, name, "", MARIA_NAME_IEXT, + (MY_UNPACK_FILENAME | + (flags & HA_DONT_TOUCH_DATA) ? MY_RETURN_REAL_PATH : 0) | + MY_APPEND_EXT); + linkname_ptr= NULL; + create_flag= MY_DELETE_OLD; + tprint(tracef, "Table '%s' creating as '%s'\n", name, filename); + if ((kfile= my_create_with_symlink(linkname_ptr, filename, 0, create_mode, + MYF(MY_WME|create_flag))) < 0) + { + eprint(tracef, "Failed to create index file"); + goto end; + } + if (my_pwrite(kfile, kfile_header, + kfile_size_before_extension, 0, MYF(MY_NABP|MY_WME)) || + my_chsize(kfile, keystart, 0, MYF(MY_WME))) + { + eprint(tracef, "Failed to write to index file"); + goto end; + } + if (!(flags & HA_DONT_TOUCH_DATA)) + { + fn_format(filename,name,"", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + linkname_ptr= NULL; + create_flag=MY_DELETE_OLD; + if (((dfile= + my_create_with_symlink(linkname_ptr, filename, 0, create_mode, + MYF(MY_WME | create_flag))) < 0) || + my_close(dfile, MYF(MY_WME))) + { + eprint(tracef, "Failed to create data file"); + goto end; + } + /* + we now have an empty data file. To be able to + _ma_initialize_data_file() we need some pieces of the share to be + correctly filled. So we just open the table (fortunately, an empty + data file does not preclude this). + */ + if (((info= maria_open(name, O_RDONLY, 0)) == NULL) || + _ma_initialize_data_file(info->s, info->dfile.file)) + { + eprint(tracef, "Failed to open new table or write to data file"); + goto end; + } + } + error= 0; +end: + if (kfile >= 0) + error|= my_close(kfile, MYF(MY_WME)); + if (info != NULL) + error|= maria_close(info); + DBUG_RETURN(error); +} + + +prototype_redo_exec_hook(REDO_RENAME_TABLE) +{ + char *old_name, *new_name; + int error= 1; + MARIA_HA *info= NULL; + DBUG_ENTER("exec_REDO_LOGREC_REDO_RENAME_TABLE"); + + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + DBUG_RETURN(0); + } + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + old_name= (char *)log_record_buffer.str; + new_name= old_name + strlen(old_name) + 1; + tprint(tracef, "Table '%s' to rename to '%s'; old-name table ", old_name, + new_name); + /* + Here is why we skip CREATE/DROP/RENAME when doing a recovery from + ha_maria (whereas we do when called from aria_read_log). Consider: + CREATE TABLE t; + RENAME TABLE t to u; + DROP TABLE u; + RENAME TABLE v to u; # crash between index rename and data rename. + And do a Recovery (not removing tables beforehand). + Recovery replays CREATE, then RENAME: the maria_open("t") works, + maria_open("u") does not (no data file) so table "u" is considered + inexistent and so maria_rename() is done which overwrites u's index file, + which is lost. Ok, the data file (v.MAD) is still available, but only a + REPAIR USE_FRM can rebuild the index, which is unsafe and downtime. + So it is preferrable to not execute RENAME, and leave the "mess" of files, + rather than possibly destroy a file. DBA will manually rename files. + A safe recovery method would probably require checking the existence of + the index file and of the data file separately (not via maria_open()), and + maybe also to store a create_rename_lsn in the data file too + For now, all we risk is to leave the mess (half-renamed files) left by the + crash. We however sync files and directories at each file rename. The SQL + layer is anyway not crash-safe for DDLs (except the repartioning-related + ones). + We replay DDLs in aria_read_log to be able to recreate tables from + scratch. It means that "aria_read_log -a" should not be used on a + database which just crashed during a DDL. And also ALTER TABLE does not + log insertions of records into the temporary table, so replaying may + fail (grep for INCOMPLETE_LOG in files). + */ + info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR); + if (info) + { + MARIA_SHARE *share= info->s; + if (!share->base.born_transactional) + { + tprint(tracef, ", is not transactional, ignoring renaming\n"); + ALERT_USER(); + error= 0; + goto end; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than" + " record, ignoring renaming", + LSN_IN_PARTS(share->state.create_rename_lsn)); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + tprint(tracef, ", is crashed, can't rename it"); + ALERT_USER(); + goto end; + } + if (close_one_table(info->s->open_file_name.str, rec->lsn) || + maria_close(info)) + goto end; + info= NULL; + tprint(tracef, ", is ok for renaming; new-name table "); + } + else /* one or two files absent, or header corrupted... */ + { + tprint(tracef, ", can't be opened, probably does not exist"); + error= 0; + goto end; + } + /* + We must also check the create_rename_lsn of the 'new_name' table if it + exists: otherwise we may, with our rename which overwrites, destroy + another table. For example: + CREATE TABLE t; + RENAME t to u; + DROP TABLE u; + RENAME v to u; # v is an old table, its creation/insertions not in log + And start executing the log (without removing tables beforehand): creates + t, renames it to u (if not testing create_rename_lsn) thus overwriting + old-named v, drops u, and we are stuck, we have lost data. + */ + info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR); + if (info) + { + MARIA_SHARE *share= info->s; + /* We should not have open instances on this table. */ + if (share->reopen != 1) + { + tprint(tracef, ", is already open (reopen=%u)\n", share->reopen); + ALERT_USER(); + goto end; + } + if (!share->base.born_transactional) + { + tprint(tracef, ", is not transactional, ignoring renaming\n"); + ALERT_USER(); + goto drop; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than" + " record, ignoring renaming", + LSN_IN_PARTS(share->state.create_rename_lsn)); + /* + We have to drop the old_name table. Consider: + CREATE TABLE t; + CREATE TABLE v; + RENAME TABLE t to u; + DROP TABLE u; + RENAME TABLE v to u; + and apply the log without removing tables beforehand. t will be + created, v too; in REDO_RENAME u will be more recent, but we still + have to drop t otherwise it stays. + */ + goto drop; + } + if (maria_is_crashed(info)) + { + tprint(tracef, ", is crashed, can't rename it"); + ALERT_USER(); + goto end; + } + if (maria_close(info)) + goto end; + info= NULL; + /* abnormal situation */ + tprint(tracef, ", exists but is older than record, can't rename it"); + goto end; + } + else /* one or two files absent, or header corrupted... */ + tprint(tracef, ", can't be opened, probably does not exist"); + tprint(tracef, ", renaming '%s'", old_name); + if (maria_rename(old_name, new_name)) + { + eprint(tracef, "Failed to rename table"); + goto end; + } + info= maria_open(new_name, O_RDONLY, 0); + if (info == NULL) + { + eprint(tracef, "Failed to open renamed table"); + goto end; + } + if (_ma_update_state_lsns(info->s, rec->lsn, info->s->state.create_trid, + TRUE, TRUE)) + goto end; + if (maria_close(info)) + goto end; + info= NULL; + error= 0; + goto end; +drop: + tprint(tracef, ", only dropping '%s'", old_name); + if (maria_delete_table(old_name)) + { + eprint(tracef, "Failed to drop table"); + goto end; + } + error= 0; + goto end; +end: + tprint(tracef, "\n"); + if (info != NULL) + error|= maria_close(info); + DBUG_RETURN(error); +} + + +/* + The record may come from REPAIR, ALTER TABLE ENABLE KEYS, OPTIMIZE. +*/ +prototype_redo_exec_hook(REDO_REPAIR_TABLE) +{ + int error= 1; + MARIA_HA *info; + HA_CHECK param; + char *name; + my_bool quick_repair; + DBUG_ENTER("exec_REDO_LOGREC_REDO_REPAIR_TABLE"); + + if (skip_DDLs) + { + /* + REPAIR is not exactly a DDL, but it manipulates files without logging + insertions into them. + */ + tprint(tracef, "we skip DDLs\n"); + DBUG_RETURN(0); + } + if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL) + DBUG_RETURN(0); + if (maria_is_crashed(info)) + { + tprint(tracef, "we skip repairing crashed table\n"); + DBUG_RETURN(0); + } + /* + Otherwise, the mapping is newer than the table, and our record is newer + than the mapping, so we can repair. + */ + tprint(tracef, " repairing...\n"); + + maria_chk_init(¶m); + param.isam_file_name= name= info->s->open_file_name.str; + param.testflag= uint8korr(rec->header + FILEID_STORE_SIZE); + param.tmpdir= maria_tmpdir; + param.max_trid= max_long_trid; + DBUG_ASSERT(maria_tmpdir); + + info->s->state.key_map= uint8korr(rec->header + FILEID_STORE_SIZE + 8); + quick_repair= test(param.testflag & T_QUICK); + + if (param.testflag & T_REP_PARALLEL) + { + if (maria_repair_parallel(¶m, info, name, quick_repair)) + goto end; + } + else if (param.testflag & T_REP_BY_SORT) + { + if (maria_repair_by_sort(¶m, info, name, quick_repair)) + goto end; + } + else if (maria_repair(¶m, info, name, quick_repair)) + goto end; + + if (_ma_update_state_lsns(info->s, rec->lsn, trnman_get_min_safe_trid(), + TRUE, !(param.testflag & T_NO_CREATE_RENAME_LSN))) + goto end; + error= 0; + +end: + DBUG_RETURN(error); +} + + +prototype_redo_exec_hook(REDO_DROP_TABLE) +{ + char *name; + int error= 1; + MARIA_HA *info; + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + return 0; + } + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + name= (char *)log_record_buffer.str; + tprint(tracef, "Table '%s'", name); + info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR); + if (info) + { + MARIA_SHARE *share= info->s; + if (!share->base.born_transactional) + { + tprint(tracef, ", is not transactional, ignoring removal\n"); + ALERT_USER(); + error= 0; + goto end; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than" + " record, ignoring removal", + LSN_IN_PARTS(share->state.create_rename_lsn)); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + tprint(tracef, ", is crashed, can't drop it"); + ALERT_USER(); + goto end; + } + if (close_one_table(info->s->open_file_name.str, rec->lsn) || + maria_close(info)) + goto end; + info= NULL; + /* if it is older, or its header is corrupted, drop it */ + tprint(tracef, ", dropping '%s'", name); + if (maria_delete_table(name)) + { + eprint(tracef, "Failed to drop table"); + goto end; + } + } + else /* one or two files absent, or header corrupted... */ + tprint(tracef,", can't be opened, probably does not exist"); + error= 0; +end: + tprint(tracef, "\n"); + if (info != NULL) + error|= maria_close(info); + return error; +} + + +prototype_redo_exec_hook(FILE_ID) +{ + uint16 sid; + int error= 1; + const char *name; + MARIA_HA *info; + DBUG_ENTER("exec_REDO_LOGREC_FILE_ID"); + + if (cmp_translog_addr(rec->lsn, checkpoint_start) < 0) + { + /* + If that mapping was still true at checkpoint time, it was found in + checkpoint record, no need to recreate it. If that mapping had ended at + checkpoint time (table was closed or repaired), a flush and force + happened and so mapping is not needed. + */ + tprint(tracef, "ignoring because before checkpoint\n"); + DBUG_RETURN(0); + } + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + sid= fileid_korr(log_record_buffer.str); + info= all_tables[sid].info; + if (info != NULL) + { + tprint(tracef, " Closing table '%s'\n", info->s->open_file_name.str); + prepare_table_for_close(info, rec->lsn); + if (maria_close(info)) + { + eprint(tracef, "Failed to close table"); + goto end; + } + all_tables[sid].info= NULL; + } + name= (char *)log_record_buffer.str + FILEID_STORE_SIZE; + if (new_table(sid, name, rec->lsn)) + goto end; + error= 0; +end: + DBUG_RETURN(error); +} + + +static int new_table(uint16 sid, const char *name, LSN lsn_of_file_id) +{ + /* + -1 (skip table): close table and return 0; + 1 (error): close table and return 1; + 0 (success): leave table open and return 0. + */ + int error= 1; + MARIA_HA *info; + MARIA_SHARE *share; + my_off_t dfile_len, kfile_len; + DBUG_ENTER("new_table"); + + checkpoint_useful= TRUE; + if ((name == NULL) || (name[0] == 0)) + { + /* + we didn't use DBUG_ASSERT() because such record corruption could + silently pass in the "info == NULL" test below. + */ + tprint(tracef, ", record is corrupted"); + info= NULL; + recovery_warnings++; + goto end; + } + tprint(tracef, "Table '%s', id %u", name, sid); + info= maria_open(name, O_RDWR, HA_OPEN_FOR_REPAIR); + if (info == NULL) + { + tprint(tracef, ", is absent (must have been dropped later?)" + " or its header is so corrupted that we cannot open it;" + " we skip it"); + if (my_errno != ENOENT) + recovery_found_crashed_tables++; + error= 0; + goto end; + } + share= info->s; + /* check that we're not already using it */ + if (share->reopen != 1) + { + tprint(tracef, ", is already open (reopen=%u)\n", share->reopen); + /* + It could be that we have in the log + FILE_ID(t1,10) ... (t1 was flushed) ... FILE_ID(t1,12); + */ + if (close_one_table(share->open_file_name.str, lsn_of_file_id)) + goto end; + /* + We should not try to get length of data/index files as the files + are not on disk yet. + */ + _ma_tmp_disable_logging_for_table(info, FALSE); + goto set_lsn_of_file_id; + } + if (!share->base.born_transactional) + { + /* + This can happen if one converts a transactional table to a + not transactional table + */ + tprint(tracef, ", is not transactional. Ignoring open request"); + error= -1; + recovery_warnings++; + goto end; + } + if (cmp_translog_addr(lsn_of_file_id, share->state.create_rename_lsn) <= 0) + { + tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than" + " LOGREC_FILE_ID's LSN (%lu,0x%lx), ignoring open request", + LSN_IN_PARTS(share->state.create_rename_lsn), + LSN_IN_PARTS(lsn_of_file_id)); + recovery_warnings++; + error= -1; + goto end; + /* + Note that we tested that before testing corruption; a recent corrupted + table is not a blocker for the present log record. + */ + } + if (maria_is_crashed(info)) + { + eprint(tracef, "Table '%s' is crashed, skipping it. Please repair it with" + " aria_chk -r", share->open_file_name.str); + recovery_found_crashed_tables++; + error= -1; /* not fatal, try with other tables */ + goto end; + /* + Note that if a first recovery fails to apply a REDO, it marks the table + corrupted and stops the entire recovery. A second recovery will find the + table is marked corrupted and skip it (and thus possibly handle other + tables). + */ + } + /* don't log any records for this work */ + _ma_tmp_disable_logging_for_table(info, FALSE); + /* execution of some REDO records relies on data_file_length */ + dfile_len= my_seek(info->dfile.file, 0, SEEK_END, MYF(MY_WME)); + kfile_len= my_seek(info->s->kfile.file, 0, SEEK_END, MYF(MY_WME)); + if ((dfile_len == MY_FILEPOS_ERROR) || + (kfile_len == MY_FILEPOS_ERROR)) + { + tprint(tracef, ", length unknown\n"); + recovery_warnings++; + goto end; + } + if (share->state.state.data_file_length != dfile_len) + { + tprint(tracef, ", has wrong state.data_file_length (fixing it)"); + share->state.state.data_file_length= dfile_len; + } + if (share->state.state.key_file_length != kfile_len) + { + tprint(tracef, ", has wrong state.key_file_length (fixing it)"); + share->state.state.key_file_length= kfile_len; + } + if ((dfile_len % share->block_size) || (kfile_len % share->block_size)) + { + tprint(tracef, ", has too short last page\n"); + /* Recovery will fix this, no error */ + ALERT_USER(); + } + +set_lsn_of_file_id: + /* + This LSN serves in this situation; assume log is: + FILE_ID(6->"t2") REDO_INSERT(6) FILE_ID(6->"t1") CHECKPOINT(6->"t1") + then crash, checkpoint record is parsed and opens "t1" with id 6; assume + REDO phase starts from the REDO_INSERT above: it will wrongly try to + update a page of "t1". With this LSN below, REDO_INSERT can realize the + mapping is newer than itself, and not execute. + Same example is possible with UNDO_INSERT (update of the state). + */ + info->s->lsn_of_file_id= lsn_of_file_id; + all_tables[sid].info= info; + /* + We don't set info->s->id, it would be useless (no logging in REDO phase); + if you change that, know that some records in REDO phase call + _ma_update_state_lsns() which resets info->s->id. + */ + tprint(tracef, ", opened"); + error= 0; +end: + tprint(tracef, "\n"); + if (error) + { + if (info != NULL) + maria_close(info); + if (error == -1) + error= 0; + } + DBUG_RETURN(error); +} + +/* + NOTE + This is called for REDO_INSERT_ROW_HEAD and READ_NEW_ROW_HEAD +*/ + +prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD) +{ + int error= 1; + uchar *buff= NULL; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + + { + /* + Table was skipped at open time (because later dropped/renamed, not + transactional, or create_rename_lsn newer than LOGREC_FILE_ID), or + record was skipped due to skip_redo_lsn; it is not an error. + */ + return 0; + } + /* + Note that REDO is per page, we still consider it if its transaction + committed long ago and is unknown. + */ + /* + If REDO's LSN is > page's LSN (read from disk), we are going to modify the + page and change its LSN. The normal runtime code stores the UNDO's LSN + into the page. Here storing the REDO's LSN (rec->lsn) would work + (we are not writing to the log here, so don't have to "flush up to UNDO's + LSN"). But in a test scenario where we do updates at runtime, then remove + tables, apply the log and check that this results in the same table as at + runtime, putting the same LSN as runtime had done will decrease + differences. So we use the UNDO's LSN which is current_group_end_lsn. + */ + enlarge_buffer(rec); + if (log_record_buffer.str == NULL) + { + eprint(tracef, "Failed to read allocate buffer for record"); + goto end; + } + if (translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + buff= log_record_buffer.str; + if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn, + HEAD_PAGE, + (rec->type == + LOGREC_REDO_NEW_ROW_HEAD), + buff + FILEID_STORE_SIZE, + buff + + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE, + rec->record_length - + (FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE))) + goto end; + error= 0; +end: + return error; +} + +/* + NOTE + This is called for REDO_INSERT_ROW_TAIL and READ_NEW_ROW_TAIL +*/ + +prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL) +{ + int error= 1; + uchar *buff; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + buff= log_record_buffer.str; + if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn, + TAIL_PAGE, + (rec->type == + LOGREC_REDO_NEW_ROW_TAIL), + buff + FILEID_STORE_SIZE, + buff + + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE, + rec->record_length - + (FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE))) + goto end; + error= 0; + +end: + return error; +} + + +prototype_redo_exec_hook(REDO_INSERT_ROW_BLOBS) +{ + int error= 1; + uchar *buff; + uint number_of_blobs, number_of_ranges; + pgcache_page_no_t first_page, last_page; + char llbuf1[22], llbuf2[22]; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + buff= log_record_buffer.str; + if (_ma_apply_redo_insert_row_blobs(info, current_group_end_lsn, + buff, rec->lsn, &number_of_blobs, + &number_of_ranges, + &first_page, &last_page)) + goto end; + llstr(first_page, llbuf1); + llstr(last_page, llbuf2); + tprint(tracef, " %u blobs %u ranges, first page %s last %s", + number_of_blobs, number_of_ranges, llbuf1, llbuf2); + + error= 0; + +end: + tprint(tracef, " \n"); + return error; +} + + +prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn, + HEAD_PAGE, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn, + TAIL_PAGE, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_FREE_BLOCKS) +{ + int error= 1; + uchar *buff; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + + buff= log_record_buffer.str; + if (_ma_apply_redo_free_blocks(info, current_group_end_lsn, + buff + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + + if (_ma_apply_redo_free_head_or_tail(info, current_group_end_lsn, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_DELETE_ALL) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + tprint(tracef, " deleting all %lu rows\n", + (ulong)info->s->state.state.records); + if (maria_delete_all_rows(info)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_INDEX) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + + if (_ma_apply_redo_index(info, current_group_end_lsn, + log_record_buffer.str + FILEID_STORE_SIZE, + rec->record_length - FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + +prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + + if (_ma_apply_redo_index_new_page(info, current_group_end_lsn, + log_record_buffer.str + FILEID_STORE_SIZE, + rec->record_length - FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + + if (_ma_apply_redo_index_free_page(info, current_group_end_lsn, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_BITMAP_NEW_PAGE) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + + if (cmp_translog_addr(rec->lsn, checkpoint_start) >= 0) + { + /* + Record is potentially after the bitmap flush made by Checkpoint, so has + to be replayed. It may overwrite a more recent state but that will be + corrected by all upcoming REDOs for data pages. + If the condition is false, we must not apply the record: it is unneeded + and nocive (may not be corrected as REDOs can be skipped due to + dirty-pages list). + */ + if (_ma_apply_redo_bitmap_new_page(info, current_group_end_lsn, + log_record_buffer.str + + FILEID_STORE_SIZE)) + goto end; + } + error= 0; +end: + return error; +} + + +static inline void set_undo_lsn_for_active_trans(uint16 short_trid, LSN lsn) +{ + if (all_active_trans[short_trid].long_trid == 0) + { + /* transaction unknown, so has committed or fully rolled back long ago */ + return; + } + all_active_trans[short_trid].undo_lsn= lsn; + if (all_active_trans[short_trid].first_undo_lsn == LSN_IMPOSSIBLE) + all_active_trans[short_trid].first_undo_lsn= lsn; +} + + +prototype_redo_exec_hook(UNDO_ROW_INSERT) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (info == NULL) + { + /* + Note that we set undo_lsn anyway. So that if the transaction is later + rolled back, this UNDO is tried for execution and we get a warning (as + it would then be abnormal that info==NULL). + */ + return 0; + } + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + tprint(tracef, " state has LSN (%lu,0x%lx) older than record, updating" + " rows' count\n", LSN_IN_PARTS(share->state.is_of_horizon)); + share->state.state.records++; + if (share->calc_checksum) + { + uchar buff[HA_CHECKSUM_STORE_SIZE]; + if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + HA_CHECKSUM_STORE_SIZE, buff, NULL) != + HA_CHECKSUM_STORE_SIZE) + { + eprint(tracef, "Failed to read record"); + return 1; + } + share->state.state.checksum+= ha_checksum_korr(buff); + } + info->s->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + } + tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records); + /* Unpin all pages, stamp them with UNDO's LSN */ + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_ROW_DELETE) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (info == NULL) + return 0; + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + tprint(tracef, " state older than record\n"); + share->state.state.records--; + if (share->calc_checksum) + { + uchar buff[HA_CHECKSUM_STORE_SIZE]; + if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 + + PAGERANGE_STORE_SIZE, + HA_CHECKSUM_STORE_SIZE, buff, NULL) != + HA_CHECKSUM_STORE_SIZE) + { + eprint(tracef, "Failed to read record"); + return 1; + } + share->state.state.checksum+= ha_checksum_korr(buff); + } + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + } + tprint(tracef, " rows' count %lu\n", (ulong)share->state.state.records); + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_ROW_UPDATE) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (info == NULL) + return 0; + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + if (share->calc_checksum) + { + uchar buff[HA_CHECKSUM_STORE_SIZE]; + if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + HA_CHECKSUM_STORE_SIZE, buff, NULL) != + HA_CHECKSUM_STORE_SIZE) + { + eprint(tracef, "Failed to read record"); + return 1; + } + share->state.state.checksum+= ha_checksum_korr(buff); + } + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + } + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_KEY_INSERT) +{ + MARIA_HA *info; + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (!(info= get_MARIA_HA_from_UNDO_record(rec))) + return 0; + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + const uchar *ptr= rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE; + uint keynr= key_nr_korr(ptr); + if (share->base.auto_key == (keynr + 1)) /* it's auto-increment */ + { + const HA_KEYSEG *keyseg= info->s->keyinfo[keynr].seg; + ulonglong value; + char llbuf[22]; + uchar *to; + tprint(tracef, " state older than record\n"); + /* we read the record to find the auto_increment value */ + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + to= log_record_buffer.str + LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE; + if (keyseg->flag & HA_SWAP_KEY) + { + /* We put key from log record to "data record" packing format... */ + uchar reversed[MARIA_MAX_KEY_BUFF]; + uchar *key_ptr= to; + uchar *key_end= key_ptr + keyseg->length; + to= reversed + keyseg->length; + do + { + *--to= *key_ptr++; + } while (key_ptr != key_end); + /* ... so that we can read it with: */ + } + value= ma_retrieve_auto_increment(to, keyseg->type); + set_if_bigger(share->state.auto_increment, value); + llstr(share->state.auto_increment, llbuf); + tprint(tracef, " auto-inc %s\n", llbuf); + } + } + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_KEY_DELETE) +{ + MARIA_HA *info; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (!(info= get_MARIA_HA_from_UNDO_record(rec))) + return 0; + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (info == NULL) + return 0; + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + uint key_nr; + my_off_t page; + key_nr= key_nr_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE); + page= page_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE); + share->state.key_root[key_nr]= (page == IMPOSSIBLE_PAGE_NO ? + HA_OFFSET_ERROR : + page * share->block_size); + } + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_BULK_INSERT) +{ + /* + If the repair finished it wrote and sync the state. If it didn't finish, + we are going to empty the table and that will fix the state. + */ + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(IMPORTED_TABLE) +{ + char *name; + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + name= (char *)log_record_buffer.str; + tprint(tracef, "Table '%s' was imported (auto-zerofilled) in this Aria instance\n", name); + return 0; +} + + +prototype_redo_exec_hook(COMMIT) +{ + uint16 sid= rec->short_trid; + TrID long_trid= all_active_trans[sid].long_trid; + char llbuf[22]; + if (long_trid == 0) + { + tprint(tracef, "We don't know about transaction with short_trid %u;" + "it probably committed long ago, forget it\n", sid); + bzero(&all_active_trans[sid], sizeof(all_active_trans[sid])); + return 0; + } + llstr(long_trid, llbuf); + tprint(tracef, "Transaction long_trid %s short_trid %u committed\n", + llbuf, sid); + bzero(&all_active_trans[sid], sizeof(all_active_trans[sid])); +#ifdef MARIA_VERSIONING + /* + if real recovery: + transaction was committed, move it to some separate list for later + purging (but don't purge now! purging may have been started before, we + may find REDO_PURGE records soon). + */ +#endif + return 0; +} + +prototype_redo_exec_hook(CLR_END) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + LSN previous_undo_lsn; + enum translog_record_type undone_record_type; + const LOG_DESC *log_desc; + my_bool row_entry= 0; + uchar *logpos; + DBUG_ENTER("exec_REDO_LOGREC_CLR_END"); + + previous_undo_lsn= lsn_korr(rec->header); + undone_record_type= + clr_type_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE); + log_desc= &log_record_type_descriptor[undone_record_type]; + + set_undo_lsn_for_active_trans(rec->short_trid, previous_undo_lsn); + if (info == NULL) + DBUG_RETURN(0); + share= info->s; + tprint(tracef, " CLR_END was about %s, undo_lsn now LSN (%lu,0x%lx)\n", + log_desc->name, LSN_IN_PARTS(previous_undo_lsn)); + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + logpos= (log_record_buffer.str + LSN_STORE_SIZE + FILEID_STORE_SIZE + + CLR_TYPE_STORE_SIZE); + + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + tprint(tracef, " state older than record\n"); + switch (undone_record_type) { + case LOGREC_UNDO_ROW_DELETE: + row_entry= 1; + share->state.state.records++; + break; + case LOGREC_UNDO_ROW_INSERT: + share->state.state.records--; + share->state.changed|= STATE_NOT_OPTIMIZED_ROWS; + row_entry= 1; + break; + case LOGREC_UNDO_ROW_UPDATE: + row_entry= 1; + break; + case LOGREC_UNDO_KEY_INSERT: + case LOGREC_UNDO_KEY_DELETE: + break; + case LOGREC_UNDO_KEY_INSERT_WITH_ROOT: + case LOGREC_UNDO_KEY_DELETE_WITH_ROOT: + { + uint key_nr; + my_off_t page; + key_nr= key_nr_korr(logpos); + page= page_korr(logpos + KEY_NR_STORE_SIZE); + share->state.key_root[key_nr]= (page == IMPOSSIBLE_PAGE_NO ? + HA_OFFSET_ERROR : + page * share->block_size); + break; + } + case LOGREC_UNDO_BULK_INSERT: + break; + default: + DBUG_ASSERT(0); + } + if (row_entry && share->calc_checksum) + share->state.state.checksum+= ha_checksum_korr(logpos); + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + } + if (row_entry) + tprint(tracef, " rows' count %lu\n", (ulong)share->state.state.records); + _ma_unpin_all_pages(info, rec->lsn); + DBUG_RETURN(0); +} + + +/** + Hock to print debug information (like MySQL query) +*/ + +prototype_redo_exec_hook(DEBUG_INFO) +{ + uchar *data; + enum translog_debug_info_type debug_info; + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record debug record"); + return 1; + } + debug_info= (enum translog_debug_info_type) log_record_buffer.str[0]; + data= log_record_buffer.str + 1; + switch (debug_info) { + case LOGREC_DEBUG_INFO_QUERY: + tprint(tracef, "Query: %.*s\n", rec->record_length - 1, + (char*) data); + break; + default: + DBUG_ASSERT(0); + } + return 0; +} + + +/** + In some cases we have to skip execution of an UNDO record during the UNDO + phase. +*/ + +static void skip_undo_record(LSN previous_undo_lsn, TRN *trn) +{ + trn->undo_lsn= previous_undo_lsn; + if (previous_undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */ + trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn); + skipped_undo_phase++; +} + + +prototype_undo_exec_hook(UNDO_ROW_INSERT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + const uchar *record_ptr; + + if (info == NULL || maria_is_crashed(info)) + { + /* + Unlike for REDOs, if the table was skipped it is abnormal; we have a + transaction to rollback which used this table, as it is not rolled back + it was supposed to hold this table and so the table should still be + there. Skip it (user may have repaired the table with maria_chk because + it was so badly corrupted that a previous recovery failed) but warn. + */ + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + record_ptr= rec->header; + if (share->calc_checksum) + { + /* + We need to read more of the record to put the checksum into the record + buffer used by _ma_apply_undo_row_insert(). + If the table has no live checksum, rec->header will be enough. + */ + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + record_ptr= log_record_buffer.str; + } + + info->trn= trn; + error= _ma_apply_undo_row_insert(info, previous_undo_lsn, + record_ptr + LSN_STORE_SIZE + + FILEID_STORE_SIZE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records); + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_ROW_DELETE) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_row_delete(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - + (LSN_STORE_SIZE + FILEID_STORE_SIZE)); + info->trn= 0; + tprint(tracef, " rows' count %lu\n undo_lsn now LSN (%lu,0x%lx)\n", + (ulong)share->state.state.records, LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_ROW_UPDATE) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_row_update(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - + (LSN_STORE_SIZE + FILEID_STORE_SIZE)); + info->trn= 0; + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_KEY_INSERT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_key_insert(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - LSN_STORE_SIZE - + FILEID_STORE_SIZE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_KEY_DELETE) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_key_delete(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - LSN_STORE_SIZE - + FILEID_STORE_SIZE, FALSE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_key_delete(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - LSN_STORE_SIZE - + FILEID_STORE_SIZE, TRUE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_BULK_INSERT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + /* Here we don't check for crashed as we can undo the bulk insert */ + if (info == NULL) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + + info->trn= trn; + error= _ma_apply_undo_bulk_insert(info, previous_undo_lsn); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +static int run_redo_phase(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply) +{ + TRANSLOG_HEADER_BUFFER rec; + struct st_translog_scanner_data scanner; + int len; + uint i; + DBUG_ENTER("run_redo_phase"); + + /* install hooks for execution */ +#define install_redo_exec_hook(R) \ + log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \ + exec_REDO_LOGREC_ ## R; +#define install_redo_exec_hook_shared(R,S) \ + log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \ + exec_REDO_LOGREC_ ## S; +#define install_undo_exec_hook(R) \ + log_record_type_descriptor[LOGREC_ ## R].record_execute_in_undo_phase= \ + exec_UNDO_LOGREC_ ## R; + install_redo_exec_hook(LONG_TRANSACTION_ID); + install_redo_exec_hook(CHECKPOINT); + install_redo_exec_hook(REDO_CREATE_TABLE); + install_redo_exec_hook(REDO_RENAME_TABLE); + install_redo_exec_hook(REDO_REPAIR_TABLE); + install_redo_exec_hook(REDO_DROP_TABLE); + install_redo_exec_hook(FILE_ID); + install_redo_exec_hook(INCOMPLETE_LOG); + install_redo_exec_hook(INCOMPLETE_GROUP); + install_redo_exec_hook(REDO_INSERT_ROW_HEAD); + install_redo_exec_hook(REDO_INSERT_ROW_TAIL); + install_redo_exec_hook(REDO_INSERT_ROW_BLOBS); + install_redo_exec_hook(REDO_PURGE_ROW_HEAD); + install_redo_exec_hook(REDO_PURGE_ROW_TAIL); + install_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL); + install_redo_exec_hook(REDO_FREE_BLOCKS); + install_redo_exec_hook(REDO_DELETE_ALL); + install_redo_exec_hook(REDO_INDEX); + install_redo_exec_hook(REDO_INDEX_NEW_PAGE); + install_redo_exec_hook(REDO_INDEX_FREE_PAGE); + install_redo_exec_hook(REDO_BITMAP_NEW_PAGE); + install_redo_exec_hook(UNDO_ROW_INSERT); + install_redo_exec_hook(UNDO_ROW_DELETE); + install_redo_exec_hook(UNDO_ROW_UPDATE); + install_redo_exec_hook(UNDO_KEY_INSERT); + install_redo_exec_hook(UNDO_KEY_DELETE); + install_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); + install_redo_exec_hook(COMMIT); + install_redo_exec_hook(CLR_END); + install_undo_exec_hook(UNDO_ROW_INSERT); + install_undo_exec_hook(UNDO_ROW_DELETE); + install_undo_exec_hook(UNDO_ROW_UPDATE); + install_undo_exec_hook(UNDO_KEY_INSERT); + install_undo_exec_hook(UNDO_KEY_DELETE); + install_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); + /* REDO_NEW_ROW_HEAD shares entry with REDO_INSERT_ROW_HEAD */ + install_redo_exec_hook_shared(REDO_NEW_ROW_HEAD, REDO_INSERT_ROW_HEAD); + /* REDO_NEW_ROW_TAIL shares entry with REDO_INSERT_ROW_TAIL */ + install_redo_exec_hook_shared(REDO_NEW_ROW_TAIL, REDO_INSERT_ROW_TAIL); + install_redo_exec_hook(UNDO_BULK_INSERT); + install_undo_exec_hook(UNDO_BULK_INSERT); + install_redo_exec_hook(IMPORTED_TABLE); + install_redo_exec_hook(DEBUG_INFO); + + current_group_end_lsn= LSN_IMPOSSIBLE; +#ifndef DBUG_OFF + current_group_table= NULL; +#endif + + if (unlikely(lsn == LSN_IMPOSSIBLE || lsn == translog_get_horizon())) + { + tprint(tracef, "checkpoint address refers to the log end log or " + "log is empty, nothing to do.\n"); + DBUG_RETURN(0); + } + + len= translog_read_record_header(lsn, &rec); + + if (len == RECHEADER_READ_ERROR) + { + eprint(tracef, "Failed to read header of the first record."); + DBUG_RETURN(1); + } + if (translog_scanner_init(lsn, 1, &scanner, 1)) + { + tprint(tracef, "Scanner init failed\n"); + DBUG_RETURN(1); + } + for (i= 1;;i++) + { + uint16 sid= rec.short_trid; + const LOG_DESC *log_desc= &log_record_type_descriptor[rec.type]; + display_record_position(log_desc, &rec, i); + /* + A complete group is a set of log records with an "end mark" record + (e.g. a set of REDOs for an operation, terminated by an UNDO for this + operation); if there is no "end mark" record the group is incomplete and + won't be executed. + */ + if ((log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF) || + (log_desc->record_in_group == LOGREC_LAST_IN_GROUP)) + { + if (all_active_trans[sid].group_start_lsn != LSN_IMPOSSIBLE) + { + if (log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF) + { + /* + Can happen if the transaction got a table write error, then + unlocked tables thus wrote a COMMIT record. Or can be an + INCOMPLETE_GROUP record written by a previous recovery. + */ + tprint(tracef, "\nDiscarding incomplete group before this record\n"); + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + } + else + { + struct st_translog_scanner_data scanner2; + TRANSLOG_HEADER_BUFFER rec2; + /* + There is a complete group for this transaction, containing more + than this event. + */ + tprint(tracef, " ends a group:\n"); + len= + translog_read_record_header(all_active_trans[sid].group_start_lsn, + &rec2); + if (len < 0) /* EOF or error */ + { + tprint(tracef, "Cannot find record where it should be\n"); + goto err; + } + if (lsn_end != LSN_IMPOSSIBLE && rec2.lsn >= lsn_end) + { + tprint(tracef, + "lsn_end reached at (%lu,0x%lx). " + "Skipping rest of redo entries", + LSN_IN_PARTS(rec2.lsn)); + translog_destroy_scanner(&scanner); + translog_free_record_header(&rec); + DBUG_RETURN(0); + } + + if (translog_scanner_init(rec2.lsn, 1, &scanner2, 1)) + { + tprint(tracef, "Scanner2 init failed\n"); + goto err; + } + current_group_end_lsn= rec.lsn; + do + { + if (rec2.short_trid == sid) /* it's in our group */ + { + const LOG_DESC *log_desc2= &log_record_type_descriptor[rec2.type]; + display_record_position(log_desc2, &rec2, 0); + if (apply == MARIA_LOG_CHECK) + { + translog_size_t read_len; + enlarge_buffer(&rec2); + read_len= + translog_read_record(rec2.lsn, 0, rec2.record_length, + log_record_buffer.str, NULL); + if (read_len != rec2.record_length) + { + tprint(tracef, "Cannot read record's body: read %u of" + " %u bytes\n", read_len, rec2.record_length); + translog_destroy_scanner(&scanner2); + translog_free_record_header(&rec2); + goto err; + } + } + if (apply == MARIA_LOG_APPLY && + display_and_apply_record(log_desc2, &rec2)) + { + translog_destroy_scanner(&scanner2); + translog_free_record_header(&rec2); + goto err; + } + } + translog_free_record_header(&rec2); + len= translog_read_next_record_header(&scanner2, &rec2); + if (len < 0) /* EOF or error */ + { + tprint(tracef, "Cannot find record where it should be\n"); + translog_destroy_scanner(&scanner2); + translog_free_record_header(&rec2); + goto err; + } + } + while (rec2.lsn < rec.lsn); + /* group finished */ + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + current_group_end_lsn= LSN_IMPOSSIBLE; /* for debugging */ + display_record_position(log_desc, &rec, 0); + translog_destroy_scanner(&scanner2); + translog_free_record_header(&rec2); + } + } + if (apply == MARIA_LOG_APPLY && + display_and_apply_record(log_desc, &rec)) + goto err; +#ifndef DBUG_OFF + current_group_table= NULL; +#endif + } + else /* record does not end group */ + { + /* just record the fact, can't know if can execute yet */ + if (all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE) + { + /* group not yet started */ + all_active_trans[sid].group_start_lsn= rec.lsn; + } + } + translog_free_record_header(&rec); + len= translog_read_next_record_header(&scanner, &rec); + if (len < 0) + { + switch (len) + { + case RECHEADER_READ_EOF: + tprint(tracef, "EOF on the log\n"); + break; + case RECHEADER_READ_ERROR: + tprint(tracef, "Error reading log\n"); + goto err; + } + break; + } + } + translog_destroy_scanner(&scanner); + translog_free_record_header(&rec); + if (recovery_message_printed == REC_MSG_REDO) + { + fprintf(stderr, " 100%%"); + fflush(stderr); + procent_printed= 1; + } + DBUG_RETURN(0); + +err: + translog_destroy_scanner(&scanner); + translog_free_record_header(&rec); + DBUG_RETURN(1); +} + + +/** + @brief Informs about any aborted groups or uncommitted transactions, + prepares for the UNDO phase if needed. + + @note Observe that it may init trnman. +*/ +static uint end_of_redo_phase(my_bool prepare_for_undo_phase) +{ + uint sid, uncommitted= 0; + char llbuf[22]; + LSN addr; + + hash_free(&all_dirty_pages); + /* + hash_free() can be called multiple times probably, but be safe if that + changes + */ + bzero(&all_dirty_pages, sizeof(all_dirty_pages)); + my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR)); + dirty_pages_pool= NULL; + + llstr(max_long_trid, llbuf); + tprint(tracef, "Maximum transaction long id seen: %s\n", llbuf); + llstr(max_trid_in_control_file, llbuf); + tprint(tracef, "Maximum transaction long id seen in control file: %s\n", + llbuf); + /* + If logs were deleted, or lost, trid in control file is needed to set + trnman's generator: + */ + set_if_bigger(max_long_trid, max_trid_in_control_file); + if (prepare_for_undo_phase && trnman_init(max_long_trid)) + return -1; + + trns_created= TRUE; + + for (sid= 0; sid <= SHORT_TRID_MAX; sid++) + { + TrID long_trid= all_active_trans[sid].long_trid; + LSN gslsn= all_active_trans[sid].group_start_lsn; + TRN *trn; + if (gslsn != LSN_IMPOSSIBLE) + { + tprint(tracef, "Group at LSN (%lu,0x%lx) short_trid %u incomplete\n", + LSN_IN_PARTS(gslsn), sid); + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + } + if (all_active_trans[sid].undo_lsn != LSN_IMPOSSIBLE) + { + llstr(long_trid, llbuf); + tprint(tracef, "Transaction long_trid %s short_trid %u uncommitted\n", + llbuf, sid); + /* + dummy_transaction_object serves only for DDLs, where there is never a + rollback or incomplete group. And unknown transactions (which have + long_trid==0) should have undo_lsn==LSN_IMPOSSIBLE. + */ + if (long_trid ==0) + { + eprint(tracef, "Transaction with long_trid 0 should not roll back"); + ALERT_USER(); + return -1; + } + if (prepare_for_undo_phase) + { + if ((trn= trnman_recreate_trn_from_recovery(sid, long_trid)) == NULL) + return -1; + trn->undo_lsn= all_active_trans[sid].undo_lsn; + trn->first_undo_lsn= all_active_trans[sid].first_undo_lsn | + TRANSACTION_LOGGED_LONG_ID; /* because trn is known in log */ + if (gslsn != LSN_IMPOSSIBLE) + { + /* + UNDO phase will log some records. So, a future recovery may see: + REDO(from incomplete group) - REDO(from rollback) - CLR_END + and thus execute the first REDO (finding it in "a complete + group"). To prevent that: + */ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS]; + LSN lsn; + if (translog_write_record(&lsn, LOGREC_INCOMPLETE_GROUP, + trn, NULL, 0, + TRANSLOG_INTERNAL_PARTS, log_array, + NULL, NULL)) + return -1; + } + } + uncommitted++; + } +#ifdef MARIA_VERSIONING + /* + If real recovery: if transaction was committed, move it to some separate + list for soon purging. + */ +#endif + } + + my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR)); + all_active_trans= NULL; + + /* + The UNDO phase uses some normal run-time code of ROLLBACK: generates log + records, etc; prepare tables for that + */ + addr= translog_get_horizon(); + for (sid= 0; sid <= SHARE_ID_MAX; sid++) + { + MARIA_HA *info= all_tables[sid].info; + if (info != NULL) + { + prepare_table_for_close(info, addr); + /* + But we don't close it; we leave it available for the UNDO phase; + it's likely that the UNDO phase will need it. + */ + if (prepare_for_undo_phase) + translog_assign_id_to_share_from_recovery(info->s, sid); + } + } + return uncommitted; +} + + +static int run_undo_phase(uint uncommitted) +{ + LSN last_undo; + DBUG_ENTER("run_undo_phase"); + + if (uncommitted > 0) + { + checkpoint_useful= TRUE; + if (tracef != stdout) + { + if (recovery_message_printed == REC_MSG_NONE) + print_preamble(); + fprintf(stderr, "transactions to roll back:"); + recovery_message_printed= REC_MSG_UNDO; + } + tprint(tracef, "%u transactions will be rolled back\n", uncommitted); + procent_printed= 1; + for( ; ; ) + { + char llbuf[22]; + TRN *trn; + if (recovery_message_printed == REC_MSG_UNDO) + { + fprintf(stderr, " %u", uncommitted); + fflush(stderr); + } + if ((uncommitted--) == 0) + break; + trn= trnman_get_any_trn(); + DBUG_ASSERT(trn != NULL); + llstr(trn->trid, llbuf); + tprint(tracef, "Rolling back transaction of long id %s\n", llbuf); + last_undo= trn->undo_lsn + 1; + + /* Execute all undo entries */ + while (trn->undo_lsn) + { + TRANSLOG_HEADER_BUFFER rec; + LOG_DESC *log_desc; + DBUG_ASSERT(trn->undo_lsn < last_undo); + last_undo= trn->undo_lsn; + + if (translog_read_record_header(trn->undo_lsn, &rec) == + RECHEADER_READ_ERROR) + DBUG_RETURN(1); + log_desc= &log_record_type_descriptor[rec.type]; + display_record_position(log_desc, &rec, 0); + if (log_desc->record_execute_in_undo_phase(&rec, trn)) + { + eprint(tracef, "Got error %d when executing undo %s", my_errno, + log_desc->name); + translog_free_record_header(&rec); + DBUG_RETURN(1); + } + translog_free_record_header(&rec); + } + + if (trnman_rollback_trn(trn)) + DBUG_RETURN(1); + /* We could want to span a few threads (4?) instead of 1 */ + /* In the future, we want to have this phase *online* */ + } + } + procent_printed= 0; + DBUG_RETURN(0); +} + + +/** + In case of error in recovery, deletes all transactions from the transaction + manager so that this module does not assert. + + @note no checkpoint should be taken as those transactions matter for the + next recovery (they still haven't been properly dealt with). +*/ + +static void delete_all_transactions() +{ + for( ; ; ) + { + TRN *trn= trnman_get_any_trn(); + if (trn == NULL) + break; + trn->undo_lsn= trn->first_undo_lsn= LSN_IMPOSSIBLE; + trnman_rollback_trn(trn); /* ignore error */ + } +} + + +/** + @brief re-enables transactionality, updates is_of_horizon + + @param info table + @param horizon address to set is_of_horizon +*/ + +static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon) +{ + MARIA_SHARE *share= info->s; + /* + In a fully-forward REDO phase (no checkpoint record), + state is now at least as new as the LSN of the current record. It may be + newer, in case we are seeing a LOGREC_FILE_ID which tells us to close a + table, but that table was later modified further in the log. + But if we parsed a checkpoint record, it may be this way in the log: + FILE_ID(6->t2)... FILE_ID(6->t1)... CHECKPOINT(6->t1) + Checkpoint parsing opened t1 with id 6; first FILE_ID above is going to + make t1 close; the first condition below is however false (when checkpoint + was taken it increased is_of_horizon) and so it works. For safety we + add the second condition. + */ + if (cmp_translog_addr(share->state.is_of_horizon, horizon) < 0 && + cmp_translog_addr(share->lsn_of_file_id, horizon) < 0) + { + share->state.is_of_horizon= horizon; + _ma_state_info_write_sub(share->kfile.file, &share->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET); + } + + /* + Ensure that info->state is up to date as + _ma_renable_logging_for_table() is depending on this + */ + *info->state= info->s->state.state; + + /* + This leaves PAGECACHE_PLAIN_PAGE pages into the cache, while the table is + going to switch back to transactional. So the table will be a mix of + pages, which is ok as long as we don't take any checkpoints until all + tables get closed at the end of the UNDO phase. + */ + _ma_reenable_logging_for_table(info, FALSE); + info->trn= NULL; /* safety */ +} + + +static MARIA_HA *get_MARIA_HA_from_REDO_record(const + TRANSLOG_HEADER_BUFFER *rec) +{ + uint16 sid; + pgcache_page_no_t page; + MARIA_HA *info; + MARIA_SHARE *share; + char llbuf[22]; + my_bool index_page_redo_entry= FALSE, page_redo_entry= FALSE; + LINT_INIT(page); + + print_redo_phase_progress(rec->lsn); + sid= fileid_korr(rec->header); + switch (rec->type) { + /* not all REDO records have a page: */ + case LOGREC_REDO_INDEX_NEW_PAGE: + case LOGREC_REDO_INDEX: + case LOGREC_REDO_INDEX_FREE_PAGE: + index_page_redo_entry= 1; + /* Fall trough*/ + case LOGREC_REDO_INSERT_ROW_HEAD: + case LOGREC_REDO_INSERT_ROW_TAIL: + case LOGREC_REDO_PURGE_ROW_HEAD: + case LOGREC_REDO_PURGE_ROW_TAIL: + case LOGREC_REDO_NEW_ROW_HEAD: + case LOGREC_REDO_NEW_ROW_TAIL: + case LOGREC_REDO_FREE_HEAD_OR_TAIL: + page_redo_entry= TRUE; + page= page_korr(rec->header + FILEID_STORE_SIZE); + llstr(page, llbuf); + break; + /* + For REDO_FREE_BLOCKS, no need to look at dirty pages list: it does not + read data pages, only reads/modifies bitmap page(s) which is cheap. + */ + default: + break; + } + tprint(tracef, " For table of short id %u", sid); + info= all_tables[sid].info; +#ifndef DBUG_OFF + DBUG_ASSERT(current_group_table == NULL || current_group_table == info); + current_group_table= info; +#endif + if (info == NULL) + { + tprint(tracef, ", table skipped, so skipping record\n"); + return NULL; + } + share= info->s; + tprint(tracef, ", '%s'", share->open_file_name.str); + DBUG_ASSERT(in_redo_phase); + if (cmp_translog_addr(rec->lsn, share->lsn_of_file_id) <= 0) + { + /* + This can happen only if processing a record before the checkpoint + record. + id->name mapping is newer than REDO record: for sure the table subject + of the REDO has been flushed and forced (id re-assignment implies this); + REDO can be ignored (and must be, as we don't know what this subject + table was). + */ + DBUG_ASSERT(cmp_translog_addr(rec->lsn, checkpoint_start) < 0); + tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent" + " than record, skipping record", + LSN_IN_PARTS(share->lsn_of_file_id)); + return NULL; + } + if (cmp_translog_addr(rec->lsn, share->state.skip_redo_lsn) <= 0) + { + /* probably a bulk insert repair */ + tprint(tracef, ", has skip_redo_lsn (%lu,0x%lx) more recent than" + " record, skipping record\n", + LSN_IN_PARTS(share->state.skip_redo_lsn)); + return NULL; + } + /* detect if an open instance of a dropped table (internal bug) */ + DBUG_ASSERT(share->last_version != 0); + if (page_redo_entry) + { + /* + Consult dirty pages list. + REDO_INSERT_ROW_BLOBS will consult list by itself, as it covers several + pages. + */ + tprint(tracef, " page %s", llbuf); + if (_ma_redo_not_needed_for_page(sid, rec->lsn, page, + index_page_redo_entry)) + return NULL; + } + /* + So we are going to read the page, and if its LSN is older than the + record's we will modify the page + */ + tprint(tracef, ", applying record\n"); + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */ + return info; +} + + +static MARIA_HA *get_MARIA_HA_from_UNDO_record(const + TRANSLOG_HEADER_BUFFER *rec) +{ + uint16 sid; + MARIA_HA *info; + MARIA_SHARE *share; + + sid= fileid_korr(rec->header + LSN_STORE_SIZE); + tprint(tracef, " For table of short id %u", sid); + info= all_tables[sid].info; +#ifndef DBUG_OFF + DBUG_ASSERT(!in_redo_phase || + current_group_table == NULL || current_group_table == info); + current_group_table= info; +#endif + if (info == NULL) + { + tprint(tracef, ", table skipped, so skipping record\n"); + return NULL; + } + share= info->s; + tprint(tracef, ", '%s'", share->open_file_name.str); + if (cmp_translog_addr(rec->lsn, share->lsn_of_file_id) <= 0) + { + tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent" + " than record, skipping record", + LSN_IN_PARTS(share->lsn_of_file_id)); + return NULL; + } + if (in_redo_phase && + cmp_translog_addr(rec->lsn, share->state.skip_redo_lsn) <= 0) + { + /* probably a bulk insert repair */ + tprint(tracef, ", has skip_redo_lsn (%lu,0x%lx) more recent than" + " record, skipping record\n", + LSN_IN_PARTS(share->state.skip_redo_lsn)); + return NULL; + } + DBUG_ASSERT(share->last_version != 0); + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */ + tprint(tracef, ", applying record\n"); + return info; +} + + +/** + @brief Parses checkpoint record. + + Builds from it the dirty_pages list (a hash), opens tables and maps them to + their 2-byte IDs, recreates transactions (not real TRNs though). + + @return LSN from where in the log the REDO phase should start + @retval LSN_ERROR error + @retval other ok +*/ + +static LSN parse_checkpoint_record(LSN lsn) +{ + ulong i; + ulonglong nb_dirty_pages; + TRANSLOG_HEADER_BUFFER rec; + TRANSLOG_ADDRESS start_address; + int len; + uint nb_active_transactions, nb_committed_transactions, nb_tables; + uchar *ptr; + LSN minimum_rec_lsn_of_active_transactions, minimum_rec_lsn_of_dirty_pages; + struct st_dirty_page *next_dirty_page_in_pool; + + tprint(tracef, "Loading data from checkpoint record at LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(lsn)); + if ((len= translog_read_record_header(lsn, &rec)) == RECHEADER_READ_ERROR) + { + tprint(tracef, "Cannot find checkpoint record where it should be\n"); + return LSN_ERROR; + } + + enlarge_buffer(&rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec.lsn, 0, rec.record_length, + log_record_buffer.str, NULL) != + rec.record_length) + { + eprint(tracef, "Failed to read record"); + return LSN_ERROR; + } + + ptr= log_record_buffer.str; + start_address= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + tprint(tracef, "Checkpoint record has start_horizon at (%lu,0x%lx)\n", + LSN_IN_PARTS(start_address)); + + /* transactions */ + nb_active_transactions= uint2korr(ptr); + ptr+= 2; + tprint(tracef, "%u active transactions\n", nb_active_transactions); + minimum_rec_lsn_of_active_transactions= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + max_long_trid= transid_korr(ptr); + ptr+= TRANSID_SIZE; + + /* + how much brain juice and discussions there was to come to writing this + line. It may make start_address slightly decrease (only by the time it + takes to write one or a few rows, roughly). + */ + tprint(tracef, "Checkpoint record has min_rec_lsn of active transactions" + " at (%lu,0x%lx)\n", + LSN_IN_PARTS(minimum_rec_lsn_of_active_transactions)); + set_if_smaller(start_address, minimum_rec_lsn_of_active_transactions); + + for (i= 0; i < nb_active_transactions; i++) + { + uint16 sid= uint2korr(ptr); + TrID long_id; + LSN undo_lsn, first_undo_lsn; + ptr+= 2; + long_id= uint6korr(ptr); + ptr+= 6; + DBUG_ASSERT(sid > 0 && long_id > 0); + undo_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + first_undo_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + new_transaction(sid, long_id, undo_lsn, first_undo_lsn); + } + nb_committed_transactions= uint4korr(ptr); + ptr+= 4; + tprint(tracef, "%lu committed transactions\n", + (ulong)nb_committed_transactions); + /* no purging => committed transactions are not important */ + ptr+= (6 + LSN_STORE_SIZE) * nb_committed_transactions; + + /* tables */ + nb_tables= uint4korr(ptr); + ptr+= 4; + tprint(tracef, "%u open tables\n", nb_tables); + for (i= 0; i< nb_tables; i++) + { + char name[FN_REFLEN]; + LSN first_log_write_lsn; + uint name_len; + uint16 sid= uint2korr(ptr); + ptr+= 2; + DBUG_ASSERT(sid > 0); + first_log_write_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + name_len= strlen((char *)ptr) + 1; + strmake(name, (char *)ptr, sizeof(name)-1); + ptr+= name_len; + if (new_table(sid, name, first_log_write_lsn)) + return LSN_ERROR; + } + + /* dirty pages */ + nb_dirty_pages= uint8korr(ptr); + + /* Ensure casts later will not loose significant bits. */ + DBUG_ASSERT((nb_dirty_pages <= SIZE_T_MAX/sizeof(struct st_dirty_page)) && + (nb_dirty_pages <= ULONG_MAX)); + + ptr+= 8; + tprint(tracef, "%lu dirty pages\n", (ulong) nb_dirty_pages); + if (hash_init(&all_dirty_pages, &my_charset_bin, (ulong)nb_dirty_pages, + offsetof(struct st_dirty_page, file_and_page_id), + sizeof(((struct st_dirty_page *)NULL)->file_and_page_id), + NULL, NULL, 0)) + return LSN_ERROR; + dirty_pages_pool= + (struct st_dirty_page *)my_malloc((size_t)nb_dirty_pages * + sizeof(struct st_dirty_page), + MYF(MY_WME)); + if (unlikely(dirty_pages_pool == NULL)) + return LSN_ERROR; + next_dirty_page_in_pool= dirty_pages_pool; + minimum_rec_lsn_of_dirty_pages= LSN_MAX; + if (maria_recovery_verbose) + tprint(tracef, "Table_id Is_index Page_id Rec_lsn\n"); + for (i= 0; i < nb_dirty_pages ; i++) + { + pgcache_page_no_t page_id; + LSN rec_lsn; + uint32 is_index; + uint16 table_id= uint2korr(ptr); + ptr+= 2; + is_index= ptr[0]; + ptr++; + page_id= page_korr(ptr); + ptr+= PAGE_STORE_SIZE; + rec_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + if (new_page((is_index << 16) | table_id, + page_id, rec_lsn, next_dirty_page_in_pool++)) + return LSN_ERROR; + if (maria_recovery_verbose) + tprint(tracef, "%8u %8u %12lu %lu,0x%lx\n", (uint) table_id, + (uint) is_index, (ulong) page_id, LSN_IN_PARTS(rec_lsn)); + set_if_smaller(minimum_rec_lsn_of_dirty_pages, rec_lsn); + } + /* after that, there will be no insert/delete into the hash */ + /* + sanity check on record (did we screw up with all those "ptr+=", did the + checkpoint write code and checkpoint read code go out of sync?). + */ + if (ptr != (log_record_buffer.str + log_record_buffer.length)) + { + eprint(tracef, "checkpoint record corrupted\n"); + return LSN_ERROR; + } + + /* + start_address is now from where the dirty pages list can be ignored. + Find LSN higher or equal to this TRANSLOG_ADDRESS, suitable for + translog_read_record() functions. + */ + start_address= checkpoint_start= + translog_next_LSN(start_address, LSN_IMPOSSIBLE); + tprint(tracef, "Checkpoint record start_horizon now adjusted to" + " LSN (%lu,0x%lx)\n", LSN_IN_PARTS(start_address)); + if (checkpoint_start == LSN_IMPOSSIBLE) + { + /* + There must be a problem, as our checkpoint record exists and is >= the + address which is stored in its first bytes, which is >= start_address. + */ + return LSN_ERROR; + } + /* now, where the REDO phase should start reading log: */ + tprint(tracef, "Checkpoint has min_rec_lsn of dirty pages at" + " LSN (%lu,0x%lx)\n", LSN_IN_PARTS(minimum_rec_lsn_of_dirty_pages)); + set_if_smaller(start_address, minimum_rec_lsn_of_dirty_pages); + DBUG_PRINT("info", + ("checkpoint_start: (%lu,0x%lx) start_address: (%lu,0x%lx)", + LSN_IN_PARTS(checkpoint_start), LSN_IN_PARTS(start_address))); + return start_address; +} + + +static int new_page(uint32 fileid, pgcache_page_no_t pageid, LSN rec_lsn, + struct st_dirty_page *dirty_page) +{ + /* serves as hash key */ + dirty_page->file_and_page_id= (((uint64)fileid) << 40) | pageid; + dirty_page->rec_lsn= rec_lsn; + return my_hash_insert(&all_dirty_pages, (uchar *)dirty_page); +} + + +static int close_all_tables(void) +{ + int error= 0; + uint count= 0; + LIST *list_element, *next_open; + MARIA_HA *info; + TRANSLOG_ADDRESS addr; + DBUG_ENTER("close_all_tables"); + + pthread_mutex_lock(&THR_LOCK_maria); + if (maria_open_list == NULL) + goto end; + tprint(tracef, "Closing all tables\n"); + if (tracef != stdout) + { + if (recovery_message_printed == REC_MSG_NONE) + print_preamble(); + for (count= 0, list_element= maria_open_list ; + list_element ; count++, (list_element= list_element->next)) + ; + fprintf(stderr, "tables to flush:"); + recovery_message_printed= REC_MSG_FLUSH; + } + /* + Since the end of end_of_redo_phase(), we may have written new records + (if UNDO phase ran) and thus the state is newer than at + end_of_redo_phase(), we need to bump is_of_horizon again. + */ + addr= translog_get_horizon(); + for (list_element= maria_open_list ; ; list_element= next_open) + { + if (recovery_message_printed == REC_MSG_FLUSH) + { + fprintf(stderr, " %u", count--); + fflush(stderr); + } + if (list_element == NULL) + break; + next_open= list_element->next; + info= (MARIA_HA*)list_element->data; + pthread_mutex_unlock(&THR_LOCK_maria); /* ok, UNDO phase not online yet */ + /* + Tables which we see here are exactly those which were open at time of + crash. They might have open_count>0 as Checkpoint maybe flushed their + state while they were used. As Recovery corrected them, don't alarm the + user, don't ask for a table check: + */ + if (info->s->state.open_count != 0) + { + /* let ma_close() mark the table properly closed */ + info->s->state.open_count= 1; + info->s->global_changed= 1; + } + prepare_table_for_close(info, addr); + error|= maria_close(info); + pthread_mutex_lock(&THR_LOCK_maria); + } +end: + pthread_mutex_unlock(&THR_LOCK_maria); + DBUG_RETURN(error); +} + + +/** + @brief Close all table instances with a certain name which are present in + all_tables. + + @param name Name of table + @param addr Log address passed to prepare_table_for_close() +*/ + +static my_bool close_one_table(const char *name, TRANSLOG_ADDRESS addr) +{ + my_bool res= 0; + /* There are no other threads using the tables, so we don't need any locks */ + struct st_table_for_recovery *internal_table, *end; + for (internal_table= all_tables, end= internal_table + SHARE_ID_MAX + 1; + internal_table < end ; + internal_table++) + { + MARIA_HA *info= internal_table->info; + if ((info != NULL) && !strcmp(info->s->open_file_name.str, name)) + { + prepare_table_for_close(info, addr); + if (maria_close(info)) + res= 1; + internal_table->info= NULL; + } + } + return res; +} + + +/** + Temporarily disables logging for this table. + + If that makes the log incomplete, writes a LOGREC_INCOMPLETE_LOG to the log + to warn log readers. + + @param info table + @param log_incomplete if that disabling makes the log incomplete + + @note for example in the REDO phase we disable logging but that does not + make the log incomplete. +*/ + +void _ma_tmp_disable_logging_for_table(MARIA_HA *info, + my_bool log_incomplete) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_tmp_disable_logging_for_table"); + if (log_incomplete) + { + uchar log_data[FILEID_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + LSN lsn; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + translog_write_record(&lsn, LOGREC_INCOMPLETE_LOG, + &dummy_transaction_object, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL); + } + + /* if we disabled before writing the record, record wouldn't reach log */ + share->now_transactional= FALSE; + + /* + Reset state pointers. This is needed as in ALTER table we may do + commit fllowed by _ma_renable_logging_for_table and then + info->state may point to a state that was deleted by + _ma_trnman_end_trans_hook() + */ + share->state.common= *info->state; + info->state= &share->state.common; + info->switched_transactional= TRUE; + + /* + Some code in ma_blockrec.c assumes a trn even if !now_transactional but in + this case it only reads trn->rec_lsn, which has to be LSN_IMPOSSIBLE and + should be now. info->trn may be NULL in maria_chk. + */ + if (info->trn == NULL) + info->trn= &dummy_transaction_object; + DBUG_ASSERT(info->trn->rec_lsn == LSN_IMPOSSIBLE); + share->page_type= PAGECACHE_PLAIN_PAGE; + /* Functions below will pick up now_transactional and change callbacks */ + _ma_set_data_pagecache_callbacks(&info->dfile, share); + _ma_set_index_pagecache_callbacks(&share->kfile, share); + _ma_bitmap_set_pagecache_callbacks(&share->bitmap.file, share); + DBUG_VOID_RETURN; +} + + +/** + Re-enables logging for a table which had it temporarily disabled. + + Only the thread which disabled logging is allowed to reenable it. Indeed, + re-enabling logging affects all open instances, one must have exclusive + access to the table to do that. In practice, the one which disables has + such access. + + @param info table + @param flush_pages if function needs to flush pages first +*/ + +my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_reenable_logging_for_table"); + + if (share->now_transactional == share->base.born_transactional || + !info->switched_transactional) + DBUG_RETURN(0); + info->switched_transactional= FALSE; + + if ((share->now_transactional= share->base.born_transactional)) + { + share->page_type= PAGECACHE_LSN_PAGE; + + /* + Copy state information that where updated while the table was used + in not transactional mode + */ + _ma_copy_nontrans_state_information(info); + _ma_reset_history(info->s); + + if (flush_pages) + { + /* + We are going to change callbacks; if a page is flushed at this moment + this can cause race conditions, that's one reason to flush pages + now. Other reasons: a checkpoint could be running and miss pages; the + pages have type PAGECACHE_PLAIN_PAGE which should not remain. As + there are no REDOs for pages, them, bitmaps and the state also have to + be flushed and synced. + */ + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE) || + _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_LOCK) || + _ma_sync_table_files(info)) + DBUG_RETURN(1); + } + else if (!maria_in_recovery) + { + /* + Except in Recovery, we mustn't leave dirty pages (see comments above). + Note that this does not verify that the state was flushed, but hey. + */ + pagecache_file_no_dirty_page(share->pagecache, &info->dfile); + pagecache_file_no_dirty_page(share->pagecache, &share->kfile); + } + _ma_set_data_pagecache_callbacks(&info->dfile, share); + _ma_set_index_pagecache_callbacks(&share->kfile, share); + _ma_bitmap_set_pagecache_callbacks(&share->bitmap.file, share); + /* + info->trn was not changed in the disable/enable combo, so that it's + still usable in this kind of combination: + external_lock; + start_bulk_insert; # table is empty, disables logging + end_bulk_insert; # enables logging + start_bulk_insert; # table is not empty, logging stays + # so rows insertion needs the real trn. + as happens during row-based replication on the slave. + */ + } + DBUG_RETURN(0); +} + + +static void print_redo_phase_progress(TRANSLOG_ADDRESS addr) +{ + static uint end_logno= FILENO_IMPOSSIBLE, percentage_printed= 0; + static ulong end_offset; + static ulonglong initial_remainder= ~(ulonglong) 0; + + uint cur_logno; + ulong cur_offset; + ulonglong local_remainder; + uint percentage_done; + + if (tracef == stdout) + return; + if (recovery_message_printed == REC_MSG_NONE) + { + print_preamble(); + fprintf(stderr, "recovered pages: 0%%"); + fflush(stderr); + procent_printed= 1; + recovery_message_printed= REC_MSG_REDO; + } + if (end_logno == FILENO_IMPOSSIBLE) + { + LSN end_addr= translog_get_horizon(); + end_logno= LSN_FILE_NO(end_addr); + end_offset= LSN_OFFSET(end_addr); + } + cur_logno= LSN_FILE_NO(addr); + cur_offset= LSN_OFFSET(addr); + local_remainder= (cur_logno == end_logno) ? (end_offset - cur_offset) : + (((longlong)log_file_size) - cur_offset + + max(end_logno - cur_logno - 1, 0) * ((longlong)log_file_size) + + end_offset); + if (initial_remainder == (ulonglong)(-1)) + initial_remainder= local_remainder; + percentage_done= (uint) ((initial_remainder - local_remainder) * ULL(100) / + initial_remainder); + if ((percentage_done - percentage_printed) >= 10) + { + percentage_printed= percentage_done; + fprintf(stderr, " %u%%", percentage_done); + fflush(stderr); + procent_printed= 1; + } +} + + +#ifdef MARIA_EXTERNAL_LOCKING +#error Marias Checkpoint and Recovery are really not ready for it +#endif + +/* +Recovery of the state : how it works +===================================== + +Here we ignore Checkpoints for a start. + +The state (MARIA_HA::MARIA_SHARE::MARIA_STATE_INFO) is updated in +memory frequently (at least at every row write/update/delete) but goes +to disk at few moments: maria_close() when closing the last open +instance, and a few rare places like CHECK/REPAIR/ALTER +(non-transactional tables also do it at maria_lock_database() but we +needn't cover them here). + +In case of crash, state on disk is likely to be older than what it was +in memory, the REDO phase needs to recreate the state as it was in +memory at the time of crash. When we say Recovery here we will always +mean "REDO phase". + +For example MARIA_STATUS_INFO::records (count of records). It is updated at +the end of every row write/update/delete/delete_all. When Recovery sees the +sign of such row operation (UNDO or REDO), it may need to update the records' +count if that count does not reflect that operation (is older). How to know +the age of the state compared to the log record: every time the state +goes to disk at runtime, its member "is_of_horizon" is updated to the +current end-of-log horizon. So Recovery just needs to compare is_of_horizon +and the record's LSN to know if it should modify "records". + +Other operations like ALTER TABLE DISABLE KEYS update the state but +don't write log records, thus the REDO phase cannot repeat their +effect on the state in case of crash. But we make them sync the state +as soon as they have finished. This reduces the window for a problem. + +It looks like only one thread at a time updates the state in memory or +on disk. We assume that the upper level (normally MySQL) has protection +against issuing HA_EXTRA_(FORCE_REOPEN|PREPARE_FOR_RENAME) so that these +are not issued while there are any running transactions on the given table. +If this is not done, we may write a corrupted state to disk. + +With checkpoints +================ + +Checkpoint module needs to read the state in memory and write it to +disk. This may happen while some other thread is modifying the state +in memory or on disk. Checkpoint thus may be reading changing data, it +needs a mutex to not have it corrupted, and concurrent modifiers of +the state need that mutex too for the same reason. +"records" is modified for every row write/update/delete, we don't want +to add a mutex lock/unlock there. So we re-use the mutex lock/unlock +which is already present in these moments, namely the log's mutex which is +taken when UNDO_ROW_INSERT|UPDATE|DELETE is written: we update "records" in +under-log-mutex hooks when writing these records (thus "records" is +not updated at the end of maria_write/update/delete() anymore). +Thus Checkpoint takes the log's lock and can read "records" from +memory an write it to disk and release log's lock. +We however want to avoid having the disk write under the log's +lock. So it has to be under another mutex, natural choice is +intern_lock (as Checkpoint needs it anyway to read MARIA_SHARE::kfile, +and as maria_close() takes it too). All state writes to disk are +changed to be protected with intern_lock. +So Checkpoint takes intern_lock, log's lock, reads "records" from +memory, releases log's lock, updates is_of_horizon and writes "records" to +disk, release intern_lock. +In practice, not only "records" needs to be written but the full +state. So, Checkpoint reads the full state from memory. Some other +thread may at this moment be modifying in memory some pieces of the +state which are not protected by the lock's log (see ma_extra.c +HA_EXTRA_NO_KEYS), and Checkpoint would be reading a corrupted state +from memory; to guard against that we extend the intern_lock-zone to +changes done to the state in memory by HA_EXTRA_NO_KEYS et al, and +also any change made in memory to create_rename_lsn/state_is_of_horizon. +Last, we don't want in Checkpoint to do + log lock; read state from memory; release log lock; +for each table, it may hold the log's lock too much in total. +So, we instead do + log lock; read N states from memory; release log lock; +Thus, the sequence above happens outside of any intern_lock. +But this re-introduces the problem that some other thread may be changing the +state in memory and on disk under intern_lock, without log's lock, like +HA_EXTRA_NO_KEYS, while we read the N states. However, when Checkpoint later +comes to handling the table under intern_lock, which is serialized with +HA_EXTRA_NO_KEYS, it can see that is_of_horizon is higher then when the state +was read from memory under log's lock, and thus can decide to not flush the +obsolete state it has, knowing that the other thread flushed a more recent +state already. If on the other hand is_of_horizon is not higher, the read +state is current and can be flushed. So we have a per-table sequence: + lock intern_lock; test if is_of_horizon is higher than when we read the state + under log's lock; if no then flush the read state to disk. +*/ + +/* some comments and pseudo-code which we keep for later */ +#if 0 + /* + MikaelR suggests: support checkpoints during REDO phase too: do checkpoint + after a certain amount of log records have been executed. This helps + against repeated crashes. Those checkpoints could not be user-requested + (as engine is not communicating during the REDO phase), so they would be + automatic: this changes the original assumption that we don't write to the + log while in the REDO phase, but why not. How often should we checkpoint? + */ + + /* + We want to have two steps: + engine->recover_with_max_memory(); + next_engine->recover_with_max_memory(); + engine->init_with_normal_memory(); + next_engine->init_with_normal_memory(); + So: in recover_with_max_memory() allocate a giant page cache, do REDO + phase, then all page cache is flushed and emptied and freed (only retain + small structures like TM): take full checkpoint, which is useful if + next engine crashes in its recovery the next second. + Destroy all shares (maria_close()), then at init_with_normal_memory() we + do this: + */ + + /**** UNDO PHASE *****/ + + /* + Launch one or more threads to do the background rollback. Don't wait for + them to complete their rollback (background rollback; for debugging, we + can have an option which waits). Set a counter (total_of_rollback_threads) + to the number of threads to lauch. + + Note that InnoDB's rollback-in-background works as long as InnoDB is the + last engine to recover, otherwise MySQL will refuse new connections until + the last engine has recovered so it's not "background" from the user's + point of view. InnoDB is near top of sys_table_types so all others + (e.g. BDB) recover after it... So it's really "online rollback" only if + InnoDB is the only engine. + */ + + /* wake up delete/update handler */ + /* tell the TM that it can now accept new transactions */ + + /* + mark that checkpoint requests are now allowed. + */ +#endif diff --git a/storage/maria/ma_recovery.h b/storage/maria/ma_recovery.h new file mode 100644 index 00000000000..0bfcdd17d39 --- /dev/null +++ b/storage/maria/ma_recovery.h @@ -0,0 +1,33 @@ +/* Copyright (C) 2006,2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3072 Maria recovery + First version written by Guilhem Bichot on 2006-04-27. +*/ + +/* This is the interface of this module. */ + +/* Performs recovery of the engine at start */ + +C_MODE_START +enum maria_apply_log_way +{ MARIA_LOG_APPLY, MARIA_LOG_DISPLAY_HEADER, MARIA_LOG_CHECK }; +int maria_recovery_from_log(void); +int maria_apply_log(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply, + FILE *trace_file, + my_bool execute_undo_phase, my_bool skip_DDLs, + my_bool take_checkpoints, uint *warnings_count); +C_MODE_END diff --git a/storage/maria/ma_recovery_util.c b/storage/maria/ma_recovery_util.c new file mode 100644 index 00000000000..19e61daf4ef --- /dev/null +++ b/storage/maria/ma_recovery_util.c @@ -0,0 +1,146 @@ +/* Copyright (C) 2006,2007,2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Q: Why isn't ma_recovery_util.c simply moved to ma_recovery.c ? + + A: ma_recovery.c, because it invokes objects from ma_check.c (like + maria_chk_init()) causes the following problem: + if a source file a.c of a program invokes a function defined in + ma_recovery.c, then a.o depends on ma_recovery.o which depends on + ma_check.o: linker thus brings in ma_check.o. That brings in the + dependencies of ma_check.o which are definitions of _ma_check_print_info() + etc; if a.o does not define them then the ones of ha_maria.o are used + i.e. ha_maria.o is linked into the program, and this brings in dependencies + of ha_maria.o on mysqld.o into the program's linking which thus fails, as + the program is not linked with mysqld.o. + Thus, while several functions defined in ma_recovery.c could be useful to + other files, they cannot be used by them. + So we are going to gradually move a great share of ma_recovery.c's exported + functions into the present file, to isolate the problematic components and + avoid the problem. +*/ + +#include "maria_def.h" + +HASH all_dirty_pages; +struct st_dirty_page /* used only in the REDO phase */ +{ + uint64 file_and_page_id; + LSN rec_lsn; +}; +/* + LSN after which dirty pages list does not apply. Can be slightly before + when ma_checkpoint_execute() started. +*/ +LSN checkpoint_start= LSN_IMPOSSIBLE; + +/** @todo looks like duplicate of recovery_message_printed */ +my_bool procent_printed; +FILE *tracef; /**< trace file for debugging */ + + +/** @brief Prints to a trace file if it is not NULL */ +void tprint(FILE *trace_file __attribute__ ((unused)), + const char *format __attribute__ ((unused)), ...) +{ + va_list args; +#ifndef DBUG_OFF + { + char buff[1024]; + va_start(args, format); + vsnprintf(buff, sizeof(buff)-1, format, args); + DBUG_PRINT("info", ("%s", buff)); + va_end(args); + } +#endif + va_start(args, format); + if (trace_file != NULL) + { + if (procent_printed) + { + procent_printed= 0; + fputc('\n', trace_file); + } + vfprintf(trace_file, format, args); + } + va_end(args); +} + + +void eprint(FILE *trace_file __attribute__ ((unused)), + const char *format __attribute__ ((unused)), ...) +{ + va_list args; + va_start(args, format); + DBUG_PRINT("error", ("%s", format)); + if (!trace_file) + trace_file= stderr; + + if (procent_printed) + { + /* In silent mode, print on another line than the 0% 10% 20% line */ + procent_printed= 0; + fputc('\n', trace_file); + } + vfprintf(trace_file , format, args); + fputc('\n', trace_file); + if (trace_file != stderr) + { + va_start(args, format); + my_printv_error(HA_ERR_INITIALIZATION, format, MYF(0), args); + } + va_end(args); + fflush(trace_file); +} + + +/** + Tells if the dirty pages list found in checkpoint record allows to ignore a + REDO for a certain page. + + @param shortid short id of the table + @param lsn REDO record's LSN + @param page page number + @param index TRUE if index page, FALSE if data page +*/ + +my_bool _ma_redo_not_needed_for_page(uint16 shortid, LSN lsn, + pgcache_page_no_t page, + my_bool index) +{ + if (cmp_translog_addr(lsn, checkpoint_start) < 0) + { + /* + 64-bit key is formed like this: + Most significant byte: 0 if data page, 1 if index page + Next 2 bytes: table's short id + Next 5 bytes: page number + */ + uint64 file_and_page_id= + (((uint64)((index << 16) | shortid)) << 40) | page; + struct st_dirty_page *dirty_page= (struct st_dirty_page *) + hash_search(&all_dirty_pages, + (uchar *)&file_and_page_id, sizeof(file_and_page_id)); + DBUG_PRINT("info", ("in dirty pages list: %d", dirty_page != NULL)); + if ((dirty_page == NULL) || + cmp_translog_addr(lsn, dirty_page->rec_lsn) < 0) + { + tprint(tracef, ", ignoring because of dirty_pages list\n"); + return TRUE; + } + } + return FALSE; +} diff --git a/storage/maria/ma_recovery_util.h b/storage/maria/ma_recovery_util.h new file mode 100644 index 00000000000..a35fea84fe9 --- /dev/null +++ b/storage/maria/ma_recovery_util.h @@ -0,0 +1,37 @@ +/* Copyright (C) 2006,2007,2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +struct st_dirty_page /* used only in the REDO phase */ +{ + uint64 file_and_page_id; + LSN rec_lsn; +}; +extern HASH all_dirty_pages; +/* + LSN after which dirty pages list does not apply. Can be slightly before + when ma_checkpoint_execute() started. +*/ +extern LSN checkpoint_start; +extern my_bool procent_printed; +extern FILE *tracef; + + +my_bool _ma_redo_not_needed_for_page(uint16 shortid, LSN lsn, + pgcache_page_no_t page, + my_bool index); +void tprint(FILE *trace_file, const char *format, ...) + ATTRIBUTE_FORMAT(printf, 2, 3); +void eprint(FILE *trace_file, const char *format, ...) + ATTRIBUTE_FORMAT(printf, 2, 3); diff --git a/storage/maria/ma_rename.c b/storage/maria/ma_rename.c new file mode 100644 index 00000000000..380f3da3c46 --- /dev/null +++ b/storage/maria/ma_rename.c @@ -0,0 +1,135 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Rename a table +*/ + +#include "ma_fulltext.h" +#include "trnman_public.h" + +/** + @brief renames a table + + @param old_name current name of table + @param new_name table should be renamed to this name + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int maria_rename(const char *old_name, const char *new_name) +{ + char from[FN_REFLEN],to[FN_REFLEN]; + int data_file_rename_error; +#ifdef USE_RAID + uint raid_type=0,raid_chunks=0; +#endif + MARIA_HA *info; + MARIA_SHARE *share; + myf sync_dir; + DBUG_ENTER("maria_rename"); + +#ifdef EXTRA_DEBUG + _ma_check_table_is_closed(old_name,"rename old_table"); + _ma_check_table_is_closed(new_name,"rename new table2"); +#endif + /** @todo LOCK take X-lock on table */ + if (!(info= maria_open(old_name, O_RDWR, HA_OPEN_FOR_REPAIR))) + DBUG_RETURN(my_errno); + share= info->s; +#ifdef USE_RAID + raid_type = share->base.raid_type; + raid_chunks = share->base.raid_chunks; +#endif + + /* + the renaming of an internal table to the final table (like in ALTER TABLE) + is the moment when this table receives its correct create_rename_lsn and + this is important; make sure transactionality has been re-enabled. + */ + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + sync_dir= (share->now_transactional && !share->temporary && + !maria_in_recovery) ? MY_SYNC_DIR : 0; + if (sync_dir) + { + LSN lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uint old_name_len= strlen(old_name)+1, new_name_len= strlen(new_name)+1; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar*)old_name; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= old_name_len; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (uchar*)new_name; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= new_name_len; + /* + For this record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe, which it is not now (that would require + work using the ddl_log of sql/sql_table.cc); when it is, we should + reconsider the moment of writing this log record (before or after op, + under THR_LOCK_maria or not...), how to use it in Recovery. + For now it can serve to apply logs to a backup so we sync it. + */ + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_RENAME_TABLE, + &dummy_transaction_object, NULL, + old_name_len + new_name_len, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) || + translog_flush(lsn))) + { + maria_close(info); + DBUG_RETURN(1); + } + /* + store LSN into file, needed for Recovery to not be confused if a + RENAME happened (applying REDOs to the wrong table). + */ + if (_ma_update_state_lsns(share, lsn, share->state.create_trid, TRUE, + TRUE)) + { + maria_close(info); + DBUG_RETURN(1); + } + } + + maria_close(info); + + fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + if (my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir))) + DBUG_RETURN(my_errno); + fn_format(from,old_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + fn_format(to,new_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); +#ifdef USE_RAID + if (raid_type) + data_file_rename_error= my_raid_rename(from, to, raid_chunks, + MYF(MY_WME | sync_dir)); + else +#endif + data_file_rename_error= + my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir)); + if (data_file_rename_error) + { + /* + now we have a renamed index file and a non-renamed data file, try to + undo the rename of the index file. + */ + data_file_rename_error= my_errno; + fn_format(from, old_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT)); + fn_format(to, new_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT)); + my_rename_with_symlink(to, from, MYF(MY_WME | sync_dir)); + } + DBUG_RETURN(data_file_rename_error); + +} diff --git a/storage/maria/ma_rfirst.c b/storage/maria/ma_rfirst.c new file mode 100644 index 00000000000..226aaa551f0 --- /dev/null +++ b/storage/maria/ma_rfirst.c @@ -0,0 +1,26 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + + /* Read first row through a specfic key */ + +int maria_rfirst(MARIA_HA *info, uchar *buf, int inx) +{ + DBUG_ENTER("maria_rfirst"); + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->update|= HA_STATE_PREV_FOUND; + DBUG_RETURN(maria_rnext(info,buf,inx)); +} /* maria_rfirst */ diff --git a/storage/maria/ma_rkey.c b/storage/maria/ma_rkey.c new file mode 100644 index 00000000000..24b275d0ba6 --- /dev/null +++ b/storage/maria/ma_rkey.c @@ -0,0 +1,215 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Read record based on a key */ + +#include "maria_def.h" +#include "ma_rt_index.h" + +/** + Read a record using key + + @note + Ordinary search_flag is 0 ; Give error if no record with key +*/ + +int maria_rkey(MARIA_HA *info, uchar *buf, int inx, const uchar *key_data, + key_part_map keypart_map, enum ha_rkey_function search_flag) +{ + uchar *key_buff; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + HA_KEYSEG *last_used_keyseg; + uint32 nextflag; + MARIA_KEY key; + int icp_res= 1; + DBUG_ENTER("maria_rkey"); + DBUG_PRINT("enter", ("base: 0x%lx buf: 0x%lx inx: %d search_flag: %d", + (long) info, (long) buf, inx, search_flag)); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(my_errno); + + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->last_key_func= search_flag; + keyinfo= share->keyinfo + inx; + + key_buff= info->lastkey_buff+info->s->base.max_key_length; + + if (info->once_flags & USE_PACKED_KEYS) + { + info->once_flags&= ~USE_PACKED_KEYS; /* Reset flag */ + /* + key is already packed!; This happens when we are using a MERGE TABLE + In this key 'key_part_map' is the length of the key ! + */ + bmove(key_buff, key_data, keypart_map); + key.data= key_buff; + key.keyinfo= keyinfo; + key.data_length= keypart_map; + key.ref_length= 0; + key.flag= 0; + + last_used_keyseg= keyinfo->seg + info->last_used_keyseg; + } + else + { + DBUG_ASSERT(keypart_map); + /* Save the packed key for later use in the second buffer of lastkey. */ + _ma_pack_key(info, &key, inx, key_buff, key_data, + keypart_map, &last_used_keyseg); + /* Save packed_key_length for use by the MERGE engine. */ + info->pack_key_length= key.data_length; + info->last_used_keyseg= (uint16) (last_used_keyseg - + keyinfo->seg); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, &key);); + } + + if (fast_ma_readinfo(info)) + goto err; + if (share->lock_key_trees) + rw_rdlock(&keyinfo->root_lock); + + nextflag= maria_read_vec[search_flag] | key.flag; + if (search_flag != HA_READ_KEY_EXACT || + ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME)) + nextflag|= SEARCH_SAVE_BUFF; + + switch (keyinfo->key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + if (maria_rtree_find_first(info, &key, nextflag) < 0) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno= HA_ERR_CRASHED; + info->cur_row.lastpos= HA_OFFSET_ERROR; + } + break; +#endif + case HA_KEY_ALG_BTREE: + default: + if (!_ma_search(info, &key, nextflag, info->s->state.key_root[inx])) + { + MARIA_KEY lastkey; + lastkey.keyinfo= keyinfo; + lastkey.data= info->lastkey_buff; + /* + Found a key, but it might not be usable. We cannot use rows that + are inserted by other threads after we got our table lock + ("concurrent inserts"). The record may not even be present yet. + Keys are inserted into the index(es) before the record is + inserted into the data file. + + If index condition is present, it must be either satisfied or + not satisfied with an out-of-range condition. + */ + if ((*share->row_is_visible)(info) && + ((icp_res= ma_check_index_cond(info, inx, buf)) != 0)) + break; + + /* The key references a concurrently inserted record. */ + if (search_flag == HA_READ_KEY_EXACT && + last_used_keyseg == keyinfo->seg + keyinfo->keysegs) + { + /* Simply ignore the key if it matches exactly. (Bug #29838) */ + my_errno= HA_ERR_KEY_NOT_FOUND; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + } + + do + { + uint not_used[2]; + /* + Skip rows that are inserted by other threads since we got + a lock. Note that this can only happen if we are not + searching after a full length exact key, because the keys + are sorted according to position. + */ + lastkey.data_length= info->last_key.data_length; + lastkey.ref_length= info->last_key.ref_length; + lastkey.flag= info->last_key.flag; + if (_ma_search_next(info, &lastkey, maria_readnext_vec[search_flag], + info->s->state.key_root[inx])) + break; /* purecov: inspected */ + /* + Check that the found key does still match the search. + _ma_search_next() delivers the next key regardless of its + value. + */ + if (!(nextflag & (SEARCH_BIGGER | SEARCH_SMALLER)) && + ha_key_cmp(keyinfo->seg, info->last_key.data, key.data, + key.data_length, SEARCH_FIND, not_used)) + { + /* purecov: begin inspected */ + my_errno= HA_ERR_KEY_NOT_FOUND; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + /* purecov: end */ + } + + } while (!(*share->row_is_visible)(info) || + ((icp_res= ma_check_index_cond(info, inx, buf)) == 0)); + } + } + if (share->lock_key_trees) + rw_unlock(&keyinfo->root_lock); + + if (info->cur_row.lastpos == HA_OFFSET_ERROR || (icp_res != 1)) + { + if (icp_res == 2) + { + info->cur_row.lastpos= HA_OFFSET_ERROR; + my_errno= HA_ERR_KEY_NOT_FOUND; + } + fast_ma_writeinfo(info); + goto err; + } + + /* Calculate length of the found key; Used by maria_rnext_same */ + if ((keyinfo->flag & HA_VAR_LENGTH_KEY)) + info->last_rkey_length= _ma_keylength_part(keyinfo, info->lastkey_buff, + last_used_keyseg); + else + info->last_rkey_length= key.data_length; + + /* Check if we don't want to have record back, only error message */ + if (!buf) + { + fast_ma_writeinfo(info); + DBUG_RETURN(0); + } + if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + + info->cur_row.lastpos= HA_OFFSET_ERROR; /* Didn't find row */ + +err: + /* Store last used key as a base for read next */ + memcpy(info->last_key.data, key_buff, key.data_length); + info->last_key.data_length= key.data_length; + info->last_key.ref_length= info->s->base.rec_reflength; + info->last_key.flag= 0; + /* Create key with rowid 0 */ + bzero((char*) info->last_key.data + info->last_key.data_length, + info->s->base.rec_reflength); + + if (search_flag == HA_READ_AFTER_KEY) + info->update|=HA_STATE_NEXT_FOUND; /* Previous gives last row */ + DBUG_RETURN(my_errno); +} /* _ma_rkey */ diff --git a/storage/maria/ma_rlast.c b/storage/maria/ma_rlast.c new file mode 100644 index 00000000000..a9a470d37d9 --- /dev/null +++ b/storage/maria/ma_rlast.c @@ -0,0 +1,26 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + + /* Read last row with the same key as the previous read. */ + +int maria_rlast(MARIA_HA *info, uchar *buf, int inx) +{ + DBUG_ENTER("maria_rlast"); + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->update|= HA_STATE_NEXT_FOUND; + DBUG_RETURN(maria_rprev(info,buf,inx)); +} /* maria_rlast */ diff --git a/storage/maria/ma_rnext.c b/storage/maria/ma_rnext.c new file mode 100644 index 00000000000..bdba5ff3a17 --- /dev/null +++ b/storage/maria/ma_rnext.c @@ -0,0 +1,130 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +#include "ma_rt_index.h" + + /* + Read next row with the same key as previous read + One may have done a write, update or delete of the previous row. + NOTE! Even if one changes the previous row, the next read is done + based on the position of the last used key! + */ + +int maria_rnext(MARIA_HA *info, uchar *buf, int inx) +{ + int error,changed; + uint flag; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + int icp_res= 1; + DBUG_ENTER("maria_rnext"); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(my_errno); + flag=SEARCH_BIGGER; /* Read next */ + if (info->cur_row.lastpos == HA_OFFSET_ERROR && + info->update & HA_STATE_PREV_FOUND) + flag=0; /* Read first */ + + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + keyinfo= share->keyinfo + inx; + if (share->lock_key_trees) + rw_rdlock(&keyinfo->root_lock); + changed= _ma_test_if_changed(info); + if (!flag) + { + switch (keyinfo->key_alg){ +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + error=maria_rtree_get_first(info, inx, + info->last_key.data_length + + info->last_key.ref_length); + + break; +#endif + case HA_KEY_ALG_BTREE: + default: + error= _ma_search_first(info, keyinfo, share->state.key_root[inx]); + break; + } + } + else + { + switch (keyinfo->key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + /* + Note that rtree doesn't support that the table + may be changed since last call, so we do need + to skip rows inserted by other threads like in btree + */ + error= maria_rtree_get_next(info, inx, info->last_key.data_length + + info->last_key.ref_length); + break; +#endif + case HA_KEY_ALG_BTREE: + default: + if (!changed) + error= _ma_search_next(info, &info->last_key, + flag | info->last_key.flag, + share->state.key_root[inx]); + else + error= _ma_search(info, &info->last_key, flag | info->last_key.flag, + share->state.key_root[inx]); + } + } + + if (!error) + { + while (!(*share->row_is_visible)(info) || + ((icp_res= ma_check_index_cond(info, inx, buf)) == 0)) + { + /* Skip rows inserted by other threads since we got a lock */ + if ((error= _ma_search_next(info, &info->last_key, + SEARCH_BIGGER, + share->state.key_root[inx]))) + break; + } + } + if (share->lock_key_trees) + rw_unlock(&keyinfo->root_lock); + + /* Don't clear if database-changed */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->update|= HA_STATE_NEXT_FOUND; + + if (icp_res == 2) + my_errno=HA_ERR_END_OF_FILE; /* got beyond the end of scanned range */ + + if (error || icp_res != 1) + { + if (my_errno == HA_ERR_KEY_NOT_FOUND) + my_errno=HA_ERR_END_OF_FILE; + } + else if (!buf) + { + DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0); + } + else if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + DBUG_PRINT("error",("Got error: %d, errno: %d",error, my_errno)); + DBUG_RETURN(my_errno); +} /* maria_rnext */ diff --git a/storage/maria/ma_rnext_same.c b/storage/maria/ma_rnext_same.c new file mode 100644 index 00000000000..f67a76a366f --- /dev/null +++ b/storage/maria/ma_rnext_same.c @@ -0,0 +1,113 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "ma_rt_index.h" + +/* + Read next row with the same key as previous read, but abort if + the key changes. + One may have done a write, update or delete of the previous row. + + NOTE! Even if one changes the previous row, the next read is done + based on the position of the last used key! +*/ + +int maria_rnext_same(MARIA_HA *info, uchar *buf) +{ + int error; + uint inx,not_used[2]; + MARIA_KEYDEF *keyinfo; + int icp_res= 1; + DBUG_ENTER("maria_rnext_same"); + + if ((int) (inx= info->lastinx) < 0 || + info->cur_row.lastpos == HA_OFFSET_ERROR) + DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX); + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + + keyinfo= info->s->keyinfo+inx; + if (info->s->lock_key_trees) + rw_rdlock(&keyinfo->root_lock); + + switch (keyinfo->key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + if ((error=maria_rtree_find_next(info,inx, + maria_read_vec[info->last_key_func]))) + { + error=1; + my_errno=HA_ERR_END_OF_FILE; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + } + break; +#endif + case HA_KEY_ALG_BTREE: + default: + if (!(info->update & HA_STATE_RNEXT_SAME)) + { + /* First rnext_same; Store old key */ + memcpy(info->lastkey_buff2, info->last_key.data, + info->last_rkey_length); + } + for (;;) + { + if ((error= _ma_search_next(info, &info->last_key, + SEARCH_BIGGER, + info->s->state.key_root[inx]))) + break; + if (ha_key_cmp(keyinfo->seg, info->last_key.data, + info->lastkey_buff2, + info->last_rkey_length, SEARCH_FIND, + not_used)) + { + error=1; + my_errno=HA_ERR_END_OF_FILE; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + } + /* Skip rows that are inserted by other threads since we got a lock */ + if ((info->s->row_is_visible)(info) && + ((icp_res= ma_check_index_cond(info, inx, buf)) != 0)) + break; + } + } + if (info->s->lock_key_trees) + rw_unlock(&keyinfo->root_lock); + /* Don't clear if database-changed */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->update|= HA_STATE_NEXT_FOUND | HA_STATE_RNEXT_SAME; + + if (icp_res == 2) + my_errno=HA_ERR_END_OF_FILE; /* got beyond the end of scanned range */ + + if (error || icp_res != 1) + { + if (my_errno == HA_ERR_KEY_NOT_FOUND) + my_errno=HA_ERR_END_OF_FILE; + } + else if (!buf) + { + DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0); + } + else if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + DBUG_RETURN(my_errno); +} /* maria_rnext_same */ diff --git a/storage/maria/ma_rprev.c b/storage/maria/ma_rprev.c new file mode 100644 index 00000000000..b9f46d7c405 --- /dev/null +++ b/storage/maria/ma_rprev.c @@ -0,0 +1,86 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + + /* + Read previous row with the same key as previous read + One may have done a write, update or delete of the previous row. + NOTE! Even if one changes the previous row, the next read is done + based on the position of the last used key! + */ + +int maria_rprev(MARIA_HA *info, uchar *buf, int inx) +{ + int error,changed; + register uint flag; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_rprev"); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(my_errno); + flag=SEARCH_SMALLER; /* Read previous */ + if (info->cur_row.lastpos == HA_OFFSET_ERROR && + info->update & HA_STATE_NEXT_FOUND) + flag=0; /* Read last */ + + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + keyinfo= share->keyinfo + inx; + changed= _ma_test_if_changed(info); + if (share->lock_key_trees) + rw_rdlock(&keyinfo->root_lock); + if (!flag) + error= _ma_search_last(info, keyinfo, share->state.key_root[inx]); + else if (!changed) + error= _ma_search_next(info, &info->last_key, + flag | info->last_key.flag, + share->state.key_root[inx]); + else + error= _ma_search(info, &info->last_key, flag | info->last_key.flag, + share->state.key_root[inx]); + + if (!error) + { + while (!(*share->row_is_visible)(info)) + { + /* Skip rows that are inserted by other threads since we got a lock */ + if ((error= _ma_search_next(info, &info->last_key, + SEARCH_SMALLER, + share->state.key_root[inx]))) + break; + } + } + if (share->lock_key_trees) + rw_unlock(&keyinfo->root_lock); + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->update|= HA_STATE_PREV_FOUND; + if (error) + { + if (my_errno == HA_ERR_KEY_NOT_FOUND) + my_errno=HA_ERR_END_OF_FILE; + } + else if (!buf) + { + DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0); + } + else if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + DBUG_RETURN(my_errno); +} /* maria_rprev */ diff --git a/storage/maria/ma_rrnd.c b/storage/maria/ma_rrnd.c new file mode 100644 index 00000000000..24c4bfdd467 --- /dev/null +++ b/storage/maria/ma_rrnd.c @@ -0,0 +1,44 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Read a record with random-access. The position to the record must + get by MARIA_HA. The next record can be read with pos= MARIA_POS_ERROR */ + + +#include "maria_def.h" + +/* + Read a row based on position. + + RETURN + 0 Ok. + HA_ERR_RECORD_DELETED Record is deleted. + HA_ERR_END_OF_FILE EOF. +*/ + +int maria_rrnd(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos) +{ + DBUG_ENTER("maria_rrnd"); + + DBUG_ASSERT(filepos != HA_OFFSET_ERROR); + + /* Init all but update-flag */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + + info->cur_row.lastpos= filepos; /* Remember for update */ + DBUG_RETURN((*info->s->read_record)(info, buf, filepos)); +} diff --git a/storage/maria/ma_rsame.c b/storage/maria/ma_rsame.c new file mode 100644 index 00000000000..4bdbfd526ba --- /dev/null +++ b/storage/maria/ma_rsame.c @@ -0,0 +1,78 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +/** + Find current row with read on position or read on key + + @notes + If inx >= 0 find record using key + + @warning + This function is not row version safe. + This is not crtical as this function is not used by MySQL + + @return + @retval 0 Ok + @retval HA_ERR_KEY_NOT_FOUND Row is deleted + @retval HA_ERR_END_OF_FILE End of file +*/ + + +int maria_rsame(MARIA_HA *info, uchar *record, int inx) +{ + DBUG_ENTER("maria_rsame"); + + if (inx != -1 && ! maria_is_key_active(info->s->state.key_map, inx)) + { + DBUG_PRINT("error", ("wrong index usage")); + DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX); + } + if (info->cur_row.lastpos == HA_OFFSET_ERROR || + info->update & HA_STATE_DELETED) + { + DBUG_PRINT("error", ("no current record")); + DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No current record */ + } + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + /* Read row from data file */ + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + + if (inx >= 0) + { + MARIA_KEYDEF *keyinfo= info->s->keyinfo + inx; + info->lastinx= inx; + (*keyinfo->make_key)(info, &info->last_key, (uint) inx, + info->lastkey_buff, record, + info->cur_row.lastpos, + info->cur_row.trid); + if (info->s->lock_key_trees) + rw_rdlock(&keyinfo->root_lock); + VOID(_ma_search(info, &info->last_key, SEARCH_SAME, + info->s->state.key_root[inx])); + if (info->s->lock_key_trees) + rw_unlock(&keyinfo->root_lock); + } + + if (!(*info->read_record)(info, record, info->cur_row.lastpos)) + DBUG_RETURN(0); + if (my_errno == HA_ERR_RECORD_DELETED) + my_errno=HA_ERR_KEY_NOT_FOUND; + DBUG_PRINT("error", ("my_errno: %d", my_errno)); + DBUG_RETURN(my_errno); +} /* maria_rsame */ diff --git a/storage/maria/ma_rsamepos.c b/storage/maria/ma_rsamepos.c new file mode 100644 index 00000000000..d2099e7b116 --- /dev/null +++ b/storage/maria/ma_rsamepos.c @@ -0,0 +1,63 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* read record through position and fix key-position */ +/* As maria_rsame but supply a position */ + +#include "maria_def.h" + + +/* + Read row based on postion + + @param inx If inx >= 0 postion the given index on found row + + @return + @retval 0 Ok + @retval HA_ERR_KEY_NOT_FOUND Row is deleted + @retval HA_ERR_END_OF_FILE End of file +*/ + +int maria_rsame_with_pos(MARIA_HA *info, uchar *record, int inx, + MARIA_RECORD_POS filepos) +{ + DBUG_ENTER("maria_rsame_with_pos"); + DBUG_PRINT("enter",("index: %d filepos: %ld", inx, (long) filepos)); + + if (inx < -1 || + (inx >= 0 && ! maria_is_key_active(info->s->state.key_map, inx))) + { + DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX); + } + + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + if ((*info->s->read_record)(info, record, filepos)) + { + if (my_errno == HA_ERR_RECORD_DELETED) + my_errno=HA_ERR_KEY_NOT_FOUND; + DBUG_RETURN(my_errno); + } + info->cur_row.lastpos= filepos; + info->lastinx= inx; + if (inx >= 0) + { + (*info->s->keyinfo[inx].make_key)(info, &info->last_key, (uint) inx, + info->lastkey_buff, + record, info->cur_row.lastpos, + info->cur_row.trid); + info->update|=HA_STATE_KEY_CHANGED; /* Don't use indexposition */ + } + DBUG_RETURN(0); +} /* maria_rsame_pos */ diff --git a/storage/maria/ma_rt_index.c b/storage/maria/ma_rt_index.c new file mode 100644 index 00000000000..62474dbbad8 --- /dev/null +++ b/storage/maria/ma_rt_index.c @@ -0,0 +1,1343 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_key_recover.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" +#include "ma_rt_key.h" +#include "ma_rt_mbr.h" + +#define REINSERT_BUFFER_INC 10 +#define PICK_BY_AREA +/*#define PICK_BY_PERIMETER*/ + +typedef struct st_page_level +{ + uint level; + my_off_t offs; +} stPageLevel; + +typedef struct st_page_list +{ + uint n_pages; + uint m_pages; + stPageLevel *pages; +} stPageList; + + +/* + Find next key in r-tree according to search_flag recursively + + NOTES + Used in maria_rtree_find_first() and maria_rtree_find_next() + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +static int maria_rtree_find_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uint32 search_flag, + uint nod_cmp_flag, my_off_t page_pos, + int level) +{ + MARIA_SHARE *share= info->s; + uint nod_flag; + int res; + uchar *page_buf, *k, *last; + int key_data_length; + uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level; + MARIA_PAGE page; + + if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length))) + { + my_errno= HA_ERR_OUT_OF_MEM; + return -1; + } + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + + key_data_length= keyinfo->keylength - share->base.rec_reflength; + + if (info->maria_rtree_recursion_depth >= level) + { + k= page_buf + *saved_key; + } + else + { + k= rt_PAGE_FIRST_KEY(share, page_buf, nod_flag); + } + last= rt_PAGE_END(&page); + + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag)) + { + if (nod_flag) + { + /* this is an internal node in the tree */ + if (!(res= maria_rtree_key_cmp(keyinfo->seg, + info->first_mbr_key, k, + info->last_rkey_length, nod_cmp_flag))) + { + switch ((res= maria_rtree_find_req(info, keyinfo, search_flag, + nod_cmp_flag, + _ma_kpos(nod_flag, k), + level + 1))) + { + case 0: /* found - exit from recursion */ + *saved_key= k - page_buf; + goto ok; + case 1: /* not found - continue searching */ + info->maria_rtree_recursion_depth= level; + break; + default: /* error */ + case -1: + goto err; + } + } + } + else + { + /* this is a leaf */ + if (!maria_rtree_key_cmp(keyinfo->seg, info->first_mbr_key, + k, info->last_rkey_length, search_flag)) + { + uchar *after_key= rt_PAGE_NEXT_KEY(share, k, key_data_length, 0); + MARIA_KEY tmp_key; + + /* + We don't need to set all MARIA_KEY elements here as + _ma_row_pos_from_key() only uses a few of them. + */ + tmp_key.keyinfo= keyinfo; + tmp_key.data= k; + tmp_key.data_length= key_data_length; + + info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key); + info->last_key.keyinfo= keyinfo; + info->last_key.data_length= key_data_length; + info->last_key.ref_length= share->base.rec_reflength; + info->last_key.flag= 0; + memcpy(info->last_key.data, k, + info->last_key.data_length + info->last_key.ref_length); + info->maria_rtree_recursion_depth= level; + *saved_key= last - page_buf; + + if (after_key < last) + { + uchar *keyread_buff= info->keyread_buff; + info->int_keypos= keyread_buff; + info->int_maxpos= keyread_buff + (last - after_key); + memcpy(keyread_buff, after_key, last - after_key); + info->keyread_buff_used= 0; + } + else + { + info->keyread_buff_used= 1; + } + + res= 0; + goto ok; + } + } + } + info->cur_row.lastpos= HA_OFFSET_ERROR; + my_errno= HA_ERR_KEY_NOT_FOUND; + res= 1; + +ok: + my_afree(page_buf); + return res; + +err: + my_afree(page_buf); + info->cur_row.lastpos= HA_OFFSET_ERROR; + return -1; +} + + +/* + Find first key in r-tree according to search_flag condition + + SYNOPSIS + maria_rtree_find_first() + info Handler to MARIA file + key Key to search for + search_flag Bitmap of flags how to do the search + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_find_first(MARIA_HA *info, MARIA_KEY *key, uint32 search_flag) +{ + my_off_t root; + uint nod_cmp_flag; + MARIA_KEYDEF *keyinfo= key->keyinfo; + + if ((root= info->s->state.key_root[keyinfo->key_nr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + /* + Save searched key, include data pointer. + The data pointer is required if the search_flag contains MBR_DATA. + (minimum bounding rectangle) + */ + memcpy(info->first_mbr_key, key->data, key->data_length + key->ref_length); + info->last_rkey_length= key->data_length; + + info->maria_rtree_recursion_depth= -1; + info->keyread_buff_used= 1; + + nod_cmp_flag= ((search_flag & (MBR_EQUAL | MBR_WITHIN)) ? + MBR_WITHIN : MBR_INTERSECT); + return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root, + 0); +} + + +/* + Find next key in r-tree according to search_flag condition + + SYNOPSIS + maria_rtree_find_next() + info Handler to MARIA file + uint keynr Key number to use + search_flag Bitmap of flags how to do the search + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint32 search_flag) +{ + my_off_t root; + uint32 nod_cmp_flag; + MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr; + DBUG_ASSERT(info->last_key.keyinfo == keyinfo); + + if (info->update & HA_STATE_DELETED) + return maria_rtree_find_first(info, &info->last_key, search_flag); + + if (!info->keyread_buff_used) + { + uchar *key= info->int_keypos; + + while (key < info->int_maxpos) + { + if (!maria_rtree_key_cmp(keyinfo->seg, + info->first_mbr_key, key, + info->last_rkey_length, search_flag)) + { + uchar *after_key= key + keyinfo->keylength; + MARIA_KEY tmp_key; + + /* + We don't need to set all MARIA_KEY elements here as + _ma_row_pos_from_key only uses a few of them. + */ + tmp_key.keyinfo= keyinfo; + tmp_key.data= key; + tmp_key.data_length= keyinfo->keylength - info->s->base.rec_reflength; + + info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key); + memcpy(info->last_key.data, key, info->last_key.data_length); + + if (after_key < info->int_maxpos) + info->int_keypos= after_key; + else + info->keyread_buff_used= 1; + return 0; + } + key+= keyinfo->keylength; + } + } + if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + nod_cmp_flag= (((search_flag & (MBR_EQUAL | MBR_WITHIN)) ? + MBR_WITHIN : MBR_INTERSECT)); + return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root, + 0); +} + + +/* + Get next key in r-tree recursively + + NOTES + Used in maria_rtree_get_first() and maria_rtree_get_next() + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +static int maria_rtree_get_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uint key_length, my_off_t page_pos, int level) +{ + MARIA_SHARE *share= info->s; + uchar *page_buf, *last, *k; + uint nod_flag, key_data_length; + int res; + uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level; + MARIA_PAGE page; + + if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length))) + return -1; + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + + key_data_length= keyinfo->keylength - share->base.rec_reflength; + + if (info->maria_rtree_recursion_depth >= level) + { + k= page.buff + *saved_key; + if (!nod_flag) + { + /* Only leaf pages contain data references. */ + /* Need to check next key with data reference. */ + k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag); + } + } + else + { + k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag); + } + last= rt_PAGE_END(&page); + + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag)) + { + if (nod_flag) + { + /* this is an internal node in the tree */ + switch ((res= maria_rtree_get_req(info, keyinfo, key_length, + _ma_kpos(nod_flag, k), level + 1))) + { + case 0: /* found - exit from recursion */ + *saved_key= k - page.buff; + goto ok; + case 1: /* not found - continue searching */ + info->maria_rtree_recursion_depth= level; + break; + default: + case -1: /* error */ + goto err; + } + } + else + { + /* this is a leaf */ + uchar *after_key= rt_PAGE_NEXT_KEY(share, k, key_data_length, 0); + MARIA_KEY tmp_key; + + /* + We don't need to set all MARIA_KEY elements here as + _ma_row_pos_from_key() only uses a few of them. + */ + tmp_key.keyinfo= keyinfo; + tmp_key.data= k; + tmp_key.data_length= key_data_length; + + info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key); + info->last_key.data_length= key_data_length; + info->last_key.ref_length= share->base.rec_reflength; + + memcpy(info->last_key.data, k, + info->last_key.data_length + info->last_key.ref_length); + + info->maria_rtree_recursion_depth= level; + *saved_key= k - page.buff; + + if (after_key < last) + { + uchar *keyread_buff= info->keyread_buff; + info->last_rtree_keypos= saved_key; + memcpy(keyread_buff, page.buff, page.size); + info->int_maxpos= keyread_buff + page.size; + info->keyread_buff_used= 0; + } + else + { + info->keyread_buff_used= 1; + } + + res= 0; + goto ok; + } + } + info->cur_row.lastpos= HA_OFFSET_ERROR; + my_errno= HA_ERR_KEY_NOT_FOUND; + res= 1; + +ok: + my_afree(page_buf); + return res; + +err: + my_afree(page_buf); + info->cur_row.lastpos= HA_OFFSET_ERROR; + return -1; +} + + +/* + Get first key in r-tree + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length) +{ + my_off_t root; + MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr; + + if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + info->maria_rtree_recursion_depth= -1; + info->keyread_buff_used= 1; + + return maria_rtree_get_req(info, keyinfo, key_length, root, 0); +} + + +/* + Get next key in r-tree + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length) +{ + my_off_t root; + MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr; + uchar *keyread_buff= info->keyread_buff; + + if (!info->keyread_buff_used) + { + uint key_data_length= keyinfo->keylength - info->s->base.rec_reflength; + /* rt_PAGE_NEXT_KEY(*info->last_rtree_keypos) */ + uchar *key= keyread_buff + *info->last_rtree_keypos + keyinfo->keylength; + /* rt_PAGE_NEXT_KEY(key) */ + uchar *after_key= key + keyinfo->keylength; + MARIA_KEY tmp_key; + + tmp_key.keyinfo= keyinfo; + tmp_key.data= key; + tmp_key.data_length= key_data_length; + tmp_key.ref_length= info->s->base.rec_reflength; + tmp_key.flag= 0; + + info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key); + _ma_copy_key(&info->last_key, &tmp_key); + + *info->last_rtree_keypos= (uint) (key - keyread_buff); + if (after_key >= info->int_maxpos) + { + info->keyread_buff_used= 1; + } + + return 0; + } + else + { + if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + return maria_rtree_get_req(info, &keyinfo[keynr], key_length, root, 0); + } +} + + +/* + Choose non-leaf better key for insertion + + Returns a pointer inside the page_buf buffer. +*/ +#ifdef PICK_BY_PERIMETER +static const uchar *maria_rtree_pick_key(const MARIA_KEY *key, + const MARIA_PAGE *page) +{ + double increase; + double best_incr; + double perimeter; + double best_perimeter; + uchar *best_key= NULL; + const MARIA_HA *info= page->info; + + uchar *k= rt_PAGE_FIRST_KEY(info->s, page->buf, page->node); + uchar *last= rt_PAGE_END(info, page); + + LINT_INIT(best_perimeter); + LINT_INIT(best_key); + LINT_INIT(best_incr); + + for (; k < last; k= rt_PAGE_NEXT_KEY(k, key->data_length, nod_flag)) + { + if ((increase= maria_rtree_perimeter_increase(keyinfo->seg, k, key, + &perimeter)) == -1) + return NULL; + if ((increase < best_incr)|| + (increase == best_incr && perimeter < best_perimeter)) + { + best_key= k; + best_perimeter= perimeter; + best_incr= increase; + } + } + return best_key; +} + +#endif /*PICK_BY_PERIMETER*/ + +#ifdef PICK_BY_AREA +static const uchar *maria_rtree_pick_key(const MARIA_KEY *key, + const MARIA_PAGE *page) +{ + const MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + double increase; + double best_incr= DBL_MAX; + double area; + double best_area; + const uchar *best_key= NULL; + const uchar *k= rt_PAGE_FIRST_KEY(share, page->buff, page->node); + const uchar *last= rt_PAGE_END(page); + + LINT_INIT(best_area); + + for (; k < last; + k= rt_PAGE_NEXT_KEY(share, k, key->data_length, page->node)) + { + /* The following is safe as -1.0 is an exact number */ + if ((increase= maria_rtree_area_increase(key->keyinfo->seg, k, key->data, + key->data_length + + key->ref_length, + &area)) == -1.0) + return NULL; + /* The following should be safe, even if we compare doubles */ + if (!best_key || increase < best_incr || + ((increase == best_incr) && (area < best_area))) + { + best_key= k; + best_area= area; + best_incr= increase; + } + } + return best_key; +} + +#endif /*PICK_BY_AREA*/ + +/* + Go down and insert key into tree + + RETURN + -1 Error + 0 Child was not split + 1 Child was split +*/ + +static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEY *key, + my_off_t page_pos, my_off_t *new_page, + int ins_level, int level) +{ + uint nod_flag; + uint key_length= key->data_length; + int res; + uchar *page_buf, *k; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("maria_rtree_insert_req"); + + if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length + + MARIA_MAX_KEY_BUFF))) + { + my_errno= HA_ERR_OUT_OF_MEM; + DBUG_RETURN(-1); /* purecov: inspected */ + } + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + DBUG_PRINT("rtree", ("page: %lu level: %d ins_level: %d nod_flag: %u", + (ulong) page.pos, level, ins_level, nod_flag)); + + if ((ins_level == -1 && nod_flag) || /* key: go down to leaf */ + (ins_level > -1 && ins_level > level)) /* branch: go down to ins_level */ + { + if (!(k= (uchar *)maria_rtree_pick_key(key, &page))) + goto err; + /* k is now a pointer inside the page_buf buffer */ + switch ((res= maria_rtree_insert_req(info, key, + _ma_kpos(nod_flag, k), new_page, + ins_level, level + 1))) + { + case 0: /* child was not split, most common case */ + { + maria_rtree_combine_rect(keyinfo->seg, k, key->data, k, key_length); + if (share->now_transactional && + _ma_log_change(&page, k, key_length, + KEY_OP_DEBUG_RTREE_COMBINE)) + goto err; + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + goto ok; + } + case 1: /* child was split */ + { + /* Set new_key to point to a free buffer area */ + uchar *new_key_buff= page_buf + keyinfo->block_length + nod_flag; + MARIA_KEY new_key; + MARIA_KEY k_key; + + DBUG_ASSERT(nod_flag); + k_key.keyinfo= new_key.keyinfo= keyinfo; + new_key.data= new_key_buff; + k_key.data= k; + k_key.data_length= new_key.data_length= key->data_length; + k_key.ref_length= new_key.ref_length= key->ref_length; + k_key.flag= new_key.flag= 0; /* Safety */ + + /* set proper MBR for key */ + if (maria_rtree_set_key_mbr(info, &k_key, _ma_kpos(nod_flag, k))) + goto err; + if (share->now_transactional && + _ma_log_change(&page, k, key_length, + KEY_OP_DEBUG_RTREE_SPLIT)) + goto err; + /* add new key for new page */ + _ma_kpointer(info, new_key_buff - nod_flag, *new_page); + if (maria_rtree_set_key_mbr(info, &new_key, *new_page)) + goto err; + res= maria_rtree_add_key(&new_key, &page, new_page); + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + goto ok; + } + default: + case -1: /* error */ + { + goto err; + } + } + } + else + { + res= maria_rtree_add_key(key, &page, new_page); + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + +ok: + my_afree(page_buf); + DBUG_RETURN(res); + +err: + res= -1; /* purecov: inspected */ + goto ok; /* purecov: inspected */ +} + + +/** + Insert key into the tree + + @param info table + @param key KEY to insert + @param ins_level at which level key insertion should start + @param root put new key_root there + + @return Operation result + @retval -1 Error + @retval 0 Root was not split + @retval 1 Root was split +*/ + +int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, int ins_level, + my_off_t *root) +{ + my_off_t old_root; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + int res; + my_off_t new_page; + enum pagecache_page_lock write_lock; + DBUG_ENTER("maria_rtree_insert_level"); + + if ((old_root= share->state.key_root[keyinfo->key_nr]) == HA_OFFSET_ERROR) + { + MARIA_PINNED_PAGE tmp_page_link, *page_link; + MARIA_PAGE page; + + page_link= &tmp_page_link; + if ((old_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) + DBUG_RETURN(-1); + write_lock= page_link->write_lock; + info->keyread_buff_used= 1; + bzero(info->buff, share->block_size); + _ma_store_keynr(share, info->buff, keyinfo->key_nr); + _ma_store_page_used(share, info->buff, share->keypage_header); + _ma_page_setup(&page, info, keyinfo, old_root, info->buff); + + if (share->now_transactional && _ma_log_new(&page, 1)) + DBUG_RETURN(1); + + res= maria_rtree_add_key(key, &page, NULL); + if (_ma_write_keypage(&page, write_lock, DFLT_INIT_HITS)) + DBUG_RETURN(1); + *root= old_root; + DBUG_RETURN(res); + } + + switch ((res= maria_rtree_insert_req(info, key, old_root, &new_page, + ins_level, 0))) + { + case 0: /* root was not split */ + { + break; + } + case 1: /* root was split, grow a new root; very rare */ + { + uchar *new_root_buf, *new_key_buff; + my_off_t new_root; + uint nod_flag= share->base.key_reflength; + MARIA_PINNED_PAGE tmp_page_link, *page_link; + MARIA_KEY new_key; + MARIA_PAGE page; + page_link= &tmp_page_link; + + DBUG_PRINT("rtree", ("root was split, grow a new root")); + if (!(new_root_buf= (uchar*) my_alloca((uint) keyinfo->block_length + + MARIA_MAX_KEY_BUFF))) + { + my_errno= HA_ERR_OUT_OF_MEM; + DBUG_RETURN(-1); /* purecov: inspected */ + } + + bzero(new_root_buf, share->block_size); + _ma_store_keypage_flag(share, new_root_buf, KEYPAGE_FLAG_ISNOD); + _ma_store_keynr(share, new_root_buf, keyinfo->key_nr); + _ma_store_page_used(share, new_root_buf, share->keypage_header); + if ((new_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) + goto err; + write_lock= page_link->write_lock; + + _ma_page_setup(&page, info, keyinfo, new_root, new_root_buf); + + if (share->now_transactional && _ma_log_new(&page, 1)) + goto err; + + /* Point to some free space */ + new_key_buff= new_root_buf + keyinfo->block_length + nod_flag; + new_key.keyinfo= keyinfo; + new_key.data= new_key_buff; + new_key.data_length= key->data_length; + new_key.ref_length= key->ref_length; + new_key.flag= 0; + + _ma_kpointer(info, new_key_buff - nod_flag, old_root); + if (maria_rtree_set_key_mbr(info, &new_key, old_root)) + goto err; + if (maria_rtree_add_key(&new_key, &page, NULL) + == -1) + goto err; + _ma_kpointer(info, new_key_buff - nod_flag, new_page); + if (maria_rtree_set_key_mbr(info, &new_key, new_page)) + goto err; + if (maria_rtree_add_key(&new_key, &page, NULL) + == -1) + goto err; + if (_ma_write_keypage(&page, write_lock, DFLT_INIT_HITS)) + goto err; + *root= new_root; + DBUG_PRINT("rtree", ("new root page: %lu level: %d nod_flag: %u", + (ulong) new_root, 0, page.node)); + + my_afree(new_root_buf); + break; +err: + my_afree(new_root_buf); + DBUG_RETURN(-1); /* purecov: inspected */ + } + default: + case -1: /* error */ + { + DBUG_ASSERT(0); + break; + } + } + DBUG_RETURN(res); +} + + +/* + Insert key into the tree - interface function + + RETURN + 1 Error + 0 OK +*/ + +my_bool maria_rtree_insert(MARIA_HA *info, MARIA_KEY *key) +{ + int res; + MARIA_SHARE *share= info->s; + my_off_t *root, new_root; + LSN lsn= LSN_IMPOSSIBLE; + DBUG_ENTER("maria_rtree_insert"); + + if (!key) + DBUG_RETURN(1); /* _ma_sp_make_key failed */ + + root= &share->state.key_root[key->keyinfo->key_nr]; + new_root= *root; + + if ((res= (maria_rtree_insert_level(info, key, -1, &new_root) == -1))) + goto err; + if (share->now_transactional) + res= _ma_write_undo_key_insert(info, key, root, new_root, &lsn); + else + { + *root= new_root; + _ma_fast_unlock_key_del(info); + } + _ma_unpin_all_pages_and_finalize_row(info, lsn); +err: + DBUG_RETURN(res != 0); +} + + +/* + Fill reinsert page buffer + + RETURN + 1 Error + 0 OK +*/ + +static my_bool maria_rtree_fill_reinsert_list(stPageList *ReinsertList, + my_off_t page, int level) +{ + DBUG_ENTER("maria_rtree_fill_reinsert_list"); + DBUG_PRINT("rtree", ("page: %lu level: %d", (ulong) page, level)); + if (ReinsertList->n_pages == ReinsertList->m_pages) + { + ReinsertList->m_pages += REINSERT_BUFFER_INC; + if (!(ReinsertList->pages= (stPageLevel*)my_realloc((uchar*)ReinsertList->pages, + ReinsertList->m_pages * sizeof(stPageLevel), MYF(MY_ALLOW_ZERO_PTR)))) + goto err; + } + /* save page to ReinsertList */ + ReinsertList->pages[ReinsertList->n_pages].offs= page; + ReinsertList->pages[ReinsertList->n_pages].level= level; + ReinsertList->n_pages++; + DBUG_RETURN(0); + +err: + DBUG_RETURN(1); /* purecov: inspected */ +} + + +/* + Go down and delete key from the tree + + RETURN + -1 Error + 0 Deleted + 1 Not found + 2 Empty leaf +*/ + +static int maria_rtree_delete_req(MARIA_HA *info, const MARIA_KEY *key, + my_off_t page_pos, uint *page_size, + stPageList *ReinsertList, int level) +{ + ulong i; + uint nod_flag; + int res; + uchar *page_buf, *last, *k; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("maria_rtree_delete_req"); + + if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length))) + { + my_errno= HA_ERR_OUT_OF_MEM; + DBUG_RETURN(-1); /* purecov: inspected */ + } + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + DBUG_PRINT("rtree", ("page: %lu level: %d nod_flag: %u", + (ulong) page_pos, level, nod_flag)); + + k= rt_PAGE_FIRST_KEY(share, page_buf, nod_flag); + last= rt_PAGE_END(&page); + + for (i= 0; + k < last; + k= rt_PAGE_NEXT_KEY(share, k, key->data_length, nod_flag), i++) + { + if (nod_flag) + { + /* not leaf */ + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key->data_length, + MBR_WITHIN)) + { + switch ((res= maria_rtree_delete_req(info, key, + _ma_kpos(nod_flag, k), + page_size, ReinsertList, + level + 1))) + { + case 0: /* deleted */ + { + /* test page filling */ + if (*page_size + key->data_length >= + rt_PAGE_MIN_SIZE(keyinfo->block_length)) + { + /* OK */ + /* Calculate a new key value (MBR) for the shrinked block. */ + MARIA_KEY tmp_key; + tmp_key.keyinfo= keyinfo; + tmp_key.data= k; + tmp_key.data_length= key->data_length; + tmp_key.ref_length= key->ref_length; + tmp_key.flag= 0; /* Safety */ + + if (maria_rtree_set_key_mbr(info, &tmp_key, + _ma_kpos(nod_flag, k))) + goto err; + if (share->now_transactional && + _ma_log_change(&page, k, key->data_length, + KEY_OP_DEBUG_RTREE_SET_KEY)) + goto err; + page_mark_changed(info, &page) + if (_ma_write_keypage(&page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + else + { + /* + Too small: delete key & add it descendant to reinsert list. + Store position and level of the block so that it can be + accessed later for inserting the remaining keys. + */ + DBUG_PRINT("rtree", ("too small. move block to reinsert list")); + if (maria_rtree_fill_reinsert_list(ReinsertList, + _ma_kpos(nod_flag, k), + level + 1)) + goto err; + /* + Delete the key that references the block. This makes the + block disappear from the index. Hence we need to insert + its remaining keys later. Note: if the block is a branch + block, we do not only remove this block, but the whole + subtree. So we need to re-insert its keys on the same + level later to reintegrate the subtrees. + */ + if (maria_rtree_delete_key(&page, k, key->data_length)) + goto err; + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + *page_size= page.size; + } + + goto ok; + } + case 1: /* not found - continue searching */ + { + break; + } + case 2: /* vacuous case: last key in the leaf */ + { + if (maria_rtree_delete_key(&page, k, key->data_length)) + goto err; + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + *page_size= page.size; + res= 0; + goto ok; + } + default: /* error */ + case -1: + { + goto err; + } + } + } + } + else + { + /* leaf */ + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key->data_length, + MBR_EQUAL | MBR_DATA)) + { + page_mark_changed(info, &page); + if (maria_rtree_delete_key(&page, k, key->data_length)) + goto err; + *page_size= page.size; + if (*page_size == info->s->keypage_header) + { + /* last key in the leaf */ + res= 2; + if (_ma_dispose(info, page.pos, 0)) + goto err; + } + else + { + res= 0; + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + goto ok; + } + } + } + res= 1; + +ok: + my_afree(page_buf); + DBUG_RETURN(res); + +err: + my_afree(page_buf); + DBUG_RETURN(-1); /* purecov: inspected */ +} + + +/* + Delete key - interface function + + RETURN + 1 Error + 0 Deleted +*/ + +my_bool maria_rtree_delete(MARIA_HA *info, MARIA_KEY *key) +{ + MARIA_SHARE *share= info->s; + my_off_t new_root= share->state.key_root[key->keyinfo->key_nr]; + int res; + LSN lsn= LSN_IMPOSSIBLE; + DBUG_ENTER("maria_rtree_delete"); + + if ((res= maria_rtree_real_delete(info, key, &new_root))) + goto err; + + if (share->now_transactional) + res= _ma_write_undo_key_delete(info, key, new_root, &lsn); + else + share->state.key_root[key->keyinfo->key_nr]= new_root; + +err: + _ma_fast_unlock_key_del(info); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res != 0); +} + + +my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key, + my_off_t *root) +{ + uint page_size; + stPageList ReinsertList; + my_off_t old_root; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + uint key_data_length= key->data_length; + DBUG_ENTER("maria_rtree_real_delete"); + + if ((old_root= share->state.key_root[keyinfo->key_nr]) == + HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + DBUG_RETURN(1); /* purecov: inspected */ + } + DBUG_PRINT("rtree", ("starting deletion at root page: %lu", + (ulong) old_root)); + + ReinsertList.pages= NULL; + ReinsertList.n_pages= 0; + ReinsertList.m_pages= 0; + + switch (maria_rtree_delete_req(info, key, old_root, &page_size, + &ReinsertList, 0)) { + case 2: /* empty */ + { + *root= HA_OFFSET_ERROR; + break; + } + case 0: /* deleted */ + { + uint nod_flag; + ulong i; + uchar *page_buf; + MARIA_PAGE page; + MARIA_KEY tmp_key; + tmp_key.keyinfo= key->keyinfo; + tmp_key.data_length= key->data_length; + tmp_key.ref_length= key->ref_length; + tmp_key.flag= 0; /* Safety */ + + if (ReinsertList.n_pages) + { + if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length))) + { + my_errno= HA_ERR_OUT_OF_MEM; + goto err; + } + + for (i= 0; i < ReinsertList.n_pages; ++i) + { + uchar *k, *last; + if (_ma_fetch_keypage(&page, info, keyinfo, ReinsertList.pages[i].offs, + PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + DBUG_PRINT("rtree", ("reinserting keys from " + "page: %lu level: %d nod_flag: %u", + (ulong) ReinsertList.pages[i].offs, + ReinsertList.pages[i].level, nod_flag)); + + k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag); + last= rt_PAGE_END(&page); + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, + nod_flag)) + { + int res; + tmp_key.data= k; + if ((res= maria_rtree_insert_level(info, &tmp_key, + ReinsertList.pages[i].level, + root)) == -1) + { + my_afree(page_buf); + goto err; + } + if (res) + { + uint j; + DBUG_PRINT("rtree", ("root has been split, adjust levels")); + for (j= i; j < ReinsertList.n_pages; j++) + { + ReinsertList.pages[j].level++; + DBUG_PRINT("rtree", ("keys from page: %lu now level: %d", + (ulong) ReinsertList.pages[i].offs, + ReinsertList.pages[i].level)); + } + } + } + page_mark_changed(info, &page); + if (_ma_dispose(info, page.pos, 0)) + { + my_afree(page_buf); + goto err; + } + } + my_afree(page_buf); + my_free(ReinsertList.pages, MYF(0)); + } + + /* check for redundant root (not leaf, 1 child) and eliminate */ + if ((old_root= *root) == HA_OFFSET_ERROR) + goto err; + if (_ma_fetch_keypage(&page, info, keyinfo, old_root, + PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, info->buff, 0)) + goto err; + nod_flag= page.node; + if (nod_flag && (page.size == share->keypage_header + key_data_length + + nod_flag)) + { + *root= _ma_kpos(nod_flag, + rt_PAGE_FIRST_KEY(share, info->buff, nod_flag)); + page_mark_changed(info, &page); + if (_ma_dispose(info, page.pos, 0)) + goto err; + } + info->update= HA_STATE_DELETED; + break; + } + case 1: /* not found */ + { + my_errno= HA_ERR_KEY_NOT_FOUND; + goto err; + } + case -1: /* error */ + default: + goto err; /* purecov: inspected */ + } + DBUG_RETURN(0); + +err: + DBUG_RETURN(1); +} + + +/* + Estimate number of suitable keys in the tree + + RETURN + estimated value +*/ + +ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag) +{ + my_off_t root; + uint i= 0; + uint nod_flag, key_data_length; + uchar *page_buf, *k, *last; + double area= 0; + ha_rows res= 0; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + + if (flag & MBR_DISJOINT) + return info->state->records; + + if ((root= share->state.key_root[key->keyinfo->key_nr]) == HA_OFFSET_ERROR) + return HA_POS_ERROR; + if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length))) + return HA_POS_ERROR; + if (_ma_fetch_keypage(&page, info, keyinfo, root, + PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, page_buf, + 0)) + goto err; + nod_flag= page.node; + + key_data_length= key->data_length; + + k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag); + last= rt_PAGE_END(&page); + + for (; k < last; + k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag), i++) + { + if (nod_flag) + { + double k_area= maria_rtree_rect_volume(keyinfo->seg, k, key_data_length); + + /* The following should be safe, even if we compare doubles */ + if (k_area == 0) + { + if (flag & (MBR_CONTAIN | MBR_INTERSECT)) + { + area+= 1; + } + else if (flag & (MBR_WITHIN | MBR_EQUAL)) + { + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length, + MBR_WITHIN)) + area+= 1; + } + else + goto err; + } + else + { + if (flag & (MBR_CONTAIN | MBR_INTERSECT)) + { + area+= maria_rtree_overlapping_area(keyinfo->seg, key->data, k, + key_data_length) / k_area; + } + else if (flag & (MBR_WITHIN | MBR_EQUAL)) + { + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length, + MBR_WITHIN)) + area+= (maria_rtree_rect_volume(keyinfo->seg, key->data, + key_data_length) / k_area); + } + else + goto err; + } + } + else + { + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length, + flag)) + ++res; + } + } + if (nod_flag) + { + if (i) + res= (ha_rows) (area / i * info->state->records); + else + res= HA_POS_ERROR; + } + + my_afree(page_buf); + return res; + +err: + my_afree(page_buf); + return HA_POS_ERROR; +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_index.h b/storage/maria/ma_rt_index.h new file mode 100644 index 00000000000..dacaa4389b7 --- /dev/null +++ b/storage/maria/ma_rt_index.h @@ -0,0 +1,46 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _rt_index_h +#define _rt_index_h + +#ifdef HAVE_RTREE_KEYS + +#define rt_PAGE_FIRST_KEY(share, page, nod_flag) (page + share->keypage_header + nod_flag) +#define rt_PAGE_NEXT_KEY(share, key, key_length, nod_flag) (key + key_length +\ + (nod_flag ? nod_flag : share->base.rec_reflength)) +#define rt_PAGE_END(page) ((page)->buff + (page)->size) + +#define rt_PAGE_MIN_SIZE(block_length) ((uint)(block_length - KEYPAGE_CHECKSUM_SIZE) / 3) + +my_bool maria_rtree_insert(MARIA_HA *info, MARIA_KEY *key); +my_bool maria_rtree_delete(MARIA_HA *info, MARIA_KEY *key); +int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, + int ins_level, my_off_t *root); +my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key, + my_off_t *root); +int maria_rtree_find_first(MARIA_HA *info, MARIA_KEY *key, uint search_flag); +int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint32 search_flag); + +int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length); +int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length); + +ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag); + +int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page, + my_off_t *new_page_offs); +#endif /*HAVE_RTREE_KEYS*/ +#endif /* _rt_index_h */ diff --git a/storage/maria/ma_rt_key.c b/storage/maria/ma_rt_key.c new file mode 100644 index 00000000000..fa173605cd3 --- /dev/null +++ b/storage/maria/ma_rt_key.c @@ -0,0 +1,120 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_key_recover.h" + +#ifdef HAVE_RTREE_KEYS +#include "ma_rt_index.h" +#include "ma_rt_key.h" +#include "ma_rt_mbr.h" + +/* + Add key to the page + + RESULT VALUES + -1 Error + 0 Not split + 1 Split +*/ + +int maria_rtree_add_key(const MARIA_KEY *key, MARIA_PAGE *page, + my_off_t *new_page) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + uint page_size= page->size; + uint nod_flag= page->node; + uchar *key_pos= rt_PAGE_END(page); + uint tot_key_length= key->data_length + key->ref_length + nod_flag; + DBUG_ENTER("maria_rtree_add_key"); + + if (page_size + tot_key_length <= + (uint)(key->keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE)) + { + /* split won't be necessary */ + if (nod_flag) + { + DBUG_ASSERT(_ma_kpos(nod_flag, key->data) < + info->state->key_file_length); + /* We don't store reference to row on nod pages for rtree index */ + tot_key_length-= key->ref_length; + } + /* save key */ + memcpy(key_pos, key->data - nod_flag, tot_key_length); + page->size+= tot_key_length; + page_store_size(share, page); + if (share->now_transactional && + _ma_log_add(page, key_pos - page->buff, + key_pos, tot_key_length, tot_key_length, 0, + KEY_OP_DEBUG_LOG_ADD_1)) + DBUG_RETURN(-1); + DBUG_RETURN(0); + } + DBUG_RETURN(maria_rtree_split_page(key, page, new_page) ? -1 : 1); +} + + +/* + Delete key from the page + + Notes + key_length is only the data part of the key +*/ + +int maria_rtree_delete_key(MARIA_PAGE *page, uchar *key, uint key_length) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + uint key_length_with_nod_flag; + uchar *key_start; + + key_start= key - page->node; + if (!page->node) + key_length+= share->base.rec_reflength; + + memmove(key_start, key + key_length, page->size - key_length - + (key - page->buff)); + key_length_with_nod_flag= key_length + page->node; + page->size-= key_length_with_nod_flag; + page_store_size(share, page); + if (share->now_transactional && + _ma_log_delete(page, key_start, 0, key_length_with_nod_flag, + 0, KEY_OP_DEBUG_LOG_DEL_CHANGE_RT)) + return -1; + return 0; +} + + +/* + Calculate and store key MBR into *key. +*/ + +int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEY *key, + my_off_t child_page) +{ + MARIA_PAGE page; + DBUG_ENTER("maria_rtree_set_key_mbr"); + if (_ma_fetch_keypage(&page, info, key->keyinfo, child_page, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->buff, 0)) + DBUG_RETURN(-1); + + DBUG_RETURN(maria_rtree_page_mbr(key->keyinfo->seg, + &page, key->data, key->data_length)); +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_key.h b/storage/maria/ma_rt_key.h new file mode 100644 index 00000000000..948809f3d38 --- /dev/null +++ b/storage/maria/ma_rt_key.h @@ -0,0 +1,31 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Ramil Kalimullin, who has a shared copyright to this code */ + +#ifndef _rt_key_h +#define _rt_key_h + +#ifdef HAVE_RTREE_KEYS + +int maria_rtree_add_key(const MARIA_KEY *key, MARIA_PAGE *page, + my_off_t *new_page); +int maria_rtree_delete_key(MARIA_PAGE *page, uchar *key, uint key_length); +int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEY *key, + my_off_t child_page); + +#endif /*HAVE_RTREE_KEYS*/ +#endif /* _rt_key_h */ diff --git a/storage/maria/ma_rt_mbr.c b/storage/maria/ma_rt_mbr.c new file mode 100644 index 00000000000..b3e2b0ceab8 --- /dev/null +++ b/storage/maria/ma_rt_mbr.c @@ -0,0 +1,818 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" +#include "ma_rt_mbr.h" + +#define INTERSECT_CMP(amin, amax, bmin, bmax) ((amin > bmax) || (bmin > amax)) +#define CONTAIN_CMP(amin, amax, bmin, bmax) ((bmin > amin) || (bmax < amax)) +#define WITHIN_CMP(amin, amax, bmin, bmax) ((amin > bmin) || (amax < bmax)) +#define DISJOINT_CMP(amin, amax, bmin, bmax) ((amin <= bmax) && (bmin <= amax)) +#define EQUAL_CMP(amin, amax, bmin, bmax) ((amin != bmin) || (amax != bmax)) + +#define FCMP(A, B) ((int)(A) - (int)(B)) +#define p_inc(A, B, X) {A += X; B += X;} + +#define RT_CMP(nextflag) \ + if (nextflag & MBR_INTERSECT) \ + { \ + if (INTERSECT_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_CONTAIN) \ + { \ + if (CONTAIN_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_WITHIN) \ + { \ + if (WITHIN_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_EQUAL) \ + { \ + if (EQUAL_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_DISJOINT) \ + { \ + if (DISJOINT_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + }\ + else /* if unknown comparison operator */ \ + { \ + DBUG_ASSERT(0); \ + } + +#define RT_CMP_KORR(type, korr_func, len, nextflag) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + RT_CMP(nextflag); \ +} + +#define RT_CMP_GET(type, get_func, len, nextflag) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + RT_CMP(nextflag); \ +} + +/* + Compares two keys a and b depending on nextflag + nextflag can contain these flags: + MBR_INTERSECT(a,b) a overlaps b + MBR_CONTAIN(a,b) a contains b + MBR_DISJOINT(a,b) a disjoint b + MBR_WITHIN(a,b) a within b + MBR_EQUAL(a,b) All coordinates of MBRs are equal + MBR_DATA(a,b) Data reference is the same + Returns 0 on success. +*/ + +int maria_rtree_key_cmp(HA_KEYSEG *keyseg, const uchar *b, const uchar *a, + uint key_length, uint32 nextflag) +{ + for (; (int) key_length > 0; keyseg += 2 ) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_CMP_KORR(int8, mi_sint1korr, 1, nextflag); + break; + case HA_KEYTYPE_BINARY: + RT_CMP_KORR(uint8, mi_uint1korr, 1, nextflag); + break; + case HA_KEYTYPE_SHORT_INT: + RT_CMP_KORR(int16, mi_sint2korr, 2, nextflag); + break; + case HA_KEYTYPE_USHORT_INT: + RT_CMP_KORR(uint16, mi_uint2korr, 2, nextflag); + break; + case HA_KEYTYPE_INT24: + RT_CMP_KORR(int32, mi_sint3korr, 3, nextflag); + break; + case HA_KEYTYPE_UINT24: + RT_CMP_KORR(uint32, mi_uint3korr, 3, nextflag); + break; + case HA_KEYTYPE_LONG_INT: + RT_CMP_KORR(int32, mi_sint4korr, 4, nextflag); + break; + case HA_KEYTYPE_ULONG_INT: + RT_CMP_KORR(uint32, mi_uint4korr, 4, nextflag); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_CMP_KORR(longlong, mi_sint8korr, 8, nextflag) + break; + case HA_KEYTYPE_ULONGLONG: + RT_CMP_KORR(ulonglong, mi_uint8korr, 8, nextflag) + break; +#endif + case HA_KEYTYPE_FLOAT: + /* The following should be safe, even if we compare doubles */ + RT_CMP_GET(float, mi_float4get, 4, nextflag); + break; + case HA_KEYTYPE_DOUBLE: + RT_CMP_GET(double, mi_float8get, 8, nextflag); + break; + case HA_KEYTYPE_END: + goto end; + default: + return 1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } + +end: + if (nextflag & MBR_DATA) + { + const uchar *end= a + keyseg->length; + do + { + if (*a++ != *b++) + return FCMP(a[-1], b[-1]); + } while (a != end); + } + return 0; +} + +#define RT_VOL_KORR(type, korr_func, len, cast) \ +{ \ + type amin, amax; \ + amin= korr_func(a); \ + amax= korr_func(a+len); \ + res *= (cast(amax) - cast(amin)); \ +} + +#define RT_VOL_GET(type, get_func, len, cast) \ +{ \ + type amin, amax; \ + get_func(amin, a); \ + get_func(amax, a+len); \ + res *= (cast(amax) - cast(amin)); \ +} + +/* + Calculates rectangle volume +*/ +double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar *a, uint key_length) +{ + double res= 1; + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_VOL_KORR(int8, mi_sint1korr, 1, (double)); + break; + case HA_KEYTYPE_BINARY: + RT_VOL_KORR(uint8, mi_uint1korr, 1, (double)); + break; + case HA_KEYTYPE_SHORT_INT: + RT_VOL_KORR(int16, mi_sint2korr, 2, (double)); + break; + case HA_KEYTYPE_USHORT_INT: + RT_VOL_KORR(uint16, mi_uint2korr, 2, (double)); + break; + case HA_KEYTYPE_INT24: + RT_VOL_KORR(int32, mi_sint3korr, 3, (double)); + break; + case HA_KEYTYPE_UINT24: + RT_VOL_KORR(uint32, mi_uint3korr, 3, (double)); + break; + case HA_KEYTYPE_LONG_INT: + RT_VOL_KORR(int32, mi_sint4korr, 4, (double)); + break; + case HA_KEYTYPE_ULONG_INT: + RT_VOL_KORR(uint32, mi_uint4korr, 4, (double)); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_VOL_KORR(longlong, mi_sint8korr, 8, (double)); + break; + case HA_KEYTYPE_ULONGLONG: + RT_VOL_KORR(longlong, mi_sint8korr, 8, ulonglong2double); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_VOL_GET(float, mi_float4get, 4, (double)); + break; + case HA_KEYTYPE_DOUBLE: + RT_VOL_GET(double, mi_float8get, 8, (double)); + break; + case HA_KEYTYPE_END: + key_length= 0; + break; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + } + return res; +} + +#define RT_D_MBR_KORR(type, korr_func, len, cast) \ +{ \ + type amin, amax; \ + amin= korr_func(a); \ + amax= korr_func(a+len); \ + *res++= cast(amin); \ + *res++= cast(amax); \ +} + +#define RT_D_MBR_GET(type, get_func, len, cast) \ +{ \ + type amin, amax; \ + get_func(amin, a); \ + get_func(amax, a+len); \ + *res++= cast(amin); \ + *res++= cast(amax); \ +} + + +/* + Creates an MBR as an array of doubles. + Fills *res. +*/ + +int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a, + uint key_length, double *res) +{ + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_D_MBR_KORR(int8, mi_sint1korr, 1, (double)); + break; + case HA_KEYTYPE_BINARY: + RT_D_MBR_KORR(uint8, mi_uint1korr, 1, (double)); + break; + case HA_KEYTYPE_SHORT_INT: + RT_D_MBR_KORR(int16, mi_sint2korr, 2, (double)); + break; + case HA_KEYTYPE_USHORT_INT: + RT_D_MBR_KORR(uint16, mi_uint2korr, 2, (double)); + break; + case HA_KEYTYPE_INT24: + RT_D_MBR_KORR(int32, mi_sint3korr, 3, (double)); + break; + case HA_KEYTYPE_UINT24: + RT_D_MBR_KORR(uint32, mi_uint3korr, 3, (double)); + break; + case HA_KEYTYPE_LONG_INT: + RT_D_MBR_KORR(int32, mi_sint4korr, 4, (double)); + break; + case HA_KEYTYPE_ULONG_INT: + RT_D_MBR_KORR(uint32, mi_uint4korr, 4, (double)); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_D_MBR_KORR(longlong, mi_sint8korr, 8, (double)); + break; + case HA_KEYTYPE_ULONGLONG: + RT_D_MBR_KORR(longlong, mi_sint8korr, 8, ulonglong2double); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_D_MBR_GET(float, mi_float4get, 4, (double)); + break; + case HA_KEYTYPE_DOUBLE: + RT_D_MBR_GET(double, mi_float8get, 8, (double)); + break; + case HA_KEYTYPE_END: + key_length= 0; + break; + default: + return 1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + } + return 0; +} + +#define RT_COMB_KORR(type, korr_func, store_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + amin= min(amin, bmin); \ + amax= max(amax, bmax); \ + store_func(c, amin); \ + store_func(c+len, amax); \ +} + +#define RT_COMB_GET(type, get_func, store_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + amin= min(amin, bmin); \ + amax= max(amax, bmax); \ + store_func(c, amin); \ + store_func(c+len, amax); \ +} + +/* + Creates common minimal bounding rectungle + for two input rectagnles a and b + Result is written to c +*/ + +int maria_rtree_combine_rect(const HA_KEYSEG *keyseg, const uchar* a, + const uchar* b, uchar* c, + uint key_length) +{ + for ( ; (int) key_length > 0 ; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_COMB_KORR(int8, mi_sint1korr, mi_int1store, 1); + break; + case HA_KEYTYPE_BINARY: + RT_COMB_KORR(uint8, mi_uint1korr, mi_int1store, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_COMB_KORR(int16, mi_sint2korr, mi_int2store, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_COMB_KORR(uint16, mi_uint2korr, mi_int2store, 2); + break; + case HA_KEYTYPE_INT24: + RT_COMB_KORR(int32, mi_sint3korr, mi_int3store, 3); + break; + case HA_KEYTYPE_UINT24: + RT_COMB_KORR(uint32, mi_uint3korr, mi_int3store, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_COMB_KORR(int32, mi_sint4korr, mi_int4store, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_COMB_KORR(uint32, mi_uint4korr, mi_int4store, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_COMB_KORR(longlong, mi_sint8korr, mi_int8store, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_COMB_KORR(ulonglong, mi_uint8korr, mi_int8store, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_COMB_GET(float, mi_float4get, mi_float4store, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_COMB_GET(double, mi_float8get, mi_float8store, 8); + break; + case HA_KEYTYPE_END: + return 0; + default: + return 1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + c+= keyseg_length; + } + return 0; +} + + +#define RT_OVL_AREA_KORR(type, korr_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + amin= max(amin, bmin); \ + amax= min(amax, bmax); \ + if (amin >= amax) \ + return 0; \ + res *= amax - amin; \ +} + +#define RT_OVL_AREA_GET(type, get_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + amin= max(amin, bmin); \ + amax= min(amax, bmax); \ + if (amin >= amax) \ + return 0; \ + res *= amax - amin; \ +} + +/* +Calculates overlapping area of two MBRs a & b +*/ +double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b, + uint key_length) +{ + double res= 1; + for (; (int) key_length > 0 ; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_OVL_AREA_KORR(int8, mi_sint1korr, 1); + break; + case HA_KEYTYPE_BINARY: + RT_OVL_AREA_KORR(uint8, mi_uint1korr, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_OVL_AREA_KORR(int16, mi_sint2korr, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_OVL_AREA_KORR(uint16, mi_uint2korr, 2); + break; + case HA_KEYTYPE_INT24: + RT_OVL_AREA_KORR(int32, mi_sint3korr, 3); + break; + case HA_KEYTYPE_UINT24: + RT_OVL_AREA_KORR(uint32, mi_uint3korr, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_OVL_AREA_KORR(int32, mi_sint4korr, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_OVL_AREA_KORR(uint32, mi_uint4korr, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_OVL_AREA_GET(float, mi_float4get, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_OVL_AREA_GET(double, mi_float8get, 8); + break; + case HA_KEYTYPE_END: + return res; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } + return res; +} + +#define RT_AREA_INC_KORR(type, korr_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + a_area *= (((double)amax) - ((double)amin)); \ + loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ +} + +#define RT_AREA_INC_GET(type, get_func, len)\ +{\ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + a_area *= (((double)amax) - ((double)amin)); \ + loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ +} + +/* + Calculates MBR_AREA(a+b) - MBR_AREA(a) + Fills *ab_area. + Note: when 'a' and 'b' objects are far from each other, + the area increase can be really big, so this function + can return 'inf' as a result. +*/ + +double maria_rtree_area_increase(const HA_KEYSEG *keyseg, const uchar *a, + const uchar *b, + uint key_length, double *ab_area) +{ + double a_area= 1.0; + double loc_ab_area= 1.0; + + *ab_area= 1.0; + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + + if (keyseg->null_bit) /* Handle NULL part */ + return -1; + + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_AREA_INC_KORR(int8, mi_sint1korr, 1); + break; + case HA_KEYTYPE_BINARY: + RT_AREA_INC_KORR(uint8, mi_uint1korr, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_AREA_INC_KORR(int16, mi_sint2korr, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_AREA_INC_KORR(uint16, mi_uint2korr, 2); + break; + case HA_KEYTYPE_INT24: + RT_AREA_INC_KORR(int32, mi_sint3korr, 3); + break; + case HA_KEYTYPE_UINT24: + RT_AREA_INC_KORR(int32, mi_uint3korr, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_AREA_INC_KORR(int32, mi_sint4korr, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_AREA_INC_KORR(uint32, mi_uint4korr, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_AREA_INC_KORR(longlong, mi_sint8korr, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_AREA_INC_KORR(longlong, mi_sint8korr, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_AREA_INC_GET(float, mi_float4get, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_AREA_INC_GET(double, mi_float8get, 8); + break; + case HA_KEYTYPE_END: + goto safe_end; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } +safe_end: + *ab_area= loc_ab_area; + return loc_ab_area - a_area; +} + +#define RT_PERIM_INC_KORR(type, korr_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + a_perim+= (((double)amax) - ((double)amin)); \ + *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ +} + +#define RT_PERIM_INC_GET(type, get_func, len)\ +{\ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + a_perim+= (((double)amax) - ((double)amin)); \ + *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ +} + +/* +Calculates MBR_PERIMETER(a+b) - MBR_PERIMETER(a) +*/ +double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b, + uint key_length, double *ab_perim) +{ + double a_perim= 0.0; + + *ab_perim= 0.0; + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + + if (keyseg->null_bit) /* Handle NULL part */ + return -1; + + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_PERIM_INC_KORR(int8, mi_sint1korr, 1); + break; + case HA_KEYTYPE_BINARY: + RT_PERIM_INC_KORR(uint8, mi_uint1korr, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_PERIM_INC_KORR(int16, mi_sint2korr, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_PERIM_INC_KORR(uint16, mi_uint2korr, 2); + break; + case HA_KEYTYPE_INT24: + RT_PERIM_INC_KORR(int32, mi_sint3korr, 3); + break; + case HA_KEYTYPE_UINT24: + RT_PERIM_INC_KORR(int32, mi_uint3korr, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_PERIM_INC_KORR(int32, mi_sint4korr, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_PERIM_INC_KORR(uint32, mi_uint4korr, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_PERIM_INC_GET(float, mi_float4get, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_PERIM_INC_GET(double, mi_float8get, 8); + break; + case HA_KEYTYPE_END: + return *ab_perim - a_perim; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } + return *ab_perim - a_perim; +} + + +#define RT_PAGE_MBR_KORR(share, type, korr_func, store_func, len, to) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(k + inc); \ + amax= korr_func(k + inc + len); \ + k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag); \ + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag)) \ +{ \ + bmin= korr_func(k + inc); \ + bmax= korr_func(k + inc + len); \ + if (amin > bmin) \ + amin= bmin; \ + if (amax < bmax) \ + amax= bmax; \ +} \ + store_func(to, amin); \ + to+= len; \ + store_func(to, amax); \ + to += len; \ + inc += 2 * len; \ +} + +#define RT_PAGE_MBR_GET(share, type, get_func, store_func, len, to) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, k + inc); \ + get_func(amax, k + inc + len); \ + k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag); \ + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag)) \ +{ \ + get_func(bmin, k + inc); \ + get_func(bmax, k + inc + len); \ + if (amin > bmin) \ + amin= bmin; \ + if (amax < bmax) \ + amax= bmax; \ +} \ + store_func(to, amin); \ + to+= len; \ + store_func(to, amax); \ + to+= len; \ + inc += 2 * len; \ +} + +/* + Calculates key page total MBR= MBR(key1) + MBR(key2) + ... + Stores into *to. +*/ +int maria_rtree_page_mbr(const HA_KEYSEG *keyseg, + MARIA_PAGE *page, + uchar *to, uint key_length) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + uint inc= 0; + uint k_len= key_length; + uint nod_flag= page->node; + const uchar *k; + const uchar *last= rt_PAGE_END(page); + + for (; (int)key_length > 0; keyseg += 2) + { + key_length -= keyseg->length * 2; + + /* Handle NULL part */ + if (keyseg->null_bit) + { + return 1; + } + + k= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag); + + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_PAGE_MBR_KORR(share, int8, mi_sint1korr, mi_int1store, 1, to); + break; + case HA_KEYTYPE_BINARY: + RT_PAGE_MBR_KORR(share, uint8, mi_uint1korr, mi_int1store, 1, to); + break; + case HA_KEYTYPE_SHORT_INT: + RT_PAGE_MBR_KORR(share, int16, mi_sint2korr, mi_int2store, 2, to); + break; + case HA_KEYTYPE_USHORT_INT: + RT_PAGE_MBR_KORR(share, uint16, mi_uint2korr, mi_int2store, 2, to); + break; + case HA_KEYTYPE_INT24: + RT_PAGE_MBR_KORR(share, int32, mi_sint3korr, mi_int3store, 3, to); + break; + case HA_KEYTYPE_UINT24: + RT_PAGE_MBR_KORR(share, uint32, mi_uint3korr, mi_int3store, 3, to); + break; + case HA_KEYTYPE_LONG_INT: + RT_PAGE_MBR_KORR(share, int32, mi_sint4korr, mi_int4store, 4, to); + break; + case HA_KEYTYPE_ULONG_INT: + RT_PAGE_MBR_KORR(share, uint32, mi_uint4korr, mi_int4store, 4, to); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_PAGE_MBR_KORR(share, longlong, mi_sint8korr, mi_int8store, 8, to); + break; + case HA_KEYTYPE_ULONGLONG: + RT_PAGE_MBR_KORR(share, ulonglong, mi_uint8korr, mi_int8store, 8, to); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_PAGE_MBR_GET(share, float, mi_float4get, mi_float4store, 4, to); + break; + case HA_KEYTYPE_DOUBLE: + RT_PAGE_MBR_GET(share, double, mi_float8get, mi_float8store, 8, to); + break; + case HA_KEYTYPE_END: + return 0; + default: + return 1; + } + } + return 0; +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_mbr.h b/storage/maria/ma_rt_mbr.h new file mode 100644 index 00000000000..8fcd3d37b99 --- /dev/null +++ b/storage/maria/ma_rt_mbr.h @@ -0,0 +1,40 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _rt_mbr_h +#define _rt_mbr_h + +#ifdef HAVE_RTREE_KEYS + +int maria_rtree_key_cmp(HA_KEYSEG *keyseg, const uchar *a, const uchar *b, + uint key_length, uint32 nextflag); +int maria_rtree_combine_rect(const HA_KEYSEG *keyseg, + const uchar *, const uchar *, uchar*, + uint key_length); +double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar*, uint key_length); +int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a, + uint key_length, double *res); +double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar *a, uchar *b, + uint key_length); +double maria_rtree_area_increase(const HA_KEYSEG *keyseg, const uchar *a, + const uchar *b, + uint key_length, double *ab_area); +double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b, + uint key_length, double *ab_perim); +int maria_rtree_page_mbr(const HA_KEYSEG *keyseg, MARIA_PAGE *page, + uchar *key, uint key_length); +#endif /*HAVE_RTREE_KEYS*/ +#endif /* _rt_mbr_h */ diff --git a/storage/maria/ma_rt_split.c b/storage/maria/ma_rt_split.c new file mode 100644 index 00000000000..856edc60490 --- /dev/null +++ b/storage/maria/ma_rt_split.c @@ -0,0 +1,554 @@ +/* Copyright (C) 2006 MySQL AB & Alexey Botchkov & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_key_recover.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" +#include "ma_rt_key.h" +#include "ma_rt_mbr.h" + +typedef struct +{ + double square; + int n_node; + const uchar *key; + double *coords; +} SplitStruct; + +inline static double *reserve_coords(double **d_buffer, int n_dim) +{ + double *coords= *d_buffer; + (*d_buffer)+= n_dim * 2; + return coords; +} + +static void mbr_join(double *a, const double *b, int n_dim) +{ + double *end= a + n_dim * 2; + do + { + if (a[0] > b[0]) + a[0]= b[0]; + + if (a[1] < b[1]) + a[1]= b[1]; + + a+= 2; + b+= 2; + } while (a != end); +} + +/* +Counts the square of mbr which is a join of a and b +*/ +static double mbr_join_square(const double *a, const double *b, int n_dim) +{ + const double *end= a + n_dim * 2; + double square= 1.0; + do + { + square *= + ((a[1] < b[1]) ? b[1] : a[1]) - ((a[0] > b[0]) ? b[0] : a[0]); + + a+= 2; + b+= 2; + } while (a != end); + + return square; +} + +static double count_square(const double *a, int n_dim) +{ + const double *end= a + n_dim * 2; + double square= 1.0; + do + { + square *= a[1] - a[0]; + a+= 2; + } while (a != end); + return square; +} + +inline static void copy_coords(double *dst, const double *src, int n_dim) +{ + memcpy(dst, src, sizeof(double) * (n_dim * 2)); +} + +/** + Select two nodes to collect group upon. + + Note that such function uses 'double' arithmetic so may behave differently + on different platforms/builds. There are others in this file. +*/ +static void pick_seeds(SplitStruct *node, int n_entries, + SplitStruct **seed_a, SplitStruct **seed_b, int n_dim) +{ + SplitStruct *cur1; + SplitStruct *lim1= node + (n_entries - 1); + SplitStruct *cur2; + SplitStruct *lim2= node + n_entries; + + double max_d= -DBL_MAX; + double d; + + for (cur1= node; cur1 < lim1; cur1++) + { + for (cur2=cur1 + 1; cur2 < lim2; cur2++) + { + + d= mbr_join_square(cur1->coords, cur2->coords, n_dim) - cur1->square - + cur2->square; + if (d > max_d) + { + max_d= d; + *seed_a= cur1; + *seed_b= cur2; + } + } + } +} + +/* +Select next node and group where to add +*/ +static void pick_next(SplitStruct *node, int n_entries, double *g1, double *g2, + SplitStruct **choice, int *n_group, int n_dim) +{ + SplitStruct *cur= node; + SplitStruct *end= node + n_entries; + + double max_diff= -DBL_MAX; + + for (; cur < end; cur++) + { + double diff; + double abs_diff; + + if (cur->n_node) + { + continue; + } + + diff= mbr_join_square(g1, cur->coords, n_dim) - + mbr_join_square(g2, cur->coords, n_dim); + + abs_diff= fabs(diff); + if (abs_diff > max_diff) + { + max_diff= abs_diff; + *n_group= 1 + (diff > 0); + *choice= cur; + } + } +} + +/* +Mark not-in-group entries as n_group +*/ +static void mark_all_entries(SplitStruct *node, int n_entries, int n_group) +{ + SplitStruct *cur= node; + SplitStruct *end= node + n_entries; + + for (; cur < end; cur++) + { + if (cur->n_node) + { + continue; + } + cur->n_node= n_group; + } +} + +static int split_maria_rtree_node(SplitStruct *node, int n_entries, + int all_size, /* Total key's size */ + int key_size, + int min_size, /* Minimal group size */ + int size1, int size2 /* initial group sizes */, + double **d_buffer, int n_dim) +{ + SplitStruct *cur; + SplitStruct *a; + SplitStruct *b; + double *g1= reserve_coords(d_buffer, n_dim); + double *g2= reserve_coords(d_buffer, n_dim); + SplitStruct *next; + int next_node; + int i; + SplitStruct *end= node + n_entries; + LINT_INIT(a); + LINT_INIT(b); + LINT_INIT(next); + LINT_INIT(next_node); + + if (all_size < min_size * 2) + { + return 1; + } + + cur= node; + for (; cur < end; cur++) + { + cur->square= count_square(cur->coords, n_dim); + cur->n_node= 0; + } + + pick_seeds(node, n_entries, &a, &b, n_dim); + a->n_node= 1; + b->n_node= 2; + + + copy_coords(g1, a->coords, n_dim); + size1+= key_size; + copy_coords(g2, b->coords, n_dim); + size2+= key_size; + + + for (i=n_entries - 2; i>0; --i) + { + if (all_size - (size2 + key_size) < min_size) /* Can't write into group 2 */ + { + mark_all_entries(node, n_entries, 1); + break; + } + + if (all_size - (size1 + key_size) < min_size) /* Can't write into group 1 */ + { + mark_all_entries(node, n_entries, 2); + break; + } + + pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim); + if (next_node == 1) + { + size1+= key_size; + mbr_join(g1, next->coords, n_dim); + } + else + { + size2+= key_size; + mbr_join(g2, next->coords, n_dim); + } + next->n_node= next_node; + } + + return 0; +} + + +/** + Logs key reorganization done in a split page (new page is logged elsewhere). + + The effect of a split on the split page is three changes: + - some piece of the page move to different places inside this page (we are + not interested here in the pieces which move to the new page) + - the key is inserted into the page or not (could be in the new page) + - page is shrunk + All this is uniquely determined by a few parameters: + - the key (starting at 'key-nod_flag', for 'full_length' bytes + (maria_rtree_split_page() seems to depend on its parameters key&key_length + but in fact it reads more (to the left: nod_flag, and to the right: + full_length) + - the binary content of the page + - some variables in the share + - double arithmetic, which is unpredictable from machine to machine and + from build to build (see pick_seeds() above: it has a comparison between + double-s 'if (d > max_d)' so the comparison can go differently from machine + to machine or build to build, it has happened in real life). + If one day we use precision-math instead of double-math, in GIS, then the + last parameter would become constant accross machines and builds and we + could some cheap logging: just log the few parameters above. + Until then, we log the list of memcpy() operations (fortunately, we often do + not have to log the source bytes, as they can be found in the page before + applying the REDO; the only source bytes to log are the key), the key if it + was inserted into this page, and the shrinking. + + @param info table + @param page page's offset in the file + @param buff content of the page (post-split) + @param key_with_nod_flag pointer to key-nod_flag + @param full_length length of (key + (nod_flag (if node) or rowid (if + leaf))) + @param log_internal_copy encoded list of mempcy() operations done on + split page, having their source in the page + @param log_internal_copy_length length of above list, in bytes + @param log_key_copy operation describing the key's copy, or NULL if the + inserted key was not put into the page (was put in + new page, so does not have to be logged here) + @param length_diff by how much the page has shrunk during split +*/ + +static my_bool _ma_log_rt_split(MARIA_PAGE *page, + const uchar *key_with_nod_flag, + uint full_length, + const uchar *log_internal_copy, + uint log_internal_copy_length, + const uchar *log_key_copy, + uint length_diff) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 1 + 2 + 1 + 2 + 2 + 7], + *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6]; + uint translog_parts, extra_length= 0; + my_off_t page_pos; + DBUG_ENTER("_ma_log_rt_split"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(share->now_transactional); + page_pos= page->pos / share->block_size; + page_store(log_data + FILEID_STORE_SIZE, page_pos); + log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE; + log_pos[0]= KEY_OP_DEL_SUFFIX; + log_pos++; + DBUG_ASSERT((int)length_diff > 0); + int2store(log_pos, length_diff); + log_pos+= 2; + log_pos[0]= KEY_OP_MULTI_COPY; + log_pos++; + int2store(log_pos, full_length); + log_pos+= 2; + int2store(log_pos, log_internal_copy_length); + log_pos+= 2; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data) - 7; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_internal_copy; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= log_internal_copy_length; + translog_parts= 2; + if (log_key_copy != NULL) /* need to store key into record */ + { + log_array[TRANSLOG_INTERNAL_PARTS + 2].str= log_key_copy; + log_array[TRANSLOG_INTERNAL_PARTS + 2].length= 1 + 2 + 1 + 2; + log_array[TRANSLOG_INTERNAL_PARTS + 3].str= key_with_nod_flag; + log_array[TRANSLOG_INTERNAL_PARTS + 3].length= full_length; + extra_length= 1 + 2 + 1 + 2 + full_length; + translog_parts+= 2; + } + + _ma_log_key_changes(page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + page->org_size= page->size; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) ((log_pos - log_data) + + log_internal_copy_length + + extra_length), + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + +/** + 0 ok; the created page is put into page cache; the shortened one is not (up + to the caller to do it) + 1 or -1: error. + If new_page_offs==NULL, won't create new page (for redo phase). +*/ + +int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page, + my_off_t *new_page_offs) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + const my_bool transactional= share->now_transactional; + int n1, n2; /* Number of items in groups */ + SplitStruct *task; + SplitStruct *cur; + SplitStruct *stop; + double *coord_buf; + double *next_coord; + double *old_coord; + int n_dim; + uchar *source_cur, *cur1, *cur2; + uchar *new_page_buff, *log_internal_copy, *log_internal_copy_ptr, + *log_key_copy= NULL; + int err_code= 0; + uint new_page_length; + uint nod_flag= page->node; + uint org_length= page->size; + uint full_length= key->data_length + (nod_flag ? nod_flag : + key->ref_length); + uint key_data_length= key->data_length; + int max_keys= ((org_length - share->keypage_header) / (full_length)); + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_KEYDEF *keyinfo= key->keyinfo; + DBUG_ENTER("maria_rtree_split_page"); + DBUG_PRINT("rtree", ("splitting block")); + + n_dim= keyinfo->keysegs / 2; + + if (!(coord_buf= (double*) my_alloca(n_dim * 2 * sizeof(double) * + (max_keys + 1 + 4) + + sizeof(SplitStruct) * (max_keys + 1)))) + DBUG_RETURN(-1); /* purecov: inspected */ + + task= (SplitStruct *)(coord_buf + n_dim * 2 * (max_keys + 1 + 4)); + + next_coord= coord_buf; + + stop= task + max_keys; + source_cur= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag); + + for (cur= task; + cur < stop; + cur++, source_cur= rt_PAGE_NEXT_KEY(share, source_cur, key_data_length, + nod_flag)) + { + cur->coords= reserve_coords(&next_coord, n_dim); + cur->key= source_cur; + maria_rtree_d_mbr(keyinfo->seg, source_cur, key_data_length, cur->coords); + } + + cur->coords= reserve_coords(&next_coord, n_dim); + maria_rtree_d_mbr(keyinfo->seg, key->data, key_data_length, cur->coords); + cur->key= key->data; + + old_coord= next_coord; + + if (split_maria_rtree_node(task, max_keys + 1, + page->size + full_length + 2, + full_length, + rt_PAGE_MIN_SIZE(keyinfo->block_length), + 2, 2, &next_coord, n_dim)) + { + err_code= 1; + goto split_err; + } + + /* Allocate buffer for new page and piece of log record */ + if (!(new_page_buff= (uchar*) my_alloca((uint)keyinfo->block_length + + (transactional ? + (max_keys * (2 + 2) + + 1 + 2 + 1 + 2) : 0)))) + { + err_code= -1; + goto split_err; + } + log_internal_copy= log_internal_copy_ptr= new_page_buff + + keyinfo->block_length; + bzero(new_page_buff, share->block_size); + + stop= task + (max_keys + 1); + cur1= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag); + cur2= rt_PAGE_FIRST_KEY(share, new_page_buff, nod_flag); + + n1= n2= 0; + for (cur= task; cur < stop; cur++) + { + uchar *to; + const uchar *cur_key= cur->key; + my_bool log_this_change; + DBUG_ASSERT(log_key_copy == NULL); + if (cur->n_node == 1) + { + to= cur1; + cur1= rt_PAGE_NEXT_KEY(share, cur1, key_data_length, nod_flag); + n1++; + log_this_change= transactional; + } + else + { + to= cur2; + cur2= rt_PAGE_NEXT_KEY(share, cur2, key_data_length, nod_flag); + n2++; + log_this_change= FALSE; + } + if (to != cur_key) + { + uchar *to_with_nod_flag= to - nod_flag; + const uchar *cur_key_with_nod_flag= cur_key - nod_flag; + memcpy(to_with_nod_flag, cur_key_with_nod_flag, full_length); + if (log_this_change) + { + uint to_with_nod_flag_offs= to_with_nod_flag - page->buff; + if (likely(cur_key != key->data)) + { + /* this memcpy() is internal to the page (source in the page) */ + uint cur_key_with_nod_flag_offs= cur_key_with_nod_flag - page->buff; + int2store(log_internal_copy_ptr, to_with_nod_flag_offs); + log_internal_copy_ptr+= 2; + int2store(log_internal_copy_ptr, cur_key_with_nod_flag_offs); + log_internal_copy_ptr+= 2; + } + else + { + /* last iteration, and this involves *key: source is external */ + log_key_copy= log_internal_copy_ptr; + log_key_copy[0]= KEY_OP_OFFSET; + int2store(log_key_copy + 1, to_with_nod_flag_offs); + log_key_copy[3]= KEY_OP_CHANGE; + int2store(log_key_copy + 4, full_length); + /* _ma_log_rt_split() will store *key, right after */ + } + } + } + } + { /* verify that above loop didn't touch header bytes */ + uint i; + for (i= 0; i < share->keypage_header; i++) + DBUG_ASSERT(new_page_buff[i]==0); + } + + if (nod_flag) + _ma_store_keypage_flag(share, new_page_buff, KEYPAGE_FLAG_ISNOD); + _ma_store_keynr(share, new_page_buff, keyinfo->key_nr); + new_page_length= share->keypage_header + n2 * full_length; + _ma_store_page_used(share, new_page_buff, new_page_length); + page->size= share->keypage_header + n1 * full_length; + page_store_size(share, page); + + if ((*new_page_offs= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) + err_code= -1; + else + { + MARIA_PAGE new_page; + _ma_page_setup(&new_page, info, keyinfo, *new_page_offs, new_page_buff); + + if (transactional && + ( /* log change to split page */ + _ma_log_rt_split(page, key->data - nod_flag, + full_length, log_internal_copy, + log_internal_copy_ptr - log_internal_copy, + log_key_copy, org_length - page->size) || + /* and to new page */ + _ma_log_new(&new_page, 0))) + err_code= -1; + + if (_ma_write_keypage(&new_page, page_link->write_lock, + DFLT_INIT_HITS)) + err_code= -1; + } + DBUG_PRINT("rtree", ("split new block: %lu", (ulong) *new_page_offs)); + + my_afree(new_page); + +split_err: + my_afree(coord_buf); + DBUG_RETURN(err_code); +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_test.c b/storage/maria/ma_rt_test.c new file mode 100644 index 00000000000..af54e6b27be --- /dev/null +++ b/storage/maria/ma_rt_test.c @@ -0,0 +1,692 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Testing of the basic functions of a MARIA rtree table */ +/* Written by Alex Barkov who has a shared copyright to this code */ + + +#include "maria_def.h" +#include "ma_control_file.h" +#include "ma_loghandler.h" +#include "ma_checkpoint.h" +#include "trnman.h" +#include <my_getopt.h> + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" + +#define MAX_REC_LENGTH 1024 +#define ndims 2 +#define KEYALG HA_KEY_ALG_RTREE + +static int read_with_pos(MARIA_HA * file); +static void create_record(uchar *record,uint rownr); +static void create_record1(uchar *record,uint rownr); +static void print_record(uchar * record,my_off_t offs,const char * tail); +static int run_test(const char *filename); +static void get_options(int argc, char *argv[]); +static void usage(); + +static double rt_data[]= +{ + /*1*/ 0,10,0,10, + /*2*/ 5,15,0,10, + /*3*/ 0,10,5,15, + /*4*/ 10,20,10,20, + /*5*/ 0,10,0,10, + /*6*/ 5,15,0,10, + /*7*/ 0,10,5,15, + /*8*/ 10,20,10,20, + /*9*/ 0,10,0,10, + /*10*/ 5,15,0,10, + /*11*/ 0,10,5,15, + /*12*/ 10,20,10,20, + /*13*/ 0,10,0,10, + /*14*/ 5,15,0,10, + /*15*/ 0,10,5,15, + /*16*/ 10,20,10,20, + /*17*/ 5,15,0,10, + /*18*/ 0,10,5,15, + /*19*/ 10,20,10,20, + /*20*/ 0,10,0,10, + + /*1*/ 100,110,0,10, + /*2*/ 105,115,0,10, + /*3*/ 100,110,5,15, + /*4*/ 110,120,10,20, + /*5*/ 100,110,0,10, + /*6*/ 105,115,0,10, + /*7*/ 100,110,5,15, + /*8*/ 110,120,10,20, + /*9*/ 100,110,0,10, + /*10*/ 105,115,0,10, + /*11*/ 100,110,5,15, + /*12*/ 110,120,10,20, + /*13*/ 100,110,0,10, + /*14*/ 105,115,0,10, + /*15*/ 100,110,5,15, + /*16*/ 110,120,10,20, + /*17*/ 105,115,0,10, + /*18*/ 100,110,5,15, + /*19*/ 110,120,10,20, + /*20*/ 100,110,0,10, + -1 +}; + +static int testflag, checkpoint, create_flag; +static my_bool silent, transactional, die_in_middle_of_transaction, + opt_versioning; +static enum data_file_type record_type= DYNAMIC_RECORD; + +int main(int argc, char *argv[]) +{ + MY_INIT(argv[0]); + get_options(argc, argv); + maria_data_root= (char *)"."; + /* Maria requires that we always have a page cache */ + if (maria_init() || + (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0, + maria_block_size, MY_WME) == 0) || + ma_control_file_open(TRUE, TRUE) || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, MY_WME) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0) || + (transactional && (trnman_init(0) || ma_checkpoint_init(0)))) + { + fprintf(stderr, "Error in initialization\n"); + exit(1); + } + + exit(run_test("rt_test")); +} + + +static int run_test(const char *filename) +{ + MARIA_HA *file; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + MARIA_COLUMNDEF recinfo[20]; + MARIA_KEYDEF keyinfo[20]; + HA_KEYSEG keyseg[20]; + key_range range; + + int opt_unique=0; + int key_type=HA_KEYTYPE_DOUBLE; + int key_length=8; + int null_fields=0; + int nrecords=sizeof(rt_data)/(sizeof(double)*4);/* 40 */ + int rec_length=0; + int uniques=0; + int i, max_i; + int error; + int row_count=0; + uchar record[MAX_REC_LENGTH]; + uchar read_record[MAX_REC_LENGTH]; + int upd= 10; + ha_rows hrows; + + bzero(&uniquedef, sizeof(uniquedef)); + bzero(&create_info, sizeof(create_info)); + bzero(recinfo, sizeof(recinfo)); + bzero(keyinfo, sizeof(keyinfo)); + bzero(keyseg, sizeof(keyseg)); + + /* Define a column for NULLs and DEL markers*/ + + recinfo[0].type=FIELD_NORMAL; + recinfo[0].length=1; /* For NULL bits */ + rec_length=1; + + /* Define 2*ndims columns for coordinates*/ + + for (i=1; i<=2*ndims ;i++) + { + recinfo[i].type=FIELD_NORMAL; + recinfo[i].length=key_length; + rec_length+=key_length; + } + + /* Define a key with 2*ndims segments */ + + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=2*ndims; + keyinfo[0].flag=0; + keyinfo[0].key_alg=KEYALG; + + for (i=0; i<2*ndims; i++) + { + keyinfo[0].seg[i].type= key_type; + keyinfo[0].seg[i].flag=0; /* Things like HA_REVERSE_SORT */ + keyinfo[0].seg[i].start= (key_length*i)+1; + keyinfo[0].seg[i].length=key_length; + keyinfo[0].seg[i].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[i].null_pos=0; + keyinfo[0].seg[i].language=default_charset_info->number; + } + + if (!silent) + printf("- Creating isam-file\n"); + + create_info.max_rows=10000000; + create_info.transactional= transactional; + + if (maria_create(filename, + record_type, + 1, /* keys */ + keyinfo, + 1+2*ndims+opt_unique, /* columns */ + recinfo,uniques,&uniquedef,&create_info,create_flag)) + goto err; + + if (!silent) + printf("- Open isam-file\n"); + + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + maria_begin(file); + if (opt_versioning) + maria_versioning(file, 1); + if (testflag == 1) + goto end; + if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + if (!silent) + printf("- Writing key:s\n"); + + for (i=0; i<nrecords; i++ ) + { + create_record(record,i); + error=maria_write(file,record); + print_record(record,maria_position(file),"\n"); + if (!error) + { + row_count++; + } + else + { + fprintf(stderr, "maria_write: %d\n", error); + goto err; + } + } + + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + if ((error=read_with_pos(file))) + goto err; + maria_scan_end(file); + + if (!silent) + printf("- Reading rows with key\n"); + + for (i=0 ; i < nrecords ; i++) + { + my_errno=0; + create_record(record,i); + + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rkey(file,read_record,0,record+1,HA_WHOLE_KEY,HA_READ_MBR_EQUAL); + + if (error && error!=HA_ERR_KEY_NOT_FOUND) + { + fprintf(stderr," maria_rkey: %3d errno: %3d\n",error,my_errno); + goto err; + } + if (error == HA_ERR_KEY_NOT_FOUND) + { + print_record(record,maria_position(file)," NOT FOUND\n"); + continue; + } + print_record(read_record,maria_position(file),"\n"); + } + + if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 2) + goto end; + + if (!silent) + printf("- Deleting rows\n"); + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + + for (i=0; i < nrecords/4; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_scan(file,read_record); + if (error) + { + fprintf(stderr, "pos: %2d maria_rrnd: %3d errno: %3d\n", i, error, + my_errno); + goto err; + } + print_record(read_record,maria_position(file),"\n"); + + error=maria_delete(file,read_record); + if (error) + { + fprintf(stderr, "pos: %2d maria_delete: %3d errno: %3d\n", i, error, + my_errno); + goto err; + } + } + maria_scan_end(file); + + if (testflag == 3) + goto end; + if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (!silent) + printf("- Updating rows with position\n"); + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + + /* We are looking for nrecords-necords/2 non-deleted records */ + for (i=0, max_i= nrecords - nrecords/2; i < max_i ; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_scan(file,read_record); + if (error) + { + if (error==HA_ERR_RECORD_DELETED) + { + if (!silent) + printf("found deleted record\n"); + /* + In BLOCK_RECORD format, maria_scan() never returns deleted records, + while in DYNAMIC format it can. Don't count such record: + */ + max_i++; + continue; + } + fprintf(stderr, "pos: %2d maria_rrnd: %3d errno: %3d\n",i , error, + my_errno); + goto err; + } + print_record(read_record,maria_position(file),""); + create_record1(record,i+nrecords*upd); + if (!silent) + printf("\t-> "); + print_record(record,maria_position(file),"\n"); + error=maria_update(file,read_record,record); + if (error) + { + fprintf(stderr, "pos: %2d maria_update: %3d errno: %3d\n",i, error, + my_errno); + goto err; + } + } + + if (testflag == 4) + goto end; + if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + if ((error=read_with_pos(file))) + goto err; + maria_scan_end(file); + + if (!silent) + printf("- Test maria_rkey then a sequence of maria_rnext_same\n"); + + create_record(record, nrecords*4/5); + print_record(record,0," search for\n"); + + if ((error=maria_rkey(file,read_record,0,record+1,HA_WHOLE_KEY, + HA_READ_MBR_INTERSECT))) + { + fprintf(stderr, "maria_rkey: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rkey\n"); + row_count=1; + + for (;;) + { + if ((error=maria_rnext_same(file,read_record))) + { + if (error==HA_ERR_END_OF_FILE) + break; + fprintf(stderr, "maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext_same\n"); + row_count++; + } + if (!silent) + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_rfirst then a sequence of maria_rnext\n"); + + error=maria_rfirst(file,read_record,0); + if (error) + { + fprintf(stderr, "maria_rfirst: %3d errno: %3d\n",error,my_errno); + goto err; + } + row_count=1; + print_record(read_record,maria_position(file)," maria_frirst\n"); + + for (i=0;i<nrecords;i++) + { + if ((error=maria_rnext(file,read_record,0))) + { + if (error==HA_ERR_END_OF_FILE) + break; + fprintf(stderr, "maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext\n"); + row_count++; + } + if (!silent) + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_records_in_range()\n"); + + create_record1(record, nrecords*4/5); + print_record(record,0,"\n"); + + range.key= record+1; + range.length= 1000; /* Big enough */ + range.flag= HA_READ_MBR_INTERSECT; + hrows= maria_records_in_range(file,0, &range, (key_range*) 0); + if (!silent) + printf(" %ld rows\n", (long) hrows); + +end: + maria_scan_end(file); + if (die_in_middle_of_transaction) + { + /* see similar code in ma_test2.c for comments */ + switch (die_in_middle_of_transaction) { + case 1: + _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE); + break; + case 2: + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + case 3: + break; + case 4: + _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE, + FLUSH_RELEASE); + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + } + if (!silent) + printf("Dying on request without maria_commit()/maria_close()\n"); + exit(0); + } + if (maria_commit(file)) + goto err; + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return 0; + +err: + fprintf(stderr, "got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + + + +static int read_with_pos (MARIA_HA * file) +{ + int error; + int i; + uchar read_record[MAX_REC_LENGTH]; + + if (!silent) + printf("- Reading rows with position\n"); + for (i=0;;i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_scan(file,read_record); + if (error) + { + if (error==HA_ERR_END_OF_FILE) + break; + if (error==HA_ERR_RECORD_DELETED) + continue; + fprintf(stderr, "pos: %2d maria_rrnd: %3d errno: %3d\n", i, error, + my_errno); + return error; + } + print_record(read_record,maria_position(file),"\n"); + } + return 0; +} + + +#ifdef NOT_USED +static void bprint_record(char * record, + my_off_t offs __attribute__((unused)), + const char * tail) +{ + int i; + char * pos; + if (silent) + return; + i=(unsigned char)record[0]; + printf("%02X ",i); + + for( pos=record+1, i=0; i<32; i++,pos++){ + int b=(unsigned char)*pos; + printf("%02X",b); + } + printf("%s",tail); +} +#endif + + +static void print_record(uchar *record, + my_off_t offs __attribute__((unused)), + const char * tail) +{ + int i; + uchar *pos; + double c; + + if (silent) + return; + printf(" rec=(%d)",(unsigned char)record[0]); + for ( pos=record+1, i=0; i<2*ndims; i++) + { + memcpy(&c,pos,sizeof(c)); + float8get(c,pos); + printf(" %.14g ",c); + pos+=sizeof(c); + } + printf("pos=%ld",(long int)offs); + printf("%s",tail); +} + + + +static void create_record1(uchar *record, uint rownr) +{ + int i; + uchar *pos; + double c=rownr+10; + + bzero((char*) record,MAX_REC_LENGTH); + record[0]=0x01; /* DEL marker */ + + for ( pos=record+1, i=0; i<2*ndims; i++) + { + memcpy(pos,&c,sizeof(c)); + float8store(pos,c); + pos+=sizeof(c); + } +} + +#ifdef NOT_USED + +static void create_record0(char *record,uint rownr) +{ + int i; + char * pos; + double c=rownr+10; + double c0=0; + + bzero((char*) record,MAX_REC_LENGTH); + record[0]=0x01; /* DEL marker */ + + for ( pos=record+1, i=0; i<ndims; i++) + { + memcpy(pos,&c0,sizeof(c0)); + float8store(pos,c0); + pos+=sizeof(c0); + memcpy(pos,&c,sizeof(c)); + float8store(pos,c); + pos+=sizeof(c); + } +} + +#endif + +static void create_record(uchar *record, uint rownr) +{ + int i; + uchar *pos; + double *data= rt_data+rownr*4; + record[0]=0x01; /* DEL marker */ + for ( pos=record+1, i=0; i<ndims*2; i++) + { + float8store(pos,data[i]); + pos+=8; + } +} + + +static struct my_option my_long_options[] = +{ + {"checkpoint", 'H', "Checkpoint at specified stage", (uchar**) &checkpoint, + (uchar**) &checkpoint, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"checksum", 'c', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', "Undocumented", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"help", '?', "Display help and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"row-fixed-size", 'S', "Fixed size records", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"rows-in-block", 'M', "Store rows in block format", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"silent", 's', "Undocumented", + (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, + 0, 0}, + {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag, + (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"test-undo", 'A', + "Abort hard. Used for testing recovery with undo", + (uchar**) &die_in_middle_of_transaction, + (uchar**) &die_in_middle_of_transaction, + 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"transactional", 'T', + "Test in transactional mode. (Only works with block format)", + (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"versioning", 'C', "Use row versioning (only works with block format)", + (uchar**) &opt_versioning, (uchar**) &opt_versioning, 0, GET_BOOL, + NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch(optid) { + case 'c': + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + break; + case 'M': + record_type= BLOCK_RECORD; + break; + case 'S': + record_type= STATIC_RECORD; + break; + case '#': + DBUG_PUSH(argument); + break; + case '?': + usage(); + exit(1); + } + return 0; +} + + +/* Read options */ + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + + return; +} /* get options */ + + +static void usage() +{ + printf("Usage: %s [options]\n\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} + +#else +int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused))) +{ + exit(0); +} +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_scan.c b/storage/maria/ma_scan.c new file mode 100644 index 00000000000..cbac463a2c8 --- /dev/null +++ b/storage/maria/ma_scan.c @@ -0,0 +1,74 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Read through all rows sequntially */ + +#include "maria_def.h" + +int maria_scan_init(register MARIA_HA *info) +{ + DBUG_ENTER("maria_scan_init"); + + info->cur_row.nextpos= info->s->pack.header_length; /* Read first record */ + info->lastinx= -1; /* Can't forward or backward */ + if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + + if ((*info->s->scan_init)(info)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); +} + +/* + Read a row based on position. + + SYNOPSIS + maria_scan() + info Maria handler + record Read data here + + RETURN + 0 ok + HA_ERR_END_OF_FILE End of file + HA_ERR_RECORD_DELETED Record was deleted (can only happen for static rec) + # Error code +*/ + +int maria_scan(MARIA_HA *info, uchar *record) +{ + DBUG_ENTER("maria_scan"); + /* Init all but update-flag */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + DBUG_RETURN((*info->s->scan)(info, record, info->cur_row.nextpos, 1)); +} + + +void maria_scan_end(MARIA_HA *info) +{ + (*info->s->scan_end)(info); +} + + +int _ma_def_scan_remember_pos(MARIA_HA *info, MARIA_RECORD_POS *lastpos) +{ + *lastpos= info->cur_row.lastpos; + return 0; +} + + +void _ma_def_scan_restore_pos(MARIA_HA *info, MARIA_RECORD_POS lastpos) +{ + info->cur_row.nextpos= lastpos; +} diff --git a/storage/maria/ma_search.c b/storage/maria/ma_search.c new file mode 100644 index 00000000000..9f1e8e2554b --- /dev/null +++ b/storage/maria/ma_search.c @@ -0,0 +1,2397 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* key handling functions */ + +#include "ma_fulltext.h" +#include "m_ctype.h" + +static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, register my_off_t pos, + MARIA_PINNED_PAGE **res_page_link, + uchar **res_page_buff); +static my_bool _ma_get_prev_key(MARIA_KEY *key, MARIA_PAGE *ma_page, + uchar *keypos); + + +/* Check that new index is ok */ + +int _ma_check_index(MARIA_HA *info, int inx) +{ + if (inx < 0 || ! maria_is_key_active(info->s->state.key_map, inx)) + { + my_errno=HA_ERR_WRONG_INDEX; + return -1; + } + if (info->lastinx != inx) /* Index changed */ + { + info->lastinx = inx; + info->page_changed=1; + info->update= ((info->update & (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED)) | + HA_STATE_NEXT_FOUND | HA_STATE_PREV_FOUND); + } + if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache)) + return(-1); + return(inx); +} /* _ma_check_index */ + + +/** + @breif Search after row by a key + + @note + Position to row is stored in info->lastpos + + @return + @retval 0 ok (key found) + @retval -1 Not found + @retval 1 If one should continue search on higher level +*/ + +int _ma_search(register MARIA_HA *info, MARIA_KEY *key, uint32 nextflag, + my_off_t pos) +{ + int error; + MARIA_PINNED_PAGE *page_link; + uchar *page_buff; + + info->page_changed= 1; /* If page not saved */ + if (!(error= _ma_search_no_save(info, key, nextflag, pos, &page_link, + &page_buff))) + { + if (nextflag & SEARCH_SAVE_BUFF) + { + bmove512(info->keyread_buff, page_buff, info->s->block_size); + + /* Save position for a possible read next / previous */ + info->int_keypos= info->keyread_buff + info->keypos_offset; + info->int_maxpos= info->keyread_buff + info->maxpos_offset; + info->int_keytree_version= key->keyinfo->version; + info->last_search_keypage= info->last_keypage; + info->page_changed= 0; + info->keyread_buff_used= 0; + } + } + _ma_unpin_all_pages(info, LSN_IMPOSSIBLE); + return (error); +} + +/** + @breif Search after row by a key + + ret_page_link Will contain pointer to page where we found key + + @note + Position to row is stored in info->lastpos + + @return + @retval 0 ok (key found) + @retval -1 Not found + @retval 1 If one should continue search on higher level +*/ + +static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, register my_off_t pos, + MARIA_PINNED_PAGE **res_page_link, + uchar **res_page_buff) +{ + my_bool last_key_not_used; + int error,flag; + uint page_flag, nod_flag, used_length; + uchar *keypos,*maxpos; + uchar lastkey[MARIA_MAX_KEY_BUFF]; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + MARIA_PINNED_PAGE *page_link; + DBUG_ENTER("_ma_search"); + DBUG_PRINT("enter",("page: %lu nextflag: %u lastpos: %lu", + (ulong) (pos / info->s->block_size), + nextflag, (ulong) info->cur_row.lastpos)); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key);); + + if (pos == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */ + info->cur_row.lastpos= HA_OFFSET_ERROR; + if (!(nextflag & (SEARCH_SMALLER | SEARCH_BIGGER | SEARCH_LAST))) + DBUG_RETURN(-1); /* Not found ; return error */ + DBUG_RETURN(1); /* Search at upper levels */ + } + + if (_ma_fetch_keypage(&page, info, keyinfo, pos, + PAGECACHE_LOCK_READ, DFLT_INIT_HITS, 0, 0)) + goto err; + page_link= dynamic_element(&info->pinned_pages, + info->pinned_pages.elements-1, + MARIA_PINNED_PAGE*); + DBUG_DUMP("page", page.buff, page.size); + + flag= (*keyinfo->bin_search)(key, &page, nextflag, &keypos, lastkey, + &last_key_not_used); + if (flag == MARIA_FOUND_WRONG_KEY) + DBUG_RETURN(-1); + page_flag= page.flag; + used_length= page.size; + nod_flag= page.node; + maxpos= page.buff + used_length -1; + + if (flag) + { + if ((error= _ma_search_no_save(info, key, nextflag, + _ma_kpos(nod_flag,keypos), + res_page_link, res_page_buff)) <= 0) + DBUG_RETURN(error); + + if (flag >0) + { + if (nextflag & (SEARCH_SMALLER | SEARCH_LAST) && + keypos == page.buff + info->s->keypage_header + nod_flag) + DBUG_RETURN(1); /* Bigger than key */ + } + else if (nextflag & SEARCH_BIGGER && keypos >= maxpos) + DBUG_RETURN(1); /* Smaller than key */ + } + else + { + /* Found matching key */ + if ((nextflag & SEARCH_FIND) && nod_flag && + ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME || + (key->flag & SEARCH_PART_KEY) || info->s->base.born_transactional)) + { + if ((error= _ma_search_no_save(info, key, (nextflag | SEARCH_FIND) & + ~(SEARCH_BIGGER | SEARCH_SMALLER | + SEARCH_LAST), + _ma_kpos(nod_flag,keypos), + res_page_link, res_page_buff)) >= 0 || + my_errno != HA_ERR_KEY_NOT_FOUND) + DBUG_RETURN(error); + } + } + + info->last_key.keyinfo= keyinfo; + if ((nextflag & (SEARCH_SMALLER | SEARCH_LAST)) && flag != 0) + { + uint not_used[2]; + if (_ma_get_prev_key(&info->last_key, &page, keypos)) + goto err; + /* + We have to use key->flag >> 1 here to transform + SEARCH_PAGE_KEY_HAS_TRANSID to SEARCH_USER_KEY_HAS_TRANSID + */ + if (!(nextflag & SEARCH_SMALLER) && + ha_key_cmp(keyinfo->seg, info->last_key.data, key->data, + key->data_length + key->ref_length, + SEARCH_FIND | (key->flag >> 1) | info->last_key.flag, + not_used)) + { + my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */ + goto err; + } + } + else + { + /* Set info->last_key to temporarily point to last key value */ + info->last_key.data= lastkey; + /* Get key value (if not packed key) and position after key */ + if (!(*keyinfo->get_key)(&info->last_key, page_flag, nod_flag, &keypos)) + goto err; + memcpy(info->lastkey_buff, lastkey, + info->last_key.data_length + info->last_key.ref_length); + info->last_key.data= info->lastkey_buff; + } + info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key); + info->cur_row.trid= _ma_trid_from_key(&info->last_key); + + /* Store offset to key */ + info->keypos_offset= (uint) (keypos - page.buff); + info->maxpos_offset= (uint) (maxpos - page.buff); + info->int_nod_flag= nod_flag; + info->last_keypage= pos; + *res_page_link= page_link; + *res_page_buff= page.buff; + + DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); + +err: + DBUG_PRINT("exit",("Error: %d",my_errno)); + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->page_changed=1; + DBUG_RETURN (-1); +} + + +/* + Search after key in page-block + + @fn _ma_bin_search + @param key Search after this key + @param page Start of data page + @param comp_flag How key should be compared + @param ret_pos + @param buff Buffer for holding a key (not used here) + @param last_key + + @note + If keys are packed, then smaller or identical key is stored in buff + + @return + @retval <0, 0 , >0 depending on if if found is smaller, equal or bigger than + 'key' + @retval ret_pos Points to where the identical or bigger key starts + @retval last_key Set to 1 if key is the last key in the page. +*/ + +int _ma_bin_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page, + uint32 comp_flag, uchar **ret_pos, + uchar *buff __attribute__((unused)), my_bool *last_key) +{ + int flag; + uint page_flag; + uint start, mid, end, save_end, totlength, nod_flag; + uint not_used[2]; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_SHARE *share= keyinfo->share; + uchar *page; + DBUG_ENTER("_ma_bin_search"); + + LINT_INIT(flag); + + page_flag= ma_page->flag; + if (page_flag & KEYPAGE_FLAG_HAS_TRANSID) + { + /* Keys have varying length, can't use binary search */ + DBUG_RETURN(_ma_seq_search(key, ma_page, comp_flag, ret_pos, buff, + last_key)); + } + + nod_flag= ma_page->node; + totlength= keyinfo->keylength + nod_flag; + DBUG_ASSERT(ma_page->size >= share->keypage_header + nod_flag + totlength); + + start=0; + mid=1; + save_end= end= ((ma_page->size - nod_flag - share->keypage_header) / + totlength-1); + DBUG_PRINT("test",("page_length: %u end: %u", ma_page->size, end)); + page= ma_page->buff + share->keypage_header + nod_flag; + + while (start != end) + { + mid= (start+end)/2; + if ((flag=ha_key_cmp(keyinfo->seg, page + (uint) mid * totlength, + key->data, key->data_length + key->ref_length, + comp_flag, not_used)) + >= 0) + end=mid; + else + start=mid+1; + } + if (mid != start) + flag=ha_key_cmp(keyinfo->seg, page + (uint) start * totlength, + key->data, key->data_length + key->ref_length, comp_flag, + not_used); + if (flag < 0) + start++; /* point at next, bigger key */ + *ret_pos= (page + (uint) start * totlength); + *last_key= end == save_end; + DBUG_PRINT("exit",("flag: %d keypos: %d",flag,start)); + DBUG_RETURN(flag); +} /* _ma_bin_search */ + + +/** + Locate a packed key in a key page. + + @fn _ma_seq_search() + @param key Search key. + @param page Key page (beginning). + @param comp_flag Search flags like SEARCH_SAME etc. + @param ret_pos + @param buff Buffer for holding temp keys + @param last_key + + @description + Used instead of _ma_bin_search() when key is packed. + Puts smaller or identical key in buff. + Key is searched sequentially. + + @todo + Don't copy key to buffer if we are not using key with prefix packing + + @return + @retval > 0 Key in 'buff' is smaller than search key. + @retval 0 Key in 'buff' is identical to search key. + @retval < 0 Not found. + + @retval ret_pos Points to where the identical or bigger key starts + @retval last_key Set to 1 if key is the last key in the page + @retval buff Copy of previous or identical unpacked key +*/ + +int _ma_seq_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page, + uint32 comp_flag, uchar **ret_pos, + uchar *buff, my_bool *last_key) +{ + int flag; + uint page_flag, nod_flag, length, not_used[2]; + uchar t_buff[MARIA_MAX_KEY_BUFF], *end; + uchar *page; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_SHARE *share= keyinfo->share; + MARIA_KEY tmp_key; + DBUG_ENTER("_ma_seq_search"); + + LINT_INIT(flag); + LINT_INIT(length); + + page_flag= ma_page->flag; + nod_flag= ma_page->node; + page= ma_page->buff; + end= page + ma_page->size; + page+= share->keypage_header + nod_flag; + *ret_pos= page; + t_buff[0]=0; /* Avoid bugs */ + + tmp_key.data= t_buff; + tmp_key.keyinfo= keyinfo; + while (page < end) + { + length=(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &page); + if (length == 0 || page > end) + { + maria_print_error(share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_PRINT("error", + ("Found wrong key: length: %u page: 0x%lx end: 0x%lx", + length, (long) page, (long) end)); + DBUG_RETURN(MARIA_FOUND_WRONG_KEY); + } + if ((flag= ha_key_cmp(keyinfo->seg, t_buff, key->data, + key->data_length + key->ref_length, + comp_flag | tmp_key.flag, + not_used)) >= 0) + break; + DBUG_PRINT("loop_extra",("page: 0x%lx key: '%s' flag: %d", + (long) page, t_buff, flag)); + memcpy(buff,t_buff,length); + *ret_pos=page; + } + if (flag == 0) + memcpy(buff,t_buff,length); /* Result is first key */ + *last_key= page == end; + DBUG_PRINT("exit",("flag: %d ret_pos: 0x%lx", flag, (long) *ret_pos)); + DBUG_RETURN(flag); +} /* _ma_seq_search */ + + +/** + Search for key on key page with string prefix compression + + @notes + This is an optimized function compared to calling _ma_get_pack_key() + for each key in the buffer + + Same interface as for _ma_seq_search() +*/ + +int _ma_prefix_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page, + uint32 nextflag, uchar **ret_pos, uchar *buff, + my_bool *last_key) +{ + /* + my_flag is raw comparison result to be changed according to + SEARCH_NO_FIND,SEARCH_LAST and HA_REVERSE_SORT flags. + flag is the value returned by ha_key_cmp and as treated as final + */ + int flag=0, my_flag=-1; + uint nod_flag, length, len, matched, cmplen, kseg_len; + uint page_flag, prefix_len,suffix_len; + int key_len_skip, seg_len_pack, key_len_left; + uchar *end, *vseg, *saved_vseg, *saved_from; + uchar *page; + uchar tt_buff[MARIA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2; + uchar *saved_to; + const uchar *kseg; + uint saved_length=0, saved_prefix_len=0; + uint length_pack; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_SHARE *share= keyinfo->share; + const uchar *sort_order= keyinfo->seg->charset->sort_order; + DBUG_ENTER("_ma_prefix_search"); + + LINT_INIT(length); + LINT_INIT(prefix_len); + LINT_INIT(seg_len_pack); + LINT_INIT(saved_from); + LINT_INIT(saved_to); + LINT_INIT(saved_vseg); + + t_buff[0]=0; /* Avoid bugs */ + page_flag= ma_page->flag; + nod_flag= ma_page->node; + page_flag&= KEYPAGE_FLAG_HAS_TRANSID; /* For faster test in loop */ + page= ma_page->buff; + end= page + ma_page->size; + page+= share->keypage_header + nod_flag; + *ret_pos= page; + kseg= key->data; + + get_key_pack_length(kseg_len, length_pack, kseg); + key_len_skip=length_pack+kseg_len; + key_len_left=(int) (key->data_length + key->ref_length) - (int) key_len_skip; + /* If key_len is 0, then length_pack is 1, then key_len_left is -1. */ + cmplen= ((key_len_left>=0) ? kseg_len : + (key->data_length + key->ref_length - length_pack)); + DBUG_PRINT("info",("key: '%.*s'",kseg_len,kseg)); + + /* + Keys are compressed the following way: + + If the max length of first key segment <= 127 bytes the prefix is + 1 uchar else it's 2 byte + + (prefix) length The high bit is set if this is a prefix for the prev key. + [suffix length] Packed length of suffix if the previous was a prefix. + (suffix) data Key data bytes (past the common prefix or whole segment). + [next-key-seg] Next key segments (([packed length], data), ...) + pointer Reference to the data file (last_keyseg->length). + */ + + matched=0; /* how many char's from prefix were alredy matched */ + len=0; /* length of previous key unpacked */ + + while (page < end) + { + uint packed= *page & 128; + uint key_flag; + + vseg= page; + if (keyinfo->seg->length >= 127) + { + suffix_len=mi_uint2korr(vseg) & 32767; + vseg+=2; + } + else + suffix_len= *vseg++ & 127; + + if (packed) + { + if (suffix_len == 0) + { + /* == 0x80 or 0x8000, same key, prefix length == old key length. */ + prefix_len=len; + } + else + { + /* > 0x80 or 0x8000, this is prefix lgt, packed suffix lgt follows. */ + prefix_len=suffix_len; + get_key_length(suffix_len,vseg); + } + } + else + { + /* Not packed. No prefix used from last key. */ + prefix_len=0; + } + + len=prefix_len+suffix_len; + seg_len_pack=get_pack_length(len); + t_buff=tt_buff+3-seg_len_pack; + store_key_length(t_buff,len); + + if (prefix_len > saved_prefix_len) + memcpy(t_buff+seg_len_pack+saved_prefix_len,saved_vseg, + prefix_len-saved_prefix_len); + saved_vseg=vseg; + saved_prefix_len=prefix_len; + + DBUG_PRINT("loop",("page: '%.*s%.*s'",prefix_len,t_buff+seg_len_pack, + suffix_len,vseg)); + { + /* Calculate length of one key */ + uchar *from= vseg+suffix_len; + HA_KEYSEG *keyseg; + + for (keyseg=keyinfo->seg+1 ; keyseg->type ; keyseg++ ) + { + if (keyseg->flag & HA_NULL_PART) + { + if (!(*from++)) + continue; + } + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) + { + uint key_part_length; + get_key_length(key_part_length,from); + from+= key_part_length; + } + else + from+= keyseg->length; + } + from+= keyseg->length; + key_flag=0; + + if (page_flag && key_has_transid(from-1)) + { + from+= transid_packed_length(from); + key_flag= SEARCH_PAGE_KEY_HAS_TRANSID; + } + page= from + nod_flag; + length= (uint) (from-vseg); + } + + if (page > end) + { + maria_print_error(share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_PRINT("error", + ("Found wrong key: length: %u page: 0x%lx end: %lx", + length, (long) page, (long) end)); + DBUG_RETURN(MARIA_FOUND_WRONG_KEY); + } + + if (matched >= prefix_len) + { + /* We have to compare. But we can still skip part of the key */ + uint left; + const uchar *k= kseg+prefix_len; + + /* + If prefix_len > cmplen then we are in the end-space comparison + phase. Do not try to acces the key any more ==> left= 0. + */ + left= ((len <= cmplen) ? suffix_len : + ((prefix_len < cmplen) ? cmplen - prefix_len : 0)); + + matched=prefix_len+left; + + if (sort_order) + { + for (my_flag=0;left;left--) + if ((my_flag= (int) sort_order[*vseg++] - (int) sort_order[*k++])) + break; + } + else + { + for (my_flag=0;left;left--) + if ((my_flag= (int) *vseg++ - (int) *k++)) + break; + } + + if (my_flag>0) /* mismatch */ + break; + if (my_flag==0) /* match */ + { + /* + ** len cmplen seg_left_len more_segs + ** < matched=len; continue search + ** > = prefix ? found : (matched=len; + * continue search) + ** > < - ok, found + ** = < - ok, found + ** = = - ok, found + ** = = + next seg + */ + if (len < cmplen) + { + if ((keyinfo->seg->type != HA_KEYTYPE_TEXT && + keyinfo->seg->type != HA_KEYTYPE_VARTEXT1 && + keyinfo->seg->type != HA_KEYTYPE_VARTEXT2)) + my_flag= -1; + else + { + /* We have to compare k and vseg as if they were space extended */ + const uchar *k_end= k+ (cmplen - len); + for ( ; k < k_end && *k == ' '; k++) ; + if (k == k_end) + goto cmp_rest; /* should never happen */ + if ((uchar) *k < (uchar) ' ') + { + my_flag= 1; /* Compared string is smaller */ + break; + } + my_flag= -1; /* Continue searching */ + } + } + else if (len > cmplen) + { + uchar *vseg_end; + if ((nextflag & SEARCH_PREFIX) && key_len_left == 0) + goto fix_flag; + + /* We have to compare k and vseg as if they were space extended */ + for (vseg_end= vseg + (len-cmplen) ; + vseg < vseg_end && *vseg == (uchar) ' '; + vseg++, matched++) ; + DBUG_ASSERT(vseg < vseg_end); + + if ((uchar) *vseg > (uchar) ' ') + { + my_flag= 1; /* Compared string is smaller */ + break; + } + my_flag= -1; /* Continue searching */ + } + else + { + cmp_rest: + if (key_len_left>0) + { + uint not_used[2]; + if ((flag = ha_key_cmp(keyinfo->seg+1,vseg, + k, key_len_left, nextflag | key_flag, + not_used)) >= 0) + break; + } + else + { + /* + at this line flag==-1 if the following lines were already + visited and 0 otherwise, i.e. flag <=0 here always !!! + */ + fix_flag: + DBUG_ASSERT(flag <= 0); + if (nextflag & (SEARCH_NO_FIND | SEARCH_LAST)) + flag=(nextflag & (SEARCH_BIGGER | SEARCH_LAST)) ? -1 : 1; + if (flag>=0) + break; + } + } + } + matched-=left; + } + /* else (matched < prefix_len) ---> do nothing. */ + + memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len); + saved_to= buff+saved_length; + saved_from= saved_vseg; + saved_length=length; + *ret_pos=page; + } + if (my_flag) + flag=(keyinfo->seg->flag & HA_REVERSE_SORT) ? -my_flag : my_flag; + if (flag == 0) + { + memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len); + saved_to= buff+saved_length; + saved_from= saved_vseg; + saved_length=length; + } + if (saved_length) + memcpy(saved_to, saved_from, saved_length); + + *last_key= page == end; + + DBUG_PRINT("exit",("flag: %d ret_pos: 0x%lx", flag, (long) *ret_pos)); + DBUG_RETURN(flag); +} /* _ma_prefix_search */ + + +/* Get pos to a key_block */ + +my_off_t _ma_kpos(uint nod_flag, const uchar *after_key) +{ + after_key-=nod_flag; + switch (nod_flag) { +#if SIZEOF_OFF_T > 4 + case 7: + return mi_uint7korr(after_key)*maria_block_size; + case 6: + return mi_uint6korr(after_key)*maria_block_size; + case 5: + return mi_uint5korr(after_key)*maria_block_size; +#else + case 7: + after_key++; + case 6: + after_key++; + case 5: + after_key++; +#endif + case 4: + return ((my_off_t) mi_uint4korr(after_key))*maria_block_size; + case 3: + return ((my_off_t) mi_uint3korr(after_key))*maria_block_size; + case 2: + return (my_off_t) (mi_uint2korr(after_key)*maria_block_size); + case 1: + return (uint) (*after_key)*maria_block_size; + case 0: /* At leaf page */ + default: /* Impossible */ + return(HA_OFFSET_ERROR); + } +} /* _kpos */ + + +/* Save pos to a key_block */ + +void _ma_kpointer(register MARIA_HA *info, register uchar *buff, my_off_t pos) +{ + pos/=maria_block_size; + switch (info->s->base.key_reflength) { +#if SIZEOF_OFF_T > 4 + case 7: mi_int7store(buff,pos); break; + case 6: mi_int6store(buff,pos); break; + case 5: mi_int5store(buff,pos); break; +#else + case 7: *buff++=0; + /* fall trough */ + case 6: *buff++=0; + /* fall trough */ + case 5: *buff++=0; + /* fall trough */ +#endif + case 4: mi_int4store(buff,pos); break; + case 3: mi_int3store(buff,pos); break; + case 2: mi_int2store(buff,(uint) pos); break; + case 1: buff[0]= (uchar) pos; break; + default: abort(); /* impossible */ + } +} /* _ma_kpointer */ + + +/* Calc pos to a data-record from a key */ + +MARIA_RECORD_POS _ma_row_pos_from_key(const MARIA_KEY *key) +{ + my_off_t pos; + const uchar *after_key= key->data + key->data_length; + MARIA_SHARE *share= key->keyinfo->share; + switch (share->rec_reflength) { +#if SIZEOF_OFF_T > 4 + case 8: pos= (my_off_t) mi_uint8korr(after_key); break; + case 7: pos= (my_off_t) mi_uint7korr(after_key); break; + case 6: pos= (my_off_t) mi_uint6korr(after_key); break; + case 5: pos= (my_off_t) mi_uint5korr(after_key); break; +#else + case 8: pos= (my_off_t) mi_uint4korr(after_key+4); break; + case 7: pos= (my_off_t) mi_uint4korr(after_key+3); break; + case 6: pos= (my_off_t) mi_uint4korr(after_key+2); break; + case 5: pos= (my_off_t) mi_uint4korr(after_key+1); break; +#endif + case 4: pos= (my_off_t) mi_uint4korr(after_key); break; + case 3: pos= (my_off_t) mi_uint3korr(after_key); break; + case 2: pos= (my_off_t) mi_uint2korr(after_key); break; + default: + pos=0L; /* Shut compiler up */ + } + return (*share->keypos_to_recpos)(share, pos); +} + + +/** + Get trid from a key + + @param key Maria key read from a page + + @retval 0 If key doesn't have a trid + @retval trid +*/ + +TrID _ma_trid_from_key(const MARIA_KEY *key) +{ + if (!(key->flag & (SEARCH_PAGE_KEY_HAS_TRANSID | + SEARCH_USER_KEY_HAS_TRANSID))) + return 0; + return transid_get_packed(key->keyinfo->share, + key->data + key->data_length + + key->keyinfo->share->rec_reflength); +} + + +/* Calc position from a record pointer ( in delete link chain ) */ + +MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *share, uchar *ptr) +{ + my_off_t pos; + switch (share->rec_reflength) { +#if SIZEOF_OFF_T > 4 + case 8: + pos= (my_off_t) mi_uint8korr(ptr); + if (pos == HA_OFFSET_ERROR) + return HA_OFFSET_ERROR; /* end of list */ + break; + case 7: + pos= (my_off_t) mi_uint7korr(ptr); + if (pos == (((my_off_t) 1) << 56) -1) + return HA_OFFSET_ERROR; /* end of list */ + break; + case 6: + pos= (my_off_t) mi_uint6korr(ptr); + if (pos == (((my_off_t) 1) << 48) -1) + return HA_OFFSET_ERROR; /* end of list */ + break; + case 5: + pos= (my_off_t) mi_uint5korr(ptr); + if (pos == (((my_off_t) 1) << 40) -1) + return HA_OFFSET_ERROR; /* end of list */ + break; +#else + case 8: + case 7: + case 6: + case 5: + ptr+= (share->rec_reflength-4); + /* fall through */ +#endif + case 4: + pos= (my_off_t) mi_uint4korr(ptr); + if (pos == (my_off_t) (uint32) ~0L) + return HA_OFFSET_ERROR; + break; + case 3: + pos= (my_off_t) mi_uint3korr(ptr); + if (pos == (my_off_t) (1 << 24) -1) + return HA_OFFSET_ERROR; + break; + case 2: + pos= (my_off_t) mi_uint2korr(ptr); + if (pos == (my_off_t) (1 << 16) -1) + return HA_OFFSET_ERROR; + break; + default: abort(); /* Impossible */ + } + return (*share->keypos_to_recpos)(share, pos); +} + + +/* save position to record */ + +void _ma_dpointer(MARIA_SHARE *share, uchar *buff, my_off_t pos) +{ + if (pos != HA_OFFSET_ERROR) + pos= (*share->recpos_to_keypos)(share, pos); + + switch (share->rec_reflength) { +#if SIZEOF_OFF_T > 4 + case 8: mi_int8store(buff,pos); break; + case 7: mi_int7store(buff,pos); break; + case 6: mi_int6store(buff,pos); break; + case 5: mi_int5store(buff,pos); break; +#else + case 8: *buff++=0; + /* fall trough */ + case 7: *buff++=0; + /* fall trough */ + case 6: *buff++=0; + /* fall trough */ + case 5: *buff++=0; + /* fall trough */ +#endif + case 4: mi_int4store(buff,pos); break; + case 3: mi_int3store(buff,pos); break; + case 2: mi_int2store(buff,(uint) pos); break; + default: abort(); /* Impossible */ + } +} /* _ma_dpointer */ + + +my_off_t _ma_static_keypos_to_recpos(MARIA_SHARE *share, my_off_t pos) +{ + return pos * share->base.pack_reclength; +} + + +my_off_t _ma_static_recpos_to_keypos(MARIA_SHARE *share, my_off_t pos) +{ + return pos / share->base.pack_reclength; +} + +my_off_t _ma_transparent_recpos(MARIA_SHARE *share __attribute__((unused)), + my_off_t pos) +{ + return pos; +} + +my_off_t _ma_transaction_keypos_to_recpos(MARIA_SHARE *share + __attribute__((unused)), + my_off_t pos) +{ + /* We need one bit to store if there is transid's after position */ + return pos >> 1; +} + +my_off_t _ma_transaction_recpos_to_keypos(MARIA_SHARE *share + __attribute__((unused)), + my_off_t pos) +{ + return pos << 1; +} + +/* + @brief Get key from key-block + + @param key Should contain previous key. Will contain new key + @param page_flag Flag on page block + @param nod_flag Is set to nod length if we on nod + @param page Points at previous key; Its advanced to point at next key + + @notes + Same as _ma_get_key but used with fixed length keys + + @return + @retval key_length + length of data pointer (without nod length) + */ + +uint _ma_get_static_key(MARIA_KEY *key, uint page_flag, uint nod_flag, + register uchar **page) +{ + register MARIA_KEYDEF *keyinfo= key->keyinfo; + size_t key_length= keyinfo->keylength; + + key->ref_length= keyinfo->share->rec_reflength; + key->data_length= key_length - key->ref_length; + key->flag= 0; + if (page_flag & KEYPAGE_FLAG_HAS_TRANSID) + { + uchar *end= *page + keyinfo->keylength; + if (key_has_transid(end-1)) + { + uint trans_length= transid_packed_length(end); + key->ref_length+= trans_length; + key_length+= trans_length; + key->flag= SEARCH_PAGE_KEY_HAS_TRANSID; + } + } + key_length+= nod_flag; + memcpy(key->data, *page, key_length); + *page+= key_length; + return key_length - nod_flag; +} /* _ma_get_static_key */ + + +/** + Skip over static length key from key-block + + @fn _ma_skip_static_key() + @param key Keyinfo and buffer that can be used + @param nod_flag If nod: Length of node pointer, else zero. + @param key Points at key + + @retval pointer to next key +*/ + +uchar *_ma_skip_static_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page) +{ + page+= key->keyinfo->keylength; + if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) && key_has_transid(page-1)) + page+= transid_packed_length(page); + return page+ nod_flag; +} + + +/* + get key which is packed against previous key or key with a NULL column. + + SYNOPSIS + _ma_get_pack_key() + @param int_key Should contain previous key. Will contain new key + @param page_flag page_flag from page + @param nod_flag If nod: Length of node pointer, else zero. + @param page_pos Points at previous key; Its advanced to point at next key + + @return + @retval key_length + length of data pointer +*/ + +uint _ma_get_pack_key(MARIA_KEY *int_key, uint page_flag, + uint nod_flag, uchar **page_pos) +{ + reg1 HA_KEYSEG *keyseg; + uchar *page= *page_pos; + uint length; + uchar *key= int_key->data; + MARIA_KEYDEF *keyinfo= int_key->keyinfo; + + for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++) + { + if (keyseg->flag & HA_PACK_KEY) + { + /* key with length, packed to previous key */ + uchar *start= key; + uint packed= *page & 128,tot_length,rest_length; + if (keyseg->length >= 127) + { + length=mi_uint2korr(page) & 32767; + page+=2; + } + else + length= *page++ & 127; + + if (packed) + { + if (length > (uint) keyseg->length) + { + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + return 0; /* Error */ + } + if (length == 0) /* Same key */ + { + if (keyseg->flag & HA_NULL_PART) + *key++=1; /* Can't be NULL */ + get_key_length(length,key); + key+= length; /* Same diff_key as prev */ + if (length > keyseg->length) + { + DBUG_PRINT("error", + ("Found too long null packed key: %u of %u at 0x%lx", + length, keyseg->length, (long) *page_pos)); + DBUG_DUMP("key", *page_pos, 16); + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + return 0; + } + continue; + } + if (keyseg->flag & HA_NULL_PART) + { + key++; /* Skip null marker*/ + start++; + } + + get_key_length(rest_length,page); + tot_length=rest_length+length; + + /* If the stored length has changed, we must move the key */ + if (tot_length >= 255 && *start != 255) + { + /* length prefix changed from a length of one to a length of 3 */ + bmove_upp(key+length+3, key+length+1, length); + *key=255; + mi_int2store(key+1,tot_length); + key+=3+length; + } + else if (tot_length < 255 && *start == 255) + { + bmove(key+1,key+3,length); + *key=tot_length; + key+=1+length; + } + else + { + store_key_length_inc(key,tot_length); + key+=length; + } + memcpy(key,page,rest_length); + page+=rest_length; + key+=rest_length; + continue; + } + else + { + /* Key that is not packed against previous key */ + if (keyseg->flag & HA_NULL_PART) + { + if (!length--) /* Null part */ + { + *key++=0; + continue; + } + *key++=1; /* Not null */ + } + } + if (length > (uint) keyseg->length) + { + DBUG_PRINT("error",("Found too long packed key: %u of %u at 0x%lx", + length, keyseg->length, (long) *page_pos)); + DBUG_DUMP("key", *page_pos, 16); + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + return 0; /* Error */ + } + store_key_length_inc(key,length); + } + else + { + if (keyseg->flag & HA_NULL_PART) + { + if (!(*key++ = *page++)) + continue; + } + if (keyseg->flag & + (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) + { + uchar *tmp=page; + get_key_length(length,tmp); + length+=(uint) (tmp-page); + } + else + length=keyseg->length; + } + memcpy(key, page,(size_t) length); + key+=length; + page+=length; + } + + int_key->data_length= (key - int_key->data); + int_key->flag= 0; + length= keyseg->length; + if (page_flag & KEYPAGE_FLAG_HAS_TRANSID) + { + uchar *end= page + length; + if (key_has_transid(end-1)) + { + length+= transid_packed_length(end); + int_key->flag= SEARCH_PAGE_KEY_HAS_TRANSID; + } + } + int_key->ref_length= length; + length+= nod_flag; + bmove(key, page, length); + *page_pos= page+length; + + return (int_key->data_length + int_key->ref_length); +} /* _ma_get_pack_key */ + + +/** + skip key which is packed against previous key or key with a NULL column. + + @fn _ma_skip_pack_key() + @param key Keyinfo and buffer that can be used + @param nod_flag If nod: Length of node pointer, else zero. + @param key Points at key + + @note + This is in principle a simpler version of _ma_get_pack_key() + + @retval pointer to next key +*/ + +uchar *_ma_skip_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page) +{ + reg1 HA_KEYSEG *keyseg; + for (keyseg= key->keyinfo->seg ; keyseg->type ; keyseg++) + { + if (keyseg->flag & HA_PACK_KEY) + { + /* key with length, packed to previous key */ + uint packed= *page & 128, length; + if (keyseg->length >= 127) + { + length= mi_uint2korr(page) & 32767; + page+= 2; + } + else + length= *page++ & 127; + + if (packed) + { + if (length == 0) /* Same key */ + continue; + get_key_length(length,page); + page+= length; + continue; + } + if ((keyseg->flag & HA_NULL_PART) && length) + { + /* + Keys that can have null use length+1 as the length for date as the + number 0 is reserved for keys that have a NULL value + */ + length--; + } + page+= length; + } + else + { + if (keyseg->flag & HA_NULL_PART) + if (!*page++) + continue; + if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint length; + get_key_length(length,page); + page+=length; + } + else + page+= keyseg->length; + } + } + page+= keyseg->length; + if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) && key_has_transid(page-1)) + page+= transid_packed_length(page); + return page + nod_flag; +} + + +/* Read key that is packed relatively to previous */ + +uint _ma_get_binary_pack_key(MARIA_KEY *int_key, uint page_flag, uint nod_flag, + register uchar **page_pos) +{ + reg1 HA_KEYSEG *keyseg; + uchar *page, *page_end, *from, *from_end, *key; + uint length,tmp; + MARIA_KEYDEF *keyinfo= int_key->keyinfo; + DBUG_ENTER("_ma_get_binary_pack_key"); + + page= *page_pos; + page_end=page + MARIA_MAX_KEY_BUFF + 1; + key= int_key->data; + + /* + Keys are compressed the following way: + + prefix length Packed length of prefix common with prev key. + (1 or 3 bytes) + for each key segment: + [is null] Null indicator if can be null (1 byte, zero means null) + [length] Packed length if varlength (1 or 3 bytes) + key segment 'length' bytes of key segment value + pointer Reference to the data file (last_keyseg->length). + + get_key_length() is a macro. It gets the prefix length from 'page' + and puts it into 'length'. It increments 'page' by 1 or 3, depending + on the packed length of the prefix length. + */ + get_key_length(length,page); + if (length) + { + if (length > keyinfo->maxlength) + { + DBUG_PRINT("error", + ("Found too long binary packed key: %u of %u at 0x%lx", + length, keyinfo->maxlength, (long) *page_pos)); + DBUG_DUMP("key", *page_pos, 16); + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); /* Wrong key */ + } + /* Key is packed against prev key, take prefix from prev key. */ + from= key; + from_end= key + length; + } + else + { + /* Key is not packed against prev key, take all from page buffer. */ + from= page; + from_end= page_end; + } + + /* + The trouble is that key can be split in two parts: + The first part (prefix) is in from .. from_end - 1. + The second part starts at page. + The split can be at every byte position. So we need to check for + the end of the first part before using every byte. + */ + for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++) + { + if (keyseg->flag & HA_NULL_PART) + { + /* If prefix is used up, switch to rest. */ + if (from == from_end) + { + from=page; + from_end=page_end; + } + if (!(*key++ = *from++)) + continue; /* Null part */ + } + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) + { + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + /* Get length of dynamic length key part */ + if ((length= (uint) (uchar) (*key++ = *from++)) == 255) + { + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + length= ((uint) (uchar) ((*key++ = *from++))) << 8; + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + length+= (uint) (uchar) ((*key++ = *from++)); + } + } + else + length=keyseg->length; + + if ((tmp=(uint) (from_end-from)) <= length) + { + key+=tmp; /* Use old key */ + length-=tmp; + from=page; from_end=page_end; + } + DBUG_ASSERT((int) length >= 0); + DBUG_PRINT("info",("key: 0x%lx from: 0x%lx length: %u", + (long) key, (long) from, length)); + memmove(key, from, (size_t) length); + key+=length; + from+=length; + } + /* + Last segment (type == 0) contains length of data pointer. + If we have mixed key blocks with data pointer and key block pointer, + we have to copy both. + */ + int_key->data_length= (key - int_key->data); + int_key->ref_length= length= keyseg->length; + int_key->flag= 0; + if ((tmp=(uint) (from_end-from)) <= length) + { + /* Skip over the last common part of the data */ + key+= tmp; + length-= tmp; + from= page; + } + else + { + /* + Remaining length is greater than max possible length. + This can happen only if we switched to the new key bytes already. + 'page_end' is calculated with MARIA_MAX_KEY_BUFF. So it can be far + behind the real end of the key. + */ + if (from_end != page_end) + { + DBUG_PRINT("error",("Error when unpacking key")); + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); /* Error */ + } + } + if (page_flag & KEYPAGE_FLAG_HAS_TRANSID) + { + uchar *end= from + length; + if (key_has_transid(end-1)) + { + uint trans_length= transid_packed_length(end); + length+= trans_length; + int_key->ref_length+= trans_length; + int_key->flag= SEARCH_PAGE_KEY_HAS_TRANSID; + } + } + + /* Copy rest of data ptr and, if appropriate, trans_id and node_ptr */ + memcpy(key, from, length + nod_flag); + *page_pos= from + length + nod_flag; + + DBUG_RETURN(int_key->data_length + int_key->ref_length); +} + +/** + skip key which is ptefix packed against previous key + + @fn _ma_skip_binary_key() + @param key Keyinfo and buffer that can be used + @param nod_flag If nod: Length of node pointer, else zero. + @param key Points at key + + @note + We have to copy the key as otherwise we don't know how much left + data there is of the key. + + @todo + Implement more efficient version of this. We can ignore to copy any rest + key parts that are not null or not packed. We also don't have to copy + rowid or transid. + + @retval pointer to next key +*/ + +uchar *_ma_skip_binary_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page) +{ + if (!_ma_get_binary_pack_key(key, page_flag, nod_flag, &page)) + return 0; + return page; +} + + +/** + @brief Get key at position without knowledge of previous key + + @return pointer to next key +*/ + +uchar *_ma_get_key(MARIA_KEY *key, MARIA_PAGE *ma_page, uchar *keypos) +{ + uint page_flag, nod_flag; + MARIA_KEYDEF *keyinfo= key->keyinfo; + uchar *page; + DBUG_ENTER("_ma_get_key"); + + page= ma_page->buff; + page_flag= ma_page->flag; + nod_flag= ma_page->node; + + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + bmove(key->data, keypos, keyinfo->keylength+nod_flag); + key->ref_length= keyinfo->share->rec_reflength; + key->data_length= keyinfo->keylength - key->ref_length; + key->flag= 0; + DBUG_RETURN(keypos+keyinfo->keylength+nod_flag); + } + else + { + page+= keyinfo->share->keypage_header + nod_flag; + key->data[0]= 0; /* safety */ + while (page <= keypos) + { + if (!(*keyinfo->get_key)(key, page_flag, nod_flag, &page)) + { + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); + } + } + } + DBUG_PRINT("exit",("page: 0x%lx length: %u", (long) page, + key->data_length + key->ref_length)); + DBUG_RETURN(page); +} /* _ma_get_key */ + + +/* + @brief Get key at position without knowledge of previous key + + @return + @retval 0 ok + @retval 1 error +*/ + +static my_bool _ma_get_prev_key(MARIA_KEY *key, MARIA_PAGE *ma_page, + uchar *keypos) +{ + uint page_flag, nod_flag; + MARIA_KEYDEF *keyinfo= key->keyinfo; + DBUG_ENTER("_ma_get_prev_key"); + + page_flag= ma_page->flag; + nod_flag= ma_page->node; + + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + bmove(key->data, keypos - keyinfo->keylength - nod_flag, + keyinfo->keylength); + key->ref_length= keyinfo->share->rec_reflength; + key->data_length= keyinfo->keylength - key->ref_length; + key->flag= 0; + DBUG_RETURN(0); + } + else + { + uchar *page; + + page= ma_page->buff + keyinfo->share->keypage_header + nod_flag; + key->data[0]= 0; /* safety */ + DBUG_ASSERT(page != keypos); + while (page < keypos) + { + if (! (*keyinfo->get_key)(key, page_flag, nod_flag, &page)) + { + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(1); + } + } + } + DBUG_RETURN(0); +} /* _ma_get_prev_key */ + + +/* + @brief Get last key from key-page before 'endpos' + + @note + endpos may be either end of buffer or start of a key + + @return + @retval pointer to where key starts +*/ + +uchar *_ma_get_last_key(MARIA_KEY *key, MARIA_PAGE *ma_page, uchar *endpos) +{ + uint page_flag,nod_flag; + uchar *lastpos, *page; + MARIA_KEYDEF *keyinfo= key->keyinfo; + DBUG_ENTER("_ma_get_last_key"); + DBUG_PRINT("enter",("page: 0x%lx endpos: 0x%lx", (long) ma_page->buff, + (long) endpos)); + + page_flag= ma_page->flag; + nod_flag= ma_page->node; + page= ma_page->buff + keyinfo->share->keypage_header + nod_flag; + + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + lastpos= endpos-keyinfo->keylength-nod_flag; + key->ref_length= keyinfo->share->rec_reflength; + key->data_length= keyinfo->keylength - key->ref_length; + key->flag= 0; + if (lastpos >= page) + bmove(key->data, lastpos, keyinfo->keylength + nod_flag); + } + else + { + lastpos= page; + key->data[0]=0; /* safety */ + while (page < endpos) + { + lastpos= page; + if (!(*keyinfo->get_key)(key, page_flag, nod_flag, &page)) + { + DBUG_PRINT("error",("Couldn't find last key: page: 0x%lx", + (long) page)); + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); + } + } + } + DBUG_PRINT("exit",("lastpos: 0x%lx length: %u", (ulong) lastpos, + key->data_length + key->ref_length)); + DBUG_RETURN(lastpos); +} /* _ma_get_last_key */ + + +/** + Calculate length of unpacked key + + @param info Maria handler + @param keyinfo key handler + @param key data for key + + @notes + This function is very seldom used. It's mainly used for debugging + or when calculating a key length from a stored key in batch insert. + + This function does *NOT* calculate length of transid size! + This function can't be used against a prefix packed key on a page + + @return + @retval total length for key +*/ + +uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key) +{ + reg1 HA_KEYSEG *keyseg; + const uchar *start; + + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY))) + return (keyinfo->keylength); + + start= key; + for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++) + { + if (keyseg->flag & HA_NULL_PART) + if (!*key++) + continue; + if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint length; + get_key_length(length,key); + key+=length; + } + else + key+= keyseg->length; + } + return((uint) (key-start)+keyseg->length); +} /* _ma_keylength */ + + +/* + Calculate length of part key. + + Used in maria_rkey() to find the key found for the key-part that was used. + This is needed in case of multi-byte character sets where we may search + after '0xDF' but find 'ss' +*/ + +uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, register const uchar *key, + HA_KEYSEG *end) +{ + reg1 HA_KEYSEG *keyseg; + const uchar *start= key; + + for (keyseg=keyinfo->seg ; keyseg != end ; keyseg++) + { + if (keyseg->flag & HA_NULL_PART) + if (!*key++) + continue; + if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint length; + get_key_length(length,key); + key+=length; + } + else + key+= keyseg->length; + } + return (uint) (key-start); +} + + +/* + Find next/previous record with same key + + WARNING + This can't be used when database is touched after last read +*/ + +int _ma_search_next(register MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, my_off_t pos) +{ + int error; + uchar lastkey[MARIA_MAX_KEY_BUFF]; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_KEY tmp_key; + MARIA_PAGE page; + DBUG_ENTER("_ma_search_next"); + DBUG_PRINT("enter",("nextflag: %u lastpos: %lu int_keypos: 0x%lx page_changed %d keyread_buff_used: %d", + nextflag, (ulong) info->cur_row.lastpos, + (ulong) info->int_keypos, + info->page_changed, info->keyread_buff_used)); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key);); + + /* + Force full read if we are at last key or if we are not on a leaf + and the key tree has changed since we used it last time + Note that even if the key tree has changed since last read, we can use + the last read data from the leaf if we haven't used the buffer for + something else. + */ + + if (((nextflag & SEARCH_BIGGER) && info->int_keypos >= info->int_maxpos) || + info->page_changed || + (info->int_keytree_version != keyinfo->version && + (info->int_nod_flag || info->keyread_buff_used))) + DBUG_RETURN(_ma_search(info, key, nextflag | SEARCH_SAVE_BUFF, + pos)); + + if (info->keyread_buff_used) + { + if (_ma_fetch_keypage(&page, info, keyinfo, info->last_search_keypage, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->keyread_buff, 0)) + DBUG_RETURN(-1); + info->keyread_buff_used=0; + } + else + { + /* Last used buffer is in info->keyread_buff */ + /* Todo: Add info->keyread_page to keep track of this */ + _ma_page_setup(&page, info, keyinfo, 0, info->keyread_buff); + } + + tmp_key.data= lastkey; + info->last_key.keyinfo= tmp_key.keyinfo= keyinfo; + + if (nextflag & SEARCH_BIGGER) /* Next key */ + { + if (page.node) + { + my_off_t tmp_pos= _ma_kpos(page.node, info->int_keypos); + + if ((error= _ma_search(info, key, nextflag | SEARCH_SAVE_BUFF, + tmp_pos)) <=0) + DBUG_RETURN(error); + } + if (keyinfo->flag & (HA_PACK_KEY | HA_BINARY_PACK_KEY) && + info->last_key.data != key->data) + memcpy(info->last_key.data, key->data, + key->data_length + key->ref_length); + if (!(*keyinfo->get_key)(&info->last_key, page.flag, page.node, + &info->int_keypos)) + DBUG_RETURN(-1); + } + else /* Previous key */ + { + /* Find start of previous key */ + info->int_keypos= _ma_get_last_key(&tmp_key, &page, info->int_keypos); + if (!info->int_keypos) + DBUG_RETURN(-1); + if (info->int_keypos == info->keyread_buff + info->s->keypage_header) + { + /* Previous key was first key, read key before this one */ + DBUG_RETURN(_ma_search(info, key, nextflag | SEARCH_SAVE_BUFF, + pos)); + } + if (page.node && + (error= _ma_search(info, key, nextflag | SEARCH_SAVE_BUFF, + _ma_kpos(page.node,info->int_keypos))) <= 0) + DBUG_RETURN(error); + + /* QQ: We should be able to optimize away the following call */ + if (! _ma_get_last_key(&info->last_key, &page, info->int_keypos)) + DBUG_RETURN(-1); + } + info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key); + info->cur_row.trid= _ma_trid_from_key(&info->last_key); + DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); +} /* _ma_search_next */ + + +/** + Search after position for the first row in an index + + @return + Found row is stored in info->cur_row.lastpos +*/ + +int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos) +{ + uchar *first_pos; + MARIA_PAGE page; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_search_first"); + + if (pos == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_KEY_NOT_FOUND; + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + + do + { + if (_ma_fetch_keypage(&page, info, keyinfo, pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->keyread_buff, 0)) + { + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + first_pos= page.buff + share->keypage_header + page.node; + } while ((pos= _ma_kpos(page.node, first_pos)) != HA_OFFSET_ERROR); + + info->last_key.keyinfo= keyinfo; + + if (!(*keyinfo->get_key)(&info->last_key, page.flag, page.node, &first_pos)) + DBUG_RETURN(-1); /* Crashed */ + + info->int_keypos= first_pos; + info->int_maxpos= (page.buff + page.size -1); + info->int_nod_flag= page.node; + info->int_keytree_version= keyinfo->version; + info->last_search_keypage= info->last_keypage; + info->page_changed=info->keyread_buff_used=0; + info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key); + info->cur_row.trid= _ma_trid_from_key(&info->last_key); + + DBUG_PRINT("exit",("found key at %lu", (ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); +} /* _ma_search_first */ + + +/** + Search after position for the last row in an index + + @return + Found row is stored in info->cur_row.lastpos +*/ + +int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos) +{ + uchar *end_of_page; + MARIA_PAGE page; + DBUG_ENTER("_ma_search_last"); + + if (pos == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */ + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + + do + { + if (_ma_fetch_keypage(&page, info, keyinfo, pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->keyread_buff, 0)) + { + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + end_of_page= page.buff + page.size; + } while ((pos= _ma_kpos(page.node, end_of_page)) != HA_OFFSET_ERROR); + + info->last_key.keyinfo= keyinfo; + + if (!_ma_get_last_key(&info->last_key, &page, end_of_page)) + DBUG_RETURN(-1); + info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key); + info->cur_row.trid= _ma_trid_from_key(&info->last_key); + info->int_keypos= info->int_maxpos= end_of_page; + info->int_nod_flag= page.node; + info->int_keytree_version= keyinfo->version; + info->last_search_keypage= info->last_keypage; + info->page_changed=info->keyread_buff_used=0; + + DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); +} /* _ma_search_last */ + + + +/**************************************************************************** +** +** Functions to store and pack a key in a page +** +** maria_calc_xx_key_length takes the following arguments: +** nod_flag If nod: Length of nod-pointer +** next_key Position to pos after the new key in buffer +** org_key Key that was before the next key in buffer +** prev_key Last key before current key +** key Key that will be stored +** s_temp Information how next key will be packed +****************************************************************************/ + +/* Static length key */ + +int +_ma_calc_static_key_length(const MARIA_KEY *key, uint nod_flag, + uchar *next_pos __attribute__((unused)), + uchar *org_key __attribute__((unused)), + uchar *prev_key __attribute__((unused)), + MARIA_KEY_PARAM *s_temp) +{ + s_temp->key= key->data; + return (int) (s_temp->move_length= key->data_length + key->ref_length + + nod_flag); +} + +/* Variable length key */ + +int +_ma_calc_var_key_length(const MARIA_KEY *key, uint nod_flag, + uchar *next_pos __attribute__((unused)), + uchar *org_key __attribute__((unused)), + uchar *prev_key __attribute__((unused)), + MARIA_KEY_PARAM *s_temp) +{ + s_temp->key= key->data; + return (int) (s_temp->move_length= key->data_length + key->ref_length + + nod_flag); +} + +/** + @brief Calc length needed to store prefixed compressed keys + + @info + Variable length first segment which is prefix compressed + (maria_chk reports 'packed + stripped') + + Keys are compressed the following way: + + If the max length of first key segment <= 127 bytes the prefix is + 1 uchar else it's 2 byte + + prefix byte(s) The high bit is set if this is a prefix for the prev key + length Packed length if the previous was a prefix byte + [data_length] data bytes ('length' bytes) + next-key-seg Next key segments + + If the first segment can have NULL: + If key was packed + data_length is length of rest of key + If key was not packed + The data_length is 0 for NULLS and 1+data_length for not null columns +*/ + +int +_ma_calc_var_pack_key_length(const MARIA_KEY *int_key, uint nod_flag, + uchar *next_key, uchar *org_key, uchar *prev_key, + MARIA_KEY_PARAM *s_temp) +{ + reg1 HA_KEYSEG *keyseg; + int length; + uint key_length,ref_length,org_key_length=0, + length_pack,new_key_length,diff_flag,pack_marker; + const uchar *key, *start, *end, *key_end; + const uchar *sort_order; + my_bool same_length; + MARIA_KEYDEF *keyinfo= int_key->keyinfo; + + key= int_key->data; + length_pack=s_temp->ref_length=s_temp->n_ref_length=s_temp->n_length=0; + same_length=0; keyseg=keyinfo->seg; + key_length= int_key->data_length + int_key->ref_length + nod_flag; + + sort_order=0; + if ((keyinfo->flag & HA_FULLTEXT) && + ((keyseg->type == HA_KEYTYPE_TEXT) || + (keyseg->type == HA_KEYTYPE_VARTEXT1) || + (keyseg->type == HA_KEYTYPE_VARTEXT2)) && + !use_strnxfrm(keyseg->charset)) + sort_order= keyseg->charset->sort_order; + + /* diff flag contains how many bytes is needed to pack key */ + if (keyseg->length >= 127) + { + diff_flag=2; + pack_marker=32768; + } + else + { + diff_flag= 1; + pack_marker=128; + } + s_temp->pack_marker=pack_marker; + + /* Handle the case that the first part have NULL values */ + if (keyseg->flag & HA_NULL_PART) + { + if (!*key++) + { + s_temp->key= key; + s_temp->key_length= 0; + s_temp->totlength= key_length-1+diff_flag; + s_temp->next_key_pos= 0; /* No next key */ + return (s_temp->move_length= s_temp->totlength); + } + s_temp->store_not_null=1; + key_length--; /* We don't store NULL */ + if (prev_key && !*prev_key++) + org_key=prev_key=0; /* Can't pack against prev */ + else if (org_key) + org_key++; /* Skip NULL */ + } + else + s_temp->store_not_null=0; + s_temp->prev_key= org_key; + + /* The key part will start with a packed length */ + + get_key_pack_length(new_key_length,length_pack,key); + end= key_end= key+ new_key_length; + start= key; + + /* Calc how many characters are identical between this and the prev. key */ + if (prev_key) + { + get_key_length(org_key_length,prev_key); + s_temp->prev_key=prev_key; /* Pointer at data */ + /* Don't use key-pack if length == 0 */ + if (new_key_length && new_key_length == org_key_length) + same_length=1; + else if (new_key_length > org_key_length) + end= key + org_key_length; + + if (sort_order) /* SerG */ + { + while (key < end && + sort_order[*key] == sort_order[*prev_key]) + { + key++; prev_key++; + } + } + else + { + while (key < end && *key == *prev_key) + { + key++; prev_key++; + } + } + } + + s_temp->key=key; + s_temp->key_length= (uint) (key_end-key); + + if (same_length && key == key_end) + { + /* identical variable length key */ + s_temp->ref_length= pack_marker; + length=(int) key_length-(int) (key_end-start)-length_pack; + length+= diff_flag; + if (next_key) + { /* Can't combine with next */ + s_temp->n_length= *next_key; /* Needed by _ma_store_key */ + next_key=0; + } + } + else + { + if (start != key) + { /* Starts as prev key */ + ref_length= (uint) (key-start); + s_temp->ref_length= ref_length + pack_marker; + length= (int) (key_length - ref_length); + + length-= length_pack; + length+= diff_flag; + length+= ((new_key_length-ref_length) >= 255) ? 3 : 1;/* Rest_of_key */ + } + else + { + s_temp->key_length+=s_temp->store_not_null; /* If null */ + length= key_length - length_pack+ diff_flag; + } + } + s_temp->totlength=(uint) length; + s_temp->prev_length=0; + DBUG_PRINT("test",("tot_length: %u length: %d uniq_key_length: %u", + key_length, length, s_temp->key_length)); + + /* If something after that hasn't length=0, test if we can combine */ + if ((s_temp->next_key_pos=next_key)) + { + uint packed,n_length; + + packed = *next_key & 128; + if (diff_flag == 2) + { + n_length= mi_uint2korr(next_key) & 32767; /* Length of next key */ + next_key+=2; + } + else + n_length= *next_key++ & 127; + if (!packed) + n_length-= s_temp->store_not_null; + + if (n_length || packed) /* Don't pack 0 length keys */ + { + uint next_length_pack, new_ref_length=s_temp->ref_length; + + if (packed) + { + /* If first key and next key is packed (only on delete) */ + if (!prev_key && org_key) + { + get_key_length(org_key_length,org_key); + key=start; + if (sort_order) /* SerG */ + { + while (key < end && + sort_order[*key] == sort_order[*org_key]) + { + key++; org_key++; + } + } + else + { + while (key < end && *key == *org_key) + { + key++; org_key++; + } + } + if ((new_ref_length= (uint) (key - start))) + new_ref_length+=pack_marker; + } + + if (!n_length) + { + /* + We put a different key between two identical variable length keys + Extend next key to have same prefix as this key + */ + if (new_ref_length) /* prefix of previus key */ + { /* make next key longer */ + s_temp->part_of_prev_key= new_ref_length; + s_temp->prev_length= org_key_length - + (new_ref_length-pack_marker); + s_temp->n_ref_length= s_temp->part_of_prev_key; + s_temp->n_length= s_temp->prev_length; + n_length= get_pack_length(s_temp->prev_length); + s_temp->prev_key+= (new_ref_length - pack_marker); + length+= s_temp->prev_length + n_length; + } + else + { /* Can't use prev key */ + s_temp->part_of_prev_key=0; + s_temp->prev_length= org_key_length; + s_temp->n_ref_length=s_temp->n_length= org_key_length; + length+= org_key_length; + } + return (s_temp->move_length= (int) length); + } + + ref_length=n_length; + /* Get information about not packed key suffix */ + get_key_pack_length(n_length,next_length_pack,next_key); + + /* Test if new keys has fewer characters that match the previous key */ + if (!new_ref_length) + { /* Can't use prev key */ + s_temp->part_of_prev_key= 0; + s_temp->prev_length= ref_length; + s_temp->n_ref_length= s_temp->n_length= n_length+ref_length; + return s_temp->move_length= ((int) length+ref_length- + next_length_pack); + } + if (ref_length+pack_marker > new_ref_length) + { + uint new_pack_length=new_ref_length-pack_marker; + /* We must copy characters from the original key to the next key */ + s_temp->part_of_prev_key= new_ref_length; + s_temp->prev_length= ref_length - new_pack_length; + s_temp->n_ref_length=s_temp->n_length=n_length + s_temp->prev_length; + s_temp->prev_key+= new_pack_length; + length-= (next_length_pack - get_pack_length(s_temp->n_length)); + return s_temp->move_length= ((int) length + s_temp->prev_length); + } + } + else + { + /* Next key wasn't a prefix of previous key */ + ref_length=0; + next_length_pack=0; + } + DBUG_PRINT("test",("length: %d next_key: 0x%lx", length, + (long) next_key)); + + { + uint tmp_length; + key=(start+=ref_length); + if (key+n_length < key_end) /* Normalize length based */ + key_end= key+n_length; + if (sort_order) /* SerG */ + { + while (key < key_end && + sort_order[*key] == sort_order[*next_key]) + { + key++; next_key++; + } + } + else + { + while (key < key_end && *key == *next_key) + { + key++; next_key++; + } + } + if (!(tmp_length=(uint) (key-start))) + { /* Key can't be re-packed */ + s_temp->next_key_pos=0; + return (s_temp->move_length= length); + } + ref_length+=tmp_length; + n_length-=tmp_length; + length-=tmp_length+next_length_pack; /* We gained these chars */ + } + if (n_length == 0 && ref_length == new_key_length) + { + s_temp->n_ref_length=pack_marker; /* Same as prev key */ + } + else + { + s_temp->n_ref_length=ref_length | pack_marker; + length+= get_pack_length(n_length); + s_temp->n_length=n_length; + } + } + } + return (s_temp->move_length= length); +} + + +/* Length of key which is prefix compressed */ + +int _ma_calc_bin_pack_key_length(const MARIA_KEY *int_key, + uint nod_flag, + uchar *next_key, + uchar *org_key, uchar *prev_key, + MARIA_KEY_PARAM *s_temp) +{ + uint length,key_length,ref_length; + const uchar *key= int_key->data; + + s_temp->totlength= key_length= (int_key->data_length + int_key->ref_length+ + nod_flag); +#ifdef HAVE_valgrind + s_temp->n_length= s_temp->n_ref_length=0; /* For valgrind */ +#endif + s_temp->key=key; + s_temp->prev_key=org_key; + if (prev_key) /* If not first key in block */ + { + /* pack key against previous key */ + /* + As keys may be identical when running a sort in maria_chk, we + have to guard against the case where keys may be identical + */ + const uchar *end; + end=key+key_length; + for ( ; *key == *prev_key && key < end; key++,prev_key++) ; + s_temp->ref_length= ref_length=(uint) (key-s_temp->key); + length=key_length - ref_length + get_pack_length(ref_length); + } + else + { + /* No previous key */ + s_temp->ref_length=ref_length=0; + length=key_length+1; + } + if ((s_temp->next_key_pos=next_key)) /* If another key after */ + { + /* pack key against next key */ + uint next_length,next_length_pack; + get_key_pack_length(next_length,next_length_pack,next_key); + + /* If first key and next key is packed (only on delete) */ + if (!prev_key && org_key && next_length) + { + const uchar *end; + for (key= s_temp->key, end=key+next_length ; + *key == *org_key && key < end; + key++,org_key++) ; + ref_length= (uint) (key - s_temp->key); + } + + if (next_length > ref_length) + { + /* + We put a key with different case between two keys with the same prefix + Extend next key to have same prefix as this key + */ + s_temp->n_ref_length= ref_length; + s_temp->prev_length= next_length-ref_length; + s_temp->prev_key+= ref_length; + return s_temp->move_length= ((int) (length+ s_temp->prev_length - + next_length_pack + + get_pack_length(ref_length))); + } + /* Check how many characters are identical to next key */ + key= s_temp->key+next_length; + s_temp->prev_length= 0; + while (*key++ == *next_key++) ; + if ((ref_length= (uint) (key - s_temp->key)-1) == next_length) + { + s_temp->next_key_pos=0; + return (s_temp->move_length= length); /* Can't pack next key */ + } + s_temp->n_ref_length=ref_length; + return s_temp->move_length= (int) (length-(ref_length - next_length) - + next_length_pack + + get_pack_length(ref_length)); + } + return (s_temp->move_length= (int) length); +} + + +/* +** store a key packed with _ma_calc_xxx_key_length in page-buffert +*/ + +/* store key without compression */ + +void _ma_store_static_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), + register uchar *key_pos, + register MARIA_KEY_PARAM *s_temp) +{ + memcpy(key_pos, s_temp->key,(size_t) s_temp->move_length); + s_temp->changed_length= s_temp->move_length; +} + + +/* store variable length key with prefix compression */ + +#define store_pack_length(test,pos,length) { \ + if (test) { *((pos)++) = (uchar) (length); } else \ + { *((pos)++) = (uchar) ((length) >> 8); *((pos)++) = (uchar) (length); } } + + +void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), + register uchar *key_pos, + register MARIA_KEY_PARAM *s_temp) +{ + uint length; + uchar *org_key_pos= key_pos; + + if (s_temp->ref_length) + { + /* Packed against previous key */ + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->ref_length); + /* If not same key after */ + if (s_temp->ref_length != s_temp->pack_marker) + store_key_length_inc(key_pos,s_temp->key_length); + } + else + { + /* Not packed against previous key */ + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->key_length); + } + bmove(key_pos, s_temp->key, + (length= s_temp->totlength - (uint) (key_pos-org_key_pos))); + + key_pos+= length; + + if (!s_temp->next_key_pos) /* No following key */ + goto end; + + if (s_temp->prev_length) + { + /* Extend next key because new key didn't have same prefix as prev key */ + if (s_temp->part_of_prev_key) + { + store_pack_length(s_temp->pack_marker == 128,key_pos, + s_temp->part_of_prev_key); + store_key_length_inc(key_pos,s_temp->n_length); + } + else + { + s_temp->n_length+= s_temp->store_not_null; + store_pack_length(s_temp->pack_marker == 128,key_pos, + s_temp->n_length); + } + memcpy(key_pos, s_temp->prev_key, s_temp->prev_length); + key_pos+= s_temp->prev_length; + } + else if (s_temp->n_ref_length) + { + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_ref_length); + if (s_temp->n_ref_length != s_temp->pack_marker) + { + /* Not identical key */ + store_key_length_inc(key_pos,s_temp->n_length); + } + } + else + { + s_temp->n_length+= s_temp->store_not_null; + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_length); + } + +end: + s_temp->changed_length= (uint) (key_pos - org_key_pos); +} + + +/* variable length key with prefix compression */ + +void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), + register uchar *key_pos, + register MARIA_KEY_PARAM *s_temp) +{ + uchar *org_key_pos= key_pos; + size_t length= s_temp->totlength - s_temp->ref_length; + + store_key_length_inc(key_pos,s_temp->ref_length); + memcpy(key_pos, s_temp->key+s_temp->ref_length, length); + key_pos+= length; + + if (s_temp->next_key_pos) + { + store_key_length_inc(key_pos,s_temp->n_ref_length); + if (s_temp->prev_length) /* If we must extend key */ + { + memcpy(key_pos,s_temp->prev_key,s_temp->prev_length); + key_pos+= s_temp->prev_length; + } + } + s_temp->changed_length= (uint) (key_pos - org_key_pos); +} diff --git a/storage/maria/ma_servicethread.c b/storage/maria/ma_servicethread.c new file mode 100644 index 00000000000..a8099c998e9 --- /dev/null +++ b/storage/maria/ma_servicethread.c @@ -0,0 +1,134 @@ +#include "maria_def.h" +#include "ma_servicethread.h" + +/** + Initializes the service thread + + @param control control block + + @return Operation status + @retval 0 OK + @retval 1 error +*/ + +int ma_service_thread_control_init(MA_SERVICE_THREAD_CONTROL *control) +{ + int res= 0; + DBUG_ENTER("ma_service_thread_control_init"); + DBUG_PRINT("init", ("control 0x%lx", (ulong) control)); + control->inited= TRUE; + control->status= THREAD_DEAD; /* not yet born == dead */ + res= (pthread_mutex_init(control->LOCK_control, MY_MUTEX_INIT_SLOW) || + pthread_cond_init(control->COND_control, 0)); + DBUG_PRINT("info", ("init: %s", (res ? "Error" : "OK"))); + DBUG_RETURN(res); +} + + +/** + Kill the service thread + + @param control control block + + @note The service thread should react on condition and status equal + THREAD_DYING, by setting status THREAD_DEAD, and issuing message to + control thread via condition and exiting. The base way to do so is using + my_service_thread_sleep() and my_service_thread_signal_end() +*/ + +void ma_service_thread_control_end(MA_SERVICE_THREAD_CONTROL *control) +{ + DBUG_ENTER("ma_service_thread_control_end"); + DBUG_PRINT("init", ("control 0x%lx", (ulong) control)); + DBUG_ASSERT(control->inited); + pthread_mutex_lock(control->LOCK_control); + if (control->status != THREAD_DEAD) /* thread was started OK */ + { + DBUG_PRINT("info",("killing Maria background thread")); + control->status= THREAD_DYING; /* kill it */ + do /* and wait for it to be dead */ + { + /* wake it up if it was in a sleep */ + pthread_cond_broadcast(control->COND_control); + DBUG_PRINT("info",("waiting for Maria background thread to die")); + pthread_cond_wait(control->COND_control, control->LOCK_control); + } + while (control->status != THREAD_DEAD); + } + pthread_mutex_unlock(control->LOCK_control); + pthread_mutex_destroy(control->LOCK_control); + pthread_cond_destroy(control->COND_control); + control->inited= FALSE; + DBUG_VOID_RETURN; +} + + +/** + Sleep for given number of nanoseconds with reaction on thread kill + + @param control control block + @param sleep_time time of sleeping + + @return Operation status + @retval FALSE Time out + @retval TRUE Thread should be killed +*/ + +my_bool my_service_thread_sleep(MA_SERVICE_THREAD_CONTROL *control, + ulonglong sleep_time) +{ + struct timespec abstime; + my_bool res= FALSE; + DBUG_ENTER("my_service_thread_sleep"); + DBUG_PRINT("init", ("control 0x%lx", (ulong) control)); + pthread_mutex_lock(control->LOCK_control); + if (control->status == THREAD_DYING) + { + pthread_mutex_unlock(control->LOCK_control); + DBUG_RETURN(TRUE); + } +#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */ + pthread_mutex_unlock(&control->LOCK_control); + my_sleep(100000); /* a tenth of a second */ + pthread_mutex_lock(&control->LOCK_control); +#else + /* To have a killable sleep, we use timedwait like our SQL GET_LOCK() */ + DBUG_PRINT("info", ("sleeping %llu nano seconds", sleep_time)); + if (sleep_time) + { + set_timespec_nsec(abstime, sleep_time); + pthread_cond_timedwait(control->COND_control, + control->LOCK_control, &abstime); + } +#endif + if (control->status == THREAD_DYING) + res= TRUE; + pthread_mutex_unlock(control->LOCK_control); + DBUG_RETURN(res); +} + + +/** + inform about thread exiting + + @param control control block +*/ + +void my_service_thread_signal_end(MA_SERVICE_THREAD_CONTROL *control) +{ + DBUG_ENTER("my_service_thread_signal_end"); + DBUG_PRINT("init", ("control 0x%lx", (ulong) control)); + pthread_mutex_lock(control->LOCK_control); + control->status = THREAD_DEAD; /* indicate that we are dead */ + /* + wake up ma_service_thread_control_end which may be waiting for + our death + */ + pthread_cond_broadcast(control->COND_control); + /* + broadcast was inside unlock because ma_service_thread_control_end + destroys mutex + */ + pthread_mutex_unlock(control->LOCK_control); + DBUG_VOID_RETURN; +} diff --git a/storage/maria/ma_servicethread.h b/storage/maria/ma_servicethread.h new file mode 100644 index 00000000000..153ff9ebd14 --- /dev/null +++ b/storage/maria/ma_servicethread.h @@ -0,0 +1,22 @@ +#include <my_pthread.h> + +enum ma_service_thread_state {THREAD_RUNNING, THREAD_DYING, THREAD_DEAD}; + +typedef struct st_ma_service_thread_control +{ + /** 'kill' flag for the background thread */ + enum ma_service_thread_state status; + /** if thread module was inited or not */ + my_bool inited; + /** for killing the background thread */ + pthread_mutex_t *LOCK_control; + /** for killing the background thread */ + pthread_cond_t *COND_control; +} MA_SERVICE_THREAD_CONTROL; + + +int ma_service_thread_control_init(MA_SERVICE_THREAD_CONTROL *control); +void ma_service_thread_control_end(MA_SERVICE_THREAD_CONTROL *control); +my_bool my_service_thread_sleep(MA_SERVICE_THREAD_CONTROL *control, + ulonglong sleep_time); +void my_service_thread_signal_end(MA_SERVICE_THREAD_CONTROL *control); diff --git a/storage/maria/ma_sort.c b/storage/maria/ma_sort.c new file mode 100644 index 00000000000..f7f79f90cf0 --- /dev/null +++ b/storage/maria/ma_sort.c @@ -0,0 +1,1077 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Creates a index for a database by reading keys, sorting them and outputing + them in sorted order through MARIA_SORT_INFO functions. +*/ + +#include "ma_fulltext.h" +#if defined(MSDOS) || defined(__WIN__) +#include <fcntl.h> +#else +#include <stddef.h> +#endif +#include <queues.h> + +/* static variables */ + +#undef MIN_SORT_MEMORY +#undef MYF_RW +#undef DISK_BUFFER_SIZE + +#define MERGEBUFF 15 +#define MERGEBUFF2 31 +#define MIN_SORT_MEMORY (4096-MALLOC_OVERHEAD) +#define MYF_RW MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL) +#define DISK_BUFFER_SIZE (IO_SIZE*16) + + +/* + Pointers of functions for store and read keys from temp file +*/ + +extern void print_error _VARARGS((const char *fmt,...)); + +/* Functions defined in this file */ + +static ha_rows find_all_keys(MARIA_SORT_PARAM *info,uint keys, + uchar **sort_keys, + DYNAMIC_ARRAY *buffpek,int *maxbuffer, + IO_CACHE *tempfile, + IO_CACHE *tempfile_for_exceptions); +static int write_keys(MARIA_SORT_PARAM *info, uchar **sort_keys, + uint count, BUFFPEK *buffpek,IO_CACHE *tempfile); +static int write_key(MARIA_SORT_PARAM *info, uchar *key, + IO_CACHE *tempfile); +static int write_index(MARIA_SORT_PARAM *info, uchar **sort_keys, + uint count); +static int merge_many_buff(MARIA_SORT_PARAM *info,uint keys, + uchar **sort_keys, + BUFFPEK *buffpek,int *maxbuffer, + IO_CACHE *t_file); +static uint read_to_buffer(IO_CACHE *fromfile,BUFFPEK *buffpek, + uint sort_length); +static int merge_buffers(MARIA_SORT_PARAM *info,uint keys, + IO_CACHE *from_file, IO_CACHE *to_file, + uchar **sort_keys, BUFFPEK *lastbuff, + BUFFPEK *Fb, BUFFPEK *Tb); +static int merge_index(MARIA_SORT_PARAM *,uint, uchar **,BUFFPEK *, int, + IO_CACHE *); +static int flush_maria_ft_buf(MARIA_SORT_PARAM *info); + +static int write_keys_varlen(MARIA_SORT_PARAM *info, uchar **sort_keys, + uint count, BUFFPEK *buffpek, + IO_CACHE *tempfile); +static uint read_to_buffer_varlen(IO_CACHE *fromfile,BUFFPEK *buffpek, + uint sort_length); +static int write_merge_key(MARIA_SORT_PARAM *info, IO_CACHE *to_file, + uchar *key, uint sort_length, uint count); +static int write_merge_key_varlen(MARIA_SORT_PARAM *info, + IO_CACHE *to_file, uchar *key, + uint sort_length, uint count); +static inline int +my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs); + +/* + Creates a index of sorted keys + + SYNOPSIS + _ma_create_index_by_sort() + info Sort parameters + no_messages Set to 1 if no output + sortbuff_size Size of sortbuffer to allocate + + RESULT + 0 ok + <> 0 Error +*/ + +int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages, + size_t sortbuff_size) +{ + int error,maxbuffer,skr; + size_t memavl,old_memavl; + uint keys,sort_length; + DYNAMIC_ARRAY buffpek; + ha_rows records; + uchar **sort_keys; + IO_CACHE tempfile, tempfile_for_exceptions; + DBUG_ENTER("_ma_create_index_by_sort"); + DBUG_PRINT("enter",("sort_buff_size: %lu sort_length: %d max_records: %lu", + (ulong) sortbuff_size, info->key_length, + (ulong) info->sort_info->max_records)); + + if (info->keyinfo->flag & HA_VAR_LENGTH_KEY) + { + info->write_keys= write_keys_varlen; + info->read_to_buffer=read_to_buffer_varlen; + info->write_key=write_merge_key_varlen; + } + else + { + info->write_keys= write_keys; + info->read_to_buffer=read_to_buffer; + info->write_key=write_merge_key; + } + + my_b_clear(&tempfile); + my_b_clear(&tempfile_for_exceptions); + bzero((char*) &buffpek,sizeof(buffpek)); + sort_keys= (uchar **) NULL; error= 1; + maxbuffer=1; + + memavl=max(sortbuff_size,MIN_SORT_MEMORY); + records= info->sort_info->max_records; + sort_length= info->key_length; + LINT_INIT(keys); + + while (memavl >= MIN_SORT_MEMORY) + { + if ((records < UINT_MAX32) && + ((my_off_t) (records + 1) * + (sort_length + sizeof(char*)) <= (my_off_t) memavl)) + keys= (uint)records+1; + else + do + { + skr=maxbuffer; + if (memavl < sizeof(BUFFPEK)*(uint) maxbuffer || + (keys=(memavl-sizeof(BUFFPEK)*(uint) maxbuffer)/ + (sort_length+sizeof(char*))) <= 1 || + keys < (uint) maxbuffer) + { + _ma_check_print_error(info->sort_info->param, + "aria_sort_buffer_size is too small"); + goto err; + } + } + while ((maxbuffer= (int) (records/(keys-1)+1)) != skr); + + if ((sort_keys=(uchar**) my_malloc(keys*(sort_length+sizeof(char*))+ + HA_FT_MAXBYTELEN, MYF(0)))) + { + if (my_init_dynamic_array(&buffpek, sizeof(BUFFPEK), maxbuffer, + maxbuffer/2)) + { + my_free(sort_keys,MYF(0)); + sort_keys= 0; + } + else + break; + } + old_memavl=memavl; + if ((memavl=memavl/4*3) < MIN_SORT_MEMORY && old_memavl > MIN_SORT_MEMORY) + memavl=MIN_SORT_MEMORY; + } + if (memavl < MIN_SORT_MEMORY) + { + _ma_check_print_error(info->sort_info->param, "Aria sort buffer" + " too small"); /* purecov: tested */ + goto err; /* purecov: tested */ + } + (*info->lock_in_memory)(info->sort_info->param);/* Everything is allocated */ + + if (!no_messages) + printf(" - Searching for keys, allocating buffer for %d keys\n",keys); + + if ((records=find_all_keys(info,keys,sort_keys,&buffpek,&maxbuffer, + &tempfile,&tempfile_for_exceptions)) + == HA_POS_ERROR) + goto err; /* purecov: tested */ + if (maxbuffer == 0) + { + if (!no_messages) + printf(" - Dumping %lu keys\n", (ulong) records); + if (write_index(info,sort_keys, (uint) records)) + goto err; /* purecov: inspected */ + } + else + { + keys=(keys*(sort_length+sizeof(char*)))/sort_length; + if (maxbuffer >= MERGEBUFF2) + { + if (!no_messages) + printf(" - Merging %lu keys\n", (ulong) records); /* purecov: tested */ + if (merge_many_buff(info,keys,sort_keys, + dynamic_element(&buffpek,0,BUFFPEK *),&maxbuffer,&tempfile)) + goto err; /* purecov: inspected */ + } + if (flush_io_cache(&tempfile) || + reinit_io_cache(&tempfile,READ_CACHE,0L,0,0)) + goto err; /* purecov: inspected */ + if (!no_messages) + printf(" - Last merge and dumping keys\n"); /* purecov: tested */ + if (merge_index(info,keys,sort_keys,dynamic_element(&buffpek,0,BUFFPEK *), + maxbuffer,&tempfile)) + goto err; /* purecov: inspected */ + } + + if (flush_maria_ft_buf(info) || _ma_flush_pending_blocks(info)) + goto err; + + if (my_b_inited(&tempfile_for_exceptions)) + { + MARIA_HA *idx=info->sort_info->info; + uint16 key_length; + MARIA_KEY key; + key.keyinfo= idx->s->keyinfo + info->key; + + if (!no_messages) + printf(" - Adding exceptions\n"); /* purecov: tested */ + if (flush_io_cache(&tempfile_for_exceptions) || + reinit_io_cache(&tempfile_for_exceptions,READ_CACHE,0L,0,0)) + goto err; + + while (!my_b_read(&tempfile_for_exceptions,(uchar*)&key_length, + sizeof(key_length)) + && !my_b_read(&tempfile_for_exceptions,(uchar*)sort_keys, + (uint) key_length)) + { + key.data= (uchar*) sort_keys; + key.ref_length= idx->s->rec_reflength; + key.data_length= key_length - key.ref_length; + key.flag= 0; + if (_ma_ck_write(idx, &key)) + goto err; + } + } + + error =0; + +err: + my_free(sort_keys, MYF(MY_ALLOW_ZERO_PTR)); + delete_dynamic(&buffpek); + close_cached_file(&tempfile); + close_cached_file(&tempfile_for_exceptions); + + DBUG_RETURN(error ? -1 : 0); +} /* _ma_create_index_by_sort */ + + +/* Search after all keys and place them in a temp. file */ + +static ha_rows find_all_keys(MARIA_SORT_PARAM *info, uint keys, + uchar **sort_keys, DYNAMIC_ARRAY *buffpek, + int *maxbuffer, IO_CACHE *tempfile, + IO_CACHE *tempfile_for_exceptions) +{ + int error; + uint idx; + DBUG_ENTER("find_all_keys"); + + idx=error=0; + sort_keys[0]= (uchar*) (sort_keys+keys); + + while (!(error=(*info->key_read)(info,sort_keys[idx]))) + { + if (info->real_key_length > info->key_length) + { + if (write_key(info,sort_keys[idx],tempfile_for_exceptions)) + DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */ + continue; + } + + if (++idx == keys) + { + if (info->write_keys(info,sort_keys,idx-1, + (BUFFPEK *)alloc_dynamic(buffpek), + tempfile)) + DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */ + + sort_keys[0]=(uchar*) (sort_keys+keys); + memcpy(sort_keys[0],sort_keys[idx-1],(size_t) info->key_length); + idx=1; + } + sort_keys[idx]=sort_keys[idx-1]+info->key_length; + } + if (error > 0) + DBUG_RETURN(HA_POS_ERROR); /* Aborted by get_key */ /* purecov: inspected */ + if (buffpek->elements) + { + if (info->write_keys(info,sort_keys,idx,(BUFFPEK *)alloc_dynamic(buffpek), + tempfile)) + DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */ + *maxbuffer=buffpek->elements-1; + } + else + *maxbuffer=0; + + DBUG_RETURN((*maxbuffer)*(keys-1)+idx); +} /* find_all_keys */ + + +#ifdef THREAD +/* Search after all keys and place them in a temp. file */ + +pthread_handler_t _ma_thr_find_all_keys(void *arg) +{ + MARIA_SORT_PARAM *sort_param= (MARIA_SORT_PARAM*) arg; + int error; + size_t memavl,old_memavl; + uint sort_length; + ulong idx, maxbuffer, keys; + uchar **sort_keys=0; + + LINT_INIT(keys); + + error=1; + + if (my_thread_init()) + goto err; + + { /* Add extra block since DBUG_ENTER declare variables */ + DBUG_ENTER("_ma_thr_find_all_keys"); + DBUG_PRINT("enter", ("master: %d", sort_param->master)); + if (sort_param->sort_info->got_error) + goto err; + + if (sort_param->keyinfo->flag & HA_VAR_LENGTH_KEY) + { + sort_param->write_keys= write_keys_varlen; + sort_param->read_to_buffer= read_to_buffer_varlen; + sort_param->write_key= write_merge_key_varlen; + } + else + { + sort_param->write_keys= write_keys; + sort_param->read_to_buffer= read_to_buffer; + sort_param->write_key= write_merge_key; + } + + my_b_clear(&sort_param->tempfile); + my_b_clear(&sort_param->tempfile_for_exceptions); + bzero((char*) &sort_param->buffpek,sizeof(sort_param->buffpek)); + bzero((char*) &sort_param->unique, sizeof(sort_param->unique)); + + memavl= max(sort_param->sortbuff_size, MIN_SORT_MEMORY); + idx= (uint)sort_param->sort_info->max_records; + sort_length= sort_param->key_length; + maxbuffer= 1; + + while (memavl >= MIN_SORT_MEMORY) + { + if ((my_off_t) (idx+1)*(sort_length+sizeof(char*)) <= (my_off_t) memavl) + keys= idx+1; + else + { + ulong skr; + do + { + skr= maxbuffer; + if (memavl < sizeof(BUFFPEK)*maxbuffer || + (keys=(memavl-sizeof(BUFFPEK)*maxbuffer)/ + (sort_length+sizeof(char*))) <= 1 || + keys < maxbuffer) + { + _ma_check_print_error(sort_param->sort_info->param, + "aria_sort_buffer_size is too small"); + goto err; + } + } + while ((maxbuffer= (int) (idx/(keys-1)+1)) != skr); + } + if ((sort_keys= (uchar **) + my_malloc(keys*(sort_length+sizeof(char*))+ + ((sort_param->keyinfo->flag & HA_FULLTEXT) ? + HA_FT_MAXBYTELEN : 0), MYF(0)))) + { + if (my_init_dynamic_array(&sort_param->buffpek, sizeof(BUFFPEK), + maxbuffer, maxbuffer/2)) + { + my_free(sort_keys, MYF(0)); + sort_keys= (uchar **) NULL; /* for err: label */ + } + else + break; + } + old_memavl= memavl; + if ((memavl= memavl/4*3) < MIN_SORT_MEMORY && + old_memavl > MIN_SORT_MEMORY) + memavl= MIN_SORT_MEMORY; + } + if (memavl < MIN_SORT_MEMORY) + { + _ma_check_print_error(sort_param->sort_info->param, + "Aria sort buffer too small"); + goto err; /* purecov: tested */ + } + + if (sort_param->sort_info->param->testflag & T_VERBOSE) + printf("Key %d - Allocating buffer for %lu keys\n", + sort_param->key+1, (ulong) keys); + sort_param->sort_keys= sort_keys; + + idx= error= 0; + sort_keys[0]= (uchar*) (sort_keys+keys); + + DBUG_PRINT("info", ("reading keys")); + while (!(error= sort_param->sort_info->got_error) && + !(error= (*sort_param->key_read)(sort_param, sort_keys[idx]))) + { + if (sort_param->real_key_length > sort_param->key_length) + { + if (write_key(sort_param,sort_keys[idx], + &sort_param->tempfile_for_exceptions)) + goto err; + continue; + } + + if (++idx == keys) + { + if (sort_param->write_keys(sort_param, sort_keys, idx - 1, + (BUFFPEK *)alloc_dynamic(&sort_param-> + buffpek), + &sort_param->tempfile)) + goto err; + sort_keys[0]= (uchar*) (sort_keys+keys); + memcpy(sort_keys[0], sort_keys[idx - 1], + (size_t) sort_param->key_length); + idx= 1; + } + sort_keys[idx]=sort_keys[idx - 1] + sort_param->key_length; + } + if (error > 0) + goto err; + if (sort_param->buffpek.elements) + { + if (sort_param->write_keys(sort_param,sort_keys, idx, + (BUFFPEK *) alloc_dynamic(&sort_param-> + buffpek), + &sort_param->tempfile)) + goto err; + sort_param->keys= (sort_param->buffpek.elements - 1) * (keys - 1) + idx; + } + else + sort_param->keys= idx; + + sort_param->sort_keys_length= keys; + goto ok; + +err: + DBUG_PRINT("error", ("got some error")); + sort_param->sort_info->got_error= 1; /* no need to protect with a mutex */ + my_free(sort_keys,MYF(MY_ALLOW_ZERO_PTR)); + sort_param->sort_keys=0; + delete_dynamic(& sort_param->buffpek); + close_cached_file(&sort_param->tempfile); + close_cached_file(&sort_param->tempfile_for_exceptions); + +ok: + free_root(&sort_param->wordroot, MYF(0)); + /* + Detach from the share if the writer is involved. Avoid others to + be blocked. This includes a flush of the write buffer. This will + also indicate EOF to the readers. + */ + if (sort_param->sort_info->info->rec_cache.share) + remove_io_thread(&sort_param->sort_info->info->rec_cache); + + /* Readers detach from the share if any. Avoid others to be blocked. */ + if (sort_param->read_cache.share) + remove_io_thread(&sort_param->read_cache); + + pthread_mutex_lock(&sort_param->sort_info->mutex); + if (!--sort_param->sort_info->threads_running) + pthread_cond_signal(&sort_param->sort_info->cond); + pthread_mutex_unlock(&sort_param->sort_info->mutex); + DBUG_PRINT("exit", ("======== ending thread ========")); + } + my_thread_end(); + return NULL; +} + + +int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param) +{ + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; + ulong length, keys; + double *rec_per_key_part= param->new_rec_per_key_part; + int got_error=sort_info->got_error; + uint i; + MARIA_HA *info=sort_info->info; + MARIA_SHARE *share= info->s; + MARIA_SORT_PARAM *sinfo; + uchar *mergebuf=0; + DBUG_ENTER("_ma_thr_write_keys"); + LINT_INIT(length); + + for (i= 0, sinfo= sort_param ; + i < sort_info->total_keys ; + i++, rec_per_key_part+=sinfo->keyinfo->keysegs, sinfo++) + { + if (!sinfo->sort_keys) + { + got_error=1; + my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + continue; + } + if (!got_error) + { + maria_set_key_active(share->state.key_map, sinfo->key); + + if (!sinfo->buffpek.elements) + { + if (param->testflag & T_VERBOSE) + { + printf("Key %d - Dumping %u keys\n",sinfo->key+1, sinfo->keys); + fflush(stdout); + } + if (write_index(sinfo, sinfo->sort_keys, sinfo->keys) || + flush_maria_ft_buf(sinfo) || _ma_flush_pending_blocks(sinfo)) + got_error=1; + } + if (!got_error && param->testflag & T_STATISTICS) + maria_update_key_parts(sinfo->keyinfo, rec_per_key_part, sinfo->unique, + param->stats_method == + MI_STATS_METHOD_IGNORE_NULLS ? + sinfo->notnull : NULL, + (ulonglong) share->state.state.records); + } + my_free(sinfo->sort_keys,MYF(0)); + my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + sinfo->sort_keys=0; + } + + for (i= 0, sinfo= sort_param ; + i < sort_info->total_keys ; + i++, + delete_dynamic(&sinfo->buffpek), + close_cached_file(&sinfo->tempfile), + close_cached_file(&sinfo->tempfile_for_exceptions), + sinfo++) + { + if (got_error) + continue; + if (sinfo->keyinfo->flag & HA_VAR_LENGTH_KEY) + { + sinfo->write_keys=write_keys_varlen; + sinfo->read_to_buffer=read_to_buffer_varlen; + sinfo->write_key=write_merge_key_varlen; + } + else + { + sinfo->write_keys=write_keys; + sinfo->read_to_buffer=read_to_buffer; + sinfo->write_key=write_merge_key; + } + if (sinfo->buffpek.elements) + { + uint maxbuffer=sinfo->buffpek.elements-1; + if (!mergebuf) + { + length=param->sort_buffer_length; + while (length >= MIN_SORT_MEMORY) + { + if ((mergebuf= my_malloc(length, MYF(0)))) + break; + length=length*3/4; + } + if (!mergebuf) + { + got_error=1; + continue; + } + } + keys=length/sinfo->key_length; + if (maxbuffer >= MERGEBUFF2) + { + if (param->testflag & T_VERBOSE) + printf("Key %d - Merging %u keys\n",sinfo->key+1, sinfo->keys); + if (merge_many_buff(sinfo, keys, (uchar **) mergebuf, + dynamic_element(&sinfo->buffpek, 0, BUFFPEK *), + (int*) &maxbuffer, &sinfo->tempfile)) + { + got_error=1; + continue; + } + } + if (flush_io_cache(&sinfo->tempfile) || + reinit_io_cache(&sinfo->tempfile,READ_CACHE,0L,0,0)) + { + got_error=1; + continue; + } + if (param->testflag & T_VERBOSE) + printf("Key %d - Last merge and dumping keys\n", sinfo->key+1); + if (merge_index(sinfo, keys, (uchar**) mergebuf, + dynamic_element(&sinfo->buffpek,0,BUFFPEK *), + maxbuffer,&sinfo->tempfile) || + flush_maria_ft_buf(sinfo) || + _ma_flush_pending_blocks(sinfo)) + { + got_error=1; + continue; + } + } + if (my_b_inited(&sinfo->tempfile_for_exceptions)) + { + uint16 key_length; + + if (param->testflag & T_VERBOSE) + printf("Key %d - Dumping 'long' keys\n", sinfo->key+1); + + if (flush_io_cache(&sinfo->tempfile_for_exceptions) || + reinit_io_cache(&sinfo->tempfile_for_exceptions,READ_CACHE,0L,0,0)) + { + got_error=1; + continue; + } + + while (!got_error && + !my_b_read(&sinfo->tempfile_for_exceptions,(uchar*)&key_length, + sizeof(key_length))) + { + uchar maria_ft_buf[HA_FT_MAXBYTELEN + HA_FT_WLEN + 10]; + if (key_length > sizeof(maria_ft_buf) || + my_b_read(&sinfo->tempfile_for_exceptions, (uchar*)maria_ft_buf, + (uint) key_length)) + got_error= 1; + else + { + MARIA_KEY tmp_key; + tmp_key.keyinfo= info->s->keyinfo + sinfo->key; + tmp_key.data= maria_ft_buf; + tmp_key.ref_length= info->s->rec_reflength; + tmp_key.data_length= key_length - info->s->rec_reflength; + tmp_key.flag= 0; + if (_ma_ck_write(info, &tmp_key)) + got_error=1; + } + } + } + } + my_free(mergebuf,MYF(MY_ALLOW_ZERO_PTR)); + DBUG_RETURN(got_error); +} +#endif /* THREAD */ + + +/* Write all keys in memory to file for later merge */ + +static int write_keys(MARIA_SORT_PARAM *info, register uchar **sort_keys, + uint count, BUFFPEK *buffpek, IO_CACHE *tempfile) +{ + uchar **end; + uint sort_length=info->key_length; + DBUG_ENTER("write_keys"); + + my_qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp, + info); + if (!my_b_inited(tempfile) && + open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); /* purecov: inspected */ + + buffpek->file_pos=my_b_tell(tempfile); + buffpek->count=count; + + for (end=sort_keys+count ; sort_keys != end ; sort_keys++) + { + if (my_b_write(tempfile, *sort_keys, (uint) sort_length)) + DBUG_RETURN(1); /* purecov: inspected */ + } + DBUG_RETURN(0); +} /* write_keys */ + + +static inline int +my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs) +{ + int err; + uint16 len= _ma_keylength(info->keyinfo, bufs); + + /* The following is safe as this is a local file */ + if ((err= my_b_write(to_file, (uchar*)&len, sizeof(len)))) + return (err); + if ((err= my_b_write(to_file,bufs, (uint) len))) + return (err); + return (0); +} + + +static int write_keys_varlen(MARIA_SORT_PARAM *info, + register uchar **sort_keys, + uint count, BUFFPEK *buffpek, + IO_CACHE *tempfile) +{ + uchar **end; + int err; + DBUG_ENTER("write_keys_varlen"); + + my_qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp, + info); + if (!my_b_inited(tempfile) && + open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); /* purecov: inspected */ + + buffpek->file_pos=my_b_tell(tempfile); + buffpek->count=count; + for (end=sort_keys+count ; sort_keys != end ; sort_keys++) + { + if ((err= my_var_write(info,tempfile, *sort_keys))) + DBUG_RETURN(err); + } + DBUG_RETURN(0); +} /* write_keys_varlen */ + + +static int write_key(MARIA_SORT_PARAM *info, uchar *key, + IO_CACHE *tempfile) +{ + uint16 key_length=info->real_key_length; + DBUG_ENTER("write_key"); + + if (!my_b_inited(tempfile) && + open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); + + if (my_b_write(tempfile, (uchar*)&key_length,sizeof(key_length)) || + my_b_write(tempfile, key, (uint) key_length)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} /* write_key */ + + +/* Write index */ + +static int write_index(MARIA_SORT_PARAM *info, + register uchar **sort_keys, + register uint count) +{ + DBUG_ENTER("write_index"); + + my_qsort2((uchar*) sort_keys,(size_t) count,sizeof(uchar*), + (qsort2_cmp) info->key_cmp,info); + while (count--) + { + if ((*info->key_write)(info, *sort_keys++)) + DBUG_RETURN(-1); /* purecov: inspected */ + } + DBUG_RETURN(0); +} /* write_index */ + + + /* Merge buffers to make < MERGEBUFF2 buffers */ + +static int merge_many_buff(MARIA_SORT_PARAM *info, uint keys, + uchar **sort_keys, BUFFPEK *buffpek, + int *maxbuffer, IO_CACHE *t_file) +{ + register int i; + IO_CACHE t_file2, *from_file, *to_file, *temp; + BUFFPEK *lastbuff; + DBUG_ENTER("merge_many_buff"); + + if (*maxbuffer < MERGEBUFF2) + DBUG_RETURN(0); /* purecov: inspected */ + if (flush_io_cache(t_file) || + open_cached_file(&t_file2,my_tmpdir(info->tmpdir),"ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); /* purecov: inspected */ + + from_file= t_file ; to_file= &t_file2; + while (*maxbuffer >= MERGEBUFF2) + { + reinit_io_cache(from_file,READ_CACHE,0L,0,0); + reinit_io_cache(to_file,WRITE_CACHE,0L,0,0); + lastbuff=buffpek; + for (i=0 ; i <= *maxbuffer-MERGEBUFF*3/2 ; i+=MERGEBUFF) + { + if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++, + buffpek+i,buffpek+i+MERGEBUFF-1)) + goto cleanup; + } + if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++, + buffpek+i,buffpek+ *maxbuffer)) + break; /* purecov: inspected */ + if (flush_io_cache(to_file)) + break; /* purecov: inspected */ + temp=from_file; from_file=to_file; to_file=temp; + *maxbuffer= (int) (lastbuff-buffpek)-1; + } +cleanup: + close_cached_file(to_file); /* This holds old result */ + if (to_file == t_file) + *t_file=t_file2; /* Copy result file */ + + DBUG_RETURN(*maxbuffer >= MERGEBUFF2); /* Return 1 if interrupted */ +} /* merge_many_buff */ + + +/* + Read data to buffer + + SYNOPSIS + read_to_buffer() + fromfile File to read from + buffpek Where to read from + sort_length max length to read + RESULT + > 0 Ammount of bytes read + -1 Error +*/ + +static uint read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek, + uint sort_length) +{ + register uint count; + uint length; + + if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count))) + { + if (my_pread(fromfile->file, buffpek->base, + (length= sort_length*count),buffpek->file_pos,MYF_RW)) + return((uint) -1); /* purecov: inspected */ + buffpek->key=buffpek->base; + buffpek->file_pos+= length; /* New filepos */ + buffpek->count-= count; + buffpek->mem_count= count; + } + return (count*sort_length); +} /* read_to_buffer */ + +static uint read_to_buffer_varlen(IO_CACHE *fromfile, BUFFPEK *buffpek, + uint sort_length) +{ + register uint count; + uint idx; + uchar *buffp; + + if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count))) + { + buffp= buffpek->base; + + for (idx=1;idx<=count;idx++) + { + uint16 length_of_key; + if (my_pread(fromfile->file,(uchar*)&length_of_key,sizeof(length_of_key), + buffpek->file_pos,MYF_RW)) + return((uint) -1); + buffpek->file_pos+=sizeof(length_of_key); + if (my_pread(fromfile->file, buffp, length_of_key, + buffpek->file_pos,MYF_RW)) + return((uint) -1); + buffpek->file_pos+=length_of_key; + buffp = buffp + sort_length; + } + buffpek->key=buffpek->base; + buffpek->count-= count; + buffpek->mem_count= count; + } + return (count*sort_length); +} /* read_to_buffer_varlen */ + + +static int write_merge_key_varlen(MARIA_SORT_PARAM *info, + IO_CACHE *to_file, uchar* key, + uint sort_length, uint count) +{ + uint idx; + uchar *bufs = key; + + for (idx=1;idx<=count;idx++) + { + int err; + if ((err= my_var_write(info, to_file, bufs))) + return (err); + bufs=bufs+sort_length; + } + return(0); +} + + +static int write_merge_key(MARIA_SORT_PARAM *info __attribute__((unused)), + IO_CACHE *to_file, uchar *key, + uint sort_length, uint count) +{ + return my_b_write(to_file, key, (size_t) sort_length*count); +} + +/* + Merge buffers to one buffer + If to_file == 0 then use info->key_write +*/ + +static int NEAR_F +merge_buffers(MARIA_SORT_PARAM *info, uint keys, IO_CACHE *from_file, + IO_CACHE *to_file, uchar **sort_keys, BUFFPEK *lastbuff, + BUFFPEK *Fb, BUFFPEK *Tb) +{ + int error; + uint sort_length,maxcount; + ha_rows count; + my_off_t to_start_filepos; + uchar *strpos; + BUFFPEK *buffpek,**refpek; + QUEUE queue; + DBUG_ENTER("merge_buffers"); + + count=error=0; + maxcount=keys/((uint) (Tb-Fb) +1); + DBUG_ASSERT(maxcount > 0); + LINT_INIT(to_start_filepos); + if (to_file) + to_start_filepos=my_b_tell(to_file); + strpos= (uchar*) sort_keys; + sort_length=info->key_length; + + if (init_queue(&queue,(uint) (Tb-Fb)+1,offsetof(BUFFPEK,key),0, + (int (*)(void*, uchar *,uchar*)) info->key_cmp, + (void*) info, 0, 0)) + DBUG_RETURN(1); /* purecov: inspected */ + + for (buffpek= Fb ; buffpek <= Tb ; buffpek++) + { + count+= buffpek->count; + buffpek->base= strpos; + buffpek->max_keys=maxcount; + strpos+= (uint) (error=(int) info->read_to_buffer(from_file,buffpek, + sort_length)); + if (error == -1) + goto err; /* purecov: inspected */ + queue_insert(&queue,(uchar*) buffpek); + } + + while (queue.elements > 1) + { + for (;;) + { + buffpek=(BUFFPEK*) queue_top(&queue); + if (to_file) + { + if (info->write_key(info,to_file, buffpek->key, + (uint) sort_length,1)) + { + error=1; goto err; /* purecov: inspected */ + } + } + else + { + if ((*info->key_write)(info,(void*) buffpek->key)) + { + error=1; goto err; /* purecov: inspected */ + } + } + buffpek->key+=sort_length; + if (! --buffpek->mem_count) + { + /* It's enough to check for killedptr before a slow operation */ + if (_ma_killed_ptr(info->sort_info->param)) + { + error=1; + goto err; + } + if (!(error=(int) info->read_to_buffer(from_file,buffpek,sort_length))) + { + uchar *base= buffpek->base; + uint max_keys=buffpek->max_keys; + + VOID(queue_remove_top(&queue)); + + /* Put room used by buffer to use in other buffer */ + for (refpek= (BUFFPEK**) &queue_top(&queue); + refpek <= (BUFFPEK**) &queue_end(&queue); + refpek++) + { + buffpek= *refpek; + if (buffpek->base+buffpek->max_keys*sort_length == base) + { + buffpek->max_keys+=max_keys; + break; + } + else if (base+max_keys*sort_length == buffpek->base) + { + buffpek->base=base; + buffpek->max_keys+=max_keys; + break; + } + } + break; /* One buffer have been removed */ + } + } + else if (error == -1) + goto err; /* purecov: inspected */ + queue_replace_top(&queue); /* Top element has been replaced */ + } + } + buffpek=(BUFFPEK*) queue_top(&queue); + buffpek->base= (uchar*) sort_keys; + buffpek->max_keys=keys; + do + { + if (to_file) + { + if (info->write_key(info, to_file, buffpek->key, + sort_length,buffpek->mem_count)) + { + error=1; goto err; /* purecov: inspected */ + } + } + else + { + register uchar *end; + strpos= buffpek->key; + for (end= strpos+buffpek->mem_count*sort_length; + strpos != end ; + strpos+=sort_length) + { + if ((*info->key_write)(info, strpos)) + { + error=1; goto err; /* purecov: inspected */ + } + } + } + } + while ((error=(int) info->read_to_buffer(from_file,buffpek,sort_length)) != + -1 && error != 0); + + lastbuff->count=count; + if (to_file) + lastbuff->file_pos=to_start_filepos; +err: + delete_queue(&queue); + DBUG_RETURN(error); +} /* merge_buffers */ + + + /* Do a merge to output-file (save only positions) */ + +static int NEAR_F +merge_index(MARIA_SORT_PARAM *info, uint keys, uchar **sort_keys, + BUFFPEK *buffpek, int maxbuffer, IO_CACHE *tempfile) +{ + DBUG_ENTER("merge_index"); + if (merge_buffers(info,keys,tempfile,(IO_CACHE*) 0,sort_keys,buffpek,buffpek, + buffpek+maxbuffer)) + DBUG_RETURN(1); /* purecov: inspected */ + DBUG_RETURN(0); +} /* merge_index */ + + +static int flush_maria_ft_buf(MARIA_SORT_PARAM *info) +{ + int err=0; + if (info->sort_info->ft_buf) + { + err=_ma_sort_ft_buf_flush(info); + my_free(info->sort_info->ft_buf, MYF(0)); + info->sort_info->ft_buf=0; + } + return err; +} diff --git a/storage/maria/ma_sp_defs.h b/storage/maria/ma_sp_defs.h new file mode 100644 index 00000000000..398bf99c52e --- /dev/null +++ b/storage/maria/ma_sp_defs.h @@ -0,0 +1,48 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _SP_DEFS_H +#define _SP_DEFS_H + +#define SPDIMS 2 +#define SPTYPE HA_KEYTYPE_DOUBLE +#define SPLEN 8 + +#ifdef HAVE_SPATIAL + +enum wkbType +{ + wkbPoint = 1, + wkbLineString = 2, + wkbPolygon = 3, + wkbMultiPoint = 4, + wkbMultiLineString = 5, + wkbMultiPolygon = 6, + wkbGeometryCollection = 7 +}; + +enum wkbByteOrder +{ + wkbXDR = 0, /* Big Endian */ + wkbNDR = 1 /* Little Endian */ +}; + +MARIA_KEY *_ma_sp_make_key(MARIA_HA *info, MARIA_KEY *ret_key, uint keynr, + uchar *key, const uchar *record, my_off_t filepos, + ulonglong trid); + +#endif /*HAVE_SPATIAL*/ +#endif /* _SP_DEFS_H */ diff --git a/storage/maria/ma_sp_key.c b/storage/maria/ma_sp_key.c new file mode 100644 index 00000000000..22944a5db0a --- /dev/null +++ b/storage/maria/ma_sp_key.c @@ -0,0 +1,305 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "ma_blockrec.h" /* For ROW_FLAG_TRANSID */ +#include "trnman.h" + +#ifdef HAVE_SPATIAL + +#include "ma_sp_defs.h" + +static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims, + double *mbr, int top); +static int sp_mbr_from_wkb(uchar (*wkb), uint size, uint n_dims, double *mbr); + + +/** + Create spactial key +*/ + +MARIA_KEY *_ma_sp_make_key(MARIA_HA *info, MARIA_KEY *ret_key, uint keynr, + uchar *key, const uchar *record, my_off_t filepos, + ulonglong trid) +{ + HA_KEYSEG *keyseg; + MARIA_KEYDEF *keyinfo = &info->s->keyinfo[keynr]; + uint len = 0; + const uchar *pos; + uint dlen; + uchar *dptr; + double mbr[SPDIMS * 2]; + uint i; + DBUG_ENTER("_ma_sp_make_key"); + + keyseg = &keyinfo->seg[-1]; + pos = record + keyseg->start; + ret_key->data= key; + + dlen = _ma_calc_blob_length(keyseg->bit_start, pos); + memcpy_fixed(&dptr, pos + keyseg->bit_start, sizeof(char*)); + if (!dptr) + { + my_errno= HA_ERR_NULL_IN_SPATIAL; + DBUG_RETURN(0); + } + + sp_mbr_from_wkb(dptr + 4, dlen - 4, SPDIMS, mbr); /* SRID */ + + for (i = 0, keyseg = keyinfo->seg; keyseg->type; keyseg++, i++) + { + uint length = keyseg->length, start= keyseg->start; + double val; + + DBUG_ASSERT(length == 8); + DBUG_ASSERT(!(start % 8)); + DBUG_ASSERT(start < sizeof(mbr)); + DBUG_ASSERT(keyseg->type == HA_KEYTYPE_DOUBLE); + + val= mbr[start / sizeof (double)]; +#ifdef HAVE_ISNAN + if (isnan(val)) + { + bzero(key, length); + key+= length; + len+= length; + continue; + } +#endif + + if (keyseg->flag & HA_SWAP_KEY) + { + mi_float8store(key, val); + } + else + { + float8store((uchar *)key, val); + } + key += length; + len+= length; + } + _ma_dpointer(info->s, key, filepos); + ret_key->keyinfo= keyinfo; + ret_key->data_length= len; + ret_key->ref_length= info->s->rec_reflength; + ret_key->flag= 0; + if (_ma_have_versioning(info) && trid) + { + ret_key->ref_length+= transid_store_packed(info, + key + ret_key->ref_length, + trid); + } + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, ret_key);); + DBUG_RETURN(ret_key); +} + + +/* + Calculate minimal bounding rectangle (mbr) of the spatial object + stored in "well-known binary representation" (wkb) format. +*/ + +static int sp_mbr_from_wkb(uchar *wkb, uint size, uint n_dims, double *mbr) +{ + uint i; + + for (i=0; i < n_dims; ++i) + { + mbr[i * 2] = DBL_MAX; + mbr[i * 2 + 1] = -DBL_MAX; + } + + return sp_get_geometry_mbr(&wkb, wkb + size, n_dims, mbr, 1); +} + +/* + Add one point stored in wkb to mbr +*/ + +static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order __attribute__((unused)), + double *mbr) +{ + double ord; + double *mbr_end= mbr + n_dims * 2; + + while (mbr < mbr_end) + { + if ((*wkb) > end - 8) + return -1; + float8get(ord, (const uchar*) *wkb); + (*wkb)+= 8; + if (ord < *mbr) + *mbr= ord; + mbr++; + if (ord > *mbr) + *mbr= ord; + mbr++; + } + return 0; +} + + +static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr) +{ + return sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr); +} + + +static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr) +{ + uint n_points; + + n_points = uint4korr(*wkb); + (*wkb) += 4; + for (; n_points > 0; --n_points) + { + /* Add next point to mbr */ + if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + return 0; +} + + +static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr) +{ + uint n_linear_rings; + uint n_points; + + n_linear_rings = uint4korr((*wkb)); + (*wkb) += 4; + + for (; n_linear_rings > 0; --n_linear_rings) + { + n_points = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_points > 0; --n_points) + { + /* Add next point to mbr */ + if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + } + return 0; +} + +static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims, + double *mbr, int top) +{ + int res; + uchar byte_order; + uint wkb_type; + + byte_order = *(*wkb); + ++(*wkb); + + wkb_type = uint4korr((*wkb)); + (*wkb) += 4; + + switch ((enum wkbType) wkb_type) + { + case wkbPoint: + res = sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr); + break; + case wkbLineString: + res = sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr); + break; + case wkbPolygon: + res = sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr); + break; + case wkbMultiPoint: + { + uint n_items; + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + byte_order = *(*wkb); + ++(*wkb); + (*wkb) += 4; + if (sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + res = 0; + break; + } + case wkbMultiLineString: + { + uint n_items; + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + byte_order = *(*wkb); + ++(*wkb); + (*wkb) += 4; + if (sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + res = 0; + break; + } + case wkbMultiPolygon: + { + uint n_items; + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + byte_order = *(*wkb); + ++(*wkb); + (*wkb) += 4; + if (sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + res = 0; + break; + } + case wkbGeometryCollection: + { + uint n_items; + + if (!top) + return -1; + + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + if (sp_get_geometry_mbr(wkb, end, n_dims, mbr, 0)) + return -1; + } + res = 0; + break; + } + default: + res = -1; + } + return res; +} + +#endif /*HAVE_SPATIAL*/ diff --git a/storage/maria/ma_sp_test.c b/storage/maria/ma_sp_test.c new file mode 100644 index 00000000000..b8c00753acb --- /dev/null +++ b/storage/maria/ma_sp_test.c @@ -0,0 +1,568 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Testing of the basic functions of a MARIA spatial table */ +/* Written by Alex Barkov, who has a shared copyright to this code */ + +#include "maria.h" + +#ifdef HAVE_SPATIAL +#include "ma_sp_defs.h" + +#define MAX_REC_LENGTH 1024 +#define KEYALG HA_KEY_ALG_RTREE + +static void create_linestring(uchar *record,uint rownr); +static void print_record(uchar * record,my_off_t offs,const char * tail); + +static void create_key(uchar *key,uint rownr); +static void print_key(const uchar *key,const char * tail); + +static int run_test(const char *filename); +static int read_with_pos(MARIA_HA * file, int silent); + +static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points, + uchar *wkb); +static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims); + +static char blob_key[MAX_REC_LENGTH]; + + +int main(int argc __attribute__((unused)),char *argv[]) +{ + MY_INIT(argv[0]); + maria_init(); + exit(run_test("sp_test")); +} + + +int run_test(const char *filename) +{ + MARIA_HA *file; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + MARIA_COLUMNDEF recinfo[20]; + MARIA_KEYDEF keyinfo[20]; + HA_KEYSEG keyseg[20]; + key_range min_range, max_range; + int silent=0; + int create_flag=0; + int null_fields=0; + int nrecords=30; + int uniques=0; + int i; + int error; + int row_count=0; + uchar record[MAX_REC_LENGTH]; + uchar key[MAX_REC_LENGTH]; + uchar read_record[MAX_REC_LENGTH]; + int upd=10; + ha_rows hrows; + + /* Define a column for NULLs and DEL markers*/ + + recinfo[0].type=FIELD_NORMAL; + recinfo[0].length=1; /* For NULL bits */ + + + /* Define spatial column */ + + recinfo[1].type=FIELD_BLOB; + recinfo[1].length=4 + portable_sizeof_char_ptr; + + + + /* Define a key with 1 spatial segment */ + + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].flag=HA_SPATIAL; + keyinfo[0].key_alg=KEYALG; + + keyinfo[0].seg[0].type= HA_KEYTYPE_BINARY; + keyinfo[0].seg[0].flag=0; + keyinfo[0].seg[0].start= 1; + keyinfo[0].seg[0].length=1; /* Spatial ignores it anyway */ + keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language=default_charset_info->number; + keyinfo[0].seg[0].bit_start=4; /* Long BLOB */ + + + if (!silent) + printf("- Creating isam-file\n"); + + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=10000000; + + if (maria_create(filename, + DYNAMIC_RECORD, + 1, /* keys */ + keyinfo, + 2, /* columns */ + recinfo,uniques,&uniquedef,&create_info,create_flag)) + goto err; + + if (!silent) + printf("- Open isam-file\n"); + + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + + if (!silent) + printf("- Writing key:s\n"); + + for (i=0; i<nrecords; i++ ) + { + create_linestring(record,i); + error=maria_write(file,record); + print_record(record,maria_position(file),"\n"); + if (!error) + { + row_count++; + } + else + { + printf("maria_write: %d\n", error); + goto err; + } + } + + if ((error=read_with_pos(file,silent))) + goto err; + + if (!silent) + printf("- Deleting rows with position\n"); + for (i=0; i < nrecords/4; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + print_record(read_record,maria_position(file),"\n"); + error=maria_delete(file,read_record); + if (error) + { + printf("pos: %2d maria_delete: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + } + + if (!silent) + printf("- Updating rows with position\n"); + for (i=0; i < nrecords/2 ; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + if (error==HA_ERR_RECORD_DELETED) + continue; + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + print_record(read_record,maria_position(file),""); + create_linestring(record,i+nrecords*upd); + printf("\t-> "); + print_record(record,maria_position(file),"\n"); + error=maria_update(file,read_record,record); + if (error) + { + printf("pos: %2d maria_update: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + } + + if ((error=read_with_pos(file,silent))) + goto err; + + if (!silent) + printf("- Test maria_rkey then a sequence of maria_rnext_same\n"); + + create_key(key, nrecords*4/5); + print_key(key," search for INTERSECT\n"); + + if ((error=maria_rkey(file,read_record,0,key,0,HA_READ_MBR_INTERSECT))) + { + printf("maria_rkey: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rkey\n"); + row_count=1; + + for (;;) + { + if ((error=maria_rnext_same(file,read_record))) + { + if (error==HA_ERR_END_OF_FILE) + break; + printf("maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext_same\n"); + row_count++; + } + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_rfirst then a sequence of maria_rnext\n"); + + error=maria_rfirst(file,read_record,0); + if (error) + { + printf("maria_rfirst: %3d errno: %3d\n",error,my_errno); + goto err; + } + row_count=1; + print_record(read_record,maria_position(file)," maria_frirst\n"); + + for(i=0;i<nrecords;i++) { + if ((error=maria_rnext(file,read_record,0))) + { + if (error==HA_ERR_END_OF_FILE) + break; + printf("maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext\n"); + row_count++; + } + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_records_in_range()\n"); + + create_key(key, nrecords*upd); + print_key(key," INTERSECT\n"); + min_range.key= key; + min_range.length= 1000; /* Big enough */ + min_range.flag= HA_READ_MBR_INTERSECT; + max_range.key= record+1; + max_range.length= 1000; /* Big enough */ + max_range.flag= HA_READ_KEY_EXACT; + hrows= maria_records_in_range(file,0, &min_range, &max_range); + printf(" %ld rows\n", (long) hrows); + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return 0; + +err: + printf("got error: %3d when using maria-database\n",my_errno); + maria_end(); + return 1; /* skip warning */ +} + + +static int read_with_pos (MARIA_HA * file,int silent) +{ + int error; + int i; + uchar read_record[MAX_REC_LENGTH]; + int rows=0; + + if (!silent) + printf("- Reading rows with position\n"); + for (i=0;;i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + if (error==HA_ERR_END_OF_FILE) + break; + if (error==HA_ERR_RECORD_DELETED) + continue; + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + return error; + } + rows++; + print_record(read_record,maria_position(file),"\n"); + } + printf(" %d rows\n",rows); + return 0; +} + + +#ifdef NOT_USED +static void bprint_record(uchar * record, + my_off_t offs __attribute__((unused)), + const char * tail) +{ + int i; + char * pos; + i=(unsigned char)record[0]; + printf("%02X ",i); + + for( pos=record+1, i=0; i<32; i++,pos++) + { + int b=(unsigned char)*pos; + printf("%02X",b); + } + printf("%s",tail); +} +#endif + + +static void print_record(uchar * record, my_off_t offs,const char * tail) +{ + uchar *pos; + char *ptr; + uint len; + + printf(" rec=(%d)",(unsigned char)record[0]); + pos=record+1; + len=sint4korr(pos); + pos+=4; + printf(" len=%d ",len); + memcpy_fixed(&ptr,pos,sizeof(char*)); + if (ptr) + maria_rtree_PrintWKB((uchar*) ptr,SPDIMS); + else + printf("<NULL> "); + printf(" offs=%ld ",(long int)offs); + printf("%s",tail); +} + + +#ifdef NOT_USED +static void create_point(uchar *record,uint rownr) +{ + uint tmp; + char *ptr; + char *pos=record; + double x[200]; + int i; + + for(i=0;i<SPDIMS;i++) + x[i]=rownr; + + bzero((char*) record,MAX_REC_LENGTH); + *pos=0x01; /* DEL marker */ + pos++; + + memset(blob_key,0,sizeof(blob_key)); + tmp=maria_rtree_CreatePointWKB(x,SPDIMS,blob_key); + + int4store(pos,tmp); + pos+=4; + + ptr=blob_key; + memcpy_fixed(pos,&ptr,sizeof(char*)); +} +#endif + + +static void create_linestring(uchar *record,uint rownr) +{ + uint tmp; + char *ptr; + uchar *pos= record; + double x[200]; + int i,j; + int npoints=2; + + for(j=0;j<npoints;j++) + for(i=0;i<SPDIMS;i++) + x[i+j*SPDIMS]=rownr*j; + + bzero((char*) record,MAX_REC_LENGTH); + *pos=0x01; /* DEL marker */ + pos++; + + memset(blob_key,0,sizeof(blob_key)); + tmp=maria_rtree_CreateLineStringWKB(x,SPDIMS,npoints, (uchar*) blob_key); + + int4store(pos,tmp); + pos+=4; + + ptr=blob_key; + memcpy_fixed(pos,&ptr,sizeof(char*)); +} + + +static void create_key(uchar *key,uint rownr) +{ + double c=rownr; + uchar *pos; + uint i; + + bzero(key,MAX_REC_LENGTH); + for ( pos=key, i=0; i<2*SPDIMS; i++) + { + float8store(pos,c); + pos+=sizeof(c); + } +} + +static void print_key(const uchar *key,const char * tail) +{ + double c; + uint i; + + printf(" key="); + for (i=0; i<2*SPDIMS; i++) + { + float8get(c,key); + key+=sizeof(c); + printf("%.14g ",c); + } + printf("%s",tail); +} + + +#ifdef NOT_USED + +static int maria_rtree_CreatePointWKB(double *ords, uint n_dims, uchar *wkb) +{ + uint i; + + *wkb = wkbXDR; + ++wkb; + int4store(wkb, wkbPoint); + wkb += 4; + + for (i=0; i < n_dims; ++i) + { + float8store(wkb, ords[i]); + wkb += 8; + } + return 5 + n_dims * 8; +} +#endif + + +static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points, + uchar *wkb) +{ + uint i; + uint n_ords = n_dims * n_points; + + *wkb = wkbXDR; + ++wkb; + int4store(wkb, wkbLineString); + wkb += 4; + int4store(wkb, n_points); + wkb += 4; + for (i=0; i < n_ords; ++i) + { + float8store(wkb, ords[i]); + wkb += 8; + } + return 9 + n_points * n_dims * 8; +} + + +static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims) +{ + uint wkb_type; + + ++wkb; + wkb_type = uint4korr(wkb); + wkb += 4; + + switch ((enum wkbType)wkb_type) + { + case wkbPoint: + { + uint i; + double ord; + + printf("POINT("); + for (i=0; i < n_dims; ++i) + { + float8get(ord, wkb); + wkb += 8; + printf("%.14g", ord); + if (i < n_dims - 1) + printf(" "); + else + printf(")"); + } + break; + } + case wkbLineString: + { + uint p, i; + uint n_points; + double ord; + + printf("LineString("); + n_points = uint4korr(wkb); + wkb += 4; + for (p=0; p < n_points; ++p) + { + for (i=0; i < n_dims; ++i) + { + float8get(ord, wkb); + wkb += 8; + printf("%.14g", ord); + if (i < n_dims - 1) + printf(" "); + } + if (p < n_points - 1) + printf(", "); + else + printf(")"); + } + break; + } + case wkbPolygon: + { + printf("POLYGON(...)"); + break; + } + case wkbMultiPoint: + { + printf("MULTIPOINT(...)"); + break; + } + case wkbMultiLineString: + { + printf("MULTILINESTRING(...)"); + break; + } + case wkbMultiPolygon: + { + printf("MULTIPOLYGON(...)"); + break; + } + case wkbGeometryCollection: + { + printf("GEOMETRYCOLLECTION(...)"); + break; + } + default: + { + printf("UNKNOWN GEOMETRY TYPE"); + break; + } + } +} + +#else +int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused))) +{ + exit(0); +} +#endif /*HAVE_SPATIAL*/ diff --git a/storage/maria/ma_state.c b/storage/maria/ma_state.c new file mode 100644 index 00000000000..ca94d58264b --- /dev/null +++ b/storage/maria/ma_state.c @@ -0,0 +1,795 @@ +/* Copyright (C) 2008 Sun AB and Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Functions to maintain live statistics for Maria transactional tables + and versioning for not transactional tables + + See WL#3138; Maria - fast "SELECT COUNT(*) FROM t;" and "CHECKSUM TABLE t" + for details about live number of rows and live checksums + + TODO + - Allocate MA_USED_TABLES and MA_HISTORY_STATE from a global pool (to + avoid calls to malloc() + - In trnamn_end_trans_hook(), don't call _ma_remove_not_visible_states() + every time. One could for example call it if there has been more than + 10 ended transactions since last time it was called. +*/ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_blockrec.h" + +/** + @brief Setup initial start-of-transaction state for a table + + @fn _ma_setup_live_state + @param info Maria handler + + @notes + This function ensures that trn->used_tables contains a list of + start and live states for tables that are part of the transaction + and that info->state points to the current live state for the table. + + @TODO + Change trn->table_list to a hash and share->state_history to a binary tree + + @return + @retval 0 ok + @retval 1 error (out of memory) +*/ + +my_bool _ma_setup_live_state(MARIA_HA *info) +{ + TRN *trn; + MARIA_SHARE *share= info->s; + MARIA_USED_TABLES *tables; + MARIA_STATE_HISTORY *history; + DBUG_ENTER("_ma_setup_live_state"); + + if (maria_create_trn_hook(info)) + DBUG_RETURN(1); + + trn= info->trn; + for (tables= (MARIA_USED_TABLES*) info->trn->used_tables; + tables; + tables= tables->next) + { + if (tables->share == share) + { + /* Table is already used by transaction */ + goto end; + } + } + + /* Table was not used before, create new table state entry */ + if (!(tables= (MARIA_USED_TABLES*) my_malloc(sizeof(*tables), + MYF(MY_WME | MY_ZEROFILL)))) + DBUG_RETURN(1); + tables->next= trn->used_tables; + trn->used_tables= tables; + tables->share= share; + + pthread_mutex_lock(&share->intern_lock); + share->in_trans++; + DBUG_PRINT("info", ("share: 0x%lx in_trans: %d", + (ulong) share, share->in_trans)); + + history= share->state_history; + + /* + We must keep share locked to ensure that we don't access a history + link that is deleted by concurrently running checkpoint. + + It's enough to compare trids here (instead of calling + tranman_can_read_from) as history->trid is a commit_trid + */ + while (trn->trid <= history->trid) + history= history->next; + pthread_mutex_unlock(&share->intern_lock); + /* The current item can't be deleted as it's the first one visible for us */ + tables->state_start= tables->state_current= history->state; + tables->state_current.changed= tables->state_current.no_transid= 0; + + DBUG_PRINT("info", ("records: %ld", (ulong) tables->state_start.records)); + +end: + info->state_start= &tables->state_start; + info->state= &tables->state_current; + + /* + Mark in transaction state if we are not using transid (versioning) + on rows. If not, then we will in _ma_trnman_end_trans_hook() + ensure that the state is visible for all at end of transaction + */ + tables->state_current.no_transid|= !(info->row_flag & ROW_FLAG_TRANSID); + + DBUG_RETURN(0); +} + + +/** + @brief Remove states that are not visible by anyone + + @fn _ma_remove_not_visible_states() + @param org_history List to history + @param all 1 if we should delete the first state if it's + visible for all. For the moment this is only used + on close() of table. + @param trnman_is_locked Set to 1 if we have already a lock on trnman. + + @notes + The assumption is that items in the history list is ordered by + commit_trid. + + A state is not visible anymore if there is no new transaction + that has been started between the commit_trid's of two states + + As long as some states exists, we keep the newest = (last commit) + state as first state in the history. This is to allow us to just move + the history from the global list to the share when we open the table. + + Note that if 'all' is set trnman_is_locked must be 0, becasue + trnman_get_min_trid() will take a lock on trnman. + + @return + @retval Pointer to new history list +*/ + +MARIA_STATE_HISTORY +*_ma_remove_not_visible_states(MARIA_STATE_HISTORY *org_history, + my_bool all, + my_bool trnman_is_locked) +{ + TrID last_trid; + MARIA_STATE_HISTORY *history, **parent, *next; + DBUG_ENTER("_ma_remove_not_visible_states"); + + if (!org_history) + DBUG_RETURN(0); /* Not versioned table */ + + last_trid= org_history->trid; + parent= &org_history->next; + for (history= org_history->next; history; history= next) + { + next= history->next; + if (!trnman_exists_active_transactions(history->trid, last_trid, + trnman_is_locked)) + { + DBUG_PRINT("info", ("removing history->trid: %lu next: %lu", + (ulong) history->trid, (ulong) last_trid)); + my_free(history, MYF(0)); + continue; + } + *parent= history; + parent= &history->next; + last_trid= history->trid; + } + *parent= 0; + + if (all && parent == &org_history->next) + { + /* There is only one state left. Delete this if it's visible for all */ + if (last_trid < trnman_get_min_trid()) + { + my_free(org_history, MYF(0)); + org_history= 0; + } + } + DBUG_RETURN(org_history); +} + + +/** + @brief Remove not used state history + + @param share Maria table information + @param all 1 if we should delete the first state if it's + visible for all. For the moment this is only used + on close() of table. + + @notes + share and trnman are not locked. + + We must first lock trnman and then share->intern_lock. This is becasue + _ma_trnman_end_trans_hook() has a lock on trnman and then + takes share->intern_lock. +*/ + +void _ma_remove_not_visible_states_with_lock(MARIA_SHARE *share, + my_bool all) +{ + my_bool is_lock_trman; + if ((is_lock_trman= trman_is_inited())) + trnman_lock(); + + pthread_mutex_lock(&share->intern_lock); + share->state_history= _ma_remove_not_visible_states(share->state_history, + all, 1); + pthread_mutex_unlock(&share->intern_lock); + if (is_lock_trman) + trnman_unlock(); +} + + +/* + Free state history information from share->history and reset information + to current state. + + @notes + Used after repair as then all rows are visible for everyone +*/ + +void _ma_reset_state(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + MARIA_STATE_HISTORY *history= share->state_history; + + if (history) + { + MARIA_STATE_HISTORY *next; + + /* Set the current history to current state */ + share->state_history->state= share->state.state; + /* Set current table handler to point to new history state */ + info->state= info->state_start= &share->state_history->state; + for (history= history->next ; history ; history= next) + { + next= history->next; + my_free(history, MYF(0)); + } + share->state_history->next= 0; + share->state_history->trid= 0; /* Visibile for all */ + } +} + + +/**************************************************************************** + The following functions are called by thr_lock() in threaded applications + for not transactional tables +****************************************************************************/ + +/* + Create a copy of the current status for the table + + SYNOPSIS + _ma_get_status() + param Pointer to Myisam handler + concurrent_insert Set to 1 if we are going to do concurrent inserts + (THR_WRITE_CONCURRENT_INSERT was used) +*/ + +void _ma_get_status(void* param, my_bool concurrent_insert) +{ + MARIA_HA *info=(MARIA_HA*) param; + DBUG_ENTER("_ma_get_status"); + DBUG_PRINT("info",("key_file: %ld data_file: %ld concurrent_insert: %d", + (long) info->s->state.state.key_file_length, + (long) info->s->state.state.data_file_length, + concurrent_insert)); +#ifndef DBUG_OFF + if (info->state->key_file_length > info->s->state.state.key_file_length || + info->state->data_file_length > info->s->state.state.data_file_length) + DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld", + (long) info->state->key_file_length, + (long) info->state->data_file_length)); +#endif + info->state_save= info->s->state.state; + info->state= &info->state_save; + info->state->changed= 0; + info->append_insert_at_end= concurrent_insert; + DBUG_VOID_RETURN; +} + + +void _ma_update_status(void* param) +{ + MARIA_HA *info=(MARIA_HA*) param; + /* + Because someone may have closed the table we point at, we only + update the state if its our own state. This isn't a problem as + we are always pointing at our own lock or at a read lock. + (This is enforced by thr_multi_lock.c) + */ + if (info->state == &info->state_save) + { + MARIA_SHARE *share= info->s; +#ifndef DBUG_OFF + DBUG_PRINT("info",("updating status: key_file: %ld data_file: %ld", + (long) info->state->key_file_length, + (long) info->state->data_file_length)); + if (info->state->key_file_length < share->state.state.key_file_length || + info->state->data_file_length < share->state.state.data_file_length) + DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld", + (long) share->state.state.key_file_length, + (long) share->state.state.data_file_length)); +#endif + /* + we are going to modify the state without lock's log, this would break + recovery if done with a transactional table. + */ + DBUG_ASSERT(!info->s->base.born_transactional); + share->state.state= *info->state; + info->state= &share->state.state; +#ifdef HAVE_QUERY_CACHE + DBUG_PRINT("info", ("invalidator... '%s' (status update)", + info->s->data_file_name.str)); + DBUG_ASSERT(info->s->chst_invalidator != NULL); + (*info->s->chst_invalidator)((const char *)info->s->data_file_name.str); +#endif + + } + info->append_insert_at_end= 0; +} + + +/* + Same as ma_update_status() but take a lock in the table lock, to protect + against someone calling ma_get_status() from thr_lock() at the same time. +*/ + +void _ma_update_status_with_lock(MARIA_HA *info) +{ + my_bool locked= 0; + if (info->state == &info->state_save) + { + locked= 1; + pthread_mutex_lock(&info->s->lock.mutex); + } + (*info->s->lock.update_status)(info); + if (locked) + pthread_mutex_unlock(&info->s->lock.mutex); +} + + +void _ma_restore_status(void *param) +{ + MARIA_HA *info= (MARIA_HA*) param; + info->state= &info->s->state.state; + info->append_insert_at_end= 0; +} + + +void _ma_copy_status(void* to, void *from) +{ + ((MARIA_HA*) to)->state= &((MARIA_HA*) from)->state_save; +} + + +void _ma_reset_update_flag(void *param, + my_bool concurrent_insert __attribute__((unused))) +{ + MARIA_HA *info=(MARIA_HA*) param; + info->state->changed= 0; +} + + +/** + @brief Check if should allow concurrent inserts + + @implementation + Allow concurrent inserts if we don't have a hole in the table or + if there is no active write lock and there is active read locks and + maria_concurrent_insert == 2. In this last case the new + row('s) are inserted at end of file instead of filling up the hole. + + The last case is to allow one to inserts into a heavily read-used table + even if there is holes. + + @notes + If there is a an rtree indexes in the table, concurrent inserts are + disabled in maria_open() + + @return + @retval 0 ok to use concurrent inserts + @retval 1 not ok +*/ + +my_bool _ma_check_status(void *param) +{ + MARIA_HA *info=(MARIA_HA*) param; + /* + The test for w_locks == 1 is here because this thread has already done an + external lock (in other words: w_locks == 1 means no other threads has + a write lock) + */ + DBUG_PRINT("info",("dellink: %ld r_locks: %u w_locks: %u", + (long) info->s->state.dellink, (uint) info->s->r_locks, + (uint) info->s->w_locks)); + return (my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR || + (maria_concurrent_insert == 2 && info->s->r_locks && + info->s->w_locks == 1)); +} + + +/** + @brief write hook at end of trans to store status for all used table + + @Notes + This function must be called under trnman_lock in trnman_end_trn() + because of the following reasons: + - After trnman_end_trn() is called, the current transaction will be + regarded as committed and all used tables state_history will be + visible to other transactions. To do this, we loop over all used + tables and create/update a history entries that contains the correct + state_history for them. +*/ + +my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit, + my_bool active_transactions) +{ + my_bool error= 0; + MARIA_USED_TABLES *tables, *next; + DBUG_ENTER("_ma_trnman_end_trans_hook"); + + for (tables= (MARIA_USED_TABLES*) trn->used_tables; + tables; + tables= next) + { + MARIA_SHARE *share= tables->share; + next= tables->next; + if (commit) + { + MARIA_STATE_HISTORY *history; + + pthread_mutex_lock(&share->intern_lock); + + /* We only have to update history state if something changed */ + if (tables->state_current.changed) + { + if (tables->state_current.no_transid) + { + /* + The change was done without using transid on rows (like in + bulk insert). In this case this thread is the only one + that is using the table and all rows will be visble + for all transactions. + */ + _ma_reset_history(share); + } + else + { + if (active_transactions && share->now_transactional && + trnman_exists_active_transactions(share->state_history->trid, + trn->commit_trid, 1)) + { + /* + There exist transactions that are still using the current + share->state_history. Create a new history item for this + commit and add it first in the state_history list. This + ensures that all history items are stored in the list in + decresing trid order. + */ + if (!(history= my_malloc(sizeof(*history), MYF(MY_WME)))) + { + /* purecov: begin inspected */ + error= 1; + pthread_mutex_unlock(&share->intern_lock); + my_free(tables, MYF(0)); + continue; + /* purecov: end */ + } + history->state= share->state_history->state; + history->next= share->state_history; + share->state_history= history; + } + else + { + /* Previous history can't be seen by anyone, reuse old memory */ + history= share->state_history; + DBUG_PRINT("info", ("removing history->trid: %lu new: %lu", + (ulong) history->trid, + (ulong) trn->commit_trid)); + } + + history->state.records+= (tables->state_current.records - + tables->state_start.records); + history->state.checksum+= (tables->state_current.checksum - + tables->state_start.checksum); + history->trid= trn->commit_trid; + + share->state.last_change_trn= trn->commit_trid; + + if (history->next) + { + /* Remove not visible states */ + share->state_history= _ma_remove_not_visible_states(history, 0, 1); + } + DBUG_PRINT("info", ("share: 0x%lx in_trans: %d", + (ulong) share, share->in_trans)); + } + } + share->in_trans--; + pthread_mutex_unlock(&share->intern_lock); + } + else + { +#ifndef DBUG_OFF + /* + We need to keep share->in_trans correct in the debug library + because of the assert in maria_close() + */ + pthread_mutex_lock(&share->intern_lock); + share->in_trans--; + pthread_mutex_unlock(&share->intern_lock); +#endif + } + my_free(tables, MYF(0)); + } + trn->used_tables= 0; + DBUG_RETURN(error); +} + + +/** + Remove table from trnman_list + + @notes + This is used when we unlock a table from a group of locked tables + just before doing a rename or drop table. + + share->internal_lock must be locked when function is called +*/ + +void _ma_remove_table_from_trnman(MARIA_SHARE *share, TRN *trn) +{ + MARIA_USED_TABLES *tables, **prev; + DBUG_ENTER("_ma_remove_table_from_trnman"); + DBUG_PRINT("enter", ("share: 0x%lx in_trans: %d", + (ulong) share, share->in_trans)); + + safe_mutex_assert_owner(&share->intern_lock); + + for (prev= (MARIA_USED_TABLES**) (char*) &trn->used_tables, tables= *prev; + tables; + tables= *prev) + { + if (tables->share == share) + { + *prev= tables->next; + share->in_trans--; + DBUG_PRINT("info", ("in_trans: %d", share->in_trans)); + my_free(tables, MYF(0)); + break; + } + prev= &tables->next; + } + DBUG_VOID_RETURN; +} + + + +/**************************************************************************** + The following functions are called by thr_lock() in threaded applications + for transactional tables. +****************************************************************************/ + +/* + Create a copy of the current status for the table + + SYNOPSIS + _ma_get_status() + param Pointer to Myisam handler + concurrent_insert Set to 1 if we are going to do concurrent inserts + (THR_WRITE_CONCURRENT_INSERT was used) +*/ + +void _ma_block_get_status(void* param, my_bool concurrent_insert) +{ + MARIA_HA *info=(MARIA_HA*) param; + DBUG_ENTER("_ma_block_get_status"); + DBUG_PRINT("enter", ("concurrent_insert %d", concurrent_insert)); + + info->row_base_length= info->s->base_length; + info->row_flag= info->s->base.default_row_flag; + if (concurrent_insert) + { + DBUG_ASSERT(info->lock.type == TL_WRITE_CONCURRENT_INSERT); + info->row_flag|= ROW_FLAG_TRANSID; + info->row_base_length+= TRANSID_SIZE; + } + else + { + DBUG_ASSERT(info->lock.type != TL_WRITE_CONCURRENT_INSERT); + } + DBUG_VOID_RETURN; +} + + +my_bool _ma_block_start_trans(void* param) +{ + MARIA_HA *info=(MARIA_HA*) param; + if (info->s->lock_key_trees) + { + /* + Assume for now that this doesn't fail (It can only fail in + out of memory conditions) + TODO: Fix this by having one extra state pre-allocated + */ + return _ma_setup_live_state(info); + } + + /* + Info->trn is set if this table is already handled and we are + called from maria_versioning() + */ + if (info->s->base.born_transactional && !info->trn) + { + /* + Assume for now that this doesn't fail (It can only fail in + out of memory conditions) + */ + return maria_create_trn_hook(info) != 0; + } + return 0; +} + + +void _ma_block_update_status(void *param __attribute__((unused))) +{ +} + +void _ma_block_restore_status(void *param __attribute__((unused))) +{ +} + + +/** + Check if should allow concurrent inserts + + @return + @retval 0 ok to use concurrent inserts + @retval 1 not ok +*/ + +my_bool _ma_block_check_status(void *param __attribute__((unused))) +{ + return (my_bool) 0; +} + + +/* Get status when transactional but not versioned */ + +my_bool _ma_block_start_trans_no_versioning(void* param) +{ + MARIA_HA *info=(MARIA_HA*) param; + DBUG_ENTER("_ma_block_get_status_no_version"); + DBUG_ASSERT(info->s->base.born_transactional); + + info->state->changed= 0; /* from _ma_reset_update_flag() */ + if (!info->trn) + { + /* + Assume for now that this doesn't fail (It can only fail in + out of memory conditions) + */ + DBUG_RETURN(maria_create_trn_hook(info)); + } + DBUG_RETURN(0); +} + + +/** + Enable/disable versioning +*/ + +void maria_versioning(MARIA_HA *info, my_bool versioning) +{ + /* For now, this is a hack */ + if (info->s->have_versioning) + { + enum thr_lock_type save_lock_type; + /* Assume is a non threaded application (for now) */ + info->s->lock_key_trees= 0; + /* Set up info->lock.type temporary for _ma_block_get_status() */ + save_lock_type= info->lock.type; + info->lock.type= versioning ? TL_WRITE_CONCURRENT_INSERT : TL_WRITE; + _ma_block_get_status((void*) info, versioning); + info->lock.type= save_lock_type; + info->state= info->state_start= &info->s->state.common; + } +} + + +/** + Update data_file_length to new length + + NOTES + Only used by block records +*/ + +void _ma_set_share_data_file_length(MARIA_SHARE *share, ulonglong new_length) +{ + pthread_mutex_lock(&share->intern_lock); + if (share->state.state.data_file_length < new_length) + share->state.state.data_file_length= new_length; + pthread_mutex_unlock(&share->intern_lock); +} + + +/** + Copy state information that where updated while the table was used + in not transactional mode +*/ + +void _ma_copy_nontrans_state_information(MARIA_HA *info) +{ + info->s->state.state.records= info->state->records; + info->s->state.state.checksum= info->state->checksum; +} + + +void _ma_reset_history(MARIA_SHARE *share) +{ + MARIA_STATE_HISTORY *history, *next; + DBUG_ENTER("_ma_reset_history"); + + share->state_history->trid= 0; /* Visibly by all */ + share->state_history->state= share->state.state; + history= share->state_history->next; + share->state_history->next= 0; + + for (; history; history= next) + { + next= history->next; + my_free(history, MYF(0)); + } + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + Virtual functions to check if row is visible +****************************************************************************/ + +/** + Row is always visible + This is for tables without concurrent insert +*/ + +my_bool _ma_row_visible_always(MARIA_HA *info __attribute__((unused))) +{ + return 1; +} + + +/** + Row visibility for non transactional tables with concurrent insert + + @implementation + When we got our table lock, we saved the current + data_file_length. Concurrent inserts always go to the end of the + file. So we can test if the found key references a new record. +*/ + +my_bool _ma_row_visible_non_transactional_table(MARIA_HA *info) +{ + return info->cur_row.lastpos < info->state->data_file_length; +} + + +/** + Row visibility for transactional tables with versioning + + + @TODO + Add test if found key was marked deleted and it was deleted by + us. In that case we should return 0 +*/ + +my_bool _ma_row_visible_transactional_table(MARIA_HA *info) +{ + return trnman_can_read_from(info->trn, info->cur_row.trid); +} diff --git a/storage/maria/ma_state.h b/storage/maria/ma_state.h new file mode 100644 index 00000000000..03ce5c2ea8c --- /dev/null +++ b/storage/maria/ma_state.h @@ -0,0 +1,86 @@ +/* Copyright (C) 2008 Sun AB & Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Struct to store tables in use by one transaction */ + +typedef struct st_maria_status_info +{ + ha_rows records; /* Rows in table */ + ha_rows del; /* Removed rows */ + my_off_t empty; /* lost space in datafile */ + my_off_t key_empty; /* lost space in indexfile */ + my_off_t key_file_length; + my_off_t data_file_length; + ha_checksum checksum; + uint32 changed:1, /* Set if table was changed */ + no_transid:1; /* Set if no transid was set on rows */ +} MARIA_STATUS_INFO; + + +typedef struct st_used_tables { + struct st_used_tables *next; + struct st_maria_share *share; + MARIA_STATUS_INFO state_current; + MARIA_STATUS_INFO state_start; +} MARIA_USED_TABLES; + + +/* Struct to store commit state at different times */ + +typedef struct st_state_history { + struct st_state_history *next; + TrID trid; + MARIA_STATUS_INFO state; +} MARIA_STATE_HISTORY; + + +/* struct to remember history for closed tables */ + +typedef struct st_state_history_closed { + LSN create_rename_lsn; + MARIA_STATE_HISTORY *state_history; +} MARIA_STATE_HISTORY_CLOSED; + + +my_bool _ma_setup_live_state(MARIA_HA *info); +MARIA_STATE_HISTORY *_ma_remove_not_visible_states(MARIA_STATE_HISTORY + *org_history, + my_bool all, + my_bool trman_is_locked); +void _ma_reset_state(MARIA_HA *info); +void _ma_get_status(void* param, my_bool concurrent_insert); +void _ma_update_status(void* param); +void _ma_update_status_with_lock(MARIA_HA *info); +void _ma_restore_status(void *param); +void _ma_copy_status(void* to, void *from); +void _ma_reset_update_flag(void *param, my_bool concurrent_insert); +my_bool _ma_check_status(void *param); +void _ma_block_get_status(void* param, my_bool concurrent_insert); +void _ma_block_update_status(void *param); +void _ma_block_restore_status(void *param); +my_bool _ma_block_check_status(void *param); +void maria_versioning(MARIA_HA *info, my_bool versioning); +void _ma_set_share_data_file_length(struct st_maria_share *share, + ulonglong new_length); +void _ma_copy_nontrans_state_information(MARIA_HA *info); +my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit, + my_bool active_transactions); +my_bool _ma_row_visible_always(MARIA_HA *info); +my_bool _ma_row_visible_non_transactional_table(MARIA_HA *info); +my_bool _ma_row_visible_transactional_table(MARIA_HA *info); +void _ma_remove_not_visible_states_with_lock(struct st_maria_share *share, + my_bool all); +void _ma_remove_table_from_trnman(struct st_maria_share *share, TRN *trn); +void _ma_reset_history(struct st_maria_share *share); diff --git a/storage/maria/ma_static.c b/storage/maria/ma_static.c new file mode 100644 index 00000000000..917385f9568 --- /dev/null +++ b/storage/maria/ma_static.c @@ -0,0 +1,109 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +/* + Static variables for MARIA library. All definied here for easy making of + a shared library +*/ + +#ifndef _global_h +#include "maria_def.h" +#include "trnman.h" +#endif + +LIST *maria_open_list=0; +uchar maria_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 9, '\003', }; +uchar maria_pack_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 10, '\001', }; +/* Unique number for this maria instance */ +uchar maria_uuid[MY_UUID_SIZE]; +uint maria_quick_table_bits=9; +ulong maria_block_size= MARIA_KEY_BLOCK_LENGTH; +my_bool maria_flush= 0, maria_single_user= 0; +my_bool maria_delay_key_write= 0, maria_page_checksums= 1; +my_bool maria_inited= FALSE; +my_bool maria_in_ha_maria= FALSE; /* If used from ha_maria or not */ +my_bool maria_recovery_changed_data= 0, maria_recovery_verbose= 0; +pthread_mutex_t THR_LOCK_maria; +#if defined(THREAD) && !defined(DONT_USE_RW_LOCKS) +ulong maria_concurrent_insert= 2; +#else +ulong maria_concurrent_insert= 0; +#endif +my_off_t maria_max_temp_length= MAX_FILE_SIZE; +ulong maria_bulk_insert_tree_size=8192*1024; +ulong maria_data_pointer_size= 4; + +PAGECACHE maria_pagecache_var; +PAGECACHE *maria_pagecache= &maria_pagecache_var; + +PAGECACHE maria_log_pagecache_var; +PAGECACHE *maria_log_pagecache= &maria_log_pagecache_var; +MY_TMPDIR *maria_tmpdir; /* Tempdir for redo */ +char *maria_data_root; +HASH maria_stored_state; +int (*maria_create_trn_hook)(MARIA_HA *); + +/** + @brief when transactionality does not matter we can use this transaction + + Used in external programs like ma_test*, and also internally inside + libmaria when there is no transaction around and the operation isn't + transactional (CREATE/DROP/RENAME/OPTIMIZE/REPAIR). +*/ +TRN dummy_transaction_object; + +/* a WT_RESOURCE_TYPE for transactions waiting on a unique key conflict */ +WT_RESOURCE_TYPE ma_rc_dup_unique={ wt_resource_id_memcmp, 0}; + +/* Enough for comparing if number is zero */ +uchar maria_zero_string[]= {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + +/* + read_vec[] is used for converting between P_READ_KEY.. and SEARCH_ + Position is , == , >= , <= , > , < +*/ + +uint32 maria_read_vec[]= +{ + SEARCH_FIND, /* HA_READ_KEY_EXACT */ + SEARCH_FIND | SEARCH_BIGGER, /* HA_READ_KEY_OR_NEXT */ + SEARCH_FIND | SEARCH_SMALLER, /* HA_READ_KEY_OR_PREV */ + SEARCH_NO_FIND | SEARCH_BIGGER, /* HA_READ_AFTER_KEY */ + SEARCH_NO_FIND | SEARCH_SMALLER, /* HA_READ_BEFORE_KEY */ + SEARCH_FIND | SEARCH_PART_KEY, /* HA_READ_PREFIX */ + SEARCH_LAST, /* HA_READ_PREFIX_LAST */ + SEARCH_LAST | SEARCH_SMALLER, /* HA_READ_PREFIX_LAST_OR_PREV */ + MBR_CONTAIN, /* HA_READ_MBR_CONTAIN */ + MBR_INTERSECT, /* HA_READ_MBR_INTERSECT */ + MBR_WITHIN, /* HA_READ_MBR_WITHIN */ + MBR_DISJOINT, /* HA_READ_MBR_DISJOINT */ + MBR_EQUAL /* HA_READ_MBR_EQUAL */ +}; + +uint32 maria_readnext_vec[]= +{ + SEARCH_BIGGER, SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_BIGGER, SEARCH_SMALLER, + SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_SMALLER +}; + +static int always_valid(const char *filename __attribute__((unused))) +{ + return 0; +} + +int (*maria_test_invalid_symlink)(const char *filename)= always_valid; diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c new file mode 100644 index 00000000000..0aa3a3acbc1 --- /dev/null +++ b/storage/maria/ma_statrec.c @@ -0,0 +1,302 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + /* Functions to handle fixed-length-records */ + +#include "maria_def.h" + + +my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record) +{ + uchar temp[8]; /* max pointer length */ + if (info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end) + { + my_off_t filepos=info->s->state.dellink; + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_read(info, &temp[0],info->s->base.rec_reflength, + info->s->state.dellink+1, + MYF(MY_NABP))) + goto err; + info->s->state.dellink= _ma_rec_pos(info->s, temp); + info->state->del--; + info->state->empty-=info->s->base.pack_reclength; + if (info->s->file_write(info, record, info->s->base.reclength, + filepos, MYF(MY_NABP))) + goto err; + } + else + { + if (info->state->data_file_length > info->s->base.max_data_file_length- + info->s->base.pack_reclength) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + return(2); + } + if (info->opt_flag & WRITE_CACHE_USED) + { /* Cash in use */ + if (my_b_write(&info->rec_cache, record, + info->s->base.reclength)) + goto err; + if (info->s->base.pack_reclength != info->s->base.reclength) + { + uint length=info->s->base.pack_reclength - info->s->base.reclength; + bzero(temp,length); + if (my_b_write(&info->rec_cache, temp,length)) + goto err; + } + } + else + { + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_write(info, record, info->s->base.reclength, + info->state->data_file_length, + info->s->write_flag)) + goto err; + if (info->s->base.pack_reclength != info->s->base.reclength) + { + uint length=info->s->base.pack_reclength - info->s->base.reclength; + bzero(temp,length); + if (info->s->file_write(info, temp,length, + info->state->data_file_length+ + info->s->base.reclength, + info->s->write_flag)) + goto err; + } + } + info->state->data_file_length+=info->s->base.pack_reclength; + info->s->state.split++; + } + return 0; + err: + return 1; +} + +my_bool _ma_update_static_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec __attribute__ ((unused)), + const uchar *record) +{ + info->rec_cache.seek_not_done=1; /* We have done a seek */ + return (info->s->file_write(info, + record, info->s->base.reclength, + pos, + MYF(MY_NABP)) != 0); +} + + +my_bool _ma_delete_static_record(MARIA_HA *info, + const uchar *record __attribute__ ((unused))) +{ + uchar temp[9]; /* 1+sizeof(uint32) */ + info->state->del++; + info->state->empty+=info->s->base.pack_reclength; + temp[0]= '\0'; /* Mark that record is deleted */ + _ma_dpointer(info->s, temp+1, info->s->state.dellink); + info->s->state.dellink= info->cur_row.lastpos; + info->rec_cache.seek_not_done=1; + return (info->s->file_write(info, temp, 1+info->s->rec_reflength, + info->cur_row.lastpos, MYF(MY_NABP)) != 0); +} + + +my_bool _ma_cmp_static_record(register MARIA_HA *info, + register const uchar *old) +{ + DBUG_ENTER("_ma_cmp_static_record"); + + /* We are going to do changes; dont let anybody disturb */ + dont_break(); /* Dont allow SIGHUP or SIGINT */ + + if (info->opt_flag & WRITE_CACHE_USED) + { + if (flush_io_cache(&info->rec_cache)) + { + DBUG_RETURN(1); + } + info->rec_cache.seek_not_done=1; /* We have done a seek */ + } + + if ((info->opt_flag & READ_CHECK_USED)) + { /* If check isn't disabled */ + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_read(info, info->rec_buff, info->s->base.reclength, + info->cur_row.lastpos, MYF(MY_NABP))) + DBUG_RETURN(1); + if (memcmp(info->rec_buff, old, (uint) info->s->base.reclength)) + { + DBUG_DUMP("read",old,info->s->base.reclength); + DBUG_DUMP("disk",info->rec_buff,info->s->base.reclength); + my_errno=HA_ERR_RECORD_CHANGED; /* Record have changed */ + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos) +{ + DBUG_ENTER("_ma_cmp_static_unique"); + + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_read(info, info->rec_buff, info->s->base.reclength, + pos, MYF(MY_NABP))) + DBUG_RETURN(1); + DBUG_RETURN(_ma_unique_comp(def, record, info->rec_buff, + def->null_are_equal)); +} + + +/* + Read a fixed-length-record + + RETURN + 0 Ok + 1 record delete + -1 on read-error or locking-error +*/ + +int _ma_read_static_record(register MARIA_HA *info, register uchar *record, + MARIA_RECORD_POS pos) +{ + int error; + DBUG_ENTER("_ma_read_static_record"); + + if (pos != HA_OFFSET_ERROR) + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file <= pos && + flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + info->rec_cache.seek_not_done=1; /* We have done a seek */ + + error= (int) info->s->file_read(info, record,info->s->base.reclength, + pos, MYF(MY_NABP)); + if (! error) + { + fast_ma_writeinfo(info); + if (!*record) + { + /* Record is deleted */ + DBUG_PRINT("warning", ("Record is deleted")); + DBUG_RETURN((my_errno=HA_ERR_RECORD_DELETED)); + } + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + } + fast_ma_writeinfo(info); /* No such record */ + DBUG_RETURN(my_errno); +} + + +/** + @brief Read record from given position or next record + + @note + When scanning, this function will return HA_ERR_RECORD_DELETED + for deleted rows even if skip_deleted_blocks is set. + The reason for this is to allow the caller to calculate the record + position without having to do call maria_position() for each record. +*/ + +int _ma_read_rnd_static_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks) +{ + int locked,error,cache_read; + uint cache_length; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_read_rnd_static_record"); + + cache_read=0; + cache_length=0; + if (info->opt_flag & READ_CACHE_USED) + { /* Cache in use */ + if (filepos == my_b_tell(&info->rec_cache) && + (skip_deleted_blocks || !filepos)) + { + cache_read=1; /* Read record using cache */ + cache_length= (uint) (info->rec_cache.read_end - + info->rec_cache.read_pos); + } + else + info->rec_cache.seek_not_done=1; /* Filepos is changed */ + } + locked=0; + if (info->lock_type == F_UNLCK) + { + if (filepos >= info->state->data_file_length) + { /* Test if new records */ + if (_ma_readinfo(info,F_RDLCK,0)) + DBUG_RETURN(my_errno); + locked=1; + } + else + { /* We don't nead new info */ +#ifndef UNSAFE_LOCKING + if ((! cache_read || share->base.reclength > cache_length) && + share->tot_locks == 0) + { /* record not in cache */ + locked=1; + } +#else + info->tmp_lock_type=F_RDLCK; +#endif + } + } + if (filepos >= info->state->data_file_length) + { + DBUG_PRINT("test",("filepos: %ld (%ld) records: %ld del: %ld", + (long) filepos/share->base.reclength, (long) filepos, + (long) info->state->records, (long) info->state->del)); + fast_ma_writeinfo(info); + DBUG_RETURN(my_errno=HA_ERR_END_OF_FILE); + } + info->cur_row.lastpos= filepos; + info->cur_row.nextpos= filepos+share->base.pack_reclength; + + if (! cache_read) /* No cacheing */ + { + error= _ma_read_static_record(info, buf, filepos); + DBUG_RETURN(error); + } + + /* Read record with cacheing */ + error=my_b_read(&info->rec_cache, buf, share->base.reclength); + if (info->s->base.pack_reclength != info->s->base.reclength && !error) + { + uchar tmp[8]; /* Skill fill bytes */ + error=my_b_read(&info->rec_cache, tmp, + info->s->base.pack_reclength - info->s->base.reclength); + } + if (locked) + VOID(_ma_writeinfo(info,0)); /* Unlock keyfile */ + if (!error) + { + if (!buf[0]) + { /* Record is removed */ + DBUG_RETURN(my_errno=HA_ERR_RECORD_DELETED); + } + /* Found and may be updated */ + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + DBUG_RETURN(0); + } + /* my_errno should be set if rec_cache.error == -1 */ + if (info->rec_cache.error != -1 || my_errno == 0) + my_errno=HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(my_errno); /* Something wrong (EOF?) */ +} diff --git a/storage/maria/ma_test1.c b/storage/maria/ma_test1.c new file mode 100644 index 00000000000..affa3a71634 --- /dev/null +++ b/storage/maria/ma_test1.c @@ -0,0 +1,899 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Testing of the basic functions of a MARIA table */ + +#include "maria_def.h" +#include <my_getopt.h> +#include <m_string.h> +#include "ma_control_file.h" +#include "ma_loghandler.h" +#include "ma_checkpoint.h" +#include "trnman.h" + +extern PAGECACHE *maria_log_pagecache; +extern char *maria_data_root; + +#define MAX_REC_LENGTH 1024 + +static void usage(); + +static int rec_pointer_size=0, flags[50], testflag, checkpoint; +static int key_field=FIELD_SKIP_PRESPACE,extra_field=FIELD_SKIP_ENDSPACE; +static int key_type=HA_KEYTYPE_NUM; +static int create_flag=0; +static ulong blob_length; +static enum data_file_type record_type= DYNAMIC_RECORD; + +static uint insert_count, update_count, remove_count; +static uint pack_keys=0, pack_seg=0, key_length; +static uint unique_key=HA_NOSAME; +static uint die_in_middle_of_transaction; +static my_bool pagecacheing, null_fields, silent, skip_update, opt_unique; +static my_bool verbose, skip_delete, transactional; +static my_bool opt_versioning= 0; +static MARIA_COLUMNDEF recinfo[4]; +static MARIA_KEYDEF keyinfo[10]; +static HA_KEYSEG keyseg[10]; +static HA_KEYSEG uniqueseg[10]; + +static int run_test(const char *filename); +static void get_options(int argc, char *argv[]); +static void create_key(uchar *key,uint rownr); +static void create_record(uchar *record,uint rownr); +static void update_record(uchar *record); + + +/* + These are here only for testing of recovery with undo. We are not + including maria_def.h here as this test is also to be an example of + how to use maria outside of the maria directory +*/ + +extern int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index, + enum flush_type flush_type_for_data, + enum flush_type flush_type_for_index); +#define MARIA_FLUSH_DATA 1 + + +int main(int argc,char *argv[]) +{ +#if defined(SAFE_MUTEX) && defined(THREAD) + safe_mutex_deadlock_detector= 1; +#endif + MY_INIT(argv[0]); + get_options(argc,argv); + maria_data_root= (char *)"."; + /* Maria requires that we always have a page cache */ + if (maria_init() || + (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0, + maria_block_size, MY_WME) == 0) || + ma_control_file_open(TRUE, TRUE) || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, MY_WME) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0) || + (transactional && (trnman_init(0) || ma_checkpoint_init(0)))) + { + fprintf(stderr, "Error in initialization\n"); + exit(1); + } + if (opt_versioning) + init_thr_lock(); + + exit(run_test("test1")); +} + + +static int run_test(const char *filename) +{ + MARIA_HA *file; + int i,j= 0,error,deleted,rec_length,uniques=0; + uint offset_to_key; + ha_rows found,row_count; + uchar record[MAX_REC_LENGTH],key[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH]; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + + if (die_in_middle_of_transaction) + null_fields= 1; + + bzero((char*) recinfo,sizeof(recinfo)); + bzero((char*) &create_info,sizeof(create_info)); + + /* First define 2 columns */ + create_info.null_bytes= 1; + recinfo[0].type= key_field; + recinfo[0].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr : + key_length); + if (key_field == FIELD_VARCHAR) + recinfo[0].length+= HA_VARCHAR_PACKLENGTH(key_length); + recinfo[1].type=extra_field; + recinfo[1].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24); + if (extra_field == FIELD_VARCHAR) + recinfo[1].length+= HA_VARCHAR_PACKLENGTH(recinfo[1].length); + recinfo[1].null_bit= null_fields ? 2 : 0; + + if (opt_unique) + { + recinfo[2].type=FIELD_CHECK; + recinfo[2].length=MARIA_UNIQUE_HASH_LENGTH; + } + rec_length= recinfo[0].length + recinfo[1].length + recinfo[2].length + + create_info.null_bytes; + + if (key_type == HA_KEYTYPE_VARTEXT1 && + key_length > 255) + key_type= HA_KEYTYPE_VARTEXT2; + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].seg[0].type= key_type; + keyinfo[0].seg[0].flag= pack_seg; + keyinfo[0].seg[0].start=1; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language= default_charset_info->number; + if (pack_seg & HA_BLOB_PART) + { + keyinfo[0].seg[0].bit_start=4; /* Length of blob length */ + } + keyinfo[0].flag = (uint8) (pack_keys | unique_key); + + bzero((uchar*) flags,sizeof(flags)); + if (opt_unique) + { + uint start; + uniques=1; + bzero((char*) &uniquedef,sizeof(uniquedef)); + bzero((char*) uniqueseg,sizeof(uniqueseg)); + uniquedef.seg=uniqueseg; + uniquedef.keysegs=2; + + /* Make a unique over all columns (except first NULL fields) */ + for (i=0, start=1 ; i < 2 ; i++) + { + uniqueseg[i].start=start; + start+=recinfo[i].length; + uniqueseg[i].length=recinfo[i].length; + uniqueseg[i].language= default_charset_info->number; + } + uniqueseg[0].type= key_type; + uniqueseg[0].null_bit= null_fields ? 2 : 0; + uniqueseg[1].type= HA_KEYTYPE_TEXT; + if (extra_field == FIELD_BLOB) + { + uniqueseg[1].length=0; /* The whole blob */ + uniqueseg[1].bit_start=4; /* long blob */ + uniqueseg[1].flag|= HA_BLOB_PART; + } + else if (extra_field == FIELD_VARCHAR) + { + uniqueseg[1].flag|= HA_VAR_LENGTH_PART; + uniqueseg[1].type= (HA_VARCHAR_PACKLENGTH(recinfo[1].length-1) == 1 ? + HA_KEYTYPE_VARTEXT1 : HA_KEYTYPE_VARTEXT2); + } + } + else + uniques=0; + + offset_to_key= test(null_fields); + if (key_field == FIELD_BLOB || key_field == FIELD_VARCHAR) + offset_to_key+= 2; + + if (!silent) + printf("- Creating maria file\n"); + create_info.max_rows=(ulong) (rec_pointer_size ? + (1L << (rec_pointer_size*8))/40 : + 0); + create_info.transactional= transactional; + if (maria_create(filename, record_type, 1, keyinfo,2+opt_unique,recinfo, + uniques, &uniquedef, &create_info, + create_flag)) + goto err; + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + if (!silent) + printf("- Writing key:s\n"); + + if (maria_begin(file)) + goto err; + if (opt_versioning) + maria_versioning(file, 1); + my_errno=0; + row_count=deleted=0; + for (i=49 ; i>=1 ; i-=2 ) + { + if (insert_count-- == 0) + { + if (testflag) + break; + VOID(maria_close(file)); + exit(0); + } + j=i%25 +1; + create_record(record,j); + error=maria_write(file,record); + if (!error) + row_count++; + flags[j]=1; + if (verbose || error) + printf("J= %2d maria_write: %d errno: %d\n", j,error,my_errno); + } + + if (maria_commit(file) || maria_begin(file)) + goto err; + + if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 1) + goto end; + + /* Insert 2 rows with null values */ + if (null_fields) + { + create_record(record,0); + error=maria_write(file,record); + if (!error) + row_count++; + if (verbose || error) + printf("J= NULL maria_write: %d errno: %d\n", error,my_errno); + error=maria_write(file,record); + if (!error) + row_count++; + if (verbose || error) + printf("J= NULL maria_write: %d errno: %d\n", error,my_errno); + flags[0]=2; + } + + if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 2) + { + printf("Terminating after inserts\n"); + goto end; + } + + if (maria_commit(file) || maria_begin(file)) + goto err; + + if (!skip_update) + { + if (opt_unique) + { + if (!silent) + printf("- Checking unique constraint\n"); + create_record(record,j); /* Check last created row */ + if (!maria_write(file,record) || my_errno != HA_ERR_FOUND_DUPP_UNIQUE) + { + printf("unique check failed\n"); + } + } + if (!silent) + printf("- Updating rows\n"); + + /* Update first last row to force extend of file */ + if (maria_rsame(file,read_record,-1)) + { + printf("Can't find last row with maria_rsame\n"); + } + else + { + memcpy(record,read_record,rec_length); + update_record(record); + if (maria_update(file,read_record,record)) + { + printf("Can't update last row: %.*s\n", + keyinfo[0].seg[0].length,read_record+1); + } + } + + /* Read through all rows and update them */ + assert(maria_scan_init(file) == 0); + + found=0; + while ((error= maria_scan(file,read_record)) == 0) + { + if (--update_count == 0) { VOID(maria_close(file)) ; exit(0) ; } + memcpy(record,read_record,rec_length); + update_record(record); + if (maria_update(file,read_record,record)) + { + printf("Can't update row: %.*s, error: %d\n", + keyinfo[0].seg[0].length,record+1,my_errno); + } + found++; + } + if (found != row_count) + printf("Found %ld of %ld rows\n", (ulong) found, (ulong) row_count); + maria_scan_end(file); + } + + if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 3) + { + printf("Terminating after updates\n"); + goto end; + } + if (!silent) + printf("- Reopening file\n"); + if (maria_commit(file)) + goto err; + if (maria_close(file)) + goto err; + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + if (maria_begin(file)) + goto err; + if (opt_versioning) + maria_versioning(file, 1); + if (!skip_delete) + { + if (!silent) + printf("- Removing keys\n"); + + for (i=0 ; i <= 10 ; i++) + { + /* + If you want to debug the problem in ma_test_recovery with BLOBs + (see @todo there), you can break out of the loop after just one + delete, it is enough, like this: + if (i==1) break; + */ + /* testing */ + if (remove_count-- == 0) + { + fprintf(stderr, + "delete-rows number of rows deleted; Going down hard!\n"); + goto end; + } + j=i*2; + if (!flags[j]) + continue; + create_key(key,j); + my_errno=0; + if ((error = maria_rkey(file, read_record, 0, key, + HA_WHOLE_KEY, HA_READ_KEY_EXACT))) + { + if (verbose || (flags[j] >= 1 || + (error && my_errno != HA_ERR_KEY_NOT_FOUND))) + printf("key: '%.*s' maria_rkey: %3d errno: %3d\n", + (int) key_length,key+offset_to_key,error,my_errno); + } + else + { + error=maria_delete(file,read_record); + if (verbose || error) + printf("key: '%.*s' maria_delete: %3d errno: %3d\n", + (int) key_length, key+offset_to_key, error, my_errno); + if (! error) + { + deleted++; + flags[j]--; + } + } + } + } + + if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 4) + { + printf("Terminating after deletes\n"); + goto end; + } + + if (!silent) + printf("- Reading rows with key\n"); + record[1]= 0; /* For nicer printf */ + for (i=0 ; i <= 25 ; i++) + { + create_key(key,i); + my_errno=0; + error=maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT); + if (verbose || + (error == 0 && flags[i] == 0 && unique_key) || + (error && (flags[i] != 0 || my_errno != HA_ERR_KEY_NOT_FOUND))) + { + printf("key: '%.*s' maria_rkey: %3d errno: %3d record: %s\n", + (int) key_length,key+offset_to_key,error,my_errno,record+1); + } + } + + if (!silent) + printf("- Reading rows with position\n"); + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + + for (i=1,found=0 ; i <= 30 ; i++) + { + my_errno=0; + if ((error= maria_scan(file, read_record)) == HA_ERR_END_OF_FILE) + { + if (found != row_count-deleted) + printf("Found only %ld of %ld rows\n", (ulong) found, + (ulong) (row_count - deleted)); + break; + } + if (!error) + found++; + if (verbose || (error != 0 && error != HA_ERR_RECORD_DELETED && + error != HA_ERR_END_OF_FILE)) + { + printf("pos: %2d maria_rrnd: %3d errno: %3d record: %s\n", + i-1,error,my_errno,read_record+1); + } + } + maria_scan_end(file); + +end: + if (die_in_middle_of_transaction) + { + /* As commit record is not done, UNDO entries needs to be rolled back */ + switch (die_in_middle_of_transaction) { + case 1: + /* + Flush changed pages go to disk. That will also flush log. Recovery + will skip REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE); + break; + case 2: + /* + Just flush log. Pages are likely to not be on disk. Recovery will + then execute REDOs and UNDOs. + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + case 3: + /* + Flush nothing. Pages and log are likely to not be on disk. Recovery + will then do nothing. + */ + break; + case 4: + /* + Flush changed data pages go to disk. Changed index pages are not + flushed. Recovery will skip some REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE, + FLUSH_RELEASE); + /* + We have to flush log separately as the redo for the last key page + may not be flushed + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + } + printf("Dying on request without maria_commit()/maria_close()\n"); + exit(0); + } + + if (maria_commit(file)) + goto err; + if (maria_close(file)) + goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return (0); +err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + + +static void create_key_part(uchar *key,uint rownr) +{ + if (!unique_key) + rownr&=7; /* Some identical keys */ + if (keyinfo[0].seg[0].type == HA_KEYTYPE_NUM) + { + sprintf((char*) key,"%*d",keyinfo[0].seg[0].length,rownr); + } + else if (keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT1 || + keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT2) + { /* Alpha record */ + /* Create a key that may be easily packed */ + bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B'); + sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr); + if ((rownr & 7) == 0) + { + /* Change the key to force a unpack of the next key */ + bfill(key+3,keyinfo[0].seg[0].length-5,rownr < 10 ? 'a' : 'b'); + } + } + else + { /* Alpha record */ + if (keyinfo[0].seg[0].flag & HA_SPACE_PACK) + sprintf((char*) key,"%-*d",keyinfo[0].seg[0].length,rownr); + else + { + /* Create a key that may be easily packed */ + bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B'); + sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr); + if ((rownr & 7) == 0) + { + /* Change the key to force a unpack of the next key */ + key[1]= (rownr < 10 ? 'a' : 'b'); + } + } + } +} + + +static void create_key(uchar *key,uint rownr) +{ + if (keyinfo[0].seg[0].null_bit) + { + if (rownr == 0) + { + key[0]=1; /* null key */ + key[1]=0; /* For easy print of key */ + return; + } + *key++=0; + } + if (keyinfo[0].seg[0].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint tmp; + create_key_part(key+2,rownr); + tmp=strlen((char*) key+2); + int2store(key,tmp); + } + else + create_key_part(key,rownr); +} + + +static uchar blob_key[MAX_REC_LENGTH]; +static uchar blob_record[MAX_REC_LENGTH+20*20]; + + +static void create_record(uchar *record,uint rownr) +{ + uchar *pos; + bzero((char*) record,MAX_REC_LENGTH); + record[0]=1; /* delete marker */ + if (rownr == 0 && keyinfo[0].seg[0].null_bit) + record[0]|=keyinfo[0].seg[0].null_bit; /* Null key */ + + pos=record+1; + if (recinfo[0].type == FIELD_BLOB) + { + uint tmp; + uchar *ptr; + create_key_part(blob_key,rownr); + tmp=strlen((char*) blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1); + create_key_part(pos+pack_length,rownr); + tmp= strlen((char*) pos+pack_length); + if (pack_length == 1) + *(uchar*) pos= (uchar) tmp; + else + int2store(pos,tmp); + pos+= recinfo[0].length; + } + else + { + create_key_part(pos,rownr); + pos+=recinfo[0].length; + } + if (recinfo[1].type == FIELD_BLOB) + { + uint tmp; + uchar *ptr;; + sprintf((char*) blob_record,"... row: %d", rownr); + strappend((char*) blob_record,max(MAX_REC_LENGTH-rownr,10),' '); + tmp=strlen((char*) blob_record); + int4store(pos,tmp); + ptr=blob_record; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1); + sprintf((char*) pos+pack_length, "... row: %d", rownr); + tmp= strlen((char*) pos+pack_length); + if (pack_length == 1) + *pos= (uchar) tmp; + else + int2store(pos,tmp); + } + else + { + sprintf((char*) pos,"... row: %d", rownr); + strappend((char*) pos,recinfo[1].length,' '); + } +} + +/* change row to test re-packing of rows and reallocation of keys */ + +static void update_record(uchar *record) +{ + uchar *pos=record+1; + if (recinfo[0].type == FIELD_BLOB) + { + uchar *column,*ptr; + int length; + length=uint4korr(pos); /* Long blob */ + memcpy_fixed(&column,pos+4,sizeof(char*)); + memcpy(blob_key,column,length); /* Move old key */ + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); /* Store pointer to new key */ + if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM) + default_charset_info->cset->casedn(default_charset_info, + (char*) blob_key, length, + (char*) blob_key, length); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1); + uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos); + default_charset_info->cset->casedn(default_charset_info, + (char*) pos + pack_length, length, + (char*) pos + pack_length, length); + pos+=recinfo[0].length; + } + else + { + if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM) + default_charset_info->cset->casedn(default_charset_info, + (char*) pos, keyinfo[0].seg[0].length, + (char*) pos, keyinfo[0].seg[0].length); + pos+=recinfo[0].length; + } + + if (recinfo[1].type == FIELD_BLOB) + { + uchar *column; + int length; + length=uint4korr(pos); + memcpy_fixed(&column,pos+4,sizeof(char*)); + memcpy(blob_record,column,length); + bfill(blob_record+length,20,'.'); /* Make it larger */ + length+=20; + int4store(pos,length); + column=blob_record; + memcpy_fixed(pos+4,&column,sizeof(char*)); + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + /* Second field is longer than 10 characters */ + uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1); + uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos); + pos= record+ recinfo[1].offset; + bfill(pos+pack_length+length,recinfo[1].length-length-pack_length,'.'); + length=recinfo[1].length-pack_length; + if (pack_length == 1) + *(uchar*) pos= (uchar) length; + else + int2store(pos,length); + } + else + { + bfill(pos+recinfo[1].length-10,10,'.'); + } +} + + +static struct my_option my_long_options[] = +{ + {"checkpoint", 'H', "Checkpoint at specified stage", (uchar**) &checkpoint, + (uchar**) &checkpoint, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"checksum", 'c', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', "Undocumented", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"delete-rows", 'd', "Abort after this many rows has been deleted", + (uchar**) &remove_count, (uchar**) &remove_count, 0, GET_UINT, REQUIRED_ARG, + 1000, 0, 0, 0, 0, 0}, + {"help", '?', "Display help and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"insert-rows", 'i', "Undocumented", (uchar**) &insert_count, + (uchar**) &insert_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0}, + {"key-alpha", 'a', "Use a key of type HA_KEYTYPE_TEXT", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-binary-pack", 'B', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-blob", 'b', "Undocumented", + (uchar**) &blob_length, (uchar**) &blob_length, + 0, GET_ULONG, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"key-cache", 'K', "Undocumented", (uchar**) &pagecacheing, + (uchar**) &pagecacheing, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-length", 'k', "Undocumented", (uchar**) &key_length, + (uchar**) &key_length, 0, GET_UINT, REQUIRED_ARG, 6, 0, 0, 0, 0, 0}, + {"key-multiple", 'm', "Don't use unique keys", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-prefix_pack", 'P', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-space_pack", 'p', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-varchar", 'w', "Test VARCHAR keys", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"null-fields", 'N', "Define fields with NULL", + (uchar**) &null_fields, (uchar**) &null_fields, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"row-fixed-size", 'S', "Fixed size records", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"rows-in-block", 'M', "Store rows in block format", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"row-pointer-size", 'R', "Undocumented", (uchar**) &rec_pointer_size, + (uchar**) &rec_pointer_size, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"silent", 's', "Undocumented", + (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, + 0, 0}, + {"skip-delete", 'D', "Don't test deletes", (uchar**) &skip_delete, + (uchar**) &skip_delete, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"skip-update", 'U', "Don't test updates", (uchar**) &skip_update, + (uchar**) &skip_update, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag, + (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"test-undo", 'A', + "Abort hard. Used for testing recovery with undo", + (uchar**) &die_in_middle_of_transaction, + (uchar**) &die_in_middle_of_transaction, + 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"transactional", 'T', + "Test in transactional mode. (Only works with block format)", + (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"unique", 'E', "Check unique handling", (uchar**) &opt_unique, + (uchar**) &opt_unique, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"update-rows", 'u', "Max number of rows to update", (uchar**) &update_count, + (uchar**) &update_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Be more verbose", (uchar**) &verbose, + (uchar**) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version number and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"versioning", 'C', "Use row versioning (only works with block format)", + (uchar**) &opt_versioning, (uchar**) &opt_versioning, 0, GET_BOOL, + NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch(optid) { + case 'a': + key_type= HA_KEYTYPE_TEXT; + break; + case 'c': + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + break; + case 'R': /* Length of record pointer */ + if (rec_pointer_size > 3) + rec_pointer_size=0; + break; + case 'P': + pack_keys= HA_PACK_KEY; /* Use prefix compression */ + break; + case 'B': + pack_keys= HA_BINARY_PACK_KEY; /* Use binary compression */ + break; + case 'M': + record_type= BLOCK_RECORD; + break; + case 'S': + if (key_field == FIELD_VARCHAR) + { + create_flag=0; /* Static sized varchar */ + record_type= STATIC_RECORD; + } + else if (key_field != FIELD_BLOB) + { + key_field=FIELD_NORMAL; /* static-size record */ + extra_field=FIELD_NORMAL; + record_type= STATIC_RECORD; + } + break; + case 'p': + pack_keys=HA_PACK_KEY; /* Use prefix + space packing */ + pack_seg=HA_SPACE_PACK; + key_type=HA_KEYTYPE_TEXT; + break; + case 'm': + unique_key=0; + break; + case 'b': + key_field=FIELD_BLOB; /* blob key */ + extra_field= FIELD_BLOB; + pack_seg|= HA_BLOB_PART; + key_type= HA_KEYTYPE_VARTEXT1; + if (record_type == STATIC_RECORD) + record_type= DYNAMIC_RECORD; + break; + case 'k': + if (key_length < 4 || key_length > HA_MAX_KEY_LENGTH) + { + fprintf(stderr,"Wrong key length\n"); + exit(1); + } + break; + case 'w': + key_field=FIELD_VARCHAR; /* varchar keys */ + extra_field= FIELD_VARCHAR; + key_type= HA_KEYTYPE_VARTEXT1; + pack_seg|= HA_VAR_LENGTH_PART; + if (record_type == STATIC_RECORD) + record_type= DYNAMIC_RECORD; + break; + case 'K': /* Use key cacheing */ + pagecacheing=1; + break; + case 'V': + printf("test1 Ver 1.2 \n"); + exit(0); + case '#': + DBUG_PUSH(argument); + break; + case '?': + usage(); + exit(1); + } + return 0; +} + + +/* Read options */ + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + if (transactional) + record_type= BLOCK_RECORD; + return; +} /* get options */ + + +static void usage() +{ + printf("Usage: %s [options]\n\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c new file mode 100644 index 00000000000..9e2f32f767b --- /dev/null +++ b/storage/maria/ma_test2.c @@ -0,0 +1,1246 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Test av isam-databas: stor test */ + +#ifndef USE_MY_FUNC /* We want to be able to dbug this !! */ +#define USE_MY_FUNC +#endif +#include "maria_def.h" +#include "trnman.h" +#include <m_ctype.h> +#include <my_bit.h> +#include "ma_checkpoint.h" + +#define STANDARD_LENGTH 37 +#define MARIA_KEYS 6 +#define MAX_PARTS 4 +#if !defined(MSDOS) && !defined(labs) +#define labs(a) abs(a) +#endif + +static void get_options(int argc, char *argv[]); +static uint rnd(uint max_value); +static void fix_length(uchar *record,uint length); +static void put_blob_in_record(uchar *blob_pos,char **blob_buffer, + ulong *length); +static void copy_key(MARIA_HA *info, uint inx, uchar *record, uchar *key); + +static int verbose= 0, testflag= 0, first_key= 0, async_io= 0, pagecacheing= 0; +static int write_cacheing= 0, do_locking= 0, rec_pointer_size= 0; +static int silent= 0, opt_quick_mode= 0, transactional= 0, skip_update= 0; +static int die_in_middle_of_transaction= 0, pack_fields= 1; +static int pack_seg= HA_SPACE_PACK, pack_type= HA_PACK_KEY, remove_count= -1; +static int create_flag= 0, srand_arg= 0, checkpoint= 0; +static my_bool opt_versioning= 0; +static uint use_blob= 0, update_count= 0; +static ulong pagecache_size=8192*32; +static enum data_file_type record_type= DYNAMIC_RECORD; + +static uint keys=MARIA_KEYS,recant=1000; +static uint16 key1[1001],key3[5000]; +static uchar record[300],record2[300],key[100],key2[100]; +static uchar read_record[300],read_record2[300],read_record3[300]; +static HA_KEYSEG glob_keyseg[MARIA_KEYS][MAX_PARTS]; + + /* Test program */ + +int main(int argc, char *argv[]) +{ + uint i; + int j,n1,n2,n3,error,k; + uint write_count,update,dupp_keys,opt_delete,start,length,blob_pos, + reclength,ant,found_parts; + my_off_t lastpos; + ha_rows range_records,records; + MARIA_HA *file; + MARIA_KEYDEF keyinfo[10]; + MARIA_COLUMNDEF recinfo[10]; + MARIA_INFO info; + const char *filename; + char *blob_buffer; + MARIA_CREATE_INFO create_info; + +#if defined(SAFE_MUTEX) && defined(THREAD) + safe_mutex_deadlock_detector= 1; +#endif + MY_INIT(argv[0]); + + filename= "test2"; + get_options(argc,argv); + if (! async_io) + my_disable_async_io=1; + + /* If we sync or not have no affect on this test */ + my_disable_sync= 1; + + maria_data_root= (char *)"."; + /* Maria requires that we always have a page cache */ + if (maria_init() || + (init_pagecache(maria_pagecache, pagecache_size, 0, 0, + maria_block_size, MY_WME) == 0) || + ma_control_file_open(TRUE, TRUE) || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, MY_WME) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0) || + (transactional && (trnman_init(0) || ma_checkpoint_init(0)))) + { + fprintf(stderr, "Error in initialization"); + exit(1); + } + if (opt_versioning) + init_thr_lock(); + + reclength=STANDARD_LENGTH+60+(use_blob ? 8 : 0); + blob_pos=STANDARD_LENGTH+60; + keyinfo[0].seg= &glob_keyseg[0][0]; + keyinfo[0].seg[0].start=0; + keyinfo[0].seg[0].length=6; + keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[0].seg[0].language= default_charset_info->number; + keyinfo[0].seg[0].flag=(uint8) pack_seg; + keyinfo[0].seg[0].null_bit=0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].keysegs=1; + keyinfo[0].flag = pack_type; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[1].seg= &glob_keyseg[1][0]; + keyinfo[1].seg[0].start=7; + keyinfo[1].seg[0].length=6; + keyinfo[1].seg[0].type=HA_KEYTYPE_BINARY; + keyinfo[1].seg[0].flag=0; + keyinfo[1].seg[0].null_bit=0; + keyinfo[1].seg[0].null_pos=0; + keyinfo[1].seg[1].start=0; /* two part key */ + keyinfo[1].seg[1].length=6; + keyinfo[1].seg[1].type=HA_KEYTYPE_NUM; + keyinfo[1].seg[1].flag=HA_REVERSE_SORT; + keyinfo[1].seg[1].null_bit=0; + keyinfo[1].seg[1].null_pos=0; + keyinfo[1].key_alg=HA_KEY_ALG_BTREE; + keyinfo[1].keysegs=2; + keyinfo[1].flag =0; + keyinfo[1].block_length= MARIA_MIN_KEY_BLOCK_LENGTH; /* Diff blocklength */ + keyinfo[2].seg= &glob_keyseg[2][0]; + keyinfo[2].seg[0].start=12; + keyinfo[2].seg[0].length=8; + keyinfo[2].seg[0].type=HA_KEYTYPE_BINARY; + keyinfo[2].seg[0].flag=HA_REVERSE_SORT; + keyinfo[2].seg[0].null_bit=0; + keyinfo[2].seg[0].null_pos=0; + keyinfo[2].key_alg=HA_KEY_ALG_BTREE; + keyinfo[2].keysegs=1; + keyinfo[2].flag =HA_NOSAME; + keyinfo[2].block_length= 0; /* Default block length */ + keyinfo[3].seg= &glob_keyseg[3][0]; + keyinfo[3].seg[0].start=0; + keyinfo[3].seg[0].length=reclength-(use_blob ? 8 : 0); + keyinfo[3].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[3].seg[0].language=default_charset_info->number; + keyinfo[3].seg[0].flag=(uint8) pack_seg; + keyinfo[3].seg[0].null_bit=0; + keyinfo[3].seg[0].null_pos=0; + keyinfo[3].key_alg=HA_KEY_ALG_BTREE; + keyinfo[3].keysegs=1; + keyinfo[3].flag = pack_type; + keyinfo[3].block_length= 0; /* Default block length */ + keyinfo[4].seg= &glob_keyseg[4][0]; + keyinfo[4].seg[0].start=0; + keyinfo[4].seg[0].length=5; + keyinfo[4].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[4].seg[0].language=default_charset_info->number; + keyinfo[4].seg[0].flag=0; + keyinfo[4].seg[0].null_bit=0; + keyinfo[4].seg[0].null_pos=0; + keyinfo[4].key_alg=HA_KEY_ALG_BTREE; + keyinfo[4].keysegs=1; + keyinfo[4].flag = pack_type; + keyinfo[4].block_length= 0; /* Default block length */ + keyinfo[5].seg= &glob_keyseg[5][0]; + keyinfo[5].seg[0].start=0; + keyinfo[5].seg[0].length=4; + keyinfo[5].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[5].seg[0].language=default_charset_info->number; + keyinfo[5].seg[0].flag=pack_seg; + keyinfo[5].seg[0].null_bit=0; + keyinfo[5].seg[0].null_pos=0; + keyinfo[5].key_alg=HA_KEY_ALG_BTREE; + keyinfo[5].keysegs=1; + keyinfo[5].flag = pack_type; + keyinfo[5].block_length= 0; /* Default block length */ + + recinfo[0].type=pack_fields ? FIELD_SKIP_PRESPACE : 0; + recinfo[0].length=7; + recinfo[0].null_bit=0; + recinfo[0].null_pos=0; + recinfo[1].type=pack_fields ? FIELD_SKIP_PRESPACE : 0; + recinfo[1].length=5; + recinfo[1].null_bit=0; + recinfo[1].null_pos=0; + recinfo[2].type=pack_fields ? FIELD_SKIP_PRESPACE : 0; + recinfo[2].length=9; + recinfo[2].null_bit=0; + recinfo[2].null_pos=0; + recinfo[3].type=FIELD_NORMAL; + recinfo[3].length=STANDARD_LENGTH-7-5-9-4; + recinfo[3].null_bit=0; + recinfo[3].null_pos=0; + recinfo[4].type=pack_fields ? FIELD_SKIP_ZERO : 0; + recinfo[4].length=4; + recinfo[4].null_bit=0; + recinfo[4].null_pos=0; + recinfo[5].type=pack_fields ? FIELD_SKIP_ENDSPACE : 0; + recinfo[5].length=60; + recinfo[5].null_bit=0; + recinfo[5].null_pos=0; + if (use_blob) + { + recinfo[6].type=FIELD_BLOB; + recinfo[6].length=4+portable_sizeof_char_ptr; + recinfo[6].null_bit=0; + recinfo[6].null_pos=0; + } + + write_count=update=dupp_keys=opt_delete=0; + blob_buffer=0; + + for (i=1000 ; i>0 ; i--) key1[i]=0; + for (i=4999 ; i>0 ; i--) key3[i]=0; + + if (!silent) + printf("- Creating maria-file\n"); + file= 0; + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=(ha_rows) (rec_pointer_size ? + (1L << (rec_pointer_size*8))/ + reclength : 0); + create_info.reloc_rows=(ha_rows) 100; + create_info.transactional= transactional; + if (maria_create(filename, record_type, keys,&keyinfo[first_key], + use_blob ? 7 : 6, &recinfo[0], + 0,(MARIA_UNIQUEDEF*) 0, + &create_info,create_flag)) + goto err; + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + maria_begin(file); + if (opt_versioning) + maria_versioning(file, 1); + if (testflag == 1) + goto end; + if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + if (!silent) + printf("- Writing key:s\n"); + if (do_locking) + maria_lock_database(file,F_WRLCK); + if (write_cacheing) + maria_extra(file,HA_EXTRA_WRITE_CACHE,0); + if (opt_quick_mode) + maria_extra(file,HA_EXTRA_QUICK,0); + + for (i=0 ; i < recant ; i++) + { + ulong blob_length; + n1=rnd(1000); n2=rnd(100); n3=rnd(5000); + sprintf((char*) record,"%6d:%4d:%8d:Pos: %4d ",n1,n2,n3,write_count); + int4store(record+STANDARD_LENGTH-4,(long) i); + fix_length(record,(uint) STANDARD_LENGTH+rnd(60)); + put_blob_in_record(record+blob_pos,&blob_buffer, &blob_length); + DBUG_PRINT("test",("record: %d blob_length: %lu", i, blob_length)); + + if (maria_write(file,record)) + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0) + { + printf("Error: %d in write at record: %d\n",my_errno,i); + goto err; + } + if (verbose) printf(" Double key: %d at record# %d\n", n3, i); + } + else + { + if (key3[n3] == 1 && first_key <3 && first_key+keys >= 3) + { + printf("Error: Didn't get error when writing second key: '%8d'\n",n3); + goto err; + } + write_count++; key1[n1]++; key3[n3]=1; + } + + /* Check if we can find key without flushing database */ + if (i % 10 == 0) + { + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + if (!j) + for (j=999 ; j>0 && key1[j] == 0 ; j--) ; + sprintf((char*) key,"%6d",j); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + { + printf("Test in loop: Can't find key: \"%s\"\n",key); + goto err; + } + } + } + if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (write_cacheing) + { + if (maria_extra(file,HA_EXTRA_NO_CACHE,0)) + { + puts("got error from maria_extra(HA_EXTRA_NO_CACHE)"); + goto err; + } + } + + if (testflag == 2) + goto end; + +#ifdef REMOVE_WHEN_WE_HAVE_RESIZE + if (pagecacheing) + resize_pagecache(maria_pagecache, maria_block_size, + pagecache_size * 2, 0, 0); +#endif + if (!silent) + printf("- Delete\n"); + if (srand_arg) + srand(srand_arg); + if (!update_count) + update_count= recant/10; + + for (i=0 ; i < update_count ; i++) + { + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + if (j != 0) + { + sprintf((char*) key,"%6d",j); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + { + printf("can't find key1: \"%s\"\n",key); + goto err; + } + if (bcmp(read_record+keyinfo[0].seg[0].start, + key, keyinfo[0].seg[0].length)) + { + printf("Found wrong record when searching for key: \"%s\"\n",key); + goto err; + } + if (opt_delete == (uint) remove_count) /* While testing */ + goto end; + if (maria_delete(file,read_record)) + { + printf("error: %d; can't delete record: \"%s\"\n", my_errno,read_record); + goto err; + } + opt_delete++; + key1[atoi((char*) read_record+keyinfo[0].seg[0].start)]--; + key3[atoi((char*) read_record+keyinfo[2].seg[0].start)]=0; + } + else + { + puts("Warning: Skipping delete test because no dupplicate keys"); + break; + } + } + if (testflag == 3) + goto end; + if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (!silent) + printf("- Update\n"); + if (srand_arg) + srand(srand_arg); + if (!update_count) + update_count= recant/10; + + for (i=0 ; i < update_count ; i++) + { + n1=rnd(1000); n2=rnd(100); n3=rnd(5000); + sprintf((char*) record2,"%6d:%4d:%8d:XXX: %4d ",n1,n2,n3,update); + int4store(record2+STANDARD_LENGTH-4,(long) i); + fix_length(record2,(uint) STANDARD_LENGTH+rnd(60)); + + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + if (j != 0) + { + sprintf((char*) key,"%6d",j); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + { + printf("can't find key1: \"%s\"\n", (char*) key); + goto err; + } + if (bcmp(read_record+keyinfo[0].seg[0].start, + key, keyinfo[0].seg[0].length)) + { + printf("Found wrong record when searching for key: \"%s\"; Found \"%.*s\"\n", + key, keyinfo[0].seg[0].length, + read_record+keyinfo[0].seg[0].start); + goto err; + } + if (use_blob) + { + ulong blob_length; + if (i & 1) + put_blob_in_record(record2+blob_pos,&blob_buffer, &blob_length); + else + bmove(record2+blob_pos, read_record+blob_pos, 4 + sizeof(char*)); + } + if (skip_update) + continue; + if (maria_update(file,read_record,record2)) + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0) + { + printf("error: %d; can't update:\nFrom: \"%s\"\nTo: \"%s\"\n", + my_errno,read_record,record2); + goto err; + } + if (verbose) + printf("Double key when tried to update:\nFrom: \"%s\"\nTo: \"%s\"\n",record,record2); + } + else + { + key1[atoi((char*) read_record+keyinfo[0].seg[0].start)]--; + key3[atoi((char*) read_record+keyinfo[2].seg[0].start)]=0; + key1[n1]++; key3[n3]=1; + update++; + } + } + } + if (testflag == 4) + goto end; + if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + for (i=999, dupp_keys=j=0 ; i>0 ; i--) + { + if (key1[i] > dupp_keys) + { + dupp_keys=key1[i]; j=i; + } + } + sprintf((char*) key,"%6d",j); + start=keyinfo[0].seg[0].start; + length=keyinfo[0].seg[0].length; + if (dupp_keys) + { + if (!silent) + printf("- Same key: first - next -> last - prev -> first\n"); + DBUG_PRINT("progpos",("first - next -> last - prev -> first")); + if (verbose) printf(" Using key: \"%s\" Keys: %d\n",key,dupp_keys); + + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + if (maria_rsame(file,read_record2,-1)) + goto err; + if (memcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame didn't find same record\n"); + goto err; + } + info.recpos=maria_position(file); + if (maria_rfirst(file,read_record2,0) || + maria_rsame_with_pos(file,read_record2,0,info.recpos) || + memcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame_with_pos didn't find same record\n"); + goto err; + } + { + int skr; + info.recpos= maria_position(file); + skr= maria_rnext(file,read_record2,0); + if ((skr && my_errno != HA_ERR_END_OF_FILE) || + maria_rprev(file,read_record2,0) || + memcmp(read_record,read_record2,reclength) != 0 || + info.recpos != maria_position(file)) + { + printf("maria_rsame_with_pos lost position\n"); + goto err; + } + } + ant=1; + while (maria_rnext(file,read_record2,0) == 0 && + memcmp(read_record2+start,key,length) == 0) ant++; + if (ant != dupp_keys) + { + printf("next: Found: %d keys of %d\n",ant,dupp_keys); + goto err; + } + ant=0; + while (maria_rprev(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys) + { + printf("prev: Found: %d records of %d\n",ant,dupp_keys); + goto err; + } + + /* Check of maria_rnext_same */ + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + ant=1; + while (!maria_rnext_same(file,read_record3) && ant < dupp_keys+10) + ant++; + if (ant != dupp_keys || my_errno != HA_ERR_END_OF_FILE) + { + printf("maria_rnext_same: Found: %d records of %d\n",ant,dupp_keys); + goto err; + } + } + + if (!silent) + printf("- All keys: first - next -> last - prev -> first\n"); + DBUG_PRINT("progpos",("All keys: first - next -> last - prev -> first")); + ant=1; + if (maria_rfirst(file,read_record,0)) + { + printf("Can't find first record\n"); + goto err; + } + while ((error=maria_rnext(file,read_record3,0)) == 0 && ant < write_count+10) + ant++; + if (ant != write_count - opt_delete || error != HA_ERR_END_OF_FILE) + { + printf("next: I found: %d records of %d (error: %d)\n", + ant, write_count - opt_delete, error); + goto err; + } + if (maria_rlast(file,read_record2,0) || + bcmp(read_record2,read_record3,reclength)) + { + printf("Can't find last record\n"); + DBUG_DUMP("record2", read_record2, reclength); + DBUG_DUMP("record3", read_record3, reclength); + goto err; + } + ant=1; + while (maria_rprev(file,read_record3,0) == 0 && ant < write_count+10) + ant++; + if (ant != write_count - opt_delete) + { + printf("prev: I found: %d records of %d\n",ant,write_count); + goto err; + } + if (bcmp(read_record,read_record3,reclength)) + { + printf("Can't find first record\n"); + goto err; + } + + if (!silent) + printf("- Test if: Read first - next - prev - prev - next == first\n"); + DBUG_PRINT("progpos",("- Read first - next - prev - prev - next == first")); + if (maria_rfirst(file,read_record,0) || + maria_rnext(file,read_record3,0) || + maria_rprev(file,read_record3,0) || + maria_rprev(file,read_record3,0) == 0 || + maria_rnext(file,read_record3,0)) + goto err; + if (bcmp(read_record,read_record3,reclength) != 0) + printf("Can't find first record\n"); + + if (!silent) + printf("- Test if: Read last - prev - next - next - prev == last\n"); + DBUG_PRINT("progpos",("Read last - prev - next - next - prev == last")); + if (maria_rlast(file,read_record2,0) || + maria_rprev(file,read_record3,0) || + maria_rnext(file,read_record3,0) || + maria_rnext(file,read_record3,0) == 0 || + maria_rprev(file,read_record3,0)) + goto err; + if (bcmp(read_record2,read_record3,reclength)) + printf("Can't find last record\n"); +#ifdef NOT_ANYMORE + if (!silent) + puts("- Test read key-part"); + strmov(key2,key); + for(i=strlen(key2) ; i-- > 1 ;) + { + key2[i]=0; + + /* The following row is just to catch some bugs in the key code */ + bzero((char*) file->lastkey,file->s->base.max_key_length*2); + if (maria_rkey(file,read_record,0,key2,(uint) i,HA_READ_PREFIX)) + goto err; + if (bcmp(read_record+start,key,(uint) i)) + { + puts("Didn't find right record"); + goto err; + } + } +#endif + if (dupp_keys > 2) + { + if (!silent) + printf("- Read key (first) - next - delete - next -> last\n"); + DBUG_PRINT("progpos",("first - next - delete - next -> last")); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + if (maria_rnext(file,read_record3,0)) goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=1; + while (maria_rnext(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-1) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-1); + goto err; + } + } + if (dupp_keys>4) + { + if (!silent) + printf("- Read last of key - prev - delete - prev -> first\n"); + DBUG_PRINT("progpos",("last - prev - delete - prev -> first")); + if (maria_rprev(file,read_record3,0)) goto err; + if (maria_rprev(file,read_record3,0)) goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=1; + while (maria_rprev(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-2) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-2); + goto err; + } + } + if (dupp_keys > 6) + { + if (!silent) + printf("- Read first - delete - next -> last\n"); + DBUG_PRINT("progpos",("first - delete - next -> last")); + if (maria_rkey(file,read_record3,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=1; + if (maria_rnext(file,read_record,0)) + goto err; /* Skall finnas poster */ + while (maria_rnext(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-3) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-3); + goto err; + } + + if (!silent) + printf("- Read last - delete - prev -> first\n"); + DBUG_PRINT("progpos",("last - delete - prev -> first")); + if (maria_rprev(file,read_record3,0)) goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=0; + while (maria_rprev(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-4) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-4); + goto err; + } + } + + if (!silent) + puts("- Test if: Read rrnd - same"); + DBUG_PRINT("progpos",("Read rrnd - same")); + assert(maria_scan_init(file) == 0); + for (i=0 ; i < write_count ; i++) + { + int tmp; + if ((tmp= maria_scan(file,read_record)) && + tmp != HA_ERR_END_OF_FILE && + tmp != HA_ERR_RECORD_DELETED) + { + printf("Got error %d when scanning table\n", tmp); + break; + } + if (!tmp) + { + /* Remember position to last found row */ + info.recpos= maria_position(file); + bmove(read_record2,read_record,reclength); + } + } + maria_scan_end(file); + if (i != write_count && i != write_count - opt_delete) + { + printf("Found wrong number of rows while scanning table\n"); + goto err; + } + + if (maria_rsame_with_pos(file,read_record,0,info.recpos)) + goto err; + if (bcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame_with_pos didn't find same record\n"); + goto err; + } + + for (i=min(2,keys) ; i-- > 0 ;) + { + if (maria_rsame(file,read_record2,(int) i)) goto err; + if (bcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame didn't find same record\n"); + goto err; + } + } + if (!silent) + puts("- Test maria_records_in_range"); + maria_status(file,&info,HA_STATUS_VARIABLE); + for (i=0 ; i < info.keys ; i++) + { + key_range min_key, max_key; + if (maria_rfirst(file,read_record,(int) i) || + maria_rlast(file,read_record2,(int) i)) + goto err; + copy_key(file,(uint) i, read_record, key); + copy_key(file,(uint) i, read_record2, key2); + min_key.key= key; + min_key.keypart_map= HA_WHOLE_KEY; + min_key.flag= HA_READ_KEY_EXACT; + max_key.key= key2; + max_key.keypart_map= HA_WHOLE_KEY; + max_key.flag= HA_READ_AFTER_KEY; + + range_records= maria_records_in_range(file,(int) i, &min_key, &max_key); + if (range_records < info.records*8/10 || + range_records > info.records*12/10) + { + printf("maria_records_range returned %ld; Should be about %ld\n", + (long) range_records,(long) info.records); + goto err; + } + if (verbose) + { + printf("maria_records_range returned %ld; Exact is %ld (diff: %4.2g %%)\n", + (long) range_records, (long) info.records, + labs((long) range_records - (long) info.records)*100.0/ + info.records); + } + } + for (i=0 ; i < 5 ; i++) + { + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + for (k=rnd(1000)+1 ; k>0 && key1[k] == 0 ; k--) ; + if (j != 0 && k != 0) + { + key_range min_key, max_key; + if (j > k) + swap_variables(int, j, k); + sprintf((char*) key,"%6d",j); + sprintf((char*) key2,"%6d",k); + + min_key.key= key; + min_key.keypart_map= HA_WHOLE_KEY; + min_key.flag= HA_READ_AFTER_KEY; + max_key.key= key2; + max_key.keypart_map= HA_WHOLE_KEY; + max_key.flag= HA_READ_BEFORE_KEY; + range_records= maria_records_in_range(file, 0, &min_key, &max_key); + records=0; + for (j++ ; j < k ; j++) + records+=key1[j]; + if ((long) range_records < (long) records*7/10-2 || + (long) range_records > (long) records*14/10+2) + { + printf("maria_records_range for key: %d returned %lu; Should be about %lu\n", + i, (ulong) range_records, (ulong) records); + goto err; + } + if (verbose && records) + { + printf("maria_records_range returned %lu; Exact is %lu (diff: %4.2g %%)\n", + (ulong) range_records, (ulong) records, + labs((long) range_records-(long) records)*100.0/records); + + } + } + } + + if (!silent) + printf("- maria_info\n"); + maria_status(file,&info,HA_STATUS_VARIABLE | HA_STATUS_CONST); + if (info.records != write_count-opt_delete || info.deleted > opt_delete + update + || info.keys != keys) + { + puts("Wrong info from maria_info"); + printf("Got: records: %lu delete: %lu i_keys: %d\n", + (ulong) info.records, (ulong) info.deleted, info.keys); + goto err; + } + if (verbose) + { + char buff[80]; + get_date(buff,3,info.create_time); + printf("info: Created %s\n",buff); + get_date(buff,3,info.check_time); + printf("info: checked %s\n",buff); + get_date(buff,3,info.update_time); + printf("info: Modified %s\n",buff); + } + + maria_panic(HA_PANIC_WRITE); + maria_panic(HA_PANIC_READ); + if (maria_is_changed(file)) + puts("Warning: maria_is_changed reported that datafile was changed"); + + if (!silent) + printf("- maria_extra(CACHE) + maria_rrnd.... + maria_extra(NO_CACHE)\n"); + if (maria_reset(file) || maria_extra(file,HA_EXTRA_CACHE,0)) + { + if (do_locking || (!use_blob && !pack_fields)) + { + puts("got error from maria_extra(HA_EXTRA_CACHE)"); + goto err; + } + } + ant=0; + assert(maria_scan_init(file) == 0); + while ((error= maria_scan(file,record)) != HA_ERR_END_OF_FILE && + ant < write_count + 10) + ant+= error ? 0 : 1; + maria_scan_end(file); + if (ant != write_count-opt_delete) + { + printf("scan with cache: I can only find: %d records of %d\n", + ant,write_count-opt_delete); + maria_scan_end(file); + goto err; + } + if (maria_extra(file,HA_EXTRA_NO_CACHE,0)) + { + puts("got error from maria_extra(HA_EXTRA_NO_CACHE)"); + maria_scan_end(file); + goto err; + } + maria_scan_end(file); + + ant=0; + maria_scan_init(file); + while ((error=maria_scan(file,record)) != HA_ERR_END_OF_FILE && + ant < write_count + 10) + ant+= error ? 0 : 1; + if (ant != write_count-opt_delete) + { + printf("scan with cache: I can only find: %d records of %d\n", + ant,write_count-opt_delete); + maria_scan_end(file); + goto err; + } + maria_scan_end(file); + + if (testflag == 5) + goto end; + if (checkpoint == 5 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (!silent) + printf("- Removing keys\n"); + DBUG_PRINT("progpos",("Removing keys")); + lastpos = HA_OFFSET_ERROR; + /* DBUG_POP(); */ + maria_reset(file); + found_parts=0; + maria_scan_init(file); + while ((error= maria_scan(file,read_record)) != HA_ERR_END_OF_FILE) + { + info.recpos=maria_position(file); + if (lastpos >= info.recpos && lastpos != HA_OFFSET_ERROR) + { + printf("maria_rrnd didn't advance filepointer; old: %ld, new: %ld\n", + (long) lastpos, (long) info.recpos); + goto err; + } + lastpos=info.recpos; + if (error == 0) + { + if (opt_delete == (uint) remove_count) /* While testing */ + goto end; + if (rnd(2) == 1 && maria_rsame(file,read_record,-1)) + { + printf("can't find record %lx\n",(long) info.recpos); + goto err; + } + if (use_blob) + { + ulong blob_length,pos; + uchar *ptr; + memcpy_fixed(&ptr, read_record+blob_pos+4, sizeof(ptr)); + blob_length= uint4korr(read_record+blob_pos); + for (pos=0 ; pos < blob_length ; pos++) + { + if (ptr[pos] != (uchar) (blob_length+pos)) + { + printf("Found blob with wrong info at %ld\n",(long) lastpos); + maria_scan_end(file); + my_errno= 0; + goto err; + } + } + } + if (maria_delete(file,read_record)) + { + printf("can't delete record: %6.6s, delete_count: %d\n", + read_record, opt_delete); + maria_scan_end(file); + goto err; + } + opt_delete++; + } + else + found_parts++; + } + if (my_errno != HA_ERR_END_OF_FILE && my_errno != HA_ERR_RECORD_DELETED) + printf("error: %d from maria_rrnd\n",my_errno); + if (write_count != opt_delete) + { + printf("Deleted only %d of %d records (%d parts)\n",opt_delete,write_count, + found_parts); + maria_scan_end(file); + goto err; + } + if (testflag == 6) + goto end; + if (checkpoint == 6 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + +end: + maria_scan_end(file); + if (die_in_middle_of_transaction) + { + /* As commit record is not done, UNDO entries needs to be rolled back */ + switch (die_in_middle_of_transaction) { + case 1: + /* + Flush changed data and index pages go to disk + That will also flush log. Recovery will skip REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE); + break; + case 2: + /* + Just flush log. Pages are likely to not be on disk. Recovery will + then execute REDOs and UNDOs. + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + case 3: + /* + Flush nothing. Pages and log are likely to not be on disk. Recovery + will then do nothing. + */ + break; + case 4: + /* + Flush changed data pages go to disk. Changed index pages are not + flushed. Recovery will skip some REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE, + FLUSH_RELEASE); + /* + We have to flush log separately as the redo for the last key page + may not be flushed + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + } + printf("Dying on request without maria_commit()/maria_close()\n"); + exit(0); + } + if (maria_commit(file)) + goto err; + if (maria_close(file)) + { + file= 0; + goto err; + } + file= 0; + maria_panic(HA_PANIC_CLOSE); /* Should close log */ + if (!silent) + { + printf("\nFollowing test have been made:\n"); + printf("Write records: %d\nUpdate records: %d\nSame-key-read: %d\nDelete records: %d\n", write_count,update,dupp_keys,opt_delete); + if (rec_pointer_size) + printf("Record pointer size: %d\n",rec_pointer_size); + printf("maria_block_size: %lu\n", maria_block_size); + if (write_cacheing) + puts("Key cache resized"); + if (write_cacheing) + puts("Write cacheing used"); + if (write_cacheing) + puts("quick mode"); + if (async_io && do_locking) + puts("Asyncron io with locking used"); + else if (do_locking) + puts("Locking used"); + if (use_blob) + puts("blobs used"); + printf("key cache status: \n\ +blocks used:%10lu\n\ +not flushed:%10lu\n\ +w_requests: %10lu\n\ +writes: %10lu\n\ +r_requests: %10lu\n\ +reads: %10lu\n", + maria_pagecache->blocks_used, + maria_pagecache->global_blocks_changed, + (ulong) maria_pagecache->global_cache_w_requests, + (ulong) maria_pagecache->global_cache_write, + (ulong) maria_pagecache->global_cache_r_requests, + (ulong) maria_pagecache->global_cache_read); + } + maria_end(); + my_free(blob_buffer, MYF(MY_ALLOW_ZERO_PTR)); + my_end(silent ? MY_CHECK_ERROR : MY_CHECK_ERROR | MY_GIVE_INFO); + return(0); +err: + printf("got error: %d when using MARIA-database\n",my_errno); + if (file) + { + if (maria_commit(file)) + goto err; + VOID(maria_close(file)); + } + maria_end(); + return(1); +} /* main */ + + +/* Read options */ + +static void get_options(int argc, char **argv) +{ + char *pos,*progname; + + progname= argv[0]; + + while (--argc >0 && *(pos = *(++argv)) == '-' ) { + switch(*++pos) { + case 'B': + pack_type= HA_BINARY_PACK_KEY; + break; + case 'b': + use_blob= 1000; + if (*++pos) + use_blob= atol(pos); + break; + case 'K': /* Use key cacheing */ + pagecacheing=1; + if (*++pos) + pagecache_size=atol(pos); + break; + case 'W': /* Use write cacheing */ + write_cacheing=1; + if (*++pos) + my_default_record_cache_size=atoi(pos); + break; + case 'd': + remove_count= atoi(++pos); + break; + case 'i': + if (*++pos) + srand(srand_arg= atoi(pos)); + break; + case 'L': + do_locking=1; + break; + case 'a': /* use asyncron io */ + async_io=1; + if (*++pos) + my_default_record_cache_size=atoi(pos); + break; + case 'v': /* verbose */ + verbose=1; + break; + case 'm': /* records */ + if ((recant=atoi(++pos)) < 10 && testflag > 2) + { + fprintf(stderr,"record count must be >= 10 (if testflag > 2)\n"); + exit(1); + } + break; + case 'e': /* maria_block_length */ + case 'E': + if ((maria_block_size= atoi(++pos)) < MARIA_MIN_KEY_BLOCK_LENGTH || + maria_block_size > MARIA_MAX_KEY_BLOCK_LENGTH) + { + fprintf(stderr,"Wrong maria_block_length\n"); + exit(1); + } + maria_block_size= my_round_up_to_next_power(maria_block_size); + break; + case 'f': + if ((first_key=atoi(++pos)) < 0 || first_key >= MARIA_KEYS) + first_key=0; + break; + case 'H': + checkpoint= atoi(++pos); + break; + case 'k': + if ((keys=(uint) atoi(++pos)) < 1 || + keys > (uint) (MARIA_KEYS-first_key)) + keys=MARIA_KEYS-first_key; + break; + case 'M': + record_type= BLOCK_RECORD; + break; + case 'P': + pack_type=0; /* Don't use DIFF_LENGTH */ + pack_seg=0; + break; + case 'R': /* Length of record pointer */ + rec_pointer_size=atoi(++pos); + if (rec_pointer_size > 7) + rec_pointer_size=0; + break; + case 'S': + pack_fields=0; /* Static-length-records */ + record_type= STATIC_RECORD; + break; + case 's': + silent=1; + break; + case 't': + testflag=atoi(++pos); /* testmod */ + break; + case 'T': + transactional= 1; + break; + case 'A': + die_in_middle_of_transaction= atoi(++pos); + break; + case 'u': + update_count=atoi(++pos); + if (!update_count) + skip_update= 1; + break; + case 'q': + opt_quick_mode=1; + break; + case 'c': + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + break; + case 'D': + create_flag|=HA_CREATE_DELAY_KEY_WRITE; + break; + case 'g': + skip_update= TRUE; + break; + case 'C': + opt_versioning= 1; + break; + case '?': + case 'I': + case 'V': + printf("%s Ver 1.2 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE); + puts("By Monty, for testing Maria\n"); + printf("Usage: %s [-?AbBcCDIKLPRqSsTVWltv] [-k#] [-f#] [-m#] [-e#] [-E#] [-t#]\n", + progname); + exit(0); + case '#': + DBUG_PUSH (++pos); + break; + default: + printf("Illegal option: '%c'\n",*pos); + break; + } + } + return; +} /* get options */ + + /* Get a random value 0 <= x <= n */ + +static uint rnd(uint max_value) +{ + return (uint) ((rand() & 32767)/32767.0*max_value); +} /* rnd */ + + + /* Create a variable length record */ + +static void fix_length(uchar *rec, uint length) +{ + bmove(rec+STANDARD_LENGTH, + "0123456789012345678901234567890123456789012345678901234567890", + length-STANDARD_LENGTH); + strfill((char*) rec+length,STANDARD_LENGTH+60-length,' '); +} /* fix_length */ + + +/* Put maybe a blob in record */ + +static int first_entry; + +static void put_blob_in_record(uchar *blob_pos, char **blob_buffer, + ulong *blob_length) +{ + ulong i,length; + *blob_length= 0; + if (use_blob) + { + if (! *blob_buffer && + !(*blob_buffer=my_malloc((uint) use_blob,MYF(MY_WME)))) + { + use_blob= 0; + return; + } + if (rnd(10) == 0) + { + if (first_entry++ == 0) + { + /* Ensure we have at least one blob of max length in file */ + length= use_blob; + } + else + length=rnd(use_blob); + for (i=0 ; i < length ; i++) + (*blob_buffer)[i]=(char) (length+i); + int4store(blob_pos,length); + memcpy_fixed(blob_pos+4,(char*) blob_buffer,sizeof(char*)); + *blob_length= length; + } + else + { + int4store(blob_pos,0); + } + } + return; +} + + +static void copy_key(MARIA_HA *info,uint inx,uchar *rec,uchar *key_buff) +{ + HA_KEYSEG *keyseg; + + for (keyseg=info->s->keyinfo[inx].seg ; keyseg->type ; keyseg++) + { + memcpy(key_buff,rec+keyseg->start,(size_t) keyseg->length); + key_buff+=keyseg->length; + } + return; +} diff --git a/storage/maria/ma_test3.c b/storage/maria/ma_test3.c new file mode 100644 index 00000000000..040d6fa78c2 --- /dev/null +++ b/storage/maria/ma_test3.c @@ -0,0 +1,501 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Test av locking */ + +#if !(defined (__NETWARE_) || defined (_WIN32)) /*no fork() in Windows*/ + +#include "maria.h" +#include <sys/types.h> +#ifdef HAVE_SYS_WAIT_H +# include <sys/wait.h> +#endif +#ifndef WEXITSTATUS +# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8) +#endif +#ifndef WIFEXITED +# define WIFEXITED(stat_val) (((stat_val) & 255) == 0) +#endif + + +#if defined(HAVE_LRAND48) +#define rnd(X) (lrand48() % X) +#define rnd_init(X) srand48(X) +#else +#define rnd(X) (random() % X) +#define rnd_init(X) srandom(X) +#endif + + +const char *filename= "test3"; +uint tests=10,forks=10,pagecacheing=0; + +static void get_options(int argc, char *argv[]); +void start_test(int id); +int test_read(MARIA_HA *,int),test_write(MARIA_HA *,int,int), + test_update(MARIA_HA *,int,int),test_rrnd(MARIA_HA *,int); + +struct record { + uchar id[8]; + uchar nr[4]; + uchar text[10]; +} record; + + +int main(int argc,char **argv) +{ + int status,wait_ret; + uint i=0; + MARIA_KEYDEF keyinfo[10]; + MARIA_COLUMNDEF recinfo[10]; + HA_KEYSEG keyseg[10][2]; + MY_INIT(argv[0]); + get_options(argc,argv); + + fprintf(stderr, "WARNING! this program is to test 'external locking'" + " (when several processes share a table through file locking)" + " which is not supported by Maria at all; expect errors." + " We may soon remove this program.\n"); + maria_init(); + bzero((char*) keyinfo,sizeof(keyinfo)); + bzero((char*) recinfo,sizeof(recinfo)); + bzero((char*) keyseg,sizeof(keyseg)); + keyinfo[0].seg= &keyseg[0][0]; + keyinfo[0].seg[0].start=0; + keyinfo[0].seg[0].length=8; + keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[0].seg[0].flag=HA_SPACE_PACK; + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].keysegs=1; + keyinfo[0].flag = (uint8) HA_PACK_KEY; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[1].seg= &keyseg[1][0]; + keyinfo[1].seg[0].start=8; + keyinfo[1].seg[0].length=4; /* Long is always 4 in maria */ + keyinfo[1].seg[0].type=HA_KEYTYPE_LONG_INT; + keyinfo[1].seg[0].flag=0; + keyinfo[1].key_alg=HA_KEY_ALG_BTREE; + keyinfo[1].keysegs=1; + keyinfo[1].flag =HA_NOSAME; + keyinfo[1].block_length= 0; /* Default block length */ + + recinfo[0].type=0; + recinfo[0].length=sizeof(record.id); + recinfo[1].type=0; + recinfo[1].length=sizeof(record.nr); + recinfo[2].type=0; + recinfo[2].length=sizeof(record.text); + + puts("- Creating maria-file"); + my_delete(filename,MYF(0)); /* Remove old locks under gdb */ + if (maria_create(filename,BLOCK_RECORD, 2, &keyinfo[0],2,&recinfo[0],0, + (MARIA_UNIQUEDEF*) 0, (MARIA_CREATE_INFO*) 0,0)) + exit(1); + + rnd_init(0); + printf("- Starting %d processes\n",forks); fflush(stdout); + for (i=0 ; i < forks; i++) + { + if (!fork()) + { + start_test(i+1); + sleep(1); + return 0; + } + VOID(rnd(1)); + } + + for (i=0 ; i < forks ; i++) + while ((wait_ret=wait(&status)) && wait_ret == -1); + maria_end(); + return 0; +} + + +static void get_options(int argc, char **argv) +{ + char *pos,*progname; + + progname= argv[0]; + + while (--argc >0 && *(pos = *(++argv)) == '-' ) { + switch(*++pos) { + case 'f': + forks=atoi(++pos); + break; + case 't': + tests=atoi(++pos); + break; + case 'K': /* Use key cacheing */ + pagecacheing=1; + break; + case 'A': /* All flags */ + pagecacheing=1; + break; + case '?': + case 'I': + case 'V': + printf("%s Ver 1.0 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE); + puts("By Monty, for your professional use\n"); + puts("Test av locking with threads\n"); + printf("Usage: %s [-?lKA] [-f#] [-t#]\n",progname); + exit(0); + case '#': + DBUG_PUSH (++pos); + break; + default: + printf("Illegal option: '%c'\n",*pos); + break; + } + } + return; +} + + +void start_test(int id) +{ + uint i; + int error,lock_type; + MARIA_INFO isam_info; + MARIA_HA *file,*file1,*file2=0,*lock; + + if (!(file1=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)) || + !(file2=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED))) + { + fprintf(stderr,"Can't open isam-file: %s\n",filename); + exit(1); + } + if (pagecacheing && rnd(2) == 0) + init_pagecache(maria_pagecache, 65536L, 0, 0, MARIA_KEY_BLOCK_LENGTH, + MY_WME); + printf("Process %d, pid: %ld\n",id,(long) getpid()); fflush(stdout); + + for (error=i=0 ; i < tests && !error; i++) + { + file= (rnd(2) == 1) ? file1 : file2; + lock=0 ; lock_type=0; + if (rnd(10) == 0) + { + if (maria_lock_database(lock=(rnd(2) ? file1 : file2), + lock_type=(rnd(2) == 0 ? F_RDLCK : F_WRLCK))) + { + fprintf(stderr,"%2d: start: Can't lock table %d\n",id,my_errno); + error=1; + break; + } + } + switch (rnd(4)) { + case 0: error=test_read(file,id); break; + case 1: error=test_rrnd(file,id); break; + case 2: error=test_write(file,id,lock_type); break; + case 3: error=test_update(file,id,lock_type); break; + } + if (lock) + maria_lock_database(lock,F_UNLCK); + } + if (!error) + { + maria_status(file1,&isam_info,HA_STATUS_VARIABLE); + printf("%2d: End of test. Records: %ld Deleted: %ld\n", + id,(long) isam_info.records, (long) isam_info.deleted); + fflush(stdout); + } + + maria_close(file1); + maria_close(file2); + if (error) + { + printf("%2d: Aborted\n",id); fflush(stdout); + exit(1); + } +} + + +int test_read(MARIA_HA *file,int id) +{ + uint i,lock,found,next,prev; + ulong find; + + lock=0; + if (rnd(2) == 0) + { + lock=1; + if (maria_lock_database(file,F_RDLCK)) + { + fprintf(stderr,"%2d: Can't lock table %d\n",id,my_errno); + return 1; + } + } + + found=next=prev=0; + for (i=0 ; i < 100 ; i++) + { + find=rnd(100000); + if (!maria_rkey(file,record.id,1,(uchar*) &find, HA_WHOLE_KEY, + HA_READ_KEY_EXACT)) + found++; + else + { + if (my_errno != HA_ERR_KEY_NOT_FOUND) + { + fprintf(stderr,"%2d: Got error %d from read in read\n",id,my_errno); + return 1; + } + else if (!maria_rnext(file,record.id,1)) + next++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in read\n",id,my_errno); + return 1; + } + else if (!maria_rprev(file,record.id,1)) + prev++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in read\n", + id,my_errno); + return 1; + } + } + } + } + } + if (lock) + { + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"%2d: Can't unlock table\n",id); + return 1; + } + } + printf("%2d: read: found: %5d next: %5d prev: %5d\n", + id,found,next,prev); + fflush(stdout); + return 0; +} + + +int test_rrnd(MARIA_HA *file,int id) +{ + uint count,lock; + + lock=0; + if (rnd(2) == 0) + { + lock=1; + if (maria_lock_database(file,F_RDLCK)) + { + fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno); + maria_close(file); + return 1; + } + if (rnd(2) == 0) + maria_extra(file,HA_EXTRA_CACHE,0); + } + + count=0; + if (maria_rrnd(file,record.id,0L)) + { + if (my_errno == HA_ERR_END_OF_FILE) + goto end; + fprintf(stderr,"%2d: Can't read first record (%d)\n",id,my_errno); + return 1; + } + for (count=1 ; !maria_rrnd(file,record.id,HA_OFFSET_ERROR) ;count++) ; + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rrnd\n",id,my_errno); + return 1; + } + +end: + if (lock) + { + maria_extra(file,HA_EXTRA_NO_CACHE,0); + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"%2d: Can't unlock table\n",id); + exit(0); + } + } + printf("%2d: rrnd: %5d\n",id,count); fflush(stdout); + return 0; +} + + +int test_write(MARIA_HA *file,int id,int lock_type) +{ + uint i,tries,count,lock; + + lock=0; + if (rnd(2) == 0 || lock_type == F_RDLCK) + { + lock=1; + if (maria_lock_database(file,F_WRLCK)) + { + if (lock_type == F_RDLCK && my_errno == EDEADLK) + { + printf("%2d: write: deadlock\n",id); fflush(stdout); + return 0; + } + fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno); + maria_close(file); + return 1; + } + if (rnd(2) == 0) + maria_extra(file,HA_EXTRA_WRITE_CACHE,0); + } + + sprintf((char*) record.id,"%7ld", (long) getpid()); + strnmov((char*) record.text,"Testing...", sizeof(record.text)); + + tries=(uint) rnd(100)+10; + for (i=count=0 ; i < tries ; i++) + { + uint32 tmp=rnd(80000)+20000; + int4store(record.nr,tmp); + if (!maria_write(file,record.id)) + count++; + else + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY) + { + fprintf(stderr,"%2d: Got error %d (errno %d) from write\n",id,my_errno, + errno); + return 1; + } + } + } + if (lock) + { + maria_extra(file,HA_EXTRA_NO_CACHE,0); + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"%2d: Can't unlock table\n",id); + exit(0); + } + } + printf("%2d: write: %5d\n",id,count); fflush(stdout); + return 0; +} + + +int test_update(MARIA_HA *file,int id,int lock_type) +{ + uint i,lock,found,next,prev,update; + uint32 tmp; + char find[4]; + struct record new_record; + + lock=0; + if (rnd(2) == 0 || lock_type == F_RDLCK) + { + lock=1; + if (maria_lock_database(file,F_WRLCK)) + { + if (lock_type == F_RDLCK && my_errno == EDEADLK) + { + printf("%2d: write: deadlock\n",id); fflush(stdout); + return 0; + } + fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno); + return 1; + } + } + bzero((char*) &new_record,sizeof(new_record)); + strmov((char*) new_record.text,"Updated"); + + found=next=prev=update=0; + for (i=0 ; i < 100 ; i++) + { + tmp=rnd(100000); + int4store(find,tmp); + if (!maria_rkey(file,record.id,1,(uchar*) find, HA_WHOLE_KEY, + HA_READ_KEY_EXACT)) + found++; + else + { + if (my_errno != HA_ERR_KEY_NOT_FOUND) + { + fprintf(stderr,"%2d: Got error %d from read in update\n",id,my_errno); + return 1; + } + else if (!maria_rnext(file,record.id,1)) + next++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in update\n", + id,my_errno); + return 1; + } + else if (!maria_rprev(file,record.id,1)) + prev++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in update\n", + id,my_errno); + return 1; + } + continue; + } + } + } + memcpy_fixed(new_record.id,record.id,sizeof(record.id)); + tmp=rnd(20000)+40000; + int4store(new_record.nr,tmp); + if (!maria_update(file,record.id,new_record.id)) + update++; + else + { + if (my_errno != HA_ERR_RECORD_CHANGED && + my_errno != HA_ERR_RECORD_DELETED && + my_errno != HA_ERR_FOUND_DUPP_KEY) + { + fprintf(stderr,"%2d: Got error %d from update\n",id,my_errno); + return 1; + } + } + } + if (lock) + { + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"Can't unlock table,id, error%d\n",my_errno); + return 1; + } + } + printf("%2d: update: %5d\n",id,update); fflush(stdout); + return 0; +} + +#else /* __NETWARE__ || __WIN__ */ + +#include <stdio.h> + +int main() +{ + fprintf(stderr,"this test has not been ported to Netware or Windows\n"); + return 0; +} + +#endif /* __NETWARE__|| __WIN__ */ diff --git a/storage/maria/ma_test_all.res b/storage/maria/ma_test_all.res new file mode 100644 index 00000000000..586aaf68020 --- /dev/null +++ b/storage/maria/ma_test_all.res @@ -0,0 +1,14 @@ +Running tests with dynamic row format +Running tests with static row format +Running tests with block row format +Running tests with block row format and transactions +ma_test2 -s -L -K -R1 -m2000 ; Should give error 135 +Error: 135 in write at record: 1099 +got error: 135 when using MARIA-database +./maria_chk -sm test2 will warn that 'Datafile is almost full' +maria_chk: MARIA file test2 +maria_chk: warning: Datafile is almost full, 65516 of 65534 used +MARIA-table 'test2' is usable but should be fixed +MARIA RECOVERY TESTS +ALL RECOVERY TESTS OK +!!!!!!!! BUT REMEMBER to FIX this BLOB issue !!!!!!! diff --git a/storage/maria/ma_test_all.sh b/storage/maria/ma_test_all.sh new file mode 100755 index 00000000000..041fbf3abe6 --- /dev/null +++ b/storage/maria/ma_test_all.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# +# This file is now deprecated and has been replaced by +# unittest/ma_test_all-t +# +# +# +# + +if test -n "$1"; then + + # unit.pl can't pass options to ma_test_all-t, so if anything + # was passed as an argument, assume the purpose was to pass + # them to ma_test_all-t and call it directly + + unittest/ma_test_all-t $@ +else + perl ../../unittest/unit.pl run unittest/ma_test_all-t +fi diff --git a/storage/maria/ma_test_big.sh b/storage/maria/ma_test_big.sh new file mode 100644 index 00000000000..6419d05e3a4 --- /dev/null +++ b/storage/maria/ma_test_big.sh @@ -0,0 +1,22 @@ +#!/bin/sh +# +# This tests is good to find bugs in the redo/undo handling and in +# finding bugs in blob handling +# + +set -e +a=15 +while test $a -le 5000 +do + echo $a + rm -f maria_log* + ma_test2 -s -L -K -W -P -M -T -c -b32768 -t4 -A1 -m$a > /dev/null + maria_read_log -a -s >& /dev/null + maria_chk -es test2 + maria_read_log -a -s >& /dev/null + maria_chk -es test2 + rm test2.MA? + maria_read_log -a -s >& /dev/null + maria_chk -es test2 + a=$((a+1)) +done diff --git a/storage/maria/ma_test_force_start.pl b/storage/maria/ma_test_force_start.pl new file mode 100755 index 00000000000..8148b2f212b --- /dev/null +++ b/storage/maria/ma_test_force_start.pl @@ -0,0 +1,238 @@ +#!/usr/bin/env perl + + +use strict; +use warnings; + +my $usage= <<EOF; +This program tests that the options +--aria-force-start-after-recovery-failures --aria-recover work as +expected. +It has to be run from directory mysql-test, and works with non-debug +and debug binaries. +Pass it option -d or -i (to test corruption of data or index file). +EOF + +# -d currently exhibits BUG#36578 +# "Maria: maria-recover may fail to autorepair a table" + +die($usage) if (@ARGV == 0); + +my $corrupt_index; + +if ($ARGV[0] eq '-d') + { + $corrupt_index= 0; + } +elsif ($ARGV[0] eq '-i') + { + $corrupt_index= 1; + } +else + { + die($usage); + } + +my $force_after= 3; +my $corrupt_file= $corrupt_index ? "MAI" : "MAD"; +my $corrupt_message= + "\\[ERROR\\] mysqld(.exe)*: Table '..test.t1' is marked as crashed and should be repaired"; + +my $sql_name= "./var/tmp/create_table.sql"; +my $error_log_name= "./var/log/master.err"; +my @cmd_output; +my $whatever; # garbage data +$ENV{MTR_VERSION} = 1; # MTR2 does not have --start-and-exit +my $base_server_cmd= "perl mysql-test-run.pl --mysqld=--aria-force-start-after-recovery-failures=$force_after --suite=maria maria.maria-recover ";
+if ($^O =~ /^mswin/i) + { + print <<EOF; +WARNING: with Activestate Perl, mysql-test-run.pl --start-and-exit has a bug: +it does not exit; cygwin perl recommended +EOF + } +my $iswindows= ( $^O =~ /win/i && $^O !~ /darwin/i ); +$base_server_cmd.= ($iswindows ? "--mysqld=--console" : "--mem"); +my $server_cmd; +my $server_pid_name="./var/run/master.pid"; +my $server_pid; +my $i; # count of server restarts +sub kill_server; + +my $suffix= ($iswindows ? ".exe" : ""); +my $client_exe_path= "../client/release"; +# we use -f, sometimes -x is unexpectedly false in Cygwin +if ( ! -f "$client_exe_path/mysql$suffix" ) + { + $client_exe_path= "../client/relwithdebinfo"; + if ( ! -f "$client_exe_path/mysql$suffix" ) + { + $client_exe_path= "../client/debug"; + if ( ! -f "$client_exe_path/mysql$suffix" ) + { + $client_exe_path= "../client"; + if ( ! -f "$client_exe_path/mysql$suffix" ) + { + die("Cannot find 'mysql' executable\n"); + } + } + } + } + +print "starting mysqld\n"; +$server_cmd= $base_server_cmd . " --start-and-exit 2>&1"; +@cmd_output=`$server_cmd`; +die if $?; +my $master_port= (grep (/Using MASTER_MYPORT .*= (\d+)$/, @cmd_output))[0]; +$master_port =~ s/.*= //; +chomp $master_port; +die unless $master_port > 0; + +my $client_cmd= "$client_exe_path/mysql -u root -h 127.0.0.1 -P $master_port test < $sql_name"; + +open(FILE, ">", $sql_name) or die; + +# To exhibit BUG#36578 with -d, we don't create an index if -d. This is +# because the presence of an index will cause repair-by-sort to be used, +# where sort_get_next_record() is only called inside +#_ma_create_index_by_sort(), so the latter function fails and in this +# case retry_repair is set, so bug does not happen. Whereas without +# an index, repair-with-key-cache is called, which calls +# sort_get_next_record() whose failure itself does not cause a retry. + +print FILE "create table t1 (a varchar(1000)". + ($corrupt_index ? ", index(a)" : "") .") engine=aria;\n"; +print FILE <<EOF; +insert into t1 values("ThursdayMorningsMarket"); +# If Recovery executes REDO_INDEX_NEW_PAGE it will overwrite our +# intentional corruption; we make Recovery skip this record by bumping +# create_rename_lsn using OPTIMIZE TABLE. This also makes sure to put +# the pages on disk, so that we can corrupt them. +optimize table t1; +# mark table open, so that --aria-recover repairs it +insert into t1 select concat(a,'b') from t1 limit 1; +EOF +close FILE; + +print "creating table\n"; +`$client_cmd`; +die if $?; + +print "killing mysqld hard\n"; +kill_server(9); + +print "ruining " . + ($corrupt_index ? "first page of keys" : "bitmap page") . + " in table to test aria-recover\n"; +open(FILE, "+<", "./var/master-data/test/t1.$corrupt_file") or die; +$whatever= ("\xAB" x 100); +sysseek (FILE, $corrupt_index ? 8192 : (8192-100-100), 0) or die; +syswrite (FILE, $whatever) or die; +close FILE; + +print "ruining log to make recovery fail; mysqld should fail the $force_after first restarts\n"; +open(FILE, "+<", "./var/tmp/aria_log.00000001") or die; +$whatever= ("\xAB" x 8192); +sysseek (FILE, 99, 0) or die; +syswrite (FILE, $whatever) or die; +close FILE; + +$server_cmd= $base_server_cmd . " --start-dirty 2>&1"; +for($i= 1; $i <= $force_after; $i= $i + 1) + { + print "mysqld restart number $i... "; + unlink($error_log_name) or die; + `$server_cmd`; + # mysqld should return 1 when can't read log + die unless (($? >> 8) == 1); + open(FILE, "<", $error_log_name) or die; + @cmd_output= <FILE>; + close FILE; + die unless grep(/\[ERROR\] mysqld(.exe)*: Aria engine: log initialization failed/, @cmd_output); + die unless grep(/\[ERROR\] Plugin 'Aria' init function returned error./, @cmd_output); + print "failed - ok\n"; + } + +print "mysqld restart number $i... "; +unlink($error_log_name) or die; +@cmd_output=`$server_cmd`; +die if $?; +open(FILE, "<", $error_log_name) or die; +@cmd_output= <FILE>; +close FILE; +die unless grep(/\[Warning\] mysqld(.exe)*: Aria engine: removed all logs after [\d]+ consecutive failures of recovery from logs/, @cmd_output); +die unless grep(/\[ERROR\] mysqld(.exe)*: File '.*tmp.aria_log.00000001' not found \(Errcode: 2\)/, @cmd_output); +print "success - ok\n"; + +open(FILE, ">", $sql_name) or die; +print FILE <<EOF; +set global aria_recover=normal; +insert into t1 values('aaa'); +EOF +close FILE; + +# verify corruption has not yet been noticed +open(FILE, "<", $error_log_name) or die; +@cmd_output= <FILE>; +close FILE; +die if grep(/$corrupt_message/, @cmd_output); + +print "inserting in table\n"; +`$client_cmd`; +die if $?; +print "table is usable - ok\n"; + +open(FILE, "<", $error_log_name) or die; +@cmd_output= <FILE>; +close FILE; +die unless grep(/$corrupt_message/, @cmd_output); +die unless grep(/\[Warning\] Recovering table: '..test.t1'/, @cmd_output); +print "was corrupted and automatically repaired - ok\n"; + +# remove our traces +kill_server(15); + +print "TEST ALL OK\n"; + +# kills mysqld with signal given in parameter +sub kill_server + { + my ($sig)= @_; + my $wait_count= 0; + my $kill_cmd; + my @kill_output; + open(FILE, "<", $server_pid_name) or die; + @cmd_output= <FILE>; + close FILE; + $server_pid= $cmd_output[0]; + chomp $server_pid; + die unless $server_pid > 0; + if ($iswindows) + { + # On Windows, server_pid_name is not the "main" process id + # so perl's kill() does not see this process id. + # But taskkill works, though only with /F ("-9"-style kill). + $kill_cmd= "taskkill /F /PID $server_pid 2>&1"; + @kill_output= `$kill_cmd`; + die unless grep(/has been terminated/, @kill_output); + } + else + { + kill($sig, $server_pid) or die; + } + while (1) # wait until mysqld process gone + { + if ($iswindows) + { + @kill_output= `$kill_cmd`; + last if grep(/not found/, @kill_output); + } + else + { + kill (0, $server_pid) or last; + } + print "waiting for mysqld to die\n" if ($wait_count > 30); + $wait_count= $wait_count + 1; + select(undef, undef, undef, 0.1); + } + } diff --git a/storage/maria/ma_test_recovery b/storage/maria/ma_test_recovery new file mode 100755 index 00000000000..0b20264c434 --- /dev/null +++ b/storage/maria/ma_test_recovery @@ -0,0 +1,8 @@ +#!/bin/sh + +# Remove comment from next line if this script fails and you need more +# information of what's going on + +# This file is deprecated and has been replaced with ma_test_recovery.pl + +unittest/ma_test_recovery.pl $@ diff --git a/storage/maria/ma_unique.c b/storage/maria/ma_unique.c new file mode 100644 index 00000000000..a90578c2162 --- /dev/null +++ b/storage/maria/ma_unique.c @@ -0,0 +1,244 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Functions to check if a row is unique */ + +#include "maria_def.h" +#include <m_ctype.h> + +/** + Check if there exist a row with the same hash + + @notes + This function is not versioning safe. For the moment this is not a problem + as it's only used for internal temporary tables in MySQL for which there + isn't any versioning information. +*/ + +my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, uchar *record, + ha_checksum unique_hash, my_off_t disk_pos) +{ + my_off_t lastpos=info->cur_row.lastpos; + MARIA_KEYDEF *keyinfo= &info->s->keyinfo[def->key]; + uchar *key_buff= info->lastkey_buff2; + MARIA_KEY key; + DBUG_ENTER("_ma_check_unique"); + DBUG_PRINT("enter",("unique_hash: %lu", (ulong) unique_hash)); + + maria_unique_store(record+keyinfo->seg->start, unique_hash); + /* Can't be spatial so it's ok to call _ma_make_key directly here */ + _ma_make_key(info, &key, def->key, key_buff, record, 0, 0); + + /* The above changed info->lastkey_buff2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + DBUG_ASSERT(key.data_length == MARIA_UNIQUE_HASH_LENGTH); + if (_ma_search(info, &key, SEARCH_FIND, info->s->state.key_root[def->key])) + { + info->page_changed=1; /* Can't optimize read next */ + info->cur_row.lastpos= lastpos; + DBUG_RETURN(0); /* No matching rows */ + } + + for (;;) + { + if (info->cur_row.lastpos != disk_pos && + !(*info->s->compare_unique)(info,def,record,info->cur_row.lastpos)) + { + my_errno=HA_ERR_FOUND_DUPP_UNIQUE; + info->errkey= (int) def->key; + info->dup_key_pos= info->cur_row.lastpos; + info->page_changed= 1; /* Can't optimize read next */ + info->cur_row.lastpos= lastpos; + DBUG_PRINT("info",("Found duplicate")); + DBUG_RETURN(1); /* Found identical */ + } + DBUG_ASSERT(info->last_key.data_length == MARIA_UNIQUE_HASH_LENGTH); + if (_ma_search_next(info, &info->last_key, SEARCH_BIGGER, + info->s->state.key_root[def->key]) || + bcmp(info->last_key.data, key_buff, MARIA_UNIQUE_HASH_LENGTH)) + { + info->page_changed= 1; /* Can't optimize read next */ + info->cur_row.lastpos= lastpos; + DBUG_RETURN(0); /* end of tree */ + } + } +} + + +/* + Calculate a hash for a row + + TODO + Add support for bit fields +*/ + +ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *record) +{ + const uchar *pos, *end; + ha_checksum crc= 0; + ulong seed1=0, seed2= 4; + HA_KEYSEG *keyseg; + + for (keyseg=def->seg ; keyseg < def->end ; keyseg++) + { + enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type; + uint length=keyseg->length; + + if (keyseg->null_bit) + { + if (record[keyseg->null_pos] & keyseg->null_bit) + { + /* + Change crc in a way different from an empty string or 0. + (This is an optimisation; The code will work even if this isn't + done) + */ + crc=((crc << 8) + 511+ + (crc >> (8*sizeof(ha_checksum)-8))); + continue; + } + } + pos= record+keyseg->start; + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= keyseg->bit_start; + uint tmp_length= (pack_length == 1 ? (uint) *pos : + uint2korr(pos)); + pos+= pack_length; /* Skip VARCHAR length */ + set_if_smaller(length,tmp_length); + } + else if (keyseg->flag & HA_BLOB_PART) + { + uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos); + memcpy_fixed((uchar*) &pos,pos+keyseg->bit_start,sizeof(char*)); + if (!length || length > tmp_length) + length=tmp_length; /* The whole blob */ + } + end= pos+length; + if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 || + type == HA_KEYTYPE_VARTEXT2) + { + keyseg->charset->coll->hash_sort(keyseg->charset, + (const uchar*) pos, length, &seed1, + &seed2); + crc^= seed1; + } + else + while (pos != end) + crc=((crc << 8) + + (((uchar) *pos++))) + + (crc >> (8*sizeof(ha_checksum)-8)); + } + return crc; +} + + +/* + compare unique key for two rows + + TODO + Add support for bit fields + + RETURN + 0 if both rows have equal unique value + 1 Rows are different +*/ + +my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b, + my_bool null_are_equal) +{ + const uchar *pos_a, *pos_b, *end; + HA_KEYSEG *keyseg; + + for (keyseg=def->seg ; keyseg < def->end ; keyseg++) + { + enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type; + uint a_length, b_length; + a_length= b_length= keyseg->length; + + /* If part is NULL it's regarded as different */ + if (keyseg->null_bit) + { + uint tmp; + if ((tmp=(a[keyseg->null_pos] & keyseg->null_bit)) != + (uint) (b[keyseg->null_pos] & keyseg->null_bit)) + return 1; + if (tmp) + { + if (!null_are_equal) + return 1; + continue; + } + } + pos_a= a+keyseg->start; + pos_b= b+keyseg->start; + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= keyseg->bit_start; + if (pack_length == 1) + { + a_length= (uint) *pos_a++; + b_length= (uint) *pos_b++; + } + else + { + a_length= uint2korr(pos_a); + b_length= uint2korr(pos_b); + pos_a+= 2; /* Skip VARCHAR length */ + pos_b+= 2; + } + set_if_smaller(a_length, keyseg->length); /* Safety */ + set_if_smaller(b_length, keyseg->length); /* safety */ + } + else if (keyseg->flag & HA_BLOB_PART) + { + /* Only compare 'length' characters if length != 0 */ + a_length= _ma_calc_blob_length(keyseg->bit_start,pos_a); + b_length= _ma_calc_blob_length(keyseg->bit_start,pos_b); + /* Check that a and b are of equal length */ + if (keyseg->length) + { + /* + This is used in some cases when we are not interested in comparing + the whole length of the blob. + */ + set_if_smaller(a_length, keyseg->length); + set_if_smaller(b_length, keyseg->length); + } + memcpy_fixed((uchar*) &pos_a,pos_a+keyseg->bit_start,sizeof(char*)); + memcpy_fixed((uchar*) &pos_b,pos_b+keyseg->bit_start,sizeof(char*)); + } + if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 || + type == HA_KEYTYPE_VARTEXT2) + { + if (ha_compare_text(keyseg->charset, pos_a, a_length, + pos_b, b_length, 0, 1)) + return 1; + } + else + { + if (a_length != b_length) + return 1; + end= pos_a+a_length; + while (pos_a != end) + { + if (*pos_a++ != *pos_b++) + return 1; + } + } + } + return 0; +} diff --git a/storage/maria/ma_update.c b/storage/maria/ma_update.c new file mode 100644 index 00000000000..7b9e006ec43 --- /dev/null +++ b/storage/maria/ma_update.c @@ -0,0 +1,253 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "ma_fulltext.h" +#include "ma_rt_index.h" +#include "trnman.h" + +/** + Update an old row in a MARIA table +*/ + +int maria_update(register MARIA_HA *info, const uchar *oldrec, uchar *newrec) +{ + int flag,key_changed,save_errno; + reg3 my_off_t pos; + uint i; + uchar old_key_buff[MARIA_MAX_KEY_BUFF],*new_key_buff; + my_bool auto_key_changed= 0; + ulonglong changed; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_update"); + LINT_INIT(new_key_buff); + LINT_INIT(changed); + + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage", + maria_print_error(info->s, HA_ERR_CRASHED); + DBUG_RETURN(my_errno= HA_ERR_CRASHED);); + if (!(info->update & HA_STATE_AKTIV)) + { + DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); + } + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + if (share->state.state.key_file_length >= share->base.margin_key_file_length) + { + DBUG_RETURN(my_errno=HA_ERR_INDEX_FILE_FULL); + } + pos= info->cur_row.lastpos; + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + + if ((*share->compare_record)(info,oldrec)) + { + save_errno= my_errno; + DBUG_PRINT("warning", ("Got error from compare record")); + goto err_end; /* Record has changed */ + } + + /* Calculate and check all unique constraints */ + key_changed=0; + for (i=0 ; i < share->state.header.uniques ; i++) + { + MARIA_UNIQUEDEF *def=share->uniqueinfo+i; + if (_ma_unique_comp(def, newrec, oldrec,1) && + _ma_check_unique(info, def, newrec, _ma_unique_hash(def, newrec), + pos)) + { + save_errno=my_errno; + goto err_end; + } + } + if (_ma_mark_file_changed(info)) + { + save_errno=my_errno; + goto err_end; + } + + /* Ensure we don't try to restore auto_increment if it doesn't change */ + info->last_auto_increment= ~(ulonglong) 0; + + /* Check which keys changed from the original row */ + + new_key_buff= info->lastkey_buff2; + changed=0; + for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++) + { + if (maria_is_key_active(share->state.key_map, i)) + { + if (keyinfo->flag & HA_FULLTEXT ) + { + if (_ma_ft_cmp(info,i,oldrec, newrec)) + { + if ((int) i == info->lastinx) + { + /* + We are changeing the index we are reading on. Mark that + the index data has changed and we need to do a full search + when doing read-next + */ + key_changed|=HA_STATE_WRITTEN; + } + changed|=((ulonglong) 1 << i); + if (_ma_ft_update(info,i,old_key_buff,oldrec,newrec,pos)) + goto err; + } + } + else + { + MARIA_KEY new_key, old_key; + + (*keyinfo->make_key)(info,&new_key, i, new_key_buff, newrec, + pos, info->trn->trid); + (*keyinfo->make_key)(info,&old_key, i, old_key_buff, + oldrec, pos, info->cur_row.trid); + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + if (new_key.data_length != old_key.data_length || + memcmp(old_key.data, new_key.data, new_key.data_length)) + { + if ((int) i == info->lastinx) + key_changed|=HA_STATE_WRITTEN; /* Mark that keyfile changed */ + changed|=((ulonglong) 1 << i); + keyinfo->version++; + if (keyinfo->ck_delete(info,&old_key)) + goto err; + if (keyinfo->ck_insert(info,&new_key)) + goto err; + if (share->base.auto_key == i+1) + auto_key_changed=1; + } + } + } + } + + if (share->calc_checksum) + { + /* + We can't use the row based checksum as this doesn't have enough + precision (one byte, while the table's is more bytes). + At least _ma_check_unique() modifies the 'newrec' record, so checksum + has to be computed _after_ it. Nobody apparently modifies 'oldrec'. + We need to pass the old row's checksum down to (*update_record)(), we do + this via info->new_row.checksum (not intuitive but existing code + mandated that cur_row is the new row). + If (*update_record)() fails, table will be marked corrupted so no need + to revert the live checksum change. + */ + info->cur_row.checksum= (*share->calc_checksum)(info, newrec); + info->new_row.checksum= (*share->calc_checksum)(info, oldrec); + info->state->checksum+= info->cur_row.checksum - info->new_row.checksum; + } + + if ((*share->update_record)(info, pos, oldrec, newrec)) + goto err; + + if (auto_key_changed & !share->now_transactional) + { + const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg; + const uchar *key= newrec + keyseg->start; + set_if_bigger(share->state.auto_increment, + ma_retrieve_auto_increment(key, keyseg->type)); + } + + /* + We can't yet have HA_STATE_AKTIV here, as block_record dosn't support it + */ + info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED | key_changed); + share->state.changed|= STATE_NOT_MOVABLE | STATE_NOT_ZEROFILLED; + info->state->changed= 1; + + /* + Every Maria function that updates Maria table must end with + call to _ma_writeinfo(). If operation (second param of + _ma_writeinfo()) is not 0 it sets share->changed to 1, that is + flags that data has changed. If operation is 0, this function + equals to no-op in this case. + + ma_update() must always pass !0 value as operation, since even if + there is no index change there could be data change. + */ + VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE)); + allow_break(); /* Allow SIGHUP & SIGINT */ + if (info->invalidator != 0) + { + DBUG_PRINT("info", ("invalidator... '%s' (update)", + share->open_file_name.str)); + (*info->invalidator)(share->open_file_name.str); + info->invalidator=0; + } + DBUG_RETURN(0); + +err: + DBUG_PRINT("error",("key: %d errno: %d",i,my_errno)); + save_errno= my_errno; + DBUG_ASSERT(save_errno); + if (!save_errno) + save_errno= HA_ERR_INTERNAL_ERROR; /* Should never happen */ + + if (my_errno == HA_ERR_FOUND_DUPP_KEY || my_errno == HA_ERR_OUT_OF_MEM || + my_errno == HA_ERR_RECORD_FILE_FULL) + { + info->errkey= (int) i; + flag=0; + do + { + if (((ulonglong) 1 << i) & changed) + { + if (share->keyinfo[i].flag & HA_FULLTEXT) + { + if ((flag++ && _ma_ft_del(info,i,new_key_buff,newrec,pos)) || + _ma_ft_add(info,i,old_key_buff,oldrec,pos)) + break; + } + else + { + MARIA_KEY new_key, old_key; + (*share->keyinfo[i].make_key)(info, &new_key, i, new_key_buff, + newrec, pos, + info->trn->trid); + (*share->keyinfo[i].make_key)(info, &old_key, i, old_key_buff, + oldrec, pos, info->cur_row.trid); + if ((flag++ && _ma_ck_delete(info, &new_key)) || + _ma_ck_write(info, &old_key)) + break; + } + } + } while (i-- != 0); + } + else + { + maria_print_error(share, HA_ERR_CRASHED); + maria_mark_crashed(info); + } + info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_ROW_CHANGED | + key_changed); + + err_end: + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + allow_break(); /* Allow SIGHUP & SIGINT */ + if (save_errno == HA_ERR_KEY_NOT_FOUND) + { + maria_print_error(share, HA_ERR_CRASHED); + save_errno=HA_ERR_CRASHED; + } + DBUG_RETURN(my_errno=save_errno); +} /* maria_update */ diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c new file mode 100644 index 00000000000..02eeec754ee --- /dev/null +++ b/storage/maria/ma_write.c @@ -0,0 +1,2461 @@ +/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (C) 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Write a row to a MARIA table */ + +#include "ma_fulltext.h" +#include "ma_rt_index.h" +#include "trnman.h" +#include "ma_key_recover.h" +#include "ma_blockrec.h" + +#define MAX_POINTER_LENGTH 8 + + /* Functions declared in this file */ + +static int w_search(MARIA_HA *info, uint32 comp_flag, + MARIA_KEY *key, my_off_t page, + MARIA_PAGE *father_page, uchar *father_keypos, + my_bool insert_last); +static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_KEY *key, MARIA_PAGE *curr_page, + MARIA_PAGE *father_page, + uchar *father_key_pos, MARIA_KEY_PARAM *s_temp); +static uchar *_ma_find_last_pos(MARIA_KEY *int_key, + MARIA_PAGE *page, uchar **after_key); +static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key); +static my_bool _ma_ck_write_btree(register MARIA_HA *info, MARIA_KEY *key); +static my_bool _ma_ck_write_btree_with_log(MARIA_HA *, MARIA_KEY *, my_off_t *, + uint32); +static my_bool _ma_log_split(MARIA_PAGE *page, uint org_length, + uint new_length, + const uchar *key_pos, + uint key_length, int move_length, + enum en_key_op prefix_or_suffix, + const uchar *data, uint data_length, + uint changed_length); +static my_bool _ma_log_del_prefix(MARIA_PAGE *page, + uint org_length, uint new_length, + const uchar *key_pos, uint key_length, + int move_length); +static my_bool _ma_log_key_middle(MARIA_PAGE *page, + uint new_length, + uint data_added_first, + uint data_changed_first, + uint data_deleted_last, + const uchar *key_pos, + uint key_length, int move_length); + +/* + @brief Default handler for returing position to new row + + @note + This is only called for non transactional tables and not for block format + which is why we use info->state here. +*/ + +MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, + const uchar *record + __attribute__((unused))) +{ + return ((info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end) ? + info->s->state.dellink : + info->state->data_file_length); +} + +my_bool _ma_write_abort_default(MARIA_HA *info __attribute__((unused))) +{ + return 0; +} + + +/* Write new record to a table */ + +int maria_write(MARIA_HA *info, uchar *record) +{ + MARIA_SHARE *share= info->s; + uint i; + int save_errno; + MARIA_RECORD_POS filepos; + uchar *buff; + my_bool lock_tree= share->lock_key_trees; + my_bool fatal_error; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_write"); + DBUG_PRINT("enter",("index_file: %d data_file: %d", + share->kfile.file, info->dfile.file)); + + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage", + maria_print_error(info->s, HA_ERR_CRASHED); + DBUG_RETURN(my_errno= HA_ERR_CRASHED);); + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + dont_break(); /* Dont allow SIGHUP or SIGINT */ + + if (share->base.reloc == (ha_rows) 1 && + share->base.records == (ha_rows) 1 && + share->state.state.records == (ha_rows) 1) + { /* System file */ + my_errno=HA_ERR_RECORD_FILE_FULL; + goto err2; + } + if (share->state.state.key_file_length >= share->base.margin_key_file_length) + { + my_errno=HA_ERR_INDEX_FILE_FULL; + goto err2; + } + if (_ma_mark_file_changed(info)) + goto err2; + + /* Calculate and check all unique constraints */ + for (i=0 ; i < share->state.header.uniques ; i++) + { + if (_ma_check_unique(info,share->uniqueinfo+i,record, + _ma_unique_hash(share->uniqueinfo+i,record), + HA_OFFSET_ERROR)) + goto err2; + } + + /* Ensure we don't try to restore auto_increment if it doesn't change */ + info->last_auto_increment= ~(ulonglong) 0; + + if ((info->opt_flag & OPT_NO_ROWS)) + filepos= HA_OFFSET_ERROR; + else + { + /* + This may either calculate a record or, or write the record and return + the record id + */ + if ((filepos= (*share->write_record_init)(info, record)) == + HA_OFFSET_ERROR) + goto err2; + } + + /* Write all keys to indextree */ + buff= info->lastkey_buff2; + for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++) + { + MARIA_KEY int_key; + if (maria_is_key_active(share->state.key_map, i)) + { + my_bool local_lock_tree= (lock_tree && + !(info->bulk_insert && + is_tree_inited(&info->bulk_insert[i]))); + if (local_lock_tree) + { + rw_wrlock(&keyinfo->root_lock); + keyinfo->version++; + } + if (keyinfo->flag & HA_FULLTEXT ) + { + if (_ma_ft_add(info,i, buff,record,filepos)) + { + if (local_lock_tree) + rw_unlock(&keyinfo->root_lock); + DBUG_PRINT("error",("Got error: %d on write",my_errno)); + goto err; + } + } + else + { + while (keyinfo->ck_insert(info, + (*keyinfo->make_key)(info, &int_key, i, + buff, record, filepos, + info->trn->trid))) + { + TRN *blocker; + DBUG_PRINT("error",("Got error: %d on write",my_errno)); + /* + explicit check to filter out temp tables, they aren't + transactional and don't have a proper TRN so the code + below doesn't work for them. + Also, filter out non-thread maria use, and table modified in + the same transaction. + At last, filter out non-dup-unique errors. + */ + if (!local_lock_tree) + goto err; + if (info->dup_key_trid == info->trn->trid || + my_errno != HA_ERR_FOUND_DUPP_KEY) + { + rw_unlock(&keyinfo->root_lock); + goto err; + } + /* Different TrIDs: table must be transactional */ + DBUG_ASSERT(share->base.born_transactional); + /* + If transactions are disabled, and dup_key_trid is different from + our TrID, it must be ALTER TABLE with dup_key_trid==0 (no + transaction). ALTER TABLE does have MARIA_HA::TRN not dummy but + puts TrID=0 in rows/keys. + */ + DBUG_ASSERT(share->now_transactional || + (info->dup_key_trid == 0)); + blocker= trnman_trid_to_trn(info->trn, info->dup_key_trid); + /* + if blocker TRN was not found, it means that the conflicting + transaction was committed long time ago. It could not be + aborted, as it would have to wait on the key tree lock + to remove the conflicting key it has inserted. + */ + if (!blocker || blocker->commit_trid != ~(TrID)0) + { /* committed */ + if (blocker) + pthread_mutex_unlock(& blocker->state_lock); + rw_unlock(&keyinfo->root_lock); + goto err; + } + rw_unlock(&keyinfo->root_lock); + { + /* running. now we wait */ + WT_RESOURCE_ID rc; + int res; + const char *old_proc_info; + + rc.type= &ma_rc_dup_unique; + /* TODO savepoint id when we'll have them */ + rc.value= (intptr)blocker; + res= wt_thd_will_wait_for(info->trn->wt, blocker->wt, & rc); + if (res != WT_OK) + { + pthread_mutex_unlock(& blocker->state_lock); + my_errno= HA_ERR_LOCK_DEADLOCK; + goto err; + } + old_proc_info= proc_info_hook(0, + "waiting for a resource", + __func__, __FILE__, __LINE__); + res= wt_thd_cond_timedwait(info->trn->wt, & blocker->state_lock); + proc_info_hook(0, old_proc_info, __func__, __FILE__, __LINE__); + + pthread_mutex_unlock(& blocker->state_lock); + if (res != WT_OK) + { + my_errno= res == WT_TIMEOUT ? HA_ERR_LOCK_WAIT_TIMEOUT + : HA_ERR_LOCK_DEADLOCK; + goto err; + } + } + rw_wrlock(&keyinfo->root_lock); +#ifndef MARIA_CANNOT_ROLLBACK + keyinfo->version++; +#endif + } + } + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + if (local_lock_tree) + rw_unlock(&keyinfo->root_lock); + } + } + if (share->calc_write_checksum) + info->cur_row.checksum= (*share->calc_write_checksum)(info,record); + if (filepos != HA_OFFSET_ERROR) + { + if ((*share->write_record)(info,record)) + goto err; + info->state->checksum+= info->cur_row.checksum; + } + if (!share->now_transactional) + { + if (share->base.auto_key != 0) + { + const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg; + const uchar *key= record + keyseg->start; + set_if_bigger(share->state.auto_increment, + ma_retrieve_auto_increment(key, keyseg->type)); + } + } + info->state->records++; + info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_WRITTEN | + HA_STATE_ROW_CHANGED); + share->state.changed|= STATE_NOT_MOVABLE | STATE_NOT_ZEROFILLED; + info->state->changed= 1; + + info->cur_row.lastpos= filepos; + VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE)); + if (info->invalidator != 0) + { + DBUG_PRINT("info", ("invalidator... '%s' (update)", + share->open_file_name.str)); + (*info->invalidator)(share->open_file_name.str); + info->invalidator=0; + } + + /* + Update status of the table. We need to do so after each row write + for the log tables, as we want the new row to become visible to + other threads as soon as possible. We don't lock mutex here + (as it is required by pthread memory visibility rules) as (1) it's + not critical to use outdated share->is_log_table value (2) locking + mutex here for every write is too expensive. + */ + if (share->is_log_table) + _ma_update_status((void*) info); + + allow_break(); /* Allow SIGHUP & SIGINT */ + DBUG_RETURN(0); + +err: + save_errno= my_errno; + fatal_error= 0; + if (my_errno == HA_ERR_FOUND_DUPP_KEY || + my_errno == HA_ERR_RECORD_FILE_FULL || + my_errno == HA_ERR_LOCK_DEADLOCK || + my_errno == HA_ERR_LOCK_WAIT_TIMEOUT || + my_errno == HA_ERR_NULL_IN_SPATIAL || + my_errno == HA_ERR_OUT_OF_MEM) + { + if (info->bulk_insert) + { + uint j; + for (j=0 ; j < share->base.keys ; j++) + maria_flush_bulk_insert(info, j); + } + info->errkey= (int) i; + /* + We delete keys in the reverse order of insertion. This is the order that + a rollback would do and is important for CLR_ENDs generated by + _ma_ft|ck_delete() and write_record_abort() to work (with any other + order they would cause wrong jumps in the chain). + */ + while ( i-- > 0) + { + if (maria_is_key_active(share->state.key_map, i)) + { + my_bool local_lock_tree= (lock_tree && + !(info->bulk_insert && + is_tree_inited(&info->bulk_insert[i]))); + keyinfo= share->keyinfo + i; + if (local_lock_tree) + rw_wrlock(&keyinfo->root_lock); + /** + @todo RECOVERY BUG + The key deletes below should generate CLR_ENDs + */ + if (keyinfo->flag & HA_FULLTEXT) + { + if (_ma_ft_del(info,i,buff,record,filepos)) + { + if (local_lock_tree) + rw_unlock(&keyinfo->root_lock); + break; + } + } + else + { + MARIA_KEY key; + if (_ma_ck_delete(info, + (*keyinfo->make_key)(info, &key, i, buff, record, + filepos, info->trn->trid))) + { + if (local_lock_tree) + rw_unlock(&keyinfo->root_lock); + break; + } + } + if (local_lock_tree) + rw_unlock(&keyinfo->root_lock); + } + } + } + else + fatal_error= 1; + + if ((*share->write_record_abort)(info)) + fatal_error= 1; + if (fatal_error) + { + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); + } + + info->update= (HA_STATE_CHANGED | HA_STATE_WRITTEN | HA_STATE_ROW_CHANGED); + my_errno=save_errno; +err2: + save_errno=my_errno; + DBUG_ASSERT(save_errno); + if (!save_errno) + save_errno= HA_ERR_INTERNAL_ERROR; /* Should never happen */ + DBUG_PRINT("error", ("got error: %d", save_errno)); + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + allow_break(); /* Allow SIGHUP & SIGINT */ + DBUG_RETURN(my_errno=save_errno); +} /* maria_write */ + + +/* + Write one key to btree + + TODO + Remove this function and have bulk insert change keyinfo->ck_insert + to point to the right function +*/ + +my_bool _ma_ck_write(MARIA_HA *info, MARIA_KEY *key) +{ + DBUG_ENTER("_ma_ck_write"); + + if (info->bulk_insert && + is_tree_inited(&info->bulk_insert[key->keyinfo->key_nr])) + { + DBUG_RETURN(_ma_ck_write_tree(info, key)); + } + DBUG_RETURN(_ma_ck_write_btree(info, key)); +} /* _ma_ck_write */ + + +/********************************************************************** + Insert key into btree (normal case) +**********************************************************************/ + +static my_bool _ma_ck_write_btree(MARIA_HA *info, MARIA_KEY *key) +{ + my_bool error; + MARIA_KEYDEF *keyinfo= key->keyinfo; + my_off_t *root= &info->s->state.key_root[keyinfo->key_nr]; + DBUG_ENTER("_ma_ck_write_btree"); + + error= _ma_ck_write_btree_with_log(info, key, root, + keyinfo->write_comp_flag | key->flag); + if (info->ft1_to_ft2) + { + if (!error) + error= _ma_ft_convert_to_ft2(info, key); + delete_dynamic(info->ft1_to_ft2); + my_free(info->ft1_to_ft2, MYF(0)); + info->ft1_to_ft2=0; + } + DBUG_RETURN(error); +} /* _ma_ck_write_btree */ + + +/** + @brief Write a key to the b-tree + + @retval 1 error + @retval 0 ok +*/ + +static my_bool _ma_ck_write_btree_with_log(MARIA_HA *info, MARIA_KEY *key, + my_off_t *root, uint32 comp_flag) +{ + MARIA_SHARE *share= info->s; + LSN lsn= LSN_IMPOSSIBLE; + int error; + my_off_t new_root= *root; + uchar key_buff[MARIA_MAX_KEY_BUFF]; + MARIA_KEY org_key; + DBUG_ENTER("_ma_ck_write_btree_with_log"); + + LINT_INIT_STRUCT(org_key); + if (share->now_transactional) + { + /* Save original value as the key may change */ + org_key= *key; + memcpy(key_buff, key->data, key->data_length + key->ref_length); + } + + error= _ma_ck_real_write_btree(info, key, &new_root, comp_flag); + if (!error && share->now_transactional) + { + /* Log the original value */ + *key= org_key; + key->data= key_buff; + error= _ma_write_undo_key_insert(info, key, root, new_root, &lsn); + } + else + { + *root= new_root; + _ma_fast_unlock_key_del(info); + } + _ma_unpin_all_pages_and_finalize_row(info, lsn); + + DBUG_RETURN(error != 0); +} /* _ma_ck_write_btree_with_log */ + + +/** + @brief Write a key to the b-tree + + @retval 1 error + @retval 0 ok +*/ + +my_bool _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEY *key, my_off_t *root, + uint32 comp_flag) +{ + int error; + DBUG_ENTER("_ma_ck_real_write_btree"); + + /* key_length parameter is used only if comp_flag is SEARCH_FIND */ + if (*root == HA_OFFSET_ERROR || + (error= w_search(info, comp_flag, key, *root, (MARIA_PAGE *) 0, + (uchar*) 0, 1)) > 0) + error= _ma_enlarge_root(info, key, root); + DBUG_RETURN(error != 0); +} /* _ma_ck_real_write_btree */ + + +/** + @brief Make a new root with key as only pointer + + @retval 1 error + @retval 0 ok +*/ + +my_bool _ma_enlarge_root(MARIA_HA *info, MARIA_KEY *key, my_off_t *root) +{ + uint t_length, nod_flag; + MARIA_KEY_PARAM s_temp; + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + my_bool res= 0; + DBUG_ENTER("_ma_enlarge_root"); + + page.info= info; + page.keyinfo= keyinfo; + page.buff= info->buff; + page.flag= 0; + + nod_flag= (*root != HA_OFFSET_ERROR) ? share->base.key_reflength : 0; + /* Store pointer to prev page if nod */ + _ma_kpointer(info, page.buff + share->keypage_header, *root); + t_length= (*keyinfo->pack_key)(key, nod_flag, (uchar*) 0, + (uchar*) 0, (uchar*) 0, &s_temp); + page.size= share->keypage_header + t_length + nod_flag; + + bzero(page.buff, share->keypage_header); + _ma_store_keynr(share, page.buff, keyinfo->key_nr); + if (nod_flag) + page.flag|= KEYPAGE_FLAG_ISNOD; + if (key->flag & (SEARCH_USER_KEY_HAS_TRANSID | SEARCH_PAGE_KEY_HAS_TRANSID)) + page.flag|= KEYPAGE_FLAG_HAS_TRANSID; + (*keyinfo->store_key)(keyinfo, page.buff + share->keypage_header + + nod_flag, &s_temp); + + /* Mark that info->buff was used */ + info->keyread_buff_used= info->page_changed= 1; + if ((page.pos= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) == + HA_OFFSET_ERROR) + DBUG_RETURN(1); + *root= page.pos; + + page_store_info(share, &page); + + /* + Clear unitialized part of page to avoid valgrind/purify warnings + and to get a clean page that is easier to compress and compare with + pages generated with redo + */ + bzero(page.buff + page.size, share->block_size - page.size); + + if (share->now_transactional && _ma_log_new(&page, 1)) + res= 1; + + if (_ma_write_keypage(&page, page_link->write_lock, + PAGECACHE_PRIORITY_HIGH)) + res= 1; + + DBUG_RETURN(res); +} /* _ma_enlarge_root */ + + +/* + Search after a position for a key and store it there + + TODO: + Change this to use pagecache directly instead of creating a copy + of the page. To do this, we must however change write-key-on-page + algorithm to not overwrite the buffer but instead store any overflow + key in a separate buffer. + + @return + @retval -1 error + @retval 0 ok + @retval > 0 Key should be stored in higher tree +*/ + +static int w_search(register MARIA_HA *info, uint32 comp_flag, MARIA_KEY *key, + my_off_t page_pos, + MARIA_PAGE *father_page, uchar *father_keypos, + my_bool insert_last) +{ + int error,flag; + uchar *temp_buff,*keypos; + uchar keybuff[MARIA_MAX_KEY_BUFF]; + my_bool was_last_key; + my_off_t next_page, dup_key_pos; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("w_search"); + DBUG_PRINT("enter", ("page: %lu", (ulong) (page_pos/keyinfo->block_length))); + + if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ + MARIA_MAX_KEY_BUFF*2))) + DBUG_RETURN(-1); + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, temp_buff, 0)) + goto err; + + flag= (*keyinfo->bin_search)(key, &page, comp_flag, &keypos, + keybuff, &was_last_key); + if (flag == 0) + { + MARIA_KEY tmp_key; + /* get position to record with duplicated key */ + + tmp_key.keyinfo= keyinfo; + tmp_key.data= keybuff; + + if ((*keyinfo->get_key)(&tmp_key, page.flag, page.node, &keypos)) + dup_key_pos= _ma_row_pos_from_key(&tmp_key); + else + dup_key_pos= HA_OFFSET_ERROR; + + if (keyinfo->flag & HA_FULLTEXT) + { + uint off; + int subkeys; + + get_key_full_length_rdonly(off, keybuff); + subkeys=ft_sintXkorr(keybuff+off); + comp_flag=SEARCH_SAME; + if (subkeys >= 0) + { + /* normal word, one-level tree structure */ + flag=(*keyinfo->bin_search)(key, &page, comp_flag, + &keypos, keybuff, &was_last_key); + } + else + { + /* popular word. two-level tree. going down */ + my_off_t root=dup_key_pos; + keyinfo= &share->ft2_keyinfo; + get_key_full_length_rdonly(off, key); + key+=off; + /* we'll modify key entry 'in vivo' */ + keypos-= keyinfo->keylength + page.node; + error= _ma_ck_real_write_btree(info, key, &root, comp_flag); + _ma_dpointer(share, keypos+HA_FT_WLEN, root); + subkeys--; /* should there be underflow protection ? */ + DBUG_ASSERT(subkeys < 0); + ft_intXstore(keypos, subkeys); + if (!error) + { + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + my_afree(temp_buff); + DBUG_RETURN(error); + } + } + else /* not HA_FULLTEXT, normal HA_NOSAME key */ + { + /* + TODO + When the index will support true versioning - with multiple + identical values in the UNIQUE index, invisible to each other - + the following should be changed to "continue inserting keys, at the + end (of the row or statement) wait". We need to wait on *all* + unique conflicts at once, not one-at-a-time, because we need to + know all blockers in advance, otherwise we'll have incomplete wait-for + graph. + */ + /* + transaction that has inserted the conflicting key may be in progress. + the caller will wait for it to be committed or aborted. + */ + info->dup_key_trid= _ma_trid_from_key(&tmp_key); + info->dup_key_pos= dup_key_pos; + my_errno= HA_ERR_FOUND_DUPP_KEY; + DBUG_PRINT("warning", + ("Duplicate key. dup_key_trid: %lu pos %lu visible: %d", + (ulong) info->dup_key_trid, + (ulong) info->dup_key_pos, + info->trn ? trnman_can_read_from(info->trn, + info->dup_key_trid) : 2)); + goto err; + } + } + if (flag == MARIA_FOUND_WRONG_KEY) + goto err; + if (!was_last_key) + insert_last=0; + next_page= _ma_kpos(page.node, keypos); + if (next_page == HA_OFFSET_ERROR || + (error= w_search(info, comp_flag, key, next_page, + &page, keypos, insert_last)) > 0) + { + error= _ma_insert(info, key, &page, keypos, keybuff, + father_page, father_keypos, insert_last); + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + my_afree(temp_buff); + DBUG_RETURN(error); +err: + my_afree(temp_buff); + DBUG_PRINT("exit",("Error: %d",my_errno)); + DBUG_RETURN(-1); +} /* w_search */ + + +/* + Insert new key. + + SYNOPSIS + _ma_insert() + info Open table information. + keyinfo Key definition information. + key New key + anc_page Key page (beginning) + key_pos Position in key page where to insert. + key_buff Copy of previous key if keys where packed. + father_page position of parent key page in file. + father_key_pos position in parent key page for balancing. + insert_last If to append at end of page. + + DESCRIPTION + Insert new key at right of key_pos. + Note that caller must save anc_buff + + This function writes log records for all changed pages + (Including anc_buff and father page) + + RETURN + < 0 Error. + 0 OK + 1 If key contains key to upper level (from balance page) + 2 If key contains key to upper level (from split space) +*/ + +int _ma_insert(register MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *anc_page, uchar *key_pos, uchar *key_buff, + MARIA_PAGE *father_page, uchar *father_key_pos, + my_bool insert_last) +{ + uint a_length, nod_flag, org_anc_length; + int t_length; + uchar *endpos, *prev_key, *anc_buff; + MARIA_KEY_PARAM s_temp; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + DBUG_ENTER("_ma_insert"); + DBUG_PRINT("enter",("key_pos: 0x%lx", (ulong) key_pos)); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key);); + + /* + Note that anc_page->size can be bigger then block_size in case of + delete key that caused increase of page length + */ + org_anc_length= a_length= anc_page->size; + nod_flag= anc_page->node; + + anc_buff= anc_page->buff; + endpos= anc_buff+ a_length; + prev_key= (key_pos == anc_buff + share->keypage_header + nod_flag ? + (uchar*) 0 : key_buff); + t_length= (*keyinfo->pack_key)(key, nod_flag, + (key_pos == endpos ? (uchar*) 0 : key_pos), + prev_key, prev_key, &s_temp); +#ifndef DBUG_OFF + if (prev_key && (keyinfo->flag & (HA_BINARY_PACK_KEY | HA_PACK_KEY))) + { + DBUG_DUMP("prev_key", prev_key, _ma_keylength(keyinfo,prev_key)); + } + if (keyinfo->flag & HA_PACK_KEY) + { + DBUG_PRINT("test",("t_length: %d ref_len: %d", + t_length,s_temp.ref_length)); + DBUG_PRINT("test",("n_ref_len: %d n_length: %d key_pos: 0x%lx", + s_temp.n_ref_length, s_temp.n_length, (long) s_temp.key)); + } +#endif + if (t_length > 0) + { + if (t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH) + { + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(-1); + } + bmove_upp(endpos+t_length, endpos, (uint) (endpos-key_pos)); + } + else + { + if (-t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH) + { + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(-1); + } + bmove(key_pos,key_pos-t_length,(uint) (endpos-key_pos)+t_length); + } + (*keyinfo->store_key)(keyinfo,key_pos,&s_temp); + a_length+=t_length; + + if (key->flag & (SEARCH_USER_KEY_HAS_TRANSID | SEARCH_PAGE_KEY_HAS_TRANSID)) + { + _ma_mark_page_with_transid(share, anc_page); + } + anc_page->size= a_length; + page_store_size(share, anc_page); + + /* + Check if the new key fits totally into the the page + (anc_buff is big enough to contain a full page + one key) + */ + if (a_length <= share->max_index_block_size) + { + if (share->max_index_block_size - a_length < 32 && + (keyinfo->flag & HA_FULLTEXT) && key_pos == endpos && + share->base.key_reflength <= share->base.rec_reflength && + share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) + { + /* + Normal word. One-level tree. Page is almost full. + Let's consider converting. + We'll compare 'key' and the first key at anc_buff + */ + const uchar *a= key->data; + const uchar *b= anc_buff + share->keypage_header + nod_flag; + uint alen, blen, ft2len= share->ft2_keyinfo.keylength; + /* the very first key on the page is always unpacked */ + DBUG_ASSERT((*b & 128) == 0); +#if HA_FT_MAXLEN >= 127 + blen= mi_uint2korr(b); b+=2; + When you enable this code, as part of the MyISAM->Maria merge of +ChangeSet@1.2562, 2008-04-09 07:41:40+02:00, serg@janus.mylan +9 -0 + restore ft2 functionality, fix bugs. + Then this will enable two-level fulltext index, which is not totally + recoverable yet. + So remove this text and inform Guilhem so that he fixes the issue. +#else + blen= *b++; +#endif + get_key_length(alen,a); + DBUG_ASSERT(info->ft1_to_ft2==0); + if (alen == blen && + ha_compare_text(keyinfo->seg->charset, a, alen, + b, blen, 0, 0) == 0) + { + /* Yup. converting */ + info->ft1_to_ft2=(DYNAMIC_ARRAY *) + my_malloc(sizeof(DYNAMIC_ARRAY), MYF(MY_WME)); + my_init_dynamic_array(info->ft1_to_ft2, ft2len, 300, 50); + + /* + Now, adding all keys from the page to dynarray + if the page is a leaf (if not keys will be deleted later) + */ + if (!nod_flag) + { + /* + Let's leave the first key on the page, though, because + we cannot easily dispatch an empty page here + */ + b+=blen+ft2len+2; + for (a=anc_buff+a_length ; b < a ; b+=ft2len+2) + insert_dynamic(info->ft1_to_ft2, b); + + /* fixing the page's length - it contains only one key now */ + anc_page->size= share->keypage_header + blen + ft2len + 2; + page_store_size(share, anc_page); + } + /* the rest will be done when we're back from recursion */ + } + } + else + { + if (share->now_transactional && + _ma_log_add(anc_page, org_anc_length, + key_pos, s_temp.changed_length, t_length, 1, + KEY_OP_DEBUG_LOG_ADD_1)) + DBUG_RETURN(-1); + } + DBUG_RETURN(0); /* There is room on page */ + } + /* Page is full */ + if (nod_flag) + insert_last=0; + /* + TODO: + Remove 'born_transactional' here. + The only reason for having it here is that the current + _ma_balance_page_ can't handle variable length keys. + */ + if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + father_page && !insert_last && !info->quick_mode && + !info->s->base.born_transactional) + { + s_temp.key_pos= key_pos; + page_mark_changed(info, father_page); + DBUG_RETURN(_ma_balance_page(info, keyinfo, key, anc_page, + father_page, father_key_pos, + &s_temp)); + } + DBUG_RETURN(_ma_split_page(info, key, anc_page, + min(org_anc_length, + info->s->max_index_block_size), + key_pos, s_temp.changed_length, t_length, + key_buff, insert_last)); +} /* _ma_insert */ + + +/** + @brief split a full page in two and assign emerging item to key + + @fn _ma_split_page() + info Maria handler + keyinfo Key handler + key Buffer for middle key + split_page Page that should be split + org_split_length Original length of split_page before key was inserted + inserted_key_pos Address in buffer where key was inserted + changed_length Number of bytes changed at 'inserted_key_pos' + move_length Number of bytes buffer was moved when key was inserted + key_buff Key buffer to use for temporary storage of key + insert_last_key If we are insert key on rightmost key page + + @note + split_buff is not stored on disk (caller has to do this) + + @return + @retval 2 ok (Middle key up from _ma_insert()) + @retval -1 error +*/ + +int _ma_split_page(MARIA_HA *info, MARIA_KEY *key, MARIA_PAGE *split_page, + uint org_split_length, + uchar *inserted_key_pos, uint changed_length, + int move_length, + uchar *key_buff, my_bool insert_last_key) +{ + uint length,a_length,key_ref_length,t_length,nod_flag,key_length; + uint page_length, split_length, page_flag; + uchar *key_pos,*pos, *after_key; + MARIA_KEY_PARAM s_temp; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_KEY tmp_key; + MARIA_PAGE new_page; + int res; + DBUG_ENTER("_ma_split_page"); + + LINT_INIT(after_key); + DBUG_DUMP("buff", split_page->buff, split_page->size); + + info->page_changed=1; /* Info->buff is used */ + info->keyread_buff_used=1; + page_flag= split_page->flag; + nod_flag= split_page->node; + key_ref_length= share->keypage_header + nod_flag; + + new_page.info= info; + new_page.buff= info->buff; + new_page.keyinfo= keyinfo; + + tmp_key.data= key_buff; + tmp_key.keyinfo= keyinfo; + if (insert_last_key) + key_pos= _ma_find_last_pos(&tmp_key, split_page, &after_key); + else + key_pos= _ma_find_half_pos(&tmp_key, split_page, &after_key); + if (!key_pos) + DBUG_RETURN(-1); + + key_length= tmp_key.data_length + tmp_key.ref_length; + split_length= (uint) (key_pos - split_page->buff); + a_length= split_page->size; + split_page->size= split_length; + page_store_size(share, split_page); + + key_pos=after_key; + if (nod_flag) + { + DBUG_PRINT("test",("Splitting nod")); + pos=key_pos-nod_flag; + memcpy(new_page.buff + share->keypage_header, pos, (size_t) nod_flag); + } + + /* Move middle item to key and pointer to new page */ + if ((new_page.pos= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) == + HA_OFFSET_ERROR) + DBUG_RETURN(-1); + + _ma_copy_key(key, &tmp_key); + _ma_kpointer(info, key->data + key_length, new_page.pos); + + /* Store new page */ + if (!(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &key_pos)) + DBUG_RETURN(-1); + + t_length=(*keyinfo->pack_key)(&tmp_key, nod_flag, (uchar *) 0, + (uchar*) 0, (uchar*) 0, &s_temp); + length=(uint) ((split_page->buff + a_length) - key_pos); + memcpy(new_page.buff + key_ref_length + t_length, key_pos, + (size_t) length); + (*keyinfo->store_key)(keyinfo,new_page.buff+key_ref_length,&s_temp); + page_length= length + t_length + key_ref_length; + + bzero(new_page.buff, share->keypage_header); + /* Copy KEYFLAG_FLAG_ISNODE and KEYPAGE_FLAG_HAS_TRANSID from parent page */ + new_page.flag= page_flag; + new_page.size= page_length; + page_store_info(share, &new_page); + + /* Copy key number */ + new_page.buff[share->keypage_header - KEYPAGE_USED_SIZE - + KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE]= + split_page->buff[share->keypage_header - KEYPAGE_USED_SIZE - + KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE]; + + res= 2; /* Middle key up */ + if (share->now_transactional && _ma_log_new(&new_page, 0)) + res= -1; + + /* + Clear unitialized part of page to avoid valgrind/purify warnings + and to get a clean page that is easier to compress and compare with + pages generated with redo + */ + bzero(new_page.buff + page_length, share->block_size - page_length); + + if (_ma_write_keypage(&new_page, page_link->write_lock, + DFLT_INIT_HITS)) + res= -1; + + /* Save changes to split pages */ + if (share->now_transactional && + _ma_log_split(split_page, org_split_length, split_length, + inserted_key_pos, changed_length, move_length, + KEY_OP_NONE, (uchar*) 0, 0, 0)) + res= -1; + + DBUG_DUMP_KEY("middle_key", key); + DBUG_RETURN(res); +} /* _ma_split_page */ + + +/* + Calculate how to much to move to split a page in two + + Returns pointer to start of key. + key will contain the key. + return_key_length will contain the length of key + after_key will contain the position to where the next key starts +*/ + +uchar *_ma_find_half_pos(MARIA_KEY *key, MARIA_PAGE *ma_page, + uchar **after_key) +{ + uint keys, length, key_ref_length, page_flag, nod_flag; + uchar *page, *end, *lastpos; + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + DBUG_ENTER("_ma_find_half_pos"); + + nod_flag= ma_page->node; + key_ref_length= share->keypage_header + nod_flag; + page_flag= ma_page->flag; + length= ma_page->size - key_ref_length; + page= ma_page->buff+ key_ref_length; /* Point to first key */ + + if (!(keyinfo->flag & + (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | + HA_BINARY_PACK_KEY)) && !(page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + key_ref_length= keyinfo->keylength+nod_flag; + key->data_length= keyinfo->keylength - info->s->rec_reflength; + key->ref_length= info->s->rec_reflength; + key->flag= 0; + keys=length/(key_ref_length*2); + end=page+keys*key_ref_length; + *after_key=end+key_ref_length; + memcpy(key->data, end, key_ref_length); + DBUG_RETURN(end); + } + + end=page+length/2-key_ref_length; /* This is aprox. half */ + key->data[0]= 0; /* Safety */ + do + { + lastpos=page; + if (!(length= (*keyinfo->get_key)(key, page_flag, nod_flag, &page))) + DBUG_RETURN(0); + } while (page < end); + *after_key= page; + DBUG_PRINT("exit",("returns: 0x%lx page: 0x%lx half: 0x%lx", + (long) lastpos, (long) page, (long) end)); + DBUG_RETURN(lastpos); +} /* _ma_find_half_pos */ + + +/** + Find second to last key on leaf page + + @notes + Used to split buffer at last key. In this case the next to last + key will be moved to parent page and last key will be on it's own page. + + @TODO + Add one argument for 'last key value' to get_key so that one can + do the loop without having to copy the found key the whole time + + @return + @retval Pointer to the start of the key before the last key + @retval int_key will contain the last key +*/ + +static uchar *_ma_find_last_pos(MARIA_KEY *int_key, MARIA_PAGE *ma_page, + uchar **after_key) +{ + uint keys, length, key_ref_length, page_flag; + uchar *page, *end, *lastpos, *prevpos; + uchar key_buff[MARIA_MAX_KEY_BUFF]; + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= int_key->keyinfo; + MARIA_KEY tmp_key; + DBUG_ENTER("_ma_find_last_pos"); + + key_ref_length= share->keypage_header; + page_flag= ma_page->flag; + length= ma_page->size - key_ref_length; + page= ma_page->buff + key_ref_length; + + if (!(keyinfo->flag & + (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | + HA_BINARY_PACK_KEY)) && !(page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + keys= length / keyinfo->keylength - 2; + length= keyinfo->keylength; + int_key->data_length= length - info->s->rec_reflength; + int_key->ref_length= info->s->rec_reflength; + int_key->flag= 0; + end=page+keys*length; + *after_key=end+length; + memcpy(int_key->data, end, length); + DBUG_RETURN(end); + } + + end=page+length-key_ref_length; + lastpos=page; + tmp_key.data= key_buff; + tmp_key.keyinfo= int_key->keyinfo; + key_buff[0]= 0; /* Safety */ + + /* We know that there are at least 2 keys on the page */ + + if (!(length=(*keyinfo->get_key)(&tmp_key, page_flag, 0, &page))) + { + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); + } + + do + { + prevpos=lastpos; lastpos=page; + int_key->data_length= tmp_key.data_length; + int_key->ref_length= tmp_key.ref_length; + int_key->flag= tmp_key.flag; + memcpy(int_key->data, key_buff, length); /* previous key */ + if (!(length=(*keyinfo->get_key)(&tmp_key, page_flag, 0, &page))) + { + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); + } + } while (page < end); + + *after_key=lastpos; + DBUG_PRINT("exit",("returns: 0x%lx page: 0x%lx end: 0x%lx", + (long) prevpos,(long) page,(long) end)); + DBUG_RETURN(prevpos); +} /* _ma_find_last_pos */ + + +/** + @brief Balance page with static size keys with page on right/left + + @param key Middle key will be stored here + + @notes + Father_buff will always be changed + Caller must handle saving of curr_buff + + @return + @retval 0 Balance was done (father buff is saved) + @retval 1 Middle key up (father buff is not saved) + @retval -1 Error +*/ + +static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_KEY *key, MARIA_PAGE *curr_page, + MARIA_PAGE *father_page, + uchar *father_key_pos, MARIA_KEY_PARAM *s_temp) +{ + MARIA_PINNED_PAGE tmp_page_link, *new_page_link= &tmp_page_link; + MARIA_SHARE *share= info->s; + my_bool right; + uint k_length,father_length,father_keylength,nod_flag,curr_keylength; + uint right_length,left_length,new_right_length,new_left_length,extra_length; + uint keys, tmp_length, extra_buff_length; + uchar *pos, *extra_buff, *parting_key; + uchar tmp_part_key[MARIA_MAX_KEY_BUFF]; + MARIA_PAGE next_page, extra_page, *left_page, *right_page; + DBUG_ENTER("_ma_balance_page"); + + k_length= keyinfo->keylength; + father_length= father_page->size; + father_keylength= k_length + share->base.key_reflength; + nod_flag= curr_page->node; + curr_keylength= k_length+nod_flag; + info->page_changed=1; + + if ((father_key_pos != father_page->buff+father_length && + (info->state->records & 1)) || + father_key_pos == father_page->buff+ share->keypage_header + + share->base.key_reflength) + { + right=1; + next_page.pos= _ma_kpos(share->base.key_reflength, + father_key_pos+father_keylength); + left_page= curr_page; + right_page= &next_page; + DBUG_PRINT("info", ("use right page: %lu", + (ulong) (next_page.pos / keyinfo->block_length))); + } + else + { + right=0; + father_key_pos-=father_keylength; + next_page.pos= _ma_kpos(share->base.key_reflength,father_key_pos); + left_page= &next_page; + right_page= curr_page; + DBUG_PRINT("info", ("use left page: %lu", + (ulong) (next_page.pos / keyinfo->block_length))); + } /* father_key_pos ptr to parting key */ + + if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos, + PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, info->buff, 0)) + goto err; + page_mark_changed(info, &next_page); + DBUG_DUMP("next", next_page.buff, next_page.size); + + /* Test if there is room to share keys */ + left_length= left_page->size; + right_length= right_page->size; + keys= ((left_length+right_length-share->keypage_header*2-nod_flag*2)/ + curr_keylength); + + if ((right ? right_length : left_length) + curr_keylength <= + share->max_index_block_size) + { + /* Enough space to hold all keys in the two buffers ; Balance bufferts */ + new_left_length= share->keypage_header+nod_flag+(keys/2)*curr_keylength; + new_right_length=share->keypage_header+nod_flag+(((keys+1)/2)* + curr_keylength); + left_page->size= new_left_length; + page_store_size(share, left_page); + right_page->size= new_right_length; + page_store_size(share, right_page); + + DBUG_PRINT("info", ("left_length: %u -> %u right_length: %u -> %u", + left_length, new_left_length, + right_length, new_right_length)); + if (left_length < new_left_length) + { + uint length; + DBUG_PRINT("info", ("move keys to end of buff")); + + /* Move keys right_page -> left_page */ + pos= left_page->buff+left_length; + memcpy(pos,father_key_pos, (size_t) k_length); + memcpy(pos+k_length, right_page->buff + share->keypage_header, + (size_t) (length=new_left_length - left_length - k_length)); + pos= right_page->buff + share->keypage_header + length; + memcpy(father_key_pos, pos, (size_t) k_length); + bmove(right_page->buff + share->keypage_header, + pos + k_length, new_right_length); + + if (share->now_transactional) + { + if (right) + { + /* + Log changes to page on left + The original page is on the left and stored in left_page->buff + We have on the page the newly inserted key and data + from buff added last on the page + */ + if (_ma_log_split(curr_page, + left_length - s_temp->move_length, + new_left_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length, + KEY_OP_ADD_SUFFIX, + curr_page->buff + left_length, + new_left_length - left_length, + new_left_length - left_length+ k_length)) + goto err; + /* + Log changes to page on right + This contains the original data with some keys deleted from + start of page + */ + if (_ma_log_prefix(&next_page, 0, + ((int) new_right_length - (int) right_length), + KEY_OP_DEBUG_LOG_PREFIX_3)) + goto err; + } + else + { + /* + Log changes to page on right (the original page) which is in buff + Data is removed from start of page + The inserted key may be in buff or moved to curr_buff + */ + if (_ma_log_del_prefix(curr_page, + right_length - s_temp->changed_length, + new_right_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length)) + goto err; + /* + Log changes to page on left, which has new data added last + */ + if (_ma_log_suffix(&next_page, left_length, new_left_length)) + goto err; + } + } + } + else + { + uint length; + DBUG_PRINT("info", ("move keys to start of right_page")); + + bmove_upp(right_page->buff + new_right_length, + right_page->buff + right_length, + right_length - share->keypage_header); + length= new_right_length -right_length - k_length; + memcpy(right_page->buff + share->keypage_header + length, father_key_pos, + (size_t) k_length); + pos= left_page->buff + new_left_length; + memcpy(father_key_pos, pos, (size_t) k_length); + memcpy(right_page->buff + share->keypage_header, pos+k_length, + (size_t) length); + + if (share->now_transactional) + { + if (right) + { + /* + Log changes to page on left + The original page is on the left and stored in curr_buff + The page is shortened from end and the key may be on the page + */ + if (_ma_log_split(curr_page, + left_length - s_temp->move_length, + new_left_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length, + KEY_OP_NONE, (uchar*) 0, 0, 0)) + goto err; + /* + Log changes to page on right + This contains the original data, with some data from cur_buff + added first + */ + if (_ma_log_prefix(&next_page, + (uint) (new_right_length - right_length), + (int) (new_right_length - right_length), + KEY_OP_DEBUG_LOG_PREFIX_4)) + goto err; + } + else + { + /* + Log changes to page on right (the original page) which is in buff + We have on the page the newly inserted key and data + from buff added first on the page + */ + uint diff_length= new_right_length - right_length; + if (_ma_log_split(curr_page, + left_length - s_temp->move_length, + new_right_length, + s_temp->key_pos + diff_length, + s_temp->changed_length, + s_temp->move_length, + KEY_OP_ADD_PREFIX, + curr_page->buff + share->keypage_header, + diff_length, diff_length + k_length)) + goto err; + /* + Log changes to page on left, which is shortened from end + */ + if (_ma_log_suffix(&next_page, left_length, new_left_length)) + goto err; + } + } + } + + /* Log changes to father (one level up) page */ + + if (share->now_transactional && + _ma_log_change(father_page, father_key_pos, k_length, + KEY_OP_DEBUG_FATHER_CHANGED_1)) + goto err; + + /* + next_page_link->changed is marked as true above and fathers + page_link->changed is marked as true in caller + */ + if (_ma_write_keypage(&next_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS) || + _ma_write_keypage(father_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + DBUG_RETURN(0); + } + + /* left_page and right_page are full, lets split and make new nod */ + + extra_buff= info->buff+share->base.max_key_block_length; + new_left_length= new_right_length= (share->keypage_header + nod_flag + + (keys+1) / 3 * curr_keylength); + extra_page.info= info; + extra_page.keyinfo= keyinfo; + extra_page.buff= extra_buff; + + /* + 5 is the minum number of keys we can have here. This comes from + the fact that each full page can store at least 2 keys and in this case + we have a 'split' key, ie 2+2+1 = 5 + */ + if (keys == 5) /* Too few keys to balance */ + new_left_length-=curr_keylength; + extra_length= (nod_flag + left_length + right_length - + new_left_length - new_right_length - curr_keylength); + extra_buff_length= extra_length + share->keypage_header; + DBUG_PRINT("info",("left_length: %d right_length: %d new_left_length: %d new_right_length: %d extra_length: %d", + left_length, right_length, + new_left_length, new_right_length, + extra_length)); + + left_page->size= new_left_length; + page_store_size(share, left_page); + right_page->size= new_right_length; + page_store_size(share, right_page); + + bzero(extra_buff, share->keypage_header); + extra_page.flag= nod_flag ? KEYPAGE_FLAG_ISNOD : 0; + extra_page.size= extra_buff_length; + page_store_info(share, &extra_page); + + /* Copy key number */ + extra_buff[share->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_KEYID_SIZE - + KEYPAGE_FLAG_SIZE]= keyinfo->key_nr; + + /* move first largest keys to new page */ + pos= right_page->buff + right_length-extra_length; + memcpy(extra_buff + share->keypage_header, pos, extra_length); + /* Zero old data from buffer */ + bzero(extra_buff + extra_buff_length, + share->block_size - extra_buff_length); + + /* Save new parting key between buff and extra_buff */ + memcpy(tmp_part_key, pos-k_length,k_length); + /* Make place for new keys */ + bmove_upp(right_page->buff + new_right_length, pos - k_length, + right_length - extra_length - k_length - share->keypage_header); + /* Copy keys from left page */ + pos= left_page->buff + new_left_length; + memcpy(right_page->buff + share->keypage_header, pos + k_length, + (size_t) (tmp_length= left_length - new_left_length - k_length)); + /* Copy old parting key */ + parting_key= right_page->buff + share->keypage_header + tmp_length; + memcpy(parting_key, father_key_pos, (size_t) k_length); + + /* Move new parting keys up to caller */ + memcpy((right ? key->data : father_key_pos),pos,(size_t) k_length); + memcpy((right ? father_key_pos : key->data),tmp_part_key, k_length); + + if ((extra_page.pos= _ma_new(info, DFLT_INIT_HITS, &new_page_link)) + == HA_OFFSET_ERROR) + goto err; + _ma_kpointer(info,key->data+k_length, extra_page.pos); + /* This is safe as long we are using not keys with transid */ + key->data_length= k_length - info->s->rec_reflength; + key->ref_length= info->s->rec_reflength; + + if (right) + { + /* + Page order according to key values: + orignal_page (curr_page = left_page), next_page (buff), extra_buff + + Move page positions so that we store data in extra_page where + next_page was and next_page will be stored at the new position + */ + swap_variables(my_off_t, extra_page.pos, next_page.pos); + } + + if (share->now_transactional) + { + if (right) + { + /* + left_page is shortened, + right_page is getting new keys at start and shortened from end. + extra_page is new page + + Note that extra_page (largest key parts) will be stored at the + place of the original 'right' page (next_page) and right page + will be stored at the new page position + + This makes the log entries smaller as right_page contains all + data to generate the data extra_buff + */ + + /* + Log changes to page on left (page shortened page at end) + */ + if (_ma_log_split(curr_page, + left_length - s_temp->move_length, new_left_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length, + KEY_OP_NONE, (uchar*) 0, 0, 0)) + goto err; + /* + Log changes to right page (stored at next page) + This contains the last 'extra_buff' from 'buff' + */ + if (_ma_log_prefix(&extra_page, + 0, (int) (extra_buff_length - right_length), + KEY_OP_DEBUG_LOG_PREFIX_5)) + goto err; + + /* + Log changes to middle page, which is stored at the new page + position + */ + if (_ma_log_new(&next_page, 0)) + goto err; + } + else + { + /* + Log changes to page on right (the original page) which is in buff + This contains the original data, with some data from curr_buff + added first and shortened at end + */ + int data_added_first= left_length - new_left_length; + if (_ma_log_key_middle(right_page, + new_right_length, + data_added_first, + data_added_first, + extra_length, + s_temp->key_pos, + s_temp->changed_length, + s_temp->move_length)) + goto err; + + /* Log changes to page on left, which is shortened from end */ + if (_ma_log_suffix(left_page, left_length, new_left_length)) + goto err; + + /* Log change to rightmost (new) page */ + if (_ma_log_new(&extra_page, 0)) + goto err; + } + + /* Log changes to father (one level up) page */ + if (share->now_transactional && + _ma_log_change(father_page, father_key_pos, k_length, + KEY_OP_DEBUG_FATHER_CHANGED_2)) + goto err; + } + + if (_ma_write_keypage(&next_page, + (right ? new_page_link->write_lock : + PAGECACHE_LOCK_LEFT_WRITELOCKED), + DFLT_INIT_HITS) || + _ma_write_keypage(&extra_page, + (!right ? new_page_link->write_lock : + PAGECACHE_LOCK_LEFT_WRITELOCKED), + DFLT_INIT_HITS)) + goto err; + + DBUG_RETURN(1); /* Middle key up */ + +err: + DBUG_RETURN(-1); +} /* _ma_balance_page */ + + +/********************************************************************** + * Bulk insert code * + **********************************************************************/ + +typedef struct { + MARIA_HA *info; + uint keynr; +} bulk_insert_param; + + +static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key) +{ + my_bool error; + uint keynr= key->keyinfo->key_nr; + DBUG_ENTER("_ma_ck_write_tree"); + + /* Store ref_length as this is always constant */ + info->bulk_insert_ref_length= key->ref_length; + error= tree_insert(&info->bulk_insert[keynr], key->data, + key->data_length + key->ref_length, + info->bulk_insert[keynr].custom_arg) == 0; + DBUG_RETURN(error); +} /* _ma_ck_write_tree */ + + +/* typeof(_ma_keys_compare)=qsort_cmp2 */ + +static int keys_compare(bulk_insert_param *param, uchar *key1, uchar *key2) +{ + uint not_used[2]; + return ha_key_cmp(param->info->s->keyinfo[param->keynr].seg, + key1, key2, USE_WHOLE_KEY, SEARCH_SAME, + not_used); +} + + +static int keys_free(uchar *key, TREE_FREE mode, bulk_insert_param *param) +{ + /* + Probably I can use info->lastkey here, but I'm not sure, + and to be safe I'd better use local lastkey. + */ + MARIA_SHARE *share= param->info->s; + uchar lastkey[MARIA_MAX_KEY_BUFF]; + uint keylen; + MARIA_KEYDEF *keyinfo= share->keyinfo + param->keynr; + MARIA_KEY tmp_key; + + switch (mode) { + case free_init: + if (share->lock_key_trees) + { + rw_wrlock(&keyinfo->root_lock); + keyinfo->version++; + } + return 0; + case free_free: + /* Note: keylen doesn't contain transid lengths */ + keylen= _ma_keylength(keyinfo, key); + tmp_key.data= lastkey; + tmp_key.keyinfo= keyinfo; + tmp_key.data_length= keylen - share->rec_reflength; + tmp_key.ref_length= param->info->bulk_insert_ref_length; + tmp_key.flag= (param->info->bulk_insert_ref_length == + share->rec_reflength ? 0 : SEARCH_USER_KEY_HAS_TRANSID); + /* + We have to copy key as ma_ck_write_btree may need the buffer for + copying middle key up if tree is growing + */ + memcpy(lastkey, key, tmp_key.data_length + tmp_key.ref_length); + return _ma_ck_write_btree(param->info, &tmp_key); + case free_end: + if (share->lock_key_trees) + rw_unlock(&keyinfo->root_lock); + return 0; + } + return 1; +} + + +int maria_init_bulk_insert(MARIA_HA *info, ulong cache_size, ha_rows rows) +{ + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *key=share->keyinfo; + bulk_insert_param *params; + uint i, num_keys, total_keylength; + ulonglong key_map; + DBUG_ENTER("_ma_init_bulk_insert"); + DBUG_PRINT("enter",("cache_size: %lu", cache_size)); + + DBUG_ASSERT(!info->bulk_insert && + (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT)); + + maria_clear_all_keys_active(key_map); + for (i=total_keylength=num_keys=0 ; i < share->base.keys ; i++) + { + if (! (key[i].flag & HA_NOSAME) && (share->base.auto_key != i + 1) && + maria_is_key_active(share->state.key_map, i)) + { + num_keys++; + maria_set_key_active(key_map, i); + total_keylength+=key[i].maxlength+TREE_ELEMENT_EXTRA_SIZE; + } + } + + if (num_keys==0 || + num_keys * MARIA_MIN_SIZE_BULK_INSERT_TREE > cache_size) + DBUG_RETURN(0); + + if (rows && rows*total_keylength < cache_size) + cache_size= (ulong)rows; + else + cache_size/=total_keylength*16; + + info->bulk_insert=(TREE *) + my_malloc((sizeof(TREE)*share->base.keys+ + sizeof(bulk_insert_param)*num_keys),MYF(0)); + + if (!info->bulk_insert) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + + params=(bulk_insert_param *)(info->bulk_insert+share->base.keys); + for (i=0 ; i < share->base.keys ; i++) + { + if (maria_is_key_active(key_map, i)) + { + params->info=info; + params->keynr=i; + /* Only allocate a 16'th of the buffer at a time */ + init_tree(&info->bulk_insert[i], + cache_size * key[i].maxlength, + cache_size * key[i].maxlength, 0, + (qsort_cmp2)keys_compare, 0, + (tree_element_free) keys_free, (void *)params++); + } + else + info->bulk_insert[i].root=0; + } + + DBUG_RETURN(0); +} + +void maria_flush_bulk_insert(MARIA_HA *info, uint inx) +{ + if (info->bulk_insert) + { + if (is_tree_inited(&info->bulk_insert[inx])) + reset_tree(&info->bulk_insert[inx]); + } +} + +void maria_end_bulk_insert(MARIA_HA *info) +{ + DBUG_ENTER("maria_end_bulk_insert"); + if (info->bulk_insert) + { + uint i; + for (i=0 ; i < info->s->base.keys ; i++) + { + if (is_tree_inited(&info->bulk_insert[i])) + { + if (info->s->deleting) + reset_free_element(&info->bulk_insert[i]); + delete_tree(&info->bulk_insert[i]); + } + } + my_free(info->bulk_insert, MYF(0)); + info->bulk_insert= 0; + } + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + Dedicated functions that generate log entries +****************************************************************************/ + + +int _ma_write_undo_key_insert(MARIA_HA *info, const MARIA_KEY *key, + my_off_t *root, my_off_t new_root, LSN *res_lsn) +{ + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE]; + const uchar *key_value; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + struct st_msg_to_write_hook_for_undo_key msg; + uint key_length; + + /* Save if we need to write a clr record */ + lsn_store(log_data, info->trn->undo_lsn); + key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, + keyinfo->key_nr); + key_length= key->data_length + key->ref_length; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key->data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length; + + msg.root= root; + msg.value= new_root; + msg.auto_increment= 0; + key_value= key->data; + if (share->base.auto_key == ((uint) keyinfo->key_nr + 1)) + { + const HA_KEYSEG *keyseg= keyinfo->seg; + uchar reversed[MARIA_MAX_KEY_BUFF]; + if (keyseg->flag & HA_SWAP_KEY) + { + /* We put key from log record to "data record" packing format... */ + const uchar *key_ptr= key->data, *key_end= key->data + keyseg->length; + uchar *to= reversed + keyseg->length; + do + { + *--to= *key_ptr++; + } while (key_ptr != key_end); + key_value= to; + } + /* ... so that we can read it with: */ + msg.auto_increment= + ma_retrieve_auto_increment(key_value, keyseg->type); + /* and write_hook_for_undo_key_insert() will pick this. */ + } + + return translog_write_record(res_lsn, LOGREC_UNDO_KEY_INSERT, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + 0].length + + key_length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data + LSN_STORE_SIZE, &msg) ? -1 : 0; +} + + +/** + @brief Log creation of new page + + @note + We don't have to store the page_length into the log entry as we can + calculate this from the length of the log entry + + @retval 1 error + @retval 0 ok +*/ + +my_bool _ma_log_new(MARIA_PAGE *ma_page, my_bool root_page) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + +1]; + uint page_length; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + my_off_t page= ma_page->pos / share->block_size; + DBUG_ENTER("_ma_log_new"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(share->now_transactional); + + /* Store address of new root page */ + page_store(log_data + FILEID_STORE_SIZE, page); + + /* Store link to next unused page */ + if (info->key_del_used == 2) + page= 0; /* key_del not changed */ + else + page= ((share->key_del_current == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO : + share->key_del_current / share->block_size); + + page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page); + key_nr_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE*2, + ma_page->keyinfo->key_nr); + log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE*2 + KEY_NR_STORE_SIZE]= + (uchar) root_page; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + page_length= ma_page->size - LSN_STORE_SIZE; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= ma_page->buff + LSN_STORE_SIZE; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= page_length; + + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX_NEW_PAGE, + info->trn, info, + (translog_size_t) + (sizeof(log_data) + page_length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + @brief + Log when some part of the key page changes +*/ + +my_bool _ma_log_change(MARIA_PAGE *ma_page, const uchar *key_pos, uint length, + enum en_key_debug debug_marker __attribute__((unused))) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 6 + 7], *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uint offset= (uint) (key_pos - ma_page->buff), translog_parts; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_change"); + DBUG_PRINT("enter", ("page: %lu length: %u", (ulong) page, length)); + + DBUG_ASSERT(info->s->now_transactional); + DBUG_ASSERT(offset + length <= ma_page->size); + DBUG_ASSERT(ma_page->org_size == ma_page->size); + + /* Store address of new root page */ + page= ma_page->pos / info->s->block_size; + page_store(log_data + FILEID_STORE_SIZE, page); + log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + (*log_pos++)= KEY_OP_DEBUG; + (*log_pos++)= debug_marker; +#endif + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos[3]= KEY_OP_CHANGE; + int2store(log_pos+4, length); + log_pos+= 6; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (log_pos - log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; + translog_parts= 2; + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &length, &translog_parts); + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) (log_pos - log_data) + length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + @brief Write log entry for page splitting + + @fn _ma_log_split() + @param + ma_page Page that is changed + org_length Original length of page. Can be bigger than block_size + for block that overflowed + new_length New length of page + key_pos Where key is inserted on page (may be 0 if no key) + key_length Number of bytes changed at key_pos + move_length Number of bytes moved at key_pos to make room for key + prefix_or_suffix KEY_OP_NONE Ignored + KEY_OP_ADD_PREFIX Add data to start of page + KEY_OP_ADD_SUFFIX Add data to end of page + data What data was added + data_length Number of bytes added first or last + changed_length Number of bytes changed first or last. + + @note + Write log entry for page that has got a key added to the page under + one and only one of the following senarios: + - Page is shortened from end + - Data is added to end of page + - Data added at front of page +*/ + +static my_bool _ma_log_split(MARIA_PAGE *ma_page, + uint org_length, uint new_length, + const uchar *key_pos, uint key_length, + int move_length, enum en_key_op prefix_or_suffix, + const uchar *data, uint data_length, + uint changed_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 3+3+3+3+3+2 +7]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6]; + uint offset= (uint) (key_pos - ma_page->buff); + uint translog_parts, extra_length; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_split"); + DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u", + (ulong) page, org_length, new_length)); + + DBUG_ASSERT(changed_length >= data_length); + DBUG_ASSERT(org_length <= info->s->max_index_block_size); + DBUG_ASSERT(new_length == ma_page->size); + DBUG_ASSERT(org_length == ma_page->org_size); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + (*log_pos++)= KEY_OP_DEBUG; + (*log_pos++)= KEY_OP_DEBUG_LOG_SPLIT; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET]; + + if (new_length <= offset || !key_pos) + { + /* + Page was split before inserted key. Write redo entry where + we just cut current page at page_length + */ + uint length_offset= org_length - new_length; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, length_offset); + log_pos+= 3; + translog_parts= 1; + extra_length= 0; + DBUG_ASSERT(data_length == 0); + } + else + { + /* Key was added to page which was split after the inserted key */ + uint max_key_length; + + /* + Handle case when split happened directly after the newly inserted key. + */ + max_key_length= new_length - offset; + extra_length= min(key_length, max_key_length); + if (offset + move_length > new_length) + { + /* This is true when move_length includes changes for next packed key */ + move_length= new_length - offset; + } + + if ((int) new_length < (int) (org_length + move_length + data_length)) + { + /* Shorten page */ + uint diff= org_length + move_length + data_length - new_length; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos + 1, diff); + log_pos+= 3; + DBUG_ASSERT(data_length == 0); /* Page is shortened */ + DBUG_ASSERT(offset <= org_length - diff); + } + else + { + DBUG_ASSERT(new_length == org_length + move_length + data_length); + DBUG_ASSERT(offset <= org_length); + } + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, extra_length); + log_pos+= 3; + + /* Point to original inserted key data */ + if (prefix_or_suffix == KEY_OP_ADD_PREFIX) + key_pos+= data_length; + + translog_parts= 2; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extra_length; + } + + if (data_length) + { + /* Add prefix or suffix */ + log_pos[0]= prefix_or_suffix; + int2store(log_pos+1, data_length); + log_pos+= 3; + if (prefix_or_suffix == KEY_OP_ADD_PREFIX) + { + int2store(log_pos+1, changed_length); + log_pos+= 2; + data_length= changed_length; + } + log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].str= data; + log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].length= data_length; + translog_parts++; + extra_length+= data_length; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got a key added to the page + and page is shortened from start of page + + @fn _ma_log_del_prefix() + @param info Maria handler + @param page Page number + @param buff Page buffer + @param org_length Length of buffer when read + @param new_length Final length + @param key_pos Where on page buffer key was added. This is position + before prefix was removed + @param key_length How many bytes was changed at 'key_pos' + @param move_length How many bytes was moved up when key was added + + @return + @retval 0 ok + @retval 1 error +*/ + +static my_bool _ma_log_del_prefix(MARIA_PAGE *ma_page, + uint org_length, uint new_length, + const uchar *key_pos, uint key_length, + int move_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 12 + 7]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uint offset= (uint) (key_pos - ma_page->buff); + uint diff_length= org_length + move_length - new_length; + uint translog_parts, extra_length; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_del_prefix"); + DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u", + (ulong) page, org_length, new_length)); + + DBUG_ASSERT((int) diff_length > 0); + DBUG_ASSERT(ma_page->org_size == org_length); + DBUG_ASSERT(ma_page->size == new_length); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + translog_parts= 1; + extra_length= 0; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + *log_pos++= KEY_OP_DEBUG; + *log_pos++= KEY_OP_DEBUG_LOG_DEL_PREFIX; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET]; + + if (offset < diff_length + info->s->keypage_header) + { + /* + Key is not anymore on page. Move data down, but take into account that + the original page had grown with 'move_length bytes' + */ + DBUG_ASSERT(offset + key_length <= diff_length + info->s->keypage_header); + + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, diff_length - move_length); + log_pos+= 3; + } + else + { + /* + Correct position to key, as data before key has been delete and key + has thus been moved down + */ + offset-= diff_length; + key_pos-= diff_length; + + /* Move data down */ + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, diff_length); + log_pos+= 3; + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, key_length); + log_pos+= 3; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length; + translog_parts= 2; + extra_length= key_length; + } + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got data added first and + data deleted last. Old changed key may be part of page +*/ + +static my_bool _ma_log_key_middle(MARIA_PAGE *ma_page, + uint new_length, + uint data_added_first, + uint data_changed_first, + uint data_deleted_last, + const uchar *key_pos, + uint key_length, int move_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 3+5+3+3+3 + 7]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6]; + uint key_offset; + uint translog_parts, extra_length; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_key_middle"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(ma_page->size == new_length); + + /* new place of key after changes */ + key_pos+= data_added_first; + key_offset= (uint) (key_pos - ma_page->buff); + if (key_offset < new_length) + { + /* key is on page; Calculate how much of the key is there */ + uint max_key_length= new_length - key_offset; + if (max_key_length < key_length) + { + /* Key is last on page */ + key_length= max_key_length; + move_length= 0; + } + /* + Take into account that new data was added as part of original key + that also needs to be removed from page + */ + data_deleted_last+= move_length; + } + + /* First log changes to page */ + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + *log_pos++= KEY_OP_DEBUG; + *log_pos++= KEY_OP_DEBUG_LOG_MIDDLE; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET]; + + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, data_deleted_last); + log_pos+= 3; + + log_pos[0]= KEY_OP_ADD_PREFIX; + int2store(log_pos+1, data_added_first); + int2store(log_pos+3, data_changed_first); + log_pos+= 5; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (ma_page->buff + + info->s->keypage_header); + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first; + translog_parts= 2; + extra_length= data_changed_first; + + /* If changed key is on page, log those changes too */ + + if (key_offset < new_length) + { + uchar *start_log_pos= log_pos; + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, key_offset); + log_pos+= 3; + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, key_length); + log_pos+= 3; + + log_array[TRANSLOG_INTERNAL_PARTS + 2].str= start_log_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 2].length= (uint) (log_pos - + start_log_pos); + + log_array[TRANSLOG_INTERNAL_PARTS + 3].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 3].length= key_length; + translog_parts+=2; + extra_length+= (uint) (log_array[TRANSLOG_INTERNAL_PARTS + 2].length + + key_length); + } + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + (log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length), + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +#ifdef NOT_NEEDED + +/** + @brief + Write log entry for page that has got data added first and + data deleted last +*/ + +static my_bool _ma_log_middle(MARIA_PAGE *ma_page, + uint data_added_first, uint data_changed_first, + uint data_deleted_last) +{ + LSN lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 3 + 5 + 7], *log_pos; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->page / info->s->block_size; + uint translog_parts, extra_length; + DBUG_ENTER("_ma_log_middle"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(ma_page->org_size + data_added_first - data_deleted_last == + ma_page->size); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, data_deleted_last); + log_pos+= 3; + + log_pos[0]= KEY_OP_ADD_PREFIX; + int2store(log_pos+1, data_added_first); + int2store(log_pos+3, data_changed_first); + log_pos+= 5; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= ((char*) buff + + info->s->keypage_header); + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first; + translog_parts= 2; + extra_length= data_changed_first; + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} +#endif diff --git a/storage/maria/maria_chk.c b/storage/maria/maria_chk.c new file mode 100644 index 00000000000..4e19d5878ea --- /dev/null +++ b/storage/maria/maria_chk.c @@ -0,0 +1,2008 @@ +/* Copyright (C) 2006-2003 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Describe, check and repair of MARIA tables */ + +#include "ma_fulltext.h" +#include <myisamchk.h> +#include <my_bit.h> +#include <m_ctype.h> +#include <stdarg.h> +#include <my_getopt.h> +#ifdef HAVE_SYS_VADVICE_H +#include <sys/vadvise.h> +#endif +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif +SET_STACK_SIZE(9000) /* Minimum stack size for program */ + +#ifndef USE_RAID +#define my_raid_create(A,B,C,D,E,F,G) my_create(A,B,C,G) +#define my_raid_delete(A,B,C) my_delete(A,B) +#endif + +static uint decode_bits; +static char **default_argv; +static const char *load_default_groups[]= { "aria_chk", 0 }; +static const char *set_collation_name, *opt_tmpdir, *opt_log_dir; +static CHARSET_INFO *set_collation; +static int stopwords_inited= 0; +static MY_TMPDIR maria_chk_tmpdir; +static my_bool opt_transaction_logging, opt_debug, opt_require_control_file; +static my_bool opt_warning_for_wrong_transid; + +static const char *type_names[]= +{ + "impossible","char","binary", "short", "long", "float", + "double","number","unsigned short", + "unsigned long","longlong","ulonglong","int24", + "uint24","int8","varchar", "varbin", "varchar2", "varbin2", "bit", + "?","?" +}; + +static const char *prefix_packed_txt="packed ", + *bin_packed_txt="prefix ", + *diff_txt="stripped ", + *null_txt="NULL", + *blob_txt="BLOB "; + +static const char *field_pack[]= +{ + "","no endspace", "no prespace", + "no zeros", "blob", "constant", "table-lockup", + "always zero","varchar","unique-hash","?","?" +}; + +static const char *record_formats[]= +{ + "Fixed length", "Packed", "Compressed", "Block", "?" +}; + +static const char *bitmap_description[]= +{ + "Empty page", "Part filled head page","Part filled head page", + "Part filled head page", "Full head page", + "Part filled tail page","Part filled tail page", + "Full tail or blob page" +}; + +static const char *maria_stats_method_str="nulls_unequal"; +static char default_open_errmsg[]= "%d when opening Aria table '%s'"; +static char default_close_errmsg[]= "%d when closing Aria table '%s'"; + +static void get_options(int *argc,char * * *argv); +static void print_version(void); +static void usage(void); +static int maria_chk(HA_CHECK *param, char *filename); +static void descript(HA_CHECK *param, register MARIA_HA *info, char *name); +static int maria_sort_records(HA_CHECK *param, register MARIA_HA *info, + char *name, uint sort_key, + my_bool write_info, my_bool update_index); +static int sort_record_index(MARIA_SORT_PARAM *sort_param, MARIA_PAGE *page, + uint sortkey, File new_file, + my_bool update_index); +static my_bool write_log_record(HA_CHECK *param); + +HA_CHECK check_param; + + /* Main program */ + +int main(int argc, char **argv) +{ + int error; + MY_INIT(argv[0]); + + opt_log_dir= maria_data_root= (char *)"."; + maria_chk_init(&check_param); + check_param.opt_lock_memory= 1; /* Lock memory if possible */ + check_param.using_global_keycache = 0; + get_options(&argc,(char***) &argv); + maria_quick_table_bits=decode_bits; + error=0; + maria_init(); + + maria_block_size= 0; /* Use block size from control file */ + if (ma_control_file_open(FALSE, opt_require_control_file || + !(check_param.testflag & T_SILENT)) && + (opt_require_control_file || + (opt_transaction_logging && (check_param.testflag & T_REP_ANY)))) + { + error= 1; + goto end; + } + + /* + If we are doing a repair, user may want to store this repair into the log + so that the log has a complete history and can be used to replay. + */ + if (opt_transaction_logging && (check_param.testflag & T_REP_ANY)) + { + if (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, MY_WME) == 0 || + translog_init(opt_log_dir, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0)) + { + _ma_check_print_error(&check_param, + "Can't initialize transaction logging. Run " + "recovery with switch --skip-transaction-log"); + error= 1; + goto end; + } + } + + while (--argc >= 0) + { + int new_error=maria_chk(&check_param, *(argv++)); + if ((check_param.testflag & T_REP_ANY) != T_REP) + check_param.testflag&= ~T_REP; + VOID(fflush(stdout)); + VOID(fflush(stderr)); + if ((check_param.error_printed | check_param.warning_printed) && + (check_param.testflag & T_FORCE_CREATE) && + (!(check_param.testflag & (T_REP | T_REP_BY_SORT | T_SORT_RECORDS | + T_SORT_INDEX)))) + { + ulonglong old_testflag=check_param.testflag; + if (!(check_param.testflag & T_REP)) + check_param.testflag|= T_REP_BY_SORT; + check_param.testflag&= ~T_EXTEND; /* Not needed */ + error|=maria_chk(&check_param, argv[-1]); + check_param.testflag= old_testflag; + VOID(fflush(stdout)); + VOID(fflush(stderr)); + } + else + error|=new_error; + if (argc && (!(check_param.testflag & T_SILENT) || + check_param.testflag & T_INFO)) + { + puts("\n---------\n"); + VOID(fflush(stdout)); + } + } +end: + if (check_param.total_files > 1) + { /* Only if descript */ + char buff[22],buff2[22]; + if (!(check_param.testflag & T_SILENT) || check_param.testflag & T_INFO) + puts("\n---------"); + printf("\nTotal of all %d Aria-files:\nData records: %9s Deleted blocks: %9s\n",check_param.total_files,llstr(check_param.total_records,buff), + llstr(check_param.total_deleted,buff2)); + } + free_defaults(default_argv); + free_tmpdir(&maria_chk_tmpdir); + maria_end(); + my_end(check_param.testflag & T_INFO ? + MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR); + exit(error); +#ifndef _lint + return 0; /* No compiler warning */ +#endif +} /* main */ + +enum options_mc { + OPT_CHARSETS_DIR=256, OPT_SET_COLLATION,OPT_START_CHECK_POS, + OPT_CORRECT_CHECKSUM, OPT_PAGE_BUFFER_SIZE, + OPT_KEY_CACHE_BLOCK_SIZE, OPT_MARIA_BLOCK_SIZE, + OPT_READ_BUFFER_SIZE, OPT_WRITE_BUFFER_SIZE, OPT_SORT_BUFFER_SIZE, + OPT_SORT_KEY_BLOCKS, OPT_DECODE_BITS, OPT_FT_MIN_WORD_LEN, + OPT_FT_MAX_WORD_LEN, OPT_FT_STOPWORD_FILE, + OPT_MAX_RECORD_LENGTH, OPT_AUTO_CLOSE, OPT_STATS_METHOD, OPT_TRANSACTION_LOG, + OPT_SKIP_SAFEMALLOC, OPT_ZEROFILL_KEEP_LSN, OPT_REQUIRE_CONTROL_FILE, + OPT_LOG_DIR, OPT_DATADIR, OPT_WARNING_FOR_WRONG_TRANSID +}; + +static struct my_option my_long_options[] = +{ + {"analyze", 'a', + "Analyze distribution of keys. Will make some joins in MySQL faster. You can check the calculated distribution.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifdef __NETWARE__ + {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"block-search", 'b', + "No help available.", + 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"backup", 'B', + "Make a backup of the .MAD file as 'filename-time.BAK'.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"character-sets-dir", OPT_CHARSETS_DIR, + "Directory where character sets are.", + (char**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"check", 'c', + "Check table for errors.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"check-only-changed", 'C', + "Check only tables that have changed since last check. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"correct-checksum", OPT_CORRECT_CHECKSUM, + "Correct checksum information for table.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', + "Output debug log. Often this is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"description", 'd', + "Prints some information about table.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"data-file-length", 'D', + "Max length of data file (when recreating data-file when it's full).", + &check_param.max_data_file_length, + &check_param.max_data_file_length, + 0, GET_LL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"extend-check", 'e', + "If used when checking a table, ensure that the table is 100 percent consistent, which will take a long time. If used when repairing a table, try to recover every possible row from the data file. Normally this will also find a lot of garbage rows; Don't use this option with repair if you are not totally desperate.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"fast", 'F', + "Check only tables that haven't been closed properly. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"force", 'f', + "Restart with -r if there are any errors in the table. States will be updated as with --update-state.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"HELP", 'H', + "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', + "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"information", 'i', + "Print statistics information about table that is checked.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"keys-used", 'k', + "Tell Aria to update only some specific keys. # is a bit mask of which keys to use. This can be used to get faster inserts.", + &check_param.keys_in_use, + &check_param.keys_in_use, + 0, GET_ULL, REQUIRED_ARG, -1, 0, 0, 0, 0, 0}, + {"datadir", OPT_DATADIR, + "Path for control file (and logs if --logdir not used).", + &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG, + 0, 0, 0, 0, 0, 0}, + {"logdir", OPT_LOG_DIR, + "Path for log files.", + (char**) &opt_log_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"max-record-length", OPT_MAX_RECORD_LENGTH, + "Skip rows bigger than this if aria_chk can't allocate memory to hold it", + &check_param.max_record_length, + &check_param.max_record_length, + 0, GET_ULL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0}, + {"medium-check", 'm', + "Faster than extend-check, but only finds 99.99% of all errors. Should be good enough for most cases.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"quick", 'q', "Faster repair by not modifying the data file.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"read-only", 'T', + "Don't mark table as checked.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"recover", 'r', + "Can fix almost anything except unique keys that aren't unique.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"parallel-recover", 'p', + "Same as '-r' but creates all the keys in parallel.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"safe-recover", 'o', + "Uses old recovery method; Slower than '-r' but can handle a couple of cases where '-r' reports that it can't fix the data file.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"sort-recover", 'n', + "Force recovering with sorting even if the temporary file was very big.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "require-control-file", OPT_REQUIRE_CONTROL_FILE, + "Abort if cannot find control file", + (uchar**)&opt_require_control_file, 0, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, +#ifdef DEBUG + {"start-check-pos", OPT_START_CHECK_POS, + "No help available.", + 0, 0, 0, GET_ULL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"set-auto-increment", 'A', + "Force auto_increment to start at this or higher value. If no value is given, then sets the next auto_increment value to the highest used value for the auto key + 1.", + &check_param.auto_increment_value, + &check_param.auto_increment_value, + 0, GET_ULL, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"set-collation", OPT_SET_COLLATION, + "Change the collation used by the index", + (char**) &set_collation_name, 0, 0, GET_STR, REQUIRED_ARG, + 0, 0, 0, 0, 0, 0}, + {"silent", 's', + "Only print errors. One can use two -s to make aria_chk very silent.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF +#ifdef SAFEMALLOC + {"skip-safemalloc", OPT_SKIP_SAFEMALLOC, + "Don't use the memory allocation checking.", 0, 0, 0, GET_NO_ARG, NO_ARG, + 0, 0, 0, 0, 0, 0}, +#endif +#endif + {"sort-index", 'S', + "Sort index blocks. This speeds up 'read-next' in applications.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"sort-records", 'R', + "Sort records according to an index. This makes your data much more localized and may speed up things. (It may be VERY slow to do a sort the first time!)", + &check_param.opt_sort_key, + &check_param.opt_sort_key, + 0, GET_UINT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"tmpdir", 't', "Path for temporary files.", (char**) &opt_tmpdir, + 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"transaction-log", OPT_TRANSACTION_LOG, + "Log repair command to transaction log", + &opt_transaction_logging, &opt_transaction_logging, + 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"update-state", 'U', + "Mark tables as crashed if any errors were found and clean if check didn't " + "find any errors. This allows one to get rid of warnings like 'table not " + "properly closed'", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"unpack", 'u', + "Unpack file packed with aria_pack.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', + "Print more information. This can be used with --description and --check. Use many -v for more verbosity!", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"wait", 'w', "Wait if table is locked.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"warning-for-wrong-transaction-id", OPT_WARNING_FOR_WRONG_TRANSID, + "Give a warning if we find a transaction id in the table that is bigger" + "than what exists in the control file. Use --skip-... to disable warning", + &opt_warning_for_wrong_transid, &opt_warning_for_wrong_transid, + 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0}, + { "page_buffer_size", OPT_PAGE_BUFFER_SIZE, + "Size of page buffer. Used by --safe-repair", + &check_param.use_buffers, &check_param.use_buffers, 0, + GET_ULONG, REQUIRED_ARG, (long) USE_BUFFER_INIT, 1024L*1024L, + (long) ~0L, (long) MALLOC_OVERHEAD, (long) IO_SIZE, 0}, + { "read_buffer_size", OPT_READ_BUFFER_SIZE, + "Read buffer size for sequential reads during scanning", + &check_param.read_buffer_length, + &check_param.read_buffer_length, 0, GET_ULONG, REQUIRED_ARG, + (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD, + (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0}, + { "write_buffer_size", OPT_WRITE_BUFFER_SIZE, + "Write buffer size for sequential writes during repair of fixed size or dynamic size rows", + &check_param.write_buffer_length, + &check_param.write_buffer_length, 0, GET_ULONG, REQUIRED_ARG, + (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD, + (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0}, + { "sort_buffer_size", OPT_SORT_BUFFER_SIZE, + "Size of sort buffer. Used by --recover", + &check_param.sort_buffer_length, + &check_param.sort_buffer_length, 0, GET_ULONG, REQUIRED_ARG, + (long) SORT_BUFFER_INIT, (long) (MIN_SORT_BUFFER + MALLOC_OVERHEAD), + (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0}, + { "sort_key_blocks", OPT_SORT_KEY_BLOCKS, + "Internal buffer for sorting keys; Don't touch :)", + &check_param.sort_key_blocks, + &check_param.sort_key_blocks, 0, GET_ULONG, REQUIRED_ARG, + BUFFERS_WHEN_SORTING, 4L, 100L, 0L, 1L, 0}, + { "decode_bits", OPT_DECODE_BITS, "", &decode_bits, + &decode_bits, 0, GET_UINT, REQUIRED_ARG, 9L, 4L, 17L, 0L, 1L, 0}, + { "ft_min_word_len", OPT_FT_MIN_WORD_LEN, "", &ft_min_word_len, + &ft_min_word_len, 0, GET_ULONG, REQUIRED_ARG, 4, 1, HA_FT_MAXCHARLEN, + 0, 1, 0}, + { "ft_max_word_len", OPT_FT_MAX_WORD_LEN, "", &ft_max_word_len, + &ft_max_word_len, 0, GET_ULONG, REQUIRED_ARG, HA_FT_MAXCHARLEN, 10, + HA_FT_MAXCHARLEN, 0, 1, 0}, + { "aria_ft_stopword_file", OPT_FT_STOPWORD_FILE, + "Use stopwords from this file instead of built-in list.", + (char**) &ft_stopword_file, (char**) &ft_stopword_file, 0, GET_STR, + REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + { "stats_method", OPT_STATS_METHOD, + "Specifies how index statistics collection code should treat NULLs. " + "Possible values of name are \"nulls_unequal\" (default behavior for 4.1/5.0), " + "\"nulls_equal\" (emulate 4.0 behavior), and \"nulls_ignored\".", + (char**) &maria_stats_method_str, (char**) &maria_stats_method_str, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + { "zerofill", 'z', + "Fill empty space in data and index files with zeroes,", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "zerofill-keep-lsn", OPT_ZEROFILL_KEEP_LSN, + "Like --zerofill but does not zero out LSN of data/index pages;" + " used only for testing and debugging", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +#include <help_start.h> + +static void print_version(void) +{ + printf("%s Ver 1.0 for %s at %s\n", my_progname, SYSTEM_TYPE, + MACHINE_TYPE); + NETWARE_SET_SCREEN_MODE(1); +} + + +static void usage(void) +{ + print_version(); + puts("By Monty, for your professional use"); + puts("This software comes with NO WARRANTY: see the PUBLIC for details.\n"); + puts("Description, check and repair of Aria tables."); + puts("Used without options all tables on the command will be checked for errors"); + printf("Usage: %s [OPTIONS] tables[.MAI]\n", my_progname_short); + printf("\nGlobal options:\n"); +#ifndef DBUG_OFF + printf("\ + -#, --debug=... Output debug log. Often this is 'd:t:o,filename'.\n"); +#endif + printf("\ + -H, --HELP Display this help and exit.\n\ + -?, --help Display this help and exit.\n\ + --datadir=path Path for control file (and logs if --logdir not used)\n\ + --logdir=path Path for log files\n\ + --require-control-file Abort if we can't find/read the maria_log_control\n\ + file\n\ + -s, --silent Only print errors. One can use two -s to make\n\ + maria_chk very silent.\n\ + -t, --tmpdir=path Path for temporary files. Multiple paths can be\n\ + specified, separated by "); +#if defined( __WIN__) || defined(__NETWARE__) + printf("semicolon (;)"); +#else + printf("colon (:)"); +#endif + printf(", they will be used\n\ + in a round-robin fashion.\n\ + -v, --verbose Print more information. This can be used with\n\ + --description and --check. Use many -v for more verbosity.\n\ + -V, --version Print version and exit.\n\ + -w, --wait Wait if table is locked.\n\n"); +#ifdef DEBUG + puts(" --start-check-pos=# Start reading file at given offset.\n"); +#endif + + puts("Check options (check is the default action for aria_chk):\n\ + -c, --check Check table for errors.\n\ + -e, --extend-check Check the table VERY throughly. Only use this in\n\ + extreme cases as aria_chk should normally be able to\n\ + find out if the table is ok even without this switch.\n\ + -F, --fast Check only tables that haven't been closed properly.\n\ + -C, --check-only-changed\n\ + Check only tables that have changed since last check.\n\ + -f, --force Restart with '-r' if there are any errors in the table.\n\ + States will be updated as with '--update-state'.\n\ + -i, --information Print statistics information about table that is checked.\n\ + -m, --medium-check Faster than extend-check, but only finds 99.99% of\n\ + all errors. Should be good enough for most cases.\n\ + -U, --update-state Mark tables as crashed if you find any errors.\n\ + -T, --read-only Don't mark table as checked.\n"); + + puts("\ +Recover (repair)/ options (When using '--recover' or '--safe-recover'):\n\ + -B, --backup Make a backup of the .MAD file as 'filename-time.BAK'.\n\ + --correct-checksum Correct checksum information for table.\n\ + -D, --data-file-length=# Max length of data file (when recreating data\n\ + file when it's full).\n\ + -e, --extend-check Try to recover every possible row from the data file\n\ + Normally this will also find a lot of garbage rows;\n\ + Don't use this option if you are not totally desperate.\n\ + -f, --force Overwrite old temporary files.\n\ + -k, --keys-used=# Tell Aria to update only some specific keys. # is a\n\ + bit mask of which keys to use. This can be used to\n\ + get faster inserts.\n\ + --max-record-length=#\n\ + Skip rows bigger than this if aria_chk can't allocate\n\ + memory to hold it.\n\ + -r, --recover Can fix almost anything except unique keys that aren't\n\ + unique.\n\ + -n, --sort-recover Forces recovering with sorting even if the temporary\n\ + file would be very big.\n\ + -p, --parallel-recover\n\ + Uses the same technique as '-r' and '-n', but creates\n\ + all the keys in parallel, in different threads."); + puts("\ + -o, --safe-recover Uses old recovery method; Slower than '-r' but can\n \ + handle a couple of cases where '-r' reports that it\n\ + can't fix the data file.\n\ + --transaction-log Log repair command to transaction log. This is needed\n\ + if one wants to use the aria_read_log to repeat the \n\ + repair\n\ + --character-sets-dir=...\n\ + Directory where character sets are.\n\ + --set-collation=name\n\ + Change the collation used by the index.\n\ + -q, --quick Faster repair by not modifying the data file.\n\ + One can give a second '-q' to force aria_chk to\n\ + modify the original datafile in case of duplicate keys.\n\ + NOTE: Tables where the data file is currupted can't be\n\ + fixed with this option.\n\ + -u, --unpack Unpack file packed with ariapack.\n\ +"); + + puts("Other actions:\n\ + -a, --analyze Analyze distribution of keys. Will make some joins in\n\ + MariaDB faster. You can check the calculated distribution\n\ + by using '--description --verbose table_name'.\n\ + --stats_method=name Specifies how index statistics collection code should\n\ + treat NULLs. Possible values of name are \"nulls_unequal\"\n\ + (default for 4.1/5.0), \"nulls_equal\" (emulate 4.0), and \n\ + \"nulls_ignored\".\n\ + -d, --description Prints some information about table.\n\ + -A, --set-auto-increment[=value]\n\ + Force auto_increment to start at this or higher value\n\ + If no value is given, then sets the next auto_increment\n\ + value to the highest used value for the auto key + 1.\n\ + -S, --sort-index Sort index blocks. This speeds up 'read-next' in\n\ + applications.\n\ + -R, --sort-records=#\n\ + Sort records according to an index. This makes your\n\ + data much more localized and may speed up things\n\ + (It may be VERY slow to do a sort the first time!).\n\ + -b, --block-search=#\n\ + Find a record, a block at given offset belongs to.\n\ + -z, --zerofill Fill empty space in data and index files with zeroes\n\ + --zerofill-keep-lsn Like --zerofill but does not zero out LSN of\n\ + data/index pages."); + + puts("Variables:\n\ +--page_buffer_size=# Size of page buffer. Used by --safe-repair\n\ +--read_buffer_size=# Read buffer size for sequential reads during scanning\n\ +--sort_buffer_size=# Size of sort buffer. Used by --recover\n\ +--sort_key_blocks=# Internal buffer for sorting keys; Don't touch :)\n\ +--write_buffer_size=# Write buffer size for sequential writes during repair"); + + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + +#include <help_end.h> + +const char *maria_stats_method_names[] = {"nulls_unequal", "nulls_equal", + "nulls_ignored", NullS}; +TYPELIB maria_stats_method_typelib= { + array_elements(maria_stats_method_names) - 1, "", + maria_stats_method_names, NULL}; + + /* Read options */ + +static my_bool +get_one_option(int optid, + const struct my_option *opt __attribute__((unused)), + char *argument) +{ + switch (optid) { +#ifdef __NETWARE__ + case OPT_AUTO_CLOSE: + setscreenmode(SCR_AUTOCLOSE_ON_EXIT); + break; +#endif + case 'a': + if (argument == disabled_my_option) + check_param.testflag&= ~T_STATISTICS; + else + check_param.testflag|= T_STATISTICS; + break; + case 'A': + if (argument) + check_param.auto_increment_value= strtoull(argument, NULL, 0); + else + check_param.auto_increment_value= 0; /* Set to max used value */ + check_param.testflag|= T_AUTO_INC; + break; + case 'b': + check_param.search_after_block= strtoul(argument, NULL, 10); + break; + case 'B': + if (argument == disabled_my_option) + check_param.testflag&= ~T_BACKUP_DATA; + else + check_param.testflag|= T_BACKUP_DATA; + break; + case 'c': + if (argument == disabled_my_option) + check_param.testflag&= ~T_CHECK; + else + check_param.testflag|= T_CHECK; + break; + case 'C': + if (argument == disabled_my_option) + check_param.testflag&= ~(T_CHECK | T_CHECK_ONLY_CHANGED); + else + check_param.testflag|= T_CHECK | T_CHECK_ONLY_CHANGED; + break; + case 'D': + check_param.max_data_file_length=strtoll(argument, NULL, 10); + break; + case 's': /* silent */ + if (argument == disabled_my_option) + check_param.testflag&= ~(T_SILENT | T_VERY_SILENT); + else + { + if (check_param.testflag & T_SILENT) + check_param.testflag|= T_VERY_SILENT; + check_param.testflag|= T_SILENT; + check_param.testflag&= ~T_WRITE_LOOP; + } + break; + case 'w': + if (argument == disabled_my_option) + check_param.testflag&= ~T_WAIT_FOREVER; + else + check_param.testflag|= T_WAIT_FOREVER; + break; + case 'd': /* description if isam-file */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_DESCRIPT; + else + check_param.testflag|= T_DESCRIPT; + break; + case 'e': /* extend check */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_EXTEND; + else + check_param.testflag|= T_EXTEND; + break; + case 'i': + if (argument == disabled_my_option) + check_param.testflag&= ~T_INFO; + else + check_param.testflag|= T_INFO; + break; + case 'f': + if (argument == disabled_my_option) + { + check_param.tmpfile_createflag= O_RDWR | O_TRUNC | O_EXCL; + check_param.testflag&= ~(T_FORCE_CREATE | T_UPDATE_STATE); + } + else + { + check_param.tmpfile_createflag= O_RDWR | O_TRUNC; + check_param.testflag|= T_FORCE_CREATE | T_UPDATE_STATE; + } + break; + case 'F': + if (argument == disabled_my_option) + check_param.testflag&= ~T_FAST; + else + check_param.testflag|= T_FAST; + break; + case 'k': + check_param.keys_in_use= (ulonglong) strtoll(argument, NULL, 10); + break; + case 'm': + if (argument == disabled_my_option) + check_param.testflag&= ~T_MEDIUM; + else + check_param.testflag|= T_MEDIUM; /* Medium check */ + break; + case 'r': /* Repair table */ + check_param.testflag&= ~T_REP_ANY; + if (argument != disabled_my_option) + check_param.testflag|= T_REP_BY_SORT; + break; + case 'p': + check_param.testflag&= ~T_REP_ANY; + if (argument != disabled_my_option) + check_param.testflag|= T_REP_PARALLEL; + break; + case 'o': + check_param.testflag&= ~T_REP_ANY; + check_param.force_sort= 0; + if (argument != disabled_my_option) + { + check_param.testflag|= T_REP; + my_disable_async_io= 1; /* More safety */ + } + break; + case 'n': + check_param.testflag&= ~T_REP_ANY; + if (argument == disabled_my_option) + check_param.force_sort= 0; + else + { + check_param.testflag|= T_REP_BY_SORT; + check_param.force_sort= 1; + } + break; + case 'q': + if (argument == disabled_my_option) + check_param.testflag&= ~(T_QUICK | T_FORCE_UNIQUENESS); + else + check_param.testflag|= + (check_param.testflag & T_QUICK) ? T_FORCE_UNIQUENESS : T_QUICK; + break; + case 'u': + if (argument == disabled_my_option) + check_param.testflag&= ~T_UNPACK; + else + { + check_param.testflag|= T_UNPACK; + if (!(check_param.testflag & T_REP_ANY)) + check_param.testflag|= T_REP_BY_SORT; + } + break; + case 'v': /* Verbose */ + if (argument == disabled_my_option) + { + check_param.testflag&= ~T_VERBOSE; + check_param.verbose=0; + } + else + { + check_param.testflag|= T_VERBOSE; + check_param.verbose++; + } + break; + case 'R': /* Sort records */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_SORT_RECORDS; + else + { + check_param.testflag|= T_SORT_RECORDS; + check_param.opt_sort_key= (uint) atoi(argument) - 1; + if (check_param.opt_sort_key >= MARIA_MAX_KEY) + { + fprintf(stderr, + "The value of the sort key is bigger than max key: %d.\n", + MARIA_MAX_KEY); + exit(1); + } + } + break; + case 'S': /* Sort index */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_SORT_INDEX; + else + check_param.testflag|= T_SORT_INDEX; + break; + case 'T': + if (argument == disabled_my_option) + check_param.testflag&= ~T_READONLY; + else + check_param.testflag|= T_READONLY; + break; + case 'U': + if (argument == disabled_my_option) + check_param.testflag&= ~T_UPDATE_STATE; + else + check_param.testflag|= T_UPDATE_STATE; + break; + case '#': + DBUG_SET_INITIAL(argument ? argument : "d:t:o,/tmp/aria_chk.trace"); + opt_debug= 1; + break; + case OPT_SKIP_SAFEMALLOC: +#ifdef SAFEMALLOC + sf_malloc_quick=1; +#endif + break; + case 'V': + print_version(); + exit(0); + case OPT_CORRECT_CHECKSUM: + if (argument == disabled_my_option) + check_param.testflag&= ~T_CALC_CHECKSUM; + else + check_param.testflag|= T_CALC_CHECKSUM; + break; + case OPT_STATS_METHOD: + { + int method; + enum_handler_stats_method method_conv; + LINT_INIT(method_conv); + maria_stats_method_str= argument; + if ((method=find_type(argument, &maria_stats_method_typelib, 2)) <= 0) + { + fprintf(stderr, "Invalid value of stats_method: %s.\n", argument); + exit(1); + } + switch (method-1) { + case 0: + method_conv= MI_STATS_METHOD_NULLS_EQUAL; + break; + case 1: + method_conv= MI_STATS_METHOD_NULLS_NOT_EQUAL; + break; + case 2: + method_conv= MI_STATS_METHOD_IGNORE_NULLS; + break; + default: assert(0); /* Impossible */ + } + check_param.stats_method= method_conv; + break; + } +#ifdef DEBUG /* Only useful if debugging */ + case OPT_START_CHECK_POS: + check_param.start_check_pos= strtoull(argument, NULL, 0); + break; +#endif + case 'z': + if (argument == disabled_my_option) + check_param.testflag&= ~T_ZEROFILL; + else + check_param.testflag|= T_ZEROFILL; + break; + case OPT_ZEROFILL_KEEP_LSN: + if (argument == disabled_my_option) + check_param.testflag&= ~(T_ZEROFILL_KEEP_LSN | T_ZEROFILL); + else + check_param.testflag|= (T_ZEROFILL_KEEP_LSN | T_ZEROFILL); + break; + case 'H': + my_print_help(my_long_options); + exit(0); + case '?': + usage(); + exit(0); + } + return 0; +} + + +static void get_options(register int *argc,register char ***argv) +{ + int ho_error; + + load_defaults("my", load_default_groups, argc, argv); + default_argv= *argv; + if (isatty(fileno(stdout))) + check_param.testflag|=T_WRITE_LOOP; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + /* If using repair, then update checksum if one uses --update-state */ + if ((check_param.testflag & T_UPDATE_STATE) && + (check_param.testflag & T_REP_ANY)) + check_param.testflag|= T_CALC_CHECKSUM; + + if (*argc == 0) + { + usage(); + exit(-1); + } + + if ((check_param.testflag & T_UNPACK) && + (check_param.testflag & (T_QUICK | T_SORT_RECORDS))) + { + VOID(fprintf(stderr, + "%s: --unpack can't be used with --quick or --sort-records\n", + my_progname_short)); + exit(1); + } + if ((check_param.testflag & T_READONLY) && + (check_param.testflag & + (T_REP_ANY | T_STATISTICS | T_AUTO_INC | + T_SORT_RECORDS | T_SORT_INDEX | T_FORCE_CREATE))) + { + VOID(fprintf(stderr, + "%s: Can't use --readonly when repairing or sorting\n", + my_progname_short)); + exit(1); + } + + if (!opt_debug) + { + DEBUGGER_OFF; /* Speed up things a bit */ + } + if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir)) + exit(1); + + check_param.tmpdir=&maria_chk_tmpdir; + + if (set_collation_name) + if (!(set_collation= get_charset_by_name(set_collation_name, + MYF(MY_WME)))) + exit(1); + + return; +} /* get options */ + + + /* Check table */ + +static int maria_chk(HA_CHECK *param, char *filename) +{ + int error,lock_type,recreate; + my_bool rep_quick= test(param->testflag & (T_QUICK | T_FORCE_UNIQUENESS)); + MARIA_HA *info; + File datafile; + char llbuff[22],llbuff2[22]; + my_bool state_updated=0; + MARIA_SHARE *share; + DBUG_ENTER("maria_chk"); + + param->out_flag=error=param->warning_printed=param->error_printed= + recreate=0; + datafile=0; + param->isam_file_name=filename; /* For error messages */ + if (!(info=maria_open(filename, + (param->testflag & (T_DESCRIPT | T_READONLY)) ? + O_RDONLY : O_RDWR, + HA_OPEN_FOR_REPAIR | + ((param->testflag & T_WAIT_FOREVER) ? + HA_OPEN_WAIT_IF_LOCKED : + (param->testflag & T_DESCRIPT) ? + HA_OPEN_IGNORE_IF_LOCKED : HA_OPEN_ABORT_IF_LOCKED)))) + { + /* Avoid twice printing of isam file name */ + param->error_printed=1; + switch (my_errno) { + case HA_ERR_CRASHED: + _ma_check_print_error(param,"'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename); + break; + case HA_ERR_NOT_A_TABLE: + _ma_check_print_error(param,"'%s' is not a Aria table",filename); + break; + case HA_ERR_CRASHED_ON_USAGE: + _ma_check_print_error(param,"'%s' is marked as crashed",filename); + break; + case HA_ERR_CRASHED_ON_REPAIR: + _ma_check_print_error(param,"'%s' is marked as crashed after last repair",filename); + break; + case HA_ERR_OLD_FILE: + _ma_check_print_error(param,"'%s' is a old type of Aria table", filename); + break; + case HA_ERR_NEW_FILE: + _ma_check_print_error(param,"'%s' uses new features not supported by this version of the Aria library", filename); + break; + case HA_ERR_END_OF_FILE: + _ma_check_print_error(param,"Couldn't read complete header from '%s'", filename); + break; + case EAGAIN: + _ma_check_print_error(param,"'%s' is locked. Use -w to wait until unlocked",filename); + break; + case ENOENT: + _ma_check_print_error(param,"File '%s' doesn't exist",filename); + break; + case EACCES: + _ma_check_print_error(param,"You don't have permission to use '%s'", + filename); + break; + default: + _ma_check_print_error(param,"%d when opening Aria table '%s'", + my_errno,filename); + break; + } + DBUG_RETURN(1); + } + share= info->s; + share->tot_locks-= share->r_locks; + share->r_locks=0; + maria_block_size= share->base.block_size; + + if (share->data_file_type == BLOCK_RECORD || + ((param->testflag & T_UNPACK) && + share->state.header.org_data_file_type == BLOCK_RECORD)) + { + if (param->testflag & T_SORT_RECORDS) + { + _ma_check_print_error(param, + "Record format used by '%s' is is not yet supported with sort-records", + filename); + param->error_printed= 0; + error= 1; + goto end2; + } + /* We can't do parallell repair with BLOCK_RECORD yet */ + if (param->testflag & T_REP_PARALLEL) + { + param->testflag&= ~T_REP_PARALLEL; + param->testflag|= T_REP_BY_SORT; + } + } + + /* + Skip the checking of the file if: + We are using --fast and the table is closed properly + We are using --check-only-changed-tables and the table hasn't changed + */ + if (param->testflag & (T_FAST | T_CHECK_ONLY_CHANGED)) + { + my_bool need_to_check= (maria_is_crashed(info) || + share->state.open_count != 0); + + if ((param->testflag & (T_REP_ANY | T_SORT_RECORDS)) && + ((share->state.changed & (STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR) || + !(param->testflag & T_CHECK_ONLY_CHANGED)))) + need_to_check=1; + + if (info->s->base.keys && info->state->records) + { + if ((param->testflag & T_STATISTICS) && + (share->state.changed & STATE_NOT_ANALYZED)) + need_to_check=1; + if ((param->testflag & T_SORT_INDEX) && + (share->state.changed & STATE_NOT_SORTED_PAGES)) + need_to_check=1; + if ((param->testflag & T_REP_BY_SORT) && + (share->state.changed & STATE_NOT_OPTIMIZED_KEYS)) + need_to_check=1; + } + if ((param->testflag & T_CHECK_ONLY_CHANGED) && + (share->state.changed & (STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR))) + need_to_check=1; + if (!need_to_check) + { + if (!(param->testflag & T_SILENT) || param->testflag & T_INFO) + printf("Aria file: %s is already checked\n",filename); + if (maria_close(info)) + { + _ma_check_print_error(param,"%d when closing Aria table '%s'", + my_errno,filename); + DBUG_RETURN(1); + } + DBUG_RETURN(0); + } + } + if ((param->testflag & (T_REP_ANY | T_STATISTICS | + T_SORT_RECORDS | T_SORT_INDEX)) && + (((param->testflag & T_UNPACK) && + share->data_file_type == COMPRESSED_RECORD) || + mi_uint2korr(share->state.header.state_info_length) != + MARIA_STATE_INFO_SIZE || + mi_uint2korr(share->state.header.base_info_length) != + MARIA_BASE_INFO_SIZE || + maria_is_any_intersect_keys_active(param->keys_in_use, share->base.keys, + ~share->state.key_map) || + maria_test_if_almost_full(info) || + info->s->state.header.file_version[3] != maria_file_magic[3] || + (set_collation && + set_collation->number != share->state.header.language))) + { + if (set_collation) + param->language= set_collation->number; + if (maria_recreate_table(param, &info,filename)) + { + VOID(fprintf(stderr, + "Aria table '%s' is not fixed because of errors\n", + filename)); + return(-1); + } + recreate=1; + if (!(param->testflag & T_REP_ANY)) + { + param->testflag|=T_REP_BY_SORT; /* if only STATISTICS */ + if (!(param->testflag & T_SILENT)) + printf("- '%s' has old table-format. Recreating index\n",filename); + rep_quick= 1; + } + share= info->s; + share->tot_locks-= share->r_locks; + share->r_locks=0; + } + + if (param->testflag & T_DESCRIPT) + { + param->total_files++; + param->total_records+=info->state->records; + param->total_deleted+=info->state->del; + descript(param, info, filename); + maria_close(info); /* Should always succeed */ + return(0); + } + + if (!stopwords_inited++) + ft_init_stopwords(); + + if (!(param->testflag & T_READONLY)) + lock_type = F_WRLCK; /* table is changed */ + else + lock_type= F_RDLCK; + if (info->lock_type == F_RDLCK) + info->lock_type=F_UNLCK; /* Read only table */ + if (_ma_readinfo(info,lock_type,0)) + { + _ma_check_print_error(param,"Can't lock indexfile of '%s', error: %d", + filename,my_errno); + param->error_printed=0; + error= 1; + goto end2; + } + /* + _ma_readinfo() has locked the table. + We mark the table as locked (without doing file locks) to be able to + use functions that only works on locked tables (like row caching). + */ + maria_lock_database(info, F_EXTRA_LCK); + datafile= info->dfile.file; + if (init_pagecache(maria_pagecache, (size_t) param->use_buffers, 0, 0, + maria_block_size, MY_WME) == 0) + { + _ma_check_print_error(param, "Can't initialize page cache with %lu memory", + (ulong) param->use_buffers); + error= 1; + goto end2; + } + + if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | + T_ZEROFILL)) + { + /* + Mark table as not transactional to avoid logging. Should not be needed, + maria_repair and maria_zerofill do it already. + */ + _ma_tmp_disable_logging_for_table(info, FALSE); + + if (param->testflag & T_REP_ANY) + { + ulonglong tmp=share->state.key_map; + maria_copy_keys_active(share->state.key_map, share->base.keys, + param->keys_in_use); + if (tmp != share->state.key_map) + info->update|=HA_STATE_CHANGED; + + if (rep_quick && + maria_chk_del(param, info, param->testflag & ~T_VERBOSE)) + { + if (param->testflag & T_FORCE_CREATE) + { + rep_quick=0; + _ma_check_print_info(param,"Creating new data file\n"); + } + else + { + error=1; + _ma_check_print_error(param, + "Quick-recover aborted; Run recovery without switch 'q'"); + } + } + } + if (!error) + { + /* + Unless this was only --zerofill-keep-lsn, old REDOs are not + applicable, tell the server's Recovery to ignore them; we don't + know what the log's end LSN is now, so we just let the server know + that it will have to find and store it. + This is the only case where create_rename_lsn can be a horizon and not + a LSN. + If this was only --zerofill-keep-lsn, the table can be used in + Recovery and especially in this scenario: do a dirty-copy-based backup + (snapshot-like), --zerofill-keep-lsn on the copies to achieve better + compression, compress the copies with an external tool, and after a + restore, Recovery still works (because pages and state still have + their correct LSNs). + */ + if (share->base.born_transactional && + ((param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | + T_ZEROFILL | T_ZEROFILL_KEEP_LSN)) != + (T_ZEROFILL | T_ZEROFILL_KEEP_LSN))) + share->state.create_rename_lsn= share->state.is_of_horizon= + share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS; + } + if (!error && (param->testflag & T_REP_ANY)) + { + if ((param->testflag & (T_REP_BY_SORT | T_REP_PARALLEL)) && + (maria_is_any_key_active(share->state.key_map) || + (rep_quick && !param->keys_in_use && !recreate)) && + maria_test_if_sort_rep(info, info->state->records, + info->s->state.key_map, + param->force_sort)) + { + if (param->testflag & T_REP_BY_SORT) + error=maria_repair_by_sort(param,info,filename,rep_quick); + else + error=maria_repair_parallel(param,info,filename,rep_quick); + state_updated=1; + } + else + error=maria_repair(param, info,filename,rep_quick); + } + if (!error && (param->testflag & T_SORT_RECORDS)) + { + /* + The data file is nowadays reopened in the repair code so we should + soon remove the following reopen-code + */ +#ifndef TO_BE_REMOVED + if (param->out_flag & O_NEW_DATA) + { /* Change temp file to org file */ + VOID(my_close(info->dfile.file, MYF(MY_WME))); /* Close new file */ + error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT, + MYF(0)); + if (_ma_open_datafile(info,info->s, NullS, -1)) + error=1; + param->out_flag&= ~O_NEW_DATA; /* We are using new datafile */ + param->read_cache.file= info->dfile.file; + } +#endif + if (! error) + { + uint key; + /* + We can't update the index in maria_sort_records if we have a + prefix compressed or fulltext index + */ + my_bool update_index=1; + for (key=0 ; key < share->base.keys; key++) + if (share->keyinfo[key].flag & (HA_BINARY_PACK_KEY|HA_FULLTEXT)) + update_index=0; + + error=maria_sort_records(param,info,filename,param->opt_sort_key, + /* what is the following parameter for ? */ + (my_bool) !(param->testflag & T_REP), + update_index); + datafile= info->dfile.file; /* This is now locked */ + if (!error && !update_index) + { + if (param->verbose) + puts("Table had a compressed index; We must now recreate the index"); + error=maria_repair_by_sort(param,info,filename,1); + } + } + } + if (!error && (param->testflag & T_SORT_INDEX)) + error= maria_sort_index(param,info,filename); + if (!error && (param->testflag & T_ZEROFILL)) + error= maria_zerofill(param, info, filename); + if (!error) + { + DBUG_PRINT("info", ("Reseting crashed state")); + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR); + } + else + maria_mark_crashed(info); + } + else if ((param->testflag & T_CHECK) || !(param->testflag & T_AUTO_INC)) + { + if (!(param->testflag & T_VERY_SILENT) || param->testflag & T_INFO) + printf("Checking Aria file: %s\n",filename); + if (!(param->testflag & T_SILENT)) + printf("Data records: %7s Deleted blocks: %7s\n", + llstr(info->state->records,llbuff), + llstr(info->state->del,llbuff2)); + maria_chk_init_for_check(param, info); + if (opt_warning_for_wrong_transid == 0) + param->max_trid= ~ (ulonglong) 0; + error= maria_chk_status(param,info); + maria_intersect_keys_active(share->state.key_map, param->keys_in_use); + error|= maria_chk_size(param,info); + if (!error || !(param->testflag & (T_FAST | T_FORCE_CREATE))) + error|=maria_chk_del(param, info,param->testflag); + if ((!error || (!(param->testflag & (T_FAST | T_FORCE_CREATE)) && + !param->start_check_pos))) + { + error|=maria_chk_key(param, info); + if (!error && (param->testflag & (T_STATISTICS | T_AUTO_INC))) + error=maria_update_state_info(param, info, + ((param->testflag & T_STATISTICS) ? + UPDATE_STAT : 0) | + ((param->testflag & T_AUTO_INC) ? + UPDATE_AUTO_INC : 0)); + } + if ((!rep_quick && !error) || + !(param->testflag & (T_FAST | T_FORCE_CREATE))) + { + VOID(init_io_cache(¶m->read_cache,datafile, + (uint) param->read_buffer_length, + READ_CACHE, + (param->start_check_pos ? + param->start_check_pos : + share->pack.header_length), + 1, + MYF(MY_WME))); + maria_lock_memory(param); + if ((info->s->data_file_type != STATIC_RECORD) || + (param->testflag & (T_EXTEND | T_MEDIUM))) + error|=maria_chk_data_link(param, info, + test(param->testflag & T_EXTEND)); + VOID(end_io_cache(¶m->read_cache)); + } + if (!error) + { + if (((share->state.changed & + (STATE_CHANGED | STATE_CRASHED | STATE_CRASHED_ON_REPAIR | + STATE_IN_REPAIR)) || + share->state.open_count != 0) + && (param->testflag & T_UPDATE_STATE)) + info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + DBUG_PRINT("info", ("Reseting crashed state")); + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR); + } + else if (!maria_is_crashed(info) && + (param->testflag & T_UPDATE_STATE)) + { /* Mark crashed */ + maria_mark_crashed(info); + info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + } + } + + if ((param->testflag & T_AUTO_INC) || + ((param->testflag & T_REP_ANY) && info->s->base.auto_key)) + _ma_update_auto_increment_key(param, info, + (my_bool) !test(param->testflag & T_AUTO_INC)); + + if (info->update & HA_STATE_CHANGED && ! (param->testflag & T_READONLY)) + error|=maria_update_state_info(param, info, + UPDATE_OPEN_COUNT | + (((param->testflag & T_REP_ANY) ? + UPDATE_TIME : 0) | + (state_updated ? UPDATE_STAT : 0) | + ((param->testflag & T_SORT_RECORDS) ? + UPDATE_SORT : 0))); + info->update&= ~HA_STATE_CHANGED; + _ma_reenable_logging_for_table(info, FALSE); + maria_lock_database(info, F_UNLCK); + +end2: + end_pagecache(maria_pagecache, 1); + if (maria_close(info)) + { + _ma_check_print_error(param, default_close_errmsg, my_errno, filename); + DBUG_RETURN(1); + } + if (error == 0) + { + if (param->out_flag & O_NEW_DATA) + error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT, + ((param->testflag & T_BACKUP_DATA) ? + MYF(MY_REDEL_MAKE_BACKUP) : MYF(0))); + if (param->out_flag & O_NEW_INDEX) + error|=maria_change_to_newfile(filename,MARIA_NAME_IEXT,INDEX_TMP_EXT, + MYF(0)); + } + if (opt_transaction_logging && + share->base.born_transactional && !error && + (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | + T_ZEROFILL))) + error= write_log_record(param); + + if (param->not_visible_rows_found && (param->testflag & T_VERBOSE)) + { + char buff[22]; + printf("Max transaction id found: %s\n", + llstr(param->max_found_trid, buff)); + } + + VOID(fflush(stdout)); VOID(fflush(stderr)); + + if (param->error_printed) + { + if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX)) + { + VOID(fprintf(stderr, + "Aria table '%s' is not fixed because of errors\n", + filename)); + if (param->testflag & T_REP_ANY) + VOID(fprintf(stderr, + "Try fixing it by using the --safe-recover (-o), the --force (-f) option or by not using the --quick (-q) flag\n")); + } + else if (!(param->error_printed & 2) && + !(param->testflag & T_FORCE_CREATE)) + VOID(fprintf(stderr, + "Aria table '%s' is corrupted\nFix it using switch \"-r\" or \"-o\"\n", + filename)); + } + else if (param->warning_printed && + ! (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | + T_FORCE_CREATE))) + VOID(fprintf(stderr, "Aria table '%s' is usable but should be fixed\n", + filename)); + VOID(fflush(stderr)); + DBUG_RETURN(error); +} /* maria_chk */ + + +/* Write info about table */ + +static void descript(HA_CHECK *param, register MARIA_HA *info, char *name) +{ + uint key,keyseg_nr,field; + reg3 MARIA_KEYDEF *keyinfo; + reg2 HA_KEYSEG *keyseg; + reg4 const char *text; + char buff[200],length[10],*pos,*end; + enum en_fieldtype type; + MARIA_SHARE *share= info->s; + char llbuff[22],llbuff2[22]; + DBUG_ENTER("descript"); + + if (param->testflag & T_VERY_SILENT) + { + longlong checksum= info->state->checksum; + if (!(share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))) + checksum= 0; + printf("%s %s %s\n", name, llstr(info->state->records,llbuff), + llstr(checksum, llbuff2)); + DBUG_VOID_RETURN; + } + + printf("Aria file: %s\n",name); + printf("Record format: %s\n", record_formats[share->data_file_type]); + printf("Crashsafe: %s\n", + share->base.born_transactional ? "yes" : "no"); + printf("Character set: %s (%d)\n", + get_charset_name(share->state.header.language), + share->state.header.language); + + if (param->testflag & T_VERBOSE) + { + printf("File-version: %d\n", + (int) share->state.header.file_version[3]); + if (share->state.create_time) + { + get_date(buff,1,share->state.create_time); + printf("Creation time: %s\n",buff); + } + if (share->state.check_time) + { + get_date(buff,1,share->state.check_time); + printf("Recover time: %s\n",buff); + } + if (share->base.born_transactional) + { + printf("LSNs: create_rename (%lu,0x%lx)," + " state_horizon (%lu,0x%lx), skip_redo (%lu,0x%lx)\n", + LSN_IN_PARTS(share->state.create_rename_lsn), + LSN_IN_PARTS(share->state.is_of_horizon), + LSN_IN_PARTS(share->state.skip_redo_lsn)); + } + compile_time_assert((MY_UUID_STRING_LENGTH + 1) <= sizeof(buff)); + buff[MY_UUID_STRING_LENGTH]= 0; + my_uuid2str(share->base.uuid, buff); + printf("UUID: %s\n", buff); + pos=buff; + if (share->state.changed & STATE_CRASHED) + strmov(buff,"crashed"); + else + { + if (share->state.open_count) + pos=strmov(pos,"open,"); + if (share->state.changed & STATE_CHANGED) + pos=strmov(pos,"changed,"); + else + pos=strmov(pos,"checked,"); + if (!(share->state.changed & STATE_NOT_ANALYZED)) + pos=strmov(pos,"analyzed,"); + if (!(share->state.changed & STATE_NOT_OPTIMIZED_KEYS)) + pos=strmov(pos,"optimized keys,"); + if (!(share->state.changed & STATE_NOT_SORTED_PAGES)) + pos=strmov(pos,"sorted index pages,"); + if (!(share->state.changed & STATE_NOT_ZEROFILLED)) + pos=strmov(pos,"zerofilled,"); + if (!(share->state.changed & STATE_NOT_MOVABLE)) + pos=strmov(pos,"movable,"); + pos[-1]=0; /* Remove extra ',' */ + } + printf("Status: %s\n",buff); + if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + printf("Checksum: %26s\n",llstr(info->state->checksum,llbuff)); +; + if (share->options & HA_OPTION_DELAY_KEY_WRITE) + printf("Keys are only flushed at close\n"); + + if (share->options & HA_OPTION_PAGE_CHECKSUM) + printf("Page checksums are used\n"); + if (share->base.auto_key) + { + printf("Auto increment key: %16d Last value: %18s\n", + share->base.auto_key, + llstr(share->state.auto_increment,llbuff)); + } + } + printf("Data records: %16s Deleted blocks: %18s\n", + llstr(info->state->records,llbuff),llstr(info->state->del,llbuff2)); + if (param->testflag & T_SILENT) + DBUG_VOID_RETURN; /* This is enough */ + + if (param->testflag & T_VERBOSE) + { +#ifdef USE_RELOC + printf("Init-relocation: %16s\n",llstr(share->base.reloc,llbuff)); +#endif + printf("Datafile parts: %16s Deleted data: %18s\n", + llstr(share->state.split,llbuff), + llstr(info->state->empty,llbuff2)); + printf("Datafile pointer (bytes): %11d Keyfile pointer (bytes): %13d\n", + share->rec_reflength,share->base.key_reflength); + printf("Datafile length: %16s Keyfile length: %18s\n", + llstr(info->state->data_file_length,llbuff), + llstr(info->state->key_file_length,llbuff2)); + + if (info->s->base.reloc == 1L && info->s->base.records == 1L) + puts("This is a one-record table"); + else + { + if (share->base.max_data_file_length != HA_OFFSET_ERROR || + share->base.max_key_file_length != HA_OFFSET_ERROR) + printf("Max datafile length: %16s Max keyfile length: %18s\n", + llstr(share->base.max_data_file_length-1,llbuff), + llstr(share->base.max_key_file_length-1,llbuff2)); + } + } + printf("Block_size: %16d\n",(int) share->block_size); + printf("Recordlength: %16d\n",(int) share->base.pack_reclength); + if (! maria_is_all_keys_active(share->state.key_map, share->base.keys)) + { + longlong2str(share->state.key_map,buff,2,1); + printf("Using only keys '%s' of %d possibly keys\n", + buff, share->base.keys); + } + puts("\nTable description:"); + printf("Key Start Len Index Type"); + if (param->testflag & T_VERBOSE) + printf(" Rec/key Root Blocksize"); + VOID(putchar('\n')); + + for (key=keyseg_nr=0, keyinfo= &share->keyinfo[0] ; + key < share->base.keys; + key++,keyinfo++) + { + keyseg=keyinfo->seg; + if (keyinfo->flag & HA_NOSAME) text="unique "; + else if (keyinfo->flag & HA_FULLTEXT) text="fulltext "; + else text="multip."; + + pos=buff; + if (keyseg->flag & HA_REVERSE_SORT) + *pos++ = '-'; + pos=strmov(pos,type_names[keyseg->type]); + *pos++ = ' '; + *pos=0; + if (keyinfo->flag & HA_PACK_KEY) + pos=strmov(pos,prefix_packed_txt); + if (keyinfo->flag & HA_BINARY_PACK_KEY) + pos=strmov(pos,bin_packed_txt); + if (keyseg->flag & HA_SPACE_PACK) + pos=strmov(pos,diff_txt); + if (keyseg->flag & HA_BLOB_PART) + pos=strmov(pos,blob_txt); + if (keyseg->flag & HA_NULL_PART) + pos=strmov(pos,null_txt); + *pos=0; + + printf("%-4d%-6ld%-3d %-8s%-23s", + key+1,(long) keyseg->start+1,keyseg->length,text,buff); + if (share->state.key_root[key] != HA_OFFSET_ERROR) + llstr(share->state.key_root[key],buff); + else + buff[0]=0; + if (param->testflag & T_VERBOSE) + printf("%9.0f %12s %10d", + share->state.rec_per_key_part[keyseg_nr++], + buff,keyinfo->block_length); + VOID(putchar('\n')); + while ((++keyseg)->type != HA_KEYTYPE_END) + { + pos=buff; + if (keyseg->flag & HA_REVERSE_SORT) + *pos++ = '-'; + pos=strmov(pos,type_names[keyseg->type]); + *pos++= ' '; + if (keyseg->flag & HA_SPACE_PACK) + pos=strmov(pos,diff_txt); + if (keyseg->flag & HA_BLOB_PART) + pos=strmov(pos,blob_txt); + if (keyseg->flag & HA_NULL_PART) + pos=strmov(pos,null_txt); + *pos=0; + printf(" %-6ld%-3d %-21s", + (long) keyseg->start+1,keyseg->length,buff); + if (param->testflag & T_VERBOSE) + printf("%11.0f", share->state.rec_per_key_part[keyseg_nr++]); + VOID(putchar('\n')); + } + keyseg++; + } + if (share->state.header.uniques) + { + MARIA_UNIQUEDEF *uniqueinfo; + puts("\nUnique Key Start Len Nullpos Nullbit Type"); + for (key=0,uniqueinfo= &share->uniqueinfo[0] ; + key < share->state.header.uniques; key++, uniqueinfo++) + { + my_bool new_row=0; + char null_bit[8],null_pos[8]; + printf("%-8d%-5d",key+1,uniqueinfo->key+1); + for (keyseg=uniqueinfo->seg ; keyseg->type != HA_KEYTYPE_END ; keyseg++) + { + if (new_row) + fputs(" ",stdout); + null_bit[0]=null_pos[0]=0; + if (keyseg->null_bit) + { + sprintf(null_bit,"%d",keyseg->null_bit); + sprintf(null_pos,"%ld",(long) keyseg->null_pos+1); + } + printf("%-7ld%-5d%-9s%-10s%-30s\n", + (long) keyseg->start+1,keyseg->length, + null_pos,null_bit, + type_names[keyseg->type]); + new_row=1; + } + } + } + if (param->verbose > 1) + { + char null_bit[8],null_pos[8]; + printf("\nField Start Length Nullpos Nullbit Type"); + if (share->options & HA_OPTION_COMPRESS_RECORD) + printf(" Huff tree Bits"); + VOID(putchar('\n')); + + for (field=0 ; field < share->base.fields ; field++) + { + if (share->options & HA_OPTION_COMPRESS_RECORD) + type=share->columndef[field].base_type; + else + type=(enum en_fieldtype) share->columndef[field].type; + end=strmov(buff,field_pack[type]); + if (share->options & HA_OPTION_COMPRESS_RECORD) + { + if (share->columndef[field].pack_type & PACK_TYPE_SELECTED) + end=strmov(end,", not_always"); + if (share->columndef[field].pack_type & PACK_TYPE_SPACE_FIELDS) + end=strmov(end,", no empty"); + if (share->columndef[field].pack_type & PACK_TYPE_ZERO_FILL) + { + sprintf(end,", zerofill(%d)",share->columndef[field].space_length_bits); + end=strend(end); + } + } + if (buff[0] == ',') + strmov(buff,buff+2); + int10_to_str((long) share->columndef[field].length,length,10); + null_bit[0]=null_pos[0]=0; + if (share->columndef[field].null_bit) + { + sprintf(null_bit,"%d",share->columndef[field].null_bit); + sprintf(null_pos,"%d",share->columndef[field].null_pos+1); + } + printf("%-6d%-6u%-7s%-8s%-8s%-35s",field+1, + (uint) share->columndef[field].offset+1, + length, null_pos, null_bit, buff); + if (share->options & HA_OPTION_COMPRESS_RECORD) + { + if (share->columndef[field].huff_tree) + printf("%3d %2d", + (uint) (share->columndef[field].huff_tree-share->decode_trees)+1, + share->columndef[field].huff_tree->quick_table_bits); + } + VOID(putchar('\n')); + } + if (share->data_file_type == BLOCK_RECORD) + { + uint i; + puts("\nBitmap Data size Description"); + for (i=0 ; i <= 7 ; i++) + printf("%u %5u %s\n", i, share->bitmap.sizes[i], + bitmap_description[i]); + } + } + DBUG_VOID_RETURN; +} /* describe */ + + + /* Sort records according to one key */ + +static int maria_sort_records(HA_CHECK *param, + register MARIA_HA *info, char *name, + uint sort_key, + my_bool write_info, + my_bool update_index) +{ + int got_error; + uint key; + MARIA_KEYDEF *keyinfo; + File new_file; + uchar *temp_buff; + ha_rows old_record_count; + MARIA_SHARE *share= info->s; + char llbuff[22],llbuff2[22]; + MARIA_SORT_INFO sort_info; + MARIA_SORT_PARAM sort_param; + MARIA_PAGE page; + DBUG_ENTER("sort_records"); + + bzero((char*)&sort_info,sizeof(sort_info)); + bzero((char*)&sort_param,sizeof(sort_param)); + sort_param.sort_info=&sort_info; + sort_info.param=param; + keyinfo= &share->keyinfo[sort_key]; + got_error=1; + temp_buff=0; + new_file= -1; + + if (! maria_is_key_active(share->state.key_map, sort_key)) + { + _ma_check_print_warning(param, + "Can't sort table '%s' on key %d; No such key", + name,sort_key+1); + param->error_printed=0; + DBUG_RETURN(0); /* Nothing to do */ + } + if (keyinfo->flag & HA_FULLTEXT) + { + _ma_check_print_warning(param,"Can't sort table '%s' on FULLTEXT key %d", + name,sort_key+1); + param->error_printed=0; + DBUG_RETURN(0); /* Nothing to do */ + } + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { + _ma_check_print_warning(param, + "Can't sort table '%s' on a key with prefix " + "packing %d", + name,sort_key+1); + param->error_printed=0; + DBUG_RETURN(0); + } + + + if (share->data_file_type == COMPRESSED_RECORD) + { + _ma_check_print_warning(param,"Can't sort read-only table '%s'", name); + param->error_printed=0; + DBUG_RETURN(0); /* Nothing to do */ + } + if (!(param->testflag & T_SILENT)) + { + printf("- Sorting records for Aria table '%s'\n",name); + if (write_info) + printf("Data records: %9s Deleted: %9s\n", + llstr(info->state->records,llbuff), + llstr(info->state->del,llbuff2)); + } + if (share->state.key_root[sort_key] == HA_OFFSET_ERROR) + DBUG_RETURN(0); /* Nothing to do */ + + if (init_io_cache(&info->rec_cache,-1,(uint) param->write_buffer_length, + WRITE_CACHE,share->pack.header_length,1, + MYF(MY_WME | MY_WAIT_IF_FULL))) + goto err; + info->opt_flag|=WRITE_CACHE_USED; + + if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length))) + { + _ma_check_print_error(param,"Not enough memory for key block"); + goto err; + } + + if (!(sort_param.record= + (uchar*) my_malloc((uint) share->base.default_rec_buff_size, MYF(0)))) + { + _ma_check_print_error(param,"Not enough memory for record"); + goto err; + } + + fn_format(param->temp_filename,name,"", MARIA_NAME_DEXT,2+4+32); + new_file= my_create(fn_format(param->temp_filename, + param->temp_filename,"", + DATA_TMP_EXT, + MY_REPLACE_EXT | MY_UNPACK_FILENAME), + 0, param->tmpfile_createflag, + MYF(0)); + if (new_file < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (share->pack.header_length) + if (maria_filecopy(param, new_file, info->dfile.file, 0L, + share->pack.header_length, + "datafile-header")) + goto err; + info->rec_cache.file=new_file; /* Use this file for cacheing*/ + + maria_lock_memory(param); + for (key=0 ; key < share->base.keys ; key++) + share->keyinfo[key].flag|= HA_SORT_ALLOWS_SAME; + + if (my_pread(share->kfile.file, temp_buff, + (uint) keyinfo->block_length, + share->state.key_root[sort_key], + MYF(MY_NABP+MY_WME))) + { + _ma_check_print_error(param, "Can't read indexpage from filepos: %s", + llstr(share->state.key_root[sort_key], llbuff)); + goto err; + } + + /* Setup param for _ma_sort_write_record */ + sort_info.info=info; + sort_info.new_data_file_type=share->data_file_type; + sort_param.fix_datafile=1; + sort_param.master=1; + sort_param.filepos=share->pack.header_length; + old_record_count=info->state->records; + info->state->records=0; + if (sort_info.new_data_file_type != COMPRESSED_RECORD) + info->state->checksum=0; + + _ma_page_setup(&page, info, keyinfo, share->state.key_root[sort_key], + temp_buff); + if (sort_record_index(&sort_param, &page, sort_key,new_file,update_index) || + maria_write_data_suffix(&sort_info,1) || + flush_io_cache(&info->rec_cache)) + goto err; + + if (info->state->records != old_record_count) + { + _ma_check_print_error(param,"found %s of %s records", + llstr(info->state->records,llbuff), + llstr(old_record_count,llbuff2)); + goto err; + } + + VOID(my_close(info->dfile.file, MYF(MY_WME))); + param->out_flag|=O_NEW_DATA; /* Data in new file */ + info->dfile.file= new_file; /* Use new datafile */ + _ma_set_data_pagecache_callbacks(&info->dfile, info->s); + + info->state->del=0; + info->state->empty=0; + share->state.dellink= HA_OFFSET_ERROR; + info->state->data_file_length=sort_param.filepos; + share->state.split=info->state->records; /* Only hole records */ + share->state.version=(ulong) time((time_t*) 0); + + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + if (param->testflag & T_WRITE_LOOP) + { + VOID(fputs(" \r",stdout)); VOID(fflush(stdout)); + } + got_error=0; + +err: + if (got_error && new_file >= 0) + { + VOID(end_io_cache(&info->rec_cache)); + (void) my_close(new_file,MYF(MY_WME)); + (void) my_delete(param->temp_filename, MYF(MY_WME)); + } + if (temp_buff) + { + my_afree(temp_buff); + } + my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR)); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + VOID(end_io_cache(&info->rec_cache)); + my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR)); + sort_info.buff=0; + share->state.sortkey=sort_key; + DBUG_RETURN(got_error); +} /* sort_records */ + + +/* Sort records recursive using one index */ + +static int sort_record_index(MARIA_SORT_PARAM *sort_param, + MARIA_PAGE *ma_page, uint sort_key, + File new_file,my_bool update_index) +{ + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + uint page_flag, nod_flag,used_length; + uchar *temp_buff,*keypos,*endpos; + my_off_t next_page,rec_pos; + uchar lastkey[MARIA_MAX_KEY_BUFF]; + char llbuff[22]; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_KEY tmp_key; + MARIA_PAGE new_page; + const MARIA_KEYDEF *keyinfo= ma_page->keyinfo; + DBUG_ENTER("sort_record_index"); + + page_flag= ma_page->flag; + nod_flag= ma_page->node; + temp_buff=0; + tmp_key.keyinfo= (MARIA_KEYDEF*) keyinfo; + tmp_key.data= lastkey; + + if (nod_flag) + { + if (!(temp_buff= (uchar*) my_alloca(tmp_key.keyinfo->block_length))) + { + _ma_check_print_error(param,"Not Enough memory"); + DBUG_RETURN(-1); + } + } + used_length= ma_page->size; + keypos= ma_page->buff + share->keypage_header + nod_flag; + endpos= ma_page->buff + used_length; + for ( ;; ) + { + _sanity(__FILE__,__LINE__); + if (nod_flag) + { + next_page= _ma_kpos(nod_flag, keypos); + if (my_pread(share->kfile.file, temp_buff, + (uint) tmp_key.keyinfo->block_length, next_page, + MYF(MY_NABP+MY_WME))) + { + _ma_check_print_error(param,"Can't read keys from filepos: %s", + llstr(next_page,llbuff)); + goto err; + } + _ma_page_setup(&new_page, info, ma_page->keyinfo, next_page, temp_buff); + + if (sort_record_index(sort_param, &new_page, sort_key, + new_file, update_index)) + goto err; + } + _sanity(__FILE__,__LINE__); + if (keypos >= endpos || + !(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos)) + break; + rec_pos= _ma_row_pos_from_key(&tmp_key); + + if ((*share->read_record)(info,sort_param->record,rec_pos)) + { + _ma_check_print_error(param,"%d when reading datafile",my_errno); + goto err; + } + if (rec_pos != sort_param->filepos && update_index) + { + _ma_dpointer(share, keypos - nod_flag - tmp_key.ref_length, + sort_param->filepos); + if (maria_movepoint(info,sort_param->record,rec_pos,sort_param->filepos, + sort_key)) + { + _ma_check_print_error(param,"%d when updating key-pointers",my_errno); + goto err; + } + } + if (_ma_sort_write_record(sort_param)) + goto err; + } + /* Clear end of block to get better compression if the table is backuped */ + bzero(ma_page->buff + used_length, keyinfo->block_length - used_length); + if (my_pwrite(share->kfile.file, ma_page->buff, (uint)keyinfo->block_length, + ma_page->pos, param->myf_rw)) + { + _ma_check_print_error(param,"%d when updating keyblock",my_errno); + goto err; + } + if (temp_buff) + my_afree(temp_buff); + DBUG_RETURN(0); +err: + if (temp_buff) + my_afree(temp_buff); + DBUG_RETURN(1); +} /* sort_record_index */ + + +static my_bool write_log_record(HA_CHECK *param) +{ + /* + Now that all operations including O_NEW_DATA|INDEX are successfully + done, we can write a log record. + */ + MARIA_HA *info= maria_open(param->isam_file_name, O_RDWR, 0); + if (info == NULL) + _ma_check_print_error(param, default_open_errmsg, my_errno, + param->isam_file_name); + else + { + if (write_log_record_for_repair(param, info)) + _ma_check_print_error(param, "%d when writing log record for" + " Aria table '%s'", my_errno, + param->isam_file_name); + else if (maria_close(info)) + _ma_check_print_error(param, default_close_errmsg, my_errno, + param->isam_file_name); + else + return FALSE; + } + return TRUE; +} + +#include "ma_check_standalone.h" diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h new file mode 100644 index 00000000000..ba97684b1aa --- /dev/null +++ b/storage/maria/maria_def.h @@ -0,0 +1,1267 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* This file is included by all internal maria files */ + +#include "maria.h" /* Structs & some defines */ +#include <myisampack.h> /* packing of keys */ +#include <my_tree.h> +#include <my_bitmap.h> +#ifdef THREAD +#include <my_pthread.h> +#include <thr_lock.h> +#else +#include <my_no_pthread.h> +#endif +#include <hash.h> +#include "ma_loghandler.h" +#include "ma_control_file.h" +#include "ma_state.h" +#include <waiting_threads.h> + +/* For testing recovery */ +#ifdef TO_BE_REMOVED +#define IDENTICAL_PAGES_AFTER_RECOVERY 1 +#endif +/* Do extra sanity checking */ +#define SANITY_CHECKS 1 +#ifdef EXTRA_DEBUG +#define EXTRA_DEBUG_KEY_CHANGES +#define EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES +#endif + +#define MAX_NONMAPPED_INSERTS 1000 +#define MARIA_MAX_TREE_LEVELS 32 + +/* maria_open() flag, specific for maria_pack */ +#define HA_OPEN_IGNORE_MOVED_STATE (1U << 30) + +struct st_transaction; + +/* undef map from my_nosys; We need test-if-disk full */ +#undef my_write + +#define CRC_SIZE 4 + +typedef struct st_maria_state_info +{ + struct + { /* Fileheader (24 bytes) */ + uchar file_version[4]; + uchar options[2]; + uchar header_length[2]; + uchar state_info_length[2]; + uchar base_info_length[2]; + uchar base_pos[2]; + uchar key_parts[2]; /* Key parts */ + uchar unique_key_parts[2]; /* Key parts + unique parts */ + uchar keys; /* number of keys in file */ + uchar uniques; /* number of UNIQUE definitions */ + uchar language; /* Language for indexes */ + uchar fulltext_keys; + uchar data_file_type; + /* Used by mariapack to store the original data_file_type */ + uchar org_data_file_type; + } header; + + MARIA_STATUS_INFO state; + /* maria_ha->state points here for crash-safe but not versioned tables */ + MARIA_STATUS_INFO common; + ha_rows split; /* number of split blocks */ + my_off_t dellink; /* Link to next removed block */ + pgcache_page_no_t first_bitmap_with_space; + ulonglong auto_increment; + TrID create_trid; /* Minum trid for file */ + TrID last_change_trn; /* selfdescriptive */ + ulong update_count; /* Updated for each write lock */ + ulong status; + double *rec_per_key_part; + ulong *nulls_per_key_part; + ha_checksum checksum; /* Table checksum */ + my_off_t *key_root; /* Start of key trees */ + my_off_t key_del; /* delete links for index pages */ + my_off_t records_at_analyze; /* Rows when calculating rec_per_key */ + + ulong sec_index_changed; /* Updated when new sec_index */ + ulong sec_index_used; /* which extra index are in use */ + ulonglong key_map; /* Which keys are in use */ + ulong version; /* timestamp of create */ + time_t create_time; /* Time when created database */ + time_t recover_time; /* Time for last recover */ + time_t check_time; /* Time for last check */ + uint sortkey; /* sorted by this key (not used) */ + uint open_count; + uint changed; /* Changed since maria_chk */ + /** + Birthday of the table: no record in the log before this LSN should ever + be applied to the table. Updated when created, renamed, explicitely + repaired (REPAIR|OPTIMIZE TABLE, ALTER TABLE ENABLE KEYS, maria_chk). + */ + LSN create_rename_lsn; + /** @brief Log horizon when state was last updated on disk */ + TRANSLOG_ADDRESS is_of_horizon; + /** + REDO phase should ignore any record before this LSN. UNDO phase + shouldn't, this is the difference with create_rename_lsn. + skip_redo_lsn >= create_rename_lsn. + The distinction is for these cases: + - after a repair at end of bulk insert (enabling indices), REDO phase + should skip the table but UNDO phase should not, so only skip_redo_lsn is + increased, not create_rename_lsn + - if one table is corrupted and so recovery fails, user may repair the + table with maria_chk and let recovery restart: that recovery should then + skip the repaired table even in the UNDO phase, so create_rename_lsn is + increased. + */ + LSN skip_redo_lsn; + + /* the following isn't saved on disk */ + uint state_diff_length; /* Should be 0 */ + uint state_length; /* Length of state header in file */ + ulong *key_info; +} MARIA_STATE_INFO; + + +#define MARIA_STATE_INFO_SIZE \ + (24 + 2 + LSN_STORE_SIZE*3 + 4 + 11*8 + 4*4 + 8 + 3*4 + 5*8) +#define MARIA_FILE_OPEN_COUNT_OFFSET 0 +#define MARIA_FILE_CHANGED_OFFSET 2 +#define MARIA_FILE_CREATE_RENAME_LSN_OFFSET 4 +#define MARIA_FILE_CREATE_TRID_OFFSET (4 + LSN_STORE_SIZE*3 + 11*8) + +#define MARIA_STATE_KEY_SIZE (8 + 4) +#define MARIA_STATE_KEYBLOCK_SIZE 8 +#define MARIA_STATE_KEYSEG_SIZE 12 +#define MARIA_STATE_EXTRA_SIZE (MARIA_MAX_KEY*MARIA_STATE_KEY_SIZE + MARIA_MAX_KEY*HA_MAX_KEY_SEG*MARIA_STATE_KEYSEG_SIZE) +#define MARIA_KEYDEF_SIZE (2+ 5*2) +#define MARIA_UNIQUEDEF_SIZE (2+1+1) +#define HA_KEYSEG_SIZE (6+ 2*2 + 4*2) +#define MARIA_MAX_KEY_BUFF (HA_MAX_KEY_BUFF + MARIA_MAX_PACK_TRANSID_SIZE) +#define MARIA_COLUMNDEF_SIZE (2*7+1+1+4) +#define MARIA_BASE_INFO_SIZE (MY_UUID_SIZE + 5*8 + 6*4 + 11*2 + 6 + 5*2 + 1 + 16) +#define MARIA_INDEX_BLOCK_MARGIN 16 /* Safety margin for .MYI tables */ +/* Internal management bytes needed to store 2 transid/key on an index page */ +#define MARIA_MAX_PACK_TRANSID_SIZE (TRANSID_SIZE+1) +#define MARIA_TRANSID_PACK_OFFSET (256- TRANSID_SIZE - 1) +#define MARIA_MIN_TRANSID_PACK_OFFSET (MARIA_TRANSID_PACK_OFFSET-TRANSID_SIZE) +#define MARIA_INDEX_OVERHEAD_SIZE (MARIA_MAX_PACK_TRANSID_SIZE * 2) +#define MARIA_DELETE_KEY_NR 255 /* keynr for deleted blocks */ + +/* + Basic information of the Maria table. This is stored on disk + and not changed (unless we do DLL changes). +*/ + +typedef struct st_ma_base_info +{ + my_off_t keystart; /* Start of keys */ + my_off_t max_data_file_length; + my_off_t max_key_file_length; + my_off_t margin_key_file_length; + ha_rows records, reloc; /* Create information */ + ulong mean_row_length; /* Create information */ + ulong reclength; /* length of unpacked record */ + ulong pack_reclength; /* Length of full packed rec */ + ulong min_pack_length; + ulong max_pack_length; /* Max possibly length of packed rec */ + ulong min_block_length; + uint fields; /* fields in table */ + uint fixed_not_null_fields; + uint fixed_not_null_fields_length; + uint max_field_lengths; + uint pack_fields; /* packed fields in table */ + uint varlength_fields; /* char/varchar/blobs */ + /* Number of bytes in the index used to refer to a row (2-8) */ + uint rec_reflength; + /* Number of bytes in the index used to refer to another index page (2-8) */ + uint key_reflength; /* = 2-8 */ + uint keys; /* same as in state.header */ + uint auto_key; /* Which key-1 is a auto key */ + uint blobs; /* Number of blobs */ + /* Length of packed bits (when table was created first time) */ + uint pack_bytes; + /* Length of null bits (when table was created first time) */ + uint original_null_bytes; + uint null_bytes; /* Null bytes in record */ + uint field_offsets; /* Number of field offsets */ + uint max_key_block_length; /* Max block length */ + uint max_key_length; /* Max key length */ + /* Extra allocation when using dynamic record format */ + uint extra_alloc_bytes; + uint extra_alloc_procent; + uint is_nulls_extended; /* 1 if new null bytes */ + uint default_row_flag; /* 0 or ROW_FLAG_NULLS_EXTENDED */ + uint block_size; + /* Size of initial record buffer */ + uint default_rec_buff_size; + /* Extra number of bytes the row format require in the record buffer */ + uint extra_rec_buff_size; + /* Tuning flags that can be ignored by older Maria versions */ + uint extra_options; + + /* The following are from the header */ + uint key_parts, all_key_parts; + uchar uuid[MY_UUID_SIZE]; + /** + @brief If false, we disable logging, versioning, transaction etc. Observe + difference with MARIA_SHARE::now_transactional + */ + my_bool born_transactional; +} MARIA_BASE_INFO; + + +/* Structs used intern in database */ + +typedef struct st_maria_blob /* Info of record */ +{ + ulong offset; /* Offset to blob in record */ + uint pack_length; /* Type of packed length */ + ulong length; /* Calc:ed for each record */ +} MARIA_BLOB; + + +typedef struct st_maria_pack +{ + ulong header_length; + uint ref_length; + uchar version; +} MARIA_PACK; + +typedef struct st_maria_file_bitmap +{ + uchar *map; + pgcache_page_no_t page; /* Page number for current bitmap */ + uint used_size; /* Size of bitmap head that is not 0 */ + my_bool changed; /* 1 if page needs to be written */ + my_bool changed_not_flushed; /* 1 if some bitmap is not flushed */ + uint flush_all_requested; /**< If _ma_bitmap_flush_all waiting */ + uint non_flushable; /**< 0 if bitmap and log are in sync */ + PAGECACHE_FILE file; /* datafile where bitmap is stored */ + +#ifdef THREAD + pthread_mutex_t bitmap_lock; + pthread_cond_t bitmap_cond; /**< When bitmap becomes flushable */ +#endif + /* Constants, allocated when initiating bitmaps */ + uint sizes[8]; /* Size per bit combination */ + uint total_size; /* Total usable size of bitmap page */ + uint block_size; /* Block size of file */ + ulong pages_covered; /* Pages covered by bitmap + 1 */ + DYNAMIC_ARRAY pinned_pages; /**< not-yet-flushable bitmap pages */ +} MARIA_FILE_BITMAP; + +#define MARIA_CHECKPOINT_LOOKS_AT_ME 1 +#define MARIA_CHECKPOINT_SHOULD_FREE_ME 2 +#define MARIA_CHECKPOINT_SEEN_IN_LOOP 4 + +typedef struct st_maria_share +{ /* Shared between opens */ + MARIA_STATE_INFO state; + MARIA_BASE_INFO base; + MARIA_STATE_HISTORY *state_history; + MARIA_KEYDEF ft2_keyinfo; /* Second-level ft-key definition */ + MARIA_KEYDEF *keyinfo; /* Key definitions */ + MARIA_UNIQUEDEF *uniqueinfo; /* unique definitions */ + HA_KEYSEG *keyparts; /* key part info */ + MARIA_COLUMNDEF *columndef; /* Pointer to column information */ + MARIA_PACK pack; /* Data about packed records */ + MARIA_BLOB *blobs; /* Pointer to blobs */ + uint16 *column_nr; /* Original column order */ + LEX_STRING unique_file_name; /* realpath() of index file */ + LEX_STRING data_file_name; /* Resolved path names from symlinks */ + LEX_STRING index_file_name; + LEX_STRING open_file_name; /* parameter to open filename */ + uchar *file_map; /* mem-map of file if possible */ + PAGECACHE *pagecache; /* ref to the current key cache */ + MARIA_DECODE_TREE *decode_trees; + /* + Previous auto-increment value. Used to verify if we can restore the + auto-increment counter if we have to abort an insert (duplicate key). + */ + ulonglong last_auto_increment; + uint16 *decode_tables; + uint16 id; /**< 2-byte id by which log records refer to the table */ + /* Called the first time the table instance is opened */ + my_bool (*once_init)(struct st_maria_share *, File); + /* Called when the last instance of the table is closed */ + my_bool (*once_end)(struct st_maria_share *); + /* Is called for every open of the table */ + my_bool (*init)(MARIA_HA *); + /* Is called for every close of the table */ + void (*end)(MARIA_HA *); + /* Called when we want to read a record from a specific position */ + int (*read_record)(MARIA_HA *, uchar *, MARIA_RECORD_POS); + /* Initialize a scan */ + my_bool (*scan_init)(MARIA_HA *); + /* Read next record while scanning */ + int (*scan)(MARIA_HA *, uchar *, MARIA_RECORD_POS, my_bool); + /* End scan */ + void (*scan_end)(MARIA_HA *); + int (*scan_remember_pos)(MARIA_HA *, MARIA_RECORD_POS*); + void (*scan_restore_pos)(MARIA_HA *, MARIA_RECORD_POS); + /* Pre-write of row (some handlers may do the actual write here) */ + MARIA_RECORD_POS (*write_record_init)(MARIA_HA *, const uchar *); + /* Write record (or accept write_record_init) */ + my_bool (*write_record)(MARIA_HA *, const uchar *); + /* Called when write failed */ + my_bool (*write_record_abort)(MARIA_HA *); + my_bool (*update_record)(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); + my_bool (*delete_record)(MARIA_HA *, const uchar *record); + my_bool (*compare_record)(MARIA_HA *, const uchar *); + /* calculate checksum for a row */ + ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *); + /* + Calculate checksum for a row during write. May be 0 if we calculate + the checksum in write_record_init() + */ + ha_checksum(*calc_write_checksum)(MARIA_HA *, const uchar *); + /* calculate checksum for a row during check table */ + ha_checksum(*calc_check_checksum)(MARIA_HA *, const uchar *); + /* Compare a row in memory with a row on disk */ + my_bool (*compare_unique)(MARIA_HA *, MARIA_UNIQUEDEF *, + const uchar *record, MARIA_RECORD_POS pos); + my_off_t (*keypos_to_recpos)(struct st_maria_share *share, my_off_t pos); + my_off_t (*recpos_to_keypos)(struct st_maria_share *share, my_off_t pos); + my_bool (*row_is_visible)(MARIA_HA *); + + /* Mapings to read/write the data file */ + size_t (*file_read)(MARIA_HA *, uchar *, size_t, my_off_t, myf); + size_t (*file_write)(MARIA_HA *, const uchar *, size_t, my_off_t, myf); + /* query cache invalidator for merged tables */ + invalidator_by_filename invalidator; + /* query cache invalidator for changing state */ + invalidator_by_filename chst_invalidator; + my_off_t key_del_current; /* delete links for index pages */ + ulong this_process; /* processid */ + ulong last_process; /* For table-change-check */ + ulong last_version; /* Version on start */ + ulong options; /* Options used */ + ulong min_pack_length; /* These are used by packed data */ + ulong max_pack_length; + ulong state_diff_length; + uint rec_reflength; /* rec_reflength in use now */ + uint keypage_header; + uint32 ftkeys; /* Number of distinct full-text keys + + 1 */ + PAGECACHE_FILE kfile; /* Shared keyfile */ + File data_file; /* Shared data file */ + int mode; /* mode of file on open */ + uint reopen; /* How many times opened */ + uint in_trans; /* Number of references by trn */ + uint w_locks, r_locks, tot_locks; /* Number of read/write locks */ + uint block_size; /* block_size of keyfile & data file*/ + uint max_index_block_size; /* block_size - end_of_page_info */ + /* Fixed length part of a packed row in BLOCK_RECORD format */ + uint base_length; + myf write_flag; + enum data_file_type data_file_type; + enum pagecache_page_type page_type; /* value depending transactional */ + /** + if Checkpoint looking at table; protected by close_lock or THR_LOCK_maria + */ + uint8 in_checkpoint; + my_bool temporary; + /* Below flag is needed to make log tables work with concurrent insert */ + my_bool is_log_table; + + my_bool changed, /* If changed since lock */ + global_changed, /* If changed since open */ + not_flushed; + my_bool lock_key_trees; /* If we have to lock trees on read */ + my_bool non_transactional_concurrent_insert; + my_bool delay_key_write; + my_bool have_rtree; + /** + @brief if the table is transactional right now. It may have been created + transactional (base.born_transactional==TRUE) but with transactionality + (logging) temporarily disabled (now_transactional==FALSE). The opposite + (FALSE, TRUE) is impossible. + */ + my_bool now_transactional; + my_bool have_versioning; + my_bool key_del_used; /* != 0 if key_del is locked */ + my_bool deleting; /* we are going to delete this table */ +#ifdef THREAD + THR_LOCK lock; + void (*lock_restore_status)(void *); + /** + Protects kfile, dfile, most members of the state, state disk writes, + versioning information (like in_trans, state_history). + @todo find the exhaustive list. + */ + pthread_mutex_t intern_lock; + pthread_mutex_t key_del_lock; + pthread_cond_t key_del_cond; + /** + _Always_ held while closing table; prevents checkpoint from looking at + structures freed during closure (like bitmap). If you need close_lock and + intern_lock, lock them in this order. + */ + pthread_mutex_t close_lock; +#endif + my_off_t mmaped_length; + uint nonmmaped_inserts; /* counter of writing in + non-mmaped area */ + MARIA_FILE_BITMAP bitmap; + rw_lock_t mmap_lock; + LSN lsn_of_file_id; /**< LSN of its last LOGREC_FILE_ID */ +} MARIA_SHARE; + + +typedef uchar MARIA_BITMAP_BUFFER; + +typedef struct st_maria_bitmap_block +{ + pgcache_page_no_t page; /* Page number */ + /* Number of continuous pages. TAIL_BIT is set if this is a tail page */ + uint page_count; + uint empty_space; /* Set for head and tail pages */ + /* + Number of BLOCKS for block-region (holds all non-blob-fields or one blob) + */ + uint sub_blocks; + /* set to <> 0 in write_record() if this block was actually used */ + uint8 used; + uint8 org_bitmap_value; +} MARIA_BITMAP_BLOCK; + + +typedef struct st_maria_bitmap_blocks +{ + MARIA_BITMAP_BLOCK *block; + uint count; + my_bool tail_page_skipped; /* If some tail pages was not used */ + my_bool page_skipped; /* If some full pages was not used */ +} MARIA_BITMAP_BLOCKS; + + +/* Data about the currently read row */ +typedef struct st_maria_row +{ + MARIA_BITMAP_BLOCKS insert_blocks; + MARIA_BITMAP_BUFFER *extents; + MARIA_RECORD_POS lastpos, nextpos; + MARIA_RECORD_POS *tail_positions; + ha_checksum checksum; + LSN orig_undo_lsn; /* Lsn at start of row insert */ + TrID trid; /* Transaction id for current row */ + uchar *empty_bits, *field_lengths; + uint *null_field_lengths; /* All null field lengths */ + ulong *blob_lengths; /* Length for each blob */ + ulong min_length, normal_length, char_length, varchar_length; + ulong blob_length, total_length; + size_t extents_buffer_length; /* Size of 'extents' buffer */ + uint head_length, header_length; + uint field_lengths_length; /* Length of data in field_lengths */ + uint extents_count; /* number of extents in 'extents' */ + uint full_page_count, tail_count; /* For maria_chk */ + uint space_on_head_page; +} MARIA_ROW; + +/* Data to scan row in blocked format */ +typedef struct st_maria_block_scan +{ + uchar *bitmap_buff, *bitmap_pos, *bitmap_end, *page_buff; + uchar *dir, *dir_end; + pgcache_page_no_t bitmap_page, max_page; + ulonglong bits; + uint number_of_rows, bit_pos; + MARIA_RECORD_POS row_base_page; +} MARIA_BLOCK_SCAN; + +typedef ICP_RESULT (*index_cond_func_t)(void *param); + +struct st_maria_handler +{ + MARIA_SHARE *s; /* Shared between open:s */ + struct st_ma_transaction *trn; /* Pointer to active transaction */ + void *external_ptr; /* Pointer to THD in mysql */ + MARIA_STATUS_INFO *state, state_save; + MARIA_STATUS_INFO *state_start; /* State at start of transaction */ + MARIA_ROW cur_row; /* The active row that we just read */ + MARIA_ROW new_row; /* Storage for a row during update */ + MARIA_KEY last_key; /* Last found key */ + MARIA_BLOCK_SCAN scan, *scan_save; + MARIA_BLOB *blobs; /* Pointer to blobs */ + MARIA_BIT_BUFF bit_buff; + DYNAMIC_ARRAY bitmap_blocks; + DYNAMIC_ARRAY pinned_pages; + /* accumulate indexfile changes between write's */ + TREE *bulk_insert; + LEX_CUSTRING *log_row_parts; /* For logging */ + DYNAMIC_ARRAY *ft1_to_ft2; /* used only in ft1->ft2 conversion */ + MEM_ROOT ft_memroot; /* used by the parser */ + MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit */ + uchar *buff; /* page buffer */ + uchar *keyread_buff; /* Buffer for last key read */ + uchar *lastkey_buff; /* Last used search key */ + uchar *lastkey_buff2; + uchar *first_mbr_key; /* Searhed spatial key */ + uchar *rec_buff; /* Temp buffer for recordpack */ + uchar *blob_buff; /* Temp buffer for blobs */ + uchar *int_keypos; /* Save position for next/previous */ + uchar *int_maxpos; /* -""- */ + uint keypos_offset; /* Tmp storage for offset int_keypos */ + uint maxpos_offset; /* Tmp storage for offset int_maxpos */ + uchar *update_field_data; /* Used by update in rows-in-block */ + uint int_nod_flag; /* -""- */ + uint32 int_keytree_version; /* -""- */ + int (*read_record)(MARIA_HA *, uchar*, MARIA_RECORD_POS); + invalidator_by_filename invalidator; /* query cache invalidator */ + ulonglong last_auto_increment; /* auto value at start of statement */ + ulong this_unique; /* uniq filenumber or thread */ + ulong last_unique; /* last unique number */ + ulong this_loop; /* counter for this open */ + ulong last_loop; /* last used counter */ + MARIA_RECORD_POS save_lastpos; + MARIA_RECORD_POS dup_key_pos; + TrID dup_key_trid; + my_off_t pos; /* Intern variable */ + my_off_t last_keypage; /* Last key page read */ + my_off_t last_search_keypage; /* Last keypage when searching */ + + /* + QQ: the folloing two xxx_length fields should be removed, + as they are not compatible with parallel repair + */ + ulong packed_length, blob_length; /* Length of found, packed record */ + size_t rec_buff_size, blob_buff_size; + PAGECACHE_FILE dfile; /* The datafile */ + IO_CACHE rec_cache; /* When cacheing records */ + LIST open_list; + MY_BITMAP changed_fields; + ulong row_base_length; /* Length of row header */ + uint row_flag; /* Flag to store in row header */ + uint opt_flag; /* Optim. for space/speed */ + uint update; /* If file changed since open */ + int lastinx; /* Last used index */ + uint last_rkey_length; /* Last length in maria_rkey() */ + uint *last_rtree_keypos; /* Last key positions for rtrees */ + uint bulk_insert_ref_length; /* Lenght of row ref during bi */ + uint non_flushable_state; + enum ha_rkey_function last_key_func; /* CONTAIN, OVERLAP, etc */ + uint save_lastkey_data_length; + uint save_lastkey_ref_length; + uint pack_key_length; /* For MARIA_MRG */ + myf lock_wait; /* is 0 or MY_SHORT_WAIT */ + int errkey; /* Got last error on this key */ + int lock_type; /* How database was locked */ + int tmp_lock_type; /* When locked by readinfo */ + uint data_changed; /* Somebody has changed data */ + uint save_update; /* When using KEY_READ */ + int save_lastinx; + uint preload_buff_size; /* When preloading indexes */ + uint16 last_used_keyseg; /* For MARIAMRG */ + uint8 key_del_used; /* != 0 if key_del is used */ + my_bool was_locked; /* Was locked in panic */ + my_bool append_insert_at_end; /* Set if concurrent insert */ + my_bool quick_mode; + /* Marker if key_del_changed */ + /* If info->keyread_buff can't be used for rnext */ + my_bool page_changed; + /* If info->keyread_buff has to be re-read for rnext */ + my_bool keyread_buff_used; + my_bool once_flags; /* For MARIA_MRG */ + /* For bulk insert enable/disable transactions control */ + my_bool switched_transactional; +#ifdef __WIN__ + my_bool owned_by_merge; /* This Maria table is part of a merge union */ +#endif +#ifdef THREAD + THR_LOCK_DATA lock; +#endif + uchar *maria_rtree_recursion_state; /* For RTREE */ + uchar length_buff[5]; /* temp buff to store blob lengths */ + int maria_rtree_recursion_depth; + + index_cond_func_t index_cond_func; /* Index condition function */ + void *index_cond_func_arg; /* parameter for the func */ +}; + +/* Some defines used by maria-functions */ + +#define USE_WHOLE_KEY 65535 /* Use whole key in _search() */ +#define F_EXTRA_LCK -1 + +/* bits in opt_flag */ +#define MEMMAP_USED 32 +#define REMEMBER_OLD_POS 64 + +#define WRITEINFO_UPDATE_KEYFILE 1 +#define WRITEINFO_NO_UNLOCK 2 + +/* once_flags */ +#define USE_PACKED_KEYS 1 +#define RRND_PRESERVE_LASTINX 2 + +/* bits in state.changed */ + +#define STATE_CHANGED 1 +#define STATE_CRASHED 2 +#define STATE_CRASHED_ON_REPAIR 4 +#define STATE_NOT_ANALYZED 8 +#define STATE_NOT_OPTIMIZED_KEYS 16 +#define STATE_NOT_SORTED_PAGES 32 +#define STATE_NOT_OPTIMIZED_ROWS 64 +#define STATE_NOT_ZEROFILLED 128 +#define STATE_NOT_MOVABLE 256 +#define STATE_MOVED 512 /* set if base->uuid != maria_uuid */ +#define STATE_IN_REPAIR 1024 /* We are running repair on table */ + +/* options to maria_read_cache */ + +#define READING_NEXT 1 +#define READING_HEADER 2 + +/* Number of bytes on key pages to indicate used size */ +#define KEYPAGE_USED_SIZE 2 +#define KEYPAGE_KEYID_SIZE 1 +#define KEYPAGE_FLAG_SIZE 1 +#define KEYPAGE_CHECKSUM_SIZE 4 +#define MAX_KEYPAGE_HEADER_SIZE (LSN_STORE_SIZE + KEYPAGE_USED_SIZE + \ + KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE + \ + TRANSID_SIZE) +#define KEYPAGE_FLAG_ISNOD 1 +#define KEYPAGE_FLAG_HAS_TRANSID 2 +/* Position to KEYPAGE_FLAG for transactional tables */ +#define KEYPAGE_TRANSFLAG_OFFSET LSN_STORE_SIZE + TRANSID_SIZE + KEYPAGE_KEYID_SIZE + +#define _ma_get_page_used(share,x) \ + ((uint) mi_uint2korr((x) + (share)->keypage_header - KEYPAGE_USED_SIZE)) +#define _ma_store_page_used(share,x,y) \ + mi_int2store((x) + (share)->keypage_header - KEYPAGE_USED_SIZE, (y)) +#define _ma_get_keypage_flag(share,x) x[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE] +#define _ma_test_if_nod(share,x) \ + ((_ma_get_keypage_flag(share,x) & KEYPAGE_FLAG_ISNOD) ? (share)->base.key_reflength : 0) + +#define _ma_store_keynr(share, x, nr) x[(share)->keypage_header - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE - KEYPAGE_USED_SIZE]= (nr) +#define _ma_get_keynr(share, x) ((uchar) x[(share)->keypage_header - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE - KEYPAGE_USED_SIZE]) +#define _ma_store_transid(buff, transid) \ + transid_store((buff) + LSN_STORE_SIZE, (transid)) +#define _ma_korr_transid(buff) \ + transid_korr((buff) + LSN_STORE_SIZE) +#define _ma_store_keypage_flag(share,x,flag) x[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]= (flag) +#define _ma_mark_page_with_transid(share, page) \ + (page)->flag|= KEYPAGE_FLAG_HAS_TRANSID; \ + (page)->buff[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]= (page)->flag; + + +/* + TODO: write int4store_aligned as *((uint32 *) (T))= (uint32) (A) for + architectures where it is possible +*/ +#define int4store_aligned(A,B) int4store((A),(B)) + +#define maria_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \ + DBUG_PRINT("error", ("Marked table crashed")); \ + }while(0) +#define maria_mark_crashed_share(x) \ + do{(x)->state.changed|= STATE_CRASHED; \ + DBUG_PRINT("error", ("Marked table crashed")); \ + }while(0) +#define maria_mark_crashed_on_repair(x) do{(x)->s->state.changed|= \ + STATE_CRASHED|STATE_CRASHED_ON_REPAIR; \ + (x)->update|= HA_STATE_CHANGED; \ + DBUG_PRINT("error", ("Marked table crashed on repair")); \ + }while(0) +#define maria_mark_in_repair(x) do{(x)->s->state.changed|= \ + STATE_CRASHED | STATE_IN_REPAIR; \ + (x)->update|= HA_STATE_CHANGED; \ + DBUG_PRINT("error", ("Marked table crashed for repair")); \ + }while(0) +#define maria_is_crashed(x) ((x)->s->state.changed & STATE_CRASHED) +#define maria_is_crashed_on_repair(x) ((x)->s->state.changed & STATE_CRASHED_ON_REPAIR) +#define maria_in_repair(x) ((x)->s->state.changed & STATE_IN_REPAIR) + +#ifdef EXTRA_DEBUG +/** + Brings additional information in certain debug builds and in standalone + (non-ha_maria) programs. To help debugging. Not in ha_maria, to not spam the + user (some messages can be produced many times per statement, or even + wrongly during some repair operations). +*/ +#define maria_print_error(SHARE, ERRNO) \ + do{ if (!maria_in_ha_maria) \ + _ma_report_error((ERRNO), &(SHARE)->index_file_name); } \ + while(0) +#else +#define maria_print_error(SHARE, ERRNO) while (0) +#endif +#define DBUG_DUMP_KEY(name, key) DBUG_DUMP(name, (key)->data, (key)->data_length + (key)->ref_length) + + +/* Functions to store length of space packed keys, VARCHAR or BLOB keys */ + +#define store_key_length(key,length) \ +{ if ((length) < 255) \ + { *(key)=(length); } \ + else \ + { *(key)=255; mi_int2store((key)+1,(length)); } \ +} + +#define get_key_full_length(length,key) \ + { if (*(const uchar*) (key) != 255) \ + length= ((uint) *(const uchar*) ((key)++))+1; \ + else \ + { length=mi_uint2korr((key)+1)+3; (key)+=3; } \ +} + +#define get_key_full_length_rdonly(length,key) \ +{ if (*(const uchar*) (key) != 255) \ + length= ((uint) *(const uchar*) ((key)))+1; \ + else \ + { length=mi_uint2korr((key)+1)+3; } \ +} + +#define maria_max_key_length() ((maria_block_size - MAX_KEYPAGE_HEADER_SIZE)/2 - MARIA_INDEX_OVERHEAD_SIZE) +#define get_pack_length(length) ((length) >= 255 ? 3 : 1) +#define _ma_have_versioning(info) ((info)->row_flag & ROW_FLAG_TRANSID) + +/** + Sets table's trn and prints debug information + @param tbl MARIA_HA of table + @param newtrn what to put into tbl->trn + @note cast of newtrn is because %p of NULL gives warning (NULL is int) +*/ +#define _ma_set_trn_for_table(tbl, newtrn) do { \ + DBUG_PRINT("info",("table: %p trn: %p -> %p", \ + (tbl), (tbl)->trn, (void *)(newtrn))); \ + (tbl)->trn= (newtrn); \ + } while (0) + + +#define MARIA_MIN_BLOCK_LENGTH 20 /* Because of delete-link */ +/* Don't use to small record-blocks */ +#define MARIA_EXTEND_BLOCK_LENGTH 20 +#define MARIA_SPLIT_LENGTH ((MARIA_EXTEND_BLOCK_LENGTH+4)*2) + /* Max prefix of record-block */ +#define MARIA_MAX_DYN_BLOCK_HEADER 20 +#define MARIA_BLOCK_INFO_HEADER_LENGTH 20 +#define MARIA_DYN_DELETE_BLOCK_HEADER 20 /* length of delete-block-header */ +#define MARIA_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L) +#define MARIA_DYN_MAX_ROW_LENGTH (MARIA_DYN_MAX_BLOCK_LENGTH - MARIA_SPLIT_LENGTH) +#define MARIA_DYN_ALIGN_SIZE 4 /* Align blocks on this */ +#define MARIA_MAX_DYN_HEADER_BYTE 13 /* max header uchar for dynamic rows */ +#define MARIA_MAX_BLOCK_LENGTH ((((ulong) 1 << 24)-1) & (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1))) +#define MARIA_REC_BUFF_OFFSET ALIGN_SIZE(MARIA_DYN_DELETE_BLOCK_HEADER+sizeof(uint32)) + +#define MEMMAP_EXTRA_MARGIN 7 /* Write this as a suffix for file */ + +#define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */ +#define PACK_TYPE_SPACE_FIELDS 2 +#define PACK_TYPE_ZERO_FILL 4 +#define MARIA_FOUND_WRONG_KEY 32738 /* Impossible value from ha_key_cmp */ + +#define MARIA_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size)) +#define MARIA_MAX_KEYPTR_SIZE 5 /* For calculating block lengths */ + +/* Marker for impossible delete link */ +#define IMPOSSIBLE_PAGE_NO LL(0xFFFFFFFFFF) + +/* The UNIQUE check is done with a hashed long key */ + +#define MARIA_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT +#define maria_unique_store(A,B) mi_int4store((A),(B)) + +#ifdef THREAD +extern pthread_mutex_t THR_LOCK_maria; +#endif +#if !defined(THREAD) || defined(DONT_USE_RW_LOCKS) +#define rw_wrlock(A) {} +#define rw_rdlock(A) {} +#define rw_unlock(A) {} +#endif + +/* Some tuning parameters */ +#define MARIA_MIN_KEYBLOCK_LENGTH 50 /* When to split delete blocks */ +#define MARIA_MIN_SIZE_BULK_INSERT_TREE 16384 /* this is per key */ +#define MARIA_MIN_ROWS_TO_USE_BULK_INSERT 100 +#define MARIA_MIN_ROWS_TO_DISABLE_INDEXES 100 +#define MARIA_MIN_ROWS_TO_USE_WRITE_CACHE 10 +/* Keep a small buffer for tables only using small blobs */ +#define MARIA_SMALL_BLOB_BUFFER 1024 +#define MARIA_MAX_CONTROL_FILE_LOCK_RETRY 30 /* Retry this many times */ + + +/* Some extern variables */ +extern LIST *maria_open_list; +extern uchar maria_file_magic[], maria_pack_file_magic[]; +extern uchar maria_uuid[MY_UUID_SIZE]; +extern uint32 maria_read_vec[], maria_readnext_vec[]; +extern uint maria_quick_table_bits; +extern char *maria_data_root; +extern uchar maria_zero_string[]; +extern my_bool maria_inited, maria_in_ha_maria, maria_recovery_changed_data; +extern my_bool maria_recovery_verbose; +extern HASH maria_stored_state; +extern int (*maria_create_trn_hook)(MARIA_HA *); + +/* This is used by _ma_calc_xxx_key_length och _ma_store_key */ +typedef struct st_maria_s_param +{ + const uchar *key; + uchar *prev_key, *next_key_pos; + uchar *key_pos; /* For balance page */ + uint ref_length, key_length, n_ref_length; + uint n_length, totlength, part_of_prev_key, prev_length, pack_marker; + uint changed_length; + int move_length; /* For balance_page */ + my_bool store_not_null; +} MARIA_KEY_PARAM; + + +/* Used to store reference to pinned page */ +typedef struct st_pinned_page +{ + PAGECACHE_BLOCK_LINK *link; + enum pagecache_page_lock unlock, write_lock; + my_bool changed; +} MARIA_PINNED_PAGE; + + +/* Keeps all information about a page and related to a page */ +typedef struct st_maria_page +{ + MARIA_HA *info; + const MARIA_KEYDEF *keyinfo; + uchar *buff; /* Data for page */ + my_off_t pos; /* Disk address to page */ + uint size; /* Size of data on page */ + uint org_size; /* Size of page at read or after log */ + uint node; /* 0 or share->base.key_reflength */ + uint flag; /* Page flag */ + uint link_offset; +} MARIA_PAGE; + + +/* Prototypes for intern functions */ +extern int _ma_read_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS); +extern int _ma_read_rnd_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS, + my_bool); +extern my_bool _ma_write_dynamic_record(MARIA_HA *, const uchar *); +extern my_bool _ma_update_dynamic_record(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); +extern my_bool _ma_delete_dynamic_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_cmp_dynamic_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_write_blob_record(MARIA_HA *, const uchar *); +extern my_bool _ma_update_blob_record(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); +extern int _ma_read_static_record(MARIA_HA *info, uchar *, MARIA_RECORD_POS); +extern int _ma_read_rnd_static_record(MARIA_HA *, uchar *, MARIA_RECORD_POS, + my_bool); +extern my_bool _ma_write_static_record(MARIA_HA *, const uchar *); +extern my_bool _ma_update_static_record(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); +extern my_bool _ma_delete_static_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_cmp_static_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_ck_write(MARIA_HA *info, MARIA_KEY *key); +extern my_bool _ma_enlarge_root(MARIA_HA *info, MARIA_KEY *key, + MARIA_RECORD_POS *root); +int _ma_insert(register MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *anc_page, uchar *key_pos, uchar *key_buff, + MARIA_PAGE *father_page, uchar *father_key_pos, + my_bool insert_last); +extern my_bool _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEY *key, + MARIA_RECORD_POS *root, uint32 comp_flag); +extern int _ma_split_page(MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *split_page, + uint org_split_length, + uchar *inserted_key_pos, uint changed_length, + int move_length, + uchar *key_buff, my_bool insert_last_key); +extern uchar *_ma_find_half_pos(MARIA_KEY *key, MARIA_PAGE *page, + uchar ** after_key); +extern int _ma_calc_static_key_length(const MARIA_KEY *key, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *key_buff, + MARIA_KEY_PARAM *s_temp); +extern int _ma_calc_var_key_length(const MARIA_KEY *key, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *key_buff, + MARIA_KEY_PARAM *s_temp); +extern int _ma_calc_var_pack_key_length(const MARIA_KEY *key, + uint nod_flag, uchar *next_key, + uchar *org_key, uchar *prev_key, + MARIA_KEY_PARAM *s_temp); +extern int _ma_calc_bin_pack_key_length(const MARIA_KEY *key, + uint nod_flag, uchar *next_key, + uchar *org_key, uchar *prev_key, + MARIA_KEY_PARAM *s_temp); +extern void _ma_store_static_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +extern void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +#ifdef NOT_USED +extern void _ma_store_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +#endif +extern void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); + +extern my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key); +extern my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key, + my_off_t *root); +extern int _ma_readinfo(MARIA_HA *info, int lock_flag, int check_keybuffer); +extern int _ma_writeinfo(MARIA_HA *info, uint options); +extern int _ma_test_if_changed(MARIA_HA *info); +extern int _ma_mark_file_changed(MARIA_HA *info); +extern void _ma_mark_file_crashed(MARIA_SHARE *share); +extern my_bool _ma_set_uuid(MARIA_HA *info, my_bool reset_uuid); +extern my_bool _ma_check_if_zero(uchar *pos, size_t size); +extern int _ma_decrement_open_count(MARIA_HA *info); +extern int _ma_check_index(MARIA_HA *info, int inx); +extern int _ma_search(MARIA_HA *info, MARIA_KEY *key, uint32 nextflag, + my_off_t pos); +extern int _ma_bin_search(const MARIA_KEY *key, const MARIA_PAGE *page, + uint32 comp_flag, uchar **ret_pos, uchar *buff, + my_bool *was_last_key); +extern int _ma_seq_search(const MARIA_KEY *key, const MARIA_PAGE *page, + uint comp_flag, uchar ** ret_pos, uchar *buff, + my_bool *was_last_key); +extern int _ma_prefix_search(const MARIA_KEY *key, const MARIA_PAGE *page, + uint32 comp_flag, uchar ** ret_pos, uchar *buff, + my_bool *was_last_key); +extern my_off_t _ma_kpos(uint nod_flag, const uchar *after_key); +extern void _ma_kpointer(MARIA_HA *info, uchar *buff, my_off_t pos); +MARIA_RECORD_POS _ma_row_pos_from_key(const MARIA_KEY *key); +TrID _ma_trid_from_key(const MARIA_KEY *key); +extern MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *share, uchar *ptr); +extern void _ma_dpointer(MARIA_SHARE *share, uchar *buff, + MARIA_RECORD_POS pos); +extern uint _ma_get_static_key(MARIA_KEY *key, uint page_flag, uint nod_flag, + uchar **page); +extern uchar *_ma_skip_static_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page); +extern uint _ma_get_pack_key(MARIA_KEY *key, uint page_flag, uint nod_flag, + uchar **page); +extern uchar *_ma_skip_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page); +extern uint _ma_get_binary_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar **page_pos); +uchar *_ma_skip_binary_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page); +extern uchar *_ma_get_last_key(MARIA_KEY *key, MARIA_PAGE *page, + uchar *endpos); +extern uchar *_ma_get_key(MARIA_KEY *key, MARIA_PAGE *page, uchar *keypos); +extern uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key); +extern uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, const uchar *key, + HA_KEYSEG *end); +extern int _ma_search_next(MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, my_off_t pos); +extern int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos); +extern int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos); +extern my_off_t _ma_static_keypos_to_recpos(MARIA_SHARE *share, my_off_t pos); +extern my_off_t _ma_static_recpos_to_keypos(MARIA_SHARE *share, my_off_t pos); +extern my_off_t _ma_transparent_recpos(MARIA_SHARE *share, my_off_t pos); +extern my_off_t _ma_transaction_keypos_to_recpos(MARIA_SHARE *, my_off_t pos); +extern my_off_t _ma_transaction_recpos_to_keypos(MARIA_SHARE *, my_off_t pos); + +extern void _ma_page_setup(MARIA_PAGE *page, MARIA_HA *info, + const MARIA_KEYDEF *keyinfo, my_off_t pos, + uchar *buff); +extern my_bool _ma_fetch_keypage(MARIA_PAGE *page, MARIA_HA *info, + const MARIA_KEYDEF *keyinfo, + my_off_t pos, enum pagecache_page_lock lock, + int level, uchar *buff, + my_bool return_buffer); +extern my_bool _ma_write_keypage(MARIA_PAGE *page, + enum pagecache_page_lock lock, int level); +extern int _ma_dispose(MARIA_HA *info, my_off_t pos, my_bool page_not_read); +extern my_off_t _ma_new(register MARIA_HA *info, int level, + MARIA_PINNED_PAGE **page_link); +extern my_bool _ma_compact_keypage(MARIA_PAGE *page, TrID min_read_from); +extern uint transid_store_packed(MARIA_HA *info, uchar *to, ulonglong trid); +extern ulonglong transid_get_packed(MARIA_SHARE *share, const uchar *from); +#define transid_packed_length(data) \ + ((data)[0] < MARIA_MIN_TRANSID_PACK_OFFSET ? 1 : \ + (uint) ((uchar) (data)[0]) - (MARIA_TRANSID_PACK_OFFSET - 1)) +#define key_has_transid(key) (*(key) & 1) + +#define page_mark_changed(info, page) \ + dynamic_element(&(info)->pinned_pages, (page)->link_offset, \ + MARIA_PINNED_PAGE*)->changed= 1; +#define page_store_size(share, page) \ + _ma_store_page_used((share), (page)->buff, (page)->size); +#define page_store_info(share, page) \ + _ma_store_keypage_flag((share), (page)->buff, (page)->flag); \ + _ma_store_page_used((share), (page)->buff, (page)->size); +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY +void page_cleanup(MARIA_SHARE *share, MARIA_PAGE *page) +#else +#define page_cleanup(A,B) while (0) +#endif + +extern MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr, + uchar *key, const uchar *record, + MARIA_RECORD_POS filepos, ulonglong trid); +extern MARIA_KEY *_ma_pack_key(MARIA_HA *info, MARIA_KEY *int_key, + uint keynr, uchar *key, + const uchar *old, key_part_map keypart_map, + HA_KEYSEG ** last_used_keyseg); +extern void _ma_copy_key(MARIA_KEY *to, const MARIA_KEY *from); +extern int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS); +extern my_bool _ma_read_cache(IO_CACHE *info, uchar *buff, + MARIA_RECORD_POS pos, size_t length, + uint re_read_if_possibly); +extern ulonglong ma_retrieve_auto_increment(const uchar *key, uint8 key_type); +extern my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size, + size_t new_size); +extern ulong _ma_rec_unpack(MARIA_HA *info, uchar *to, uchar *from, + ulong reclength); +extern my_bool _ma_rec_check(MARIA_HA *info, const uchar *record, + uchar *packpos, ulong packed_length, + my_bool with_checkum, ha_checksum checksum); +extern int _ma_write_part_record(MARIA_HA *info, my_off_t filepos, + ulong length, my_off_t next_filepos, + uchar ** record, ulong *reclength, + int *flag); +extern void _ma_print_key(FILE *stream, MARIA_KEY *key); +extern void _ma_print_keydata(FILE *stream, HA_KEYSEG *keyseg, + const uchar *key, uint length); +extern my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile); +extern my_bool _ma_once_end_pack_row(MARIA_SHARE *share); +extern int _ma_read_pack_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos); +extern int _ma_read_rnd_pack_record(MARIA_HA *, uchar *, MARIA_RECORD_POS, + my_bool); +extern int _ma_pack_rec_unpack(MARIA_HA *info, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *from, ulong reclength); +extern ulonglong _ma_safe_mul(ulonglong a, ulonglong b); +extern int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf, + const uchar *oldrec, const uchar *newrec, + my_off_t pos); + +/* + Parameter to _ma_get_block_info + The dynamic row header is read into this struct. For an explanation of + the fields, look at the function _ma_get_block_info(). +*/ + +typedef struct st_maria_block_info +{ + uchar header[MARIA_BLOCK_INFO_HEADER_LENGTH]; + ulong rec_len; + ulong data_len; + ulong block_len; + ulong blob_len; + MARIA_RECORD_POS filepos; + MARIA_RECORD_POS next_filepos; + MARIA_RECORD_POS prev_filepos; + uint second_read; + uint offset; +} MARIA_BLOCK_INFO; + + +/* bits in return from _ma_get_block_info */ + +#define BLOCK_FIRST 1 +#define BLOCK_LAST 2 +#define BLOCK_DELETED 4 +#define BLOCK_ERROR 8 /* Wrong data */ +#define BLOCK_SYNC_ERROR 16 /* Right data at wrong place */ +#define BLOCK_FATAL_ERROR 32 /* hardware-error */ + +#define NEED_MEM ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */ +#define MAXERR 20 +#define BUFFERS_WHEN_SORTING 16 /* Alloc for sort-key-tree */ +#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE +#define INDEX_TMP_EXT ".TMM" +#define DATA_TMP_EXT ".TMD" + +#define UPDATE_TIME 1 +#define UPDATE_STAT 2 +#define UPDATE_SORT 4 +#define UPDATE_AUTO_INC 8 +#define UPDATE_OPEN_COUNT 16 + +#define USE_BUFFER_INIT (((1024L*1024L*128-MALLOC_OVERHEAD)/8192)*8192) +#define READ_BUFFER_INIT (1024L*256L-MALLOC_OVERHEAD) +#define SORT_BUFFER_INIT (1024L*1024L*256-MALLOC_OVERHEAD) +#define MIN_SORT_BUFFER (4096-MALLOC_OVERHEAD) + +#define fast_ma_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _ma_writeinfo((INFO),0) +#define fast_ma_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _ma_readinfo((INFO),F_RDLCK,1) + +extern uint _ma_get_block_info(MARIA_BLOCK_INFO *, File, my_off_t); +extern uint _ma_rec_pack(MARIA_HA *info, uchar *to, const uchar *from); +extern uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, uchar **rec_buff_p, + size_t *rec_buff_size, + File file, my_off_t filepos); +extern void _ma_store_blob_length(uchar *pos, uint pack_length, uint length); +extern void _ma_report_error(int errcode, const LEX_STRING *file_name); +extern my_bool _ma_memmap_file(MARIA_HA *info); +extern void _ma_unmap_file(MARIA_HA *info); +extern uint _ma_save_pack_length(uint version, uchar * block_buff, + ulong length); +extern uint _ma_calc_pack_length(uint version, ulong length); +extern ulong _ma_calc_blob_length(uint length, const uchar *pos); +extern size_t _ma_mmap_pread(MARIA_HA *info, uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags); +extern size_t _ma_mmap_pwrite(MARIA_HA *info, const uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags); +extern size_t _ma_nommap_pread(MARIA_HA *info, uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags); +extern size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags); + +/* my_pwrite instead of my_write used */ +#define MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET 1 +/* info should be written */ +#define MA_STATE_INFO_WRITE_FULL_INFO 2 +/* intern_lock taking is needed */ +#define MA_STATE_INFO_WRITE_LOCK 4 +uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite); +uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite); +uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state); +uint _ma_base_info_write(File file, MARIA_BASE_INFO *base); +my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg); +uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg); +my_bool _ma_keydef_write(File file, MARIA_KEYDEF *keydef); +uchar *_ma_keydef_read(uchar *ptr, MARIA_KEYDEF *keydef); +my_bool _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *keydef); +uchar *_ma_uniquedef_read(uchar *ptr, MARIA_UNIQUEDEF *keydef); +my_bool _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef); +uchar *_ma_columndef_read(uchar *ptr, MARIA_COLUMNDEF *columndef); +my_bool _ma_column_nr_write(File file, uint16 *offsets, uint columns); +uchar *_ma_column_nr_read(uchar *ptr, uint16 *offsets, uint columns); +ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record); +ha_checksum _ma_checksum(MARIA_HA *info, const uchar *buf); +ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *buf); +my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + uchar *record, ha_checksum unique_hash, + MARIA_RECORD_POS pos); +ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *buf); +my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos); +my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos); +my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b, + my_bool null_are_equal); +void _ma_get_status(void *param, my_bool concurrent_insert); +void _ma_update_status(void *param); +void _ma_restore_status(void *param); +void _ma_copy_status(void *to, void *from); +my_bool _ma_check_status(void *param); +void _ma_restore_status(void *param); +void _ma_reset_status(MARIA_HA *maria); +int _ma_def_scan_remember_pos(MARIA_HA *info, MARIA_RECORD_POS *lastpos); +void _ma_def_scan_restore_pos(MARIA_HA *info, MARIA_RECORD_POS lastpos); + +#include "ma_commit.h" + +extern MARIA_HA *_ma_test_if_reopen(const char *filename); +my_bool _ma_check_table_is_closed(const char *name, const char *where); +int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share, const char *org_name, + File file_to_dup); +int _ma_open_keyfile(MARIA_SHARE *share); +void _ma_setup_functions(register MARIA_SHARE *share); +my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size); +void _ma_remap_file(MARIA_HA *info, my_off_t size); + +MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, const uchar *record); +my_bool _ma_write_abort_default(MARIA_HA *info); + +C_MODE_START +#define MARIA_FLUSH_DATA 1 +#define MARIA_FLUSH_INDEX 2 +int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index, + enum flush_type flush_type_for_data, + enum flush_type flush_type_for_index); +/* + Functions needed by _ma_check (are overridden in MySQL/ha_maria.cc). + See ma_check_standalone.h . +*/ +int _ma_killed_ptr(HA_CHECK *param); +void _ma_check_print_error _VARARGS((HA_CHECK *param, const char *fmt, ...)) + ATTRIBUTE_FORMAT(printf, 2, 3); +void _ma_check_print_warning _VARARGS((HA_CHECK *param, const char *fmt, ...)) + ATTRIBUTE_FORMAT(printf, 2, 3); +void _ma_check_print_info _VARARGS((HA_CHECK *param, const char *fmt, ...)) + ATTRIBUTE_FORMAT(printf, 2, 3); +my_bool write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info); +C_MODE_END + +int _ma_flush_pending_blocks(MARIA_SORT_PARAM *param); +int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param); +int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param); +#ifdef THREAD +pthread_handler_t _ma_thr_find_all_keys(void *arg); +#endif + +int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param); +int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages, + size_t); +int _ma_sync_table_files(const MARIA_HA *info); +int _ma_initialize_data_file(MARIA_SHARE *share, File dfile); +int _ma_update_state_lsns(MARIA_SHARE *share, + LSN lsn, TrID create_trid, my_bool do_sync, + my_bool update_create_rename_lsn); +int _ma_update_state_lsns_sub(MARIA_SHARE *share, LSN lsn, + TrID create_trid, my_bool do_sync, + my_bool update_create_rename_lsn); +void _ma_set_data_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share); +void _ma_set_index_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share); +void _ma_tmp_disable_logging_for_table(MARIA_HA *info, + my_bool log_incomplete); +my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages); +my_bool write_log_record_for_bulk_insert(MARIA_HA *info); +void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn); + +#define MARIA_NO_CRC_NORMAL_PAGE 0xffffffff +#define MARIA_NO_CRC_BITMAP_PAGE 0xfffffffe +extern my_bool maria_page_crc_set_index(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern my_bool maria_page_crc_set_normal(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern my_bool maria_page_crc_check_bitmap(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern my_bool maria_page_crc_check_data(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern my_bool maria_page_crc_check_index(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern my_bool maria_page_crc_check_none(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern my_bool maria_page_filler_set_bitmap(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern my_bool maria_page_filler_set_normal(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern my_bool maria_page_filler_set_none(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern void maria_page_write_failure(uchar* data_ptr); +extern my_bool maria_flush_log_for_page(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern my_bool maria_flush_log_for_page_none(uchar *page, + pgcache_page_no_t page_no, + uchar *data_ptr); +extern PAGECACHE *maria_log_pagecache; +extern void ma_set_index_cond_func(MARIA_HA *info, index_cond_func_t func, + void *func_arg); +int ma_check_index_cond(register MARIA_HA *info, uint keynr, uchar *record); diff --git a/storage/maria/maria_ftdump.c b/storage/maria/maria_ftdump.c new file mode 100644 index 00000000000..870d07fa96e --- /dev/null +++ b/storage/maria/maria_ftdump.c @@ -0,0 +1,282 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code + added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ + +#include "ma_ftdefs.h" +#include <my_getopt.h> + +static void usage(); +static void complain(int val); +static my_bool get_one_option(int, const struct my_option *, char *); + +static int count=0, stats=0, dump=0, lstats=0; +static my_bool verbose; +static char *query=NULL; +static uint lengths[256]; + +#define MAX_LEN (HA_FT_MAXBYTELEN+10) +#define HOW_OFTEN_TO_WRITE 10000 + +static struct my_option my_long_options[] = +{ + {"help", 'h', "Display help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', "Synonym for -h.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"count", 'c', "Calculate per-word stats (counts and global weights).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"dump", 'd', "Dump index (incl. data offsets and word weights).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"length", 'l', "Report length distribution.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"stats", 's', "Report global stats.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Be verbose.", + &verbose, &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +int main(int argc,char *argv[]) +{ + int error=0; + uint keylen, keylen2=0, inx, doc_cnt=0; + float weight= 1.0; + double gws, min_gws=0, avg_gws=0; + MARIA_HA *info; + char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN]; + ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0; + struct { MARIA_HA *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */ + + MY_INIT(argv[0]); + if ((error= handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(error); + maria_init(); + if (count || dump) + verbose=0; + if (!count && !dump && !lstats && !query) + stats=1; + + if (verbose) + setbuf(stdout,NULL); + + if (argc < 2) + usage(); + + { + char *end; + inx= (uint) strtoll(argv[1], &end, 10); + if (*end) + usage(); + } + + init_pagecache(maria_pagecache, USE_BUFFER_INIT, 0, 0, + MARIA_KEY_BLOCK_LENGTH, MY_WME); + + if (!(info=maria_open(argv[0], O_RDONLY, + HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER))) + { + error=my_errno; + goto err; + } + + *buf2=0; + aio->info=info; + + if ((inx >= info->s->base.keys) || + !(info->s->keyinfo[inx].flag & HA_FULLTEXT)) + { + printf("Key %d in table %s is not a FULLTEXT key\n", inx, + info->s->open_file_name.str); + goto err; + } + + maria_lock_database(info, F_EXTRA_LCK); + + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->update|= HA_STATE_PREV_FOUND; + + while (!(error=maria_rnext(info,NULL,inx))) + { + FT_WEIGTH subkeys; + keylen=*(info->lastkey_buff); + + subkeys.i= ft_sintXkorr(info->lastkey_buff + keylen + 1); + if (subkeys.i >= 0) + weight= subkeys.f; + +#ifdef HAVE_SNPRINTF + snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey_buff+1); +#else + sprintf(buf,"%.*s",(int) keylen,info->lastkey_buff+1); +#endif + my_casedn_str(default_charset_info,buf); + total++; + lengths[keylen]++; + + if (count || stats) + { + if (strcmp(buf, buf2)) + { + if (*buf2) + { + uniq++; + avg_gws+=gws=GWS_IN_USE; + if (count) + printf("%9u %20.7f %s\n",doc_cnt,gws,buf2); + if (maxlen<keylen2) + { + maxlen=keylen2; + strmov(buf_maxlen, buf2); + } + if (max_doc_cnt < doc_cnt) + { + max_doc_cnt=doc_cnt; + strmov(buf_min_gws, buf2); + min_gws=gws; + } + } + strmov(buf2, buf); + keylen2=keylen; + doc_cnt=0; + } + doc_cnt+= (subkeys.i >= 0 ? 1 : -subkeys.i); + } + if (dump) + { + if (subkeys.i >= 0) + printf("%9lx %20.7f %s\n", (long) info->cur_row.lastpos,weight,buf); + else + printf("%9lx => %17d %s\n",(long) info->cur_row.lastpos,-subkeys.i, + buf); + } + if (verbose && (total%HOW_OFTEN_TO_WRITE)==0) + printf("%10ld\r",total); + } + maria_lock_database(info, F_UNLCK); + + if (count || stats) + { + if (*buf2) + { + uniq++; + avg_gws+=gws=GWS_IN_USE; + if (count) + printf("%9u %20.7f %s\n",doc_cnt,gws,buf2); + if (maxlen<keylen2) + { + maxlen=keylen2; + strmov(buf_maxlen, buf2); + } + if (max_doc_cnt < doc_cnt) + { + max_doc_cnt=doc_cnt; + strmov(buf_min_gws, buf2); + min_gws=gws; + } + } + } + + if (stats) + { + count=0; + for (inx=0;inx<256;inx++) + { + count+=lengths[inx]; + if ((ulong) count >= total/2) + break; + } + printf("Total rows: %lu\nTotal words: %lu\n" + "Unique words: %lu\nLongest word: %lu chars (%s)\n" + "Median length: %u\n" + "Average global weight: %f\n" + "Most common word: %lu times, weight: %f (%s)\n", + (long) info->state->records, total, uniq, maxlen, buf_maxlen, + inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws); + } + if (lstats) + { + count=0; + for (inx=0; inx<256; inx++) + { + count+=lengths[inx]; + if (count && lengths[inx]) + printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx, + (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count, + 100.0*count/total); + } + } + +err: + if (error && error != HA_ERR_END_OF_FILE) + printf("got error %d\n",my_errno); + if (info) + maria_close(info); + maria_end(); + return 0; +} + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch(optid) { + case 'd': + dump=1; + complain(count || query); + break; + case 's': + stats=1; + complain(query!=0); + break; + case 'c': + count= 1; + complain(dump || query); + break; + case 'l': + lstats=1; + complain(query!=0); + break; + case '?': + case 'h': + usage(); + } + return 0; +} + +#include <help_start.h> + +static void usage() +{ + printf("Use: aria_ft_dump <table_name> <index_num>\n"); + my_print_help(my_long_options); + my_print_variables(my_long_options); + NETWARE_SET_SCREEN_MODE(1); + exit(1); +} + +#include <help_end.h> + +static void complain(int val) /* Kinda assert :-) */ +{ + if (val) + { + printf("You cannot use these options together!\n"); + exit(1); + } +} diff --git a/storage/maria/maria_pack.c b/storage/maria/maria_pack.c new file mode 100644 index 00000000000..1d2d3995bd8 --- /dev/null +++ b/storage/maria/maria_pack.c @@ -0,0 +1,3234 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Pack MARIA file */ + +#ifndef USE_MY_FUNC +#define USE_MY_FUNC /* We need at least my_malloc */ +#endif + +#include "maria_def.h" +#include <queues.h> +#include <my_tree.h> +#include "mysys_err.h" +#ifdef MSDOS +#include <io.h> +#endif +#ifndef __GNU_LIBRARY__ +#define __GNU_LIBRARY__ /* Skip warnings in getopt.h */ +#endif +#include <my_getopt.h> +#include <assert.h> + +#if SIZEOF_LONG_LONG > 4 +#define BITS_SAVED 64 +#else +#define BITS_SAVED 32 +#endif + +#define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */ +#define HEAD_LENGTH 32 +#define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */ + +#define DATA_TMP_EXT ".TMD" +#define OLD_EXT ".OLD" +#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE + +struct st_file_buffer { + File file; + uchar *buffer,*pos,*end; + my_off_t pos_in_file; + int bits; + ulonglong bitbucket; +}; + +struct st_huff_tree; +struct st_huff_element; + +typedef struct st_huff_counts { + uint field_length,max_zero_fill; + uint pack_type; + uint max_end_space,max_pre_space,length_bits,min_space; + ulong max_length; + enum en_fieldtype field_type; + struct st_huff_tree *tree; /* Tree for field */ + my_off_t counts[256]; + my_off_t end_space[8]; + my_off_t pre_space[8]; + my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed; + TREE int_tree; /* Tree for detecting distinct column values. */ + uchar *tree_buff; /* Column values, 'field_length' each. */ + uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */ +} HUFF_COUNTS; + +typedef struct st_huff_element HUFF_ELEMENT; + +/* + WARNING: It is crucial for the optimizations in calc_packed_length() + that 'count' is the first element of 'HUFF_ELEMENT'. +*/ +struct st_huff_element { + my_off_t count; + union un_element { + struct st_nod { + HUFF_ELEMENT *left,*right; + } nod; + struct st_leaf { + HUFF_ELEMENT *null; + uint element_nr; /* Number of element */ + } leaf; + } a; +}; + + +typedef struct st_huff_tree { + HUFF_ELEMENT *root,*element_buffer; + HUFF_COUNTS *counts; + uint tree_number; + uint elements; + my_off_t bytes_packed; + uint tree_pack_length; + uint min_chr,max_chr,char_bits,offset_bits,max_offset,height; + ulonglong *code; + uchar *code_len; +} HUFF_TREE; + + +typedef struct st_isam_mrg { + MARIA_HA **file,**current,**end; + uint free_file; + uint count; + uint min_pack_length; /* Theese is used by packed data */ + uint max_pack_length; + uint ref_length; + uint max_blob_length; + my_off_t records; + /* true if at least one source file has at least one disabled index */ + my_bool src_file_has_indexes_disabled; +} PACK_MRG_INFO; + + +extern int main(int argc,char * *argv); +static void get_options(int *argc,char ***argv); +static MARIA_HA *open_maria_file(char *name,int mode); +static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count); +static int compress(PACK_MRG_INFO *file,char *join_name); +static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records); +static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, + uint trees, + HUFF_COUNTS *huff_counts, + uint fields); +static int compare_tree(void* cmp_arg __attribute__((unused)), + const uchar *s,const uchar *t); +static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts); +static void check_counts(HUFF_COUNTS *huff_counts,uint trees, + my_off_t records); +static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records, + uint max_space_length,my_off_t *space_counts, + my_off_t tot_space_count, + enum en_fieldtype field_type); +static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees); +static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts); +static int compare_huff_elements(void *not_used, uchar *a,uchar *b); +static int save_counts_in_queue(uchar *key,element_count count, + HUFF_TREE *tree); +static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag); +static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees); +static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees); +static void make_traverse_code_tree(HUFF_TREE *huff_tree, + HUFF_ELEMENT *element,uint size, + ulonglong code); +static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees, + my_off_t tot_elements,my_off_t filelength); +static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees); +static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees); +static uint *make_offset_code_tree(HUFF_TREE *huff_tree, + HUFF_ELEMENT *element, + uint *offset); +static uint max_bit(uint value); +static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts); +static char *make_new_name(char *new_name,char *old_name); +static char *make_old_name(char *new_name,char *old_name); +static void init_file_buffer(File file,pbool read_buffer); +static int flush_buffer(ulong neaded_length); +static void end_file_buffer(void); +static void write_bits(ulonglong value, uint bits); +static void flush_bits(void); +static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg, + my_off_t new_length, ha_checksum crc); +static int save_state_mrg(File file,PACK_MRG_INFO *isam_file, + my_off_t new_length, ha_checksum crc); +static int mrg_close(PACK_MRG_INFO *mrg); +static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf); +static void mrg_reset(PACK_MRG_INFO *mrg); +#if !defined(DBUG_OFF) +static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count); +static int fakecmp(my_off_t **count1, my_off_t **count2); +#endif + + +static int error_on_write=0,test_only=0,verbose=0,silent=0, + write_loop=0,force_pack=0, isamchk_neaded=0; +static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL; +static my_bool backup, opt_wait; +/* + tree_buff_length is somewhat arbitrary. The bigger it is the better + the chance to win in terms of compression factor. On the other hand, + this table becomes part of the compressed file header. And its length + is coded with 16 bits in the header. Hence the limit is 2**16 - 1. +*/ +static uint tree_buff_length= 65536 - MALLOC_OVERHEAD; +static char tmp_dir[FN_REFLEN]={0},*join_table; +static my_off_t intervall_length; +static ha_checksum glob_crc; +static struct st_file_buffer file_buffer; +static QUEUE queue; +static HUFF_COUNTS *global_count; +static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; +static const char *load_default_groups[]= { "ariapack",0 }; + + /* The main program */ + +int main(int argc, char **argv) +{ + int error,ok; + PACK_MRG_INFO merge; + char **default_argv; + MY_INIT(argv[0]); + + load_defaults("my",load_default_groups,&argc,&argv); + default_argv= argv; + get_options(&argc,&argv); + maria_init(); + + error=ok=isamchk_neaded=0; + if (join_table) + { /* Join files into one */ + if (open_maria_files(&merge,argv,(uint) argc) || + compress(&merge,join_table)) + error=1; + } + else while (argc--) + { + MARIA_HA *isam_file; + if (!(isam_file=open_maria_file(*argv++,O_RDWR))) + error=1; + else + { + merge.file= &isam_file; + merge.current=0; + merge.free_file=0; + merge.count=1; + if (compress(&merge,0)) + error=1; + else + ok=1; + } + } + if (ok && isamchk_neaded && !silent) + puts("Remember to run aria_chk -rq on compressed tables"); + VOID(fflush(stdout)); + VOID(fflush(stderr)); + free_defaults(default_argv); + maria_end(); + my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR); + exit(error ? 2 : 0); +#ifndef _lint + return 0; /* No compiler warning */ +#endif +} + +enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE}; + +static struct my_option my_long_options[] = +{ +#ifdef __NETWARE__ + {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"backup", 'b', "Make a backup of the table as table_name.OLD.", + &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"character-sets-dir", OPT_CHARSETS_DIR_MP, + "Directory where character sets are.", (char**) &charsets_dir, + (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"force", 'f', + "Force packing of table even if it gets bigger or if tempfile exists.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"join", 'j', + "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.", + &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, + 0, 0, 0}, + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"silent", 's', "Be more silent.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"tmpdir", 'T', "Use temporary directory to store temporary table.", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"test", 't', "Don't pack table, only test packing it.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Output version information and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"wait", 'w', "Wait and retry if table is in use.", &opt_wait, + &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +#include <help_start.h> + +static void print_version(void) +{ + VOID(printf("%s Ver 1.0 for %s on %s\n", + my_progname, SYSTEM_TYPE, MACHINE_TYPE)); + NETWARE_SET_SCREEN_MODE(1); +} + + +static void usage(void) +{ + print_version(); + puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc."); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Pack a Aria-table to take much less space."); + puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file"); + puts("afterwards to update the keys."); + puts("You should give the .MAI file as the filename argument."); + puts("To unpack a packed table, run aria_chk -u on the table"); + + VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname)); + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + +#include <help_end.h> + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument) +{ + uint length; + + switch(optid) { +#ifdef __NETWARE__ + case OPT_AUTO_CLOSE: + setscreenmode(SCR_AUTOCLOSE_ON_EXIT); + break; +#endif + case 'f': + force_pack= 1; + tmpfile_createflag= O_RDWR | O_TRUNC; + break; + case 's': + write_loop= verbose= 0; + silent= 1; + break; + case 't': + test_only= 1; + /* Avoid to reset 'verbose' if it was already set > 1. */ + if (! verbose) + verbose= 1; + break; + case 'T': + length= (uint) (strmov(tmp_dir, argument) - tmp_dir); + if (length != dirname_length(tmp_dir)) + { + tmp_dir[length]=FN_LIBCHAR; + tmp_dir[length+1]=0; + } + break; + case 'v': + verbose++; /* Allow for selecting the level of verbosity. */ + silent= 0; + break; + case '#': + DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace"); + break; + case 'V': + print_version(); + exit(0); + case 'I': + case '?': + usage(); + exit(0); + } + return 0; +} + + /* reads options */ + /* Initiates DEBUG - but no debugging here ! */ + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + + my_progname= argv[0][0]; + if (isatty(fileno(stdout))) + write_loop=1; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + if (!*argc) + { + usage(); + exit(1); + } + if (join_table) + { + backup=0; /* Not needed */ + tmp_dir[0]=0; + } + return; +} + + +static MARIA_HA *open_maria_file(char *name,int mode) +{ + MARIA_HA *isam_file; + MARIA_SHARE *share; + DBUG_ENTER("open_maria_file"); + + if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE | + (opt_wait ? HA_OPEN_WAIT_IF_LOCKED : + HA_OPEN_ABORT_IF_LOCKED)))) + { + VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno)); + DBUG_RETURN(0); + } + share=isam_file->s; + if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table) + { + if (!force_pack) + { + VOID(fprintf(stderr, "%s is already compressed\n", name)); + VOID(maria_close(isam_file)); + DBUG_RETURN(0); + } + if (verbose) + puts("Recompressing already compressed table"); + share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */ + } + if (! force_pack && share->state.state.records != 0 && + (share->state.state.records <= 1 || + share->state.state.data_file_length < 1024)) + { + VOID(fprintf(stderr, "%s is too small to compress\n", name)); + VOID(maria_close(isam_file)); + DBUG_RETURN(0); + } + VOID(maria_lock_database(isam_file,F_WRLCK)); + maria_ignore_trids(isam_file); + DBUG_RETURN(isam_file); +} + + +static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count) +{ + uint i,j; + mrg->count=0; + mrg->current=0; + mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE)); + mrg->free_file=1; + mrg->src_file_has_indexes_disabled= 0; + for (i=0; i < count ; i++) + { + if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY))) + goto error; + + mrg->src_file_has_indexes_disabled|= + ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map, + mrg->file[i]->s->base.keys); + } + /* Check that files are identical */ + for (j=0 ; j < count-1 ; j++) + { + MARIA_COLUMNDEF *m1,*m2,*end; + if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength || + mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields) + goto diff_file; + m1=mrg->file[j]->s->columndef; + end=m1+mrg->file[j]->s->base.fields; + m2=mrg->file[j+1]->s->columndef; + for ( ; m1 != end ; m1++,m2++) + { + if (m1->type != m2->type || m1->length != m2->length) + goto diff_file; + } + } + mrg->count=count; + return 0; + + diff_file: + VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n", + my_progname, names[j], names[j+1])); + error: + while (i--) + maria_close(mrg->file[i]); + my_free(mrg->file, MYF(0)); + return 1; +} + + +static int compress(PACK_MRG_INFO *mrg,char *result_table) +{ + int error; + File new_file,join_maria_file; + MARIA_HA *isam_file; + MARIA_SHARE *share; + char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN]; + uint i,header_length,fields,trees,used_trees; + my_off_t old_length,new_length,tot_elements; + HUFF_COUNTS *huff_counts; + HUFF_TREE *huff_trees; + DBUG_ENTER("compress"); + + isam_file=mrg->file[0]; /* Take this as an example */ + share=isam_file->s; + new_file=join_maria_file= -1; + trees=fields=0; + huff_trees=0; + huff_counts=0; + maria_block_size= isam_file->s->block_size; + + /* Create temporary or join file */ + if (backup) + VOID(fn_format(org_name,isam_file->s->open_file_name.str, + "",MARIA_NAME_DEXT, 2)); + else + VOID(fn_format(org_name,isam_file->s->open_file_name.str, + "",MARIA_NAME_DEXT, 2+4+16)); + + if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0, + maria_block_size, MY_WME) == 0) + { + fprintf(stderr, "Can't initialize page cache\n"); + goto err; + } + + if (!test_only && result_table) + { + /* Make a new indexfile based on first file in list */ + uint length; + uchar *buff; + strmov(org_name,result_table); /* Fix error messages */ + VOID(fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2)); + if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) + < 0) + goto err; + length=(uint) share->base.keystart; + if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME)))) + goto err; + if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) || + my_write(join_maria_file,buff,length, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL))) + { + my_free(buff,MYF(0)); + goto err; + } + my_free(buff,MYF(0)); + VOID(fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2)); + } + else if (!tmp_dir[0]) + VOID(make_new_name(new_name,org_name)); + else + VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4)); + if (!test_only && + (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0) + goto err; + + /* Start calculating statistics */ + + mrg->records=0; + for (i=0 ; i < mrg->count ; i++) + mrg->records+=mrg->file[i]->s->state.state.records; + + DBUG_PRINT("info", ("Compressing %s: (%lu records)", + result_table ? new_name : org_name, + (ulong) mrg->records)); + if (write_loop || verbose) + { + VOID(printf("Compressing %s: (%lu records)\n", + result_table ? new_name : org_name, (ulong) mrg->records)); + } + trees=fields=share->base.fields; + huff_counts=init_huff_count(isam_file,mrg->records); + QUICK_SAFEMALLOC; + + /* + Read the whole data file(s) for statistics. + */ + DBUG_PRINT("info", ("- Calculating statistics")); + if (write_loop || verbose) + VOID(printf("- Calculating statistics\n")); + if (get_statistic(mrg,huff_counts)) + goto err; + NORMAL_SAFEMALLOC; + old_length=0; + for (i=0; i < mrg->count ; i++) + old_length+= (mrg->file[i]->s->state.state.data_file_length - + mrg->file[i]->s->state.state.empty); + + /* + Create a global priority queue in preparation for making + temporary Huffman trees. + */ + if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0)) + goto err; + + /* + Check each column if we should use pre-space-compress, end-space- + compress, empty-field-compress or zero-field-compress. + */ + check_counts(huff_counts,fields,mrg->records); + + /* + Build a Huffman tree for each column. + */ + huff_trees=make_huff_trees(huff_counts,trees); + + /* + If the packed lengths of combined columns is less then the sum of + the non-combined columns, then create common Huffman trees for them. + We do this only for uchar compressed columns, not for distinct values + compressed columns. + */ + if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0) + goto err; + + /* + Assign codes to all uchar or column values. + */ + if (make_huff_decode_table(huff_trees,fields)) + goto err; + + /* Prepare a file buffer. */ + init_file_buffer(new_file,0); + + /* + Reserve space in the target file for the fixed compressed file header. + */ + file_buffer.pos_in_file=HEAD_LENGTH; + if (! test_only) + VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0))); + + /* + Write field infos: field type, pack type, length bits, tree number. + */ + write_field_info(huff_counts,fields,used_trees); + + /* + Write decode trees. + */ + if (!(tot_elements=write_huff_tree(huff_trees,trees))) + goto err; + + /* + Calculate the total length of the compression info header. + This includes the fixed compressed file header, the column compression + type descriptions, and the decode trees. + */ + header_length=(uint) file_buffer.pos_in_file+ + (uint) (file_buffer.pos-file_buffer.buffer); + + /* + Compress the source file into the target file. + */ + DBUG_PRINT("info", ("- Compressing file")); + if (write_loop || verbose) + VOID(printf("- Compressing file\n")); + error=compress_maria_file(mrg,huff_counts); + new_length=file_buffer.pos_in_file; + if (!error && !test_only) + { + uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */ + bzero(buff,sizeof(buff)); + error=my_write(file_buffer.file,buff,sizeof(buff), + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0; + } + + /* + Write the fixed compressed file header. + */ + if (!error) + error=write_header(mrg,header_length,used_trees,tot_elements, + new_length); + + /* Flush the file buffer. */ + end_file_buffer(); + + /* Display statistics. */ + DBUG_PRINT("info", ("Min record length: %6d Max length: %6d " + "Mean total length: %6ld", + mrg->min_pack_length, mrg->max_pack_length, + (ulong) (mrg->records ? (new_length/mrg->records) : 0))); + if (verbose && mrg->records) + VOID(printf("Min record length: %6d Max length: %6d " + "Mean total length: %6ld\n", mrg->min_pack_length, + mrg->max_pack_length, (ulong) (new_length/mrg->records))); + + /* Close source and target file. */ + if (!test_only) + { + error|=my_close(new_file,MYF(MY_WME)); + if (!result_table) + { + error|=my_close(isam_file->dfile.file, MYF(MY_WME)); + isam_file->dfile.file= -1; /* Tell maria_close file is closed */ + isam_file->s->bitmap.file.file= -1; + } + } + + /* Cleanup. */ + free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields); + if (! test_only && ! error) + { + if (result_table) + { + error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc); + } + else + { + if (backup) + { + if (my_rename(org_name,make_old_name(temp_name, + isam_file->s->open_file_name.str), + MYF(MY_WME))) + error=1; + else + { + if (tmp_dir[0]) + error=my_copy(new_name,org_name,MYF(MY_WME)); + else + error=my_rename(new_name,org_name,MYF(MY_WME)); + if (!error) + { + VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME))); + if (tmp_dir[0]) + VOID(my_delete(new_name,MYF(MY_WME))); + } + } + } + else + { + if (tmp_dir[0]) + { + error=my_copy(new_name,org_name, + MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME)); + if (!error) + VOID(my_delete(new_name,MYF(MY_WME))); + } + else + error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME)); + } + if (! error) + error=save_state(isam_file,mrg,new_length,glob_crc); + } + } + error|=mrg_close(mrg); + if (join_maria_file >= 0) + error|=my_close(join_maria_file,MYF(MY_WME)); + if (error) + { + VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name)); + VOID(my_delete(new_name,MYF(MY_WME))); + DBUG_RETURN(-1); + } + if (write_loop || verbose) + { + if (old_length) + VOID(printf("%.4g%% \n", + (((longlong) (old_length - new_length)) * 100.0 / + (longlong) old_length))); + else + puts("Empty file saved in compressed format"); + } + DBUG_RETURN(0); + + err: + end_pagecache(maria_pagecache, 1); + free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields); + if (new_file >= 0) + VOID(my_close(new_file,MYF(0))); + if (join_maria_file >= 0) + VOID(my_close(join_maria_file,MYF(0))); + mrg_close(mrg); + VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name)); + DBUG_RETURN(-1); +} + + /* Init a huff_count-struct for each field and init it */ + +static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records) +{ + reg2 uint i; + reg1 HUFF_COUNTS *count; + if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields* + sizeof(HUFF_COUNTS), + MYF(MY_ZEROFILL | MY_WME)))) + { + for (i=0 ; i < info->s->base.fields ; i++) + { + enum en_fieldtype type; + count[i].field_length=info->s->columndef[i].length; + type= count[i].field_type= (enum en_fieldtype) info->s->columndef[i].type; + if (type == FIELD_INTERVALL || + type == FIELD_CONSTANT || + type == FIELD_ZERO) + type = FIELD_NORMAL; + if (count[i].field_length <= 8 && + (type == FIELD_NORMAL || + type == FIELD_SKIP_ZERO)) + count[i].max_zero_fill= count[i].field_length; + /* + For every column initialize a tree, which is used to detect distinct + column values. 'int_tree' works together with 'tree_buff' and + 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'. + This is accomplished by '-1' as the element size. + */ + init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL, + NULL); + if (records && type != FIELD_BLOB && type != FIELD_VARCHAR) + count[i].tree_pos=count[i].tree_buff = + my_malloc(count[i].field_length > 1 ? tree_buff_length : 2, + MYF(MY_WME)); + } + } + return count; +} + + + /* Free memory used by counts and trees */ + +static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees, + HUFF_COUNTS *huff_counts, + uint fields) +{ + register uint i; + + if (huff_trees) + { + for (i=0 ; i < trees ; i++) + { + if (huff_trees[i].element_buffer) + my_free(huff_trees[i].element_buffer,MYF(0)); + if (huff_trees[i].code) + my_free(huff_trees[i].code,MYF(0)); + } + my_free(huff_trees,MYF(0)); + } + if (huff_counts) + { + for (i=0 ; i < fields ; i++) + { + if (huff_counts[i].tree_buff) + { + my_free(huff_counts[i].tree_buff,MYF(0)); + delete_tree(&huff_counts[i].int_tree); + } + } + my_free(huff_counts, MYF(0)); + } + delete_queue(&queue); /* This is safe to free */ + return; +} + + /* Read through old file and gather some statistics */ + +static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts) +{ + int error; + uint length, null_bytes; + ulong reclength,max_blob_length; + uchar *record,*pos,*next_pos,*end_pos,*start_pos; + ha_rows record_count; + HUFF_COUNTS *count,*end_count; + TREE_ELEMENT *element; + ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *); + DBUG_ENTER("get_statistic"); + + reclength= mrg->file[0]->s->base.reclength; + null_bytes= mrg->file[0]->s->base.null_bytes; + record=(uchar*) my_alloca(reclength); + end_count=huff_counts+mrg->file[0]->s->base.fields; + record_count=0; glob_crc=0; + max_blob_length=0; + + /* Check how to calculate checksum */ + if (mrg->file[0]->s->data_file_type == STATIC_RECORD) + calc_checksum= _ma_static_checksum; + else + calc_checksum= _ma_checksum; + + mrg_reset(mrg); + while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE) + { + ulong tot_blob_length=0; + if (! error) + { + /* glob_crc is a checksum over all bytes of all records. */ + glob_crc+= (*calc_checksum)(mrg->file[0],record); + + /* Count the incidence of values separately for every column. */ + for (pos=record + null_bytes, count=huff_counts ; + count < end_count ; + count++, + pos=next_pos) + { + next_pos=end_pos=(start_pos=pos)+count->field_length; + + /* + Put the whole column value in a tree if there is room for it. + 'int_tree' is used to quickly check for duplicate values. + 'tree_buff' collects as many distinct column values as + possible. If the field length is > 1, it is tree_buff_length, + else 2 bytes. Each value is 'field_length' bytes big. If there + are more distinct column values than fit into the buffer, we + give up with this tree. BLOBs and VARCHARs do not have a + tree_buff as it can only be used with fixed length columns. + For the special case of field length == 1, we handle only the + case that there is only one distinct value in the table(s). + Otherwise, we can have a maximum of 256 distinct values. This + is then handled by the normal Huffman tree build. + + Another limit for collecting distinct column values is the + number of values itself. Since we would need to build a + Huffman tree for the values, we are limited by the 'IS_OFFSET' + constant. This constant expresses a bit which is used to + determine if a tree element holds a final value or an offset + to a child element. Hence, all values and offsets need to be + smaller than 'IS_OFFSET'. A tree element is implemented with + two integer values, one for the left branch and one for the + right branch. For the extreme case that the first element + points to the last element, the number of integers in the tree + must be less or equal to IS_OFFSET. So the number of elements + must be less or equal to IS_OFFSET / 2. + + WARNING: At first, we insert a pointer into the record buffer + as the key for the tree. If we got a new distinct value, which + is really inserted into the tree, instead of being counted + only, we will copy the column value from the record buffer to + 'tree_buff' and adjust the key pointer of the tree accordingly. + */ + if (count->tree_buff) + { + global_count=count; + if (!(element=tree_insert(&count->int_tree,pos, 0, + count->int_tree.custom_arg)) || + (element->count == 1 && + (count->tree_buff + tree_buff_length < + count->tree_pos + count->field_length)) || + (count->int_tree.elements_in_tree > IS_OFFSET / 2) || + (count->field_length == 1 && + count->int_tree.elements_in_tree > 1)) + { + delete_tree(&count->int_tree); + my_free(count->tree_buff,MYF(0)); + count->tree_buff=0; + } + else + { + /* + If tree_insert() succeeds, it either creates a new element + or increments the counter of an existing element. + */ + if (element->count == 1) + { + /* Copy the new column value into 'tree_buff'. */ + memcpy(count->tree_pos,pos,(size_t) count->field_length); + /* Adjust the key pointer in the tree. */ + tree_set_pointer(element,count->tree_pos); + /* Point behind the last column value so far. */ + count->tree_pos+=count->field_length; + } + } + } + + /* Save character counters and space-counts and zero-field-counts */ + if (count->field_type == FIELD_NORMAL || + count->field_type == FIELD_SKIP_ENDSPACE) + { + /* Ignore trailing space. */ + for ( ; end_pos > pos ; end_pos--) + if (end_pos[-1] != ' ') + break; + /* Empty fields are just counted. Go to the next record. */ + if (end_pos == pos) + { + count->empty_fields++; + count->max_zero_fill=0; + continue; + } + /* + Count the total of all trailing spaces and the number of + short trailing spaces. Remember the longest trailing space. + */ + length= (uint) (next_pos-end_pos); + count->tot_end_space+=length; + if (length < 8) + count->end_space[length]++; + if (count->max_end_space < length) + count->max_end_space = length; + } + + if (count->field_type == FIELD_NORMAL || + count->field_type == FIELD_SKIP_PRESPACE) + { + /* Ignore leading space. */ + for (pos=start_pos; pos < end_pos ; pos++) + if (pos[0] != ' ') + break; + /* Empty fields are just counted. Go to the next record. */ + if (end_pos == pos) + { + count->empty_fields++; + count->max_zero_fill=0; + continue; + } + /* + Count the total of all leading spaces and the number of + short leading spaces. Remember the longest leading space. + */ + length= (uint) (pos-start_pos); + count->tot_pre_space+=length; + if (length < 8) + count->pre_space[length]++; + if (count->max_pre_space < length) + count->max_pre_space = length; + } + + /* Calculate pos, end_pos, and max_length for variable length fields. */ + if (count->field_type == FIELD_BLOB) + { + uint field_length=count->field_length -portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(field_length, start_pos); + memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*)); + end_pos=pos+blob_length; + tot_blob_length+=blob_length; + set_if_bigger(count->max_length,blob_length); + } + else if (count->field_type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1); + length= (pack_length == 1 ? (uint) *(uchar*) start_pos : + uint2korr(start_pos)); + pos= start_pos+pack_length; + end_pos= pos+length; + set_if_bigger(count->max_length,length); + } + + /* Evaluate 'max_zero_fill' for short fields. */ + if (count->field_length <= 8 && + (count->field_type == FIELD_NORMAL || + count->field_type == FIELD_SKIP_ZERO)) + { + uint i; + /* Zero fields are just counted. Go to the next record. */ + if (!memcmp(start_pos, zero_string, count->field_length)) + { + count->zero_fields++; + continue; + } + /* + max_zero_fill starts with field_length. It is decreased every + time a shorter "zero trailer" is found. It is set to zero when + an empty field is found (see above). This suggests that the + variable should be called 'min_zero_fill'. + */ + for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ; + i++) ; + if (i < count->max_zero_fill) + count->max_zero_fill=i; + } + + /* Ignore zero fields and check fields. */ + if (count->field_type == FIELD_ZERO || + count->field_type == FIELD_CHECK) + continue; + + /* + Count the incidence of every uchar value in the + significant field value. + */ + for ( ; pos < end_pos ; pos++) + count->counts[(uchar) *pos]++; + + /* Step to next field. */ + } + + if (tot_blob_length > max_blob_length) + max_blob_length=tot_blob_length; + record_count++; + if (write_loop && record_count % WRITE_COUNT == 0) + { + VOID(printf("%lu\r", (ulong) record_count)); + VOID(fflush(stdout)); + } + } + else if (error != HA_ERR_RECORD_DELETED) + { + VOID(fprintf(stderr, "Got error %d while reading rows\n", error)); + break; + } + + /* Step to next record. */ + } + if (write_loop) + { + VOID(printf(" \r")); + VOID(fflush(stdout)); + } + + /* + If --debug=d,fakebigcodes is set, fake the counts to get big Huffman + codes. + */ + DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count);); + + DBUG_PRINT("info", ("Found the following number of incidents " + "of the uchar codes:")); + if (verbose >= 2) + VOID(printf("Found the following number of incidents " + "of the uchar codes:\n")); + for (count= huff_counts ; count < end_count; count++) + { + uint idx; + my_off_t total_count; + char llbuf[32]; + + DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1))); + if (verbose >= 2) + VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1))); + if (count->tree_buff) + { + DBUG_PRINT("info", ("number of distinct values: %u", + (uint) ((count->tree_pos - count->tree_buff) / + count->field_length))); + if (verbose >= 2) + VOID(printf("number of distinct values: %u\n", + (uint) ((count->tree_pos - count->tree_buff) / + count->field_length))); + } + total_count= 0; + for (idx= 0; idx < 256; idx++) + { + if (count->counts[idx]) + { + total_count+= count->counts[idx]; + DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx, + llstr((longlong) count->counts[idx], llbuf))); + if (verbose >= 2) + VOID(printf("counts[0x%02x]: %12s\n", idx, + llstr((longlong) count->counts[idx], llbuf))); + } + } + DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count, + llbuf))); + if ((verbose >= 2) && total_count) + { + VOID(printf("total: %12s\n", + llstr((longlong) total_count, llbuf))); + } + } + + mrg->records=record_count; + mrg->max_blob_length=max_blob_length; + my_afree(record); + DBUG_RETURN(error != HA_ERR_END_OF_FILE); +} + +static int compare_huff_elements(void *not_used __attribute__((unused)), + uchar *a, uchar *b) +{ + return *((my_off_t*) a) < *((my_off_t*) b) ? -1 : + (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1); +} + + /* Check each tree if we should use pre-space-compress, end-space- + compress, empty-field-compress or zero-field-compress */ + +static void check_counts(HUFF_COUNTS *huff_counts, uint trees, + my_off_t records) +{ + uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count]; + my_off_t old_length,new_length,length; + DBUG_ENTER("check_counts"); + + bzero((uchar*) field_count,sizeof(field_count)); + space_fields=fill_zero_fields=0; + + for (; trees-- ; huff_counts++) + { + if (huff_counts->field_type == FIELD_BLOB) + { + huff_counts->length_bits=max_bit(huff_counts->max_length); + goto found_pack; + } + else if (huff_counts->field_type == FIELD_VARCHAR) + { + huff_counts->length_bits=max_bit(huff_counts->max_length); + goto found_pack; + } + else if (huff_counts->field_type == FIELD_CHECK) + { + huff_counts->bytes_packed=0; + huff_counts->counts[0]=0; + goto found_pack; + } + + huff_counts->field_type=FIELD_NORMAL; + huff_counts->pack_type=0; + + /* Check for zero-filled records (in this column), or zero records. */ + if (huff_counts->zero_fields || ! records) + { + my_off_t old_space_count; + /* + If there are only zero filled records (in this column), + or no records at all, we are done. + */ + if (huff_counts->zero_fields == records) + { + huff_counts->field_type= FIELD_ZERO; + huff_counts->bytes_packed=0; + huff_counts->counts[0]=0; + goto found_pack; + } + /* Remeber the number of significant spaces. */ + old_space_count=huff_counts->counts[' ']; + /* Add all leading and trailing spaces. */ + huff_counts->counts[' ']+= (huff_counts->tot_end_space + + huff_counts->tot_pre_space + + huff_counts->empty_fields * + huff_counts->field_length); + /* Check, what the compressed length of this would be. */ + old_length=calc_packed_length(huff_counts,0)+records/8; + /* Get the number of zero bytes. */ + length=huff_counts->zero_fields*huff_counts->field_length; + /* Add it to the counts. */ + huff_counts->counts[0]+=length; + /* Check, what the compressed length of this would be. */ + new_length=calc_packed_length(huff_counts,0); + /* If the compression without the zeroes would be shorter, we are done. */ + if (old_length < new_length && huff_counts->field_length > 1) + { + huff_counts->field_type=FIELD_SKIP_ZERO; + huff_counts->counts[0]-=length; + huff_counts->bytes_packed=old_length- records/8; + goto found_pack; + } + /* Remove the insignificant spaces, but keep the zeroes. */ + huff_counts->counts[' ']=old_space_count; + } + /* Check, what the compressed length of this column would be. */ + huff_counts->bytes_packed=calc_packed_length(huff_counts,0); + + /* + If there are enough empty records (in this column), + treating them specially may pay off. + */ + if (huff_counts->empty_fields) + { + if (huff_counts->field_length > 2 && + huff_counts->empty_fields + (records - huff_counts->empty_fields)* + (1+max_bit(max(huff_counts->max_pre_space, + huff_counts->max_end_space))) < + records * max_bit(huff_counts->field_length)) + { + huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS; + } + else + { + length=huff_counts->empty_fields*huff_counts->field_length; + if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space) + { + huff_counts->tot_end_space+=length; + huff_counts->max_end_space=huff_counts->field_length; + if (huff_counts->field_length < 8) + huff_counts->end_space[huff_counts->field_length]+= + huff_counts->empty_fields; + } + if (huff_counts->tot_pre_space) + { + huff_counts->tot_pre_space+=length; + huff_counts->max_pre_space=huff_counts->field_length; + if (huff_counts->field_length < 8) + huff_counts->pre_space[huff_counts->field_length]+= + huff_counts->empty_fields; + } + } + } + + /* + If there are enough trailing spaces (in this column), + treating them specially may pay off. + */ + if (huff_counts->tot_end_space) + { + huff_counts->counts[' ']+=huff_counts->tot_pre_space; + if (test_space_compress(huff_counts,records,huff_counts->max_end_space, + huff_counts->end_space, + huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE)) + goto found_pack; + huff_counts->counts[' ']-=huff_counts->tot_pre_space; + } + + /* + If there are enough leading spaces (in this column), + treating them specially may pay off. + */ + if (huff_counts->tot_pre_space) + { + if (test_space_compress(huff_counts,records,huff_counts->max_pre_space, + huff_counts->pre_space, + huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE)) + goto found_pack; + } + + found_pack: /* Found field-packing */ + + /* Test if we can use zero-fill */ + + if (huff_counts->max_zero_fill && + (huff_counts->field_type == FIELD_NORMAL || + huff_counts->field_type == FIELD_SKIP_ZERO)) + { + huff_counts->counts[0]-=huff_counts->max_zero_fill* + (huff_counts->field_type == FIELD_SKIP_ZERO ? + records - huff_counts->zero_fields : records); + huff_counts->pack_type|=PACK_TYPE_ZERO_FILL; + huff_counts->bytes_packed=calc_packed_length(huff_counts,0); + } + + /* Test if intervall-field is better */ + + if (huff_counts->tree_buff) + { + HUFF_TREE tree; + + DBUG_EXECUTE_IF("forceintervall", + huff_counts->bytes_packed= ~ (my_off_t) 0;); + tree.element_buffer=0; + if (!make_huff_tree(&tree,huff_counts) && + tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed) + { + if (tree.elements == 1) + huff_counts->field_type=FIELD_CONSTANT; + else + huff_counts->field_type=FIELD_INTERVALL; + huff_counts->pack_type=0; + } + else + { + my_free(huff_counts->tree_buff,MYF(0)); + delete_tree(&huff_counts->int_tree); + huff_counts->tree_buff=0; + } + if (tree.element_buffer) + my_free(tree.element_buffer,MYF(0)); + } + if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS) + space_fields++; + if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL) + fill_zero_fields++; + field_count[huff_counts->field_type]++; + } + DBUG_PRINT("info", ("normal: %3d empty-space: %3d " + "empty-zero: %3d empty-fill: %3d", + field_count[FIELD_NORMAL],space_fields, + field_count[FIELD_SKIP_ZERO],fill_zero_fields)); + DBUG_PRINT("info", ("pre-space: %3d end-space: %3d " + "intervall-fields: %3d zero: %3d", + field_count[FIELD_SKIP_PRESPACE], + field_count[FIELD_SKIP_ENDSPACE], + field_count[FIELD_INTERVALL], + field_count[FIELD_ZERO])); + if (verbose) + VOID(printf("\nnormal: %3d empty-space: %3d " + "empty-zero: %3d empty-fill: %3d\n" + "pre-space: %3d end-space: %3d " + "intervall-fields: %3d zero: %3d\n", + field_count[FIELD_NORMAL],space_fields, + field_count[FIELD_SKIP_ZERO],fill_zero_fields, + field_count[FIELD_SKIP_PRESPACE], + field_count[FIELD_SKIP_ENDSPACE], + field_count[FIELD_INTERVALL], + field_count[FIELD_ZERO])); + DBUG_VOID_RETURN; +} + + +/* Test if we can use space-compression and empty-field-compression */ + +static int +test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records, + uint max_space_length, my_off_t *space_counts, + my_off_t tot_space_count, enum en_fieldtype field_type) +{ + int min_pos; + uint length_bits,i; + my_off_t space_count,min_space_count,min_pack,new_length,skip; + + length_bits=max_bit(max_space_length); + + /* Default no end_space-packing */ + space_count=huff_counts->counts[(uint) ' ']; + min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count); + min_pack=calc_packed_length(huff_counts,0); + min_pos= -2; + huff_counts->counts[(uint) ' ']=space_count; + + /* Test with allways space-count */ + new_length=huff_counts->bytes_packed+length_bits*records/8; + if (new_length+1 < min_pack) + { + min_pos= -1; + min_pack=new_length; + min_space_count=space_count; + } + /* Test with length-flag */ + for (skip=0L, i=0 ; i < 8 ; i++) + { + if (space_counts[i]) + { + if (i) + huff_counts->counts[(uint) ' ']+=space_counts[i]; + skip+=huff_counts->pre_space[i]; + new_length=calc_packed_length(huff_counts,0)+ + (records+(records-skip)*(1+length_bits))/8; + if (new_length < min_pack) + { + min_pos=(int) i; + min_pack=new_length; + min_space_count=huff_counts->counts[(uint) ' ']; + } + } + } + + huff_counts->counts[(uint) ' ']=min_space_count; + huff_counts->bytes_packed=min_pack; + switch (min_pos) { + case -2: + return(0); /* No space-compress */ + case -1: /* Always space-count */ + huff_counts->field_type=field_type; + huff_counts->min_space=0; + huff_counts->length_bits=max_bit(max_space_length); + break; + default: + huff_counts->field_type=field_type; + huff_counts->min_space=(uint) min_pos; + huff_counts->pack_type|=PACK_TYPE_SELECTED; + huff_counts->length_bits=max_bit(max_space_length); + break; + } + return(1); /* Using space-compress */ +} + + + /* Make a huff_tree of each huff_count */ + +static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees) +{ + uint tree; + HUFF_TREE *huff_tree; + DBUG_ENTER("make_huff_trees"); + + if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE), + MYF(MY_WME | MY_ZEROFILL)))) + DBUG_RETURN(0); + + for (tree=0 ; tree < trees ; tree++) + { + if (make_huff_tree(huff_tree+tree,huff_counts+tree)) + { + while (tree--) + my_free(huff_tree[tree].element_buffer,MYF(0)); + my_free(huff_tree,MYF(0)); + DBUG_RETURN(0); + } + } + DBUG_RETURN(huff_tree); +} + +/* + Build a Huffman tree. + + SYNOPSIS + make_huff_tree() + huff_tree The Huffman tree. + huff_counts The counts. + + DESCRIPTION + Build a Huffman tree according to huff_counts->counts or + huff_counts->tree_buff. tree_buff, if non-NULL contains up to + tree_buff_length of distinct column values. In that case, whole + values can be Huffman encoded instead of single bytes. + + RETURN + 0 OK + != 0 Error +*/ + +static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts) +{ + uint i,found,bits_packed,first,last; + my_off_t bytes_packed; + HUFF_ELEMENT *a,*b,*new_huff_el; + + first=last=0; + if (huff_counts->tree_buff) + { + /* Calculate the number of distinct values in tree_buff. */ + found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) / + huff_counts->field_length; + first=0; last=found-1; + } + else + { + /* Count the number of uchar codes found in the column. */ + for (i=found=0 ; i < 256 ; i++) + { + if (huff_counts->counts[i]) + { + if (! found++) + first=i; + last=i; + } + } + if (found < 2) + found=2; + } + + /* When using 'tree_buff' we can have more that 256 values. */ + if (queue.max_elements < found) + { + delete_queue(&queue); + if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0)) + return -1; + } + + /* Allocate or reallocate an element buffer for the Huffman tree. */ + if (!huff_tree->element_buffer) + { + if (!(huff_tree->element_buffer= + (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME)))) + return 1; + } + else + { + HUFF_ELEMENT *temp; + if (!(temp= + (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer, + found*2*sizeof(HUFF_ELEMENT), + MYF(MY_WME)))) + return 1; + huff_tree->element_buffer=temp; + } + + huff_counts->tree=huff_tree; + huff_tree->counts=huff_counts; + huff_tree->min_chr=first; + huff_tree->max_chr=last; + huff_tree->char_bits=max_bit(last-first); + huff_tree->offset_bits=max_bit(found-1)+1; + + if (huff_counts->tree_buff) + { + huff_tree->elements=0; + huff_tree->tree_pack_length=(1+15+16+5+5+ + (huff_tree->char_bits+1)*found+ + (huff_tree->offset_bits+1)* + (found-2)+7)/8 + + (uint) (huff_tree->counts->tree_pos- + huff_tree->counts->tree_buff); + /* + Put a HUFF_ELEMENT into the queue for every distinct column value. + + tree_walk() calls save_counts_in_queue() for every element in + 'int_tree'. This takes elements from the target trees element + buffer and places references to them into the buffer of the + priority queue. We insert in column value order, but the order is + in fact irrelevant here. We will establish the correct order + later. + */ + tree_walk(&huff_counts->int_tree, + (int (*)(void*, element_count,void*)) save_counts_in_queue, + (uchar*) huff_tree, left_root_right); + } + else + { + huff_tree->elements=found; + huff_tree->tree_pack_length=(9+9+5+5+ + (huff_tree->char_bits+1)*found+ + (huff_tree->offset_bits+1)* + (found-2)+7)/8; + /* + Put a HUFF_ELEMENT into the queue for every uchar code found in the column. + + The elements are taken from the target trees element buffer. + Instead of using queue_insert(), we just place references to the + elements into the buffer of the priority queue. We insert in byte + value order, but the order is in fact irrelevant here. We will + establish the correct order later. + */ + for (i=first, found=0 ; i <= last ; i++) + { + if (huff_counts->counts[i]) + { + new_huff_el=huff_tree->element_buffer+(found++); + new_huff_el->count=huff_counts->counts[i]; + new_huff_el->a.leaf.null=0; + new_huff_el->a.leaf.element_nr=i; + queue.root[found]=(uchar*) new_huff_el; + } + } + /* + If there is only a single uchar value in this field in all records, + add a second element with zero incidence. This is required to enter + the loop, which builds the Huffman tree. + */ + while (found < 2) + { + new_huff_el=huff_tree->element_buffer+(found++); + new_huff_el->count=0; + new_huff_el->a.leaf.null=0; + if (last) + new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1; + else + new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1; + queue.root[found]=(uchar*) new_huff_el; + } + } + + /* Make a queue from the queue buffer. */ + queue.elements=found; + + /* + Make a priority queue from the queue. Construct its index so that we + have a partially ordered tree. + */ + queue_fix(&queue); + + /* The Huffman algorithm. */ + bytes_packed=0; bits_packed=0; + for (i=1 ; i < found ; i++) + { + /* + Pop the top element from the queue (the one with the least incidence). + Popping from a priority queue includes a re-ordering of the queue, + to get the next least incidence element to the top. + */ + a=(HUFF_ELEMENT*) queue_remove_top(&queue); + /* Copy the next least incidence element */ + b=(HUFF_ELEMENT*) queue_top(&queue); + /* Get a new element from the element buffer. */ + new_huff_el=huff_tree->element_buffer+found+i; + /* The new element gets the sum of the two least incidence elements. */ + new_huff_el->count=a->count+b->count; + /* + The Huffman algorithm assigns another bit to the code for a byte + every time that bytes incidence is combined (directly or indirectly) + to a new element as one of the two least incidence elements. + This means that one more bit per incidence of that uchar is required + in the resulting file. So we add the new combined incidence as the + number of bits by which the result grows. + */ + bits_packed+=(uint) (new_huff_el->count & 7); + bytes_packed+=new_huff_el->count/8; + /* The new element points to its children, lesser in left. */ + new_huff_el->a.nod.left=a; + new_huff_el->a.nod.right=b; + /* + Replace the copied top element by the new element and re-order the + queue. + */ + queue_top(&queue)= (uchar*) new_huff_el; + queue_replace_top(&queue); + } + huff_tree->root=(HUFF_ELEMENT*) queue.root[1]; + huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8; + return 0; +} + +static int compare_tree(void* cmp_arg __attribute__((unused)), + register const uchar *s, register const uchar *t) +{ + uint length; + for (length=global_count->field_length; length-- ;) + if (*s++ != *t++) + return (int) s[-1] - (int) t[-1]; + return 0; +} + +/* + Organize distinct column values and their incidences into a priority queue. + + SYNOPSIS + save_counts_in_queue() + key The column value. + count The incidence of this value. + tree The Huffman tree to be built later. + + DESCRIPTION + We use the element buffer of the targeted tree. The distinct column + values are organized in a priority queue first. The Huffman + algorithm will later organize the elements into a Huffman tree. For + the time being, we just place references to the elements into the + queue buffer. The buffer will later be organized into a priority + queue. + + RETURN + 0 + */ + +static int save_counts_in_queue(uchar *key, element_count count, + HUFF_TREE *tree) +{ + HUFF_ELEMENT *new_huff_el; + + new_huff_el=tree->element_buffer+(tree->elements++); + new_huff_el->count=count; + new_huff_el->a.leaf.null=0; + new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) / + tree->counts->field_length; + queue.root[tree->elements]=(uchar*) new_huff_el; + return 0; +} + + +/* + Calculate length of file if given counts should be used. + + SYNOPSIS + calc_packed_length() + huff_counts The counts for a column of the table(s). + add_tree_lenght If the decode tree length should be added. + + DESCRIPTION + We need to follow the Huffman algorithm until we know, how many bits + are required for each uchar code. But we do not need the resulting + Huffman tree. Hence, we can leave out some steps which are essential + in make_huff_tree(). + + RETURN + Number of bytes required to compress this table column. +*/ + +static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts, + uint add_tree_lenght) +{ + uint i,found,bits_packed,first,last; + my_off_t bytes_packed; + HUFF_ELEMENT element_buffer[256]; + DBUG_ENTER("calc_packed_length"); + + /* + WARNING: We use a small hack for efficiency: Instead of placing + references to HUFF_ELEMENTs into the queue, we just insert + references to the counts of the uchar codes which appeared in this + table column. During the Huffman algorithm they are successively + replaced by references to HUFF_ELEMENTs. This works, because + HUFF_ELEMENTs have the incidence count at their beginning. + Regardless, wether the queue array contains references to counts of + type my_off_t or references to HUFF_ELEMENTs which have the count of + type my_off_t at their beginning, it always points to a count of the + same type. + + Instead of using queue_insert(), we just copy the references into + the buffer of the priority queue. We insert in uchar value order, but + the order is in fact irrelevant here. We will establish the correct + order later. + */ + first=last=0; + for (i=found=0 ; i < 256 ; i++) + { + if (huff_counts->counts[i]) + { + if (! found++) + first=i; + last=i; + /* We start with root[1], which is the queues top element. */ + queue.root[found]=(uchar*) &huff_counts->counts[i]; + } + } + if (!found) + DBUG_RETURN(0); /* Empty tree */ + /* + If there is only a single uchar value in this field in all records, + add a second element with zero incidence. This is required to enter + the loop, which follows the Huffman algorithm. + */ + if (found < 2) + queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1]; + + /* Make a queue from the queue buffer. */ + queue.elements=found; + + bytes_packed=0; bits_packed=0; + /* Add the length of the coding table, which would become part of the file. */ + if (add_tree_lenght) + bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+ + (max_bit(found-1)+1+1)*(found-2) +7)/8; + + /* + Make a priority queue from the queue. Construct its index so that we + have a partially ordered tree. + */ + queue_fix(&queue); + + /* The Huffman algorithm. */ + for (i=0 ; i < found-1 ; i++) + { + my_off_t *a; + my_off_t *b; + HUFF_ELEMENT *new_huff_el; + + /* + Pop the top element from the queue (the one with the least + incidence). Popping from a priority queue includes a re-ordering + of the queue, to get the next least incidence element to the top. + */ + a= (my_off_t*) queue_remove_top(&queue); + /* Copy the next least incidence element. */ + b= (my_off_t*) queue_top(&queue); + /* Create a new element in a local (automatic) buffer. */ + new_huff_el= element_buffer + i; + /* The new element gets the sum of the two least incidence elements. */ + new_huff_el->count= *a + *b; + /* + The Huffman algorithm assigns another bit to the code for a byte + every time that bytes incidence is combined (directly or indirectly) + to a new element as one of the two least incidence elements. + This means that one more bit per incidence of that uchar is required + in the resulting file. So we add the new combined incidence as the + number of bits by which the result grows. + */ + bits_packed+=(uint) (new_huff_el->count & 7); + bytes_packed+=new_huff_el->count/8; + /* + Replace the copied top element by the new element and re-order the + queue. This successively replaces the references to counts by + references to HUFF_ELEMENTs. + */ + queue_top(&queue)= (uchar*) new_huff_el; + queue_replace_top(&queue); + } + DBUG_RETURN(bytes_packed+(bits_packed+7)/8); +} + + + /* Remove trees that don't give any compression */ + +static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees) +{ + uint k,tree_number; + HUFF_COUNTS count,*i,*j,*last_count; + + last_count=huff_counts+trees; + for (tree_number=0, i=huff_counts ; i < last_count ; i++) + { + if (!i->tree->tree_number) + { + i->tree->tree_number= ++tree_number; + if (i->tree_buff) + continue; /* Don't join intervall */ + for (j=i+1 ; j < last_count ; j++) + { + if (! j->tree->tree_number && ! j->tree_buff) + { + for (k=0 ; k < 256 ; k++) + count.counts[k]=i->counts[k]+j->counts[k]; + if (calc_packed_length(&count,1) <= + i->tree->bytes_packed + j->tree->bytes_packed+ + i->tree->tree_pack_length+j->tree->tree_pack_length+ + ALLOWED_JOIN_DIFF) + { + memcpy_fixed((uchar*) i->counts,(uchar*) count.counts, + sizeof(count.counts[0])*256); + my_free((uchar*) j->tree->element_buffer,MYF(0)); + j->tree->element_buffer=0; + j->tree=i->tree; + bmove((uchar*) i->counts,(uchar*) count.counts, + sizeof(count.counts[0])*256); + if (make_huff_tree(i->tree,i)) + return (uint) -1; + } + } + } + } + } + DBUG_PRINT("info", ("Original trees: %d After join: %d", + trees, tree_number)); + if (verbose) + VOID(printf("Original trees: %d After join: %d\n", trees, tree_number)); + return tree_number; /* Return trees left */ +} + + +/* + Fill in huff_tree encode tables. + + SYNOPSIS + make_huff_decode_table() + huff_tree An array of HUFF_TREE which are to be encoded. + trees The number of HUFF_TREE in the array. + + RETURN + 0 success + != 0 error +*/ + +static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees) +{ + uint elements; + for ( ; trees-- ; huff_tree++) + { + if (huff_tree->tree_number > 0) + { + elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256; + if (!(huff_tree->code = + (ulonglong*) my_malloc(elements* + (sizeof(ulonglong) + sizeof(uchar)), + MYF(MY_WME | MY_ZEROFILL)))) + return 1; + huff_tree->code_len=(uchar*) (huff_tree->code+elements); + make_traverse_code_tree(huff_tree, huff_tree->root, + 8 * sizeof(ulonglong), LL(0)); + } + } + return 0; +} + + +static void make_traverse_code_tree(HUFF_TREE *huff_tree, + HUFF_ELEMENT *element, + uint size, ulonglong code) +{ + uint chr; + if (!element->a.leaf.null) + { + chr=element->a.leaf.element_nr; + huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size); + huff_tree->code[chr]= (code >> size); + if (huff_tree->height < 8 * sizeof(ulonglong) - size) + huff_tree->height= 8 * sizeof(ulonglong) - size; + } + else + { + size--; + make_traverse_code_tree(huff_tree,element->a.nod.left,size,code); + make_traverse_code_tree(huff_tree, element->a.nod.right, size, + code + (((ulonglong) 1) << size)); + } + return; +} + + +/* + Convert a value into binary digits. + + SYNOPSIS + bindigits() + value The value. + length The number of low order bits to convert. + + NOTE + The result string is in static storage. It is reused on every call. + So you cannot use it twice in one expression. + + RETURN + A pointer to a static NUL-terminated string. + */ + +static char *bindigits(ulonglong value, uint bits) +{ + static char digits[72]; + char *ptr= digits; + uint idx= bits; + + DBUG_ASSERT(idx < sizeof(digits)); + while (idx) + *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1); + *ptr= '\0'; + return digits; +} + + +/* + Convert a value into hexadecimal digits. + + SYNOPSIS + hexdigits() + value The value. + + NOTE + The result string is in static storage. It is reused on every call. + So you cannot use it twice in one expression. + + RETURN + A pointer to a static NUL-terminated string. + */ + +static char *hexdigits(ulonglong value) +{ + static char digits[20]; + char *ptr= digits; + uint idx= 2 * sizeof(value); /* Two hex digits per byte. */ + + DBUG_ASSERT(idx < sizeof(digits)); + while (idx) + { + if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9') + *(ptr - 1)+= 'a' - '9' - 1; + } + *ptr= '\0'; + return digits; +} + + + /* Write header to new packed data file */ + +static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees, + my_off_t tot_elements,my_off_t filelength) +{ + uchar *buff= (uchar*) file_buffer.pos; + + bzero(buff,HEAD_LENGTH); + memcpy_fixed(buff,maria_pack_file_magic,4); + int4store(buff+4,head_length); + int4store(buff+8, mrg->min_pack_length); + int4store(buff+12,mrg->max_pack_length); + int4store(buff+16,tot_elements); + int4store(buff+20,intervall_length); + int2store(buff+24,trees); + buff[26]=(char) mrg->ref_length; + /* Save record pointer length */ + buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2); + if (test_only) + return 0; + VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0))); + return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0; +} + + /* Write fieldinfo to new packed file */ + +static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees) +{ + reg1 uint i; + uint huff_tree_bits; + huff_tree_bits=max_bit(trees ? trees-1 : 0); + + DBUG_PRINT("info", (" ")); + DBUG_PRINT("info", ("column types:")); + DBUG_PRINT("info", ("FIELD_NORMAL 0")); + DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1")); + DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2")); + DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3")); + DBUG_PRINT("info", ("FIELD_BLOB 4")); + DBUG_PRINT("info", ("FIELD_CONSTANT 5")); + DBUG_PRINT("info", ("FIELD_INTERVALL 6")); + DBUG_PRINT("info", ("FIELD_ZERO 7")); + DBUG_PRINT("info", ("FIELD_VARCHAR 8")); + DBUG_PRINT("info", ("FIELD_CHECK 9")); + DBUG_PRINT("info", (" ")); + DBUG_PRINT("info", ("pack type as a set of flags:")); + DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1")); + DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2")); + DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4")); + DBUG_PRINT("info", (" ")); + if (verbose >= 2) + { + VOID(printf("\n")); + VOID(printf("column types:\n")); + VOID(printf("FIELD_NORMAL 0\n")); + VOID(printf("FIELD_SKIP_ENDSPACE 1\n")); + VOID(printf("FIELD_SKIP_PRESPACE 2\n")); + VOID(printf("FIELD_SKIP_ZERO 3\n")); + VOID(printf("FIELD_BLOB 4\n")); + VOID(printf("FIELD_CONSTANT 5\n")); + VOID(printf("FIELD_INTERVALL 6\n")); + VOID(printf("FIELD_ZERO 7\n")); + VOID(printf("FIELD_VARCHAR 8\n")); + VOID(printf("FIELD_CHECK 9\n")); + VOID(printf("\n")); + VOID(printf("pack type as a set of flags:\n")); + VOID(printf("PACK_TYPE_SELECTED 1\n")); + VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n")); + VOID(printf("PACK_TYPE_ZERO_FILL 4\n")); + VOID(printf("\n")); + } + for (i=0 ; i++ < fields ; counts++) + { + write_bits((ulonglong) (int) counts->field_type, 5); + write_bits(counts->pack_type,6); + if (counts->pack_type & PACK_TYPE_ZERO_FILL) + write_bits(counts->max_zero_fill,5); + else + write_bits(counts->length_bits,5); + write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits); + DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u " + "lbits: %2u tree: %2u length: %4u", + i , counts->field_type, counts->pack_type, + counts->max_zero_fill, counts->length_bits, + counts->tree->tree_number, counts->field_length)); + if (verbose >= 2) + VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u " + "tree: %2u length: %4u\n", i , counts->field_type, + counts->pack_type, counts->max_zero_fill, counts->length_bits, + counts->tree->tree_number, counts->field_length)); + } + flush_bits(); + return; +} + + /* Write all huff_trees to new datafile. Return tot count of + elements in all trees + Returns 0 on error */ + +static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees) +{ + uint i,int_length; + uint tree_no; + uint codes; + uint errors= 0; + uint *packed_tree,*offset,length; + my_off_t elements; + + /* Find the highest number of elements in the trees. */ + for (i=length=0 ; i < trees ; i++) + if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length) + length=huff_tree[i].elements; + /* + Allocate a buffer for packing a decode tree. Two numbers per element + (left child and right child). + */ + if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2))) + { + my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2); + return 0; + } + + DBUG_PRINT("info", (" ")); + if (verbose >= 2) + VOID(printf("\n")); + tree_no= 0; + intervall_length=0; + for (elements=0; trees-- ; huff_tree++) + { + /* Skip columns that have been joined with other columns. */ + if (huff_tree->tree_number == 0) + continue; /* Deleted tree */ + tree_no++; + DBUG_PRINT("info", (" ")); + if (verbose >= 3) + VOID(printf("\n")); + /* Count the total number of elements (byte codes or column values). */ + elements+=huff_tree->elements; + huff_tree->max_offset=2; + /* Build a tree of offsets and codes for decoding in 'packed_tree'. */ + if (huff_tree->elements <= 1) + offset=packed_tree; + else + offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree); + + /* This should be the same as 'length' above. */ + huff_tree->offset_bits=max_bit(huff_tree->max_offset); + + /* + Since we check this during collecting the distinct column values, + this should never happen. + */ + if (huff_tree->max_offset >= IS_OFFSET) + { /* This should be impossible */ + VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n", + huff_tree->max_offset)); + my_afree(packed_tree); + return 0; + } + + DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu " + "char_bits: %u\n", + (ulong) (file_buffer.pos - file_buffer.buffer), + huff_tree->elements, (ulong) (offset - packed_tree), + huff_tree->char_bits)); + if (!huff_tree->counts->tree_buff) + { + /* We do a uchar compression on this column. Mark with bit 0. */ + write_bits(0,1); + write_bits(huff_tree->min_chr,8); + write_bits(huff_tree->elements,9); + write_bits(huff_tree->char_bits,5); + write_bits(huff_tree->offset_bits,5); + int_length=0; + } + else + { + int_length=(uint) (huff_tree->counts->tree_pos - + huff_tree->counts->tree_buff); + /* We have distinct column values for this column. Mark with bit 1. */ + write_bits(1,1); + write_bits(huff_tree->elements,15); + write_bits(int_length,16); + write_bits(huff_tree->char_bits,5); + write_bits(huff_tree->offset_bits,5); + intervall_length+=int_length; + } + DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u " + "offset_bits: %2u %s: %5u codelen: %2u", + tree_no, huff_tree->elements, huff_tree->char_bits, + huff_tree->offset_bits, huff_tree->counts->tree_buff ? + "bufflen" : "min_chr", huff_tree->counts->tree_buff ? + int_length : huff_tree->min_chr, huff_tree->height)); + if (verbose >= 2) + VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u " + "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements, + huff_tree->char_bits, huff_tree->offset_bits, + huff_tree->counts->tree_buff ? "bufflen" : "min_chr", + huff_tree->counts->tree_buff ? int_length : + huff_tree->min_chr, huff_tree->height)); + + /* Check that the code tree length matches the element count. */ + length=(uint) (offset-packed_tree); + if (length != huff_tree->elements*2-2) + { + VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n", + length, huff_tree->elements * 2 - 2)); + errors++; + break; + } + + for (i=0 ; i < length ; i++) + { + if (packed_tree[i] & IS_OFFSET) + write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits), + huff_tree->offset_bits+1); + else + write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1); + DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x", + i, (packed_tree[i] & IS_OFFSET) ? + " -> " : "", (packed_tree[i] & IS_OFFSET) ? + packed_tree[i] - IS_OFFSET + i : packed_tree[i])); + if (verbose >= 3) + VOID(printf("tree[0x%04x]: %s0x%04x\n", + i, (packed_tree[i] & IS_OFFSET) ? " -> " : "", + (packed_tree[i] & IS_OFFSET) ? + packed_tree[i] - IS_OFFSET + i : packed_tree[i])); + } + flush_bits(); + + /* + Display coding tables and check their correctness. + */ + codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256; + for (i= 0; i < codes; i++) + { + ulonglong code; + uint bits; + uint len; + uint idx; + + if (! (len= huff_tree->code_len[i])) + continue; + DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i, + hexdigits(huff_tree->code[i]), huff_tree->code_len[i], + bindigits(huff_tree->code[i], + huff_tree->code_len[i]))); + if (verbose >= 3) + VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i, + hexdigits(huff_tree->code[i]), huff_tree->code_len[i], + bindigits(huff_tree->code[i], huff_tree->code_len[i]))); + + /* Check that the encode table decodes correctly. */ + code= 0; + bits= 0; + idx= 0; + DBUG_EXECUTE_IF("forcechkerr1", len--;); + DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code);); + DBUG_EXECUTE_IF("forcechkerr3", idx= length;); + for (;;) + { + if (! len) + { + VOID(fflush(stdout)); + VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n", + hexdigits(huff_tree->code[i]), huff_tree->code_len[i])); + errors++; + break; + } + code<<= 1; + code|= (huff_tree->code[i] >> (--len)) & 1; + bits++; + if (bits > 8 * sizeof(code)) + { + VOID(fflush(stdout)); + VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n", + bits, (uint) (8 * sizeof(code)))); + errors++; + break; + } + idx+= (uint) code & 1; + if (idx >= length) + { + VOID(fflush(stdout)); + VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n", + idx, length)); + errors++; + break; + } + if (packed_tree[idx] & IS_OFFSET) + idx+= packed_tree[idx] & ~IS_OFFSET; + else + break; /* Hit a leaf. This contains the result value. */ + } + if (errors) + break; + + DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;); + if (packed_tree[idx] != i) + { + VOID(fflush(stdout)); + VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n", + packed_tree[idx], i)); + errors++; + break; + } + } /*end for (codes)*/ + if (errors) + break; + + /* Write column values in case of distinct column value compression. */ + if (huff_tree->counts->tree_buff) + { + for (i=0 ; i < int_length ; i++) + { + write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8); + DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x", + i, (uchar) huff_tree->counts->tree_buff[i])); + if (verbose >= 3) + VOID(printf("column_values[0x%04x]: 0x%02x\n", + i, (uchar) huff_tree->counts->tree_buff[i])); + } + } + flush_bits(); + } + DBUG_PRINT("info", (" ")); + if (verbose >= 2) + VOID(printf("\n")); + my_afree(packed_tree); + if (errors) + { + VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n")); + return 0; + } + return elements; +} + + +static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element, + uint *offset) +{ + uint *prev_offset; + + prev_offset= offset; + /* + 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null, + then there is no left child and, hence no right child either. This + is a property of a binary tree. An element is either a node with two + childs, or a leaf without childs. + + The current element is always a node with two childs. Go left first. + */ + if (!element->a.nod.left->a.leaf.null) + { + /* Store the uchar code or the index of the column value. */ + prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr; + offset+=2; + } + else + { + /* + Recursively traverse the tree to the left. Mark it as an offset to + another tree node (in contrast to a uchar code or column value index). + */ + prev_offset[0]= IS_OFFSET+2; + offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2); + } + + /* Now, check the right child. */ + if (!element->a.nod.right->a.leaf.null) + { + /* Store the uchar code or the index of the column value. */ + prev_offset[1]=element->a.nod.right->a.leaf.element_nr; + return offset; + } + else + { + /* + Recursively traverse the tree to the right. Mark it as an offset to + another tree node (in contrast to a uchar code or column value index). + */ + uint temp=(uint) (offset-prev_offset-1); + prev_offset[1]= IS_OFFSET+ temp; + if (huff_tree->max_offset < temp) + huff_tree->max_offset = temp; + return make_offset_code_tree(huff_tree,element->a.nod.right,offset); + } +} + + /* Get number of bits neaded to represent value */ + +static uint max_bit(register uint value) +{ + reg2 uint power=1; + + while ((value>>=1)) + power++; + return (power); +} + + +static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts) +{ + int error; + uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length; + uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes; + my_off_t record_count; + char llbuf[32]; + ulong length,pack_length; + uchar *record,*pos,*end_pos,*record_pos,*start_pos; + HUFF_COUNTS *count,*end_count; + HUFF_TREE *tree; + MARIA_HA *isam_file=mrg->file[0]; + uint pack_version= (uint) isam_file->s->pack.version; + DBUG_ENTER("compress_maria_file"); + + /* Allocate a buffer for the records (excluding blobs). */ + if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength))) + return -1; + + end_count=huff_counts+isam_file->s->base.fields; + min_record_length= (uint) ~0; + max_record_length=0; + null_bytes= isam_file->s->base.null_bytes; + + /* + Calculate the maximum number of bits required to pack the records. + Remember to understand 'max_zero_fill' as 'min_zero_fill'. + The tree height determines the maximum number of bits per value. + Some fields skip leading or trailing spaces or zeroes. The skipped + number of bytes is encoded by 'length_bits' bits. + Empty blobs and varchar are encoded with a single 1 bit. Other blobs + and varchar get a leading 0 bit. + */ + max_calc_length= null_bytes; + for (i= 0 ; i < isam_file->s->base.fields ; i++) + { + if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL)) + huff_counts[i].max_zero_fill=0; + if (huff_counts[i].field_type == FIELD_CONSTANT || + huff_counts[i].field_type == FIELD_ZERO || + huff_counts[i].field_type == FIELD_CHECK) + continue; + if (huff_counts[i].field_type == FIELD_INTERVALL) + max_calc_length+=huff_counts[i].tree->height; + else if (huff_counts[i].field_type == FIELD_BLOB || + huff_counts[i].field_type == FIELD_VARCHAR) + max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1; + else + max_calc_length+= + (huff_counts[i].field_length - huff_counts[i].max_zero_fill)* + huff_counts[i].tree->height+huff_counts[i].length_bits; + } + max_calc_length= (max_calc_length + 7) / 8; + pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length); + record_count=0; + /* 'max_blob_length' is the max length of all blobs of a record. */ + pack_blob_length= isam_file->s->base.blobs ? + _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0; + max_pack_length=pack_ref_length+pack_blob_length; + + DBUG_PRINT("fields", ("===")); + mrg_reset(mrg); + while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE) + { + ulong tot_blob_length=0; + if (! error) + { + if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length + + null_bytes)) + break; + record_pos= file_buffer.pos; + file_buffer.pos+= max_pack_length; + if (null_bytes) + { + /* Copy null bits 'as is' */ + memcpy(file_buffer.pos, record, null_bytes); + file_buffer.pos+= null_bytes; + } + for (start_pos=record+null_bytes, count= huff_counts; + count < end_count ; + count++) + { + end_pos=start_pos+(field_length=count->field_length); + tree=count->tree; + + DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u " + "lbits: %2u tree: %2u length: %4u", + (ulong) (count - huff_counts + 1), + count->field_type, + count->pack_type, count->max_zero_fill, + count->length_bits, count->tree->tree_number, + count->field_length)); + + /* Check if the column contains spaces only. */ + if (count->pack_type & PACK_TYPE_SPACE_FIELDS) + { + for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ; + if (pos == end_pos) + { + DBUG_PRINT("fields", + ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1")); + DBUG_PRINT("fields", ("---")); + write_bits(1,1); + start_pos=end_pos; + continue; + } + DBUG_PRINT("fields", + ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1")); + write_bits(0,1); + } + end_pos-=count->max_zero_fill; + field_length-=count->max_zero_fill; + + switch (count->field_type) { + case FIELD_SKIP_ZERO: + if (!memcmp(start_pos, zero_string, field_length)) + { + DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1")); + write_bits(1,1); + start_pos=end_pos; + break; + } + DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1")); + write_bits(0,1); + /* Fall through */ + case FIELD_NORMAL: + DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes", + (ulong) (end_pos - start_pos))); + for ( ; start_pos < end_pos ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + break; + case FIELD_SKIP_ENDSPACE: + for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ; + length= (ulong) (end_pos - pos); + if (count->pack_type & PACK_TYPE_SELECTED) + { + if (length > count->min_space) + { + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1")); + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(1,1); + write_bits(length,count->length_bits); + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE not more than min_space, " + "bits: 1")); + write_bits(0,1); + pos=end_pos; + } + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(length,count->length_bits); + } + /* Encode all significant bytes. */ + DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes", + (ulong) (pos - start_pos))); + for ( ; start_pos < pos ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + start_pos=end_pos; + break; + case FIELD_SKIP_PRESPACE: + for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ; + length= (ulong) (pos - start_pos); + if (count->pack_type & PACK_TYPE_SELECTED) + { + if (length > count->min_space) + { + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE more than min_space, bits: 1")); + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(1,1); + write_bits(length,count->length_bits); + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE not more than min_space, " + "bits: 1")); + pos=start_pos; + write_bits(0,1); + } + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(length,count->length_bits); + } + /* Encode all significant bytes. */ + DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes", + (ulong) (end_pos - start_pos))); + for (start_pos=pos ; start_pos < end_pos ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + break; + case FIELD_CONSTANT: + case FIELD_ZERO: + case FIELD_CHECK: + DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK")); + start_pos=end_pos; + break; + case FIELD_INTERVALL: + global_count=count; + pos=(uchar*) tree_search(&count->int_tree, start_pos, + count->int_tree.custom_arg); + intervall=(uint) (pos - count->tree_buff)/field_length; + DBUG_PRINT("fields", ("FIELD_INTERVALL")); + DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u", + intervall, hexdigits(tree->code[intervall]), + (uint) tree->code_len[intervall])); + write_bits(tree->code[intervall],(uint) tree->code_len[intervall]); + start_pos=end_pos; + break; + case FIELD_BLOB: + { + ulong blob_length= _ma_calc_blob_length(field_length- + portable_sizeof_char_ptr, + start_pos); + /* Empty blobs are encoded with a single 1 bit. */ + if (!blob_length) + { + DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1")); + write_bits(1,1); + } + else + { + uchar *blob,*blob_end; + DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1")); + write_bits(0,1); + /* Write the blob length. */ + DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u", + blob_length, count->length_bits)); + write_bits(blob_length,count->length_bits); + memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr, + sizeof(char*)); + blob_end=blob+blob_length; + /* Encode the blob bytes. */ + for ( ; blob < blob_end ; blob++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *blob, hexdigits(tree->code[(uchar) *blob]), + (uint) tree->code_len[(uchar) *blob], + bindigits(tree->code[(uchar) *start_pos], + (uint)tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *blob], + (uint) tree->code_len[(uchar) *blob]); + } + tot_blob_length+=blob_length; + } + start_pos= end_pos; + break; + } + case FIELD_VARCHAR: + { + uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1); + ulong col_length= (var_pack_length == 1 ? + (uint) *(uchar*) start_pos : + uint2korr(start_pos)); + /* Empty varchar are encoded with a single 1 bit. */ + if (!col_length) + { + DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1")); + write_bits(1,1); /* Empty varchar */ + } + else + { + uchar *end= start_pos + var_pack_length + col_length; + DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1")); + write_bits(0,1); + /* Write the varchar length. */ + DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u", + col_length, count->length_bits)); + write_bits(col_length,count->length_bits); + /* Encode the varchar bytes. */ + for (start_pos+= var_pack_length ; start_pos < end ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint)tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + } + start_pos= end_pos; + break; + } + case FIELD_LAST: + case FIELD_enum_val_count: + abort(); /* Impossible */ + } + start_pos+=count->max_zero_fill; + DBUG_PRINT("fields", ("---")); + } + flush_bits(); + length=(ulong) (file_buffer.pos - record_pos) - max_pack_length; + pack_length= _ma_save_pack_length(pack_version, record_pos, length); + if (pack_blob_length) + pack_length+= _ma_save_pack_length(pack_version, + record_pos + pack_length, + tot_blob_length); + DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu " + "length-bytes: %lu", (ulong) record_count, length, + tot_blob_length, pack_length)); + DBUG_PRINT("fields", ("===")); + + /* Correct file buffer if the header was smaller */ + if (pack_length != max_pack_length) + { + bmove(record_pos+pack_length,record_pos+max_pack_length,length); + file_buffer.pos-= (max_pack_length-pack_length); + } + if (length < (ulong) min_record_length) + min_record_length=(uint) length; + if (length > (ulong) max_record_length) + max_record_length=(uint) length; + record_count++; + if (write_loop && record_count % WRITE_COUNT == 0) + { + VOID(printf("%lu\r", (ulong) record_count)); + VOID(fflush(stdout)); + } + } + else if (error != HA_ERR_RECORD_DELETED) + break; + } + if (error == HA_ERR_END_OF_FILE) + error=0; + else + { + VOID(fprintf(stderr, "%s: Got error %d reading records\n", + my_progname, error)); + } + if (verbose >= 2) + VOID(printf("wrote %s records.\n", llstr((longlong) record_count, llbuf))); + + my_afree(record); + mrg->ref_length=max_pack_length; + mrg->min_pack_length=max_record_length ? min_record_length : 0; + mrg->max_pack_length=max_record_length; + DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0)); +} + + +static char *make_new_name(char *new_name, char *old_name) +{ + return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4); +} + +static char *make_old_name(char *new_name, char *old_name) +{ + return fn_format(new_name,old_name,"",OLD_EXT,2+4); +} + + /* rutines for bit writing buffer */ + +static void init_file_buffer(File file, pbool read_buffer) +{ + file_buffer.file=file; + file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE), + MYF(MY_WME)); + file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8; + file_buffer.pos_in_file=0; + error_on_write=0; + if (read_buffer) + { + + file_buffer.pos=file_buffer.end; + file_buffer.bits=0; + } + else + { + file_buffer.pos=file_buffer.buffer; + file_buffer.bits=BITS_SAVED; + } + file_buffer.bitbucket= 0; +} + + +static int flush_buffer(ulong neaded_length) +{ + ulong length; + + /* + file_buffer.end is 8 bytes lower than the real end of the buffer. + This is done so that the end-of-buffer condition does not need to be + checked for every uchar (see write_bits()). Consequently, + file_buffer.pos can become greater than file_buffer.end. The + algorithms in the other functions ensure that there will never be + more than 8 bytes written to the buffer without an end-of-buffer + check. So the buffer cannot be overrun. But we need to check for the + near-to-buffer-end condition to avoid a negative result, which is + casted to unsigned and thus becomes giant. + */ + if ((file_buffer.pos < file_buffer.end) && + ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length)) + return 0; + length=(ulong) (file_buffer.pos-file_buffer.buffer); + file_buffer.pos=file_buffer.buffer; + file_buffer.pos_in_file+=length; + if (test_only) + return 0; + if (error_on_write|| my_write(file_buffer.file, + (const uchar*) file_buffer.buffer, + length, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL))) + { + error_on_write=1; + return 1; + } + + if (neaded_length != ~(ulong) 0 && + (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length) + { + uchar *tmp; + neaded_length+=256; /* some margin */ + tmp= (uchar*) my_realloc(file_buffer.buffer, neaded_length,MYF(MY_WME)); + if (!tmp) + return 1; + file_buffer.pos= (tmp + (ulong) (file_buffer.pos - file_buffer.buffer)); + file_buffer.buffer= tmp; + file_buffer.end= (tmp+neaded_length-8); + } + return 0; +} + + +static void end_file_buffer(void) +{ + my_free(file_buffer.buffer, MYF(0)); +} + + /* output `bits` low bits of `value' */ + +static void write_bits(register ulonglong value, register uint bits) +{ + DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) || + (bits == 8 * sizeof(value))); + + if ((file_buffer.bits-= (int) bits) >= 0) + { + file_buffer.bitbucket|= value << file_buffer.bits; + } + else + { + reg3 ulonglong bit_buffer; + bits= (uint) -file_buffer.bits; + bit_buffer= (file_buffer.bitbucket | + ((bits != 8 * sizeof(value)) ? (value >> bits) : 0)); +#if BITS_SAVED == 64 + *file_buffer.pos++= (uchar) (bit_buffer >> 56); + *file_buffer.pos++= (uchar) (bit_buffer >> 48); + *file_buffer.pos++= (uchar) (bit_buffer >> 40); + *file_buffer.pos++= (uchar) (bit_buffer >> 32); +#endif + *file_buffer.pos++= (uchar) (bit_buffer >> 24); + *file_buffer.pos++= (uchar) (bit_buffer >> 16); + *file_buffer.pos++= (uchar) (bit_buffer >> 8); + *file_buffer.pos++= (uchar) (bit_buffer); + + if (bits != 8 * sizeof(value)) + value&= (((ulonglong) 1) << bits) - 1; + if (file_buffer.pos >= file_buffer.end) + VOID(flush_buffer(~ (ulong) 0)); + file_buffer.bits=(int) (BITS_SAVED - bits); + file_buffer.bitbucket= value << (BITS_SAVED - bits); + } + return; +} + + /* Flush bits in bit_buffer to buffer */ + +static void flush_bits(void) +{ + int bits; + ulonglong bit_buffer; + + bits= file_buffer.bits & ~7; + bit_buffer= file_buffer.bitbucket >> bits; + bits= BITS_SAVED - bits; + while (bits > 0) + { + bits-= 8; + *file_buffer.pos++= (uchar) (bit_buffer >> bits); + } + if (file_buffer.pos >= file_buffer.end) + VOID(flush_buffer(~ (ulong) 0)); + file_buffer.bits= BITS_SAVED; + file_buffer.bitbucket= 0; +} + + +/**************************************************************************** +** functions to handle the joined files +****************************************************************************/ + +static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg, + my_off_t new_length, + ha_checksum crc) +{ + MARIA_SHARE *share=isam_file->s; + uint options=mi_uint2korr(share->state.header.options); + uint key; + DBUG_ENTER("save_state"); + + options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA; + mi_int2store(share->state.header.options,options); + /* Save the original file type of we have to undo the packing later */ + share->state.header.org_data_file_type= share->state.header.data_file_type; + share->state.header.data_file_type= COMPRESSED_RECORD; + + share->state.state.data_file_length=new_length; + share->state.state.del=0; + share->state.state.empty=0; + share->state.dellink= HA_OFFSET_ERROR; + share->state.split=(ha_rows) mrg->records; + share->state.version=(ulong) time((time_t*) 0); + if (share->base.born_transactional) + share->state.create_rename_lsn= share->state.is_of_horizon= + share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS; + if (! maria_is_all_keys_active(share->state.key_map, share->base.keys)) + { + /* + Some indexes are disabled, cannot use current key_file_length value + as an estimate of upper bound of index file size. Use packed data file + size instead. + */ + share->state.state.key_file_length= new_length; + } + /* + If there are no disabled indexes, keep key_file_length value from + original file so "aria_chk -rq" can use this value (this is necessary + because index size cannot be easily calculated for fulltext keys) + */ + maria_clear_all_keys_active(share->state.key_map); + for (key=0 ; key < share->base.keys ; key++) + share->state.key_root[key]= HA_OFFSET_ERROR; + share->state.key_del= HA_OFFSET_ERROR; + share->state.state.checksum= crc; /* Save crc in file */ + share->changed=1; /* Force write of header */ + share->state.open_count=0; + share->global_changed=0; + VOID(my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0))); + if (share->base.keys) + isamchk_neaded=1; + DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file, + &share->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO)); +} + + +static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length, + ha_checksum crc) +{ + MARIA_STATE_INFO state; + MARIA_HA *isam_file=mrg->file[0]; + uint options; + DBUG_ENTER("save_state_mrg"); + + state= isam_file->s->state; + options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD | + HA_OPTION_READ_ONLY_DATA); + mi_int2store(state.header.options,options); + /* Save the original file type of we have to undo the packing later */ + state.header.org_data_file_type= state.header.data_file_type; + state.header.data_file_type= COMPRESSED_RECORD; + + state.state.data_file_length=new_length; + state.state.del=0; + state.state.empty=0; + state.state.records=state.split=(ha_rows) mrg->records; + state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn= + LSN_NEEDS_NEW_STATE_LSNS; + + /* See comment above in save_state about key_file_length handling. */ + if (mrg->src_file_has_indexes_disabled) + { + isam_file->s->state.state.key_file_length= + max(isam_file->s->state.state.key_file_length, new_length); + } + state.dellink= HA_OFFSET_ERROR; + state.version=(ulong) time((time_t*) 0); + maria_clear_all_keys_active(state.key_map); + state.state.checksum=crc; + if (isam_file->s->base.keys) + isamchk_neaded=1; + state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */ + DBUG_RETURN (_ma_state_info_write_sub(file, &state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO)); +} + + +/* reset for mrg_rrnd */ + +static void mrg_reset(PACK_MRG_INFO *mrg) +{ + if (mrg->current) + { + maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0); + mrg->current=0; + } +} + +static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf) +{ + int error; + MARIA_HA *isam_info; + my_off_t filepos; + + if (!info->current) + { + isam_info= *(info->current=info->file); + info->end=info->current+info->count; + maria_reset(isam_info); + maria_extra(isam_info, HA_EXTRA_CACHE, 0); + if ((error= maria_scan_init(isam_info))) + return(error); + } + else + isam_info= *info->current; + + for (;;) + { + if (!(error= maria_scan(isam_info, buf)) || + error != HA_ERR_END_OF_FILE) + return (error); + maria_scan_end(isam_info); + maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0); + if (info->current+1 == info->end) + return(HA_ERR_END_OF_FILE); + info->current++; + isam_info= *info->current; + filepos=isam_info->s->pack.header_length; + maria_reset(isam_info); + maria_extra(isam_info,HA_EXTRA_CACHE, 0); + if ((error= maria_scan_init(isam_info))) + return(error); + } +} + + +static int mrg_close(PACK_MRG_INFO *mrg) +{ + uint i; + int error=0; + DBUG_ENTER("mrg_close"); + + for (i=0 ; i < mrg->count ; i++) + error|=maria_close(mrg->file[i]); + if (mrg->free_file) + my_free(mrg->file, MYF(0)); + DBUG_RETURN(error); +} + + +#if !defined(DBUG_OFF) +/* + Fake the counts to get big Huffman codes. + + SYNOPSIS + fakebigcodes() + huff_counts A pointer to the counts array. + end_count A pointer past the counts array. + + DESCRIPTION + + Huffman coding works by removing the two least frequent values from + the list of values and add a new value with the sum of their + incidences in a loop until only one value is left. Every time a + value is reused for a new value, it gets one more bit for its + encoding. Hence, the least frequent values get the longest codes. + + To get a maximum code length for a value, two of the values must + have an incidence of 1. As their sum is 2, the next infrequent value + must have at least an incidence of 2, then 4, 8, 16 and so on. This + means that one needs 2**n bytes (values) for a code length of n + bits. However, using more distinct values forces the use of longer + codes, or reaching the code length with less total bytes (values). + + To get 64(32)-bit codes, I sort the counts by decreasing incidence. + I assign counts of 1 to the two most frequent values, a count of 2 + for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All + the remaining values get 1. That way every possible uchar has an + assigned code, though not all codes are used if not all uchar values + are present in the column. + + This strategy would work with distinct column values too, but + requires that at least 64(32) values are present. To make things + easier here, I cancel all distinct column values and force byte + compression for all columns. + + RETURN + void +*/ + +static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count) +{ + HUFF_COUNTS *count; + my_off_t *cur_count_p; + my_off_t *end_count_p; + my_off_t **cur_sort_p; + my_off_t **end_sort_p; + my_off_t *sort_counts[256]; + my_off_t total; + DBUG_ENTER("fakebigcodes"); + + for (count= huff_counts; count < end_count; count++) + { + /* + Remove distinct column values. + */ + if (huff_counts->tree_buff) + { + my_free(huff_counts->tree_buff, MYF(0)); + delete_tree(&huff_counts->int_tree); + huff_counts->tree_buff= NULL; + DBUG_PRINT("fakebigcodes", ("freed distinct column values")); + } + + /* + Sort counts by decreasing incidence. + */ + cur_count_p= count->counts; + end_count_p= cur_count_p + 256; + cur_sort_p= sort_counts; + while (cur_count_p < end_count_p) + *(cur_sort_p++)= cur_count_p++; + (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp); + + /* + Assign faked counts. + */ + cur_sort_p= sort_counts; +#if SIZEOF_LONG_LONG > 4 + end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1; +#else + end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2; +#endif + /* Most frequent value gets a faked count of 1. */ + **(cur_sort_p++)= 1; + total= 1; + while (cur_sort_p < end_sort_p) + { + **(cur_sort_p++)= total; + total<<= 1; + } + /* Set the last value. */ + **(cur_sort_p++)= --total; + /* + Set the remaining counts. + */ + end_sort_p= sort_counts + 256; + while (cur_sort_p < end_sort_p) + **(cur_sort_p++)= 1; + } + DBUG_VOID_RETURN; +} + + +/* + Compare two counts for reverse sorting. + + SYNOPSIS + fakecmp() + count1 One count. + count2 Another count. + + RETURN + 1 count1 < count2 + 0 count1 == count2 + -1 count1 > count2 +*/ + +static int fakecmp(my_off_t **count1, my_off_t **count2) +{ + return ((**count1 < **count2) ? 1 : + (**count1 > **count2) ? -1 : 0); +} +#endif diff --git a/storage/maria/maria_read_log.c b/storage/maria/maria_read_log.c new file mode 100644 index 00000000000..de45eb0bcb6 --- /dev/null +++ b/storage/maria/maria_read_log.c @@ -0,0 +1,308 @@ +/* Copyright (C) 2007 MySQL AB + Copyright (C) 2010 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "ma_recovery.h" +#include <my_getopt.h> + +#define LOG_FLAGS 0 + +static const char *load_default_groups[]= { "aria_read_log",0 }; +static void get_options(int *argc,char * * *argv); +#ifndef DBUG_OFF +#if defined(__WIN__) +const char *default_dbug_option= "d:t:O,\\aria_read_log.trace"; +#else +const char *default_dbug_option= "d:t:o,/tmp/aria_read_log.trace"; +#endif +#endif /* DBUG_OFF */ +static my_bool opt_display_only, opt_apply, opt_apply_undo, opt_silent; +static my_bool opt_check; +static const char *opt_tmpdir; +static ulong opt_page_buffer_size; +static ulonglong opt_start_from_lsn, opt_end_lsn, opt_start_from_checkpoint; +static MY_TMPDIR maria_chk_tmpdir; + + +int main(int argc, char **argv) +{ + LSN lsn; + char **default_argv; + uint warnings_count; + MY_INIT(argv[0]); + + load_defaults("my", load_default_groups, &argc, &argv); + default_argv= argv; + maria_data_root= (char *)"."; + get_options(&argc, &argv); + + maria_in_recovery= TRUE; + + if (maria_init()) + { + fprintf(stderr, "Can't init Aria engine (%d)\n", errno); + goto err; + } + maria_block_size= 0; /* Use block size from file */ + /* we don't want to create a control file, it MUST exist */ + if (ma_control_file_open(FALSE, TRUE)) + { + fprintf(stderr, "Can't open control file (%d)\n", errno); + goto err; + } + if (last_logno == FILENO_IMPOSSIBLE) + { + fprintf(stderr, "Can't find any log\n"); + goto err; + } + if (init_pagecache(maria_pagecache, opt_page_buffer_size, 0, 0, + maria_block_size, MY_WME) == 0) + { + fprintf(stderr, "Got error in init_pagecache() (errno: %d)\n", errno); + goto err; + } + /* + If log handler does not find the "last_logno" log it will return error, + which is good. + But if it finds a log and this log was crashed, it will create a new log, + which is useless. TODO: start log handler in read-only mode. + */ + if (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, MY_WME) == 0 || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, TRANSLOG_DEFAULT_FLAGS, + opt_display_only)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + goto err; + } + + if (opt_display_only) + printf("You are using --display-only, NOTHING will be written to disk\n"); + + lsn= translog_first_lsn_in_log(); + if (lsn == LSN_ERROR) + { + fprintf(stderr, "Opening transaction log failed\n"); + goto end; + } + if (lsn == LSN_IMPOSSIBLE) + { + fprintf(stdout, "The transaction log is empty\n"); + } + if (opt_start_from_checkpoint && !opt_start_from_lsn && + last_checkpoint_lsn != LSN_IMPOSSIBLE) + { + lsn= LSN_IMPOSSIBLE; /* LSN set in maria_apply_log() */ + fprintf(stdout, "Starting from checkpoint (%lu,0x%lx)\n", + LSN_IN_PARTS(last_checkpoint_lsn)); + } + else + fprintf(stdout, "The transaction log starts from lsn (%lu,0x%lx)\n", + LSN_IN_PARTS(lsn)); + + if (opt_start_from_lsn) + { + if (opt_start_from_lsn < (ulonglong) lsn) + { + fprintf(stderr, "start_from_lsn is too small. Aborting\n"); + maria_end(); + goto err; + } + lsn= (LSN) opt_start_from_lsn; + fprintf(stdout, "Starting reading log from lsn (%lu,0x%lx)\n", + LSN_IN_PARTS(lsn)); + } + + if (opt_end_lsn != LSN_IMPOSSIBLE) + { + /* We can't apply undo if we use end_lsn */ + opt_apply_undo= 0; + } + + fprintf(stdout, "TRACE of the last aria_read_log\n"); + if (maria_apply_log(lsn, opt_end_lsn, opt_apply ? MARIA_LOG_APPLY : + (opt_check ? MARIA_LOG_CHECK : + MARIA_LOG_DISPLAY_HEADER), opt_silent ? NULL : stdout, + opt_apply_undo, FALSE, FALSE, &warnings_count)) + goto err; + if (warnings_count == 0) + fprintf(stdout, "%s: SUCCESS\n", my_progname_short); + else + fprintf(stdout, "%s: DOUBTFUL (%u warnings, check previous output)\n", + my_progname_short, warnings_count); + +end: + maria_end(); + free_tmpdir(&maria_chk_tmpdir); + free_defaults(default_argv); + my_end(0); + exit(0); + return 0; /* No compiler warning */ + +err: + /* don't touch anything more, in case we hit a bug */ + fprintf(stderr, "%s: FAILED\n", my_progname_short); + free_tmpdir(&maria_chk_tmpdir); + free_defaults(default_argv); + exit(1); +} + + +#include "ma_check_standalone.h" + +enum options_mc { + OPT_CHARSETS_DIR=256 +}; + +static struct my_option my_long_options[] = +{ + {"apply", 'a', + "Apply log to tables: modifies tables! you should make a backup first! " + " Displays a lot of information if not run with --silent", + (uchar **) &opt_apply, (uchar **) &opt_apply, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"character-sets-dir", OPT_CHARSETS_DIR, + "Directory where character sets are.", + (char**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"check", 'c', + "if --display-only, check if record is fully readable (for debugging)", + (uchar **) &opt_check, (uchar **) &opt_check, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"display-only", 'd', "display brief info read from records' header", + &opt_display_only, &opt_display_only, 0, GET_BOOL, + NO_ARG,0, 0, 0, 0, 0, 0}, + {"aria-log-dir-path", 'l', + "Path to the directory where to store transactional log", + (uchar **) &maria_data_root, (uchar **) &maria_data_root, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + { "page-buffer-size", 'P', "", + &opt_page_buffer_size, &opt_page_buffer_size, 0, + GET_ULONG, REQUIRED_ARG, (long) USE_BUFFER_INIT, + (long) USE_BUFFER_INIT, (long) ~(ulong) 0, (long) MALLOC_OVERHEAD, + (long) IO_SIZE, 0}, + { "start-from-lsn", 'o', "Start reading log from this lsn", + &opt_start_from_lsn, &opt_start_from_lsn, + 0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 }, + {"start-from-checkpoint", 'C', "Start applying from last checkpoint", + &opt_start_from_checkpoint, &opt_start_from_checkpoint, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "end-lsn", 'e', "Stop applying at this lsn. If end-lsn is used, UNDO:s " + "will not be applied", &opt_end_lsn, &opt_end_lsn, + 0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 }, + {"silent", 's', "Print less information during apply/undo phase", + &opt_silent, &opt_silent, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Print more information during apply/undo phase", + &maria_recovery_verbose, &maria_recovery_verbose, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"tmpdir", 't', "Path for temporary files. Multiple paths can be specified, " + "separated by " +#if defined( __WIN__) || defined(__NETWARE__) + "semicolon (;)" +#else + "colon (:)" +#endif + , (char**) &opt_tmpdir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"undo", 'u', "Apply UNDO records to tables. (disable with --disable-undo)", + (uchar **) &opt_apply_undo, (uchar **) &opt_apply_undo, 0, + GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +#include <help_start.h> + +static void print_version(void) +{ + VOID(printf("%s Ver 1.3 for %s on %s\n", + my_progname_short, SYSTEM_TYPE, MACHINE_TYPE)); + NETWARE_SET_SCREEN_MODE(1); +} + + +static void usage(void) +{ + print_version(); + puts("Copyright (C) 2007 MySQL AB"); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Display and apply log records from a Aria transaction log"); + puts("found in the current directory (for now)"); +#ifndef IDENTICAL_PAGES_AFTER_RECOVERY + puts("\nNote: Aria is compiled without -DIDENTICAL_PAGES_AFTER_RECOVERY\n" + "which means that the table files are not byte-to-byte identical to\n" + "files created during normal execution. This should be ok, except for\n" + "test scripts that tries to compare files before and after recovery."); +#endif + VOID(printf("\nUsage: %s OPTIONS\n", my_progname_short)); + puts("You need to use one of -d or -a"); + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + +#include <help_end.h> + +static my_bool +get_one_option(int optid __attribute__((unused)), + const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch (optid) { + case '?': + usage(); + exit(0); + case 'V': + print_version(); + exit(0); +#ifndef DBUG_OFF + case '#': + DBUG_SET_INITIAL(argument ? argument : default_dbug_option); + break; +#endif + } + return 0; +} + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + if (!opt_apply) + opt_apply_undo= FALSE; + + if (((opt_display_only + opt_apply) != 1) || (*argc > 0)) + { + usage(); + exit(1); + } + if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir)) + exit(1); + maria_tmpdir= &maria_chk_tmpdir; +} diff --git a/storage/maria/maria_rename.sh b/storage/maria/maria_rename.sh new file mode 100755 index 00000000000..fb20e47e635 --- /dev/null +++ b/storage/maria/maria_rename.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +replace myisam maria MYISAM MARIA MyISAM MARIA -- mysql-test/t/*maria*test mysql-test/r/*maria*result + +FILES=`echo sql/ha_maria.{cc,h} include/maria*h storage/maria/*.{c,h}` + +replace myisam maria MYISAM MARIA MyISAM MARIA myisam.h maria.h myisamdef.h maria_def.h mi_ maria_ ft_ maria_ft_ "Copyright (C) 2000" "Copyright (C) 2006" MI_ISAMINFO MARIA_INFO MI_CREATE_INFO MARIA_CREATE_INFO maria_isam_ maria_ MI_INFO MARIA_HA MI_ MARIA_ MARIACHK MARIA_CHK rt_index.h ma_rt_index.h rtree_ maria_rtree rt_key.h ma_rt_key.h rt_mbr.h ma_rt_mbr.h -- $FILES + +replace check_table_is_closed _ma_check_table_is_closed test_if_reopen _ma_test_if_reopen my_n_base_info_read maria_n_base_info_read update_auto_increment _ma_update_auto_increment save_pack_length _ma_save_packlength calc_pack_length _ma_calc_pack_length -- $FILES + +replace mi_ ma_ ft_ ma_ft_ rt_ ma_rt_ myisam maria myisamchk maria_chk myisampack maria_pack myisamlog maria_log -- storage/maria/Makefile.am + +# +# Restore wrong replaces +# + +replace maria_sint1korr mi_sint1korr maria_uint1korr mi_uint1korr maria_sint2korr mi_sint2korr maria_sint3korr mi_sint3korr maria_sint4korr mi_sint4korr maria_sint8korr mi_sint8korr maria_uint2korr mi_uint2korr maria_uint3korr mi_uint3korr maria_uint4korr mi_uint4korr maria_uint5korr mi_uint5korr maria_uint6korr mi_uint6korr maria_uint7korr mi_uint7korr maria_uint8korr mi_uint8korr maria_int1store mi_int1store maria_int2store mi_int2store maria_int3store mi_int3store maria_int4store mi_int4store maria_int5store mi_int5store maria_int6store mi_int6store maria_int7store mi_int7store maria_int8store mi_int8store maria_float4store mi_float4store maria_float4get mi_float4get maria_float8store mi_float8store maria_float8get mi_float8get maria_rowstore mi_rowstore maria_rowkorr mi_rowkorr maria_sizestore mi_sizestore maria_sizekorr mi_sizekorr _maria_maria_ _maria MARIA_MAX_POSSIBLE_KEY HA_MAX_POSSIBLE_KEY MARIA_MAX_KEY_BUFF HA_MAX_KEY_BUFF MARIA_MAX_KEY_SEG HA_MAX_KEY_SEG maria_ft_sintXkorr ft_sintXkorr maria_ft_intXstore ft_intXstore maria_ft_boolean_syntax ft_boolean_syntax maria_ft_min_word_len ft_min_word_len maria_ft_max_word_len ft_max_word_len -- $FILES diff --git a/storage/maria/plug.in b/storage/maria/plug.in new file mode 100644 index 00000000000..008d82250c8 --- /dev/null +++ b/storage/maria/plug.in @@ -0,0 +1,19 @@ +MYSQL_STORAGE_ENGINE(aria,, [Aria Storage Engine], + [Crash-safe tables with MyISAM heritage], [default,max,max-no-ndb]) +MYSQL_PLUGIN_DIRECTORY(aria, [storage/maria]) +MYSQL_PLUGIN_STATIC(aria, [libaria.a]) +MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(aria, [ha_maria.cc]) + +MYSQL_PLUGIN_ACTIONS(aria, [ +# AC_CONFIG_FILES(storage/maria/unittest/Makefile) +AC_ARG_WITH(aria-tmp-tables, + AC_HELP_STRING([--with-aria-tmp-tables],[Use Aria for internal temporary tables]), + [with_aria_tmp_tables=$withval], + [with_aria_tmp_tables=yes] +) + +if test "$with_aria_tmp_tables" = "yes" +then + AC_DEFINE([USE_MARIA_FOR_TMP_TABLES], [1], [Aria is used for internal temporary tables]) +fi +]) diff --git a/storage/maria/tablockman.c b/storage/maria/tablockman.c new file mode 100644 index 00000000000..1bb8889aaa7 --- /dev/null +++ b/storage/maria/tablockman.c @@ -0,0 +1,674 @@ +/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */ +/* QQ: automatically place S instead of LS if possible */ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include <my_base.h> +#include <hash.h> +#include "tablockman.h" + +/* + Lock Manager for Table Locks + + The code below handles locks on resources - but it is optimized for a + case when a number of resources is not very large, and there are many of + locks per resource - that is a resource is likely to be a table or a + database, but hardly a row in a table. + + Locks belong to "lock owners". A Lock Owner is uniquely identified by a + 16-bit number - loid (lock owner identifier). A function loid_to_tlo must + be provided by the application that takes such a number as an argument + and returns a TABLE_LOCK_OWNER structure. + + Lock levels are completely defined by three tables. Lock compatibility + matrix specifies which locks can be held at the same time on a resource. + Lock combining matrix specifies what lock level has the same behaviour as + a pair of two locks of given levels. getlock_result matrix simplifies + intention locking and lock escalation for an application, basically it + defines which locks are intention locks and which locks are "loose" + locks. It is only used to provide better diagnostics for the + application, lock manager itself does not differentiate between normal, + intention, and loose locks. + + The assumptions are: few distinct resources, many locks are held at the + same time on one resource. Thus: a lock structure _per resource_ can be + rather large; a lock structure _per lock_ does not need to be very small + either; we need to optimize for _speed_. Operations we need are: place a + lock, check if a particular transaction already has a lock on this + resource, check if a conflicting lock exists, if yes - find who owns it. + + Solution: every resource has a structure with + 1. Hash of latest (see the lock upgrade section below) granted locks with + loid as a key. Thus, checking if a given transaction has a lock on + this resource is O(1) operation. + 2. Doubly-linked lists of all granted locks - one list for every lock + type. Thus, checking if a conflicting lock exists is a check whether + an appropriate list head pointer is not null, also O(1). + 3. Every lock has a loid of the owner, thus checking who owns a + conflicting lock is also O(1). + 4. Deque of waiting locks. It's a deque (double-ended queue) not a fifo, + because for lock upgrades requests are added to the queue head, not + tail. This is a single place where there it gets O(N) on number + of locks - when a transaction wakes up from waiting on a condition, + it may need to scan the queue backward to the beginning to find + a conflicting lock. It is guaranteed though that "all transactions + before it" received the same - or earlier - signal. In other words a + transaction needs to scan all transactions before it that received the + signal but didn't have a chance to resume the execution yet, so + practically OS scheduler won't let the scan to be O(N). + + Waiting: if there is a conflicting lock or if wait queue is not empty, a + requested lock cannot be granted at once. It is added to the end of the + wait queue. If a queue was empty and there is a conflicting lock - the + "blocker" transaction is the owner of this lock. If a queue is not empty, + an owner of the previous lock in the queue is the "blocker". But if the + previous lock is compatible with the request, then the "blocker" is the + transaction that the owner of the lock at the end of the queue is waiting + for (in other words, our lock is added to the end of the wait queue, and + our blocker is the same as of the lock right before us). + + Lock upgrades: when a thread that has a lock on a given resource, + requests a new lock on the same resource and the old lock is not enough + to satisfy new lock requirements (which is defined by + lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock + (defined by lock_combining_matrix as above) is placed. Depending on + other granted locks it is immediately granted or it has to wait. Here the + lock is added to the start of the waiting queue, not to the end. Old + lock, is removed from the hash, but not from the doubly-linked lists. + (indeed, a transaction checks "do I have a lock on this resource ?" by + looking in a hash, and it should find a latest lock, so old locks must be + removed; but a transaction checks "are there conflicting locks ?" by + checking doubly-linked lists, it doesn't matter if it will find an old + lock - if it would be removed, a new lock would be also a conflict). + So, a hash contains only "latest" locks - there can be only one latest + lock per resource per transaction. But doubly-linked lists contain all + locks, even "obsolete" ones, because it doesnt't hurt. Note that old + locks can not be freed early, in particular they stay in the + 'active_locks' list of a lock owner, because they may be "re-enabled" + on a savepoint rollback. + + To better support table-row relations where one needs to lock the table + with an intention lock before locking the row, extended diagnostics is + provided. When an intention lock (presumably on a table) is granted, + lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row, + perhaps the thread already has a normal lock on this table), + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual), + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check + whether it's possible to lock the row, but no need to lock it - perhaps + the thread has a loose lock on this table). This is defined by + getlock_result[] table. + + Instant duration locks are not supported. Though they're trivial to add, + they are normally only used on rows, not on tables. So, presumably, + they are not needed here. + + Mutexes: there're table mutexes (LOCKED_TABLE::mutex), lock owner mutexes + (TABLE_LOCK_OWNER::mutex), and a pool mutex (TABLOCKMAN::pool_mutex). + table mutex protects operations on the table lock structures, and lock + owner pointers waiting_for and waiting_for_loid. + lock owner mutex is only used to wait on lock owner condition + (TABLE_LOCK_OWNER::cond), there's no need to protect owner's lock + structures, and only lock owner itself may access them. + The pool mutex protects a pool of unused locks. Note the locking order: + first the table mutex, then the owner mutex or a pool mutex. + Table mutex lock cannot be attempted when owner or pool mutex are locked. + No mutex lock can be attempted if owner or pool mutex are locked. +*/ + +/* + Lock compatibility matrix. + + It's asymmetric. Read it as "Somebody has the lock <value in the row + label>, can I set the lock <value in the column label> ?" + + ') Though you can take LS lock while somebody has S lock, it makes no + sense - it's simpler to take S lock too. + + 1 - compatible + 0 - incompatible + -1 - "impossible", so that we can assert the impossibility. +*/ +static const int lock_compatibility_matrix[10][10]= +{ /* N S X IS IX SIX LS LX SLX LSIX */ + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, /* N */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */ + { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */ + { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */ +}; + +/* + Lock combining matrix. + + It's symmetric. Read it as "what lock level L is identical to the + set of two locks A and B" + + One should never get N from it, we assert the impossibility +*/ +static const enum lockman_lock_type lock_combining_matrix[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { N, N, N, N, N, N, N, N, N, N}, /* N */ + { N, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */ + { N, X, X, X, X, X, X, X, X, X}, /* X */ + { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */ + { N, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */ + { N, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */ + { N, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */ + { N, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */ + { N, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */ + { N, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */ +}; + +/* + the return codes for lockman_getlock + + It's asymmetric. Read it as "I have the lock <value in the row label>, + what value should be returned for <value in the column label> ?" + + 0 means impossible combination (assert!) + + Defines below help to preserve the table structure. + I/L/A values are self explanatory + x means the combination is possible (assert should not crash) + but it cannot happen in row locks, only in table locks (S,X), + or lock escalations (LS,LX) +*/ +#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE +#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +#define A GOT_THE_LOCK +#define x GOT_THE_LOCK +static const enum lockman_getlock_result getlock_result[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */ + { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */ + { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */ + { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */ + { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */ + { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */ + { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */ + { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */ + { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */ + { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */ +}; +#undef I +#undef L +#undef A +#undef x + +/* + this structure is optimized for a case when there're many locks + on the same resource - e.g. a table +*/ + +struct st_table_lock { + /* QQ: do we need upgraded_from ? */ + struct st_table_lock *next_in_lo, *upgraded_from, *next, *prev; + struct st_locked_table *table; + uint16 loid; + uchar lock_type; +}; + +#define hash_insert my_hash_insert /* for consistency :) */ + +static inline +TABLE_LOCK *find_by_loid(LOCKED_TABLE *table, uint16 loid) +{ + return (TABLE_LOCK *)hash_search(& table->latest_locks, + (uchar *)& loid, sizeof(loid)); +} + +static inline +void remove_from_wait_queue(TABLE_LOCK *lock, LOCKED_TABLE *table) +{ + DBUG_ASSERT(table == lock->table); + if (lock->prev) + { + DBUG_ASSERT(table->wait_queue_out != lock); + lock->prev->next= lock->next; + } + else + { + DBUG_ASSERT(table->wait_queue_out == lock); + table->wait_queue_out= lock->next; + } + if (lock->next) + { + DBUG_ASSERT(table->wait_queue_in != lock); + lock->next->prev= lock->prev; + } + else + { + DBUG_ASSERT(table->wait_queue_in == lock); + table->wait_queue_in= lock->prev; + } +} + +/* + DESCRIPTION + tries to lock a resource 'table' with a lock level 'lock'. + + RETURN + see enum lockman_getlock_result +*/ +enum lockman_getlock_result +tablockman_getlock(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo, + LOCKED_TABLE *table, enum lockman_lock_type lock) +{ + TABLE_LOCK *old, *new, *blocker, *blocker2; + TABLE_LOCK_OWNER *wait_for; + struct timespec timeout; + enum lockman_lock_type new_lock; + enum lockman_getlock_result res; + int i; + + DBUG_ASSERT(lo->waiting_lock == 0); + DBUG_ASSERT(lo->waiting_for == 0); + DBUG_ASSERT(lo->waiting_for_loid == 0); + + pthread_mutex_lock(& table->mutex); + /* do we already have a lock on this resource ? */ + old= find_by_loid(table, lo->loid); + + /* calculate the level of the upgraded lock, if yes */ + new_lock= old ? lock_combining_matrix[old->lock_type][lock] : lock; + + /* and check if old lock is enough to satisfy the new request */ + if (old && new_lock == old->lock_type) + { + /* yes */ + res= getlock_result[old->lock_type][lock]; + goto ret; + } + + /* no, placing a new lock. first - take a free lock structure from the pool */ + pthread_mutex_lock(& lm->pool_mutex); + new= lm->pool; + if (new) + { + lm->pool= new->next; + pthread_mutex_unlock(& lm->pool_mutex); + } + else + { + pthread_mutex_unlock(& lm->pool_mutex); + new= (TABLE_LOCK *)my_malloc(sizeof(*new), MYF(MY_WME)); + if (unlikely(!new)) + { + res= NO_MEMORY_FOR_LOCK; + goto ret; + } + } + + new->loid= lo->loid; + new->lock_type= new_lock; + new->table= table; + + /* and try to place it */ + for (new->prev= table->wait_queue_in;;) + { + wait_for= 0; + if (!old) + { + /* not upgrading - a lock must be added to the _end_ of the wait queue */ + for (blocker= new->prev; blocker && !wait_for; blocker= blocker->prev) + { + TABLE_LOCK_OWNER *tmp= lm->loid_to_tlo(blocker->loid); + + /* find a blocking lock */ + DBUG_ASSERT(table->wait_queue_out); + DBUG_ASSERT(table->wait_queue_in); + if (!lock_compatibility_matrix[blocker->lock_type][lock]) + { + /* found! */ + wait_for= tmp; + break; + } + + /* + hmm, the lock before doesn't block us, let's look one step further. + the condition below means: + + if we never waited on a condition yet + OR + the lock before ours (blocker) waits on a lock (blocker2) that is + present in the hash AND and conflicts with 'blocker' + + the condition after OR may fail if 'blocker2' was removed from + the hash, its signal woke us up, but 'blocker' itself didn't see + the signal yet. + */ + if (!lo->waiting_lock || + ((blocker2= find_by_loid(table, tmp->waiting_for_loid)) && + !lock_compatibility_matrix[blocker2->lock_type] + [blocker->lock_type])) + { + /* but it's waiting for a real lock. we'll wait for the same lock */ + wait_for= tmp->waiting_for; + /* + We don't really need tmp->waiting_for, as tmp->waiting_for_loid + is enough. waiting_for is just a local cache to avoid calling + loid_to_tlo(). + But it's essensial that tmp->waiting_for pointer can ONLY + be dereferenced if find_by_loid() above returns a non-null + pointer, because a TABLE_LOCK_OWNER object that it points to + may've been freed when we come here after a signal. + In particular tmp->waiting_for_loid cannot be replaced + with tmp->waiting_for->loid. + */ + DBUG_ASSERT(wait_for == lm->loid_to_tlo(tmp->waiting_for_loid)); + break; + } + + /* + otherwise - a lock it's waiting for doesn't exist. + We've no choice but to scan the wait queue backwards, looking + for a conflicting lock or a lock waiting for a real lock. + QQ is there a way to avoid this scanning ? + */ + } + } + + if (wait_for == 0) + { + /* checking for compatibility with existing locks */ + for (blocker= 0, i= 0; i < LOCK_TYPES; i++) + { + if (table->active_locks[i] && !lock_compatibility_matrix[i+1][lock]) + { + blocker= table->active_locks[i]; + /* if the first lock in the list is our own - skip it */ + if (blocker->loid == lo->loid) + blocker= blocker->next; + if (blocker) /* found a conflicting lock, need to wait */ + break; + } + } + if (!blocker) /* free to go */ + break; + wait_for= lm->loid_to_tlo(blocker->loid); + } + + /* ok, we're here - the wait is inevitable */ + lo->waiting_for= wait_for; + lo->waiting_for_loid= wait_for->loid; + if (!lo->waiting_lock) /* first iteration of the for() loop */ + { + /* lock upgrade or new lock request ? */ + if (old) + { + /* upgrade - add the lock to the _start_ of the wait queue */ + new->prev= 0; + if ((new->next= table->wait_queue_out)) + new->next->prev= new; + table->wait_queue_out= new; + if (!table->wait_queue_in) + table->wait_queue_in= table->wait_queue_out; + } + else + { + /* new lock - add the lock to the _end_ of the wait queue */ + new->next= 0; + if ((new->prev= table->wait_queue_in)) + new->prev->next= new; + table->wait_queue_in= new; + if (!table->wait_queue_out) + table->wait_queue_out= table->wait_queue_in; + } + lo->waiting_lock= new; + + set_timespec_nsec(timeout,lm->lock_timeout * 1000000); + + } + + /* + prepare to wait. + we must lock blocker's mutex to wait on blocker's cond. + and we must release table's mutex. + note that blocker's mutex is locked _before_ table's mutex is released + */ + pthread_mutex_lock(wait_for->mutex); + pthread_mutex_unlock(& table->mutex); + + /* now really wait */ + i= pthread_cond_timedwait(wait_for->cond, wait_for->mutex, & timeout); + + pthread_mutex_unlock(wait_for->mutex); + + if (i == ETIMEDOUT || i == ETIME) + { + /* we rely on the caller to rollback and release all locks */ + res= LOCK_TIMEOUT; + goto ret2; + } + + pthread_mutex_lock(& table->mutex); + + /* ... and repeat from the beginning */ + } + /* yeah! we can place the lock now */ + + /* remove the lock from the wait queue, if it was there */ + if (lo->waiting_lock) + { + remove_from_wait_queue(new, table); + lo->waiting_lock= 0; + lo->waiting_for= 0; + lo->waiting_for_loid= 0; + } + + /* add it to the list of all locks of this lock owner */ + new->next_in_lo= lo->active_locks; + lo->active_locks= new; + + /* and to the list of active locks of this lock type */ + new->prev= 0; + if ((new->next= table->active_locks[new_lock-1])) + new->next->prev= new; + table->active_locks[new_lock-1]= new; + + /* update the latest_locks hash */ + if (old) + hash_delete(& table->latest_locks, (uchar *)old); + hash_insert(& table->latest_locks, (uchar *)new); + + new->upgraded_from= old; + + res= getlock_result[lock][lock]; + +ret: + pthread_mutex_unlock(& table->mutex); +ret2: + DBUG_ASSERT(res); + return res; +} + +/* + DESCRIPTION + release all locks belonging to a transaction. + signal waiters to continue +*/ +void tablockman_release_locks(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo) +{ + TABLE_LOCK *lock, *local_pool= 0, *local_pool_end; + + /* + instead of adding released locks to a pool one by one, we'll link + them in a list and add to a pool in one short action (under a mutex) + */ + local_pool_end= lo->waiting_lock ? lo->waiting_lock : lo->active_locks; + if (!local_pool_end) + return; + + /* release a waiting lock, if any */ + if ((lock= lo->waiting_lock)) + { + DBUG_ASSERT(lock->loid == lo->loid); + pthread_mutex_lock(& lock->table->mutex); + remove_from_wait_queue(lock, lock->table); + + /* + a special case: if this lock was not the last in the wait queue + and it's compatible with the next lock, than the next lock + is waiting for our blocker though really it waits for us, indirectly. + Signal our blocker to release this next lock (after we removed our + lock from the wait queue, of course). + */ + /* + An example to clarify the above: + trn1> S-lock the table. Granted. + trn2> IX-lock the table. Added to the wait queue. trn2 waits on trn1 + trn3> IS-lock the table. The queue is not empty, so IS-lock is added + to the queue. It's compatible with the waiting IX-lock, so trn3 + waits for trn2->waiting_for, that is trn1. + if trn1 releases the lock it signals trn1->cond and both waiting + transactions are awaken. But if trn2 times out, trn3 must be notified + too (as IS and S locks are compatible). So trn2 must signal trn1->cond. + */ + if (lock->next && + lock_compatibility_matrix[lock->next->lock_type][lock->lock_type]) + { + pthread_mutex_lock(lo->waiting_for->mutex); + pthread_cond_broadcast(lo->waiting_for->cond); + pthread_mutex_unlock(lo->waiting_for->mutex); + } + lo->waiting_for= 0; + lo->waiting_for_loid= 0; + pthread_mutex_unlock(& lock->table->mutex); + + lock->next= local_pool; + local_pool= lock; + } + + /* now release granted locks */ + lock= lo->active_locks; + while (lock) + { + TABLE_LOCK *cur= lock; + pthread_mutex_t *mutex= & lock->table->mutex; + DBUG_ASSERT(cur->loid == lo->loid); + + DBUG_ASSERT(lock != lock->next_in_lo); + lock= lock->next_in_lo; + + /* TODO ? group locks by table to reduce the number of mutex locks */ + pthread_mutex_lock(mutex); + hash_delete(& cur->table->latest_locks, (uchar *)cur); + + if (cur->prev) + cur->prev->next= cur->next; + if (cur->next) + cur->next->prev= cur->prev; + if (cur->table->active_locks[cur->lock_type-1] == cur) + cur->table->active_locks[cur->lock_type-1]= cur->next; + + cur->next= local_pool; + local_pool= cur; + + pthread_mutex_unlock(mutex); + } + + lo->waiting_lock= lo->active_locks= 0; + + /* + okay, all locks released. now signal that we're leaving, + in case somebody's waiting for it + */ + pthread_mutex_lock(lo->mutex); + pthread_cond_broadcast(lo->cond); + pthread_mutex_unlock(lo->mutex); + + /* and push all freed locks to the lockman's pool */ + pthread_mutex_lock(& lm->pool_mutex); + local_pool_end->next= lm->pool; + lm->pool= local_pool; + pthread_mutex_unlock(& lm->pool_mutex); +} + +void tablockman_init(TABLOCKMAN *lm, loid_to_tlo_func *func, uint timeout) +{ + lm->pool= 0; + lm->loid_to_tlo= func; + lm->lock_timeout= timeout; + pthread_mutex_init(& lm->pool_mutex, MY_MUTEX_INIT_FAST); + my_getsystime(); /* ensure that my_getsystime() is initialized */ +} + +void tablockman_destroy(TABLOCKMAN *lm) +{ + while (lm->pool) + { + TABLE_LOCK *tmp= lm->pool; + lm->pool= tmp->next; + my_free((void *)tmp, MYF(0)); + } + pthread_mutex_destroy(& lm->pool_mutex); +} + +/* + initialize a LOCKED_TABLE structure + + SYNOPSYS + lt a LOCKED_TABLE to initialize + initial_hash_size initial size for 'latest_locks' hash +*/ +void tablockman_init_locked_table(LOCKED_TABLE *lt, int initial_hash_size) +{ + bzero(lt, sizeof(*lt)); + pthread_mutex_init(& lt->mutex, MY_MUTEX_INIT_FAST); + hash_init(& lt->latest_locks, & my_charset_bin, initial_hash_size, + offsetof(TABLE_LOCK, loid), + sizeof(((TABLE_LOCK*)0)->loid), 0, 0, 0); +} + +void tablockman_destroy_locked_table(LOCKED_TABLE *lt) +{ + int i; + + DBUG_ASSERT(lt->wait_queue_out == 0); + DBUG_ASSERT(lt->wait_queue_in == 0); + DBUG_ASSERT(lt->latest_locks.records == 0); + for (i= 0; i<LOCK_TYPES; i++) + DBUG_ASSERT(lt->active_locks[i] == 0); + + hash_free(& lt->latest_locks); + pthread_mutex_destroy(& lt->mutex); +} + +#ifdef EXTRA_DEBUG +static const char *lock2str[LOCK_TYPES+1]= {"N", "S", "X", "IS", "IX", "SIX", + "LS", "LX", "SLX", "LSIX"}; + +void tablockman_print_tlo(TABLE_LOCK_OWNER *lo) +{ + TABLE_LOCK *lock; + + printf("lo%d>", lo->loid); + if ((lock= lo->waiting_lock)) + printf(" (%s.0x%lx)", lock2str[lock->lock_type], (ulong)lock->table); + for (lock= lo->active_locks; + lock && lock != lock->next_in_lo; + lock= lock->next_in_lo) + printf(" %s.0x%lx", lock2str[lock->lock_type], (ulong)lock->table); + if (lock && lock == lock->next_in_lo) + printf("!"); + printf("\n"); +} +#endif + diff --git a/storage/maria/tablockman.h b/storage/maria/tablockman.h new file mode 100644 index 00000000000..e33d1aa44e8 --- /dev/null +++ b/storage/maria/tablockman.h @@ -0,0 +1,87 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _tablockman_h +#define _tablockman_h + +/* + Lock levels: + ^^^^^^^^^^^ + + N - "no lock", not a lock, used sometimes internally to simplify the code + S - Shared + X - eXclusive + IS - Intention Shared + IX - Intention eXclusive + SIX - Shared + Intention eXclusive + LS - Loose Shared + LX - Loose eXclusive + SLX - Shared + Loose eXclusive + LSIX - Loose Shared + Intention eXclusive +*/ +#ifndef _lockman_h +/* QQ: TODO remove N-locks */ +enum lockman_lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST }; +enum lockman_getlock_result { + NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT, + GOT_THE_LOCK, + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE, + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +}; +#endif + +#define LOCK_TYPES (LOCK_TYPE_LAST-1) + +typedef struct st_table_lock TABLE_LOCK; + +typedef struct st_table_lock_owner { + TABLE_LOCK *active_locks; /* list of active locks */ + TABLE_LOCK *waiting_lock; /* waiting lock (one lock only) */ + struct st_table_lock_owner *waiting_for; /* transaction we're waiting for */ + pthread_cond_t *cond; /* transactions waiting for us, wait on 'cond' */ + pthread_mutex_t *mutex; /* mutex is required to use 'cond' */ + uint16 loid, waiting_for_loid; /* Lock Owner IDentifier */ +} TABLE_LOCK_OWNER; + +typedef struct st_locked_table { + pthread_mutex_t mutex; /* mutex for everything below */ + HASH latest_locks; /* latest locks in a hash */ + TABLE_LOCK *active_locks[LOCK_TYPES]; /* dl-list of locks per type */ + TABLE_LOCK *wait_queue_in, *wait_queue_out; /* wait deque (double-end queue)*/ +} LOCKED_TABLE; + +typedef TABLE_LOCK_OWNER *loid_to_tlo_func(uint16); + +typedef struct { + pthread_mutex_t pool_mutex; + TABLE_LOCK *pool; /* lifo pool of free locks */ + uint lock_timeout; /* lock timeout in milliseconds */ + loid_to_tlo_func *loid_to_tlo; /* for mapping loid to TABLE_LOCK_OWNER */ +} TABLOCKMAN; + +void tablockman_init(TABLOCKMAN *, loid_to_tlo_func *, uint); +void tablockman_destroy(TABLOCKMAN *); +enum lockman_getlock_result tablockman_getlock(TABLOCKMAN *, TABLE_LOCK_OWNER *, + LOCKED_TABLE *, enum lockman_lock_type); +void tablockman_release_locks(TABLOCKMAN *, TABLE_LOCK_OWNER *); +void tablockman_init_locked_table(LOCKED_TABLE *, int); +void tablockman_destroy_locked_table(LOCKED_TABLE *); + +#ifdef EXTRA_DEBUG +void tablockman_print_tlo(TABLE_LOCK_OWNER *); +#endif + +#endif + diff --git a/storage/maria/test_pack b/storage/maria/test_pack new file mode 100755 index 00000000000..689645b1661 --- /dev/null +++ b/storage/maria/test_pack @@ -0,0 +1,10 @@ +silent="-s" +suffix="" + +ma_test1$suffix -s ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -us test1 ; maria_chk$suffix -es test1 +ma_test1$suffix -s -S ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ;maria_chk$suffix -us test1 ; maria_chk$suffix -es test1 +ma_test1$suffix -s -b ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 +ma_test1$suffix -s -w ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -ros test1 ; maria_chk$suffix -es test1 + +ma_test2$suffix -s -t4 ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2 +ma_test2$suffix -s -t4 -b ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2 diff --git a/storage/maria/trnman.c b/storage/maria/trnman.c new file mode 100644 index 00000000000..05330baed76 --- /dev/null +++ b/storage/maria/trnman.c @@ -0,0 +1,979 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +#include <my_global.h> +#include <my_sys.h> +#include <m_string.h> +#include "trnman.h" +#include "ma_checkpoint.h" +#include "ma_control_file.h" + +/* + status variables: + how many trns in the active list currently, + in the committed list currently, allocated since startup. +*/ +uint trnman_active_transactions, trnman_committed_transactions, + trnman_allocated_transactions; + +/* list of active transactions in the trid order */ +static TRN active_list_min, active_list_max; +/* list of committed transactions in the trid order */ +static TRN committed_list_min, committed_list_max; + +/* a counter, used to generate transaction ids */ +static TrID global_trid_generator; + +/* + The minimum existing transaction id for trnman_get_min_trid() + The default value is used when transaction manager not initialize; + Probably called from maria_chk +*/ +static TrID trid_min_read_from= MAX_TRID; + +/* the mutex for everything above */ +static pthread_mutex_t LOCK_trn_list; + +/* LIFO pool of unused TRN structured for reuse */ +static TRN *pool; + +/* a hash for committed transactions that maps trid to a TRN structure */ +static LF_HASH trid_to_trn; + +/* an array that maps short_id of an active transaction to a TRN structure */ +static TRN **short_trid_to_active_trn; + +/* locks for short_trid_to_active_trn and pool */ +static my_atomic_rwlock_t LOCK_short_trid_to_trn, LOCK_pool; +static my_bool default_trnman_end_trans_hook(TRN *, my_bool, my_bool); +static void trnman_free_trn(TRN *); + +my_bool (*trnman_end_trans_hook)(TRN *, my_bool, my_bool)= + default_trnman_end_trans_hook; + +/* + Simple interface functions + QQ: if they stay so simple, should we make them inline? +*/ + +uint trnman_increment_locked_tables(TRN *trn) +{ + return trn->locked_tables++; +} + +uint trnman_has_locked_tables(TRN *trn) +{ + return trn->locked_tables; +} + +uint trnman_decrement_locked_tables(TRN *trn) +{ + return --trn->locked_tables; +} + +void trnman_reset_locked_tables(TRN *trn, uint locked_tables) +{ + trn->locked_tables= locked_tables; +} + +#ifdef EXTRA_DEBUG +uint16 trnman_get_flags(TRN *trn) +{ + return trn->flags; +} + +void trnman_set_flags(TRN *trn, uint16 flags) +{ + trn->flags= flags; +} +#endif + +/** Wake up threads waiting for this transaction */ +static void wt_thd_release_self(TRN *trn) +{ + if (trn->wt) + { + WT_RESOURCE_ID rc; + rc.type= &ma_rc_dup_unique; + rc.value= (intptr)trn; + wt_thd_release(trn->wt, & rc); + trn->wt= 0; + } +} + +static my_bool +default_trnman_end_trans_hook(TRN *trn __attribute__ ((unused)), + my_bool commit __attribute__ ((unused)), + my_bool active_transactions + __attribute__ ((unused))) +{ + return 0; +} + + +static uchar *trn_get_hash_key(const uchar *trn, size_t *len, + my_bool unused __attribute__ ((unused))) +{ + *len= sizeof(TrID); + return (uchar *) & ((*((TRN **)trn))->trid); +} + + +/** + @brief Initializes transaction manager. + + @param initial_trid Generated TrIDs will start from initial_trid+1. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int trnman_init(TrID initial_trid) +{ + DBUG_ENTER("trnman_init"); + DBUG_PRINT("enter", ("initial_trid: %lu", (ulong) initial_trid)); + + short_trid_to_active_trn= (TRN **)my_malloc(SHORT_TRID_MAX*sizeof(TRN*), + MYF(MY_WME|MY_ZEROFILL)); + if (unlikely(!short_trid_to_active_trn)) + DBUG_RETURN(1); + short_trid_to_active_trn--; /* min short_id is 1 */ + + /* + Initialize lists. + active_list_max.min_read_from must be larger than any trid, + so that when an active list is empty we would could free + all committed list. + And committed_list_max itself can not be freed so + committed_list_max.commit_trid must not be smaller that + active_list_max.min_read_from + */ + + active_list_max.trid= active_list_min.trid= 0; + active_list_max.min_read_from= MAX_TRID; + active_list_max.next= active_list_min.prev= 0; + active_list_max.prev= &active_list_min; + active_list_min.next= &active_list_max; + + committed_list_max.commit_trid= MAX_TRID; + committed_list_max.next= committed_list_min.prev= 0; + committed_list_max.prev= &committed_list_min; + committed_list_min.next= &committed_list_max; + + trnman_active_transactions= 0; + trnman_committed_transactions= 0; + trnman_allocated_transactions= 0; + /* This is needed for recovery and repair */ + dummy_transaction_object.min_read_from= ~(TrID) 0; + + pool= 0; + global_trid_generator= initial_trid; + trid_min_read_from= initial_trid; + lf_hash_init(&trid_to_trn, sizeof(TRN*), LF_HASH_UNIQUE, + 0, 0, trn_get_hash_key, 0); + DBUG_PRINT("info", ("pthread_mutex_init LOCK_trn_list")); + pthread_mutex_init(&LOCK_trn_list, MY_MUTEX_INIT_FAST); + my_atomic_rwlock_init(&LOCK_short_trid_to_trn); + my_atomic_rwlock_init(&LOCK_pool); + + DBUG_RETURN(0); +} + +/* + NOTE + this could only be called in the "idle" state - no transaction can be + running. See asserts below. +*/ +void trnman_destroy() +{ + DBUG_ENTER("trnman_destroy"); + + if (short_trid_to_active_trn == NULL) /* trnman already destroyed */ + DBUG_VOID_RETURN; + DBUG_ASSERT(trid_to_trn.count == 0); + DBUG_ASSERT(trnman_active_transactions == 0); + DBUG_ASSERT(trnman_committed_transactions == 0); + DBUG_ASSERT(active_list_max.prev == &active_list_min); + DBUG_ASSERT(active_list_min.next == &active_list_max); + DBUG_ASSERT(committed_list_max.prev == &committed_list_min); + DBUG_ASSERT(committed_list_min.next == &committed_list_max); + while (pool) + { + TRN *trn= pool; + pool= pool->next; + DBUG_ASSERT(trn->wt == NULL); + pthread_mutex_destroy(&trn->state_lock); + my_free((void *)trn, MYF(0)); + } + lf_hash_destroy(&trid_to_trn); + DBUG_PRINT("info", ("pthread_mutex_destroy LOCK_trn_list")); + pthread_mutex_destroy(&LOCK_trn_list); + my_atomic_rwlock_destroy(&LOCK_short_trid_to_trn); + my_atomic_rwlock_destroy(&LOCK_pool); + my_free((void *)(short_trid_to_active_trn+1), MYF(0)); + short_trid_to_active_trn= NULL; + + DBUG_VOID_RETURN; +} + +/* + NOTE + TrID is limited to 6 bytes. Initial value of the generator + is set by the recovery code - being read from the last checkpoint + (or 1 on a first run). +*/ +static TrID new_trid() +{ + DBUG_ENTER("new_trid"); + DBUG_ASSERT(global_trid_generator < 0xffffffffffffLL); + DBUG_PRINT("info", ("safe_mutex_assert_owner LOCK_trn_list")); + safe_mutex_assert_owner(&LOCK_trn_list); + DBUG_RETURN(++global_trid_generator); +} + +static uint get_short_trid(TRN *trn) +{ + int i= (int) ((global_trid_generator + (intptr)trn) * 312089 % + SHORT_TRID_MAX) + 1; + uint res=0; + + for ( ; !res ; i= 1) + { + my_atomic_rwlock_wrlock(&LOCK_short_trid_to_trn); + for ( ; i <= SHORT_TRID_MAX; i++) /* the range is [1..SHORT_TRID_MAX] */ + { + void *tmp= NULL; + if (short_trid_to_active_trn[i] == NULL && + my_atomic_casptr((void **)&short_trid_to_active_trn[i], &tmp, trn)) + { + res= i; + break; + } + } + my_atomic_rwlock_wrunlock(&LOCK_short_trid_to_trn); + } + return res; +} + +/** + Allocates and initialzies a new TRN object + + @note the 'wt' parameter can only be 0 in a single-threaded code (or, + generally, where threads cannot block each other), otherwise the + first call to the deadlock detector will sigsegv. +*/ + +TRN *trnman_new_trn(WT_THD *wt) +{ + int res; + TRN *trn; + union { TRN *trn; void *v; } tmp; + DBUG_ENTER("trnman_new_trn"); + + /* + we have a mutex, to do simple things under it - allocate a TRN, + increment trnman_active_transactions, set trn->min_read_from. + + Note that all the above is fast. generating short_id may be slow, + as it involves scanning a large array - so it's done outside of the + mutex. + */ + + DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list")); + pthread_mutex_lock(&LOCK_trn_list); + + /* Allocating a new TRN structure */ + tmp.trn= pool; + /* + Popping an unused TRN from the pool + (ABA isn't possible, we're behind a mutex + */ + my_atomic_rwlock_wrlock(&LOCK_pool); + while (tmp.trn && !my_atomic_casptr((void **)(char*) &pool, &tmp.v, + (void *)tmp.trn->next)) + /* no-op */; + my_atomic_rwlock_wrunlock(&LOCK_pool); + + /* Nothing in the pool ? Allocate a new one */ + if (!(trn= tmp.trn)) + { + /* + trn should be completely initalized at create time to allow + one to keep a known state on it. + (Like redo_lns, which is assumed to be 0 at start of row handling + and reset to zero before end of row handling) + */ + trn= (TRN *)my_malloc(sizeof(TRN), MYF(MY_WME | MY_ZEROFILL)); + if (unlikely(!trn)) + { + DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list")); + pthread_mutex_unlock(&LOCK_trn_list); + return 0; + } + trnman_allocated_transactions++; + pthread_mutex_init(&trn->state_lock, MY_MUTEX_INIT_FAST); + } + trn->wt= wt; + trn->pins= lf_hash_get_pins(&trid_to_trn); + if (!trn->pins) + { + trnman_free_trn(trn); + pthread_mutex_unlock(&LOCK_trn_list); + return 0; + } + + trnman_active_transactions++; + + trn->min_read_from= active_list_min.next->trid; + + trn->trid= new_trid(); + + trn->next= &active_list_max; + trn->prev= active_list_max.prev; + active_list_max.prev= trn->prev->next= trn; + trid_min_read_from= active_list_min.next->min_read_from; + DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list")); + pthread_mutex_unlock(&LOCK_trn_list); + + if (unlikely(!trn->min_read_from)) + { + /* + We are the only transaction. Set min_read_from so that we can read + our own rows + */ + trn->min_read_from= trn->trid + 1; + } + + /* no other transaction can read changes done by this one */ + trn->commit_trid= MAX_TRID; + trn->rec_lsn= trn->undo_lsn= trn->first_undo_lsn= 0; + trn->used_tables= 0; + + trn->locked_tables= 0; + trn->flags= 0; + + /* + only after the following function TRN is considered initialized, + so it must be done the last + */ + pthread_mutex_lock(&trn->state_lock); + trn->short_id= get_short_trid(trn); + pthread_mutex_unlock(&trn->state_lock); + + res= lf_hash_insert(&trid_to_trn, trn->pins, &trn); + DBUG_ASSERT(res <= 0); + if (res) + { + trnman_end_trn(trn, 0); + return 0; + } + + DBUG_PRINT("exit", ("trn: 0x%lx trid: 0x%lu", + (ulong) trn, (ulong) trn->trid)); + + DBUG_RETURN(trn); +} + +/* + remove a trn from the active list. + if necessary - move to committed list and set commit_trid + + NOTE + Locks are released at the end. In particular, after placing the + transaction in commit list, and after setting commit_trid. It's + important, as commit_trid affects visibility. Locks don't affect + anything they simply delay execution of other threads - they could be + released arbitrarily late. In other words, when locks are released it + serves as a start banner for other threads, they start to run. So + everything they may need must be ready at that point. + + RETURN + 0 ok + 1 error +*/ +my_bool trnman_end_trn(TRN *trn, my_bool commit) +{ + int res= 1; + uint16 cached_short_id= trn->short_id; /* we have to cache it, see below */ + TRN *free_me= 0; + LF_PINS *pins= trn->pins; + DBUG_ENTER("trnman_end_trn"); + DBUG_PRINT("enter", ("trn=0x%lx commit=%d", (ulong) trn, commit)); + + /* if a rollback, all UNDO records should have been executed */ + DBUG_ASSERT(commit || trn->undo_lsn == 0); + DBUG_ASSERT(trn != &dummy_transaction_object); + DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list")); + + pthread_mutex_lock(&LOCK_trn_list); + + /* remove from active list */ + trn->next->prev= trn->prev; + trn->prev->next= trn->next; + + /* + if trn was the oldest active transaction, now that it goes away there + may be committed transactions in the list which no active transaction + needs to bother about - clean up the committed list + */ + if (trn->prev == &active_list_min) + { + uint free_me_count; + TRN *t; + for (t= committed_list_min.next, free_me_count= 0; + t->commit_trid < active_list_min.next->min_read_from; + t= t->next, free_me_count++) /* no-op */; + + DBUG_ASSERT((t != committed_list_min.next && free_me_count > 0) || + (t == committed_list_min.next && free_me_count == 0)); + /* found transactions committed before the oldest active one */ + if (t != committed_list_min.next) + { + free_me= committed_list_min.next; + committed_list_min.next= t; + t->prev->next= 0; + t->prev= &committed_list_min; + trnman_committed_transactions-= free_me_count; + } + } + + pthread_mutex_lock(&trn->state_lock); + if (commit) + trn->commit_trid= global_trid_generator; + wt_thd_release_self(trn); + pthread_mutex_unlock(&trn->state_lock); + + /* + if transaction is committed and it was not the only active transaction - + add it to the committed list + */ + if (commit && active_list_min.next != &active_list_max) + { + trn->next= &committed_list_max; + trn->prev= committed_list_max.prev; + trnman_committed_transactions++; + committed_list_max.prev= trn->prev->next= trn; + } + else + { + trn->next= free_me; + free_me= trn; + } + trid_min_read_from= active_list_min.next->min_read_from; + + if ((*trnman_end_trans_hook)(trn, commit, + active_list_min.next != &active_list_max)) + res= -1; + trnman_active_transactions--; + + DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list")); + pthread_mutex_unlock(&LOCK_trn_list); + + /* + the rest is done outside of a critical section + + note that we don't own trn anymore, it may be in a shared list now. + Thus, we cannot dereference it, and must use cached_short_id below. + */ + my_atomic_rwlock_rdlock(&LOCK_short_trid_to_trn); + my_atomic_storeptr((void **)&short_trid_to_active_trn[cached_short_id], 0); + my_atomic_rwlock_rdunlock(&LOCK_short_trid_to_trn); + + /* + we, under the mutex, removed going-in-free_me transactions from the + active and committed lists, thus nobody else may see them when it scans + those lists, and thus nobody may want to free them. Now we don't + need a mutex to access free_me list + */ + /* QQ: send them to the purge thread */ + while (free_me) + { + TRN *t= free_me; + free_me= free_me->next; + + /* ignore OOM. it's harmless, and we can do nothing here anyway */ + (void)lf_hash_delete(&trid_to_trn, pins, &t->trid, sizeof(TrID)); + + trnman_free_trn(t); + } + + lf_hash_put_pins(pins); + + DBUG_RETURN(res < 0); +} + +/* + free a trn (add to the pool, that is) + note - we can never really free() a TRN if there's at least one other + running transaction - see, e.g., how lock waits are implemented in + lockman.c + The same is true for other lock-free data structures too. We may need some + kind of FLUSH command to reset them all - ensuring that no transactions are + running. It may even be called automatically on checkpoints if no + transactions are running. +*/ +static void trnman_free_trn(TRN *trn) +{ + /* + union is to solve strict aliasing issue. + without it gcc 3.4.3 doesn't notice that updating *(void **)&tmp + modifies the value of tmp. + */ + union { TRN *trn; void *v; } tmp; + + pthread_mutex_lock(&trn->state_lock); + trn->short_id= 0; + pthread_mutex_unlock(&trn->state_lock); + + tmp.trn= pool; + + my_atomic_rwlock_wrlock(&LOCK_pool); + do + { + /* + without this volatile cast gcc-3.4.4 moves the assignment + down after the loop at -O2 + */ + *(TRN * volatile *)&(trn->next)= tmp.trn; + } while (!my_atomic_casptr((void **)(char*)&pool, &tmp.v, trn)); + my_atomic_rwlock_wrunlock(&LOCK_pool); +} + +/* + NOTE + here we access the hash in a lock-free manner. + It's safe, a 'found' TRN can never be freed/reused before we access it. + In fact, it cannot be freed before 'trn' ends, because a 'found' TRN + can only be removed from the hash when: + found->commit_trid < ALL (trn->min_read_from) + that is, at least + found->commit_trid < trn->min_read_from + but + found->trid >= trn->min_read_from + and + found->commit_trid > found->trid + + RETURN + 1 can + 0 cannot + -1 error (OOM) +*/ +int trnman_can_read_from(TRN *trn, TrID trid) +{ + TRN **found; + my_bool can; + LF_REQUIRE_PINS(3); + + if (trid < trn->min_read_from) + return 1; /* Row is visible by all transactions in the system */ + + if (trid >= trn->trid) + { + /* + We have now two cases + trid > trn->trid, in which case the row is from a new transaction + and not visible, in which case we should return 0. + trid == trn->trid in which case the row is from the current transaction + and we should return 1 + */ + return trid == trn->trid; + } + + found= lf_hash_search(&trid_to_trn, trn->pins, &trid, sizeof(trid)); + if (found == NULL) + return 0; /* not in the hash of transactions = cannot read */ + if (found == MY_ERRPTR) + return -1; + + can= (*found)->commit_trid < trn->trid; + lf_hash_search_unpin(trn->pins); + return can; +} + +/** + Finds a TRN by its TrID + + @param trn current trn. Needed for pinning pointers (see lf_pin) + @param trid trid to search for + + @return found trn or 0 + + @note that trn is returned with its state locked! +*/ +TRN *trnman_trid_to_trn(TRN *trn, TrID trid) +{ + TRN **found; + LF_REQUIRE_PINS(3); + + if (trid < trn->min_read_from) + return 0; /* it's committed eons ago */ + + found= lf_hash_search(&trid_to_trn, trn->pins, &trid, sizeof(trid)); + if (found == NULL || found == MY_ERRPTR) + return 0; /* no luck */ + + /* we've found something */ + pthread_mutex_lock(&(*found)->state_lock); + + if ((*found)->short_id == 0) + { + pthread_mutex_unlock(&(*found)->state_lock); + lf_hash_search_unpin(trn->pins); + return 0; /* but it was a ghost */ + } + lf_hash_search_unpin(trn->pins); + + /* Gotcha! */ + return *found; +} + +/* TODO: the stubs below are waiting for savepoints to be implemented */ + +void trnman_new_statement(TRN *trn __attribute__ ((unused))) +{ +} + +void trnman_rollback_statement(TRN *trn __attribute__ ((unused))) +{ +} + + +/** + @brief Allocates buffers and stores in them some info about transactions + + Does the allocation because the caller cannot know the size itself. + Memory freeing is to be done by the caller (if the "str" member of the + LEX_STRING is not NULL). + The caller has the intention of doing checkpoints. + + @param[out] str_act pointer to where the allocated buffer, + and its size, will be put; buffer will be filled + with info about active transactions + @param[out] str_com pointer to where the allocated buffer, + and its size, will be put; buffer will be filled + with info about committed transactions + @param[out] min_first_undo_lsn pointer to where the minimum + first_undo_lsn of all transactions will be put + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com, + LSN *min_rec_lsn, LSN *min_first_undo_lsn) +{ + my_bool error; + TRN *trn; + char *ptr; + uint stored_transactions= 0; + LSN minimum_rec_lsn= LSN_MAX, minimum_first_undo_lsn= LSN_MAX; + DBUG_ENTER("trnman_collect_transactions"); + + DBUG_ASSERT((NULL == str_act->str) && (NULL == str_com->str)); + + /* validate the use of read_non_atomic() in general: */ + compile_time_assert((sizeof(LSN) == 8) && (sizeof(LSN_WITH_FLAGS) == 8)); + pthread_mutex_lock(&LOCK_trn_list); + str_act->length= 2 + /* number of active transactions */ + LSN_STORE_SIZE + /* minimum of their rec_lsn */ + TRANSID_SIZE + /* current TrID generator value */ + (2 + /* short id */ + 6 + /* long id */ + LSN_STORE_SIZE + /* undo_lsn */ +#ifdef MARIA_VERSIONING /* not enabled yet */ + LSN_STORE_SIZE + /* undo_purge_lsn */ +#endif + LSN_STORE_SIZE /* first_undo_lsn */ + ) * trnman_active_transactions; + str_com->length= 4 + /* number of committed transactions */ + (6 + /* long id */ +#ifdef MARIA_VERSIONING /* not enabled yet */ + LSN_STORE_SIZE + /* undo_purge_lsn */ +#endif + LSN_STORE_SIZE /* first_undo_lsn */ + ) * trnman_committed_transactions; + if ((NULL == (str_act->str= my_malloc(str_act->length, MYF(MY_WME)))) || + (NULL == (str_com->str= my_malloc(str_com->length, MYF(MY_WME))))) + goto err; + /* First, the active transactions */ + ptr= str_act->str + 2 + LSN_STORE_SIZE; + transid_store(ptr, global_trid_generator); + ptr+= TRANSID_SIZE; + for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next) + { + uint sid; + LSN rec_lsn, undo_lsn, first_undo_lsn; + pthread_mutex_lock(&trn->state_lock); + sid= trn->short_id; + pthread_mutex_unlock(&trn->state_lock); + if (sid == 0) + { + /* + Not even inited, has done nothing. Or it is the + dummy_transaction_object, which does only non-transactional + immediate-sync operations (CREATE/DROP/RENAME/REPAIR TABLE), and so + can be forgotten for Checkpoint. + */ + continue; + } + /* needed for low-water mark calculation */ + if (((rec_lsn= lsn_read_non_atomic(trn->rec_lsn)) > 0) && + (cmp_translog_addr(rec_lsn, minimum_rec_lsn) < 0)) + minimum_rec_lsn= rec_lsn; + /* + If trn has not logged LOGREC_LONG_TRANSACTION_ID, this trn will be + discovered when seeing that log record which is for sure located after + checkpoint_start_log_horizon. + */ + if ((LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn) & + TRANSACTION_LOGGED_LONG_ID) == 0) + continue; + /* + On the other hand, if undo_lsn is LSN_IMPOSSIBLE, trn may later log + records; so we must include trn in the checkpoint now, because we cannot + count on LOGREC_LONG_TRANSACTION_ID (as we are already past it). + */ + undo_lsn= trn->undo_lsn; + stored_transactions++; + int2store(ptr, sid); + ptr+= 2; + int6store(ptr, trn->trid); + ptr+= 6; + lsn_store(ptr, undo_lsn); /* needed for rollback */ + ptr+= LSN_STORE_SIZE; + /* needed for low-water mark calculation */ + if (((first_undo_lsn= lsn_read_non_atomic(trn->first_undo_lsn)) > 0) && + (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0)) + minimum_first_undo_lsn= first_undo_lsn; + lsn_store(ptr, first_undo_lsn); + ptr+= LSN_STORE_SIZE; +#ifdef MARIA_VERSIONING /* not enabled yet */ + /* to know where purging should start (last delete of this trn) */ + lsn_store(ptr, trn->undo_purge_lsn); + ptr+= LSN_STORE_SIZE; +#endif + /** + @todo RECOVERY: add a comment explaining why we can dirtily read some + vars, inspired by the text of "assumption 8" in WL#3072 + */ + } + str_act->length= ptr - str_act->str; /* as we maybe over-estimated */ + ptr= str_act->str; + DBUG_PRINT("info",("collected %u active transactions", + (uint)stored_transactions)); + int2store(ptr, stored_transactions); + ptr+= 2; + /* this LSN influences how REDOs for any page can be ignored by Recovery */ + lsn_store(ptr, minimum_rec_lsn); + /* one day there will also be a list of prepared transactions */ + /* do the same for committed ones */ + ptr= str_com->str; + int4store(ptr, trnman_committed_transactions); + ptr+= 4; + DBUG_PRINT("info",("collected %u committed transactions", + (uint)trnman_committed_transactions)); + for (trn= committed_list_min.next; trn != &committed_list_max; + trn= trn->next) + { + LSN first_undo_lsn; + int6store(ptr, trn->trid); + ptr+= 6; +#ifdef MARIA_VERSIONING /* not enabled yet */ + lsn_store(ptr, trn->undo_purge_lsn); + ptr+= LSN_STORE_SIZE; +#endif + first_undo_lsn= LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn); + if (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0) + minimum_first_undo_lsn= first_undo_lsn; + lsn_store(ptr, first_undo_lsn); + ptr+= LSN_STORE_SIZE; + } + /* + TODO: if we see there exists no transaction (active and committed) we can + tell the lock-free structures to do some freeing (my_free()). + */ + error= 0; + *min_rec_lsn= minimum_rec_lsn; + *min_first_undo_lsn= minimum_first_undo_lsn; + goto end; +err: + error= 1; +end: + pthread_mutex_unlock(&LOCK_trn_list); + DBUG_RETURN(error); +} + + +TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid) +{ + TrID old_trid_generator= global_trid_generator; + TRN *trn; + DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded); + global_trid_generator= longid-1; /* force a correct trid in the new trn */ + if (unlikely((trn= trnman_new_trn(NULL)) == NULL)) + return NULL; + /* deallocate excessive allocations of trnman_new_trn() */ + global_trid_generator= old_trid_generator; + set_if_bigger(global_trid_generator, longid); + short_trid_to_active_trn[trn->short_id]= 0; + DBUG_ASSERT(short_trid_to_active_trn[shortid] == NULL); + short_trid_to_active_trn[shortid]= trn; + trn->short_id= shortid; + return trn; +} + + +TRN *trnman_get_any_trn() +{ + TRN *trn= active_list_min.next; + return (trn != &active_list_max) ? trn : NULL; +} + + +/** + Returns the minimum existing transaction id. May return a too small + number in race conditions, but this is ok as the value is used to + remove not visible transid from index/rows. +*/ + +TrID trnman_get_min_trid() +{ + return trid_min_read_from; +} + + +/** + Returns the minimum possible transaction id + + @notes + If there is no transactions running, returns number for next running + transaction. + If one has an active transaction, the returned number will be less or + equal to this. If one is not running in a transaction one will ge the + number for the next started transaction. This is used in create table + to get a safe minimum trid to use. +*/ + +TrID trnman_get_min_safe_trid() +{ + TrID trid; + pthread_mutex_lock(&LOCK_trn_list); + trid= min(active_list_min.next->min_read_from, + global_trid_generator); + pthread_mutex_unlock(&LOCK_trn_list); + return trid; +} + + +/** + Returns maximum transaction id given to a transaction so far. +*/ + +TrID trnman_get_max_trid() +{ + TrID id; + if (short_trid_to_active_trn == NULL) + return 0; + pthread_mutex_lock(&LOCK_trn_list); + id= global_trid_generator; + pthread_mutex_unlock(&LOCK_trn_list); + return id; +} + +/** + @brief Check if there exist an active transaction between two commit_id's + + @todo + Improve speed of this. + - Store transactions in tree or skip list + - Have function to copying all active transaction id's to b-tree + and use b-tree for checking states. This could be a big win + for checkpoint that will call this function for a lot of objects. + + @return + 0 No transaction exists + 1 There is at least on active transaction in the given range +*/ + +my_bool trnman_exists_active_transactions(TrID min_id, TrID max_id, + my_bool trnman_is_locked) +{ + TRN *trn; + my_bool ret= 0; + + if (!trnman_is_locked) + pthread_mutex_lock(&LOCK_trn_list); + safe_mutex_assert_owner(&LOCK_trn_list); + for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next) + { + /* + We use <= for max_id as max_id is a commit_trid and trn->trid + is transaction id. When calculating commit_trid we use the + current value of global_trid_generator. global_trid_generator is + incremented for each new transaction. + + For example, assuming we have + min_id = 5 + max_id = 10 + + A trid of value 5 can't see the history event between 5 & 10 + at it vas started before min_id 5 was committed. + A trid of value 10 can't see the next history event (max_id = 10) + as it started before this was committed. In this case it must use + the this event. + */ + if (trn->trid > min_id && trn->trid <= max_id) + { + ret= 1; + break; + } + } + if (!trnman_is_locked) + pthread_mutex_unlock(&LOCK_trn_list); + return ret; +} + + +/** + lock transaction list +*/ + +void trnman_lock() +{ + pthread_mutex_lock(&LOCK_trn_list); +} + + +/** + unlock transaction list +*/ + +void trnman_unlock() +{ + pthread_mutex_unlock(&LOCK_trn_list); +} + + +/** + Is trman initialized +*/ + +my_bool trman_is_inited() +{ + return (short_trid_to_active_trn != NULL); +} diff --git a/storage/maria/trnman.h b/storage/maria/trnman.h new file mode 100644 index 00000000000..afe01d4ad10 --- /dev/null +++ b/storage/maria/trnman.h @@ -0,0 +1,67 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _trnman_h +#define _trnman_h + +C_MODE_START + +#include <lf.h> +#include "trnman_public.h" +#include "ma_loghandler_lsn.h" + +/** + trid - 6 uchar transaction identifier. Assigned when a transaction + is created. Transaction can always be identified by its trid, + even after transaction has ended. + + short_id - 2-byte transaction identifier, identifies a running + transaction, is reassigned when transaction ends. + + when short_id is 0, TRN is not initialized, for all practical purposes + it could be considered unused. + + when commit_trid is MAX_TRID the transaction is running, otherwise it's + committed. + + state_lock mutex protects the state of a TRN, that is whether a TRN + is committed/running/unused. Meaning that modifications of short_id and + commit_trid happen under this mutex. +*/ + +struct st_ma_transaction +{ + LF_PINS *pins; + WT_THD *wt; + pthread_mutex_t state_lock; + void *used_tables; /**< Tables used by transaction */ + TRN *next, *prev; + TrID trid, min_read_from, commit_trid; + LSN rec_lsn, undo_lsn; + LSN_WITH_FLAGS first_undo_lsn; + uint locked_tables; + uint16 short_id; + uint16 flags; /**< Various flags */ +}; + +#define TRANSACTION_LOGGED_LONG_ID ULL(0x8000000000000000) +#define MAX_TRID (~(TrID)0) + +extern WT_RESOURCE_TYPE ma_rc_dup_unique; + +C_MODE_END + +#endif + diff --git a/storage/maria/trnman_public.h b/storage/maria/trnman_public.h new file mode 100644 index 00000000000..9523eb5de8f --- /dev/null +++ b/storage/maria/trnman_public.h @@ -0,0 +1,85 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +/* + External definitions for trnman.h + We need to split this into two files as gcc 4.1.2 gives error if it tries + to include my_atomic.h in C++ code. +*/ + +#ifndef _trnman_public_h +#define _trnman_public_h + +#include "ma_loghandler_lsn.h" +#include <waiting_threads.h> + +C_MODE_START +typedef uint64 TrID; /* our TrID is 6 bytes */ +typedef struct st_ma_transaction TRN; + +#define SHORT_TRID_MAX 65535 + +extern uint trnman_active_transactions, trnman_allocated_transactions; +extern TRN dummy_transaction_object; +extern my_bool (*trnman_end_trans_hook)(TRN *trn, my_bool commit, + my_bool active_transactions); + +int trnman_init(TrID); +void trnman_destroy(void); +TRN *trnman_new_trn(WT_THD *wt); +my_bool trnman_end_trn(TRN *trn, my_bool commit); +#define trnman_commit_trn(T) trnman_end_trn(T, TRUE) +#define trnman_abort_trn(T) trnman_end_trn(T, FALSE) +#define trnman_rollback_trn(T) trnman_end_trn(T, FALSE) +int trnman_can_read_from(TRN *trn, TrID trid); +TRN *trnman_trid_to_trn(TRN *trn, TrID trid); +void trnman_new_statement(TRN *trn); +void trnman_rollback_statement(TRN *trn); +my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com, + LSN *min_rec_lsn, + LSN *min_first_undo_lsn); + +uint trnman_increment_locked_tables(TRN *trn); +uint trnman_decrement_locked_tables(TRN *trn); +uint trnman_has_locked_tables(TRN *trn); +void trnman_reset_locked_tables(TRN *trn, uint locked_tables); +TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid); +TRN *trnman_get_any_trn(void); +TrID trnman_get_min_trid(void); +TrID trnman_get_max_trid(void); +TrID trnman_get_min_safe_trid(); +my_bool trnman_exists_active_transactions(TrID min_id, TrID max_id, + my_bool trnman_is_locked); +#define TRANSID_SIZE 6 +#define transid_store(dst, id) int6store(dst,id) +#define transid_korr(P) uint6korr(P) +void trnman_lock(); +void trnman_unlock(); +my_bool trman_is_inited(); +#ifdef EXTRA_DEBUG +uint16 trnman_get_flags(TRN *); +void trnman_set_flags(TRN *, uint16 flags); +#else +#define trnman_get_flags(A) 0 +#define trnman_set_flags(A, B) do { } while (0) +#endif + +/* Flag bits */ +#define TRN_STATE_INFO_LOGGED 1 /* Query is logged */ +#define TRN_STATE_TABLES_CAN_CHANGE 2 /* Things can change during trans. */ + +C_MODE_END +#endif diff --git a/storage/maria/unittest/CMakeLists.txt b/storage/maria/unittest/CMakeLists.txt new file mode 100644 index 00000000000..fe6327c6ea3 --- /dev/null +++ b/storage/maria/unittest/CMakeLists.txt @@ -0,0 +1,95 @@ +# Copyright (C) 2007 MySQL AB +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/zlib + ${CMAKE_SOURCE_DIR}/unittest/mytap) +LINK_LIBRARIES(aria myisam mytap mysys dbug strings wsock32 zlib) + +ADD_EXECUTABLE(ma_control_file-t ma_control_file-t.c) +ADD_EXECUTABLE(trnman-t trnman-t.c) +ADD_EXECUTABLE(ma_test_loghandler-t + ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +ADD_EXECUTABLE(ma_test_loghandler_multigroup-t + ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c) +ADD_EXECUTABLE(ma_test_loghandler_multithread-t + ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +ADD_EXECUTABLE(ma_test_loghandler_pagecache-t + ma_test_loghandler_pagecache-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +ADD_EXECUTABLE(ma_test_loghandler_long-t + ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +SET_TARGET_PROPERTIES(ma_test_loghandler_long-t PROPERTIES COMPILE_FLAGS "-DLONG_LOG_TEST") + +ADD_EXECUTABLE(ma_test_loghandler_noflush-t + ma_test_loghandler_noflush-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +ADD_EXECUTABLE(ma_test_loghandler_first_lsn-t + ma_test_loghandler_first_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +ADD_EXECUTABLE(ma_test_loghandler_max_lsn-t + ma_test_loghandler_max_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +ADD_EXECUTABLE(ma_test_loghandler_purge-t + ma_test_loghandler_purge-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +ADD_EXECUTABLE(ma_test_loghandler_readonly-t + ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c) +SET_TARGET_PROPERTIES(ma_test_loghandler_readonly-t PROPERTIES COMPILE_FLAGS "-DREADONLY_TEST") +ADD_EXECUTABLE(ma_test_loghandler_nologs-t + ma_test_loghandler_nologs-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) + +SET(ma_pagecache_single_src ma_pagecache_single.c test_file.c test_file.h) +SET(ma_pagecache_consist_src ma_pagecache_consist.c test_file.c test_file.h) +SET(ma_pagecache_common_cppflags "-DEXTRA_DEBUG -DPAGECACHE_DEBUG -DMAIN") + +ADD_EXECUTABLE(ma_pagecache_single_1k-t ${ma_pagecache_single_src}) +SET_TARGET_PROPERTIES(ma_pagecache_single_1k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024") + +ADD_EXECUTABLE(ma_pagecache_single_8k-t ${ma_pagecache_single_src}) +SET_TARGET_PROPERTIES(ma_pagecache_single_8k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=8192") + +ADD_EXECUTABLE(ma_pagecache_single_64k-t ${ma_pagecache_single_src}) +SET_TARGET_PROPERTIES(ma_pagecache_single_64k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536") + +ADD_EXECUTABLE(ma_pagecache_consist_1k-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_1k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024") + +ADD_EXECUTABLE(ma_pagecache_consist_64k-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_64k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536") + +ADD_EXECUTABLE(ma_pagecache_consist_1kHC-t + ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_1kHC-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_HIGH_CONCURENCY") +ADD_EXECUTABLE(ma_pagecache_consist_64kHC-t + ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_64kHC-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_HIGH_CONCURENCY") +ADD_EXECUTABLE(ma_pagecache_consist_1kRD-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_1kRD-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_READERS") +ADD_EXECUTABLE(ma_pagecache_consist_64kRD-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_64kRD-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_READERS") +ADD_EXECUTABLE(ma_pagecache_consist_1kWR-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_1kWR-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_WRITERS") +ADD_EXECUTABLE(ma_pagecache_consist_64kWR-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_64kWR-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_WRITERS") +ADD_EXECUTABLE(ma_pagecache_rwconsist_1k-t ma_pagecache_rwconsist.c) +SET_TARGET_PROPERTIES(ma_pagecache_rwconsist_1k-t PROPERTIES COMPILE_FLAGS "-DTEST_PAGE_SIZE=1024") +ADD_EXECUTABLE(ma_pagecache_rwconsist2_1k-t ma_pagecache_rwconsist2.c) +SET_TARGET_PROPERTIES(ma_pagecache_rwconsist2_1k-t PROPERTIES COMPILE_FLAGS "-DTEST_PAGE_SIZE=1024") diff --git a/storage/maria/unittest/Makefile.am b/storage/maria/unittest/Makefile.am new file mode 100644 index 00000000000..b5bc8587066 --- /dev/null +++ b/storage/maria/unittest/Makefile.am @@ -0,0 +1,115 @@ +# Copyright (C) 2006-2008 MySQL AB +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +AM_CPPFLAGS = @ZLIB_INCLUDES@ -I$(top_builddir)/include \ + -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap +INCLUDES = @ZLIB_INCLUDES@ -I$(top_builddir)/include \ + -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap +EXTRA_DIST= ma_test_all-t CMakeLists.txt \ + ma_test_recovery.pl ma_test_recovery.expected +# Only reason to link with libmyisam.a here is that it's where some fulltext +# pieces are (but soon we'll remove fulltext dependencies from Aria). +LDADD= $(top_builddir)/unittest/mytap/libmytap.a \ + $(top_builddir)/storage/maria/libaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +noinst_PROGRAMS = ma_control_file-t trnman-t \ + ma_pagecache_single_1k-t ma_pagecache_single_8k-t \ + ma_pagecache_single_64k-t \ + ma_pagecache_consist_1k-t \ + ma_pagecache_consist_64k-t \ + ma_pagecache_consist_1kHC-t \ + ma_pagecache_consist_64kHC-t \ + ma_pagecache_consist_1kRD-t \ + ma_pagecache_consist_64kRD-t \ + ma_pagecache_consist_1kWR-t \ + ma_pagecache_consist_64kWR-t \ + ma_pagecache_rwconsist_1k-t \ + ma_pagecache_rwconsist2_1k-t \ + ma_test_loghandler-t \ + ma_test_loghandler_multigroup-t \ + ma_test_loghandler_multithread-t \ + ma_test_loghandler_multiflush-t \ + ma_test_loghandler_pagecache-t \ + ma_test_loghandler_long-t \ + ma_test_loghandler_noflush-t \ + ma_test_loghandler_first_lsn-t \ + ma_test_loghandler_max_lsn-t \ + ma_test_loghandler_purge-t \ + ma_test_loghandler_readonly-t\ + ma_test_loghandler_nologs-t + +ma_test_loghandler_t_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c +ma_test_loghandler_multigroup_t_SOURCES = ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c sequence_storage.h +ma_test_loghandler_multithread_t_SOURCES = ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c +ma_test_loghandler_multiflush_t_SOURCES = ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c +ma_test_loghandler_multiflush_t_CPPFLAGS = -DMULTIFLUSH_TEST +ma_test_loghandler_pagecache_t_SOURCES = ma_test_loghandler_pagecache-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c +ma_test_loghandler_long_t_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c +ma_test_loghandler_long_t_CPPFLAGS = -DLONG_LOG_TEST +ma_test_loghandler_noflush_t_SOURCES = ma_test_loghandler_noflush-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c +ma_test_loghandler_first_lsn_t_SOURCES = ma_test_loghandler_first_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c +ma_test_loghandler_max_lsn_t_SOURCES = ma_test_loghandler_max_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c +ma_test_loghandler_purge_t_SOURCES = ma_test_loghandler_purge-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c +ma_test_loghandler_readonly_t_SOURCES = ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c sequence_storage.h +ma_test_loghandler_readonly_t_CPPFLAGS = -DREADONLY_TEST +ma_test_loghandler_nologs_t_SOURCES = ma_test_loghandler_nologs-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c + +ma_pagecache_single_src = ma_pagecache_single.c test_file.c test_file.h +ma_pagecache_consist_src = ma_pagecache_consist.c test_file.c test_file.h +ma_pagecache_common_cppflags = -DEXTRA_DEBUG -DPAGECACHE_DEBUG -DMAIN + +ma_pagecache_single_1k_t_SOURCES = $(ma_pagecache_single_src) +ma_pagecache_single_8k_t_SOURCES = $(ma_pagecache_single_src) +ma_pagecache_single_64k_t_SOURCES = $(ma_pagecache_single_src) +ma_pagecache_single_1k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 +ma_pagecache_single_8k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=8192 +ma_pagecache_single_64k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DBIG + +ma_pagecache_consist_1k_t_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_1k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 +ma_pagecache_consist_64k_t_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_64k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 + +ma_pagecache_consist_1kHC_t_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_1kHC_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 -DTEST_HIGH_CONCURENCY +ma_pagecache_consist_64kHC_t_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_64kHC_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DTEST_HIGH_CONCURENCY + +ma_pagecache_consist_1kRD_t_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_1kRD_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 -DTEST_READERS +ma_pagecache_consist_64kRD_t_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_64kRD_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DTEST_READERS + +ma_pagecache_consist_1kWR_t_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_1kWR_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 -DTEST_WRITERS +ma_pagecache_consist_64kWR_t_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_64kWR_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DTEST_WRITERS + +ma_pagecache_rwconsist_1k_t_SOURCES = ma_pagecache_rwconsist.c +ma_pagecache_rwconsist_1k_t_CPPFLAGS = -DTEST_PAGE_SIZE=1024 +ma_pagecache_rwconsist2_1k_t_SOURCES = ma_pagecache_rwconsist2.c +ma_pagecache_rwconsist2_1k_t_CPPFLAGS = -DTEST_PAGE_SIZE=1024 + +# the generic lock manager may not be used in the end and lockman1-t crashes, +# and lockman2-t takes at least quarter an hour, +# so we don't build lockman-t and lockman1-t and lockman2-t +CLEANFILES = aria_log_control page_cache_test_file_1 \ + aria_log.???????? + +# Don't update the files from bitkeeper +%::SCCS/s.% diff --git a/storage/maria/unittest/lockman-t.c b/storage/maria/unittest/lockman-t.c new file mode 100644 index 00000000000..9b54a3d8ff9 --- /dev/null +++ b/storage/maria/unittest/lockman-t.c @@ -0,0 +1,308 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + lockman for row and table locks +*/ + +/* #define EXTRA_VERBOSE */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <my_atomic.h> +#include <lf.h> +#include "../lockman.h" + +#define Nlos 100 +LOCK_OWNER loarray[Nlos]; +pthread_mutex_t mutexes[Nlos]; +pthread_cond_t conds[Nlos]; +LOCKMAN lockman; + +#ifndef EXTRA_VERBOSE +#define print_lockhash(X) /* no-op */ +#define DIAG(X) /* no-op */ +#else +#define DIAG(X) diag X +#endif + +LOCK_OWNER *loid2lo(uint16 loid) +{ + return loarray+loid-1; +} + +#define unlock_all(O) diag("lo" #O "> release all locks"); \ + lockman_release_locks(&lockman, loid2lo(O));print_lockhash(&lockman) +#define test_lock(O, R, L, S, RES) \ + ok(lockman_getlock(&lockman, loid2lo(O), R, L) == RES, \ + "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \ + print_lockhash(&lockman) +#define lock_ok_a(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK) +#define lock_ok_i(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE) +#define lock_ok_l(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE) +#define lock_conflict(O, R, L) \ + test_lock(O, R, L, "cannot ", DIDNT_GET_THE_LOCK); + +void test_lockman_simple() +{ + /* simple */ + lock_ok_a(1, 1, S); + lock_ok_i(2, 2, IS); + lock_ok_i(1, 2, IX); + /* lock escalation */ + lock_ok_a(1, 1, X); + lock_ok_i(2, 2, IX); + /* failures */ + lock_conflict(2, 1, X); + unlock_all(2); + lock_ok_a(1, 2, S); + lock_ok_a(1, 2, IS); + lock_ok_a(1, 2, LS); + lock_ok_i(1, 3, IX); + lock_ok_a(2, 3, LS); + lock_ok_i(1, 3, IX); + lock_ok_l(2, 3, IS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_conflict(2, 1, S); + lock_ok_a(1, 1, LS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_ok_a(2, 1, LS); + lock_ok_a(1, 1, LS); + lock_ok_i(1, 1, IX); + lock_ok_i(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_i(1, 4, IS); + lock_ok_i(2, 4, IS); + lock_ok_i(3, 4, IS); + lock_ok_a(3, 4, LS); + lock_ok_i(4, 4, IS); + lock_conflict(4, 4, IX); + lock_conflict(2, 4, IX); + lock_ok_a(1, 4, LS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + unlock_all(4); + + lock_ok_i(1, 1, IX); + lock_ok_i(2, 1, IX); + lock_conflict(1, 1, S); + lock_conflict(2, 1, X); + unlock_all(1); + unlock_all(2); +} + +int rt_num_threads; +int litmus; +int thread_number= 0, timeouts= 0; +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + thread_number= timeouts= 0; + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Running %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, 0, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +pthread_mutex_t rt_mutex; +int Nrows= 100; +int Ntables= 10; +int table_lock_ratio= 10; +enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX}; +char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"}; +char *res2str[4]= { + "DIDN'T GET THE LOCK", + "GOT THE LOCK", + "GOT THE LOCK NEED TO LOCK A SUBRESOURCE", + "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"}; +pthread_handler_t test_lockman(void *arg) +{ + int m= (*(int *)arg); + uint x, loid, row, table, res, locklevel, timeout= 0; + LOCK_OWNER *lo; + + pthread_mutex_lock(&rt_mutex); + loid= ++thread_number; + pthread_mutex_unlock(&rt_mutex); + lo= loid2lo(loid); + + for (x= ((int)(intptr)(&m)); m > 0; m--) + { + x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */ + row= x % Nrows + Ntables; + table= row % Ntables; + locklevel= (x/Nrows) & 3; + if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0) + { /* table lock */ + res= lockman_getlock(&lockman, lo, table, lock_array[locklevel]); + DIAG(("loid %2d, table %d, lock %s, res %s", loid, table, + lock2str[locklevel], res2str[res])); + if (res == DIDNT_GET_THE_LOCK) + { + lockman_release_locks(&lockman, lo); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + } + else + { /* row lock */ + locklevel&= 1; + res= lockman_getlock(&lockman, lo, table, lock_array[locklevel + 4]); + DIAG(("loid %2d, row %d, lock %s, res %s", loid, row, + lock2str[locklevel+4], res2str[res])); + switch (res) + { + case DIDNT_GET_THE_LOCK: + lockman_release_locks(&lockman, lo); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + case GOT_THE_LOCK: + continue; + case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE: + /* not implemented, so take a regular lock */ + case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE: + res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]); + DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row, + lock2str[locklevel], res2str[res])); + if (res == DIDNT_GET_THE_LOCK) + { + lockman_release_locks(&lockman, lo); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + continue; + default: + DBUG_ASSERT(0); + } + } + } + + lockman_release_locks(&lockman, lo); + + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + timeouts+= timeout; + if (!rt_num_threads) + diag("number of timeouts: %d", timeouts); + pthread_mutex_unlock(&rt_mutex); + + return 0; +} + +int main() +{ + int i; + + my_init(); + pthread_mutex_init(&rt_mutex, 0); + + plan(35); + + if (my_atomic_initialize()) + return exit_status(); + + + lockman_init(&lockman, &loid2lo, 50); + + for (i= 0; i < Nlos; i++) + { + loarray[i].pins= lf_alloc_get_pins(&lockman.alloc); + loarray[i].all_locks= 0; + loarray[i].waiting_for= 0; + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init (&conds[i], 0); + loarray[i].mutex= &mutexes[i]; + loarray[i].cond= &conds[i]; + loarray[i].loid= i+1; + } + + test_lockman_simple(); + +#define CYCLES 10000 +#define THREADS Nlos /* don't change this line */ + + /* mixed load, stress-test with random locks */ + Nrows= 100; + Ntables= 10; + table_lock_ratio= 10; + run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES); + + /* "real-life" simulation - many rows, no table locks */ + Nrows= 1000000; + Ntables= 10; + table_lock_ratio= 0; + run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10); + + for (i= 0; i < Nlos; i++) + { + lockman_release_locks(&lockman, &loarray[i]); + pthread_mutex_destroy(loarray[i].mutex); + pthread_cond_destroy(loarray[i].cond); + lf_pinbox_put_pins(loarray[i].pins); + } + + { + ulonglong now= my_getsystime(); + lockman_destroy(&lockman); + now= my_getsystime()-now; + diag("lockman_destroy: %g secs", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/maria/unittest/lockman1-t.c b/storage/maria/unittest/lockman1-t.c new file mode 100644 index 00000000000..ca959c6e6e3 --- /dev/null +++ b/storage/maria/unittest/lockman1-t.c @@ -0,0 +1,334 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + lockman for row locks, tablockman for table locks +*/ + +/* #define EXTRA_VERBOSE */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <my_atomic.h> +#include <lf.h> +#include "../lockman.h" +#include "../tablockman.h" + +#define Nlos 100 +#define Ntbls 10 +LOCK_OWNER loarray[Nlos]; +TABLE_LOCK_OWNER loarray1[Nlos]; +pthread_mutex_t mutexes[Nlos]; +pthread_cond_t conds[Nlos]; +LOCKED_TABLE ltarray[Ntbls]; +LOCKMAN lockman; +TABLOCKMAN tablockman; + +#ifndef EXTRA_VERBOSE +#define print_lo1(X) /* no-op */ +#define DIAG(X) /* no-op */ +#else +#define DIAG(X) diag X +#endif + +LOCK_OWNER *loid2lo(uint16 loid) +{ + return loarray+loid-1; +} +TABLE_LOCK_OWNER *loid2lo1(uint16 loid) +{ + return loarray1+loid-1; +} + +#define unlock_all(O) diag("lo" #O "> release all locks"); \ + tablockman_release_locks(&tablockman, loid2lo1(O)); +#define test_lock(O, R, L, S, RES) \ + ok(tablockman_getlock(&tablockman, loid2lo1(O), <array[R], L) == RES, \ + "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \ + print_lo1(loid2lo1(O)); +#define lock_ok_a(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK) +#define lock_ok_i(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE) +#define lock_ok_l(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE) +#define lock_conflict(O, R, L) \ + test_lock(O, R, L, "cannot ", LOCK_TIMEOUT); + +void test_tablockman_simple() +{ + /* simple */ + lock_ok_a(1, 1, S); + lock_ok_i(2, 2, IS); + lock_ok_i(1, 2, IX); + /* lock escalation */ + lock_ok_a(1, 1, X); + lock_ok_i(2, 2, IX); + /* failures */ + lock_conflict(2, 1, X); + unlock_all(2); + lock_ok_a(1, 2, S); + lock_ok_a(1, 2, IS); + lock_ok_a(1, 2, LS); + lock_ok_i(1, 3, IX); + lock_ok_a(2, 3, LS); + lock_ok_i(1, 3, IX); + lock_ok_l(2, 3, IS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_conflict(2, 1, S); + lock_ok_a(1, 1, LS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_ok_a(2, 1, LS); + lock_ok_a(1, 1, LS); + lock_ok_i(1, 1, IX); + lock_ok_i(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_i(1, 4, IS); + lock_ok_i(2, 4, IS); + lock_ok_i(3, 4, IS); + lock_ok_a(3, 4, LS); + lock_ok_i(4, 4, IS); + lock_conflict(4, 4, IX); + lock_conflict(2, 4, IX); + lock_ok_a(1, 4, LS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + unlock_all(4); + + lock_ok_i(1, 1, IX); + lock_ok_i(2, 1, IX); + lock_conflict(1, 1, S); + lock_conflict(2, 1, X); + unlock_all(1); + unlock_all(2); +} + +int rt_num_threads; +int litmus; +int thread_number= 0, timeouts= 0; +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + thread_number= timeouts= 0; + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Running %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, 0, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +pthread_mutex_t rt_mutex; +int Nrows= 100; +int Ntables= 10; +int table_lock_ratio= 10; +enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX}; +char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"}; +char *res2str[]= { + "DIDN'T GET THE LOCK", + "OUT OF MEMORY", + "DEADLOCK", + "LOCK TIMEOUT", + "GOT THE LOCK", + "GOT THE LOCK NEED TO LOCK A SUBRESOURCE", + "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"}; +pthread_handler_t test_lockman(void *arg) +{ + int m= (*(int *)arg); + uint x, loid, row, table, res, locklevel, timeout= 0; + LOCK_OWNER *lo; TABLE_LOCK_OWNER *lo1; DBUG_ASSERT(Ntables <= Ntbls); + + pthread_mutex_lock(&rt_mutex); + loid= ++thread_number; + pthread_mutex_unlock(&rt_mutex); + lo= loid2lo(loid); lo1= loid2lo1(loid); + + for (x= ((int)(intptr)(&m)); m > 0; m--) + { + x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */ + row= x % Nrows + Ntables; + table= row % Ntables; + locklevel= (x/Nrows) & 3; + if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0) + { /* table lock */ + res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel]); + DIAG(("loid %2d, table %d, lock %s, res %s", loid, table, + lock2str[locklevel], res2str[res])); + if (res < GOT_THE_LOCK) + { + lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + } + else + { /* row lock */ + locklevel&= 1; + res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]); + DIAG(("loid %2d, row %d, lock %s, res %s", loid, row, + lock2str[locklevel+4], res2str[res])); + switch (res) + { + case GOT_THE_LOCK: + continue; + case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE: + /* not implemented, so take a regular lock */ + case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE: + res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]); + DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row, + lock2str[locklevel], res2str[res])); + if (res == DIDNT_GET_THE_LOCK) + { + lockman_release_locks(&lockman, lo); + tablockman_release_locks(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + continue; + default: + lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + } + } + + lockman_release_locks(&lockman, lo); + tablockman_release_locks(&tablockman, lo1); + + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + timeouts+= timeout; + if (!rt_num_threads) + diag("number of timeouts: %d", timeouts); + pthread_mutex_unlock(&rt_mutex); + + return 0; +} + +int main() +{ + int i; + + my_init(); + pthread_mutex_init(&rt_mutex, 0); + + plan(35); + + if (my_atomic_initialize()) + return exit_status(); + + + lockman_init(&lockman, &loid2lo, 50); + tablockman_init(&tablockman, &loid2lo1, 50); + + for (i= 0; i < Nlos; i++) + { + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init (&conds[i], 0); + + loarray[i].pins= lf_alloc_get_pins(&lockman.alloc); + loarray[i].all_locks= 0; + loarray[i].waiting_for= 0; + loarray[i].mutex= &mutexes[i]; + loarray[i].cond= &conds[i]; + loarray[i].loid= i+1; + + loarray1[i].active_locks= 0; + loarray1[i].waiting_lock= 0; + loarray1[i].waiting_for= 0; + loarray1[i].mutex= &mutexes[i]; + loarray1[i].cond= &conds[i]; + loarray1[i].loid= i+1; + } + + for (i= 0; i < Ntbls; i++) + { + tablockman_init_locked_table(ltarray+i, Nlos); + } + + test_tablockman_simple(); + +#define CYCLES 10000 +#define THREADS Nlos /* don't change this line */ + + /* mixed load, stress-test with random locks */ + Nrows= 100; + Ntables= 10; + table_lock_ratio= 10; + run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES); + + /* "real-life" simulation - many rows, no table locks */ + Nrows= 1000000; + Ntables= 10; + table_lock_ratio= 0; + run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10); + + for (i= 0; i < Nlos; i++) + { + lockman_release_locks(&lockman, &loarray[i]); + pthread_mutex_destroy(loarray[i].mutex); + pthread_cond_destroy(loarray[i].cond); + lf_pinbox_put_pins(loarray[i].pins); + } + + { + ulonglong now= my_getsystime(); + lockman_destroy(&lockman); + now= my_getsystime()-now; + diag("lockman_destroy: %g secs", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/maria/unittest/lockman2-t.c b/storage/maria/unittest/lockman2-t.c new file mode 100644 index 00000000000..c1d40159500 --- /dev/null +++ b/storage/maria/unittest/lockman2-t.c @@ -0,0 +1,361 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + tablockman for row and table locks +*/ + +/* #define EXTRA_VERBOSE */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <my_atomic.h> +#include <lf.h> +#include "../tablockman.h" + +#define Nlos 100 +#define Ntbls 110 +TABLE_LOCK_OWNER loarray1[Nlos]; +pthread_mutex_t mutexes[Nlos]; +pthread_cond_t conds[Nlos]; +LOCKED_TABLE ltarray[Ntbls]; +TABLOCKMAN tablockman; + +#ifndef EXTRA_VERBOSE +#define print_lo1(X) /* no-op */ +#define DIAG(X) /* no-op */ +#else +#define DIAG(X) diag X +#endif + +TABLE_LOCK_OWNER *loid2lo1(uint16 loid) +{ + return loarray1+loid-1; +} + +#define unlock_all(O) diag("lo" #O "> release all locks"); \ + tablockman_release_locks(&tablockman, loid2lo1(O)); +#define test_lock(O, R, L, S, RES) \ + ok(tablockman_getlock(&tablockman, loid2lo1(O), <array[R], L) == RES, \ + "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \ + print_lo1(loid2lo1(O)); +#define lock_ok_a(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK) +#define lock_ok_i(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE) +#define lock_ok_l(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE) +#define lock_conflict(O, R, L) \ + test_lock(O, R, L, "cannot ", LOCK_TIMEOUT); + +void test_tablockman_simple() +{ + /* simple */ + lock_ok_a(1, 1, S); + lock_ok_i(2, 2, IS); + lock_ok_i(1, 2, IX); + /* lock escalation */ + lock_ok_a(1, 1, X); + lock_ok_i(2, 2, IX); + /* failures */ + lock_conflict(2, 1, X); + unlock_all(2); + lock_ok_a(1, 2, S); + lock_ok_a(1, 2, IS); + lock_ok_a(1, 2, LS); + lock_ok_i(1, 3, IX); + lock_ok_a(2, 3, LS); + lock_ok_i(1, 3, IX); + lock_ok_l(2, 3, IS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_conflict(2, 1, S); + lock_ok_a(1, 1, LS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_ok_a(2, 1, LS); + lock_ok_a(1, 1, LS); + lock_ok_i(1, 1, IX); + lock_ok_i(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_i(1, 4, IS); + lock_ok_i(2, 4, IS); + lock_ok_i(3, 4, IS); + lock_ok_a(3, 4, LS); + lock_ok_i(4, 4, IS); + lock_conflict(4, 4, IX); + lock_conflict(2, 4, IX); + lock_ok_a(1, 4, LS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + unlock_all(4); + + lock_ok_i(1, 1, IX); + lock_ok_i(2, 1, IX); + lock_conflict(1, 1, S); + lock_conflict(2, 1, X); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IS); + lock_conflict(2, 1, X); + lock_conflict(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_a(1, 1, S); + lock_conflict(2, 1, IX); + lock_conflict(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); +} + +int rt_num_threads; +int litmus; +int thread_number= 0, timeouts= 0; +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + thread_number= timeouts= 0; + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Running %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, 0, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +static void reinit_tlo(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo) +{ +#ifdef NOT_USED_YET + TABLE_LOCK_OWNER backup= *lo; +#endif + + tablockman_release_locks(lm, lo); +#ifdef NOT_USED_YET + pthread_mutex_destroy(lo->mutex); + pthread_cond_destroy(lo->cond); + bzero(lo, sizeof(*lo)); + + lo->mutex= backup.mutex; + lo->cond= backup.cond; + lo->loid= backup.loid; + pthread_mutex_init(lo->mutex, MY_MUTEX_INIT_FAST); + pthread_cond_init(lo->cond, 0); +#endif +} + +pthread_mutex_t rt_mutex; +int Nrows= 100; +int Ntables= 10; +int table_lock_ratio= 10; +enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX}; +const char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"}; +const char *res2str[]= { + 0, + "OUT OF MEMORY", + "DEADLOCK", + "LOCK TIMEOUT", + "GOT THE LOCK", + "GOT THE LOCK NEED TO LOCK A SUBRESOURCE", + "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"}; + +pthread_handler_t test_lockman(void *arg) +{ + int m= (*(int *)arg); + uint x, loid, row, table, res, locklevel, timeout= 0; + TABLE_LOCK_OWNER *lo1; + DBUG_ASSERT(Ntables <= Ntbls); + DBUG_ASSERT(Nrows + Ntables <= Ntbls); + + pthread_mutex_lock(&rt_mutex); + loid= ++thread_number; + pthread_mutex_unlock(&rt_mutex); + lo1= loid2lo1(loid); + + for (x= ((int)(intptr)(&m)); m > 0; m--) + { + /* three prime numbers */ + x= (uint) ((x*LL(3628273133) + LL(1500450271)) % LL(9576890767)); + row= x % Nrows + Ntables; + table= row % Ntables; + locklevel= (x/Nrows) & 3; + if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0) + { + /* table lock */ + res= tablockman_getlock(&tablockman, lo1, ltarray+table, + lock_array[locklevel]); + DIAG(("loid %2d, table %d, lock %s, res %s", loid, table, + lock2str[locklevel], res2str[res])); + if (res < GOT_THE_LOCK) + { + reinit_tlo(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + } + else + { /* row lock */ + locklevel&= 1; + res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]); + DIAG(("loid %2d, row %d, lock %s, res %s", loid, row, + lock2str[locklevel+4], res2str[res])); + switch (res) + { + case GOT_THE_LOCK: + continue; + case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE: + /* not implemented, so take a regular lock */ + case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE: + res= tablockman_getlock(&tablockman, lo1, ltarray+row, lock_array[locklevel]); + DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row, + lock2str[locklevel], res2str[res])); + if (res < GOT_THE_LOCK) + { + reinit_tlo(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + continue; + default: + reinit_tlo(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + } + } + + reinit_tlo(&tablockman, lo1); + + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + timeouts+= timeout; + if (!rt_num_threads) + diag("number of timeouts: %d", timeouts); + pthread_mutex_unlock(&rt_mutex); + + return 0; +} + +int main(int argc __attribute__((unused)), char **argv) +{ + int i; + MY_INIT(argv[0]); + + my_init(); + pthread_mutex_init(&rt_mutex, 0); + + plan(40); + + if (my_atomic_initialize()) + return exit_status(); + + + tablockman_init(&tablockman, &loid2lo1, 50); + + for (i= 0; i < Nlos; i++) + { + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init (&conds[i], 0); + + loarray1[i].active_locks= 0; + loarray1[i].waiting_lock= 0; + loarray1[i].waiting_for= 0; + loarray1[i].mutex= &mutexes[i]; + loarray1[i].cond= &conds[i]; + loarray1[i].loid= i+1; + } + + for (i= 0; i < Ntbls; i++) + { + tablockman_init_locked_table(ltarray+i, Nlos); + } + + test_tablockman_simple(); + +#define CYCLES 10000 +#define THREADS Nlos /* don't change this line */ + + /* mixed load, stress-test with random locks */ + Nrows= 100; + Ntables= 10; + table_lock_ratio= 10; + run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES); +#if 0 + /* "real-life" simulation - many rows, no table locks */ + Nrows= 1000000; + Ntables= 10; + table_lock_ratio= 0; + run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10); +#endif + for (i= 0; i < Nlos; i++) + { + tablockman_release_locks(&tablockman, &loarray1[i]); + pthread_mutex_destroy(loarray1[i].mutex); + pthread_cond_destroy(loarray1[i].cond); + } + + { + ulonglong now= my_getsystime(); + for (i= 0; i < Ntbls; i++) + { + tablockman_destroy_locked_table(ltarray+i); + } + tablockman_destroy(&tablockman); + now= my_getsystime()-now; + diag("lockman_destroy: %g secs", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/maria/unittest/ma_control_file-t.c b/storage/maria/unittest/ma_control_file-t.c new file mode 100644 index 00000000000..164ea284f31 --- /dev/null +++ b/storage/maria/unittest/ma_control_file-t.c @@ -0,0 +1,592 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Unit test of the control file module of the Aria engine WL#3234 */ + +/* + Note that it is not possible to test the durability of the write (can't + pull the plug programmatically :) +*/ + +#include <my_global.h> +#include <my_sys.h> +#include <tap.h> + +#ifndef WITH_ARIA_STORAGE_ENGINE +/* + If Aria is not compiled in, normally we don't come to building this test. +*/ +#error "Aria engine is not compiled in, test cannot be built" +#endif + +#include "maria.h" +#include "../../../storage/maria/maria_def.h" +#include <my_getopt.h> + +#define EXTRACT_DEFINITIONS +#include "../ma_control_file.c" +#undef EXTRACT_DEFINITIONS + +char file_name[FN_REFLEN]; + +/* The values we'll set and expect the control file module to return */ +LSN expect_checkpoint_lsn; +uint32 expect_logno; +TrID expect_max_trid; +uint8 expect_recovery_failures; + +static int delete_file(myf my_flags); +/* + Those are test-specific wrappers around the module's API functions: after + calling the module's API functions they perform checks on the result. +*/ +static int close_file(void); /* wraps ma_control_file_end */ +/* wraps ma_control_file_open_or_create */ +static int open_file(void); +/* wraps ma_control_file_write_and_force */ +static int write_file(LSN checkpoint_lsn, uint32 logno, TrID trid, + uint8 rec_failures); + +/* Tests */ +static int test_one_log_and_recovery_failures(void); +static int test_five_logs_and_max_trid(void); +static int test_3_checkpoints_and_2_logs(void); +static int test_binary_content(void); +static int test_start_stop(void); +static int test_2_open_and_2_close(void); +static int test_bad_magic_string(void); +static int test_bad_checksum(void); +static int test_bad_hchecksum(void); +static int test_future_size(void); +static int test_bad_blocksize(void); +static int test_bad_size(void); + +/* Utility */ +static int verify_module_values_match_expected(void); +static int verify_module_values_are_impossible(void); +static void usage(void); +static void get_options(int argc, char *argv[]); + +/* + If "expr" is FALSE, this macro will make the function print a diagnostic + message and immediately return 1. + This is inspired from assert() but does not crash the binary (sometimes we + may want to see how other tests go even if one fails). + RET_ERR means "return error". +*/ + +#define RET_ERR_UNLESS(expr) \ + {if (!(expr)) {diag("line %d: failure: '%s'", __LINE__, #expr); assert(0);return 1;}} + + +/* Used to ignore error messages from ma_control_file_open() */ + +static int my_ignore_message(uint error __attribute__((unused)), + const char *str __attribute__((unused)), + myf MyFlags __attribute__((unused))) +{ + DBUG_ENTER("my_message_no_curses"); + DBUG_PRINT("enter",("message: %s",str)); + DBUG_RETURN(0); +} + +int (*default_error_handler_hook)(uint my_err, const char *str, + myf MyFlags) = 0; + + +/* like ma_control_file_open(), but without error messages */ + +static CONTROL_FILE_ERROR local_ma_control_file_open(void) +{ + CONTROL_FILE_ERROR error; + error_handler_hook= my_ignore_message; + error= ma_control_file_open(TRUE, TRUE); + error_handler_hook= default_error_handler_hook; + return error; +} + + + +int main(int argc,char *argv[]) +{ + MY_INIT(argv[0]); + my_init(); + + maria_data_root= (char *)"."; + default_error_handler_hook= error_handler_hook; + + plan(12); + + diag("Unit tests for control file"); + + get_options(argc,argv); + + diag("Deleting control file at startup, if there is an old one"); + RET_ERR_UNLESS(0 == delete_file(0)); /* if fails, can't continue */ + + diag("Tests of normal conditions"); + ok(0 == test_one_log_and_recovery_failures(), + "test of creating one log and recording recovery failures"); + ok(0 == test_five_logs_and_max_trid(), + "test of creating five logs and many transactions"); + ok(0 == test_3_checkpoints_and_2_logs(), + "test of creating three checkpoints and two logs"); + ok(0 == test_binary_content(), "test of the binary content of the file"); + ok(0 == test_start_stop(), "test of multiple starts and stops"); + diag("Tests of abnormal conditions"); + ok(0 == test_2_open_and_2_close(), + "test of two open and two close (strange call sequence)"); + ok(0 == test_bad_magic_string(), "test of bad magic string"); + ok(0 == test_bad_checksum(), "test of bad checksum"); + ok(0 == test_bad_hchecksum(), "test of bad hchecksum"); + ok(0 == test_future_size(), "test of ability to handlr future versions"); + ok(0 == test_bad_blocksize(), "test of bad blocksize"); + ok(0 == test_bad_size(), "test of too small/big file"); + + return exit_status(); +} + + +static int delete_file(myf my_flags) +{ + RET_ERR_UNLESS(fn_format(file_name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) != NullS); + /* + Maybe file does not exist, ignore error. + The error will however be printed on stderr. + */ + my_delete(file_name, my_flags); + expect_checkpoint_lsn= LSN_IMPOSSIBLE; + expect_logno= FILENO_IMPOSSIBLE; + expect_max_trid= expect_recovery_failures= 0; + + return 0; +} + +/* + Verifies that global values last_checkpoint_lsn, last_logno, + max_trid_in_control_file (belonging to the module) match what we expect. +*/ +static int verify_module_values_match_expected(void) +{ + RET_ERR_UNLESS(last_logno == expect_logno); + RET_ERR_UNLESS(last_checkpoint_lsn == expect_checkpoint_lsn); + RET_ERR_UNLESS(max_trid_in_control_file == expect_max_trid); + RET_ERR_UNLESS(recovery_failures == expect_recovery_failures); + return 0; +} + + +/* + Verifies that global values last_checkpoint_lsn and last_logno (belonging + to the module) are impossible (this is used when the file has been closed). +*/ +static int verify_module_values_are_impossible(void) +{ + RET_ERR_UNLESS(last_logno == FILENO_IMPOSSIBLE); + RET_ERR_UNLESS(last_checkpoint_lsn == LSN_IMPOSSIBLE); + RET_ERR_UNLESS(max_trid_in_control_file == 0); + return 0; +} + + +static int close_file(void) +{ + /* Simulate shutdown */ + ma_control_file_end(); + /* Verify amnesia */ + RET_ERR_UNLESS(verify_module_values_are_impossible() == 0); + return 0; +} + +static int open_file(void) +{ + RET_ERR_UNLESS(local_ma_control_file_open() == CONTROL_FILE_OK); + /* Check that the module reports expected information */ + RET_ERR_UNLESS(verify_module_values_match_expected() == 0); + return 0; +} + +static int write_file(LSN checkpoint_lsn, uint32 logno, TrID trid, + uint8 rec_failures) +{ + RET_ERR_UNLESS(ma_control_file_write_and_force(checkpoint_lsn, logno, trid, + rec_failures) + == 0); + /* Check that the module reports expected information */ + RET_ERR_UNLESS(verify_module_values_match_expected() == 0); + return 0; +} + +static int test_one_log_and_recovery_failures(void) +{ + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + expect_logno= 123; + RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + expect_recovery_failures= 158; + RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + expect_recovery_failures) == 0); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_five_logs_and_max_trid(void) +{ + uint i; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + expect_logno= 100; + expect_max_trid= ULL(14111978111); + for (i= 0; i<5; i++) + { + expect_logno*= 3; + RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno, + expect_max_trid, + recovery_failures) == 0); + } + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_3_checkpoints_and_2_logs(void) +{ + /* + Simulate one checkpoint, one log creation, two checkpoints, one + log creation. + */ + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + expect_checkpoint_lsn= MAKE_LSN(5, 10000); + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + + expect_logno= 17; + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + + expect_checkpoint_lsn= MAKE_LSN(17, 20000); + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + + expect_checkpoint_lsn= MAKE_LSN(17, 45000); + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + + expect_logno= 19; + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_binary_content(void) +{ + uint i; + int fd; + + /* + TEST4: actually check by ourselves the content of the file. + Note that constants (offsets) are hard-coded here, precisely to prevent + someone from changing them in the control file module and breaking + backward-compatibility. + TODO: when we reach the format-freeze state, we may even just do a + comparison with a raw binary string, to not depend on any uint4korr + future change/breakage. + */ + + uchar buffer[45]; + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_read(fd, buffer, 45, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + i= uint3korr(buffer + 34 ); + RET_ERR_UNLESS(i == LSN_FILE_NO(last_checkpoint_lsn)); + i= uint4korr(buffer + 37); + RET_ERR_UNLESS(i == LSN_OFFSET(last_checkpoint_lsn)); + i= uint4korr(buffer + 41); + RET_ERR_UNLESS(i == last_logno); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_start_stop(void) +{ + /* TEST5: Simulate start/nothing/stop/start/nothing/stop/start */ + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_2_open_and_2_close(void) +{ + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + + +static int test_bad_magic_string(void) +{ + uchar buffer[4]; + int fd; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + /* Corrupt magic string */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pread(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_pwrite(fd, (const uchar *)"papa", 4, 0, + MYF(MY_FNABP | MY_WME)) == 0); + + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_BAD_MAGIC_STRING); + /* Restore magic string */ + RET_ERR_UNLESS(my_pwrite(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_bad_checksum(void) +{ + uchar buffer[4]; + int fd; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + /* Corrupt checksum */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pread(fd, buffer, 1, 30, MYF(MY_FNABP | MY_WME)) == 0); + buffer[0]+= 3; /* mangle checksum */ + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 30, MYF(MY_FNABP | MY_WME)) == 0); + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_BAD_CHECKSUM); + /* Restore checksum */ + buffer[0]-= 3; + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 30, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + + return 0; +} + + +static int test_bad_blocksize(void) +{ + maria_block_size<<= 1; + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_WRONG_BLOCKSIZE); + /* Restore blocksize */ + maria_block_size>>= 1; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + + +static int test_future_size(void) +{ + /* + Here we check ability to add fields only so we can use + defined constants + */ + uint32 sum; + int fd; + uchar buffer[CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE + 2]; + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_read(fd, buffer, + CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE, + MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + /* "add" new field of 1 byte (value 1) to header and variable part */ + memmove(buffer + CF_CREATE_TIME_TOTAL_SIZE + 1, + buffer + CF_CREATE_TIME_TOTAL_SIZE, + CF_CHANGEABLE_TOTAL_SIZE); + buffer[CF_CREATE_TIME_TOTAL_SIZE - CF_CHECKSUM_SIZE]= '\1'; + buffer[CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE + 1]= '\1'; + /* fix lengths */ + int2store(buffer + CF_CREATE_TIME_SIZE_OFFSET, CF_CREATE_TIME_TOTAL_SIZE + 1); + int2store(buffer + CF_CHANGEABLE_SIZE_OFFSET, CF_CHANGEABLE_TOTAL_SIZE + 1); + /* recalculete checksums */ + sum= (uint32) my_checksum(0, buffer, CF_CREATE_TIME_TOTAL_SIZE - + CF_CHECKSUM_SIZE + 1); + int4store(buffer + CF_CREATE_TIME_TOTAL_SIZE - CF_CHECKSUM_SIZE + 1, sum); + sum= (uint32) my_checksum(0, buffer + CF_CREATE_TIME_TOTAL_SIZE + 1 + + CF_CHECKSUM_SIZE, + CF_CHANGEABLE_TOTAL_SIZE - CF_CHECKSUM_SIZE + 1); + int4store(buffer + CF_CREATE_TIME_TOTAL_SIZE + 1, sum); + /* write new file and check it */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pwrite(fd, buffer, + CF_CREATE_TIME_TOTAL_SIZE + + CF_CHANGEABLE_TOTAL_SIZE + 2, + 0, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + return(0); +} + +static int test_bad_hchecksum(void) +{ + uchar buffer[4]; + int fd; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + /* Corrupt checksum */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pread(fd, buffer, 1, 26, MYF(MY_FNABP | MY_WME)) == 0); + buffer[0]+= 3; /* mangle checksum */ + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 26, MYF(MY_FNABP | MY_WME)) == 0); + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_BAD_HEAD_CHECKSUM); + /* Restore checksum */ + buffer[0]-= 3; + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 26, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + + return 0; +} + + +static int test_bad_size(void) +{ + uchar buffer[]= + "123456789012345678901234567890123456789012345678901234567890123456"; + int fd, i; + + /* A too short file */ + RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0); + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR | O_CREAT, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_write(fd, buffer, 10, MYF(MY_FNABP | MY_WME)) == 0); + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_TOO_SMALL); + for (i= 0; i < 8; i++) + { + RET_ERR_UNLESS(my_write(fd, buffer, 66, MYF(MY_FNABP | MY_WME)) == 0); + } + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_TOO_BIG); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + + /* Leave a correct control file */ + RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + return 0; +} + + +static struct my_option my_long_options[] = +{ +#ifndef DBUG_OFF + {"debug", '#', "Debug log.", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"help", '?', "Display help and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version number and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static void version(void) +{ + printf("ma_control_file_test: unit test for the control file " + "module of the Aria storage engine. Ver 1.0 \n"); +} + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch(optid) { + case 'V': + version(); + exit(0); + case '#': + DBUG_PUSH (argument); + break; + case '?': + version(); + usage(); + exit(0); + } + return 0; +} + + +/* Read options */ + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, + get_one_option))) + exit(ho_error); + + return; +} /* get options */ + + +static void usage(void) +{ + printf("Usage: %s [options]\n\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/unittest/ma_loghandler_examples.c b/storage/maria/unittest/ma_loghandler_examples.c new file mode 100644 index 00000000000..0c11a3b9a8e --- /dev/null +++ b/storage/maria/unittest/ma_loghandler_examples.c @@ -0,0 +1,65 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" + +static LOG_DESC INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE= +{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0, + "fixed0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, NULL, NULL, 0, +"variable0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 7, 7, NULL, NULL, NULL, 1, +"fixed1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 12, NULL, NULL, NULL, 1, +"variable1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 23, 23, NULL, NULL, NULL, 2, +"fixed2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 19, NULL, NULL, NULL, 2, +"variable2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + + +void translog_example_table_init() +{ + int i; + log_record_type_descriptor[LOGREC_FIXED_RECORD_0LSN_EXAMPLE]= + INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE]= + INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_FIXED_RECORD_1LSN_EXAMPLE]= + INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE]= + INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_FIXED_RECORD_2LSN_EXAMPLE]= + INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE]= + INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE; + for (i= LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE + 1; + i < LOGREC_NUMBER_OF_TYPES; + i++) + log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED; +} + + + diff --git a/storage/maria/unittest/ma_maria_log_cleanup.c b/storage/maria/unittest/ma_maria_log_cleanup.c new file mode 100644 index 00000000000..f85c75b1a88 --- /dev/null +++ b/storage/maria/unittest/ma_maria_log_cleanup.c @@ -0,0 +1,64 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <my_dir.h> + +my_bool maria_log_remove() +{ + MY_DIR *dirp; + uint i; + MY_STAT stat_buff; + char file_name[FN_REFLEN]; + + /* Removes control file */ + if (fn_format(file_name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) == NullS) + return 1; + if (my_stat(file_name, &stat_buff, MYF(0)) && + my_delete(file_name, MYF(MY_WME)) != 0) + return 1; + + /* Finds and removes transaction log files */ + if (!(dirp = my_dir(maria_data_root, MYF(MY_DONT_SORT)))) + return 1; + + for (i= 0; i < dirp->number_off_files; i++) + { + char *file= dirp->dir_entry[i].name; + if (strncmp(file, "aria_log.", 9) == 0 && + file[9] >= '0' && file[9] <= '9' && + file[10] >= '0' && file[10] <= '9' && + file[11] >= '0' && file[11] <= '9' && + file[12] >= '0' && file[12] <= '9' && + file[13] >= '0' && file[13] <= '9' && + file[14] >= '0' && file[14] <= '9' && + file[15] >= '0' && file[15] <= '9' && + file[16] >= '0' && file[16] <= '9' && + file[17] == '\0') + { + if (fn_format(file_name, file, + maria_data_root, "", MYF(MY_WME)) == NullS || + my_delete(file_name, MYF(MY_WME)) != 0) + { + my_dirend(dirp); + return 1; + } + } + } + my_dirend(dirp); + return 0; +} + diff --git a/storage/maria/unittest/ma_pagecache_consist.c b/storage/maria/unittest/ma_pagecache_consist.c new file mode 100644 index 00000000000..7dbdba433c6 --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_consist.c @@ -0,0 +1,498 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). +*/ + +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + +static char *file1_name= (char*)"page_cache_test_file_1"; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count; +static PAGECACHE pagecache; + +#ifdef TEST_HIGH_CONCURENCY +static uint number_of_readers= 10; +static uint number_of_writers= 20; +static uint number_of_tests= 30000; +static uint record_length_limit= TEST_PAGE_SIZE/200; +static uint number_of_pages= 20; +static uint flush_divider= 1000; +#else /*TEST_HIGH_CONCURENCY*/ +#ifdef TEST_READERS +static uint number_of_readers= 10; +static uint number_of_writers= 1; +static uint number_of_tests= 30000; +static uint record_length_limit= TEST_PAGE_SIZE/200; +static uint number_of_pages= 20; +static uint flush_divider= 1000; +#undef SKIP_BIG_TESTS +#define SKIP_BIG_TESTS(X) /* no-op */ +#else /*TEST_READERS*/ +#ifdef TEST_WRITERS +static uint number_of_readers= 0; +static uint number_of_writers= 10; +static uint number_of_tests= 30000; +static uint record_length_limit= TEST_PAGE_SIZE/200; +static uint number_of_pages= 20; +static uint flush_divider= 1000; +#undef SKIP_BIG_TESTS +#define SKIP_BIG_TESTS(X) /* no-op */ +#else /*TEST_WRITERS*/ +static uint number_of_readers= 10; +static uint number_of_writers= 10; +static uint number_of_tests= 50000; +static uint record_length_limit= TEST_PAGE_SIZE/200; +static uint number_of_pages= 20000; +static uint flush_divider= 1000; +#endif /*TEST_WRITERS*/ +#endif /*TEST_READERS*/ +#endif /*TEST_HIGH_CONCURENCY*/ + + +/** + @brief Dummy pagecache callback. +*/ + +static my_bool +dummy_callback(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no __attribute__((unused)), + uchar* data_ptr __attribute__((unused))) +{ + return 0; +} + + +/** + @brief Dummy pagecache callback. +*/ + +static void +dummy_fail_callback(uchar* data_ptr __attribute__((unused))) +{ + return; +} + + +/* + Get pseudo-random length of the field in (0;limit) + + SYNOPSYS + get_len() + limit limit for generated value + + RETURN + length where length >= 0 & length < limit +*/ + +static uint get_len(uint limit) +{ + return (uint)((ulonglong)rand()*(limit-1)/RAND_MAX); +} + + +/* + Check page's consistency: layout is + 4 bytes: number 'num' of records in this page, then num occurences of + { 4 bytes: record's length 'len'; then 4 bytes unchecked ('tag') then + 'len' bytes each equal to the record's sequential number in this page, + modulo 256 }, then zeroes. + */ +uint check_page(uchar *buff, ulong offset, int page_locked, int page_no, + int tag) +{ + uint end= sizeof(uint); + uint num= uint4korr(buff); + uint i; + DBUG_ENTER("check_page"); + + for (i= 0; i < num; i++) + { + uint len= uint4korr(buff + end); + uint j; + end+= 4 + 4; + if (len + end > TEST_PAGE_SIZE) + { + diag("incorrect field header #%u by offset %lu\n", i, offset + end); + goto err; + } + for(j= 0; j < len; j++) + { + if (buff[end + j] != (uchar)((i+1) % 256)) + { + diag("incorrect %lu byte\n", offset + end + j); + goto err; + } + } + end+= len; + } + for(i= end; i < TEST_PAGE_SIZE; i++) + { + if (buff[i] != 0) + { + int h; + DBUG_PRINT("err", + ("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n", + offset + i, offset, i, page_no, + (page_locked ? "locked" : "unlocked"), + end, num, tag)); + diag("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n", + offset + i, offset, i, page_no, + (page_locked ? "locked" : "unlocked"), + end, num, tag); + h= my_open("wrong_page", O_CREAT | O_TRUNC | O_RDWR, MYF(0)); + my_pwrite(h, (uchar*) buff, TEST_PAGE_SIZE, 0, MYF(0)); + my_close(h, MYF(0)); + goto err; + } + } + DBUG_RETURN(end); +err: + DBUG_PRINT("err", ("try to flush")); + if (page_locked) + { + pagecache_delete(&pagecache, &file1, page_no, + PAGECACHE_LOCK_LEFT_WRITELOCKED, 1); + } + else + { + flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE); + } + exit(1); +} + +void put_rec(uchar *buff, uint end, uint len, uint tag) +{ + uint i; + uint num; + num= uint4korr(buff); + if (!len) + len= 1; + if (end + 4*2 + len > TEST_PAGE_SIZE) + return; + int4store(buff + end, len); + end+= 4; + int4store(buff + end, tag); + end+= 4; + num++; + int4store(buff, num); + for (i= end; i < (len + end); i++) + { + buff[i]= (uchar) num % 256; + } +} + +/* + Recreate and reopen a file for test + + SYNOPSIS + reset_file() + file File to reset + file_name Path (and name) of file which should be reset +*/ + +void reset_file(PAGECACHE_FILE file, char *file_name) +{ + flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE); + if (my_close(file1.file, MYF(0)) != 0) + { + diag("Got error during %s closing from close() (errno: %d)\n", + file_name, errno); + exit(1); + } + my_delete(file_name, MYF(0)); + if ((file.file= my_open(file_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag("Got error during %s creation from open() (errno: %d)\n", + file_name, errno); + exit(1); + } +} + + +void reader(int num) +{ + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + uint i; + + for (i= 0; i < number_of_tests; i++) + { + uint page= get_len(number_of_pages); + pagecache_read(&pagecache, &file1, page, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + check_page(buffr, page * TEST_PAGE_SIZE, 0, page, -num); + + } + free(buffr); +} + + +void writer(int num) +{ + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + uint i; + + for (i= 0; i < number_of_tests; i++) + { + uint end; + uint page= get_len(number_of_pages); + pagecache_read(&pagecache, &file1, page, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + end= check_page(buffr, page * TEST_PAGE_SIZE, 1, page, num); + put_rec(buffr, end, get_len(record_length_limit), num); + pagecache_write(&pagecache, &file1, page, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + + if (i % flush_divider == 0) + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + } + free(buffr); +} + + +static void *test_thread_reader(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_reader"); + DBUG_PRINT("enter", ("param: %d", param)); + + reader(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "reader%d: done", param); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + + +static void *test_thread_writer(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_writer"); + DBUG_PRINT("enter", ("param: %d", param)); + + writer(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "writer%d: done", param); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error, pagen; + + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/test_pagecache_consist.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + { + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + plan(number_of_writers + number_of_readers); + SKIP_BIG_TESTS(number_of_writers + number_of_readers) + { + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag( "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + pagecache_file_init(file1, &dummy_callback, &dummy_callback, + &dummy_fail_callback, &dummy_callback, NULL); + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME))) + exit(1); + my_pwrite(file1.file, (const uchar *)"test file", 9, 0, MYF(0)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + diag("Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + diag( + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + VOID(thr_setconcurrency(2)); +#endif + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TEST_PAGE_SIZE, 0)) == 0) + { + diag("Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %d pages", pagen)); + { + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + uint i; + memset(buffr, '\0', TEST_PAGE_SIZE); + for (i= 0; i < number_of_pages; i++) + { + pagecache_write(&pagecache, &file1, i, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + } + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + free(buffr); + } + pthread_mutex_lock(&LOCK_thread_count); + while (number_of_readers != 0 || number_of_writers != 0) + { + if (number_of_readers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_readers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_reader, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_readers--; + } + if (number_of_writers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_writers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + } + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count))) + diag("COND_thread_count: %d from pthread_cond_wait\n",error); + } + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_PRINT("info", ("thread ended")); + + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(0)) != 0) + { + diag( "Got error during file1 closing from close() (errno: %d)\n", + errno); + exit(1); + } + my_delete(file1_name, MYF(0)); + + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + DBUG_PRINT("info", ("Program end")); + + } /* SKIP_BIG_TESTS */ + my_end(0); + + return exit_status(); + } +} diff --git a/storage/maria/unittest/ma_pagecache_rwconsist.c b/storage/maria/unittest/ma_pagecache_rwconsist.c new file mode 100644 index 00000000000..a1a22b5e18d --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_rwconsist.c @@ -0,0 +1,362 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). +*/ + +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + + +#define SLEEP my_sleep(5) + +static char *file1_name= (char*)"page_cache_test_file_1"; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count= 0; +static PAGECACHE pagecache; + +static uint number_of_readers= 5; +static uint number_of_writers= 5; +static uint number_of_read_tests= 2000; +static uint number_of_write_tests= 1000; +static uint read_sleep_limit= 3; +static uint report_divisor= 50; + +/** + @brief Dummy pagecache callback. +*/ + +static my_bool +dummy_callback(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no __attribute__((unused)), + uchar* data_ptr __attribute__((unused))) +{ + return 0; +} + + +/** + @brief Dummy pagecache callback. +*/ + +static void +dummy_fail_callback(uchar* data_ptr __attribute__((unused))) +{ + return; +} + + +/** + @brief Checks page consistency + + @param buff pointer to the page content + @param task task ID +*/ +void check_page(uchar *buff, int task) +{ + uint i; + DBUG_ENTER("check_page"); + + for (i= 1; i < TEST_PAGE_SIZE; i++) + { + if (buff[0] != buff[i]) + goto err; + } + DBUG_VOID_RETURN; +err: + diag("Task %d char #%u '%u' != '%u'", task, i, (uint) buff[0], + (uint) buff[i]); + DBUG_PRINT("err", ("try to flush")); + exit(1); +} + + + +void reader(int num) +{ + unsigned char *buff; + uint i; + PAGECACHE_BLOCK_LINK *link; + + for (i= 0; i < number_of_read_tests; i++) + { + if (i % report_divisor == 0) + diag("Reader %d - %u", num, i); + buff= pagecache_read(&pagecache, &file1, 0, 3, NULL, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_READ, + &link); + check_page(buff, num); + pagecache_unlock_by_link(&pagecache, link, + PAGECACHE_LOCK_READ_UNLOCK, + PAGECACHE_UNPIN, 0, 0, 0, FALSE); + { + int lim= rand() % read_sleep_limit; + int j; + for (j= 0; j < lim; j++) + SLEEP; + } + } +} + + +void writer(int num) +{ + uint i; + uchar *buff; + PAGECACHE_BLOCK_LINK *link; + + for (i= 0; i < number_of_write_tests; i++) + { + uchar c= (uchar) rand() % 256; + + if (i % report_divisor == 0) + diag("Writer %d - %u", num, i); + buff= pagecache_read(&pagecache, &file1, 0, 3, NULL, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + &link); + + check_page(buff, num); + bfill(buff, TEST_PAGE_SIZE / 2, c); + SLEEP; + bfill(buff + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE / 2, c); + check_page(buff, num); + pagecache_unlock_by_link(&pagecache, link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, 0, 0, 1, FALSE); + SLEEP; + } +} + + +static void *test_thread_reader(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_reader"); + + DBUG_PRINT("enter", ("param: %d", param)); + + reader(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "reader%d: done", param); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + + +static void *test_thread_writer(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_writer"); + + writer(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "writer%d: done", param); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error, pagen; + + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace"; +#else + default_dbug_option= "d:t:i:O,/tmp/test_pagecache_consist.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + { + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + plan(number_of_writers + number_of_readers); + SKIP_BIG_TESTS(number_of_writers + number_of_readers) + { + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag( "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + pagecache_file_init(file1, &dummy_callback, &dummy_callback, + &dummy_fail_callback, &dummy_callback, NULL); + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME))) + exit(1); + my_pwrite(file1.file, (const uchar*) "test file", 9, 0, MYF(0)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + diag("Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + diag( + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + VOID(thr_setconcurrency(2)); +#endif + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TEST_PAGE_SIZE, 0)) == 0) + { + diag("Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %d pages", pagen)); + { + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + memset(buffr, '\0', TEST_PAGE_SIZE); + pagecache_write(&pagecache, &file1, 0, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + } + pthread_mutex_lock(&LOCK_thread_count); + + while (number_of_readers != 0 || number_of_writers != 0) + { + if (number_of_readers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_readers + number_of_writers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_reader, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_readers--; + } + if (number_of_writers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_writers + number_of_readers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + } + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count))) + diag("COND_thread_count: %d from pthread_cond_wait\n", error); + } + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_PRINT("info", ("thread ended")); + + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(0)) != 0) + { + diag( "Got error during file1 closing from close() (errno: %d)\n", + errno); + exit(1); + } + my_delete(file1_name, MYF(0)); + + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + DBUG_PRINT("info", ("Program end")); + } /* SKIP_BIG_TESTS */ + my_end(0); + + return exit_status(); + } +} diff --git a/storage/maria/unittest/ma_pagecache_rwconsist2.c b/storage/maria/unittest/ma_pagecache_rwconsist2.c new file mode 100644 index 00000000000..34183a2d0ab --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_rwconsist2.c @@ -0,0 +1,358 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +/** + @file this unit tests consistence of long block writing under write lock + and simultaneous reading of this block with read request without read lock + requirement. +*/ + +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). +*/ + +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + + +#define SLEEP my_sleep(5) + +static char *file1_name= (char*)"page_cache_test_file_1"; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count= 0; +static PAGECACHE pagecache; + +static uint number_of_readers= 5; +static uint number_of_writers= 5; +static uint number_of_read_tests= 20000; +static uint number_of_write_tests= 1000; +static uint report_divisor= 50; + +/** + @brief Dummy pagecache callback. +*/ + +static my_bool +dummy_callback(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no __attribute__((unused)), + uchar* data_ptr __attribute__((unused))) +{ + return 0; +} + + +/** + @brief Dummy pagecache callback. +*/ + +static void +dummy_fail_callback(uchar* data_ptr __attribute__((unused))) +{ + return; +} + + +/** + @brief Checks page consistency + + @param buff pointer to the page content + @param task task ID +*/ +void check_page(uchar *buff, int task) +{ + uint i; + DBUG_ENTER("check_page"); + + for (i= 1; i < TEST_PAGE_SIZE; i++) + { + if (buff[0] != buff[i]) + goto err; + } + DBUG_VOID_RETURN; +err: + diag("Task %d char #%u '%u' != '%u'", task, i, (uint) buff[0], + (uint) buff[i]); + DBUG_PRINT("err", ("try to flush")); + exit(1); +} + + + +void reader(int num) +{ + unsigned char buff[TEST_PAGE_SIZE]; + uint i; + + for (i= 0; i < number_of_read_tests; i++) + { + if (i % report_divisor == 0) + diag("Reader %d - %u", num, i); + pagecache_read(&pagecache, &file1, 0, 3, buff, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + NULL); + check_page(buff, num); + } +} + + +void writer(int num) +{ + uint i; + uchar *buff; + PAGECACHE_BLOCK_LINK *link; + + for (i= 0; i < number_of_write_tests; i++) + { + uchar c= (uchar) rand() % 256; + + if (i % report_divisor == 0) + diag("Writer %d - %u", num, i); + buff= pagecache_read(&pagecache, &file1, 0, 3, NULL, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + &link); + + check_page(buff, num); + bfill(buff, TEST_PAGE_SIZE / 2, c); + SLEEP; + bfill(buff + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE / 2, c); + check_page(buff, num); + pagecache_unlock_by_link(&pagecache, link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, 0, 0, 1, FALSE); + SLEEP; + } +} + + +static void *test_thread_reader(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_reader"); + + DBUG_PRINT("enter", ("param: %d", param)); + + reader(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "reader%d: done", param); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + + +static void *test_thread_writer(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_writer"); + + writer(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "writer%d: done", param); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error, pagen; + + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace"; +#else + default_dbug_option= "d:t:i:O,/tmp/test_pagecache_consist.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + { + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + plan(number_of_writers + number_of_readers); + SKIP_BIG_TESTS(number_of_writers + number_of_readers) + { + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag( "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + pagecache_file_init(file1, &dummy_callback, &dummy_callback, + &dummy_fail_callback, &dummy_callback, NULL); + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME))) + exit(1); + my_pwrite(file1.file, (const uchar*) "test file", 9, 0, MYF(0)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + diag("Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + diag( + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + VOID(thr_setconcurrency(2)); +#endif + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TEST_PAGE_SIZE, 0)) == 0) + { + diag("Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %d pages", pagen)); + { + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + memset(buffr, '\0', TEST_PAGE_SIZE); + pagecache_write(&pagecache, &file1, 0, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + } + pthread_mutex_lock(&LOCK_thread_count); + + while (number_of_readers != 0 || number_of_writers != 0) + { + if (number_of_readers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_readers + number_of_writers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_reader, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_readers--; + } + if (number_of_writers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_writers + number_of_readers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + } + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count))) + diag("COND_thread_count: %d from pthread_cond_wait\n", error); + } + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_PRINT("info", ("thread ended")); + + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(0)) != 0) + { + diag( "Got error during file1 closing from close() (errno: %d)\n", + errno); + exit(1); + } + my_delete(file1_name, MYF(0)); + + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + DBUG_PRINT("info", ("Program end")); + } /* SKIP_BIG_TESTS */ + my_end(0); + + return exit_status(); + } +} diff --git a/storage/maria/unittest/ma_pagecache_single.c b/storage/maria/unittest/ma_pagecache_single.c new file mode 100644 index 00000000000..32e588e165a --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_single.c @@ -0,0 +1,853 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). + Use diag() instead of fprintf(stderr). +*/ +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*10) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + +#ifndef BIG +#undef SKIP_BIG_TESTS +#define SKIP_BIG_TESTS(X) /* no-op */ +#endif + +static char *file1_name= (char*)"page_cache_test_file_1"; +static char *file2_name= (char*)"page_cache_test_file_2"; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count; +static PAGECACHE pagecache; + +/* + File contance descriptors +*/ +static struct file_desc simple_read_write_test_file[]= +{ + { TEST_PAGE_SIZE, '\1'}, + {0, 0} +}; +static struct file_desc simple_read_change_write_read_test_file[]= +{ + { TEST_PAGE_SIZE/2, '\65'}, + { TEST_PAGE_SIZE/2, '\1'}, + {0, 0} +}; +static struct file_desc simple_pin_test_file1[]= +{ + { TEST_PAGE_SIZE*2, '\1'}, + {0, 0} +}; +static struct file_desc simple_pin_test_file2[]= +{ + { TEST_PAGE_SIZE/2, '\1'}, + { TEST_PAGE_SIZE/2, (unsigned char)129}, + { TEST_PAGE_SIZE, '\1'}, + {0, 0} +}; +static struct file_desc simple_pin_no_lock_test_file1[]= +{ + { TEST_PAGE_SIZE, '\4'}, + {0, 0} +}; +static struct file_desc simple_pin_no_lock_test_file2[]= +{ + { TEST_PAGE_SIZE, '\5'}, + {0, 0} +}; +static struct file_desc simple_pin_no_lock_test_file3[]= +{ + { TEST_PAGE_SIZE, '\6'}, + {0, 0} +}; +static struct file_desc simple_delete_forget_test_file[]= +{ + { TEST_PAGE_SIZE, '\1'}, + {0, 0} +}; +static struct file_desc simple_delete_flush_test_file[]= +{ + { TEST_PAGE_SIZE, '\2'}, + {0, 0} +}; + + +/** + @brief Dummy pagecache callback. +*/ + +static my_bool +dummy_callback(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no __attribute__((unused)), + uchar* data_ptr __attribute__((unused))) +{ + return 0; +} + + +/** + @brief Dummy pagecache callback. +*/ + +static void +dummy_fail_callback(uchar* data_ptr __attribute__((unused))) +{ + return; +} + + +/* + Recreate and reopen a file for test + + SYNOPSIS + reset_file() + file File to reset + file_name Path (and name) of file which should be reset +*/ + +void reset_file(PAGECACHE_FILE *file, const char *file_name) +{ + flush_pagecache_blocks(&pagecache, file, FLUSH_RELEASE); + if (my_close(file->file, MYF(MY_WME))) + exit(1); + my_delete(file_name, MYF(MY_WME)); + if ((file->file= my_open(file_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag("Got error during %s creation from open() (errno: %d)\n", + file_name, my_errno); + exit(1); + } +} + +/* + Write then read page, check file on disk +*/ + +int simple_read_write_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + int res; + DBUG_ENTER("simple_read_write_test"); + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + pagecache_read(&pagecache, &file1, 0, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + ok((res= test(memcmp(buffr, buffw, TEST_PAGE_SIZE) == 0)), + "Simple write-read page "); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_read_write_test_file))), + "Simple write-read page file"); + if (res) + reset_file(&file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + + +/* + Prepare page, then read (and lock), change (write new value and unlock), + then check the page in the cache and on the disk +*/ +int simple_read_change_write_read_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + int res, res2; + DBUG_ENTER("simple_read_change_write_read_test"); + + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + /* test */ + pagecache_read(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + bfill(buffw, TEST_PAGE_SIZE/2, '\65'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + + pagecache_read(&pagecache, &file1, 0, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + ok((res= test(memcmp(buffr, buffw, TEST_PAGE_SIZE) == 0)), + "Simple read-change-write-read page "); + DBUG_ASSERT(pagecache.blocks_changed == 1); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + DBUG_ASSERT(pagecache.blocks_changed == 0); + ok((res2= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_read_change_write_read_test_file))), + "Simple read-change-write-read page file"); + if (res && res2) + reset_file(&file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res && res2); +} + + +/* + Prepare page, read page 0 (and pin) then write page 1 and page 0. + Flush the file (should flush only page 1 and return 1 (page 0 is + still pinned). + Check file on the disk. + Unpin and flush. + Check file on the disk. +*/ +int simple_pin_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + int res; + DBUG_ENTER("simple_pin_test"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* test */ + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + pagecache_read(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + pagecache_write(&pagecache, &file1, 1, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + bfill(buffw + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE/2, ((unsigned char) 129)); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* + We have to get error because one page of the file is pinned, + other page should be flushed + */ + if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Did not get error in flush_pagecache_blocks\n"); + res= 0; + goto err; + } + ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE*2, + simple_pin_test_file1))), + "Simple pin page file with pin"); + pagecache_unlock(&pagecache, + &file1, + 0, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + 0, 0, 0); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks\n"); + res= 0; + goto err; + } + ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE, + simple_pin_test_file2))), + "Simple pin page result file"); + if (res) + reset_file(&file1, file1_name); +err: + free(buffw); + DBUG_RETURN(res); +} + +/* + Prepare page, read page 0 (and pin) then write page 1 and page 0. + Flush the file (should flush only page 1 and return 1 (page 0 is + still pinned). + Check file on the disk. + Unpin and flush. + Check file on the disk. +*/ +int simple_pin_test2() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + int res; + DBUG_ENTER("simple_pin_test2"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* test */ + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + pagecache_read(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + pagecache_write(&pagecache, &file1, 1, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + bfill(buffw + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE/2, ((unsigned char) 129)); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_TO_READ, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* + We have to get error because one page of the file is pinned, + other page should be flushed + */ + if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY)) + { + diag("Did not get error in flush_pagecache_blocks 2\n"); + res= 0; + goto err; + } + ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE*2, + simple_pin_test_file1))), + "Simple pin page file with pin 2"); + + /* Test that a normal flush goes through */ + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks 3\n"); + res= 0; + goto err; + } + pagecache_unlock(&pagecache, + &file1, + 0, + PAGECACHE_LOCK_READ_UNLOCK, + PAGECACHE_UNPIN, + 0, 0, 0); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks 4\n"); + res= 0; + goto err; + } + ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE, + simple_pin_test_file2))), + "Simple pin page result file 2"); + if (res) + reset_file(&file1, file1_name); +err: + free(buffw); + DBUG_RETURN(res); +} + +/* + Checks pins without lock. +*/ +int simple_pin_no_lock_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + PAGECACHE_BLOCK_LINK *link; + int res; + DBUG_ENTER("simple_pin_no_lock_test"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\4'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* test */ + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache 2\n"); + exit(1); + } + bfill(buffw, TEST_PAGE_SIZE, '\5'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* + We have to get error because one page of the file is pinned, + other page should be flushed + */ + if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY)) + { + diag("Did not get error in flush_pagecache_blocks 2\n"); + res= 0; + goto err; + } + ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_pin_no_lock_test_file1))), + "Simple pin (no lock) page file with pin 2"); + pagecache_unlock(&pagecache, + &file1, + 0, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_UNPIN, + 0, 0, 0); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks 2\n"); + res= 0; + goto err; + } + ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_pin_no_lock_test_file2))), + "Simple pin (no lock) page result file 2"); + + bfill(buffw, TEST_PAGE_SIZE, '\6'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, + &link, LSN_IMPOSSIBLE); + pagecache_unlock_by_link(&pagecache, link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_PIN_LEFT_PINNED, 0, 0, 1, FALSE); + if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY)) + { + diag("Did not get error in flush_pagecache_blocks 3\n"); + res= 0; + goto err; + } + ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_pin_no_lock_test_file2))), + "Simple pin (no lock) page file with pin 3"); + pagecache_unpin_by_link(&pagecache, link, 0); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks 3\n"); + res= 0; + goto err; + } + ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_pin_no_lock_test_file3))), + "Simple pin (no lock) page result file 3"); + if (res) + reset_file(&file1, file1_name); +err: + free(buffw); + DBUG_RETURN(res); +} +/* + Prepare page, write new value, then delete page from cache without flush, + on the disk should be page with old content written during preparation +*/ + +int simple_delete_forget_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + int res; + DBUG_ENTER("simple_delete_forget_test"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + /* test */ + bfill(buffw, TEST_PAGE_SIZE, '\2'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + pagecache_delete(&pagecache, &file1, 0, + PAGECACHE_LOCK_WRITE, 0); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_delete_forget_test_file))), + "Simple delete-forget page file"); + if (res) + reset_file(&file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + +/* + Prepare page with locking, write new content to the page, + delete page with flush and on existing lock, + check that page on disk contain new value. +*/ + +int simple_delete_flush_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + PAGECACHE_BLOCK_LINK *link; + int res; + DBUG_ENTER("simple_delete_flush_test"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, + &link, LSN_IMPOSSIBLE); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + /* test */ + bfill(buffw, TEST_PAGE_SIZE, '\2'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + if (pagecache_delete_by_link(&pagecache, link, + PAGECACHE_LOCK_LEFT_WRITELOCKED, 1)) + { + diag("simple_delete_flush_test: error during delete"); + exit(1); + } + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_delete_flush_test_file))), + "Simple delete flush (link) page file"); + if (res) + reset_file(&file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + + +/* + write then read file bigger then cache +*/ + +int simple_big_test() +{ + unsigned char *buffw= (unsigned char *) my_malloc(TEST_PAGE_SIZE, MYF(MY_WME)); + unsigned char *buffr= (unsigned char *) my_malloc(TEST_PAGE_SIZE, MYF(MY_WME)); + struct file_desc *desc= ((struct file_desc *) + my_malloc((PCACHE_SIZE/(TEST_PAGE_SIZE/2) + 1) * + sizeof(struct file_desc), MYF(MY_WME))); + int res, i; + DBUG_ENTER("simple_big_test"); + + /* prepare the file twice larger then cache */ + for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE/2); i++) + { + bfill(buffw, TEST_PAGE_SIZE, (unsigned char) (i & 0xff)); + desc[i].length= TEST_PAGE_SIZE; + desc[i].content= (i & 0xff); + pagecache_write(&pagecache, &file1, i, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + } + desc[i].length= 0; + desc[i].content= '\0'; + ok(1, "Simple big file write"); + /* check written pages sequentally read */ + for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE/2); i++) + { + int j; + pagecache_read(&pagecache, &file1, i, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + for(j= 0; j < TEST_PAGE_SIZE; j++) + { + if (buffr[j] != (i & 0xff)) + { + diag("simple_big_test seq: page %u byte %u mismatch\n", i, j); + res= 0; + goto err; + } + } + } + ok(1, "Simple big file sequential read"); + /* chack random reads */ + for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE); i++) + { + int j, page; + page= rand() % (PCACHE_SIZE/(TEST_PAGE_SIZE/2)); + pagecache_read(&pagecache, &file1, page, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + for(j= 0; j < TEST_PAGE_SIZE; j++) + { + if (buffr[j] != (page & 0xff)) + { + diag("simple_big_test rnd: page %u byte %u mismatch\n", page, j); + res= 0; + goto err; + } + } + } + ok(1, "Simple big file random read"); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + + ok((res= test(test_file(file1, file1_name, PCACHE_SIZE*2, TEST_PAGE_SIZE, + desc))), + "Simple big file"); + if (res) + reset_file(&file1, file1_name); + +err: + my_free(buffw, 0); + my_free(buffr, 0); + my_free(desc, 0); + DBUG_RETURN(res); +} + + +/* + Thread function +*/ + +static void *test_thread(void *arg) +{ +#ifndef DBUG_OFF + int param= *((int*) arg); +#endif + + my_thread_init(); + { + DBUG_ENTER("test_thread"); + DBUG_PRINT("enter", ("param: %d", param)); + + if (!simple_read_write_test() || + !simple_read_change_write_read_test() || + !simple_pin_test() || + !simple_pin_test2() || + !simple_pin_no_lock_test() || + !simple_delete_forget_test() || + !simple_delete_flush_test()) + exit(1); + + SKIP_BIG_TESTS(4) + { + if (!simple_big_test()) + exit(1); + } + + DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + DBUG_RETURN(0); + } +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error, pagen; + File tmp_file; + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\test_pagecache_single.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/test_pagecache_single.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + { + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + + plan(18); + SKIP_BIG_TESTS(18) + { + + if ((tmp_file= my_open(file2_name, O_CREAT | O_TRUNC | O_RDWR, + MYF(MY_WME))) < 0) + exit(1); + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + pagecache_file_init(file1, &dummy_callback, &dummy_callback, + &dummy_fail_callback, &dummy_callback, NULL); + my_close(tmp_file, MYF(0)); + my_delete(file2_name, MYF(0)); + + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME))) + exit(1); + my_pwrite(file1.file, (const uchar*)"test file", 9, 0, MYF(MY_WME)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + fprintf(stderr,"Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + fprintf(stderr, + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + VOID(thr_setconcurrency(2)); +#endif + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TEST_PAGE_SIZE, MYF(MY_WME))) == 0) + { + fprintf(stderr,"Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %d pages", pagen)); + + pthread_mutex_lock(&LOCK_thread_count); + param=(int*) malloc(sizeof(int)); + *param= 1; + if ((error= pthread_create(&tid, &thr_attr, test_thread, (void*) param))) + { + fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count))) + fprintf(stderr,"Got error: %d from pthread_cond_wait\n",error); + } + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_PRINT("info", ("thread ended")); + + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(MY_WME))) + exit(1); + + my_delete(file1_name, MYF(0)); + + } /* SKIP_BIG_TESTS */ + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + DBUG_PRINT("info", ("Program end")); + + my_end(0); + + } + return exit_status(); +} diff --git a/storage/maria/unittest/ma_test_all-t b/storage/maria/unittest/ma_test_all-t new file mode 100755 index 00000000000..0b11daf7f98 --- /dev/null +++ b/storage/maria/unittest/ma_test_all-t @@ -0,0 +1,710 @@ +#!/usr/bin/env perl +# +# Run various unit tests. +# + +use Getopt::Long; +use File::Basename; + +$|= 1; +$^W = 1; # warnings, because env cannot parse 'perl -w' +$VER= "1.4"; + +$opt_version= 0; +$opt_help= 0; +$opt_verbose= 0; +$opt_abort_on_error= 0; +$opt_valgrind= "valgrind --alignment=8 --leak-check=yes"; +$opt_silent= "-s"; +$opt_number_of_tests= 0; +$opt_run_tests= undef(); + +my $maria_path; # path to "storage/maria" +my $maria_exe_path; # path to executables (ma_test1, aria_chk etc) +my $my_progname= $0; +$my_progname=~ s/.*[\/]//; +my $runtime_error= 0; # Return 1 if error(s) occur during run +my $NEW_TEST= 0; # Test group separator in an array of tests +my $test_begin= 0; +my $test_end= 0; +my $test_counter= 0; + +run_tests(); + +#### +#### Initialise variables, clean temporary files and run the tests +#### + +sub run_tests +{ + my $nr_tests= 0; + my $flag_exit= 0; + + if (!GetOptions("help" => \$opt_help, + "version" => \$opt_version, + "verbose" => \$opt_verbose, + "abort-on-error" => \$opt_abort_on_error, + "valgrind=s" => \$opt_valgrind, + "silent=s" => \$opt_silent, + "number-of-tests" => \$opt_number_of_tests, + "run-tests=s" => \$opt_run_tests, + "start-from=s" => \$opt_run_tests)) + { + $flag_exit= 1; + } + if ($opt_version) + { + print "$my_progname version $VER\n"; + exit(0); + } + $maria_path= dirname($0) . "/.."; + + my $suffix= ( $^O =~ /win/i && $^O !~ /darwin/i ) ? ".exe" : ""; + $maria_exe_path= "$maria_path/release"; + # we use -f, sometimes -x is unexpectedly false in Cygwin + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= "$maria_path/relwithdebinfo"; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= "$maria_path/debug"; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= $maria_path; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + die("Cannot find ma_test1 executable\n"); + } + } + } + } + + usage() if ($opt_help || $flag_exit); + + # + # IMPORTANT: If you modify this file, please read this: + # + # Count total number of tests. Make sure that the functions return + # number of unit tests correctly, e.g. calls to ok(). The last argument + # for each function is a flag counter and will return the number of + # unit tests in each. Please see comments on function ok() at the end. + # + # If you modify any functions or add any new ones, please make sure the + # unit tests are appropriately detected here. A wrong count will + # make the unit test fail during 'make test'. $nr_tests must be right. + # + + $nr_tests+= run_check_tests(0, 0, 0, 0, 1) * 5; # + $nr_tests+= run_repair_tests(0, 0, 0, 0, 1) * 5; # called 4 times + $nr_tests+= run_pack_tests(0, 0, 0, 0, 1) * 5; # + $nr_tests+= run_tests_on_warnings_and_errors(0, 0, 0, 1); + $nr_tests+= run_ma_test_recovery(0, 1); + $nr_tests+= run_tests_on_clrs(0, 0, 1); + + if ($opt_number_of_tests) + { + print "Total number of tests is $nr_tests\n"; + exit(0); + } + + if (defined($opt_run_tests)) + { + if ($opt_run_tests =~ m/^(\d+)$/ || + $opt_run_tests =~ m/^(\d+)\.+$/) + { + $test_begin= $1; + } + elsif ($opt_run_tests =~ m/^(\d+)\.+(\d+)$/) + { + $test_begin= $1; + $test_end= $2; + } + else + { + print "Wrong syntax for option --run-tests=$opt_run_tests\n"; + print "Please use --run-tests=<begin>..<end>\nwhere 'begin' is the "; + print "first test to be run and 'end' is the last.\n"; + exit(1); + } + if ($test_end > $nr_tests) + { + print "Test range ($test_begin..$test_end) out of range. "; + print "There are only $nr_tests in the test suite.\n"; + exit(1); + } + $test_begin++ if (!$test_begin); # Handle zero, if user gave that + if ($test_end && $test_begin > $test_end) + { + print "Bad test range ($test_begin..$test_end)\n"; + exit(1); + } + # Now adjust number of tests + $nr_tests= ($test_end ? $test_end : $nr_tests) - $test_begin + 1; + } + + # + # clean-up + # + + unlink <*.TMD aria_log*>; # Delete temporary files + + # + # Run tests + # + + if (!$opt_verbose) + { + print "1..$nr_tests\n"; + } + else + { + print "Total tests: $nr_tests\n"; + } + + if ($opt_verbose) + { + print "Running tests with dynamic row format\n" + } + run_check_tests($suffix, $opt_silent, "", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "", $opt_verbose, 0); + + if ($opt_verbose) + { + print "\nRunning tests with static row format\n"; + } + run_check_tests($suffix, $opt_silent, "-S", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "-S", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "-S", $opt_verbose, 0); + + if ($opt_verbose) + { + print "\nRunning tests with block row format\n"; + } + run_check_tests($suffix, $opt_silent, "-M", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "-M", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "-M", $opt_verbose, 0); + + if ($opt_verbose) + { + print "\nRunning tests with block row format and transactions\n"; + } + run_check_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0); + + if ($opt_verbose) + { + print "\nRunning tests with block row format, transactions and versioning\n"; + } + run_check_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0); + + + if ($opt_verbose) + { + print "\nRunning tests with warnings and recovery\n"; + } + run_tests_on_warnings_and_errors($suffix, $opt_silent, $opt_verbose, 0); + run_ma_test_recovery($opt_verbose, 0); + run_tests_on_clrs($suffix, $opt_verbose, 0); + + exit($runtime_error); +} + +#### +#### regular tests +#### + +sub run_check_tests +{ + my ($suffix, $silent, $row_type, $verbose, $count)= @_; + my ($i, $nr_tests); + my @ma_test1_opt= ( ["","-se"], + ["-N","-se"], + ["-P --checksum","-se"], + ["-P -N","-se"], + ["-B -N -R2","-sm"], + ["-a -k 480 --unique","-sm"], + ["-a -N -R1 ","-sm"], + ["-p","-sm"], + ["-p -N --unique","-sm"], + ["-p -N --key_length=127 --checksum","-sm"], + ["-p -N --key_length=128","-sm"], + ["-p --key_length=480","-sm"], + ["-a -B","-sm"], + ["-a -B --key_length=64 --unique","-sm"], + ["-a -B -k 480 --checksum","-sm"], + ["-a -B -k 480 -N --unique --checksum","-sm"], + ["-a -m","-sm"], + ["-a -m -P --unique --checksum","-sm"], + ["-a -m -P --key_length=480 --key_cache","-sm"], + ["-m -p","-sm"], + ["-w --unique","-sm"], + ["-a -w --key_length=64 --checksum","-sm"], + ["-a -w -N --key_length=480","-sm"], + ["-a -w --key_length=480 --checksum","-sm"], + ["-a -b -N","-sm"], + ["-a -b --key_length=480","-sm"], + ["-p -B --key_length=480","-sm"], + ["--checksum --unique","-se"], + ["--unique","-se"], + ["--key_multiple -N -S","-sm"], + ["--key_multiple -a -p --key_length=480","-sm"], + ["--key_multiple -a -B --key_length=480","-sm"], + ["--key_multiple -P -S","-sm"] ); + my @ma_test2_opt= ( ["-L -K -W -P","-sm"], + ["-L -K -W -P -A","-sm"], + ["-L -K -W -P -b32768", "-sm"], + ["-L -K -W -P -M -T -c -b32768 -t4 -m300", "-sm"], + ["-L -K -P -R3 -m50 -b1000000", "-sm"], + ["-L -B","-sm"], + ["-D -B -c","-sm"], + ["-m10000 -e4096 -K","-sm"], + ["-m10000 -e8192 -K","-sm"], + ["-m10000 -e16384 -E16384 -K -L","-sm"], + ["-L -K -W -P -b32768", "-se"], + ["-c -b65000","-se"] ); + my @ma_rt_test_opt= ( ); # (["--checksum", "-se"] ); + + + if ($count) + { + $nr_tests= 2; # Number of tests outside loops + for ($i= 0; defined($ma_test1_opt[$i]); $i++) { $nr_tests+=2; } + for ($i= 0; defined($ma_test2_opt[$i]); $i++) { $nr_tests+=2; } + for ($i= 0; defined($ma_rt_test_opt[$i]); $i++) { $nr_tests+=2; } + return $nr_tests; + } + + for ($i= 0; defined($ma_test1_opt[$i]); $i++) + { + unlink <aria_log_control aria_log.*>; + ok("$maria_exe_path/ma_test1$suffix $silent $ma_test1_opt[$i][0] $row_type", + $verbose, $i + 1); + ok("$maria_exe_path/aria_chk$suffix $ma_test1_opt[$i][1] test1", + $verbose, $i + 1); + } + # + # These tests are outside the loops. Make sure to include them in + # nr_tests manually + # + ok("$maria_exe_path/aria_pack$suffix --force -s test1", $verbose, 0); + ok("$maria_exe_path/aria_chk$suffix -ess test1", $verbose, 0); + + for ($i= 0; defined($ma_test2_opt[$i]); $i++) + { + unlink <aria_log_control aria_log.*>; + ok("$maria_exe_path/ma_test2$suffix $silent $ma_test2_opt[$i][0] $row_type", + $verbose, $i + 1); + ok("$maria_exe_path/aria_chk$suffix $ma_test2_opt[$i][1] test2", + $verbose, $i + 1); + } + + for ($i= 0; defined($ma_rt_test_opt[$i]); $i++) + { + unlink <aria_log_control aria_log.*>; + ok("$maria_exe_path/ma_rt_test$suffix $silent $ma_rt_test_opt[$i][0] $row_type", + $verbose, $i + 1); + ok("$maria_exe_path/aria_chk$suffix $ma_rt_test_opt[$i][1] rt_test", + $verbose, $i + 1); + } + + unlink <aria_log_control aria_log.*>; + + return 0; +} + +#### +#### repair tests +#### + +sub run_repair_tests() +{ + my ($suffix, $silent, $row_type, $verbose, $count)= @_; + my ($i); + + my @t= ($NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix --silent -re --transaction-log test1", + "$maria_exe_path/aria_chk$suffix -rs test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -rqs test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -rs --correct-checksum test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -rqs --correct-checksum test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -ros --correct-checksum test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -rqos --correct-checksum test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -sz test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/ma_test2$suffix $silent -c -d1 $row_type", + "$maria_exe_path/aria_chk$suffix -s --parallel-recover test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/aria_chk$suffix -s --parallel-recover --quick test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/ma_test2$suffix $silent -c $row_type", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/aria_chk$suffix -sr test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/ma_test2$suffix $silent -c -t4 -b32768 $row_type", + "$maria_exe_path/aria_chk$suffix -s --zerofill test1", + "$maria_exe_path/aria_chk$suffix -se test1" + ); + + return &count_tests(\@t) if ($count); + &run_test_bunch(\@t, $verbose, 0); + return 0; +} + +#### +#### pack tests +#### + +sub run_pack_tests() +{ + my ($suffix, $silent, $row_type, $verbose, $count)= @_; + my ($i); + + my @t= ($NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type", + "$maria_exe_path/aria_pack$suffix --force -s test1", + "$maria_exe_path/aria_chk$suffix -ess test1", + "$maria_exe_path/aria_chk$suffix -rqs test1", + "$maria_exe_path/aria_chk$suffix -es test1", + "$maria_exe_path/aria_chk$suffix -rs test1", + "$maria_exe_path/aria_chk$suffix -es test1", + "$maria_exe_path/aria_chk$suffix -rus test1", + "$maria_exe_path/aria_chk$suffix -es test1", + $NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type", + "$maria_exe_path/aria_pack$suffix --force -s test1", + "$maria_exe_path/aria_chk$suffix -rus --safe-recover test1", + "$maria_exe_path/aria_chk$suffix -es test1", + $NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent --checksum -S $row_type", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -ros test1", + "$maria_exe_path/aria_chk$suffix -rqs test1", + "$maria_exe_path/aria_chk$suffix -se test1", + $NEW_TEST, + "$maria_exe_path/aria_pack$suffix --force -s test1", + "$maria_exe_path/aria_chk$suffix -rqs test1", + "$maria_exe_path/aria_chk$suffix -es test1", + "$maria_exe_path/aria_chk$suffix -rus test1", + "$maria_exe_path/aria_chk$suffix -es test1", + $NEW_TEST, + "$maria_exe_path/ma_test2$suffix $silent -c -d1 $row_type", + "$maria_exe_path/aria_chk$suffix -s --parallel-recover test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/aria_chk$suffix -s --unpack --parallel-recover test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/aria_pack$suffix --force -s test1", + "$maria_exe_path/aria_chk$suffix -s --unpack --parallel-recover test2", + "$maria_exe_path/aria_chk$suffix -se test2", + $NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent -c $row_type", + "cp test1.MAD test2.MAD", + "cp test1.MAI test2.MAI", + "$maria_exe_path/aria_pack$suffix --force -s --join=test3 test1 test2", + "$maria_exe_path/aria_chk -s test3", + "$maria_exe_path/aria_chk -s --safe-recover test3", + "$maria_exe_path/aria_chk -s test3" + ); + + return &count_tests(\@t) if ($count); + &run_test_bunch(\@t, $verbose, 0); + return 0; +} + +#### +#### Tests that gives warnings or errors +#### + +sub run_tests_on_warnings_and_errors +{ + my ($suffix, $silent, $verbose, $count)= @_; + my ($com); + + return 9 if ($count); # Number of tests in this function, e.g. calls to ok() + + ok("$maria_exe_path/ma_test2$suffix $silent -L -K -W -P -S -R1 -m500", + $verbose, 0); + ok("$maria_exe_path/aria_chk$suffix -sm test2", $verbose, 0); + # ma_test2$suffix $silent -L -K -R1 -m2000 ; Should give error 135\n + # In the following a failure is a success and success is a failure + $com= "$maria_exe_path/ma_test2$suffix $silent -L -K -R1 -m2000 "; + $com.= ">ma_test2_message.txt 2>&1"; + ok($com, $verbose, 0, 1); + ok("cat ma_test2_message.txt", $verbose, 0); + ok("grep \"Error: 135\" ma_test2_message.txt > /dev/null", $verbose, 0); + # maria_exe_path/aria_chk$suffix -sm test2 will warn that + # Datafile is almost full + ok("$maria_exe_path/aria_chk$suffix -sm test2 >ma_test2_message.txt 2>&1", + $verbose, 0); + ok("cat ma_test2_message.txt", $verbose, 0); + ok("grep \"warning: Datafile is almost full\" ma_test2_message.txt>/dev/null", + $verbose, 0); + unlink <ma_test2_message.txt>; + ok("$maria_exe_path/aria_chk$suffix -ssm test2", $verbose, 0); + + return 0; +} + +#### +#### Test that removing tables and applying the log leads to identical tables +#### + +sub run_ma_test_recovery +{ + my ($verbose, $count)= @_; + + return 1 if ($count); # Number of tests in this function + ok("$maria_path/unittest/ma_test_recovery.pl", $verbose, 0); + return 0; +} + +#### +#### Tests on CLR's +#### + +sub run_tests_on_clrs +{ + my ($suffix, $verbose, $count)= @_; + my ($i); + + my @t= ($NEW_TEST, + "$maria_exe_path/ma_test2$suffix -s -L -K -W -P -M -T -c -b -t2 -A1", + "cp aria_log_control tmp", + "$maria_exe_path/aria_read_log$suffix -a -s", + "$maria_exe_path/aria_chk$suffix -s -e test2", + "cp tmp/aria_log_control .", + "rm test2.MA?", + "$maria_exe_path/aria_read_log$suffix -a -s", + "$maria_exe_path/aria_chk$suffix -s -e test2", + "rm test2.MA?", + $NEW_TEST, + "$maria_exe_path/ma_test2$suffix -s -L -K -W -P -M -T -c -b -t2 -A1", + "$maria_exe_path/aria_read_log$suffix -a -s", + "$maria_exe_path/aria_chk$suffix -s -e test2", + "rm test2.MA?", + "$maria_exe_path/aria_read_log$suffix -a -s", + "$maria_exe_path/aria_chk$suffix -e -s test2", + "rm test2.MA?", + $NEW_TEST, + "$maria_exe_path/ma_test2$suffix -s -L -K -W -P -M -T -c -b32768 -t4 -A1", + "$maria_exe_path/aria_read_log$suffix -a -s", + "$maria_exe_path/aria_chk$suffix -es test2", + "$maria_exe_path/aria_read_log$suffix -a -s", + "$maria_exe_path/aria_chk$suffix -es test2", + "rm test2.MA?", + "$maria_exe_path/aria_read_log$suffix -a -s", + "$maria_exe_path/aria_chk$suffix -es test2", + "rm test2.MA?" + ); + + return &count_tests(\@t) if ($count); + &run_test_bunch(\@t, $verbose, 1); + return 0; +} + +# +# Print "ok" on success and "not ok" on error +# +# Note: Every time this function is called it will be counted +# as a unit test. +# +# Args: $com: The actual command run. Will be printed on a failure +# $verbose: Be more verbose. +# $iteration: Number of iterations in a loop when the error +# occurred. If not in loop, this should be blank +# (e.g. send zero). +# $expected_error: Optional; put here expected error code. Test +# will pass with this result only. +# +# Return value: Will return 1 on success and 0 on an error +# + +sub ok +{ + my ($com, $verbose, $iteration, $expected_error)= @_; + my ($msg, $output, $err, $len); + + $test_counter++; + if ($test_begin > $test_counter) + { + return 0; + } + if ($test_end && $test_end < $test_counter) + { + exit(0); + } + + $msg= ""; + $expected_error= 0 if (!defined($expected_error)); + + if ($verbose) + { + print "$com "; + } + $output= `$com 2>&1`; + $len= length($com); + if ($verbose) + { + print " " x (62 - $len); + } + $err= $?; + if ((!$err && !$expected_error) || + (($err >> 8) == $expected_error && $expected_error)) + { + print "[ " if ($verbose); + print "ok"; + if ($verbose) + { + print " ]"; + print " " x (5 - length("$test_counter")); + print "$test_counter"; + } + else + { + print " $test_counter - $com" + } + print "\n"; + return 1; + } + print "[ " if ($verbose); + print "not ok"; + print " ]" if ($verbose); + print " $test_counter - $com" unless $verbose; + print "\n"; + if ($verbose && defined($output) && length($output)) + { + print "$output\n"; + } + if (!$verbose) + { + $msg= "\n"; # Get a nicer output in perl unit test mode + } + $msg.= "Failed test '$com' "; + if ($iteration) + { + $msg.= "(loop iteration $iteration.) "; + } + $msg.= "at line "; + $msg.= (caller)[2]; + $msg.= "\n(errcode: $err, test: $test_counter)\n"; + if ($expected_error) + { + $msg.= "Was expecting errcode: $expected_error\n"; + } + warn $msg; + $runtime_error= 1; + if ($opt_abort_on_error) + { + exit 1; + } + return 0; +} + +# +# Print "skip" and the reason +# +# Note: Every time this function is called it will be counted +# as a unit test. +# +# Args: $com: The actual command run. Will be printed on a failure +# $reason: The reason to skip a test +# $verbose: Be more verbose. +# + +sub skip +{ + my ($com, $reason, $verbose)= @_; + + $test_counter++; + return 0 if $test_begin > $test_counter; + exit 0 if $test_end && $test_end < $test_counter; + printf '%-64s[ skipped ]%5d', $com, $test_counter if $verbose; + print "ok $test_counter # skip $reason" unless $verbose; + print "\n"; + return 1; +} + +#### +#### Count tests +#### Arguments: $t: an array of the tests +#### + +sub count_tests +{ + my ($t)= @_; + my ($i, $nr_tests); + + $nr_tests= 0; + for ($i= 0; defined(@$t[$i]); $i++) { $nr_tests++ if (@$t[$i]); } + return $nr_tests; +} + +#### +#### Run a bunch of tests +#### Arguments: $t: an array of the tests +#### $verbose: to be passed to ok() +#### $clear: clear log files if set +#### + +sub run_test_bunch +{ + my ($t, $verbose, $clear)= @_; + my ($i); + + for ($i= 0; defined(@$t[$i]); $i++) + { + if ($clear && @$t[$i] eq $NEW_TEST) + { + unlink <aria_log.* aria_log_control>; + } + if (@$t[$i] ne $NEW_TEST) + { + ok(@$t[$i], $verbose, $i + 1); + } + } +} + +#### +#### usage +#### + +sub usage +{ + print <<EOF; +$my_progname version $VER + +Description: + +Run various Aria related tests. Typically used via make test as a unittest. + +Options +--help Show this help and exit. +--abort-on-error Abort at once in case of error. +--number-of-tests Print the total number of tests and exit. +--run-tests=... Test number(s) that should be run. You can give just + one number or a range. For example 45..89. To run a specific + test alone, for example test 215, use --run-tests=215..215 + Use this option with caution, because some of the tests + might depend on previous ones. +--start-from=... Alias for --run-tests +--silent=... Silent option passed to ma_test* tests ('$opt_silent') +--valgrind=... Options for valgrind. + ('$opt_valgrind') +--verbose Be more verbose. Will print each unittest on a line + and result after. This mode cannot be used with unit.pl + when running in normal unit test mode. +--version Show version number and exit. +EOF + exit(0); +} diff --git a/storage/maria/unittest/ma_test_loghandler-t.c b/storage/maria/unittest/ma_test_loghandler-t.c new file mode 100644 index 00000000000..ffac9b04839 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler-t.c @@ -0,0 +1,661 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); +extern void example_loghandler_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif +static TRN *trn= &dummy_transaction_object; + +#define PCACHE_SIZE (1024*1024*10) + +#define LONG_BUFFER_SIZE (100 * 1024) + +#ifdef LONG_LOG_TEST +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE (1024L*1024L*8) +#define ITERATIONS (1600*4) + +#else +#undef SKIP_BIG_TESTS +#define SKIP_BIG_TESTS(X) /* no-op */ +#define LOG_FLAGS (TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC) +#define LOG_FILE_SIZE (1024L*1024L*8L) +#define ITERATIONS 1600 +#endif + +/* +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE 1024L*1024L*1024L +#define ITERATIONS 181000 +*/ + +/* +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE 1024L*1024L*3L +#define ITERATIONS 1600 +*/ + +/* +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE 1024L*1024L*100L +#define ITERATIONS 65000 +*/ + +/* + Generate random value in the range (0,LONG_BUFFER_SIZE) +*/ +static uint32 rand_buffer_size() +{ + return (uint32)((ulonglong)rand()*(LONG_BUFFER_SIZE + 1)/RAND_MAX); +} + +/* + Check that the buffer filled correctly + + SYNOPSIS + check_content() + ptr Pointer to the buffer + length length of the buffer + + RETURN + 0 - OK + 1 - Error +*/ + + +static my_bool check_content(uchar *ptr, ulong length) +{ + ulong i; + uchar buff[2]; + for (i= 0; i < length; i++) + { + if (i % 2 == 0) + int2store(buff, i >> 1); + if (ptr[i] != buff[i % 2]) + { + fprintf(stderr, "Byte # %lu is %x instead of %x", + i, (uint) ptr[i], (uint) buff[i % 2]); + return 1; + } + } + return 0; +} + + +/* + Report OK for read operation + + SYNOPSIS + read_ok() + rec the record header +*/ + +void read_ok(TRANSLOG_HEADER_BUFFER *rec) +{ + ok(1, "read record type: %u LSN: (%lu,0x%lx)", + rec->type, LSN_IN_PARTS(rec->lsn)); +} + +/* + Read whole record content, and check content (put with offset) + + SYNOPSIS + read_and_check_content() + rec The record header buffer + buffer The buffer to read the record in + skip Skip this number of bytes ot the record content + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec, + uchar *buffer, uint skip) +{ + DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE * 2 + 7 * 2 + 2); + if (translog_read_record(rec->lsn, 0, rec->record_length, buffer, NULL) != + rec->record_length) + return 1; + return check_content(buffer + skip, rec->record_length - skip); +} + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint32 i; + uint32 rec_len; + uint pagen; + uchar long_tr_id[6]; + uchar lsn_buff[23]= + { + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + uchar long_buffer[LONG_BUFFER_SIZE * 2 + LSN_STORE_SIZE * 2 + 2]; + PAGECACHE pagecache; + LSN lsn, lsn_base, first_lsn; + TRANSLOG_HEADER_BUFFER rec; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 3]; + struct st_translog_scanner_data scanner; + int rc; + + MY_INIT(argv[0]); + + if (my_set_max_open_files(100) < 100) + { + fprintf(stderr, "can't allocate 100 file descriptors\n"); + exit(1); + } + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= (char *)"."; + if (maria_log_remove()) + exit(1); + + for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i+= 2) + { + int2store(long_buffer + i, (i >> 1)); + /* long_buffer[i]= (i & 0xFF); */ + } + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + plan(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1 + 1); + + SKIP_BIG_TESTS(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1 + 1) + { + + srand(122334817L); + + long_tr_id[5]= 0xff; + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + trn->short_id= 0; + trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + lsn_base= first_lsn= lsn; + + for (i= 1; i < ITERATIONS; i++) + { + trn->short_id= i % 0xFFFF; + if (i % 2) + { + lsn_store(lsn_buff, lsn_base); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + /* check auto-count feature */ + parts[TRANSLOG_INTERNAL_PARTS + 1].str= NULL; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= 0; + if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_1LSN_EXAMPLE, trn, + NULL, LSN_STORE_SIZE, 0, parts, NULL, NULL)) + { + fprintf(stderr, "1 Can't write reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + if ((rec_len= rand_buffer_size()) < 12) + rec_len= 12; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + /* check record length auto-counting */ + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + trn, NULL, 0, TRANSLOG_INTERNAL_PARTS + 2, + parts, NULL, NULL)) + { + fprintf(stderr, "1 Can't write var reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + } + else + { + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 23; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_2LSN_EXAMPLE, + trn, NULL, 23, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "0 Can't write reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + if ((rec_len= rand_buffer_size()) < 19) + rec_len= 19; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 14; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE, + trn, NULL, 14 + rec_len, + TRANSLOG_INTERNAL_PARTS + 2, parts, NULL, + NULL)) + { + fprintf(stderr, "0 Can't write var reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + } + int4store(long_tr_id, i); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + + lsn_base= lsn; + + if ((rec_len= rand_buffer_size()) < 9) + rec_len= 9; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + trn, NULL, rec_len, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + if (translog_flush(lsn)) + { + fprintf(stderr, "Can't flush #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "flush"); + exit(1); + } + ok(1, "flush"); + } + + if (translog_flush(translog_get_horizon())) + { + fprintf(stderr, "Can't flush up to horizon\n"); + translog_destroy(); + ok(0, "flush"); + exit(1); + } + ok(1, "flush"); + + srand(122334817L); + + rc= 1; + + { + int len= translog_read_record_header(first_lsn, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "translog_read_record_header failed (%d)\n", errno); + goto err; + } + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 || + rec.record_length != 6 || uint4korr(rec.header) != 0 || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF || + first_lsn != rec.lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(0)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, " + "lsn(%lu,0x%lx)\n", + (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length, + (uint) uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + goto err; + } + read_ok(&rec); + translog_free_record_header(&rec); + lsn= first_lsn; + if (translog_scanner_init(first_lsn, 1, &scanner, 0)) + { + fprintf(stderr, "scanner init failed\n"); + goto err; + } + for (i= 1;; i++) + { + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + if (i != ITERATIONS) + { + fprintf(stderr, "EOL met at iteration %u instead of %u\n", + i, ITERATIONS); + goto err; + } + break; + } + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 7 || ref != lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE " + "data read(%d) " + "type: %u strid: %u len: %u" + "ref: (%lu,0x%lx) (%lu,0x%lx) " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref), LSN_IN_PARTS(lsn), + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 23 || + ref1 != lsn || + ref2 != first_lsn || + ((uchar)rec.header[22]) != 0x55 || + ((uchar)rec.header[21]) != 0xAA || + ((uchar)rec.header[20]) != 0x55 || + ((uchar)rec.header[19]) != 0xAA || + ((uchar)rec.header[18]) != 0x55 || + ((uchar)rec.header[17]) != 0xAA || + ((uchar)rec.header[16]) != 0x55 || + ((uchar)rec.header[15]) != 0xAA || + ((uchar)rec.header[14]) != 0x55) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %u, ref1(%lu,0x%lx), " + "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + (uint) rec.header[14], (uint) rec.header[15], + (uint) rec.header[16], (uint) rec.header[17], + (uint) rec.header[18], (uint) rec.header[19], + (uint) rec.header[20], (uint) rec.header[21], + (uint) rec.header[22], + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + read_ok(&rec); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header (var) " + "failed (%d)\n", i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration (first var) %u " + "instead of beginning of %u\n", i, ITERATIONS); + goto err; + } + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + if ((rec_len= rand_buffer_size()) < 12) + rec_len= 12; + if (rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE || + len != 12 || ref != lsn || + check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "data read(%d)" + "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), " + "hdr len: %u (%d), " + "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n", + i, (uint) rec.type, + rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + (uint) rec.short_trid, + rec.short_trid != (i % 0xFFFF), + (ulong) rec.record_length, (ulong) rec_len, + rec.record_length != rec_len + LSN_STORE_SIZE, + (uint) len, + len != 12, + LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn), + (len != 12 || ref != lsn), + check_content(rec.header + LSN_STORE_SIZE, + len - LSN_STORE_SIZE)); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + if ((rec_len= rand_buffer_size()) < 19) + rec_len= 19; + if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE * 2 || + len != 19 || + ref1 != lsn || + ref2 != first_lsn || + check_content(rec.header + LSN_STORE_SIZE * 2, + len - LSN_STORE_SIZE * 2)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, " + "ref1(%lu,0x%lx), ref2(%lu,0x%lx), " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + LSN_IN_PARTS(rec.lsn)); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + read_ok(&rec); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration %u " + "instead of beginning of %u\n", i, ITERATIONS); + goto err; + } + if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 6 || uint4korr(rec.header) != i || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(%d)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + (uint) uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + goto err; + } + lsn= rec.lsn; + read_ok(&rec); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if ((rec_len= rand_buffer_size()) < 9) + rec_len= 9; + if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len || + len != 9 || check_content(rec.header, (uint)len)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %lu != %lu, hdr len: %d, " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, LSN_IN_PARTS(rec.lsn)); + goto err; + } + if (read_and_check_content(&rec, long_buffer, 0)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + goto err; + } + read_ok(&rec); + translog_free_record_header(&rec); + } + } + + rc= 0; +err: + if (rc) + ok(0, "read record"); + } /* SKIP_BIG_TESTS */ + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + + if (maria_log_remove()) + exit(1); + + return(test(exit_status())); +} diff --git a/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c new file mode 100644 index 00000000000..06d9a00c04c --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c @@ -0,0 +1,160 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define LOG_FLAGS 0 + +static char *first_translog_file= (char*)"maria_log.00000001"; + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn, first_lsn, theor_lsn; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + + MY_INIT(argv[0]); + + plan(2); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= (char *)"."; + if (maria_log_remove()) + exit(1); + /* be sure that we have no logs in the directory*/ + my_delete(CONTROL_FILE_BASE_NAME, MYF(0)); + my_delete(first_translog_file, MYF(0)); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + theor_lsn= translog_first_theoretical_lsn(); + if (theor_lsn == 1) + { + fprintf(stderr, "Error reading the first log file."); + translog_destroy(); + exit(1); + } + if (theor_lsn == LSN_IMPOSSIBLE) + { + fprintf(stderr, "There is no first log file."); + translog_destroy(); + exit(1); + } + first_lsn= translog_first_lsn_in_log(); + if (first_lsn != LSN_IMPOSSIBLE) + { + fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).", + LSN_IN_PARTS(first_lsn)); + translog_destroy(); + exit(1); + } + ok(1, "Empty log response"); + + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + theor_lsn= translog_first_theoretical_lsn(); + if (theor_lsn == 1) + { + fprintf(stderr, "Error reading the first log file\n"); + translog_destroy(); + exit(1); + } + if (theor_lsn == LSN_IMPOSSIBLE) + { + fprintf(stderr, "There is no first log file\n"); + translog_destroy(); + exit(1); + } + first_lsn= translog_first_lsn_in_log(); + if (first_lsn != theor_lsn) + { + fprintf(stderr, "Incorrect first lsn: (%lu,0x%lx) " + " theoretical first: (%lu,0x%lx)\n", + LSN_IN_PARTS(first_lsn), LSN_IN_PARTS(theor_lsn)); + translog_destroy(); + exit(1); + } + + ok(1, "Full log response"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + exit(0); +} diff --git a/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c new file mode 100644 index 00000000000..64f486b8cf1 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c @@ -0,0 +1,156 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (8*1024L*1024L) +#define LOG_FLAGS 0 + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + ulong i; + uint pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn, max_lsn, last_lsn= LSN_IMPOSSIBLE; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + + MY_INIT(argv[0]); + + plan(2); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= (char *)"."; + if (maria_log_remove()) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + max_lsn= translog_get_file_max_lsn_stored(1); + if (max_lsn == 1) + { + fprintf(stderr, "Error reading the first log file."); + translog_destroy(); + exit(1); + } + if (max_lsn != LSN_IMPOSSIBLE) + { + fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).", + LSN_IN_PARTS(max_lsn)); + translog_destroy(); + exit(1); + } + ok(1, "Empty log response"); + + + /* write more then 1 file */ + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + for(i= 0; i < LOG_FILE_SIZE/6; i++) + { + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + if (LSN_FILE_NO(lsn) == 1) + last_lsn= lsn; + } + + + max_lsn= translog_get_file_max_lsn_stored(1); + if (max_lsn == 1) + { + fprintf(stderr, "Error reading the first log file\n"); + translog_destroy(); + exit(1); + } + if (max_lsn == LSN_IMPOSSIBLE) + { + fprintf(stderr, "Isn't first file still finished?!!\n"); + translog_destroy(); + exit(1); + } + if (max_lsn != last_lsn) + { + fprintf(stderr, "Incorrect max lsn: (%lu,0x%lx) " + " last lsn on first file: (%lu,0x%lx)\n", + LSN_IN_PARTS(max_lsn), LSN_IN_PARTS(last_lsn)); + translog_destroy(); + exit(1); + } + + ok(1, "First file max LSN"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + exit(0); +} diff --git a/storage/maria/unittest/ma_test_loghandler_multigroup-t.c b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c new file mode 100644 index 00000000000..7ba7ce3176d --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c @@ -0,0 +1,746 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" +#include "sequence_storage.h" +#include <my_getopt.h> + +extern my_bool maria_log_remove(); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif +static TRN *trn= &dummy_transaction_object; + + +#ifndef READONLY_TEST + +#define PCACHE_SIZE (1024*1024*10) +#define LONG_BUFFER_SIZE ((1024L*1024L*1024L) + (1024L*1024L*512)) +#define MIN_REC_LENGTH (1024L*1024L + 1024L*512L + 1) +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define ITERATIONS 2 +#define READONLY 0 + +#else + +#define PCACHE_SIZE (1024*1024*10) +#define LONG_BUFFER_SIZE (1024L*1024L) +#define MIN_REC_LENGTH (1024L) +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define ITERATIONS 2 +#define READONLY 1 + +#endif /*READONLY_TEST*/ + + +/* +#define LOG_FILE_SIZE 1024L*1024L*3L +#define ITERATIONS 1600 +*/ +/* +#define LOG_FILE_SIZE 1024L*1024L*100L +#define ITERATIONS 65000 +*/ + + +/* + Check that the buffer filled correctly + + SYNOPSIS + check_content() + ptr Pointer to the buffer + length length of the buffer + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool check_content(uchar *ptr, ulong length) +{ + ulong i; + uchar buff[4]; + DBUG_ENTER("check_content"); + for (i= 0; i < length; i++) + { + if (i % 4 == 0) + int4store(buff, (i >> 2)); + if (ptr[i] != buff[i % 4]) + { + fprintf(stderr, "Byte # %lu is %x instead of %x", + i, (uint) ptr[i], (uint) buff[i % 4]); + DBUG_DUMP("mem", ptr +(ulong) (i > 16 ? i - 16 : 0), + (i > 16 ? 16 : i) + (i + 16 < length ? 16 : length - i)); + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Read whole record content, and check content (put with offset) + + SYNOPSIS + read_and_check_content() + rec The record header buffer + buffer The buffer to read the record in + skip Skip this number of bytes ot the record content + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec, + uchar *buffer, uint skip) +{ + int res= 0; + translog_size_t len; + DBUG_ENTER("read_and_check_content"); + DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); + if ((len= translog_read_record(rec->lsn, 0, rec->record_length, + buffer, NULL)) != rec->record_length) + { + fprintf(stderr, "Requested %lu byte, read %lu\n", + (ulong) rec->record_length, (ulong) len); + res= 1; + } + res|= check_content(buffer + skip, rec->record_length - skip); + DBUG_RETURN(res); +} + +static const char *load_default_groups[]= {"ma_unit_loghandler", 0}; +#ifndef DBUG_OFF +static const char *default_dbug_option= + IF_WIN("d:t:i:O,\\ma_test_loghandler.trace", + "d:t:i:o,/tmp/ma_test_loghandler.trace"); +#endif +static const char *opt_wfile= NULL; +static const char *opt_rfile= NULL; +static struct my_option my_long_options[] = +{ +#ifndef DBUG_OFF + {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"write-seq", 'w', "Path to file in which \"random\" sequence used in the test will be written", + (uchar**) &opt_wfile, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"read-seq", 'r', "Path to file from which \"random\" sequence used in the test will be read", + (uchar**) &opt_rfile, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; +static SEQ_STORAGE seq; + +static uint32 get_len() +{ + uint32 res; + DBUG_ENTER("get_len"); + if (opt_rfile) + res= seq_storage_next(&seq); + else + { + res= (uint32) + ((ulonglong) rand() * + (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1) / RAND_MAX) + MIN_REC_LENGTH; + if (opt_wfile && + seq_storage_write(opt_wfile, res)) + exit(1); + } + DBUG_PRINT("info", ("length value : %lu", (ulong) res)); + DBUG_RETURN(res); +} + +static void usage(void) +{ + puts("Copyright (C) 2008 MySQL AB"); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Unit test of maria engine"); + VOID(printf("\nUsage: %s [OPTIONS]\n", my_progname_short)); + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + + +static my_bool +get_one_option(int optid __attribute__((unused)), + const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch (optid) { + case '?': + usage(); + exit(0); +#ifndef DBUG_OFF + case '#': + DBUG_SET_INITIAL(argument ? argument : default_dbug_option); + break; +#endif + } + return 0; +} + + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + + if ((ho_error= handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + if (opt_rfile && opt_wfile) + { + usage(); + exit(1); + } +} + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint32 i; + uint32 rec_len; + uint pagen; + uchar long_tr_id[6]; + uchar lsn_buff[23]= + { + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + uchar *long_buffer= malloc(LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); + char **default_argv; + PAGECACHE pagecache; + LSN lsn, lsn_base, first_lsn; + TRANSLOG_HEADER_BUFFER rec; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 2]; + struct st_translog_scanner_data scanner; + int rc; + + MY_INIT(argv[0]); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= (char *)"."; + load_defaults("my", load_default_groups, &argc, &argv); + default_argv= argv; + get_options(&argc, &argv); + + if (maria_log_remove()) + exit(1); + + { + uchar buff[4]; + for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i++) + { + if (i % 4 == 0) + int4store(buff, (i >> 2)); + long_buffer[i]= buff[i % 4]; + } + } + + bzero(long_tr_id, 6); + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + 0, 0, &translog_example_table_init, 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + plan(((ITERATIONS - 1) * 4 + 1) * 2); + + if (opt_rfile && + seq_storage_reader_init(&seq, opt_rfile)) + exit(1); + srand(122334817L); + + long_tr_id[5]= 0xff; + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + trn->short_id= 0; + trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1, parts, + NULL, NULL)) + { + fprintf(stderr, "Can't write record #%u\n", 0); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + lsn_base= first_lsn= lsn; + + for (i= 1; i < ITERATIONS; i++) + { + if (i % 2) + { + lsn_store(lsn_buff, lsn_base); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_1LSN_EXAMPLE, trn, NULL, + LSN_STORE_SIZE, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "1 Can't write reference before record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + rec_len= get_len(); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + trn, NULL, LSN_STORE_SIZE + rec_len, + TRANSLOG_INTERNAL_PARTS + 2, + parts, NULL, NULL)) + { + fprintf(stderr, "1 Can't write var reference before record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + } + else + { + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + parts[TRANSLOG_INTERNAL_PARTS + 1].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= 23; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_2LSN_EXAMPLE, + trn, NULL, 23, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "0 Can't write reference before record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + rec_len= get_len(); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE * 2; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE, + trn, NULL, LSN_STORE_SIZE * 2 + rec_len, + TRANSLOG_INTERNAL_PARTS + 2, + parts, NULL, NULL)) + { + fprintf(stderr, "0 Can't write var reference before record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + } + int4store(long_tr_id, i); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + + lsn_base= lsn; + + rec_len= get_len(); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + trn, NULL, rec_len, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL)) + { + fprintf(stderr, "Can't write variable record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + } + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + + if (ma_control_file_open(TRUE,TRUE)) + { + fprintf(stderr, "pass2: Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0)) == 0) + { + fprintf(stderr, "pass2: Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + 0, READONLY, &translog_example_table_init, 0)) + { + fprintf(stderr, "pass2: Can't init loghandler (%d)\n", errno); + exit(1); + } + + + /* If we were writing sequence we need it only once */ + opt_wfile= NULL; + if (opt_rfile) + seq_storage_rewind(&seq); + srand(122334817L); + + rc= 1; + + { + int len= translog_read_record_header(first_lsn, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "translog_read_record_header failed (%d)\n", errno); + translog_free_record_header(&rec); + goto err; + } + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 || + rec.record_length != 6 || uint4korr(rec.header) != 0 || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF || + first_lsn != rec.lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(0)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, " + "lsn(0x%lu,0x%lx)\n", + (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length, + (uint)uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + ok(1, "read record"); + translog_free_record_header(&rec); + lsn= first_lsn; + if (translog_scanner_init(first_lsn, 1, &scanner, 0)) + { + fprintf(stderr, "scanner init failed\n"); + goto err; + } + for (i= 1;; i++) + { + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + translog_free_record_header(&rec); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + if (i != ITERATIONS) + { + fprintf(stderr, "EOL met at iteration %u instead of %u\n", + i, ITERATIONS); + translog_free_record_header(&rec); + goto err; + } + break; + } + + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != LSN_STORE_SIZE || ref != lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE " + "data read(%d)" + "type %u, strid %u, len %u, ref(%lu,0x%lx), lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 23 || + ref1 != lsn || + ref2 != first_lsn || + ((uchar)rec.header[22]) != 0x55 || + ((uchar)rec.header[21]) != 0xAA || + ((uchar)rec.header[20]) != 0x55 || + ((uchar)rec.header[19]) != 0xAA || + ((uchar)rec.header[18]) != 0x55 || + ((uchar)rec.header[17]) != 0xAA || + ((uchar)rec.header[16]) != 0x55 || + ((uchar)rec.header[15]) != 0xAA || + ((uchar)rec.header[14]) != 0x55) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %u, ref1(%lu,0x%lx), " + "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + (uint) rec.header[14], (uint) rec.header[15], + (uint) rec.header[16], (uint) rec.header[17], + (uint) rec.header[18], (uint) rec.header[19], + (uint) rec.header[20], (uint) rec.header[21], + (uint) rec.header[22], + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + DBUG_ASSERT(0); + goto err; + } + } + ok(1, "read record"); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header (var) " + "failed (%d)\n", i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration (first var) %u " + "instead of beginning of %u\n", i, ITERATIONS); + goto err; + } + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + rec_len= get_len(); + if (rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE || + len != 12 || ref != lsn || + check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "data read(%d)" + "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), " + "hdr len: %d (%d), " + "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n", + i, (uint) rec.type, + rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + (uint) rec.short_trid, + rec.short_trid != (i % 0xFFFF), + (ulong) rec.record_length, (ulong) rec_len, + rec.record_length != rec_len + LSN_STORE_SIZE, + len, + len != 12, + LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn), + (ref != lsn), + check_content(rec.header + LSN_STORE_SIZE, + len - LSN_STORE_SIZE)); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + rec_len= get_len(); + if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE * 2 || + len != 19 || + ref1 != lsn || + ref2 != first_lsn || + check_content(rec.header + LSN_STORE_SIZE * 2, + len - LSN_STORE_SIZE * 2)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + " data read(%d) " + "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, " + "ref1(%lu,0x%lx), ref2(%lu,0x%lx), " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, + LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + ok(1, "read record"); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + translog_free_record_header(&rec); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration %u " + "instead of beginning of %u\n", i, ITERATIONS); + translog_free_record_header(&rec); + goto err; + } + if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 6 || uint4korr(rec.header) != i || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(%d)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + (uint)uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + ok(1, "read record"); + translog_free_record_header(&rec); + + lsn= rec.lsn; + + len= translog_read_next_record_header(&scanner, &rec); + rec_len= get_len(); + if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len || + len != 9 || check_content(rec.header, len)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %lu != %lu, hdr len: %d, " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, 0)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + ok(1, "read record"); + translog_free_record_header(&rec); + } + } + + rc= 0; +err: + if (rc) + ok(0, "read record"); + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + free_defaults(default_argv); + seq_storage_destroy(&seq); + if (maria_log_remove()) + exit(1); + + return (test(exit_status())); +} diff --git a/storage/maria/unittest/ma_test_loghandler_multithread-t.c b/storage/maria/unittest/ma_test_loghandler_multithread-t.c new file mode 100644 index 00000000000..354f5d12e08 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_multithread-t.c @@ -0,0 +1,556 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) + +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +/*#define LOG_FLAGS TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC */ +#define LOG_FLAGS 0 +/*#define LONG_BUFFER_SIZE (1024L*1024L*1024L + 1024L*1024L*512)*/ + +#ifdef MULTIFLUSH_TEST + +#define LONG_BUFFER_SIZE (16384L) +#define MIN_REC_LENGTH 10 +#define SHOW_DIVIDER 20 +#define ITERATIONS 10000 +#define FLUSH_ITERATIONS 1000 +#define WRITERS 2 +#define FLUSHERS 10 + +#else + +#define LONG_BUFFER_SIZE (512L*1024L*1024L) +#define MIN_REC_LENGTH 30 +#define SHOW_DIVIDER 10 +#define ITERATIONS 3 +#define FLUSH_ITERATIONS 0 +#define WRITERS 3 +#define FLUSHERS 0 + +#endif + +static uint number_of_writers= WRITERS; +static uint number_of_flushers= FLUSHERS; + +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count; + +static ulong lens[WRITERS][ITERATIONS]; +static LSN lsns1[WRITERS][ITERATIONS]; +static LSN lsns2[WRITERS][ITERATIONS]; +static uchar *long_buffer; + + +static LSN last_lsn; /* For test purposes the variable allow dirty read/write */ + +/* + Get pseudo-random length of the field in + limits [MIN_REC_LENGTH..LONG_BUFFER_SIZE] + + SYNOPSIS + get_len() + + RETURN + length - length >= 0 length <= LONG_BUFFER_SIZE +*/ + +static uint32 get_len() +{ + return MIN_REC_LENGTH + + (uint32)(((ulonglong)rand())* + (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1)/RAND_MAX); +} + + +/* + Check that the buffer filled correctly + + SYNOPSIS + check_content() + ptr Pointer to the buffer + length length of the buffer + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool check_content(uchar *ptr, ulong length) +{ + ulong i; + for (i= 0; i < length; i++) + { + if (((uchar)ptr[i]) != (i & 0xFF)) + { + fprintf(stderr, "Byte # %lu is %x instead of %x", + i, (uint) ptr[i], (uint) (i & 0xFF)); + return 1; + } + } + return 0; +} + + +/* + Read whole record content, and check content (put with offset) + + SYNOPSIS + read_and_check_content() + rec The record header buffer + buffer The buffer to read the record in + skip Skip this number of bytes ot the record content + + RETURN + 0 - OK + 1 - Error +*/ + + +static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec, + uchar *buffer, uint skip) +{ + int res= 0; + translog_size_t len; + + if ((len= translog_read_record(rec->lsn, 0, rec->record_length, + buffer, NULL)) != rec->record_length) + { + fprintf(stderr, "Requested %lu byte, read %lu\n", + (ulong) rec->record_length, (ulong) len); + res= 1; + } + res|= check_content(buffer + skip, rec->record_length - skip); + return(res); +} + +void writer(int num) +{ + LSN lsn; + TRN trn; + uchar long_tr_id[6]; + uint i; + + trn.short_id= num; + trn.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + for (i= 0; i < ITERATIONS; i++) + { + uint len= get_len(); + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + lens[num][i]= len; + + int2store(long_tr_id, num); + int4store(long_tr_id + 2, i); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write LOGREC_FIXED_RECORD_0LSN_EXAMPLE record #%lu " + "thread %i\n", (ulong) i, num); + translog_destroy(); + pthread_mutex_lock(&LOCK_thread_count); + ok(0, "write records"); + pthread_mutex_unlock(&LOCK_thread_count); + return; + } + lsns1[num][i]= lsn; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= len; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + &trn, NULL, + len, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i); + translog_destroy(); + pthread_mutex_lock(&LOCK_thread_count); + ok(0, "write records"); + pthread_mutex_unlock(&LOCK_thread_count); + return; + } + lsns2[num][i]= lsn; + last_lsn= lsn; + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "write records"); + pthread_mutex_unlock(&LOCK_thread_count); + } + return; +} + + +static void *test_thread_writer(void *arg) +{ + int param= *((int*) arg); + + my_thread_init(); + + writer(param); + + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + ok(1, "writer finished"); /* just to show progress */ + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are + ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + return(0); +} + + +static void *test_thread_flusher(void *arg) +{ + int param= *((int*) arg); + int i; + + my_thread_init(); + + for(i= 0; i < FLUSH_ITERATIONS; i++) + { + translog_flush(last_lsn); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "-- flush %d", param); + pthread_mutex_unlock(&LOCK_thread_count); + } + + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + ok(1, "flusher finished"); /* just to show progress */ + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are + ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + return(0); +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__ ((unused))) +{ + uint32 i; + uint pagen; + PAGECACHE pagecache; + LSN first_lsn; + TRANSLOG_HEADER_BUFFER rec; + struct st_translog_scanner_data scanner; + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error; + int rc; + + /* Disabled until Sanja tests */ + plan(1); + ok(1, "disabled"); + exit(0); + + plan(WRITERS + FLUSHERS + + ITERATIONS * WRITERS * 3 + FLUSH_ITERATIONS * FLUSHERS ); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= (char *)"."; + long_buffer= malloc(LONG_BUFFER_SIZE + 7 * 2 + 2); + if (long_buffer == 0) + { + fprintf(stderr, "End of memory\n"); + exit(1); + } + for (i= 0; i < (LONG_BUFFER_SIZE + 7 * 2 + 2); i++) + long_buffer[i]= (i & 0xFF); + + MY_INIT(argv[0]); + if (maria_log_remove()) + exit(1); + + +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + fprintf(stderr, "COND_thread_count: %d from pthread_cond_init " + "(errno: %d)\n", error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + fprintf(stderr, "LOCK_thread_count: %d from pthread_cond_init " + "(errno: %d)\n", error, errno); + exit(1); + } + if ((error= pthread_attr_init(&thr_attr))) + { + fprintf(stderr, "Got error: %d from pthread_attr_init " + "(errno: %d)\n", error, errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + fprintf(stderr, + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error, errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + VOID(thr_setconcurrency(2)); +#endif + + my_thread_global_init(); + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + srand(122334817L); + { + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + uchar long_tr_id[6]= + { + 0x11, 0x22, 0x33, 0x44, 0x55, 0x66 + }; + + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&first_lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write the first record\n"); + translog_destroy(); + exit(1); + } + } + + + pthread_mutex_lock(&LOCK_thread_count); + while (number_of_writers != 0 || number_of_flushers != 0) + { + if (number_of_writers) + { + param= (int*) malloc(sizeof(int)); + *param= number_of_writers - 1; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n", + error, errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + if (number_of_flushers) + { + param= (int*) malloc(sizeof(int)); + *param= number_of_flushers - 1; + if ((error= pthread_create(&tid, &thr_attr, test_thread_flusher, + (void*) param))) + { + fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n", + error, errno); + exit(1); + } + thread_count++; + number_of_flushers--; + } + } + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count))) + fprintf(stderr, "COND_thread_count: %d from pthread_cond_wait\n", error); + } + pthread_mutex_unlock(&LOCK_thread_count); + + /* Find last LSN and flush up to it (all our log) */ + { + LSN max= 0; + for (i= 0; i < WRITERS; i++) + { + if (cmp_translog_addr(lsns2[i][ITERATIONS - 1], max) > 0) + max= lsns2[i][ITERATIONS - 1]; + } + translog_flush(max); + } + + rc= 1; + + { + uint indeces[WRITERS]; + uint index, stage; + int len; + bzero(indeces, sizeof(uint) * WRITERS); + + bzero(indeces, sizeof(indeces)); + + if (translog_scanner_init(first_lsn, 1, &scanner, 0)) + { + fprintf(stderr, "scanner init failed\n"); + goto err; + } + for (i= 0;; i++) + { + len= translog_read_next_record_header(&scanner, &rec); + + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + translog_free_record_header(&rec); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + if (i != WRITERS * ITERATIONS * 2) + { + fprintf(stderr, "EOL met at iteration %u instead of %u\n", + i, ITERATIONS * WRITERS * 2); + translog_free_record_header(&rec); + goto err; + } + break; + } + index= indeces[rec.short_trid] / 2; + stage= indeces[rec.short_trid] % 2; + if (stage == 0) + { + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || + rec.record_length != 6 || + uint2korr(rec.header) != rec.short_trid || + index != uint4korr(rec.header + 2) || + cmp_translog_addr(lsns1[rec.short_trid][index], rec.lsn) != 0) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(%d)\n" + "type %u, strid %u %u, len %u, i: %u %u, " + "lsn(%lu,0x%lx) (%lu,0x%lx)\n", + i, (uint) rec.type, + (uint) rec.short_trid, (uint) uint2korr(rec.header), + (uint) rec.record_length, + (uint) index, (uint) uint4korr(rec.header + 2), + LSN_IN_PARTS(rec.lsn), + LSN_IN_PARTS(lsns1[rec.short_trid][index])); + translog_free_record_header(&rec); + goto err; + } + } + else + { + if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE || + len != 9 || + rec.record_length != lens[rec.short_trid][index] || + cmp_translog_addr(lsns2[rec.short_trid][index], rec.lsn) != 0 || + check_content(rec.header, (uint)len)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "data read(%d) " + "thread: %d, iteration %d, stage %d\n" + "type %u (%d), len %d, length %lu %lu (%d) " + "lsn(%lu,0x%lx) (%lu,0x%lx)\n", + i, (uint) rec.short_trid, index, stage, + (uint) rec.type, (rec.type != + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE), + len, + (ulong) rec.record_length, lens[rec.short_trid][index], + (rec.record_length != lens[rec.short_trid][index]), + LSN_IN_PARTS(rec.lsn), + LSN_IN_PARTS(lsns2[rec.short_trid][index])); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, 0)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + ok(1, "record read"); + translog_free_record_header(&rec); + indeces[rec.short_trid]++; + } + } + + rc= 0; +err: + if (rc) + ok(0, "record read"); + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + + return(exit_status()); +} diff --git a/storage/maria/unittest/ma_test_loghandler_noflush-t.c b/storage/maria/unittest/ma_test_loghandler_noflush-t.c new file mode 100644 index 00000000000..973dfd03bcf --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_noflush-t.c @@ -0,0 +1,146 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define LOG_FLAGS 0 + +static char *first_translog_file= (char*)"maria_log.00000001"; + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint pagen; + int rc= 1; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN first_lsn; + TRANSLOG_HEADER_BUFFER rec; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + translog_size_t len; + + MY_INIT(argv[0]); + + plan(1); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= (char *)"."; + if (maria_log_remove()) + exit(1); + /* be sure that we have no logs in the directory*/ + my_delete(CONTROL_FILE_BASE_NAME, MYF(0)); + my_delete(first_translog_file, MYF(0)); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + int4store(long_tr_id, 0); + long_tr_id[5]= 0xff; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&first_lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + len= translog_read_record_header(first_lsn, &rec); + if (len == 0) + { + fprintf(stderr, "translog_read_record_header failed (%d)\n", errno); + goto err; + } + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 || + rec.record_length != 6 || uint4korr(rec.header) != 0 || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF || + first_lsn != rec.lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(0)\n" + "type: %u (%d) strid: %u (%d) len: %u (%d) i: %u (%d), " + "4: %u (%d) 5: %u (%d) " + "lsn(%lu,0x%lx) (%d)\n", + (uint) rec.type, (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE), + (uint) rec.short_trid, (rec.short_trid != 0), + (uint) rec.record_length, (rec.record_length != 6), + (uint) uint4korr(rec.header), (uint4korr(rec.header) != 0), + (uint) rec.header[4], (((uchar)rec.header[4]) != 0), + (uint) rec.header[5], (((uchar)rec.header[5]) != 0xFF), + LSN_IN_PARTS(rec.lsn), (first_lsn != rec.lsn)); + goto err; + } + + ok(1, "read OK"); + rc= 0; + +err: + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + + exit(rc); +} diff --git a/storage/maria/unittest/ma_test_loghandler_nologs-t.c b/storage/maria/unittest/ma_test_loghandler_nologs-t.c new file mode 100644 index 00000000000..34508d1d751 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_nologs-t.c @@ -0,0 +1,195 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); +extern void example_loghandler_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (8*1024L*1024L) +#define LOG_FLAGS 0 +#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2) + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + ulong i; + uint pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + uchar *long_buffer= malloc(LONG_BUFFER_SIZE); + + MY_INIT(argv[0]); + + plan(2); + + bzero(&pagecache, sizeof(pagecache)); + bzero(long_buffer, LONG_BUFFER_SIZE); + maria_data_root= (char *)"."; + if (maria_log_remove()) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + /* write more then 1 file */ + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #0\n"); + translog_destroy(); + exit(1); + } + + for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++) + { + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #0\n"); + translog_destroy(); + exit(1); + } + } + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + + { + char file_name[FN_REFLEN]; + for (i= 1; i <= 2; i++) + { + translog_filename_by_fileno(i, file_name); + if (my_access(file_name, W_OK)) + { + fprintf(stderr, "No file '%s'\n", file_name); + exit(1); + } + if (my_delete(file_name, MYF(MY_WME)) != 0) + { + fprintf(stderr, "Error %d during removing file'%s'\n", + errno, file_name); + exit(1); + } + } + } + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 1)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + ok(1, "Log init OK"); + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #0\n"); + translog_destroy(); + exit(1); + } + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + + if (!translog_is_file(3)) + { + fprintf(stderr, "No file #3\n"); + exit(1); + } + + ok(1, "New log is OK"); + + if (maria_log_remove()) + exit(1); + exit(0); +} diff --git a/storage/maria/unittest/ma_test_loghandler_pagecache-t.c b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c new file mode 100644 index 00000000000..1644aa4885c --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c @@ -0,0 +1,200 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define LOG_FLAGS 0 + +static char *first_translog_file= (char*)"aria_log.00000001"; +static char *file1_name= (char*)"page_cache_test_file_1"; +static PAGECACHE_FILE file1; + + +/** + @brief Dummy pagecache callback. +*/ + +static my_bool +dummy_callback(uchar *page __attribute__((unused)), + pgcache_page_no_t page_no __attribute__((unused)), + uchar* data_ptr __attribute__((unused))) +{ + return 0; +} + + +/** + @brief Dummy pagecache callback. +*/ + +static void +dummy_fail_callback(uchar* data_ptr __attribute__((unused))) +{ + return; +} + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn; + my_off_t file_size; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + + MY_INIT(argv[0]); + + plan(1); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= (char *)"."; + if (maria_log_remove()) + exit(1); + /* be sure that we have no logs in the directory*/ + my_delete(CONTROL_FILE_BASE_NAME, MYF(0)); + my_delete(first_translog_file, MYF(0)); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler_pagecache.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler_pagecache.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + if ((file1.file= my_open(first_translog_file, O_RDONLY, MYF(MY_WME))) < 0) + { + fprintf(stderr, "There is no %s (%d)\n", first_translog_file, errno); + exit(1); + } + file_size= my_seek(file1.file, 0, SEEK_END, MYF(MY_WME)); + if (file_size != TRANSLOG_PAGE_SIZE) + { + fprintf(stderr, + "incorrect initial size of %s: %ld instead of %ld\n", + first_translog_file, (long)file_size, (long)TRANSLOG_PAGE_SIZE); + exit(1); + } + my_close(file1.file, MYF(MY_WME)); + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + pagecache_file_init(file1, &dummy_callback, &dummy_callback, + &dummy_fail_callback, maria_flush_log_for_page, NULL); + if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME))) + exit(1); + + { + uchar page[PCACHE_PAGE]; + + bzero(page, PCACHE_PAGE); + lsn_store(page, lsn); + pagecache_write(&pagecache, &file1, 0, 3, page, + PAGECACHE_LSN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + } + my_close(file1.file, MYF(MY_WME)); + if ((file1.file= my_open(first_translog_file, O_RDONLY, MYF(MY_WME))) < 0) + { + fprintf(stderr, "can't open %s (%d)\n", first_translog_file, errno); + exit(1); + } + file_size= my_seek(file1.file, 0, SEEK_END, MYF(MY_WME)); + if (file_size != TRANSLOG_PAGE_SIZE * 2) + { + fprintf(stderr, + "incorrect initial size of %s: %ld instead of %ld\n", + first_translog_file, + (long)file_size, (long)(TRANSLOG_PAGE_SIZE * 2)); + ok(0, "log triggered"); + exit(1); + } + my_close(file1.file, MYF(MY_WME)); + ok(1, "log triggered"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + my_delete(CONTROL_FILE_BASE_NAME, MYF(0)); + my_delete(first_translog_file, MYF(0)); + my_delete(file1_name, MYF(0)); + + exit(0); +} diff --git a/storage/maria/unittest/ma_test_loghandler_purge-t.c b/storage/maria/unittest/ma_test_loghandler_purge-t.c new file mode 100644 index 00000000000..d37b45bc3ca --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_purge-t.c @@ -0,0 +1,192 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (8*1024L*1024L) +#define LOG_FLAGS 0 +#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2) + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + ulong i; + uint pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + uchar *long_buffer= malloc(LONG_BUFFER_SIZE); + + MY_INIT(argv[0]); + + plan(4); + + bzero(&pagecache, sizeof(pagecache)); + bzero(long_buffer, LONG_BUFFER_SIZE); + maria_data_root= (char *)"."; + if (maria_log_remove()) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + /* write more then 1 file */ + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + translog_purge(lsn); + if (!translog_is_file(1)) + { + fprintf(stderr, "First file was removed after first record\n"); + translog_destroy(); + exit(1); + } + ok(1, "First is not removed"); + + for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++) + { + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + } + + translog_purge(lsn); + if (translog_is_file(1)) + { + fprintf(stderr, "First file was not removed.\n"); + translog_destroy(); + exit(1); + } + + ok(1, "First file is removed"); + + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LONG_BUFFER_SIZE; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, LONG_BUFFER_SIZE, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL)) + { + fprintf(stderr, "Can't write variable record\n"); + translog_destroy(); + exit(1); + } + + translog_purge(lsn); + if (!translog_is_file(2) || !translog_is_file(3)) + { + fprintf(stderr, "Second file (%d) or third file (%d) is not present.\n", + translog_is_file(2), translog_is_file(3)); + translog_destroy(); + exit(1); + } + + ok(1, "Second and third files are not removed"); + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write last record\n"); + translog_destroy(); + exit(1); + } + + translog_purge(lsn); + if (translog_is_file(2)) + { + fprintf(stderr, "Second file is not removed\n"); + translog_destroy(); + exit(1); + } + + ok(1, "Second file is removed"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + exit(0); +} diff --git a/storage/maria/unittest/ma_test_recovery.expected b/storage/maria/unittest/ma_test_recovery.expected new file mode 100644 index 00000000000..5f7dd54e673 --- /dev/null +++ b/storage/maria/unittest/ma_test_recovery.expected @@ -0,0 +1,1578 @@ +Testing the REDO PHASE ALONE +TEST WITH ma_test1 -s -M -T -c +applying log +testing idempotency +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -d500 +applying log +testing idempotency +applying log +TEST WITH ma_test2 -s -M -T -c -b65000 +applying log +testing idempotency +applying log +TEST WITH ma_test2 -s -M -T -c -b65000 -d800 +applying log +testing idempotency +applying log +TEST WITH ma_test1 -s -M -T -c -C +applying log +testing idempotency +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -d500 -C +applying log +testing idempotency +applying log +Testing the REDO AND UNDO PHASE +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=4 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=4 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=4 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=4 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=4 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=4 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=4 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=4 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled,movable +--- +> Status: changed +========DIFF END======= diff --git a/storage/maria/unittest/ma_test_recovery.pl b/storage/maria/unittest/ma_test_recovery.pl new file mode 100755 index 00000000000..d9be82f4e58 --- /dev/null +++ b/storage/maria/unittest/ma_test_recovery.pl @@ -0,0 +1,481 @@ +#!/usr/bin/env perl + +use Getopt::Long; +use File::Copy; +use File::Compare; +use File::Basename; +use Digest::MD5; + +$|= 1; +$^W = 1; # warnings, because env cannot parse 'perl -w' +$VER= "1.2"; + +$opt_version= 0; +$opt_help= 0; +$opt_verbose= 0; +$opt_abort_on_error=0; + +my $silent= "-s"; +my $maria_path; # path to "storage/maria" +my $maria_exe_path; # path to executables (ma_test1, aria_chk etc) +my $tmp= "./tmp"; +my $my_progname= $0; +my $suffix; +my $zerofilled_tables= 0; + +$my_progname=~ s/.*[\/]//; +$maria_path= dirname($0) . "/.."; + +main(); + +#### +#### main function +#### + +sub main +{ + my ($res, $table); + + if (!GetOptions("abort-on-error", "help", "version", "verbose")) + { + $flag_exit= 1; + } + if ($opt_version) + { + print "$my_progname version $VER\n"; + exit(0); + } + usage() if ($opt_help || $flag_exit); + + $suffix= ( $^O =~ /win/i && $^O !~ /darwin/i ) ? ".exe" : ""; + $maria_exe_path= "$maria_path/release"; + # we use -f, sometimes -x is unexpectedly false in Cygwin + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= "$maria_path/relwithdebinfo"; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= "$maria_path/debug"; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= $maria_path; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + die("Cannot find ma_test1 executable\n"); + } + } + } + } + + # test data is always put in the current directory or a tmp subdirectory + # of it + + if (! -d "$tmp") + { + mkdir $tmp; + } + print "ARIA RECOVERY TESTS\n"; + + # To not flood the screen, we redirect all the commands below to a text file + # and just give a final error if their output is not as expected + + open (MY_LOG, ">$tmp/ma_test_recovery.output") or die "Can't open log file\n"; + print MY_LOG "Testing the REDO PHASE ALONE\n"; + + # runs a program inserting/deleting rows, then moves the resulting table + # elsewhere; applies the log and checks that the data file is + # identical to the saved original. + + my @t= ("ma_test1$suffix $silent -M -T -c", + "ma_test2$suffix $silent -L -K -W -P -M -T -c -d500", + "ma_test2$suffix $silent -M -T -c -b65000", + "ma_test2$suffix $silent -M -T -c -b65000 -d800", + "ma_test1$suffix $silent -M -T -c -C", + "ma_test2$suffix $silent -L -K -W -P -M -T -c -d500 -C", + #"ma_rt_test$suffix $silent -M -T -c -C", + # @todo: also add to @t2 + ); + + foreach my $prog (@t) + { + unlink <aria_log.* aria_log_control>; + my $prog_no_suffix= $prog; + $prog_no_suffix=~ s/$suffix// if ($suffix); + print MY_LOG "TEST WITH $prog_no_suffix\n"; + $res= my_exec("$maria_exe_path/$prog"); + print MY_LOG $res; + # derive table's name from program's name + if ($prog =~ m/^ma_(\S+)\s.*/) + { + $table= $1; + } + else + { + die("can't guess table name"); + } + $com= "$maria_exe_path/aria_chk$suffix -dvv $table "; + $com.= "| grep -v \"Creation time:\" | grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\""; + $com.= "> $tmp/aria_chk_message.good.txt 2>&1"; + my_exec($com); + my $checksum= my_exec("$maria_exe_path/aria_chk$suffix -dss $table"); + move("$table.MAD", "$tmp/$table-good.MAD") || + die "Can't move $table.MAD to $tmp/$table-good.MAD\n"; + move("$table.MAI", "$tmp/$table-good.MAI") || + die "Can't move $table.MAI to $tmp/$table-good.MAI\n"; + apply_log($table, "shouldnotchangelog"); + check_table_is_same($table, $checksum); + $res= physical_cmp($table, "$tmp/$table-good"); + print MY_LOG $res; + print MY_LOG "testing idempotency\n"; + apply_log($table, "shouldnotchangelog"); + check_table_is_same($table, $checksum); + $res= physical_cmp($table, "$tmp/$table-good"); + print MY_LOG $res; + } + + print MY_LOG "Testing the REDO AND UNDO PHASE\n"; + # The test programs look like: + # work; commit (time T1); work; exit-without-commit (time T2) + # We first run the test program and let it exit after T1's commit. + # Then we run it again and let it exit at T2. Then we compare + # and expect identity. + + my @take_checkpoints= ("no", "yes"); + my @blobs= ("", "-b32768"); + my @test_undo= (1, 2, 3, 4); + my @t2= ("ma_test1$suffix $silent -M -T -c -N blob -H1", + "--testflag=1", + "--testflag=2 --test-undo=", + "ma_test1$suffix $silent -M -T -c -N blob -H2", + "--testflag=3", + "--testflag=4 --test-undo=", + "ma_test1$suffix $silent -M -T -c -N blob -H2 --versioning", + "--testflag=3", + "--testflag=4 --test-undo=", + "ma_test1$suffix $silent -M -T -c -N blob -H2", + "--testflag=2", + "--testflag=3 --test-undo=", + "ma_test2$suffix $silent -L -K -W -P -M -T -c blob -H1", + "-t1", + "-t2 -A", + "ma_test2$suffix $silent -L -K -W -P -M -T -c blob -H1", + "-t1", + "-t6 -A"); + + foreach my $take_checkpoint (@take_checkpoints) + { + my ($i, $j, $k, $commit_run_args, $abort_run_args); + # we test table without blobs and then table with blobs + for ($i= 0; defined($blobs[$i]); $i++) + { + for ($j= 0; defined($test_undo[$j]); $j++) + { + # first iteration tests rollback of insert, second tests rollback of delete + # -N (create NULL fields) is needed because --test-undo adds it anyway + for ($k= 0; defined($t2[$k]); $k+= 3) + { + $prog= $t2[$k]; + $prog=~ s/blob/$blobs[$i]/; + if ("$take_checkpoint" eq "no") { + $prog=~ s/\s+\-H[0-9]+//; + } + $commit_run_args= $t2[$k + 1]; + $abort_run_args= $t2[$k + 2]; + unlink <aria_log.* aria_log_control>; + my $prog_no_suffix= $prog; + $prog_no_suffix=~ s/$suffix// if ($suffix); + print MY_LOG "TEST WITH $prog_no_suffix $commit_run_args (commit at end)\n"; + $res= my_exec("$maria_exe_path/$prog $commit_run_args"); + print MY_LOG $res; + # derive table's name from program's name + if ($prog =~ m/^ma_(\S+)\s.*/) + { + $table= $1; + } + else + { + die("can't guess table name"); + } + $com= "$maria_exe_path/aria_chk$suffix -dvv $table "; + $com.= "| grep -v \"Creation time:\" | grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\" "; + $com.= "> $tmp/aria_chk_message.good.txt 2>&1"; + $res= my_exec($com); + print MY_LOG $res; + $checksum= my_exec("$maria_exe_path/aria_chk$suffix -dss $table"); + move("$table.MAD", "$tmp/$table-good.MAD") || + die "Can't move $table.MAD to $tmp/$table-good.MAD\n"; + move("$table.MAI", "$tmp/$table-good.MAI") || + die "Can't move $table.MAI to $tmp/$table-good.MAI\n"; + unlink <aria_log.* aria_log_control>; + print MY_LOG "TEST WITH $prog_no_suffix $abort_run_args$test_undo[$j] (additional aborted work)\n"; + $res= my_exec("$maria_exe_path/$prog $abort_run_args$test_undo[$j]"); + print MY_LOG $res; + copy("$table.MAD", "$tmp/$table-before_undo.MAD") || + die "Can't copy $table.MAD to $tmp/$table-before_undo.MAD\n"; + copy("$table.MAI", "$tmp/$table-before_undo.MAI") || + die "Can't copy $table.MAI to $tmp/$table-before_undo.MAI\n"; + + # The lines below seem unneeded, will be removed soon + # We have to copy and restore logs, as running aria_read_log will + # change the aria_control_file + # rm -f $tmp/aria_log.* $tmp/aria_log_control + # cp $maria_path/aria_log* $tmp + + if ($test_undo[$j] != 3) { + apply_log($table, "shouldchangelog"); # should undo aborted work + } else { + # probably nothing to undo went to log or data file + apply_log($table, "dontknow"); + } + copy("$table.MAD", "$tmp/$table-after_undo.MAD") || + die "Can't copy $table.MAD to $tmp/$table-after_undo.MAD\n"; + copy("$table.MAI", "$tmp/$table-after_undo.MAI") || + die "Can't copy $table.MAI to $tmp/$table-after_undo.MAI\n"; + + # It is impossible to do a "cmp" between .good and .after_undo, + # because the UNDO phase generated log + # records whose LSN tagged pages. Another reason is that rolling back + # INSERT only marks the rows free, does not empty them (optimization), so + # traces of the INSERT+rollback remain. + + check_table_is_same($table, $checksum); + print MY_LOG "testing idempotency\n"; + apply_log($table, "shouldnotchangelog"); + check_table_is_same($table, $checksum); + $res= physical_cmp($table, "$tmp/$table-after_undo"); + print MY_LOG $res; + print MY_LOG "testing applying of CLRs to recreate table\n"; + unlink <$table.MA?>; + # cp $tmp/aria_log* $maria_path #unneeded + apply_log($table, "shouldnotchangelog"); + check_table_is_same($table, $checksum); + $res= physical_cmp($table, "$tmp/$table-after_undo"); + print MY_LOG $res; + } + unlink <$table.* $tmp/$table* $tmp/aria_chk_*.txt $tmp/aria_read_log_$table.txt>; + } + } + } + + if ($? >> 8) { + print "Some test failed\n"; + exit(1); + } + + close(MY_LOG); + # also note that aria_chk -dvv shows differences for ma_test2 in UNDO phase, + # this is normal: removing records does not shrink the data/key file, + # does not put back the "analyzed,optimized keys"(etc) index state. + `diff -b $maria_path/unittest/ma_test_recovery.expected $tmp/ma_test_recovery.output`; + if ($? >> 8) { + print "UNEXPECTED OUTPUT OF TESTS, FAILED"; + print " (zerofilled $zerofilled_tables tables)\n"; + print "For more info, do diff -b $maria_path/unittest/ma_test_recovery.expected "; + print "$tmp/ma_test_recovery.output\n"; + exit(1); + } + print "ALL RECOVERY TESTS OK (zerofilled $zerofilled_tables tables)\n"; +} + +#### +#### check_table_is_same +#### + +sub check_table_is_same +{ + my ($table, $checksum)= @_; + my ($com, $checksum2, $res); + + # Computes checksum of new table and compares to checksum of old table + # Shows any difference in table's state (info from the index's header) + # Data/key file length is random in ma_test2 (as it uses srand() which + # may differ between machines). + + if ($opt_verbose) + { + print "checking if table $table has changed\n"; + } + + $com= "$maria_exe_path/aria_chk$suffix -dvv $table | grep -v \"Creation time:\" "; + $com.= "| grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\" > $tmp/aria_chk_message.txt 2>&1"; + $res= `$com`; + print MY_LOG $res; + $res= `$maria_exe_path/aria_chk$suffix -ss -e --read-only $table`; + print MY_LOG $res; + $checksum2= `$maria_exe_path/aria_chk$suffix -dss $table`; + if ("$checksum" ne "$checksum2") + { + print MY_LOG "checksum differs for $table before and after recovery\n"; + return 1; + } + + $com= "diff $tmp/aria_chk_message.good.txt $tmp/aria_chk_message.txt "; + $com.= "> $tmp/aria_chk_diff.txt || true"; + $res= `$com`; + print MY_LOG $res; + + if (-s "$tmp/aria_chk_diff.txt") + { + print MY_LOG "Differences in aria_chk -dvv, recovery not yet perfect !\n"; + print MY_LOG "========DIFF START=======\n"; + open(MY_FILE, "<$tmp/aria_chk_diff.txt") || die "Can't open file aria_chk_diff.txt\n"; + while (<MY_FILE>) + { + print MY_LOG $_; + } + close(MY_FILE); + print MY_LOG "========DIFF END=======\n"; + } +} + +#### +#### apply_log +#### + +sub apply_log +{ + my ($table, $shouldchangelog)= @_; + my ($log_md5, $log_md5_2); + + # applies log, can verify if applying did write to log or not + + if ("$shouldchangelog" ne "shouldnotchangelog" && + "$shouldchangelog" ne "shouldchangelog" && + "$shouldchangelog" ne "dontknow" ) + { + print MY_LOG "bad argument '$shouldchangelog'\n"; + return 1; + } + foreach (<aria_log.*>) + { + $log_md5.= md5_conv($_); + } + print MY_LOG "applying log\n"; + my_exec("$maria_exe_path/aria_read_log$suffix -a > $tmp/aria_read_log_$table.txt"); + foreach (<aria_log.*>) + { + $log_md5_2.= md5_conv($_); + } + if ("$log_md5" ne "$log_md5_2" ) + { + if ("$shouldchangelog" eq "shouldnotchangelog") + { + print MY_LOG "aria_read_log should not have modified the log\n"; + return 1; + } + } + elsif ("$shouldchangelog" eq "shouldchangelog") + { + print MY_LOG "aria_read_log should have modified the log\n"; + return 1; + } +} + +#### +#### md5_conv +#### + +sub md5_conv +{ + my ($file)= @_; + + open(FILE, $file) or die "Can't open '$file': $!\n"; + binmode(FILE); + my $md5= Digest::MD5->new; + $md5->addfile(FILE); + close (FILE); + return $md5->hexdigest . "\n"; +} + +#### +#### physical_cmp: compares two tables (MAI and MAD) physically; +#### uses zerofill-keep-lsn to reduce irrelevant differences. +#### + +sub physical_cmp +{ + my ($table1, $table2)= @_; + my ($zerofilled, $ret_text)= (0, ""); + #return `cmp $table1.MAD $table2.MAD`.`cmp $table1.MAI $table2.MAI`; + foreach my $file_suffix ("MAD", "MAI") + { + my $file1= "$table1.$file_suffix"; + my $file2= "$table2.$file_suffix"; + my $res= File::Compare::compare($file1, $file2); + die() if ($res == -1); + if ($res == 1 # they differ + and !$zerofilled) + { + # let's try with --zerofill-keep-lsn + $zerofilled= 1; # but no need to do it twice + $zerofilled_tables= $zerofilled_tables + 1; + my $table_no= 1; + foreach my $table ($table1, $table2) + { + # save original tables to restore them later + copy("$table.MAD", "$tmp/before_zerofill$table_no.MAD") || die(); + copy("$table.MAI", "$tmp/before_zerofill$table_no.MAI") || die(); + $com= "$maria_exe_path/aria_chk$suffix -ss --zerofill-keep-lsn $table"; + $res= `$com`; + print MY_LOG $res; + $table_no= $table_no + 1; + } + $res= File::Compare::compare($file1, $file2); + die() if ($res == -1); + } + $ret_text.= "$file1 and $file2 differ\n" if ($res != 0); + } + if ($zerofilled) + { + my $table_no= 1; + foreach my $table ($table1, $table2) + { + move("$tmp/before_zerofill$table_no.MAD", "$table.MAD") || die(); + move("$tmp/before_zerofill$table_no.MAI", "$table.MAI") || die(); + $table_no= $table_no + 1; + } + } + return $ret_text; +} + + +sub my_exec +{ + my($command)= @_; + my $res; + if ($opt_verbose) + { + print "$command\n"; + } + $res= `$command`; + if ($? != 0 && $opt_abort_on_error) + { + exit(1); + } + return $res; +} + + +#### +#### usage +#### + +sub usage +{ + print <<EOF; +$my_progname version $VER + +Description: + +Run various Aria recovery tests and print the results + +Options +--help Show this help and exit. + +--abort-on-error Abort at once in case of error. +--verbose Show commands while there are executing. +--version Show version number and exit. + +EOF + exit(0); +} diff --git a/storage/maria/unittest/sequence_storage.c b/storage/maria/unittest/sequence_storage.c new file mode 100644 index 00000000000..d5db20d31ca --- /dev/null +++ b/storage/maria/unittest/sequence_storage.c @@ -0,0 +1,110 @@ +/* Copyright (C) 2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "../maria_def.h" +#include "sequence_storage.h" + + +/** + @brief Initializes the sequence from the sequence file. + + @param seq Reference on the sequence storage. + @param file Path to the file where to write the sequence + + @retval 0 OK + @retval 1 Error +*/ + +my_bool seq_storage_reader_init(SEQ_STORAGE *seq, const char *file) +{ + FILE *fd; + seq->pos= 0; + if ((fd= my_fopen(file, O_RDONLY, MYF(MY_WME))) == NULL) + return 1; + if (my_init_dynamic_array(&seq->seq, sizeof(ulong), 10, 10)) + return 1; + + for(;;) + { + ulong num; + char line[22]; + if (fgets(line, sizeof(line), fd) == NULL) + break; + num= atol(line); + if (insert_dynamic(&seq->seq, (uchar*) &num)) + return 1; + } + fclose(fd); + return 0; +} + + +/** + @brief Gets next number from the sequence storage + + @param seq Reference on the sequence storage. + + @return Next number from the sequence. +*/ + +ulong seq_storage_next(SEQ_STORAGE *seq) +{ + DBUG_ASSERT(seq->seq.elements > 0); + DBUG_ASSERT(seq->pos < seq->seq.elements); + return (*(dynamic_element(&seq->seq, seq->pos++, ulong *))); +} + + +/** + @brief Frees resources allocated for the storage + + @param seq Reference on the sequence storage. +*/ + +void seq_storage_destroy(SEQ_STORAGE *seq) +{ + delete_dynamic(&seq->seq); +} + + +/** + @brief Starts the sequence from begining + + @param seq Reference on the sequence storage. +*/ + +void seq_storage_rewind(SEQ_STORAGE *seq) +{ + seq->pos= 0; +} + +/** + @brief Writes a number to the sequence file. + + @param file Path to the file where to write the sequence + @pagem num Number to be written + + @retval 0 OK + @retval 1 Error +*/ + +my_bool seq_storage_write(const char *file, ulong num) +{ + FILE *fd; + return ((fd= my_fopen(file, O_CREAT | O_APPEND | O_WRONLY, MYF(MY_WME))) == + NULL || + fprintf(fd, "%lu\n", num) < 0 || + fclose(fd) != 0); +} diff --git a/storage/maria/unittest/sequence_storage.h b/storage/maria/unittest/sequence_storage.h new file mode 100644 index 00000000000..78ce15a6253 --- /dev/null +++ b/storage/maria/unittest/sequence_storage.h @@ -0,0 +1,28 @@ +/* Copyright (C) 2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +typedef struct st_seq_storage +{ + uint pos; + DYNAMIC_ARRAY seq; +} SEQ_STORAGE; + +extern my_bool seq_storage_reader_init(SEQ_STORAGE *seq, const char *file); +extern ulong seq_storage_next(SEQ_STORAGE *seq); +extern void seq_storage_destroy(SEQ_STORAGE *seq); +extern void seq_storage_rewind(SEQ_STORAGE *seq); +extern my_bool seq_storage_write(const char *file, ulong num); + diff --git a/storage/maria/unittest/test_file.c b/storage/maria/unittest/test_file.c new file mode 100644 index 00000000000..5f7e3939592 --- /dev/null +++ b/storage/maria/unittest/test_file.c @@ -0,0 +1,118 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include <tap.h> +#include <my_sys.h> +#include <my_dir.h> +#include "test_file.h" + + +/* + Check that file contance correspond to descriptor + + SYNOPSIS + test_file() + file File to test + file_name Path (and name) of file which is tested + size size of file + buff_size size of buffer which is enought to check the file + desc file descriptor to check with + + RETURN + 1 file if OK + 0 error +*/ + +int test_file(PAGECACHE_FILE file, char *file_name, + off_t size, size_t buff_size, struct file_desc *desc) +{ + unsigned char *buffr= my_malloc(buff_size, MYF(0)); + off_t pos= 0; + size_t byte; + int step= 0; + int res= 1; /* ok */ + +#ifdef __WIN__ + /* + On Windows, the info returned by stat(), specifically file length + is not necessarily current, because this is the behavior of + underlying FindFirstFile() function. + */ + WIN32_FILE_ATTRIBUTE_DATA file_attr; + LARGE_INTEGER li; + if(GetFileAttributesEx(file_name, GetFileExInfoStandard, &file_attr) == 0) + { + diag("Can't GetFileAttributesEx %s (errno: %d)\n", file_name, + GetLastError()); + res= 0; + goto err; + } + li.HighPart= file_attr.nFileSizeHigh; + li.LowPart= file_attr.nFileSizeLow; + if(li.QuadPart != size) + { + diag("file %s size is %llu (should be %llu)\n", + file_name, (ulonglong)size, (ulonglong)li.QuadPart); + res= 0; /* failed */ + /* continue to get more information */ + } +#else + MY_STAT stat_buff, *stat; + if ((stat= my_stat(file_name, &stat_buff, MYF(0))) == NULL) + { + diag("Can't stat() %s (errno: %d)\n", file_name, errno); + res= 0; + goto err; + } + if (stat->st_size != size) + { + diag("file %s size is %lu (should be %lu)\n", + file_name, (ulong) stat->st_size, (ulong) size); + res= 0; /* failed */ + /* continue to get more information */ + } +#endif + + /* check content */ + my_seek(file.file, 0, SEEK_SET, MYF(MY_WME)); + while (desc[step].length != 0) + { + if (my_read(file.file, buffr, desc[step].length, MYF(0)) != + desc[step].length) + { + diag("Can't read %u bytes from %s (file: %d errno: %d)\n", + (uint)desc[step].length, file_name, file.file, errno); + res= 0; + goto err; + } + for (byte= 0; byte < desc[step].length; byte++) + { + if (buffr[byte] != desc[step].content) + { + diag("content of %s mismatch 0x%x in position %lu instead of 0x%x\n", + file_name, (uint) buffr[byte], (ulong) (pos + byte), + desc[step].content); + res= 0; + goto err; + } + } + pos+= desc[step].length; + step++; + } + +err: + my_free(buffr, 0); + return res; +} diff --git a/storage/maria/unittest/test_file.h b/storage/maria/unittest/test_file.h new file mode 100644 index 00000000000..0a1ccf4ab54 --- /dev/null +++ b/storage/maria/unittest/test_file.h @@ -0,0 +1,29 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include <m_string.h> +#include "../ma_pagecache.h" + +/* + File content descriptor +*/ +struct file_desc +{ + unsigned int length; + unsigned char content; +}; + +int test_file(PAGECACHE_FILE file, char *file_name, + off_t size, size_t buff_size, struct file_desc *desc); diff --git a/storage/maria/unittest/trnman-t.c b/storage/maria/unittest/trnman-t.c new file mode 100644 index 00000000000..43cf982a7f2 --- /dev/null +++ b/storage/maria/unittest/trnman-t.c @@ -0,0 +1,175 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <my_atomic.h> +#include <lf.h> +#include <m_string.h> +#include "../trnman.h" + +pthread_mutex_t rt_mutex; +pthread_attr_t attr; +size_t stacksize= 0; +#define STACK_SIZE (((int)stacksize-2048)*STACK_DIRECTION) + +int rt_num_threads; +int litmus; + +/* + create and end (commit or rollback) transactions randomly +*/ +#define MAX_ITER 100 +pthread_handler_t test_trnman(void *arg) +{ + uint x, y, i, n; + TRN *trn[MAX_ITER]; + int m= (*(int *)arg); + + if (my_thread_init()) + BAIL_OUT("my_thread_init failed!"); + + for (x= ((int)(intptr)(&m)); m > 0; ) + { + y= x= (x*LL(3628273133) + LL(1500450271)) % LL(9576890767); /* three prime numbers */ + m-= n= x % MAX_ITER; + for (i= 0; i < n; i++) + { + trn[i]= trnman_new_trn(0); + if (!trn[i]) + { + diag("trnman_new_trn() failed"); + litmus++; + } + } + for (i= 0; i < n; i++) + { + y= (y*19 + 7) % 31; + trnman_end_trn(trn[i], y & 1); + } + } + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + pthread_mutex_unlock(&rt_mutex); + + my_thread_end(); + + return 0; +} +#undef MAX_ITER + +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Testing %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, &attr, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Tested %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +#define ok_read_from(T1, T2, RES) \ + i= trnman_can_read_from(trn[T1], trid[T2]); \ + ok(i == RES, "trn" #T1 " %s read from trn" #T2, i ? "can" : "cannot") +#define start_transaction(T) \ + trn[T]= trnman_new_trn(0); \ + trid[T]= trn[T]->trid +#define commit(T) trnman_commit_trn(trn[T]) +#define abort(T) trnman_abort_trn(trn[T]) + +#define Ntrns 4 +void test_trnman_read_from() +{ + TRN *trn[Ntrns]; + TrID trid[Ntrns]; + int i; + + start_transaction(0); /* start trn1 */ + start_transaction(1); /* start trn2 */ + ok_read_from(1, 0, 0); + commit(0); /* commit trn1 */ + start_transaction(2); /* start trn4 */ + abort(2); /* abort trn4 */ + start_transaction(3); /* start trn5 */ + ok_read_from(3, 0, 1); + ok_read_from(3, 1, 0); + ok_read_from(3, 2, 0); + ok_read_from(3, 3, 1); + commit(1); /* commit trn2 */ + ok_read_from(3, 1, 0); + commit(3); /* commit trn5 */ + +} + +int main(int argc __attribute__((unused)), char **argv) +{ + MY_INIT(argv[0]); + + plan(7); + + if (my_atomic_initialize()) + return exit_status(); + + pthread_mutex_init(&rt_mutex, 0); + pthread_attr_init(&attr); +#ifdef HAVE_PTHREAD_ATTR_GETSTACKSIZE + pthread_attr_getstacksize(&attr, &stacksize); + if (stacksize == 0) +#endif + stacksize= PTHREAD_STACK_MIN; + +#define CYCLES 10000 +#define THREADS 10 + + trnman_init(0); + + test_trnman_read_from(); + run_test("trnman", test_trnman, THREADS, CYCLES); + + diag("mallocs: %d", trnman_allocated_transactions); + { + ulonglong now= my_getsystime(); + trnman_destroy(); + now= my_getsystime()-now; + diag("trnman_destroy: %g", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + |