diff options
Diffstat (limited to 'storage')
172 files changed, 75489 insertions, 750 deletions
diff --git a/storage/Makefile.am b/storage/Makefile.am index b978453d29d..eec499aabf4 100644 --- a/storage/Makefile.am +++ b/storage/Makefile.am @@ -19,7 +19,12 @@ AUTOMAKE_OPTIONS = foreign # These are built from source in the Docs directory EXTRA_DIST = -SUBDIRS = @mysql_se_dirs@ +# Until we remove fulltext-related references from Maria to MyISAM +# MyISAM must be built before Maria, which is not the case by default +# because of alphabetical order +# So we put myisam first; this is very ugly regarding plugins' logic +# but it works, and we'll remove it soon. +SUBDIRS = myisam @mysql_se_dirs@ # Don't update the files from bitkeeper %::SCCS/s.% diff --git a/storage/csv/ha_tina.cc b/storage/csv/ha_tina.cc index 9a7781e017d..394d00d20b7 100644 --- a/storage/csv/ha_tina.cc +++ b/storage/csv/ha_tina.cc @@ -445,7 +445,7 @@ ha_tina::ha_tina(handlerton *hton, TABLE_SHARE *table_arg) */ current_position(0), next_position(0), local_saved_data_file_length(0), file_buff(0), chain_alloced(0), chain_size(DEFAULT_CHAIN_LENGTH), - local_data_file_version(0), records_is_known(0) + local_data_file_version(0), records_is_known(0), curr_lock_type(F_UNLCK) { /* Set our original buffers from pre-allocated memory */ buffer.set((char*)byte_buffer, IO_SIZE, &my_charset_bin); @@ -1454,6 +1454,14 @@ int ha_tina::delete_all_rows() DBUG_RETURN(rc); } +int ha_tina::external_lock(THD *thd __attribute__((unused)), int lock_type) +{ + if (lock_type==F_UNLCK && curr_lock_type == F_WRLCK) + update_status(); + curr_lock_type= lock_type; + return 0; +} + /* Called by the database to lock the table. Keep in mind that this is an internal lock. @@ -1468,7 +1476,7 @@ THR_LOCK_DATA **ha_tina::store_lock(THD *thd, return to; } -/* +/* Create a table. You do not want to leave the table open after a call to this (the database will call ::open() if it needs to). */ diff --git a/storage/csv/ha_tina.h b/storage/csv/ha_tina.h index 5ce09783b9b..9a9c2399745 100644 --- a/storage/csv/ha_tina.h +++ b/storage/csv/ha_tina.h @@ -84,6 +84,8 @@ class ha_tina: public handler bool records_is_known; private: + int curr_lock_type; + bool get_write_pos(off_t *end_pos, tina_set *closest_hole); int open_update_temp_file_if_needed(); int init_tina_writer(); @@ -154,6 +156,8 @@ public: bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); + int external_lock(THD *thd, int lock_type); + THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type); diff --git a/storage/maria/CMakeLists.txt b/storage/maria/CMakeLists.txt new file mode 100644 index 00000000000..cfe23054e2f --- /dev/null +++ b/storage/maria/CMakeLists.txt @@ -0,0 +1 @@ +# empty for the moment; will fill it when we build under Windows diff --git a/storage/maria/Makefile.am b/storage/maria/Makefile.am new file mode 100644 index 00000000000..2bd9b7db922 --- /dev/null +++ b/storage/maria/Makefile.am @@ -0,0 +1,172 @@ +# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +MYSQLDATAdir = $(localstatedir) +MYSQLSHAREdir = $(pkgdatadir) +MYSQLBASEdir= $(prefix) +MYSQLLIBdir= $(pkglibdir) +INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include \ + -I$(top_srcdir)/regex \ + -I$(top_srcdir)/sql \ + -I$(srcdir) +WRAPLIBS= + +LDADD = + +DEFS = @DEFS@ + +# "." is needed first because tests in unittest need libmaria +SUBDIRS = . unittest + +EXTRA_DIST = ma_test_all.sh ma_test_all.res ma_ft_stem.c CMakeLists.txt plug.in ma_test_recovery +pkgdata_DATA = ma_test_all ma_test_all.res ma_test_recovery +pkglib_LIBRARIES = libmaria.a +bin_PROGRAMS = maria_chk maria_pack maria_ftdump maria_read_log +maria_chk_DEPENDENCIES= $(LIBRARIES) +# Only reason to link with libmyisam.a here is that it's where some fulltext +# pieces are (but soon we'll remove fulltext dependencies from Maria). +# For now, it imposes that storage/myisam be built before storage/maria. +maria_chk_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +maria_pack_DEPENDENCIES=$(LIBRARIES) +maria_pack_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +maria_read_log_DEPENDENCIES=$(LIBRARIES) +maria_read_log_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +noinst_PROGRAMS = ma_test1 ma_test2 ma_test3 ma_rt_test ma_sp_test +noinst_HEADERS = maria_def.h ma_rt_index.h ma_rt_key.h ma_rt_mbr.h \ + ma_sp_defs.h ma_fulltext.h ma_ftdefs.h ma_ft_test1.h \ + ma_ft_eval.h trnman.h lockman.h tablockman.h \ + ma_control_file.h ha_maria.h ma_blockrec.h \ + ma_loghandler.h ma_loghandler_lsn.h ma_pagecache.h \ + ma_checkpoint.h ma_recovery.h ma_commit.h \ + trnman_public.h +ma_test1_DEPENDENCIES= $(LIBRARIES) +ma_test1_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +ma_test2_DEPENDENCIES= $(LIBRARIES) +ma_test2_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +ma_test3_DEPENDENCIES= $(LIBRARIES) +ma_test3_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +#ma_ft_test1_DEPENDENCIES= $(LIBRARIES) +#ma_ft_eval_DEPENDENCIES= $(LIBRARIES) +maria_ftdump_DEPENDENCIES= $(LIBRARIES) +maria_ftdump_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +ma_rt_test_DEPENDENCIES= $(LIBRARIES) +ma_rt_test_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +ma_sp_test_DEPENDENCIES= $(LIBRARIES) +ma_sp_test_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +libmaria_a_SOURCES = ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c \ + ma_rnext.c ma_rnext_same.c \ + ma_search.c ma_page.c ma_key.c ma_locking.c \ + ma_rrnd.c ma_scan.c ma_cache.c \ + ma_statrec.c ma_packrec.c ma_dynrec.c \ + ma_blockrec.c ma_bitmap.c \ + ma_update.c ma_write.c ma_unique.c \ + ma_delete.c \ + ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c \ + ma_rsamepos.c ma_panic.c ma_close.c ma_create.c\ + ma_range.c ma_dbug.c ma_checksum.c \ + ma_changed.c ma_static.c ma_delete_all.c \ + ma_delete_table.c ma_rename.c ma_check.c \ + ma_keycache.c ma_preload.c ma_ft_parser.c \ + ma_ft_update.c ma_ft_boolean_search.c \ + ma_ft_nlq_search.c ft_maria.c ma_sort.c \ + ha_maria.cc trnman.c lockman.c tablockman.c \ + ma_rt_index.c ma_rt_key.c ma_rt_mbr.c ma_rt_split.c \ + ma_sp_key.c ma_control_file.c ma_loghandler.c \ + ma_pagecache.c ma_pagecaches.c \ + ma_checkpoint.c ma_recovery.c ma_commit.c +CLEANFILES = test?.MA? FT?.MA? isam.log ma_test_all ma_rt_test.MA? sp_test.MA? + +SUFFIXES = .sh + +.sh: + @RM@ -f $@ $@-t + @SED@ \ + -e 's!@''bindir''@!$(bindir)!g' \ + -e 's!@''scriptdir''@!$(bindir)!g' \ + -e 's!@''prefix''@!$(prefix)!g' \ + -e 's!@''datadir''@!$(datadir)!g' \ + -e 's!@''localstatedir''@!$(localstatedir)!g' \ + -e 's!@''libexecdir''@!$(libexecdir)!g' \ + -e 's!@''CC''@!@CC@!'\ + -e 's!@''CXX''@!@CXX@!'\ + -e 's!@''GXX''@!@GXX@!'\ + -e 's!@''PERL''@!@PERL@!' \ + -e 's!@''CFLAGS''@!@SAVE_CFLAGS@!'\ + -e 's!@''CXXFLAGS''@!@SAVE_CXXFLAGS@!'\ + -e 's!@''LDFLAGS''@!@SAVE_LDFLAGS@!'\ + -e 's!@''VERSION''@!@VERSION@!' \ + -e 's!@''MYSQL_SERVER_SUFFIX''@!@MYSQL_SERVER_SUFFIX@!' \ + -e 's!@''COMPILATION_COMMENT''@!@COMPILATION_COMMENT@!' \ + -e 's!@''MACHINE_TYPE''@!@MACHINE_TYPE@!' \ + -e 's!@''HOSTNAME''@!@HOSTNAME@!' \ + -e 's!@''SYSTEM_TYPE''@!@SYSTEM_TYPE@!' \ + -e 's!@''CHECK_PID''@!@CHECK_PID@!' \ + -e 's!@''FIND_PROC''@!@FIND_PROC@!' \ + -e 's!@''MYSQLD_DEFAULT_SWITCHES''@!@MYSQLD_DEFAULT_SWITCHES@!' \ + -e 's!@''MYSQL_UNIX_ADDR''@!@MYSQL_UNIX_ADDR@!' \ + -e 's!@''TARGET_LINUX''@!@TARGET_LINUX@!' \ + -e "s!@""CONF_COMMAND""@!@CONF_COMMAND@!" \ + -e 's!@''MYSQLD_USER''@!@MYSQLD_USER@!' \ + -e 's!@''sysconfdir''@!@sysconfdir@!' \ + -e 's!@''SHORT_MYSQL_INTRO''@!@SHORT_MYSQL_INTRO@!' \ + -e 's!@''SHARED_LIB_VERSION''@!@SHARED_LIB_VERSION@!' \ + -e 's!@''MYSQL_BASE_VERSION''@!@MYSQL_BASE_VERSION@!' \ + -e 's!@''MYSQL_NO_DASH_VERSION''@!@MYSQL_NO_DASH_VERSION@!' \ + -e 's!@''MYSQL_TCP_PORT''@!@MYSQL_TCP_PORT@!' \ + -e 's!@''PERL_DBI_VERSION''@!@PERL_DBI_VERSION@!' \ + -e 's!@''PERL_DBD_VERSION''@!@PERL_DBD_VERSION@!' \ + -e 's!@''PERL_DATA_DUMPER''@!@PERL_DATA_DUMPER@!' \ + $< > $@-t + @CHMOD@ +x $@-t + @MV@ $@-t $@ + +# Don't update the files from bitkeeper +%::SCCS/s.% diff --git a/storage/maria/ft_maria.c b/storage/maria/ft_maria.c new file mode 100644 index 00000000000..1b082f904d0 --- /dev/null +++ b/storage/maria/ft_maria.c @@ -0,0 +1,48 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* + This function is for interface functions between fulltext and maria +*/ + +#include "ma_ftdefs.h" + +FT_INFO *maria_ft_init_search(uint flags, void *info, uint keynr, + uchar *query, uint query_len, CHARSET_INFO *cs, + uchar *record) +{ + FT_INFO *res; + if (flags & FT_BOOL) + res= maria_ft_init_boolean_search((MARIA_HA *) info, keynr, query, + query_len, cs); + else + res= maria_ft_init_nlq_search((MARIA_HA *) info, keynr, query, query_len, + flags, record); + return res; +} + +const struct _ft_vft _ma_ft_vft_nlq = { + maria_ft_nlq_read_next, maria_ft_nlq_find_relevance, + maria_ft_nlq_close_search, maria_ft_nlq_get_relevance, + maria_ft_nlq_reinit_search +}; +const struct _ft_vft _ma_ft_vft_boolean = { + maria_ft_boolean_read_next, maria_ft_boolean_find_relevance, + maria_ft_boolean_close_search, maria_ft_boolean_get_relevance, + maria_ft_boolean_reinit_search +}; + diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc new file mode 100644 index 00000000000..678b88063db --- /dev/null +++ b/storage/maria/ha_maria.cc @@ -0,0 +1,2436 @@ +/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +#ifdef USE_PRAGMA_IMPLEMENTATION +#pragma implementation // gcc: Class implementation +#endif + +#define MYSQL_SERVER 1 +#include "mysql_priv.h" +#include <mysql/plugin.h> +#include <m_ctype.h> +#include <myisampack.h> +#include <my_bit.h> +#include "ha_maria.h" +#include "trnman_public.h" + +C_MODE_START +#include "maria_def.h" +#include "ma_rt_index.h" +#include "ma_blockrec.h" +#include "ma_checkpoint.h" +#include "ma_recovery.h" +C_MODE_END + +/* + Note that in future versions, only *transactional* Maria tables can + rollback, so this flag should be up or down conditionally. +*/ +#define MARIA_CANNOT_ROLLBACK HA_NO_TRANSACTIONS +#ifdef MARIA_CANNOT_ROLLBACK +#define trans_register_ha(A, B, C) do { /* nothing */ } while(0) +#endif + +/** + @todo For now there is no way for a user to set a different value of + maria_recover_options, i.e. auto-check-and-repair is always disabled. + We could enable it. As the auto-repair is initiated when opened from the + SQL layer (open_unireg_entry(), check_and_repair()), it does not happen + when Maria's Recovery internally opens the table to apply log records to + it, which is good. It would happen only after Recovery, if the table is + still corrupted. +*/ +ulong maria_recover_options= HA_RECOVER_NONE; +static handlerton *maria_hton; + +/* bits in maria_recover_options */ +const char *maria_recover_names[]= +{ + "DEFAULT", "BACKUP", "FORCE", "QUICK", NullS +}; +TYPELIB maria_recover_typelib= +{ + array_elements(maria_recover_names) - 1, "", + maria_recover_names, NULL +}; + +const char *maria_stats_method_names[]= +{ + "nulls_unequal", "nulls_equal", + "nulls_ignored", NullS +}; +TYPELIB maria_stats_method_typelib= +{ + array_elements(maria_stats_method_names) - 1, "", + maria_stats_method_names, NULL +}; + + +/***************************************************************************** +** MARIA tables +*****************************************************************************/ + +static handler *maria_create_handler(handlerton *hton, + TABLE_SHARE * table, + MEM_ROOT *mem_root) +{ + return new (mem_root) ha_maria(hton, table); +} + + +// collect errors printed by maria_check routines + +static void _ma_check_print_msg(HA_CHECK *param, const char *msg_type, + const char *fmt, va_list args) +{ + THD *thd= (THD *) param->thd; + Protocol *protocol= thd->protocol; + uint length, msg_length; + char msgbuf[MARIA_MAX_MSG_BUF]; + char name[NAME_LEN * 2 + 2]; + + msg_length= my_vsnprintf(msgbuf, sizeof(msgbuf), fmt, args); + msgbuf[sizeof(msgbuf) - 1]= 0; // healthy paranoia + + DBUG_PRINT(msg_type, ("message: %s", msgbuf)); + + if (!thd->vio_ok()) + { + sql_print_error(msgbuf); + return; + } + + if (param->testflag & + (T_CREATE_MISSING_KEYS | T_SAFE_REPAIR | T_AUTO_REPAIR)) + { + my_message(ER_NOT_KEYFILE, msgbuf, MYF(MY_WME)); + return; + } + length= (uint) (strxmov(name, param->db_name, ".", param->table_name, + NullS) - name); + /* + TODO: switch from protocol to push_warning here. The main reason we didn't + it yet is parallel repair. Due to following trace: + ma_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr. + + Also we likely need to lock mutex here (in both cases with protocol and + push_warning). + */ + protocol->prepare_for_resend(); + protocol->store(name, length, system_charset_info); + protocol->store(param->op_name, system_charset_info); + protocol->store(msg_type, system_charset_info); + protocol->store(msgbuf, msg_length, system_charset_info); + if (protocol->write()) + sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n", + msgbuf); + return; +} + + +/* + Convert TABLE object to Maria key and column definition + + SYNOPSIS + table2maria() + table_arg in TABLE object. + keydef_out out Maria key definition. + recinfo_out out Maria column definition. + records_out out Number of fields. + + DESCRIPTION + This function will allocate and initialize Maria key and column + definition for further use in ma_create or for a check for underlying + table conformance in merge engine. + + RETURN VALUE + 0 OK + # error code +*/ + +int table2maria(TABLE *table_arg, MARIA_KEYDEF **keydef_out, + MARIA_COLUMNDEF **recinfo_out, uint *records_out) +{ + uint i, j, recpos, minpos, fieldpos, temp_length, length; + enum ha_base_keytype type= HA_KEYTYPE_BINARY; + uchar *record; + KEY *pos; + MARIA_KEYDEF *keydef; + MARIA_COLUMNDEF *recinfo, *recinfo_pos; + HA_KEYSEG *keyseg; + TABLE_SHARE *share= table_arg->s; + uint options= share->db_options_in_use; + DBUG_ENTER("table2maria"); + + if (!(my_multi_malloc(MYF(MY_WME), + recinfo_out, (share->fields * 2 + 2) * sizeof(MARIA_COLUMNDEF), + keydef_out, share->keys * sizeof(MARIA_KEYDEF), + &keyseg, + (share->key_parts + share->keys) * sizeof(HA_KEYSEG), + NullS))) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); /* purecov: inspected */ + keydef= *keydef_out; + recinfo= *recinfo_out; + pos= table_arg->key_info; + for (i= 0; i < share->keys; i++, pos++) + { + keydef[i].flag= (pos->flags & (HA_NOSAME | HA_FULLTEXT | HA_SPATIAL)); + keydef[i].key_alg= pos->algorithm == HA_KEY_ALG_UNDEF ? + (pos->flags & HA_SPATIAL ? HA_KEY_ALG_RTREE : HA_KEY_ALG_BTREE) : + pos->algorithm; + keydef[i].block_length= pos->block_size; + keydef[i].seg= keyseg; + keydef[i].keysegs= pos->key_parts; + for (j= 0; j < pos->key_parts; j++) + { + Field *field= pos->key_part[j].field; + type= field->key_type(); + keydef[i].seg[j].flag= pos->key_part[j].key_part_flag; + + if (options & HA_OPTION_PACK_KEYS || + (pos->flags & (HA_PACK_KEY | HA_BINARY_PACK_KEY | + HA_SPACE_PACK_USED))) + { + if (pos->key_part[j].length > 8 && + (type == HA_KEYTYPE_TEXT || + type == HA_KEYTYPE_NUM || + (type == HA_KEYTYPE_BINARY && !field->zero_pack()))) + { + /* No blobs here */ + if (j == 0) + keydef[i].flag|= HA_PACK_KEY; + if (!(field->flags & ZEROFILL_FLAG) && + (field->type() == MYSQL_TYPE_STRING || + field->type() == MYSQL_TYPE_VAR_STRING || + ((int) (pos->key_part[j].length - field->decimals())) >= 4)) + keydef[i].seg[j].flag|= HA_SPACE_PACK; + } + else if (j == 0 && (!(pos->flags & HA_NOSAME) || pos->key_length > 16)) + keydef[i].flag|= HA_BINARY_PACK_KEY; + } + keydef[i].seg[j].type= (int) type; + keydef[i].seg[j].start= pos->key_part[j].offset; + keydef[i].seg[j].length= pos->key_part[j].length; + keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_end= + keydef[i].seg[j].bit_length= 0; + keydef[i].seg[j].bit_pos= 0; + keydef[i].seg[j].language= field->charset()->number; + + if (field->null_ptr) + { + keydef[i].seg[j].null_bit= field->null_bit; + keydef[i].seg[j].null_pos= (uint) (field->null_ptr- + (uchar*) table_arg->record[0]); + } + else + { + keydef[i].seg[j].null_bit= 0; + keydef[i].seg[j].null_pos= 0; + } + if (field->type() == MYSQL_TYPE_BLOB || + field->type() == MYSQL_TYPE_GEOMETRY) + { + keydef[i].seg[j].flag|= HA_BLOB_PART; + /* save number of bytes used to pack length */ + keydef[i].seg[j].bit_start= (uint) (field->pack_length() - + share->blob_ptr_size); + } + else if (field->type() == MYSQL_TYPE_BIT) + { + keydef[i].seg[j].bit_length= ((Field_bit *) field)->bit_len; + keydef[i].seg[j].bit_start= ((Field_bit *) field)->bit_ofs; + keydef[i].seg[j].bit_pos= (uint) (((Field_bit *) field)->bit_ptr - + (uchar*) table_arg->record[0]); + } + } + keyseg+= pos->key_parts; + } + if (table_arg->found_next_number_field) + keydef[share->next_number_index].flag|= HA_AUTO_KEY; + record= table_arg->record[0]; + recpos= 0; + recinfo_pos= recinfo; + while (recpos < (uint) share->reclength) + { + Field **field, *found= 0; + minpos= share->reclength; + length= 0; + + for (field= table_arg->field; *field; field++) + { + if ((fieldpos= (*field)->offset(record)) >= recpos && + fieldpos <= minpos) + { + /* skip null fields */ + if (!(temp_length= (*field)->pack_length_in_rec())) + continue; /* Skip null-fields */ + if (! found || fieldpos < minpos || + (fieldpos == minpos && temp_length < length)) + { + minpos= fieldpos; + found= *field; + length= temp_length; + } + } + } + DBUG_PRINT("loop", ("found: 0x%lx recpos: %d minpos: %d length: %d", + (long) found, recpos, minpos, length)); + if (recpos != minpos) + { // Reserved space (Null bits?) + bzero((char*) recinfo_pos, sizeof(*recinfo_pos)); + recinfo_pos->type= FIELD_NORMAL; + recinfo_pos++->length= (uint16) (minpos - recpos); + } + if (!found) + break; + + if (found->flags & BLOB_FLAG) + recinfo_pos->type= FIELD_BLOB; + else if (found->type() == MYSQL_TYPE_VARCHAR) + recinfo_pos->type= FIELD_VARCHAR; + else if (!(options & HA_OPTION_PACK_RECORD) || + (found->zero_pack() && (found->flags & PRI_KEY_FLAG))) + recinfo_pos->type= FIELD_NORMAL; + else if (found->zero_pack()) + recinfo_pos->type= FIELD_SKIP_ZERO; + else + recinfo_pos->type= ((length <= 3 || + (found->flags & ZEROFILL_FLAG)) ? + FIELD_NORMAL : + found->type() == MYSQL_TYPE_STRING || + found->type() == MYSQL_TYPE_VAR_STRING ? + FIELD_SKIP_ENDSPACE : + FIELD_SKIP_PRESPACE); + if (found->null_ptr) + { + recinfo_pos->null_bit= found->null_bit; + recinfo_pos->null_pos= (uint) (found->null_ptr - + (uchar*) table_arg->record[0]); + } + else + { + recinfo_pos->null_bit= 0; + recinfo_pos->null_pos= 0; + } + (recinfo_pos++)->length= (uint16) length; + recpos= minpos + length; + DBUG_PRINT("loop", ("length: %d type: %d", + recinfo_pos[-1].length,recinfo_pos[-1].type)); + } + *records_out= (uint) (recinfo_pos - recinfo); + DBUG_RETURN(0); +} + + +/* + Check for underlying table conformance + + SYNOPSIS + maria_check_definition() + t1_keyinfo in First table key definition + t1_recinfo in First table record definition + t1_keys in Number of keys in first table + t1_recs in Number of records in first table + t2_keyinfo in Second table key definition + t2_recinfo in Second table record definition + t2_keys in Number of keys in second table + t2_recs in Number of records in second table + strict in Strict check switch + + DESCRIPTION + This function compares two Maria definitions. By intention it was done + to compare merge table definition against underlying table definition. + It may also be used to compare dot-frm and MAI definitions of Maria + table as well to compare different Maria table definitions. + + For merge table it is not required that number of keys in merge table + must exactly match number of keys in underlying table. When calling this + function for underlying table conformance check, 'strict' flag must be + set to false, and converted merge definition must be passed as t1_*. + + Otherwise 'strict' flag must be set to 1 and it is not required to pass + converted dot-frm definition as t1_*. + + RETURN VALUE + 0 - Equal definitions. + 1 - Different definitions. + + TODO + - compare FULLTEXT keys; + - compare SPATIAL keys; + - compare FIELD_SKIP_ZERO which is converted to FIELD_NORMAL correctly + (should be corretly detected in table2maria). +*/ +int maria_check_definition(MARIA_KEYDEF *t1_keyinfo, + MARIA_COLUMNDEF *t1_recinfo, + uint t1_keys, uint t1_recs, + MARIA_KEYDEF *t2_keyinfo, + MARIA_COLUMNDEF *t2_recinfo, + uint t2_keys, uint t2_recs, bool strict) +{ + uint i, j; + DBUG_ENTER("maria_check_definition"); + if ((strict ? t1_keys != t2_keys : t1_keys > t2_keys)) + { + DBUG_PRINT("error", ("Number of keys differs: t1_keys=%u, t2_keys=%u", + t1_keys, t2_keys)); + DBUG_RETURN(1); + } + if (t1_recs != t2_recs) + { + DBUG_PRINT("error", ("Number of recs differs: t1_recs=%u, t2_recs=%u", + t1_recs, t2_recs)); + DBUG_RETURN(1); + } + for (i= 0; i < t1_keys; i++) + { + HA_KEYSEG *t1_keysegs= t1_keyinfo[i].seg; + HA_KEYSEG *t2_keysegs= t2_keyinfo[i].seg; + if (t1_keyinfo[i].flag & HA_FULLTEXT && t2_keyinfo[i].flag & HA_FULLTEXT) + continue; + else if (t1_keyinfo[i].flag & HA_FULLTEXT || + t2_keyinfo[i].flag & HA_FULLTEXT) + { + DBUG_PRINT("error", ("Key %d has different definition", i)); + DBUG_PRINT("error", ("t1_fulltext= %d, t2_fulltext=%d", + test(t1_keyinfo[i].flag & HA_FULLTEXT), + test(t2_keyinfo[i].flag & HA_FULLTEXT))); + DBUG_RETURN(1); + } + if (t1_keyinfo[i].flag & HA_SPATIAL && t2_keyinfo[i].flag & HA_SPATIAL) + continue; + else if (t1_keyinfo[i].flag & HA_SPATIAL || + t2_keyinfo[i].flag & HA_SPATIAL) + { + DBUG_PRINT("error", ("Key %d has different definition", i)); + DBUG_PRINT("error", ("t1_spatial= %d, t2_spatial=%d", + test(t1_keyinfo[i].flag & HA_SPATIAL), + test(t2_keyinfo[i].flag & HA_SPATIAL))); + DBUG_RETURN(1); + } + if (t1_keyinfo[i].keysegs != t2_keyinfo[i].keysegs || + t1_keyinfo[i].key_alg != t2_keyinfo[i].key_alg) + { + DBUG_PRINT("error", ("Key %d has different definition", i)); + DBUG_PRINT("error", ("t1_keysegs=%d, t1_key_alg=%d", + t1_keyinfo[i].keysegs, t1_keyinfo[i].key_alg)); + DBUG_PRINT("error", ("t2_keysegs=%d, t2_key_alg=%d", + t2_keyinfo[i].keysegs, t2_keyinfo[i].key_alg)); + DBUG_RETURN(1); + } + for (j= t1_keyinfo[i].keysegs; j--;) + { + if (t1_keysegs[j].type != t2_keysegs[j].type || + t1_keysegs[j].language != t2_keysegs[j].language || + t1_keysegs[j].null_bit != t2_keysegs[j].null_bit || + t1_keysegs[j].length != t2_keysegs[j].length) + { + DBUG_PRINT("error", ("Key segment %d (key %d) has different " + "definition", j, i)); + DBUG_PRINT("error", ("t1_type=%d, t1_language=%d, t1_null_bit=%d, " + "t1_length=%d", + t1_keysegs[j].type, t1_keysegs[j].language, + t1_keysegs[j].null_bit, t1_keysegs[j].length)); + DBUG_PRINT("error", ("t2_type=%d, t2_language=%d, t2_null_bit=%d, " + "t2_length=%d", + t2_keysegs[j].type, t2_keysegs[j].language, + t2_keysegs[j].null_bit, t2_keysegs[j].length)); + + DBUG_RETURN(1); + } + } + } + for (i= 0; i < t1_recs; i++) + { + MARIA_COLUMNDEF *t1_rec= &t1_recinfo[i]; + MARIA_COLUMNDEF *t2_rec= &t2_recinfo[i]; + /* + FIELD_SKIP_ZERO can be changed to FIELD_NORMAL in maria_create, + see NOTE1 in ma_create.c + */ + if ((t1_rec->type != t2_rec->type && + !(t1_rec->type == (int) FIELD_SKIP_ZERO && + t1_rec->length == 1 && + t2_rec->type == (int) FIELD_NORMAL)) || + t1_rec->length != t2_rec->length || + t1_rec->null_bit != t2_rec->null_bit) + { + DBUG_PRINT("error", ("Field %d has different definition", i)); + DBUG_PRINT("error", ("t1_type=%d, t1_length=%d, t1_null_bit=%d", + t1_rec->type, t1_rec->length, t1_rec->null_bit)); + DBUG_PRINT("error", ("t2_type=%d, t2_length=%d, t2_null_bit=%d", + t2_rec->type, t2_rec->length, t2_rec->null_bit)); + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +extern "C" { + +volatile int *_ma_killed_ptr(HA_CHECK *param) +{ + /* In theory Unsafe conversion, but should be ok for now */ + return (int*) &(((THD *) (param->thd))->killed); +} + + +void _ma_check_print_error(HA_CHECK *param, const char *fmt, ...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_error"); + param->error_printed |= 1; + param->out_flag |= O_DATA_LOST; + va_start(args, fmt); + _ma_check_print_msg(param, "error", fmt, args); + va_end(args); + DBUG_VOID_RETURN; +} + + +void _ma_check_print_info(HA_CHECK *param, const char *fmt, ...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_info"); + va_start(args, fmt); + _ma_check_print_msg(param, "info", fmt, args); + va_end(args); + DBUG_VOID_RETURN; +} + + +void _ma_check_print_warning(HA_CHECK *param, const char *fmt, ...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_warning"); + param->warning_printed= 1; + param->out_flag |= O_DATA_LOST; + va_start(args, fmt); + _ma_check_print_msg(param, "warning", fmt, args); + va_end(args); + DBUG_VOID_RETURN; +} + +} + + +ha_maria::ha_maria(handlerton *hton, TABLE_SHARE *table_arg): +handler(hton, table_arg), file(0), +int_table_flags(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER | + HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE | + HA_DUPLICATE_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY | + HA_FILE_BASED | HA_CAN_GEOMETRY | MARIA_CANNOT_ROLLBACK | + HA_CAN_BIT_FIELD | HA_CAN_RTREEKEYS | + HA_HAS_RECORDS | HA_STATS_RECORDS_IS_EXACT), +can_enable_indexes(1) +{} + + +handler *ha_maria::clone(MEM_ROOT *mem_root) +{ + ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(mem_root)); + if (new_handler) + new_handler->file->state= file->state; + return new_handler; +} + + +static const char *ha_maria_exts[]= +{ + MARIA_NAME_IEXT, + MARIA_NAME_DEXT, + NullS +}; + + +const char **ha_maria::bas_ext() const +{ + return ha_maria_exts; +} + + +const char *ha_maria::index_type(uint key_number) +{ + return ((table->key_info[key_number].flags & HA_FULLTEXT) ? + "FULLTEXT" : + (table->key_info[key_number].flags & HA_SPATIAL) ? + "SPATIAL" : + (table->key_info[key_number].algorithm == HA_KEY_ALG_RTREE) ? + "RTREE" : "BTREE"); +} + + +double ha_maria::scan_time() +{ + if (file->s->data_file_type == BLOCK_RECORD) + return ulonglong2double(stats.data_file_length - file->s->block_size) / max(file->s->block_size / 2, IO_SIZE) + 2; + return handler::scan_time(); +} + +/* + We need to be able to store at least two keys on an index page as the + splitting algorithms depends on this. (With only one key on a page + we also can't use any compression, which may make the index file much + larger) + We use HA_MAX_KEY_BUFF as this is a stack restriction imposed by the + handler interface. + + We also need to reserve place for a record pointer (8) and 3 bytes + per key segment to store the length of the segment + possible null bytes. + These extra bytes are required here so that maria_create() will surely + accept any keys created which the returned key data storage length. +*/ + +uint ha_maria::max_supported_key_length() const +{ + uint tmp= (maria_max_key_length() - 8 - HA_MAX_KEY_SEG*3); + return min(HA_MAX_KEY_BUFF, tmp); +} + + +#ifdef HAVE_REPLICATION +int ha_maria::net_read_dump(NET * net) +{ + int data_fd= file->dfile.file; + int error= 0; + + my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME)); + for (;;) + { + ulong packet_len= my_net_read(net); + if (!packet_len) + break; // end of file + if (packet_len == packet_error) + { + sql_print_error("ha_maria::net_read_dump - read error "); + error= -1; + goto err; + } + if (my_write(data_fd, (uchar *) net->read_pos, (uint) packet_len, + MYF(MY_WME | MY_FNABP))) + { + error= errno; + goto err; + } + } +err: + return error; +} + + +int ha_maria::dump(THD * thd, int fd) +{ + MARIA_SHARE *share= file->s; + NET *net= &thd->net; + uint block_size= share->block_size; + my_off_t bytes_to_read= share->state.state.data_file_length; + int data_fd= file->dfile.file; + uchar *buf= (uchar *) my_malloc(block_size, MYF(MY_WME)); + if (!buf) + return ENOMEM; + + int error= 0; + my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME)); + for (; bytes_to_read > 0;) + { + size_t bytes= my_read(data_fd, buf, block_size, MYF(MY_WME)); + if (bytes == MY_FILE_ERROR) + { + error= errno; + goto err; + } + + if (fd >= 0) + { + if (my_write(fd, buf, bytes, MYF(MY_WME | MY_FNABP))) + { + error= errno ? errno : EPIPE; + goto err; + } + } + else + { + if (my_net_write(net, buf, bytes)) + { + error= errno ? errno : EPIPE; + goto err; + } + } + bytes_to_read -= bytes; + } + + if (fd < 0) + { + if (my_net_write(net, (uchar*) "", 0)) + error= errno ? errno : EPIPE; + net_flush(net); + } + +err: + my_free((uchar*) buf, MYF(0)); + return error; +} +#endif /* HAVE_REPLICATION */ + + +bool ha_maria::check_if_locking_is_allowed(uint sql_command, + ulong type, TABLE *table, + uint count, uint current, + uint *system_count, + bool called_by_privileged_thread) +{ + /* + To be able to open and lock for reading system tables like 'mysql.proc', + when we already have some tables opened and locked, and avoid deadlocks + we have to disallow write-locking of these tables with any other tables. + */ + if (table->s->system_table && + table->reginfo.lock_type >= TL_WRITE_ALLOW_WRITE) + (*system_count)++; + + /* 'current' is an index, that's why '<=' below. */ + if (*system_count > 0 && *system_count <= current) + { + my_error(ER_WRONG_LOCK_OF_SYSTEM_TABLE, MYF(0)); + return FALSE; + } + + /* + Deny locking of the log tables, which is incompatible with + concurrent insert. Unless called from a logger THD (general_log_thd + or slow_log_thd) or by a privileged thread. + */ + if (!called_by_privileged_thread) + return check_if_log_table_locking_is_allowed(sql_command, type, table); + + return TRUE; +} + + + /* Name is here without an extension */ + +int ha_maria::open(const char *name, int mode, uint test_if_locked) +{ + MARIA_KEYDEF *keyinfo; + MARIA_COLUMNDEF *recinfo= 0; + uint recs; + uint i; + +#ifdef NOT_USED + /* + If the user wants to have memory mapped data files, add an + open_flag. Do not memory map temporary tables because they are + expected to be inserted and thus extended a lot. Memory mapping is + efficient for files that keep their size, but very inefficient for + growing files. Using an open_flag instead of calling ma_extra(... + HA_EXTRA_MMAP ...) after maxs_open() has the advantage that the + mapping is not repeated for every open, but just done on the initial + open, when the MyISAM share is created. Everytime the server + requires to open a new instance of a table it calls this method. We + will always supply HA_OPEN_MMAP for a permanent table. However, the + Maria storage engine will ignore this flag if this is a secondary + open of a table that is in use by other threads already (if the + Maria share exists already). + */ + if (!(test_if_locked & HA_OPEN_TMP_TABLE) && opt_maria_use_mmap) + test_if_locked|= HA_OPEN_MMAP; +#endif + + if (!(file= maria_open(name, mode, test_if_locked | HA_OPEN_FROM_SQL_LAYER))) + return (my_errno ? my_errno : -1); + + /** + @todo ASK_MONTY + This is a protection for the case of a frm and MAI containing incompatible + table definitions (as in BUG#25908). This was merged from MyISAM. + But it breaks maria.test and ps_maria.test ("incorrect key file") if the + table is BLOCK_RECORD (does it have to do with column reordering done in + ma_create.c ?). + */ + if (!table->s->tmp_table) /* No need to perform a check for tmp table */ + { + if ((my_errno= table2maria(table, &keyinfo, &recinfo, &recs))) + { + /* purecov: begin inspected */ + DBUG_PRINT("error", ("Failed to convert TABLE object to Maria " + "key and column definition")); + goto err; + /* purecov: end */ + } +#ifdef ASK_MONTY + if (maria_check_definition(keyinfo, recinfo, table->s->keys, recs, + file->s->keyinfo, file->s->columndef, + file->s->base.keys, file->s->base.fields, true)) +#else + if (0) +#endif + { + /* purecov: begin inspected */ + my_errno= HA_ERR_CRASHED; + goto err; + /* purecov: end */ + } + } + + if (test_if_locked & (HA_OPEN_IGNORE_IF_LOCKED | HA_OPEN_TMP_TABLE)) + VOID(maria_extra(file, HA_EXTRA_NO_WAIT_LOCK, 0)); + + info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); + if (!(test_if_locked & HA_OPEN_WAIT_IF_LOCKED)) + VOID(maria_extra(file, HA_EXTRA_WAIT_LOCK, 0)); + if ((data_file_type= file->s->data_file_type) != STATIC_RECORD) + int_table_flags |= HA_REC_NOT_IN_SEQ; + if (!file->s->base.born_transactional) + { + /* + INSERT DELAYED cannot work with transactional tables (because it cannot + stand up to "when client gets ok the data is safe on disk": the record + may not even be inserted). In the future, we could enable it back (as a + client doing INSERT DELAYED knows the specificities; but we then should + make sure to regularly commit in the delayed_insert thread). + */ + int_table_flags|= HA_CAN_INSERT_DELAYED; + } + if (file->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + int_table_flags |= HA_HAS_CHECKSUM; + + for (i= 0; i < table->s->keys; i++) + { + plugin_ref parser= table->key_info[i].parser; + if (table->key_info[i].flags & HA_USES_PARSER) + file->s->keyinfo[i].parser= + (struct st_mysql_ftparser *)plugin_decl(parser)->info; + table->key_info[i].block_size= file->s->keyinfo[i].block_length; + } + my_errno= 0; + goto end; + err: + this->close(); + end: + /* + Both recinfo and keydef are allocated by my_multi_malloc(), thus only + recinfo must be freed. + */ + if (recinfo) + my_free((uchar*) recinfo, MYF(0)); + return my_errno; +} + + +int ha_maria::close(void) +{ + MARIA_HA *tmp= file; + file= 0; + return maria_close(tmp); +} + + +int ha_maria::write_row(uchar * buf) +{ + ha_statistic_increment(&SSV::ha_write_count); + + /* If we have a timestamp column, update it to the current time */ + if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT) + table->timestamp_field->set_time(); + + /* + If we have an auto_increment column and we are writing a changed row + or a new row, then update the auto_increment value in the record. + */ + if (table->next_number_field && buf == table->record[0]) + { + int error; + if ((error= update_auto_increment())) + return error; + } + return maria_write(file, buf); +} + + +int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt) +{ + if (!file) + return HA_ADMIN_INTERNAL_ERROR; + int error; + HA_CHECK param; + MARIA_SHARE *share= file->s; + const char *old_proc_info= thd->proc_info; + + thd->proc_info= "Checking table"; + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "check"; + param.db_name= table->s->db.str; + param.table_name= table->alias; + param.testflag= check_opt->flags | T_CHECK | T_SILENT; + param.stats_method= (enum_handler_stats_method) thd->variables. + maria_stats_method; + + if (!(table->db_stat & HA_READ_ONLY)) + param.testflag |= T_STATISTICS; + param.using_global_keycache= 1; + + if (!maria_is_crashed(file) && + (((param.testflag & T_CHECK_ONLY_CHANGED) && + !(share->state.changed & (STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR)) && + share->state.open_count == 0) || + ((param.testflag & T_FAST) && (share->state.open_count == + (uint) (share->global_changed ? 1 : + 0))))) + return HA_ADMIN_ALREADY_DONE; + + error= maria_chk_status(¶m, file); // Not fatal + error= maria_chk_size(¶m, file); + if (!error) + error |= maria_chk_del(¶m, file, param.testflag); + if (!error) + error= maria_chk_key(¶m, file); + if (!error) + { + if ((!(param.testflag & T_QUICK) && + ((share->options & + (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) || + (param.testflag & (T_EXTEND | T_MEDIUM)))) || maria_is_crashed(file)) + { + uint old_testflag= param.testflag; + param.testflag |= T_MEDIUM; + if (!(error= init_io_cache(¶m.read_cache, file->dfile.file, + my_default_record_cache_size, READ_CACHE, + share->pack.header_length, 1, MYF(MY_WME)))) + { + error= maria_chk_data_link(¶m, file, param.testflag & T_EXTEND); + end_io_cache(&(param.read_cache)); + } + param.testflag= old_testflag; + } + } + if (!error) + { + if ((share->state.changed & (STATE_CHANGED | + STATE_CRASHED_ON_REPAIR | + STATE_CRASHED | STATE_NOT_ANALYZED)) || + (param.testflag & T_STATISTICS) || maria_is_crashed(file)) + { + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + pthread_mutex_lock(&share->intern_lock); + share->state.changed &= ~(STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR); + if (!(table->db_stat & HA_READ_ONLY)) + error= maria_update_state_info(¶m, file, UPDATE_TIME | UPDATE_OPEN_COUNT | + UPDATE_STAT); + pthread_mutex_unlock(&share->intern_lock); + info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE | + HA_STATUS_CONST); + } + } + else if (!maria_is_crashed(file) && !thd->killed) + { + maria_mark_crashed(file); + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + } + + thd->proc_info= old_proc_info; + return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK; +} + + +/* + Analyze the key distribution in the table + As the table may be only locked for read, we have to take into account that + two threads may do an analyze at the same time! +*/ + +int ha_maria::analyze(THD *thd, HA_CHECK_OPT * check_opt) +{ + int error= 0; + HA_CHECK param; + MARIA_SHARE *share= file->s; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "analyze"; + param.db_name= table->s->db.str; + param.table_name= table->alias; + param.testflag= (T_FAST | T_CHECK | T_SILENT | T_STATISTICS | + T_DONT_CHECK_CHECKSUM); + param.using_global_keycache= 1; + param.stats_method= (enum_handler_stats_method) thd->variables. + maria_stats_method; + + if (!(share->state.changed & STATE_NOT_ANALYZED)) + return HA_ADMIN_ALREADY_DONE; + + error= maria_chk_key(¶m, file); + if (!error) + { + pthread_mutex_lock(&share->intern_lock); + error= maria_update_state_info(¶m, file, UPDATE_STAT); + pthread_mutex_unlock(&share->intern_lock); + } + else if (!maria_is_crashed(file) && !thd->killed) + maria_mark_crashed(file); + return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK; +} + + +int ha_maria::restore(THD * thd, HA_CHECK_OPT *check_opt) +{ + HA_CHECK_OPT tmp_check_opt; + char *backup_dir= thd->lex->backup_dir; + char src_path[FN_REFLEN], dst_path[FN_REFLEN]; + char table_name[FN_REFLEN]; + int error; + const char *errmsg; + DBUG_ENTER("restore"); + + VOID(tablename_to_filename(table->s->table_name.str, table_name, + sizeof(table_name))); + + if (fn_format_relative_to_data_home(src_path, table_name, backup_dir, + MARIA_NAME_DEXT)) + DBUG_RETURN(HA_ADMIN_INVALID); + + strxmov(dst_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS); + if (my_copy(src_path, dst_path, MYF(MY_WME))) + { + error= HA_ADMIN_FAILED; + errmsg= "Failed in my_copy (Error %d)"; + goto err; + } + + tmp_check_opt.init(); + tmp_check_opt.flags |= T_VERY_SILENT | T_CALC_CHECKSUM | T_QUICK; + DBUG_RETURN(repair(thd, &tmp_check_opt)); + +err: + { + HA_CHECK param; + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "restore"; + param.db_name= table->s->db.str; + param.table_name= table->s->table_name.str; + param.testflag= 0; + _ma_check_print_error(¶m, errmsg, my_errno); + DBUG_RETURN(error); + } +} + + +int ha_maria::backup(THD * thd, HA_CHECK_OPT *check_opt) +{ + char *backup_dir= thd->lex->backup_dir; + char src_path[FN_REFLEN], dst_path[FN_REFLEN]; + char table_name[FN_REFLEN]; + int error; + const char *errmsg; + DBUG_ENTER("ha_maria::backup"); + + VOID(tablename_to_filename(table->s->table_name.str, table_name, + sizeof(table_name))); + + if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir, + reg_ext)) + { + errmsg= "Failed in fn_format() for .frm file (errno: %d)"; + error= HA_ADMIN_INVALID; + goto err; + } + + strxmov(src_path, table->s->normalized_path.str, reg_ext, NullS); + if (my_copy(src_path, dst_path, + MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE))) + { + error= HA_ADMIN_FAILED; + errmsg= "Failed copying .frm file (errno: %d)"; + goto err; + } + + /* Change extension */ + if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir, + MARIA_NAME_DEXT)) + { + errmsg= "Failed in fn_format() for .MYD file (errno: %d)"; + error= HA_ADMIN_INVALID; + goto err; + } + + strxmov(src_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS); + if (_ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_FORCE_WRITE, + FLUSH_KEEP)) + { + error= HA_ADMIN_FAILED; + errmsg= "Failed in flush (Error %d)"; + goto err; + } + if (my_copy(src_path, dst_path, + MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE))) + { + errmsg= "Failed copying .MYD file (errno: %d)"; + error= HA_ADMIN_FAILED; + goto err; + } + DBUG_RETURN(HA_ADMIN_OK); + +err: + { + HA_CHECK param; + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "backup"; + param.db_name= table->s->db.str; + param.table_name= table->s->table_name.str; + param.testflag= 0; + _ma_check_print_error(¶m, errmsg, my_errno); + DBUG_RETURN(error); + } +} + + +int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt) +{ + int error; + HA_CHECK param; + ha_rows start_records; + + if (!file) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "repair"; + param.testflag= ((check_opt->flags & ~(T_EXTEND)) | + T_SILENT | T_FORCE_CREATE | T_CALC_CHECKSUM | + (check_opt->flags & T_EXTEND ? T_REP : T_REP_BY_SORT)); + param.sort_buffer_length= check_opt->sort_buffer_size; + start_records= file->state->records; + while ((error= repair(thd, param, 0)) && param.retry_repair) + { + param.retry_repair= 0; + if (test_all_bits(param.testflag, + (uint) (T_RETRY_WITHOUT_QUICK | T_QUICK))) + { + param.testflag &= ~T_RETRY_WITHOUT_QUICK; + sql_print_information("Retrying repair of: '%s' without quick", + table->s->path.str); + continue; + } + param.testflag &= ~T_QUICK; + if ((param.testflag & T_REP_BY_SORT)) + { + param.testflag= (param.testflag & ~T_REP_BY_SORT) | T_REP; + sql_print_information("Retrying repair of: '%s' with keycache", + table->s->path.str); + continue; + } + break; + } + if (!error && start_records != file->state->records && + !(check_opt->flags & T_VERY_SILENT)) + { + char llbuff[22], llbuff2[22]; + sql_print_information("Found %s of %s rows when repairing '%s'", + llstr(file->state->records, llbuff), + llstr(start_records, llbuff2), + table->s->path.str); + } + return error; +} + +int ha_maria::optimize(THD * thd, HA_CHECK_OPT *check_opt) +{ + int error; + if (!file) + return HA_ADMIN_INTERNAL_ERROR; + HA_CHECK param; + + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "optimize"; + param.testflag= (check_opt->flags | T_SILENT | T_FORCE_CREATE | + T_REP_BY_SORT | T_STATISTICS | T_SORT_INDEX); + param.sort_buffer_length= check_opt->sort_buffer_size; + if ((error= repair(thd, param, 1)) && param.retry_repair) + { + sql_print_warning("Warning: Optimize table got errno %d on %s.%s, retrying", + my_errno, param.db_name, param.table_name); + param.testflag &= ~T_REP_BY_SORT; + error= repair(thd, param, 1); + } + return error; +} + + +int ha_maria::repair(THD *thd, HA_CHECK ¶m, bool do_optimize) +{ + int error= 0; + uint local_testflag= param.testflag; + bool optimize_done= !do_optimize, statistics_done= 0; + const char *old_proc_info= thd->proc_info; + char fixed_name[FN_REFLEN]; + MARIA_SHARE *share= file->s; + ha_rows rows= file->state->records; + DBUG_ENTER("ha_maria::repair"); + + /* + Normally this method is entered with a properly opened table. If the + repair fails, it can be repeated with more elaborate options. Under + special circumstances it can happen that a repair fails so that it + closed the data file and cannot re-open it. In this case file->dfile + is set to -1. We must not try another repair without an open data + file. (Bug #25289) + */ + if (file->dfile.file == -1) + { + sql_print_information("Retrying repair of: '%s' failed. " + "Please try REPAIR EXTENDED or maria_chk", + table->s->path.str); + DBUG_RETURN(HA_ADMIN_FAILED); + } + + param.db_name= table->s->db.str; + param.table_name= table->alias; + param.tmpfile_createflag= O_RDWR | O_TRUNC; + param.using_global_keycache= 1; + param.thd= thd; + param.tmpdir= &mysql_tmpdir_list; + param.out_flag= 0; + strmov(fixed_name, file->s->open_file_name); + + // Don't lock tables if we have used LOCK TABLE + if (!thd->locked_tables && + maria_lock_database(file, table->s->tmp_table ? F_EXTRA_LCK : F_WRLCK)) + { + _ma_check_print_error(¶m, ER(ER_CANT_LOCK), my_errno); + DBUG_RETURN(HA_ADMIN_FAILED); + } + + if (!do_optimize || + ((file->state->del || + ((file->s->data_file_type != BLOCK_RECORD) && + share->state.split != file->state->records)) && + (!(param.testflag & T_QUICK) || + (share->state.changed & (STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_OPTIMIZED_ROWS))))) + { + ulonglong key_map= ((local_testflag & T_CREATE_MISSING_KEYS) ? + maria_get_mask_all_keys_active(share->base.keys) : + share->state.key_map); + uint testflag= param.testflag; + if (maria_test_if_sort_rep(file, file->state->records, key_map, 0) && + (local_testflag & T_REP_BY_SORT)) + { + local_testflag |= T_STATISTICS; + param.testflag |= T_STATISTICS; // We get this for free + statistics_done= 1; + /* TODO: Remove BLOCK_RECORD test when parallel works with blocks */ + if (thd->variables.maria_repair_threads > 1 && + file->s->data_file_type != BLOCK_RECORD) + { + char buf[40]; + /* TODO: respect maria_repair_threads variable */ + my_snprintf(buf, 40, "Repair with %d threads", my_count_bits(key_map)); + thd->proc_info= buf; + error= maria_repair_parallel(¶m, file, fixed_name, + param.testflag & T_QUICK); + thd->proc_info= "Repair done"; // to reset proc_info, as + // it was pointing to local buffer + } + else + { + thd->proc_info= "Repair by sorting"; + error= maria_repair_by_sort(¶m, file, fixed_name, + param.testflag & T_QUICK); + } + } + else + { + thd->proc_info= "Repair with keycache"; + param.testflag &= ~T_REP_BY_SORT; + error= maria_repair(¶m, file, fixed_name, param.testflag & T_QUICK); + } + param.testflag= testflag; + optimize_done= 1; + } + if (!error) + { + if ((local_testflag & T_SORT_INDEX) && + (share->state.changed & STATE_NOT_SORTED_PAGES)) + { + optimize_done= 1; + thd->proc_info= "Sorting index"; + error= maria_sort_index(¶m, file, fixed_name); + } + if (!statistics_done && (local_testflag & T_STATISTICS)) + { + if (share->state.changed & STATE_NOT_ANALYZED) + { + optimize_done= 1; + thd->proc_info= "Analyzing"; + error= maria_chk_key(¶m, file); + } + else + local_testflag &= ~T_STATISTICS; // Don't update statistics + } + } + thd->proc_info= "Saving state"; + pthread_mutex_lock(&share->intern_lock); + if (!error) + { + if ((share->state.changed & STATE_CHANGED) || maria_is_crashed(file)) + { + share->state.changed &= ~(STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR); + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + } + /* + the following 'if', thought conceptually wrong, + is a useful optimization nevertheless. + */ + if (file->state != &file->s->state.state) + file->s->state.state= *file->state; + if (file->s->base.auto_key) + _ma_update_auto_increment_key(¶m, file, 1); + if (optimize_done) + error= maria_update_state_info(¶m, file, + UPDATE_TIME | UPDATE_OPEN_COUNT | + (local_testflag & + T_STATISTICS ? UPDATE_STAT : 0)); + info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE | + HA_STATUS_CONST); + if (rows != file->state->records && !(param.testflag & T_VERY_SILENT)) + { + char llbuff[22], llbuff2[22]; + _ma_check_print_warning(¶m, "Number of rows changed from %s to %s", + llstr(rows, llbuff), + llstr(file->state->records, llbuff2)); + } + } + else + { + maria_mark_crashed_on_repair(file); + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + maria_update_state_info(¶m, file, 0); + } + pthread_mutex_unlock(&share->intern_lock); + thd->proc_info= old_proc_info; + if (!thd->locked_tables) + { + _ma_reenable_logging_for_table(file->s); + maria_lock_database(file, F_UNLCK); + } + DBUG_RETURN(error ? HA_ADMIN_FAILED : + !optimize_done ? HA_ADMIN_ALREADY_DONE : HA_ADMIN_OK); +} + + +/* + Assign table indexes to a specific key cache. +*/ + +int ha_maria::assign_to_keycache(THD * thd, HA_CHECK_OPT *check_opt) +{ + PAGECACHE *new_pagecache= check_opt->pagecache; + const char *errmsg= 0; + int error= HA_ADMIN_OK; + ulonglong map; + TABLE_LIST *table_list= table->pos_in_table_list; + DBUG_ENTER("ha_maria::assign_to_keycache"); + + /* for now, it is disabled */ + DBUG_RETURN(HA_ADMIN_NOT_IMPLEMENTED); + + table->keys_in_use_for_query.clear_all(); + + if (table_list->process_index_hints(table)) + DBUG_RETURN(HA_ADMIN_FAILED); + map= ~(ulonglong) 0; + if (!table->keys_in_use_for_query.is_clear_all()) + /* use all keys if there's no list specified by the user through hints */ + map= table->keys_in_use_for_query.to_ulonglong(); + + if ((error= maria_assign_to_pagecache(file, map, new_pagecache))) + { + char buf[STRING_BUFFER_USUAL_SIZE]; + my_snprintf(buf, sizeof(buf), + "Failed to flush to index file (errno: %d)", error); + errmsg= buf; + error= HA_ADMIN_CORRUPT; + } + + if (error != HA_ADMIN_OK) + { + /* Send error to user */ + HA_CHECK param; + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "assign_to_keycache"; + param.db_name= table->s->db.str; + param.table_name= table->s->table_name.str; + param.testflag= 0; + _ma_check_print_error(¶m, errmsg); + } + DBUG_RETURN(error); +} + + +/* + Preload pages of the index file for a table into the key cache. +*/ + +int ha_maria::preload_keys(THD * thd, HA_CHECK_OPT *check_opt) +{ + int error; + const char *errmsg; + ulonglong map; + TABLE_LIST *table_list= table->pos_in_table_list; + my_bool ignore_leaves= table_list->ignore_leaves; + char buf[ERRMSGSIZE+20]; + + DBUG_ENTER("ha_maria::preload_keys"); + + table->keys_in_use_for_query.clear_all(); + + if (table_list->process_index_hints(table)) + DBUG_RETURN(HA_ADMIN_FAILED); + + map= ~(ulonglong) 0; + /* Check validity of the index references */ + if (!table->keys_in_use_for_query.is_clear_all()) + /* use all keys if there's no list specified by the user through hints */ + map= table->keys_in_use_for_query.to_ulonglong(); + + maria_extra(file, HA_EXTRA_PRELOAD_BUFFER_SIZE, + (void*) &thd->variables.preload_buff_size); + + if ((error= maria_preload(file, map, ignore_leaves))) + { + switch (error) { + case HA_ERR_NON_UNIQUE_BLOCK_SIZE: + errmsg= "Indexes use different block sizes"; + break; + case HA_ERR_OUT_OF_MEM: + errmsg= "Failed to allocate buffer"; + break; + default: + my_snprintf(buf, ERRMSGSIZE, + "Failed to read from index file (errno: %d)", my_errno); + errmsg= buf; + } + error= HA_ADMIN_FAILED; + goto err; + } + + DBUG_RETURN(HA_ADMIN_OK); + +err: + { + HA_CHECK param; + maria_chk_init(¶m); + param.thd= thd; + param.op_name= "preload_keys"; + param.db_name= table->s->db.str; + param.table_name= table->s->table_name.str; + param.testflag= 0; + _ma_check_print_error(¶m, errmsg); + DBUG_RETURN(error); + } +} + + +/* + Disable indexes, making it persistent if requested. + + SYNOPSIS + disable_indexes() + mode mode of operation: + HA_KEY_SWITCH_NONUNIQ disable all non-unique keys + HA_KEY_SWITCH_ALL disable all keys + HA_KEY_SWITCH_NONUNIQ_SAVE dis. non-uni. and make persistent + HA_KEY_SWITCH_ALL_SAVE dis. all keys and make persistent + + IMPLEMENTATION + HA_KEY_SWITCH_NONUNIQ is not implemented. + HA_KEY_SWITCH_ALL_SAVE is not implemented. + + RETURN + 0 ok + HA_ERR_WRONG_COMMAND mode not implemented. +*/ + +int ha_maria::disable_indexes(uint mode) +{ + int error; + + if (mode == HA_KEY_SWITCH_ALL) + { + /* call a storage engine function to switch the key map */ + error= maria_disable_indexes(file); + } + else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE) + { + maria_extra(file, HA_EXTRA_NO_KEYS, 0); + info(HA_STATUS_CONST); // Read new key info + error= 0; + } + else + { + /* mode not implemented */ + error= HA_ERR_WRONG_COMMAND; + } + return error; +} + + +/* + Enable indexes, making it persistent if requested. + + SYNOPSIS + enable_indexes() + mode mode of operation: + HA_KEY_SWITCH_NONUNIQ enable all non-unique keys + HA_KEY_SWITCH_ALL enable all keys + HA_KEY_SWITCH_NONUNIQ_SAVE en. non-uni. and make persistent + HA_KEY_SWITCH_ALL_SAVE en. all keys and make persistent + + DESCRIPTION + Enable indexes, which might have been disabled by disable_index() before. + The modes without _SAVE work only if both data and indexes are empty, + since the MARIA repair would enable them persistently. + To be sure in these cases, call handler::delete_all_rows() before. + + IMPLEMENTATION + HA_KEY_SWITCH_NONUNIQ is not implemented. + HA_KEY_SWITCH_ALL_SAVE is not implemented. + + RETURN + 0 ok + !=0 Error, among others: + HA_ERR_CRASHED data or index is non-empty. Delete all rows and retry. + HA_ERR_WRONG_COMMAND mode not implemented. +*/ + +int ha_maria::enable_indexes(uint mode) +{ + int error; + + if (maria_is_all_keys_active(file->s->state.key_map, file->s->base.keys)) + { + /* All indexes are enabled already. */ + return 0; + } + + if (mode == HA_KEY_SWITCH_ALL) + { + error= maria_enable_indexes(file); + /* + Do not try to repair on error, + as this could make the enabled state persistent, + but mode==HA_KEY_SWITCH_ALL forbids it. + */ + } + else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE) + { + THD *thd= current_thd; + HA_CHECK param; + const char *save_proc_info= thd->proc_info; + thd->proc_info= "Creating index"; + maria_chk_init(¶m); + param.op_name= "recreating_index"; + param.testflag= (T_SILENT | T_REP_BY_SORT | T_QUICK | + T_CREATE_MISSING_KEYS); + param.myf_rw &= ~MY_WAIT_IF_FULL; + param.sort_buffer_length= thd->variables.maria_sort_buff_size; + param.stats_method= + (enum_handler_stats_method) thd->variables.maria_stats_method; + param.tmpdir= &mysql_tmpdir_list; + if ((error= (repair(thd, param, 0) != HA_ADMIN_OK)) && param.retry_repair) + { + sql_print_warning("Warning: Enabling keys got errno %d on %s.%s, retrying", + my_errno, param.db_name, param.table_name); + /* Repairing by sort failed. Now try standard repair method. */ + param.testflag &= ~(T_REP_BY_SORT | T_QUICK); + error= (repair(thd, param, 0) != HA_ADMIN_OK); + /* + If the standard repair succeeded, clear all error messages which + might have been set by the first repair. They can still be seen + with SHOW WARNINGS then. + */ +#ifndef EMBEDDED_LIBRARY + if (!error) + thd->clear_error(); +#endif /* EMBEDDED_LIBRARY */ + } + info(HA_STATUS_CONST); + thd->proc_info= save_proc_info; + } + else + { + /* mode not implemented */ + error= HA_ERR_WRONG_COMMAND; + } + return error; +} + + +/* + Test if indexes are disabled. + + + SYNOPSIS + indexes_are_disabled() + no parameters + + + RETURN + 0 indexes are not disabled + 1 all indexes are disabled + [2 non-unique indexes are disabled - NOT YET IMPLEMENTED] +*/ + +int ha_maria::indexes_are_disabled(void) +{ + return maria_indexes_are_disabled(file); +} + + +/* + prepare for a many-rows insert operation + e.g. - disable indexes (if they can be recreated fast) or + activate special bulk-insert optimizations + + SYNOPSIS + start_bulk_insert(rows) + rows Rows to be inserted + 0 if we don't know + + NOTICE + Do not forget to call end_bulk_insert() later! +*/ + +void ha_maria::start_bulk_insert(ha_rows rows) +{ + DBUG_ENTER("ha_maria::start_bulk_insert"); + THD *thd= current_thd; + ulong size= min(thd->variables.read_buff_size, + table->s->avg_row_length * rows); + DBUG_PRINT("info", ("start_bulk_insert: rows %lu size %lu", + (ulong) rows, size)); + + /* don't enable row cache if too few rows */ + if (!rows || (rows > MARIA_MIN_ROWS_TO_USE_WRITE_CACHE)) + maria_extra(file, HA_EXTRA_WRITE_CACHE, (void*) &size); + + can_enable_indexes= (maria_is_all_keys_active(file->s->state.key_map, + file->s->base.keys)); + + if (!(specialflag & SPECIAL_SAFE_MODE)) + { + /* + Only disable old index if the table was empty and we are inserting + a lot of rows. + We should not do this for only a few rows as this is slower and + we don't want to update the key statistics based of only a few rows. + */ + if (file->state->records == 0 && can_enable_indexes && + (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES)) + maria_disable_non_unique_index(file, rows); + else if (!file->bulk_insert && + (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT)) + { + maria_init_bulk_insert(file, thd->variables.bulk_insert_buff_size, rows); + } + } + DBUG_VOID_RETURN; +} + + +/* + end special bulk-insert optimizations, + which have been activated by start_bulk_insert(). + + SYNOPSIS + end_bulk_insert() + no arguments + + RETURN + 0 OK + != 0 Error +*/ + +int ha_maria::end_bulk_insert() +{ + int err; + DBUG_ENTER("ha_maria::end_bulk_insert"); + maria_end_bulk_insert(file); + err= maria_extra(file, HA_EXTRA_NO_CACHE, 0); + DBUG_RETURN(err ? err : can_enable_indexes ? + enable_indexes(HA_KEY_SWITCH_NONUNIQ_SAVE) : 0); +} + + +bool ha_maria::check_and_repair(THD *thd) +{ + int error= 0; + int marked_crashed; + char *old_query; + uint old_query_length; + HA_CHECK_OPT check_opt; + DBUG_ENTER("ha_maria::check_and_repair"); + + check_opt.init(); + check_opt.flags= T_MEDIUM | T_AUTO_REPAIR; + // Don't use quick if deleted rows + if (!file->state->del && (maria_recover_options & HA_RECOVER_QUICK)) + check_opt.flags |= T_QUICK; + sql_print_warning("Checking table: '%s'", table->s->path.str); + + old_query= thd->query; + old_query_length= thd->query_length; + pthread_mutex_lock(&LOCK_thread_count); + thd->query= table->s->table_name.str; + thd->query_length= table->s->table_name.length; + pthread_mutex_unlock(&LOCK_thread_count); + + if ((marked_crashed= maria_is_crashed(file)) || check(thd, &check_opt)) + { + sql_print_warning("Recovering table: '%s'", table->s->path.str); + check_opt.flags= + ((maria_recover_options & HA_RECOVER_BACKUP ? T_BACKUP_DATA : 0) | + (marked_crashed ? 0 : T_QUICK) | + (maria_recover_options & HA_RECOVER_FORCE ? 0 : T_SAFE_REPAIR) | + T_AUTO_REPAIR); + if (repair(thd, &check_opt)) + error= 1; + } + pthread_mutex_lock(&LOCK_thread_count); + thd->query= old_query; + thd->query_length= old_query_length; + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_RETURN(error); +} + + +bool ha_maria::is_crashed() const +{ + return (file->s->state.changed & STATE_CRASHED || + (my_disable_locking && file->s->state.open_count)); +} + + +int ha_maria::update_row(const uchar * old_data, uchar * new_data) +{ + ha_statistic_increment(&SSV::ha_update_count); + if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) + table->timestamp_field->set_time(); + return maria_update(file, old_data, new_data); +} + + +int ha_maria::delete_row(const uchar * buf) +{ + ha_statistic_increment(&SSV::ha_delete_count); + return maria_delete(file, buf); +} + + +int ha_maria::index_read(uchar * buf, const uchar * key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_key_count); + int error= maria_rkey(file, buf, active_index, key, keypart_map, find_flag); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_read_idx(uchar * buf, uint index, const uchar * key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) +{ + ha_statistic_increment(&SSV::ha_read_key_count); + int error= maria_rkey(file, buf, index, key, keypart_map, find_flag); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_read_last(uchar * buf, const uchar * key, + key_part_map keypart_map) +{ + DBUG_ENTER("ha_maria::index_read_last"); + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_key_count); + int error= maria_rkey(file, buf, active_index, key, keypart_map, + HA_READ_PREFIX_LAST); + table->status= error ? STATUS_NOT_FOUND : 0; + DBUG_RETURN(error); +} + + +int ha_maria::index_next(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_next_count); + int error= maria_rnext(file, buf, active_index); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_prev(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_prev_count); + int error= maria_rprev(file, buf, active_index); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_first(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_first_count); + int error= maria_rfirst(file, buf, active_index); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_last(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_last_count); + int error= maria_rlast(file, buf, active_index); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::index_next_same(uchar * buf, + const uchar *key __attribute__ ((unused)), + uint length __attribute__ ((unused))) +{ + DBUG_ASSERT(inited == INDEX); + ha_statistic_increment(&SSV::ha_read_next_count); + int error= maria_rnext_same(file, buf); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::rnd_init(bool scan) +{ + if (scan) + return maria_scan_init(file); + return maria_reset(file); // Free buffers +} + + +int ha_maria::rnd_end() +{ + /* Safe to call even if we don't have started a scan */ + maria_scan_end(file); + return 0; +} + + +int ha_maria::rnd_next(uchar *buf) +{ + ha_statistic_increment(&SSV::ha_read_rnd_next_count); + int error= maria_scan(file, buf); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +int ha_maria::restart_rnd_next(uchar *buf, uchar *pos) +{ + return rnd_pos(buf, pos); +} + + +int ha_maria::rnd_pos(uchar * buf, uchar *pos) +{ + ha_statistic_increment(&SSV::ha_read_rnd_count); + int error= maria_rrnd(file, buf, my_get_ptr(pos, ref_length)); + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +void ha_maria::position(const uchar * record) +{ + my_off_t row_position= maria_position(file); + my_store_ptr(ref, ref_length, row_position); +} + + +int ha_maria::info(uint flag) +{ + MARIA_INFO maria_info; + char name_buff[FN_REFLEN]; + + (void) maria_status(file, &maria_info, flag); + if (flag & HA_STATUS_VARIABLE) + { + stats.records= maria_info.records; + stats.deleted= maria_info.deleted; + stats.data_file_length= maria_info.data_file_length; + stats.index_file_length= maria_info.index_file_length; + stats.delete_length= maria_info.delete_length; + stats.check_time= maria_info.check_time; + stats.mean_rec_length= maria_info.mean_reclength; + } + if (flag & HA_STATUS_CONST) + { + TABLE_SHARE *share= table->s; + stats.max_data_file_length= maria_info.max_data_file_length; + stats.max_index_file_length= maria_info.max_index_file_length; + stats.create_time= maria_info.create_time; + ref_length= maria_info.reflength; + share->db_options_in_use= maria_info.options; + stats.block_size= maria_block_size; + + /* Update share */ + if (share->tmp_table == NO_TMP_TABLE) + pthread_mutex_lock(&share->mutex); + share->keys_in_use.set_prefix(share->keys); + share->keys_in_use.intersect_extended(maria_info.key_map); + share->keys_for_keyread.intersect(share->keys_in_use); + share->db_record_offset= maria_info.record_offset; + if (share->key_parts) + memcpy((char*) table->key_info[0].rec_per_key, + (char*) maria_info.rec_per_key, + sizeof(table->key_info[0].rec_per_key) * share->key_parts); + if (share->tmp_table == NO_TMP_TABLE) + pthread_mutex_unlock(&share->mutex); + + /* + Set data_file_name and index_file_name to point at the symlink value + if table is symlinked (Ie; Real name is not same as generated name) + */ + data_file_name= index_file_name= 0; + fn_format(name_buff, file->s->open_file_name, "", MARIA_NAME_DEXT, + MY_APPEND_EXT | MY_UNPACK_FILENAME); + if (strcmp(name_buff, maria_info.data_file_name)) + data_file_name=maria_info.data_file_name; + fn_format(name_buff, file->s->open_file_name, "", MARIA_NAME_IEXT, + MY_APPEND_EXT | MY_UNPACK_FILENAME); + if (strcmp(name_buff, maria_info.index_file_name)) + index_file_name=maria_info.index_file_name; + } + if (flag & HA_STATUS_ERRKEY) + { + errkey= maria_info.errkey; + my_store_ptr(dup_ref, ref_length, maria_info.dup_key_pos); + } + /* Faster to always update, than to do it based on flag */ + stats.update_time= maria_info.update_time; + stats.auto_increment_value= maria_info.auto_increment; + + return 0; +} + + +int ha_maria::extra(enum ha_extra_function operation) +{ + if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_KEYREAD) + return 0; + return maria_extra(file, operation, 0); +} + +int ha_maria::reset(void) +{ + return maria_reset(file); +} + +/* To be used with WRITE_CACHE and EXTRA_CACHE */ + +int ha_maria::extra_opt(enum ha_extra_function operation, ulong cache_size) +{ + if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_WRITE_CACHE) + return 0; + return maria_extra(file, operation, (void*) &cache_size); +} + + +int ha_maria::delete_all_rows() +{ + return maria_delete_all_rows(file); +} + + +int ha_maria::delete_table(const char *name) +{ + return maria_delete_table(name); +} + +#define THD_TRN (*(TRN **)thd_ha_data(thd, maria_hton)) + +int ha_maria::external_lock(THD *thd, int lock_type) +{ + TRN *trn= THD_TRN; + DBUG_ENTER("ha_maria::external_lock"); + /* + We don't test now_transactional because it may vary between lock/unlock + and thus confuse our reference counting. + It is critical to skip non-transactional tables: user-visible temporary + tables get an external_lock() when read/written for the first time, but no + corresponding unlock (they just stay locked and are later dropped while + locked); if a tmp table was transactional, "SELECT FROM non_tmp, tmp" + would never commit as its "locked_tables" count would stay 1. + When Maria has has_transactions()==TRUE, open_temporary_table() + (sql_base.cc) will use TRANSACTIONAL_TMP_TABLE and thus the + external_lock(F_UNLCK) will happen and we can then allow the user to + create transactional temporary tables. + */ + if (!file->s->base.born_transactional) + goto skip_transaction; + if (lock_type != F_UNLCK) + { + if (!thd->transaction.on) + { + /* + No need to log REDOs/UNDOs. If this is an internal temporary table + which will be renamed to a permanent table (like in ALTER TABLE), + the rename happens after unlocking so will be durable (and the table + will get its create_rename_lsn). + Note: if we wanted to enable users to have an old backup and apply + tons of archived logs to roll-forward, we could then not disable + REDOs/UNDOs in this case. + */ + DBUG_PRINT("info", ("Disabling logging for table")); + _ma_tmp_disable_logging_for_table(file->s); + } + if (!trn) /* no transaction yet - open it now */ + { + trn= trnman_new_trn(& thd->mysys_var->mutex, + & thd->mysys_var->suspend, + thd->thread_stack + STACK_DIRECTION * + (my_thread_stack_size - STACK_MIN_SIZE)); + if (unlikely(!trn)) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + + DBUG_PRINT("info", ("THD_TRN set to 0x%lx", (ulong)trn)); + THD_TRN= trn; + if (thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) + trans_register_ha(thd, TRUE, maria_hton); + } + this->file->trn= trn; + if (!trnman_increment_locked_tables(trn)) + { + trans_register_ha(thd, FALSE, maria_hton); + trnman_new_statement(trn); + } + } + else + { + _ma_reenable_logging_for_table(file->s); + this->file->trn= 0; /* TODO: remove it also in commit and rollback */ + if (trn && trnman_has_locked_tables(trn)) + { + if (!trnman_decrement_locked_tables(trn)) + { + /* autocommit ? rollback a transaction */ +#ifdef MARIA_CANNOT_ROLLBACK + if (ma_commit(trn)) + DBUG_RETURN(1); + THD_TRN= 0; +#else + if (!(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) + { + trnman_rollback_trn(trn); + DBUG_PRINT("info", ("THD_TRN set to 0x0")); + THD_TRN= 0; + } +#endif + } + } + } +skip_transaction: + DBUG_RETURN(maria_lock_database(file, !table->s->tmp_table ? + lock_type : ((lock_type == F_UNLCK) ? + F_UNLCK : F_EXTRA_LCK))); +} + +int ha_maria::start_stmt(THD *thd, thr_lock_type lock_type) +{ + TRN *trn= THD_TRN; + if (file->s->base.born_transactional) + { + DBUG_ASSERT(trn); // this may be called only after external_lock() + DBUG_ASSERT(trnman_has_locked_tables(trn)); + DBUG_ASSERT(lock_type != F_UNLCK); + /* + As external_lock() was already called, don't increment locked_tables. + Note that we call the function below possibly several times when + statement starts (once per table). This is ok as long as that function + does cheap operations. Otherwise, we will need to do it only on first + call to start_stmt(). + */ + trnman_new_statement(trn); + } + return 0; +} + +THR_LOCK_DATA **ha_maria::store_lock(THD *thd, + THR_LOCK_DATA **to, + enum thr_lock_type lock_type) +{ + if (lock_type != TL_IGNORE && file->lock.type == TL_UNLOCK) + file->lock.type= lock_type; + *to++= &file->lock; + return to; +} + + +void ha_maria::update_create_info(HA_CREATE_INFO *create_info) +{ + ha_maria::info(HA_STATUS_AUTO | HA_STATUS_CONST); + if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) + { + create_info->auto_increment_value= stats.auto_increment_value; + } + create_info->data_file_name= data_file_name; + create_info->index_file_name= index_file_name; +} + + +enum row_type ha_maria::get_row_type() const +{ + switch (file->s->data_file_type) { + case STATIC_RECORD: return ROW_TYPE_FIXED; + case DYNAMIC_RECORD: return ROW_TYPE_DYNAMIC; + case BLOCK_RECORD: return ROW_TYPE_PAGE; + case COMPRESSED_RECORD: return ROW_TYPE_COMPRESSED; + default: return ROW_TYPE_NOT_USED; + } +} + + +static enum data_file_type maria_row_type(HA_CREATE_INFO *info) +{ + if (info->transactional == HA_CHOICE_YES) + return BLOCK_RECORD; + switch (info->row_type) { + case ROW_TYPE_FIXED: return STATIC_RECORD; + case ROW_TYPE_DYNAMIC: return DYNAMIC_RECORD; + default: return BLOCK_RECORD; + } +} + + +int ha_maria::create(const char *name, register TABLE *table_arg, + HA_CREATE_INFO *ha_create_info) +{ + int error; + uint create_flags= 0, records, i; + char buff[FN_REFLEN]; + MARIA_KEYDEF *keydef; + MARIA_COLUMNDEF *recinfo; + MARIA_CREATE_INFO create_info; + TABLE_SHARE *share= table_arg->s; + uint options= share->db_options_in_use; + enum data_file_type row_type; + DBUG_ENTER("ha_maria::create"); + + for (i= 0; i < share->keys; i++) + { + if (table_arg->key_info[i].flags & HA_USES_PARSER) + { + create_flags|= HA_CREATE_RELIES_ON_SQL_LAYER; + break; + } + } + row_type= maria_row_type(ha_create_info); + if ((error= table2maria(table_arg, &keydef, &recinfo, &records))) + DBUG_RETURN(error); /* purecov: inspected */ + bzero((char*) &create_info, sizeof(create_info)); + create_info.max_rows= share->max_rows; + create_info.reloc_rows= share->min_rows; + create_info.with_auto_increment= share->next_number_key_offset == 0; + create_info.auto_increment= (ha_create_info->auto_increment_value ? + ha_create_info->auto_increment_value -1 : + (ulonglong) 0); + create_info.data_file_length= ((ulonglong) share->max_rows * + share->avg_row_length); + create_info.data_file_name= ha_create_info->data_file_name; + create_info.index_file_name= ha_create_info->index_file_name; +#ifdef ASK_MONTY + /** + @todo ASK_MONTY + Where "transactional" in the frm and in the engine can go out of sync. + Don't we want to do, after the setting, this test: + if (!create_info.transactional && + ha_create_info->transactional == HA_CHOICE_YES) + error; + ? + Why fool the user? + */ +#endif + create_info.transactional= (row_type == BLOCK_RECORD && + ha_create_info->transactional != HA_CHOICE_NO); + + if (ha_create_info->options & HA_LEX_CREATE_TMP_TABLE) + create_flags|= HA_CREATE_TMP_TABLE; + if (options & HA_OPTION_PACK_RECORD) + create_flags|= HA_PACK_RECORD; + if (options & HA_OPTION_CHECKSUM) + create_flags|= HA_CREATE_CHECKSUM; + if (options & HA_OPTION_DELAY_KEY_WRITE) + create_flags|= HA_CREATE_DELAY_KEY_WRITE; + + /* TODO: Check that the following fn_format is really needed */ + error= + maria_create(fn_format(buff, name, "", "", + MY_UNPACK_FILENAME | MY_APPEND_EXT), + row_type, share->keys, keydef, + records, recinfo, + 0, (MARIA_UNIQUEDEF *) 0, + &create_info, create_flags); + + my_free((uchar*) recinfo, MYF(0)); + DBUG_RETURN(error); +} + + +int ha_maria::rename_table(const char *from, const char *to) +{ + return maria_rename(from, to); +} + + +void ha_maria::get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values) +{ + ulonglong nr; + int error; + uchar key[HA_MAX_KEY_LENGTH]; + + if (!table->s->next_number_key_offset) + { // Autoincrement at key-start + ha_maria::info(HA_STATUS_AUTO); + *first_value= stats.auto_increment_value; + /* Maria has only table-level lock for now, so reserves to +inf */ + *nb_reserved_values= ULONGLONG_MAX; + return; + } + + /* it's safe to call the following if bulk_insert isn't on */ + maria_flush_bulk_insert(file, table->s->next_number_index); + + (void) extra(HA_EXTRA_KEYREAD); + key_copy(key, table->record[0], + table->key_info + table->s->next_number_index, + table->s->next_number_key_offset); + error= maria_rkey(file, table->record[1], (int) table->s->next_number_index, + key, make_prev_keypart_map(table->s->next_number_keypart), + HA_READ_PREFIX_LAST); + if (error) + nr= 1; + else + { + /* Get data from record[1] */ + nr= ((ulonglong) table->next_number_field-> + val_int_offset(table->s->rec_buff_length) + 1); + } + extra(HA_EXTRA_NO_KEYREAD); + *first_value= nr; + /* + MySQL needs to call us for next row: assume we are inserting ("a",null) + here, we return 3, and next this statement will want to insert ("b",null): + there is no reason why ("b",3+1) would be the good row to insert: maybe it + already exists, maybe 3+1 is too large... + */ + *nb_reserved_values= 1; +} + + +/* + Find out how many rows there is in the given range + + SYNOPSIS + records_in_range() + inx Index to use + min_key Start of range. Null pointer if from first key + max_key End of range. Null pointer if to last key + + NOTES + min_key.flag can have one of the following values: + HA_READ_KEY_EXACT Include the key in the range + HA_READ_AFTER_KEY Don't include key in range + + max_key.flag can have one of the following values: + HA_READ_BEFORE_KEY Don't include key in range + HA_READ_AFTER_KEY Include all 'end_key' values in the range + + RETURN + HA_POS_ERROR Something is wrong with the index tree. + 0 There is no matching keys in the given range + number > 0 There is approximately 'number' matching rows in + the range. +*/ + +ha_rows ha_maria::records_in_range(uint inx, key_range *min_key, + key_range *max_key) +{ + return (ha_rows) maria_records_in_range(file, (int) inx, min_key, max_key); +} + + +int ha_maria::ft_read(uchar * buf) +{ + int error; + + if (!ft_handler) + return -1; + + thread_safe_increment(table->in_use->status_var.ha_read_next_count, + &LOCK_status); // why ? + + error= ft_handler->please->read_next(ft_handler, (char*) buf); + + table->status= error ? STATUS_NOT_FOUND : 0; + return error; +} + + +uint ha_maria::checksum() const +{ + return (uint) file->state->checksum; +} + + +bool ha_maria::check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes) +{ + uint options= table->s->db_options_in_use; + + if (info->auto_increment_value != stats.auto_increment_value || + info->data_file_name != data_file_name || + info->index_file_name != index_file_name || + maria_row_type(info) != data_file_type || + table_changes == IS_EQUAL_NO || + table_changes & IS_EQUAL_PACK_LENGTH) // Not implemented yet + return COMPATIBLE_DATA_NO; + + if ((options & (HA_OPTION_PACK_RECORD | HA_OPTION_CHECKSUM | + HA_OPTION_DELAY_KEY_WRITE)) != + (info->table_options & (HA_OPTION_PACK_RECORD | HA_OPTION_CHECKSUM | + HA_OPTION_DELAY_KEY_WRITE))) + return COMPATIBLE_DATA_NO; + return COMPATIBLE_DATA_YES; +} + + +static int maria_hton_panic(handlerton *hton, ha_panic_function flag) +{ + ma_checkpoint_execute(CHECKPOINT_FULL, FALSE); /* can't catch error */ + return maria_panic(flag); +} + + +static int maria_commit(handlerton *hton __attribute__ ((unused)), + THD *thd, bool all) +{ + TRN *trn= THD_TRN; + DBUG_ENTER("maria_commit"); + trnman_reset_locked_tables(trn); + /* statement or transaction ? */ + if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all) + DBUG_RETURN(0); // end of statement + DBUG_PRINT("info", ("THD_TRN set to 0x0")); + THD_TRN= 0; + DBUG_RETURN(ma_commit(trn)); // end of transaction +} + + +static int maria_rollback(handlerton *hton __attribute__ ((unused)), + THD *thd, bool all) +{ + TRN *trn= THD_TRN; + DBUG_ENTER("maria_rollback"); + trnman_reset_locked_tables(trn); + /* statement or transaction ? */ + if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all) + { + trnman_rollback_statement(trn); + DBUG_RETURN(0); // end of statement + } + DBUG_PRINT("info", ("THD_TRN set to 0x0")); + THD_TRN= 0; + DBUG_RETURN(trnman_rollback_trn(trn) ? + HA_ERR_OUT_OF_MEM : 0); // end of transaction +} + + +static int ha_maria_init(void *p) +{ + int res; + maria_hton= (handlerton *)p; + maria_hton->state= SHOW_OPTION_YES; + maria_hton->db_type= DB_TYPE_MARIA; + maria_hton->create= maria_create_handler; + maria_hton->panic= maria_hton_panic; + maria_hton->commit= maria_commit; + maria_hton->rollback= maria_rollback; + /* TODO: decide if we support Maria being used for log tables */ + maria_hton->flags= HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES; + bzero(maria_log_pagecache, sizeof(*maria_log_pagecache)); + maria_data_root= mysql_real_data_home; + res= maria_init() || ma_control_file_create_or_open() || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + MYSQL_VERSION_ID, server_id, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS) || + maria_recover() || + ma_checkpoint_init(FALSE) || + /* One checkpoint after Recovery */ + ma_checkpoint_execute(CHECKPOINT_FULL, FALSE); + maria_multi_threaded= TRUE; + return res; +} + + +struct st_mysql_storage_engine maria_storage_engine= +{ MYSQL_HANDLERTON_INTERFACE_VERSION }; + +mysql_declare_plugin(maria) +{ + MYSQL_STORAGE_ENGINE_PLUGIN, + &maria_storage_engine, + "MARIA", + "MySQL AB", + "Traditional transactional MySQL tables", + PLUGIN_LICENSE_GPL, + ha_maria_init, /* Plugin Init */ + NULL, /* Plugin Deinit */ + 0x0100, /* 1.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + NULL /* config options */ +} +mysql_declare_plugin_end; diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h new file mode 100644 index 00000000000..7675778ab5b --- /dev/null +++ b/storage/maria/ha_maria.h @@ -0,0 +1,151 @@ +/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +#ifdef USE_PRAGMA_INTERFACE +#pragma interface /* gcc class implementation */ +#endif + +/* class for the the maria handler */ + +#include <maria.h> + +#define HA_RECOVER_NONE 0 /* No automatic recover */ +#define HA_RECOVER_DEFAULT 1 /* Automatic recover active */ +#define HA_RECOVER_BACKUP 2 /* Make a backupfile on recover */ +#define HA_RECOVER_FORCE 4 /* Recover even if we loose rows */ +#define HA_RECOVER_QUICK 8 /* Don't check rows in data file */ + +extern ulong maria_sort_buffer_size; +extern TYPELIB maria_recover_typelib; +extern ulong maria_recover_options; + +class ha_maria :public handler +{ + MARIA_HA *file; + ulonglong int_table_flags; + char *data_file_name, *index_file_name; + enum data_file_type data_file_type; + bool can_enable_indexes; + int repair(THD * thd, HA_CHECK ¶m, bool optimize); + +public: + ha_maria(handlerton *hton, TABLE_SHARE * table_arg); + ~ha_maria() {} + handler *clone(MEM_ROOT *mem_root); + const char *table_type() const + { return "MARIA"; } + const char *index_type(uint key_number); + const char **bas_ext() const; + ulonglong table_flags() const + { return int_table_flags; } + ulong index_flags(uint inx, uint part, bool all_parts) const + { + return ((table_share->key_info[inx].algorithm == HA_KEY_ALG_FULLTEXT) ? + 0 : HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE | + HA_READ_ORDER | HA_KEYREAD_ONLY); + } + uint max_supported_keys() const + { return MARIA_MAX_KEY; } + uint max_supported_key_length() const; + uint max_supported_key_part_length() const + { return max_supported_key_length(); } + enum row_type get_row_type() const; + uint checksum() const; + virtual double scan_time(); + + virtual bool check_if_locking_is_allowed(uint sql_command, + ulong type, TABLE * table, + uint count, uint current, + uint *system_count, + bool called_by_logger_thread); + int open(const char *name, int mode, uint test_if_locked); + int close(void); + int write_row(uchar * buf); + int update_row(const uchar * old_data, uchar * new_data); + int delete_row(const uchar * buf); + int index_read(uchar * buf, const uchar * key, key_part_map keypart_map, + enum ha_rkey_function find_flag); + int index_read_idx(uchar * buf, uint idx, const uchar * key, + key_part_map keypart_map, + enum ha_rkey_function find_flag); + int index_read_last(uchar * buf, const uchar * key, + key_part_map keypart_map); + int index_next(uchar * buf); + int index_prev(uchar * buf); + int index_first(uchar * buf); + int index_last(uchar * buf); + int index_next_same(uchar * buf, const uchar * key, uint keylen); + int ft_init() + { + if (!ft_handler) + return 1; + ft_handler->please->reinit_search(ft_handler); + return 0; + } + FT_INFO *ft_init_ext(uint flags, uint inx, String * key) + { + return maria_ft_init_search(flags, file, inx, + (uchar *) key->ptr(), key->length(), + key->charset(), table->record[0]); + } + int ft_read(uchar * buf); + int rnd_init(bool scan); + int rnd_end(void); + int rnd_next(uchar * buf); + int rnd_pos(uchar * buf, uchar * pos); + int restart_rnd_next(uchar * buf, uchar * pos); + void position(const uchar * record); + int info(uint); + int extra(enum ha_extra_function operation); + int extra_opt(enum ha_extra_function operation, ulong cache_size); + int reset(void); + int external_lock(THD * thd, int lock_type); + int start_stmt(THD *thd, thr_lock_type lock_type); + int delete_all_rows(void); + int disable_indexes(uint mode); + int enable_indexes(uint mode); + int indexes_are_disabled(void); + void start_bulk_insert(ha_rows rows); + int end_bulk_insert(); + ha_rows records_in_range(uint inx, key_range * min_key, key_range * max_key); + void update_create_info(HA_CREATE_INFO * create_info); + int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info); + THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to, + enum thr_lock_type lock_type); + virtual void get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values); + int rename_table(const char *from, const char *to); + int delete_table(const char *name); + int check(THD * thd, HA_CHECK_OPT * check_opt); + int analyze(THD * thd, HA_CHECK_OPT * check_opt); + int repair(THD * thd, HA_CHECK_OPT * check_opt); + bool check_and_repair(THD * thd); + bool is_crashed() const; + bool auto_repair() const + { return maria_recover_options != 0; } + int optimize(THD * thd, HA_CHECK_OPT * check_opt); + int restore(THD * thd, HA_CHECK_OPT * check_opt); + int backup(THD * thd, HA_CHECK_OPT * check_opt); + int assign_to_keycache(THD * thd, HA_CHECK_OPT * check_opt); + int preload_keys(THD * thd, HA_CHECK_OPT * check_opt); + bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes); +#ifdef HAVE_REPLICATION + int dump(THD * thd, int fd); + int net_read_dump(NET * net); +#endif +}; diff --git a/storage/maria/lockman.c b/storage/maria/lockman.c new file mode 100644 index 00000000000..8316d70bb29 --- /dev/null +++ b/storage/maria/lockman.c @@ -0,0 +1,786 @@ +/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */ +/* QQ: TODO instant duration locks */ +/* QQ: #warning automatically place S instead of LS if possible */ + +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Generic Lock Manager + + Lock manager handles locks on "resources", a resource must be uniquely + identified by a 64-bit number. Lock manager itself does not imply + anything about the nature of a resource - it can be a row, a table, a + database, or just anything. + + Locks belong to "lock owners". A Lock owner is uniquely identified by a + 16-bit number. A function loid2lo must be provided by the application + that takes such a number as an argument and returns a LOCK_OWNER + structure. + + Lock levels are completely defined by three tables. Lock compatibility + matrix specifies which locks can be held at the same time on a resource. + Lock combining matrix specifies what lock level has the same behaviour as + a pair of two locks of given levels. getlock_result matrix simplifies + intention locking and lock escalation for an application, basically it + defines which locks are intention locks and which locks are "loose" + locks. It is only used to provide better diagnostics for the + application, lock manager itself does not differentiate between normal, + intention, and loose locks. + + Internally lock manager is based on a lock-free hash, see lf_hash.c for + details. All locks are stored in a hash, with a resource id as a search + key, so all locks for the same resource will be considered collisions and + will be put in a one (lock-free) linked list. The main lock-handling + logic is in the inner loop that searches for a lock in such a linked + list - lockfind(). + + This works as follows. Locks generally are added to the end of the list + (with one exception, see below). When scanning the list it is always + possible to determine what locks are granted (active) and what locks are + waiting - first lock is obviously active, the second is active if it's + compatible with the first, and so on, a lock is active if it's compatible + with all previous locks and all locks before it are also active. + To calculate the "compatible with all previous locks" all locks are + accumulated in prev_lock variable using lock_combining_matrix. + + Lock upgrades: when a thread that has a lock on a given resource, + requests a new lock on the same resource and the old lock is not enough + to satisfy new lock requirements (which is defined by + lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock is + placed in the list. Depending on other locks it is immediately active or + it will wait for other locks. Here's an exception to "locks are added + to the end" rule - upgraded locks are added after the last active lock + but before all waiting locks. Old lock (the one we upgraded from) is + not removed from the list, indeed it may be needed if the new lock was + in a savepoint that gets rolled back. So old lock is marked as "ignored" + (IGNORE_ME flag). New lock gets an UPGRADED flag. + + Loose locks add an important exception to the above. Loose locks do not + always commute with other locks. In the list IX-LS both locks are active, + while in the LS-IX list only the first lock is active. This creates a + problem in lock upgrades. If the list was IX-LS and the owner of the + first lock wants to place LS lock (which can be immediately granted), the + IX lock is upgraded to LSIX and the list becomes IX-LS-LSIX, which, + according to the lock compatibility matrix means that the last lock is + waiting - of course it all happened because IX and LS were swapped and + they don't commute. To work around this there's ACTIVE flag which is set + in every lock that never waited (was placed active), and this flag + overrides "compatible with all previous locks" rule. + + When a lock is placed to the end of the list it's either compatible with + all locks and all locks are active - new lock becomes active at once, or + it conflicts with some of the locks, in this case in the 'blocker' + variable a conflicting lock is returned and the calling thread waits on a + pthread condition in the LOCK_OWNER structure of the owner of the + conflicting lock. Or a new lock is compatible with all locks, but some + existing locks are not compatible with each other (example: request IS, + when the list is S-IX) - that is not all locks are active. In this case a + first waiting lock is returned in the 'blocker' variable, lockman_getlock() + notices that a "blocker" does not conflict with the requested lock, and + "dereferences" it, to find the lock that it's waiting on. The calling + thread than begins to wait on the same lock. + + To better support table-row relations where one needs to lock the table + with an intention lock before locking the row, extended diagnostics is + provided. When an intention lock (presumably on a table) is granted, + lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row, + perhaps the thread already has a normal lock on this table), + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual), + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check + whether it's possible to lock the row, but no need to lock it - perhaps + the thread has a loose lock on this table). This is defined by + getlock_result[] table. +*/ + +#include <my_global.h> +#include <my_sys.h> +#include <my_bit.h> +#include <lf.h> +#include "lockman.h" + +/* + Lock compatibility matrix. + + It's asymmetric. Read it as "Somebody has the lock <value in the row + label>, can I set the lock <value in the column label> ?" + + ') Though you can take LS lock while somebody has S lock, it makes no + sense - it's simpler to take S lock too. + + 1 - compatible + 0 - incompatible + -1 - "impossible", so that we can assert the impossibility. +*/ +static int lock_compatibility_matrix[10][10]= +{ /* N S X IS IX SIX LS LX SLX LSIX */ + { -1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */ + { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */ + { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */ +}; + +/* + Lock combining matrix. + + It's symmetric. Read it as "what lock level L is identical to the + set of two locks A and B" + + One should never get N from it, we assert the impossibility +*/ +static enum lock_type lock_combining_matrix[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { N, S, X, IS, IX, SIX, S, SLX, SLX, SIX}, /* N */ + { S, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */ + { X, X, X, X, X, X, X, X, X, X}, /* X */ + { IS, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */ + { IX, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */ + { SIX, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */ + { LS, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */ + { LX, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */ + { SLX, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */ + { LSIX, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */ +}; + +#define REPEAT_ONCE_MORE 0 +#define OK_TO_PLACE_THE_LOCK 1 +#define OK_TO_PLACE_THE_REQUEST 2 +#define ALREADY_HAVE_THE_LOCK 4 +#define ALREADY_HAVE_THE_REQUEST 8 +#define PLACE_NEW_DISABLE_OLD 16 +#define REQUEST_NEW_DISABLE_OLD 32 +#define RESOURCE_WAS_UNLOCKED 64 + +#define NEED_TO_WAIT (OK_TO_PLACE_THE_REQUEST | ALREADY_HAVE_THE_REQUEST |\ + REQUEST_NEW_DISABLE_OLD) +#define ALREADY_HAVE (ALREADY_HAVE_THE_LOCK | ALREADY_HAVE_THE_REQUEST) +#define LOCK_UPGRADE (PLACE_NEW_DISABLE_OLD | REQUEST_NEW_DISABLE_OLD) + + +/* + the return codes for lockman_getlock + + It's asymmetric. Read it as "I have the lock <value in the row label>, + what value should be returned for <value in the column label> ?" + + 0 means impossible combination (assert!) + + Defines below help to preserve the table structure. + I/L/A values are self explanatory + x means the combination is possible (assert should not crash) + but it cannot happen in row locks, only in table locks (S,X), + or lock escalations (LS,LX) +*/ +#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE +#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +#define A GOT_THE_LOCK +#define x GOT_THE_LOCK +static enum lockman_getlock_result getlock_result[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */ + { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */ + { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */ + { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */ + { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */ + { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */ + { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */ + { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */ + { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */ + { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */ +}; +#undef I +#undef L +#undef A +#undef x + +LF_REQUIRE_PINS(4); + +typedef struct lockman_lock { + uint64 resource; + struct lockman_lock *lonext; + intptr volatile link; + uint32 hashnr; + /* QQ: TODO - remove hashnr from LOCK */ + uint16 loid; + uchar lock; /* sizeof(uchar) <= sizeof(enum) */ + uchar flags; +} LOCK; + +#define IGNORE_ME 1 +#define UPGRADED 2 +#define ACTIVE 4 + +typedef struct { + intptr volatile *prev; + LOCK *curr, *next; + LOCK *blocker, *upgrade_from; +} CURSOR; + +#define PTR(V) (LOCK *)((V) & (~(intptr)1)) +#define DELETED(V) ((V) & 1) + +/* + NOTE + cursor is positioned in either case + pins[0..3] are used, they are NOT removed on return +*/ +static int lockfind(LOCK * volatile *head, LOCK *node, + CURSOR *cursor, LF_PINS *pins) +{ + uint32 hashnr, cur_hashnr; + uint64 resource, cur_resource; + intptr link; + my_bool cur_active, compatible, upgrading, prev_active; + enum lock_type lock, prev_lock, cur_lock; + uint16 loid, cur_loid; + int cur_flags, flags; + + hashnr= node->hashnr; + resource= node->resource; + lock= node->lock; + loid= node->loid; + flags= node->flags; + +retry: + cursor->prev= (intptr *)head; + prev_lock= N; + cur_active= TRUE; + compatible= TRUE; + upgrading= FALSE; + cursor->blocker= cursor->upgrade_from= 0; + _lf_unpin(pins, 3); + do { + cursor->curr= PTR(*cursor->prev); + _lf_pin(pins, 1, cursor->curr); + } while(*cursor->prev != (intptr)cursor->curr && LF_BACKOFF); + for (;;) + { + if (!cursor->curr) + break; + do { + link= cursor->curr->link; + cursor->next= PTR(link); + _lf_pin(pins, 0, cursor->next); + } while(link != cursor->curr->link && LF_BACKOFF); + cur_hashnr= cursor->curr->hashnr; + cur_resource= cursor->curr->resource; + cur_lock= cursor->curr->lock; + cur_loid= cursor->curr->loid; + cur_flags= cursor->curr->flags; + if (*cursor->prev != (intptr)cursor->curr) + { + (void)LF_BACKOFF; + goto retry; + } + if (!DELETED(link)) + { + if (cur_hashnr > hashnr || + (cur_hashnr == hashnr && cur_resource >= resource)) + { + if (cur_hashnr > hashnr || cur_resource > resource) + break; + /* ok, we have a lock for this resource */ + DBUG_ASSERT(lock_compatibility_matrix[prev_lock][cur_lock] >= 0); + DBUG_ASSERT(lock_compatibility_matrix[cur_lock][lock] >= 0); + if ((cur_flags & IGNORE_ME) && ! (flags & IGNORE_ME)) + { + DBUG_ASSERT(cur_active); + if (cur_loid == loid) + cursor->upgrade_from= cursor->curr; + } + else + { + prev_active= cur_active; + if (cur_flags & ACTIVE) + DBUG_ASSERT(prev_active == TRUE); + else + cur_active&= lock_compatibility_matrix[prev_lock][cur_lock]; + if (upgrading && !cur_active /*&& !(cur_flags & UPGRADED)*/) + break; + if (prev_active && !cur_active) + { + cursor->blocker= cursor->curr; + _lf_pin(pins, 3, cursor->curr); + } + if (cur_loid == loid) + { + /* we already have a lock on this resource */ + DBUG_ASSERT(lock_combining_matrix[cur_lock][lock] != N); + DBUG_ASSERT(!upgrading || (flags & IGNORE_ME)); + if (lock_combining_matrix[cur_lock][lock] == cur_lock) + { + /* new lock is compatible */ + if (cur_active) + { + cursor->blocker= cursor->curr; /* loose-locks! */ + _lf_unpin(pins, 3); /* loose-locks! */ + return ALREADY_HAVE_THE_LOCK; + } + else + return ALREADY_HAVE_THE_REQUEST; + } + /* not compatible, upgrading */ + upgrading= TRUE; + cursor->upgrade_from= cursor->curr; + } + else + { + if (!lock_compatibility_matrix[cur_lock][lock]) + { + compatible= FALSE; + cursor->blocker= cursor->curr; + _lf_pin(pins, 3, cursor->curr); + } + } + prev_lock= lock_combining_matrix[prev_lock][cur_lock]; + DBUG_ASSERT(prev_lock != N); + } + } + cursor->prev= &(cursor->curr->link); + _lf_pin(pins, 2, cursor->curr); + } + else + { + if (my_atomic_casptr((void **)cursor->prev, + (void **)&cursor->curr, cursor->next)) + _lf_alloc_free(pins, cursor->curr); + else + { + (void)LF_BACKOFF; + goto retry; + } + } + cursor->curr= cursor->next; + _lf_pin(pins, 1, cursor->curr); + } + /* + either the end of lock list - no more locks for this resource, + or upgrading and the end of active lock list + */ + if (upgrading) + { + if (compatible /*&& prev_active*/) + return PLACE_NEW_DISABLE_OLD; + else + return REQUEST_NEW_DISABLE_OLD; + } + if (cur_active && compatible) + { + /* + either no locks for this resource or all are compatible. + ok to place the lock in any case. + */ + return prev_lock == N ? RESOURCE_WAS_UNLOCKED + : OK_TO_PLACE_THE_LOCK; + } + /* we have a lock conflict. ok to place a lock request. And wait */ + return OK_TO_PLACE_THE_REQUEST; +} + +/* + NOTE + it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays +*/ +static int lockinsert(LOCK * volatile *head, LOCK *node, LF_PINS *pins, + LOCK **blocker) +{ + CURSOR cursor; + int res; + + do + { + res= lockfind(head, node, &cursor, pins); + DBUG_ASSERT(res != ALREADY_HAVE_THE_REQUEST); + if (!(res & ALREADY_HAVE)) + { + if (res & LOCK_UPGRADE) + { + node->flags|= UPGRADED; + node->lock= lock_combining_matrix[cursor.upgrade_from->lock][node->lock]; + } + if (!(res & NEED_TO_WAIT)) + node->flags|= ACTIVE; + node->link= (intptr)cursor.curr; + DBUG_ASSERT(node->link != (intptr)node); + DBUG_ASSERT(cursor.prev != &node->link); + if (!my_atomic_casptr((void **)cursor.prev, (void **)&cursor.curr, node)) + { + res= REPEAT_ONCE_MORE; + node->flags&= ~ACTIVE; + } + if (res & LOCK_UPGRADE) + cursor.upgrade_from->flags|= IGNORE_ME; + /* + QQ: is this OK ? if a reader has already read upgrade_from, + it may find it conflicting with node :( + - see the last test from test_lockman_simple() + */ + } + + } while (res == REPEAT_ONCE_MORE); + _lf_unpin(pins, 0); + _lf_unpin(pins, 1); + _lf_unpin(pins, 2); + /* + note that blocker is not necessarily pinned here (when it's == curr). + this is ok as in such a case it's either a dummy node for + initialize_bucket() and dummy nodes don't need pinning, + or it's a lock of the same transaction for lockman_getlock, + and it cannot be removed by another thread + */ + *blocker= cursor.blocker; + return res; +} + +/* + NOTE + it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays +*/ +static int lockpeek(LOCK * volatile *head, LOCK *node, LF_PINS *pins, + LOCK **blocker) +{ + CURSOR cursor; + int res; + + res= lockfind(head, node, &cursor, pins); + + _lf_unpin(pins, 0); + _lf_unpin(pins, 1); + _lf_unpin(pins, 2); + if (blocker) + *blocker= cursor.blocker; + return res; +} + +/* + NOTE + it uses pins[0..3], on return all pins are removed. + + One _must_ have the lock (or request) to call this +*/ +static int lockdelete(LOCK * volatile *head, LOCK *node, LF_PINS *pins) +{ + CURSOR cursor; + int res; + + do + { + res= lockfind(head, node, &cursor, pins); + DBUG_ASSERT(res & ALREADY_HAVE); + + if (cursor.upgrade_from) + cursor.upgrade_from->flags&= ~IGNORE_ME; + + /* + XXX this does not work with savepoints, as old lock is left ignored. + It cannot be unignored, as would basically mean moving the lock back + in the lock chain (from upgraded). And the latter is not allowed - + because it breaks list scanning. So old ignored lock must be deleted, + new - same - lock must be installed right after the lock we're deleting, + then we can delete. Good news is - this is only required when rolling + back a savepoint. + */ + if (my_atomic_casptr((void **)&(cursor.curr->link), + (void **)&cursor.next, 1+(char *)cursor.next)) + { + if (my_atomic_casptr((void **)cursor.prev, + (void **)&cursor.curr, cursor.next)) + _lf_alloc_free(pins, cursor.curr); + else + lockfind(head, node, &cursor, pins); + } + else + { + res= REPEAT_ONCE_MORE; + if (cursor.upgrade_from) + cursor.upgrade_from->flags|= IGNORE_ME; + } + } while (res == REPEAT_ONCE_MORE); + _lf_unpin(pins, 0); + _lf_unpin(pins, 1); + _lf_unpin(pins, 2); + _lf_unpin(pins, 3); + return res; +} + +void lockman_init(LOCKMAN *lm, loid_to_lo_func *func, uint timeout) +{ + lf_alloc_init(&lm->alloc, sizeof(LOCK), offsetof(LOCK, lonext)); + lf_dynarray_init(&lm->array, sizeof(LOCK **)); + lm->size= 1; + lm->count= 0; + lm->loid_to_lo= func; + lm->lock_timeout= timeout; +} + +void lockman_destroy(LOCKMAN *lm) +{ + LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0); + while (el) + { + intptr next= el->link; + if (el->hashnr & 1) + lf_alloc_direct_free(&lm->alloc, el); + else + my_free((void *)el, MYF(0)); + el= (LOCK *)next; + } + lf_alloc_destroy(&lm->alloc); + lf_dynarray_destroy(&lm->array); +} + +/* TODO: optimize it */ +#define MAX_LOAD 1 + +static void initialize_bucket(LOCKMAN *lm, LOCK * volatile *node, + uint bucket, LF_PINS *pins) +{ + int res; + uint parent= my_clear_highest_bit(bucket); + LOCK *dummy= (LOCK *)my_malloc(sizeof(LOCK), MYF(MY_WME)); + LOCK **tmp= 0, *cur; + LOCK * volatile *el= _lf_dynarray_lvalue(&lm->array, parent); + + if (*el == NULL && bucket) + initialize_bucket(lm, el, parent, pins); + dummy->hashnr= my_reverse_bits(bucket); + dummy->loid= 0; + dummy->lock= X; /* doesn't matter, in fact */ + dummy->resource= 0; + dummy->flags= 0; + res= lockinsert(el, dummy, pins, &cur); + DBUG_ASSERT(res & (ALREADY_HAVE_THE_LOCK | RESOURCE_WAS_UNLOCKED)); + if (res & ALREADY_HAVE_THE_LOCK) + { + my_free((void *)dummy, MYF(0)); + dummy= cur; + } + my_atomic_casptr((void **)node, (void **)&tmp, dummy); +} + +static inline uint calc_hash(uint64 resource) +{ + const uchar *pos= (uchar *)&resource; + ulong nr1= 1, nr2= 4, i; + for (i= 0; i < sizeof(resource) ; i++, pos++) + { + nr1^= (ulong) ((((uint) nr1 & 63)+nr2) * ((uint)*pos)) + (nr1 << 8); + nr2+= 3; + } + return nr1 & INT_MAX32; +} + +/* + RETURN + see enum lockman_getlock_result + NOTE + uses pins[0..3], they're removed on return +*/ +enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo, + uint64 resource, + enum lock_type lock) +{ + int res; + uint csize, bucket, hashnr; + LOCK *node, * volatile *el, *blocker; + LF_PINS *pins= lo->pins; + enum lock_type old_lock; + + DBUG_ASSERT(lo->loid); + lf_rwlock_by_pins(pins); + node= (LOCK *)_lf_alloc_new(pins); + node->flags= 0; + node->lock= lock; + node->loid= lo->loid; + node->resource= resource; + hashnr= calc_hash(resource); + bucket= hashnr % lm->size; + el= _lf_dynarray_lvalue(&lm->array, bucket); + if (*el == NULL) + initialize_bucket(lm, el, bucket, pins); + node->hashnr= my_reverse_bits(hashnr) | 1; + res= lockinsert(el, node, pins, &blocker); + if (res & ALREADY_HAVE) + { + int r; + old_lock= blocker->lock; + _lf_alloc_free(pins, node); + lf_rwunlock_by_pins(pins); + r= getlock_result[old_lock][lock]; + DBUG_ASSERT(r); + return r; + } + /* a new value was added to the hash */ + csize= lm->size; + if ((my_atomic_add32(&lm->count, 1)+1.0) / csize > MAX_LOAD) + my_atomic_cas32(&lm->size, &csize, csize*2); + node->lonext= lo->all_locks; + lo->all_locks= node; + for ( ; res & NEED_TO_WAIT; res= lockpeek(el, node, pins, &blocker)) + { + LOCK_OWNER *wait_for_lo; + ulonglong deadline; + struct timespec timeout; + + _lf_assert_pin(pins, 3); /* blocker must be pinned here */ + wait_for_lo= lm->loid_to_lo(blocker->loid); + + /* + now, this is tricky. blocker is not necessarily a LOCK + we're waiting for. If it's compatible with what we want, + then we're waiting for a lock that blocker is waiting for + (see two places where blocker is set in lockfind) + In the latter case, let's "dereference" it + */ + if (lock_compatibility_matrix[blocker->lock][lock]) + { + blocker= wait_for_lo->all_locks; + _lf_pin(pins, 3, blocker); + if (blocker != wait_for_lo->all_locks) + continue; + wait_for_lo= wait_for_lo->waiting_for; + } + + /* + note that the blocker transaction may have ended by now, + its LOCK_OWNER and short id were reused, so 'wait_for_lo' may point + to an unrelated - albeit valid - LOCK_OWNER + */ + if (!wait_for_lo) + continue; + + lo->waiting_for= wait_for_lo; + lf_rwunlock_by_pins(pins); + + /* + We lock a mutex - it may belong to a wrong LOCK_OWNER, but it must + belong to _some_ LOCK_OWNER. It means, we can never free() a LOCK_OWNER, + if there're other active LOCK_OWNERs. + */ + /* QQ: race condition here */ + pthread_mutex_lock(wait_for_lo->mutex); + if (DELETED(blocker->link)) + { + /* + blocker transaction was ended, or a savepoint that owned + the lock was rolled back. Either way - the lock was removed + */ + pthread_mutex_unlock(wait_for_lo->mutex); + lf_rwlock_by_pins(pins); + continue; + } + + /* yuck. waiting */ + deadline= my_getsystime() + lm->lock_timeout * 10000; + timeout.tv_sec= deadline/10000000; + timeout.tv_nsec= (deadline % 10000000) * 100; + do + { + pthread_cond_timedwait(wait_for_lo->cond, wait_for_lo->mutex, &timeout); + } while (!DELETED(blocker->link) && my_getsystime() < deadline); + pthread_mutex_unlock(wait_for_lo->mutex); + lf_rwlock_by_pins(pins); + if (!DELETED(blocker->link)) + { + /* + timeout. + note that we _don't_ release the lock request here. + Instead we're relying on the caller to abort the transaction, + and release all locks at once - see lockman_release_locks() + */ + _lf_unpin(pins, 3); + lf_rwunlock_by_pins(pins); + return DIDNT_GET_THE_LOCK; + } + } + lo->waiting_for= 0; + _lf_assert_unpin(pins, 3); /* unpin should not be needed */ + lf_rwunlock_by_pins(pins); + return getlock_result[lock][lock]; +} + +/* + RETURN + 0 - deleted + 1 - didn't (not found) + NOTE + see lockdelete() for pin usage notes +*/ +int lockman_release_locks(LOCKMAN *lm, LOCK_OWNER *lo) +{ + LOCK * volatile *el, *node, *next; + uint bucket; + LF_PINS *pins= lo->pins; + + pthread_mutex_lock(lo->mutex); + lf_rwlock_by_pins(pins); + for (node= lo->all_locks; node; node= next) + { + next= node->lonext; + bucket= calc_hash(node->resource) % lm->size; + el= _lf_dynarray_lvalue(&lm->array, bucket); + if (*el == NULL) + initialize_bucket(lm, el, bucket, pins); + lockdelete(el, node, pins); + my_atomic_add32(&lm->count, -1); + } + lf_rwunlock_by_pins(pins); + lo->all_locks= 0; + /* now signal all waiters */ + pthread_cond_broadcast(lo->cond); + pthread_mutex_unlock(lo->mutex); + return 0; +} + +#ifdef MY_LF_EXTRA_DEBUG +static const char *lock2str[]= +{ "N", "S", "X", "IS", "IX", "SIX", "LS", "LX", "SLX", "LSIX" }; +/* + NOTE + the function below is NOT thread-safe !!! +*/ +void print_lockhash(LOCKMAN *lm) +{ + LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0); + printf("hash: size %u count %u\n", lm->size, lm->count); + while (el) + { + intptr next= el->link; + if (el->hashnr & 1) + { + printf("0x%08lx { resource %lu, loid %u, lock %s", + (long) el->hashnr, (ulong) el->resource, el->loid, + lock2str[el->lock]); + if (el->flags & IGNORE_ME) printf(" IGNORE_ME"); + if (el->flags & UPGRADED) printf(" UPGRADED"); + if (el->flags & ACTIVE) printf(" ACTIVE"); + if (DELETED(next)) printf(" ***DELETED***"); + printf("}\n"); + } + else + { + /*printf("0x%08x { dummy }\n", el->hashnr);*/ + DBUG_ASSERT(el->resource == 0 && el->loid == 0 && el->lock == X); + } + el= PTR(next); + } +} +#endif diff --git a/storage/maria/lockman.h b/storage/maria/lockman.h new file mode 100644 index 00000000000..279a5537f76 --- /dev/null +++ b/storage/maria/lockman.h @@ -0,0 +1,76 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _lockman_h +#define _lockman_h + +/* + Lock levels: + ^^^^^^^^^^^ + + N - "no lock", not a lock, used sometimes internally to simplify the code + S - Shared + X - eXclusive + IS - Intention Shared + IX - Intention eXclusive + SIX - Shared + Intention eXclusive + LS - Loose Shared + LX - Loose eXclusive + SLX - Shared + Loose eXclusive + LSIX - Loose Shared + Intention eXclusive +*/ +enum lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST }; + +struct lockman_lock; + +typedef struct st_lock_owner LOCK_OWNER; +struct st_lock_owner { + LF_PINS *pins; /* must be allocated from lockman's pinbox */ + struct lockman_lock *all_locks; /* a LIFO */ + LOCK_OWNER *waiting_for; + pthread_cond_t *cond; /* transactions waiting for this, wait on 'cond' */ + pthread_mutex_t *mutex; /* mutex is required to use 'cond' */ + uint16 loid; +}; + +typedef LOCK_OWNER *loid_to_lo_func(uint16); +typedef struct { + LF_DYNARRAY array; /* hash itself */ + LF_ALLOCATOR alloc; /* allocator for elements */ + int32 volatile size; /* size of array */ + int32 volatile count; /* number of elements in the hash */ + uint lock_timeout; + loid_to_lo_func *loid_to_lo; +} LOCKMAN; +#define DIDNT_GET_THE_LOCK 0 +enum lockman_getlock_result { + NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT, + GOT_THE_LOCK, + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE, + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +}; + +void lockman_init(LOCKMAN *, loid_to_lo_func *, uint); +void lockman_destroy(LOCKMAN *); +enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo, + uint64 resource, + enum lock_type lock); +int lockman_release_locks(LOCKMAN *, LOCK_OWNER *); + +#ifdef EXTRA_DEBUG +void print_lockhash(LOCKMAN *lm); +#endif + +#endif diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c new file mode 100644 index 00000000000..684f5e16ffa --- /dev/null +++ b/storage/maria/ma_bitmap.c @@ -0,0 +1,2077 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Bitmap handling (for records in block) + + The data file starts with a bitmap page, followed by as many data + pages as the bitmap can cover. After this there is a new bitmap page + and more data pages etc. + + The bitmap code assumes there is always an active bitmap page and thus + that there is at least one bitmap page in the file + + Structure of bitmap page: + + Fixed size records (to be implemented later): + + 2 bits are used to indicate: + + 0 Empty + 1 0-75 % full (at least room for 2 records) + 2 75-100 % full (at least room for one record) + 3 100 % full (no more room for records) + + Assuming 8K pages, this will allow us to map: + 8192 (bytes per page) * 4 (pages mapped per byte) * 8192 (page size)= 256M + + (For Maria this will be 7*4 * 8192 = 224K smaller because of LSN) + + Note that for fixed size rows, we can't add more columns without doing + a full reorganization of the table. The user can always force a dynamic + size row format by specifying ROW_FORMAT=dynamic. + + + Dynamic size records: + + 3 bits are used to indicate + + 0 Empty page + 1 0-30 % full (at least room for 3 records) + 2 30-60 % full (at least room for 2 records) + 3 60-90 % full (at least room for one record) + 4 100 % full (no more room for records) + 5 Tail page, 0-40 % full + 6 Tail page, 40-80 % full + 7 Full tail page or full blob page + + Assuming 8K pages, this will allow us to map: + 8192 (bytes per page) * 8 bits/byte / 3 bits/page * 8192 (page size)= 170.7M + + Note that values 1-3 may be adjust for each individual table based on + 'min record length'. Tail pages are for overflow data which can be of + any size and thus doesn't have to be adjusted for different tables. + If we add more columns to the table, some of the originally calculated + 'cut off' points may not be optimal, but they shouldn't be 'drasticly + wrong'. + + When allocating data from the bitmap, we are trying to do it in a + 'best fit' manner. Blobs and varchar blocks are given out in large + continuous extents to allow fast access to these. Before allowing a + row to 'flow over' to other blocks, we will compact the page and use + all space on it. If there is many rows in the page, we will ensure + there is *LEFT_TO_GROW_ON_SPLIT* bytes left on the page to allow other + rows to grow. + + The bitmap format allows us to extend the row file in big chunks, if needed. + + When calculating the size for a packed row, we will calculate the following + things separately: + - Row header + null_bits + empty_bits fixed size segments etc. + - Size of all char/varchar fields + - Size of each blob field + + The bitmap handler will get all the above information and return + either one page or a set of pages to put the different parts. + + Bitmaps are read on demand in response to insert/delete/update operations. + The following bitmap pointers will be cached and stored on disk on close: + - Current insert_bitmap; When inserting new data we will first try to + fill this one. + - First bitmap which is not completely full. This is updated when we + free data with an update or delete. + + While flushing out bitmaps, we will cache the status of the bitmap in memory + to avoid having to read a bitmap for insert of new data that will not + be of any use + - Total empty space + - Largest number of continuous pages + + Bitmap ONLY goes to disk in the following scenarios + - The file is closed (and we flush all changes to disk) + - On checkpoint + (Ie: When we do a checkpoint, we have to ensure that all bitmaps are + put on disk even if they are not in the page cache). + - When explicitely requested (for example on backup or after recvoery, + to simplify things) + + The flow of writing a row is that: + - Lock the bitmap + - Decide which data pages we will write to + - Mark them full in the bitmap page so that other threads do not try to + use the same data pages as us + - We unlock the bitmap + - Write the data pages + - Lock the bitmap + - Correct the bitmap page with the true final occupation of the data + pages (that is, we marked pages full but when we are done we realize + we didn't fill them) + - Unlock the bitmap. +*/ + +#include "maria_def.h" +#include "ma_blockrec.h" + +/* Number of pages to store blob parts */ +#define BLOB_SEGMENT_MIN_SIZE 128 + +#define FULL_HEAD_PAGE 4 +#define FULL_TAIL_PAGE 7 + +/** all bitmap pages end with this 2-byte signature */ +uchar maria_bitmap_marker[2]= {(uchar) 'b',(uchar) 'm'}; + +static my_bool _ma_read_bitmap_page(MARIA_SHARE *share, + MARIA_FILE_BITMAP *bitmap, + ulonglong page); + + +/* Write bitmap page to key cache */ + +static inline my_bool write_changed_bitmap(MARIA_SHARE *share, + MARIA_FILE_BITMAP *bitmap) +{ + DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size); + return (pagecache_write(share->pagecache, + &bitmap->file, bitmap->page, 0, + (uchar*) bitmap->map, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0)); +} + +/* + Initialize bitmap variables in share + + SYNOPSIS + _ma_bitmap_init() + share Share handler + file data file handler + + NOTES + This is called the first time a file is opened. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_init(MARIA_SHARE *share, File file) +{ + uint aligned_bit_blocks; + uint max_page_size; + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + uint size= share->block_size; +#ifndef DBUG_OFF + /* We want to have a copy of the bitmap to be able to print differences */ + size*= 2; +#endif + + if (!(bitmap->map= (uchar*) my_malloc(size, MYF(MY_WME)))) + return 1; + + bitmap->file.file= file; + bitmap->changed= 0; + bitmap->block_size= share->block_size; + /* Size needs to be alligned on 6 */ + aligned_bit_blocks= share->block_size / 6; + bitmap->total_size= aligned_bit_blocks * 6; + /* + In each 6 bytes, we have 6*8/3 = 16 pages covered + The +1 is to add the bitmap page, as this doesn't have to be covered + */ + bitmap->pages_covered= aligned_bit_blocks * 16 + 1; + + /* Update size for bits */ + /* TODO; Make this dependent of the row size */ + max_page_size= share->block_size - PAGE_OVERHEAD_SIZE; + bitmap->sizes[0]= max_page_size; /* Empty page */ + bitmap->sizes[1]= max_page_size - max_page_size * 30 / 100; + bitmap->sizes[2]= max_page_size - max_page_size * 60 / 100; + bitmap->sizes[3]= max_page_size - max_page_size * 90 / 100; + bitmap->sizes[4]= 0; /* Full page */ + bitmap->sizes[5]= max_page_size - max_page_size * 40 / 100; + bitmap->sizes[6]= max_page_size - max_page_size * 80 / 100; + bitmap->sizes[7]= 0; + + pthread_mutex_init(&share->bitmap.bitmap_lock, MY_MUTEX_INIT_SLOW); + + /* + We can't read a page yet, as in some case we don't have an active + page cache yet. + Pretend we have a dummy, full and not changed bitmap page in memory. + */ + + bitmap->page= ~(ulonglong) 0; + bitmap->used_size= bitmap->total_size; + bfill(bitmap->map, share->block_size, 255); + if (share->state.first_bitmap_with_space == ~(ulonglong) 0) + { + /* Start scanning for free space from start of file */ + share->state.first_bitmap_with_space = 0; + } + return 0; +} + + +/* + Free data allocated by _ma_bitmap_init + + SYNOPSIS + _ma_bitmap_end() + share Share handler +*/ + +my_bool _ma_bitmap_end(MARIA_SHARE *share) +{ + my_bool res= _ma_flush_bitmap(share); + pthread_mutex_destroy(&share->bitmap.bitmap_lock); + my_free((uchar*) share->bitmap.map, MYF(MY_ALLOW_ZERO_PTR)); + share->bitmap.map= 0; + return res; +} + + +/* + Send updated bitmap to the page cache + + SYNOPSIS + _ma_flush_bitmap() + share Share handler + + NOTES + In the future, _ma_flush_bitmap() will be called to flush changes don't + by this thread (ie, checking the changed flag is ok). The reason we + check it again in the mutex is that if someone else did a flush at the + same time, we don't have to do the write. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_flush_bitmap(MARIA_SHARE *share) +{ + my_bool res= 0; + DBUG_ENTER("_ma_flush_bitmap"); + if (share->bitmap.changed) + { + pthread_mutex_lock(&share->bitmap.bitmap_lock); + if (share->bitmap.changed) + { + res= write_changed_bitmap(share, &share->bitmap); + share->bitmap.changed= 0; + } + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + } + DBUG_RETURN(res); +} + + +/* + Intialize bitmap in memory to a zero bitmap + + SYNOPSIS + _ma_bitmap_delete_all() + share Share handler + + NOTES + This is called on maria_delete_all_rows (truncate data file). +*/ + +void _ma_bitmap_delete_all(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + if (bitmap->map) /* Not in create */ + { + bzero(bitmap->map, bitmap->block_size); + memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker), + maria_bitmap_marker, sizeof(maria_bitmap_marker)); + bitmap->changed= 1; + bitmap->page= 0; + bitmap->used_size= bitmap->total_size; + } +} + + +/* + Return bitmap pattern for the smallest head block that can hold 'size' + + SYNOPSIS + size_to_head_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0-3 For a description of the bitmap sizes, see the header +*/ + +static uint size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size <= bitmap->sizes[3]) + return 3; + if (size <= bitmap->sizes[2]) + return 2; + if (size <= bitmap->sizes[1]) + return 1; + DBUG_ASSERT(size <= bitmap->sizes[0]); + return 0; +} + + +/* + Return bitmap pattern for head block where there is size bytes free + + SYNOPSIS + _ma_free_size_to_head_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0-4 (Possible bitmap patterns for head block) +*/ + +uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size < bitmap->sizes[3]) + return 4; + if (size < bitmap->sizes[2]) + return 3; + if (size < bitmap->sizes[1]) + return 2; + return (size < bitmap->sizes[0]) ? 1 : 0; +} + + +/* + Return bitmap pattern for the smallest tail block that can hold 'size' + + SYNOPSIS + size_to_tail_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0, 5 or 6 For a description of the bitmap sizes, see the header +*/ + +static uint size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size <= bitmap->sizes[6]) + return 6; + if (size <= bitmap->sizes[5]) + return 5; + DBUG_ASSERT(size <= bitmap->sizes[0]); + return 0; +} + + +/* + Return bitmap pattern for tail block where there is size bytes free + + SYNOPSIS + free_size_to_tail_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0, 5, 6, 7 For a description of the bitmap sizes, see the header +*/ + +static uint free_size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size >= bitmap->sizes[0]) + return 0; /* Revert to empty page */ + if (size < bitmap->sizes[6]) + return 7; + if (size < bitmap->sizes[5]) + return 6; + return 5; +} + + +/* + Return size guranteed to be available on a page + + SYNOPSIS + pattern_to_head_size() + bitmap Bitmap + pattern Pattern (0-7) + + RETURN + 0 - block_size +*/ + +static inline uint pattern_to_size(MARIA_FILE_BITMAP *bitmap, uint pattern) +{ + DBUG_ASSERT(pattern <= 7); + return bitmap->sizes[pattern]; +} + + +/* + Print bitmap for debugging + + SYNOPSIS + _ma_print_bitmap() + bitmap Bitmap to print + + IMPLEMENTATION + Prints all changed bits since last call to _ma_print_bitmap(). + This is done by having a copy of the last bitmap in + bitmap->map+bitmap->block_size. +*/ + +#ifndef DBUG_OFF + +const char *bits_to_txt[]= +{ + "empty", "00-30% full", "30-60% full", "60-90% full", "full", + "tail 00-40 % full", "tail 40-80 % full", "tail/blob full" +}; + +static void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap) +{ + uchar *pos, *end, *org_pos; + ulong page; + + end= bitmap->map + bitmap->used_size; + DBUG_LOCK_FILE; + fprintf(DBUG_FILE,"\nBitmap page changes at page %lu\n", + (ulong) bitmap->page); + + DBUG_ASSERT(memcmp(bitmap->map + bitmap->block_size - + sizeof(maria_bitmap_marker), + maria_bitmap_marker, sizeof(maria_bitmap_marker)) == 0); + + page= (ulong) bitmap->page+1; + for (pos= bitmap->map, org_pos= bitmap->map + bitmap->block_size ; + pos < end ; + pos+= 6, org_pos+= 6) + { + ulonglong bits= uint6korr(pos); /* 6 bytes = 6*8/3= 16 patterns */ + ulonglong org_bits= uint6korr(org_pos); + uint i; + + /* + Test if there is any changes in the next 16 bitmaps (to not have to + loop through all bits if we know they are the same) + */ + if (bits != org_bits) + { + for (i= 0; i < 16 ; i++, bits>>= 3, org_bits>>= 3) + { + if ((bits & 7) != (org_bits & 7)) + fprintf(DBUG_FILE, "Page: %8lu %s -> %s\n", page+i, + bits_to_txt[org_bits & 7], bits_to_txt[bits & 7]); + } + } + page+= 16; + } + fputc('\n', DBUG_FILE); + DBUG_UNLOCK_FILE; + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); +} + +#endif /* DBUG_OFF */ + + +/*************************************************************************** + Reading & writing bitmap pages +***************************************************************************/ + +/* + Read a given bitmap page + + SYNOPSIS + read_bitmap_page() + info Maria handler + bitmap Bitmap handler + page Page to read + + TODO + Update 'bitmap->used_size' to real size of used bitmap + + NOTE + We don't always have share->bitmap.bitmap_lock here + (when called from_ma_check_bitmap_data() for example). + + RETURN + 0 ok + 1 error (Error writing old bitmap or reading bitmap page) +*/ + +static my_bool _ma_read_bitmap_page(MARIA_SHARE *share, + MARIA_FILE_BITMAP *bitmap, + ulonglong page) +{ + my_off_t end_of_page= (page + 1) * bitmap->block_size; + my_bool res; + DBUG_ENTER("_ma_read_bitmap_page"); + DBUG_ASSERT(page % bitmap->pages_covered == 0); + + bitmap->page= page; + if (end_of_page > share->state.state.data_file_length) + { + /* + Inexistent or half-created page (could be crash in the middle of + _ma_bitmap_create_first(), before appending maria_bitmap_marker). + */ + share->state.state.data_file_length= end_of_page; + bzero(bitmap->map, bitmap->block_size); + memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker), + maria_bitmap_marker, sizeof(maria_bitmap_marker)); + bitmap->used_size= 0; +#ifndef DBUG_OFF + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); +#endif + DBUG_RETURN(0); + } + bitmap->used_size= bitmap->total_size; + DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size); + res= ((pagecache_read(share->pagecache, + (PAGECACHE_FILE*)&bitmap->file, page, 0, + (uchar*) bitmap->map, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == NULL) || + memcmp(bitmap->map + bitmap->block_size - + sizeof(maria_bitmap_marker), + maria_bitmap_marker, sizeof(maria_bitmap_marker))); +#ifndef DBUG_OFF + if (!res) + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); +#endif + DBUG_RETURN(res); +} + + +/* + Change to another bitmap page + + SYNOPSIS + _ma_change_bitmap_page() + info Maria handler + bitmap Bitmap handler + page Bitmap page to read + + NOTES + If old bitmap was changed, write it out before reading new one + We return empty bitmap if page is outside of file size + + RETURN + 0 ok + 1 error (Error writing old bitmap or reading bitmap page) +*/ + +static my_bool _ma_change_bitmap_page(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + ulonglong page) +{ + DBUG_ENTER("_ma_change_bitmap_page"); + + if (bitmap->changed) + { + if (write_changed_bitmap(info->s, bitmap)) + DBUG_RETURN(1); + bitmap->changed= 0; + } + DBUG_RETURN(_ma_read_bitmap_page(info->s, bitmap, page)); +} + + +/* + Read next suitable bitmap + + SYNOPSIS + move_to_next_bitmap() + bitmap Bitmap handle + + NOTES + The found bitmap may be full, so calling function may need to call this + repeatedly until it finds enough space. + + TODO + Add cache of bitmaps to not read something that is not usable + + RETURN + 0 ok + 1 error (either couldn't save old bitmap or read new one +*/ + +static my_bool move_to_next_bitmap(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap) +{ + ulonglong page= bitmap->page; + MARIA_STATE_INFO *state= &info->s->state; + DBUG_ENTER("move_to_next_bitmap"); + + if (state->first_bitmap_with_space != ~(ulonglong) 0 && + state->first_bitmap_with_space != page) + { + page= state->first_bitmap_with_space; + state->first_bitmap_with_space= ~(ulonglong) 0; + } + else + page+= bitmap->pages_covered; + DBUG_RETURN(_ma_change_bitmap_page(info, bitmap, page)); +} + + +/**************************************************************************** + Allocate data in bitmaps +****************************************************************************/ + +/* + Store data in 'block' and mark the place used in the bitmap + + SYNOPSIS + fill_block() + bitmap Bitmap handle + block Store data about what we found + best_data Pointer to best 6 uchar aligned area in bitmap->map + best_pos Which bit in *best_data the area starts + 0 = first bit pattern, 1 second bit pattern etc + best_bits The original value of the bits at best_pos + fill_pattern Bitmap pattern to store in best_data[best_pos] + + NOTES + We mark all pages to be 'TAIL's, which means that + block->page_count is really a row position inside the page. +*/ + +static void fill_block(MARIA_FILE_BITMAP *bitmap, + MARIA_BITMAP_BLOCK *block, + uchar *best_data, uint best_pos, uint best_bits, + uint fill_pattern) +{ + uint page, offset, tmp; + uchar *data; + + /* For each 6 bytes we have 6*8/3= 16 patterns */ + page= (best_data - bitmap->map) / 6 * 16 + best_pos; + block->page= bitmap->page + 1 + page; + block->page_count= 1 + TAIL_BIT; + block->empty_space= pattern_to_size(bitmap, best_bits); + block->sub_blocks= 1; + block->org_bitmap_value= best_bits; + block->used= BLOCKUSED_TAIL; /* See _ma_bitmap_release_unused() */ + + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + best_pos*= 3; + data= best_data+ best_pos / 8; + offset= best_pos & 7; + tmp= uint2korr(data); + + /* we turn off the 3 bits and replace them with fill_pattern */ + tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset); + int2store(data, tmp); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap);); +} + + +/* + Allocate data for head block + + SYNOPSIS + allocate_head() + bitmap bitmap + size Size of data region we need to store + block Store found information here + + IMPLEMENTATION + Find the best-fit page to put a region of 'size' + This is defined as the first page of the set of pages + with the smallest free space that can hold 'size'. + + RETURN + 0 ok (block is updated) + 1 error (no space in bitmap; block is not touched) +*/ + + +static my_bool allocate_head(MARIA_FILE_BITMAP *bitmap, uint size, + MARIA_BITMAP_BLOCK *block) +{ + uint min_bits= size_to_head_pattern(bitmap, size); + uchar *data= bitmap->map, *end= data + bitmap->used_size; + uchar *best_data= 0; + uint best_bits= (uint) -1, best_pos; + DBUG_ENTER("allocate_head"); + + LINT_INIT(best_pos); + DBUG_ASSERT(size <= FULL_PAGE_SIZE(bitmap->block_size)); + + for (; data < end; data += 6) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uint i; + + /* + Skip common patterns + We can skip empty pages (if we already found a match) or + anything matching the following pattern as this will be either + a full page or a tail page + */ + if ((!bits && best_data) || + ((bits & LL(04444444444444444)) == LL(04444444444444444))) + continue; + for (i= 0; i < 16 ; i++, bits >>= 3) + { + uint pattern= bits & 7; + if (pattern <= min_bits) + { + /* There is enough space here */ + if (pattern == min_bits) + { + /* There is exactly enough space here, return this page */ + best_bits= min_bits; + best_data= data; + best_pos= i; + goto found; + } + if ((int) pattern > (int) best_bits) + { + /* + There is more than enough space here and it's better than what + we have found so far. Remember it, as we will choose it if we + don't find anything in this bitmap page. + */ + best_bits= pattern; + best_data= data; + best_pos= i; + } + } + } + } + if (!best_data) /* Found no place */ + { + if (bitmap->used_size == bitmap->total_size) + DBUG_RETURN(1); /* No space in bitmap */ + /* Allocate data at end of bitmap */ + bitmap->used_size+= 6; + best_data= data; + best_pos= best_bits= 0; + } + +found: + fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_HEAD_PAGE); + DBUG_RETURN(0); +} + + +/* + Allocate data for tail block + + SYNOPSIS + allocate_tail() + bitmap bitmap + size Size of block we need to find + block Store found information here + + RETURN + 0 ok (block is updated) + 1 error (no space in bitmap; block is not touched) +*/ + + +static my_bool allocate_tail(MARIA_FILE_BITMAP *bitmap, uint size, + MARIA_BITMAP_BLOCK *block) +{ + uint min_bits= size_to_tail_pattern(bitmap, size); + uchar *data= bitmap->map, *end= data + bitmap->used_size; + uchar *best_data= 0; + uint best_bits= (uint) -1, best_pos; + DBUG_ENTER("allocate_tail"); + DBUG_PRINT("enter", ("size: %u", size)); + + LINT_INIT(best_pos); + DBUG_ASSERT(size <= FULL_PAGE_SIZE(bitmap->block_size)); + + for (; data < end; data += 6) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uint i; + + /* + Skip common patterns + We can skip empty pages (if we already found a match) or + the following patterns: 1-4 (head pages, not suitable for tail) or + 7 (full tail page). See 'Dynamic size records' comment at start of file. + + At the moment we only skip full tail pages (ie, all bits are + set) as this is easy to detect with one simple test and is a + quite common case if we have blobs. + */ + + if ((!bits && best_data) || bits == LL(0xffffffffffff)) + continue; + for (i= 0; i < 16; i++, bits >>= 3) + { + uint pattern= bits & 7; + if (pattern <= min_bits && (!pattern || pattern >= 5)) + { + if (pattern == min_bits) + { + best_bits= min_bits; + best_data= data; + best_pos= i; + goto found; + } + if ((int) pattern > (int) best_bits) + { + best_bits= pattern; + best_data= data; + best_pos= i; + } + } + } + } + if (!best_data) + { + if (bitmap->used_size == bitmap->total_size) + DBUG_RETURN(1); + /* Allocate data at end of bitmap */ + best_data= end; + bitmap->used_size+= 6; + best_pos= best_bits= 0; + } + +found: + fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_TAIL_PAGE); + DBUG_RETURN(0); +} + + +/* + Allocate data for full blocks + + SYNOPSIS + allocate_full_pages() + bitmap bitmap + pages_needed Total size in pages (bitmap->total_size) we would like to have + block Store found information here + full_page 1 if we are not allowed to split extent + + IMPLEMENTATION + We will return the smallest area >= size. If there is no such + block, we will return the biggest area that satisfies + area_size >= min(BLOB_SEGMENT_MIN_SIZE*full_page_size, size) + + To speed up searches, we will only consider areas that has at least 16 free + pages starting on an even boundary. When finding such an area, we will + extend it with all previous and following free pages. This will ensure + we don't get holes between areas + + RETURN + # Blocks used + 0 error (no space in bitmap; block is not touched) +*/ + +static ulong allocate_full_pages(MARIA_FILE_BITMAP *bitmap, + ulong pages_needed, + MARIA_BITMAP_BLOCK *block, my_bool full_page) +{ + uchar *data= bitmap->map, *data_end= data + bitmap->used_size; + uchar *page_end= data + bitmap->total_size; + uchar *best_data= 0; + uint min_size; + uint best_area_size, best_prefix_area_size, best_suffix_area_size; + uint page, size; + ulonglong best_prefix_bits; + DBUG_ENTER("allocate_full_pages"); + DBUG_PRINT("enter", ("pages_needed: %lu", pages_needed)); + + /* Following variables are only used if best_data is set */ + LINT_INIT(best_prefix_bits); + LINT_INIT(best_prefix_area_size); + LINT_INIT(best_suffix_area_size); + + min_size= pages_needed; + if (!full_page && min_size > BLOB_SEGMENT_MIN_SIZE) + min_size= BLOB_SEGMENT_MIN_SIZE; + best_area_size= ~(uint) 0; + + for (; data < page_end; data+= 6) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uchar *data_start; + ulonglong prefix_bits= 0; + uint area_size, prefix_area_size, suffix_area_size; + + /* Find area with at least 16 free pages */ + if (bits) + continue; + data_start= data; + /* Find size of area */ + for (data+=6 ; data < data_end ; data+= 6) + { + if ((bits= uint6korr(data))) + break; + } + area_size= (data - data_start) / 6 * 16; + if (area_size >= best_area_size) + continue; + prefix_area_size= suffix_area_size= 0; + if (!bits) + { + /* + End of page; All the rest of the bits on page are part of area + This is needed because bitmap->used_size only covers the set bits + in the bitmap. + */ + area_size+= (page_end - data) / 6 * 16; + if (area_size >= best_area_size) + break; + data= page_end; + } + else + { + /* Add bits at end of page */ + for (; !(bits & 7); bits >>= 3) + suffix_area_size++; + area_size+= suffix_area_size; + } + if (data_start != bitmap->map) + { + /* Add bits before page */ + bits= prefix_bits= uint6korr(data_start - 6); + DBUG_ASSERT(bits != 0); + /* 111 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 */ + if (!(bits & LL(07000000000000000))) + { + data_start-= 6; + do + { + prefix_area_size++; + bits<<= 3; + } while (!(bits & LL(07000000000000000))); + area_size+= prefix_area_size; + /* Calculate offset to page from data_start */ + prefix_area_size= 16 - prefix_area_size; + } + } + if (area_size >= min_size && area_size <= best_area_size) + { + best_data= data_start; + best_area_size= area_size; + best_prefix_bits= prefix_bits; + best_prefix_area_size= prefix_area_size; + best_suffix_area_size= suffix_area_size; + + /* Prefer to put data in biggest possible area */ + if (area_size <= pages_needed) + min_size= area_size; + else + min_size= pages_needed; + } + } + if (!best_data) + DBUG_RETURN(0); /* No room on page */ + + /* + Now allocate min(pages_needed, area_size), starting from + best_start + best_prefix_area_size + */ + if (best_area_size > pages_needed) + best_area_size= pages_needed; + + /* For each 6 bytes we have 6*8/3= 16 patterns */ + page= ((best_data - bitmap->map) * 8) / 3 + best_prefix_area_size; + block->page= bitmap->page + 1 + page; + block->page_count= best_area_size; + block->empty_space= 0; + block->sub_blocks= 1; + block->org_bitmap_value= 0; + block->used= 0; + DBUG_PRINT("info", ("page: %lu page_count: %u", + (ulong) block->page, block->page_count)); + + if (best_prefix_area_size) + { + ulonglong tmp; + /* Convert offset back to bits */ + best_prefix_area_size= 16 - best_prefix_area_size; + if (best_area_size < best_prefix_area_size) + { + tmp= (LL(1) << best_area_size*3) - 1; + best_area_size= best_prefix_area_size; /* for easy end test */ + } + else + tmp= (LL(1) << best_prefix_area_size*3) - 1; + tmp<<= (16 - best_prefix_area_size) * 3; + DBUG_ASSERT((best_prefix_bits & tmp) == 0); + best_prefix_bits|= tmp; + int6store(best_data, best_prefix_bits); + if (!(best_area_size-= best_prefix_area_size)) + { + DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap);); + DBUG_RETURN(block->page_count); + } + best_data+= 6; + } + best_area_size*= 3; /* Bits to set */ + size= best_area_size/8; /* Bytes to set */ + bfill(best_data, size, 255); + best_data+= size; + if ((best_area_size-= size * 8)) + { + /* fill last uchar */ + *best_data|= (uchar) ((1 << best_area_size) -1); + best_data++; + } + if (data_end < best_data) + bitmap->used_size= (uint) (best_data - bitmap->map); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap);); + DBUG_RETURN(block->page_count); +} + + +/**************************************************************************** + Find right bitmaps where to store data +****************************************************************************/ + +/* + Find right bitmap and position for head block + + SYNOPSIS + find_head() + info Maria handler + length Size of data region we need store + position Position in bitmap_blocks where to store the + information for the head block. + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_head(MARIA_HA *info, uint length, uint position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + /* + There is always place for the head block in bitmap_blocks as these are + preallocated at _ma_init_block_record(). + */ + block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *); + + while (allocate_head(bitmap, length, block)) + if (move_to_next_bitmap(info, bitmap)) + return 1; + return 0; +} + + +/* + Find right bitmap and position for tail + + SYNOPSIS + find_tail() + info Maria handler + length Size of data region we need store + position Position in bitmap_blocks where to store the + information for the head block. + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_tail(MARIA_HA *info, uint length, uint position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + DBUG_ENTER("find_tail"); + + /* Needed, as there is no error checking in dynamic_element */ + if (allocate_dynamic(&info->bitmap_blocks, position)) + DBUG_RETURN(1); + block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *); + + while (allocate_tail(bitmap, length, block)) + if (move_to_next_bitmap(info, bitmap)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/* + Find right bitmap and position for full blocks in one extent + + SYNOPSIS + find_mid() + info Maria handler. + pages How many pages to allocate. + position Position in bitmap_blocks where to store the + information for the head block. + NOTES + This is used to allocate the main extent after the 'head' block + (Ie, the middle part of the head-middle-tail entry) + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_mid(MARIA_HA *info, ulong pages, uint position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *); + + while (!allocate_full_pages(bitmap, pages, block, 1)) + { + if (move_to_next_bitmap(info, bitmap)) + return 1; + } + return 0; +} + + +/* + Find right bitmap and position for putting a blob + + SYNOPSIS + find_blob() + info Maria handler. + length Length of the blob + + NOTES + The extents are stored last in info->bitmap_blocks + + IMPLEMENTATION + Allocate all full pages for the block + optionally one tail + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_blob(MARIA_HA *info, ulong length) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + uint full_page_size= FULL_PAGE_SIZE(info->s->block_size); + ulong pages; + uint rest_length, used; + uint first_block_pos; + MARIA_BITMAP_BLOCK *first_block= 0; + DBUG_ENTER("find_blob"); + DBUG_PRINT("enter", ("length: %lu", length)); + + pages= length / full_page_size; + rest_length= (uint) (length - pages * full_page_size); + if (rest_length >= MAX_TAIL_SIZE(info->s->block_size)) + { + pages++; + rest_length= 0; + } + + if (pages) + { + MARIA_BITMAP_BLOCK *block; + if (allocate_dynamic(&info->bitmap_blocks, + info->bitmap_blocks.elements + + pages / BLOB_SEGMENT_MIN_SIZE + 2)) + DBUG_RETURN(1); + first_block_pos= info->bitmap_blocks.elements; + block= dynamic_element(&info->bitmap_blocks, info->bitmap_blocks.elements, + MARIA_BITMAP_BLOCK*); + first_block= block; + do + { + used= allocate_full_pages(bitmap, + (pages >= 65535 ? 65535 : (uint) pages), block, + 0); + if (!used) + { + if (move_to_next_bitmap(info, bitmap)) + DBUG_RETURN(1); + } + else + { + pages-= used; + info->bitmap_blocks.elements++; + block++; + } + } while (pages != 0); + } + if (rest_length && find_tail(info, rest_length, + info->bitmap_blocks.elements++)) + DBUG_RETURN(1); + if (first_block) + first_block->sub_blocks= info->bitmap_blocks.elements - first_block_pos; + DBUG_RETURN(0); +} + + +/* + Find pages to put ALL blobs + + SYNOPSIS + allocate_blobs() + info Maria handler + row Information of what is in the row (from calc_record_size()) + + RETURN + 0 ok + 1 error +*/ + +static my_bool allocate_blobs(MARIA_HA *info, MARIA_ROW *row) +{ + ulong *length, *end; + uint elements; + /* + Reserve size for: + head block + one extent + tail block + */ + elements= info->bitmap_blocks.elements; + for (length= row->blob_lengths, end= length + info->s->base.blobs; + length < end; length++) + { + if (*length && find_blob(info, *length)) + return 1; + } + row->extents_count= (info->bitmap_blocks.elements - elements); + return 0; +} + + +/* + Store in the bitmap the new size for a head page + + SYNOPSIS + use_head() + info Maria handler + page Page number to update + (Note that caller guarantees this is in the active + bitmap) + size How much free space is left on the page + block_position In which info->bitmap_block we have the + information about the head block. + + NOTES + This is used on update where we are updating an existing head page +*/ + +static void use_head(MARIA_HA *info, ulonglong page, uint size, + uint block_position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + uchar *data; + uint offset, tmp, offset_page; + + block= dynamic_element(&info->bitmap_blocks, block_position, + MARIA_BITMAP_BLOCK*); + block->page= page; + block->page_count= 1 + TAIL_BIT; + block->empty_space= size; + block->sub_blocks= 1; + block->used= BLOCKUSED_TAIL; + + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + offset_page= (uint) (page - bitmap->page - 1) * 3; + offset= offset_page & 7; + data= bitmap->map + offset_page / 8; + tmp= uint2korr(data); + block->org_bitmap_value= (tmp >> offset) & 7; + tmp= (tmp & ~(7 << offset)) | (FULL_HEAD_PAGE << offset); + int2store(data, tmp); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap);); +} + + +/* + Find out where to split the row (ie, what goes in head, middle, tail etc) + + SYNOPSIS + find_where_to_split_row() + share Maria share + row Information of what is in the row (from calc_record_size()) + extents_length Number of bytes needed to store all extents + split_size Free size on the page (The head length must be less + than this) + + RETURN + row_length for the head block. +*/ + +static uint find_where_to_split_row(MARIA_SHARE *share, MARIA_ROW *row, + uint extents_length, uint split_size) +{ + uint row_length= row->base_length; + uint *lengths, *lengths_end; + + DBUG_ASSERT(row_length < split_size); + /* + Store first in all_field_lengths the different parts that are written + to the row. This needs to be in same order as in + ma_block_rec.c::write_block_record() + */ + row->null_field_lengths[-3]= extents_length; + row->null_field_lengths[-2]= share->base.fixed_not_null_fields_length; + row->null_field_lengths[-1]= row->field_lengths_length; + for (lengths= row->null_field_lengths - EXTRA_LENGTH_FIELDS, + lengths_end= (lengths + share->base.pack_fields - share->base.blobs + + EXTRA_LENGTH_FIELDS); lengths < lengths_end; lengths++) + { + if (row_length + *lengths > split_size) + break; + row_length+= *lengths; + } + return row_length; +} + + +/* + Find where to write the middle parts of the row and the tail + + SYNOPSIS + write_rest_of_head() + info Maria handler + position Position in bitmap_blocks. Is 0 for rows that needs + full blocks (ie, has a head, middle part and optional tail) + rest_length How much left of the head block to write. + + RETURN + 0 ok + 1 error +*/ + +static my_bool write_rest_of_head(MARIA_HA *info, uint position, + ulong rest_length) +{ + MARIA_SHARE *share= info->s; + uint full_page_size= FULL_PAGE_SIZE(share->block_size); + MARIA_BITMAP_BLOCK *block; + DBUG_ENTER("write_rest_of_head"); + DBUG_PRINT("enter", ("position: %u rest_length: %lu", position, + rest_length)); + + if (position == 0) + { + /* Write out full pages */ + uint pages= rest_length / full_page_size; + + rest_length%= full_page_size; + if (rest_length >= MAX_TAIL_SIZE(share->block_size)) + { + /* Put tail on a full page */ + pages++; + rest_length= 0; + } + if (find_mid(info, pages, 1)) + DBUG_RETURN(1); + /* + Insert empty block after full pages, to allow write_block_record() to + split segment into used + free page + */ + block= dynamic_element(&info->bitmap_blocks, 2, MARIA_BITMAP_BLOCK*); + block->page_count= 0; + block->used= 0; + } + if (rest_length) + { + if (find_tail(info, rest_length, ELEMENTS_RESERVED_FOR_MAIN_PART - 1)) + DBUG_RETURN(1); + } + else + { + /* Empty tail block */ + block= dynamic_element(&info->bitmap_blocks, + ELEMENTS_RESERVED_FOR_MAIN_PART - 1, + MARIA_BITMAP_BLOCK *); + block->page_count= 0; + block->used= 0; + } + DBUG_RETURN(0); +} + + +/* + Find where to store one row + + SYNPOSIS + _ma_bitmap_find_place() + info Maria handler + row Information about row to write + blocks Store data about allocated places here + + RETURN + 0 ok + row->space_on_head_page contains minimum number of bytes we + expect to put on the head page. + 1 error +*/ + +my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row, + MARIA_BITMAP_BLOCKS *blocks) +{ + MARIA_SHARE *share= info->s; + my_bool res= 1; + uint full_page_size, position, max_page_size; + uint head_length, row_length, rest_length, extents_length; + DBUG_ENTER("_ma_bitmap_find_place"); + + blocks->count= 0; + blocks->tail_page_skipped= blocks->page_skipped= 0; + row->extents_count= 0; + + /* + Reserve place for the following blocks: + - Head block + - Full page block + - Marker block to allow write_block_record() to split full page blocks + into full and free part + - Tail block + */ + + info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART; + max_page_size= (share->block_size - PAGE_OVERHEAD_SIZE); + + pthread_mutex_lock(&share->bitmap.bitmap_lock); + + if (row->total_length <= max_page_size) + { + /* Row fits in one page */ + position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1; + if (find_head(info, (uint) row->total_length, position)) + goto abort; + row->space_on_head_page= row->total_length; + goto end; + } + + /* + First allocate all blobs (so that we can find out the needed size for + the main block. + */ + if (row->blob_length && allocate_blobs(info, row)) + goto abort; + + extents_length= row->extents_count * ROW_EXTENT_SIZE; + if ((head_length= (row->head_length + extents_length)) <= max_page_size) + { + /* Main row part fits into one page */ + position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1; + if (find_head(info, head_length, position)) + goto abort; + row->space_on_head_page= head_length; + goto end; + } + + /* Allocate enough space */ + head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE; + + /* The first segment size is stored in 'row_length' */ + row_length= find_where_to_split_row(share, row, extents_length, + max_page_size); + + full_page_size= FULL_PAGE_SIZE(share->block_size); + position= 0; + if (head_length - row_length <= full_page_size) + position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */ + if (find_head(info, row_length, position)) + goto abort; + row->space_on_head_page= row_length; + rest_length= head_length - row_length; + if (write_rest_of_head(info, position, rest_length)) + goto abort; + +end: + blocks->block= dynamic_element(&info->bitmap_blocks, position, + MARIA_BITMAP_BLOCK*); + blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position; + /* First block's page_count is for all blocks */ + blocks->count= info->bitmap_blocks.elements - position; + res= 0; + +abort: + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/* + Find where to put row on update (when head page is already defined) + + SYNPOSIS + _ma_bitmap_find_new_place() + info Maria handler + row Information about row to write + page On which page original row was stored + free_size Free size on head page + blocks Store data about allocated places here + + NOTES + This function is only called when the new row can't fit in the space of + the old row in the head page. + + This is essently same as _ma_bitmap_find_place() except that + we don't call find_head() to search in bitmaps where to put the page. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *row, + ulonglong page, uint free_size, + MARIA_BITMAP_BLOCKS *blocks) +{ + MARIA_SHARE *share= info->s; + my_bool res= 1; + uint full_page_size, position; + uint head_length, row_length, rest_length, extents_length; + DBUG_ENTER("_ma_bitmap_find_new_place"); + + blocks->count= 0; + blocks->tail_page_skipped= blocks->page_skipped= 0; + row->extents_count= 0; + info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART; + + pthread_mutex_lock(&share->bitmap.bitmap_lock); + if (share->bitmap.page != page / share->bitmap.pages_covered && + _ma_change_bitmap_page(info, &share->bitmap, + page / share->bitmap.pages_covered)) + goto abort; + + /* + First allocate all blobs (so that we can find out the needed size for + the main block. + */ + if (row->blob_length && allocate_blobs(info, row)) + goto abort; + + extents_length= row->extents_count * ROW_EXTENT_SIZE; + if ((head_length= (row->head_length + extents_length)) <= free_size) + { + /* Main row part fits into one page */ + position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1; + use_head(info, page, head_length, position); + goto end; + } + + /* Allocate enough space */ + head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE; + + /* The first segment size is stored in 'row_length' */ + row_length= find_where_to_split_row(share, row, extents_length, free_size); + + full_page_size= FULL_PAGE_SIZE(share->block_size); + position= 0; + if (head_length - row_length <= full_page_size) + position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */ + use_head(info, page, row_length, position); + rest_length= head_length - row_length; + + if (write_rest_of_head(info, position, rest_length)) + goto abort; + +end: + blocks->block= dynamic_element(&info->bitmap_blocks, position, + MARIA_BITMAP_BLOCK*); + blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position; + /* First block's page_count is for all blocks */ + blocks->count= info->bitmap_blocks.elements - position; + res= 0; + +abort: + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/**************************************************************************** + Clear and reset bits +****************************************************************************/ + +/* + Set fill pattern for a page + + set_page_bits() + info Maria handler + bitmap Bitmap handler + page Adress to page + fill_pattern Pattern (not size) for page + + NOTES + Page may not be part of active bitmap + + RETURN + 0 ok + 1 error +*/ + +static my_bool set_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + ulonglong page, uint fill_pattern) +{ + ulonglong bitmap_page; + uint offset_page, offset, tmp, org_tmp; + uchar *data; + DBUG_ENTER("set_page_bits"); + + bitmap_page= page - page % bitmap->pages_covered; + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(1); + + /* Find page number from start of bitmap */ + offset_page= page - bitmap->page - 1; + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + offset_page*= 3; + offset= offset_page & 7; + data= bitmap->map + offset_page / 8; + org_tmp= tmp= uint2korr(data); + tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset); + if (tmp == org_tmp) + DBUG_RETURN(0); /* No changes */ + int2store(data, tmp); + + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap);); + if (fill_pattern != 3 && fill_pattern != 7) + set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page); + /* + Note that if the condition above is false (page is full), and all pages of + this bitmap are now full, and that bitmap page was + first_bitmap_with_space, we don't modify first_bitmap_with_space, indeed + its value still tells us where to start our search for a bitmap with space + (which is for sure after this full one). + That does mean that first_bitmap_with_space is only a lower bound. + */ + DBUG_RETURN(0); +} + + +/* + Get bitmap pattern for a given page + + SYNOPSIS + get_page_bits() + info Maria handler + bitmap Bitmap handler + page Page number + + RETURN + 0-7 Bitmap pattern + ~0 Error (couldn't read page) +*/ + +static uint get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + ulonglong page) +{ + ulonglong bitmap_page; + uint offset_page, offset, tmp; + uchar *data; + DBUG_ENTER("get_page_bits"); + + bitmap_page= page - page % bitmap->pages_covered; + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(~ (uint) 0); + + /* Find page number from start of bitmap */ + offset_page= page - bitmap->page - 1; + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + offset_page*= 3; + offset= offset_page & 7; + data= bitmap->map + offset_page / 8; + tmp= uint2korr(data); + DBUG_RETURN((tmp >> offset) & 7); +} + + +/* + Mark all pages in a region as free + + SYNOPSIS + _ma_reset_full_page_bits() + info Maria handler + bitmap Bitmap handler + page Start page + page_count Number of pages + + NOTES + We assume that all pages in region is covered by same bitmap + One must have a lock on info->s->bitmap.bitmap_lock + + RETURN + 0 ok + 1 Error (when reading bitmap) +*/ + +my_bool _ma_reset_full_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + ulonglong page, uint page_count) +{ + ulonglong bitmap_page; + uint offset, bit_start, bit_count, tmp; + uchar *data; + DBUG_ENTER("_ma_reset_full_page_bits"); + DBUG_PRINT("enter", ("page: %lu page_count: %u", (ulong) page, page_count)); + safe_mutex_assert_owner(&info->s->bitmap.bitmap_lock); + + bitmap_page= page - page % bitmap->pages_covered; + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(1); + + /* Find page number from start of bitmap */ + page= page - bitmap->page - 1; + + /* Clear bits from 'page * 3' -> '(page + page_count) * 3' */ + bit_start= page * 3; + bit_count= page_count * 3; + + data= bitmap->map + bit_start / 8; + offset= bit_start & 7; + + tmp= (255 << offset); /* Bits to keep */ + if (bit_count + offset < 8) + { + /* Only clear bits between 'offset' and 'offset+bit_count-1' */ + tmp^= (255 << (offset + bit_count)); + } + *data&= ~tmp; + + if ((int) (bit_count-= (8 - offset)) > 0) + { + uint fill; + data++; + /* + -1 is here to avoid one 'if' statement and to let the following code + handle the last byte + */ + if ((fill= (bit_count - 1) / 8)) + { + bzero(data, fill); + data+= fill; + } + bit_count-= fill * 8; /* Bits left to clear */ + tmp= (1 << bit_count) - 1; + *data&= ~tmp; + } + set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap(bitmap);); + DBUG_RETURN(0); +} + + +/* + Correct bitmap pages to reflect the true allocation + + SYNOPSIS + _ma_bitmap_release_unused() + info Maria handle + blocks Bitmap blocks + + IMPLEMENTATION + If block->used & BLOCKUSED_TAIL is set: + If block->used & BLOCKUSED_USED is set, then the bits for the + corresponding page is set according to block->empty_space + If block->used & BLOCKUSED_USED is not set, then the bits for + the corresponding page is set to org_bitmap_value; + + If block->used & BLOCKUSED_TAIL is not set: + if block->used is not set, the bits for the corresponding page are + cleared + + For the first block (head block) the logic is same as for a tail block + + Note that we may have 'filler blocks' that are used to split a block + in half; These can be recognized by that they have page_count == 0. + + RETURN + 0 ok + 1 error (Couldn't write or read bitmap page) +*/ + +my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks) +{ + MARIA_BITMAP_BLOCK *block= blocks->block, *end= block + blocks->count; + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + uint bits, current_bitmap_value; + DBUG_ENTER("_ma_bitmap_release_unused"); + + /* + We can skip FULL_HEAD_PAGE (4) as the page was marked as 'full' + when we allocated space in the page + */ + current_bitmap_value= FULL_HEAD_PAGE; + + pthread_mutex_lock(&info->s->bitmap.bitmap_lock); + + /* First handle head block */ + if (block->used & BLOCKUSED_USED) + { + DBUG_PRINT("info", ("head empty_space: %u", block->empty_space)); + bits= _ma_free_size_to_head_pattern(bitmap, block->empty_space); + if (block->used & BLOCKUSED_USE_ORG_BITMAP) + current_bitmap_value= block->org_bitmap_value; + } + else + bits= block->org_bitmap_value; + if (bits != current_bitmap_value && + set_page_bits(info, bitmap, block->page, bits)) + goto err; + + + /* Handle all full pages and tail pages (for head page and blob) */ + for (block++; block < end; block++) + { + uint page_count; + if (!block->page_count) + continue; /* Skip 'filler blocks' */ + + page_count= block->page_count; + if (block->used & BLOCKUSED_TAIL) + { + /* The bitmap page is only one page */ + page_count= 1; + if (block->used & BLOCKUSED_USED) + { + DBUG_PRINT("info", ("tail empty_space: %u", block->empty_space)); + bits= free_size_to_tail_pattern(bitmap, block->empty_space); + } + else + bits= block->org_bitmap_value; + + /* + The page has all bits set; The following test is an optimization + to not set the bits to the same value as before. + */ + if (bits != FULL_TAIL_PAGE && + set_page_bits(info, bitmap, block->page, bits)) + goto err; + } + if (!(block->used & BLOCKUSED_USED) && + _ma_reset_full_page_bits(info, bitmap, + block->page, page_count)) + goto err; + } + pthread_mutex_unlock(&info->s->bitmap.bitmap_lock); + DBUG_RETURN(0); + +err: + pthread_mutex_unlock(&info->s->bitmap.bitmap_lock); + DBUG_RETURN(1); +} + + +/* + Free full pages from bitmap and pagecache + + SYNOPSIS + _ma_bitmap_free_full_pages() + info Maria handle + extents Extents (as stored on disk) + count Number of extents + + IMPLEMENTATION + Mark all full pages (not tails) from extents as free, both in bitmap + and page cache. + + RETURN + 0 ok + 1 error (Couldn't write or read bitmap page) +*/ + +my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents, + uint count) +{ + DBUG_ENTER("_ma_bitmap_free_full_pages"); + + pthread_mutex_lock(&info->s->bitmap.bitmap_lock); + for (; count--; extents += ROW_EXTENT_SIZE) + { + ulonglong page= uint5korr(extents); + uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE); + if (!(page_count & TAIL_BIT)) + { + if (pagecache_delete_pages(info->s->pagecache, &info->dfile, page, + page_count, PAGECACHE_LOCK_WRITE, 1)) + DBUG_RETURN(1); + if (_ma_reset_full_page_bits(info, &info->s->bitmap, page, page_count)) + { + pthread_mutex_unlock(&info->s->bitmap.bitmap_lock); + DBUG_RETURN(1); + } + } + } + pthread_mutex_unlock(&info->s->bitmap.bitmap_lock); + DBUG_RETURN(0); +} + + +/* + Mark in the bitmap how much free space there is on a page + + SYNOPSIS + _ma_bitmap_set() + info Mari handler + page Adress to page + head 1 if page is a head page, 0 if tail page + empty_space How much empty space there is on page + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_set(MARIA_HA *info, ulonglong page, my_bool head, + uint empty_space) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + uint bits; + my_bool res; + DBUG_ENTER("_ma_bitmap_set"); + + pthread_mutex_lock(&info->s->bitmap.bitmap_lock); + bits= (head ? + _ma_free_size_to_head_pattern(bitmap, empty_space) : + free_size_to_tail_pattern(bitmap, empty_space)); + res= set_page_bits(info, bitmap, page, bits); + pthread_mutex_unlock(&info->s->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/* + Check that bitmap pattern is correct for a page + + NOTES + Used in maria_chk + + SYNOPSIS + _ma_check_bitmap_data() + info Maria handler + page_type What kind of page this is + page Adress to page + empty_space Empty space on page + bitmap_pattern Store here the pattern that was in the bitmap for the + page. This is always updated. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_check_bitmap_data(MARIA_HA *info, + enum en_page_type page_type, ulonglong page, + uint empty_space, uint *bitmap_pattern) +{ + uint bits; + switch (page_type) { + case UNALLOCATED_PAGE: + case MAX_PAGE_TYPE: + bits= 0; + break; + case HEAD_PAGE: + bits= _ma_free_size_to_head_pattern(&info->s->bitmap, empty_space); + break; + case TAIL_PAGE: + bits= free_size_to_tail_pattern(&info->s->bitmap, empty_space); + break; + case BLOB_PAGE: + bits= FULL_TAIL_PAGE; + break; + } + return (*bitmap_pattern= get_page_bits(info, &info->s->bitmap, page)) != + bits; +} + + +/* + Check if the page type matches the one that we have in the bitmap + + SYNOPSIS + _ma_check_if_right_bitmap_type() + info Maria handler + page_type What kind of page this is + page Adress to page + bitmap_pattern Store here the pattern that was in the bitmap for the + page. This is always updated. + + NOTES + Used in maria_chk + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info, + enum en_page_type page_type, + ulonglong page, + uint *bitmap_pattern) +{ + if ((*bitmap_pattern= get_page_bits(info, &info->s->bitmap, page)) > 7) + return 1; /* Couldn't read page */ + switch (page_type) { + case HEAD_PAGE: + return *bitmap_pattern < 1 || *bitmap_pattern > 4; + case TAIL_PAGE: + return *bitmap_pattern < 5; + case BLOB_PAGE: + return *bitmap_pattern != 7; + default: + break; + } + DBUG_ASSERT(0); + return 1; +} + + +/** + @brief create the first bitmap page of a freshly created data file + + @param share table's share + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int _ma_bitmap_create_first(MARIA_SHARE *share) +{ + uint block_size= share->bitmap.block_size; + File file= share->bitmap.file.file; + if (my_chsize(file, block_size - sizeof(maria_bitmap_marker), + 0, MYF(MY_WME)) || + my_pwrite(file, maria_bitmap_marker, sizeof(maria_bitmap_marker), + block_size - sizeof(maria_bitmap_marker), + MYF(MY_NABP | MY_WME))) + return 1; + share->state.state.data_file_length= block_size; + _ma_bitmap_delete_all(share); + return 0; +} diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c new file mode 100644 index 00000000000..b12035c9cfa --- /dev/null +++ b/storage/maria/ma_blockrec.c @@ -0,0 +1,5279 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Storage of records in block + + Some clarifications about the abbrev used: + + NULL fields -> Fields that may have contain a NULL value. + Not null fields -> Fields that may not contain a NULL value. + Critical fields -> Fields that can't be null and can't be dropped without + causing a table reorganization. + + + Maria will have a LSN at start of each page (excluding the bitmap pages) + + The different page types that are in a data file are: + + Bitmap pages Map of free pages in the next extent (8192 page size + gives us 256M of mapped pages / bitmap) + Head page Start of rows are stored on this page. + A rowid always points to a head page + Blob page This page is totally filled with data from one blob or by + a set of long VARCHAR/CHAR fields + Tail page This contains the last part from different rows, blobs + or varchar fields. + + The data file starts with a bitmap page, followed by as many data + pages as the bitmap can cover. After this there is a new bitmap page + and more data pages etc. + + For information about the bitmap page, see ma_bitmap.c + + Structure of data and tail page: + + The page has a row directory at end of page to allow us to do deletes + without having to reorganize the page. It also allows us to later store + some more bytes after each row to allow them to grow without having to move + around other rows. + + Page header: + + LSN 7 bytes Log position for last page change + PAGE_TYPE 1 uchar 1 for head / 2 for tail / 3 for blob + NO 1 uchar Number of row/tail entries on page + empty space 2 bytes Empty space on page + + The most significant bit in PAGE_TYPE is set to 1 if the data on the page + can be compacted to get more space. (PAGE_CAN_BE_COMPACTED) + + Row data + + Row directory of NO entries, that consist of the following for each row + (in reverse order; i.e., first record is stored last): + + Position 2 bytes Position of row on page + Length 2 bytes Length of entry + + For Position and Length, the 1 most significant bit of the position and + the 1 most significant bit of the length could be used for some states of + the row (in other words, we should try to keep these reserved) + + eof flag 1 uchar Reserved for full page read testing. (Ie, did the + previous write get the whole block on disk. + + ---------------- + + Structure of blob pages: + + LSN 7 bytes Log position for last page change + PAGE_TYPE 1 uchar 3 + + data + + ----------------- + + Row data structure: + + Flag 1 uchar Marker of which header field exists + TRANSID 6 bytes TRANSID of changing transaction + (optional, added on insert and first + update/delete) + VER_PTR 7 bytes Pointer to older version in log + (undo record) + (optional, added after first + update/delete) + DELETE_TRANSID 6 bytes (optional). TRANSID of original row. + Added on delete. + Nulls_extended 1 uchar To allow us to add new DEFAULT NULL + fields (optional, added after first + change of row after alter table) + Number of ROW_EXTENT's 1-3 uchar Length encoded, optional + This is the number of extents the + row is split into + First row_extent 7 uchar Pointer to first row extent (optional) + + Total length of length array 1-3 uchar Only used if we have + char/varchar/blob fields. + Row checksum 1 uchar Only if table created with checksums + Null_bits .. One bit for each NULL field (a field that may + have the value NULL) + Empty_bits .. One bit for each field that may be 'empty'. + (Both for null and not null fields). + This bit is 1 if the value for the field is + 0 or empty string. + + field_offsets 2 byte/offset + For each 32'th field, there is one offset + that points to where the field information + starts in the block. This is to provide + fast access to later field in the row + when we only need to return a small + set of fields. + TODO: Implement this. + + Things marked above as 'optional' will only be present if the + corresponding bit is set in 'Flag' field. Flag gives us a way to + get more space on a page when doing page compaction as we don't need + to store TRANSID that have committed before the smallest running + transaction we have in memory. + + Data in the following order: + (Field order is precalculated when table is created) + + Critical fixed length, not null, fields. (Note, these can't be dropped) + Fixed length, null fields + + Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields. + Number of bytes used in length array per entry is depending on max length + for field. + + ROW_EXTENT's + CHAR data (space stripped) + VARCHAR data + BLOB data + + Fields marked in null_bits or empty_bits are not stored in data part or + length array. + + If row doesn't fit into the given block, then the first EXTENT will be + stored last on the row. This is done so that we don't break any field + data in the middle. + + We first try to store the full row into one block. If that's not possible + we move out each big blob into their own extents. If this is not enough we + move out a concatenation of all varchars to their own extent. + + Each blob and the concatenated char/varchar fields are stored the following + way: + - Store the parts in as many full-contiguous pages as possible. + - The last part, that doesn't fill a full page, is stored in tail page. + + When doing an insert of a new row, we don't have to have + VER_PTR in the row. This will make rows that are not changed stored + efficiently. On update and delete we would add TRANSID (if it was an old + committed row) and VER_PTR to + the row. On row page compaction we can easily detect rows where + TRANSID was committed before the longest running transaction + started and we can then delete TRANSID and VER_PTR from the row to + gain more space. + + If a row is deleted in Maria, we change TRANSID to the deleting + transaction's id, change VER_PTR to point to the undo record for the delete, + and add DELETE_TRANSID (the id of the transaction which last + inserted/updated the row before its deletion). DELETE_TRANSID allows an old + transaction to avoid reading the log to know if it can see the last version + before delete (in other words it reduces the probability of having to follow + VER_PTR). TODO: depending on a compilation option, evaluate the performance + impact of not storing DELETE_TRANSID (which would make the row smaller). + + Description of the different parts: + + Flag is coded as: + + Description bit + TRANS_ID_exists 0 + VER_PTR_exists 1 + Row is deleted 2 (Means that DELETE_TRANSID exists) + Nulls_extended_exists 3 + Row is split 7 This means that 'Number_of_row_extents' exists + + Nulls_extended is the number of new DEFAULT NULL fields in the row + compared to the number of DEFAULT NULL fields when the first version + of the table was created. If Nulls_extended doesn't exist in the row, + we know it's 0 as this must be one of the original rows from when the + table was created first time. This coding allows us to add 255*8 = + 2048 new fields without requiring a full alter table. + + Empty_bits is used to allow us to store 0, 0.0, empty string, empty + varstring and empty blob efficiently. (This is very good for data + warehousing where NULL's are often regarded as evil). Having this + bitmap also allows us to drop information of a field during a future + delete if field was deleted with ALTER TABLE DROP COLUMN. To be able + to handle DROP COLUMN, we must store in the index header the fields + that has been dropped. When unpacking a row we will ignore dropped + fields. When storing a row, we will mark a dropped field either with a + null in the null bit map or in the empty_bits and not store any data + for it. + TODO: Add code for handling dropped fields. + + + A ROW EXTENT is range of pages. One ROW_EXTENT is coded as: + + START_PAGE 5 bytes + PAGE_COUNT 2 bytes. High bit is used to indicate tail page/ + end of blob + With 8K pages, we can cover 256M in one extent. This coding gives us a + maximum file size of 2^40*8192 = 8192 tera + + As an example of ROW_EXTENT handling, assume a row with one integer + field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2 + big BLOB fields that we have updated. + + The record format for storing this into an empty file would be: + + Page 1: + + 00 00 00 00 00 00 00 LSN + 01 Only one row in page + xx xx Empty space on page + + 10 Flag: row split, VER_PTR exists + 01 00 00 00 00 00 TRANSID 1 + 00 00 00 00 00 01 00 VER_PTR to first block in LOG file 1 + 5 Number of row extents + 02 00 00 00 00 03 00 VARCHAR's are stored in full pages 2,3,4 + 0 No null fields + 0 No empty fields + 05 00 00 00 00 00 80 Tail page for VARCHAR, rowid 0 + 06 00 00 00 00 80 00 First blob, stored at page 6-133 + 05 00 00 00 00 01 80 Tail of first blob (896 bytes) at page 5 + 86 00 00 00 00 80 00 Second blob, stored at page 134-262 + 05 00 00 00 00 02 80 Tail of second blob (896 bytes) at page 5 + 05 00 5 integer + FA Length of first varchar field (size 250) + 00 60 Length of second varchar field (size 8192*3) + 00 60 10 First medium BLOB, 1M + 01 00 10 00 Second BLOB, 1M + xx xx xx xx xx xx Varchars are stored here until end of page + + ..... until end of page + + 09 00 F4 1F 00 (Start position 9, length 8180, end byte) +*/ + +#define SANITY_CHECKS + +#include "maria_def.h" +#include "ma_blockrec.h" +#include <lf.h> +#include "trnman.h" + +/* + Struct for having a cursor over a set of extent. + This is used to loop over all extents for a row when reading + the row data. It's also used to store the tail positions for + a read row to be used by a later update/delete command. +*/ + +typedef struct st_maria_extent_cursor +{ + /* + Pointer to packed uchar array of extents for the row. + Format is described above in the header + */ + uchar *extent; + /* Where data starts on page; Only for debugging */ + uchar *data_start; + /* Position to all tails in the row. Updated when reading a row */ + MARIA_RECORD_POS *tail_positions; + /* Current page */ + ulonglong page; + /* How many pages in the page region */ + uint page_count; + /* What kind of lock to use for tail pages */ + enum pagecache_page_lock lock_for_tail_pages; + /* Total number of extents (i.e., entries in the 'extent' slot) */ + uint extent_count; + /* <> 0 if current extent is a tail page; Set while using cursor */ + uint tail; + /* Position for tail on tail page */ + uint tail_row_nr; + /* + == 1 if we are working on the first extent (i.e., the one that is stored in + the row header, not an extent that is stored as part of the row data). + */ + my_bool first_extent; +} MARIA_EXTENT_CURSOR; + + +static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails); +static my_bool delete_head_or_tail(MARIA_HA *info, + ulonglong page, uint record_number, + my_bool head, my_bool from_update); +#ifndef DBUG_OFF +static void _ma_print_directory(uchar *buff, uint block_size); +#endif +static void compact_page(uchar *buff, uint block_size, uint rownr, + my_bool extend_block); +static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block, + uint block_size, ulong length); +static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record, + LEX_STRING *log_parts, + uint *log_parts_count); +static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec, + const uchar *newrec, + LEX_STRING *log_parts, + uint *log_parts_count); + +/**************************************************************************** + Initialization +****************************************************************************/ + +/* + Initialize data needed for block structures +*/ + + +/* Size of the different header elements for a row */ + +static uchar header_sizes[]= +{ + TRANSID_SIZE, + VERPTR_SIZE, + TRANSID_SIZE, /* Delete transid */ + 1 /* Null extends */ +}; + +/* + Calculate array of all used headers + + Used to speed up: + + size= 1; + if (flag & 1) + size+= TRANSID_SIZE; + if (flag & 2) + size+= VERPTR_SIZE; + if (flag & 4) + size+= TRANSID_SIZE + if (flag & 8) + size+= 1; + + NOTES + This is called only once at startup of Maria +*/ + +static uchar total_header_size[1 << array_elements(header_sizes)]; +#define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1) + +void _ma_init_block_record_data(void) +{ + uint i; + bzero(total_header_size, sizeof(total_header_size)); + total_header_size[0]= FLAG_SIZE; /* Flag uchar */ + for (i= 1; i < array_elements(total_header_size); i++) + { + uint size= FLAG_SIZE, j, bit; + for (j= 0; (bit= (1 << j)) <= i; j++) + { + if (i & bit) + size+= header_sizes[j]; + } + total_header_size[i]= size; + } +} + + +my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file) +{ + + share->base.max_data_file_length= + (((ulonglong) 1 << ((share->base.rec_reflength-1)*8))-1) * + share->block_size; +#if SIZEOF_OFF_T == 4 + set_if_smaller(share->base.max_data_file_length, INT_MAX32); +#endif + return _ma_bitmap_init(share, data_file); +} + + +my_bool _ma_once_end_block_record(MARIA_SHARE *share) +{ + int res= _ma_bitmap_end(share); + if (share->bitmap.file.file >= 0) + { + if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file, + share->temporary ? FLUSH_IGNORE_CHANGED : + FLUSH_RELEASE)) + res= 1; + /* + File must be synced as it is going out of the maria_open_list and so + becoming unknown to Checkpoint. + */ + if (share->now_transactional && + my_sync(share->bitmap.file.file, MYF(MY_WME))) + res= 1; + if (my_close(share->bitmap.file.file, MYF(MY_WME))) + res= 1; + /* + Trivial assignment to guard against multiple invocations + (May happen if file are closed but we want to keep the maria object + around a bit longer) + */ + share->bitmap.file.file= -1; + } + if (share->id != 0) + translog_deassign_id_from_share(share); + return res; +} + + +/* Init info->cur_row structure */ + +my_bool _ma_init_block_record(MARIA_HA *info) +{ + MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row; + DBUG_ENTER("_ma_init_block_record"); + + if (!my_multi_malloc(MY_WME, + &row->empty_bits, info->s->base.pack_bytes, + &row->field_lengths, + info->s->base.max_field_lengths + 2, + &row->blob_lengths, sizeof(ulong) * info->s->base.blobs, + &row->null_field_lengths, (sizeof(uint) * + (info->s->base.fields - + info->s->base.blobs + + EXTRA_LENGTH_FIELDS)), + &row->tail_positions, (sizeof(MARIA_RECORD_POS) * + (info->s->base.blobs + 2)), + &new_row->empty_bits, info->s->base.pack_bytes, + &new_row->field_lengths, + info->s->base.max_field_lengths + 2, + &new_row->blob_lengths, + sizeof(ulong) * info->s->base.blobs, + &new_row->null_field_lengths, (sizeof(uint) * + (info->s->base.fields - + info->s->base.blobs + + EXTRA_LENGTH_FIELDS)), + &info->log_row_parts, + sizeof(*info->log_row_parts) * + (TRANSLOG_INTERNAL_PARTS + 2 + + info->s->base.fields + 3), + &info->update_field_data, + (info->s->base.fields * 4 + + info->s->base.max_field_lengths + 1 + 4), + NullS, 0)) + DBUG_RETURN(1); + /* Skip over bytes used to store length of field length for logging */ + row->field_lengths+= 2; + new_row->field_lengths+= 2; + if (my_init_dynamic_array(&info->bitmap_blocks, + sizeof(MARIA_BITMAP_BLOCK), + ELEMENTS_RESERVED_FOR_MAIN_PART, 16)) + goto err; + /* The following should be big enough for all purposes */ + if (my_init_dynamic_array(&info->pinned_pages, + sizeof(MARIA_PINNED_PAGE), + max(info->s->base.blobs*2 + 4, + MARIA_MAX_TREE_LEVELS*2), 16)) + goto err; + row->base_length= new_row->base_length= info->s->base_length; + + /* + We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in + null_field_lengths to allow splitting of rows in 'find_where_to_split_row' + */ + + row->null_field_lengths+= EXTRA_LENGTH_FIELDS; + new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS; + + DBUG_RETURN(0); + +err: + _ma_end_block_record(info); + DBUG_RETURN(1); +} + + +void _ma_end_block_record(MARIA_HA *info) +{ + DBUG_ENTER("_ma_end_block_record"); + my_free((uchar*) info->cur_row.empty_bits, MYF(MY_ALLOW_ZERO_PTR)); + delete_dynamic(&info->bitmap_blocks); + delete_dynamic(&info->pinned_pages); + my_free((uchar*) info->cur_row.extents, MYF(MY_ALLOW_ZERO_PTR)); + /* + The data file is closed, when needed, in ma_once_end_block_record(). + The following protects us from doing an extra, not allowed, close + in maria_close() + */ + info->dfile.file= -1; + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + Helper functions +****************************************************************************/ + +/* + Return the next unused postion on the page after a directory entry. + + SYNOPSIS + start_of_next_entry() + dir Directory entry to be used. This can not be the + the last entry on the page! + + RETURN + # Position in page where next entry starts. + Everything between the '*dir' and this are free to be used. +*/ + +static inline uint start_of_next_entry(uchar *dir) +{ + uchar *prev; + /* + Find previous used entry. (There is always a previous entry as + the directory never starts with a deleted entry) + */ + for (prev= dir - DIR_ENTRY_SIZE ; + prev[0] == 0 && prev[1] == 0 ; + prev-= DIR_ENTRY_SIZE) + {} + return (uint) uint2korr(prev); +} + + +/* + Return the offset where the previous entry ends (before on page) + + SYNOPSIS + end_of_previous_entry() + dir Address for current directory entry + end Address to last directory entry + + RETURN + # Position where previous entry ends (smallest address on page) + Everything between # and current entry are free to be used. +*/ + + +static inline uint end_of_previous_entry(uchar *dir, uchar *end) +{ + uchar *pos; + for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE) + { + uint offset; + if ((offset= uint2korr(pos))) + return offset + uint2korr(pos+2); + } + return PAGE_HEADER_SIZE; +} + + +/** + @brief Extend a record area to fit a given size block + + @fn extend_area_on_page() + @param buff Page buffer + @param dir Pointer to dir entry in buffer + @param rownr Row number we working on + @param block_size Block size of buffer + @param request_length How much data we want to put at [dir] + @param empty_space Total empty space in buffer + + IMPLEMENTATION + The logic is as follows (same as in _ma_update_block_record()) + - If new data fits in old block, use old block. + - Extend block with empty space before block. If enough, use it. + - Extend block with empty space after block. If enough, use it. + - Use compact_page() to get all empty space at dir. + + RETURN + @retval 0 ok + @retval ret_offset Pointer to store offset to found area + @retval ret_length Pointer to store length of found area + @retval [dir] rec_offset is store here too + + @retval 1 error (wrong info in block) +*/ + +static my_bool extend_area_on_page(uchar *buff, uchar *dir, + uint rownr, uint block_size, + uint request_length, + uint *empty_space, uint *ret_offset, + uint *ret_length) +{ + uint rec_offset, length; + DBUG_ENTER("extend_area_on_page"); + + rec_offset= uint2korr(dir); + length= uint2korr(dir + 2); + DBUG_PRINT("enter", ("rec_offset: %u length: %u request_length: %u", + rec_offset, length, request_length)); + + *empty_space+= length; + if (length < request_length) + { + uint max_entry= (uint) ((uchar*) buff)[DIR_COUNT_OFFSET]; + uint old_rec_offset; + /* + New data did not fit in old position. + Find first possible position where to put new data. + */ + old_rec_offset= rec_offset; + rec_offset= end_of_previous_entry(dir, buff + block_size - + PAGE_SUFFIX_SIZE); + length+= (uint) (old_rec_offset - rec_offset); + /* + old_rec_offset is 0 if we are doing an insert into a not allocated block. + This can only happen during REDO of INSERT + */ + if (!old_rec_offset || length < request_length) + { + /* + Did not fit in current block + empty space. Extend with + empty space after block. + */ + if (rownr == max_entry - 1) + { + /* Last entry; Everything is free between this and directory */ + length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) - + rec_offset); + } + else + length= start_of_next_entry(dir) - rec_offset; + DBUG_ASSERT((int) length > 0); + if (length < request_length) + { + /* Not enough continues space, compact page to get more */ + int2store(dir, rec_offset); + compact_page(buff, block_size, rownr, 1); + rec_offset= uint2korr(dir); + length= uint2korr(dir+2); + if (length < request_length) + DBUG_RETURN(1); /* Error in block */ + *empty_space= length; /* All space is here */ + } + } + } + int2store(dir, rec_offset); + *ret_offset= rec_offset; + *ret_length= length; + DBUG_RETURN(0); +} + + +/* + Check that a region is all zero + + SYNOPSIS + check_if_zero() + pos Start of memory to check + length length of memory region + + NOTES + Used mainly to detect rows with wrong extent information +*/ + +static my_bool check_if_zero(uchar *pos, uint length) +{ + uchar *end; + for (end= pos+ length; pos != end ; pos++) + if (pos[0] != 0) + return 1; + return 0; +} + + +/* + @brief Copy not changed fields from 'from' to 'to' + + @notes + Assumption is that most fields are not changed! + (Which is why we don't test if all bits are set for some bytes in bitmap) +*/ + +void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields, + uchar *to, uchar *from) +{ + MARIA_COLUMNDEF *column, *end_column; + uchar *bitmap= (uchar*) changed_fields->bitmap; + MARIA_SHARE *share= info->s; + uint bit= 1; + + for (column= share->columndef, end_column= column+ share->base.fields; + column < end_column; column++) + { + if (!(*bitmap & bit)) + { + uint field_length= column->length; + if (column->type == FIELD_VARCHAR) + { + if (column->fill_length == 1) + field_length= (uint) from[column->offset] + 1; + else + field_length= uint2korr(from + column->offset) + 2; + } + memcpy(to + column->offset, from + column->offset, field_length); + } + if ((bit= (bit << 1)) == 256) + { + bitmap++; + bit= 1; + } + } +} + + +/* + Unpin all pinned pages + + SYNOPSIS + _ma_unpin_all_pages() + info Maria handler + undo_lsn LSN for undo pages. LSN_IMPOSSIBLE if we shouldn't write undo + (error) + + NOTE + We unpin pages in the reverse order as they where pinned; This may not + be strictly necessary but may simplify things in the future. + + RETURN + 0 ok + 1 error (fatal disk error) + +*/ + +void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn) +{ + MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*) + dynamic_array_ptr(&info->pinned_pages, 0)); + MARIA_PINNED_PAGE *pinned_page= page_link + info->pinned_pages.elements; + DBUG_ENTER("_ma_unpin_all_pages"); + DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn)); + + /* True if not disk error */ + DBUG_ASSERT((undo_lsn != LSN_IMPOSSIBLE) || !info->s->now_transactional); + + if (!info->s->now_transactional) + undo_lsn= LSN_IMPOSSIBLE; /* don't try to set a LSN on pages */ + + while (pinned_page-- != page_link) + pagecache_unlock_by_link(info->s->pagecache, pinned_page->link, + pinned_page->unlock, PAGECACHE_UNPIN, + info->trn->rec_lsn, undo_lsn); + + info->pinned_pages.elements= 0; + DBUG_VOID_RETURN; +} + + +#ifdef NOT_YET_NEEDED +/* Calculate empty space on a page */ + +static uint empty_space_on_page(uchar *buff, uint block_size) +{ + enum en_page_type; + page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] & + ~(uchar) PAGE_CAN_BE_COMPACTED); + if (page_type == UNALLOCATED_PAGE) + return block_size; + if ((uint) page_type <= TAIL_PAGE) + return uint2korr(buff+EMPTY_SPACE_OFFSET); + return 0; /* Blob page */ +} +#endif + +/** + When we have finished the write/update/delete of a row, we have cleanups to + do. For now it is signalling to Checkpoint that all dirtied pages have + their rec_lsn set and page LSN set (_ma_unpin_all_pages() has been called), + and that bitmap pages are correct (_ma_bitmap_release_unused() has been + called). +*/ +#define _ma_finalize_row(info) \ + do { info->trn->rec_lsn= LSN_IMPOSSIBLE; } while(0) +/** unpinning is often the last operation before finalizing: */ +#define _ma_unpin_all_pages_and_finalize_row(info,undo_lsn) do \ + { \ + _ma_unpin_all_pages(info, undo_lsn); \ + _ma_finalize_row(info); \ + } while(0) + + +/* + Find free position in directory + + SYNOPSIS + find_free_position() + buff Page + block_size Size of page + res_rownr Store index to free position here + res_length Store length of found segment here + empty_space Store length of empty space on disk here. This is + all empty space, including the found block. + + NOTES + If there is a free directory entry (entry with position == 0), + then use it and change it to be the size of the empty block + after the previous entry. This guarantees that all row entries + are stored on disk in inverse directory order, which makes life easier for + 'compact_page()' and to know if there is free space after any block. + + If there is no free entry (entry with position == 0), then we create + a new one. If there is not space for the directory entry (because + the last block overlapps with the directory), we compact the page. + + We will update the offset and the length of the found dir entry to + match the position and empty space found. + + buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller + + RETURN + 0 Error (directory full or last block goes over directory) + # Pointer to directory entry on page +*/ + +static uchar *find_free_position(uchar *buff, uint block_size, uint *res_rownr, + uint *res_length, uint *empty_space) +{ + uint max_entry= (uint) ((uchar*) buff)[DIR_COUNT_OFFSET]; + uint entry, length, first_pos; + uchar *dir, *end; + DBUG_ENTER("find_free_position"); + DBUG_PRINT("info", ("max_entry: %u", max_entry)); + + dir= (buff + block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE); + end= buff + block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; + + *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + + /* Search after first empty position */ + first_pos= PAGE_HEADER_SIZE; + for (entry= 0 ; dir <= end ; end-= DIR_ENTRY_SIZE, entry++) + { + uint tmp= uint2korr(end); + if (!tmp) /* Found not used entry */ + { + length= start_of_next_entry(end) - first_pos; + int2store(end, first_pos); /* Update dir entry */ + int2store(end + 2, length); + *res_rownr= entry; + *res_length= length; + DBUG_RETURN(end); + } + first_pos= tmp + uint2korr(end + 2); + } + /* No empty places in dir; create a new one */ + dir= end; + /* Check if there is place for the directory entry */ + if (max_entry == MAX_ROWS_PER_PAGE) + DBUG_RETURN(0); + /* Check if there is place for the directory entry */ + if ((uint) (dir - buff) < first_pos) + { + /* Create place for directory */ + compact_page(buff, block_size, max_entry-1, 0); + first_pos= (uint2korr(end + DIR_ENTRY_SIZE) + + uint2korr(end + DIR_ENTRY_SIZE+ 2)); + *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + } + buff[DIR_COUNT_OFFSET]= (uchar) (uchar) max_entry+1; + length= (uint) (dir - buff - first_pos); + DBUG_ASSERT(length <= *empty_space - DIR_ENTRY_SIZE); + int2store(dir, first_pos); + int2store(dir+2, length); /* Current max length */ + *res_rownr= max_entry; + *res_length= length; + + /* Reduce directory entry size from free space size */ + (*empty_space)-= DIR_ENTRY_SIZE; + DBUG_RETURN(dir); +} + + +/**************************************************************************** + Updating records +****************************************************************************/ + +/* + Calculate length of all the different field parts + + SYNOPSIS + calc_record_size() + info Maria handler + record Row to store + row Store statistics about row here + + NOTES + The statistics is used to find out how much space a row will need + and also where we can split a row when we need to split it into several + extents. +*/ + +static void calc_record_size(MARIA_HA *info, const uchar *record, + MARIA_ROW *row) +{ + MARIA_SHARE *share= info->s; + uchar *field_length_data; + MARIA_COLUMNDEF *column, *end_column; + uint *null_field_lengths= row->null_field_lengths; + ulong *blob_lengths= row->blob_lengths; + DBUG_ENTER("calc_record_size"); + + row->normal_length= row->char_length= row->varchar_length= + row->blob_length= row->extents_count= 0; + + /* Create empty bitmap and calculate length of each varlength/char field */ + bzero(row->empty_bits, share->base.pack_bytes); + field_length_data= row->field_lengths; + for (column= share->columndef + share->base.fixed_not_null_fields, + end_column= share->columndef + share->base.fields; + column < end_column; column++, null_field_lengths++) + { + if ((record[column->null_pos] & column->null_bit)) + { + if (column->type != FIELD_BLOB) + *null_field_lengths= 0; + else + *blob_lengths++= 0; + continue; + } + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + DBUG_ASSERT(column->empty_bit == 0); + /* fall through */ + case FIELD_SKIP_PRESPACE: /* Not packed */ + row->normal_length+= column->length; + *null_field_lengths= column->length; + break; + case FIELD_SKIP_ZERO: /* Fixed length field */ + if (memcmp(record+ column->offset, maria_zero_string, + column->length) == 0) + { + row->empty_bits[column->empty_pos] |= column->empty_bit; + *null_field_lengths= 0; + } + else + { + row->normal_length+= column->length; + *null_field_lengths= column->length; + } + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + const char *pos, *end; + for (pos= record + column->offset, end= pos + column->length; + end > pos && end[-1] == ' '; end--) + ; + if (pos == end) /* If empty string */ + { + row->empty_bits[column->empty_pos]|= column->empty_bit; + *null_field_lengths= 0; + } + else + { + uint length= (end - pos); + if (column->length <= 255) + *field_length_data++= (uchar) length; + else + { + int2store(field_length_data, length); + field_length_data+= 2; + } + row->char_length+= length; + *null_field_lengths= length; + } + break; + } + case FIELD_VARCHAR: + { + uint length, field_length_data_length; + const uchar *field_pos= record + column->offset; + + /* 256 is correct as this includes the length uchar */ + field_length_data[0]= field_pos[0]; + if (column->length <= 256) + { + length= (uint) (uchar) *field_pos; + field_length_data_length= 1; + } + else + { + length= uint2korr(field_pos); + field_length_data[1]= field_pos[1]; + field_length_data_length= 2; + } + *null_field_lengths= length; + if (!length) + { + row->empty_bits[column->empty_pos]|= column->empty_bit; + break; + } + row->varchar_length+= length; + *null_field_lengths= length; + field_length_data+= field_length_data_length; + break; + } + case FIELD_BLOB: + { + const uchar *field_pos= record + column->offset; + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_pos); + + *blob_lengths++= blob_length; + if (!blob_length) + row->empty_bits[column->empty_pos]|= column->empty_bit; + else + { + row->blob_length+= blob_length; + memcpy(field_length_data, field_pos, size_length); + field_length_data+= size_length; + } + break; + } + default: + DBUG_ASSERT(0); + } + } + row->field_lengths_length= (uint) (field_length_data - row->field_lengths); + row->head_length= (row->base_length + + share->base.fixed_not_null_fields_length + + row->field_lengths_length + + size_to_store_key_length(row->field_lengths_length) + + row->normal_length + + row->char_length + row->varchar_length); + row->total_length= (row->head_length + row->blob_length); + if (row->total_length < share->base.min_row_length) + row->total_length= share->base.min_row_length; + DBUG_PRINT("exit", ("head_length: %lu total_length: %lu", + (ulong) row->head_length, (ulong) row->total_length)); + DBUG_VOID_RETURN; +} + + +/* + Compact page by removing all space between rows + + IMPLEMENTATION + Move up all rows to start of page. + Move blocks that are directly after each other with one memmove. + + TODO LATER + Remove TRANSID from rows that are visible to all transactions + + SYNOPSIS + compact_page() + buff Page to compact + block_size Size of page + rownr Put empty data after this row + extend_block If 1, extend the block at 'rownr' to cover the + whole block. +*/ + + +static void compact_page(uchar *buff, uint block_size, uint rownr, + my_bool extend_block) +{ + uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET]; + uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block; + uchar *dir, *end; + DBUG_ENTER("compact_page"); + DBUG_PRINT("enter", ("rownr: %u", rownr)); + DBUG_ASSERT(max_entry > 0 && + max_entry < (block_size - PAGE_HEADER_SIZE - + PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE); + + /* Move all entries before and including rownr up to start of page */ + dir= buff + block_size - DIR_ENTRY_SIZE * (rownr+1) - PAGE_SUFFIX_SIZE; + end= buff + block_size - DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE; + page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE; + diff= 0; + for (; dir <= end ; end-= DIR_ENTRY_SIZE) + { + uint offset= uint2korr(end); + + if (offset) + { + uint row_length= uint2korr(end + 2); + DBUG_ASSERT(offset >= page_pos); + DBUG_ASSERT(buff + offset + row_length <= dir); + + if (offset != next_free_pos) + { + uint length= (next_free_pos - start_of_found_block); + /* + There was empty space before this and prev block + Check if we have to move previous block up to page start + */ + if (page_pos != start_of_found_block) + { + /* move up previous block */ + memmove(buff + page_pos, buff + start_of_found_block, length); + } + page_pos+= length; + /* next continuous block starts here */ + start_of_found_block= offset; + diff= offset - page_pos; + } + int2store(end, offset - diff); /* correct current pos */ + next_free_pos= offset + row_length; + } + } + if (page_pos != start_of_found_block) + { + uint length= (next_free_pos - start_of_found_block); + memmove(buff + page_pos, buff + start_of_found_block, length); + } + start_of_found_block= uint2korr(dir); + + if (rownr != max_entry - 1) + { + /* Move all entries after rownr to end of page */ + uint rownr_length; + next_free_pos= end_of_found_block= page_pos= + block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE; + diff= 0; + /* End points to entry before 'rownr' */ + for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE) + { + uint offset= uint2korr(dir); + uint row_length= uint2korr(dir + 2); + uint row_end= offset + row_length; + if (!offset) + continue; + DBUG_ASSERT(offset >= start_of_found_block && row_end <= next_free_pos); + + if (row_end != next_free_pos) + { + uint length= (end_of_found_block - next_free_pos); + if (page_pos != end_of_found_block) + { + /* move next block down */ + memmove(buff + page_pos - length, buff + next_free_pos, length); + } + page_pos-= length; + /* next continuous block starts here */ + end_of_found_block= row_end; + diff= page_pos - row_end; + } + int2store(dir, offset + diff); /* correct current pos */ + next_free_pos= offset; + } + if (page_pos != end_of_found_block) + { + uint length= (end_of_found_block - next_free_pos); + memmove(buff + page_pos - length, buff + next_free_pos, length); + next_free_pos= page_pos- length; + } + /* Extend rownr block to cover hole */ + rownr_length= next_free_pos - start_of_found_block; + int2store(dir+2, rownr_length); + } + else + { + if (extend_block) + { + /* Extend last block cover whole page */ + uint length= (uint) (dir - buff) - start_of_found_block; + int2store(dir+2, length); + } + else + { + /* + TODO: + Update (buff + EMPTY_SPACE_OFFSET) if we remove transid from rows + */ + } + buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED; + } + DBUG_EXECUTE("directory", _ma_print_directory(buff, block_size);); + DBUG_VOID_RETURN; +} + + +/* + Create an empty tail or head page + + SYNOPSIS + make_empty_page() + buff Page buffer + block_size Block size + page_type HEAD_PAGE or TAIL_PAGE + + NOTES + EMPTY_SPACE is not updated +*/ + +static void make_empty_page(uchar *buff, uint block_size, uint page_type) +{ + + bzero(buff, PAGE_HEADER_SIZE); + /* + We zero the rest of the block to avoid getting old memory information + to disk and to allow the file to be compressed better if archived. + The rest of the code does not assume the block is zeroed above + PAGE_OVERHEAD_SIZE + */ + bzero(buff+ PAGE_HEADER_SIZE, block_size - PAGE_HEADER_SIZE); + buff[PAGE_TYPE_OFFSET]= (uchar) page_type; + buff[DIR_COUNT_OFFSET]= 1; + /* Store position to the first row */ + int2store(buff + block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE, + PAGE_HEADER_SIZE); +} + + +/* + Read or initialize new head or tail page + + SYNOPSIS + get_head_or_tail_page() + info Maria handler + block Block to read + buff Suggest this buffer to key cache + length Minimum space needed + page_type HEAD_PAGE || TAIL_PAGE + res Store result position here + + NOTES + We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data + as we don't know how much data the caller will actually use. + + RETURN + 0 ok All slots in 'res' are updated + 1 error my_errno is set +*/ + +struct st_row_pos_info +{ + uchar *buff; /* page buffer */ + uchar *data; /* Place for data */ + uchar *dir; /* Directory */ + uint length; /* Length for data */ + uint rownr; /* Offset in directory */ + uint empty_space; /* Space left on page */ +}; + + +static my_bool get_head_or_tail_page(MARIA_HA *info, + MARIA_BITMAP_BLOCK *block, + uchar *buff, uint length, uint page_type, + enum pagecache_page_lock lock, + struct st_row_pos_info *res) +{ + uint block_size; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + DBUG_ENTER("get_head_or_tail_page"); + DBUG_PRINT("enter", ("length: %u", length)); + + block_size= share->block_size; + if (block->org_bitmap_value == 0) /* Empty block */ + { + /* New page */ + make_empty_page(buff, block_size, page_type); + res->buff= buff; + res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE); + res->data= (buff + PAGE_HEADER_SIZE); + res->dir= res->data + res->length; + res->rownr= 0; + DBUG_ASSERT(length <= res->length); + } + else + { + uchar *dir; + /* Read old page */ + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (!(res->buff= pagecache_read(share->pagecache, + &info->dfile, + (my_off_t) block->page, 0, + buff, share->page_type, + lock, &page_link.link))) + DBUG_RETURN(1); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + DBUG_ASSERT((res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type); + if (!(dir= find_free_position(res->buff, block_size, &res->rownr, + &res->length, &res->empty_space))) + goto crashed; + + if (res->length < length) + { + if (res->empty_space + res->length >= length) + { + compact_page(res->buff, block_size, res->rownr, 1); + /* All empty space are now after current position */ + dir= (res->buff + block_size - DIR_ENTRY_SIZE * res->rownr - + DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE); + res->length= res->empty_space= uint2korr(dir+2); + } + if (res->length < length) + { + DBUG_PRINT("error", ("length: %u res->length: %u empty_space: %u", + length, res->length, res->empty_space)); + goto crashed; /* Wrong bitmap information */ + } + } + res->dir= dir; + res->data= res->buff + uint2korr(dir); + } + DBUG_RETURN(0); + +crashed: + my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */ + DBUG_RETURN(1); +} + + +/* + Write tail for head data or blob + + SYNOPSIS + write_tail() + info Maria handler + block Block to tail page + row_part Data to write to page + length Length of data + + NOTES + block->page_count is updated to the directory offset for the tail + so that we can store the position in the row extent information + + RETURN + 0 ok + block->page_count is set to point (dir entry + TAIL_BIT) + + 1 error; In this case my_errno is set to the error +*/ + +static my_bool write_tail(MARIA_HA *info, + MARIA_BITMAP_BLOCK *block, + uchar *row_part, uint length) +{ + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + uint block_size= share->block_size, empty_space; + struct st_row_pos_info row_pos; + my_off_t position; + my_bool res, block_is_read; + DBUG_ENTER("write_tail"); + DBUG_PRINT("enter", ("page: %lu length: %u", + (ulong) block->page, length)); + + info->keyread_buff_used= 1; + + /* page will be pinned & locked by get_head_or_tail_page */ + if (get_head_or_tail_page(info, block, info->keyread_buff, length, + TAIL_PAGE, PAGECACHE_LOCK_WRITE, + &row_pos)) + DBUG_RETURN(1); + block_is_read= block->org_bitmap_value != 0; + + memcpy(row_pos.data, row_part, length); + + { + /* Log changes in tail block */ + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + LSN lsn; + + /* Log REDO changes of tail page */ + page_store(log_data + FILEID_STORE_SIZE, block->page); + dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + row_pos.rownr); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) row_pos.data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; + if (translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_TAIL, + info->trn, info, sizeof(log_data) + length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data)) + DBUG_RETURN(1); + } + + /* + Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows + some place to grow in the future) + */ + if (length < MIN_TAIL_SIZE) + length= MIN_TAIL_SIZE; + int2store(row_pos.dir + 2, length); + empty_space= row_pos.empty_space - length; + int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space); + block->page_count= row_pos.rownr + TAIL_BIT; + /* + If there is less directory entries free than number of possible tails + we can write for a row, we mark the page full to ensure that we don't + during _ma_bitmap_find_place() allocate more entries on the tail page + than it can hold + */ + block->empty_space= ((uint) ((uchar*) row_pos.buff)[DIR_COUNT_OFFSET] <= + MAX_ROWS_PER_PAGE - 1 - share->base.blobs ? + empty_space : 0); + block->used= BLOCKUSED_USED | BLOCKUSED_TAIL; + + /* Increase data file size, if extended */ + position= (my_off_t) block->page * block_size; + if (info->state->data_file_length <= position) + info->state->data_file_length= position + block_size; + + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (!(res= pagecache_write(share->pagecache, + &info->dfile, block->page, 0, + row_pos.buff,share->page_type, + block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ : + PAGECACHE_LOCK_READ, + block_is_read ? PAGECACHE_PIN_LEFT_PINNED : + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, &page_link.link))) + { + page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; + if (block_is_read) + { + /* Change the lock used when we read the page */ + set_dynamic(&info->pinned_pages, (void*) &page_link, + info->pinned_pages.elements-1); + } + else + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + DBUG_RETURN(res); +} + + +/* + Write full pages + + SYNOPSIS + write_full_pages() + info Maria handler + lsn LSN for the undo record + block Where to write data + data Data to write + length Length of data + + NOTES + Logging of the changes to the full pages are done in the caller + write_block_record(). + + RETURN + 0 ok + 1 error on write +*/ + +static my_bool write_full_pages(MARIA_HA *info, + LSN lsn, + MARIA_BITMAP_BLOCK *block, + uchar *data, ulong length) +{ + my_off_t page; + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + uint data_size= FULL_PAGE_SIZE(block_size); + uchar *buff= info->keyread_buff; + uint page_count; + my_off_t position; + DBUG_ENTER("write_full_pages"); + DBUG_PRINT("enter", ("length: %lu page: %lu page_count: %lu", + (ulong) length, (ulong) block->page, + (ulong) block->page_count)); + DBUG_ASSERT((block->page_count & TAIL_BIT) == 0); + + info->keyread_buff_used= 1; + page= block->page; + page_count= block->page_count; + + position= (my_off_t) (page + page_count) * block_size; + if (info->state->data_file_length < position) + info->state->data_file_length= position; + + /* Increase data file size, if extended */ + + for (; length; data+= data_size) + { + uint copy_length; + if (!page_count--) + { + block++; + page= block->page; + page_count= block->page_count - 1; + DBUG_PRINT("info", ("page: %lu page_count: %lu", + (ulong) block->page, (ulong) block->page_count)); + + position= (page + page_count + 1) * block_size; + if (info->state->data_file_length < position) + info->state->data_file_length= position; + } + lsn_store(buff, lsn); + buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE; + copy_length= min(data_size, length); + memcpy(buff + LSN_SIZE + PAGE_TYPE_SIZE, data, copy_length); + length-= copy_length; + + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0)) + DBUG_RETURN(1); + page++; + block->used= BLOCKUSED_USED; + } + DBUG_RETURN(0); +} + + +/* + Store ranges of full pages in compact format for logging + + SYNOPSIS + store_page_range() + to Store data here + block Where pages are to be written + block_size block size + length Length of data to be written + Normally this is full pages, except for the last + tail block that may only partly fit the last page. + + RETURN + # end position for 'to' +*/ + +static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block, + uint block_size, ulong length) +{ + uint data_size= FULL_PAGE_SIZE(block_size); + ulong pages_left= (length + data_size -1) / data_size; + uint page_count; + DBUG_ENTER("store_page_range"); + + do + { + ulonglong page; + page= block->page; + page_count= block->page_count; + block++; + if (page_count > pages_left) + page_count= pages_left; + + page_store(to, page); + to+= PAGE_STORE_SIZE; + pagerange_store(to, page_count); + to+= PAGERANGE_STORE_SIZE; + } while ((pages_left-= page_count)); + DBUG_RETURN(to); +} + + +/* + Store packed extent data + + SYNOPSIS + store_extent_info() + to Store first packed data here + row_extents_second_part Store rest here + first_block First block to store + count Number of blocks + + NOTES + We don't have to store the position for the head block +*/ + +static void store_extent_info(uchar *to, + uchar *row_extents_second_part, + MARIA_BITMAP_BLOCK *first_block, + uint count) +{ + MARIA_BITMAP_BLOCK *block, *end_block; + uint copy_length; + my_bool first_found= 0; + + for (block= first_block, end_block= first_block+count ; + block < end_block; block++) + { + /* The following is only false for marker blocks */ + if (likely(block->used & BLOCKUSED_USED)) + { + DBUG_ASSERT(block->page_count != 0); + page_store(to, block->page); + pagerange_store(to + PAGE_STORE_SIZE, block->page_count); + to+= ROW_EXTENT_SIZE; + if (!first_found) + { + first_found= 1; + to= row_extents_second_part; + } + } + } + copy_length= (count - 1) * ROW_EXTENT_SIZE; + /* + In some unlikely cases we have allocated to many blocks. Clear this + data. + */ + bzero(to, (size_t) (row_extents_second_part + copy_length - to)); +} + + +/* + Free regions of pages with logging + + RETURN + 0 ok + 1 error +*/ + +static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row) +{ + uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + LSN lsn; + size_t extents_length= row->extents_count * ROW_EXTENT_SIZE; + DBUG_ENTER("free_full_pages"); + + pagerange_store(log_data + FILEID_STORE_SIZE, + row->extents_count); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row->extents; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length; + if (translog_write_record(&lsn, LOGREC_REDO_PURGE_BLOCKS, info->trn, + info, sizeof(log_data) + extents_length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data)) + DBUG_RETURN(1); + + DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents, + row->extents_count)); +} + + +/* + Free one page range + + NOTES + This is very similar to free_full_pages() + + RETURN + 0 ok + 1 error +*/ + +static my_bool free_full_page_range(MARIA_HA *info, ulonglong page, uint count) +{ + my_bool res= 0; + DBUG_ENTER("free_full_page_range"); + + if (pagecache_delete_pages(info->s->pagecache, &info->dfile, + page, count, PAGECACHE_LOCK_WRITE, 0)) + res= 1; + + if (info->s->now_transactional) + { + LSN lsn; + /** @todo unify log_data's shape with delete_head_or_tail() */ + uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + ROW_EXTENT_SIZE]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + DBUG_ASSERT(info->trn->rec_lsn); + pagerange_store(log_data + FILEID_STORE_SIZE, 1); + page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, + page); + int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + PAGE_STORE_SIZE, count); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + if (translog_write_record(&lsn, LOGREC_REDO_PURGE_BLOCKS, + info->trn, info, sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data)) + res= 1; + + } + pthread_mutex_lock(&info->s->bitmap.bitmap_lock); + if (_ma_reset_full_page_bits(info, &info->s->bitmap, page, + count)) + res= 1; + pthread_mutex_unlock(&info->s->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/** + @brief Write a record to a (set of) pages + + @fn write_block_record() + @param info Maria handler + @param old_record Original record in case of update; NULL in case of + insert + @param record Record we should write + @param row Statistics about record (calculated by + calc_record_size()) + @param map_blocks On which pages the record should be stored + @param row_pos Position on head page where to put head part of + record + @param undo_lsn <> LSN_ERROR if we are executing an UNDO + + @note + On return all pinned pages are released. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool write_block_record(MARIA_HA *info, + const uchar *old_record, const uchar *record, + MARIA_ROW *row, + MARIA_BITMAP_BLOCKS *bitmap_blocks, + my_bool head_block_is_read, + struct st_row_pos_info *row_pos, + LSN undo_lsn) +{ + uchar *data, *end_of_data, *tmp_data_used, *tmp_data; + uchar *row_extents_first_part, *row_extents_second_part; + uchar *field_length_data; + uchar *page_buff; + MARIA_BITMAP_BLOCK *block, *head_block; + MARIA_SHARE *share= info->s; + MARIA_COLUMNDEF *column, *end_column; + MARIA_PINNED_PAGE page_link; + uint block_size, flag; + ulong *blob_lengths; + my_bool row_extents_in_use, blob_full_pages_exists; + LSN lsn; + my_off_t position; + DBUG_ENTER("write_block_record"); + + LINT_INIT(row_extents_first_part); + LINT_INIT(row_extents_second_part); + + head_block= bitmap_blocks->block; + block_size= share->block_size; + + page_buff= row_pos->buff; + /* Position on head page where we should store the head part */ + data= row_pos->data; + end_of_data= data + row_pos->length; + + /* Write header */ + flag= share->base.default_row_flag; + row_extents_in_use= 0; + if (unlikely(row->total_length > row_pos->length)) + { + /* Need extent */ + if (bitmap_blocks->count <= 1) + goto crashed; /* Wrong in bitmap */ + flag|= ROW_FLAG_EXTENTS; + row_extents_in_use= 1; + } + /* For now we have only a minimum header */ + *data++= (uchar) flag; + if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED)) + *data++= (uchar) (share->base.null_bytes - + share->base.original_null_bytes); + if (row_extents_in_use) + { + /* Store first extent in header */ + store_key_length_inc(data, bitmap_blocks->count - 1); + row_extents_first_part= data; + data+= ROW_EXTENT_SIZE; + } + if (share->base.pack_fields) + store_key_length_inc(data, row->field_lengths_length); + if (share->calc_checksum) + *(data++)= (uchar) (row->checksum); /* store least significant byte */ + memcpy(data, record, share->base.null_bytes); + data+= share->base.null_bytes; + memcpy(data, row->empty_bits, share->base.pack_bytes); + data+= share->base.pack_bytes; + + /* + Allocate a buffer of rest of data (except blobs) + + To avoid double copying of data, we copy as many columns that fits into + the page. The rest goes into info->packed_row. + + Using an extra buffer, instead of doing continuous writes to different + pages, uses less code and we don't need to have to do a complex call + for every data segment we want to store. + */ + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + row->head_length)) + DBUG_RETURN(1); + + tmp_data_used= 0; /* Either 0 or last used uchar in 'data' */ + tmp_data= data; + + if (row_extents_in_use) + { + uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE; + if (!tmp_data_used && tmp_data + copy_length > end_of_data) + { + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + row_extents_second_part= tmp_data; + /* + We will copy the extents here when we have figured out the tail + positions. + */ + tmp_data+= copy_length; + } + + /* Copy fields that has fixed lengths (primary key etc) */ + for (column= share->columndef, + end_column= column + share->base.fixed_not_null_fields; + column < end_column; column++) + { + if (!tmp_data_used && tmp_data + column->length > end_of_data) + { + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + memcpy(tmp_data, record + column->offset, column->length); + tmp_data+= column->length; + } + + /* Copy length of data for variable length fields */ + if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data) + { + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + field_length_data= row->field_lengths; + memcpy(tmp_data, field_length_data, row->field_lengths_length); + tmp_data+= row->field_lengths_length; + + /* Copy variable length fields and fields with null/zero */ + for (end_column= share->columndef + share->base.fields - share->base.blobs; + column < end_column ; + column++) + { + const uchar *field_pos; + ulong length; + if ((record[column->null_pos] & column->null_bit) || + (row->empty_bits[column->empty_pos] & column->empty_bit)) + continue; + + field_pos= record + column->offset; + switch (column->type) { + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_SKIP_PRESPACE: + case FIELD_SKIP_ZERO: /* Fixed length field */ + length= column->length; + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + /* Char that is space filled */ + if (column->length <= 255) + length= (uint) (uchar) *field_length_data++; + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + } + break; + case FIELD_VARCHAR: + if (column->length <= 256) + { + length= (uint) (uchar) *field_length_data++; + field_pos++; /* Skip length uchar */ + } + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + field_pos+= 2; + } + break; + default: /* Wrong data */ + DBUG_ASSERT(0); + break; + } + if (!tmp_data_used && tmp_data + length > end_of_data) + { + /* Data didn't fit in page; Change to use tmp buffer */ + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + memcpy((char*) tmp_data, (char*) field_pos, length); + tmp_data+= length; + } + + block= head_block + head_block->sub_blocks; /* Point to first blob data */ + + end_column= column + share->base.blobs; + blob_lengths= row->blob_lengths; + if (!tmp_data_used) + { + /* Still room on page; Copy as many blobs we can into this page */ + data= tmp_data; + for (; column < end_column && + *blob_lengths <= (ulong)(end_of_data - data); + column++, blob_lengths++) + { + uchar *tmp_pos; + uint length; + if (!*blob_lengths) /* Null or "" */ + continue; + length= column->length - portable_sizeof_char_ptr; + memcpy_fixed((uchar*) &tmp_pos, record + column->offset + length, + sizeof(char*)); + memcpy(data, tmp_pos, *blob_lengths); + data+= *blob_lengths; + /* Skip over tail page that was to be used to store blob */ + block++; + bitmap_blocks->tail_page_skipped= 1; + } + if (head_block->sub_blocks > 1) + { + /* We have allocated pages that where not used */ + bitmap_blocks->page_skipped= 1; + } + } + else + data= tmp_data_used; /* Get last used on page */ + + { + /* Update page directory */ + uint length= (uint) (data - row_pos->data); + DBUG_PRINT("info", ("Used head length on page: %u", length)); + DBUG_ASSERT(data <= end_of_data); + if (length < info->s->base.min_row_length) + { + uint diff_length= info->s->base.min_row_length - length; + bzero(data, diff_length); + data+= diff_length; + length= info->s->base.min_row_length; + } + int2store(row_pos->dir + 2, length); + /* update empty space at start of block */ + row_pos->empty_space-= length; + int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space); + /* Mark in bitmaps how the current page was actually used */ + head_block->empty_space= row_pos->empty_space; + if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE) + head_block->empty_space= 0; /* Page is full */ + head_block->used= BLOCKUSED_USED; + } + + /* + Now we have to write tail pages, as we need to store the position + to them in the row extent header. + + We first write out all blob tails, to be able to store them in + the current page or 'tmp_data'. + + Then we write the tail of the non-blob fields (The position to the + tail page is stored either in row header, the extents in the head + page or in the first full page of the non-blob data. It's never in + the tail page of the non-blob data) + */ + + blob_full_pages_exists= 0; + if (row_extents_in_use) + { + if (column != end_column) /* If blob fields */ + { + MARIA_COLUMNDEF *save_column= column; + MARIA_BITMAP_BLOCK *save_block= block; + MARIA_BITMAP_BLOCK *end_block; + ulong *save_blob_lengths= blob_lengths; + + for (; column < end_column; column++, blob_lengths++) + { + uchar *blob_pos; + if (!*blob_lengths) /* Null or "" */ + continue; + if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL) + { + uint length; + length= column->length - portable_sizeof_char_ptr; + memcpy_fixed((uchar *) &blob_pos, record + column->offset + length, + sizeof(char*)); + length= *blob_lengths % FULL_PAGE_SIZE(block_size); /* tail size */ + if (length != *blob_lengths) + blob_full_pages_exists= 1; + if (write_tail(info, block + block->sub_blocks-1, + blob_pos + *blob_lengths - length, + length)) + goto disk_err; + } + else + blob_full_pages_exists= 1; + + for (end_block= block + block->sub_blocks; block < end_block; block++) + { + /* + Set only a bit, to not cause bitmap code to believe a block is full + when there is still a lot of entries in it + */ + block->used|= BLOCKUSED_USED; + } + } + column= save_column; + block= save_block; + blob_lengths= save_blob_lengths; + } + + if (tmp_data_used) /* non blob data overflows */ + { + MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block; + MARIA_BITMAP_BLOCK *head_tail_block= 0; + ulong length; + ulong data_length= (tmp_data - info->rec_buff); + +#ifdef SANITY_CHECKS + if (head_block->sub_blocks == 1) + goto crashed; /* no reserved full or tails */ +#endif + /* + Find out where to write tail for non-blob fields. + + Problem here is that the bitmap code may have allocated more + space than we need. We have to handle the following cases: + + - Bitmap code allocated a tail page we don't need. + - The last full page allocated needs to be changed to a tail page + (Because we where able to put more data on the head page than + the bitmap allocation assumed) + + The reserved pages in bitmap_blocks for the main page has one of + the following allocations: + - Full pages, with following blocks: + # * full pages + empty page ; To be used if we change last full to tail page. This + has 'count' = 0. + tail page (optional, if last full page was part full) + - One tail page + */ + + cur_block= head_block + 1; + end_block= head_block + head_block->sub_blocks; + /* + Loop until we have find a block bigger than we need or + we find the empty page block. + */ + while (data_length >= (length= (cur_block->page_count * + FULL_PAGE_SIZE(block_size))) && + cur_block->page_count) + { +#ifdef SANITY_CHECKS + if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED)) + goto crashed; +#endif + data_length-= length; + (cur_block++)->used= BLOCKUSED_USED; + } + last_head_block= cur_block; + if (data_length) + { + if (cur_block->page_count == 0) + { + /* Skip empty filler block */ + cur_block++; + } +#ifdef SANITY_CHECKS + if ((cur_block >= end_block)) + goto crashed; +#endif + if (cur_block->used & BLOCKUSED_TAIL) + { + DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size)); + /* tail written to full tail page */ + cur_block->used= BLOCKUSED_USED; + head_tail_block= cur_block; + } + else if (data_length > length - MAX_TAIL_SIZE(block_size)) + { + /* tail written to full page */ + cur_block->used= BLOCKUSED_USED; + if ((cur_block != end_block - 1) && + (end_block[-1].used & BLOCKUSED_TAIL)) + bitmap_blocks->tail_page_skipped= 1; + } + else + { + /* + cur_block is a full block, followed by an empty and optional + tail block. Change cur_block to a tail block or split it + into full blocks and tail blocks. + + TODO: + If there is enough space on the following tail block, use + this instead of creating a new tail block. + */ + DBUG_ASSERT(cur_block[1].page_count == 0); + if (cur_block->page_count == 1) + { + /* convert full block to tail block */ + cur_block->used= BLOCKUSED_USED | BLOCKUSED_TAIL; + head_tail_block= cur_block; + } + else + { + DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(block_size)); + DBUG_PRINT("info", ("Splitting blocks into full and tail")); + cur_block[1].page= (cur_block->page + cur_block->page_count - 1); + cur_block[1].page_count= 1; /* Avoid DBUG_ASSERT */ + cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL; + cur_block->page_count--; + cur_block->used= BLOCKUSED_USED; + last_head_block= head_tail_block= cur_block+1; + } + if (end_block[-1].used & BLOCKUSED_TAIL) + bitmap_blocks->tail_page_skipped= 1; + } + } + else + { + /* Must be an empty or tail page */ + DBUG_ASSERT(cur_block->page_count == 0 || + cur_block->used & BLOCKUSED_TAIL); + if (end_block[-1].used & BLOCKUSED_TAIL) + bitmap_blocks->tail_page_skipped= 1; + } + + /* + Write all extents into page or tmp_data + + Note that we still don't have a correct position for the tail + of the non-blob fields. + */ + store_extent_info(row_extents_first_part, + row_extents_second_part, + head_block+1, bitmap_blocks->count - 1); + if (head_tail_block) + { + ulong data_length= (tmp_data - info->rec_buff); + uint length; + uchar *extent_data; + + length= (uint) (data_length % FULL_PAGE_SIZE(block_size)); + if (write_tail(info, head_tail_block, + info->rec_buff + data_length - length, + length)) + goto disk_err; + tmp_data-= length; /* Remove the tail */ + if (tmp_data == info->rec_buff) + { + /* We have no full blocks to write for the head part */ + tmp_data_used= 0; + } + + /* Store the tail position for the non-blob fields */ + if (head_tail_block == head_block + 1) + { + /* + We had a head block + tail block, which means that the + tail block is the first extent + */ + extent_data= row_extents_first_part; + } + else + { + /* + We have a head block + some full blocks + tail block + last_head_block is pointing after the last used extent + for the head block. + */ + extent_data= row_extents_second_part + + ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE; + } + DBUG_ASSERT(uint2korr(extent_data+5) & TAIL_BIT); + page_store(extent_data, head_tail_block->page); + int2store(extent_data + PAGE_STORE_SIZE, head_tail_block->page_count); + } + } + else + store_extent_info(row_extents_first_part, + row_extents_second_part, + head_block+1, bitmap_blocks->count - 1); + } + + if (share->now_transactional) + { + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + size_t data_length= (size_t) (data - row_pos->data); + + /* Log REDO changes of head page */ + page_store(log_data + FILEID_STORE_SIZE, head_block->page); + dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + row_pos->rownr); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) row_pos->data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_length; + if (translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_HEAD, info->trn, + info, sizeof(log_data) + data_length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data)) + goto disk_err; + } + + /* Increase data file size, if extended */ + position= (my_off_t) head_block->page * block_size; + if (info->state->data_file_length <= position) + info->state->data_file_length= position + block_size; + + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (pagecache_write(share->pagecache, + &info->dfile, head_block->page, 0, + page_buff, share->page_type, + head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ : + PAGECACHE_LOCK_READ, + head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED : + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, &page_link.link)) + goto disk_err; + page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; + if (head_block_is_read) + { + /* Head page is always the first pinned page */ + set_dynamic(&info->pinned_pages, (void*) &page_link, 0); + } + else + push_dynamic(&info->pinned_pages, (void*) &page_link); + + if (share->now_transactional && (tmp_data_used || blob_full_pages_exists)) + { + /* + Log REDO writes for all full pages (head part and all blobs) + We write all here to be able to generate the UNDO record early + so that we can write the LSN for the UNDO record to all full pages. + */ + uchar tmp_log_data[FILEID_STORE_SIZE + LSN_STORE_SIZE + PAGE_STORE_SIZE + + ROW_EXTENT_SIZE * ROW_EXTENTS_ON_STACK]; + uchar *log_data, *log_pos; + LEX_STRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 + + ROW_EXTENTS_ON_STACK]; + LEX_STRING *log_array_pos, *log_array; + int error; + ulong log_entry_length= 0; + + /* If few extents, then allocate things on stack to avoid a malloc call */ + if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK) + { + log_array= tmp_log_array; + log_data= tmp_log_data; + } + else + { + if (my_multi_malloc(MY_WME, &log_array, + (uint) ((bitmap_blocks->count + + TRANSLOG_INTERNAL_PARTS + 2) * + sizeof(*log_array)), + &log_data, bitmap_blocks->count * ROW_EXTENT_SIZE, + NullS)) + goto disk_err; + } + log_pos= log_data + FILEID_STORE_SIZE; + log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1; + + if (tmp_data_used) + { + /* Full head pages */ + size_t data_length= (ulong) (tmp_data - info->rec_buff); + log_pos= store_page_range(log_pos, head_block+1, block_size, + data_length); + log_array_pos->str= (char*) info->rec_buff; + log_array_pos->length= data_length; + log_entry_length+= data_length; + log_array_pos++; + } + if (blob_full_pages_exists) + { + MARIA_COLUMNDEF *tmp_column= column; + ulong *tmp_blob_lengths= blob_lengths; + MARIA_BITMAP_BLOCK *tmp_block= block; + + /* Full blob pages */ + for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++) + { + ulong blob_length; + uint length; + + if (!*tmp_blob_lengths) /* Null or "" */ + continue; + length= tmp_column->length - portable_sizeof_char_ptr; + blob_length= *tmp_blob_lengths; + if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL) + blob_length-= (blob_length % FULL_PAGE_SIZE(block_size)); + if (blob_length) + { + memcpy_fixed((uchar*) &log_array_pos->str, + record + column->offset + length, + sizeof(uchar*)); + log_array_pos->length= blob_length; + log_entry_length+= blob_length; + log_array_pos++; + + log_pos= store_page_range(log_pos, tmp_block, block_size, + blob_length); + tmp_block+= tmp_block->sub_blocks; + } + } + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (size_t) (log_pos - + log_data); + log_entry_length+= (log_pos - log_data); + + /* trn->rec_lsn is already set earlier in this function */ + error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS, + info->trn, info, log_entry_length, + (uint) (log_array_pos - log_array), + log_array, log_data); + if (log_array != tmp_log_array) + my_free((uchar*) log_array, MYF(0)); + if (error) + goto disk_err; + } + + /* Write UNDO or CLR record */ + lsn= 0; + if (share->now_transactional) + { + LEX_STRING *log_array= info->log_row_parts; + + if (undo_lsn != LSN_ERROR) + { + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + 1]; + /* undo_lsn must be first for compression to work */ + lsn_store(log_data, undo_lsn); + /* + Store if this CLR is about an UNDO_INSERT, UNDO_DELETE or UNDO_UPDATE; + in the first/second case, Recovery, when it sees the CLR_END in the + REDO phase, may decrement/increment the records' count. + */ + log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE]= old_record ? + LOGREC_UNDO_ROW_UPDATE : LOGREC_UNDO_ROW_DELETE; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + if (translog_write_record(&lsn, LOGREC_CLR_END, + info->trn, info, sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data + LSN_STORE_SIZE)) + goto disk_err; + } + else + { + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + + /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_INSERT share same header */ + lsn_store(log_data, info->trn->undo_lsn); + page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, + head_block->page); + dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE, + row_pos->rownr); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + if (!old_record) + { + /* Write UNDO log record for the INSERT */ + if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT, + info->trn, info, sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data + LSN_STORE_SIZE)) + goto disk_err; + } + else + { + /* Write UNDO log record for the UPDATE */ + size_t row_length; + uint row_parts_count; + row_length= fill_update_undo_parts(info, old_record, record, + log_array + + TRANSLOG_INTERNAL_PARTS + 1, + &row_parts_count); + if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn, + info, sizeof(log_data) + row_length, + TRANSLOG_INTERNAL_PARTS + 1 + + row_parts_count, + log_array, log_data + LSN_STORE_SIZE)) + goto disk_err; + } + } + } + _ma_unpin_all_pages(info, lsn); + + if (tmp_data_used) + { + /* + Write data stored in info->rec_buff to pages + This is the char/varchar data that didn't fit into the head page. + */ + DBUG_ASSERT(bitmap_blocks->count != 0); + if (write_full_pages(info, info->trn->undo_lsn, head_block + 1, + info->rec_buff, (ulong) (tmp_data - info->rec_buff))) + goto disk_err; + } + + /* Write rest of blobs (data, but no tails as they are already written) */ + for (; column < end_column; column++, blob_lengths++) + { + uchar *blob_pos; + uint length; + ulong blob_length; + if (!*blob_lengths) /* Null or "" */ + continue; + length= column->length - portable_sizeof_char_ptr; + memcpy_fixed((uchar*) &blob_pos, record + column->offset + length, + sizeof(char*)); + /* remove tail part */ + blob_length= *blob_lengths; + if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL) + blob_length-= (blob_length % FULL_PAGE_SIZE(block_size)); + + if (blob_length && write_full_pages(info, info->trn->undo_lsn, block, + blob_pos, blob_length)) + goto disk_err; + block+= block->sub_blocks; + } + /* Release not used space in used pages */ + if (_ma_bitmap_release_unused(info, bitmap_blocks)) + goto disk_err; + + _ma_finalize_row(info); + DBUG_RETURN(0); + +crashed: + /* Something was wrong with data on page */ + my_errno= HA_ERR_WRONG_IN_RECORD; + +disk_err: + /** + @todo RECOVERY we are going to let dirty pages go to disk while we have + logged UNDO, this violates WAL. We must mark the table corrupted! + + @todo RECOVERY we have written some REDOs without a closing UNDO, + it's possible that a next operation by this transaction succeeds and then + Recovery would glue the "orphan REDOs" to the succeeded operation and + execute the failed REDOs. We need some mark "abort this group" in the + log, or mark the table corrupted (then user will repair it and thus REDOs + will be skipped). + + @todo RECOVERY to not let write errors go unnoticed, pagecache_write() + should take a MARIA_HA* in argument, and it it + fails when flushing a page to disk it should call + (*the_maria_ha->write_error_func)(the_maria_ha) + and this hook will mark the table corrupted. + Maybe hook should be stored in the pagecache's block structure, or in a + hash "file->maria_ha*". + + @todo RECOVERY we should distinguish below between log write error and + table write error. The former should stop Maria immediately, the latter + should mark the table corrupted. + */ + /* + Unpin all pinned pages to not cause problems for disk cache. This is + safe to call even if we already called _ma_unpin_all_pages() above. + */ + _ma_unpin_all_pages_and_finalize_row(info, 0); + + DBUG_RETURN(1); +} + + +/* + @brief Write a record + + @fn allocate_and_write_block_record() + @param info Maria handler + @param record Record to write + @param row Information about fields in 'record' + @param undo_lsn <> LSN_ERROR if we are executing an UNDO + + @return + @retval 0 ok + @retval 1 Error +*/ + +static my_bool allocate_and_write_block_record(MARIA_HA *info, + const uchar *record, + MARIA_ROW *row, + LSN undo_lsn) +{ + struct st_row_pos_info row_pos; + MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks; + DBUG_ENTER("allocate_and_write_block_record"); + + if (_ma_bitmap_find_place(info, row, blocks)) + DBUG_RETURN(1); /* Error reading bitmap */ + /* page will be pinned & locked by get_head_or_tail_page */ + if (get_head_or_tail_page(info, blocks->block, info->buff, + row->space_on_head_page, HEAD_PAGE, + PAGECACHE_LOCK_WRITE, &row_pos)) + DBUG_RETURN(1); + row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr); + if (info->s->calc_checksum) + row->checksum= (info->s->calc_checksum)(info, record); + if (write_block_record(info, (uchar*) 0, record, row, + blocks, blocks->block->org_bitmap_value != 0, + &row_pos, undo_lsn)) + DBUG_RETURN(1); /* Error reading bitmap */ + DBUG_PRINT("exit", ("Rowid: %lu (%lu:%u)", (ulong) row->lastpos, + (ulong) ma_recordpos_to_page(row->lastpos), + ma_recordpos_to_dir_entry(row->lastpos))); + DBUG_RETURN(0); +} + + +/* + Write a record and return rowid for it + + SYNOPSIS + _ma_write_init_block_record() + info Maria handler + record Record to write + + NOTES + This is done BEFORE we write the keys to the row! + + RETURN + HA_OFFSET_ERROR Something went wrong + # Rowid for row +*/ + +MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info, + const uchar *record) +{ + DBUG_ENTER("_ma_write_init_block_record"); + + calc_record_size(info, record, &info->cur_row); + if (allocate_and_write_block_record(info, record, + &info->cur_row, LSN_ERROR)) + DBUG_RETURN(HA_OFFSET_ERROR); + DBUG_RETURN(info->cur_row.lastpos); +} + + +/* + Dummy function for (*info->s->write_record)() + + Nothing to do here, as we already wrote the record in + _ma_write_init_block_record() +*/ + +my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)), + const uchar *record __attribute__ ((unused))) +{ + return 0; /* Row already written */ +} + + +/** + @brief Remove row written by _ma_write_block_record() and log undo + + @param info Maria handler + + @note + This is called in case we got a duplicate unique key while + writing keys. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool _ma_write_abort_block_record(MARIA_HA *info) +{ + my_bool res= 0; + MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks; + MARIA_BITMAP_BLOCK *block, *end; + LSN lsn= LSN_IMPOSSIBLE; + DBUG_ENTER("_ma_write_abort_block_record"); + + if (delete_head_or_tail(info, + ma_recordpos_to_page(info->cur_row.lastpos), + ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1, + 0)) + res= 1; + for (block= blocks->block + 1, end= block + blocks->count - 1; block < end; + block++) + { + if (block->used & BLOCKUSED_TAIL) + { + /* + block->page_count is set to the tail directory entry number in + write_block_record() + */ + if (delete_head_or_tail(info, block->page, block->page_count & ~TAIL_BIT, + 0, 0)) + res= 1; + } + else if (block->used & BLOCKUSED_USED) + { + if (free_full_page_range(info, block->page, block->page_count)) + res= 1; + } + } + + if (info->s->now_transactional) + { + LSN previous_undo_lsn; + TRANSLOG_HEADER_BUFFER rec; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + 1]; + int len; + /* + We do need the code above (delete_head_or_tail() etc) for + non-transactional tables. + For transactional tables we could skip this code above and just execute + the UNDO_INSERT, but we try to have one code path. + Write CLR record, because we are somehow undoing UNDO_ROW_INSERT. + When we have logging for keys: as maria_write() first writes the row + then the keys, and if failure, deletes the keys then the rows, + info->trn->undo_lsn below will properly point to the UNDO of the + UNDO_ROW_INSERT for this row. + */ + if ((len= translog_read_record_header(info->trn->undo_lsn, &rec)) == + RECHEADER_READ_ERROR) + { + res= 1; + goto end; + } + DBUG_ASSERT(rec.type == LOGREC_UNDO_ROW_INSERT); + previous_undo_lsn= lsn_korr(rec.header); + lsn_store(log_data, previous_undo_lsn); + log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE]= LOGREC_UNDO_ROW_INSERT; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (translog_write_record(&lsn, LOGREC_CLR_END, + info->trn, info, sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data + LSN_STORE_SIZE)) + res= 1; + } +end: + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/* + Update a record + + NOTES + For the moment, we assume that info->curr_row.extents is always updated + when a row is read. In the future we may decide to read this on demand + for rows split into many extents. +*/ + +static my_bool _ma_update_block_record2(MARIA_HA *info, + MARIA_RECORD_POS record_pos, + const uchar *oldrec, + const uchar *record, + LSN undo_lsn) +{ + MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks; + uchar *buff; + MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row; + MARIA_PINNED_PAGE page_link; + uint rownr, org_empty_size, head_length; + uint block_size= info->s->block_size; + uchar *dir; + ulonglong page; + struct st_row_pos_info row_pos; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_update_block_record2"); + DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos)); + +#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE + DBUG_DUMP("oldrec", oldrec, share->base.reclength); + DBUG_DUMP("newrec", record, share->base.reclength); +#endif + + /* checksum was computed by maria_update() already and put into cur_row */ + new_row->checksum= cur_row->checksum; + calc_record_size(info, record, new_row); + page= ma_recordpos_to_page(record_pos); + + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, (my_off_t) page, 0, + info->buff, share->page_type, + PAGECACHE_LOCK_WRITE, &page_link.link))) + DBUG_RETURN(1); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET); + rownr= ma_recordpos_to_dir_entry(record_pos); + dir= (buff + block_size - DIR_ENTRY_SIZE * rownr - + DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE); + + if ((org_empty_size + cur_row->head_length) >= new_row->total_length) + { + uint rec_offset, length; + MARIA_BITMAP_BLOCK block; + + /* + We can fit the new row in the same page as the original head part + of the row + */ + block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap, + org_empty_size); + + if (extend_area_on_page(buff, dir, rownr, share->block_size, + new_row->total_length, &org_empty_size, + &rec_offset, &length)) + DBUG_RETURN(1); + + row_pos.buff= buff; + row_pos.rownr= rownr; + row_pos.empty_space= org_empty_size; + row_pos.dir= dir; + row_pos.data= buff + rec_offset; + row_pos.length= length; + blocks->block= █ + blocks->count= 1; + block.page= page; + block.sub_blocks= 1; + block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP; + block.empty_space= row_pos.empty_space; + /* Update cur_row, if someone calls update at once again */ + cur_row->head_length= new_row->total_length; + + if (cur_row->extents_count && free_full_pages(info, cur_row)) + goto err; + DBUG_RETURN(write_block_record(info, oldrec, record, new_row, blocks, + 1, &row_pos, undo_lsn)); + } + /* + Allocate all size in block for record + TODO: + Need to improve this to do compact if we can fit one more blob into + the head page + */ + head_length= uint2korr(dir + 2); + if (buff[PAGE_TYPE_OFFSET] & PAGE_CAN_BE_COMPACTED && org_empty_size && + (head_length < new_row->head_length || + (new_row->total_length <= head_length && + org_empty_size + head_length >= new_row->total_length))) + { + compact_page(buff, share->block_size, rownr, 1); + org_empty_size= 0; + head_length= uint2korr(dir + 2); + } + + /* Delete old row */ + if (*cur_row->tail_positions && delete_tails(info, cur_row->tail_positions)) + goto err; + if (cur_row->extents_count && free_full_pages(info, cur_row)) + goto err; + if (_ma_bitmap_find_new_place(info, new_row, page, head_length, blocks)) + goto err; + + row_pos.buff= buff; + row_pos.rownr= rownr; + row_pos.empty_space= org_empty_size + head_length; + row_pos.dir= dir; + row_pos.data= buff + uint2korr(dir); + row_pos.length= head_length; + DBUG_RETURN(write_block_record(info, oldrec, record, new_row, blocks, 1, + &row_pos, undo_lsn)); + +err: + _ma_unpin_all_pages_and_finalize_row(info, 0); + DBUG_RETURN(1); +} + + +/* Wrapper for _ma_update_block_record2() used by ma_update() */ + + +my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos, + const uchar *orig_rec, const uchar *new_rec) +{ + return _ma_update_block_record2(info, record_pos, orig_rec, new_rec, + LSN_ERROR); +} + + +/* + Delete a directory entry + + SYNOPSIS + delete_dir_entry() + buff Page buffer + block_size Block size + record_number Record number to delete + empty_space Empty space on page after delete + + RETURN + -1 Error on page + 0 ok + 1 Page is now empty +*/ + +static int delete_dir_entry(uchar *buff, uint block_size, uint record_number, + uint *empty_space_res) +{ + uint number_of_records= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET]; + uint length, empty_space; + uchar *dir, *org_dir; + DBUG_ENTER("delete_dir_entry"); + +#ifdef SANITY_CHECKS + if (record_number >= number_of_records || + record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 - + PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE)) + { + DBUG_PRINT("error", ("record_number: %u number_of_records: %u", + record_number, number_of_records)); + + DBUG_RETURN(-1); + } +#endif + + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + org_dir= dir= (buff + block_size - DIR_ENTRY_SIZE * record_number - + DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE); + length= uint2korr(dir + 2); + + if (record_number == number_of_records - 1) + { + /* Delete this entry and all following empty directory entries */ + uchar *end= buff + block_size - PAGE_SUFFIX_SIZE; + do + { + number_of_records--; + dir+= DIR_ENTRY_SIZE; + empty_space+= DIR_ENTRY_SIZE; + } while (dir < end && dir[0] == 0 && dir[1] == 0); + + if (number_of_records == 0) + { + buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; + *empty_space_res= block_size; + DBUG_RETURN(1); + } + buff[DIR_COUNT_OFFSET]= (uchar) number_of_records; + } + empty_space+= length; + + /* Update directory */ + org_dir[0]= org_dir[1]= 0; org_dir[2]= org_dir[3]= 0; /* Delete entry */ + int2store(buff + EMPTY_SPACE_OFFSET, empty_space); + buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED; + + *empty_space_res= empty_space; + DBUG_RETURN(0); +} + + +/* + Delete a head a tail part + + SYNOPSIS + delete_head_or_tail() + info Maria handler + page Page (not file offset!) on which the row is + head 1 if this is a head page + from_update 1 if we are called from update. In this case we + leave the page as write locked as we may put + the new row into the old position. + + NOTES + Uses info->keyread_buff + + RETURN + 0 ok + 1 error +*/ + +static my_bool delete_head_or_tail(MARIA_HA *info, + ulonglong page, uint record_number, + my_bool head, my_bool from_update) +{ + MARIA_SHARE *share= info->s; + uint empty_space; + uint block_size= share->block_size; + uchar *buff; + LSN lsn; + MARIA_PINNED_PAGE page_link; + int res; + enum pagecache_page_lock lock_at_write, lock_at_unpin; + DBUG_ENTER("delete_head_or_tail"); + + info->keyread_buff_used= 1; + DBUG_ASSERT(info->s->pagecache->block_size == block_size); + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, page, 0, + info->keyread_buff, + info->s->page_type, + PAGECACHE_LOCK_WRITE, &page_link.link))) + DBUG_RETURN(1); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + if (from_update) + { + lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED; + lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK; + } + else + { + lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ; + lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK; + } + + res= delete_dir_entry(buff, block_size, record_number, &empty_space); + if (res < 0) + DBUG_RETURN(1); + if (res == 0) /* after our deletion, page is still not empty */ + { + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + if (info->s->now_transactional) + { + /* Log REDO data */ + page_store(log_data + FILEID_STORE_SIZE, page); + dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + record_number); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD : + LOGREC_REDO_PURGE_ROW_TAIL), + info->trn, info, sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data)) + DBUG_RETURN(1); + } + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, share->page_type, + lock_at_write, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, &page_link.link)) + DBUG_RETURN(1); + } + else /* page is now empty */ + { + if (info->s->now_transactional) + { + uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + PAGE_STORE_SIZE + PAGERANGE_STORE_SIZE]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + pagerange_store(log_data + FILEID_STORE_SIZE, 1); + page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, page); + pagerange_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + PAGE_STORE_SIZE, 1); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (translog_write_record(&lsn, LOGREC_REDO_PURGE_BLOCKS, + info->trn, info, sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data)) + DBUG_RETURN(1); + } + /* Write the empty page (needed only for REPAIR to work) */ + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, share->page_type, + lock_at_write, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, &page_link.link)) + DBUG_RETURN(1); + + DBUG_ASSERT(empty_space >= info->s->bitmap.sizes[0]); + } + /* The page is pinned with a read lock */ + page_link.unlock= lock_at_unpin; + set_dynamic(&info->pinned_pages, (void*) &page_link, + info->pinned_pages.elements-1); + + DBUG_PRINT("info", ("empty_space: %u", empty_space)); + DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space)); +} + + +/* + delete all tails + + SYNOPSIS + delete_tails() + info Handler + tails Pointer to vector of tail positions, ending with 0 + + NOTES + Uses info->keyread_buff + + RETURN + 0 ok + 1 error +*/ + +static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails) +{ + my_bool res= 0; + DBUG_ENTER("delete_tails"); + for (; *tails; tails++) + { + if (delete_head_or_tail(info, + ma_recordpos_to_page(*tails), + ma_recordpos_to_dir_entry(*tails), 0, 1)) + res= 1; + } + DBUG_RETURN(res); +} + + +/* + Delete a record + + NOTES + For the moment, we assume that info->cur_row.extents is always updated + when a row is read. In the future we may decide to read this on demand + for rows with many splits. +*/ + +my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record) +{ + ulonglong page; + uint record_number; + DBUG_ENTER("_ma_delete_block_record"); + + page= ma_recordpos_to_page(info->cur_row.lastpos); + record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos); + DBUG_PRINT("enter", ("Rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos, + (ulong) page, record_number)); + + if (delete_head_or_tail(info, page, record_number, 1, 0) || + delete_tails(info, info->cur_row.tail_positions)) + goto err; + + if (info->cur_row.extents && free_full_pages(info, &info->cur_row)) + goto err; + + if (info->s->now_transactional) + { + LSN lsn; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + + DIR_COUNT_SIZE]; + size_t row_length; + uint row_parts_count; + + /* Write UNDO record */ + lsn_store(log_data, info->trn->undo_lsn); + page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page); + dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE, record_number); + + info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= (char*) log_data; + info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length= sizeof(log_data); + row_length= fill_insert_undo_parts(info, record, info->log_row_parts + + TRANSLOG_INTERNAL_PARTS + 1, + &row_parts_count); + + if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn, + info, sizeof(log_data) + row_length, + TRANSLOG_INTERNAL_PARTS + 1 + row_parts_count, + info->log_row_parts, log_data + LSN_STORE_SIZE)) + goto err; + + } + + _ma_unpin_all_pages_and_finalize_row(info, info->trn->undo_lsn); + DBUG_RETURN(0); + +err: + _ma_unpin_all_pages_and_finalize_row(info, 0); + DBUG_RETURN(1); +} + + +/**************************************************************************** + Reading of records +****************************************************************************/ + +/* + Read position to record from record directory at end of page + + SYNOPSIS + get_record_position() + buff page buffer + block_size block size for page + record_number Record number in index + end_of_data pointer to end of data for record + + RETURN + 0 Error in data + # Pointer to start of record. + In this case *end_of_data is set. +*/ + +static uchar *get_record_position(uchar *buff, uint block_size, + uint record_number, uchar **end_of_data) +{ + uint number_of_records= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET]; + uchar *dir; + uchar *data; + uint offset, length; + +#ifdef SANITY_CHECKS + if (record_number >= number_of_records || + record_number > ((block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE) / + DIR_ENTRY_SIZE)) + { + DBUG_PRINT("error", + ("Wrong row number: record_number: %u number_of_records: %u", + record_number, number_of_records)); + return 0; + } +#endif + + dir= (buff + block_size - DIR_ENTRY_SIZE * record_number - + DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE); + offset= uint2korr(dir); + length= uint2korr(dir + 2); +#ifdef SANITY_CHECKS + if (offset < PAGE_HEADER_SIZE || + offset + length > (block_size - + number_of_records * DIR_ENTRY_SIZE - + PAGE_SUFFIX_SIZE)) + { + DBUG_PRINT("error", + ("Wrong row position: record_number: %u offset: %u " + "length: %u number_of_records: %u", + record_number, offset, length, number_of_records)); + return 0; + } +#endif + data= buff + offset; + *end_of_data= data + length; + return data; +} + + +/* + Init extent + + NOTES + extent is a cursor over which pages to read +*/ + +static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info, + uint extents, MARIA_RECORD_POS *tail_positions) +{ + uint page_count; + extent->extent= extent_info; + extent->extent_count= extents; + extent->page= page_korr(extent_info); /* First extent */ + page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE); + extent->tail= page_count & TAIL_BIT; + if (extent->tail) + { + extent->page_count= 1; + extent->tail_row_nr= page_count & ~TAIL_BIT; + } + else + extent->page_count= page_count; + extent->tail_positions= tail_positions; + extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED; +} + + +/* + Read next extent + + SYNOPSIS + read_next_extent() + info Maria handler + extent Pointer to current extent (this is updated to point + to next) + end_of_data Pointer to end of data in read block (out) + + NOTES + New block is read into info->buff + + RETURN + 0 Error; my_errno is set + # Pointer to start of data in read block + In this case end_of_data is updated to point to end of data. +*/ + +static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent, + uchar **end_of_data) +{ + MARIA_SHARE *share= info->s; + uchar *buff, *data; + MARIA_PINNED_PAGE page_link; + enum pagecache_page_lock lock; + DBUG_ENTER("read_next_extent"); + + if (!extent->page_count) + { + uint page_count; + if (!--extent->extent_count) + goto crashed; + extent->extent+= ROW_EXTENT_SIZE; + extent->page= page_korr(extent->extent); + page_count= uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE); + if (!page_count) + goto crashed; + extent->tail= page_count & TAIL_BIT; + if (extent->tail) + extent->tail_row_nr= page_count & ~TAIL_BIT; + else + extent->page_count= page_count; + DBUG_PRINT("info",("New extent. Page: %lu page_count: %u tail_flag: %d", + (ulong) extent->page, extent->page_count, + extent->tail != 0)); + } + extent->first_extent= 0; + + lock= PAGECACHE_LOCK_LEFT_UNLOCKED; + if (extent->tail) + lock= extent->lock_for_tail_pages; + + DBUG_ASSERT(share->pagecache->block_size == share->block_size); + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, extent->page, 0, + info->buff, share->page_type, + lock, &page_link.link))) + { + /* check if we tried to read over end of file (ie: bad data in record) */ + if ((extent->page + 1) * share->block_size > info->state->data_file_length) + goto crashed; + DBUG_RETURN(0); + } + + if (!extent->tail) + { + /* Full data page */ + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE) + goto crashed; + extent->page++; /* point to next page */ + extent->page_count--; + *end_of_data= buff + share->block_size; + info->cur_row.full_page_count++; /* For maria_chk */ + DBUG_RETURN(extent->data_start= buff + LSN_SIZE + PAGE_TYPE_SIZE); + } + + /* Found tail */ + if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED) + { + /* Read during redo */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE) + goto crashed; + *(extent->tail_positions++)= ma_recordpos(extent->page, + extent->tail_row_nr); + info->cur_row.tail_count++; /* For maria_chk */ + + if (!(data= get_record_position(buff, share->block_size, + extent->tail_row_nr, + end_of_data))) + goto crashed; + extent->data_start= data; + extent->page_count= 0; /* No more data in extent */ + DBUG_RETURN(data); + + +crashed: + my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */ + DBUG_PRINT("error", ("wrong extent information")); + DBUG_RETURN(0); +} + + +/* + Read data that may be split over many blocks + + SYNOPSIS + read_long_data() + info Maria handler + to Store result string here (this is allocated) + extent Pointer to current extent position + data Current position in buffer + end_of_data End of data in buffer + + NOTES + When we have to read a new buffer, it's read into info->buff + + This loop is implemented by goto's instead of a for() loop as + the code is notable smaller and faster this way (and it's not nice + to jump into a for loop() or into a 'then' clause) + + RETURN + 0 ok + 1 error +*/ + +static my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length, + MARIA_EXTENT_CURSOR *extent, + uchar **data, uchar **end_of_data) +{ + DBUG_ENTER("read_long_data"); + DBUG_PRINT("enter", ("length: %lu", length)); + DBUG_ASSERT(*data <= *end_of_data); + + /* + Fields are never split in middle. This means that if length > rest-of-data + we should start reading from the next extent. The reason we may have + data left on the page is that there fixed part of the row was less than + min_row_length and in this case the head block was extended to + min_row_length. + + This may change in the future, which is why we have the loop written + the way it's written. + */ + if (extent->first_extent && length > (ulong) (*end_of_data - *data)) + *end_of_data= *data; + + for(;;) + { + uint left_length; + left_length= (uint) (*end_of_data - *data); + if (likely(left_length >= length)) + { + memcpy(to, *data, length); + (*data)+= length; + DBUG_RETURN(0); + } + memcpy(to, *data, left_length); + to+= left_length; + length-= left_length; + if (!(*data= read_next_extent(info, extent, end_of_data))) + break; + } + DBUG_RETURN(1); +} + + +/* + Read a record from page (helper function for _ma_read_block_record()) + + SYNOPSIS + _ma_read_block_record2() + info Maria handler + record Store record here + data Start of head data for row + end_of_data End of data for row + + NOTES + The head page is already read by caller + Following data is update in info->cur_row: + + cur_row.head_length is set to size of entry in head block + cur_row.tail_positions is set to point to all tail blocks + cur_row.extents points to extents data + cur_row.extents_counts contains number of extents + cur_row.empty_bits is set to empty bits + cur_row.field_lengths contains packed length of all fields + cur_row.blob_length contains total length of all blobs. + + RETURN + 0 ok + # Error code +*/ + +int _ma_read_block_record2(MARIA_HA *info, uchar *record, + uchar *data, uchar *end_of_data) +{ + MARIA_SHARE *share= info->s; + uchar *field_length_data, *blob_buffer, *start_of_data; + uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths; + my_bool found_blob= 0; + MARIA_EXTENT_CURSOR extent; + MARIA_COLUMNDEF *column, *end_column; + MARIA_ROW *cur_row= &info->cur_row; + DBUG_ENTER("_ma_read_block_record2"); + + LINT_INIT(field_length_data); + LINT_INIT(blob_buffer); + + start_of_data= data; + flag= (uint) (uchar) data[0]; + cur_null_bytes= share->base.original_null_bytes; + null_bytes= share->base.null_bytes; + cur_row->head_length= (uint) (end_of_data - data); + cur_row->full_page_count= cur_row->tail_count= 0; + cur_row->blob_length= 0; + + /* Skip trans header (for now, until we have MVCC csupport) */ + data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)]; + if (flag & ROW_FLAG_NULLS_EXTENDED) + cur_null_bytes+= data[-1]; + + row_extents= 0; + if (flag & ROW_FLAG_EXTENTS) + { + uint row_extent_size; + /* + Record is split over many data pages. + Get number of extents and first extent + */ + get_key_length(row_extents, data); + cur_row->extents_count= row_extents; + row_extent_size= row_extents * ROW_EXTENT_SIZE; + if (cur_row->extents_buffer_length < row_extent_size && + _ma_alloc_buffer(&cur_row->extents, + &cur_row->extents_buffer_length, + row_extent_size)) + DBUG_RETURN(my_errno); + memcpy(cur_row->extents, data, ROW_EXTENT_SIZE); + data+= ROW_EXTENT_SIZE; + init_extent(&extent, cur_row->extents, row_extents, + cur_row->tail_positions); + } + else + { + cur_row->extents_count= 0; + (*cur_row->tail_positions)= 0; + extent.page_count= 0; + extent.extent_count= 1; + } + extent.first_extent= 1; + + field_lengths= 0; + if (share->base.max_field_lengths) + { + get_key_length(field_lengths, data); + cur_row->field_lengths_length= field_lengths; +#ifdef SANITY_CHECKS + if (field_lengths > share->base.max_field_lengths) + goto err; +#endif + } + + if (share->calc_checksum) + cur_row->checksum= (uint) (uchar) *data++; + /* data now points on null bits */ + memcpy(record, data, cur_null_bytes); + if (unlikely(cur_null_bytes != null_bytes)) + { + /* + This only happens if we have added more NULL columns with + ALTER TABLE and are fetching an old, not yet modified old row + */ + bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes)); + } + data+= null_bytes; + /* We copy the empty bits to be able to use them for delete/update */ + memcpy(cur_row->empty_bits, data, share->base.pack_bytes); + data+= share->base.pack_bytes; + + /* TODO: Use field offsets, instead of just skipping them */ + data+= share->base.field_offsets * FIELD_OFFSET_SIZE; + + /* + Read row extents (note that first extent was already read into + cur_row->extents above) + */ + if (row_extents > 1) + { + if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE, + (row_extents - 1) * ROW_EXTENT_SIZE, + &extent, &data, &end_of_data)) + DBUG_RETURN(my_errno); + } + + /* + Data now points to start of fixed length field data that can't be null + or 'empty'. Note that these fields can't be split over blocks. + */ + for (column= share->columndef, + end_column= column + share->base.fixed_not_null_fields; + column < end_column; column++) + { + uint column_length= column->length; + if (data >= end_of_data && + !(data= read_next_extent(info, &extent, &end_of_data))) + goto err; + memcpy(record + column->offset, data, column_length); + data+= column_length; + } + + /* Read array of field lengths. This may be stored in several extents */ + if (field_lengths) + { + field_length_data= cur_row->field_lengths; + if (read_long_data(info, field_length_data, field_lengths, &extent, + &data, &end_of_data)) + DBUG_RETURN(my_errno); + } + + /* Read variable length data. Each of these may be split over many extents */ + for (end_column= share->columndef + share->base.fields; + column < end_column; column++) + { + enum en_fieldtype type= column->type; + uchar *field_pos= record + column->offset; + /* First check if field is present in record */ + if ((record[column->null_pos] & column->null_bit) || + (cur_row->empty_bits[column->empty_pos] & column->empty_bit)) + { + bfill(record + column->offset, column->fill_length, + type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + continue; + } + switch (type) { + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_SKIP_PRESPACE: + case FIELD_SKIP_ZERO: /* Fixed length field */ + if (data >= end_of_data && + !(data= read_next_extent(info, &extent, &end_of_data))) + goto err; + memcpy(field_pos, data, column->length); + data+= column->length; + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + /* Char that is space filled */ + uint length; + if (column->length <= 255) + length= (uint) (uchar) *field_length_data++; + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + } +#ifdef SANITY_CHECKS + if (length > column->length) + goto err; +#endif + if (read_long_data(info, field_pos, length, &extent, &data, + &end_of_data)) + DBUG_RETURN(my_errno); + bfill(field_pos + length, column->length - length, ' '); + break; + } + case FIELD_VARCHAR: + { + ulong length; + if (column->length <= 256) + { + length= (uint) (uchar) (*field_pos++= *field_length_data++); + } + else + { + length= uint2korr(field_length_data); + field_pos[0]= field_length_data[0]; + field_pos[1]= field_length_data[1]; + field_pos+= 2; + field_length_data+= 2; + } + if (read_long_data(info, field_pos, length, &extent, &data, + &end_of_data)) + DBUG_RETURN(my_errno); + break; + } + case FIELD_BLOB: + { + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_length_data); + + if (!found_blob) + { + /* Calculate total length for all blobs */ + ulong blob_lengths= 0; + uchar *length_data= field_length_data; + MARIA_COLUMNDEF *blob_field= column; + + found_blob= 1; + for (; blob_field < end_column; blob_field++) + { + uint size_length; + if ((record[blob_field->null_pos] & blob_field->null_bit) || + (cur_row->empty_bits[blob_field->empty_pos] & + blob_field->empty_bit)) + continue; + size_length= blob_field->length - portable_sizeof_char_ptr; + blob_lengths+= _ma_calc_blob_length(size_length, length_data); + length_data+= size_length; + } + cur_row->blob_length= blob_lengths; + DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths)); + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + blob_lengths)) + DBUG_RETURN(my_errno); + blob_buffer= info->rec_buff; + } + + memcpy(field_pos, field_length_data, size_length); + memcpy_fixed(field_pos + size_length, (uchar *) & blob_buffer, + sizeof(char*)); + field_length_data+= size_length; + + /* + After we have read one extent, then each blob is in it's own extent + */ + if (extent.first_extent && (ulong) (end_of_data - data) < blob_length) + end_of_data= data; /* Force read of next extent */ + + if (read_long_data(info, blob_buffer, blob_length, &extent, &data, + &end_of_data)) + DBUG_RETURN(my_errno); + blob_buffer+= blob_length; + break; + } + default: +#ifdef EXTRA_DEBUG + DBUG_ASSERT(0); /* purecov: deadcode */ +#endif + goto err; + } + continue; + } + + if (row_extents) + { + DBUG_PRINT("info", ("Row read: page_count: %u extent_count: %u", + extent.page_count, extent.extent_count)); + *extent.tail_positions= 0; /* End marker */ + if (extent.page_count) + goto err; + if (extent.extent_count > 1) + if (check_if_zero(extent.extent + ROW_EXTENT_SIZE, + (extent.extent_count-1) * ROW_EXTENT_SIZE)) + goto err; + } + else + { + DBUG_PRINT("info", ("Row read")); + /* + data should normally point to end_of_date. The only exception is if + the row is very short in which case we allocated 'min_row_length' data + for allowing the row to expand. + */ + if (data != end_of_data && (uint) (end_of_data - start_of_data) > + info->s->base.min_row_length) + goto err; + } + + info->update|= HA_STATE_AKTIV; /* We have an active record */ + DBUG_RETURN(0); + +err: + /* Something was wrong with data on record */ + DBUG_PRINT("error", ("Found record with wrong data")); + DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD)); +} + + +/** @brief Read positions to tail blocks and full blocks + + @fn read_row_extent_info() + @param info Handler + + @notes + This function is a simpler version of _ma_read_block_record2() + The data about the used pages is stored in info->cur_row. + + @return Status + @retval 0 ok + @retval 1 Error. my_errno contains error number +*/ + +static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff, + uint record_number) +{ + MARIA_SHARE *share= info->s; + uchar *data, *end_of_data; + uint flag, row_extents, field_lengths; + MARIA_EXTENT_CURSOR extent; + DBUG_ENTER("read_row_extent_info"); + + if (!(data= get_record_position(buff, share->block_size, + record_number, &end_of_data))) + DBUG_RETURN(1); /* Wrong in record */ + + flag= (uint) (uchar) data[0]; + /* Skip trans header */ + data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)]; + + row_extents= 0; + if (flag & ROW_FLAG_EXTENTS) + { + uint row_extent_size; + /* + Record is split over many data pages. + Get number of extents and first extent + */ + get_key_length(row_extents, data); + row_extent_size= row_extents * ROW_EXTENT_SIZE; + if (info->cur_row.extents_buffer_length < row_extent_size && + _ma_alloc_buffer(&info->cur_row.extents, + &info->cur_row.extents_buffer_length, + row_extent_size)) + DBUG_RETURN(1); + memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE); + data+= ROW_EXTENT_SIZE; + init_extent(&extent, info->cur_row.extents, row_extents, + info->cur_row.tail_positions); + extent.first_extent= 1; + } + else + (*info->cur_row.tail_positions)= 0; + info->cur_row.extents_count= row_extents; + + if (share->base.max_field_lengths) + get_key_length(field_lengths, data); + + if (share->calc_checksum) + info->cur_row.checksum= (uint) (uchar) *data++; + if (row_extents > 1) + { + MARIA_RECORD_POS *tail_pos; + uchar *extents, *end; + + data+= share->base.null_bytes; + data+= share->base.pack_bytes; + data+= share->base.field_offsets * FIELD_OFFSET_SIZE; + + /* + Read row extents (note that first extent was already read into + info->cur_row.extents above) + Lock tails with write lock as we will delete them later. + */ + extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED; + if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE, + (row_extents - 1) * ROW_EXTENT_SIZE, + &extent, &data, &end_of_data)) + DBUG_RETURN(1); + + /* Update tail_positions with pointer to tails */ + tail_pos= info->cur_row.tail_positions; + for (extents= info->cur_row.extents, end= extents+ row_extents; + extents < end; + extents += ROW_EXTENT_SIZE) + { + ulonglong page= uint5korr(extents); + uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE); + if (page_count & TAIL_BIT) + *(tail_pos++)= ma_recordpos(page, (page_count & ~TAIL_BIT)); + } + *tail_pos= 0; /* End marker */ + } + DBUG_RETURN(0); +} + + + +/* + Read a record based on record position + + @fn _ma_read_block_record() + @param info Maria handler + @param record Store record here + @param record_pos Record position + + @return Status + @retval 0 ok + @retval # Error number +*/ + +int _ma_read_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS record_pos) +{ + uchar *data, *end_of_data, *buff; + uint offset; + uint block_size= info->s->block_size; + DBUG_ENTER("_ma_read_block_record"); + DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos)); + + offset= ma_recordpos_to_dir_entry(record_pos); + + DBUG_ASSERT(info->s->pagecache->block_size == block_size); + if (!(buff= pagecache_read(info->s->pagecache, + &info->dfile, ma_recordpos_to_page(record_pos), 0, + info->buff, info->s->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE); + if (!(data= get_record_position(buff, block_size, offset, &end_of_data))) + { + DBUG_PRINT("error", ("Wrong directory entry in data block")); + my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */ + DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); + } + DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data)); +} + + +/* compare unique constraint between stored rows */ + +my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos) +{ + uchar *org_rec_buff, *old_record; + size_t org_rec_buff_size; + int error; + DBUG_ENTER("_ma_cmp_block_unique"); + + if (!(old_record= my_alloca(info->s->base.reclength))) + DBUG_RETURN(1); + + /* Don't let the compare destroy blobs that may be in use */ + org_rec_buff= info->rec_buff; + org_rec_buff_size= info->rec_buff_size; + if (info->s->base.blobs) + { + /* Force realloc of record buffer*/ + info->rec_buff= 0; + info->rec_buff_size= 0; + } + error= _ma_read_block_record(info, old_record, pos); + if (!error) + error= _ma_unique_comp(def, record, old_record, def->null_are_equal); + if (info->s->base.blobs) + { + my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + info->rec_buff= org_rec_buff; + info->rec_buff_size= org_rec_buff_size; + } + DBUG_PRINT("exit", ("result: %d", error)); + my_afree(old_record); + DBUG_RETURN(error != 0); +} + + +/**************************************************************************** + Table scan +****************************************************************************/ + +/* + Allocate buffers for table scan + + SYNOPSIS + _ma_scan_init_block_record(MARIA_HA *info) + + IMPLEMENTATION + We allocate one buffer for the current bitmap and one buffer for the + current page + + RETURN + 0 ok + 1 error (couldn't allocate memory or disk error) +*/ + +my_bool _ma_scan_init_block_record(MARIA_HA *info) +{ + DBUG_ENTER("_ma_scan_init_block_record"); + /* + bitmap_buff may already be allocated if this is the second call to + rnd_init() without a rnd_end() in between, see sql/handler.h + */ + if (!(info->scan.bitmap_buff || + ((info->scan.bitmap_buff= + (uchar *) my_malloc(info->s->block_size * 2, MYF(MY_WME)))))) + DBUG_RETURN(1); + info->scan.page_buff= info->scan.bitmap_buff + info->s->block_size; + info->scan.bitmap_end= info->scan.bitmap_buff + info->s->bitmap.total_size; + + /* Set scan variables to get _ma_scan_block() to start with reading bitmap */ + info->scan.number_of_rows= 0; + info->scan.bitmap_pos= info->scan.bitmap_end; + info->scan.bitmap_page= (ulong) - (long) info->s->bitmap.pages_covered; + /* + We have to flush bitmap as we will read the bitmap from the page cache + while scanning rows + */ + DBUG_RETURN(_ma_flush_bitmap(info->s)); +} + + +/* Free buffers allocated by _ma_scan_block_init() */ + +void _ma_scan_end_block_record(MARIA_HA *info) +{ + DBUG_ENTER("_ma_scan_end_block_record"); + my_free(info->scan.bitmap_buff, MYF(MY_ALLOW_ZERO_PTR)); + info->scan.bitmap_buff= 0; + DBUG_VOID_RETURN; +} + + +/* + Read next record while scanning table + + SYNOPSIS + _ma_scan_block_record() + info Maria handler + record Store found here + record_pos Value stored in info->cur_row.next_pos after last call + skip_deleted + + NOTES + - One must have called mi_scan() before this + - In this version, we don't actually need record_pos, we as easily + use a variable in info->scan + + IMPLEMENTATION + Current code uses a lot of goto's to separate the different kind of + states we may be in. This gives us a minimum of executed if's for + the normal cases. I tried several different ways to code this, but + the current one was in the end the most readable and fastest. + + RETURN + 0 ok + # Error code +*/ + +int _ma_scan_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS record_pos, + my_bool skip_deleted __attribute__ ((unused))) +{ + uint block_size; + my_off_t filepos; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_scan_block_record"); + +restart_record_read: + /* Find next row in current page */ + if (likely(record_pos < info->scan.number_of_rows)) + { + uint length, offset; + uchar *data, *end_of_data; + + while (!(offset= uint2korr(info->scan.dir))) + { + info->scan.dir-= DIR_ENTRY_SIZE; + record_pos++; +#ifdef SANITY_CHECKS + if (info->scan.dir < info->scan.dir_end) + goto err; +#endif + } + /* found row */ + info->cur_row.lastpos= info->scan.row_base_page + record_pos; + info->cur_row.nextpos= record_pos + 1; + data= info->scan.page_buff + offset; + length= uint2korr(info->scan.dir + 2); + end_of_data= data + length; + info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */ +#ifdef SANITY_CHECKS + if (end_of_data > info->scan.dir_end || + offset < PAGE_HEADER_SIZE || length < share->base.min_block_length) + goto err; +#endif + DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos)); + DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data)); + } + + /* Find next head page in current bitmap */ +restart_bitmap_scan: + block_size= share->block_size; + if (likely(info->scan.bitmap_pos < info->scan.bitmap_end)) + { + uchar *data= info->scan.bitmap_pos; + longlong bits= info->scan.bits; + uint bit_pos= info->scan.bit_pos; + + do + { + while (likely(bits)) + { + uint pattern= bits & 7; + bits >>= 3; + bit_pos++; + if (pattern > 0 && pattern <= 4) + { + /* Found head page; Read it */ + ulong page; + info->scan.bitmap_pos= data; + info->scan.bits= bits; + info->scan.bit_pos= bit_pos; + page= (info->scan.bitmap_page + 1 + + (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1); + info->scan.row_base_page= ma_recordpos(page, 0); + if (!(pagecache_read(share->pagecache, + &info->dfile, + page, 0, info->scan.page_buff, + share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != + HEAD_PAGE) || + (info->scan.number_of_rows= + (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0) + { + DBUG_PRINT("error", ("Wrong page header")); + DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD)); + } + DBUG_PRINT("info", ("Page %lu has %u rows", + (ulong) page, info->scan.number_of_rows)); + info->scan.dir= (info->scan.page_buff + block_size - + PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE); + info->scan.dir_end= (info->scan.dir - + (info->scan.number_of_rows - 1) * + DIR_ENTRY_SIZE); + record_pos= 0; + goto restart_record_read; + } + } + for (data+= 6; data < info->scan.bitmap_end; data+= 6) + { + bits= uint6korr(data); + /* Skip not allocated pages and blob / full tail pages */ + if (bits && bits != LL(07777777777777777)) + break; + } + bit_pos= 0; + } while (data < info->scan.bitmap_end); + } + + /* Read next bitmap */ + info->scan.bitmap_page+= share->bitmap.pages_covered; + filepos= (my_off_t) info->scan.bitmap_page * block_size; + if (unlikely(filepos >= info->state->data_file_length)) + { + DBUG_PRINT("info", ("Found end of file")); + DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE)); + } + DBUG_PRINT("info", ("Reading bitmap at %lu", + (ulong) info->scan.bitmap_page)); + if (!(pagecache_read(share->pagecache, &info->dfile, + info->scan.bitmap_page, + 0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + /* Skip scanning 'bits' in bitmap scan code */ + info->scan.bitmap_pos= info->scan.bitmap_buff - 6; + info->scan.bits= 0; + goto restart_bitmap_scan; + +err: + DBUG_PRINT("error", ("Wrong data on page")); + DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD)); +} + + +/* + Compare a row against a stored one + + NOTES + Not implemented, as block record is not supposed to be used in a shared + global environment +*/ + +my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)), + const uchar *record __attribute__ ((unused))) +{ + return 0; +} + + +#ifndef DBUG_OFF + +static void _ma_print_directory(uchar *buff, uint block_size) +{ + uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0; + uint end_of_prev_row= PAGE_HEADER_SIZE; + uchar *dir, *end; + + dir= buff + block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE; + end= buff + block_size - DIR_ENTRY_SIZE - PAGE_SUFFIX_SIZE; + + DBUG_LOCK_FILE; + fprintf(DBUG_FILE,"Directory dump (pos:length):\n"); + + for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++) + { + uint offset= uint2korr(end); + uint length= uint2korr(end+2); + fprintf(DBUG_FILE, " %4u:%4u", offset, offset ? length : 0); + if (!(row % (80/12))) + fputc('\n', DBUG_FILE); + if (offset) + { + DBUG_ASSERT(offset >= end_of_prev_row); + end_of_prev_row= offset + length; + } + } + fputc('\n', DBUG_FILE); + fflush(DBUG_FILE); + DBUG_UNLOCK_FILE; +} +#endif /* DBUG_OFF */ + + +/* + Store an integer with simple packing + + SYNOPSIS + ma_store_integer() + to Store the packed integer here + nr Integer to store + + NOTES + This is mostly used to store field numbers and lengths of strings. + We have to cast the result for the LL() becasue of a bug in Forte CC + compiler. + + Packing used is: + nr < 251 is stored as is (in 1 byte) + Numbers that require 1-4 bytes are stored as char(250+byte_length), data + Bigger numbers are stored as 255, data as ulonglong (not yet done). + + RETURN + Position in 'to' after the packed length +*/ + +uchar *ma_store_length(uchar *to, ulong nr) +{ + if (nr < 251) + { + *to=(uchar) nr; + return to+1; + } + if (nr < 65536) + { + if (nr <= 255) + { + to[0]= (uchar) 251; + to[1]= (uchar) nr; + return to+2; + } + to[0]= (uchar) 252; + int2store(to+1, nr); + return to+3; + } + if (nr < 16777216) + { + *to++= (uchar) 253; + int3store(to, nr); + return to+3; + } + *to++= (uchar) 254; + int4store(to, nr); + return to+4; +} + + +/* Calculate how many bytes needed to store a number */ + +uint ma_calc_length_for_store_length(ulong nr) +{ + if (nr < 251) + return 1; + if (nr < 65536) + { + if (nr <= 255) + return 2; + return 3; + } + if (nr < 16777216) + return 4; + return 5; +} + + +/* Retrive a stored number */ + +static ulong ma_get_length(uchar **packet) +{ + reg1 uchar *pos= *packet; + if (*pos < 251) + { + (*packet)++; + return (ulong) *pos; + } + if (*pos == 251) + { + (*packet)+= 2; + return (ulong) pos[1]; + } + if (*pos == 252) + { + (*packet)+= 3; + return (ulong) uint2korr(pos+1); + } + if (*pos == 253) + { + (*packet)+= 4; + return (ulong) uint3korr(pos+1); + } + DBUG_ASSERT(*pos == 254); + (*packet)+= 5; + return (ulong) uint4korr(pos+1); +} + + +/* + Fill array with pointers to field parts to be stored in log for insert + + SYNOPSIS + fill_insert_undo_parts() + info Maria handler + record Inserted row + log_parts Store pointers to changed memory areas here + log_parts_count See RETURN + + NOTES + We have information in info->cur_row about the read row. + + RETURN + length of data in log_parts. + log_parts_count contains number of used log_parts +*/ + +static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record, + LEX_STRING *log_parts, + uint *log_parts_count) +{ + MARIA_SHARE *share= info->s; + MARIA_COLUMNDEF *column, *end_column; + uchar *field_lengths= info->cur_row.field_lengths; + size_t row_length; + MARIA_ROW *cur_row= &info->cur_row; + LEX_STRING *start_log_parts; + DBUG_ENTER("fill_insert_undo_parts"); + + start_log_parts= log_parts; + + /* Store null bits */ + log_parts->str= (char*) record; + log_parts->length= share->base.null_bytes; + row_length= log_parts->length; + log_parts++; + + /* Stored bitmap over packed (zero length or all-zero fields) */ + log_parts->str= info->cur_row.empty_bits; + log_parts->length= share->base.pack_bytes; + row_length+= log_parts->length; + log_parts++; + + if (share->base.max_field_lengths) + { + /* Store length of all not empty char, varchar and blob fields */ + log_parts->str= field_lengths-2; + log_parts->length= info->cur_row.field_lengths_length+2; + int2store(log_parts->str, info->cur_row.field_lengths_length); + row_length+= log_parts->length; + log_parts++; + } + + if (share->base.blobs) + { + /* Store total blob length to make buffer allocation easier during undo */ + log_parts->str= info->length_buff; + log_parts->length= (uint) (ma_store_length(log_parts->str, + info->cur_row.blob_length) - + (uchar*) log_parts->str); + row_length+= log_parts->length; + log_parts++; + } + + /* Handle constant length fields that are always present */ + for (column= share->columndef, + end_column= column+ share->base.fixed_not_null_fields; + column < end_column; + column++) + { + log_parts->str= (char*) record + column->offset; + log_parts->length= column->length; + row_length+= log_parts->length; + log_parts++; + } + + /* Handle NULL fields and CHAR/VARCHAR fields */ + for (end_column= share->columndef + share->base.fields - share->base.blobs; + column < end_column; + column++) + { + const uchar *column_pos; + size_t column_length; + if ((record[column->null_pos] & column->null_bit) || + cur_row->empty_bits[column->empty_pos] & column->empty_bit) + continue; + + column_pos= record+ column->offset; + column_length= column->length; + + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + case FIELD_SKIP_ZERO: /* Fixed length field */ + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + if (column->length <= 255) + column_length= *field_lengths++; + else + { + column_length= uint2korr(field_lengths); + field_lengths+= 2; + } + break; + } + case FIELD_VARCHAR: + { + if (column->fill_length == 1) + column_length= *field_lengths; + else + column_length= uint2korr(field_lengths); + field_lengths+= column->fill_length; + column_pos+= column->fill_length; + break; + } + default: + DBUG_ASSERT(0); + } + log_parts->str= (char*) column_pos; + log_parts->length= column_length; + row_length+= log_parts->length; + log_parts++; + } + + /* Add blobs */ + for (end_column+= share->base.blobs; column < end_column; column++) + { + const uchar *field_pos= record + column->offset; + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_pos); + + /* + We don't have to check for null, as blob_length is guranteed to be 0 + if the blob is null + */ + if (blob_length) + { + char *blob_pos; + memcpy_fixed((uchar*) &blob_pos, record + column->offset + size_length, + sizeof(blob_pos)); + log_parts->str= blob_pos; + log_parts->length= blob_length; + row_length+= log_parts->length; + log_parts++; + } + } + *log_parts_count= (log_parts - start_log_parts); + DBUG_RETURN(row_length); +} + + +/* + Fill array with pointers to field parts to be stored in log for update + + SYNOPSIS + fill_update_undo_parts() + info Maria handler + oldrec Original row + newrec New row + log_parts Store pointers to changed memory areas here + log_parts_count See RETURN + + IMPLEMENTATION + Format of undo record: + + Fields are stored in same order as the field array. + + Offset to changed field data (packed) + + For each changed field + Fieldnumber (packed) + Length, if variable length field (packed) + + For each changed field + Data + + Packing is using ma_store_integer() + + The reason we store field numbers & length separated from data (ie, not + after each other) is to get better cpu caching when we loop over + fields (as we probably don't have to access data for each field when we + want to read and old row through the undo log record). + + As a special case, we use '255' for the field number of the null bitmap. + + RETURN + length of data in log_parts. + log_parts_count contains number of used log_parts +*/ + +static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec, + const uchar *newrec, + LEX_STRING *log_parts, + uint *log_parts_count) +{ + MARIA_SHARE *share= info->s; + MARIA_COLUMNDEF *column, *end_column; + MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row; + uchar *field_data, *start_field_data; + uchar *old_field_lengths= old_row->field_lengths; + uchar *new_field_lengths= new_row->field_lengths; + size_t row_length= 0; + uint field_lengths; + LEX_STRING *start_log_parts; + my_bool new_column_is_empty; + DBUG_ENTER("fill_update_undo_parts"); + + start_log_parts= log_parts; + + /* + First log part is for number of fields, field numbers and lengths + The +4 is to reserve place for the number of changed fields. + */ + start_field_data= field_data= info->update_field_data + 4; + log_parts++; + + if (memcmp(oldrec, newrec, share->base.null_bytes)) + { + /* Store changed null bits */ + *field_data++= (uchar) 255; /* Special case */ + log_parts->str= (char*) oldrec; + log_parts->length= share->base.null_bytes; + row_length= log_parts->length; + log_parts++; + } + + /* Handle constant length fields */ + for (column= share->columndef, + end_column= column+ share->base.fixed_not_null_fields; + column < end_column; + column++) + { + if (memcmp(oldrec + column->offset, newrec + column->offset, + column->length)) + { + field_data= ma_store_length(field_data, + (uint) (column - share->columndef)); + log_parts->str= (char*) oldrec + column->offset; + log_parts->length= column->length; + row_length+= column->length; + log_parts++; + } + } + + /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */ + for (end_column= share->columndef + share->base.fields; + column < end_column; + column++) + { + const uchar *new_column_pos, *old_column_pos; + size_t new_column_length, old_column_length; + + /* First check if old column is null or empty */ + if (oldrec[column->null_pos] & column->null_bit) + { + /* + It's safe to skip this one as either the new column is also null + (no change) or the new_column is not null, in which case the null-bit + maps differed and we have already stored the null bitmap. + */ + continue; + } + if (old_row->empty_bits[column->empty_pos] & column->empty_bit) + { + if (new_row->empty_bits[column->empty_pos] & column->empty_bit) + continue; /* Both are empty; skip */ + + /* Store null length column */ + field_data= ma_store_length(field_data, + (uint) (column - share->columndef)); + field_data= ma_store_length(field_data, 0); + continue; + } + /* + Remember if the 'new' value is empty (as in this case we must always + log the original value + */ + new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) || + (new_row->empty_bits[column->empty_pos] & + column->empty_bit)); + + old_column_pos= oldrec + column->offset; + new_column_pos= newrec + column->offset; + old_column_length= new_column_length= column->length; + + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + case FIELD_SKIP_ZERO: /* Fixed length field */ + break; + case FIELD_VARCHAR: + new_column_length--; /* Skip length prefix */ + old_column_pos+= column->fill_length; + new_column_pos+= column->fill_length; + /* Fall through */ + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + if (new_column_length <= 255) + { + old_column_length= *old_field_lengths++; + if (!new_column_is_empty) + new_column_length= *new_field_lengths++; + } + else + { + old_column_length= uint2korr(old_field_lengths); + old_field_lengths+= 2; + if (!new_column_is_empty) + { + new_column_length= uint2korr(new_field_lengths); + new_field_lengths+= 2; + } + } + break; + } + case FIELD_BLOB: + { + uint size_length= column->length - portable_sizeof_char_ptr; + old_column_length= _ma_calc_blob_length(size_length, old_column_pos); + memcpy_fixed((uchar*) &old_column_pos, + oldrec + column->offset + size_length, + sizeof(old_column_pos)); + if (!new_column_is_empty) + { + new_column_length= _ma_calc_blob_length(size_length, new_column_pos); + memcpy_fixed((uchar*) &new_column_pos, + newrec + column->offset + size_length, + sizeof(old_column_pos)); + } + break; + } + default: + DBUG_ASSERT(0); + } + + if (new_column_is_empty || new_column_length != old_column_length || + memcmp(old_column_pos, new_column_pos, new_column_length)) + { + field_data= ma_store_length(field_data, + (uint) (column - share->columndef)); + field_data= ma_store_length(field_data, old_column_length); + + log_parts->str= (char*) old_column_pos; + log_parts->length= old_column_length; + row_length+= old_column_length; + log_parts++; + } + } + + *log_parts_count= (log_parts - start_log_parts); + + /* Store length of field length data before the field/field_lengths */ + field_lengths= (field_data - start_field_data); + start_log_parts->str= ((char*) + (start_field_data - + ma_calc_length_for_store_length(field_lengths))); + ma_store_length(start_log_parts->str, field_lengths); + start_log_parts->length= (size_t) ((char*) field_data - + start_log_parts->str); + row_length+= start_log_parts->length; + DBUG_RETURN(row_length); +} + +/*************************************************************************** + Applying of REDO log records +***************************************************************************/ + +/* + Apply LOGREC_REDO_INSERT_ROW_HEAD & LOGREC_REDO_INSERT_ROW_TAIL + + SYNOPSIS + _ma_apply_redo_insert_row_head_or_tail() + info Maria handler + lsn LSN to put on page + page_type HEAD_PAGE or TAIL_PAGE + header Header (without FILEID) + data Data to be put on page + data_length Length of data + + RETURN + 0 ok + # Error number +*/ + +uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + const uchar *header, + const uchar *data, + size_t data_length) +{ + MARIA_SHARE *share= info->s; + ulonglong page; + uint rownr, empty_space; + uint block_size= share->block_size; + uint rec_offset; + uchar *buff= info->keyread_buff, *dir; + DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail"); + + info->keyread_buff_used= 1; + page= page_korr(header); + rownr= dirpos_korr(header+PAGE_STORE_SIZE); + + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u data_length: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr, (uint) data_length)); + + if (((page + 1) * info->s->block_size) > info->state->data_file_length) + { + /* + New page at end of file. Note that the test above is also positive if + data_file_length is not a multiple of block_size (system crashed while + writing the last page): in this case we just extend the last page and + fill it entirely with zeroes, then the REDO will put correct data on + it. + */ + DBUG_ASSERT(rownr == 0); + if (rownr != 0) + goto err; + make_empty_page(buff, block_size, page_type); + empty_space= (block_size - PAGE_OVERHEAD_SIZE); + rec_offset= PAGE_HEADER_SIZE; + dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; + } + else + { + uint max_entry; + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, + page, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + + /* Fix bitmap, just in case */ + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); + } + + max_entry= (uint) ((uchar*) buff)[DIR_COUNT_OFFSET]; + if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type)) + { + /* + This is a page that has been freed before and now should be + changed to new type. + */ + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE && + (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != UNALLOCATED_PAGE) + goto err; + make_empty_page(buff, block_size, page_type); + empty_space= (block_size - PAGE_OVERHEAD_SIZE); + rec_offset= PAGE_HEADER_SIZE; + dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; + } + else + { + dir= (buff + block_size - DIR_ENTRY_SIZE * (rownr + 1) - + PAGE_SUFFIX_SIZE); + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + + if (max_entry <= rownr) + { + /* Add directory entry first in directory and data last on page */ + DBUG_ASSERT(max_entry == rownr); + if (max_entry != rownr) + goto err; + rec_offset= (uint2korr(dir + DIR_ENTRY_SIZE) + + uint2korr(dir + DIR_ENTRY_SIZE +2)); + if ((uint) (dir - buff) < rec_offset + data_length) + { + /* Create place for directory & data */ + compact_page(buff, block_size, max_entry - 1, 0); + rec_offset= (uint2korr(dir + DIR_ENTRY_SIZE) + + uint2korr(dir + DIR_ENTRY_SIZE +2)); + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + DBUG_ASSERT(!((uint) (dir - buff) < rec_offset + data_length)); + if ((uint) (dir - buff) < rec_offset + data_length) + goto err; + } + buff[DIR_COUNT_OFFSET]= (uchar) max_entry+1; + int2store(dir, rec_offset); + empty_space-= DIR_ENTRY_SIZE; + } + else + { + uint length; + /* + Reuse old entry. This is empty if the command was an insert and + possible used if the command was an update. + */ + if (extend_area_on_page(buff, dir, rownr, block_size, + data_length, &empty_space, + &rec_offset, &length)) + goto err; + } + } + } + /* Copy data */ + int2store(dir+2, data_length); + memcpy(buff + rec_offset, data, data_length); + empty_space-= data_length; + int2store(buff + EMPTY_SPACE_OFFSET, empty_space); + + /* Write modified page */ + lsn_store(buff, lsn); + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0)) + DBUG_RETURN(my_errno); + + /* Fix bitmap */ + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + DBUG_RETURN(my_errno); + + /* + Data page and bitmap page are in place, we can update data_file_length in + case we extended the file. We could not do it earlier: bitmap code tests + data_file_length to know if it has to create a new page or not. + */ + { + my_off_t end_of_page= (page + 1) * info->s->block_size; + set_if_bigger(info->state->data_file_length, end_of_page); + } + + DBUG_RETURN(0); + +err: + DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); +} + + +/* + Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL + + SYNOPSIS + _ma_apply_redo_purge_row_head_or_tail() + info Maria handler + lsn LSN to put on page + page_type HEAD_PAGE or TAIL_PAGE + header Header (without FILEID) + + NOTES + This function is very similar to delete_head_or_tail() + + RETURN + 0 ok + # Error number +*/ + +uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + const uchar *header) +{ + MARIA_SHARE *share= info->s; + ulonglong page; + uint rownr, empty_space; + uint block_size= share->block_size; + uchar *buff= info->keyread_buff; + DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail"); + + page= page_korr(header); + rownr= dirpos_korr(header+PAGE_STORE_SIZE); + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr)); + + info->keyread_buff_used= 1; + + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, + page, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + + if (lsn_korr(buff) >= lsn) + { + /* + Already applied + Note that in case the page is not anymore a head or tail page + a future redo will fix the bitmap. + */ + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type) + { + empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET); + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, + empty_space)) + DBUG_RETURN(my_errno); + } + DBUG_RETURN(0); + } + + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type); + + if (delete_dir_entry(buff, block_size, rownr, &empty_space) < 0) + DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); + + lsn_store(buff, lsn); + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0)) + DBUG_RETURN(my_errno); + + /* This will work even if the page was marked as UNALLOCATED_PAGE */ + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + DBUG_RETURN(my_errno); + + DBUG_RETURN(0); +} + + +/** + @brief Apply LOGREC_REDO_PURGE_BLOCKS + + @param info Maria handler + @param header Header (without FILEID) + + @note It marks the page free in the bitmap, and sets the directory's count + to 0. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +uint _ma_apply_redo_purge_blocks(MARIA_HA *info, + LSN lsn, const uchar *header) +{ + MARIA_SHARE *share= info->s; + ulonglong page; + uint page_range, ranges; + uint res= 0; + uchar *buff= info->keyread_buff; + DBUG_ENTER("_ma_apply_redo_purge_blocks"); + + info->keyread_buff_used= 1; + ranges= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + + while (ranges--) + { + uint i; + page= page_korr(header); + header+= PAGE_STORE_SIZE; + page_range= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + + for (i= 0; i < page_range ; i++) + { + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, + page+i, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + continue; + } + buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; + lsn_store(buff, lsn); + if (pagecache_write(share->pagecache, + &info->dfile, page+i, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0)) + DBUG_RETURN(my_errno); + } + /** @todo leave bitmap lock to the bitmap code... */ + pthread_mutex_lock(&share->bitmap.bitmap_lock); + res= _ma_reset_full_page_bits(info, &share->bitmap, page, page_range); + pthread_mutex_unlock(&share->bitmap.bitmap_lock); + if (res) + DBUG_RETURN(res); + } + DBUG_RETURN(0); +} + +/**************************************************************************** + Applying of UNDO entries +****************************************************************************/ + +my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header) +{ + ulonglong page; + uint rownr; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + 1], *buff; + my_bool res= 1; + MARIA_PINNED_PAGE page_link; + LSN lsn; + DBUG_ENTER("_ma_apply_undo_row_insert"); + + page= page_korr(header); + rownr= dirpos_korr(header + PAGE_STORE_SIZE); + DBUG_PRINT("enter", ("Page: %lu rownr: %u", (ulong) page, rownr)); + + if (!(buff= pagecache_read(info->s->pagecache, + &info->dfile, page, 0, + info->buff, info->s->page_type, + PAGECACHE_LOCK_WRITE, + &page_link.link))) + DBUG_RETURN(1); + + + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + if (read_row_extent_info(info, buff, rownr)) + DBUG_RETURN(1); + + if (delete_head_or_tail(info, page, rownr, 1, 1) || + delete_tails(info, info->cur_row.tail_positions)) + goto err; + + if (info->cur_row.extents && free_full_pages(info, &info->cur_row)) + goto err; + + /* undo_lsn must be first for compression to work */ + lsn_store(log_data, undo_lsn); + log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE]= LOGREC_UNDO_ROW_INSERT; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + if (translog_write_record(&lsn, LOGREC_CLR_END, + info->trn, info, sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data + LSN_STORE_SIZE)) + goto err; + + res= 0; +err: + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/* Execute undo of a row delete (insert the row back somewhere) */ + +my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, size_t length) +{ + uchar *record; + const uchar *null_bits, *field_length_data; + MARIA_SHARE *share= info->s; + MARIA_ROW row; + uint *null_field_lengths; + ulong *blob_lengths; + MARIA_COLUMNDEF *column, *end_column; + my_bool res; + DBUG_ENTER("_ma_apply_undo_row_delete"); + + /* + Use cur row as a base; We need to make a copy as we will change + some buffers to point directly to 'header' + */ + memcpy(&row, &info->cur_row, sizeof(row)); + null_field_lengths= row.null_field_lengths; + blob_lengths= row.blob_lengths; + + /* + Fill in info->cur_row with information about the row, like in + calc_record_size(), to be used by write_block_record() + */ + + row.normal_length= row.char_length= row.varchar_length= + row.blob_length= row.extents_count= row.field_lengths_length= 0; + + null_bits= header; + header+= share->base.null_bytes; + row.empty_bits= (uchar*) header; + header+= share->base.pack_bytes; + if (share->base.max_field_lengths) + { + row.field_lengths_length= uint2korr(header); + row.field_lengths= (uchar*) header + 2 ; + header+= 2 + row.field_lengths_length; + } + if (share->base.blobs) + row.blob_length= ma_get_length((uchar**) &header); + + /* We need to build up a record (without blobs) in rec_buff */ + if (!(record= my_malloc(share->base.reclength, MYF(MY_WME)))) + DBUG_RETURN(1); + + memcpy(record, null_bits, share->base.null_bytes); + + /* Copy field information from header to record */ + + /* Handle constant length fields that are always present */ + for (column= share->columndef, + end_column= column+ share->base.fixed_not_null_fields; + column < end_column; + column++) + { + memcpy(record + column->offset, header, column->length); + header+= column->length; + } + + /* Handle NULL fields and CHAR/VARCHAR fields */ + field_length_data= row.field_lengths; + for (end_column= share->columndef + share->base.fields; + column < end_column; + column++, null_field_lengths++) + { + if ((record[column->null_pos] & column->null_bit) || + row.empty_bits[column->empty_pos] & column->empty_bit) + { + if (column->type != FIELD_BLOB) + *null_field_lengths= 0; + else + *blob_lengths++= 0; + if (share->calc_checksum) + bfill(record + column->offset, column->fill_length, + column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + continue; + } + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + case FIELD_SKIP_ZERO: /* Fixed length field */ + row.normal_length+= column->length; + *null_field_lengths= column->length; + memcpy(record + column->offset, header, column->length); + header+= column->length; + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + if (column->length <= 255) + length= (uint) *field_length_data++; + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + } + row.char_length+= length; + *null_field_lengths= length; + memcpy(record + column->offset, header, length); + if (share->calc_checksum) + bfill(record + column->offset + length, (column->length - length), + ' '); + header+= length; + break; + case FIELD_VARCHAR: + { + uint length; + uchar *field_pos= record + column->offset; + + /* 256 is correct as this includes the length uchar */ + if (column->fill_length == 1) + { + field_pos[0]= *field_length_data; + length= (uint) *field_length_data; + } + else + { + field_pos[0]= field_length_data[0]; + field_pos[1]= field_length_data[1]; + length= uint2korr(field_length_data); + } + field_length_data+= column->fill_length; + field_pos+= column->fill_length; + row.varchar_length+= length; + *null_field_lengths= length; + memcpy(field_pos, header, length); + header+= length; + break; + } + case FIELD_BLOB: + { + /* Copy length of blob and pointer to blob data to record */ + uchar *field_pos= record + column->offset; + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_length_data); + + memcpy(field_pos, field_length_data, size_length); + field_length_data+= size_length; + memcpy(field_pos + size_length, &header, sizeof(&header)); + header+= blob_length; + *blob_lengths++= blob_length; + row.blob_length+= blob_length; + break; + } + default: + DBUG_ASSERT(0); + } + } + row.head_length= (row.base_length + + share->base.fixed_not_null_fields_length + + row.field_lengths_length + + size_to_store_key_length(row.field_lengths_length) + + row.normal_length + + row.char_length + row.varchar_length); + row.total_length= (row.head_length + row.blob_length); + if (row.total_length < share->base.min_row_length) + row.total_length= share->base.min_row_length; + + /* Row is now up to date. Time to insert the record */ + + res= allocate_and_write_block_record(info, record, &row, undo_lsn); + my_free(record, MYF(0)); + DBUG_RETURN(res); +} + + +/* + Execute undo of a row update + + @fn _ma_apply_undo_row_update() + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn, + const uchar *header, + size_t header_length __attribute__((unused))) +{ + ulonglong page; + uint rownr, field_length_header; + MARIA_SHARE *share= info->s; + const uchar *field_length_data, *field_length_data_end; + uchar *current_record, *orig_record; + int error= 1; + MARIA_RECORD_POS record_pos; + DBUG_ENTER("_ma_apply_undo_row_update"); + + page= page_korr(header); + rownr= dirpos_korr(header + PAGE_STORE_SIZE); + record_pos= ma_recordpos(page, rownr); + DBUG_PRINT("enter", ("Page: %lu rownr: %u", (ulong) page, rownr)); + + /* + Set header to point to old field values, generated by + fill_update_undo_parts() + */ + header+= PAGE_STORE_SIZE + DIRPOS_STORE_SIZE; + field_length_header= ma_get_length((uchar**) &header); + field_length_data= header; + header+= field_length_header; + field_length_data_end= header; + + /* Allocate buffer for current row & original row */ + if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME)))) + DBUG_RETURN(1); + orig_record= current_record+ share->base.reclength; + + /* Read current record */ + if (_ma_read_block_record(info, current_record, record_pos)) + goto err; + + if (*field_length_data == 255) + { + /* Bitmap changed */ + field_length_data++; + memcpy(orig_record, header, share->base.null_bytes); + header+= share->base.null_bytes; + } + else + memcpy(orig_record, current_record, share->base.null_bytes); + bitmap_clear_all(&info->changed_fields); + + while (field_length_data < field_length_data_end) + { + uint field_nr= ma_get_length((uchar**) &field_length_data), field_length; + MARIA_COLUMNDEF *column= share->columndef + field_nr; + uchar *orig_field_pos= orig_record + column->offset; + + bitmap_set_bit(&info->changed_fields, field_nr); + if (field_nr >= share->base.fixed_not_null_fields) + { + if (!(field_length= ma_get_length((uchar**) &field_length_data))) + { + /* Null field or empty field */ + bfill(orig_field_pos, column->fill_length, + column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + continue; + } + } + else + field_length= column->length; + + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + memcpy(orig_field_pos, header, column->length); + header+= column->length; + break; + case FIELD_SKIP_ZERO: /* Number */ + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + uint diff; + memcpy(orig_field_pos, header, field_length); + if ((diff= (column->length - field_length))) + bfill(orig_field_pos + column->length - diff, diff, + column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + header+= field_length; + } + break; + case FIELD_VARCHAR: + if (column->length <= 256) + { + *orig_field_pos++= (uchar) field_length; + } + else + { + int2store(orig_field_pos, field_length); + orig_field_pos+= 2; + } + memcpy(orig_field_pos, header, field_length); + header+= field_length; + break; + case FIELD_BLOB: + { + uint size_length= column->length - portable_sizeof_char_ptr; + _ma_store_blob_length(orig_field_pos, size_length, field_length); + memcpy_fixed(orig_field_pos + size_length, &header, sizeof(header)); + header+= field_length; + break; + } + default: + DBUG_ASSERT(0); + } + } + copy_not_changed_fields(info, &info->changed_fields, + orig_record, current_record); + + if (share->calc_checksum) + { + info->cur_row.checksum= (*share->calc_checksum)(info, orig_record); + info->state->checksum+= (info->cur_row.checksum - + (*share->calc_checksum)(info, current_record)); + } + + /* + Now records are up to date, execute the update to original values + */ + if (_ma_update_block_record2(info, record_pos, current_record, orig_record, + undo_lsn)) + goto err; + + error= 0; +err: + my_free(current_record, MYF(0)); + DBUG_RETURN(error); +} diff --git a/storage/maria/ma_blockrec.h b/storage/maria/ma_blockrec.h new file mode 100644 index 00000000000..30dffe1c0c0 --- /dev/null +++ b/storage/maria/ma_blockrec.h @@ -0,0 +1,195 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Storage of records in block +*/ + +#define LSN_SIZE 7 +#define DIR_COUNT_SIZE 1 /* Stores number of rows on page */ +#define EMPTY_SPACE_SIZE 2 /* Stores empty space on page */ +#define PAGE_TYPE_SIZE 1 +#define PAGE_SUFFIX_SIZE 0 /* Bytes for page suffix */ +#define PAGE_HEADER_SIZE (LSN_SIZE + DIR_COUNT_SIZE + EMPTY_SPACE_SIZE +\ + PAGE_TYPE_SIZE) +#define PAGE_OVERHEAD_SIZE (PAGE_HEADER_SIZE + DIR_ENTRY_SIZE + \ + PAGE_SUFFIX_SIZE) +#define BLOCK_RECORD_POINTER_SIZE 6 + +#define FULL_PAGE_SIZE(block_size) ((block_size) - LSN_SIZE - PAGE_TYPE_SIZE) + +#define ROW_EXTENT_PAGE_SIZE 5 +#define ROW_EXTENT_COUNT_SIZE 2 +#define ROW_EXTENT_SIZE (ROW_EXTENT_PAGE_SIZE + ROW_EXTENT_COUNT_SIZE) +#define TAIL_BIT 0x8000 /* Bit in page_count to signify tail */ +/* Number of extents reserved MARIA_BITMAP_BLOCKS to store head part */ +#define ELEMENTS_RESERVED_FOR_MAIN_PART 4 +/* Fields before 'row->null_field_lengths' used by find_where_to_split_row */ +#define EXTRA_LENGTH_FIELDS 3 + +/* Size for the different parts in the row header (and head page) */ + +#define FLAG_SIZE 1 +#define TRANSID_SIZE 6 +#define VERPTR_SIZE 7 +#define DIR_ENTRY_SIZE 4 +#define FIELD_OFFSET_SIZE 2 /* size of pointers to field starts */ + +/* Minimum header size needed for a new row */ +#define BASE_ROW_HEADER_SIZE FLAG_SIZE +#define TRANS_ROW_EXTRA_HEADER_SIZE TRANSID_SIZE + +#define PAGE_TYPE_MASK 127 +enum en_page_type { UNALLOCATED_PAGE, HEAD_PAGE, TAIL_PAGE, BLOB_PAGE, MAX_PAGE_TYPE }; + +#define PAGE_TYPE_OFFSET LSN_SIZE +#define DIR_COUNT_OFFSET LSN_SIZE+PAGE_TYPE_SIZE +#define EMPTY_SPACE_OFFSET (DIR_COUNT_OFFSET + DIR_COUNT_SIZE) + +#define PAGE_CAN_BE_COMPACTED 128 /* Bit in PAGE_TYPE */ + +/* Bits used for flag uchar (one byte, first in record) */ +#define ROW_FLAG_TRANSID 1 +#define ROW_FLAG_VER_PTR 2 +#define ROW_FLAG_DELETE_TRANSID 4 +#define ROW_FLAG_NULLS_EXTENDED 8 +#define ROW_FLAG_EXTENTS 128 +#define ROW_FLAG_ALL (1+2+4+8+128) + +/******** Variables that affects how data pages are utilized ********/ + +/* Minium size of tail segment */ +#define MIN_TAIL_SIZE 32 + +/* + Fixed length part of Max possible header size; See row data structure + table in ma_blockrec.c. +*/ +#define MAX_FIXED_HEADER_SIZE (FLAG_SIZE + 3 + ROW_EXTENT_SIZE + 3) +#define TRANS_MAX_FIXED_HEADER_SIZE (MAX_FIXED_HEADER_SIZE + \ + TRANSID_SIZE + VERPTR_SIZE + \ + TRANSID_SIZE) + +/* We use 1 uchar in record header to store number of directory entries */ +#define MAX_ROWS_PER_PAGE 255 + +/* Bits for MARIA_BITMAP_BLOCKS->used */ +/* We stored data on disk in the block */ +#define BLOCKUSED_USED 1 +/* Bitmap on disk is block->org_bitmap_value ; Happens only on update */ +#define BLOCKUSED_USE_ORG_BITMAP 2 +/* We stored tail data on disk for the block */ +#define BLOCKUSED_TAIL 4 + +/******* defines that affects allocation (density) of data *******/ + +/* + If the tail part (from the main block or a blob) would use more than 75 % of + the size of page, store the tail on a full page instead of a shared + tail page. +*/ +#define MAX_TAIL_SIZE(block_size) ((block_size) *3 / 4) + +/* Don't allocate memory for too many row extents on the stack */ +#define ROW_EXTENTS_ON_STACK 32 + +/* Functions to convert MARIA_RECORD_POS to/from page:offset */ + +static inline MARIA_RECORD_POS ma_recordpos(ulonglong page, uint dir_entry) +{ + DBUG_ASSERT(dir_entry <= 255); + return (MARIA_RECORD_POS) ((page << 8) | dir_entry); +} + +static inline my_off_t ma_recordpos_to_page(MARIA_RECORD_POS record_pos) +{ + return record_pos >> 8; +} + +static inline uint ma_recordpos_to_dir_entry(MARIA_RECORD_POS record_pos) +{ + return (uint) (record_pos & 255); +} + +/* ma_blockrec.c */ +void _ma_init_block_record_data(void); +my_bool _ma_once_init_block_record(MARIA_SHARE *share, File dfile); +my_bool _ma_once_end_block_record(MARIA_SHARE *share); +my_bool _ma_init_block_record(MARIA_HA *info); +void _ma_end_block_record(MARIA_HA *info); + +my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec, const uchar *newrec); +my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record); +int _ma_read_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS record_pos); +int _ma_read_block_record2(MARIA_HA *info, uchar *record, + uchar *data, uchar *end_of_data); +int _ma_scan_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS, my_bool); +my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos); +my_bool _ma_scan_init_block_record(MARIA_HA *info); +void _ma_scan_end_block_record(MARIA_HA *info); + +MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info, + const uchar *record); +my_bool _ma_write_block_record(MARIA_HA *info, const uchar *record); +my_bool _ma_write_abort_block_record(MARIA_HA *info); +my_bool _ma_compare_block_record(register MARIA_HA *info, + register const uchar *record); + +/* ma_bitmap.c */ +my_bool _ma_bitmap_init(MARIA_SHARE *share, File file); +my_bool _ma_bitmap_end(MARIA_SHARE *share); +my_bool _ma_flush_bitmap(MARIA_SHARE *share); +my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row, + MARIA_BITMAP_BLOCKS *result_blocks); +my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks); +my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents, + uint count); +my_bool _ma_bitmap_set(MARIA_HA *info, ulonglong pos, my_bool head, + uint empty_space); +my_bool _ma_reset_full_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + ulonglong page, uint page_count); +uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size); +my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *new_row, + ulonglong page, uint free_size, + MARIA_BITMAP_BLOCKS *result_blocks); +my_bool _ma_check_bitmap_data(MARIA_HA *info, + enum en_page_type page_type, ulonglong page, + uint empty_space, uint *bitmap_pattern); +my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info, + enum en_page_type page_type, + ulonglong page, + uint *bitmap_pattern); +void _ma_bitmap_delete_all(MARIA_SHARE *share); +int _ma_bitmap_create_first(MARIA_SHARE *share); +uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + const uchar *header, + const uchar *data, + size_t data_length); +uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + const uchar *header); +uint _ma_apply_redo_purge_blocks(MARIA_HA *info, LSN lsn, + const uchar *header); +my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header); +my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, size_t length); +my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn, + const uchar *header, size_t length); diff --git a/storage/maria/ma_cache.c b/storage/maria/ma_cache.c new file mode 100644 index 00000000000..6b1f9ec3fae --- /dev/null +++ b/storage/maria/ma_cache.c @@ -0,0 +1,107 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Functions for read record cacheing with maria + Used for reading dynamic/compressed records from datafile. + + Can fetch data directly from file (outside cache), + if reading a small chunk straight before the cached part (with possible + overlap). + + Can be explicitly asked not to use cache (by not setting READING_NEXT in + flag) - useful for occasional out-of-cache reads, when the next read is + expected to hit the cache again. + + Allows "partial read" errors in the record header (when READING_HEADER flag + is set) - unread part is bzero'ed + + Note: out-of-cache reads are enabled for shared IO_CACHE's too, + as these reads will be cached by OS cache (and my_pread is always atomic) +*/ + + +#include "maria_def.h" + +int _ma_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos, uint length, + int flag) +{ + uint read_length,in_buff_length; + my_off_t offset; + uchar *in_buff_pos; + DBUG_ENTER("_ma_read_cache"); + + if (pos < info->pos_in_file) + { + read_length=length; + if ((my_off_t) read_length > (my_off_t) (info->pos_in_file-pos)) + read_length=(uint) (info->pos_in_file-pos); + info->seek_not_done=1; + if (my_pread(info->file,buff,read_length,pos,MYF(MY_NABP))) + DBUG_RETURN(1); + if (!(length-=read_length)) + DBUG_RETURN(0); + pos+=read_length; + buff+=read_length; + } + if (pos >= info->pos_in_file && + (offset= (my_off_t) (pos - info->pos_in_file)) < + (my_off_t) (info->read_end - info->request_pos)) + { + in_buff_pos=info->request_pos+(uint) offset; + in_buff_length= min(length,(size_t) (info->read_end-in_buff_pos)); + memcpy(buff,info->request_pos+(uint) offset,(size_t) in_buff_length); + if (!(length-=in_buff_length)) + DBUG_RETURN(0); + pos+=in_buff_length; + buff+=in_buff_length; + } + else + in_buff_length=0; + if (flag & READING_NEXT) + { + if (pos != (info->pos_in_file + + (uint) (info->read_end - info->request_pos))) + { + info->pos_in_file=pos; /* Force start here */ + info->read_pos=info->read_end=info->request_pos; /* Everything used */ + info->seek_not_done=1; + } + else + info->read_pos=info->read_end; /* All block used */ + if (!(*info->read_function)(info,buff,length)) + DBUG_RETURN(0); + read_length=info->error; + } + else + { + info->seek_not_done=1; + if ((read_length=my_pread(info->file,buff,length,pos,MYF(0))) == length) + DBUG_RETURN(0); + } + if (!(flag & READING_HEADER) || (int) read_length == -1 || + read_length+in_buff_length < 3) + { + DBUG_PRINT("error", + ("Error %d reading next-multi-part block (Got %d bytes)", + my_errno, (int) read_length)); + if (!my_errno || my_errno == -1) + my_errno=HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(1); + } + bzero(buff+read_length,MARIA_BLOCK_INFO_HEADER_LENGTH - in_buff_length - + read_length); + DBUG_RETURN(0); +} /* _ma_read_cache */ diff --git a/storage/maria/ma_changed.c b/storage/maria/ma_changed.c new file mode 100644 index 00000000000..4d0964581f6 --- /dev/null +++ b/storage/maria/ma_changed.c @@ -0,0 +1,33 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Check if somebody has changed table since last check. */ + +#include "maria_def.h" + + /* Return 0 if table isn't changed */ + +int maria_is_changed(MARIA_HA *info) +{ + int result; + DBUG_ENTER("maria_is_changed"); + if (fast_ma_readinfo(info)) + DBUG_RETURN(-1); + VOID(_ma_writeinfo(info,0)); + result=(int) info->data_changed; + info->data_changed=0; + DBUG_PRINT("exit",("result: %d",result)); + DBUG_RETURN(result); +} diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c new file mode 100644 index 00000000000..fa1c812daf7 --- /dev/null +++ b/storage/maria/ma_check.c @@ -0,0 +1,5633 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Describe, check and repair of MARIA tables */ + +/* + About checksum calculation. + + There are two types of checksums. Table checksum and row checksum. + + Row checksum is an additional uchar at the end of dynamic length + records. It must be calculated if the table is configured for them. + Otherwise they must not be used. The variable + MYISAM_SHARE::calc_checksum determines if row checksums are used. + MI_INFO::checksum is used as temporary storage during row handling. + For parallel repair we must assure that only one thread can use this + variable. There is no problem on the write side as this is done by one + thread only. But when checking a record after read this could go + wrong. But since all threads read through a common read buffer, it is + sufficient if only one thread checks it. + + Table checksum is an eight uchar value in the header of the index file. + It can be calculated even if row checksums are not used. The variable + MI_CHECK::glob_crc is calculated over all records. + MI_SORT_PARAM::calc_checksum determines if this should be done. This + variable is not part of MI_CHECK because it must be set per thread for + parallel repair. The global glob_crc must be changed by one thread + only. And it is sufficient to calculate the checksum once only. +*/ + +#include "ma_ftdefs.h" +#include <myisamchk.h> +#include <stdarg.h> +#include <my_getopt.h> +#ifdef HAVE_SYS_VADVISE_H +#include <sys/vadvise.h> +#endif +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif +#include "ma_rt_index.h" +#include "ma_blockrec.h" +#include "trnman_public.h" + +/* Functions defined in this file */ + +static int check_k_link(HA_CHECK *param, MARIA_HA *info, my_off_t next_link); +static int chk_index(HA_CHECK *param, MARIA_HA *info,MARIA_KEYDEF *keyinfo, + my_off_t page, uchar *buff, ha_rows *keys, + ha_checksum *key_checksum, uint level); +static uint isam_key_length(MARIA_HA *info,MARIA_KEYDEF *keyinfo); +static ha_checksum calc_checksum(ha_rows count); +static int writekeys(MARIA_SORT_PARAM *sort_param); +static int sort_one_index(HA_CHECK *param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t pagepos, File new_file); +static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key); +static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key); +static int sort_get_next_record(MARIA_SORT_PARAM *sort_param); +static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a, + const void *b); +static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param, + const uchar *a); +static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a); +static my_off_t get_record_for_key(MARIA_HA *info,MARIA_KEYDEF *keyinfo, + const uchar *key); +static int sort_insert_key(MARIA_SORT_PARAM *sort_param, + reg1 SORT_KEY_BLOCKS *key_block, + const uchar *key, my_off_t prev_block); +static int sort_delete_record(MARIA_SORT_PARAM *sort_param); +/*static int _ma_flush_pending_blocks(HA_CHECK *param);*/ +static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks, + uint buffer_length); +static ha_checksum maria_byte_checksum(const uchar *buf, uint length); +static void set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share); +static void restore_data_file_type(MARIA_SHARE *share); +static void change_data_file_descriptor(MARIA_HA *info, File new_file); +static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info, + MARIA_HA *info, uchar *record); +static void copy_data_file_state(MARIA_STATE_INFO *to, + MARIA_STATE_INFO *from); +static int write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info); + + +void maria_chk_init(HA_CHECK *param) +{ + bzero((uchar*) param,sizeof(*param)); + param->opt_follow_links=1; + param->keys_in_use= ~(ulonglong) 0; + param->search_after_block=HA_OFFSET_ERROR; + param->auto_increment_value= 0; + param->use_buffers=USE_BUFFER_INIT; + param->read_buffer_length=READ_BUFFER_INIT; + param->write_buffer_length=READ_BUFFER_INIT; + param->sort_buffer_length=SORT_BUFFER_INIT; + param->sort_key_blocks=BUFFERS_WHEN_SORTING; + param->tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL; + param->myf_rw=MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL); + param->start_check_pos=0; + param->max_record_length= LONGLONG_MAX; + param->pagecache_block_size= KEY_CACHE_BLOCK_SIZE; + param->stats_method= MI_STATS_METHOD_NULLS_NOT_EQUAL; +} + + /* Check the status flags for the table */ + +int maria_chk_status(HA_CHECK *param, register MARIA_HA *info) +{ + MARIA_SHARE *share=info->s; + + if (maria_is_crashed_on_repair(info)) + _ma_check_print_warning(param, + "Table is marked as crashed and last repair failed"); + else if (maria_is_crashed(info)) + _ma_check_print_warning(param, + "Table is marked as crashed"); + if (share->state.open_count != (uint) (info->s->global_changed ? 1 : 0)) + { + /* Don't count this as a real warning, as check can correct this ! */ + uint save=param->warning_printed; + _ma_check_print_warning(param, + share->state.open_count==1 ? + "%d client is using or hasn't closed the table properly" : + "%d clients are using or haven't closed the table properly", + share->state.open_count); + /* If this will be fixed by the check, forget the warning */ + if (param->testflag & T_UPDATE_STATE) + param->warning_printed=save; + } + return 0; +} + +/* + Check delete links in row data +*/ + +int maria_chk_del(HA_CHECK *param, register MARIA_HA *info, uint test_flag) +{ + reg2 ha_rows i; + uint delete_link_length; + my_off_t empty,next_link,old_link; + char buff[22],buff2[22]; + DBUG_ENTER("maria_chk_del"); + + LINT_INIT(old_link); + + if (info->s->data_file_type == BLOCK_RECORD) + DBUG_RETURN(0); /* No delete links here */ + + param->record_checksum=0; + delete_link_length=((info->s->options & HA_OPTION_PACK_RECORD) ? 20 : + info->s->rec_reflength+1); + + if (!(test_flag & T_SILENT)) + puts("- check record delete-chain"); + + next_link=info->s->state.dellink; + if (info->state->del == 0) + { + if (test_flag & T_VERBOSE) + { + puts("No recordlinks"); + } + } + else + { + if (test_flag & T_VERBOSE) + printf("Recordlinks: "); + empty=0; + for (i= info->state->del ; i > 0L && next_link != HA_OFFSET_ERROR ; i--) + { + if (*_ma_killed_ptr(param)) + DBUG_RETURN(1); + if (test_flag & T_VERBOSE) + printf(" %9s",llstr(next_link,buff)); + if (next_link >= info->state->data_file_length) + goto wrong; + if (my_pread(info->dfile.file, (char*) buff, delete_link_length, + next_link,MYF(MY_NABP))) + { + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"Can't read delete-link at filepos: %s", + llstr(next_link,buff)); + DBUG_RETURN(1); + } + if (*buff != '\0') + { + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"Record at pos: %s is not remove-marked", + llstr(next_link,buff)); + goto wrong; + } + if (info->s->options & HA_OPTION_PACK_RECORD) + { + my_off_t prev_link=mi_sizekorr(buff+12); + if (empty && prev_link != old_link) + { + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"Deleted block at %s doesn't point back at previous delete link",llstr(next_link,buff2)); + goto wrong; + } + old_link=next_link; + next_link=mi_sizekorr(buff+4); + empty+=mi_uint3korr(buff+1); + } + else + { + param->record_checksum+=(ha_checksum) next_link; + next_link= _ma_rec_pos(info->s, buff+1); + empty+=info->s->base.pack_reclength; + } + } + if (test_flag & T_VERBOSE) + puts("\n"); + if (empty != info->state->empty) + { + _ma_check_print_warning(param, + "Found %s deleted space in delete link chain. Should be %s", + llstr(empty,buff2), + llstr(info->state->empty,buff)); + } + if (next_link != HA_OFFSET_ERROR) + { + _ma_check_print_error(param, + "Found more than the expected %s deleted rows in delete link chain", + llstr(info->state->del, buff)); + goto wrong; + } + if (i != 0) + { + _ma_check_print_error(param, + "Found %s deleted rows in delete link chain. Should be %s", + llstr(info->state->del - i, buff2), + llstr(info->state->del, buff)); + goto wrong; + } + } + DBUG_RETURN(0); + +wrong: + param->testflag|=T_RETRY_WITHOUT_QUICK; + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"record delete-link-chain corrupted"); + DBUG_RETURN(1); +} /* maria_chk_del */ + + + /* Check delete links in index file */ + +static int check_k_link(HA_CHECK *param, register MARIA_HA *info, + my_off_t next_link) +{ + uint block_size= info->s->block_size; + ha_rows records; + char llbuff[21], llbuff2[21], *buff; + DBUG_ENTER("check_k_link"); + + records= (ha_rows) (info->state->key_file_length / block_size); + while (next_link != HA_OFFSET_ERROR && records > 0) + { + if (*_ma_killed_ptr(param)) + DBUG_RETURN(1); + if (param->testflag & T_VERBOSE) + printf("%16s",llstr(next_link,llbuff)); + + /* Key blocks must lay within the key file length entirely. */ + if (next_link + block_size > info->state->key_file_length) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "Invalid key block position: %s " + "key block size: %u file_length: %s", + llstr(next_link, llbuff), block_size, + llstr(info->state->key_file_length, llbuff2)); + DBUG_RETURN(1); + /* purecov: end */ + } + + /* Key blocks must be aligned at block_size */ + if (next_link & (block_size -1)) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "Mis-aligned key block: %s " + "minimum key block length: %u", + llstr(next_link, llbuff), + block_size); + DBUG_RETURN(1); + /* purecov: end */ + } + + DBUG_ASSERT(info->s->pagecache->block_size == block_size); + if (!(buff= pagecache_read(info->s->pagecache, + &info->s->kfile, next_link/block_size, + DFLT_INIT_HITS, + (uchar*) info->buff, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "key cache read error for block: %s", + llstr(next_link,llbuff)); + DBUG_RETURN(1); + /* purecov: end */ + } + next_link=mi_sizekorr(buff); + records--; + param->key_file_blocks+=block_size; + } + if (param->testflag & T_VERBOSE) + { + if (next_link != HA_OFFSET_ERROR) + printf("%16s\n",llstr(next_link,llbuff)); + else + puts(""); + } + DBUG_RETURN (next_link != HA_OFFSET_ERROR); +} /* check_k_link */ + + + /* Check sizes of files */ + +int maria_chk_size(HA_CHECK *param, register MARIA_HA *info) +{ + int error; + register my_off_t skr,size; + char buff[22],buff2[22]; + DBUG_ENTER("maria_chk_size"); + + if (!(param->testflag & T_SILENT)) + puts("- check file-size"); + + /* + The following is needed if called externally (not from maria_chk). + To get a correct physical size we need to flush them. + */ + if ((error= _ma_flush_table_files(info, + MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE))) + _ma_check_print_error(param, "Failed to flush data or index file"); + + size= my_seek(info->s->kfile.file, 0L, MY_SEEK_END, MYF(MY_THREADSAFE)); + if ((skr=(my_off_t) info->state->key_file_length) != size) + { + /* Don't give error if file generated by mariapack */ + if (skr > size && maria_is_any_key_active(info->s->state.key_map)) + { + error=1; + _ma_check_print_error(param, + "Size of indexfile is: %-8s Should be: %s", + llstr(size,buff), llstr(skr,buff2)); + } + else if (!(param->testflag & T_VERY_SILENT)) + _ma_check_print_warning(param, + "Size of indexfile is: %-8s Should be: %s", + llstr(size,buff), llstr(skr,buff2)); + } + if (!(param->testflag & T_VERY_SILENT) && + ! (info->s->options & HA_OPTION_COMPRESS_RECORD) && + ulonglong2double(info->state->key_file_length) > + ulonglong2double(info->s->base.margin_key_file_length)*0.9) + _ma_check_print_warning(param,"Keyfile is almost full, %10s of %10s used", + llstr(info->state->key_file_length,buff), + llstr(info->s->base.max_key_file_length-1,buff)); + + size= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)); + skr=(my_off_t) info->state->data_file_length; + if (info->s->options & HA_OPTION_COMPRESS_RECORD) + skr+= MEMMAP_EXTRA_MARGIN; +#ifdef USE_RELOC + if (info->data_file_type == STATIC_RECORD && + skr < (my_off_t) info->s->base.reloc*info->s->base.min_pack_length) + skr=(my_off_t) info->s->base.reloc*info->s->base.min_pack_length; +#endif + if (skr != size) + { + info->state->data_file_length=size; /* Skip other errors */ + if (skr > size && skr != size + MEMMAP_EXTRA_MARGIN) + { + error=1; + _ma_check_print_error(param,"Size of datafile is: %-9s Should be: %s", + llstr(size,buff), llstr(skr,buff2)); + param->testflag|=T_RETRY_WITHOUT_QUICK; + } + else + { + _ma_check_print_warning(param, + "Size of datafile is: %-9s Should be: %s", + llstr(size,buff), llstr(skr,buff2)); + } + } + if (!(param->testflag & T_VERY_SILENT) && + !(info->s->options & HA_OPTION_COMPRESS_RECORD) && + ulonglong2double(info->state->data_file_length) > + (ulonglong2double(info->s->base.max_data_file_length)*0.9)) + _ma_check_print_warning(param, "Datafile is almost full, %10s of %10s used", + llstr(info->state->data_file_length,buff), + llstr(info->s->base.max_data_file_length-1,buff2)); + DBUG_RETURN(error); +} /* maria_chk_size */ + + +/* Check keys */ + +int maria_chk_key(HA_CHECK *param, register MARIA_HA *info) +{ + uint key,found_keys=0,full_text_keys=0,result=0; + ha_rows keys; + ha_checksum old_record_checksum,init_checksum; + my_off_t all_keydata,all_totaldata,key_totlength,length; + ulong *rec_per_key_part; + MARIA_SHARE *share=info->s; + MARIA_KEYDEF *keyinfo; + char buff[22],buff2[22]; + DBUG_ENTER("maria_chk_key"); + + if (!(param->testflag & T_SILENT)) + puts("- check key delete-chain"); + + param->key_file_blocks=info->s->base.keystart; + if (check_k_link(param, info, info->s->state.key_del)) + { + if (param->testflag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"key delete-link-chain corrupted"); + DBUG_RETURN(-1); + } + + if (!(param->testflag & T_SILENT)) puts("- check index reference"); + + all_keydata=all_totaldata=key_totlength=0; + old_record_checksum=0; + init_checksum=param->record_checksum; + if (share->data_file_type == STATIC_RECORD) + old_record_checksum= (calc_checksum(info->state->records + + info->state->del-1) * + share->base.pack_reclength); + rec_per_key_part= param->rec_per_key_part; + for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ; + rec_per_key_part+=keyinfo->keysegs, key++, keyinfo++) + { + param->key_crc[key]=0; + if (! maria_is_key_active(share->state.key_map, key)) + { + /* Remember old statistics for key */ + memcpy((char*) rec_per_key_part, + (char*) (share->state.rec_per_key_part + + (uint) (rec_per_key_part - param->rec_per_key_part)), + keyinfo->keysegs*sizeof(*rec_per_key_part)); + continue; + } + found_keys++; + + param->record_checksum=init_checksum; + + bzero((char*) ¶m->unique_count,sizeof(param->unique_count)); + bzero((char*) ¶m->notnull_count,sizeof(param->notnull_count)); + + if ((!(param->testflag & T_SILENT))) + printf ("- check data record references index: %d\n",key+1); + if (keyinfo->flag & HA_FULLTEXT) + full_text_keys++; + if (share->state.key_root[key] == HA_OFFSET_ERROR && + (info->state->records == 0 || keyinfo->flag & HA_FULLTEXT)) + goto do_stat; + if (!_ma_fetch_keypage(info,keyinfo,share->state.key_root[key], + DFLT_INIT_HITS,info->buff,0)) + { + _ma_check_print_error(param,"Can't read indexpage from filepos: %s", + llstr(share->state.key_root[key],buff)); + if (!(param->testflag & T_INFO)) + DBUG_RETURN(-1); + result= -1; + continue; + } + param->key_file_blocks+=keyinfo->block_length; + keys=0; + param->keydata=param->totaldata=0; + param->key_blocks=0; + param->max_level=0; + if (chk_index(param,info,keyinfo,share->state.key_root[key],info->buff, + &keys, param->key_crc+key,1)) + DBUG_RETURN(-1); + if(!(keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL))) + { + if (keys != info->state->records) + { + _ma_check_print_error(param,"Found %s keys of %s",llstr(keys,buff), + llstr(info->state->records,buff2)); + if (!(param->testflag & T_INFO)) + DBUG_RETURN(-1); + result= -1; + continue; + } + if ((found_keys - full_text_keys == 1 && + !(share->data_file_type == STATIC_RECORD)) || + (param->testflag & T_DONT_CHECK_CHECKSUM)) + old_record_checksum= param->record_checksum; + else if (old_record_checksum != param->record_checksum) + { + if (key) + _ma_check_print_error(param,"Key %u doesn't point at same records that key 1", + key+1); + else + _ma_check_print_error(param,"Key 1 doesn't point at all records"); + if (!(param->testflag & T_INFO)) + DBUG_RETURN(-1); + result= -1; + continue; + } + } + if ((uint) share->base.auto_key -1 == key) + { + /* Check that auto_increment key is bigger than max key value */ + ulonglong auto_increment; + info->lastinx=key; + _ma_read_key_record(info, info->rec_buff, 0); + auto_increment= ma_retrieve_auto_increment(info, info->rec_buff); + if (auto_increment > info->s->state.auto_increment) + { + _ma_check_print_warning(param, "Auto-increment value: %s is smaller " + "than max used value: %s", + llstr(info->s->state.auto_increment,buff2), + llstr(auto_increment, buff)); + } + if (param->testflag & T_AUTO_INC) + { + set_if_bigger(info->s->state.auto_increment, + auto_increment); + set_if_bigger(info->s->state.auto_increment, + param->auto_increment_value); + } + + /* Check that there isn't a row with auto_increment = 0 in the table */ + maria_extra(info,HA_EXTRA_KEYREAD,0); + bzero(info->lastkey,keyinfo->seg->length); + if (!maria_rkey(info, info->rec_buff, key, (const uchar*) info->lastkey, + (key_part_map)1, HA_READ_KEY_EXACT)) + { + /* Don't count this as a real warning, as maria_chk can't correct it */ + uint save=param->warning_printed; + _ma_check_print_warning(param, "Found row where the auto_increment " + "column has the value 0"); + param->warning_printed=save; + } + maria_extra(info,HA_EXTRA_NO_KEYREAD,0); + } + + length=(my_off_t) isam_key_length(info,keyinfo)*keys + param->key_blocks*2; + if (param->testflag & T_INFO && param->totaldata != 0L && keys != 0L) + printf("Key: %2d: Keyblocks used: %3d%% Packed: %4d%% Max levels: %2d\n", + key+1, + (int) (my_off_t2double(param->keydata)*100.0/my_off_t2double(param->totaldata)), + (int) ((my_off_t2double(length) - my_off_t2double(param->keydata))*100.0/ + my_off_t2double(length)), + param->max_level); + all_keydata+=param->keydata; all_totaldata+=param->totaldata; key_totlength+=length; + +do_stat: + if (param->testflag & T_STATISTICS) + maria_update_key_parts(keyinfo, rec_per_key_part, param->unique_count, + param->stats_method == MI_STATS_METHOD_IGNORE_NULLS? + param->notnull_count: NULL, + (ulonglong)info->state->records); + } + if (param->testflag & T_INFO) + { + if (all_totaldata != 0L && found_keys > 0) + printf("Total: Keyblocks used: %3d%% Packed: %4d%%\n\n", + (int) (my_off_t2double(all_keydata)*100.0/ + my_off_t2double(all_totaldata)), + (int) ((my_off_t2double(key_totlength) - + my_off_t2double(all_keydata))*100.0/ + my_off_t2double(key_totlength))); + else if (all_totaldata != 0L && maria_is_any_key_active(share->state.key_map)) + puts(""); + } + if (param->key_file_blocks != info->state->key_file_length && + param->keys_in_use != ~(ulonglong) 0) + _ma_check_print_warning(param, "Some data are unreferenced in keyfile"); + if (found_keys != full_text_keys) + param->record_checksum=old_record_checksum-init_checksum; /* Remove delete links */ + else + param->record_checksum=0; + DBUG_RETURN(result); +} /* maria_chk_key */ + + +static int chk_index_down(HA_CHECK *param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t page, uchar *buff, ha_rows *keys, + ha_checksum *key_checksum, uint level) +{ + char llbuff[22],llbuff2[22]; + DBUG_ENTER("chk_index_down"); + + /* Key blocks must lay within the key file length entirely. */ + if (page + keyinfo->block_length > info->state->key_file_length) + { + /* purecov: begin tested */ + /* Give it a chance to fit in the real file size. */ + my_off_t max_length= my_seek(info->s->kfile.file, 0L, MY_SEEK_END, + MYF(MY_THREADSAFE)); + _ma_check_print_error(param, "Invalid key block position: %s " + "key block size: %u file_length: %s", + llstr(page, llbuff), keyinfo->block_length, + llstr(info->state->key_file_length, llbuff2)); + if (page + keyinfo->block_length > max_length) + goto err; + /* Fix the remembered key file length. */ + info->state->key_file_length= (max_length & + ~ (my_off_t) (keyinfo->block_length - 1)); + /* purecov: end */ + } + + /* Key blocks must be aligned at block length */ + if (page & (info->s->block_size -1)) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "Mis-aligned key block: %s " + "minimum key block length: %u", + llstr(page, llbuff), info->s->block_size); + goto err; + /* purecov: end */ + } + + if (!_ma_fetch_keypage(info,keyinfo,page, DFLT_INIT_HITS,buff,0)) + { + _ma_check_print_error(param,"Can't read key from filepos: %s", + llstr(page,llbuff)); + goto err; + } + param->key_file_blocks+=keyinfo->block_length; + if (chk_index(param,info,keyinfo,page,buff,keys,key_checksum,level)) + goto err; + + DBUG_RETURN(0); + + /* purecov: begin tested */ +err: + DBUG_RETURN(1); + /* purecov: end */ +} + + +/* + "Ignore NULLs" statistics collection method: process first index tuple. + + SYNOPSIS + maria_collect_stats_nonulls_first() + keyseg IN Array of key part descriptions + notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i} + tuples that don't contain NULLs) + key IN Key values tuple + + DESCRIPTION + Process the first index tuple - find out which prefix tuples don't + contain NULLs, and update the array of notnull counters accordingly. +*/ + +static +void maria_collect_stats_nonulls_first(HA_KEYSEG *keyseg, ulonglong *notnull, + const uchar *key) +{ + uint first_null, kp; + first_null= ha_find_null(keyseg, (uchar*) key) - keyseg; + /* + All prefix tuples that don't include keypart_{first_null} are not-null + tuples (and all others aren't), increment counters for them. + */ + for (kp= 0; kp < first_null; kp++) + notnull[kp]++; +} + + +/* + "Ignore NULLs" statistics collection method: process next index tuple. + + SYNOPSIS + maria_collect_stats_nonulls_next() + keyseg IN Array of key part descriptions + notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i} + tuples that don't contain NULLs) + prev_key IN Previous key values tuple + last_key IN Next key values tuple + + DESCRIPTION + Process the next index tuple: + 1. Find out which prefix tuples of last_key don't contain NULLs, and + update the array of notnull counters accordingly. + 2. Find the first keypart number where the prev_key and last_key tuples + are different(A), or last_key has NULL value(B), and return it, so the + caller can count number of unique tuples for each key prefix. We don't + need (B) to be counted, and that is compensated back in + maria_update_key_parts(). + + RETURN + 1 + number of first keypart where values differ or last_key tuple has NULL +*/ + +static +int maria_collect_stats_nonulls_next(HA_KEYSEG *keyseg, ulonglong *notnull, + const uchar *prev_key, + const uchar *last_key) +{ + uint diffs[2]; + uint first_null_seg, kp; + HA_KEYSEG *seg; + + /* + Find the first keypart where values are different or either of them is + NULL. We get results in diffs array: + diffs[0]= 1 + number of first different keypart + diffs[1]=offset: (last_key + diffs[1]) points to first value in + last_key that is NULL or different from corresponding + value in prev_key. + */ + ha_key_cmp(keyseg, (uchar*) prev_key, (uchar*) last_key, USE_WHOLE_KEY, + SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diffs); + seg= keyseg + diffs[0] - 1; + + /* Find first NULL in last_key */ + first_null_seg= ha_find_null(seg, (uchar*) last_key + diffs[1]) - keyseg; + for (kp= 0; kp < first_null_seg; kp++) + notnull[kp]++; + + /* + Return 1+ number of first key part where values differ. Don't care if + these were NULLs and not .... We compensate for that in + maria_update_key_parts. + */ + return diffs[0]; +} + + + /* Check if index is ok */ + +static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t page, uchar *buff, ha_rows *keys, + ha_checksum *key_checksum, uint level) +{ + int flag; + uint used_length,comp_flag,nod_flag,key_length=0; + uchar key[HA_MAX_POSSIBLE_KEY_BUFF],*temp_buff,*keypos,*old_keypos,*endpos; + my_off_t next_page,record; + char llbuff[22]; + uint diff_pos[2]; + DBUG_ENTER("chk_index"); + DBUG_DUMP("buff",(uchar*) buff,maria_data_on_page(buff)); + + /* TODO: implement appropriate check for RTree keys */ + if (keyinfo->flag & HA_SPATIAL) + DBUG_RETURN(0); + + if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length))) + { + _ma_check_print_error(param,"Not enough memory for keyblock"); + DBUG_RETURN(-1); + } + + if (keyinfo->flag & HA_NOSAME) + comp_flag=SEARCH_FIND | SEARCH_UPDATE; /* Not real duplicates */ + else + comp_flag=SEARCH_SAME; /* Keys in positionorder */ + nod_flag=_ma_test_if_nod(buff); + used_length= maria_data_on_page(buff); + keypos=buff+2+nod_flag; + endpos=buff+used_length; + + param->keydata+=used_length; param->totaldata+=keyinfo->block_length; /* INFO */ + param->key_blocks++; + if (level > param->max_level) + param->max_level=level; + + if (used_length > keyinfo->block_length) + { + _ma_check_print_error(param,"Wrong pageinfo at page: %s", + llstr(page,llbuff)); + goto err; + } + for ( ;; ) + { + if (*_ma_killed_ptr(param)) + goto err; + memcpy(info->lastkey, key, key_length); + info->lastkey_length= key_length; + if (nod_flag) + { + next_page= _ma_kpos(nod_flag,keypos); + if (chk_index_down(param,info,keyinfo,next_page, + temp_buff,keys,key_checksum,level+1)) + goto err; + } + old_keypos=keypos; + if (keypos >= endpos || + (key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,key)) == 0) + break; + if (keypos > endpos) + { + _ma_check_print_error(param,"Wrong key block length at page: %s", + llstr(page,llbuff)); + goto err; + } + if ((*keys)++ && + (flag=ha_key_cmp(keyinfo->seg, (uchar*) info->lastkey, (uchar*) key, + key_length, comp_flag, diff_pos)) >=0) + { + DBUG_DUMP("old", info->lastkey, info->lastkey_length); + DBUG_DUMP("new", key, key_length); + DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos)); + + if (comp_flag & SEARCH_FIND && flag == 0) + _ma_check_print_error(param,"Found duplicated key at page %s", + llstr(page,llbuff)); + else + _ma_check_print_error(param,"Key in wrong position at page %s", + llstr(page,llbuff)); + goto err; + } + if (param->testflag & T_STATISTICS) + { + if (*keys != 1L) /* not first_key */ + { + if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL) + ha_key_cmp(keyinfo->seg, (uchar*) info->lastkey, (uchar*) key, + USE_WHOLE_KEY, SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, + diff_pos); + else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + { + diff_pos[0]= maria_collect_stats_nonulls_next(keyinfo->seg, + param->notnull_count, + info->lastkey, key); + } + param->unique_count[diff_pos[0]-1]++; + } + else + { + if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + maria_collect_stats_nonulls_first(keyinfo->seg, param->notnull_count, + key); + } + } + (*key_checksum)+= maria_byte_checksum((uchar*) key, + key_length- info->s->rec_reflength); + record= _ma_dpos(info,0,key+key_length); + if (keyinfo->flag & HA_FULLTEXT) /* special handling for ft2 */ + { + uint off; + int subkeys; + get_key_full_length_rdonly(off, key); + subkeys=ft_sintXkorr(key+off); + if (subkeys < 0) + { + ha_rows tmp_keys=0; + if (chk_index_down(param,info,&info->s->ft2_keyinfo,record, + temp_buff,&tmp_keys,key_checksum,1)) + goto err; + if (tmp_keys + subkeys) + { + _ma_check_print_error(param, + "Number of words in the 2nd level tree " + "does not match the number in the header. " + "Parent word in on the page %s, offset %u", + llstr(page,llbuff), (uint) (old_keypos-buff)); + goto err; + } + (*keys)+=tmp_keys-1; + continue; + } + /* fall through */ + } + if (record >= info->state->data_file_length) + { +#ifndef DBUG_OFF + char llbuff2[22], llbuff3[22]; +#endif + _ma_check_print_error(param,"Found key at page %s that points to record outside datafile",llstr(page,llbuff)); + DBUG_PRINT("test",("page: %s record: %s filelength: %s", + llstr(page,llbuff),llstr(record,llbuff2), + llstr(info->state->data_file_length,llbuff3))); + DBUG_DUMP("key",(uchar*) key,key_length); + DBUG_DUMP("new_in_page",(char*) old_keypos,(uint) (keypos-old_keypos)); + goto err; + } + param->record_checksum+= (ha_checksum) record; + } + if (keypos != endpos) + { + _ma_check_print_error(param,"Keyblock size at page %s is not correct. Block length: %d key length: %d", + llstr(page,llbuff), used_length, (keypos - buff)); + goto err; + } + my_afree((uchar*) temp_buff); + DBUG_RETURN(0); + err: + my_afree((uchar*) temp_buff); + DBUG_RETURN(1); +} /* chk_index */ + + + /* Calculate a checksum of 1+2+3+4...N = N*(N+1)/2 without overflow */ + +static ha_checksum calc_checksum(ha_rows count) +{ + ulonglong sum,a,b; + DBUG_ENTER("calc_checksum"); + + sum=0; + a=count; b=count+1; + if (a & 1) + b>>=1; + else + a>>=1; + while (b) + { + if (b & 1) + sum+=a; + a<<=1; b>>=1; + } + DBUG_PRINT("exit",("sum: %lx",(ulong) sum)); + DBUG_RETURN((ha_checksum) sum); +} /* calc_checksum */ + + + /* Calc length of key in normal isam */ + +static uint isam_key_length(MARIA_HA *info, register MARIA_KEYDEF *keyinfo) +{ + uint length; + HA_KEYSEG *keyseg; + DBUG_ENTER("isam_key_length"); + + length= info->s->rec_reflength; + for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++) + length+= keyseg->length; + + DBUG_PRINT("exit",("length: %d",length)); + DBUG_RETURN(length); +} /* key_length */ + + + +static void record_pos_to_txt(MARIA_HA *info, my_off_t recpos, + char *buff) +{ + if (info->s->data_file_type != BLOCK_RECORD) + llstr(recpos, buff); + else + { + my_off_t page= ma_recordpos_to_page(recpos); + uint row= ma_recordpos_to_dir_entry(recpos); + char *end= longlong10_to_str(page, buff, 10); + *(end++)= ':'; + longlong10_to_str(row, end, 10); + } +} + + +/* + Check that keys in records exist in index tree + + SYNOPSIS + check_keys_in_record() + param Check paramenter + info Maria handler + extend Type of check (extended or normal) + start_recpos Position to row + record Record buffer + + NOTES + This function also calculates record checksum & number of rows +*/ + +static int check_keys_in_record(HA_CHECK *param, MARIA_HA *info, int extend, + my_off_t start_recpos, uchar *record) +{ + MARIA_KEYDEF *keyinfo; + char llbuff[22+4]; + uint key; + + param->tmp_record_checksum+= (ha_checksum) start_recpos; + param->records++; + if (param->testflag & T_WRITE_LOOP && param->records % WRITE_COUNT == 0) + { + printf("%s\r", llstr(param->records, llbuff)); + VOID(fflush(stdout)); + } + + /* Check if keys match the record */ + for (key=0, keyinfo= info->s->keyinfo; key < info->s->base.keys; + key++,keyinfo++) + { + if (maria_is_key_active(info->s->state.key_map, key)) + { + if(!(keyinfo->flag & HA_FULLTEXT)) + { + uint key_length= _ma_make_key(info,key,info->lastkey,record, + start_recpos); + if (extend) + { + /* We don't need to lock the key tree here as we don't allow + concurrent threads when running maria_chk + */ + int search_result= +#ifdef HAVE_RTREE_KEYS + (keyinfo->flag & HA_SPATIAL) ? + maria_rtree_find_first(info, key, info->lastkey, key_length, + MBR_EQUAL | MBR_DATA) : +#endif + _ma_search(info,keyinfo,info->lastkey,key_length, + SEARCH_SAME, info->s->state.key_root[key]); + if (search_result) + { + record_pos_to_txt(info, start_recpos, llbuff); + _ma_check_print_error(param, + "Record at: %14s " + "Can't find key for index: %2d", + llbuff, key+1); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + return -1; + } + } + else + param->tmp_key_crc[key]+= + maria_byte_checksum((uchar*) info->lastkey, key_length); + } + } + } + return 0; +} + + +/* + Functions to loop through all rows and check if they are ok + + NOTES + One function for each record format + + RESULT + 0 ok + -1 Interrupted by user + 1 Error +*/ + +static int check_static_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + my_off_t start_recpos, pos; + char llbuff[22]; + + pos= 0; + while (pos < info->state->data_file_length) + { + if (*_ma_killed_ptr(param)) + return -1; + if (my_b_read(¶m->read_cache,(uchar*) record, + info->s->base.pack_reclength)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: %s", + my_errno, llstr(pos, llbuff)); + return 1; + } + start_recpos= pos; + pos+= info->s->base.pack_reclength; + param->splits++; + if (*record == '\0') + { + param->del_blocks++; + param->del_length+= info->s->base.pack_reclength; + continue; /* Record removed */ + } + param->glob_crc+= _ma_static_checksum(info,record); + param->used+= info->s->base.pack_reclength; + if (check_keys_in_record(param, info, extend, start_recpos, record)) + return 1; + } + return 0; +} + + +static int check_dynamic_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + MARIA_BLOCK_INFO block_info; + my_off_t start_recpos, start_block, pos; + uchar *to; + ulong left_length; + uint b_type; + char llbuff[22],llbuff2[22],llbuff3[22]; + DBUG_ENTER("check_dynamic_record"); + + LINT_INIT(left_length); + LINT_INIT(start_recpos); + LINT_INIT(to); + + pos= 0; + while (pos < info->state->data_file_length) + { + my_bool got_error= 0; + int flag; + if (*_ma_killed_ptr(param)) + DBUG_RETURN(-1); + + flag= block_info.second_read=0; + block_info.next_filepos=pos; + do + { + if (_ma_read_cache(¶m->read_cache,(uchar*) block_info.header, + (start_block=block_info.next_filepos), + sizeof(block_info.header), + (flag ? 0 : READING_NEXT) | READING_HEADER)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at " + "position: %s", + my_errno, llstr(start_block, llbuff)); + DBUG_RETURN(1); + } + + if (start_block & (MARIA_DYN_ALIGN_SIZE-1)) + { + _ma_check_print_error(param,"Wrong aligned block at %s", + llstr(start_block,llbuff)); + DBUG_RETURN(1); + } + b_type= _ma_get_block_info(&block_info,-1,start_block); + if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if (b_type & BLOCK_SYNC_ERROR) + { + if (flag) + { + _ma_check_print_error(param,"Unexpected byte: %d at link: %s", + (int) block_info.header[0], + llstr(start_block,llbuff)); + DBUG_RETURN(1); + } + pos=block_info.filepos+block_info.block_len; + goto next; + } + if (b_type & BLOCK_DELETED) + { + if (block_info.block_len < info->s->base.min_block_length) + { + _ma_check_print_error(param, + "Deleted block with impossible length %lu at %s", + block_info.block_len,llstr(pos,llbuff)); + DBUG_RETURN(1); + } + if ((block_info.next_filepos != HA_OFFSET_ERROR && + block_info.next_filepos >= info->state->data_file_length) || + (block_info.prev_filepos != HA_OFFSET_ERROR && + block_info.prev_filepos >= info->state->data_file_length)) + { + _ma_check_print_error(param,"Delete link points outside datafile at %s", + llstr(pos,llbuff)); + DBUG_RETURN(1); + } + param->del_blocks++; + param->del_length+= block_info.block_len; + param->splits++; + pos= block_info.filepos+block_info.block_len; + goto next; + } + _ma_check_print_error(param,"Wrong bytesec: %d-%d-%d at linkstart: %s", + block_info.header[0],block_info.header[1], + block_info.header[2], + llstr(start_block,llbuff)); + DBUG_RETURN(1); + } + if (info->state->data_file_length < block_info.filepos+ + block_info.block_len) + { + _ma_check_print_error(param, + "Recordlink that points outside datafile at %s", + llstr(pos,llbuff)); + got_error=1; + break; + } + param->splits++; + if (!flag++) /* First block */ + { + start_recpos=pos; + pos=block_info.filepos+block_info.block_len; + if (block_info.rec_len > (uint) info->s->base.max_pack_length) + { + _ma_check_print_error(param,"Found too long record (%lu) at %s", + (ulong) block_info.rec_len, + llstr(start_recpos,llbuff)); + got_error=1; + break; + } + if (info->s->base.blobs) + { + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + block_info.rec_len + + info->s->base.extra_rec_buff_size)) + + { + _ma_check_print_error(param, + "Not enough memory (%lu) for blob at %s", + (ulong) block_info.rec_len, + llstr(start_recpos,llbuff)); + got_error=1; + break; + } + } + to= info->rec_buff; + left_length= block_info.rec_len; + } + if (left_length < block_info.data_len) + { + _ma_check_print_error(param,"Found too long record (%lu) at %s", + (ulong) block_info.data_len, + llstr(start_recpos,llbuff)); + got_error=1; + break; + } + if (_ma_read_cache(¶m->read_cache,(uchar*) to,block_info.filepos, + (uint) block_info.data_len, + flag == 1 ? READING_NEXT : 0)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: %s", my_errno, llstr(block_info.filepos, llbuff)); + + DBUG_RETURN(1); + } + to+=block_info.data_len; + param->link_used+= block_info.filepos-start_block; + param->used+= block_info.filepos - start_block + block_info.data_len; + param->empty+= block_info.block_len-block_info.data_len; + left_length-= block_info.data_len; + if (left_length) + { + if (b_type & BLOCK_LAST) + { + _ma_check_print_error(param, + "Wrong record length %s of %s at %s", + llstr(block_info.rec_len-left_length,llbuff), + llstr(block_info.rec_len, llbuff2), + llstr(start_recpos,llbuff3)); + got_error=1; + break; + } + if (info->state->data_file_length < block_info.next_filepos) + { + _ma_check_print_error(param, + "Found next-recordlink that points outside datafile at %s", + llstr(block_info.filepos,llbuff)); + got_error=1; + break; + } + } + } while (left_length); + + if (! got_error) + { + if (_ma_rec_unpack(info,record,info->rec_buff,block_info.rec_len) == + MY_FILE_ERROR) + { + _ma_check_print_error(param,"Found wrong record at %s", + llstr(start_recpos,llbuff)); + got_error=1; + } + else + { + ha_checksum checksum= 0; + if (info->s->calc_checksum) + checksum= (*info->s->calc_checksum)(info, record); + + if (param->testflag & (T_EXTEND | T_MEDIUM | T_VERBOSE)) + { + if (_ma_rec_check(info,record, info->rec_buff,block_info.rec_len, + test(info->s->calc_checksum), checksum)) + { + _ma_check_print_error(param,"Found wrong packed record at %s", + llstr(start_recpos,llbuff)); + got_error= 1; + } + } + param->glob_crc+= checksum; + } + + if (! got_error) + { + if (check_keys_in_record(param, info, extend, start_recpos, record)) + DBUG_RETURN(1); + } + else + { + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + } + else if (!flag) + pos= block_info.filepos+block_info.block_len; +next:; + } + DBUG_RETURN(0); +} + + +static int check_compressed_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + my_off_t start_recpos, pos; + char llbuff[22]; + bool got_error= 0; + MARIA_BLOCK_INFO block_info; + DBUG_ENTER("check_compressed_record"); + + pos= info->s->pack.header_length; /* Skip header */ + while (pos < info->state->data_file_length) + { + if (*_ma_killed_ptr(param)) + DBUG_RETURN(-1); + + if (_ma_read_cache(¶m->read_cache,(uchar*) block_info.header, pos, + info->s->pack.ref_length, READING_NEXT)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: %s", + my_errno, llstr(pos, llbuff)); + DBUG_RETURN(1); + } + + start_recpos= pos; + param->splits++; + VOID(_ma_pack_get_block_info(info, &info->bit_buff, &block_info, + &info->rec_buff, &info->rec_buff_size, -1, + start_recpos)); + pos=block_info.filepos+block_info.rec_len; + if (block_info.rec_len < (uint) info->s->min_pack_length || + block_info.rec_len > (uint) info->s->max_pack_length) + { + _ma_check_print_error(param, + "Found block with wrong recordlength: %d at %s", + block_info.rec_len, llstr(start_recpos,llbuff)); + got_error=1; + goto end; + } + if (_ma_read_cache(¶m->read_cache,(uchar*) info->rec_buff, + block_info.filepos, block_info.rec_len, READING_NEXT)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: %s", + my_errno, llstr(block_info.filepos, llbuff)); + DBUG_RETURN(1); + } + if (_ma_pack_rec_unpack(info, &info->bit_buff, record, + info->rec_buff, block_info.rec_len)) + { + _ma_check_print_error(param,"Found wrong record at %s", + llstr(start_recpos,llbuff)); + got_error=1; + goto end; + } + param->glob_crc+= (*info->s->calc_checksum)(info,record); + param->link_used+= (block_info.filepos - start_recpos); + param->used+= (pos-start_recpos); + +end: + if (! got_error) + { + if (check_keys_in_record(param, info, extend, start_recpos, record)) + DBUG_RETURN(1); + } + else + { + got_error= 0; /* Reset for next loop */ + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Check if layout on a page is ok + + NOTES + This is for rows-in-block format. +*/ + +static int check_page_layout(HA_CHECK *param, MARIA_HA *info, + my_off_t page_pos, uchar *page, + uint row_count, uint head_empty, + uint *real_rows_found) +{ + uint empty, last_row_end, row, first_dir_entry; + uchar *dir_entry; + char llbuff[22]; + DBUG_ENTER("check_page_layout"); + + empty= 0; + last_row_end= PAGE_HEADER_SIZE; + *real_rows_found= 0; + + dir_entry= page+ info->s->block_size - PAGE_SUFFIX_SIZE; + first_dir_entry= info->s->block_size - row_count* DIR_ENTRY_SIZE; + for (row= 0 ; row < row_count ; row++) + { + uint pos, length; + dir_entry-= DIR_ENTRY_SIZE; + pos= uint2korr(dir_entry); + if (!pos) + { + if (row == row_count -1) + { + _ma_check_print_error(param, + "Page %9s: First entry in directory is 0", + llstr(page_pos, llbuff)); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + continue; /* Deleted row */ + } + (*real_rows_found)++; + length= uint2korr(dir_entry+2); + param->used+= length; + if (pos < last_row_end) + { + _ma_check_print_error(param, + "Page %9s: Row %3u overlapps with previous row", + llstr(page_pos, llbuff), row); + DBUG_RETURN(1); + } + empty+= (pos - last_row_end); + last_row_end= pos + length; + if (last_row_end > first_dir_entry) + { + _ma_check_print_error(param, + "Page %9s: Row %3u overlapps with directory", + llstr(page_pos, llbuff), row); + DBUG_RETURN(1); + } + } + empty+= (first_dir_entry - last_row_end); + + if (empty != head_empty) + { + _ma_check_print_error(param, + "Page %9s: Wrong empty size. Stored: %5u Actual: %5u", + llstr(page_pos, llbuff), head_empty, empty); + DBUG_RETURN(param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)); + } + DBUG_RETURN(0); +} + + +/* + Check all rows on head page + + NOTES + This is for rows-in-block format. + + Before this, we have already called check_page_layout(), so + we know the block is logicaly correct (even if the rows may not be that) + + RETURN + 0 ok + 1 error +*/ + + +static my_bool check_head_page(HA_CHECK *param, MARIA_HA *info, uchar *record, + int extend, my_off_t page_pos, uchar *page_buff, + uint row_count) +{ + uchar *dir_entry; + uint row; + char llbuff[22], llbuff2[22]; + DBUG_ENTER("check_head_page"); + + dir_entry= page_buff+ info->s->block_size - PAGE_SUFFIX_SIZE; + for (row= 0 ; row < row_count ; row++) + { + uint pos, length, flag; + dir_entry-= DIR_ENTRY_SIZE; + pos= uint2korr(dir_entry); + if (!pos) + continue; + length= uint2korr(dir_entry+2); + if (length < info->s->base.min_block_length) + { + _ma_check_print_error(param, + "Page %9s: Row %3u is too short (%d bytes)", + llstr(page_pos, llbuff), row, length); + DBUG_RETURN(1); + } + flag= (uint) (uchar) page_buff[pos]; + if (flag & ~(ROW_FLAG_ALL)) + _ma_check_print_error(param, + "Page %9s: Row %3u has wrong flag: %d", + llstr(page_pos, llbuff), row, flag); + + DBUG_PRINT("info", ("rowid: %s page: %lu row: %u", + llstr(ma_recordpos(page_pos/info->s->block_size, row), + llbuff), + (ulong) (page_pos / info->s->block_size), row)); + if (_ma_read_block_record2(info, record, page_buff+pos, + page_buff+pos+length)) + { + _ma_check_print_error(param, + "Page %9s: Row %3d is crashed", + llstr(page_pos, llbuff), row); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + continue; + } + if (info->s->calc_checksum) + { + ha_checksum checksum= (*info->s->calc_checksum)(info, record); + if (info->cur_row.checksum != (checksum & 255)) + _ma_check_print_error(param, "Page %9s: Row %3d has wrong checksum", + llstr(page_pos, llbuff), row); + param->glob_crc+= checksum; + } + if (info->cur_row.extents_count) + { + uchar *extents= info->cur_row.extents; + uint i; + /* Check that bitmap has the right marker for the found extents */ + for (i= 0 ; i < info->cur_row.extents_count ; i++) + { + uint page, page_count, page_type; + page= uint5korr(extents); + page_count= uint2korr(extents+5); + extents+= ROW_EXTENT_SIZE; + page_type= BLOB_PAGE; + if (page_count & TAIL_BIT) + { + page_count= 1; + page_type= TAIL_PAGE; + } + for ( ; page_count--; page++) + { + uint bitmap_pattern; + if (_ma_check_if_right_bitmap_type(info, page_type, page, + &bitmap_pattern)) + { + _ma_check_print_error(param, + "Page %9s: Row: %3d has an extent with wrong information in bitmap: Page %9s Page_type: %d Bitmap: %d", + llstr(page_pos, llbuff), row, + llstr(page * info->s->bitmap.block_size, + llbuff2), + page_type, + bitmap_pattern); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + } + } + } + param->full_page_count+= info->cur_row.full_page_count; + param->tail_count+= info->cur_row.tail_count; + if (check_keys_in_record(param, info, extend, + ma_recordpos(page_pos/info->s->block_size, row), + record)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +/* + Check if rows-in-block data file is consistent +*/ + +static int check_block_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + my_off_t pos; + uchar *page_buff, *bitmap_buff, *data; + char llbuff[22], llbuff2[22]; + uint block_size= info->s->block_size; + ha_rows full_page_count, tail_count; + my_bool full_dir; + uint offset_page, offset; + + LINT_INIT(full_dir); + + if (_ma_scan_init_block_record(info)) + { + _ma_check_print_error(param, "got error %d when initializing scan", + my_errno); + return 1; + } + bitmap_buff= info->scan.bitmap_buff; + page_buff= info->scan.page_buff; + full_page_count= tail_count= 0; + param->full_page_count= param->tail_count= 0; + param->used= param->link_used= 0; + + for (pos= 0; + pos < info->state->data_file_length; + pos+= block_size) + { + uint row_count, real_row_count, empty_space, page_type, bitmap_pattern; + LINT_INIT(row_count); + LINT_INIT(empty_space); + + if (*_ma_killed_ptr(param)) + { + _ma_scan_end_block_record(info); + return -1; + } + if (((pos / block_size) % info->s->bitmap.pages_covered) == 0) + { + /* Bitmap page */ + if (pagecache_read(info->s->pagecache, + &info->dfile, + (pos / block_size), 1, + bitmap_buff, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0) + { + _ma_check_print_error(param, + "Page %9s: Got error: %d when reading datafile", + my_errno, llstr(pos, llbuff)); + goto err; + } + param->used+= block_size; + param->link_used+= block_size; + continue; + } + /* Skip pages marked as empty in bitmap */ + offset_page= (((pos / block_size) % info->s->bitmap.pages_covered) -1) * 3; + offset= offset_page & 7; + data= bitmap_buff + offset_page / 8; + bitmap_pattern= uint2korr(data); + param->splits++; + if (!((bitmap_pattern >> offset) & 7)) + { + param->empty+= block_size; + param->del_blocks++; + continue; + } + + if (pagecache_read(info->s->pagecache, + &info->dfile, + (pos / block_size), 1, + page_buff, + info->s->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0) + { + _ma_check_print_error(param, + "Page %9s: Got error: %d when reading datafile", + my_errno, llstr(pos, llbuff)); + goto err; + } + page_type= page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK; + if (page_type == UNALLOCATED_PAGE || page_type >= MAX_PAGE_TYPE) + { + _ma_check_print_error(param, + "Page %9s: Found wrong page type %d\n", + llstr(pos, llbuff), page_type); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + goto err; + continue; + } + switch ((enum en_page_type) page_type) { + case UNALLOCATED_PAGE: + case MAX_PAGE_TYPE: + DBUG_ASSERT(0); /* Impossible */ + break; + case HEAD_PAGE: + row_count= ((uchar*) page_buff)[DIR_COUNT_OFFSET]; + empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET); + param->used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE + + row_count * DIR_ENTRY_SIZE); + param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE + + row_count * DIR_ENTRY_SIZE); + full_dir= row_count == MAX_ROWS_PER_PAGE; + break; + case TAIL_PAGE: + row_count= ((uchar*) page_buff)[DIR_COUNT_OFFSET]; + empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET); + param->used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE + + row_count * DIR_ENTRY_SIZE); + param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE + + row_count * DIR_ENTRY_SIZE); + full_dir= row_count == MAX_ROWS_PER_PAGE; + break; + case BLOB_PAGE: + full_page_count++; + full_dir= 0; + empty_space= block_size; /* for error reporting */ + param->link_used+= (LSN_SIZE + PAGE_TYPE_SIZE); + param->used+= block_size; + break; + } + if (_ma_check_bitmap_data(info, page_type, pos / block_size, + full_dir ? 0 : empty_space, + &bitmap_pattern)) + { + if (bitmap_pattern == ~(uint) 0) + _ma_check_print_error(param, + "Page: %9s: Wrong bitmap for data on page", + llstr(pos, llbuff)); + else + _ma_check_print_error(param, + "Page %9s: Wrong data in bitmap. Page_type: %d empty_space: %u Bitmap-bits: %d", + llstr(pos, llbuff), page_type, empty_space, + bitmap_pattern); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + goto err; + } + if ((enum en_page_type) page_type == BLOB_PAGE) + continue; + param->empty+= empty_space; + if (check_page_layout(param, info, pos, page_buff, row_count, + empty_space, &real_row_count)) + goto err; + if ((enum en_page_type) page_type == TAIL_PAGE) + { + tail_count+= real_row_count; + continue; + } + if (check_head_page(param, info, record, extend, pos, page_buff, + row_count)) + goto err; + } + + _ma_scan_end_block_record(info); + + if (full_page_count != param->full_page_count) + _ma_check_print_error(param, "Full page count read through records was %s but we found %s pages while scanning table", + llstr(param->full_page_count, llbuff), + llstr(full_page_count, llbuff2)); + if (tail_count != param->tail_count) + _ma_check_print_error(param, "Tail count read through records was %s but we found %s tails while scanning table", + llstr(param->tail_count, llbuff), + llstr(tail_count, llbuff2)); + + /* Update splits to avoid warning */ + info->s->state.split= param->splits; + info->state->del= param->del_blocks; + return param->error_printed != 0; + +err: + _ma_scan_end_block_record(info); + return 1; +} + + +/* Check that record-link is ok */ + +int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info,int extend) +{ + int error; + uchar *record; + char llbuff[22],llbuff2[22],llbuff3[22]; + DBUG_ENTER("maria_chk_data_link"); + + if (!(param->testflag & T_SILENT)) + { + if (extend) + puts("- check records and index references"); + else + puts("- check record links"); + } + + if (!(record= (uchar*) my_malloc(info->s->base.pack_reclength,MYF(0)))) + { + _ma_check_print_error(param,"Not enough memory for record"); + DBUG_RETURN(-1); + } + param->records= param->del_blocks= 0; + param->used= param->link_used= param->splits= param->del_length= 0; + param->tmp_record_checksum= param->glob_crc= 0; + param->err_count= 0; + + error= 0; + param->empty= info->s->pack.header_length; + + bzero((char*) param->tmp_key_crc, + info->s->base.keys * sizeof(param->tmp_key_crc[0])); + + switch (info->s->data_file_type) { + case BLOCK_RECORD: + error= check_block_record(param, info, extend, record); + break; + case STATIC_RECORD: + error= check_static_record(param, info, extend, record); + break; + case DYNAMIC_RECORD: + error= check_dynamic_record(param, info, extend, record); + break; + case COMPRESSED_RECORD: + error= check_compressed_record(param, info, extend, record); + break; + } /* switch */ + + if (error) + goto err; + + if (param->testflag & T_WRITE_LOOP) + { + VOID(fputs(" \r",stdout)); VOID(fflush(stdout)); + } + if (param->records != info->state->records) + { + _ma_check_print_error(param, + "Record-count is not ok; found %-10s Should be: %s", + llstr(param->records,llbuff), + llstr(info->state->records,llbuff2)); + error=1; + } + else if (param->record_checksum && + param->record_checksum != param->tmp_record_checksum) + { + _ma_check_print_error(param, + "Key pointers and record positions doesn't match"); + error=1; + } + else if (param->glob_crc != info->state->checksum && + (info->s->options & + (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))) + { + _ma_check_print_warning(param, + "Record checksum is not the same as checksum stored in the index file\n"); + error=1; + } + else if (!extend) + { + uint key; + for (key=0 ; key < info->s->base.keys; key++) + { + if (param->tmp_key_crc[key] != param->key_crc[key] && + !(info->s->keyinfo[key].flag & (HA_FULLTEXT | HA_SPATIAL))) + { + _ma_check_print_error(param,"Checksum for key: %2d doesn't match checksum for records", + key+1); + error=1; + } + } + } + + if (param->del_length != info->state->empty) + { + _ma_check_print_warning(param, + "Found %s deleted space. Should be %s", + llstr(param->del_length,llbuff2), + llstr(info->state->empty,llbuff)); + } + if (param->used + param->empty + param->del_length != + info->state->data_file_length) + { + _ma_check_print_warning(param, + "Found %s record data and %s unused data and %s deleted data", + llstr(param->used, llbuff), + llstr(param->empty,llbuff2), + llstr(param->del_length,llbuff3)); + _ma_check_print_warning(param, + "Total %s Should be: %s", + llstr((param->used+param->empty+param->del_length), + llbuff), + llstr(info->state->data_file_length,llbuff2)); + } + if (param->del_blocks != info->state->del) + { + _ma_check_print_warning(param, + "Found %10s deleted blocks Should be: %s", + llstr(param->del_blocks,llbuff), + llstr(info->state->del,llbuff2)); + } + if (param->splits != info->s->state.split) + { + _ma_check_print_warning(param, + "Found %10s parts Should be: %s parts", + llstr(param->splits, llbuff), + llstr(info->s->state.split,llbuff2)); + } + if (param->testflag & T_INFO) + { + if (param->warning_printed || param->error_printed) + puts(""); + if (param->used != 0 && ! param->error_printed) + { + if (param->records) + { + printf("Records:%18s M.recordlength:%9lu Packed:%14.0f%%\n", + llstr(param->records,llbuff), + (long)((param->used - param->link_used)/param->records), + (info->s->base.blobs ? 0.0 : + (ulonglong2double((ulonglong) info->s->base.reclength * + param->records)- + my_off_t2double(param->used))/ + ulonglong2double((ulonglong) info->s->base.reclength * + param->records)*100.0)); + printf("Recordspace used:%9.0f%% Empty space:%12d%% Blocks/Record: %6.2f\n", + (ulonglong2double(param->used - param->link_used)/ + ulonglong2double(param->used-param->link_used+param->empty)*100.0), + (!param->records ? 100 : + (int) (ulonglong2double(param->del_length+param->empty)/ + my_off_t2double(param->used)*100.0)), + ulonglong2double(param->splits - param->del_blocks) / + param->records); + } + else + printf("Records:%18s\n", "0"); + } + printf("Record blocks:%12s Delete blocks:%10s\n", + llstr(param->splits - param->del_blocks, llbuff), + llstr(param->del_blocks, llbuff2)); + printf("Record data: %12s Deleted data: %10s\n", + llstr(param->used - param->link_used,llbuff), + llstr(param->del_length, llbuff2)); + printf("Lost space: %12s Linkdata: %10s\n", + llstr(param->empty, llbuff),llstr(param->link_used, llbuff2)); + } + my_free((uchar*) record,MYF(0)); + DBUG_RETURN (error); + + err: + my_free((uchar*) record,MYF(0)); + param->testflag|=T_RETRY_WITHOUT_QUICK; + DBUG_RETURN(1); +} /* maria_chk_data_link */ + + +/* + Recover old table by reading each record and writing all keys + + NOTES + Save new datafile-name in temp_filename. + We overwrite the index file as we go (writekeys() for example), so if we + crash during this the table is unusable and user (or Recovery in the + future) must repeat the REPAIR/OPTIMIZE operation. We could use a + temporary index file in the future (drawback: more disk space). + + IMPLEMENTATION (for hard repair with block format) + - Create new, unrelated MARIA_HA of the table + - Create new datafile and associate it with new handler + - Reset all statistic information in new handler + - Copy all data to new handler with normal write operations + - Move state of new handler to old handler + - Close new handler + - Close data file in old handler + - Rename old data file to new data file. + - Reopen data file in old handler +*/ + +int maria_repair(HA_CHECK *param, register MARIA_HA *info, + char *name, int rep_quick) +{ + int error, got_error= 1; + uint i; + ha_rows start_records,new_header_length; + my_off_t del; + File new_file; + MARIA_SHARE *share=info->s; + char llbuff[22],llbuff2[22]; + MARIA_SORT_INFO sort_info; + MARIA_SORT_PARAM sort_param; + my_bool block_record, scan_inited= 0; + enum data_file_type org_data_file_type= info->s->data_file_type; + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + DBUG_ENTER("maria_repair"); + + bzero((char *)&sort_info, sizeof(sort_info)); + bzero((char *)&sort_param, sizeof(sort_param)); + start_records=info->state->records; + new_header_length=(param->testflag & T_UNPACK) ? 0L : + share->pack.header_length; + new_file= -1; + sort_param.sort_info=&sort_info; + block_record= org_data_file_type == BLOCK_RECORD; + sort_info.info= sort_info.new_info= info; + bzero(&info->rec_cache,sizeof(info->rec_cache)); + + if (!(param->testflag & T_SILENT)) + { + printf("- recovering (with keycache) MARIA-table '%s'\n",name); + printf("Data records: %s\n", llstr(info->state->records,llbuff)); + } + param->testflag|=T_REP; /* for easy checking */ + + if (info->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + param->testflag|=T_CALC_CHECKSUM; + + /* + The physical size of the data file is sometimes used during repair (see + sort_info.filelength further below); we need to flush to have it exact. + We flush the state because our maria_open(HA_OPEN_COPY) will want to read + it from disk. Index file will be recreated. + */ + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_FORCE_WRITE, FLUSH_IGNORE_CHANGED) || + _ma_state_info_write(share, 1|2|4)) + goto err; + + if (!rep_quick) + { + /* Get real path for data file */ + if ((new_file= my_create(fn_format(param->temp_filename, + share->data_file_name, "", + DATA_TMP_EXT, 2+4), + 0,param->tmpfile_createflag, + MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (new_header_length && + maria_filecopy(param, new_file, info->dfile.file, 0L, + new_header_length, "datafile-header")) + goto err; + info->s->state.dellink= HA_OFFSET_ERROR; + info->rec_cache.file= new_file; + if (share->data_file_type == BLOCK_RECORD || + ((param->testflag & T_UNPACK) && + share->state.header.org_data_file_type == BLOCK_RECORD)) + { + MARIA_HA *new_info; + /* + It's ok for Recovery to have two MARIA_SHARE on the same index file + because the one below is not transactional + */ + if (!(sort_info.new_info= maria_open(info->s->open_file_name, O_RDWR, + HA_OPEN_COPY | HA_OPEN_FOR_REPAIR))) + goto err; + new_info= sort_info.new_info; + change_data_file_descriptor(new_info, new_file); + maria_lock_database(new_info, F_EXTRA_LCK); + if ((param->testflag & T_UNPACK) && + share->data_file_type == COMPRESSED_RECORD) + { + (*new_info->s->once_end)(new_info->s); + (*new_info->s->end)(new_info); + restore_data_file_type(new_info->s); + _ma_setup_functions(new_info->s); + if ((*new_info->s->once_init)(new_info->s, new_file) || + (*new_info->s->init)(new_info)) + goto err; + } + _ma_reset_status(sort_info.new_info); + if (_ma_initialize_data_file(sort_info.new_info->s, new_file)) + goto err; + block_record= 1; + } + } + + if (org_data_file_type != BLOCK_RECORD) + { + /* We need a read buffer to read rows in big blocks */ + if (init_io_cache(¶m->read_cache, info->dfile.file, + (uint) param->read_buffer_length, + READ_CACHE, share->pack.header_length, 1, MYF(MY_WME))) + goto err; + } + if (sort_info.new_info->s->data_file_type != BLOCK_RECORD) + { + /* When writing to not block records, we need a write buffer */ + if (!rep_quick) + if (init_io_cache(&info->rec_cache, new_file, + (uint) param->write_buffer_length, + WRITE_CACHE, new_header_length, 1, + MYF(MY_WME | MY_WAIT_IF_FULL))) + goto err; + info->opt_flag|=WRITE_CACHE_USED; + } + else + { + scan_inited= 1; + if (maria_scan_init(sort_info.info)) + goto err; + } + + if (!(sort_param.record=(uchar*) my_malloc((uint) share->base.pack_reclength, + MYF(0))) || + _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size, + info->s->base.default_rec_buff_size)) + { + _ma_check_print_error(param, "Not enough memory for extra record"); + goto err; + } + + sort_info.param = param; + sort_param.read_cache=param->read_cache; + sort_param.pos=sort_param.max_pos=share->pack.header_length; + sort_param.filepos=new_header_length; + param->read_cache.end_of_file=sort_info.filelength= + my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)); + sort_info.dupp=0; + sort_param.fix_datafile= (my_bool) (! rep_quick); + sort_param.master=1; + sort_info.max_records= ~(ha_rows) 0; + + set_data_file_type(&sort_info, share); + del=info->state->del; + info->state->records=info->state->del=share->state.split=0; + info->state->empty=0; + param->glob_crc=0; + if (param->testflag & T_CALC_CHECKSUM) + sort_param.calc_checksum= 1; + + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + /* + Clear all keys. Note that all key blocks allocated until now remain + "dead" parts of the key file. (Bug #4692) + */ + for (i=0 ; i < info->s->base.keys ; i++) + share->state.key_root[i]= HA_OFFSET_ERROR; + + /* Drop the delete chain. */ + share->state.key_del= HA_OFFSET_ERROR; + + /* + If requested, activate (enable) all keys in key_map. In this case, + all indexes will be (re-)built. + */ + if (param->testflag & T_CREATE_MISSING_KEYS) + maria_set_all_keys_active(share->state.key_map, share->base.keys); + + info->state->key_file_length=share->base.keystart; + + maria_lock_memory(param); /* Everything is alloced */ + + sort_info.org_data_file_type= info->s->data_file_type; + + /* Re-create all keys, which are set in key_map. */ + while (!(error=sort_get_next_record(&sort_param))) + { + if (block_record && _ma_sort_write_record(&sort_param)) + goto err; + + if (writekeys(&sort_param)) + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY) + goto err; + DBUG_DUMP("record",(uchar*) sort_param.record,share->base.pack_reclength); + _ma_check_print_info(param,"Duplicate key %2d for record at %10s against new record at %10s", + info->errkey+1, + llstr(sort_param.start_recpos,llbuff), + llstr(info->dup_key_pos,llbuff2)); + if (param->testflag & T_VERBOSE) + { + VOID(_ma_make_key(info,(uint) info->errkey,info->lastkey, + sort_param.record,0L)); + _ma_print_key(stdout,share->keyinfo[info->errkey].seg,info->lastkey, + USE_WHOLE_KEY); + } + sort_info.dupp++; + if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK) + { + param->testflag|=T_RETRY_WITHOUT_QUICK; + param->error_printed=1; + goto err; + } + /* purecov: begin tested */ + if (block_record) + { + sort_info.new_info->state->records--; + if ((*sort_info.new_info->s->write_record_abort)(sort_info.new_info)) + { + _ma_check_print_error(param,"Couldn't delete duplicate row"); + goto err; + } + continue; + } + /* purecov: end */ + } + if (!block_record && _ma_sort_write_record(&sort_param)) + goto err; + } + if (error > 0 || maria_write_data_suffix(&sort_info, (my_bool)!rep_quick) || + flush_io_cache(&info->rec_cache) || param->read_cache.error < 0) + goto err; + + if (param->testflag & T_WRITE_LOOP) + { + VOID(fputs(" \r",stdout)); VOID(fflush(stdout)); + } + if (my_chsize(share->kfile.file, info->state->key_file_length, 0, MYF(0))) + { + _ma_check_print_warning(param, + "Can't change size of indexfile, error: %d", + my_errno); + goto err; + } + + if (rep_quick && del+sort_info.dupp != info->state->del) + { + _ma_check_print_error(param,"Couldn't fix table with quick recovery: Found wrong number of deleted records"); + _ma_check_print_error(param,"Run recovery again without -q"); + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + goto err; + } + + if (param->testflag & T_SAFE_REPAIR) + { + /* Don't repair if we loosed more than one row */ + if (sort_info.new_info->state->records+1 < start_records) + { + info->state->records=start_records; + goto err; + } + } + + if (!rep_quick) + { + if (sort_info.new_info != sort_info.info) + { + MARIA_STATE_INFO save_state= sort_info.new_info->s->state; + if (maria_close(sort_info.new_info)) + { + _ma_check_print_error(param, "Got error %d on close", my_errno); + goto err; + } + copy_data_file_state(&info->s->state, &save_state); + new_file= -1; + } + else + info->state->data_file_length= sort_param.filepos; + share->state.version=(ulong) time((time_t*) 0); /* Force reopen */ + + /* Replace the actual file with the temporary file */ + if (new_file >= 0) + my_close(new_file, MYF(MY_WME)); + new_file= -1; + change_data_file_descriptor(info, -1); + if (maria_change_to_newfile(share->data_file_name,MARIA_NAME_DEXT, + DATA_TMP_EXT, + (param->testflag & T_BACKUP_DATA ? + MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) | + sync_dir) || + _ma_open_datafile(info, share, -1)) + { + goto err; + } + } + else + { + info->state->data_file_length= sort_param.max_pos; + } + if (param->testflag & T_CALC_CHECKSUM) + info->state->checksum= param->glob_crc; + + if (!(param->testflag & T_SILENT)) + { + if (start_records != info->state->records) + printf("Data records: %s\n", llstr(info->state->records,llbuff)); + if (sort_info.dupp) + _ma_check_print_warning(param, + "%s records have been removed", + llstr(sort_info.dupp,llbuff)); + } + + got_error= 0; + /* If invoked by external program that uses thr_lock */ + if (&share->state.state != info->state) + memcpy( &share->state.state, info->state, sizeof(*info->state)); + +err: + if (scan_inited) + maria_scan_end(sort_info.info); + + VOID(end_io_cache(¶m->read_cache)); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + /* this below could fail, shouldn't we detect error? */ + VOID(end_io_cache(&info->rec_cache)); + got_error|= _ma_flush_table_files_after_repair(param, info); + if (got_error) + { + if (! param->error_printed) + _ma_check_print_error(param,"%d for record at pos %s",my_errno, + llstr(sort_param.start_recpos,llbuff)); + if (sort_info.new_info && sort_info.new_info != sort_info.info) + { + /** + @todo ASK_MONTY + grepping for "dfile.file=" + shows several places (ma_check.c, ma_panic.c, ma_extra.c) where we + modify dfile.file without modifying share->bitmap.file.file; those + sound like bugs because the two variables are normally copies of each + other in BLOCK_RECORD (and in other record formats it does not hurt + to change the unused share->bitmap.file.file). + It does matter, because if we close dfile.file, set dfile.file to -1, + but leave bitmap.file.file to its positive value, maria_close() will + close a file which it is not allowed to (maybe even a file in another + engine or mysqld!). + */ + sort_info.new_info->dfile.file= -1; + maria_close(sort_info.new_info); + } + if (new_file >= 0) + { + VOID(my_close(new_file,MYF(0))); + VOID(my_delete(param->temp_filename, MYF(MY_WME))); + info->rec_cache.file=-1; /* don't flush data to new_file, it's closed */ + } + maria_mark_crashed_on_repair(info); + } + else if (sync_dir) + { + /* + Now that we have flushed and forced everything, we can bump + create_rename_lsn: + */ + write_log_record_for_repair(param, info); + } + my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR)); + if (!got_error && (param->testflag & T_UNPACK)) + restore_data_file_type(share); + share->state.changed|= (STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES | + STATE_NOT_ANALYZED); + share->state.changed&= ~STATE_NOT_OPTIMIZED_ROWS; + DBUG_RETURN(got_error); +} + + +/* Uppdate keyfile when doing repair */ + +static int writekeys(MARIA_SORT_PARAM *sort_param) +{ + register uint i; + uchar *key; + MARIA_HA *info= sort_param->sort_info->info; + uchar *buff= sort_param->record; + my_off_t filepos= sort_param->filepos; + DBUG_ENTER("writekeys"); + + key= info->lastkey+info->s->base.max_key_length; + for (i=0 ; i < info->s->base.keys ; i++) + { + if (maria_is_key_active(info->s->state.key_map, i)) + { + if (info->s->keyinfo[i].flag & HA_FULLTEXT ) + { + if (_ma_ft_add(info,i,(char*) key,buff,filepos)) + goto err; + } +#ifdef HAVE_SPATIAL + else if (info->s->keyinfo[i].flag & HA_SPATIAL) + { + uint key_length= _ma_make_key(info,i,key,buff,filepos); + if (maria_rtree_insert(info, i, key, key_length)) + goto err; + } +#endif /*HAVE_SPATIAL*/ + else + { + uint key_length= _ma_make_key(info,i,key,buff,filepos); + if (_ma_ck_write(info,i,key,key_length)) + goto err; + } + } + } + DBUG_RETURN(0); + + err: + if (my_errno == HA_ERR_FOUND_DUPP_KEY) + { + info->errkey=(int) i; /* This key was found */ + while ( i-- > 0 ) + { + if (maria_is_key_active(info->s->state.key_map, i)) + { + if (info->s->keyinfo[i].flag & HA_FULLTEXT) + { + if (_ma_ft_del(info,i,(char*) key,buff,filepos)) + break; + } + else + { + uint key_length= _ma_make_key(info,i,key,buff,filepos); + if (_ma_ck_delete(info,i,key,key_length)) + break; + } + } + } + } + /* Remove checksum that was added to glob_crc in sort_get_next_record */ + if (sort_param->calc_checksum) + sort_param->sort_info->param->glob_crc-= info->cur_row.checksum; + DBUG_PRINT("error",("errno: %d",my_errno)); + DBUG_RETURN(-1); +} /* writekeys */ + + + /* Change all key-pointers that points to a records */ + +int maria_movepoint(register MARIA_HA *info, uchar *record, + MARIA_RECORD_POS oldpos, MARIA_RECORD_POS newpos, + uint prot_key) +{ + register uint i; + uchar *key; + uint key_length; + DBUG_ENTER("maria_movepoint"); + + key= info->lastkey+info->s->base.max_key_length; + for (i=0 ; i < info->s->base.keys; i++) + { + if (i != prot_key && maria_is_key_active(info->s->state.key_map, i)) + { + key_length= _ma_make_key(info,i,key,record,oldpos); + if (info->s->keyinfo[i].flag & HA_NOSAME) + { /* Change pointer direct */ + uint nod_flag; + MARIA_KEYDEF *keyinfo; + keyinfo=info->s->keyinfo+i; + if (_ma_search(info,keyinfo,key,USE_WHOLE_KEY, + (uint) (SEARCH_SAME | SEARCH_SAVE_BUFF), + info->s->state.key_root[i])) + DBUG_RETURN(-1); + nod_flag=_ma_test_if_nod(info->buff); + _ma_dpointer(info,info->int_keypos-nod_flag- + info->s->rec_reflength,newpos); + if (_ma_write_keypage(info,keyinfo,info->last_keypage, + DFLT_INIT_HITS,info->buff)) + DBUG_RETURN(-1); + } + else + { /* Change old key to new */ + if (_ma_ck_delete(info,i,key,key_length)) + DBUG_RETURN(-1); + key_length= _ma_make_key(info,i,key,record,newpos); + if (_ma_ck_write(info,i,key,key_length)) + DBUG_RETURN(-1); + } + } + } + DBUG_RETURN(0); +} /* maria_movepoint */ + + + /* Tell system that we want all memory for our cache */ + +void maria_lock_memory(HA_CHECK *param __attribute__((unused))) +{ +#ifdef SUN_OS /* Key-cacheing thrases on sun 4.1 */ + if (param->opt_maria_lock_memory) + { + int success = mlockall(MCL_CURRENT); /* or plock(DATLOCK); */ + if (geteuid() == 0 && success != 0) + _ma_check_print_warning(param, + "Failed to lock memory. errno %d",my_errno); + } +#endif +} /* maria_lock_memory */ + + +/** + Flush all changed blocks to disk so that we can say "at the end of repair, + the table is fully ok on disk". + + It is a requirement for transactional tables. + We release blocks as it's unlikely that they would all be needed soon. + + @param param description of the repair operation + @param info table +*/ + +int _ma_flush_table_files_after_repair(HA_CHECK *param, MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE) || + _ma_state_info_write(share, 1|4) || + (share->base.born_transactional && _ma_sync_table_files(info))) + { + _ma_check_print_error(param,"%d when trying to write bufferts",my_errno); + return 1; + } + return 0; +} /* _ma_flush_table_files_after_repair */ + + + /* Sort index for more efficent reads */ + +int maria_sort_index(HA_CHECK *param, register MARIA_HA *info, char *name) +{ + reg2 uint key; + reg1 MARIA_KEYDEF *keyinfo; + File new_file; + my_off_t index_pos[HA_MAX_POSSIBLE_KEY]; + uint r_locks,w_locks; + int old_lock; + MARIA_SHARE *share=info->s; + MARIA_STATE_INFO old_state; + myf sync_dir= (share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0; + DBUG_ENTER("maria_sort_index"); + + /* cannot sort index files with R-tree indexes */ + for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ; + key++,keyinfo++) + if (keyinfo->key_alg == HA_KEY_ALG_RTREE) + DBUG_RETURN(0); + + if (!(param->testflag & T_SILENT)) + printf("- Sorting index for MARIA-table '%s'\n",name); + + /* Get real path for index file */ + fn_format(param->temp_filename,name,"", MARIA_NAME_IEXT,2+4+32); + if ((new_file=my_create(fn_format(param->temp_filename,param->temp_filename, + "", INDEX_TMP_EXT,2+4), + 0,param->tmpfile_createflag,MYF(0))) <= 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + DBUG_RETURN(-1); + } + if (maria_filecopy(param, new_file, share->kfile.file, 0L, + (ulong) share->base.keystart, "headerblock")) + goto err; + + param->new_file_pos=share->base.keystart; + for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ; + key++,keyinfo++) + { + if (! maria_is_key_active(info->s->state.key_map, key)) + continue; + + if (share->state.key_root[key] != HA_OFFSET_ERROR) + { + index_pos[key]=param->new_file_pos; /* Write first block here */ + if (sort_one_index(param,info,keyinfo,share->state.key_root[key], + new_file)) + goto err; + } + else + index_pos[key]= HA_OFFSET_ERROR; /* No blocks */ + } + + /* Flush key cache for this file if we are calling this outside maria_chk */ + flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_IGNORE_CHANGED); + + share->state.version=(ulong) time((time_t*) 0); + old_state= share->state; /* save state if not stored */ + r_locks= share->r_locks; + w_locks= share->w_locks; + old_lock= info->lock_type; + + /* Put same locks as old file */ + share->r_locks= share->w_locks= share->tot_locks= 0; + (void) _ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE); + pthread_mutex_lock(&share->intern_lock); + VOID(my_close(share->kfile.file, MYF(MY_WME))); + share->kfile.file = -1; + pthread_mutex_unlock(&share->intern_lock); + VOID(my_close(new_file,MYF(MY_WME))); + if (maria_change_to_newfile(share->index_file_name, MARIA_NAME_IEXT, + INDEX_TMP_EXT, sync_dir) || + _ma_open_keyfile(share)) + goto err2; + info->lock_type= F_UNLCK; /* Force maria_readinfo to lock */ + _ma_readinfo(info,F_WRLCK,0); /* Will lock the table */ + info->lock_type= old_lock; + share->r_locks= r_locks; + share->w_locks= w_locks; + share->tot_locks= r_locks+w_locks; + share->state= old_state; /* Restore old state */ + + info->state->key_file_length=param->new_file_pos; + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + for (key=0 ; key < info->s->base.keys ; key++) + info->s->state.key_root[key]=index_pos[key]; + info->s->state.key_del= HA_OFFSET_ERROR; + + info->s->state.changed&= ~STATE_NOT_SORTED_PAGES; + DBUG_RETURN(0); + +err: + VOID(my_close(new_file,MYF(MY_WME))); +err2: + VOID(my_delete(param->temp_filename,MYF(MY_WME))); + DBUG_RETURN(-1); +} /* maria_sort_index */ + + + /* Sort records recursive using one index */ + +static int sort_one_index(HA_CHECK *param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t pagepos, File new_file) +{ + uint length,nod_flag,used_length, key_length; + uchar *buff,*keypos,*endpos; + uchar key[HA_MAX_POSSIBLE_KEY_BUFF]; + my_off_t new_page_pos,next_page; + char llbuff[22]; + DBUG_ENTER("sort_one_index"); + + /* cannot walk over R-tree indices */ + DBUG_ASSERT(keyinfo->key_alg != HA_KEY_ALG_RTREE); + new_page_pos=param->new_file_pos; + param->new_file_pos+=keyinfo->block_length; + + if (!(buff= (uchar*) my_alloca((uint) keyinfo->block_length))) + { + _ma_check_print_error(param,"Not enough memory for key block"); + DBUG_RETURN(-1); + } + if (!_ma_fetch_keypage(info,keyinfo,pagepos,DFLT_INIT_HITS,buff,0)) + { + _ma_check_print_error(param,"Can't read key block from filepos: %s", + llstr(pagepos,llbuff)); + goto err; + } + if ((nod_flag=_ma_test_if_nod(buff)) || keyinfo->flag & HA_FULLTEXT) + { + used_length= maria_data_on_page(buff); + keypos=buff+2+nod_flag; + endpos=buff+used_length; + for ( ;; ) + { + if (nod_flag) + { + next_page= _ma_kpos(nod_flag,keypos); + /* Save new pos */ + _ma_kpointer(info,keypos-nod_flag,param->new_file_pos); + if (sort_one_index(param,info,keyinfo,next_page,new_file)) + { + DBUG_PRINT("error", + ("From page: %ld, keyoffset: %lu used_length: %d", + (ulong) pagepos, (ulong) (keypos - buff), + (int) used_length)); + DBUG_DUMP("buff",(uchar*) buff,used_length); + goto err; + } + } + if (keypos >= endpos || + (key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,key)) == 0) + break; + DBUG_ASSERT(keypos <= endpos); + if (keyinfo->flag & HA_FULLTEXT) + { + uint off; + int subkeys; + get_key_full_length_rdonly(off, key); + subkeys=ft_sintXkorr(key+off); + if (subkeys < 0) + { + next_page= _ma_dpos(info,0,key+key_length); + _ma_dpointer(info,keypos-nod_flag-info->s->rec_reflength, + param->new_file_pos); /* Save new pos */ + if (sort_one_index(param,info,&info->s->ft2_keyinfo, + next_page,new_file)) + goto err; + } + } + } + } + + /* Fill block with zero and write it to the new index file */ + length= maria_data_on_page(buff); + bzero((uchar*) buff+length,keyinfo->block_length-length); + if (my_pwrite(new_file,(uchar*) buff,(uint) keyinfo->block_length, + new_page_pos,MYF(MY_NABP | MY_WAIT_IF_FULL))) + { + _ma_check_print_error(param,"Can't write indexblock, error: %d",my_errno); + goto err; + } + my_afree((uchar*) buff); + DBUG_RETURN(0); +err: + my_afree((uchar*) buff); + DBUG_RETURN(1); +} /* sort_one_index */ + + + /* + Let temporary file replace old file. + This assumes that the new file was created in the same + directory as given by realpath(filename). + This will ensure that any symlinks that are used will still work. + Copy stats from old file to new file, deletes orignal and + changes new file name to old file name + */ + +int maria_change_to_newfile(const char * filename, const char * old_ext, + const char * new_ext, myf MyFlags) +{ + char old_filename[FN_REFLEN],new_filename[FN_REFLEN]; +#ifdef USE_RAID + if (raid_chunks) + return my_raid_redel(fn_format(old_filename,filename,"",old_ext,2+4), + fn_format(new_filename,filename,"",new_ext,2+4), + raid_chunks, + MYF(MY_WME | MY_LINK_WARNING | MyFlags)); +#endif + /* Get real path to filename */ + (void) fn_format(old_filename,filename,"",old_ext,2+4+32); + return my_redel(old_filename, + fn_format(new_filename,old_filename,"",new_ext,2+4), + MYF(MY_WME | MY_LINK_WARNING | MyFlags)); +} /* maria_change_to_newfile */ + + +/* Copy a block between two files */ + +int maria_filecopy(HA_CHECK *param, File to,File from,my_off_t start, + my_off_t length, const char *type) +{ + char tmp_buff[IO_SIZE],*buff; + ulong buff_length; + DBUG_ENTER("maria_filecopy"); + + buff_length=(ulong) min(param->write_buffer_length,length); + if (!(buff=my_malloc(buff_length,MYF(0)))) + { + buff=tmp_buff; buff_length=IO_SIZE; + } + + VOID(my_seek(from,start,MY_SEEK_SET,MYF(0))); + while (length > buff_length) + { + if (my_read(from,(uchar*) buff,buff_length,MYF(MY_NABP)) || + my_write(to,(uchar*) buff,buff_length,param->myf_rw)) + goto err; + length-= buff_length; + } + if (my_read(from,(uchar*) buff,(uint) length,MYF(MY_NABP)) || + my_write(to,(uchar*) buff,(uint) length,param->myf_rw)) + goto err; + if (buff != tmp_buff) + my_free(buff,MYF(0)); + DBUG_RETURN(0); +err: + if (buff != tmp_buff) + my_free(buff,MYF(0)); + _ma_check_print_error(param,"Can't copy %s to tempfile, error %d", + type,my_errno); + DBUG_RETURN(1); +} + + +/* + Repair table or given index using sorting + + SYNOPSIS + maria_repair_by_sort() + param Repair parameters + info MARIA handler to repair + name Name of table (for warnings) + rep_quick set to <> 0 if we should not change data file + + RESULT + 0 ok + <>0 Error +*/ + +int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info, + const char * name, int rep_quick) +{ + int got_error; + uint i; + ulong length; + ha_rows start_records; + my_off_t new_header_length, org_header_length, del; + File new_file; + MARIA_SORT_PARAM sort_param; + MARIA_SHARE *share=info->s; + HA_KEYSEG *keyseg; + ulong *rec_per_key_part; + char llbuff[22]; + MARIA_SORT_INFO sort_info; + ulonglong key_map=share->state.key_map; + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + DBUG_ENTER("maria_repair_by_sort"); + + bzero((char*)&sort_info,sizeof(sort_info)); + bzero((char *)&sort_param, sizeof(sort_param)); + + start_records=info->state->records; + got_error=1; + new_file= -1; + org_header_length= share->pack.header_length; + new_header_length= (param->testflag & T_UNPACK) ? 0 : org_header_length; + + if (!(param->testflag & T_SILENT)) + { + printf("- recovering (with sort) MARIA-table '%s'\n",name); + printf("Data records: %s\n", llstr(start_records,llbuff)); + } + param->testflag|=T_REP; /* for easy checking */ + + if (info->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + param->testflag|=T_CALC_CHECKSUM; + + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA, FLUSH_FORCE_WRITE, + FLUSH_KEEP)) + goto err; + + if (!(sort_info.key_block= + alloc_key_blocks(param, + (uint) param->sort_key_blocks, + share->base.max_key_block_length)) || + init_io_cache(¶m->read_cache, info->dfile.file, + (uint) param->read_buffer_length, + READ_CACHE, org_header_length, 1, MYF(MY_WME)) || + (! rep_quick && + init_io_cache(&info->rec_cache, info->dfile.file, + (uint) param->write_buffer_length, + WRITE_CACHE,new_header_length,1, + MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw))) + goto err; + sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks; + info->opt_flag|=WRITE_CACHE_USED; + info->rec_cache.file= info->dfile.file; /* for sort_delete_record */ + sort_info.org_data_file_type= info->s->data_file_type; + + if (!(sort_param.record=(uchar*) my_malloc((uint) share->base.pack_reclength, + MYF(0))) || + _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size, + info->s->base.default_rec_buff_size)) + { + _ma_check_print_error(param, "Not enough memory for extra record"); + goto err; + } + if (!rep_quick) + { + /* Get real path for data file */ + if ((new_file=my_create(fn_format(param->temp_filename, + share->data_file_name, "", + DATA_TMP_EXT, 2+4), + 0,param->tmpfile_createflag, + MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (new_header_length && + maria_filecopy(param, new_file, info->dfile.file, 0L, + new_header_length, "datafile-header")) + goto err; + if (param->testflag & T_UNPACK) + restore_data_file_type(share); + share->state.dellink= HA_OFFSET_ERROR; + info->rec_cache.file=new_file; + } + + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + if (!(param->testflag & T_CREATE_MISSING_KEYS)) + { + /* + Flush key cache for this file if we are calling this outside + maria_chk + */ + flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_IGNORE_CHANGED); + /* Clear the pointers to the given rows */ + for (i=0 ; i < share->base.keys ; i++) + share->state.key_root[i]= HA_OFFSET_ERROR; + share->state.key_del= HA_OFFSET_ERROR; + info->state->key_file_length=share->base.keystart; + } + else + { + if (flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_FORCE_WRITE)) + goto err; + key_map= ~key_map; /* Create the missing keys */ + } + + sort_info.info= sort_info.new_info= info; + sort_info.param= param; + + set_data_file_type(&sort_info, share); + sort_param.filepos=new_header_length; + sort_info.dupp=0; + sort_info.buff=0; + param->read_cache.end_of_file=sort_info.filelength= + my_seek(param->read_cache.file,0L,MY_SEEK_END,MYF(0)); + + sort_param.wordlist=NULL; + init_alloc_root(&sort_param.wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0); + + if (sort_info.org_data_file_type == DYNAMIC_RECORD) + length=max(share->base.min_pack_length+1,share->base.min_block_length); + else if (sort_info.org_data_file_type == COMPRESSED_RECORD) + length=share->base.min_block_length; + else + length=share->base.pack_reclength; + sort_info.max_records= + ((param->testflag & T_CREATE_MISSING_KEYS) ? info->state->records : + (ha_rows) (sort_info.filelength/length+1)); + sort_param.key_cmp=sort_key_cmp; + sort_param.lock_in_memory=maria_lock_memory; + sort_param.tmpdir=param->tmpdir; + sort_param.sort_info=&sort_info; + sort_param.fix_datafile= (my_bool) (! rep_quick); + sort_param.master =1; + + del=info->state->del; + param->glob_crc=0; + if (param->testflag & T_CALC_CHECKSUM) + sort_param.calc_checksum= 1; + + rec_per_key_part= param->rec_per_key_part; + for (sort_param.key=0 ; sort_param.key < share->base.keys ; + rec_per_key_part+=sort_param.keyinfo->keysegs, sort_param.key++) + { + sort_param.read_cache=param->read_cache; + sort_param.keyinfo=share->keyinfo+sort_param.key; + sort_param.seg=sort_param.keyinfo->seg; + if (! maria_is_key_active(key_map, sort_param.key)) + { + /* Remember old statistics for key */ + memcpy((char*) rec_per_key_part, + (char*) (share->state.rec_per_key_part + + (uint) (rec_per_key_part - param->rec_per_key_part)), + sort_param.keyinfo->keysegs*sizeof(*rec_per_key_part)); + continue; + } + + if ((!(param->testflag & T_SILENT))) + printf ("- Fixing index %d\n",sort_param.key+1); + sort_param.max_pos= sort_param.pos= org_header_length; + keyseg=sort_param.seg; + bzero((char*) sort_param.unique,sizeof(sort_param.unique)); + sort_param.key_length=share->rec_reflength; + for (i=0 ; keyseg[i].type != HA_KEYTYPE_END; i++) + { + sort_param.key_length+=keyseg[i].length; + if (keyseg[i].flag & HA_SPACE_PACK) + sort_param.key_length+=get_pack_length(keyseg[i].length); + if (keyseg[i].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) + sort_param.key_length+=2 + test(keyseg[i].length >= 127); + if (keyseg[i].flag & HA_NULL_PART) + sort_param.key_length++; + } + info->state->records=info->state->del=share->state.split=0; + info->state->empty=0; + + if (sort_param.keyinfo->flag & HA_FULLTEXT) + { + uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT* + sort_param.keyinfo->seg->charset->mbmaxlen; + sort_param.key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN; + /* + fulltext indexes may have much more entries than the + number of rows in the table. We estimate the number here. + + Note, built-in parser is always nr. 0 - see ftparser_call_initializer() + */ + if (sort_param.keyinfo->ftparser_nr == 0) + { + /* + for built-in parser the number of generated index entries + cannot be larger than the size of the data file divided + by the minimal word's length + */ + sort_info.max_records= + (ha_rows) (sort_info.filelength/ft_min_word_len+1); + } + else + { + /* + for external plugin parser we cannot tell anything at all :( + so, we'll use all the sort memory and start from ~10 buffpeks. + (see _create_index_by_sort) + */ + sort_info.max_records= + 10*param->sort_buffer_length/sort_param.key_length; + } + + sort_param.key_read= sort_maria_ft_key_read; + sort_param.key_write= sort_maria_ft_key_write; + } + else + { + sort_param.key_read= sort_key_read; + sort_param.key_write= sort_key_write; + } + + if (_ma_create_index_by_sort(&sort_param, + (my_bool) (!(param->testflag & T_VERBOSE)), + (uint) param->sort_buffer_length)) + { + param->retry_repair=1; + goto err; + } + /* No need to calculate checksum again. */ + sort_param.calc_checksum= 0; + free_root(&sort_param.wordroot, MYF(0)); + + /* Set for next loop */ + sort_info.max_records= (ha_rows) info->state->records; + + if (param->testflag & T_STATISTICS) + maria_update_key_parts(sort_param.keyinfo, rec_per_key_part, sort_param.unique, + param->stats_method == MI_STATS_METHOD_IGNORE_NULLS? + sort_param.notnull: NULL,(ulonglong) info->state->records); + maria_set_key_active(share->state.key_map, sort_param.key); + + if (sort_param.fix_datafile) + { + param->read_cache.end_of_file=sort_param.filepos; + if (maria_write_data_suffix(&sort_info,1) || end_io_cache(&info->rec_cache)) + goto err; + if (param->testflag & T_SAFE_REPAIR) + { + /* Don't repair if we loosed more than one row */ + if (info->state->records+1 < start_records) + { + info->state->records=start_records; + goto err; + } + } + share->state.state.data_file_length = info->state->data_file_length= + sort_param.filepos; + /* Only whole records */ + share->state.version=(ulong) time((time_t*) 0); + my_close(info->dfile.file, MYF(0)); + info->dfile.file= new_file; + share->data_file_type= sort_info.new_data_file_type; + org_header_length= (ulong) new_header_length; + sort_info.org_data_file_type= info->s->data_file_type; + sort_param.fix_datafile=0; + } + else + info->state->data_file_length=sort_param.max_pos; + + param->read_cache.file= info->dfile.file; /* re-init read cache */ + reinit_io_cache(¶m->read_cache,READ_CACHE,share->pack.header_length, + 1,1); + } + + if (param->testflag & T_WRITE_LOOP) + { + VOID(fputs(" \r",stdout)); VOID(fflush(stdout)); + } + + if (rep_quick && del+sort_info.dupp != info->state->del) + { + _ma_check_print_error(param,"Couldn't fix table with quick recovery: Found wrong number of deleted records"); + _ma_check_print_error(param,"Run recovery again without -q"); + got_error=1; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + goto err; + } + + if (rep_quick & T_FORCE_UNIQUENESS) + { + my_off_t skr= (info->state->data_file_length + + (sort_info.org_data_file_type == COMPRESSED_RECORD) ? + MEMMAP_EXTRA_MARGIN : 0); +#ifdef USE_RELOC + if (sort_info.org_data_file_type == STATIC_RECORD && + skr < share->base.reloc*share->base.min_pack_length) + skr=share->base.reloc*share->base.min_pack_length; +#endif + if (skr != sort_info.filelength) + if (my_chsize(info->dfile.file, skr, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of datafile, error: %d", + my_errno); + } + if (param->testflag & T_CALC_CHECKSUM) + info->state->checksum=param->glob_crc; + + if (my_chsize(share->kfile.file, info->state->key_file_length, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of indexfile, error: %d", + my_errno); + + if (!(param->testflag & T_SILENT)) + { + if (start_records != info->state->records) + printf("Data records: %s\n", llstr(info->state->records,llbuff)); + if (sort_info.dupp) + _ma_check_print_warning(param, + "%s records have been removed", + llstr(sort_info.dupp,llbuff)); + } + got_error=0; + + if (&share->state.state != info->state) + memcpy( &share->state.state, info->state, sizeof(*info->state)); + +err: + VOID(end_io_cache(&info->rec_cache)); + VOID(end_io_cache(¶m->read_cache)); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + got_error|= _ma_flush_table_files_after_repair(param, info); + if (!got_error) + { + /* Replace the actual file with the temporary file */ + if (new_file >= 0) + { + my_close(new_file,MYF(0)); + info->dfile.file= new_file= -1; + if (maria_change_to_newfile(share->data_file_name,MARIA_NAME_DEXT, + DATA_TMP_EXT, + MYF((param->testflag & T_BACKUP_DATA ? + MY_REDEL_MAKE_BACKUP : 0) | + sync_dir)) || + _ma_open_datafile(info,share,-1)) + got_error=1; + } + } + if (got_error) + { + if (! param->error_printed) + _ma_check_print_error(param,"%d when fixing table",my_errno); + if (new_file >= 0) + { + VOID(my_close(new_file,MYF(0))); + VOID(my_delete(param->temp_filename, MYF(MY_WME))); + if (info->dfile.file == new_file) + info->dfile.file= -1; + } + maria_mark_crashed_on_repair(info); + } + else if (key_map == share->state.key_map) + share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS; + share->state.changed|= STATE_NOT_SORTED_PAGES; + share->state.changed&= ~STATE_NOT_OPTIMIZED_ROWS; + + my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR)); + my_free((uchar*) sort_info.key_block,MYF(MY_ALLOW_ZERO_PTR)); + my_free((uchar*) sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR)); + if (!got_error && (param->testflag & T_UNPACK)) + restore_data_file_type(share); + DBUG_RETURN(got_error); +} + +/* + Threaded repair of table using sorting + + SYNOPSIS + maria_repair_parallel() + param Repair parameters + info MARIA handler to repair + name Name of table (for warnings) + rep_quick set to <> 0 if we should not change data file + + DESCRIPTION + Same as maria_repair_by_sort but do it multithreaded + Each key is handled by a separate thread. + TODO: make a number of threads a parameter + + In parallel repair we use one thread per index. There are two modes: + + Quick + + Only the indexes are rebuilt. All threads share a read buffer. + Every thread that needs fresh data in the buffer enters the shared + cache lock. The last thread joining the lock reads the buffer from + the data file and wakes all other threads. + + Non-quick + + The data file is rebuilt and all indexes are rebuilt to point to + the new record positions. One thread is the master thread. It + reads from the old data file and writes to the new data file. It + also creates one of the indexes. The other threads read from a + buffer which is filled by the master. If they need fresh data, + they enter the shared cache lock. If the masters write buffer is + full, it flushes it to the new data file and enters the shared + cache lock too. When all threads joined in the lock, the master + copies its write buffer to the read buffer for the other threads + and wakes them. + + RESULT + 0 ok + <>0 Error +*/ + +int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info, + const char * name, int rep_quick) +{ +#ifndef THREAD + return maria_repair_by_sort(param, info, name, rep_quick); +#else + int got_error; + uint i,key, total_key_length, istep; + ulong rec_length; + ha_rows start_records; + my_off_t new_header_length,del; + File new_file; + MARIA_SORT_PARAM *sort_param=0; + MARIA_SHARE *share=info->s; + ulong *rec_per_key_part; + HA_KEYSEG *keyseg; + char llbuff[22]; + IO_CACHE new_data_cache; /* For non-quick repair. */ + IO_CACHE_SHARE io_share; + MARIA_SORT_INFO sort_info; + ulonglong key_map=share->state.key_map; + pthread_attr_t thr_attr; + myf sync_dir= (share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0; + DBUG_ENTER("maria_repair_parallel"); + + start_records=info->state->records; + got_error=1; + new_file= -1; + new_header_length=(param->testflag & T_UNPACK) ? 0 : + share->pack.header_length; + if (!(param->testflag & T_SILENT)) + { + printf("- parallel recovering (with sort) MARIA-table '%s'\n",name); + printf("Data records: %s\n", llstr(start_records,llbuff)); + } + param->testflag|=T_REP; /* for easy checking */ + + if (info->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + param->testflag|=T_CALC_CHECKSUM; + + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA, FLUSH_FORCE_WRITE, + FLUSH_KEEP)) + goto err; + + /* + Quick repair (not touching data file, rebuilding indexes): + { + Read cache is (MI_CHECK *param)->read_cache using info->dfile.file. + } + + Non-quick repair (rebuilding data file and indexes): + { + Master thread: + + Read cache is (MI_CHECK *param)->read_cache using info->dfile.file. + Write cache is (MI_INFO *info)->rec_cache using new_file. + + Slave threads: + + Read cache is new_data_cache synced to master rec_cache. + + The final assignment of the filedescriptor for rec_cache is done + after the cache creation. + + Don't check file size on new_data_cache, as the resulting file size + is not known yet. + + As rec_cache and new_data_cache are synced, write_buffer_length is + used for the read cache 'new_data_cache'. Both start at the same + position 'new_header_length'. + } + */ + DBUG_PRINT("info", ("is quick repair: %d", rep_quick)); + bzero((char*)&sort_info,sizeof(sort_info)); + /* Initialize pthread structures before goto err. */ + pthread_mutex_init(&sort_info.mutex, MY_MUTEX_INIT_FAST); + pthread_cond_init(&sort_info.cond, 0); + + sort_info.org_data_file_type= info->s->data_file_type; + + if (!(sort_info.key_block= + alloc_key_blocks(param, (uint) param->sort_key_blocks, + share->base.max_key_block_length)) || + init_io_cache(¶m->read_cache, info->dfile.file, + (uint) param->read_buffer_length, + READ_CACHE, share->pack.header_length, 1, MYF(MY_WME)) || + (!rep_quick && + (init_io_cache(&info->rec_cache, info->dfile.file, + (uint) param->write_buffer_length, + WRITE_CACHE, new_header_length, 1, + MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw) || + init_io_cache(&new_data_cache, -1, + (uint) param->write_buffer_length, + READ_CACHE, new_header_length, 1, + MYF(MY_WME | MY_DONT_CHECK_FILESIZE))))) + goto err; + sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks; + info->opt_flag|=WRITE_CACHE_USED; + info->rec_cache.file= info->dfile.file; /* for sort_delete_record */ + + if (!rep_quick) + { + /* Get real path for data file */ + if ((new_file= my_create(fn_format(param->temp_filename, + share->data_file_name, "", + DATA_TMP_EXT, + 2+4), + 0,param->tmpfile_createflag, + MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (new_header_length && + maria_filecopy(param, new_file, info->dfile.file,0L,new_header_length, + "datafile-header")) + goto err; + if (param->testflag & T_UNPACK) + restore_data_file_type(share); + share->state.dellink= HA_OFFSET_ERROR; + info->rec_cache.file=new_file; + } + + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + if (!(param->testflag & T_CREATE_MISSING_KEYS)) + { + /* + Flush key cache for this file if we are calling this outside + maria_chk + */ + flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_IGNORE_CHANGED); + /* Clear the pointers to the given rows */ + for (i=0 ; i < share->base.keys ; i++) + share->state.key_root[i]= HA_OFFSET_ERROR; + share->state.key_del= HA_OFFSET_ERROR; + info->state->key_file_length=share->base.keystart; + } + else + { + if (flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_FORCE_WRITE)) + goto err; + key_map= ~key_map; /* Create the missing keys */ + } + + sort_info.info= sort_info.new_info= info; + sort_info.param= param; + + set_data_file_type(&sort_info, share); + sort_info.dupp=0; + sort_info.buff=0; + param->read_cache.end_of_file=sort_info.filelength= + my_seek(param->read_cache.file,0L,MY_SEEK_END,MYF(0)); + + if (sort_info.org_data_file_type == DYNAMIC_RECORD) + rec_length=max(share->base.min_pack_length+1,share->base.min_block_length); + else if (sort_info.org_data_file_type == COMPRESSED_RECORD) + rec_length=share->base.min_block_length; + else + rec_length=share->base.pack_reclength; + /* + +1 below is required hack for parallel repair mode. + The info->state->records value, that is compared later + to sort_info.max_records and cannot exceed it, is + increased in sort_key_write. In maria_repair_by_sort, sort_key_write + is called after sort_key_read, where the comparison is performed, + but in parallel mode master thread can call sort_key_write + before some other repair thread calls sort_key_read. + Furthermore I'm not even sure +1 would be enough. + May be sort_info.max_records shold be always set to max value in + parallel mode. + */ + sort_info.max_records= + ((param->testflag & T_CREATE_MISSING_KEYS) ? info->state->records + 1: + (ha_rows) (sort_info.filelength/rec_length+1)); + + del=info->state->del; + param->glob_crc=0; + + if (!(sort_param=(MARIA_SORT_PARAM *) + my_malloc((uint) share->base.keys * + (sizeof(MARIA_SORT_PARAM) + share->base.pack_reclength), + MYF(MY_ZEROFILL)))) + { + _ma_check_print_error(param,"Not enough memory for key!"); + goto err; + } + total_key_length=0; + rec_per_key_part= param->rec_per_key_part; + info->state->records=info->state->del=share->state.split=0; + info->state->empty=0; + + for (i=key=0, istep=1 ; key < share->base.keys ; + rec_per_key_part+=sort_param[i].keyinfo->keysegs, i+=istep, key++) + { + sort_param[i].key=key; + sort_param[i].keyinfo=share->keyinfo+key; + sort_param[i].seg=sort_param[i].keyinfo->seg; + if (! maria_is_key_active(key_map, key)) + { + /* Remember old statistics for key */ + memcpy((char*) rec_per_key_part, + (char*) (share->state.rec_per_key_part+ + (uint) (rec_per_key_part - param->rec_per_key_part)), + sort_param[i].keyinfo->keysegs*sizeof(*rec_per_key_part)); + istep=0; + continue; + } + istep=1; + if ((!(param->testflag & T_SILENT))) + printf ("- Fixing index %d\n",key+1); + if (sort_param[i].keyinfo->flag & HA_FULLTEXT) + { + sort_param[i].key_read=sort_maria_ft_key_read; + sort_param[i].key_write=sort_maria_ft_key_write; + } + else + { + sort_param[i].key_read=sort_key_read; + sort_param[i].key_write=sort_key_write; + } + sort_param[i].key_cmp=sort_key_cmp; + sort_param[i].lock_in_memory=maria_lock_memory; + sort_param[i].tmpdir=param->tmpdir; + sort_param[i].sort_info=&sort_info; + sort_param[i].master=0; + sort_param[i].fix_datafile=0; + sort_param[i].calc_checksum= 0; + + sort_param[i].filepos=new_header_length; + sort_param[i].max_pos=sort_param[i].pos=share->pack.header_length; + + sort_param[i].record= (((char *)(sort_param+share->base.keys))+ + (share->base.pack_reclength * i)); + if (_ma_alloc_buffer(&sort_param[i].rec_buff, &sort_param[i].rec_buff_size, + share->base.default_rec_buff_size)) + { + _ma_check_print_error(param,"Not enough memory!"); + goto err; + } + sort_param[i].key_length=share->rec_reflength; + for (keyseg=sort_param[i].seg; keyseg->type != HA_KEYTYPE_END; + keyseg++) + { + sort_param[i].key_length+=keyseg->length; + if (keyseg->flag & HA_SPACE_PACK) + sort_param[i].key_length+=get_pack_length(keyseg->length); + if (keyseg->flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) + sort_param[i].key_length+=2 + test(keyseg->length >= 127); + if (keyseg->flag & HA_NULL_PART) + sort_param[i].key_length++; + } + total_key_length+=sort_param[i].key_length; + + if (sort_param[i].keyinfo->flag & HA_FULLTEXT) + { + uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT* + sort_param[i].keyinfo->seg->charset->mbmaxlen; + sort_param[i].key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN; + init_alloc_root(&sort_param[i].wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0); + } + } + sort_info.total_keys=i; + sort_param[0].master= 1; + sort_param[0].fix_datafile= (my_bool)(! rep_quick); + sort_param[0].calc_checksum= test(param->testflag & T_CALC_CHECKSUM); + + sort_info.got_error=0; + pthread_mutex_lock(&sort_info.mutex); + + /* + Initialize the I/O cache share for use with the read caches and, in + case of non-quick repair, the write cache. When all threads join on + the cache lock, the writer copies the write cache contents to the + read caches. + */ + if (i > 1) + { + if (rep_quick) + init_io_cache_share(¶m->read_cache, &io_share, NULL, i); + else + init_io_cache_share(&new_data_cache, &io_share, &info->rec_cache, i); + } + else + io_share.total_threads= 0; /* share not used */ + + (void) pthread_attr_init(&thr_attr); + (void) pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED); + + for (i=0 ; i < sort_info.total_keys ; i++) + { + /* + Copy the properly initialized IO_CACHE structure so that every + thread has its own copy. In quick mode param->read_cache is shared + for use by all threads. In non-quick mode all threads but the + first copy the shared new_data_cache, which is synchronized to the + write cache of the first thread. The first thread copies + param->read_cache, which is not shared. + */ + sort_param[i].read_cache= ((rep_quick || !i) ? param->read_cache : + new_data_cache); + DBUG_PRINT("io_cache_share", ("thread: %u read_cache: 0x%lx", + i, (long) &sort_param[i].read_cache)); + + /* + two approaches: the same amount of memory for each thread + or the memory for the same number of keys for each thread... + In the second one all the threads will fill their sort_buffers + (and call write_keys) at the same time, putting more stress on i/o. + */ + sort_param[i].sortbuff_size= +#ifndef USING_SECOND_APPROACH + param->sort_buffer_length/sort_info.total_keys; +#else + param->sort_buffer_length*sort_param[i].key_length/total_key_length; +#endif + if (pthread_create(&sort_param[i].thr, &thr_attr, + _ma_thr_find_all_keys, + (void *) (sort_param+i))) + { + _ma_check_print_error(param,"Cannot start a repair thread"); + /* Cleanup: Detach from the share. Avoid others to be blocked. */ + if (io_share.total_threads) + remove_io_thread(&sort_param[i].read_cache); + DBUG_PRINT("error", ("Cannot start a repair thread")); + sort_info.got_error=1; + } + else + sort_info.threads_running++; + } + (void) pthread_attr_destroy(&thr_attr); + + /* waiting for all threads to finish */ + while (sort_info.threads_running) + pthread_cond_wait(&sort_info.cond, &sort_info.mutex); + pthread_mutex_unlock(&sort_info.mutex); + + if ((got_error= _ma_thr_write_keys(sort_param))) + { + param->retry_repair=1; + goto err; + } + got_error=1; /* Assume the following may go wrong */ + + if (sort_param[0].fix_datafile) + { + /* + Append some nuls to the end of a memory mapped file. Destroy the + write cache. The master thread did already detach from the share + by remove_io_thread() in sort.c:thr_find_all_keys(). + */ + if (maria_write_data_suffix(&sort_info,1) || end_io_cache(&info->rec_cache)) + goto err; + if (param->testflag & T_SAFE_REPAIR) + { + /* Don't repair if we loosed more than one row */ + if (info->state->records+1 < start_records) + { + info->state->records=start_records; + goto err; + } + } + share->state.state.data_file_length= info->state->data_file_length= + sort_param->filepos; + /* Only whole records */ + share->state.version=(ulong) time((time_t*) 0); + /* + Exchange the data file descriptor of the table, so that we use the + new file from now on. + */ + my_close(info->dfile.file, MYF(0)); + info->dfile.file= new_file; + share->pack.header_length=(ulong) new_header_length; + } + else + info->state->data_file_length=sort_param->max_pos; + + if (rep_quick && del+sort_info.dupp != info->state->del) + { + _ma_check_print_error(param,"Couldn't fix table with quick recovery: Found wrong number of deleted records"); + _ma_check_print_error(param,"Run recovery again without -q"); + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + goto err; + } + + if (rep_quick & T_FORCE_UNIQUENESS) + { + my_off_t skr= (info->state->data_file_length + + (sort_info.org_data_file_type == COMPRESSED_RECORD) ? + MEMMAP_EXTRA_MARGIN : 0); +#ifdef USE_RELOC + if (sort_info.org_data_file_type == STATIC_RECORD && + skr < share->base.reloc*share->base.min_pack_length) + skr=share->base.reloc*share->base.min_pack_length; +#endif + if (skr != sort_info.filelength) + if (my_chsize(info->dfile.file, skr, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of datafile, error: %d", + my_errno); + } + if (param->testflag & T_CALC_CHECKSUM) + info->state->checksum=param->glob_crc; + + if (my_chsize(share->kfile.file, info->state->key_file_length, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of indexfile, error: %d", my_errno); + + if (!(param->testflag & T_SILENT)) + { + if (start_records != info->state->records) + printf("Data records: %s\n", llstr(info->state->records,llbuff)); + if (sort_info.dupp) + _ma_check_print_warning(param, + "%s records have been removed", + llstr(sort_info.dupp,llbuff)); + } + got_error=0; + + if (&share->state.state != info->state) + memcpy(&share->state.state, info->state, sizeof(*info->state)); + +err: + /* + Destroy the write cache. The master thread did already detach from + the share by remove_io_thread() or it was not yet started (if the + error happend before creating the thread). + */ + VOID(end_io_cache(&info->rec_cache)); + VOID(end_io_cache(¶m->read_cache)); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + /* + Destroy the new data cache in case of non-quick repair. All slave + threads did either detach from the share by remove_io_thread() + already or they were not yet started (if the error happend before + creating the threads). + */ + if (!rep_quick) + VOID(end_io_cache(&new_data_cache)); + got_error|= _ma_flush_table_files_after_repair(param, info); + if (!got_error) + { + /* Replace the actual file with the temporary file */ + if (new_file >= 0) + { + my_close(new_file,MYF(0)); + info->dfile.file= new_file= -1; + if (maria_change_to_newfile(share->data_file_name,MARIA_NAME_DEXT, + DATA_TMP_EXT, + MYF((param->testflag & T_BACKUP_DATA ? + MY_REDEL_MAKE_BACKUP : 0) | + sync_dir)) || + _ma_open_datafile(info,share,-1)) + got_error=1; + } + } + if (got_error) + { + if (! param->error_printed) + _ma_check_print_error(param,"%d when fixing table",my_errno); + if (new_file >= 0) + { + VOID(my_close(new_file,MYF(0))); + VOID(my_delete(param->temp_filename, MYF(MY_WME))); + if (info->dfile.file == new_file) + info->dfile.file= -1; + } + maria_mark_crashed_on_repair(info); + } + else if (key_map == share->state.key_map) + share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS; + share->state.changed|= STATE_NOT_SORTED_PAGES; + share->state.changed&= ~STATE_NOT_OPTIMIZED_ROWS; + + pthread_cond_destroy (&sort_info.cond); + pthread_mutex_destroy(&sort_info.mutex); + + my_free((uchar*) sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR)); + my_free((uchar*) sort_info.key_block,MYF(MY_ALLOW_ZERO_PTR)); + my_free((uchar*) sort_param,MYF(MY_ALLOW_ZERO_PTR)); + my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR)); + if (!got_error && (param->testflag & T_UNPACK)) + restore_data_file_type(share); + DBUG_RETURN(got_error); +#endif /* THREAD */ +} + + /* Read next record and return next key */ + +static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key) +{ + int error; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + MARIA_HA *info= sort_info->info; + DBUG_ENTER("sort_key_read"); + + if ((error=sort_get_next_record(sort_param))) + DBUG_RETURN(error); + if (info->state->records == sort_info->max_records) + { + _ma_check_print_error(sort_info->param, + "Key %d - Found too many records; Can't continue", + sort_param->key+1); + DBUG_RETURN(1); + } + sort_param->real_key_length= + (info->s->rec_reflength+ + _ma_make_key(info, sort_param->key, key, + sort_param->record, sort_param->filepos)); +#ifdef HAVE_purify + bzero(key+sort_param->real_key_length, + (sort_param->key_length-sort_param->real_key_length)); +#endif + DBUG_RETURN(_ma_sort_write_record(sort_param)); +} /* sort_key_read */ + + +static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key) +{ + int error; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + MARIA_HA *info=sort_info->info; + FT_WORD *wptr=0; + DBUG_ENTER("sort_maria_ft_key_read"); + + if (!sort_param->wordlist) + { + for (;;) + { + free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE)); + if ((error=sort_get_next_record(sort_param))) + DBUG_RETURN(error); + if (!(wptr= _ma_ft_parserecord(info,sort_param->key,sort_param->record, + &sort_param->wordroot))) + + DBUG_RETURN(1); + if (wptr->pos) + break; + error=_ma_sort_write_record(sort_param); + } + sort_param->wordptr=sort_param->wordlist=wptr; + } + else + { + error=0; + wptr=(FT_WORD*)(sort_param->wordptr); + } + + sort_param->real_key_length=(info->s->rec_reflength+ + _ma_ft_make_key(info, sort_param->key, + key, wptr++, + sort_param->filepos)); +#ifdef HAVE_purify + if (sort_param->key_length > sort_param->real_key_length) + bzero(key+sort_param->real_key_length, + (sort_param->key_length-sort_param->real_key_length)); +#endif + if (!wptr->pos) + { + free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE)); + sort_param->wordlist=0; + error=_ma_sort_write_record(sort_param); + } + else + sort_param->wordptr=(void*)wptr; + + DBUG_RETURN(error); +} /* sort_maria_ft_key_read */ + + +/* + Read next record from file using parameters in sort_info. + + SYNOPSIS + sort_get_next_record() + sort_param Information about and for the sort process + + NOTES + Dynamic Records With Non-Quick Parallel Repair + + For non-quick parallel repair we use a synchronized read/write + cache. This means that one thread is the master who fixes the data + file by reading each record from the old data file and writing it + to the new data file. By doing this the records in the new data + file are written contiguously. Whenever the write buffer is full, + it is copied to the read buffer. The slaves read from the read + buffer, which is not associated with a file. Thus read_cache.file + is -1. When using _mi_read_cache(), the slaves must always set + flag to READING_NEXT so that the function never tries to read from + file. This is safe because the records are contiguous. There is no + need to read outside the cache. This condition is evaluated in the + variable 'parallel_flag' for quick reference. read_cache.file must + be >= 0 in every other case. + + RETURN + -1 end of file + 0 ok + sort_param->filepos points to record position. + sort_param->record contains record + > 0 error +*/ + +static int sort_get_next_record(MARIA_SORT_PARAM *sort_param) +{ + int searching; + int parallel_flag; + uint found_record,b_type,left_length; + my_off_t pos; + MARIA_BLOCK_INFO block_info; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_HA *info=sort_info->info; + MARIA_SHARE *share=info->s; + char llbuff[22],llbuff2[22]; + DBUG_ENTER("sort_get_next_record"); + + if (*_ma_killed_ptr(param)) + DBUG_RETURN(1); + + switch (sort_info->org_data_file_type) { + case BLOCK_RECORD: + { + for (;;) + { + int flag; + + if (info != sort_info->new_info) + { + /* Safe scanning */ + flag= _ma_safe_scan_block_record(sort_info, info, + sort_param->record); + } + else + { + /* + Scan on clean table. + It requires a reliable data_file_length so we set it. + */ + info->state->data_file_length= sort_info->filelength; + flag= _ma_scan_block_record(info, sort_param->record, + info->cur_row.nextpos, 1); + } + if (!flag) + { + if (sort_param->calc_checksum) + { + ha_checksum checksum; + checksum= (*info->s->calc_check_checksum)(info, sort_param->record); + if (info->s->calc_checksum && + info->cur_row.checksum != (checksum & 255)) + { + if (param->testflag & T_VERBOSE) + { + char llbuff[22]; + record_pos_to_txt(info, info->cur_row.lastpos, llbuff); + _ma_check_print_info(param, + "Found record with wrong checksum at %s", + llbuff); + } + continue; + } + info->cur_row.checksum= checksum; + param->glob_crc+= checksum; + } + sort_param->start_recpos= sort_param->filepos= info->cur_row.lastpos; + DBUG_RETURN(0); + } + if (flag == HA_ERR_END_OF_FILE) + { + sort_param->max_pos= sort_info->filelength; + DBUG_RETURN(-1); + } + /* Retry only if wrong record, not if disk error */ + if (flag != HA_ERR_WRONG_IN_RECORD) + DBUG_RETURN(flag); + } + break; + } + case STATIC_RECORD: + for (;;) + { + if (my_b_read(&sort_param->read_cache,sort_param->record, + share->base.pack_reclength)) + { + if (sort_param->read_cache.error) + param->out_flag |= O_DATA_LOST; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + DBUG_RETURN(-1); + } + sort_param->start_recpos=sort_param->pos; + if (!sort_param->fix_datafile) + { + sort_param->filepos=sort_param->pos; + if (sort_param->master) + share->state.split++; + } + sort_param->max_pos=(sort_param->pos+=share->base.pack_reclength); + if (*sort_param->record) + { + if (sort_param->calc_checksum) + param->glob_crc+= (info->cur_row.checksum= + _ma_static_checksum(info,sort_param->record)); + DBUG_RETURN(0); + } + if (!sort_param->fix_datafile && sort_param->master) + { + info->state->del++; + info->state->empty+=share->base.pack_reclength; + } + } + case DYNAMIC_RECORD: + { + uchar *to; + LINT_INIT(to); + ha_checksum checksum= 0; + + pos=sort_param->pos; + searching=(sort_param->fix_datafile && (param->testflag & T_EXTEND)); + parallel_flag= (sort_param->read_cache.file < 0) ? READING_NEXT : 0; + for (;;) + { + found_record=block_info.second_read= 0; + left_length=1; + if (searching) + { + pos=MY_ALIGN(pos,MARIA_DYN_ALIGN_SIZE); + param->testflag|=T_RETRY_WITHOUT_QUICK; + sort_param->start_recpos=pos; + } + do + { + if (pos > sort_param->max_pos) + sort_param->max_pos=pos; + if (pos & (MARIA_DYN_ALIGN_SIZE-1)) + { + if ((param->testflag & T_VERBOSE) || searching == 0) + _ma_check_print_info(param,"Wrong aligned block at %s", + llstr(pos,llbuff)); + if (searching) + goto try_next; + } + if (found_record && pos == param->search_after_block) + _ma_check_print_info(param,"Block: %s used by record at %s", + llstr(param->search_after_block,llbuff), + llstr(sort_param->start_recpos,llbuff2)); + if (_ma_read_cache(&sort_param->read_cache, + (uchar*) block_info.header,pos, + MARIA_BLOCK_INFO_HEADER_LENGTH, + (! found_record ? READING_NEXT : 0) | + parallel_flag | READING_HEADER)) + { + if (found_record) + { + _ma_check_print_info(param, + "Can't read whole record at %s (errno: %d)", + llstr(sort_param->start_recpos,llbuff),errno); + goto try_next; + } + DBUG_RETURN(-1); + } + if (searching && ! sort_param->fix_datafile) + { + param->error_printed=1; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + DBUG_RETURN(1); /* Something wrong with data */ + } + b_type= _ma_get_block_info(&block_info,-1,pos); + if ((b_type & (BLOCK_ERROR | BLOCK_FATAL_ERROR)) || + ((b_type & BLOCK_FIRST) && + (block_info.rec_len < (uint) share->base.min_pack_length || + block_info.rec_len > (uint) share->base.max_pack_length))) + { + uint i; + if (param->testflag & T_VERBOSE || searching == 0) + _ma_check_print_info(param, + "Wrong bytesec: %3d-%3d-%3d at %10s; Skipped", + block_info.header[0],block_info.header[1], + block_info.header[2],llstr(pos,llbuff)); + if (found_record) + goto try_next; + block_info.second_read=0; + searching=1; + /* Search after block in read header string */ + for (i=MARIA_DYN_ALIGN_SIZE ; + i < MARIA_BLOCK_INFO_HEADER_LENGTH ; + i+= MARIA_DYN_ALIGN_SIZE) + if (block_info.header[i] >= 1 && + block_info.header[i] <= MARIA_MAX_DYN_HEADER_BYTE) + break; + pos+=(ulong) i; + sort_param->start_recpos=pos; + continue; + } + if (b_type & BLOCK_DELETED) + { + bool error=0; + if (block_info.block_len+ (uint) (block_info.filepos-pos) < + share->base.min_block_length) + { + if (!searching) + _ma_check_print_info(param, + "Deleted block with impossible length %u at %s", + block_info.block_len,llstr(pos,llbuff)); + error=1; + } + else + { + if ((block_info.next_filepos != HA_OFFSET_ERROR && + block_info.next_filepos >= + info->state->data_file_length) || + (block_info.prev_filepos != HA_OFFSET_ERROR && + block_info.prev_filepos >= info->state->data_file_length)) + { + if (!searching) + _ma_check_print_info(param, + "Delete link points outside datafile at %s", + llstr(pos,llbuff)); + error=1; + } + } + if (error) + { + if (found_record) + goto try_next; + searching=1; + pos+= MARIA_DYN_ALIGN_SIZE; + sort_param->start_recpos=pos; + block_info.second_read=0; + continue; + } + } + else + { + if (block_info.block_len+ (uint) (block_info.filepos-pos) < + share->base.min_block_length || + block_info.block_len > (uint) share->base.max_pack_length+ + MARIA_SPLIT_LENGTH) + { + if (!searching) + _ma_check_print_info(param, + "Found block with impossible length %u at %s; Skipped", + block_info.block_len+ (uint) (block_info.filepos-pos), + llstr(pos,llbuff)); + if (found_record) + goto try_next; + searching=1; + pos+= MARIA_DYN_ALIGN_SIZE; + sort_param->start_recpos=pos; + block_info.second_read=0; + continue; + } + } + if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + { + if (!sort_param->fix_datafile && sort_param->master && + (b_type & BLOCK_DELETED)) + { + info->state->empty+=block_info.block_len; + info->state->del++; + share->state.split++; + } + if (found_record) + goto try_next; + if (searching) + { + pos+=MARIA_DYN_ALIGN_SIZE; + sort_param->start_recpos=pos; + } + else + pos=block_info.filepos+block_info.block_len; + block_info.second_read=0; + continue; + } + + if (!sort_param->fix_datafile && sort_param->master) + share->state.split++; + if (! found_record++) + { + sort_param->find_length=left_length=block_info.rec_len; + sort_param->start_recpos=pos; + if (!sort_param->fix_datafile) + sort_param->filepos=sort_param->start_recpos; + if (sort_param->fix_datafile && (param->testflag & T_EXTEND)) + sort_param->pos=block_info.filepos+1; + else + sort_param->pos=block_info.filepos+block_info.block_len; + if (share->base.blobs) + { + if (_ma_alloc_buffer(&sort_param->rec_buff, + &sort_param->rec_buff_size, + block_info.rec_len + + info->s->base.extra_rec_buff_size)) + + { + if (param->max_record_length >= block_info.rec_len) + { + _ma_check_print_error(param,"Not enough memory for blob at %s (need %lu)", + llstr(sort_param->start_recpos,llbuff), + (ulong) block_info.rec_len); + DBUG_RETURN(1); + } + else + { + _ma_check_print_info(param,"Not enough memory for blob at %s (need %lu); Row skipped", + llstr(sort_param->start_recpos,llbuff), + (ulong) block_info.rec_len); + goto try_next; + } + } + } + to= sort_param->rec_buff; + } + if (left_length < block_info.data_len || ! block_info.data_len) + { + _ma_check_print_info(param, + "Found block with too small length at %s; " + "Skipped", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + if (block_info.filepos + block_info.data_len > + sort_param->read_cache.end_of_file) + { + _ma_check_print_info(param, + "Found block that points outside data file " + "at %s", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + /* + Copy information that is already read. Avoid accessing data + below the cache start. This could happen if the header + streched over the end of the previous buffer contents. + */ + { + uint header_len= (uint) (block_info.filepos - pos); + uint prefetch_len= (MARIA_BLOCK_INFO_HEADER_LENGTH - header_len); + + if (prefetch_len > block_info.data_len) + prefetch_len= block_info.data_len; + if (prefetch_len) + { + memcpy(to, block_info.header + header_len, prefetch_len); + block_info.filepos+= prefetch_len; + block_info.data_len-= prefetch_len; + left_length-= prefetch_len; + to+= prefetch_len; + } + } + if (block_info.data_len && + _ma_read_cache(&sort_param->read_cache,to,block_info.filepos, + block_info.data_len, + (found_record == 1 ? READING_NEXT : 0) | + parallel_flag)) + { + _ma_check_print_info(param, + "Read error for block at: %s (error: %d); Skipped", + llstr(block_info.filepos,llbuff),my_errno); + goto try_next; + } + left_length-=block_info.data_len; + to+=block_info.data_len; + pos=block_info.next_filepos; + if (pos == HA_OFFSET_ERROR && left_length) + { + _ma_check_print_info(param,"Wrong block with wrong total length starting at %s", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + if (pos + MARIA_BLOCK_INFO_HEADER_LENGTH > sort_param->read_cache.end_of_file) + { + _ma_check_print_info(param,"Found link that points at %s (outside data file) at %s", + llstr(pos,llbuff2), + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + } while (left_length); + + if (_ma_rec_unpack(info,sort_param->record,sort_param->rec_buff, + sort_param->find_length) != MY_FILE_ERROR) + { + if (sort_param->read_cache.error < 0) + DBUG_RETURN(1); + if (sort_param->calc_checksum) + checksum= (info->s->calc_check_checksum)(info, sort_param->record); + if ((param->testflag & (T_EXTEND | T_REP)) || searching) + { + if (_ma_rec_check(info, sort_param->record, sort_param->rec_buff, + sort_param->find_length, + (param->testflag & T_QUICK) && + sort_param->calc_checksum && + test(info->s->calc_checksum), checksum)) + { + _ma_check_print_info(param,"Found wrong packed record at %s", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + } + if (sort_param->calc_checksum) + param->glob_crc+= checksum; + DBUG_RETURN(0); + } + if (!searching) + _ma_check_print_info(param,"Key %d - Found wrong stored record at %s", + sort_param->key+1, + llstr(sort_param->start_recpos,llbuff)); + try_next: + pos=(sort_param->start_recpos+=MARIA_DYN_ALIGN_SIZE); + searching=1; + } + } + case COMPRESSED_RECORD: + for (searching=0 ;; searching=1, sort_param->pos++) + { + if (_ma_read_cache(&sort_param->read_cache,(uchar*) block_info.header, + sort_param->pos, + share->pack.ref_length,READING_NEXT)) + DBUG_RETURN(-1); + if (searching && ! sort_param->fix_datafile) + { + param->error_printed=1; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + DBUG_RETURN(1); /* Something wrong with data */ + } + sort_param->start_recpos=sort_param->pos; + if (_ma_pack_get_block_info(info, &sort_param->bit_buff, &block_info, + &sort_param->rec_buff, + &sort_param->rec_buff_size, -1, + sort_param->pos)) + DBUG_RETURN(-1); + if (!block_info.rec_len && + sort_param->pos + MEMMAP_EXTRA_MARGIN == + sort_param->read_cache.end_of_file) + DBUG_RETURN(-1); + if (block_info.rec_len < (uint) share->min_pack_length || + block_info.rec_len > (uint) share->max_pack_length) + { + if (! searching) + _ma_check_print_info(param,"Found block with wrong recordlength: %d at %s\n", + block_info.rec_len, + llstr(sort_param->pos,llbuff)); + continue; + } + if (_ma_read_cache(&sort_param->read_cache,(uchar*) sort_param->rec_buff, + block_info.filepos, block_info.rec_len, + READING_NEXT)) + { + if (! searching) + _ma_check_print_info(param,"Couldn't read whole record from %s", + llstr(sort_param->pos,llbuff)); + continue; + } + if (_ma_pack_rec_unpack(info, &sort_param->bit_buff, sort_param->record, + sort_param->rec_buff, block_info.rec_len)) + { + if (! searching) + _ma_check_print_info(param,"Found wrong record at %s", + llstr(sort_param->pos,llbuff)); + continue; + } + if (!sort_param->fix_datafile) + { + sort_param->filepos=sort_param->pos; + if (sort_param->master) + share->state.split++; + } + sort_param->max_pos=(sort_param->pos=block_info.filepos+ + block_info.rec_len); + info->packed_length=block_info.rec_len; + + if (sort_param->calc_checksum) + { + info->cur_row.checksum= (*info->s->calc_check_checksum)(info, + sort_param-> + record); + param->glob_crc+= info->cur_row.checksum; + } + DBUG_RETURN(0); + } + } + DBUG_RETURN(1); /* Impossible */ +} + + +/* + Write record to new file. + + SYNOPSIS + _ma_sort_write_record() + sort_param Sort parameters. + + NOTE + This is only called by a master thread if parallel repair is used. + + RETURN + 0 OK + 1 Error +*/ + +int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param) +{ + int flag; + uint length; + ulong block_length,reclength; + uchar *from; + uchar block_buff[8]; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param= sort_info->param; + MARIA_HA *info= sort_info->new_info; + MARIA_SHARE *share=info->s; + DBUG_ENTER("_ma_sort_write_record"); + + if (sort_param->fix_datafile) + { + switch (sort_info->new_data_file_type) { + case BLOCK_RECORD: + if ((sort_param->filepos= (*share->write_record_init)(info, + sort_param-> + record)) == + HA_OFFSET_ERROR) + DBUG_RETURN(1); + break; + case STATIC_RECORD: + if (my_b_write(&info->rec_cache,sort_param->record, + share->base.pack_reclength)) + { + _ma_check_print_error(param,"%d when writing to datafile",my_errno); + DBUG_RETURN(1); + } + sort_param->filepos+=share->base.pack_reclength; + info->s->state.split++; + break; + case DYNAMIC_RECORD: + if (! info->blobs) + from=sort_param->rec_buff; + else + { + /* must be sure that local buffer is big enough */ + reclength=info->s->base.pack_reclength+ + _ma_calc_total_blob_length(info,sort_param->record)+ + ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+ + MARIA_DYN_DELETE_BLOCK_HEADER; + if (sort_info->buff_length < reclength) + { + if (!(sort_info->buff=my_realloc(sort_info->buff, (uint) reclength, + MYF(MY_FREE_ON_ERROR | + MY_ALLOW_ZERO_PTR)))) + DBUG_RETURN(1); + sort_info->buff_length=reclength; + } + from=sort_info->buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER); + } + /* We can use info->checksum here as only one thread calls this */ + info->cur_row.checksum= (*info->s->calc_check_checksum)(info, + sort_param-> + record); + reclength= _ma_rec_pack(info,from,sort_param->record); + flag=0; + + do + { + block_length=reclength+ 3 + test(reclength >= (65520-3)); + if (block_length < share->base.min_block_length) + block_length=share->base.min_block_length; + info->update|=HA_STATE_WRITE_AT_END; + block_length=MY_ALIGN(block_length,MARIA_DYN_ALIGN_SIZE); + if (block_length > MARIA_MAX_BLOCK_LENGTH) + block_length=MARIA_MAX_BLOCK_LENGTH; + if (_ma_write_part_record(info,0L,block_length, + sort_param->filepos+block_length, + &from,&reclength,&flag)) + { + _ma_check_print_error(param,"%d when writing to datafile",my_errno); + DBUG_RETURN(1); + } + sort_param->filepos+=block_length; + info->s->state.split++; + } while (reclength); + break; + case COMPRESSED_RECORD: + reclength=info->packed_length; + length= _ma_save_pack_length((uint) share->pack.version, block_buff, + reclength); + if (info->s->base.blobs) + length+= _ma_save_pack_length((uint) share->pack.version, + block_buff + length, info->blob_length); + if (my_b_write(&info->rec_cache,block_buff,length) || + my_b_write(&info->rec_cache,(uchar*) sort_param->rec_buff,reclength)) + { + _ma_check_print_error(param,"%d when writing to datafile",my_errno); + DBUG_RETURN(1); + } + sort_param->filepos+=reclength+length; + info->s->state.split++; + break; + } + } + if (sort_param->master) + { + info->state->records++; + if ((param->testflag & T_WRITE_LOOP) && + (info->state->records % WRITE_COUNT) == 0) + { + char llbuff[22]; + printf("%s\r", llstr(info->state->records,llbuff)); + VOID(fflush(stdout)); + } + } + DBUG_RETURN(0); +} /* _ma_sort_write_record */ + + +/* Compare two keys from _ma_create_index_by_sort */ + +static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a, + const void *b) +{ + uint not_used[2]; + return (ha_key_cmp(sort_param->seg, *((uchar**) a), *((uchar**) b), + USE_WHOLE_KEY, SEARCH_SAME, not_used)); +} /* sort_key_cmp */ + + +static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a) +{ + uint diff_pos[2]; + char llbuff[22],llbuff2[22]; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param= sort_info->param; + int cmp; + + if (sort_info->key_block->inited) + { + cmp=ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey, + (uchar*) a, USE_WHOLE_KEY,SEARCH_FIND | SEARCH_UPDATE, + diff_pos); + if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL) + ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey, + (uchar*) a, USE_WHOLE_KEY, + SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diff_pos); + else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + { + diff_pos[0]= maria_collect_stats_nonulls_next(sort_param->seg, + sort_param->notnull, + sort_info->key_block->lastkey, + a); + } + sort_param->unique[diff_pos[0]-1]++; + } + else + { + cmp= -1; + if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + maria_collect_stats_nonulls_first(sort_param->seg, sort_param->notnull, + a); + } + if ((sort_param->keyinfo->flag & HA_NOSAME) && cmp == 0) + { + sort_info->dupp++; + sort_info->info->cur_row.lastpos= get_record_for_key(sort_info->info, + sort_param->keyinfo, + a); + _ma_check_print_warning(param, + "Duplicate key for record at %10s against record at %10s", + llstr(sort_info->info->cur_row.lastpos, llbuff), + llstr(get_record_for_key(sort_info->info, + sort_param->keyinfo, + sort_info->key_block-> + lastkey), + llbuff2)); + param->testflag|=T_RETRY_WITHOUT_QUICK; + if (sort_info->param->testflag & T_VERBOSE) + _ma_print_key(stdout,sort_param->seg, a, USE_WHOLE_KEY); + return (sort_delete_record(sort_param)); + } +#ifndef DBUG_OFF + if (cmp > 0) + { + _ma_check_print_error(param, + "Internal error: Keys are not in order from sort"); + return(1); + } +#endif + return (sort_insert_key(sort_param, sort_info->key_block, + a, HA_OFFSET_ERROR)); +} /* sort_key_write */ + + +int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param) +{ + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + SORT_KEY_BLOCKS *key_block=sort_info->key_block; + MARIA_SHARE *share=sort_info->info->s; + uint val_off, val_len; + int error; + SORT_FT_BUF *maria_ft_buf=sort_info->ft_buf; + uchar *from, *to; + + val_len=share->ft2_keyinfo.keylength; + get_key_full_length_rdonly(val_off, maria_ft_buf->lastkey); + to= maria_ft_buf->lastkey+val_off; + + if (maria_ft_buf->buf) + { + /* flushing first-level tree */ + error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey, + HA_OFFSET_ERROR); + for (from=to+val_len; + !error && from < maria_ft_buf->buf; + from+= val_len) + { + memcpy(to, from, val_len); + error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey, + HA_OFFSET_ERROR); + } + return error; + } + /* flushing second-level tree keyblocks */ + error=_ma_flush_pending_blocks(sort_param); + /* updating lastkey with second-level tree info */ + ft_intXstore(maria_ft_buf->lastkey+val_off, -maria_ft_buf->count); + _ma_dpointer(sort_info->info, maria_ft_buf->lastkey+val_off+HA_FT_WLEN, + share->state.key_root[sort_param->key]); + /* restoring first level tree data in sort_info/sort_param */ + sort_info->key_block=sort_info->key_block_end- sort_info->param->sort_key_blocks; + sort_param->keyinfo=share->keyinfo+sort_param->key; + share->state.key_root[sort_param->key]=HA_OFFSET_ERROR; + /* writing lastkey in first-level tree */ + return error ? error : + sort_insert_key(sort_param,sort_info->key_block, + maria_ft_buf->lastkey,HA_OFFSET_ERROR); +} + + +static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param, + const uchar *a) +{ + uint a_len, val_off, val_len, error; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + SORT_FT_BUF *ft_buf= sort_info->ft_buf; + SORT_KEY_BLOCKS *key_block= sort_info->key_block; + + val_len=HA_FT_WLEN+sort_info->info->s->base.rec_reflength; + get_key_full_length_rdonly(a_len, (uchar *)a); + + if (!ft_buf) + { + /* + use two-level tree only if key_reflength fits in rec_reflength place + and row format is NOT static - for _ma_dpointer not to garble offsets + */ + if ((sort_info->info->s->base.key_reflength <= + sort_info->info->s->base.rec_reflength) && + (sort_info->info->s->options & + (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD))) + ft_buf= (SORT_FT_BUF *)my_malloc(sort_param->keyinfo->block_length + + sizeof(SORT_FT_BUF), MYF(MY_WME)); + + if (!ft_buf) + { + sort_param->key_write=sort_key_write; + return sort_key_write(sort_param, a); + } + sort_info->ft_buf= ft_buf; + goto word_init_ft_buf; /* no need to duplicate the code */ + } + get_key_full_length_rdonly(val_off, ft_buf->lastkey); + + if (ha_compare_text(sort_param->seg->charset, + ((uchar *)a)+1,a_len-1, + (uchar*) ft_buf->lastkey+1,val_off-1, 0, 0)==0) + { + uchar *p; + if (!ft_buf->buf) /* store in second-level tree */ + { + ft_buf->count++; + return sort_insert_key(sort_param,key_block, + a + a_len, HA_OFFSET_ERROR); + } + + /* storing the key in the buffer. */ + memcpy (ft_buf->buf, (char *)a+a_len, val_len); + ft_buf->buf+=val_len; + if (ft_buf->buf < ft_buf->end) + return 0; + + /* converting to two-level tree */ + p=ft_buf->lastkey+val_off; + + while (key_block->inited) + key_block++; + sort_info->key_block=key_block; + sort_param->keyinfo=& sort_info->info->s->ft2_keyinfo; + ft_buf->count=(ft_buf->buf - p)/val_len; + + /* flushing buffer to second-level tree */ + for (error=0; !error && p < ft_buf->buf; p+= val_len) + error=sort_insert_key(sort_param,key_block,p,HA_OFFSET_ERROR); + ft_buf->buf=0; + return error; + } + + /* flushing buffer */ + if ((error=_ma_sort_ft_buf_flush(sort_param))) + return error; + +word_init_ft_buf: + a_len+=val_len; + memcpy(ft_buf->lastkey, a, a_len); + ft_buf->buf=ft_buf->lastkey+a_len; + /* + 32 is just a safety margin here + (at least max(val_len, sizeof(nod_flag)) should be there). + May be better performance could be achieved if we'd put + (sort_info->keyinfo->block_length-32)/XXX + instead. + TODO: benchmark the best value for XXX. + */ + ft_buf->end= ft_buf->lastkey+ (sort_param->keyinfo->block_length-32); + return 0; +} /* sort_maria_ft_key_write */ + + + /* get pointer to record from a key */ + +static my_off_t get_record_for_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + const uchar *key) +{ + return _ma_dpos(info,0, key + _ma_keylength(keyinfo, key)); +} /* get_record_for_key */ + + + /* Insert a key in sort-key-blocks */ + +static int sort_insert_key(MARIA_SORT_PARAM *sort_param, + register SORT_KEY_BLOCKS *key_block, + const uchar *key, + my_off_t prev_block) +{ + uint a_length,t_length,nod_flag; + my_off_t filepos,key_file_length; + uchar *anc_buff,*lastkey; + MARIA_KEY_PARAM s_temp; + MARIA_HA *info; + MARIA_KEYDEF *keyinfo=sort_param->keyinfo; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + HA_CHECK *param=sort_info->param; + DBUG_ENTER("sort_insert_key"); + + anc_buff= key_block->buff; + info=sort_info->info; + lastkey=key_block->lastkey; + nod_flag= (key_block == sort_info->key_block ? 0 : + info->s->base.key_reflength); + + if (!key_block->inited) + { + key_block->inited=1; + if (key_block == sort_info->key_block_end) + { + _ma_check_print_error(param,"To many key-block-levels; Try increasing sort_key_blocks"); + DBUG_RETURN(1); + } + a_length=2+nod_flag; + key_block->end_pos=anc_buff+2; + lastkey=0; /* No previous key in block */ + } + else + a_length= maria_data_on_page(anc_buff); + + /* Save pointer to previous block */ + if (nod_flag) + _ma_kpointer(info,key_block->end_pos,prev_block); + + t_length=(*keyinfo->pack_key)(keyinfo,nod_flag, + (uchar*) 0,lastkey,lastkey,key, + &s_temp); + (*keyinfo->store_key)(keyinfo, key_block->end_pos+nod_flag,&s_temp); + a_length+=t_length; + maria_putint(anc_buff,a_length,nod_flag); + key_block->end_pos+=t_length; + if (a_length <= keyinfo->block_length) + { + VOID(_ma_move_key(keyinfo, key_block->lastkey, key)); + key_block->last_length=a_length-t_length; + DBUG_RETURN(0); + } + + /* Fill block with end-zero and write filled block */ + maria_putint(anc_buff,key_block->last_length,nod_flag); + bzero(anc_buff+key_block->last_length, + keyinfo->block_length- key_block->last_length); + key_file_length=info->state->key_file_length; + if ((filepos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + DBUG_RETURN(1); + + /* If we read the page from the key cache, we have to write it back to it */ + if (key_file_length == info->state->key_file_length) + { + if (_ma_write_keypage(info, keyinfo, filepos, DFLT_INIT_HITS, anc_buff)) + DBUG_RETURN(1); + } + else if (my_pwrite(info->s->kfile.file, anc_buff, + (uint) keyinfo->block_length,filepos, param->myf_rw)) + DBUG_RETURN(1); + DBUG_DUMP("buff",anc_buff,maria_data_on_page(anc_buff)); + + /* Write separator-key to block in next level */ + if (sort_insert_key(sort_param,key_block+1,key_block->lastkey,filepos)) + DBUG_RETURN(1); + + /* clear old block and write new key in it */ + key_block->inited=0; + DBUG_RETURN(sort_insert_key(sort_param, key_block,key,prev_block)); +} /* sort_insert_key */ + + + /* Delete record when we found a duplicated key */ + +static int sort_delete_record(MARIA_SORT_PARAM *sort_param) +{ + uint i; + int old_file,error; + uchar *key; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_HA *info=sort_info->info; + DBUG_ENTER("sort_delete_record"); + + if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK) + { + _ma_check_print_error(param, + "Quick-recover aborted; Run recovery without switch -q or with switch -qq"); + DBUG_RETURN(1); + } + if (info->s->options & HA_OPTION_COMPRESS_RECORD) + { + _ma_check_print_error(param, + "Recover aborted; Can't run standard recovery on compressed tables with errors in data-file. Use switch 'maria_chk --safe-recover' to fix it\n",stderr);; + DBUG_RETURN(1); + } + + old_file= info->dfile.file; + info->dfile.file= info->rec_cache.file; + if (sort_info->current_key) + { + key= info->lastkey+info->s->base.max_key_length; + if ((error=(*info->s->read_record)(info,sort_param->record, + info->cur_row.lastpos)) && + error != HA_ERR_RECORD_DELETED) + { + _ma_check_print_error(param,"Can't read record to be removed"); + info->dfile.file= old_file; + DBUG_RETURN(1); + } + + for (i=0 ; i < sort_info->current_key ; i++) + { + uint key_length= _ma_make_key(info, i, key, sort_param->record, + info->cur_row.lastpos); + if (_ma_ck_delete(info, i, key, key_length)) + { + _ma_check_print_error(param, + "Can't delete key %d from record to be removed", + i+1); + info->dfile.file= old_file; + DBUG_RETURN(1); + } + } + if (sort_param->calc_checksum) + param->glob_crc-=(*info->s->calc_check_checksum)(info, + sort_param->record); + } + error= (flush_io_cache(&info->rec_cache) || + (*info->s->delete_record)(info, sort_param->record)); + info->dfile.file= old_file; /* restore actual value */ + info->state->records--; + DBUG_RETURN(error); +} /* sort_delete_record */ + + +/* Fix all pending blocks and flush everything to disk */ + +int _ma_flush_pending_blocks(MARIA_SORT_PARAM *sort_param) +{ + uint nod_flag,length; + my_off_t filepos,key_file_length; + SORT_KEY_BLOCKS *key_block; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + myf myf_rw=sort_info->param->myf_rw; + MARIA_HA *info=sort_info->info; + MARIA_KEYDEF *keyinfo=sort_param->keyinfo; + DBUG_ENTER("_ma_flush_pending_blocks"); + + filepos= HA_OFFSET_ERROR; /* if empty file */ + nod_flag=0; + for (key_block=sort_info->key_block ; key_block->inited ; key_block++) + { + key_block->inited=0; + length= maria_data_on_page(key_block->buff); + if (nod_flag) + _ma_kpointer(info,key_block->end_pos,filepos); + key_file_length=info->state->key_file_length; + bzero(key_block->buff+length, keyinfo->block_length-length); + if ((filepos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + DBUG_RETURN(1); + + /* If we read the page from the key cache, we have to write it back */ + if (key_file_length == info->state->key_file_length) + { + if (_ma_write_keypage(info, keyinfo, filepos, + DFLT_INIT_HITS, key_block->buff)) + DBUG_RETURN(1); + } + else if (my_pwrite(info->s->kfile.file, key_block->buff, + (uint) keyinfo->block_length,filepos, myf_rw)) + DBUG_RETURN(1); + DBUG_DUMP("buff",key_block->buff,length); + nod_flag=1; + } + info->s->state.key_root[sort_param->key]=filepos; /* Last is root for tree */ + DBUG_RETURN(0); +} /* _ma_flush_pending_blocks */ + + /* alloc space and pointers for key_blocks */ + +static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks, + uint buffer_length) +{ + reg1 uint i; + SORT_KEY_BLOCKS *block; + DBUG_ENTER("alloc_key_blocks"); + + if (!(block= (SORT_KEY_BLOCKS*) my_malloc((sizeof(SORT_KEY_BLOCKS)+ + buffer_length+IO_SIZE)*blocks, + MYF(0)))) + { + _ma_check_print_error(param,"Not enough memory for sort-key-blocks"); + return(0); + } + for (i=0 ; i < blocks ; i++) + { + block[i].inited=0; + block[i].buff= (uchar*) (block+blocks)+(buffer_length+IO_SIZE)*i; + } + DBUG_RETURN(block); +} /* alloc_key_blocks */ + + + /* Check if file is almost full */ + +int maria_test_if_almost_full(MARIA_HA *info) +{ + if (info->s->options & HA_OPTION_COMPRESS_RECORD) + return 0; + return my_seek(info->s->kfile.file, 0L, MY_SEEK_END, + MYF(MY_THREADSAFE))/10*9 > + (my_off_t) info->s->base.max_key_file_length || + my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) / 10 * 9 > + (my_off_t) info->s->base.max_data_file_length; +} + + /* Recreate table with bigger more alloced record-data */ + +int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename) +{ + int error; + MARIA_HA info; + MARIA_SHARE share; + MARIA_KEYDEF *keyinfo,*key,*key_end; + HA_KEYSEG *keysegs,*keyseg; + MARIA_COLUMNDEF *columndef,*column,*end; + MARIA_UNIQUEDEF *uniquedef,*u_ptr,*u_end; + MARIA_STATUS_INFO status_info; + uint unpack,key_parts; + ha_rows max_records; + ulonglong file_length,tmp_length; + MARIA_CREATE_INFO create_info; + DBUG_ENTER("maria_recreate_table"); + + error=1; /* Default error */ + info= **org_info; + status_info= (*org_info)->state[0]; + info.state= &status_info; + share= *(*org_info)->s; + unpack= (share.options & HA_OPTION_COMPRESS_RECORD) && + (param->testflag & T_UNPACK); + if (!(keyinfo=(MARIA_KEYDEF*) my_alloca(sizeof(MARIA_KEYDEF) * + share.base.keys))) + DBUG_RETURN(0); + memcpy((uchar*) keyinfo,(uchar*) share.keyinfo, + (size_t) (sizeof(MARIA_KEYDEF)*share.base.keys)); + + key_parts= share.base.all_key_parts; + if (!(keysegs=(HA_KEYSEG*) my_alloca(sizeof(HA_KEYSEG)* + (key_parts+share.base.keys)))) + { + my_afree((uchar*) keyinfo); + DBUG_RETURN(1); + } + if (!(columndef=(MARIA_COLUMNDEF*) + my_alloca(sizeof(MARIA_COLUMNDEF)*(share.base.fields+1)))) + { + my_afree((uchar*) keyinfo); + my_afree((uchar*) keysegs); + DBUG_RETURN(1); + } + if (!(uniquedef=(MARIA_UNIQUEDEF*) + my_alloca(sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques+1)))) + { + my_afree((uchar*) columndef); + my_afree((uchar*) keyinfo); + my_afree((uchar*) keysegs); + DBUG_RETURN(1); + } + + /* Copy the column definitions */ + memcpy((uchar*) columndef,(uchar*) share.columndef, + (size_t) (sizeof(MARIA_COLUMNDEF)*(share.base.fields+1))); + for (column=columndef, end= columndef+share.base.fields; + column != end ; + column++) + { + if (unpack && !(share.options & HA_OPTION_PACK_RECORD) && + column->type != FIELD_BLOB && + column->type != FIELD_VARCHAR && + column->type != FIELD_CHECK) + column->type=(int) FIELD_NORMAL; + } + + /* Change the new key to point at the saved key segments */ + memcpy((uchar*) keysegs,(uchar*) share.keyparts, + (size_t) (sizeof(HA_KEYSEG)*(key_parts+share.base.keys+ + share.state.header.uniques))); + keyseg=keysegs; + for (key=keyinfo,key_end=keyinfo+share.base.keys; key != key_end ; key++) + { + key->seg=keyseg; + for (; keyseg->type ; keyseg++) + { + if (param->language) + keyseg->language=param->language; /* change language */ + } + keyseg++; /* Skip end pointer */ + } + + /* + Copy the unique definitions and change them to point at the new key + segments + */ + memcpy((uchar*) uniquedef,(uchar*) share.uniqueinfo, + (size_t) (sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques))); + for (u_ptr=uniquedef,u_end=uniquedef+share.state.header.uniques; + u_ptr != u_end ; u_ptr++) + { + u_ptr->seg=keyseg; + keyseg+=u_ptr->keysegs+1; + } + if (share.options & HA_OPTION_COMPRESS_RECORD) + share.base.records=max_records=info.state->records; + else if (share.base.min_pack_length) + max_records=(ha_rows) (my_seek(info.dfile.file, 0L, MY_SEEK_END, + MYF(0)) / + (ulong) share.base.min_pack_length); + else + max_records=0; + unpack= (share.data_file_type == COMPRESSED_RECORD) && + (param->testflag & T_UNPACK); + share.options&= ~HA_OPTION_TEMP_COMPRESS_RECORD; + + file_length=(ulonglong) my_seek(info.dfile.file, 0L, MY_SEEK_END, MYF(0)); + tmp_length= file_length+file_length/10; + set_if_bigger(file_length,param->max_data_file_length); + set_if_bigger(file_length,tmp_length); + set_if_bigger(file_length,(ulonglong) share.base.max_data_file_length); + + VOID(maria_close(*org_info)); + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=max(max_records,share.base.records); + create_info.reloc_rows=share.base.reloc; + create_info.old_options=(share.options | + (unpack ? HA_OPTION_TEMP_COMPRESS_RECORD : 0)); + + create_info.data_file_length=file_length; + create_info.auto_increment=share.state.auto_increment; + create_info.language = (param->language ? param->language : + share.state.header.language); + create_info.key_file_length= status_info.key_file_length; + create_info.org_data_file_type= ((enum data_file_type) + share.state.header.org_data_file_type); + + /* + Allow for creating an auto_increment key. This has an effect only if + an auto_increment key exists in the original table. + */ + create_info.with_auto_increment= TRUE; + create_info.null_bytes= share.base.null_bytes; + /* + We don't have to handle symlinks here because we are using + HA_DONT_TOUCH_DATA + */ + if (maria_create(filename, share.data_file_type, + share.base.keys - share.state.header.uniques, + keyinfo, share.base.fields, columndef, + share.state.header.uniques, uniquedef, + &create_info, + HA_DONT_TOUCH_DATA)) + { + _ma_check_print_error(param, + "Got error %d when trying to recreate indexfile", + my_errno); + goto end; + } + *org_info=maria_open(filename,O_RDWR, + (param->testflag & T_WAIT_FOREVER) ? HA_OPEN_WAIT_IF_LOCKED : + (param->testflag & T_DESCRIPT) ? HA_OPEN_IGNORE_IF_LOCKED : + HA_OPEN_ABORT_IF_LOCKED); + if (!*org_info) + { + _ma_check_print_error(param, + "Got error %d when trying to open re-created indexfile", + my_errno); + goto end; + } + /* We are modifing */ + (*org_info)->s->options&= ~HA_OPTION_READ_ONLY_DATA; + VOID(_ma_readinfo(*org_info,F_WRLCK,0)); + (*org_info)->state->records=info.state->records; + if (share.state.create_time) + (*org_info)->s->state.create_time=share.state.create_time; + (*org_info)->s->state.unique=(*org_info)->this_unique= + share.state.unique; + (*org_info)->state->checksum=info.state->checksum; + (*org_info)->state->del=info.state->del; + (*org_info)->s->state.dellink=share.state.dellink; + (*org_info)->state->empty=info.state->empty; + (*org_info)->state->data_file_length=info.state->data_file_length; + if (maria_update_state_info(param,*org_info,UPDATE_TIME | UPDATE_STAT | + UPDATE_OPEN_COUNT)) + goto end; + error=0; +end: + my_afree((uchar*) uniquedef); + my_afree((uchar*) keyinfo); + my_afree((uchar*) columndef); + my_afree((uchar*) keysegs); + DBUG_RETURN(error); +} + + + /* write suffix to data file if neaded */ + +int maria_write_data_suffix(MARIA_SORT_INFO *sort_info, my_bool fix_datafile) +{ + MARIA_HA *info=sort_info->new_info; + + if (info->s->data_file_type == COMPRESSED_RECORD && fix_datafile) + { + char buff[MEMMAP_EXTRA_MARGIN]; + bzero(buff,sizeof(buff)); + if (my_b_write(&info->rec_cache,buff,sizeof(buff))) + { + _ma_check_print_error(sort_info->param, + "%d when writing to datafile",my_errno); + return 1; + } + sort_info->param->read_cache.end_of_file+=sizeof(buff); + } + return 0; +} + + +/* Update state and maria_chk time of indexfile */ + +int maria_update_state_info(HA_CHECK *param, MARIA_HA *info,uint update) +{ + MARIA_SHARE *share=info->s; + + if (update & UPDATE_OPEN_COUNT) + { + share->state.open_count=0; + share->global_changed=0; + } + if (update & UPDATE_STAT) + { + uint i, key_parts= mi_uint2korr(share->state.header.key_parts); + share->state.rec_per_key_rows=info->state->records; + share->state.changed&= ~STATE_NOT_ANALYZED; + if (info->state->records) + { + for (i=0; i<key_parts; i++) + { + if (!(share->state.rec_per_key_part[i]=param->rec_per_key_part[i])) + share->state.changed|= STATE_NOT_ANALYZED; + } + } + } + if (update & (UPDATE_STAT | UPDATE_SORT | UPDATE_TIME | UPDATE_AUTO_INC)) + { + if (update & UPDATE_TIME) + { + share->state.check_time= (long) time((time_t*) 0); + if (!share->state.create_time) + share->state.create_time=share->state.check_time; + } + /* + When tables are locked we haven't synched the share state and the + real state for a while so we better do it here before synching + the share state to disk. Only when table is write locked is it + necessary to perform this synch. + */ + if (info->lock_type == F_WRLCK) + share->state.state= *info->state; + if (_ma_state_info_write(share, 1|2)) + goto err; + share->changed=0; + } + { /* Force update of status */ + int error; + uint r_locks=share->r_locks,w_locks=share->w_locks; + share->r_locks= share->w_locks= share->tot_locks= 0; + error= _ma_writeinfo(info,WRITEINFO_NO_UNLOCK); + share->r_locks=r_locks; + share->w_locks=w_locks; + share->tot_locks=r_locks+w_locks; + if (!error) + return 0; + } +err: + _ma_check_print_error(param,"%d when updating keyfile",my_errno); + return 1; +} + + /* + Update auto increment value for a table + When setting the 'repair_only' flag we only want to change the + old auto_increment value if its wrong (smaller than some given key). + The reason is that we shouldn't change the auto_increment value + for a table without good reason when only doing a repair; If the + user have inserted and deleted rows, the auto_increment value + may be bigger than the biggest current row and this is ok. + + If repair_only is not set, we will update the flag to the value in + param->auto_increment is bigger than the biggest key. + */ + +void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info, + my_bool repair_only) +{ + uchar *record; + DBUG_ENTER("update_auto_increment_key"); + + if (!info->s->base.auto_key || + ! maria_is_key_active(info->s->state.key_map, info->s->base.auto_key - 1)) + { + if (!(param->testflag & T_VERY_SILENT)) + _ma_check_print_info(param, + "Table: %s doesn't have an auto increment key\n", + param->isam_file_name); + DBUG_VOID_RETURN; + } + if (!(param->testflag & T_SILENT) && + !(param->testflag & T_REP)) + printf("Updating MARIA file: %s\n", param->isam_file_name); + /* + We have to use an allocated buffer instead of info->rec_buff as + _ma_put_key_in_record() may use info->rec_buff + */ + if (!(record= (uchar*) my_malloc((uint) info->s->base.pack_reclength, + MYF(0)))) + { + _ma_check_print_error(param,"Not enough memory for extra record"); + DBUG_VOID_RETURN; + } + + maria_extra(info,HA_EXTRA_KEYREAD,0); + if (maria_rlast(info, record, info->s->base.auto_key-1)) + { + if (my_errno != HA_ERR_END_OF_FILE) + { + maria_extra(info,HA_EXTRA_NO_KEYREAD,0); + my_free((char*) record, MYF(0)); + _ma_check_print_error(param,"%d when reading last record",my_errno); + DBUG_VOID_RETURN; + } + if (!repair_only) + info->s->state.auto_increment=param->auto_increment_value; + } + else + { + ulonglong auto_increment= ma_retrieve_auto_increment(info, record); + set_if_bigger(info->s->state.auto_increment,auto_increment); + if (!repair_only) + set_if_bigger(info->s->state.auto_increment, param->auto_increment_value); + } + maria_extra(info,HA_EXTRA_NO_KEYREAD,0); + my_free((char*) record, MYF(0)); + maria_update_state_info(param, info, UPDATE_AUTO_INC); + DBUG_VOID_RETURN; +} + + +/* + Update statistics for each part of an index + + SYNOPSIS + maria_update_key_parts() + keyinfo IN Index information (only key->keysegs used) + rec_per_key_part OUT Store statistics here + unique IN Array of (#distinct tuples) + notnull_tuples IN Array of (#tuples), or NULL + records Number of records in the table + + DESCRIPTION + This function is called produce index statistics values from unique and + notnull_tuples arrays after these arrays were produced with sequential + index scan (the scan is done in two places: chk_index() and + sort_key_write()). + + This function handles all 3 index statistics collection methods. + + Unique is an array: + unique[0]= (#different values of {keypart1}) - 1 + unique[1]= (#different values of {keypart1,keypart2} tuple)-unique[0]-1 + ... + + For MI_STATS_METHOD_IGNORE_NULLS method, notnull_tuples is an array too: + notnull_tuples[0]= (#of {keypart1} tuples such that keypart1 is not NULL) + notnull_tuples[1]= (#of {keypart1,keypart2} tuples such that all + keypart{i} are not NULL) + ... + For all other statistics collection methods notnull_tuples==NULL. + + Output is an array: + rec_per_key_part[k] = + = E(#records in the table such that keypart_1=c_1 AND ... AND + keypart_k=c_k for arbitrary constants c_1 ... c_k) + + = {assuming that values have uniform distribution and index contains all + tuples from the domain (or that {c_1, ..., c_k} tuple is choosen from + index tuples} + + = #tuples-in-the-index / #distinct-tuples-in-the-index. + + The #tuples-in-the-index and #distinct-tuples-in-the-index have different + meaning depending on which statistics collection method is used: + + MI_STATS_METHOD_* how are nulls compared? which tuples are counted? + NULLS_EQUAL NULL == NULL all tuples in table + NULLS_NOT_EQUAL NULL != NULL all tuples in table + IGNORE_NULLS n/a tuples that don't have NULLs +*/ + +void maria_update_key_parts(MARIA_KEYDEF *keyinfo, ulong *rec_per_key_part, + ulonglong *unique, ulonglong *notnull, + ulonglong records) +{ + ulonglong count=0,tmp, unique_tuples; + ulonglong tuples= records; + uint parts; + for (parts=0 ; parts < keyinfo->keysegs ; parts++) + { + count+=unique[parts]; + unique_tuples= count + 1; + if (notnull) + { + tuples= notnull[parts]; + /* + #(unique_tuples not counting tuples with NULLs) = + #(unique_tuples counting tuples with NULLs as different) - + #(tuples with NULLs) + */ + unique_tuples -= (records - notnull[parts]); + } + + if (unique_tuples == 0) + tmp= 1; + else if (count == 0) + tmp= tuples; /* 1 unique tuple */ + else + tmp= (tuples + unique_tuples/2) / unique_tuples; + + /* + for some weird keys (e.g. FULLTEXT) tmp can be <1 here. + let's ensure it is not + */ + set_if_bigger(tmp,1); + if (tmp >= (ulonglong) ~(ulong) 0) + tmp=(ulonglong) ~(ulong) 0; + + *rec_per_key_part=(ulong) tmp; + rec_per_key_part++; + } +} + + +static ha_checksum maria_byte_checksum(const uchar *buf, uint length) +{ + ha_checksum crc; + const uchar *end=buf+length; + for (crc=0; buf != end; buf++) + crc=((crc << 1) + *((uchar*) buf)) + + test(crc & (((ha_checksum) 1) << (8*sizeof(ha_checksum)-1))); + return crc; +} + +static my_bool maria_too_big_key_for_sort(MARIA_KEYDEF *key, ha_rows rows) +{ + uint key_maxlength=key->maxlength; + if (key->flag & HA_FULLTEXT) + { + uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT* + key->seg->charset->mbmaxlen; + key_maxlength+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN; + } + return (key->flag & HA_SPATIAL) || + (key->flag & (HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY | HA_FULLTEXT) && + ((ulonglong) rows * key_maxlength > + (ulonglong) maria_max_temp_length)); +} + +/* + Deactivate all not unique index that can be recreated fast + These include packed keys on which sorting will use more temporary + space than the max allowed file length or for which the unpacked keys + will take much more space than packed keys. + Note that 'rows' may be zero for the case when we don't know how many + rows we will put into the file. + */ + +void maria_disable_non_unique_index(MARIA_HA *info, ha_rows rows) +{ + MARIA_SHARE *share=info->s; + MARIA_KEYDEF *key=share->keyinfo; + uint i; + + DBUG_ASSERT(info->state->records == 0 && + (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES)); + for (i=0 ; i < share->base.keys ; i++,key++) + { + if (!(key->flag & (HA_NOSAME | HA_SPATIAL | HA_AUTO_KEY)) && + ! maria_too_big_key_for_sort(key,rows) && info->s->base.auto_key != i+1) + { + maria_clear_key_active(share->state.key_map, i); + info->update|= HA_STATE_CHANGED; + } + } +} + + +/* + Return TRUE if we can use repair by sorting + One can set the force argument to force to use sorting + even if the temporary file would be quite big! +*/ + +my_bool maria_test_if_sort_rep(MARIA_HA *info, ha_rows rows, + ulonglong key_map, my_bool force) +{ + MARIA_SHARE *share=info->s; + MARIA_KEYDEF *key=share->keyinfo; + uint i; + + /* + maria_repair_by_sort only works if we have at least one key. If we don't + have any keys, we should use the normal repair. + */ + if (! maria_is_any_key_active(key_map)) + return FALSE; /* Can't use sort */ + /* QQ: Remove this when maria_repair_by_sort() works with block format */ + if (info->s->data_file_type == BLOCK_RECORD) + return FALSE; + for (i=0 ; i < share->base.keys ; i++,key++) + { + if (!force && maria_too_big_key_for_sort(key,rows)) + return FALSE; + } + return TRUE; +} + + +static void +set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share) +{ + if ((sort_info->new_data_file_type=share->data_file_type) == + COMPRESSED_RECORD && sort_info->param->testflag & T_UNPACK) + { + MARIA_SHARE tmp; + sort_info->new_data_file_type= share->state.header.org_data_file_type; + /* Set delete_function for sort_delete_record() */ + tmp= *share; + tmp.state.header.data_file_type= tmp.state.header.org_data_file_type; + tmp.options= ~HA_OPTION_COMPRESS_RECORD; + _ma_setup_functions(&tmp); + share->delete_record=tmp.delete_record; + } +} + +static void restore_data_file_type(MARIA_SHARE *share) +{ + share->options&= ~HA_OPTION_COMPRESS_RECORD; + mi_int2store(share->state.header.options,share->options); + share->state.header.data_file_type= + share->state.header.org_data_file_type; + share->data_file_type= share->state.header.data_file_type; + share->pack.header_length= 0; +} + + +static void change_data_file_descriptor(MARIA_HA *info, File new_file) +{ + my_close(info->dfile.file, MYF(MY_WME)); + info->dfile.file= info->s->bitmap.file.file= new_file; +} + + +/* + Copy all states that has to do with the data file + + NOTES + This is done to copy the state from the data file generated from + repair to the original handler +*/ + +static void copy_data_file_state(MARIA_STATE_INFO *to, + MARIA_STATE_INFO *from) +{ + to->state.records= from->state.records; + to->state.del= from->state.del; + to->state.empty= from->state.empty; + to->state.data_file_length= from->state.data_file_length; + to->split= from->split; + to->dellink= from->dellink; + to->first_bitmap_with_space= from->first_bitmap_with_space; +} + + +/* + Read 'safely' next record while scanning table. + + SYNOPSIS + _ma_safe_scan_block_record() + info Maria handler + record Store found here + + NOTES + - One must have called mi_scan() before this + + Differences compared to _ma_scan_block_records() are: + - We read all blocks, not only blocks marked by the bitmap to be safe + - In case of errors, next read will read next record. + - More sanity checks + + RETURN + 0 ok + HA_ERR_END_OF_FILE End of file + # error number +*/ + + +static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info, + MARIA_HA *info, uchar *record) +{ + uint record_pos= info->cur_row.nextpos; + ulonglong page= sort_info->page; + DBUG_ENTER("_ma_safe_scan_block_record"); + + for (;;) + { + /* Find next row in current page */ + if (likely(record_pos < info->scan.number_of_rows)) + { + uint length, offset; + uchar *data, *end_of_data; + char llbuff[22]; + + while (!(offset= uint2korr(info->scan.dir))) + { + info->scan.dir-= DIR_ENTRY_SIZE; + record_pos++; + if (info->scan.dir < info->scan.dir_end) + { + _ma_check_print_info(sort_info->param, + "Wrong directory on page: %s", + llstr(page, llbuff)); + goto read_next_page; + } + } + /* found row */ + info->cur_row.lastpos= info->scan.row_base_page + record_pos; + info->cur_row.nextpos= record_pos + 1; + data= info->scan.page_buff + offset; + length= uint2korr(info->scan.dir + 2); + end_of_data= data + length; + info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */ + + if (end_of_data > info->scan.dir_end || + offset < PAGE_HEADER_SIZE || length < info->s->base.min_block_length) + { + _ma_check_print_info(sort_info->param, + "Wrong directory entry %3u at page %s", + record_pos, llstr(page, llbuff)); + record_pos++; + continue; + } + else + { + DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos)); + DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data)); + } + } + +read_next_page: + /* Read until we find next head page */ + for (;;) + { + uint page_type; + char llbuff[22]; + + sort_info->page++; /* In case of errors */ + page++; + if (!(page % info->s->bitmap.pages_covered)) + page++; /* Skip bitmap */ + if ((page + 1) * info->s->block_size > sort_info->filelength) + DBUG_RETURN(HA_ERR_END_OF_FILE); + if (!(pagecache_read(info->s->pagecache, + &info->dfile, + page, 0, info->scan.page_buff, + PAGECACHE_READ_UNKNOWN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + + page_type= (info->scan.page_buff[PAGE_TYPE_OFFSET] & + PAGE_TYPE_MASK); + if (page_type == HEAD_PAGE) + { + if ((info->scan.number_of_rows= + (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) != 0) + break; + _ma_check_print_info(sort_info->param, + "Wrong head page at %s", + llstr(page * info->s->block_size, llbuff)); + } + else if (page_type >= MAX_PAGE_TYPE) + { + _ma_check_print_info(sort_info->param, + "Found wrong page type: %d at %s", + page_type, llstr(page * info->s->block_size, + llbuff)); + } + } + + /* New head page */ + info->scan.dir= (info->scan.page_buff + info->s->block_size - + PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE); + info->scan.dir_end= (info->scan.dir - + (info->scan.number_of_rows - 1) * + DIR_ENTRY_SIZE); + info->scan.row_base_page= ma_recordpos(page, 0); + record_pos= 0; + } +} + + +/** + @brief Writes a LOGREC_REPAIR_TABLE record and updates create_rename_lsn + and is_of_horizon + + REPAIR/OPTIMIZE have replaced the data/index file with a new file + and so, in this scenario: + @verbatim + CHECKPOINT - REDO_INSERT - COMMIT - ... - REPAIR - ... - crash + @endverbatim + we do not want Recovery to apply the REDO_INSERT to the table, as it would + then possibly wrongly extend the table. By updating create_rename_lsn at + the end of REPAIR, we know that REDO_INSERT will be skipped. + + @param param description of the REPAIR operation + @param info table + + @return Operation status + @retval 0 ok + @retval 1 error (disk problem) +*/ + +static int write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + /* in case this is maria_chk or recovery... */ + if (translog_inited && !maria_in_recovery) + { + /* + For now this record is only informative. It could serve when applying + logs to a backup, but that needs more thought. Assume table became + corrupted. It is repaired, then some writes happen to it. + Later we restore an old backup, and want to apply this REDO_REPAIR_TABLE + record. For it to give the same result as originally, the table should + be corrupted the same way, so applying previous REDOs should produce the + same corruption; that's really not guaranteed (different execution paths + in execution of REDOs vs runtime code so not same bugs hit, temporary + hardware issues not repeatable etc). Corruption may not be repeatable. + A reasonable solution is to execute the REDO_REPAIR_TABLE record and + check if the checksum of the resulting table matches what it was at the + end of the original repair (should be stored in log record); or execute + the REDO_REPAIR_TABLE if the checksum of the table-before-repair matches + was it was at the start of the original repair (should be stored in log + record). + */ + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[LSN_STORE_SIZE]; + LSN lsn; + compile_time_assert(LSN_STORE_SIZE >= (FILEID_STORE_SIZE + 4)); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= FILEID_STORE_SIZE + 4; + /* + testflag gives an idea of what REPAIR did (in particular T_QUICK + or not: did it touch the data file or not?). + */ + int4store(log_data + FILEID_STORE_SIZE, param->testflag); + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_REPAIR_TABLE, + &dummy_transaction_object, info, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, log_data) || + translog_flush(lsn))) + return 1; + /* + The table's existence was made durable earlier (MY_SYNC_DIR passed to + maria_change_to_newfile()). _ma_flush_table_files_after_repair() was + called earlier, flushed and forced data+index+state. Old REDOs should + not be applied to the table: + */ + if (_ma_update_create_rename_lsn(share, lsn, TRUE)) + return 1; + } + return 0; +} diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c new file mode 100644 index 00000000000..4446285fce9 --- /dev/null +++ b/storage/maria/ma_checkpoint.c @@ -0,0 +1,1108 @@ +/* Copyright (C) 2006,2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3071 Maria checkpoint + First version written by Guilhem Bichot on 2006-04-27. +*/ + +/* Here is the implementation of this module */ + +/** + @todo RECOVERY BUG this is unreviewed code, but used in safe conditions: + ha_maria takes a checkpoint at end of recovery and one at clean shutdown, + that's all. So there never are open tables, dirty pages, transactions. +*/ +/* + Summary: + checkpoints are done either by a background thread (checkpoint every Nth + second) or by a client. + In ha_maria, it's not made available to clients, and will soon be done by a + background thread (periodically taking checkpoints and flushing dirty + pages). +*/ + +#include "maria_def.h" +#include "ma_pagecache.h" +#include "trnman.h" +#include "ma_blockrec.h" +#include "ma_checkpoint.h" +#include "ma_loghandler_lsn.h" + + +/* + Checkpoints currently happen only at ha_maria's startup (after recovery) and + at shutdown, always when there is no open tables. + Background page flushing is not used. + So, needed pagecache functions for doing this flushing are not yet pushed. +*/ +#define flush_pagecache_blocks_with_filter(A,B,C,D,E) (int)(((ulong)D) * 0) +/** + filter has to return 0, 1 or 2: 0 means "don't flush this page", 1 means + "flush it", 2 means "don't flush this page and following pages". + Will move to ma_pagecache.h +*/ +typedef int (*PAGECACHE_FILTER)(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); + + +/** @brief type of checkpoint currently running */ +static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE; +/** @brief protects checkpoint_in_progress */ +static pthread_mutex_t LOCK_checkpoint; +/** @brief for killing the background checkpoint thread */ +static pthread_cond_t COND_checkpoint; +/** @brief if checkpoint module was inited or not */ +static my_bool checkpoint_inited= FALSE; +/** @brief 'kill' flag for the background checkpoint thread */ +static int checkpoint_thread_die; +/* is ulong like pagecache->blocks_changed */ +static ulong pages_to_flush_before_next_checkpoint; +static PAGECACHE_FILE *dfiles, /**< data files to flush in background */ + *dfiles_end; /**< list of data files ends here */ +static PAGECACHE_FILE *kfiles, /**< index files to flush in background */ + *kfiles_end; /**< list of index files ends here */ +/* those two statistics below could serve in SHOW GLOBAL STATUS */ +static uint checkpoints_total= 0, /**< all checkpoint requests made */ + checkpoints_ok_total= 0; /**< all checkpoints which succeeded */ + +struct st_filter_param +{ + my_bool is_data_file; /**< is the file about data or index */ + LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */ + ulong pages_covered_by_bitmap; /**< to know which page is a bitmap page */ + uint max_pages; /**< stop after flushing this number pages */ +}; /**< information to determine which dirty pages should be flushed */ + +static int filter_flush_data_file_medium(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); +static int filter_flush_data_file_full(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); +static int filter_flush_data_file_indirect(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); +static int filter_flush_data_file_evenly(enum pagecache_page_type type, + pgcache_page_no_t pageno, + LSN rec_lsn, void *arg); +static int really_execute_checkpoint(void); +pthread_handler_t ma_checkpoint_background(void *arg); +static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon); + +/** + @brief Does a checkpoint + + @param level what level of checkpoint to do + @param no_wait if another checkpoint of same or stronger level + is already running, consider our job done + + @note In ha_maria, there can never be two threads trying a checkpoint at + the same time. + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait) +{ + int result= 0; + DBUG_ENTER("ma_checkpoint_execute"); + + if (!checkpoint_inited) + { + /* + If ha_maria failed to start, maria_panic_hton is called, we come here. + */ + DBUG_RETURN(0); + } + DBUG_ASSERT(level > CHECKPOINT_NONE); + + /* look for already running checkpoints */ + pthread_mutex_lock(&LOCK_checkpoint); + while (checkpoint_in_progress != CHECKPOINT_NONE) + { + if (no_wait && (checkpoint_in_progress >= level)) + { + /* + If we are the checkpoint background thread, we don't wait (it's + smarter to flush pages instead of waiting here while the other thread + finishes its checkpoint). + */ + pthread_mutex_unlock(&LOCK_checkpoint); + goto end; + } + pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint); + } + + checkpoint_in_progress= level; + pthread_mutex_unlock(&LOCK_checkpoint); + /* from then on, we are sure to be and stay the only checkpointer */ + + result= really_execute_checkpoint(); + pthread_cond_broadcast(&COND_checkpoint); +end: + DBUG_RETURN(result); +} + + +/** + @brief Does a checkpoint, really; expects no other checkpoints + running. + + Checkpoint level requested is read from checkpoint_in_progress. + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +static int really_execute_checkpoint(void) +{ + uint i, error= 0; + /** @brief checkpoint_start_log_horizon will be stored there */ + char *ptr; + LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */ + LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn; + TRANSLOG_ADDRESS checkpoint_start_log_horizon; + uchar checkpoint_start_log_horizon_char[LSN_STORE_SIZE]; + DBUG_ENTER("really_execute_checkpoint"); + bzero(&record_pieces, sizeof(record_pieces)); + + /* + STEP 1: record current end-of-log position using log's lock. It is + critical for the correctness of Checkpoint (related to memory visibility + rules, the log's lock is a mutex). + "Horizon" is a lower bound of the LSN of the next log record. + */ + /** + @todo RECOVERY BUG + this is an horizon, but it is used as a LSN (REDO phase may start from + there! probably log handler would refuse to read then; + Sanja proposed to make a loghandler's function which finds the LSN after + this horizon. + */ + checkpoint_start_log_horizon= translog_get_horizon(); + DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)", + LSN_IN_PARTS(checkpoint_start_log_horizon))); + lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon); + + + /* + STEP 2: fetch information about transactions. + We must fetch transactions before dirty pages. Indeed, a transaction + first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn + to 0. If we fetched pages first, we may see no dirty page yet, then we + fetch transactions but the transaction has already reset its rec_lsn to 0 + so we miss rec_lsn again. + For a similar reason (over-allocated bitmap pages) we have to fetch + transactions before flushing bitmap pages. + + min_trn_rec_lsn will serve to lower the starting point of the REDO phase + (down from checkpoint_start_log_horizon). + */ + if (unlikely(trnman_collect_transactions(&record_pieces[0], + &record_pieces[1], + &min_trn_rec_lsn, + &min_first_undo_lsn))) + goto err; + + + /* STEP 3: fetch information about table files */ + if (unlikely(collect_tables(&record_pieces[2], + checkpoint_start_log_horizon))) + goto err; + + + /* STEP 4: fetch information about dirty pages */ + /* + It's better to do it _after_ having flushed some data pages (which + collect_tables() may have done), because those are now non-dirty and so we + have a more up-to-date dirty pages list to put into the checkpoint record, + and thus we will have less work at Recovery. + */ + /* Using default pagecache for now */ + if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache, + &record_pieces[3], + &min_page_rec_lsn))) + goto err; + + + /* LAST STEP: now write the checkpoint log record */ + { + LSN lsn; + uint total_rec_length; + /* + the log handler is allowed to modify "str" and "length" (but not "*str") + of its argument, so we must not pass it record_pieces directly, + otherwise we would later not know what memory pieces to my_free(). + */ + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 5]; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= + checkpoint_start_log_horizon_char; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length= + sizeof(checkpoint_start_log_horizon_char); + for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) + { + log_array[TRANSLOG_INTERNAL_PARTS + 1 + i]= record_pieces[i]; + total_rec_length+= record_pieces[i].length; + } + + if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT, + &dummy_transaction_object, NULL, + total_rec_length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL) || + translog_flush(lsn))) + goto err; + + translog_lock(); + /* + This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because + such hook would be called before translog_flush (and we must be sure + that log was flushed before we write to the control file). + */ + if (unlikely(ma_control_file_write_and_force(lsn, FILENO_IMPOSSIBLE, + CONTROL_FILE_UPDATE_ONLY_LSN))) + { + translog_unlock(); + goto err; + } + translog_unlock(); + } + + /* + Note that we should not alter memory structures until we have successfully + written the checkpoint record and control file. + */ + /* checkpoint succeeded */ + ptr= record_pieces[3].str; + pages_to_flush_before_next_checkpoint= uint4korr(ptr); + DBUG_PRINT("info",("%u pages to flush before next checkpoint", + (uint)pages_to_flush_before_next_checkpoint)); + + /* compute log's low-water mark */ + TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn; + set_if_smaller(log_low_water_mark, min_trn_rec_lsn); + set_if_smaller(log_low_water_mark, min_first_undo_lsn); + set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon); + /** + Now purge unneeded logs. + As some systems have an unreliable fsync (drive lying), we could try to + be robust against that: remember a few previous checkpoints in the + control file, and not purge logs immediately... Think about it. + */ +#if 0 /* purging/keeping will be an option */ + if (translog_purge(log_low_water_mark)) + fprintf(stderr, "Maria engine: log purge failed\n"); /* not deadly */ +#endif + + goto end; + +err: + error= 1; + fprintf(stderr, "Maria engine: checkpoint failed\n"); /* TODO: improve ;) */ + /* we were possibly not able to determine what pages to flush */ + pages_to_flush_before_next_checkpoint= 0; + +end: + for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) + my_free(record_pieces[i].str, MYF(MY_ALLOW_ZERO_PTR)); + pthread_mutex_lock(&LOCK_checkpoint); + checkpoint_in_progress= CHECKPOINT_NONE; + checkpoints_total++; + checkpoints_ok_total+= !error; + pthread_mutex_unlock(&LOCK_checkpoint); + DBUG_RETURN(error); +} + + +/** + @brief Initializes the checkpoint module + + @param create_background_thread If one wants the module to now create a + thread which will periodically do + checkpoints, and flush dirty pages, in the + background. + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +int ma_checkpoint_init(my_bool create_background_thread) +{ + pthread_t th; + int res= 0; + DBUG_ENTER("ma_checkpoint_init"); + checkpoint_inited= TRUE; + checkpoint_thread_die= 2; /* not yet born == dead */ + if (pthread_mutex_init(&LOCK_checkpoint, MY_MUTEX_INIT_SLOW) || + pthread_cond_init(&COND_checkpoint, 0)) + res= 1; + else if (create_background_thread) + { + if (!(res= pthread_create(&th, NULL, ma_checkpoint_background, NULL))) + checkpoint_thread_die= 0; /* thread lives, will have to be killed */ + } + DBUG_RETURN(res); +} + + +/** + @brief Destroys the checkpoint module +*/ + +void ma_checkpoint_end(void) +{ + DBUG_ENTER("ma_checkpoint_end"); + if (checkpoint_inited) + { + pthread_mutex_lock(&LOCK_checkpoint); + if (checkpoint_thread_die != 2) /* thread was started ok */ + { + DBUG_PRINT("info",("killing Maria background checkpoint thread")); + checkpoint_thread_die= 1; /* kill it */ + do /* and wait for it to be dead */ + { + /* wake it up if it was in a sleep */ + pthread_cond_broadcast(&COND_checkpoint); + DBUG_PRINT("info",("waiting for Maria background checkpoint thread" + " to die")); + pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint); + } + while (checkpoint_thread_die != 2); + } + pthread_mutex_unlock(&LOCK_checkpoint); + my_free((uchar *)dfiles, MYF(MY_ALLOW_ZERO_PTR)); + my_free((uchar *)kfiles, MYF(MY_ALLOW_ZERO_PTR)); + pthread_mutex_destroy(&LOCK_checkpoint); + pthread_cond_destroy(&COND_checkpoint); + checkpoint_inited= FALSE; + } + DBUG_VOID_RETURN; +} + + +/** + @brief dirty-page filtering criteria for MEDIUM checkpoint. + + We flush data/index pages which have been dirty since the previous + checkpoint (this is the two-checkpoint rule: the REDO phase will not have + to start from earlier than the next-to-last checkpoint), and all dirty + bitmap pages. + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param + + @return Operation status + @retval 0 don't flush the page + @retval 1 flush the page +*/ + +static int filter_flush_data_file_medium(enum pagecache_page_type type, + pgcache_page_no_t pageno, + LSN rec_lsn, void *arg) +{ + struct st_filter_param *param= (struct st_filter_param *)arg; + return ((type == PAGECACHE_LSN_PAGE) && + (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0)) || + (param->is_data_file && + ((pageno % param->pages_covered_by_bitmap) == 0)); +} + + +/** + @brief dirty-page filtering criteria for FULL checkpoint. + + We flush all dirty data/index pages and all dirty bitmap pages. + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param + + @return Operation status + @retval 0 don't flush the page + @retval 1 flush the page +*/ + +static int filter_flush_data_file_full(enum pagecache_page_type type, + pgcache_page_no_t pageno, + LSN rec_lsn + __attribute__ ((unused)), + void *arg) +{ + struct st_filter_param *param= (struct st_filter_param *)arg; + return (type == PAGECACHE_LSN_PAGE) || + (param->is_data_file && + ((pageno % param->pages_covered_by_bitmap) == 0)); +} + + +/** + @brief dirty-page filtering criteria for INDIRECT checkpoint. + + We flush all dirty bitmap pages. + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param + + @return Operation status + @retval 0 don't flush the page + @retval 1 flush the page +*/ + +static int filter_flush_data_file_indirect(enum pagecache_page_type type + __attribute__ ((unused)), + pgcache_page_no_t pageno, + LSN rec_lsn + __attribute__ ((unused)), + void *arg) +{ + struct st_filter_param *param= (struct st_filter_param *)arg; + return + (param->is_data_file && + ((pageno % param->pages_covered_by_bitmap) == 0)); +} + + +/** + @brief dirty-page filtering criteria for background flushing thread. + + We flush data pages which have been dirty since the previous checkpoint + (this is the two-checkpoint rule: the REDO phase will not have to start + from earlier than the next-to-last checkpoint), and all dirty bitmap + pages. But we flush no more than a certain number of pages (to have an + even flushing, no write burst). + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param + + @return Operation status + @retval 0 don't flush the page + @retval 1 flush the page + @retval 2 don't flush the page and following pages +*/ + +static int filter_flush_data_file_evenly(enum pagecache_page_type type, + pgcache_page_no_t pageno + __attribute__ ((unused)), + LSN rec_lsn, void *arg) +{ + struct st_filter_param *param= (struct st_filter_param *)arg; + if (unlikely(param->max_pages == 0)) /* all flushed already */ + return 2; + if ((type == PAGECACHE_LSN_PAGE) && + (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0)) + { + param->max_pages--; + return 1; + } + return 0; +} + + +/** + @brief Background thread which does checkpoints and flushes periodically. + + Takes a checkpoint every 30th second. After taking a checkpoint, all pages + dirty at the time of that checkpoint are flushed evenly until it is time to + take another checkpoint (30 seconds later). This ensures that the REDO + phase starts at earliest (in LSN time) at the next-to-last checkpoint + record ("two-checkpoint rule"). + + @note MikaelR questioned why the same thread does two different jobs, the + risk could be that while a checkpoint happens no LRD flushing happens. + + @note MikaelR noted that he observed that Linux's file cache may never + fsync to disk until this cache is full, at which point it decides to empty + the cache, making the machine very slow. A solution was to fsync after + writing 2 MB. +*/ + +pthread_handler_t ma_checkpoint_background(void *arg __attribute__((unused))) +{ + const uint sleep_unit= 1 /* 1 second */, + time_between_checkpoints= 30, /* 30 sleep units */ + /** @brief At least this of log/page bytes written between checkpoints */ + checkpoint_min_activity= 2*1024*1024; + uint sleeps= 0; + + my_thread_init(); + DBUG_PRINT("info",("Maria background checkpoint thread starts")); + for(;;) + { +#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */ + sleeps=0; +#endif + uint pages_bunch_size; + struct st_filter_param filter_param; + PAGECACHE_FILE *dfile; /**< data file currently being flushed */ + PAGECACHE_FILE *kfile; /**< index file currently being flushed */ + TRANSLOG_ADDRESS log_horizon_at_last_checkpoint= LSN_IMPOSSIBLE; + ulonglong pagecache_flushes_at_last_checkpoint= 0; + struct timespec abstime; + switch((sleeps++) % time_between_checkpoints) + { + case 0: + /* + With background flushing evenly distributed over the time + between two checkpoints, we should have only little flushing to do + in the checkpoint. + */ + /* + No checkpoint if little work of interest for recovery was done + since last checkpoint. Such work includes log writing (lengthens + recovery, checkpoint would shorten it), page flushing (checkpoint + would decrease the amount of read pages in recovery). + */ + if (((translog_get_horizon() - log_horizon_at_last_checkpoint) + + (maria_pagecache->global_cache_write - + pagecache_flushes_at_last_checkpoint) * + maria_pagecache->block_size) < checkpoint_min_activity) + { + /* don't take checkpoint, so don't know what to flush */ + pages_to_flush_before_next_checkpoint= 0; + break; + } + ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE); + /* + Snapshot this kind of "state" of the engine. Note that the value below + is possibly greater than last_checkpoint_lsn. + */ + log_horizon_at_last_checkpoint= translog_get_horizon(); + pagecache_flushes_at_last_checkpoint= + maria_pagecache->global_cache_write; + /* + If the checkpoint above succeeded it has set d|kfiles and + d|kfiles_end. If is has failed, it has set + pages_to_flush_before_next_checkpoint to 0 so we will skip flushing + and sleep until the next checkpoint. + */ + break; + case 1: + /* set up parameters for background page flushing */ + filter_param.up_to_lsn= last_checkpoint_lsn; + pages_bunch_size= pages_to_flush_before_next_checkpoint / + time_between_checkpoints; + dfile= dfiles; + kfile= kfiles; + /* fall through */ + default: + if (pages_bunch_size > 0) + { + /* flush a bunch of dirty pages */ + filter_param.max_pages= pages_bunch_size; + filter_param.is_data_file= TRUE; + while (dfile != dfiles_end) + { + int res= + flush_pagecache_blocks_with_filter(maria_pagecache, + dfile, FLUSH_KEEP, + filter_flush_data_file_evenly, + &filter_param); + /* note that it may just be a pinned page */ + if (unlikely(res)) + fprintf(stderr, "Maria engine: warning - background page flush" + " failed\n"); + if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ + break; /* and we will continue with the same file */ + dfile++; /* otherwise all this file is flushed, move to next file */ + } + filter_param.is_data_file= FALSE; + while (kfile != kfiles_end) + { + int res= + flush_pagecache_blocks_with_filter(maria_pagecache, + dfile, FLUSH_KEEP, + filter_flush_data_file_evenly, + &filter_param); + if (unlikely(res)) + fprintf(stderr, "Maria engine: warning - background page flush" + " failed\n"); + if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ + break; /* and we will continue with the same file */ + kfile++; /* otherwise all this file is flushed, move to next file */ + } + } + } + pthread_mutex_lock(&LOCK_checkpoint); + if (checkpoint_thread_die == 1) + break; +#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */ + pthread_mutex_unlock(&LOCK_checkpoint); + my_sleep(100000); // a tenth of a second + pthread_mutex_lock(&LOCK_checkpoint); +#else + /* To have a killable sleep, we use timedwait like our SQL GET_LOCK() */ + set_timespec(abstime, sleep_unit); + pthread_cond_timedwait(&COND_checkpoint, &LOCK_checkpoint, &abstime); +#endif + if (checkpoint_thread_die == 1) + break; + pthread_mutex_unlock(&LOCK_checkpoint); + } + pthread_mutex_unlock(&LOCK_checkpoint); + DBUG_PRINT("info",("Maria background checkpoint thread ends")); + /* + A last checkpoint, now that all tables should be closed; to have instant + recovery later. We always do it, because the test above about number of + log records or flushed pages is only approximative. For example, some log + records may have been written while ma_checkpoint_execute() above was + running, or some pages may have been flushed during this time. Thus it + could be that, while nothing has changed since that checkpoint's *end*, if + we recovered from that checkpoint we would have a non-empty dirty pages + list, REDOs to execute, and we don't want that, we want a clean shutdown + to have an empty recovery (simplifies upgrade/backups: one can just do a + clean shutdown, copy its tables to another system without copying the log + or control file and it will work because recovery will not need those). + Another reason why it's approximative is that a log record may have been + written above between ma_checkpoint_execute() and the + tranlog_get_horizon() which follows. + So, we have at least two checkpoints per start/stop of the engine, and + only two if the engine stays idle. + */ + ma_checkpoint_execute(CHECKPOINT_FULL, FALSE); + pthread_mutex_lock(&LOCK_checkpoint); + checkpoint_thread_die= 2; /* indicate that we are dead */ + /* wake up ma_checkpoint_end() which may be waiting for our death */ + pthread_cond_broadcast(&COND_checkpoint); + /* broadcast was inside unlock because ma_checkpoint_end() destroys mutex */ + pthread_mutex_unlock(&LOCK_checkpoint); + my_thread_end(); + return 0; +} + + +/** + @brief Allocates buffer and stores in it some info about open tables, + does some flushing on those. + + Does the allocation because the caller cannot know the size itself. + Memory freeing is to be done by the caller (if the "str" member of the + LEX_STRING is not NULL). + The caller is taking a checkpoint. + + @param[out] str pointer to where the allocated buffer, + and its size, will be put; buffer will be filled + with info about open tables + @param checkpoint_start_log_horizon Of the in-progress checkpoint + record. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) +{ + MARIA_SHARE **distinct_shares= NULL; + char *ptr; + uint error= 1, sync_error= 0, nb, nb_stored, i; + my_bool unmark_tables= TRUE; + uint total_names_length; + LIST *pos; /**< to iterate over open tables */ + struct st_state_copy { + uint index; + MARIA_STATE_INFO state; + }; + struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */ + *state_copies_end, /**< cache ends here */ + *state_copy; /**< iterator in cache */ + TRANSLOG_ADDRESS state_copies_horizon; /**< horizon of states' _copies_ */ + DBUG_ENTER("collect_tables"); + + /* let's make a list of distinct shares */ + pthread_mutex_lock(&THR_LOCK_maria); + for (nb= 0, pos= maria_open_list; pos; pos= pos->next) + { + MARIA_HA *info= (MARIA_HA*)pos->data; + MARIA_SHARE *share= info->s; + /* the first three variables below can never change */ + if (share->base.born_transactional && !share->temporary && + share->mode != O_RDONLY && + !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)) + { + /* + Why we didn't take intern_lock above: table had in_checkpoint==0 so no + thread could set in_checkpoint. And no thread needs to know that we + are setting in_checkpoint, because only maria_close() needs it and + cannot run now as we hold THR_LOCK_maria. + */ + /* + This table is relevant for checkpoint and not already seen. Mark it, + so that it is not seen again in the loop. + */ + nb++; + DBUG_ASSERT(share->in_checkpoint == 0); + /* This flag ensures that we count only _distinct_ shares. */ + share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP; + } + } + if (unlikely((distinct_shares= + (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *), + MYF(MY_WME))) == NULL)) + goto err; + for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next) + { + MARIA_HA *info= (MARIA_HA*)pos->data; + MARIA_SHARE *share= info->s; + if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP) + { + distinct_shares[i++]= share; + /* + With this we prevent the share from going away while we later flush + and force it without holding THR_LOCK_maria. For example if the share + could be my_free()d by maria_close() we would have a problem when we + access it to flush the table. We "pin" the share pointer. + And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is + not seen again in the loop. + */ + share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME; + /** @todo avoid strlen() */ + total_names_length+= strlen(share->open_file_name); + } + } + + DBUG_ASSERT(i == nb); + pthread_mutex_unlock(&THR_LOCK_maria); + DBUG_PRINT("info",("found %u table shares", nb)); + + str->length= + 4 + /* number of tables */ + (2 + /* short id */ + 4 + /* kfile */ + 4 + /* dfile */ + LSN_STORE_SIZE + /* first_log_write_at_lsn */ + 1 /* end-of-name 0 */ + ) * nb + total_names_length; + if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL)) + goto err; + + ptr= str->str; + ptr+= 4; /* real number of stored tables is not yet know */ + + struct st_filter_param filter_param; + /* only possible checkpointer, so can do the read below without mutex */ + filter_param.up_to_lsn= last_checkpoint_lsn; + PAGECACHE_FILTER filter; + switch(checkpoint_in_progress) + { + case CHECKPOINT_MEDIUM: + filter= &filter_flush_data_file_medium; + break; + case CHECKPOINT_FULL: + filter= &filter_flush_data_file_full; + break; + case CHECKPOINT_INDIRECT: + filter= &filter_flush_data_file_indirect; + break; + default: + DBUG_ASSERT(0); + goto err; + } + + /* + The principle of reading/writing the state below is explained in + ma_recovery.c, look for "Recovery of the state". + */ +#define STATE_COPIES 1024 + state_copies= (struct st_state_copy *) + my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME)); + dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles, + /* avoid size of 0 for my_realloc */ + max(1, nb) * sizeof(PAGECACHE_FILE), + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); + kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles, + /* avoid size of 0 for my_realloc */ + max(1, nb) * sizeof(PAGECACHE_FILE), + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); + if (unlikely((state_copies == NULL) || + (dfiles == NULL) || (kfiles == NULL))) + goto err; + state_copy= state_copies_end= NULL; + dfiles_end= dfiles; + kfiles_end= kfiles; + + for (nb_stored= 0, i= 0; i < nb; i++) + { + MARIA_SHARE *share= distinct_shares[i]; + PAGECACHE_FILE kfile, dfile; + if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) + { + /* No need for a mutex to read the above, only us can write this flag */ + continue; + } + DBUG_PRINT("info",("looking at table '%s'", share->open_file_name)); + if (state_copy == state_copies_end) /* we have no more cached states */ + { + /* + Collect and cache a bunch of states. We do this for many states at a + time, to not lock/unlock the log's lock too often. + */ + uint j, bound= min(nb, i + STATE_COPIES); + state_copy= state_copies; + /* part of the state is protected by log's lock */ + translog_lock(); + state_copies_horizon= translog_get_horizon_no_lock(); + for (j= i; j < bound; j++) + { + MARIA_SHARE *share2= distinct_shares[j]; + if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) + continue; + state_copy->index= j; + state_copy->state= share2->state; /* we copy the state */ + state_copy++; + /* + data_file_length is not updated under log's lock by the bitmap + code, but writing a wrong data_file_length is ok: a next + maria_close() will correct it; if we crash before, Recovery will + set it to the true physical size. + */ + } + translog_unlock(); + state_copies_end= state_copy; + state_copy= state_copies; + /* so now we have cached states */ + } + + /* locate our state among these cached ones */ + for ( ; state_copy->index != i; state_copy++) + DBUG_ASSERT(state_copy < state_copies_end); + + filter_param.pages_covered_by_bitmap= share->bitmap.pages_covered; + /* OS file descriptors are ints which we stored in 4 bytes */ + compile_time_assert(sizeof(int) == 4); + pthread_mutex_lock(&share->intern_lock); + /* + Tables in a normal state have their two file descriptors open. + In some rare cases like REPAIR, some descriptor may be closed or even + -1. If that happened, the _ma_state_info_write() may fail. This is + prevented by enclosing all all places which close/change kfile.file with + intern_lock. + */ + kfile= share->kfile; + dfile= share->bitmap.file; + /* + Ignore table which has no logged writes (all its future log records will + be found naturally by Recovery). Ignore obsolete shares (_before_ + setting themselves to last_version=0 they already did all flush and + sync; if we flush their state now we may be flushing an obsolete state + onto a newer one (assuming the table has been reopened with a different + share but of course same physical index file). + */ + if ((share->id != 0) && (share->last_version != 0)) + { + /** @todo avoid strlen */ + uint open_file_name_len= strlen(share->open_file_name) + 1; + /* remember the descriptors for background flush */ + *(dfiles_end++)= dfile; + *(kfiles_end++)= kfile; + /* we will store this table in the record */ + nb_stored++; + int2store(ptr, share->id); + ptr+= 2; + /* + We must store the OS file descriptors, because the pagecache, which + tells us the list of dirty pages, refers to these pages by OS file + descriptors. An alternative is to make the page cache aware of the + 2-byte id and of the location of a page ("is it a data file page or an + index file page?"). + If one descriptor is -1, normally there should be no dirty pages + collected for this file, it's ok to store -1, it will not be used. + */ + int4store(ptr, kfile.file); + ptr+= 4; + int4store(ptr, dfile.file); + ptr+= 4; + lsn_store(ptr, share->lsn_of_file_id); + ptr+= LSN_STORE_SIZE; + /* + first_bitmap_with_space is not updated under log's lock, and is + important. We would need the bitmap's lock to get it right. Recovery + of this is not clear, so we just play safe: write it out as + unknown: if crash, _ma_bitmap_init() at next open (for example in + Recovery) will convert it to 0 and thus the first insertion will + search for free space from the file's first bitmap (0) - + under-optimal but safe. + If no crash, maria_close() will write the exact value. + */ + state_copy->state.first_bitmap_with_space= ~(ulonglong)0; + memcpy(ptr, share->open_file_name, open_file_name_len); + ptr+= open_file_name_len; + if (cmp_translog_addr(share->state.is_of_horizon, + checkpoint_start_log_horizon) >= 0) + { + /* + State was flushed recently, it does not hold down the log's + low-water mark and will not give avoidable work to Recovery. So we + needn't flush it. Also, it is possible that while we copied the + state above (under log's lock, without intern_lock) it was being + modified in memory or flushed to disk (without log's lock, under + intern_lock, like in maria_extra()), so our copy may be incorrect + and we should not flush it. + It may also be a share which got last_version==0 since we checked + last_version; in this case, it flushed its state and the LSN test + above will catch it. + */ + } + else + { + /* + We could do the state flush only if share->changed, but it's + tricky. + Consider a maria_write() which has written REDO,UNDO, and before it + calls _ma_writeinfo() (setting share->changed=1), checkpoint + happens and sees share->changed=0, does not flush state. It is + possible that Recovery does not start from before the REDO and thus + the state is not recovered. A solution may be to set + share->changed=1 under log mutex when writing log records. + But as anyway we have another problem below, this optimization would + be of little use. + */ + /** @todo flush state only if changed since last checkpoint */ + DBUG_ASSERT(share->last_version != 0); + state_copy->state.is_of_horizon= share->state.is_of_horizon= + state_copies_horizon; + if (kfile.file >= 0) + sync_error|= + _ma_state_info_write_sub(kfile.file, &state_copy->state, 1); + /* + We don't set share->changed=0 because it may interfere with a + concurrent _ma_writeinfo() doing share->changed=1 (cancel its + effect). The sad consequence is that we will flush the same state at + each checkpoint if the table was once written and then not anymore. + */ + } + sync_error|= + _ma_flush_bitmap(share); /* after that, all is in page cache */ + DBUG_ASSERT(share->pagecache == maria_pagecache); + } + if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) + { + /* maria_close() left us to free the share */ + pthread_mutex_unlock(&share->intern_lock); + pthread_mutex_destroy(&share->intern_lock); + my_free((uchar *)share, MYF(0)); + } + else + { + /* share goes back to normal state */ + share->in_checkpoint= 0; + pthread_mutex_unlock(&share->intern_lock); + } + + /* + We do the big disk writes out of intern_lock to not block other + users of this table (intern_lock is taken at the start and end of + every statement). This means that file descriptors may be invalid + (files may have been closed for example by HA_EXTRA_PREPARE_FOR_* + under Windows, or REPAIR). This should not be a problem as we use + MY_IGNORE_BADFD. Descriptors may even point to other files but then + the old blocks (of before the close) must have been flushed for sure, + so our flush will flush new blocks (of after the latest open) and that + should do no harm. + */ + /* + If CHECKPOINT_MEDIUM, this big flush below may result in a + serious write burst. Realize that all pages dirtied between the + last checkpoint and the one we are doing now, will be flushed at + next checkpoint, except those evicted by LRU eviction (depending on + the size of the page cache compared to the size of the working data + set, eviction may be rare or frequent). + We avoid that burst by anticipating: those pages are flushed + in bunches spanned regularly over the time interval between now and + the next checkpoint, by a background thread. Thus the next checkpoint + will have only little flushing to do (CHECKPOINT_MEDIUM should thus be + only a little slower than CHECKPOINT_INDIRECT). + */ + + /** + @todo we ignore the error because it may be just due a pinned page; + we should rather fix the function below to distinguish between + pinned page and write error. Then we can turn the warning into an + error. + */ + if (((filter_param.is_data_file= TRUE), + flush_pagecache_blocks_with_filter(maria_pagecache, + &dfile, FLUSH_KEEP, + filter, &filter_param)) || + ((filter_param.is_data_file= FALSE), + flush_pagecache_blocks_with_filter(maria_pagecache, + &kfile, FLUSH_KEEP, + filter, &filter_param))) + fprintf(stderr, "Maria engine: warning - checkpoint page flush" + " failed\n"); /** @todo improve */ + /* + fsyncs the fd, that's the loooong operation (e.g. max 150 fsync + per second, so if you have touched 1000 files it's 7 seconds). + */ + sync_error|= + my_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) | + my_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD)); + /* + in case of error, we continue because writing other tables to disk is + still useful. + */ + } + + if (sync_error) + goto err; + /* We maybe over-estimated (due to share->id==0 or last_version==0) */ + DBUG_ASSERT(str->length >= (uint)(ptr - str->str)); + str->length= (uint)(ptr - str->str); + /* + As we support max 65k tables open at a time (2-byte short id), we + assume uint is enough for the cumulated length of table names; and + LEX_STRING::length is uint. + */ + int4store(str->str, nb_stored); + error= unmark_tables= 0; + +err: + if (unlikely(unmark_tables)) + { + /* maria_close() uses THR_LOCK_maria from start to end */ + pthread_mutex_lock(&THR_LOCK_maria); + for (i= 0; i < nb; i++) + { + MARIA_SHARE *share= distinct_shares[i]; + if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) + { + /* maria_close() left us to free the share */ + pthread_mutex_destroy(&share->intern_lock); + my_free((uchar *)share, MYF(0)); + } + else + { + /* share goes back to normal state */ + share->in_checkpoint= 0; + } + } + pthread_mutex_unlock(&THR_LOCK_maria); + } + my_free((uchar *)distinct_shares, MYF(MY_ALLOW_ZERO_PTR)); + my_free((uchar *)state_copies, MYF(MY_ALLOW_ZERO_PTR)); + DBUG_RETURN(error); +} diff --git a/storage/maria/ma_checkpoint.h b/storage/maria/ma_checkpoint.h new file mode 100644 index 00000000000..86f3779ca7a --- /dev/null +++ b/storage/maria/ma_checkpoint.h @@ -0,0 +1,81 @@ +/* Copyright (C) 2006,2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3071 Maria checkpoint + First version written by Guilhem Bichot on 2006-04-27. + Does not compile yet. +*/ + +/* This is the interface of this module. */ + +typedef enum enum_ma_checkpoint_level { + CHECKPOINT_NONE= 0, + /* just write dirty_pages, transactions table and sync files */ + CHECKPOINT_INDIRECT, + /* also flush all dirty pages which were already dirty at prev checkpoint */ + CHECKPOINT_MEDIUM, + /* also flush all dirty pages */ + CHECKPOINT_FULL +} CHECKPOINT_LEVEL; + +C_MODE_START +int ma_checkpoint_init(my_bool create_background_thread); +void ma_checkpoint_end(void); +int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait); +C_MODE_END + +/** + @brief reads some LSNs with special trickery + + If a 64-bit variable transitions between both halves being zero to both + halves being non-zero, and back, this function can be used to do a read of + it (without mutex, without atomic load) which always produces a correct + (though maybe slightly old) value (even on 32-bit CPUs). The value is at + least as new as the latest mutex unlock done by the calling thread. + The assumption is that the system sets both 4-byte halves either at the + same time, or one after the other (in any order), but NOT some bytes of the + first half then some bytes of the second half then the rest of bytes of the + first half. With this assumption, the function can detect when it is + seeing an inconsistent value. + + @param LSN pointer to the LSN variable to read + + @return LSN part (most significant byte always 0) +*/ +#if ( SIZEOF_CHARP >= 8 ) +/* 64-bit CPU, 64-bit reads are atomic */ +#define lsn_read_non_atomic LSN_WITH_FLAGS_TO_LSN +#else +static inline LSN lsn_read_non_atomic_32(const volatile LSN *x) +{ + /* + 32-bit CPU, 64-bit reads may give a mixed of old half and new half (old + low bits and new high bits, or the contrary). + */ + for (;;) /* loop until no atomicity problems */ + { + /* + Remove most significant byte in case this is a LSN_WITH_FLAGS object. + Those flags in TRN::first_undo_lsn break the condition on transitions so + they must be removed below. + */ + LSN y= LSN_WITH_FLAGS_TO_LSN(*x); + if (likely((y == LSN_IMPOSSIBLE) || LSN_VALID(y))) + return y; + } +} +#define lsn_read_non_atomic(x) lsn_read_non_atomic_32(&x) +#endif diff --git a/storage/maria/ma_checksum.c b/storage/maria/ma_checksum.c new file mode 100644 index 00000000000..9076b3ebb86 --- /dev/null +++ b/storage/maria/ma_checksum.c @@ -0,0 +1,72 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Calculate a checksum for a row */ + +#include "maria_def.h" + +ha_checksum _ma_checksum(MARIA_HA *info, const uchar *record) +{ + ha_checksum crc=0; + MARIA_COLUMNDEF *column= info->s->columndef; + MARIA_COLUMNDEF *column_end= column+ info->s->base.fields; + + if (info->s->base.null_bytes) + crc= my_checksum(crc, record, info->s->base.null_bytes); + + for ( ; column != column_end ; column++) + { + const uchar *pos= record + column->offset; + ulong length; + + if (record[column->null_pos] & column->null_bit) + continue; /* Null field */ + + switch (column->type) { + case FIELD_BLOB: + { + uint blob_size_length= column->length- portable_sizeof_char_ptr; + length= _ma_calc_blob_length(blob_size_length, pos); + if (length) + { + memcpy((char*) &pos, pos + blob_size_length, sizeof(char*)); + crc= my_checksum(crc, pos, length); + } + continue; + } + case FIELD_VARCHAR: + { + uint pack_length= column->fill_length; + if (pack_length == 1) + length= (ulong) *(uchar*) pos; + else + length= uint2korr(pos); + pos+= pack_length; /* Skip length information */ + break; + } + default: + length= column->length; + break; + } + crc= my_checksum(crc, pos, length); + } + return crc; +} + + +ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *pos) +{ + return my_checksum(0, pos, info->s->base.reclength); +} diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c new file mode 100644 index 00000000000..9b654803945 --- /dev/null +++ b/storage/maria/ma_close.c @@ -0,0 +1,156 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* close a isam-database */ +/* + TODO: + We need to have a separate mutex on the closed file to allow other threads + to open other files during the time we flush the cache and close this file +*/ + +#include "maria_def.h" + +int maria_close(register MARIA_HA *info) +{ + int error=0,flag; + my_bool share_can_be_freed= FALSE; + MARIA_SHARE *share=info->s; + DBUG_ENTER("maria_close"); + DBUG_PRINT("enter",("base: 0x%lx reopen: %u locks: %u", + (long) info, (uint) share->reopen, + (uint) share->tot_locks)); + + pthread_mutex_lock(&THR_LOCK_maria); + if (info->lock_type == F_EXTRA_LCK) + info->lock_type=F_UNLCK; /* HA_EXTRA_NO_USER_CHANGE */ + + if (share->reopen == 1 && share->kfile.file >= 0) + _ma_decrement_open_count(info); + + if (info->lock_type != F_UNLCK) + { + if (maria_lock_database(info,F_UNLCK)) + error=my_errno; + } + pthread_mutex_lock(&share->intern_lock); + + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + share->r_locks--; + share->tot_locks--; + } + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + if (end_io_cache(&info->rec_cache)) + error=my_errno; + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + } + flag= !--share->reopen; + maria_open_list=list_delete(maria_open_list,&info->open_list); + + my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + (*share->end)(info); + + if (flag) + { + /* Last close of file; Flush everything */ + if (share->kfile.file >= 0) + { + if ((*share->once_end)(share)) + error= my_errno; + if (flush_pagecache_blocks(share->pagecache, &share->kfile, + (share->temporary ? + FLUSH_IGNORE_CHANGED : + FLUSH_RELEASE))) + error= my_errno; +#ifdef HAVE_MMAP + if (share->file_map) + _ma_unmap_file(info); +#endif + /* + If we are crashed, we can safely flush the current state as it will + not change the crashed state. + We can NOT write the state in other cases as other threads + may be using the file at this point + IF using --external-locking, which does not apply to Maria. + */ + if ((share->changed && share->base.born_transactional) || + (share->mode != O_RDONLY && maria_is_crashed(info))) + { + /* + State must be written to file as it was not done at table's + unlocking. + */ + if (_ma_state_info_write(share, 1)) + error= my_errno; + } + /* + File must be synced as it is going out of the maria_open_list and so + becoming unknown to future Checkpoints. + */ + if (my_sync(share->kfile.file, MYF(MY_WME))) + error= my_errno; + if (my_close(share->kfile.file, MYF(0))) + error= my_errno; + } +#ifdef THREAD + thr_lock_delete(&share->lock); + { + int i,keys; + keys = share->state.header.keys; + VOID(rwlock_destroy(&share->mmap_lock)); + for(i=0; i<keys; i++) { + VOID(rwlock_destroy(&share->key_root_lock[i])); + } + } +#endif + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + if (share->in_checkpoint == MARIA_CHECKPOINT_LOOKS_AT_ME) + { + share->kfile.file= -1; /* because Checkpoint does not need to flush */ + /* we cannot my_free() the share, Checkpoint would see a bad pointer */ + share->in_checkpoint|= MARIA_CHECKPOINT_SHOULD_FREE_ME; + } + else + share_can_be_freed= TRUE; + } + pthread_mutex_unlock(&THR_LOCK_maria); + pthread_mutex_unlock(&share->intern_lock); + if (share_can_be_freed) + { + VOID(pthread_mutex_destroy(&share->intern_lock)); + my_free((uchar *)share, MYF(0)); + } + if (info->ftparser_param) + { + my_free((uchar*)info->ftparser_param, MYF(0)); + info->ftparser_param= 0; + } + if (info->dfile.file >= 0) + { + /* + This is outside of mutex so would confuse a concurrent + Checkpoint. Fortunately in BLOCK_RECORD we close earlier under mutex. + */ + if (my_close(info->dfile.file, MYF(0))) + error = my_errno; + } + + my_free((uchar*) info,MYF(0)); + + if (error) + DBUG_RETURN(my_errno= error); + DBUG_RETURN(0); +} /* maria_close */ diff --git a/storage/maria/ma_commit.c b/storage/maria/ma_commit.c new file mode 100644 index 00000000000..36ea2f6e6e4 --- /dev/null +++ b/storage/maria/ma_commit.c @@ -0,0 +1,124 @@ +/* Copyright (C) 2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "trnman.h" + +/** + @brief writes a COMMIT record to log and commits transaction in memory + + @param trn transaction + + @return Operation status + @retval 0 ok + @retval 1 error (disk error or out of memory) +*/ + +int ma_commit(TRN *trn) +{ + int res; + LSN commit_lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS]; + DBUG_ENTER("ma_commit"); + + if (trn->undo_lsn == 0) /* no work done, rollback (cheaper than commit) */ + DBUG_RETURN(trnman_rollback_trn(trn)); + /* + - if COMMIT record is written before trnman_commit_trn(): + if Checkpoint comes in the middle it will see trn is not committed, + then if crash, Recovery might roll back trn (if min(rec_lsn) is after + COMMIT record) and this is not an issue as + * transaction's updates were not made visible to other transactions + * "commit ok" was not sent to client + Alternatively, Recovery might commit trn (if min(rec_lsn) is before COMMIT + record), which is ok too. All in all it means that "trn committed" is not + 100% equal to "COMMIT record written". + - if COMMIT record is written after trnman_commit_trn(): + if crash happens between the two, trn will be rolled back which is an + issue (transaction's updates were made visible to other transactions). + So we need to go the first way. + */ + + /** + @todo RECOVERY share's state is written to disk only in + maria_lock_database(), so COMMIT record is not the last record of the + transaction! It is probably an issue. Recovery of the state is a problem + not yet solved. + */ + /* + We do not store "thd->transaction.xid_state.xid" for now, it will be + needed only when we support XA. + */ + res= (translog_write_record(&commit_lsn, LOGREC_COMMIT, + trn, NULL, 0, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL) || + translog_flush(commit_lsn) || + trnman_commit_trn(trn)); + /* + Note: if trnman_commit_trn() fails above, we have already + written the COMMIT record, so Checkpoint and Recovery will see the + transaction as committed. + */ + DBUG_RETURN(res); +} + + +/** + @brief Writes a COMMIT record for a transaciton associated with a file + + @param info Maria handler + + @return Operation status + @retval 0 ok + @retval # error (disk error or out of memory) +*/ + +int maria_commit(MARIA_HA *info) +{ + return info->s->now_transactional ? ma_commit(info->trn) : 0; +} + + +/** + @brief Starts a transaction on a file handle + + @param info Maria handler + + @return Operation status + @retval 0 ok + @retval # Error code. +*/ + + +int maria_begin(MARIA_HA *info) +{ + DBUG_ENTER("maria_begin"); + + if (info->s->now_transactional) + { + TRN *trn; + struct st_my_thread_var *mysys_var= my_thread_var; + trn= trnman_new_trn(&mysys_var->mutex, + &mysys_var->suspend, + (char*) &mysys_var + STACK_DIRECTION *1024*128); + if (unlikely(!trn)) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + + DBUG_PRINT("info", ("TRN set to 0x%lx", (ulong) trn)); + info->trn= trn; + } + DBUG_RETURN(0); +} diff --git a/storage/maria/ma_commit.h b/storage/maria/ma_commit.h new file mode 100644 index 00000000000..2c57c73fd7a --- /dev/null +++ b/storage/maria/ma_commit.h @@ -0,0 +1,18 @@ +/* Copyright (C) 2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +C_MODE_START +int ma_commit(TRN *trn); +C_MODE_END diff --git a/storage/maria/ma_control_file.c b/storage/maria/ma_control_file.c new file mode 100644 index 00000000000..3816830d9e1 --- /dev/null +++ b/storage/maria/ma_control_file.c @@ -0,0 +1,325 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3234 Maria control file + First version written by Guilhem Bichot on 2006-04-27. + Does not compile yet. +*/ + +#include "maria_def.h" + +/* Here is the implementation of this module */ + +/* + a control file contains 3 objects: magic string, LSN of last checkpoint, + number of last log. +*/ + +/* total size should be < sector size for atomic write operation */ +#define CONTROL_FILE_MAGIC_STRING "\xfe\xfe\xc\1MACF" +#define CONTROL_FILE_MAGIC_STRING_OFFSET 0 +#define CONTROL_FILE_MAGIC_STRING_SIZE (sizeof(CONTROL_FILE_MAGIC_STRING)-1) +#define CONTROL_FILE_CHECKSUM_OFFSET (CONTROL_FILE_MAGIC_STRING_OFFSET + CONTROL_FILE_MAGIC_STRING_SIZE) +#define CONTROL_FILE_CHECKSUM_SIZE 4 +#define CONTROL_FILE_LSN_OFFSET (CONTROL_FILE_CHECKSUM_OFFSET + CONTROL_FILE_CHECKSUM_SIZE) +#define CONTROL_FILE_LSN_SIZE LSN_STORE_SIZE +#define CONTROL_FILE_FILENO_OFFSET (CONTROL_FILE_LSN_OFFSET + CONTROL_FILE_LSN_SIZE) +#define CONTROL_FILE_FILENO_SIZE 4 +#define CONTROL_FILE_SIZE (CONTROL_FILE_FILENO_OFFSET + CONTROL_FILE_FILENO_SIZE) + +/* This module owns these two vars. */ +/** + This LSN serves for the two-checkpoint rule, and also to find the + checkpoint record when doing a recovery. +*/ +LSN last_checkpoint_lsn= LSN_IMPOSSIBLE; +uint32 last_logno= FILENO_IMPOSSIBLE; + +/** + @brief If log's lock should be asserted when writing to control file. + + Can be re-used by any function which needs to be thread-safe except when + it is called at startup. +*/ +my_bool maria_multi_threaded= FALSE; +/** @brief if currently doing a recovery */ +my_bool maria_in_recovery= FALSE; + +/* + Control file is less then 512 bytes (a disk sector), + to be as atomic as possible +*/ +static int control_file_fd= -1; + +/* + @brief Initialize control file subsystem + + Looks for the control file. If none and creation is requested, creates file. + If present, reads it to find out last checkpoint's LSN and last log, updates + the last_checkpoint_lsn and last_logno global variables. + Called at engine's start. + + @note + The format of the control file is: + 4 bytes: magic string + 4 bytes: checksum of the following bytes + 4 bytes: number of log where last checkpoint is + 4 bytes: offset in log where last checkpoint is + 4 bytes: number of last log + + @note If in recovery, file is not created + + @return Operation status + @retval 0 OK + @retval 1 Error (in which case the file is left closed) +*/ +CONTROL_FILE_ERROR ma_control_file_create_or_open() +{ + char buffer[CONTROL_FILE_SIZE]; + char name[FN_REFLEN]; + MY_STAT stat_buff; + my_bool create_file; + int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR; + int error= CONTROL_FILE_UNKNOWN_ERROR; + DBUG_ENTER("ma_control_file_create_or_open"); + + /* + If you change sizes in the #defines, you at least have to change the + "*store" and "*korr" calls in this file, and can even create backward + compatibility problems. Beware! + */ + DBUG_ASSERT(CONTROL_FILE_LSN_SIZE == (3+4)); + DBUG_ASSERT(CONTROL_FILE_FILENO_SIZE == 4); + + if (control_file_fd >= 0) /* already open */ + DBUG_RETURN(0); + + if (fn_format(name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) == NullS) + DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); + + create_file= test(my_access(name,F_OK)); + + if (create_file) + { + /* in a recovery, we expect to find a control file */ + if (maria_in_recovery) + DBUG_RETURN(CONTROL_FILE_MISSING); + if ((control_file_fd= my_create(name, 0, + open_flags, MYF(MY_SYNC_DIR))) < 0) + DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); + + /* + To be safer we should make sure that there are no logs or data/index + files around (indeed it could be that the control file alone was deleted + or not restored, and we should not go on with life at this point). + + TODO: For now we trust (this is alpha version), but for beta if would + be great to verify. + + We could have a tool which can rebuild the control file, by reading the + directory of logs, finding the newest log, reading it to find last + checkpoint... Slow but can save your db. For this to be possible, we + must always write to the control file right after writing the checkpoint + log record, and do nothing in between (i.e. the checkpoint must be + usable as soon as it has been written to the log). + */ + + /* init the file with these "undefined" values */ + DBUG_RETURN(ma_control_file_write_and_force(LSN_IMPOSSIBLE, + FILENO_IMPOSSIBLE, + CONTROL_FILE_UPDATE_ALL)); + } + + /* Otherwise, file exists */ + + if ((control_file_fd= my_open(name, open_flags, MYF(MY_WME))) < 0) + goto err; + + if (my_stat(name, &stat_buff, MYF(MY_WME)) == NULL) + goto err; + + if ((uint)stat_buff.st_size < CONTROL_FILE_SIZE) + { + /* + Given that normally we write only a sector and it's atomic, the only + possibility for a file to be of too short size is if we crashed at the + very first startup, between file creation and file write. Quite unlikely + (and can be made even more unlikely by doing this: create a temp file, + write it, and then rename it to be the control file). + What's more likely is if someone forgot to restore the control file, + just did a "touch control" to try to get Maria to start, or if the + disk/filesystem has a problem. + So let's be rigid. + */ + /* + TODO: store a message "too small file" somewhere, so that it goes to + MySQL's error log at startup. + */ + error= CONTROL_FILE_TOO_SMALL; + goto err; + } + + if ((uint)stat_buff.st_size > CONTROL_FILE_SIZE) + { + /* TODO: store "too big file" message */ + error= CONTROL_FILE_TOO_BIG; + goto err; + } + + if (my_read(control_file_fd, buffer, CONTROL_FILE_SIZE, + MYF(MY_FNABP | MY_WME))) + goto err; + if (memcmp(buffer + CONTROL_FILE_MAGIC_STRING_OFFSET, + CONTROL_FILE_MAGIC_STRING, CONTROL_FILE_MAGIC_STRING_SIZE)) + { + /* TODO: store message "bad magic string" somewhere */ + error= CONTROL_FILE_BAD_MAGIC_STRING; + goto err; + } + if (my_checksum(0, buffer + CONTROL_FILE_LSN_OFFSET, + CONTROL_FILE_SIZE - CONTROL_FILE_LSN_OFFSET) != + uint4korr(buffer + CONTROL_FILE_CHECKSUM_OFFSET)) + { + /* TODO: store message "checksum mismatch" somewhere */ + error= CONTROL_FILE_BAD_CHECKSUM; + goto err; + } + last_checkpoint_lsn= lsn_korr(buffer + CONTROL_FILE_LSN_OFFSET); + last_logno= uint4korr(buffer + CONTROL_FILE_FILENO_OFFSET); + + DBUG_RETURN(0); +err: + ma_control_file_end(); + DBUG_RETURN(error); +} + + +/* + Write information durably to the control file; stores this information into + the last_checkpoint_lsn and last_logno global variables. + Called when we have created a new log (after syncing this log's creation) + and when we have written a checkpoint (after syncing this log record). + Variables last_checkpoint_lsn and last_logno must be protected by caller + using log's lock, unless this function is called at startup. + + SYNOPSIS + ma_control_file_write_and_force() + checkpoint_lsn LSN of last checkpoint + logno last log file number + objs_to_write which of the arguments should be used as new values + (for example, CONTROL_FILE_UPDATE_ONLY_LSN will not + write the logno argument to the control file and will + not update the last_logno global variable); can be: + CONTROL_FILE_UPDATE_ALL + CONTROL_FILE_UPDATE_ONLY_LSN + CONTROL_FILE_UPDATE_ONLY_LOGNO. + + NOTE + We always want to do one single my_pwrite() here to be as atomic as + possible. + + RETURN + 0 - OK + 1 - Error +*/ + +int ma_control_file_write_and_force(const LSN checkpoint_lsn, uint32 logno, + uint objs_to_write) +{ + char buffer[CONTROL_FILE_SIZE]; + my_bool update_checkpoint_lsn= FALSE, update_logno= FALSE; + DBUG_ENTER("ma_control_file_write_and_force"); + + DBUG_ASSERT(control_file_fd >= 0); /* must be open */ +#ifndef DBUG_OFF + if (maria_multi_threaded) + translog_lock_assert_owner(); +#endif + + memcpy(buffer + CONTROL_FILE_MAGIC_STRING_OFFSET, + CONTROL_FILE_MAGIC_STRING, CONTROL_FILE_MAGIC_STRING_SIZE); + + if (objs_to_write == CONTROL_FILE_UPDATE_ONLY_LSN) + update_checkpoint_lsn= TRUE; + else if (objs_to_write == CONTROL_FILE_UPDATE_ONLY_LOGNO) + update_logno= TRUE; + else if (objs_to_write == CONTROL_FILE_UPDATE_ALL) + update_checkpoint_lsn= update_logno= TRUE; + else /* incorrect value of objs_to_write */ + DBUG_ASSERT(0); + + if (update_checkpoint_lsn) + lsn_store(buffer + CONTROL_FILE_LSN_OFFSET, checkpoint_lsn); + else /* store old value == change nothing */ + lsn_store(buffer + CONTROL_FILE_LSN_OFFSET, last_checkpoint_lsn); + + if (update_logno) + int4store(buffer + CONTROL_FILE_FILENO_OFFSET, logno); + else + int4store(buffer + CONTROL_FILE_FILENO_OFFSET, last_logno); + + { + uint32 sum= (uint32) + my_checksum(0, buffer + CONTROL_FILE_LSN_OFFSET, + CONTROL_FILE_SIZE - CONTROL_FILE_LSN_OFFSET); + int4store(buffer + CONTROL_FILE_CHECKSUM_OFFSET, sum); + } + + if (my_pwrite(control_file_fd, buffer, sizeof(buffer), + 0, MYF(MY_FNABP | MY_WME)) || + my_sync(control_file_fd, MYF(MY_WME))) + DBUG_RETURN(1); + + if (update_checkpoint_lsn) + last_checkpoint_lsn= checkpoint_lsn; + if (update_logno) + last_logno= logno; + + DBUG_RETURN(0); +} + + +/* + Free resources taken by control file subsystem + + SYNOPSIS + ma_control_file_end() +*/ + +int ma_control_file_end() +{ + int close_error; + DBUG_ENTER("ma_control_file_end"); + + if (control_file_fd < 0) /* already closed */ + DBUG_RETURN(0); + + close_error= my_close(control_file_fd, MYF(MY_WME)); + /* + As my_close() frees structures even if close() fails, we do the same, + i.e. we mark the file as closed in all cases. + */ + control_file_fd= -1; + /* + As this module owns these variables, closing the module forbids access to + them (just a safety): + */ + last_checkpoint_lsn= LSN_IMPOSSIBLE; + last_logno= FILENO_IMPOSSIBLE; + + DBUG_RETURN(close_error); +} diff --git a/storage/maria/ma_control_file.h b/storage/maria/ma_control_file.h new file mode 100644 index 00000000000..88a1780543a --- /dev/null +++ b/storage/maria/ma_control_file.h @@ -0,0 +1,80 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3234 Maria control file + First version written by Guilhem Bichot on 2006-04-27. +*/ + +#ifndef _ma_control_file_h +#define _ma_control_file_h + +#define CONTROL_FILE_BASE_NAME "maria_log_control" + +/* Here is the interface of this module */ + +/* + LSN of the last checkoint + (if last_checkpoint_lsn == LSN_IMPOSSIBLE then there was never a checkpoint) +*/ +extern LSN last_checkpoint_lsn; +/* + Last log number (if last_logno == FILENO_IMPOSSIBLE then there is no log + file yet) +*/ +extern uint32 last_logno; + +extern my_bool maria_multi_threaded, maria_in_recovery; + +typedef enum enum_control_file_error { + CONTROL_FILE_OK= 0, + CONTROL_FILE_TOO_SMALL, + CONTROL_FILE_TOO_BIG, + CONTROL_FILE_BAD_MAGIC_STRING, + CONTROL_FILE_BAD_CHECKSUM, + CONTROL_FILE_MISSING, + CONTROL_FILE_UNKNOWN_ERROR /* any other error */ +} CONTROL_FILE_ERROR; + +#define CONTROL_FILE_UPDATE_ALL 0 +#define CONTROL_FILE_UPDATE_ONLY_LSN 1 +#define CONTROL_FILE_UPDATE_ONLY_LOGNO 2 + +#ifdef __cplusplus +extern "C" { +#endif + +/* + Looks for the control file. If none and creation was requested, creates file. + If present, reads it to find out last checkpoint's LSN and last log. + Called at engine's start. +*/ +CONTROL_FILE_ERROR ma_control_file_create_or_open(); +/* + Write information durably to the control file. + Called when we have created a new log (after syncing this log's creation) + and when we have written a checkpoint (after syncing this log record). +*/ +int ma_control_file_write_and_force(const LSN checkpoint_lsn, uint32 logno, + uint objs_to_write); + + +/* Free resources taken by control file subsystem */ +int ma_control_file_end(); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c new file mode 100644 index 00000000000..ba1d9a13b42 --- /dev/null +++ b/storage/maria/ma_create.c @@ -0,0 +1,1279 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Create a MARIA table */ + +#include "ma_ftdefs.h" +#include "ma_sp_defs.h" +#include <my_bit.h> +#include "ma_blockrec.h" +#include "trnman_public.h" + +#if defined(MSDOS) || defined(__WIN__) +#ifdef __WIN__ +#include <fcntl.h> +#else +#include <process.h> /* Prototype for getpid */ +#endif +#endif +#include <m_ctype.h> + +static int compare_columns(MARIA_COLUMNDEF **a, MARIA_COLUMNDEF **b); + +/* + Old options is used when recreating database, from maria_chk +*/ + +int maria_create(const char *name, enum data_file_type datafile_type, + uint keys,MARIA_KEYDEF *keydefs, + uint columns, MARIA_COLUMNDEF *columndef, + uint uniques, MARIA_UNIQUEDEF *uniquedefs, + MARIA_CREATE_INFO *ci,uint flags) +{ + register uint i,j; + File dfile,file; + int errpos,save_errno, create_mode= O_RDWR | O_TRUNC, res; + myf create_flag; + uint length,max_key_length,packed,pack_bytes,pointer,real_length_diff, + key_length,info_length,key_segs,options,min_key_length_skip, + base_pos,long_varchar_count,varchar_length, + unique_key_parts,fulltext_keys,offset, not_block_record_extra_length; + uint max_field_lengths, extra_header_size; + ulong reclength, real_reclength,min_pack_length; + char filename[FN_REFLEN], linkname[FN_REFLEN], *linkname_ptr; + ulong pack_reclength; + ulonglong tot_length,max_rows, tmp; + enum en_fieldtype type; + enum data_file_type org_datafile_type= datafile_type; + MARIA_SHARE share; + MARIA_KEYDEF *keydef,tmp_keydef; + MARIA_UNIQUEDEF *uniquedef; + HA_KEYSEG *keyseg,tmp_keyseg; + MARIA_COLUMNDEF *column, *end_column; + ulong *rec_per_key_part; + my_off_t key_root[HA_MAX_POSSIBLE_KEY], kfile_size_before_extension; + MARIA_CREATE_INFO tmp_create_info; + my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */ + my_bool forced_packed; + myf sync_dir= 0; + uchar *log_data= NULL; + DBUG_ENTER("maria_create"); + DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u", + keys, columns, uniques, flags)); + + DBUG_ASSERT(maria_block_size && maria_block_size % IO_SIZE == 0); + LINT_INIT(dfile); + LINT_INIT(file); + + if (!ci) + { + bzero((char*) &tmp_create_info,sizeof(tmp_create_info)); + ci=&tmp_create_info; + } + + if (keys + uniques > MARIA_MAX_KEY || columns == 0) + { + DBUG_RETURN(my_errno=HA_WRONG_CREATE_OPTION); + } + errpos=0; + options=0; + bzero((uchar*) &share,sizeof(share)); + + if (flags & HA_DONT_TOUCH_DATA) + { + org_datafile_type= ci->org_data_file_type; + if (!(ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD)) + options=ci->old_options & + (HA_OPTION_COMPRESS_RECORD | HA_OPTION_PACK_RECORD | + HA_OPTION_READ_ONLY_DATA | HA_OPTION_CHECKSUM | + HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE); + else + { + /* Uncompressing rows */ + options=ci->old_options & + (HA_OPTION_CHECKSUM | HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE); + } + } + + if (ci->reloc_rows > ci->max_rows) + ci->reloc_rows=ci->max_rows; /* Check if wrong parameter */ + + if (!(rec_per_key_part= + (ulong*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(long), + MYF(MY_WME | MY_ZEROFILL)))) + DBUG_RETURN(my_errno); + + /* Start by checking fields and field-types used */ + + varchar_length=long_varchar_count=packed= not_block_record_extra_length= + pack_reclength= max_field_lengths= 0; + reclength= min_pack_length= ci->null_bytes; + forced_packed= 0; + + for (column= columndef, end_column= column + columns ; + column != end_column ; + column++) + { + /* Fill in not used struct parts */ + column->offset= reclength; + column->empty_pos= 0; + column->empty_bit= 0; + column->fill_length= column->length; + if (column->null_bit) + options|= HA_OPTION_NULL_FIELDS; + + reclength+= column->length; + type= column->type; + if (type == FIELD_SKIP_PRESPACE && datafile_type == BLOCK_RECORD) + type= FIELD_NORMAL; /* SKIP_PRESPACE not supported */ + + if (type != FIELD_NORMAL && type != FIELD_CHECK) + { + column->empty_pos= packed/8; + column->empty_bit= (1 << (packed & 7)); + if (type == FIELD_BLOB) + { + forced_packed= 1; + packed++; + share.base.blobs++; + if (pack_reclength != INT_MAX32) + { + if (column->length == 4+portable_sizeof_char_ptr) + pack_reclength= INT_MAX32; + else + { + /* Add max possible blob length */ + pack_reclength+= (1 << ((column->length- + portable_sizeof_char_ptr)*8)); + } + } + max_field_lengths+= (column->length - portable_sizeof_char_ptr); + } + else if (type == FIELD_SKIP_PRESPACE || + type == FIELD_SKIP_ENDSPACE) + { + forced_packed= 1; + max_field_lengths+= column->length > 255 ? 2 : 1; + not_block_record_extra_length++; + packed++; + } + else if (type == FIELD_VARCHAR) + { + varchar_length+= column->length-1; /* Used for min_pack_length */ + pack_reclength++; + not_block_record_extra_length++; + max_field_lengths++; + packed++; + column->fill_length= 1; + /* We must test for 257 as length includes pack-length */ + if (test(column->length >= 257)) + { + long_varchar_count++; + max_field_lengths++; + column->fill_length= 2; + } + } + else if (type == FIELD_SKIP_ZERO) + packed++; + else + { + if (!column->null_bit) + min_pack_length+= column->length; + else + not_block_record_extra_length+= column->length; + column->empty_pos= 0; + column->empty_bit= 0; + } + } + else /* FIELD_NORMAL */ + { + if (!column->null_bit) + { + min_pack_length+= column->length; + share.base.fixed_not_null_fields++; + share.base.fixed_not_null_fields_length+= column->length; + } + else + not_block_record_extra_length+= column->length; + } + } + + if (datafile_type == STATIC_RECORD && forced_packed) + { + /* Can't use fixed length records, revert to block records */ + datafile_type= BLOCK_RECORD; + } + + if (datafile_type == DYNAMIC_RECORD) + options|= HA_OPTION_PACK_RECORD; /* Must use packed records */ + + if (datafile_type == STATIC_RECORD) + { + /* We can't use checksum with static length rows */ + flags&= ~HA_CREATE_CHECKSUM; + options&= ~HA_OPTION_CHECKSUM; + min_pack_length+= varchar_length; + packed= 0; + } + if (datafile_type != BLOCK_RECORD) + min_pack_length+= not_block_record_extra_length; + + if ((packed & 7) == 1) + { + /* + Not optimal packing, try to remove a 1 uchar length zero-field as + this will get same record length, but smaller pack overhead + */ + while (column != columndef) + { + column--; + if (column->type == (int) FIELD_SKIP_ZERO && column->length == 1) + { + /* + NOTE1: here we change a field type FIELD_SKIP_ZERO -> + FIELD_NORMAL + */ + column->type=(int) FIELD_NORMAL; + column->empty_pos= 0; + column->empty_bit= 0; + packed--; + min_pack_length++; + break; + } + } + } + + if (flags & HA_CREATE_TMP_TABLE) + { + options|= HA_OPTION_TMP_TABLE; + tmp_table= TRUE; + create_mode|= O_EXCL | O_NOFOLLOW; + /* "CREATE TEMPORARY" tables are not crash-safe (dropped at restart) */ + ci->transactional= FALSE; + } + share.base.null_bytes= ci->null_bytes; + share.base.original_null_bytes= ci->null_bytes; + share.base.born_transactional= ci->transactional; + share.base.max_field_lengths= max_field_lengths; + share.base.field_offsets= 0; /* for future */ + + if (pack_reclength != INT_MAX32) + pack_reclength+= max_field_lengths + long_varchar_count; + + if (flags & HA_CREATE_CHECKSUM || (options & HA_OPTION_CHECKSUM)) + { + options|= HA_OPTION_CHECKSUM; + min_pack_length++; + pack_reclength++; + } + if (flags & HA_CREATE_DELAY_KEY_WRITE) + options|= HA_OPTION_DELAY_KEY_WRITE; + if (flags & HA_CREATE_RELIES_ON_SQL_LAYER) + options|= HA_OPTION_RELIES_ON_SQL_LAYER; + + pack_bytes= (packed + 7) / 8; + if (pack_reclength != INT_MAX32) + pack_reclength+= reclength+pack_bytes + + test(test_all_bits(options, HA_OPTION_CHECKSUM | HA_PACK_RECORD)); + min_pack_length+= pack_bytes; + /* Calculate min possible row length for rows-in-block */ + extra_header_size= MAX_FIXED_HEADER_SIZE; + if (ci->transactional) + { + extra_header_size= TRANS_MAX_FIXED_HEADER_SIZE; + DBUG_PRINT("info",("creating a transactional table")); + } + share.base.min_row_length= (extra_header_size + share.base.null_bytes + + pack_bytes); + if (!ci->data_file_length && ci->max_rows) + { + if (pack_reclength == INT_MAX32 || + (~(ulonglong) 0)/ci->max_rows < (ulonglong) pack_reclength) + ci->data_file_length= ~(ulonglong) 0; + else + ci->data_file_length=(ulonglong) ci->max_rows*pack_reclength; + } + else if (!ci->max_rows) + { + if (datafile_type == BLOCK_RECORD) + { + uint rows_per_page= ((maria_block_size - PAGE_OVERHEAD_SIZE) / + (min_pack_length + extra_header_size + + DIR_ENTRY_SIZE)); + ulonglong data_file_length= ci->data_file_length; + if (!data_file_length) + data_file_length= ((((ulonglong) 1 << ((BLOCK_RECORD_POINTER_SIZE-1) * + 8)) -1)); + if (rows_per_page > 0) + { + set_if_smaller(rows_per_page, MAX_ROWS_PER_PAGE); + ci->max_rows= data_file_length / maria_block_size * rows_per_page; + } + else + ci->max_rows= data_file_length / (min_pack_length + + extra_header_size + + DIR_ENTRY_SIZE); + } + else + ci->max_rows=(ha_rows) (ci->data_file_length/(min_pack_length + + ((options & + HA_OPTION_PACK_RECORD) ? + 3 : 0))); + } + max_rows= (ulonglong) ci->max_rows; + if (datafile_type == BLOCK_RECORD) + { + /* The + 1 is for record position withing page */ + pointer= maria_get_pointer_length((ci->data_file_length / + maria_block_size), 3) + 1; + set_if_smaller(pointer, BLOCK_RECORD_POINTER_SIZE); + + if (!max_rows) + max_rows= (((((ulonglong) 1 << ((pointer-1)*8)) -1) * maria_block_size) / + min_pack_length); + } + else + { + if (datafile_type != STATIC_RECORD) + pointer= maria_get_pointer_length(ci->data_file_length, + maria_data_pointer_size); + else + pointer= maria_get_pointer_length(ci->max_rows, maria_data_pointer_size); + if (!max_rows) + max_rows= ((((ulonglong) 1 << (pointer*8)) -1) / min_pack_length); + } + + real_reclength=reclength; + if (datafile_type == STATIC_RECORD) + { + if (reclength <= pointer) + reclength=pointer+1; /* reserve place for delete link */ + } + else + reclength+= long_varchar_count; /* We need space for varchar! */ + + max_key_length=0; tot_length=0 ; key_segs=0; + fulltext_keys=0; + share.state.rec_per_key_part=rec_per_key_part; + share.state.key_root=key_root; + share.state.key_del= HA_OFFSET_ERROR; + if (uniques) + max_key_length= MARIA_UNIQUE_HASH_LENGTH + pointer; + + for (i=0, keydef=keydefs ; i < keys ; i++ , keydef++) + { + share.state.key_root[i]= HA_OFFSET_ERROR; + min_key_length_skip=length=real_length_diff=0; + key_length=pointer; + if (keydef->flag & HA_SPATIAL) + { +#ifdef HAVE_SPATIAL + /* BAR TODO to support 3D and more dimensions in the future */ + uint sp_segs=SPDIMS*2; + keydef->flag=HA_SPATIAL; + + if (flags & HA_DONT_TOUCH_DATA) + { + /* + Called by maria_chk - i.e. table structure was taken from + MYI file and SPATIAL key *does have* additional sp_segs keysegs. + keydef->seg here points right at the GEOMETRY segment, + so we only need to decrease keydef->keysegs. + (see maria_recreate_table() in _ma_check.c) + */ + keydef->keysegs-=sp_segs-1; + } + + for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ; + j++, keyseg++) + { + if (keyseg->type != HA_KEYTYPE_BINARY && + keyseg->type != HA_KEYTYPE_VARBINARY1 && + keyseg->type != HA_KEYTYPE_VARBINARY2) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + } + keydef->keysegs+=sp_segs; + key_length+=SPLEN*sp_segs; + length++; /* At least one length uchar */ + min_key_length_skip+=SPLEN*2*SPDIMS; +#else + my_errno= HA_ERR_UNSUPPORTED; + goto err_no_lock; +#endif /*HAVE_SPATIAL*/ + } + else if (keydef->flag & HA_FULLTEXT) + { + keydef->flag=HA_FULLTEXT | HA_PACK_KEY | HA_VAR_LENGTH_KEY; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + + for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ; + j++, keyseg++) + { + if (keyseg->type != HA_KEYTYPE_TEXT && + keyseg->type != HA_KEYTYPE_VARTEXT1 && + keyseg->type != HA_KEYTYPE_VARTEXT2) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + if (!(keyseg->flag & HA_BLOB_PART) && + (keyseg->type == HA_KEYTYPE_VARTEXT1 || + keyseg->type == HA_KEYTYPE_VARTEXT2)) + { + /* Make a flag that this is a VARCHAR */ + keyseg->flag|= HA_VAR_LENGTH_PART; + /* Store in bit_start number of bytes used to pack the length */ + keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1)? + 1 : 2); + } + } + + fulltext_keys++; + key_length+= HA_FT_MAXBYTELEN+HA_FT_WLEN; + length++; /* At least one length uchar */ + min_key_length_skip+=HA_FT_MAXBYTELEN; + real_length_diff=HA_FT_MAXBYTELEN-FT_MAX_WORD_LEN_FOR_SORT; + } + else + { + /* Test if prefix compression */ + if (keydef->flag & HA_PACK_KEY) + { + /* Can't use space_compression on number keys */ + if ((keydef->seg[0].flag & HA_SPACE_PACK) && + keydef->seg[0].type == (int) HA_KEYTYPE_NUM) + keydef->seg[0].flag&= ~HA_SPACE_PACK; + + /* Only use HA_PACK_KEY when first segment is a variable length key */ + if (!(keydef->seg[0].flag & (HA_SPACE_PACK | HA_BLOB_PART | + HA_VAR_LENGTH_PART))) + { + /* pack relative to previous key */ + keydef->flag&= ~HA_PACK_KEY; + keydef->flag|= HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY; + } + else + { + keydef->seg[0].flag|=HA_PACK_KEY; /* for easyer intern test */ + keydef->flag|=HA_VAR_LENGTH_KEY; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + } + } + if (keydef->flag & HA_BINARY_PACK_KEY) + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + + if (keydef->flag & HA_AUTO_KEY && ci->with_auto_increment) + share.base.auto_key=i+1; + for (j=0, keyseg=keydef->seg ; j < keydef->keysegs ; j++, keyseg++) + { + /* numbers are stored with high by first to make compression easier */ + switch (keyseg->type) { + case HA_KEYTYPE_SHORT_INT: + case HA_KEYTYPE_LONG_INT: + case HA_KEYTYPE_FLOAT: + case HA_KEYTYPE_DOUBLE: + case HA_KEYTYPE_USHORT_INT: + case HA_KEYTYPE_ULONG_INT: + case HA_KEYTYPE_LONGLONG: + case HA_KEYTYPE_ULONGLONG: + case HA_KEYTYPE_INT24: + case HA_KEYTYPE_UINT24: + case HA_KEYTYPE_INT8: + keyseg->flag|= HA_SWAP_KEY; + break; + case HA_KEYTYPE_VARTEXT1: + case HA_KEYTYPE_VARTEXT2: + case HA_KEYTYPE_VARBINARY1: + case HA_KEYTYPE_VARBINARY2: + if (!(keyseg->flag & HA_BLOB_PART)) + { + /* Make a flag that this is a VARCHAR */ + keyseg->flag|= HA_VAR_LENGTH_PART; + /* Store in bit_start number of bytes used to pack the length */ + keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 || + keyseg->type == HA_KEYTYPE_VARBINARY1) ? + 1 : 2); + } + break; + default: + break; + } + if (keyseg->flag & HA_SPACE_PACK) + { + DBUG_ASSERT(!(keyseg->flag & HA_VAR_LENGTH_PART)); + keydef->flag |= HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + length++; /* At least one length uchar */ + min_key_length_skip+=keyseg->length; + if (keyseg->length >= 255) + { /* prefix may be 3 bytes */ + min_key_length_skip+=2; + length+=2; + } + } + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) + { + DBUG_ASSERT(!test_all_bits(keyseg->flag, + (HA_VAR_LENGTH_PART | HA_BLOB_PART))); + keydef->flag|=HA_VAR_LENGTH_KEY; + length++; /* At least one length uchar */ + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + min_key_length_skip+=keyseg->length; + if (keyseg->length >= 255) + { /* prefix may be 3 bytes */ + min_key_length_skip+=2; + length+=2; + } + } + key_length+= keyseg->length; + if (keyseg->null_bit) + { + key_length++; + options|=HA_OPTION_PACK_KEYS; + keyseg->flag|=HA_NULL_PART; + keydef->flag|=HA_VAR_LENGTH_KEY | HA_NULL_PART_KEY; + } + } + } /* if HA_FULLTEXT */ + key_segs+=keydef->keysegs; + if (keydef->keysegs > HA_MAX_KEY_SEG) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + /* + key_segs may be 0 in the case when we only want to be able to + add on row into the table. This can happen with some DISTINCT queries + in MySQL + */ + if ((keydef->flag & (HA_NOSAME | HA_NULL_PART_KEY)) == HA_NOSAME && + key_segs) + share.state.rec_per_key_part[key_segs-1]=1L; + length+=key_length; + /* + A key can't be longer than than half a index block (as we have + to be able to put at least 2 keys on an index block for the key + algorithms to work). + */ + if (length > maria_max_key_length()) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + keydef->block_length= maria_block_size; + keydef->keylength= (uint16) key_length; + keydef->minlength= (uint16) (length-min_key_length_skip); + keydef->maxlength= (uint16) length; + + if (length > max_key_length) + max_key_length= length; + tot_length+= ((max_rows/(ulong) (((uint) maria_block_size-5)/ + (length*2))) * + maria_block_size); + } + + unique_key_parts=0; + offset=reclength-uniques*MARIA_UNIQUE_HASH_LENGTH; + for (i=0, uniquedef=uniquedefs ; i < uniques ; i++ , uniquedef++) + { + uniquedef->key=keys+i; + unique_key_parts+=uniquedef->keysegs; + share.state.key_root[keys+i]= HA_OFFSET_ERROR; + tot_length+= (max_rows/(ulong) (((uint) maria_block_size-5)/ + ((MARIA_UNIQUE_HASH_LENGTH + pointer)*2)))* + (ulong) maria_block_size; + } + keys+=uniques; /* Each unique has 1 key */ + key_segs+=uniques; /* Each unique has 1 key seg */ + + base_pos=(MARIA_STATE_INFO_SIZE + keys * MARIA_STATE_KEY_SIZE + + key_segs * MARIA_STATE_KEYSEG_SIZE); + info_length= base_pos+(uint) (MARIA_BASE_INFO_SIZE+ + keys * MARIA_KEYDEF_SIZE+ + uniques * MARIA_UNIQUEDEF_SIZE + + (key_segs + unique_key_parts)*HA_KEYSEG_SIZE+ + columns*MARIA_COLUMNDEF_SIZE); + + DBUG_PRINT("info", ("info_length: %u", info_length)); + /* There are only 16 bits for the total header length. */ + if (info_length > 65535) + { + my_printf_error(0, "Maria table '%s' has too many columns and/or " + "indexes and/or unique constraints.", + MYF(0), name + dirname_length(name)); + my_errno= HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + + bmove(share.state.header.file_version,(uchar*) maria_file_magic,4); + ci->old_options=options| (ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD ? + HA_OPTION_COMPRESS_RECORD | + HA_OPTION_TEMP_COMPRESS_RECORD: 0); + mi_int2store(share.state.header.options,ci->old_options); + mi_int2store(share.state.header.header_length,info_length); + mi_int2store(share.state.header.state_info_length,MARIA_STATE_INFO_SIZE); + mi_int2store(share.state.header.base_info_length,MARIA_BASE_INFO_SIZE); + mi_int2store(share.state.header.base_pos,base_pos); + share.state.header.data_file_type= share.data_file_type= datafile_type; + share.state.header.org_data_file_type= org_datafile_type; + share.state.header.language= (ci->language ? + ci->language : default_charset_info->number); + + share.state.dellink = HA_OFFSET_ERROR; + share.state.first_bitmap_with_space= 0; + share.state.process= (ulong) getpid(); + share.state.unique= (ulong) 0; + share.state.update_count=(ulong) 0; + share.state.version= (ulong) time((time_t*) 0); + share.state.sortkey= (ushort) ~0; + share.state.auto_increment=ci->auto_increment; + share.options=options; + share.base.rec_reflength=pointer; + share.base.block_size= maria_block_size; + + /* Get estimate for index file length (this may be wrong for FT keys) */ + tmp= (tot_length + maria_block_size * keys * + MARIA_INDEX_BLOCK_MARGIN) / maria_block_size; + /* + use maximum of key_file_length we calculated and key_file_length value we + got from MYI file header (see also mariapack.c:save_state) + */ + share.base.key_reflength= + maria_get_pointer_length(max(ci->key_file_length,tmp),3); + share.base.keys= share.state.header.keys= keys; + share.state.header.uniques= uniques; + share.state.header.fulltext_keys= fulltext_keys; + mi_int2store(share.state.header.key_parts,key_segs); + mi_int2store(share.state.header.unique_key_parts,unique_key_parts); + + maria_set_all_keys_active(share.state.key_map, keys); + + share.base.keystart = share.state.state.key_file_length= + MY_ALIGN(info_length, maria_block_size); + share.base.max_key_block_length= maria_block_size; + share.base.max_key_length=ALIGN_SIZE(max_key_length+4); + share.base.records=ci->max_rows; + share.base.reloc= ci->reloc_rows; + share.base.reclength=real_reclength; + share.base.pack_reclength=reclength+ test(options & HA_OPTION_CHECKSUM); + share.base.max_pack_length=pack_reclength; + share.base.min_pack_length=min_pack_length; + share.base.pack_bytes= pack_bytes; + share.base.fields= columns; + share.base.pack_fields= packed; + + if (share.data_file_type == BLOCK_RECORD) + { + /* + we are going to create a first bitmap page, set data_file_length + to reflect this, before the state goes to disk + */ + share.state.state.data_file_length= maria_block_size; + /* Add length of packed fields + length */ + share.base.pack_reclength+= share.base.max_field_lengths+3; + + } + + /* max_data_file_length and max_key_file_length are recalculated on open */ + if (tmp_table) + share.base.max_data_file_length= (my_off_t) ci->data_file_length; + else if (ci->transactional && translog_inited && !maria_in_recovery) + { + /* + we have checked translog_inited above, because maria_chk may call us + (via maria_recreate_table()) and it does not have a log. + */ + sync_dir= MY_SYNC_DIR; + } + + if (datafile_type == BLOCK_RECORD) + share.base.min_block_length= share.base.min_row_length; + else + { + share.base.min_block_length= + (share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH && + ! share.base.blobs) ? + max(share.base.pack_reclength,MARIA_MIN_BLOCK_LENGTH) : + MARIA_EXTEND_BLOCK_LENGTH; + } + if (! (flags & HA_DONT_TOUCH_DATA)) + share.state.create_time= (long) time((time_t*) 0); + + pthread_mutex_lock(&THR_LOCK_maria); + + /* + NOTE: For test_if_reopen() we need a real path name. Hence we need + MY_RETURN_REAL_PATH for every fn_format(filename, ...). + */ + if (ci->index_file_name) + { + char *iext= strrchr(ci->index_file_name, '.'); + int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT); + if (tmp_table) + { + char *path; + /* chop off the table name, tempory tables use generated name */ + if ((path= strrchr(ci->index_file_name, FN_LIBCHAR))) + *path= '\0'; + fn_format(filename, name, ci->index_file_name, MARIA_NAME_IEXT, + MY_REPLACE_DIR | MY_UNPACK_FILENAME | + MY_RETURN_REAL_PATH | MY_APPEND_EXT); + } + else + { + fn_format(filename, ci->index_file_name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH | + (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT)); + } + fn_format(linkname, name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME|MY_APPEND_EXT); + linkname_ptr= linkname; + /* + Don't create the table if the link or file exists to ensure that one + doesn't accidently destroy another table. + Don't sync dir now if the data file has the same path. + */ + create_flag= + (ci->data_file_name && + !strcmp(ci->index_file_name, ci->data_file_name)) ? 0 : sync_dir; + } + else + { + char *iext= strrchr(name, '.'); + int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT); + fn_format(filename, name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH | + (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT)); + linkname_ptr= NullS; + /* + Replace the current file. + Don't sync dir now if the data file has the same path. + */ + create_flag= MY_DELETE_OLD | (!ci->data_file_name ? 0 : sync_dir); + } + + /* + If a MRG_MARIA table is in use, the mapped MARIA tables are open, + but no entry is made in the table cache for them. + A TRUNCATE command checks for the table in the cache only and could + be fooled to believe, the table is not open. + Pull the emergency brake in this situation. (Bug #8306) + + + NOTE: The filename is compared against unique_file_name of every + open table. Hence we need a real path here. + */ + if (_ma_test_if_reopen(filename)) + { + my_printf_error(0, "MARIA table '%s' is in use " + "(most likely by a MERGE table). Try FLUSH TABLES.", + MYF(0), name + dirname_length(name)); + goto err; + } + + if ((file= my_create_with_symlink(linkname_ptr, filename, 0, create_mode, + MYF(MY_WME|create_flag))) < 0) + goto err; + errpos=1; + + DBUG_PRINT("info", ("write state info and base info")); + if (_ma_state_info_write_sub(file, &share.state, 2) || + _ma_base_info_write(file, &share.base)) + goto err; + DBUG_PRINT("info", ("base_pos: %d base_info_size: %d", + base_pos, MARIA_BASE_INFO_SIZE)); + DBUG_ASSERT(my_tell(file,MYF(0)) == base_pos+ MARIA_BASE_INFO_SIZE); + + /* Write key and keyseg definitions */ + DBUG_PRINT("info", ("write key and keyseg definitions")); + for (i=0 ; i < share.base.keys - uniques; i++) + { + uint sp_segs=(keydefs[i].flag & HA_SPATIAL) ? 2*SPDIMS : 0; + + if (_ma_keydef_write(file, &keydefs[i])) + goto err; + for (j=0 ; j < keydefs[i].keysegs-sp_segs ; j++) + if (_ma_keyseg_write(file, &keydefs[i].seg[j])) + goto err; +#ifdef HAVE_SPATIAL + for (j=0 ; j < sp_segs ; j++) + { + HA_KEYSEG sseg; + sseg.type=SPTYPE; + sseg.language= 7; /* Binary */ + sseg.null_bit=0; + sseg.bit_start=0; + sseg.bit_end=0; + sseg.bit_length= 0; + sseg.bit_pos= 0; + sseg.length=SPLEN; + sseg.null_pos=0; + sseg.start=j*SPLEN; + sseg.flag= HA_SWAP_KEY; + if (_ma_keyseg_write(file, &sseg)) + goto err; + } +#endif + } + /* Create extra keys for unique definitions */ + offset=reclength-uniques*MARIA_UNIQUE_HASH_LENGTH; + bzero((char*) &tmp_keydef,sizeof(tmp_keydef)); + bzero((char*) &tmp_keyseg,sizeof(tmp_keyseg)); + for (i=0; i < uniques ; i++) + { + tmp_keydef.keysegs=1; + tmp_keydef.flag= HA_UNIQUE_CHECK; + tmp_keydef.block_length= (uint16) maria_block_size; + tmp_keydef.keylength= MARIA_UNIQUE_HASH_LENGTH + pointer; + tmp_keydef.minlength=tmp_keydef.maxlength=tmp_keydef.keylength; + tmp_keyseg.type= MARIA_UNIQUE_HASH_TYPE; + tmp_keyseg.length= MARIA_UNIQUE_HASH_LENGTH; + tmp_keyseg.start= offset; + offset+= MARIA_UNIQUE_HASH_LENGTH; + if (_ma_keydef_write(file,&tmp_keydef) || + _ma_keyseg_write(file,(&tmp_keyseg))) + goto err; + } + + /* Save unique definition */ + DBUG_PRINT("info", ("write unique definitions")); + for (i=0 ; i < share.state.header.uniques ; i++) + { + HA_KEYSEG *keyseg_end; + keyseg= uniquedefs[i].seg; + if (_ma_uniquedef_write(file, &uniquedefs[i])) + goto err; + for (keyseg= uniquedefs[i].seg, keyseg_end= keyseg+ uniquedefs[i].keysegs; + keyseg < keyseg_end; + keyseg++) + { + switch (keyseg->type) { + case HA_KEYTYPE_VARTEXT1: + case HA_KEYTYPE_VARTEXT2: + case HA_KEYTYPE_VARBINARY1: + case HA_KEYTYPE_VARBINARY2: + if (!(keyseg->flag & HA_BLOB_PART)) + { + keyseg->flag|= HA_VAR_LENGTH_PART; + keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 || + keyseg->type == HA_KEYTYPE_VARBINARY1) ? + 1 : 2); + } + break; + default: + DBUG_ASSERT((keyseg->flag & HA_VAR_LENGTH_PART) == 0); + break; + } + if (_ma_keyseg_write(file, keyseg)) + goto err; + } + } + DBUG_PRINT("info", ("write field definitions")); + if (datafile_type == BLOCK_RECORD) + { + /* Store columns in a more efficent order */ + MARIA_COLUMNDEF **col_order, **pos; + if (!(col_order= (MARIA_COLUMNDEF**) my_malloc(share.base.fields * + sizeof(MARIA_COLUMNDEF*), + MYF(MY_WME)))) + goto err; + for (column= columndef, pos= col_order ; + column != end_column ; + column++, pos++) + *pos= column; + qsort(col_order, share.base.fields, sizeof(*col_order), + (qsort_cmp) compare_columns); + for (i=0 ; i < share.base.fields ; i++) + { + if (_ma_columndef_write(file, col_order[i])) + { + my_free((uchar*) col_order, MYF(0)); + goto err; + } + } + my_free((uchar*) col_order, MYF(0)); + } + else + { + for (i=0 ; i < share.base.fields ; i++) + if (_ma_columndef_write(file, &columndef[i])) + goto err; + } + + if ((kfile_size_before_extension= my_tell(file,MYF(0))) == MY_FILEPOS_ERROR) + goto err; +#ifndef DBUG_OFF + if (kfile_size_before_extension != info_length) + DBUG_PRINT("warning",("info_length: %u != used_length: %u", + info_length, (uint)kfile_size_before_extension)); +#endif + + if (sync_dir) + { + /* + we log the first bytes and then the size to which we extend; this is + not log 1 KB of mostly zeroes if this is a small table. + */ + char empty_string[]= ""; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uint total_rec_length= 0; + uint i; + LSN lsn; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= 1 + 2 + 2 + + kfile_size_before_extension; + /* we are needing maybe 64 kB, so don't use the stack */ + log_data= my_malloc(log_array[TRANSLOG_INTERNAL_PARTS + 1].length, MYF(0)); + if ((log_data == NULL) || + my_pread(file, 1 + 2 + 2 + log_data, kfile_size_before_extension, + 0, MYF(MY_NABP))) + goto err; + /* + remember if the data file was created or not, to know if Recovery can + do it or not, in the future + */ + log_data[0]= test(flags & HA_DONT_TOUCH_DATA); + int2store(log_data + 1, kfile_size_before_extension); + int2store(log_data + 1 + 2, share.base.keystart); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char *)name; + /* we store the end-zero, for Recovery to just pass it to my_create() */ + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= + strlen(log_array[TRANSLOG_INTERNAL_PARTS + 0].str) + 1; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_data; + /* symlink description is also needed for re-creation by Recovery: */ + log_array[TRANSLOG_INTERNAL_PARTS + 2].str= (char *) + (ci->data_file_name ? ci->data_file_name : empty_string); + log_array[TRANSLOG_INTERNAL_PARTS + 2].length= + strlen(log_array[TRANSLOG_INTERNAL_PARTS + 2].str) + 1; + log_array[TRANSLOG_INTERNAL_PARTS + 3].str= (char *) + (ci->index_file_name ? ci->index_file_name : empty_string); + log_array[TRANSLOG_INTERNAL_PARTS + 3].length= + strlen(log_array[TRANSLOG_INTERNAL_PARTS + 3].str) + 1; + for (i= TRANSLOG_INTERNAL_PARTS; + i < (sizeof(log_array)/sizeof(log_array[0])); i++) + total_rec_length+= log_array[i].length; + /** + For this record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe, which it is not now (that would require + work using the ddl_log of sql/sql_table.cc); when it is, we should + reconsider the moment of writing this log record (before or after op, + under THR_LOCK_maria or not...), how to use it in Recovery. + For now this record can serve when we apply logs to a backup, + so we sync it. This happens before the data file is created. If the + data file was created before, and we crashed before writing the log + record, at restart the table may be used, so we would not have a + trustable history in the log (impossible to apply this log to a + backup). The way we do it, if we crash before writing the log record + then there is no data file and the table cannot be used. + @todo Note that in case of TRUNCATE TABLE we also come here; for + Recovery to be able to finish TRUNCATE TABLE, instead of leaving a + half-truncated table, we should log the record at start of + maria_create(); for that we shouldn't write to the index file but to a + buffer (DYNAMIC_STRING), put the buffer into the record, then put the + buffer into the index file (so, change _ma_keydef_write() etc). That + would also enable Recovery to finish a CREATE TABLE. The final result + would be that we would be able to finish what the SQL layer has asked + for: it would be atomic. + When in CREATE/TRUNCATE (or DROP or RENAME or REPAIR) we have not + called external_lock(), so have no TRN. It does not matter, as all + these operations are non-transactional and sync their files. + */ + if (unlikely(translog_write_record(&lsn, + LOGREC_REDO_CREATE_TABLE, + &dummy_transaction_object, NULL, + total_rec_length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL) || + translog_flush(lsn))) + goto err; + /* + store LSN into file, needed for Recovery to not be confused if a + DROP+CREATE happened (applying REDOs to the wrong table). + */ + share.kfile.file= file; + if (_ma_update_create_rename_lsn_sub(&share, lsn, FALSE)) + goto err; + my_free(log_data, MYF(0)); + } + + if (!(flags & HA_DONT_TOUCH_DATA)) + { + if (ci->data_file_name) + { + char *dext= strrchr(ci->data_file_name, '.'); + int have_dext= dext && !strcmp(dext, MARIA_NAME_DEXT); + + if (tmp_table) + { + char *path; + /* chop off the table name, tempory tables use generated name */ + if ((path= strrchr(ci->data_file_name, FN_LIBCHAR))) + *path= '\0'; + fn_format(filename, name, ci->data_file_name, MARIA_NAME_DEXT, + MY_REPLACE_DIR | MY_UNPACK_FILENAME | MY_APPEND_EXT); + } + else + { + fn_format(filename, ci->data_file_name, "", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | + (have_dext ? MY_REPLACE_EXT : MY_APPEND_EXT)); + } + fn_format(linkname, name, "",MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + linkname_ptr= linkname; + create_flag=0; + } + else + { + fn_format(filename,name,"", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + linkname_ptr= NullS; + create_flag=MY_DELETE_OLD; + } + if ((dfile= + my_create_with_symlink(linkname_ptr, filename, 0, create_mode, + MYF(MY_WME | create_flag | sync_dir))) < 0) + goto err; + errpos=3; + + if (_ma_initialize_data_file(&share, dfile)) + goto err; + } + + /* Enlarge files */ + DBUG_PRINT("info", ("enlarge to keystart: %lu", + (ulong) share.base.keystart)); + if (my_chsize(file,(ulong) share.base.keystart,0,MYF(0))) + goto err; + + if (sync_dir && my_sync(file, MYF(0))) + goto err; + + if (! (flags & HA_DONT_TOUCH_DATA)) + { +#ifdef USE_RELOC + if (my_chsize(dfile,share.base.min_pack_length*ci->reloc_rows,0,MYF(0))) + goto err; +#endif + if (sync_dir && my_sync(dfile, MYF(0))) + goto err; + if (my_close(dfile,MYF(0))) + goto err; + } + pthread_mutex_unlock(&THR_LOCK_maria); + res= 0; + my_free((char*) rec_per_key_part,MYF(0)); + errpos=0; + if (my_close(file,MYF(0))) + res= my_errno; + DBUG_RETURN(res); + +err: + pthread_mutex_unlock(&THR_LOCK_maria); + +err_no_lock: + save_errno=my_errno; + switch (errpos) { + case 3: + VOID(my_close(dfile,MYF(0))); + /* fall through */ + case 2: + if (! (flags & HA_DONT_TOUCH_DATA)) + my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT), + sync_dir); + /* fall through */ + case 1: + VOID(my_close(file,MYF(0))); + if (! (flags & HA_DONT_TOUCH_DATA)) + my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_IEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT), + sync_dir); + } + my_free(log_data, MYF(MY_ALLOW_ZERO_PTR)); + my_free((char*) rec_per_key_part, MYF(0)); + DBUG_RETURN(my_errno=save_errno); /* return the fatal errno */ +} + + +uint maria_get_pointer_length(ulonglong file_length, uint def) +{ + DBUG_ASSERT(def >= 2 && def <= 7); + if (file_length) /* If not default */ + { +#ifdef NOT_YET_READY_FOR_8_BYTE_POINTERS + if (file_length >= (ULL(1) << 56)) + def=8; + else +#endif + if (file_length >= (ULL(1) << 48)) + def=7; + else if (file_length >= (ULL(1) << 40)) + def=6; + else if (file_length >= (ULL(1) << 32)) + def=5; + else if (file_length >= (ULL(1) << 24)) + def=4; + else if (file_length >= (ULL(1) << 16)) + def=3; + else + def=2; + } + return def; +} + + +/* + Sort columns for records-in-block + + IMPLEMENTATION + Sort columns in following order: + + Fixed size, not null columns + Fixed length, null fields + Variable length fields (CHAR, VARCHAR) + Blobs + + For same kind of fields, keep fields in original order +*/ + +static inline int sign(longlong a) +{ + return a < 0 ? -1 : (a > 0 ? 1 : 0); +} + + +static int compare_columns(MARIA_COLUMNDEF **a_ptr, MARIA_COLUMNDEF **b_ptr) +{ + MARIA_COLUMNDEF *a= *a_ptr, *b= *b_ptr; + enum en_fieldtype a_type, b_type; + + a_type= ((a->type == FIELD_NORMAL || a->type == FIELD_CHECK) ? + FIELD_NORMAL : a->type); + b_type= ((b->type == FIELD_NORMAL || b->type == FIELD_CHECK) ? + FIELD_NORMAL : b->type); + + if (a_type == FIELD_NORMAL && !a->null_bit) + { + if (b_type != FIELD_NORMAL || b->null_bit) + return -1; + return sign((long) (a->offset - b->offset)); + } + if (b_type == FIELD_NORMAL && !b->null_bit) + return 1; + if (a_type == b_type) + return sign((long) (a->offset - b->offset)); + if (a_type == FIELD_NORMAL) + return -1; + if (b_type == FIELD_NORMAL) + return 1; + if (a_type == FIELD_BLOB) + return 1; + if (b_type == FIELD_BLOB) + return -1; + return sign((long) (a->offset - b->offset)); +} + + +/* Initialize data file */ + +int _ma_initialize_data_file(MARIA_SHARE *share, File dfile) +{ + if (share->data_file_type == BLOCK_RECORD) + { + share->bitmap.block_size= share->base.block_size; + share->bitmap.file.file = dfile; + return _ma_bitmap_create_first(share); + } + /* + So, in BLOCK_RECORD, a freshly created datafile is one page long; while in + other formats it is 0-byte long. + */ + return 0; +} + + +/** + @brief Writes create_rename_lsn and is_of_horizon to disk, can force. + + This is for special cases where: + - we don't want to write the full state to disk (so, not call + _ma_state_info_write()) because some parts of the state may be + currently inconsistent, or because it would be overkill + - we must sync these LSNs immediately for correctness. + It acquires intern_lock to protect the two LSNs and state write. + + @param share table's share + @param do_sync if the write should be forced to disk + + @return Operation status + @retval 0 ok + @retval 1 error (disk problem) +*/ + +int _ma_update_create_rename_lsn(MARIA_SHARE *share, + LSN lsn, my_bool do_sync) +{ + int res; + pthread_mutex_lock(&share->intern_lock); + res= _ma_update_create_rename_lsn_sub(share, lsn, do_sync); + pthread_mutex_unlock(&share->intern_lock); + return res; +} + + +/** + @brief Writes create_rename_lsn and is_of_horizon to disk, can force. + + Shortcut of _ma_update_create_rename_lsn() when we know that + intern_lock is not needed (when creating a table or opening it for the + first time). + + @param share table's share + @param do_sync if the write should be forced to disk + + @return Operation status + @retval 0 ok + @retval 1 error (disk problem) +*/ + +int _ma_update_create_rename_lsn_sub(MARIA_SHARE *share, + LSN lsn, my_bool do_sync) +{ + char buf[LSN_STORE_SIZE*2], *ptr; + File file= share->kfile.file; + DBUG_ASSERT(file >= 0); + for (ptr= buf; ptr < (buf + sizeof(buf)); ptr+= LSN_STORE_SIZE) + lsn_store(ptr, lsn); + share->state.is_of_horizon= share->state.create_rename_lsn= lsn; + if (share->id != 0) + { + /* + If OP is the operation which is calling us, if table is later written, + we could see in the log: + FILE_ID ... REDO_OP ... REDO_INSERT. + (that can happen in real life at least with OP=REPAIR). + As FILE_ID will be ignored by Recovery because it is < + create_rename_lsn, REDO_INSERT would be ignored too, wrongly. + To avoid that, we force a LOGREC_FILE_ID to be logged at next write: + */ + translog_deassign_id_from_share(share); + } + return my_pwrite(file, buf, sizeof(buf), + sizeof(share->state.header) + 2, MYF(MY_NABP)) || + (do_sync && my_sync(file, MYF(0))); +} diff --git a/storage/maria/ma_dbug.c b/storage/maria/ma_dbug.c new file mode 100644 index 00000000000..a23e7248029 --- /dev/null +++ b/storage/maria/ma_dbug.c @@ -0,0 +1,193 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Support rutiner with are using with dbug */ + +#include "maria_def.h" + + /* Print a key in user understandable format */ + +void _ma_print_key(FILE *stream, register HA_KEYSEG *keyseg, + const uchar *key, uint length) +{ + int flag; + short int s_1; + long int l_1; + float f_1; + double d_1; + const uchar *end; + const uchar *key_end= key + length; + + VOID(fputs("Key: \"",stream)); + flag=0; + for (; keyseg->type && key < key_end ;keyseg++) + { + if (flag++) + VOID(putc('-',stream)); + end= key+ keyseg->length; + if (keyseg->flag & HA_NULL_PART) + { + /* A NULL value is encoded by a 1-byte flag. Zero means NULL. */ + if (! *(key++)) + { + fprintf(stream,"NULL"); + continue; + } + end++; + } + + switch (keyseg->type) { + case HA_KEYTYPE_BINARY: + if (!(keyseg->flag & HA_SPACE_PACK) && keyseg->length == 1) + { /* packed binary digit */ + VOID(fprintf(stream,"%d",(uint) *key++)); + break; + } + /* fall through */ + case HA_KEYTYPE_TEXT: + case HA_KEYTYPE_NUM: + if (keyseg->flag & HA_SPACE_PACK) + { + VOID(fprintf(stream,"%.*s",(int) *key,key+1)); + key+= (int) *key+1; + } + else + { + VOID(fprintf(stream,"%.*s",(int) keyseg->length,key)); + key=end; + } + break; + case HA_KEYTYPE_INT8: + VOID(fprintf(stream,"%d",(int) *((signed char*) key))); + key=end; + break; + case HA_KEYTYPE_SHORT_INT: + s_1= mi_sint2korr(key); + VOID(fprintf(stream,"%d",(int) s_1)); + key=end; + break; + case HA_KEYTYPE_USHORT_INT: + { + ushort u_1; + u_1= mi_uint2korr(key); + VOID(fprintf(stream,"%u",(uint) u_1)); + key=end; + break; + } + case HA_KEYTYPE_LONG_INT: + l_1=mi_sint4korr(key); + VOID(fprintf(stream,"%ld",l_1)); + key=end; + break; + case HA_KEYTYPE_ULONG_INT: + l_1=mi_sint4korr(key); + VOID(fprintf(stream,"%lu",(ulong) l_1)); + key=end; + break; + case HA_KEYTYPE_INT24: + VOID(fprintf(stream,"%ld",(long) mi_sint3korr(key))); + key=end; + break; + case HA_KEYTYPE_UINT24: + VOID(fprintf(stream,"%lu",(ulong) mi_uint3korr(key))); + key=end; + break; + case HA_KEYTYPE_FLOAT: + mi_float4get(f_1,key); + VOID(fprintf(stream,"%g",(double) f_1)); + key=end; + break; + case HA_KEYTYPE_DOUBLE: + mi_float8get(d_1,key); + VOID(fprintf(stream,"%g",d_1)); + key=end; + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + { + char buff[21]; + longlong2str(mi_sint8korr(key),buff,-10); + VOID(fprintf(stream,"%s",buff)); + key=end; + break; + } + case HA_KEYTYPE_ULONGLONG: + { + char buff[21]; + longlong2str(mi_sint8korr(key),buff,10); + VOID(fprintf(stream,"%s",buff)); + key=end; + break; + } + case HA_KEYTYPE_BIT: + { + uint i; + fputs("0x",stream); + for (i=0 ; i < keyseg->length ; i++) + fprintf(stream, "%02x", (uint) *key++); + key= end; + break; + } + +#endif + case HA_KEYTYPE_VARTEXT1: /* VARCHAR and TEXT */ + case HA_KEYTYPE_VARTEXT2: /* VARCHAR and TEXT */ + case HA_KEYTYPE_VARBINARY1: /* VARBINARY and BLOB */ + case HA_KEYTYPE_VARBINARY2: /* VARBINARY and BLOB */ + { + uint tmp_length; + get_key_length(tmp_length,key); + /* + The following command sometimes gives a warning from valgrind. + Not yet sure if the bug is in valgrind, glibc or mysqld + */ + VOID(fprintf(stream,"%.*s",(int) tmp_length,key)); + key+=tmp_length; + break; + } + default: break; /* This never happens */ + } + } + VOID(fputs("\"\n",stream)); + return; +} /* print_key */ + + +#ifdef EXTRA_DEBUG + +my_bool _ma_check_table_is_closed(const char *name, const char *where) +{ + char filename[FN_REFLEN]; + LIST *pos; + DBUG_ENTER("_ma_check_table_is_closed"); + + (void) fn_format(filename,name,"",MARIA_NAME_IEXT,4+16+32); + for (pos=maria_open_list ; pos ; pos=pos->next) + { + MARIA_HA *info=(MARIA_HA*) pos->data; + MARIA_SHARE *share=info->s; + if (!strcmp(share->unique_file_name,filename)) + { + if (share->last_version) + { + fprintf(stderr,"Warning: Table: %s is open on %s\n", name,where); + DBUG_PRINT("warning",("Table: %s is open on %s", name,where)); + DBUG_RETURN(1); + } + } + } + DBUG_RETURN(0); +} +#endif /* EXTRA_DEBUG */ diff --git a/storage/maria/ma_delete.c b/storage/maria/ma_delete.c new file mode 100644 index 00000000000..56da6fd3ed3 --- /dev/null +++ b/storage/maria/ma_delete.c @@ -0,0 +1,891 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Remove a row from a MARIA table */ + +#include "ma_fulltext.h" +#include "ma_rt_index.h" + +static int d_search(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uint comp_flag, + uchar *key,uint key_length,my_off_t page,uchar *anc_buff); +static int del(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uchar *key,uchar *anc_buff, + my_off_t leaf_page,uchar *leaf_buff,uchar *keypos, + my_off_t next_block,uchar *ret_key); +static int underflow(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uchar *anc_buff, + my_off_t leaf_page,uchar *leaf_buff,uchar *keypos); +static uint remove_key(MARIA_KEYDEF *keyinfo,uint nod_flag,uchar *keypos, + uchar *lastkey,uchar *page_end, + my_off_t *next_block); +static int _ma_ck_real_delete(register MARIA_HA *info,MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, my_off_t *root); + + +int maria_delete(MARIA_HA *info,const uchar *record) +{ + uint i; + uchar *old_key; + int save_errno; + char lastpos[8]; + MARIA_SHARE *share=info->s; + DBUG_ENTER("maria_delete"); + + /* Test if record is in datafile */ + + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage", + maria_print_error(info->s, HA_ERR_CRASHED); + DBUG_RETURN(my_errno= HA_ERR_CRASHED);); + DBUG_EXECUTE_IF("my_error_test_undefined_error", + maria_print_error(info->s, INT_MAX); + DBUG_RETURN(my_errno= INT_MAX);); + if (!(info->update & HA_STATE_AKTIV)) + { + DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No database read */ + } + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + if ((*share->compare_record)(info,record)) + goto err; /* Error on read-check */ + + if (_ma_mark_file_changed(info)) + goto err; + + /* Remove all keys from the index file */ + + old_key= info->lastkey2; + for (i=0 ; i < share->base.keys ; i++ ) + { + if (maria_is_key_active(info->s->state.key_map, i)) + { + info->s->keyinfo[i].version++; + if (info->s->keyinfo[i].flag & HA_FULLTEXT ) + { + if (_ma_ft_del(info,i,(char*) old_key,record,info->cur_row.lastpos)) + goto err; + } + else + { + if (info->s->keyinfo[i].ck_delete(info,i,old_key, + _ma_make_key(info,i,old_key,record,info->cur_row.lastpos))) + goto err; + } + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + } + } + + if ((*share->delete_record)(info, record)) + goto err; /* Remove record from database */ + + /* + We can't use the row based checksum as this doesn't have enough + precision. + */ + if (info->s->calc_checksum) + { + info->cur_row.checksum= (*info->s->calc_checksum)(info,record); + info->state->checksum-= info->cur_row.checksum; + } + + info->update= HA_STATE_CHANGED+HA_STATE_DELETED+HA_STATE_ROW_CHANGED; + info->state->records-= !share->now_transactional; + share->state.changed|= STATE_NOT_OPTIMIZED_ROWS; + + mi_sizestore(lastpos, info->cur_row.lastpos); + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + allow_break(); /* Allow SIGHUP & SIGINT */ + if (info->invalidator != 0) + { + DBUG_PRINT("info", ("invalidator... '%s' (delete)", info->s->open_file_name)); + (*info->invalidator)(info->s->open_file_name); + info->invalidator=0; + } + DBUG_RETURN(0); + +err: + save_errno=my_errno; + mi_sizestore(lastpos, info->cur_row.lastpos); + if (save_errno != HA_ERR_RECORD_CHANGED) + { + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); /* mark table crashed */ + } + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + info->update|=HA_STATE_WRITTEN; /* Buffer changed */ + allow_break(); /* Allow SIGHUP & SIGINT */ + my_errno=save_errno; + if (save_errno == HA_ERR_KEY_NOT_FOUND) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + } + + DBUG_RETURN(my_errno); +} /* maria_delete */ + + + /* Remove a key from the btree index */ + +int _ma_ck_delete(register MARIA_HA *info, uint keynr, uchar *key, + uint key_length) +{ + return _ma_ck_real_delete(info, info->s->keyinfo+keynr, key, key_length, + &info->s->state.key_root[keynr]); +} /* _ma_ck_delete */ + + +static int _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, my_off_t *root) +{ + int error; + uint nod_flag; + my_off_t old_root; + uchar *root_buff; + DBUG_ENTER("_ma_ck_real_delete"); + + if ((old_root=*root) == HA_OFFSET_ERROR) + { + maria_print_error(info->s, HA_ERR_CRASHED); + DBUG_RETURN(my_errno=HA_ERR_CRASHED); + } + if (!(root_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ + HA_MAX_KEY_BUFF*2))) + { + DBUG_PRINT("error",("Couldn't allocate memory")); + DBUG_RETURN(my_errno=ENOMEM); + } + DBUG_PRINT("info",("root_page: %ld", (long) old_root)); + if (!_ma_fetch_keypage(info,keyinfo,old_root,DFLT_INIT_HITS,root_buff,0)) + { + error= -1; + goto err; + } + if ((error=d_search(info,keyinfo, + (keyinfo->flag & HA_FULLTEXT ? SEARCH_FIND | SEARCH_UPDATE + : SEARCH_SAME), + key,key_length,old_root,root_buff)) >0) + { + if (error == 2) + { + DBUG_PRINT("test",("Enlarging of root when deleting")); + error= _ma_enlarge_root(info,keyinfo,key,root); + } + else /* error == 1 */ + { + if (maria_data_on_page(root_buff) <= (nod_flag=_ma_test_if_nod(root_buff))+3) + { + error=0; + if (nod_flag) + *root= _ma_kpos(nod_flag,root_buff+2+nod_flag); + else + *root=HA_OFFSET_ERROR; + if (_ma_dispose(info,keyinfo,old_root,DFLT_INIT_HITS)) + error= -1; + } + else + error= _ma_write_keypage(info,keyinfo,old_root, + DFLT_INIT_HITS,root_buff); + } + } +err: + my_afree((uchar*) root_buff); + DBUG_PRINT("exit",("Return: %d",error)); + DBUG_RETURN(error); +} /* _ma_ck_real_delete */ + + + /* + ** Remove key below key root + ** Return values: + ** 1 if there are less buffers; In this case anc_buff is not saved + ** 2 if there are more buffers + ** -1 on errors + */ + +static int d_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uint comp_flag, uchar *key, uint key_length, + my_off_t page, uchar *anc_buff) +{ + int flag,ret_value,save_flag; + uint length,nod_flag,search_key_length; + my_bool last_key; + uchar *leaf_buff,*keypos; + my_off_t leaf_page,next_block; + uchar lastkey[HA_MAX_KEY_BUFF]; + DBUG_ENTER("d_search"); + DBUG_DUMP("page",anc_buff,maria_data_on_page(anc_buff)); + + search_key_length= (comp_flag & SEARCH_FIND) ? key_length : USE_WHOLE_KEY; + flag=(*keyinfo->bin_search)(info,keyinfo,anc_buff,key, search_key_length, + comp_flag, &keypos, lastkey, &last_key); + if (flag == MARIA_FOUND_WRONG_KEY) + { + DBUG_PRINT("error",("Found wrong key")); + DBUG_RETURN(-1); + } + nod_flag=_ma_test_if_nod(anc_buff); + + if (!flag && keyinfo->flag & HA_FULLTEXT) + { + uint off; + int subkeys; + + get_key_full_length_rdonly(off, lastkey); + subkeys=ft_sintXkorr(lastkey+off); + DBUG_ASSERT(info->ft1_to_ft2==0 || subkeys >=0); + comp_flag=SEARCH_SAME; + if (subkeys >= 0) + { + /* normal word, one-level tree structure */ + if (info->ft1_to_ft2) + { + /* we're in ft1->ft2 conversion mode. Saving key data */ + insert_dynamic(info->ft1_to_ft2, (char*) (lastkey+off)); + } + else + { + /* we need exact match only if not in ft1->ft2 conversion mode */ + flag=(*keyinfo->bin_search)(info,keyinfo,anc_buff,key,USE_WHOLE_KEY, + comp_flag, &keypos, lastkey, &last_key); + } + /* fall through to normal delete */ + } + else + { + /* popular word. two-level tree. going down */ + uint tmp_key_length; + my_off_t root; + uchar *kpos=keypos; + + if (!(tmp_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&kpos,lastkey))) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno= HA_ERR_CRASHED; + DBUG_RETURN(-1); + } + root= _ma_dpos(info,nod_flag,kpos); + if (subkeys == -1) + { + /* the last entry in sub-tree */ + if (_ma_dispose(info, keyinfo, root,DFLT_INIT_HITS)) + DBUG_RETURN(-1); + /* fall through to normal delete */ + } + else + { + keyinfo=&info->s->ft2_keyinfo; + kpos-=keyinfo->keylength+nod_flag; /* we'll modify key entry 'in vivo' */ + get_key_full_length_rdonly(off, key); + key+=off; + ret_value= _ma_ck_real_delete(info, &info->s->ft2_keyinfo, + key, HA_FT_WLEN, &root); + _ma_dpointer(info, kpos+HA_FT_WLEN, root); + subkeys++; + ft_intXstore(kpos, subkeys); + if (!ret_value) + ret_value= _ma_write_keypage(info,keyinfo,page, + DFLT_INIT_HITS,anc_buff); + DBUG_PRINT("exit",("Return: %d",ret_value)); + DBUG_RETURN(ret_value); + } + } + } + leaf_buff=0; + LINT_INIT(leaf_page); + if (nod_flag) + { + leaf_page= _ma_kpos(nod_flag,keypos); + if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ + HA_MAX_KEY_BUFF*2))) + { + DBUG_PRINT("error",("Couldn't allocate memory")); + my_errno=ENOMEM; + DBUG_PRINT("exit",("Return: %d",-1)); + DBUG_RETURN(-1); + } + if (!_ma_fetch_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff,0)) + goto err; + } + + if (flag != 0) + { + if (!nod_flag) + { + DBUG_PRINT("error",("Didn't find key")); + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; /* This should newer happend */ + goto err; + } + save_flag=0; + ret_value=d_search(info,keyinfo,comp_flag,key,key_length, + leaf_page,leaf_buff); + } + else + { /* Found key */ + uint tmp; + length= maria_data_on_page(anc_buff); + if (!(tmp= remove_key(keyinfo,nod_flag,keypos,lastkey,anc_buff+length, + &next_block))) + goto err; + + length-= tmp; + + maria_putint(anc_buff,length,nod_flag); + if (!nod_flag) + { /* On leaf page */ + if (_ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,anc_buff)) + { + DBUG_PRINT("exit",("Return: %d",-1)); + DBUG_RETURN(-1); + } + /* Page will be update later if we return 1 */ + DBUG_RETURN(test(length <= (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length))); + } + save_flag=1; + ret_value=del(info,keyinfo,key,anc_buff,leaf_page,leaf_buff,keypos, + next_block,lastkey); + } + if (ret_value >0) + { + save_flag=1; + if (ret_value == 1) + ret_value= underflow(info,keyinfo,anc_buff,leaf_page,leaf_buff,keypos); + else + { /* This happens only with packed keys */ + DBUG_PRINT("test",("Enlarging of key when deleting")); + if (!_ma_get_last_key(info,keyinfo,anc_buff,lastkey,keypos,&length)) + goto err; + ret_value= _ma_insert(info,keyinfo,key,anc_buff,keypos,lastkey, + (uchar*) 0,(uchar*) 0,(my_off_t) 0,(my_bool) 0); + } + } + if (ret_value == 0 && maria_data_on_page(anc_buff) > keyinfo->block_length) + { + save_flag=1; + ret_value= _ma_split_page(info,keyinfo,key,anc_buff,lastkey,0) | 2; + } + if (save_flag && ret_value != 1) + ret_value|= _ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,anc_buff); + else + { + DBUG_DUMP("page",anc_buff,maria_data_on_page(anc_buff)); + } + my_afree(leaf_buff); + DBUG_PRINT("exit",("Return: %d",ret_value)); + DBUG_RETURN(ret_value); + +err: + my_afree(leaf_buff); + DBUG_PRINT("exit",("Error: %d",my_errno)); + DBUG_RETURN (-1); +} /* d_search */ + + + /* Remove a key that has a page-reference */ + +static int del(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *key, uchar *anc_buff, my_off_t leaf_page, + uchar *leaf_buff, + uchar *keypos, /* Pos to where deleted key was */ + my_off_t next_block, + uchar *ret_key) /* key before keypos in anc_buff */ +{ + int ret_value,length; + uint a_length,nod_flag,tmp; + my_off_t next_page; + uchar keybuff[HA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key; + MARIA_SHARE *share=info->s; + MARIA_KEY_PARAM s_temp; + DBUG_ENTER("del"); + DBUG_PRINT("enter",("leaf_page: %ld keypos: 0x%lx", (long) leaf_page, + (ulong) keypos)); + DBUG_DUMP("leaf_buff",leaf_buff,maria_data_on_page(leaf_buff)); + + endpos= leaf_buff+ maria_data_on_page(leaf_buff); + if (!(key_start= _ma_get_last_key(info,keyinfo,leaf_buff,keybuff,endpos, + &tmp))) + DBUG_RETURN(-1); + + if ((nod_flag=_ma_test_if_nod(leaf_buff))) + { + next_page= _ma_kpos(nod_flag,endpos); + if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ + HA_MAX_KEY_BUFF*2))) + DBUG_RETURN(-1); + if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,next_buff,0)) + ret_value= -1; + else + { + DBUG_DUMP("next_page",next_buff,maria_data_on_page(next_buff)); + if ((ret_value=del(info,keyinfo,key,anc_buff,next_page,next_buff, + keypos,next_block,ret_key)) >0) + { + endpos=leaf_buff+maria_data_on_page(leaf_buff); + if (ret_value == 1) + { + ret_value=underflow(info,keyinfo,leaf_buff,next_page, + next_buff,endpos); + if (ret_value == 0 && maria_data_on_page(leaf_buff) > keyinfo->block_length) + { + ret_value= _ma_split_page(info,keyinfo,key,leaf_buff,ret_key,0) | 2; + } + } + else + { + DBUG_PRINT("test",("Inserting of key when deleting")); + if (!_ma_get_last_key(info,keyinfo,leaf_buff,keybuff,endpos, + &tmp)) + goto err; + ret_value= _ma_insert(info,keyinfo,key,leaf_buff,endpos,keybuff, + (uchar*) 0,(uchar*) 0,(my_off_t) 0,0); + } + } + if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff)) + goto err; + } + my_afree(next_buff); + DBUG_RETURN(ret_value); + } + + /* Remove last key from leaf page */ + + maria_putint(leaf_buff,key_start-leaf_buff,nod_flag); + if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff)) + goto err; + + /* Place last key in ancestor page on deleted key position */ + + a_length= maria_data_on_page(anc_buff); + endpos=anc_buff+a_length; + if (keypos != anc_buff+2+share->base.key_reflength && + !_ma_get_last_key(info,keyinfo,anc_buff,ret_key,keypos,&tmp)) + goto err; + prev_key=(keypos == anc_buff+2+share->base.key_reflength ? + 0 : ret_key); + length=(*keyinfo->pack_key)(keyinfo,share->base.key_reflength, + keypos == endpos ? (uchar*) 0 : keypos, + prev_key, prev_key, + keybuff,&s_temp); + if (length > 0) + bmove_upp(endpos+length,endpos,(uint) (endpos-keypos)); + else + bmove(keypos,keypos-length, (int) (endpos-keypos)+length); + (*keyinfo->store_key)(keyinfo,keypos,&s_temp); + /* Save pointer to next leaf */ + if (!(*keyinfo->get_key)(keyinfo,share->base.key_reflength,&keypos,ret_key)) + goto err; + _ma_kpointer(info,keypos - share->base.key_reflength,next_block); + maria_putint(anc_buff,a_length+length,share->base.key_reflength); + + DBUG_RETURN( maria_data_on_page(leaf_buff) <= + (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length)); +err: + DBUG_RETURN(-1); +} /* del */ + + + /* Balances adjacent pages if underflow occours */ + +static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *anc_buff, + my_off_t leaf_page,/* Ancestor page and underflow page */ + uchar *leaf_buff, + uchar *keypos) /* Position to pos after key */ +{ + int t_length; + uint length,anc_length,buff_length,leaf_length,p_length,s_length,nod_flag, + key_reflength,key_length; + my_off_t next_page; + uchar anc_key[HA_MAX_KEY_BUFF],leaf_key[HA_MAX_KEY_BUFF]; + uchar *buff,*endpos,*next_keypos,*anc_pos,*half_pos,*temp_pos,*prev_key; + uchar *after_key; + MARIA_KEY_PARAM s_temp; + MARIA_SHARE *share=info->s; + DBUG_ENTER("underflow"); + DBUG_PRINT("enter",("leaf_page: %ld keypos: 0x%lx",(long) leaf_page, + (ulong) keypos)); + DBUG_DUMP("anc_buff",anc_buff,maria_data_on_page(anc_buff)); + DBUG_DUMP("leaf_buff",leaf_buff,maria_data_on_page(leaf_buff)); + + buff=info->buff; + info->keyread_buff_used=1; + next_keypos=keypos; + nod_flag=_ma_test_if_nod(leaf_buff); + p_length=nod_flag+2; + anc_length= maria_data_on_page(anc_buff); + leaf_length= maria_data_on_page(leaf_buff); + key_reflength=share->base.key_reflength; + if (info->s->keyinfo+info->lastinx == keyinfo) + info->page_changed=1; + + if ((keypos < anc_buff+anc_length && (info->state->records & 1)) || + keypos == anc_buff+2+key_reflength) + { /* Use page right of anc-page */ + DBUG_PRINT("test",("use right page")); + + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { + if (!(next_keypos= _ma_get_key(info, keyinfo, + anc_buff, buff, keypos, &length))) + goto err; + } + else + { + /* Got to end of found key */ + buff[0]=buff[1]=0; /* Avoid length error check if packed key */ + if (!(*keyinfo->get_key)(keyinfo,key_reflength,&next_keypos, + buff)) + goto err; + } + next_page= _ma_kpos(key_reflength,next_keypos); + if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff,0)) + goto err; + buff_length= maria_data_on_page(buff); + DBUG_DUMP("next",buff,buff_length); + + /* find keys to make a big key-page */ + bmove(next_keypos-key_reflength, buff+2, key_reflength); + if (!_ma_get_last_key(info,keyinfo,anc_buff,anc_key,next_keypos,&length) + || !_ma_get_last_key(info,keyinfo,leaf_buff,leaf_key, + leaf_buff+leaf_length,&length)) + goto err; + + /* merge pages and put parting key from anc_buff between */ + prev_key=(leaf_length == p_length ? (uchar*) 0 : leaf_key); + t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,buff+p_length, + prev_key, prev_key, + anc_key, &s_temp); + length=buff_length-p_length; + endpos=buff+length+leaf_length+t_length; + /* buff will always be larger than before !*/ + bmove_upp(endpos, buff+buff_length,length); + memcpy(buff, leaf_buff,(size_t) leaf_length); + (*keyinfo->store_key)(keyinfo,buff+leaf_length,&s_temp); + buff_length=(uint) (endpos-buff); + maria_putint(buff,buff_length,nod_flag); + + /* remove key from anc_buff */ + + if (!(s_length=remove_key(keyinfo,key_reflength,keypos,anc_key, + anc_buff+anc_length,(my_off_t *) 0))) + goto err; + + anc_length-=s_length; + maria_putint(anc_buff,anc_length,key_reflength); + + if (buff_length <= keyinfo->block_length) + { /* Keys in one page */ + memcpy(leaf_buff,buff,(size_t) buff_length); + if (_ma_dispose(info,keyinfo,next_page,DFLT_INIT_HITS)) + goto err; + } + else + { /* Page is full */ + endpos=anc_buff+anc_length; + DBUG_PRINT("test",("anc_buff: 0x%lx endpos: 0x%lx", + (long) anc_buff, (long) endpos)); + if (keypos != anc_buff+2+key_reflength && + !_ma_get_last_key(info,keyinfo,anc_buff,anc_key,keypos,&length)) + goto err; + if (!(half_pos= _ma_find_half_pos(nod_flag, keyinfo, buff, leaf_key, + &key_length, &after_key))) + goto err; + length=(uint) (half_pos-buff); + memcpy(leaf_buff,buff,(size_t) length); + maria_putint(leaf_buff,length,nod_flag); + + /* Correct new keypointer to leaf_page */ + half_pos=after_key; + _ma_kpointer(info,leaf_key+key_length,next_page); + /* Save key in anc_buff */ + prev_key=(keypos == anc_buff+2+key_reflength ? (uchar*) 0 : anc_key), + t_length=(*keyinfo->pack_key)(keyinfo,key_reflength, + (keypos == endpos ? (uchar*) 0 : + keypos), + prev_key, prev_key, + leaf_key, &s_temp); + if (t_length >= 0) + bmove_upp(endpos+t_length, endpos, (uint) (endpos-keypos)); + else + bmove(keypos,keypos-t_length,(uint) (endpos-keypos)+t_length); + (*keyinfo->store_key)(keyinfo,keypos,&s_temp); + maria_putint(anc_buff,(anc_length+=t_length),key_reflength); + + /* Store key first in new page */ + if (nod_flag) + bmove(buff+2,half_pos-nod_flag,(size_t) nod_flag); + if (!(*keyinfo->get_key)(keyinfo,nod_flag,&half_pos,leaf_key)) + goto err; + t_length=(int) (*keyinfo->pack_key)(keyinfo, nod_flag, (uchar*) 0, + (uchar*) 0, (uchar*) 0, + leaf_key, &s_temp); + /* t_length will always be > 0 for a new page !*/ + length=(uint) ((buff+maria_data_on_page(buff))-half_pos); + bmove(buff+p_length+t_length, half_pos, (size_t) length); + (*keyinfo->store_key)(keyinfo,buff+p_length,&s_temp); + maria_putint(buff,length+t_length+p_length,nod_flag); + + if (_ma_write_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff)) + goto err; + } + if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff)) + goto err; + DBUG_RETURN(anc_length <= ((info->quick_mode ? MARIA_MIN_BLOCK_LENGTH : + (uint) keyinfo->underflow_block_length))); + } + + DBUG_PRINT("test",("use left page")); + + keypos= _ma_get_last_key(info,keyinfo,anc_buff,anc_key,keypos,&length); + if (!keypos) + goto err; + next_page= _ma_kpos(key_reflength,keypos); + if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff,0)) + goto err; + buff_length= maria_data_on_page(buff); + endpos=buff+buff_length; + DBUG_DUMP("prev",buff,buff_length); + + /* find keys to make a big key-page */ + bmove(next_keypos - key_reflength, leaf_buff+2, key_reflength); + next_keypos=keypos; + if (!(*keyinfo->get_key)(keyinfo,key_reflength,&next_keypos, + anc_key)) + goto err; + if (!_ma_get_last_key(info,keyinfo,buff,leaf_key,endpos,&length)) + goto err; + + /* merge pages and put parting key from anc_buff between */ + prev_key=(leaf_length == p_length ? (uchar*) 0 : leaf_key); + t_length=(*keyinfo->pack_key)(keyinfo,nod_flag, + (leaf_length == p_length ? + (uchar*) 0 : leaf_buff+p_length), + prev_key, prev_key, + anc_key, &s_temp); + if (t_length >= 0) + bmove(endpos+t_length, leaf_buff+p_length, + (size_t) (leaf_length-p_length)); + else /* We gained space */ + bmove(endpos,leaf_buff+((int) p_length-t_length), + (size_t) (leaf_length-p_length+t_length)); + + (*keyinfo->store_key)(keyinfo,endpos,&s_temp); + buff_length=buff_length+leaf_length-p_length+t_length; + maria_putint(buff,buff_length,nod_flag); + + /* remove key from anc_buff */ + if (!(s_length= remove_key(keyinfo,key_reflength,keypos,anc_key, + anc_buff+anc_length,(my_off_t *) 0))) + goto err; + + anc_length-=s_length; + maria_putint(anc_buff,anc_length,key_reflength); + + if (buff_length <= keyinfo->block_length) + { /* Keys in one page */ + if (_ma_dispose(info,keyinfo,leaf_page,DFLT_INIT_HITS)) + goto err; + } + else + { /* Page is full */ + if (keypos == anc_buff+2+key_reflength) + anc_pos=0; /* First key */ + else if (!_ma_get_last_key(info,keyinfo,anc_buff,anc_pos=anc_key,keypos, + &length)) + goto err; + endpos= _ma_find_half_pos(nod_flag,keyinfo,buff,leaf_key, + &key_length, &half_pos); + if (!endpos) + goto err; + _ma_kpointer(info,leaf_key+key_length,leaf_page); + /* Save key in anc_buff */ + DBUG_DUMP("anc_buff",anc_buff,anc_length); + DBUG_DUMP("key_to_anc",leaf_key,key_length); + + temp_pos=anc_buff+anc_length; + t_length=(*keyinfo->pack_key)(keyinfo,key_reflength, + keypos == temp_pos ? (uchar*) 0 + : keypos, + anc_pos, anc_pos, + leaf_key,&s_temp); + if (t_length > 0) + bmove_upp(temp_pos+t_length, temp_pos, (uint) (temp_pos-keypos)); + else + bmove(keypos,keypos-t_length,(uint) (temp_pos-keypos)+t_length); + (*keyinfo->store_key)(keyinfo,keypos,&s_temp); + maria_putint(anc_buff,(anc_length+=t_length),key_reflength); + + /* Store first key on new page */ + if (nod_flag) + bmove(leaf_buff+2,half_pos-nod_flag,(size_t) nod_flag); + if (!(length=(*keyinfo->get_key)(keyinfo,nod_flag,&half_pos,leaf_key))) + goto err; + DBUG_DUMP("key_to_leaf",leaf_key,length); + t_length=(*keyinfo->pack_key)(keyinfo,nod_flag, (uchar*) 0, + (uchar*) 0, (uchar*) 0, leaf_key, &s_temp); + length=(uint) ((buff+buff_length)-half_pos); + DBUG_PRINT("info",("t_length: %d length: %d",t_length,(int) length)); + bmove(leaf_buff+p_length+t_length,half_pos, + (size_t) length); + (*keyinfo->store_key)(keyinfo,leaf_buff+p_length,&s_temp); + maria_putint(leaf_buff,length+t_length+p_length,nod_flag); + if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff)) + goto err; + maria_putint(buff,endpos-buff,nod_flag); + } + if (_ma_write_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff)) + goto err; + DBUG_RETURN(anc_length <= (uint) keyinfo->block_length/2); + +err: + DBUG_RETURN(-1); +} /* underflow */ + + + /* + remove a key from packed buffert + The current code doesn't handle the case that the next key may be + packed better against the previous key if there is a case difference + returns how many chars was removed or 0 on error + */ + +static uint remove_key(MARIA_KEYDEF *keyinfo, uint nod_flag, + uchar *keypos, /* Where key starts */ + uchar *lastkey, /* key to be removed */ + uchar *page_end, /* End of page */ + my_off_t *next_block) /* ptr to next block */ +{ + int s_length; + uchar *start; + DBUG_ENTER("remove_key"); + DBUG_PRINT("enter",("keypos: 0x%lx page_end: 0x%lx",(long) keypos, (long) page_end)); + + start=keypos; + if (!(keyinfo->flag & + (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | + HA_BINARY_PACK_KEY))) + { + s_length=(int) (keyinfo->keylength+nod_flag); + if (next_block && nod_flag) + *next_block= _ma_kpos(nod_flag,keypos+s_length); + } + else + { /* Let keypos point at next key */ + /* Calculate length of key */ + if (!(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,lastkey)) + DBUG_RETURN(0); /* Error */ + + if (next_block && nod_flag) + *next_block= _ma_kpos(nod_flag,keypos); + s_length=(int) (keypos-start); + if (keypos != page_end) + { + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { + uchar *old_key=start; + uint next_length,prev_length,prev_pack_length; + get_key_length(next_length,keypos); + get_key_pack_length(prev_length,prev_pack_length,old_key); + if (next_length > prev_length) + { + /* We have to copy data from the current key to the next key */ + bmove_upp((char*) keypos,(char*) (lastkey+next_length), + (next_length-prev_length)); + keypos-=(next_length-prev_length)+prev_pack_length; + store_key_length(keypos,prev_length); + s_length=(int) (keypos-start); + } + } + else + { + /* Check if a variable length first key part */ + if ((keyinfo->seg->flag & HA_PACK_KEY) && *keypos & 128) + { + /* Next key is packed against the current one */ + uint next_length,prev_length,prev_pack_length,lastkey_length, + rest_length; + if (keyinfo->seg[0].length >= 127) + { + if (!(prev_length=mi_uint2korr(start) & 32767)) + goto end; + next_length=mi_uint2korr(keypos) & 32767; + keypos+=2; + prev_pack_length=2; + } + else + { + if (!(prev_length= *start & 127)) + goto end; /* Same key as previous*/ + next_length= *keypos & 127; + keypos++; + prev_pack_length=1; + } + if (!(*start & 128)) + prev_length=0; /* prev key not packed */ + if (keyinfo->seg[0].flag & HA_NULL_PART) + lastkey++; /* Skip null marker */ + get_key_length(lastkey_length,lastkey); + if (!next_length) /* Same key after */ + { + next_length=lastkey_length; + rest_length=0; + } + else + get_key_length(rest_length,keypos); + + if (next_length >= prev_length) + { /* Key after is based on deleted key */ + uint pack_length,tmp; + bmove_upp((char*) keypos,(char*) (lastkey+next_length), + tmp=(next_length-prev_length)); + rest_length+=tmp; + pack_length= prev_length ? get_pack_length(rest_length): 0; + keypos-=tmp+pack_length+prev_pack_length; + s_length=(int) (keypos-start); + if (prev_length) /* Pack against prev key */ + { + *keypos++= start[0]; + if (prev_pack_length == 2) + *keypos++= start[1]; + store_key_length(keypos,rest_length); + } + else + { + /* Next key is not packed anymore */ + if (keyinfo->seg[0].flag & HA_NULL_PART) + { + rest_length++; /* Mark not null */ + } + if (prev_pack_length == 2) + { + mi_int2store(keypos,rest_length); + } + else + *keypos= rest_length; + } + } + } + } + } + } + end: + bmove(start, start+s_length, (uint) (page_end-start-s_length)); + DBUG_RETURN((uint) s_length); +} /* remove_key */ diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c new file mode 100644 index 00000000000..8cb4fdb8a3e --- /dev/null +++ b/storage/maria/ma_delete_all.c @@ -0,0 +1,161 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Remove all rows from a MARIA table */ +/* This clears the status information and truncates files */ + +#include "maria_def.h" +#include "trnman.h" + +/** + @brief deletes all rows from a table + + @param info Maria handler + + @return Operation status + @retval 0 ok + @retval 1 error +*/ + +int maria_delete_all_rows(MARIA_HA *info) +{ + MARIA_SHARE *share=info->s; + my_bool log_record; + DBUG_ENTER("maria_delete_all_rows"); + + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + /** + @todo LOCK take X-lock on table here. + When we have versioning, if some other thread is looking at this table, + we cannot shrink the file like this. + */ + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + log_record= share->now_transactional && !share->temporary; + if (_ma_mark_file_changed(info)) + goto err; + + if (log_record) + { + /* + This record will be used by Recovery to finish the deletion if it + crashed. We force it because it's a non-undoable operation. + */ + LSN lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[FILEID_STORE_SIZE]; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DELETE_ALL, + info->trn, info, 0, + sizeof(log_array)/sizeof(log_array[0]), + log_array, log_data) || + translog_flush(lsn))) + goto err; + } + + /* + For recovery it matters that this is called after writing the log record, + so that resetting state.records actually happens under log's mutex. + */ + _ma_reset_status(info); + + /* + If we are using delayed keys or if the user has done changes to the tables + since it was locked then there may be key blocks in the page cache. Or + there may be data blocks there. We need to throw them away or they may + re-enter the emptied table later. + */ + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX, + FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) || + my_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) || + my_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME)) ) + goto err; + + if (_ma_initialize_data_file(share, info->dfile.file)) + goto err; + + /* + The operations above on the index/data file will be forced to disk at + Checkpoint or maria_close() time. So we can reset: + */ + info->trn->rec_lsn= LSN_IMPOSSIBLE; + + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); +#ifdef HAVE_MMAP + /* Resize mmaped area */ + rw_wrlock(&info->s->mmap_lock); + _ma_remap_file(info, (my_off_t)0); + rw_unlock(&info->s->mmap_lock); +#endif + allow_break(); /* Allow SIGHUP & SIGINT */ + DBUG_RETURN(0); + +err: + { + int save_errno=my_errno; + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + info->update|=HA_STATE_WRITTEN; /* Buffer changed */ + /** + @todo RECOVERY if we come here, Recovery may later apply the REDO above, + which may be wrong. Not fixing it now, as anyway this way of deleting + rows will have to be re-examined when we have versioning. + */ + allow_break(); /* Allow SIGHUP & SIGINT */ + DBUG_RETURN(my_errno=save_errno); + } +} /* maria_delete_all_rows */ + + +/* + Reset status information + + SYNOPSIS + _ma_reset_status() + maria Maria handler + + DESCRIPTION + Resets data and index file information as if the file would be empty + Files are not touched. +*/ + +void _ma_reset_status(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + MARIA_STATE_INFO *state= &share->state; + uint i; + + info->state->records= info->state->del= state->split= 0; + state->changed= 0; /* File is optimized */ + state->dellink= HA_OFFSET_ERROR; + state->sortkey= (ushort) ~0; + info->state->key_file_length= share->base.keystart; + info->state->data_file_length= 0; + info->state->empty= info->state->key_empty= 0; + /** + @todo RECOVERY BUG + the line below must happen under log's mutex when writing the REDO + */ + info->state->checksum= 0; + + /* Drop the delete key chain. */ + state->key_del= HA_OFFSET_ERROR; + /* Clear all keys */ + for (i=0 ; i < share->base.keys ; i++) + state->key_root[i]= HA_OFFSET_ERROR; +} diff --git a/storage/maria/ma_delete_table.c b/storage/maria/ma_delete_table.c new file mode 100644 index 00000000000..693c68c7e5f --- /dev/null +++ b/storage/maria/ma_delete_table.c @@ -0,0 +1,111 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "ma_fulltext.h" +#include "trnman_public.h" + +/** + @brief drops (deletes) a table + + @param name table's name + + @return Operation status + @retval 0 ok + @retval 1 error +*/ + +int maria_delete_table(const char *name) +{ + char from[FN_REFLEN]; +#ifdef USE_RAID + uint raid_type=0,raid_chunks=0; +#endif + MARIA_HA *info; + myf sync_dir; + DBUG_ENTER("maria_delete_table"); + +#ifdef EXTRA_DEBUG + _ma_check_table_is_closed(name,"delete"); +#endif + /** @todo LOCK take X-lock on table */ + /* + We need to know if this table is transactional. + When built with RAID support, we also need to determine if this table + makes use of the raid feature. If yes, we need to remove all raid + chunks. This is done with my_raid_delete(). Unfortunately it is + necessary to open the table just to check this. We use + 'open_for_repair' to be able to open even a crashed table. If even + this open fails, we assume no raid configuration for this table + and try to remove the normal data file only. This may however + leave the raid chunks behind. + */ + if (!(info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR))) + { +#ifdef USE_RAID + raid_type= 0; +#endif + sync_dir= 0; + } + else + { +#ifdef USE_RAID + raid_type= info->s->base.raid_type; + raid_chunks= info->s->base.raid_chunks; +#endif + sync_dir= (info->s->now_transactional && !info->s->temporary && + !maria_in_recovery) ? + MY_SYNC_DIR : 0; + maria_close(info); + } +#ifdef USE_RAID +#ifdef EXTRA_DEBUG + _ma_check_table_is_closed(name,"delete"); +#endif +#endif /* USE_RAID */ + + if (sync_dir) + { + /* + For this log record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe in DDLs. + For now this record can serve when we apply logs to a backup, so we sync + it. + */ + LSN lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char *)name; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1; + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DROP_TABLE, + &dummy_transaction_object, NULL, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL) || + translog_flush(lsn))) + DBUG_RETURN(1); + } + + fn_format(from,name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + if (my_delete_with_symlink(from, MYF(MY_WME | sync_dir))) + DBUG_RETURN(my_errno); + fn_format(from,name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); +#ifdef USE_RAID + if (raid_type) + DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME | sync_dir)) ? + my_errno : 0); +#endif + DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME | sync_dir)) ? + my_errno : 0); +} diff --git a/storage/maria/ma_dynrec.c b/storage/maria/ma_dynrec.c new file mode 100644 index 00000000000..6e13fbcecb6 --- /dev/null +++ b/storage/maria/ma_dynrec.c @@ -0,0 +1,1972 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Functions to handle space-packed-records and blobs + + A row may be stored in one or more linked blocks. + The block size is between MARIA_MIN_BLOCK_LENGTH and MARIA_MAX_BLOCK_LENGTH. + Each block is aligned on MARIA_DYN_ALIGN_SIZE. + The reson for the max block size is to not have too many different types + of blocks. For the differnet block types, look at _ma_get_block_info() +*/ + +#include "maria_def.h" + +static my_bool write_dynamic_record(MARIA_HA *info,const uchar *record, + ulong reclength); +static int _ma_find_writepos(MARIA_HA *info,ulong reclength,my_off_t *filepos, + ulong *length); +static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, + uchar *record, ulong reclength); +static my_bool delete_dynamic_record(MARIA_HA *info,MARIA_RECORD_POS filepos, + uint second_read); +static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos, + uint length); + +#ifdef THREAD +/* Play it safe; We have a small stack when using threads */ +#undef my_alloca +#undef my_afree +#define my_alloca(A) my_malloc((A),MYF(0)) +#define my_afree(A) my_free((A),MYF(0)) +#endif + + /* Interface function from MARIA_HA */ + +#ifdef HAVE_MMAP + +/* + Create mmaped area for MARIA handler + + SYNOPSIS + _ma_dynmap_file() + info MARIA handler + + RETURN + 0 ok + 1 error. +*/ + +my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size) +{ + DBUG_ENTER("_ma_dynmap_file"); + if (size > (my_off_t) (~((size_t) 0)) - MEMMAP_EXTRA_MARGIN) + { + DBUG_PRINT("warning", ("File is too large for mmap")); + DBUG_RETURN(1); + } + /* + Ingo wonders if it is good to use MAP_NORESERVE. From the Linux man page: + MAP_NORESERVE + Do not reserve swap space for this mapping. When swap space is + reserved, one has the guarantee that it is possible to modify the + mapping. When swap space is not reserved one might get SIGSEGV + upon a write if no physical memory is available. + */ + info->s->file_map= (uchar*) + my_mmap(0, (size_t)(size + MEMMAP_EXTRA_MARGIN), + info->s->mode==O_RDONLY ? PROT_READ : + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_NORESERVE, + info->dfile.file, 0L); + if (info->s->file_map == (uchar*) MAP_FAILED) + { + info->s->file_map= NULL; + DBUG_RETURN(1); + } +#if defined(HAVE_MADVISE) + madvise(info->s->file_map, size, MADV_RANDOM); +#endif + info->s->mmaped_length= size; + DBUG_RETURN(0); +} + + +/* + Resize mmaped area for MARIA handler + + SYNOPSIS + _ma_remap_file() + info MARIA handler + + RETURN +*/ + +void _ma_remap_file(MARIA_HA *info, my_off_t size) +{ + if (info->s->file_map) + { + VOID(my_munmap(info->s->file_map, + (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN)); + _ma_dynmap_file(info, size); + } +} +#endif + + +/* + Read bytes from MySAM handler, using mmap or pread + + SYNOPSIS + _ma_mmap_pread() + info MARIA handler + Buffer Input buffer + Count Count of bytes for read + offset Start position + MyFlags + + RETURN + 0 ok +*/ + +uint _ma_mmap_pread(MARIA_HA *info, uchar *Buffer, + uint Count, my_off_t offset, myf MyFlags) +{ + DBUG_PRINT("info", ("maria_read with mmap %d\n", info->dfile.file)); + if (info->s->concurrent_insert) + rw_rdlock(&info->s->mmap_lock); + + /* + The following test may fail in the following cases: + - We failed to remap a memory area (fragmented memory?) + - This thread has done some writes, but not yet extended the + memory mapped area. + */ + + if (info->s->mmaped_length >= offset + Count) + { + memcpy(Buffer, info->s->file_map + offset, Count); + if (info->s->concurrent_insert) + rw_unlock(&info->s->mmap_lock); + return 0; + } + else + { + if (info->s->concurrent_insert) + rw_unlock(&info->s->mmap_lock); + return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags); + } +} + + + /* wrapper for my_pread in case if mmap isn't used */ + +uint _ma_nommap_pread(MARIA_HA *info, uchar *Buffer, + uint Count, my_off_t offset, myf MyFlags) +{ + return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags); +} + + +/* + Write bytes to MySAM handler, using mmap or pwrite + + SYNOPSIS + _ma_mmap_pwrite() + info MARIA handler + Buffer Output buffer + Count Count of bytes for write + offset Start position + MyFlags + + RETURN + 0 ok + !=0 error. In this case return error from pwrite +*/ + +uint _ma_mmap_pwrite(MARIA_HA *info, uchar *Buffer, + uint Count, my_off_t offset, myf MyFlags) +{ + DBUG_PRINT("info", ("maria_write with mmap %d\n", info->dfile.file)); + if (info->s->concurrent_insert) + rw_rdlock(&info->s->mmap_lock); + + /* + The following test may fail in the following cases: + - We failed to remap a memory area (fragmented memory?) + - This thread has done some writes, but not yet extended the + memory mapped area. + */ + + if (info->s->mmaped_length >= offset + Count) + { + memcpy(info->s->file_map + offset, Buffer, Count); + if (info->s->concurrent_insert) + rw_unlock(&info->s->mmap_lock); + return 0; + } + else + { + info->s->nonmmaped_inserts++; + if (info->s->concurrent_insert) + rw_unlock(&info->s->mmap_lock); + return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags); + } + +} + + + /* wrapper for my_pwrite in case if mmap isn't used */ + +uint _ma_nommap_pwrite(MARIA_HA *info, uchar *Buffer, + uint Count, my_off_t offset, myf MyFlags) +{ + return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags); +} + + +my_bool _ma_write_dynamic_record(MARIA_HA *info, const uchar *record) +{ + ulong reclength= _ma_rec_pack(info,info->rec_buff + MARIA_REC_BUFF_OFFSET, + record); + return (write_dynamic_record(info,info->rec_buff + MARIA_REC_BUFF_OFFSET, + reclength)); +} + +my_bool _ma_update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec __attribute__ ((unused)), + const uchar *record) +{ + uint length= _ma_rec_pack(info, info->rec_buff + MARIA_REC_BUFF_OFFSET, + record); + return (update_dynamic_record(info, pos, + info->rec_buff + MARIA_REC_BUFF_OFFSET, + length)); +} + + +my_bool _ma_write_blob_record(MARIA_HA *info, const uchar *record) +{ + uchar *rec_buff; + int error; + ulong reclength,reclength2,extra; + + extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+ + MARIA_DYN_DELETE_BLOCK_HEADER+1); + reclength= (info->s->base.pack_reclength + + _ma_calc_total_blob_length(info,record)+ extra); + if (!(rec_buff=(uchar*) my_alloca(reclength))) + { + my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */ + return(1); + } + reclength2= _ma_rec_pack(info, + rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + record); + DBUG_PRINT("info",("reclength: %lu reclength2: %lu", + reclength, reclength2)); + DBUG_ASSERT(reclength2 <= reclength); + error= write_dynamic_record(info, + rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + reclength2); + my_afree(rec_buff); + return(error != 0); +} + + +my_bool _ma_update_blob_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec __attribute__ ((unused)), + const uchar *record) +{ + uchar *rec_buff; + int error; + ulong reclength,extra; + + extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+ + MARIA_DYN_DELETE_BLOCK_HEADER); + reclength= (info->s->base.pack_reclength+ + _ma_calc_total_blob_length(info,record)+ extra); +#ifdef NOT_USED /* We now support big rows */ + if (reclength > MARIA_DYN_MAX_ROW_LENGTH) + { + my_errno=HA_ERR_TO_BIG_ROW; + return 1; + } +#endif + if (!(rec_buff=(uchar*) my_alloca(reclength))) + { + my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */ + return(1); + } + reclength= _ma_rec_pack(info,rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + record); + error=update_dynamic_record(info,pos, + rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + reclength); + my_afree(rec_buff); + return(error != 0); +} + + +my_bool _ma_delete_dynamic_record(MARIA_HA *info, + const uchar *record __attribute__ ((unused))) +{ + return delete_dynamic_record(info, info->cur_row.lastpos, 0); +} + + + /* Write record to data-file */ + +static my_bool write_dynamic_record(MARIA_HA *info, const uchar *record, + ulong reclength) +{ + int flag; + ulong length; + my_off_t filepos; + DBUG_ENTER("write_dynamic_record"); + + flag=0; + do + { + if (_ma_find_writepos(info,reclength,&filepos,&length)) + goto err; + if (_ma_write_part_record(info,filepos,length, + (info->append_insert_at_end ? + HA_OFFSET_ERROR : info->s->state.dellink), + (uchar**) &record,&reclength,&flag)) + goto err; + } while (reclength); + + DBUG_RETURN(0); +err: + DBUG_RETURN(1); +} + + + /* Get a block for data ; The given data-area must be used !! */ + +static int _ma_find_writepos(MARIA_HA *info, + ulong reclength, /* record length */ + my_off_t *filepos, /* Return file pos */ + ulong *length) /* length of block at filepos */ +{ + MARIA_BLOCK_INFO block_info; + ulong tmp; + DBUG_ENTER("_ma_find_writepos"); + + if (info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end) + { + /* Deleted blocks exists; Get last used block */ + *filepos=info->s->state.dellink; + block_info.second_read=0; + info->rec_cache.seek_not_done=1; + if (!(_ma_get_block_info(&block_info, info->dfile.file, + info->s->state.dellink) & + BLOCK_DELETED)) + { + DBUG_PRINT("error",("Delete link crashed")); + my_errno=HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(-1); + } + info->s->state.dellink=block_info.next_filepos; + info->state->del--; + info->state->empty-= block_info.block_len; + *length= block_info.block_len; + } + else + { + /* No deleted blocks; Allocate a new block */ + *filepos=info->state->data_file_length; + if ((tmp=reclength+3 + test(reclength >= (65520-3))) < + info->s->base.min_block_length) + tmp= info->s->base.min_block_length; + else + tmp= ((tmp+MARIA_DYN_ALIGN_SIZE-1) & + (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1))); + if (info->state->data_file_length > + (info->s->base.max_data_file_length - tmp)) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + DBUG_RETURN(-1); + } + if (tmp > MARIA_MAX_BLOCK_LENGTH) + tmp=MARIA_MAX_BLOCK_LENGTH; + *length= tmp; + info->state->data_file_length+= tmp; + info->s->state.split++; + info->update|=HA_STATE_WRITE_AT_END; + } + DBUG_RETURN(0); +} /* _ma_find_writepos */ + + + +/* + Unlink a deleted block from the deleted list. + This block will be combined with the preceding or next block to form + a big block. +*/ + +static bool unlink_deleted_block(MARIA_HA *info, MARIA_BLOCK_INFO *block_info) +{ + DBUG_ENTER("unlink_deleted_block"); + if (block_info->filepos == info->s->state.dellink) + { + /* First deleted block; We can just use this ! */ + info->s->state.dellink=block_info->next_filepos; + } + else + { + MARIA_BLOCK_INFO tmp; + tmp.second_read=0; + /* Unlink block from the previous block */ + if (!(_ma_get_block_info(&tmp, info->dfile.file, block_info->prev_filepos) + & BLOCK_DELETED)) + DBUG_RETURN(1); /* Something is wrong */ + mi_sizestore(tmp.header+4,block_info->next_filepos); + if (info->s->file_write(info,(char*) tmp.header+4,8, + block_info->prev_filepos+4, MYF(MY_NABP))) + DBUG_RETURN(1); + /* Unlink block from next block */ + if (block_info->next_filepos != HA_OFFSET_ERROR) + { + if (!(_ma_get_block_info(&tmp, info->dfile.file, + block_info->next_filepos) + & BLOCK_DELETED)) + DBUG_RETURN(1); /* Something is wrong */ + mi_sizestore(tmp.header+12,block_info->prev_filepos); + if (info->s->file_write(info,(char*) tmp.header+12,8, + block_info->next_filepos+12, + MYF(MY_NABP))) + DBUG_RETURN(1); + } + } + /* We now have one less deleted block */ + info->state->del--; + info->state->empty-= block_info->block_len; + info->s->state.split--; + + /* + If this was a block that we where accessing through table scan + (maria_rrnd() or maria_scan(), then ensure that we skip over this block + when doing next maria_rrnd() or maria_scan(). + */ + if (info->cur_row.nextpos == block_info->filepos) + info->cur_row.nextpos+= block_info->block_len; + DBUG_RETURN(0); +} + + +/* + Add a backward link to delete block + + SYNOPSIS + update_backward_delete_link() + info MARIA handler + delete_block Position to delete block to update. + If this is 'HA_OFFSET_ERROR', nothing will be done + filepos Position to block that 'delete_block' should point to + + RETURN + 0 ok + 1 error. In this case my_error is set. +*/ + +static my_bool update_backward_delete_link(MARIA_HA *info, + my_off_t delete_block, + MARIA_RECORD_POS filepos) +{ + MARIA_BLOCK_INFO block_info; + DBUG_ENTER("update_backward_delete_link"); + + if (delete_block != HA_OFFSET_ERROR) + { + block_info.second_read=0; + if (_ma_get_block_info(&block_info, info->dfile.file, delete_block) + & BLOCK_DELETED) + { + char buff[8]; + mi_sizestore(buff,filepos); + if (info->s->file_write(info,buff, 8, delete_block+12, MYF(MY_NABP))) + DBUG_RETURN(1); /* Error on write */ + } + else + { + my_errno=HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(1); /* Wrong delete link */ + } + } + DBUG_RETURN(0); +} + +/* Delete datarecord from database */ +/* info->rec_cache.seek_not_done is updated in cmp_record */ + +static my_bool delete_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, + uint second_read) +{ + uint length,b_type; + MARIA_BLOCK_INFO block_info,del_block; + int error; + my_bool remove_next_block; + DBUG_ENTER("delete_dynamic_record"); + + /* First add a link from the last block to the new one */ + error= update_backward_delete_link(info, info->s->state.dellink, filepos); + + block_info.second_read=second_read; + do + { + /* Remove block at 'filepos' */ + if ((b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos)) + & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR) || + (length=(uint) (block_info.filepos-filepos) +block_info.block_len) < + MARIA_MIN_BLOCK_LENGTH) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(1); + } + /* Check if next block is a delete block */ + del_block.second_read=0; + remove_next_block=0; + if (_ma_get_block_info(&del_block, info->dfile.file, filepos + length) & + BLOCK_DELETED && del_block.block_len+length < + MARIA_DYN_MAX_BLOCK_LENGTH) + { + /* We can't remove this yet as this block may be the head block */ + remove_next_block=1; + length+=del_block.block_len; + } + + block_info.header[0]=0; + mi_int3store(block_info.header+1,length); + mi_sizestore(block_info.header+4,info->s->state.dellink); + if (b_type & BLOCK_LAST) + bfill(block_info.header+12,8,255); + else + mi_sizestore(block_info.header+12,block_info.next_filepos); + if (info->s->file_write(info,(uchar*) block_info.header,20,filepos, + MYF(MY_NABP))) + DBUG_RETURN(1); + info->s->state.dellink = filepos; + info->state->del++; + info->state->empty+=length; + filepos=block_info.next_filepos; + + /* Now it's safe to unlink the deleted block directly after this one */ + if (remove_next_block && unlink_deleted_block(info,&del_block)) + error=1; + } while (!(b_type & BLOCK_LAST)); + + DBUG_RETURN(error); +} + + + /* Write a block to datafile */ + +int _ma_write_part_record(MARIA_HA *info, + my_off_t filepos, /* points at empty block */ + ulong length, /* length of block */ + my_off_t next_filepos,/* Next empty block */ + uchar **record, /* pointer to record ptr */ + ulong *reclength, /* length of *record */ + int *flag) /* *flag == 0 if header */ +{ + ulong head_length,res_length,extra_length,long_block,del_length; + uchar *pos,*record_end; + my_off_t next_delete_block; + uchar temp[MARIA_SPLIT_LENGTH+MARIA_DYN_DELETE_BLOCK_HEADER]; + DBUG_ENTER("_ma_write_part_record"); + + next_delete_block=HA_OFFSET_ERROR; + + res_length=extra_length=0; + if (length > *reclength + MARIA_SPLIT_LENGTH) + { /* Splitt big block */ + res_length=MY_ALIGN(length- *reclength - MARIA_EXTEND_BLOCK_LENGTH, + MARIA_DYN_ALIGN_SIZE); + length-= res_length; /* Use this for first part */ + } + long_block= (length < 65520L && *reclength < 65520L) ? 0 : 1; + if (length == *reclength+ 3 + long_block) + { + /* Block is exactly of the right length */ + temp[0]=(uchar) (1+ *flag)+(uchar) long_block; /* Flag is 0 or 6 */ + if (long_block) + { + mi_int3store(temp+1,*reclength); + head_length=4; + } + else + { + mi_int2store(temp+1,*reclength); + head_length=3; + } + } + else if (length-long_block < *reclength+4) + { /* To short block */ + if (next_filepos == HA_OFFSET_ERROR) + next_filepos= (info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end ? + info->s->state.dellink : info->state->data_file_length); + if (*flag == 0) /* First block */ + { + if (*reclength > MARIA_MAX_BLOCK_LENGTH) + { + head_length= 16; + temp[0]=13; + mi_int4store(temp+1,*reclength); + mi_int3store(temp+5,length-head_length); + mi_sizestore((uchar*) temp+8,next_filepos); + } + else + { + head_length=5+8+long_block*2; + temp[0]=5+(uchar) long_block; + if (long_block) + { + mi_int3store(temp+1,*reclength); + mi_int3store(temp+4,length-head_length); + mi_sizestore((uchar*) temp+7,next_filepos); + } + else + { + mi_int2store(temp+1,*reclength); + mi_int2store(temp+3,length-head_length); + mi_sizestore((uchar*) temp+5,next_filepos); + } + } + } + else + { + head_length=3+8+long_block; + temp[0]=11+(uchar) long_block; + if (long_block) + { + mi_int3store(temp+1,length-head_length); + mi_sizestore((uchar*) temp+4,next_filepos); + } + else + { + mi_int2store(temp+1,length-head_length); + mi_sizestore((uchar*) temp+3,next_filepos); + } + } + } + else + { /* Block with empty info last */ + head_length=4+long_block; + extra_length= length- *reclength-head_length; + temp[0]= (uchar) (3+ *flag)+(uchar) long_block; /* 3,4 or 9,10 */ + if (long_block) + { + mi_int3store(temp+1,*reclength); + temp[4]= (uchar) (extra_length); + } + else + { + mi_int2store(temp+1,*reclength); + temp[3]= (uchar) (extra_length); + } + length= *reclength+head_length; /* Write only what is needed */ + } + DBUG_DUMP("header",(uchar*) temp,head_length); + + /* Make a long block for one write */ + record_end= *record+length-head_length; + del_length=(res_length ? MARIA_DYN_DELETE_BLOCK_HEADER : 0); + bmove((uchar*) (*record-head_length),(uchar*) temp,head_length); + memcpy(temp,record_end,(size_t) (extra_length+del_length)); + bzero((uchar*) record_end,extra_length); + + if (res_length) + { + /* Check first if we can join this block with the next one */ + MARIA_BLOCK_INFO del_block; + my_off_t next_block=filepos+length+extra_length+res_length; + + del_block.second_read=0; + if (next_block < info->state->data_file_length && + info->s->state.dellink != HA_OFFSET_ERROR) + { + if ((_ma_get_block_info(&del_block, info->dfile.file, next_block) + & BLOCK_DELETED) && + res_length + del_block.block_len < MARIA_DYN_MAX_BLOCK_LENGTH) + { + if (unlink_deleted_block(info,&del_block)) + goto err; + res_length+=del_block.block_len; + } + } + + /* Create a delete link of the last part of the block */ + pos=record_end+extra_length; + pos[0]= '\0'; + mi_int3store(pos+1,res_length); + mi_sizestore(pos+4,info->s->state.dellink); + bfill(pos+12,8,255); /* End link */ + next_delete_block=info->s->state.dellink; + info->s->state.dellink= filepos+length+extra_length; + info->state->del++; + info->state->empty+=res_length; + info->s->state.split++; + } + if (info->opt_flag & WRITE_CACHE_USED && + info->update & HA_STATE_WRITE_AT_END) + { + if (info->update & HA_STATE_EXTEND_BLOCK) + { + info->update&= ~HA_STATE_EXTEND_BLOCK; + if (my_block_write(&info->rec_cache,(uchar*) *record-head_length, + length+extra_length+del_length,filepos)) + goto err; + } + else if (my_b_write(&info->rec_cache,(uchar*) *record-head_length, + length+extra_length+del_length)) + goto err; + } + else + { + info->rec_cache.seek_not_done=1; + if (info->s->file_write(info,(uchar*) *record-head_length, + length+extra_length+ + del_length,filepos,info->s->write_flag)) + goto err; + } + memcpy(record_end,temp,(size_t) (extra_length+del_length)); + *record=record_end; + *reclength-=(length-head_length); + *flag=6; + + if (del_length) + { + /* link the next delete block to this */ + if (update_backward_delete_link(info, next_delete_block, + info->s->state.dellink)) + goto err; + } + + DBUG_RETURN(0); +err: + DBUG_PRINT("exit",("errno: %d",my_errno)); + DBUG_RETURN(1); +} /* _ma_write_part_record */ + + + /* update record from datafile */ + +static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, + uchar *record, ulong reclength) +{ + int flag; + uint error; + ulong length; + MARIA_BLOCK_INFO block_info; + DBUG_ENTER("update_dynamic_record"); + + flag=block_info.second_read=0; + while (reclength > 0) + { + if (filepos != info->s->state.dellink) + { + block_info.next_filepos= HA_OFFSET_ERROR; + if ((error= _ma_get_block_info(&block_info, info->dfile.file, filepos)) + & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + DBUG_PRINT("error",("Got wrong block info")); + if (!(error & BLOCK_FATAL_ERROR)) + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; + } + length=(ulong) (block_info.filepos-filepos) + block_info.block_len; + if (length < reclength) + { + uint tmp=MY_ALIGN(reclength - length + 3 + + test(reclength >= 65520L),MARIA_DYN_ALIGN_SIZE); + /* Don't create a block bigger than MARIA_MAX_BLOCK_LENGTH */ + tmp= min(length+tmp, MARIA_MAX_BLOCK_LENGTH)-length; + /* Check if we can extend this block */ + if (block_info.filepos + block_info.block_len == + info->state->data_file_length && + info->state->data_file_length < + info->s->base.max_data_file_length-tmp) + { + /* extend file */ + DBUG_PRINT("info",("Extending file with %d bytes",tmp)); + if (info->cur_row.nextpos == info->state->data_file_length) + info->cur_row.nextpos+= tmp; + info->state->data_file_length+= tmp; + info->update|= HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK; + length+=tmp; + } + else if (length < MARIA_MAX_BLOCK_LENGTH - MARIA_MIN_BLOCK_LENGTH) + { + /* + Check if next block is a deleted block + Above we have MARIA_MIN_BLOCK_LENGTH to avoid the problem where + the next block is so small it can't be splited which could + casue problems + */ + + MARIA_BLOCK_INFO del_block; + del_block.second_read=0; + if (_ma_get_block_info(&del_block, info->dfile.file, + block_info.filepos + block_info.block_len) & + BLOCK_DELETED) + { + /* Use; Unlink it and extend the current block */ + DBUG_PRINT("info",("Extending current block")); + if (unlink_deleted_block(info,&del_block)) + goto err; + if ((length+=del_block.block_len) > MARIA_MAX_BLOCK_LENGTH) + { + /* + New block was too big, link overflow part back to + delete list + */ + my_off_t next_pos; + ulong rest_length= length-MARIA_MAX_BLOCK_LENGTH; + set_if_bigger(rest_length, MARIA_MIN_BLOCK_LENGTH); + next_pos= del_block.filepos+ del_block.block_len - rest_length; + + if (update_backward_delete_link(info, info->s->state.dellink, + next_pos)) + DBUG_RETURN(1); + + /* create delete link for data that didn't fit into the page */ + del_block.header[0]=0; + mi_int3store(del_block.header+1, rest_length); + mi_sizestore(del_block.header+4,info->s->state.dellink); + bfill(del_block.header+12,8,255); + if (info->s->file_write(info,(uchar*) del_block.header, 20, + next_pos, MYF(MY_NABP))) + DBUG_RETURN(1); + info->s->state.dellink= next_pos; + info->s->state.split++; + info->state->del++; + info->state->empty+= rest_length; + length-= rest_length; + } + } + } + } + } + else + { + if (_ma_find_writepos(info,reclength,&filepos,&length)) + goto err; + } + if (_ma_write_part_record(info,filepos,length,block_info.next_filepos, + &record,&reclength,&flag)) + goto err; + if ((filepos=block_info.next_filepos) == HA_OFFSET_ERROR) + { + /* Start writing data on deleted blocks */ + filepos=info->s->state.dellink; + } + } + + if (block_info.next_filepos != HA_OFFSET_ERROR) + if (delete_dynamic_record(info,block_info.next_filepos,1)) + goto err; + DBUG_RETURN(0); +err: + DBUG_RETURN(1); +} + + + /* Pack a record. Return new reclength */ + +uint _ma_rec_pack(MARIA_HA *info, register uchar *to, + register const uchar *from) +{ + uint length,new_length,flag,bit,i; + uchar *pos,*end,*startpos,*packpos; + enum en_fieldtype type; + reg3 MARIA_COLUMNDEF *column; + MARIA_BLOB *blob; + DBUG_ENTER("_ma_rec_pack"); + + flag= 0; + bit= 1; + startpos= packpos=to; + to+= info->s->base.pack_bytes; + blob= info->blobs; + column= info->s->columndef; + if (info->s->base.null_bytes) + { + memcpy(to, from, info->s->base.null_bytes); + from+= info->s->base.null_bytes; + to+= info->s->base.null_bytes; + } + + for (i=info->s->base.fields ; i-- > 0; from+= length, column++) + { + length=(uint) column->length; + if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL) + { + if (type == FIELD_BLOB) + { + if (!blob->length) + flag|=bit; + else + { + char *temp_pos; + size_t tmp_length=length-portable_sizeof_char_ptr; + memcpy((uchar*) to,from,tmp_length); + memcpy_fixed(&temp_pos,from+tmp_length,sizeof(char*)); + memcpy(to+tmp_length,temp_pos,(size_t) blob->length); + to+=tmp_length+blob->length; + } + blob++; + } + else if (type == FIELD_SKIP_ZERO) + { + if (memcmp((uchar*) from, maria_zero_string, length) == 0) + flag|=bit; + else + { + memcpy((uchar*) to,from,(size_t) length); to+=length; + } + } + else if (type == FIELD_SKIP_ENDSPACE || + type == FIELD_SKIP_PRESPACE) + { + pos= (uchar*) from; end= (uchar*) from + length; + if (type == FIELD_SKIP_ENDSPACE) + { /* Pack trailing spaces */ + while (end > from && *(end-1) == ' ') + end--; + } + else + { /* Pack pref-spaces */ + while (pos < end && *pos == ' ') + pos++; + } + new_length=(uint) (end-pos); + if (new_length +1 + test(column->length > 255 && new_length > 127) + < length) + { + if (column->length > 255 && new_length > 127) + { + to[0]=(char) ((new_length & 127)+128); + to[1]=(char) (new_length >> 7); + to+=2; + } + else + *to++= (char) new_length; + memcpy((uchar*) to,pos,(size_t) new_length); to+=new_length; + flag|=bit; + } + else + { + memcpy(to,from,(size_t) length); to+=length; + } + } + else if (type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1); + uint tmp_length; + if (pack_length == 1) + { + tmp_length= (uint) *(uchar*) from; + *to++= *from; + } + else + { + tmp_length= uint2korr(from); + store_key_length_inc(to,tmp_length); + } + memcpy(to, from+pack_length,tmp_length); + to+= tmp_length; + continue; + } + else + { + memcpy(to,from,(size_t) length); to+=length; + continue; /* Normal field */ + } + if ((bit= bit << 1) >= 256) + { + *packpos++ = (char) (uchar) flag; + bit=1; flag=0; + } + } + else + { + memcpy(to,from,(size_t) length); to+=length; + } + } + if (bit != 1) + *packpos= (char) (uchar) flag; + if (info->s->calc_checksum) + *to++= (uchar) info->cur_row.checksum; + DBUG_PRINT("exit",("packed length: %d",(int) (to-startpos))); + DBUG_RETURN((uint) (to-startpos)); +} /* _ma_rec_pack */ + + + +/* + Check if a record was correctly packed. Used only by maria_chk + Returns 0 if record is ok. +*/ + +my_bool _ma_rec_check(MARIA_HA *info,const uchar *record, uchar *rec_buff, + ulong packed_length, my_bool with_checksum, + ha_checksum checksum) +{ + uint length,new_length,flag,bit,i; + uchar *pos,*end,*packpos,*to; + enum en_fieldtype type; + reg3 MARIA_COLUMNDEF *column; + DBUG_ENTER("_ma_rec_check"); + + packpos=rec_buff; to= rec_buff+info->s->base.pack_bytes; + column= info->s->columndef; + flag= *packpos; bit=1; + record+= info->s->base.null_bytes; + to+= info->s->base.null_bytes; + + for (i=info->s->base.fields ; i-- > 0; record+= length, column++) + { + length=(uint) column->length; + if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL) + { + if (type == FIELD_BLOB) + { + uint blob_length= + _ma_calc_blob_length(length-portable_sizeof_char_ptr,record); + if (!blob_length && !(flag & bit)) + goto err; + if (blob_length) + to+=length - portable_sizeof_char_ptr+ blob_length; + } + else if (type == FIELD_SKIP_ZERO) + { + if (memcmp((uchar*) record, maria_zero_string, length) == 0) + { + if (!(flag & bit)) + goto err; + } + else + to+=length; + } + else if (type == FIELD_SKIP_ENDSPACE || + type == FIELD_SKIP_PRESPACE) + { + pos= (uchar*) record; end= (uchar*) record + length; + if (type == FIELD_SKIP_ENDSPACE) + { /* Pack trailing spaces */ + while (end > record && *(end-1) == ' ') + end--; + } + else + { /* Pack pre-spaces */ + while (pos < end && *pos == ' ') + pos++; + } + new_length=(uint) (end-pos); + if (new_length +1 + test(column->length > 255 && new_length > 127) + < length) + { + if (!(flag & bit)) + goto err; + if (column->length > 255 && new_length > 127) + { + if (to[0] != (char) ((new_length & 127)+128) || + to[1] != (char) (new_length >> 7)) + goto err; + to+=2; + } + else if (*to++ != (char) new_length) + goto err; + to+=new_length; + } + else + to+=length; + } + else if (type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1); + uint tmp_length; + if (pack_length == 1) + { + tmp_length= (uint) *(uchar*) record; + to+= 1+ tmp_length; + continue; + } + else + { + tmp_length= uint2korr(record); + to+= get_pack_length(tmp_length)+tmp_length; + } + continue; + } + else + { + to+=length; + continue; /* Normal field */ + } + if ((bit= bit << 1) >= 256) + { + flag= *++packpos; + bit=1; + } + } + else + to+= length; + } + if (packed_length != (uint) (to - rec_buff) + + test(info->s->calc_checksum) || (bit != 1 && (flag & ~(bit - 1)))) + goto err; + if (with_checksum && ((uchar) checksum != (uchar) *to)) + { + DBUG_PRINT("error",("wrong checksum for row")); + goto err; + } + DBUG_RETURN(0); + +err: + DBUG_RETURN(1); +} + + +/* + @brief Unpacks a record + + @return Recordlength + @retval >0 ok + @retval MY_FILE_ERROR (== -1) Error. + my_errno is set to HA_ERR_WRONG_IN_RECORD +*/ + +ulong _ma_rec_unpack(register MARIA_HA *info, register uchar *to, uchar *from, + ulong found_length) +{ + uint flag,bit,length,min_pack_length, column_length; + enum en_fieldtype type; + uchar *from_end,*to_end,*packpos; + reg3 MARIA_COLUMNDEF *column, *end_column; + DBUG_ENTER("_ma_rec_unpack"); + + to_end=to + info->s->base.reclength; + from_end=from+found_length; + flag= (uchar) *from; bit=1; packpos=from; + if (found_length < info->s->base.min_pack_length) + goto err; + from+= info->s->base.pack_bytes; + min_pack_length= info->s->base.min_pack_length - info->s->base.pack_bytes; + + if ((length= info->s->base.null_bytes)) + { + memcpy(to, from, length); + from+= length; + to+= length; + min_pack_length-= length; + } + + for (column= info->s->columndef, end_column= column + info->s->base.fields; + column < end_column ; to+= column_length, column++) + { + column_length= column->length; + if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL && + (type != FIELD_CHECK)) + { + if (type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(column_length-1); + if (pack_length == 1) + { + length= (uint) *(uchar*) from; + if (length > column_length-1) + goto err; + *to= *from++; + } + else + { + get_key_length(length, from); + if (length > column_length-2) + goto err; + int2store(to,length); + } + if (from+length > from_end) + goto err; + memcpy(to+pack_length, from, length); + from+= length; + min_pack_length--; + continue; + } + if (flag & bit) + { + if (type == FIELD_BLOB || type == FIELD_SKIP_ZERO) + bzero((uchar*) to,column_length); + else if (type == FIELD_SKIP_ENDSPACE || + type == FIELD_SKIP_PRESPACE) + { + if (column->length > 255 && *from & 128) + { + if (from + 1 >= from_end) + goto err; + length= (*from & 127)+ ((uint) (uchar) *(from+1) << 7); from+=2; + } + else + { + if (from == from_end) + goto err; + length= (uchar) *from++; + } + min_pack_length--; + if (length >= column_length || + min_pack_length + length > (uint) (from_end - from)) + goto err; + if (type == FIELD_SKIP_ENDSPACE) + { + memcpy(to,(uchar*) from,(size_t) length); + bfill((uchar*) to+length,column_length-length,' '); + } + else + { + bfill((uchar*) to,column_length-length,' '); + memcpy(to+column_length-length,(uchar*) from,(size_t) length); + } + from+=length; + } + } + else if (type == FIELD_BLOB) + { + uint size_length=column_length- portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length,from); + ulong from_left= (ulong) (from_end - from); + if (from_left < size_length || + from_left - size_length < blob_length || + from_left - size_length - blob_length < min_pack_length) + goto err; + memcpy((uchar*) to,(uchar*) from,(size_t) size_length); + from+=size_length; + memcpy_fixed((uchar*) to+size_length,(uchar*) &from,sizeof(char*)); + from+=blob_length; + } + else + { + if (type == FIELD_SKIP_ENDSPACE || type == FIELD_SKIP_PRESPACE) + min_pack_length--; + if (min_pack_length + column_length > (uint) (from_end - from)) + goto err; + memcpy(to,(uchar*) from,(size_t) column_length); from+=column_length; + } + if ((bit= bit << 1) >= 256) + { + flag= (uchar) *++packpos; bit=1; + } + } + else + { + if (min_pack_length > (uint) (from_end - from)) + goto err; + min_pack_length-=column_length; + memcpy(to, (uchar*) from, (size_t) column_length); + from+=column_length; + } + } + if (info->s->calc_checksum) + info->cur_row.checksum= (uint) (uchar) *from++; + if (to == to_end && from == from_end && (bit == 1 || !(flag & ~(bit-1)))) + DBUG_RETURN(found_length); + +err: + my_errno= HA_ERR_WRONG_IN_RECORD; + DBUG_PRINT("error",("to_end: 0x%lx -> 0x%lx from_end: 0x%lx -> 0x%lx", + (long) to, (long) to_end, (long) from, (long) from_end)); + DBUG_DUMP("from",(uchar*) info->rec_buff,info->s->base.min_pack_length); + DBUG_RETURN(MY_FILE_ERROR); +} /* _ma_rec_unpack */ + + + /* Calc length of blob. Update info in blobs->length */ + +ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record) +{ + ulong length; + MARIA_BLOB *blob,*end; + + for (length=0, blob= info->blobs, end=blob+info->s->base.blobs ; + blob != end; + blob++) + { + blob->length= _ma_calc_blob_length(blob->pack_length,record + blob->offset); + length+=blob->length; + } + return length; +} + + +ulong _ma_calc_blob_length(uint length, const uchar *pos) +{ + switch (length) { + case 1: + return (uint) (uchar) *pos; + case 2: + return (uint) uint2korr(pos); + case 3: + return uint3korr(pos); + case 4: + return uint4korr(pos); + default: + break; + } + return 0; /* Impossible */ +} + + +void _ma_store_blob_length(uchar *pos,uint pack_length,uint length) +{ + switch (pack_length) { + case 1: + *pos= (uchar) length; + break; + case 2: + int2store(pos,length); + break; + case 3: + int3store(pos,length); + break; + case 4: + int4store(pos,length); + default: + break; + } + return; +} + + +/* + Read record from datafile. + + SYNOPSIS + _ma_read_dynamic_record() + info MARIA_HA pointer to table. + filepos From where to read the record. + buf Destination for record. + + NOTE + If a write buffer is active, it needs to be flushed if its contents + intersects with the record to read. We always check if the position + of the first uchar of the write buffer is lower than the position + past the last uchar to read. In theory this is also true if the write + buffer is completely below the read segment. That is, if there is no + intersection. But this case is unusual. We flush anyway. Only if the + first uchar in the write buffer is above the last uchar to read, we do + not flush. + + A dynamic record may need several reads. So this check must be done + before every read. Reading a dynamic record starts with reading the + block header. If the record does not fit into the free space of the + header, the block may be longer than the header. In this case a + second read is necessary. These one or two reads repeat for every + part of the record. + + RETURN + 0 OK + # Error number +*/ + +int _ma_read_dynamic_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos) +{ + int block_of_record; + uint b_type; + MARIA_BLOCK_INFO block_info; + File file; + uchar *to; + uint left_length; + DBUG_ENTER("_ma_read_dynamic_record"); + + if (filepos == HA_OFFSET_ERROR) + goto err; + + LINT_INIT(to); + LINT_INIT(left_length); + file= info->dfile.file; + block_of_record= 0; /* First block of record is numbered as zero. */ + block_info.second_read= 0; + do + { + /* A corrupted table can have wrong pointers. (Bug# 19835) */ + if (filepos == HA_OFFSET_ERROR) + goto panic; + if (info->opt_flag & WRITE_CACHE_USED && + (info->rec_cache.pos_in_file < filepos + + MARIA_BLOCK_INFO_HEADER_LENGTH) && + flush_io_cache(&info->rec_cache)) + goto err; + info->rec_cache.seek_not_done=1; + if ((b_type= _ma_get_block_info(&block_info, file, filepos)) & + (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED)) + my_errno=HA_ERR_RECORD_DELETED; + goto err; + } + if (block_of_record++ == 0) /* First block */ + { + if (block_info.rec_len > (uint) info->s->base.max_pack_length) + goto panic; + if (info->s->base.blobs) + { + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + block_info.rec_len + + info->s->base.extra_rec_buff_size)) + goto err; + } + to= info->rec_buff; + left_length=block_info.rec_len; + } + if (left_length < block_info.data_len || ! block_info.data_len) + goto panic; /* Wrong linked record */ + /* copy information that is already read */ + { + uint offset= (uint) (block_info.filepos - filepos); + uint prefetch_len= (sizeof(block_info.header) - offset); + filepos+= sizeof(block_info.header); + + if (prefetch_len > block_info.data_len) + prefetch_len= block_info.data_len; + if (prefetch_len) + { + memcpy((uchar*) to, block_info.header + offset, prefetch_len); + block_info.data_len-= prefetch_len; + left_length-= prefetch_len; + to+= prefetch_len; + } + } + /* read rest of record from file */ + if (block_info.data_len) + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file < filepos + block_info.data_len && + flush_io_cache(&info->rec_cache)) + goto err; + /* + What a pity that this method is not called 'file_pread' and that + there is no equivalent without seeking. We are at the right + position already. :( + */ + if (info->s->file_read(info, (uchar*) to, block_info.data_len, + filepos, MYF(MY_NABP))) + goto panic; + left_length-=block_info.data_len; + to+=block_info.data_len; + } + filepos= block_info.next_filepos; + } while (left_length); + + info->update|= HA_STATE_AKTIV; /* We have a aktive record */ + fast_ma_writeinfo(info); + DBUG_RETURN(_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) != + MY_FILE_ERROR ? 0 : my_errno); + +err: + fast_ma_writeinfo(info); + DBUG_RETURN(my_errno); + +panic: + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; +} + + /* compare unique constraint between stored rows */ + +my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos) +{ + uchar *old_rec_buff,*old_record; + my_off_t old_rec_buff_size; + my_bool error; + DBUG_ENTER("_ma_cmp_dynamic_unique"); + + if (!(old_record=my_alloca(info->s->base.reclength))) + DBUG_RETURN(1); + + /* Don't let the compare destroy blobs that may be in use */ + old_rec_buff= info->rec_buff; + old_rec_buff_size= info->rec_buff_size; + + if (info->s->base.blobs) + info->rec_buff= 0; + error= _ma_read_dynamic_record(info, old_record, pos) != 0; + if (!error) + error=_ma_unique_comp(def, record, old_record, def->null_are_equal) != 0; + if (info->s->base.blobs) + { + my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + info->rec_buff= old_rec_buff; + info->rec_buff_size= old_rec_buff_size; + } + my_afree(old_record); + DBUG_RETURN(error); +} + + + /* Compare of record on disk with packed record in memory */ + +my_bool _ma_cmp_dynamic_record(register MARIA_HA *info, + register const uchar *record) +{ + uint flag, reclength, b_type,cmp_length; + my_off_t filepos; + uchar *buffer; + MARIA_BLOCK_INFO block_info; + my_bool error= 1; + DBUG_ENTER("_ma_cmp_dynamic_record"); + + /* We are going to do changes; dont let anybody disturb */ + dont_break(); /* Dont allow SIGHUP or SIGINT */ + + if (info->opt_flag & WRITE_CACHE_USED) + { + info->update&= ~(HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK); + if (flush_io_cache(&info->rec_cache)) + DBUG_RETURN(1); + } + info->rec_cache.seek_not_done=1; + + /* If nobody have touched the database we don't have to test rec */ + + buffer=info->rec_buff; + if ((info->opt_flag & READ_CHECK_USED)) + { /* If check isn't disabled */ + if (info->s->base.blobs) + { + if (!(buffer=(uchar*) my_alloca(info->s->base.pack_reclength+ + _ma_calc_total_blob_length(info,record)))) + DBUG_RETURN(1); + } + reclength= _ma_rec_pack(info,buffer,record); + record= buffer; + + filepos= info->cur_row.lastpos; + flag=block_info.second_read=0; + block_info.next_filepos=filepos; + while (reclength > 0) + { + if ((b_type= _ma_get_block_info(&block_info, info->dfile.file, + block_info.next_filepos)) + & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED)) + my_errno=HA_ERR_RECORD_CHANGED; + goto err; + } + if (flag == 0) /* First block */ + { + flag=1; + if (reclength != block_info.rec_len) + { + my_errno=HA_ERR_RECORD_CHANGED; + goto err; + } + } else if (reclength < block_info.data_len) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; + } + reclength-= block_info.data_len; + cmp_length= block_info.data_len; + if (!reclength && info->s->calc_checksum) + cmp_length--; /* 'record' may not contain checksum */ + + if (_ma_cmp_buffer(info->dfile.file, record, block_info.filepos, + cmp_length)) + { + my_errno=HA_ERR_RECORD_CHANGED; + goto err; + } + flag=1; + record+=block_info.data_len; + } + } + my_errno=0; + error= 0; +err: + if (buffer != info->rec_buff) + my_afree((uchar*) buffer); + DBUG_PRINT("exit", ("result: %d", error)); + DBUG_RETURN(error); +} + + + /* Compare file to buffert */ + +static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos, + uint length) +{ + uint next_length; + char temp_buff[IO_SIZE*2]; + DBUG_ENTER("_ma_cmp_buffer"); + + next_length= IO_SIZE*2 - (uint) (filepos & (IO_SIZE-1)); + + while (length > IO_SIZE*2) + { + if (my_pread(file,temp_buff,next_length,filepos, MYF(MY_NABP)) || + memcmp((uchar*) buff,temp_buff,next_length)) + goto err; + filepos+=next_length; + buff+=next_length; + length-= next_length; + next_length=IO_SIZE*2; + } + if (my_pread(file,temp_buff,length,filepos,MYF(MY_NABP))) + goto err; + DBUG_RETURN(memcmp((uchar*) buff,temp_buff,length) != 0); +err: + DBUG_RETURN(1); +} + + +/* + Read next record from datafile during table scan. + + SYNOPSIS + _ma_read_rnd_dynamic_record() + info MARIA_HA pointer to table. + buf Destination for record. + filepos From where to read the record. + skip_deleted_blocks If to repeat reading until a non-deleted + record is found. + + NOTE + This is identical to _ma_read_dynamic_record(), except the following + cases: + + - If there is no active row at 'filepos', continue scanning for + an active row. (This is becasue the previous + _ma_read_rnd_dynamic_record() call stored the next block position + in filepos, but this position may not be a start block for a row + - We may have READ_CACHING enabled, in which case we use the cache + to read rows. + + For other comments, check _ma_read_dynamic_record() + + RETURN + 0 OK + != 0 Error number +*/ + +int _ma_read_rnd_dynamic_record(MARIA_HA *info, + uchar *buf, + MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks) +{ + int block_of_record, info_read; + uint left_len,b_type; + uchar *to; + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share=info->s; + DBUG_ENTER("_ma_read_rnd_dynamic_record"); + + info_read=0; + LINT_INIT(to); + + if (info->lock_type == F_UNLCK) + { +#ifndef UNSAFE_LOCKING +#else + info->tmp_lock_type=F_RDLCK; +#endif + } + else + info_read=1; /* memory-keyinfoblock is ok */ + + block_of_record= 0; /* First block of record is numbered as zero. */ + block_info.second_read= 0; + left_len=1; + do + { + if (filepos >= info->state->data_file_length) + { + if (!info_read) + { /* Check if changed */ + info_read=1; + info->rec_cache.seek_not_done=1; + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + goto panic; + } + if (filepos >= info->state->data_file_length) + { + my_errno= HA_ERR_END_OF_FILE; + goto err; + } + } + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(&info->rec_cache,(uchar*) block_info.header,filepos, + sizeof(block_info.header), + (!block_of_record && skip_deleted_blocks ? + READING_NEXT : 0) | READING_HEADER)) + goto panic; + b_type= _ma_get_block_info(&block_info,-1,filepos); + } + else + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file < filepos + MARIA_BLOCK_INFO_HEADER_LENGTH && + flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + info->rec_cache.seek_not_done=1; + b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos); + } + + if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if ((b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + && skip_deleted_blocks) + { + filepos=block_info.filepos+block_info.block_len; + block_info.second_read=0; + continue; /* Search after next_record */ + } + if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + { + my_errno= HA_ERR_RECORD_DELETED; + info->cur_row.lastpos= block_info.filepos; + info->cur_row.nextpos= block_info.filepos+block_info.block_len; + } + goto err; + } + if (block_of_record == 0) /* First block */ + { + if (block_info.rec_len > (uint) share->base.max_pack_length) + goto panic; + info->cur_row.lastpos= filepos; + if (share->base.blobs) + { + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + block_info.rec_len + + info->s->base.extra_rec_buff_size)) + goto err; + } + to= info->rec_buff; + left_len=block_info.rec_len; + } + if (left_len < block_info.data_len) + goto panic; /* Wrong linked record */ + + /* copy information that is already read */ + { + uint offset=(uint) (block_info.filepos - filepos); + uint tmp_length= (sizeof(block_info.header) - offset); + filepos=block_info.filepos; + + if (tmp_length > block_info.data_len) + tmp_length= block_info.data_len; + if (tmp_length) + { + memcpy((uchar*) to, block_info.header+offset,tmp_length); + block_info.data_len-=tmp_length; + left_len-=tmp_length; + to+=tmp_length; + filepos+=tmp_length; + } + } + /* read rest of record from file */ + if (block_info.data_len) + { + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(&info->rec_cache,(uchar*) to,filepos, + block_info.data_len, + (!block_of_record && skip_deleted_blocks) ? + READING_NEXT : 0)) + goto panic; + } + else + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file < + block_info.filepos + block_info.data_len && + flush_io_cache(&info->rec_cache)) + goto err; + /* VOID(my_seek(info->dfile.file, filepos, MY_SEEK_SET, MYF(0))); */ + if (my_read(info->dfile.file, (uchar*)to, block_info.data_len, + MYF(MY_NABP))) + { + if (my_errno == -1) + my_errno= HA_ERR_WRONG_IN_RECORD; /* Unexpected end of file */ + goto err; + } + } + } + /* + Increment block-of-record counter. If it was the first block, + remember the position behind the block for the next call. + */ + if (block_of_record++ == 0) + { + info->cur_row.nextpos= block_info.filepos+block_info.block_len; + skip_deleted_blocks=0; + } + left_len-=block_info.data_len; + to+=block_info.data_len; + filepos=block_info.next_filepos; + } while (left_len); + + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + fast_ma_writeinfo(info); + if (_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) != + MY_FILE_ERROR) + DBUG_RETURN(0); + DBUG_RETURN(my_errno); /* Wrong record */ + +panic: + my_errno=HA_ERR_WRONG_IN_RECORD; /* Something is fatal wrong */ +err: + fast_ma_writeinfo(info); + DBUG_RETURN(my_errno); +} + + + /* Read and process header from a dynamic-record-file */ + +uint _ma_get_block_info(MARIA_BLOCK_INFO *info, File file, my_off_t filepos) +{ + uint return_val=0; + uchar *header=info->header; + + if (file >= 0) + { + /* + We do not use my_pread() here because we want to have the file + pointer set to the end of the header after this function. + my_pread() may leave the file pointer untouched. + */ + VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0))); + if (my_read(file,(char*) header,sizeof(info->header),MYF(0)) != + sizeof(info->header)) + goto err; + } + DBUG_DUMP("header",(uchar*) header,MARIA_BLOCK_INFO_HEADER_LENGTH); + if (info->second_read) + { + if (info->header[0] <= 6 || info->header[0] == 13) + return_val=BLOCK_SYNC_ERROR; + } + else + { + if (info->header[0] > 6 && info->header[0] != 13) + return_val=BLOCK_SYNC_ERROR; + } + info->next_filepos= HA_OFFSET_ERROR; /* Dummy if no next block */ + + switch (info->header[0]) { + case 0: + if ((info->block_len=(uint) mi_uint3korr(header+1)) < + MARIA_MIN_BLOCK_LENGTH || + (info->block_len & (MARIA_DYN_ALIGN_SIZE -1))) + goto err; + info->filepos=filepos; + info->next_filepos=mi_sizekorr(header+4); + info->prev_filepos=mi_sizekorr(header+12); +#if SIZEOF_OFF_T == 4 + if ((mi_uint4korr(header+4) != 0 && + (mi_uint4korr(header+4) != (ulong) ~0 || + info->next_filepos != (ulong) ~0)) || + (mi_uint4korr(header+12) != 0 && + (mi_uint4korr(header+12) != (ulong) ~0 || + info->prev_filepos != (ulong) ~0))) + goto err; +#endif + return return_val | BLOCK_DELETED; /* Deleted block */ + + case 1: + info->rec_len=info->data_len=info->block_len=mi_uint2korr(header+1); + info->filepos=filepos+3; + return return_val | BLOCK_FIRST | BLOCK_LAST; + case 2: + info->rec_len=info->data_len=info->block_len=mi_uint3korr(header+1); + info->filepos=filepos+4; + return return_val | BLOCK_FIRST | BLOCK_LAST; + + case 13: + info->rec_len=mi_uint4korr(header+1); + info->block_len=info->data_len=mi_uint3korr(header+5); + info->next_filepos=mi_sizekorr(header+8); + info->second_read=1; + info->filepos=filepos+16; + return return_val | BLOCK_FIRST; + + case 3: + info->rec_len=info->data_len=mi_uint2korr(header+1); + info->block_len=info->rec_len+ (uint) header[3]; + info->filepos=filepos+4; + return return_val | BLOCK_FIRST | BLOCK_LAST; + case 4: + info->rec_len=info->data_len=mi_uint3korr(header+1); + info->block_len=info->rec_len+ (uint) header[4]; + info->filepos=filepos+5; + return return_val | BLOCK_FIRST | BLOCK_LAST; + + case 5: + info->rec_len=mi_uint2korr(header+1); + info->block_len=info->data_len=mi_uint2korr(header+3); + info->next_filepos=mi_sizekorr(header+5); + info->second_read=1; + info->filepos=filepos+13; + return return_val | BLOCK_FIRST; + case 6: + info->rec_len=mi_uint3korr(header+1); + info->block_len=info->data_len=mi_uint3korr(header+4); + info->next_filepos=mi_sizekorr(header+7); + info->second_read=1; + info->filepos=filepos+15; + return return_val | BLOCK_FIRST; + + /* The following blocks are identical to 1-6 without rec_len */ + case 7: + info->data_len=info->block_len=mi_uint2korr(header+1); + info->filepos=filepos+3; + return return_val | BLOCK_LAST; + case 8: + info->data_len=info->block_len=mi_uint3korr(header+1); + info->filepos=filepos+4; + return return_val | BLOCK_LAST; + + case 9: + info->data_len=mi_uint2korr(header+1); + info->block_len=info->data_len+ (uint) header[3]; + info->filepos=filepos+4; + return return_val | BLOCK_LAST; + case 10: + info->data_len=mi_uint3korr(header+1); + info->block_len=info->data_len+ (uint) header[4]; + info->filepos=filepos+5; + return return_val | BLOCK_LAST; + + case 11: + info->data_len=info->block_len=mi_uint2korr(header+1); + info->next_filepos=mi_sizekorr(header+3); + info->second_read=1; + info->filepos=filepos+11; + return return_val; + case 12: + info->data_len=info->block_len=mi_uint3korr(header+1); + info->next_filepos=mi_sizekorr(header+4); + info->second_read=1; + info->filepos=filepos+12; + return return_val; + } + +err: + my_errno=HA_ERR_WRONG_IN_RECORD; /* Garbage */ + return BLOCK_ERROR; +} diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c new file mode 100644 index 00000000000..4f1634756ab --- /dev/null +++ b/storage/maria/ma_extra.c @@ -0,0 +1,623 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif +#include "ma_blockrec.h" + +static void maria_extra_keyflag(MARIA_HA *info, + enum ha_extra_function function); + +/** + @brief Set options and buffers to optimize table handling + + @param name table's name + @param info open table + @param function operation + @param extra_arg Pointer to extra argument (normally pointer to + ulong); used when function is one of: + HA_EXTRA_WRITE_CACHE + HA_EXTRA_CACHE + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +int maria_extra(MARIA_HA *info, enum ha_extra_function function, + void *extra_arg) +{ + int error=0; + ulong cache_size; + MARIA_SHARE *share=info->s; + my_bool block_records= share->data_file_type == BLOCK_RECORD; + + DBUG_ENTER("maria_extra"); + DBUG_PRINT("enter",("function: %d",(int) function)); + + switch (function) { + case HA_EXTRA_RESET_STATE: /* Reset state (don't free buffers) */ + info->lastinx= 0; /* Use first index as def */ + info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR; + info->page_changed=1; + /* Next/prev gives first/last */ + if (info->opt_flag & READ_CACHE_USED) + { + reinit_io_cache(&info->rec_cache,READ_CACHE,0, + (pbool) (info->lock_type != F_UNLCK), + (pbool) test(info->update & HA_STATE_ROW_CHANGED) + ); + } + info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND | + HA_STATE_PREV_FOUND); + break; + case HA_EXTRA_CACHE: + if (block_records) + break; /* Not supported */ + + if (info->lock_type == F_UNLCK && + (share->options & HA_OPTION_PACK_RECORD)) + { + error=1; /* Not possibly if not locked */ + my_errno=EACCES; + break; + } + if (info->s->file_map) /* Don't use cache if mmap */ + break; +#if defined(HAVE_MMAP) && defined(HAVE_MADVISE) + if ((share->options & HA_OPTION_COMPRESS_RECORD)) + { + pthread_mutex_lock(&share->intern_lock); + if (_ma_memmap_file(info)) + { + /* We don't nead MADV_SEQUENTIAL if small file */ + madvise(share->file_map,share->state.state.data_file_length, + share->state.state.data_file_length <= RECORD_CACHE_SIZE*16 ? + MADV_RANDOM : MADV_SEQUENTIAL); + pthread_mutex_unlock(&share->intern_lock); + break; + } + pthread_mutex_unlock(&share->intern_lock); + } +#endif + if (info->opt_flag & WRITE_CACHE_USED) + { + info->opt_flag&= ~WRITE_CACHE_USED; + if ((error=end_io_cache(&info->rec_cache))) + break; + } + if (!(info->opt_flag & + (READ_CACHE_USED | WRITE_CACHE_USED | MEMMAP_USED))) + { + cache_size= (extra_arg ? *(ulong*) extra_arg : + my_default_record_cache_size); + if (!(init_io_cache(&info->rec_cache, info->dfile.file, + (uint) min(info->state->data_file_length+1, + cache_size), + READ_CACHE,0L,(pbool) (info->lock_type != F_UNLCK), + MYF(share->write_flag & MY_WAIT_IF_FULL)))) + { + info->opt_flag|=READ_CACHE_USED; + info->update&= ~HA_STATE_ROW_CHANGED; + } + if (share->concurrent_insert) + info->rec_cache.end_of_file=info->state->data_file_length; + } + break; + case HA_EXTRA_REINIT_CACHE: + if (info->opt_flag & READ_CACHE_USED) + { + reinit_io_cache(&info->rec_cache, READ_CACHE, info->cur_row.nextpos, + (pbool) (info->lock_type != F_UNLCK), + (pbool) test(info->update & HA_STATE_ROW_CHANGED)); + info->update&= ~HA_STATE_ROW_CHANGED; + if (share->concurrent_insert) + info->rec_cache.end_of_file=info->state->data_file_length; + } + break; + case HA_EXTRA_WRITE_CACHE: + if (info->lock_type == F_UNLCK) + { + error=1; /* Not possibly if not locked */ + break; + } + if (block_records) + break; /* Not supported */ + + cache_size= (extra_arg ? *(ulong*) extra_arg : + my_default_record_cache_size); + if (!(info->opt_flag & + (READ_CACHE_USED | WRITE_CACHE_USED | OPT_NO_ROWS)) && + !share->state.header.uniques) + if (!(init_io_cache(&info->rec_cache, info->dfile.file, cache_size, + WRITE_CACHE,info->state->data_file_length, + (pbool) (info->lock_type != F_UNLCK), + MYF(share->write_flag & MY_WAIT_IF_FULL)))) + { + info->opt_flag|=WRITE_CACHE_USED; + info->update&= ~(HA_STATE_ROW_CHANGED | + HA_STATE_WRITE_AT_END | + HA_STATE_EXTEND_BLOCK); + } + break; + case HA_EXTRA_PREPARE_FOR_UPDATE: + if (info->s->data_file_type != DYNAMIC_RECORD) + break; + /* Remove read/write cache if dynamic rows */ + case HA_EXTRA_NO_CACHE: + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + error=end_io_cache(&info->rec_cache); + /* Sergei will insert full text index caching here */ + } +#if defined(HAVE_MMAP) && defined(HAVE_MADVISE) + if (info->opt_flag & MEMMAP_USED) + madvise(share->file_map,share->state.state.data_file_length,MADV_RANDOM); +#endif + break; + case HA_EXTRA_FLUSH_CACHE: + if (info->opt_flag & WRITE_CACHE_USED) + { + if ((error=flush_io_cache(&info->rec_cache))) + { + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); /* Fatal error found */ + } + } + break; + case HA_EXTRA_NO_READCHECK: + info->opt_flag&= ~READ_CHECK_USED; /* No readcheck */ + break; + case HA_EXTRA_READCHECK: + info->opt_flag|= READ_CHECK_USED; + break; + case HA_EXTRA_KEYREAD: /* Read only keys to record */ + case HA_EXTRA_REMEMBER_POS: + info->opt_flag |= REMEMBER_OLD_POS; + bmove((uchar*) info->lastkey+share->base.max_key_length*2, + (uchar*) info->lastkey,info->lastkey_length); + info->save_update= info->update; + info->save_lastinx= info->lastinx; + info->save_lastpos= info->cur_row.lastpos; + info->save_lastkey_length=info->lastkey_length; + if (function == HA_EXTRA_REMEMBER_POS) + break; + /* fall through */ + case HA_EXTRA_KEYREAD_CHANGE_POS: + info->opt_flag |= KEY_READ_USED; + info->read_record= _ma_read_key_record; + break; + case HA_EXTRA_NO_KEYREAD: + case HA_EXTRA_RESTORE_POS: + if (info->opt_flag & REMEMBER_OLD_POS) + { + bmove((uchar*) info->lastkey, + (uchar*) info->lastkey+share->base.max_key_length*2, + info->save_lastkey_length); + info->update= info->save_update | HA_STATE_WRITTEN; + info->lastinx= info->save_lastinx; + info->cur_row.lastpos= info->save_lastpos; + info->lastkey_length=info->save_lastkey_length; + } + info->read_record= share->read_record; + info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS); + break; + case HA_EXTRA_NO_USER_CHANGE: /* Database is somehow locked agains changes */ + info->lock_type= F_EXTRA_LCK; /* Simulate as locked */ + break; + case HA_EXTRA_WAIT_LOCK: + info->lock_wait=0; + break; + case HA_EXTRA_NO_WAIT_LOCK: + info->lock_wait=MY_DONT_WAIT; + break; + case HA_EXTRA_NO_KEYS: + /* we're going to modify pieces of the state, stall Checkpoint */ + pthread_mutex_lock(&share->intern_lock); + if (info->lock_type == F_UNLCK) + { + pthread_mutex_unlock(&share->intern_lock); + error=1; /* Not possibly if not lock */ + break; + } + if (maria_is_any_key_active(share->state.key_map)) + { + MARIA_KEYDEF *key=share->keyinfo; + uint i; + for (i=0 ; i < share->base.keys ; i++,key++) + { + if (!(key->flag & HA_NOSAME) && info->s->base.auto_key != i+1) + { + maria_clear_key_active(share->state.key_map, i); + info->update|= HA_STATE_CHANGED; + } + } + + if (!share->changed) + { + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; + share->changed=1; /* Update on close */ + if (!share->global_changed) + { + share->global_changed=1; + share->state.open_count++; + } + } + share->state.state= *info->state; + /* + That state write to disk must be done, even for transactional tables; + indeed the table's share is going to be lost (there was a + HA_EXTRA_FORCE_REOPEN before, which set share->last_version to + 0), and so the only way it leaves information (share->state.key_map) + for the posterity is by writing it to disk. + */ + DBUG_ASSERT(!maria_in_recovery); + error= _ma_state_info_write(share, 1|2); + } + pthread_mutex_unlock(&share->intern_lock); + break; + case HA_EXTRA_FORCE_REOPEN: + /* + Normally MySQL uses this case when it is going to close all open + instances of the table, thus going to flush all data/index/state. + We however do a flush here for additional safety. + */ + /** @todo consider porting these flush-es to MyISAM */ + error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE) || + _ma_state_info_write(share, 1|2|4); +#ifdef ASK_MONTY + || (share->changed= 0); +#endif + /** + @todo RECOVERY BUG + Though we flushed the state, IF some other thread may have the same + table (same MARIA_SHARE) open at this time then it may have a + more recent state to flush when it closes, thus we don't set + share->changed to 0 here. On the other hand, this means that when our + thread closes its table, it will flush the state again, then it would + overwrite any state written by yet another thread which may have opened + the table (new MARIA_SHARE) and done some updates. + ASK_MONTY about the IF above. See also same tag in + HA_EXTRA_PREPARE_FOR_DROP|RENAME. + */ + pthread_mutex_lock(&THR_LOCK_maria); + pthread_mutex_lock(&share->intern_lock); /* protect against Checkpoint */ + /* this makes the share not be re-used next time the table is opened */ + share->last_version= 0L; /* Impossible version */ + pthread_mutex_unlock(&share->intern_lock); + pthread_mutex_unlock(&THR_LOCK_maria); + break; + case HA_EXTRA_PREPARE_FOR_DROP: + case HA_EXTRA_PREPARE_FOR_RENAME: + { + my_bool do_flush= test(function != HA_EXTRA_PREPARE_FOR_DROP); + pthread_mutex_lock(&THR_LOCK_maria); + /* + This share, to have last_version=0, needs to save all its data/index + blocks to disk if this is not for a DROP TABLE. Otherwise they would be + invisible to future openers; and they could even go to disk late and + cancel the work of future openers. + On Windows, which cannot delete an open file (cannot drop an open table) + we have to close the table's files. + */ + if (info->lock_type != F_UNLCK && !info->was_locked) + { + info->was_locked= info->lock_type; + if (maria_lock_database(info, F_UNLCK)) + error= my_errno; + info->lock_type= F_UNLCK; + } + if (share->kfile.file >= 0) + _ma_decrement_open_count(info); + pthread_mutex_lock(&share->intern_lock); + enum flush_type type= do_flush ? FLUSH_RELEASE : FLUSH_IGNORE_CHANGED; + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + type, type)) + { + error=my_errno; + share->changed=1; + } + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + if (end_io_cache(&info->rec_cache)) + error= 1; + } + if (share->kfile.file >= 0) + { + if (do_flush) + { + /* + Save the state so that others can find it from disk. + We have to sync now, as on Windows we are going to close the file + (so cannot sync later). + */ + if (_ma_state_info_write(share, 1 | 2) || + my_sync(share->kfile.file, MYF(0))) + error= my_errno; +#ifdef ASK_MONTY /* see same tag in HA_EXTRA_FORCE_REOPEN */ + else + share->changed= 0; +#endif + } + else + { + /* be sure that state is not tried for write as file may be closed */ + share->changed= 0; + } +#ifdef __WIN__ + if (my_close(share->kfile, MYF(0))) + error=my_errno; + share->kfile.file= -1; +#endif + } + if (share->data_file_type == BLOCK_RECORD && + share->bitmap.file.file >= 0) + { + if (do_flush && my_sync(share->bitmap.file.file, MYF(0))) + error= my_errno; +#ifdef __WIN__ + if (my_close(share->bitmap.file.file, MYF(0))) + error= my_errno; + share->bitmap.file.file= -1; +#endif + } +#ifdef __WIN__ + { + LIST *list_element ; + for (list_element=maria_open_list ; + list_element ; + list_element=list_element->next) + { + MARIA_HA *tmpinfo=(MARIA_HA*) list_element->data; + if (tmpinfo->s == info->s) + { + if (share->data_file_type != BLOCK_RECORD && + tmpinfo->dfile.file >= 0 && + my_close(tmpinfo->dfile.file, MYF(0))) + error = my_errno; + tmpinfo->dfile.file= -1; + } + } + } +#endif + /* For protection against Checkpoint, we set under intern_lock: */ + share->last_version= 0L; /* Impossible version */ + pthread_mutex_unlock(&share->intern_lock); + pthread_mutex_unlock(&THR_LOCK_maria); + break; + } + case HA_EXTRA_FLUSH: + if (!share->temporary) + error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_KEEP, FLUSH_KEEP); +#ifdef HAVE_PWRITE + _ma_decrement_open_count(info); +#endif + if (share->not_flushed) + { + share->not_flushed=0; + if (_ma_sync_table_files(info)) + error= my_errno; + if (error) + { + share->changed=1; + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); /* Fatal error found */ + } + } + if (share->base.blobs && info->rec_buff_size > + share->base.default_rec_buff_size) + { + info->rec_buff_size= 1; /* Force realloc */ + _ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + share->base.default_rec_buff_size); + } + break; + case HA_EXTRA_NORMAL: /* Theese isn't in use */ + info->quick_mode=0; + break; + case HA_EXTRA_QUICK: + info->quick_mode=1; + break; + case HA_EXTRA_NO_ROWS: + if (!share->state.header.uniques) + info->opt_flag|= OPT_NO_ROWS; + break; + case HA_EXTRA_PRELOAD_BUFFER_SIZE: + info->preload_buff_size= *((ulong *) extra_arg); + break; + case HA_EXTRA_CHANGE_KEY_TO_UNIQUE: + case HA_EXTRA_CHANGE_KEY_TO_DUP: + maria_extra_keyflag(info, function); + break; + case HA_EXTRA_MMAP: +#ifdef HAVE_MMAP + if (block_records) + break; /* Not supported */ + pthread_mutex_lock(&share->intern_lock); + /* + Memory map the data file if it is not already mapped. It is safe + to memory map a file while other threads are using file I/O on it. + Assigning a new address to a function pointer is an atomic + operation. intern_lock prevents that two or more mappings are done + at the same time. + */ + if (!share->file_map) + { + if (_ma_dynmap_file(info, share->state.state.data_file_length)) + { + DBUG_PRINT("warning",("mmap failed: errno: %d",errno)); + error= my_errno= errno; + } + else + { + share->file_read= _ma_mmap_pread; + share->file_write= _ma_mmap_pwrite; + } + } + pthread_mutex_unlock(&share->intern_lock); +#endif + break; + case HA_EXTRA_MARK_AS_LOG_TABLE: + pthread_mutex_lock(&share->intern_lock); + share->is_log_table= TRUE; + pthread_mutex_unlock(&share->intern_lock); + break; + case HA_EXTRA_KEY_CACHE: + case HA_EXTRA_NO_KEY_CACHE: + default: + break; + } + { + char tmp[1]; + tmp[0]=function; + } + DBUG_RETURN(error); +} /* maria_extra */ + + +/* + Start/Stop Inserting Duplicates Into a Table, WL#1648. +*/ + +static void maria_extra_keyflag(MARIA_HA *info, + enum ha_extra_function function) +{ + uint idx; + + for (idx= 0; idx< info->s->base.keys; idx++) + { + switch (function) { + case HA_EXTRA_CHANGE_KEY_TO_UNIQUE: + info->s->keyinfo[idx].flag|= HA_NOSAME; + break; + case HA_EXTRA_CHANGE_KEY_TO_DUP: + info->s->keyinfo[idx].flag&= ~(HA_NOSAME); + break; + default: + break; + } + } +} + + +int maria_reset(MARIA_HA *info) +{ + int error= 0; + MARIA_SHARE *share=info->s; + DBUG_ENTER("maria_reset"); + /* + Free buffers and reset the following flags: + EXTRA_CACHE, EXTRA_WRITE_CACHE, EXTRA_KEYREAD, EXTRA_QUICK + + If the row buffer cache is large (for dynamic tables), reduce it + to save memory. + */ + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + error= end_io_cache(&info->rec_cache); + } + if (share->base.blobs && info->rec_buff_size > + share->base.default_rec_buff_size) + { + info->rec_buff_size= 1; /* Force realloc */ + _ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + share->base.default_rec_buff_size); + } +#if defined(HAVE_MMAP) && defined(HAVE_MADVISE) + if (info->opt_flag & MEMMAP_USED) + madvise(share->file_map,share->state.state.data_file_length,MADV_RANDOM); +#endif + info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS); + info->quick_mode=0; + info->lastinx= 0; /* Use first index as def */ + info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR; + info->page_changed= 1; + info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND | + HA_STATE_PREV_FOUND); + DBUG_RETURN(error); +} + + +int _ma_sync_table_files(const MARIA_HA *info) +{ + return (my_sync(info->dfile.file, MYF(MY_WME)) || + my_sync(info->s->kfile.file, MYF(MY_WME))); +} + + +/** + @brief flushes the data and/or index file of a table + + This is useful when one wants to read a table using OS syscalls (like + my_copy()) and first wants to be sure that MySQL-level caches go down to + the OS so that OS syscalls can see all data. It can flush rec_cache, + bitmap, pagecache of data file, pagecache of index file. + + @param info table + @param flush_data_or_index one or two of these flags: + MARIA_FLUSH_DATA, MARIA_FLUSH_INDEX + @param flush_type_for_data + @param flush_type_for_index + + @note does not sync files (@see _ma_sync_table_files()). + @note Progressively this function will be used in all places where we flush + the index but not the data file (probable bugs). + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index, + enum flush_type flush_type_for_data, + enum flush_type flush_type_for_index) +{ + MARIA_SHARE *share= info->s; + /* flush data file first because it's more critical */ + if (flush_data_or_index & MARIA_FLUSH_DATA) + { + if (info->opt_flag & WRITE_CACHE_USED) + { + /* normally any code which creates a WRITE_CACHE destroys it later */ + DBUG_ASSERT(0); + if (end_io_cache(&info->rec_cache)) + goto err; + info->opt_flag&= ~WRITE_CACHE_USED; + } + if (share->data_file_type == BLOCK_RECORD) + { + if(_ma_flush_bitmap(share) || + flush_pagecache_blocks(share->pagecache, &info->dfile, + flush_type_for_data)) + goto err; + } + } + if ((flush_data_or_index & MARIA_FLUSH_INDEX) && + flush_pagecache_blocks(share->pagecache, &share->kfile, + flush_type_for_index)) + goto err; + return 0; +err: + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); + return 1; +} diff --git a/storage/maria/ma_ft_boolean_search.c b/storage/maria/ma_ft_boolean_search.c new file mode 100644 index 00000000000..e09a076ceaa --- /dev/null +++ b/storage/maria/ma_ft_boolean_search.c @@ -0,0 +1,975 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* TODO: add caching - pre-read several index entries at once */ + +/* + Added optimization for full-text queries with plus-words. It was + implemented by sharing maximal document id (max_docid) variable + inside plus subtree. max_docid could be used by any word in plus + subtree, but it could be updated by plus-word only. + + The idea is: there is no need to search for docid smaller than + biggest docid inside current plus subtree. + + Examples: + +word1 word2 + share same max_docid + max_docid updated by word1 + +word1 +(word2 word3) + share same max_docid + max_docid updated by word1 + +(word1 -word2) +(+word3 word4) + share same max_docid + max_docid updated by word3 +*/ + +#define FT_CORE +#include "ma_ftdefs.h" + +/* search with boolean queries */ + +static double _wghts[11]= +{ + 0.131687242798354, + 0.197530864197531, + 0.296296296296296, + 0.444444444444444, + 0.666666666666667, + 1.000000000000000, + 1.500000000000000, + 2.250000000000000, + 3.375000000000000, + 5.062500000000000, + 7.593750000000000}; +static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */ + +static double _nwghts[11]= +{ + -0.065843621399177, + -0.098765432098766, + -0.148148148148148, + -0.222222222222222, + -0.333333333333334, + -0.500000000000000, + -0.750000000000000, + -1.125000000000000, + -1.687500000000000, + -2.531250000000000, + -3.796875000000000}; +static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */ + +#define FTB_FLAG_TRUNC 1 +/* At most one of the following flags can be set */ +#define FTB_FLAG_YES 2 +#define FTB_FLAG_NO 4 +#define FTB_FLAG_WONLY 8 + +typedef struct st_ftb_expr FTB_EXPR; +struct st_ftb_expr +{ + FTB_EXPR *up; + uint flags; +/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ + my_off_t docid[2]; + my_off_t max_docid; + float weight; + float cur_weight; + LIST *phrase; /* phrase words */ + LIST *document; /* for phrase search */ + uint yesses; /* number of "yes" words matched */ + uint nos; /* number of "no" words matched */ + uint ythresh; /* number of "yes" words in expr */ + uint yweaks; /* number of "yes" words for scan only */ +}; + +typedef struct st_ftb_word +{ + FTB_EXPR *up; + uint flags; +/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ + my_off_t docid[2]; /* for index search and for scan */ + my_off_t key_root; + my_off_t *max_docid; + MARIA_KEYDEF *keyinfo; + struct st_ftb_word *prev; + float weight; + uint ndepth; + uint len; + uchar off; + uchar word[1]; +} FTB_WORD; + +typedef struct st_ft_info +{ + struct _ft_vft *please; + MARIA_HA *info; + CHARSET_INFO *charset; + FTB_EXPR *root; + FTB_WORD **list; + FTB_WORD *last_word; + MEM_ROOT mem_root; + QUEUE queue; + TREE no_dupes; + my_off_t lastpos; + uint keynr; + uchar with_scan; + enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state; +} FTB; + +static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b) +{ + int i; + + /* if a==curdoc, take it as a < b */ + if (v && a->docid[0] == *v) + return -1; + + /* ORDER BY docid, ndepth DESC */ + i=CMP_NUM(a->docid[0], b->docid[0]); + if (!i) + i=CMP_NUM(b->ndepth,a->ndepth); + return i; +} + +static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b) +{ + /* ORDER BY word DESC, ndepth DESC */ + int i= ha_compare_text(cs, (uchar*) (*b)->word+1,(*b)->len-1, + (uchar*) (*a)->word+1,(*a)->len-1,0,0); + if (!i) + i=CMP_NUM((*b)->ndepth,(*a)->ndepth); + return i; +} + + +typedef struct st_my_ftb_param +{ + FTB *ftb; + FTB_EXPR *ftbe; + uchar *up_quot; + uint depth; +} MY_FTB_PARAM; + + +static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param, + char *word, int word_len, + MYSQL_FTPARSER_BOOLEAN_INFO *info) +{ + MY_FTB_PARAM *ftb_param= param->mysql_ftparam; + FTB_WORD *ftbw; + FTB_EXPR *ftbe, *tmp_expr; + FT_WORD *phrase_word; + LIST *tmp_element; + int r= info->weight_adjust; + float weight= (float) + (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)]; + + switch (info->type) { + case FT_TOKEN_WORD: + ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root, + sizeof(FTB_WORD) + + (info->trunc ? HA_MAX_KEY_BUFF : + word_len * ftb_param->ftb->charset->mbmaxlen + + HA_FT_WLEN + + ftb_param->ftb->info->s->rec_reflength)); + ftbw->len= word_len + 1; + ftbw->flags= 0; + ftbw->off= 0; + if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES; + if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO; + if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC; + ftbw->weight= weight; + ftbw->up= ftb_param->ftbe; + ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR; + ftbw->ndepth= (info->yesno < 0) + ftb_param->depth; + ftbw->key_root= HA_OFFSET_ERROR; + memcpy(ftbw->word + 1, word, word_len); + ftbw->word[0]= word_len; + if (info->yesno > 0) ftbw->up->ythresh++; + ftb_param->ftb->queue.max_elements++; + ftbw->prev= ftb_param->ftb->last_word; + ftb_param->ftb->last_word= ftbw; + ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC); + for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up) + if (! (tmp_expr->flags & FTB_FLAG_YES)) + break; + ftbw->max_docid= &tmp_expr->max_docid; + /* fall through */ + case FT_TOKEN_STOPWORD: + if (! ftb_param->up_quot) break; + phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); + tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); + phrase_word->pos= word; + phrase_word->len= word_len; + tmp_element->data= (void *)phrase_word; + ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element); + /* Allocate document list at this point. + It allows to avoid huge amount of allocs/frees for each row.*/ + tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); + tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); + ftb_param->ftbe->document= + list_add(ftb_param->ftbe->document, tmp_element); + break; + case FT_TOKEN_LEFT_PAREN: + ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR)); + ftbe->flags= 0; + if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES; + if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO; + ftbe->weight= weight; + ftbe->up= ftb_param->ftbe; + ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; + ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR; + ftbe->phrase= NULL; + ftbe->document= 0; + if (info->quot) ftb_param->ftb->with_scan|= 2; + if (info->yesno > 0) ftbe->up->ythresh++; + ftb_param->ftbe= ftbe; + ftb_param->depth++; + ftb_param->up_quot= info->quot; + break; + case FT_TOKEN_RIGHT_PAREN: + if (ftb_param->ftbe->document) + { + /* Circuit document list */ + for (tmp_element= ftb_param->ftbe->document; + tmp_element->next; tmp_element= tmp_element->next) /* no-op */; + tmp_element->next= ftb_param->ftbe->document; + ftb_param->ftbe->document->prev= tmp_element; + } + info->quot= 0; + if (ftb_param->ftbe->up) + { + DBUG_ASSERT(ftb_param->depth); + ftb_param->ftbe= ftb_param->ftbe->up; + ftb_param->depth--; + ftb_param->up_quot= 0; + } + break; + case FT_TOKEN_EOF: + default: + break; + } + return(0); +} + + +static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param, + char *query, int len) +{ + MY_FTB_PARAM *ftb_param= param->mysql_ftparam; + MYSQL_FTPARSER_BOOLEAN_INFO info; + CHARSET_INFO *cs= ftb_param->ftb->charset; + uchar **start= (uchar**) &query; + char *end= query + len; + FT_WORD w; + + info.prev= ' '; + info.quot= 0; + while (maria_ft_get_word(cs, start, end, &w, &info)) + param->mysql_add_word(param, w.pos, w.len, &info); + return(0); +} + + +static int _ftb_parse_query(FTB *ftb, uchar *query, uint len, + struct st_mysql_ftparser *parser) +{ + MYSQL_FTPARSER_PARAM *param; + MY_FTB_PARAM ftb_param; + DBUG_ENTER("_ftb_parse_query"); + DBUG_ASSERT(parser); + + if (ftb->state != UNINITIALIZED) + DBUG_RETURN(0); + if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0))) + DBUG_RETURN(1); + + ftb_param.ftb= ftb; + ftb_param.depth= 0; + ftb_param.ftbe= ftb->root; + ftb_param.up_quot= 0; + + param->mysql_parse= ftb_parse_query_internal; + param->mysql_add_word= ftb_query_add_word; + param->mysql_ftparam= (void *)&ftb_param; + param->cs= ftb->charset; + param->doc= query; + param->length= len; + param->flags= 0; + param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO; + DBUG_RETURN(parser->parse(param)); +} + + +static int _ftb_no_dupes_cmp(void* not_used __attribute__((unused)), + const void *a,const void *b) +{ + return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b))); +} + +/* returns 1 if the search was finished (must-word wasn't found) */ +static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) +{ + int r; + int subkeys=1; + my_bool can_go_down; + MARIA_HA *info=ftb->info; + uint off= 0, extra=HA_FT_WLEN+info->s->base.rec_reflength; + uchar *lastkey_buf= ftbw->word+ftbw->off; + + if (ftbw->flags & FTB_FLAG_TRUNC) + lastkey_buf+=ftbw->len; + + if (init_search) + { + ftbw->key_root=info->s->state.key_root[ftb->keynr]; + ftbw->keyinfo=info->s->keyinfo+ftb->keynr; + + r= _ma_search(info, ftbw->keyinfo, ftbw->word, ftbw->len, + SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root); + } + else + { + uint sflag= SEARCH_BIGGER; + if (ftbw->docid[0] < *ftbw->max_docid) + { + sflag|= SEARCH_SAME; + _ma_dpointer(info, (ftbw->word + ftbw->len + HA_FT_WLEN), + *ftbw->max_docid); + } + r= _ma_search(info, ftbw->keyinfo, lastkey_buf, + USE_WHOLE_KEY, sflag, ftbw->key_root); + } + + can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC))); + /* Skip rows inserted by concurrent insert */ + while (!r) + { + if (can_go_down) + { + /* going down ? */ + off=info->lastkey_length-extra; + subkeys=ft_sintXkorr(info->lastkey+off); + } + if (subkeys<0 || info->cur_row.lastpos < info->state->data_file_length) + break; + r= _ma_search_next(info, ftbw->keyinfo, info->lastkey, + info->lastkey_length, + SEARCH_BIGGER, ftbw->key_root); + } + + if (!r && !ftbw->off) + { + r= ha_compare_text(ftb->charset, + (uchar*) info->lastkey+1, + info->lastkey_length-extra-1, + (uchar*) ftbw->word+1, + ftbw->len-1, + (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0); + } + + if (r) /* not found */ + { + if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC)) + { + ftbw->docid[0]=HA_OFFSET_ERROR; + if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0) + { + /* + This word MUST BE present in every document returned, + so we can stop the search right now + */ + ftb->state=INDEX_DONE; + return 1; /* search is done */ + } + else + return 0; + } + + /* going up to the first-level tree to continue search there */ + _ma_dpointer(info, (lastkey_buf+HA_FT_WLEN), ftbw->key_root); + ftbw->key_root=info->s->state.key_root[ftb->keynr]; + ftbw->keyinfo=info->s->keyinfo+ftb->keynr; + ftbw->off=0; + return _ft2_search(ftb, ftbw, 0); + } + + /* matching key found */ + memcpy(lastkey_buf, info->lastkey, info->lastkey_length); + if (lastkey_buf == ftbw->word) + ftbw->len=info->lastkey_length-extra; + + /* going down ? */ + if (subkeys<0) + { + /* + yep, going down, to the second-level tree + TODO here: subkey-based optimization + */ + ftbw->off=off; + ftbw->key_root= info->cur_row.lastpos; + ftbw->keyinfo=& info->s->ft2_keyinfo; + r= _ma_search_first(info, ftbw->keyinfo, ftbw->key_root); + DBUG_ASSERT(r==0); /* found something */ + memcpy(lastkey_buf+off, info->lastkey, info->lastkey_length); + } + ftbw->docid[0]= info->cur_row.lastpos; + if (ftbw->flags & FTB_FLAG_YES) + *ftbw->max_docid= info->cur_row.lastpos; + return 0; +} + +static void _ftb_init_index_search(FT_INFO *ftb) +{ + int i; + FTB_WORD *ftbw; + + if ((ftb->state != READY && ftb->state !=INDEX_DONE) || + ftb->keynr == NO_SUCH_KEY) + return; + ftb->state=INDEX_SEARCH; + + for (i=ftb->queue.elements; i; i--) + { + ftbw=(FTB_WORD *)(ftb->queue.root[i]); + + if (ftbw->flags & FTB_FLAG_TRUNC) + { + /* + special treatment for truncation operator + 1. there are some (besides this) +words + | no need to search in the index, it can never ADD new rows + | to the result, and to remove half-matched rows we do scan anyway + 2. -trunc* + | same as 1. + 3. in 1 and 2, +/- need not be on the same expr. level, + but can be on any upper level, as in +word +(trunc1* trunc2*) + 4. otherwise + | We have to index-search for this prefix. + | It may cause duplicates, as in the index (sorted by <word,docid>) + | <aaaa,row1> + | <aabb,row2> + | <aacc,row1> + | Searching for "aa*" will find row1 twice... + */ + FTB_EXPR *ftbe; + for (ftbe=(FTB_EXPR*)ftbw; + ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC); + ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up) + { + if (ftbe->flags & FTB_FLAG_NO || /* 2 */ + ftbe->up->ythresh - ftbe->up->yweaks >1) /* 1 */ + { + FTB_EXPR *top_ftbe=ftbe->up; + ftbw->docid[0]=HA_OFFSET_ERROR; + for (ftbe=(FTB_EXPR *)ftbw; + ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO); + ftbe=ftbe->up) + ftbe->up->yweaks++; + ftbe=0; + break; + } + } + if (!ftbe) + continue; + /* 4 */ + if (!is_tree_inited(& ftb->no_dupes)) + init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t), + _ftb_no_dupes_cmp,0,0,0); + else + reset_tree(& ftb->no_dupes); + } + + ftbw->off=0; /* in case of reinit */ + if (_ft2_search(ftb, ftbw, 1)) + return; + } + queue_fix(& ftb->queue); +} + + +FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr, uchar *query, + uint query_len, CHARSET_INFO *cs) +{ + FTB *ftb; + FTB_EXPR *ftbe; + FTB_WORD *ftbw; + + if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME)))) + return 0; + ftb->please= (struct _ft_vft *) & _ma_ft_vft_boolean; + ftb->state=UNINITIALIZED; + ftb->info=info; + ftb->keynr=keynr; + ftb->charset=cs; + DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset); + ftb->with_scan=0; + ftb->lastpos=HA_OFFSET_ERROR; + bzero(& ftb->no_dupes, sizeof(TREE)); + ftb->last_word= 0; + + init_alloc_root(&ftb->mem_root, 1024, 1024); + ftb->queue.max_elements= 0; + if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR)))) + goto err; + ftbe->weight=1; + ftbe->flags=FTB_FLAG_YES; + ftbe->nos=1; + ftbe->up=0; + ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; + ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR; + ftbe->phrase= NULL; + ftbe->document= 0; + ftb->root=ftbe; + if (unlikely(_ftb_parse_query(ftb, query, query_len, + keynr == NO_SUCH_KEY ? &ft_default_parser : + info->s->keyinfo[keynr].parser))) + goto err; + /* + Hack: instead of init_queue, we'll use reinit queue to be able + to alloc queue with alloc_root() + */ + if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root, + (ftb->queue.max_elements + 1) * + sizeof(void *)))) + goto err; + reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0, + (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0); + for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev) + queue_insert(&ftb->queue, (uchar *)ftbw); + ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root, + sizeof(FTB_WORD *)*ftb->queue.elements); + memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements); + qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *), + (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset); + if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC; + ftb->state=READY; + return ftb; +err: + free_root(& ftb->mem_root, MYF(0)); + my_free((uchar*)ftb,MYF(0)); + return 0; +} + + +typedef struct st_my_ftb_phrase_param +{ + LIST *phrase; + LIST *document; + CHARSET_INFO *cs; + uint phrase_length; + uint document_length; + uint match; +} MY_FTB_PHRASE_PARAM; + + +static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param, + char *word, int word_len, + MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused))) +{ + MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; + FT_WORD *w= (FT_WORD *)phrase_param->document->data; + LIST *phrase, *document; + w->pos= word; + w->len= word_len; + phrase_param->document= phrase_param->document->prev; + if (phrase_param->phrase_length > phrase_param->document_length) + { + phrase_param->document_length++; + return 0; + } + /* TODO: rewrite phrase search to avoid + comparing the same word twice. */ + for (phrase= phrase_param->phrase, document= phrase_param->document->next; + phrase; phrase= phrase->next, document= document->next) + { + FT_WORD *phrase_word= (FT_WORD *)phrase->data; + FT_WORD *document_word= (FT_WORD *)document->data; + if (my_strnncoll(phrase_param->cs, + (uchar*) phrase_word->pos, phrase_word->len, + (uchar*) document_word->pos, document_word->len)) + return 0; + } + phrase_param->match++; + return 0; +} + + +static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param, + char *document, int len) +{ + FT_WORD word; + MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; + const char *docend= document + len; + while (maria_ft_simple_get_word(phrase_param->cs, (uchar**) &document, + docend, &word, FALSE)) + { + param->mysql_add_word(param, word.pos, word.len, 0); + if (phrase_param->match) + break; + } + return 0; +} + + +/* + Checks if given buffer matches phrase list. + + SYNOPSIS + _ftb_check_phrase() + s0 start of buffer + e0 end of buffer + phrase broken into list phrase + cs charset info + + RETURN VALUE + 1 is returned if phrase found, 0 else. + -1 is returned if error occurs. +*/ + +static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len, + FTB_EXPR *ftbe, struct st_mysql_ftparser *parser) +{ + MY_FTB_PHRASE_PARAM ftb_param; + MYSQL_FTPARSER_PARAM *param; + DBUG_ENTER("_ftb_check_phrase"); + DBUG_ASSERT(parser); + + if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 1))) + DBUG_RETURN(0); + ftb_param.phrase= ftbe->phrase; + ftb_param.document= ftbe->document; + ftb_param.cs= ftb->charset; + ftb_param.phrase_length= list_length(ftbe->phrase); + ftb_param.document_length= 1; + ftb_param.match= 0; + + param->mysql_parse= ftb_check_phrase_internal; + param->mysql_add_word= ftb_phrase_add_word; + param->mysql_ftparam= (void *)&ftb_param; + param->cs= ftb->charset; + param->doc= (uchar *)document; + param->length= len; + param->flags= 0; + param->mode= MYSQL_FTPARSER_WITH_STOPWORDS; + if (unlikely(parser->parse(param))) + return -1; + DBUG_RETURN(ftb_param.match ? 1 : 0); +} + + +static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig) +{ + FT_SEG_ITERATOR ftsi; + FTB_EXPR *ftbe; + float weight=ftbw->weight; + int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0); + my_off_t curdoc=ftbw->docid[mode]; + struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? + &ft_default_parser : + ftb->info->s->keyinfo[ftb->keynr].parser; + + for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up) + { + ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks); + if (ftbe->docid[mode] != curdoc) + { + ftbe->cur_weight=0; + ftbe->yesses=ftbe->nos=0; + ftbe->docid[mode]=curdoc; + } + if (ftbe->nos) + break; + if (yn_flag & FTB_FLAG_YES) + { + weight /= ftbe->ythresh; + ftbe->cur_weight += weight; + if ((int) ++ftbe->yesses == ythresh) + { + yn_flag=ftbe->flags; + weight=ftbe->cur_weight*ftbe->weight; + if (mode && ftbe->phrase) + { + int found= 0; + + memcpy(&ftsi, ftsi_orig, sizeof(ftsi)); + while (_ma_ft_segiterator(&ftsi) && !found) + { + if (!ftsi.pos) + continue; + found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser); + if (unlikely(found < 0)) + return 1; + } + if (!found) + break; + } /* ftbe->quot */ + } + else + break; + } + else + if (yn_flag & FTB_FLAG_NO) + { + /* + NOTE: special sort function of queue assures that all + (yn_flag & FTB_FLAG_NO) != 0 + events for every particular subexpression will + "auto-magically" happen BEFORE all the + (yn_flag & FTB_FLAG_YES) != 0 events. So no + already matched expression can become not-matched again. + */ + ++ftbe->nos; + break; + } + else + { + if (ftbe->ythresh) + weight/=3; + ftbe->cur_weight += weight; + if ((int) ftbe->yesses < ythresh) + break; + if (!(yn_flag & FTB_FLAG_WONLY)) + yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ; + weight*= ftbe->weight; + } + } + return 0; +} + + +int maria_ft_boolean_read_next(FT_INFO *ftb, char *record) +{ + FTB_EXPR *ftbe; + FTB_WORD *ftbw; + MARIA_HA *info=ftb->info; + my_off_t curdoc; + + if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE) + return -1; + + /* black magic ON */ + if ((int) _ma_check_index(info, ftb->keynr) < 0) + return my_errno; + if (_ma_readinfo(info, F_RDLCK, 1)) + return my_errno; + /* black magic OFF */ + + if (!ftb->queue.elements) + return my_errno=HA_ERR_END_OF_FILE; + + /* Attention!!! Address of a local variable is used here! See err: label */ + ftb->queue.first_cmp_arg=(void *)&curdoc; + + while (ftb->state == INDEX_SEARCH && + (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) != + HA_OFFSET_ERROR) + { + while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0]) + { + if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0))) + { + my_errno= HA_ERR_OUT_OF_MEM; + goto err; + } + + /* update queue */ + _ft2_search(ftb, ftbw, 0); + queue_replaced(& ftb->queue); + } + + ftbe=ftb->root; + if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 && + ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos) + { + /* curdoc matched ! */ + if (is_tree_inited(&ftb->no_dupes) && + tree_insert(&ftb->no_dupes, &curdoc, 0, + ftb->no_dupes.custom_arg)->count >1) + /* but it managed already to get past this line once */ + continue; + + info->cur_row.lastpos= curdoc; + /* Clear all states, except that the table was updated */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + if (!(*info->read_record)(info, record, curdoc)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + if (ftb->with_scan && maria_ft_boolean_find_relevance(ftb,record,0)==0) + continue; /* no match */ + my_errno=0; + goto err; + } + goto err; + } + } + ftb->state=INDEX_DONE; + my_errno=HA_ERR_END_OF_FILE; +err: + ftb->queue.first_cmp_arg=(void *)0; + return my_errno; +} + + +typedef struct st_my_ftb_find_param +{ + FT_INFO *ftb; + FT_SEG_ITERATOR *ftsi; +} MY_FTB_FIND_PARAM; + + +static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param, + char *word, int len, + MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused))) +{ + MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; + FT_INFO *ftb= ftb_param->ftb; + FTB_WORD *ftbw; + int a, b, c; + for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2) + { + ftbw= ftb->list[c]; + if (ha_compare_text(ftb->charset, (uchar*)word, len, + (uchar*)ftbw->word+1, ftbw->len-1, + (my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) > 0) + b= c; + else + a= c; + } + for (; c >= 0; c--) + { + ftbw= ftb->list[c]; + if (ha_compare_text(ftb->charset, (uchar*)word, len, + (uchar*)ftbw->word + 1,ftbw->len - 1, + (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0)) + break; + if (ftbw->docid[1] == ftb->info->cur_row.lastpos) + continue; + ftbw->docid[1]= ftb->info->cur_row.lastpos; + if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi))) + return 1; + } + return(0); +} + + +static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param, + char *doc, int len) +{ + MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; + FT_INFO *ftb= ftb_param->ftb; + char *end= doc + len; + FT_WORD w; + while (maria_ft_simple_get_word(ftb->charset, (uchar**) &doc, end, &w, TRUE)) + param->mysql_add_word(param, w.pos, w.len, 0); + return(0); +} + + +float maria_ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length) +{ + FTB_EXPR *ftbe; + FT_SEG_ITERATOR ftsi, ftsi2; + MARIA_RECORD_POS docid= ftb->info->cur_row.lastpos; + MY_FTB_FIND_PARAM ftb_param; + MYSQL_FTPARSER_PARAM *param; + struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? + &ft_default_parser : + ftb->info->s->keyinfo[ftb->keynr].parser; + + if (docid == HA_OFFSET_ERROR) + return -2.0; + if (!ftb->queue.elements) + return 0; + if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0))) + return 0; + + if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos) + { + FTB_EXPR *x; + uint i; + + for (i=0; i < ftb->queue.elements; i++) + { + ftb->list[i]->docid[1]=HA_OFFSET_ERROR; + for (x=ftb->list[i]->up; x; x=x->up) + x->docid[1]=HA_OFFSET_ERROR; + } + } + + ftb->lastpos=docid; + + if (ftb->keynr==NO_SUCH_KEY) + _ma_ft_segiterator_dummy_init(record, length, &ftsi); + else + _ma_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi); + memcpy(&ftsi2, &ftsi, sizeof(ftsi)); + + ftb_param.ftb= ftb; + ftb_param.ftsi= &ftsi2; + param->mysql_parse= ftb_find_relevance_parse; + param->mysql_add_word= ftb_find_relevance_add_word; + param->mysql_ftparam= (void *)&ftb_param; + param->flags= 0; + param->cs= ftb->charset; + param->mode= MYSQL_FTPARSER_SIMPLE_MODE; + + while (_ma_ft_segiterator(&ftsi)) + { + if (!ftsi.pos) + continue; + param->doc= (uchar *)ftsi.pos; + param->length= ftsi.len; + if (unlikely(parser->parse(param))) + return 0; + } + ftbe=ftb->root; + if (ftbe->docid[1]==docid && ftbe->cur_weight>0 && + ftbe->yesses>=ftbe->ythresh && !ftbe->nos) + { /* row matched ! */ + return ftbe->cur_weight; + } + else + { /* match failed ! */ + return 0.0; + } +} + + +void maria_ft_boolean_close_search(FT_INFO *ftb) +{ + if (is_tree_inited(& ftb->no_dupes)) + { + delete_tree(& ftb->no_dupes); + } + free_root(& ftb->mem_root, MYF(0)); + my_free((uchar*)ftb,MYF(0)); +} + + +float maria_ft_boolean_get_relevance(FT_INFO *ftb) +{ + return ftb->root->cur_weight; +} + + +void maria_ft_boolean_reinit_search(FT_INFO *ftb) +{ + _ftb_init_index_search(ftb); +} diff --git a/storage/maria/ma_ft_eval.c b/storage/maria/ma_ft_eval.c new file mode 100644 index 00000000000..5fc67c6c664 --- /dev/null +++ b/storage/maria/ma_ft_eval.c @@ -0,0 +1,254 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code + added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ + +#include "ma_ftdefs.h" +#include "maria_ft_eval.h" +#include <stdarg.h> +#include <my_getopt.h> + +static void print_error(int exit_code, const char *fmt,...); +static void get_options(int argc, char *argv[]); +static int create_record(char *pos, FILE *file); +static void usage(); + +static struct my_option my_long_options[] = +{ + {"", 's', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'q', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '#', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +int main(int argc, char *argv[]) +{ + MARIA_HA *file; + int i,j; + + MY_INIT(argv[0]); + get_options(argc,argv); + bzero((char*)recinfo,sizeof(recinfo)); + + maria_init(); + /* First define 2 columns */ + recinfo[0].type=FIELD_SKIP_ENDSPACE; + recinfo[0].length=docid_length; + recinfo[1].type=FIELD_BLOB; + recinfo[1].length= 4+portable_sizeof_char_ptr; + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].seg[0].type= HA_KEYTYPE_TEXT; + keyinfo[0].seg[0].flag= HA_BLOB_PART; + keyinfo[0].seg[0].start=recinfo[0].length; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit=0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].bit_start=4; + keyinfo[0].seg[0].language=MY_CHARSET_CURRENT; + keyinfo[0].flag = HA_FULLTEXT; + + if (!silent) + printf("- Creating isam-file\n"); + if (maria_create(filename,1,keyinfo,2,recinfo,0,NULL,(MARIA_CREATE_INFO*) 0,0)) + goto err; + if (!(file=maria_open(filename,2,0))) + goto err; + if (!silent) + printf("Initializing stopwords\n"); + maria_ft_init_stopwords(stopwordlist); + + if (!silent) + printf("- Writing key:s\n"); + + my_errno=0; + i=0; + while (create_record(record,df)) + { + error=maria_write(file,record); + if (error) + printf("I= %2d maria_write: %d errno: %d\n",i,error,my_errno); + i++; + } + fclose(df); + + if (maria_close(file)) goto err; + if (!silent) + printf("- Reopening file\n"); + if (!(file=maria_open(filename,2,0))) goto err; + if (!silent) + printf("- Reading rows with key\n"); + for (i=1;create_record(record,qf);i++) + { + FT_DOCLIST *result; + double w; + int t, err; + + result=maria_ft_nlq_init_search(file,0,blob_record,(uint) strlen(blob_record),1); + if (!result) + { + printf("Query %d failed with errno %3d\n",i,my_errno); + goto err; + } + if (!silent) + printf("Query %d. Found: %d.\n",i,result->ndocs); + for (j=0;(err=maria_ft_nlq_read_next(result, read_record))==0;j++) + { + t=uint2korr(read_record); + w=maria_ft_nlq_get_relevance(result); + printf("%d %.*s %f\n",i,t,read_record+2,w); + } + if (err != HA_ERR_END_OF_FILE) + { + printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno); + goto err; + } + maria_ft_nlq_close_search(result); + } + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return (0); + + err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ + +} + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument) +{ + switch (optid) { + case 's': + if (stopwordlist && stopwordlist != maria_ft_precompiled_stopwords) + break; + { + FILE *f; char s[HA_FT_MAXLEN]; int i=0,n=SWL_INIT; + + if (!(stopwordlist=(const char**) malloc(n*sizeof(char *)))) + print_error(1,"malloc(%d)",n*sizeof(char *)); + if (!(f=fopen(argument,"r"))) + print_error(1,"fopen(%s)",argument); + while (!feof(f)) + { + if (!(fgets(s,HA_FT_MAXLEN,f))) + print_error(1,"fgets(s,%d,%s)",HA_FT_MAXLEN,argument); + if (!(stopwordlist[i++]=strdup(s))) + print_error(1,"strdup(%s)",s); + if (i >= n) + { + n+=SWL_PLUS; + if (!(stopwordlist=(const char**) realloc((char*) stopwordlist, + n*sizeof(char *)))) + print_error(1,"realloc(%d)",n*sizeof(char *)); + } + } + fclose(f); + stopwordlist[i]=NULL; + break; + } + case 'q': silent=1; break; + case 'S': if (stopwordlist==maria_ft_precompiled_stopwords) stopwordlist=NULL; break; + case '#': + DBUG_PUSH (argument); + break; + case 'V': + case '?': + case 'h': + usage(); + exit(1); + } + return 0; +} + + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + + if (!(d_file=argv[optind])) print_error(1,"No d_file"); + if (!(df=fopen(d_file,"r"))) + print_error(1,"fopen(%s)",d_file); + if (!(q_file=argv[optind+1])) print_error(1,"No q_file"); + if (!(qf=fopen(q_file,"r"))) + print_error(1,"fopen(%s)",q_file); + return; +} /* get options */ + + +static int create_record(char *pos, FILE *file) +{ + uint tmp; char *ptr; + + bzero((char *)pos,MAX_REC_LENGTH); + + /* column 1 - VARCHAR */ + if (!(fgets(pos+2,MAX_REC_LENGTH-32,file))) + { + if (feof(file)) + return 0; + else + print_error(1,"fgets(docid) - 1"); + } + tmp=(uint) strlen(pos+2)-1; + int2store(pos,tmp); + pos+=recinfo[0].length; + + /* column 2 - BLOB */ + + if (!(fgets(blob_record,MAX_BLOB_LENGTH,file))) + print_error(1,"fgets(docid) - 2"); + tmp=(uint) strlen(blob_record); + int4store(pos,tmp); + ptr=blob_record; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + return 1; +} + +/* VARARGS */ + +static void print_error(int exit_code, const char *fmt,...) +{ + va_list args; + + va_start(args,fmt); + fprintf(stderr,"%s: error: ",my_progname); + VOID(vfprintf(stderr, fmt, args)); + VOID(fputc('\n',stderr)); + fflush(stderr); + va_end(args); + exit(exit_code); +} + + +static void usage() +{ + printf("%s [options]\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/ma_ft_eval.h b/storage/maria/ma_ft_eval.h new file mode 100644 index 00000000000..481943dfb0b --- /dev/null +++ b/storage/maria/ma_ft_eval.h @@ -0,0 +1,41 @@ +/* Copyright (C) 2006 MySQL AB & Sergei A. Golubchik + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +const char **stopwordlist=maria_ft_precompiled_stopwords; + +#define MAX_REC_LENGTH 128 +#define MAX_BLOB_LENGTH 60000 +char record[MAX_REC_LENGTH], read_record[MAX_REC_LENGTH+MAX_BLOB_LENGTH]; +char blob_record[MAX_BLOB_LENGTH+20*20]; + +char *filename= (char*) "EVAL"; + +int silent=0, error=0; + +uint key_length=MAX_BLOB_LENGTH,docid_length=32; +char *d_file, *q_file; +FILE *df,*qf; + +MARIA_COLUMNDEF recinfo[3]; +MARIA_KEYDEF keyinfo[2]; +HA_KEYSEG keyseg[10]; + +#define SWL_INIT 500 +#define SWL_PLUS 50 + +#define MAX_LINE_LENGTH 128 +char line[MAX_LINE_LENGTH]; diff --git a/storage/maria/ma_ft_nlq_search.c b/storage/maria/ma_ft_nlq_search.c new file mode 100644 index 00000000000..18b101f0e05 --- /dev/null +++ b/storage/maria/ma_ft_nlq_search.c @@ -0,0 +1,374 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +#define FT_CORE +#include "ma_ftdefs.h" + +/* search with natural language queries */ + +typedef struct ft_doc_rec +{ + my_off_t dpos; + double weight; +} FT_DOC; + +struct st_ft_info +{ + struct _ft_vft *please; + MARIA_HA *info; + int ndocs; + int curdoc; + FT_DOC doc[1]; +}; + +typedef struct st_all_in_one +{ + MARIA_HA *info; + uint keynr; + CHARSET_INFO *charset; + uchar *keybuff; + TREE dtree; +} ALL_IN_ONE; + +typedef struct st_ft_superdoc +{ + FT_DOC doc; + FT_WORD *word_ptr; + double tmp_weight; +} FT_SUPERDOC; + +static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)), + FT_SUPERDOC *p1, FT_SUPERDOC *p2) +{ + if (p1->doc.dpos < p2->doc.dpos) + return -1; + if (p1->doc.dpos == p2->doc.dpos) + return 0; + return 1; +} + +static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio) +{ + int subkeys, r; + uint keylen, doc_cnt; + FT_SUPERDOC sdoc, *sptr; + TREE_ELEMENT *selem; + double gweight=1; + MARIA_HA *info= aio->info; + uchar *keybuff= (uchar*) aio->keybuff; + MARIA_KEYDEF *keyinfo=info->s->keyinfo+aio->keynr; + my_off_t key_root=info->s->state.key_root[aio->keynr]; + uint extra=HA_FT_WLEN+info->s->base.rec_reflength; +#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT + float tmp_weight; +#else +#error +#endif + + DBUG_ENTER("walk_and_match"); + + word->weight=LWS_FOR_QUERY; + + keylen= _ma_ft_make_key(info,aio->keynr,(char*) keybuff,word,0); + keylen-=HA_FT_WLEN; + doc_cnt=0; + + /* Skip rows inserted by current inserted */ + for (r= _ma_search(info, keyinfo, keybuff, keylen, SEARCH_FIND, key_root) ; + !r && + (subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 && + info->cur_row.lastpos >= info->state->data_file_length ; + r= _ma_search_next(info, keyinfo, info->lastkey, + info->lastkey_length, SEARCH_BIGGER, key_root)) + ; + + info->update|= HA_STATE_AKTIV; /* for _ma_test_if_changed() */ + + /* The following should be safe, even if we compare doubles */ + while (!r && gweight) + { + + if (keylen && + ha_compare_text(aio->charset, + (uchar*) info->lastkey+1, info->lastkey_length-extra-1, + (uchar*) keybuff+1, keylen-1, 0, 0)) + break; + + if (subkeys<0) + { + if (doc_cnt) + DBUG_RETURN(1); /* index is corrupted */ + /* + TODO here: unsafe optimization, should this word + be skipped (based on subkeys) ? + */ + keybuff+=keylen; + keyinfo=& info->s->ft2_keyinfo; + key_root= info->cur_row.lastpos; + keylen=0; + r= _ma_search_first(info, keyinfo, key_root); + goto do_skip; + } +#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT + tmp_weight=*(float*)&subkeys; +#else +#error +#endif + /* The following should be safe, even if we compare doubles */ + if (tmp_weight==0) + DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */ + + sdoc.doc.dpos= info->cur_row.lastpos; + + /* saving document matched into dtree */ + if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg))) + DBUG_RETURN(1); + + sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem); + + if (selem->count==1) /* document's first match */ + sptr->doc.weight=0; + else + sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight; + + sptr->word_ptr=word; + sptr->tmp_weight=tmp_weight; + + doc_cnt++; + + gweight=word->weight*GWS_IN_USE; + if (gweight < 0 || doc_cnt > 2000000) + gweight=0; + + if (_ma_test_if_changed(info) == 0) + r= _ma_search_next(info, keyinfo, info->lastkey, info->lastkey_length, + SEARCH_BIGGER, key_root); + else + r= _ma_search(info, keyinfo, info->lastkey, info->lastkey_length, + SEARCH_BIGGER, key_root); +do_skip: + while ((subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 && + !r && info->cur_row.lastpos >= info->state->data_file_length) + r= _ma_search_next(info, keyinfo, info->lastkey, info->lastkey_length, + SEARCH_BIGGER, key_root); + + } + word->weight=gweight; + + DBUG_RETURN(0); +} + + +static int walk_and_copy(FT_SUPERDOC *from, + uint32 count __attribute__((unused)), FT_DOC **to) +{ + DBUG_ENTER("walk_and_copy"); + from->doc.weight+=from->tmp_weight*from->word_ptr->weight; + (*to)->dpos=from->doc.dpos; + (*to)->weight=from->doc.weight; + (*to)++; + DBUG_RETURN(0); +} + +static int walk_and_push(FT_SUPERDOC *from, + uint32 count __attribute__((unused)), QUEUE *best) +{ + DBUG_ENTER("walk_and_copy"); + from->doc.weight+=from->tmp_weight*from->word_ptr->weight; + set_if_smaller(best->elements, ft_query_expansion_limit-1); + queue_insert(best, (uchar *)& from->doc); + DBUG_RETURN(0); +} + + +static int FT_DOC_cmp(void *unused __attribute__((unused)), + FT_DOC *a, FT_DOC *b) +{ + return sgn(b->weight - a->weight); +} + + +FT_INFO *maria_ft_init_nlq_search(MARIA_HA *info, uint keynr, uchar *query, + uint query_len, uint flags, uchar *record) +{ + TREE wtree; + ALL_IN_ONE aio; + FT_DOC *dptr; + FT_INFO *dlist=NULL; + MARIA_RECORD_POS saved_lastpos= info->cur_row.lastpos; + struct st_mysql_ftparser *parser; + MYSQL_FTPARSER_PARAM *ftparser_param; + DBUG_ENTER("maria_ft_init_nlq_search"); + + /* black magic ON */ + if ((int) (keynr = _ma_check_index(info,keynr)) < 0) + DBUG_RETURN(NULL); + if (_ma_readinfo(info,F_RDLCK,1)) + DBUG_RETURN(NULL); + /* black magic OFF */ + + aio.info=info; + aio.keynr=keynr; + aio.charset=info->s->keyinfo[keynr].seg->charset; + aio.keybuff= (uchar*) info->lastkey+info->s->base.max_key_length; + parser= info->s->keyinfo[keynr].parser; + if (! (ftparser_param= maria_ftparser_call_initializer(info, keynr, 0))) + goto err; + + bzero(&wtree,sizeof(wtree)); + + init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0, + NULL, NULL); + + maria_ft_parse_init(&wtree, aio.charset); + ftparser_param->flags= 0; + if (maria_ft_parse(&wtree, query, query_len, parser, ftparser_param, + &wtree.mem_root)) + goto err; + + if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio, + left_root_right)) + goto err; + + if (flags & FT_EXPAND && ft_query_expansion_limit) + { + QUEUE best; + init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp, + 0); + tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push, + &best, left_root_right); + while (best.elements) + { + my_off_t docid=((FT_DOC *)queue_remove(& best, 0))->dpos; + if (!(*info->read_record)(info, record, docid)) + { + info->update|= HA_STATE_AKTIV; + ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY; + if (unlikely(_ma_ft_parse(&wtree, info, keynr, record, ftparser_param, + &wtree.mem_root))) + { + delete_queue(&best); + goto err; + } + } + } + delete_queue(&best); + reset_tree(&aio.dtree); + if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio, + left_root_right)) + goto err; + + } + + /* + If ndocs == 0, this will not allocate RAM for FT_INFO.doc[], + so if ndocs == 0, FT_INFO.doc[] must not be accessed. + */ + dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+ + sizeof(FT_DOC)* + (int)(aio.dtree.elements_in_tree-1), + MYF(0)); + if (!dlist) + goto err; + + dlist->please= (struct _ft_vft *) & _ma_ft_vft_nlq; + dlist->ndocs=aio.dtree.elements_in_tree; + dlist->curdoc=-1; + dlist->info=aio.info; + dptr=dlist->doc; + + tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy, + &dptr, left_root_right); + + if (flags & FT_SORTED) + qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC), (qsort2_cmp)&FT_DOC_cmp, 0); + +err: + delete_tree(&aio.dtree); + delete_tree(&wtree); + info->cur_row.lastpos= saved_lastpos; + DBUG_RETURN(dlist); +} + + +int maria_ft_nlq_read_next(FT_INFO *handler, char *record) +{ + MARIA_HA *info= (MARIA_HA *) handler->info; + + if (++handler->curdoc >= handler->ndocs) + { + --handler->curdoc; + return HA_ERR_END_OF_FILE; + } + + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + info->cur_row.lastpos= handler->doc[handler->curdoc].dpos; + if (!(*info->read_record)(info, record, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + return 0; + } + return my_errno; +} + + +float maria_ft_nlq_find_relevance(FT_INFO *handler, + uchar *record __attribute__((unused)), + uint length __attribute__((unused))) +{ + int a,b,c; + FT_DOC *docs=handler->doc; + MARIA_RECORD_POS docid= handler->info->cur_row.lastpos; + + if (docid == HA_POS_ERROR) + return -5.0; + + /* Assuming docs[] is sorted by dpos... */ + + for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2) + { + if (docs[c].dpos > docid) + b=c; + else + a=c; + } + /* bounds check to avoid accessing unallocated handler->doc */ + if (a < handler->ndocs && docs[a].dpos == docid) + return (float) docs[a].weight; + else + return 0.0; +} + + +void maria_ft_nlq_close_search(FT_INFO *handler) +{ + my_free((uchar*)handler,MYF(0)); +} + + +float maria_ft_nlq_get_relevance(FT_INFO *handler) +{ + return (float) handler->doc[handler->curdoc].weight; +} + + +void maria_ft_nlq_reinit_search(FT_INFO *handler) +{ + handler->curdoc=-1; +} + diff --git a/storage/maria/ma_ft_parser.c b/storage/maria/ma_ft_parser.c new file mode 100644 index 00000000000..2cbbb2dc5f7 --- /dev/null +++ b/storage/maria/ma_ft_parser.c @@ -0,0 +1,426 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +#include "ma_ftdefs.h" + +typedef struct st_maria_ft_docstat { + FT_WORD *list; + uint uniq; + double sum; +} FT_DOCSTAT; + + +typedef struct st_my_maria_ft_parser_param +{ + TREE *wtree; + MEM_ROOT *mem_root; +} MY_FT_PARSER_PARAM; + + +static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2) +{ + return ha_compare_text(cs, (uchar*) w1->pos, w1->len, + (uchar*) w2->pos, w2->len, 0, 0); +} + +static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat) +{ + word->weight=LWS_IN_USE; + docstat->sum+=word->weight; + memcpy_fixed((docstat->list)++,word,sizeof(FT_WORD)); + return 0; +} + +/* transforms tree of words into the array, applying normalization */ + +FT_WORD * maria_ft_linearize(TREE *wtree, MEM_ROOT *mem_root) +{ + FT_WORD *wlist,*p; + FT_DOCSTAT docstat; + DBUG_ENTER("maria_ft_linearize"); + + if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)* + (1+wtree->elements_in_tree)))) + { + docstat.list=wlist; + docstat.uniq=wtree->elements_in_tree; + docstat.sum=0; + tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right); + } + delete_tree(wtree); + if (!wlist) + DBUG_RETURN(NULL); + + docstat.list->pos=NULL; + + for (p=wlist;p->pos;p++) + { + p->weight=PRENORM_IN_USE; + } + + for (p=wlist;p->pos;p++) + { + p->weight/=NORM_IN_USE; + } + + DBUG_RETURN(wlist); +} + +my_bool maria_ft_boolean_check_syntax_string(const uchar *str) +{ + uint i, j; + + if (!str || + (strlen(str)+1 != sizeof(ft_boolean_syntax)) || + (str[0] != ' ' && str[1] != ' ')) + return 1; + for (i=0; i<sizeof(ft_boolean_syntax); i++) + { + /* limiting to 7-bit ascii only */ + if ((unsigned char)(str[i]) > 127 || + my_isalnum(default_charset_info, str[i])) + return 1; + for (j=0; j<i; j++) + if (str[i] == str[j] && (i != 11 || j != 10)) + return 1; + } + return 0; +} + +/* + RETURN VALUE + 0 - eof + 1 - word found + 2 - left bracket + 3 - right bracket + 4 - stopword found +*/ +uchar maria_ft_get_word(CHARSET_INFO *cs, uchar **start, uchar *end, + FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) +{ + uchar *doc=*start; + int ctype; + uint mwc, length, mbl; + + param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); + param->weight_adjust= param->wasign= 0; + param->type= FT_TOKEN_EOF; + + while (doc<end) + { + for (; doc < end; doc+= (mbl > 0 ? mbl : 1)) + { + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (true_word_char(ctype, *doc)) + break; + if (*doc == FTB_RQUOT && param->quot) + { + param->quot=doc; + *start=doc+1; + param->type= FT_TOKEN_RIGHT_PAREN; + goto ret; + } + if (!param->quot) + { + if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) + { + /* param->prev=' '; */ + *start=doc+1; + if (*doc == FTB_LQUOT) param->quot=*start; + param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN); + goto ret; + } + if (param->prev == ' ') + { + if (*doc == FTB_YES ) { param->yesno=+1; continue; } else + if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else + if (*doc == FTB_NO ) { param->yesno=-1; continue; } else + if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else + if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else + if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; } + } + } + param->prev=*doc; + param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); + param->weight_adjust= param->wasign= 0; + } + + mwc=length=0; + for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1)) + { + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (true_word_char(ctype, *doc)) + mwc=0; + else if (!misc_word_char(*doc) || mwc) + break; + else + mwc++; + } + param->prev='A'; /* be sure *prev is true_word_char */ + word->len= (uint)(doc-word->pos) - mwc; + if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) + doc++; + + if (((length >= ft_min_word_len && !is_stopword(word->pos, word->len)) + || param->trunc) && length < ft_max_word_len) + { + *start=doc; + param->type= FT_TOKEN_WORD; + goto ret; + } + else if (length) /* make sure length > 0 (if start contains spaces only) */ + { + *start= doc; + param->type= FT_TOKEN_STOPWORD; + goto ret; + } + } + if (param->quot) + { + param->quot=*start=doc; + param->type= 3; /* FT_RBR */ + goto ret; + } +ret: + return param->type; +} + +uchar maria_ft_simple_get_word(CHARSET_INFO *cs, uchar **start, + const uchar *end, FT_WORD *word, + my_bool skip_stopwords) +{ + uchar *doc= *start; + uint mwc, length, mbl; + int ctype; + DBUG_ENTER("maria_ft_simple_get_word"); + + do + { + for (;; doc+= (mbl > 0 ? mbl : 1)) + { + if (doc >= end) + DBUG_RETURN(0); + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (true_word_char(ctype, *doc)) + break; + } + + mwc= length= 0; + for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1)) + { + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (true_word_char(ctype, *doc)) + mwc= 0; + else if (!misc_word_char(*doc) || mwc) + break; + else + mwc++; + } + + word->len= (uint)(doc-word->pos) - mwc; + + if (skip_stopwords == FALSE || + (length >= ft_min_word_len && length < ft_max_word_len && + !is_stopword(word->pos, word->len))) + { + *start= doc; + DBUG_RETURN(1); + } + } while (doc < end); + DBUG_RETURN(0); +} + +void maria_ft_parse_init(TREE *wtree, CHARSET_INFO *cs) +{ + DBUG_ENTER("maria_ft_parse_init"); + if (!is_tree_inited(wtree)) + init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, cs); + DBUG_VOID_RETURN; +} + + +static int maria_ft_add_word(MYSQL_FTPARSER_PARAM *param, + char *word, int word_len, + MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused))) +{ + TREE *wtree; + FT_WORD w; + MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; + DBUG_ENTER("maria_ft_add_word"); + wtree= ft_param->wtree; + if (param->flags & MYSQL_FTFLAGS_NEED_COPY) + { + uchar *ptr; + DBUG_ASSERT(wtree->with_delete == 0); + ptr= (uchar *)alloc_root(ft_param->mem_root, word_len); + memcpy(ptr, word, word_len); + w.pos= ptr; + } + else + w.pos= word; + w.len= word_len; + if (!tree_insert(wtree, &w, 0, wtree->custom_arg)) + { + delete_tree(wtree); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +static int maria_ft_parse_internal(MYSQL_FTPARSER_PARAM *param, + char *doc_arg, int doc_len) +{ + uchar *doc= (uchar*) doc_arg; + uchar *end= doc + doc_len; + MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; + TREE *wtree= ft_param->wtree; + FT_WORD w; + DBUG_ENTER("maria_ft_parse_internal"); + + while (maria_ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE)) + if (param->mysql_add_word(param, w.pos, w.len, 0)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +int maria_ft_parse(TREE *wtree, uchar *doc, int doclen, + struct st_mysql_ftparser *parser, + MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) +{ + MY_FT_PARSER_PARAM my_param; + DBUG_ENTER("maria_ft_parse"); + DBUG_ASSERT(parser); + my_param.wtree= wtree; + my_param.mem_root= mem_root; + + param->mysql_parse= maria_ft_parse_internal; + param->mysql_add_word= maria_ft_add_word; + param->mysql_ftparam= &my_param; + param->cs= wtree->custom_arg; + param->doc= doc; + param->length= doclen; + param->mode= MYSQL_FTPARSER_SIMPLE_MODE; + DBUG_RETURN(parser->parse(param)); +} + + +#define MAX_PARAM_NR 2 +MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, + uint keynr, uint paramnr) +{ + uint32 ftparser_nr; + struct st_mysql_ftparser *parser; + if (! info->ftparser_param) + { + /* info->ftparser_param can not be zero after the initialization, + because it always includes built-in fulltext parser. And built-in + parser can be called even if the table has no fulltext indexes and + no varchar/text fields. */ + if (! info->s->ftparsers) + { + /* It's ok that modification to shared structure is done w/o mutex + locks, because all threads would set the same variables to the + same values. */ + uint i, j, keys= info->s->state.header.keys, ftparsers= 1; + for (i= 0; i < keys; i++) + { + MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i]; + if (keyinfo->flag & HA_FULLTEXT) + { + for (j= 0;; j++) + { + if (j == i) + { + keyinfo->ftparser_nr= ftparsers++; + break; + } + if (info->s->keyinfo[j].flag & HA_FULLTEXT && + keyinfo->parser == info->s->keyinfo[j].parser) + { + keyinfo->ftparser_nr= info->s->keyinfo[j].ftparser_nr; + break; + } + } + } + } + info->s->ftparsers= ftparsers; + } + /* + We have to allocate two MYSQL_FTPARSER_PARAM structures per plugin + because in a boolean search a parser is called recursively + ftb_find_relevance* calls ftb_check_phrase* + (MAX_PARAM_NR=2) + */ + info->ftparser_param= (MYSQL_FTPARSER_PARAM *) + my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) * + info->s->ftparsers, MYF(MY_WME|MY_ZEROFILL)); + init_alloc_root(&info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0); + if (! info->ftparser_param) + return 0; + } + if (keynr == NO_SUCH_KEY) + { + ftparser_nr= 0; + parser= &ft_default_parser; + } + else + { + ftparser_nr= info->s->keyinfo[keynr].ftparser_nr; + parser= info->s->keyinfo[keynr].parser; + } + DBUG_ASSERT(paramnr < MAX_PARAM_NR); + ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr; + if (! info->ftparser_param[ftparser_nr].mysql_add_word) + { + /* Note, that mysql_add_word is used here as a flag: + mysql_add_word == 0 - parser is not initialized + mysql_add_word != 0 - parser is initialized, or no + initialization needed. */ + info->ftparser_param[ftparser_nr].mysql_add_word= (void *)1; + if (parser->init && parser->init(&info->ftparser_param[ftparser_nr])) + return 0; + } + return &info->ftparser_param[ftparser_nr]; +} + + +void maria_ftparser_call_deinitializer(MARIA_HA *info) +{ + uint i, j, keys= info->s->state.header.keys; + free_root(&info->ft_memroot, MYF(0)); + if (! info->ftparser_param) + return; + for (i= 0; i < keys; i++) + { + MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i]; + for (j=0; j < MAX_PARAM_NR; j++) + { + MYSQL_FTPARSER_PARAM *ftparser_param= + &info->ftparser_param[keyinfo->ftparser_nr*MAX_PARAM_NR + j]; + if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word) + { + if (keyinfo->parser->deinit) + keyinfo->parser->deinit(ftparser_param); + ftparser_param->mysql_add_word= 0; + } + else + break; + } + } +} diff --git a/storage/maria/ma_ft_stem.c b/storage/maria/ma_ft_stem.c new file mode 100644 index 00000000000..06fc0b2df6c --- /dev/null +++ b/storage/maria/ma_ft_stem.c @@ -0,0 +1,18 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* mulitingual stem */ diff --git a/storage/maria/ma_ft_test1.c b/storage/maria/ma_ft_test1.c new file mode 100644 index 00000000000..4c98e766234 --- /dev/null +++ b/storage/maria/ma_ft_test1.c @@ -0,0 +1,317 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code + added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ + +#include "ma_ftdefs.h" +#include "maria_ft_test1.h" +#include <my_getopt.h> + +static int key_field=FIELD_VARCHAR,extra_field=FIELD_SKIP_ENDSPACE; +static uint key_length=200,extra_length=50; +static int key_type=HA_KEYTYPE_TEXT; +static int verbose=0,silent=0,skip_update=0, + no_keys=0,no_stopwords=0,no_search=0,no_fulltext=0; +static int create_flag=0,error=0; + +#define MAX_REC_LENGTH 300 +static char record[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH]; + +static int run_test(const char *filename); +static void get_options(int argc, char *argv[]); +static void create_record(char *, int); +static void usage(); + +static struct my_option my_long_options[] = +{ + {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 's', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'N', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'K', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'F', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'U', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '#', "", 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +int main(int argc, char *argv[]) +{ + MY_INIT(argv[0]); + + get_options(argc,argv); + maria_init(); + + exit(run_test("FT1")); +} + +static MARIA_COLUMNDEF recinfo[3]; +static MARIA_KEYDEF keyinfo[2]; +static HA_KEYSEG keyseg[10]; + +static int run_test(const char *filename) +{ + MARIA_HA *file; + int i,j; + my_off_t pos; + + bzero((char*) recinfo,sizeof(recinfo)); + + /* First define 2 columns */ + recinfo[0].type=extra_field; + recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : + extra_length); + if (extra_field == FIELD_VARCHAR) + recinfo[0].length+= HA_VARCHAR_PACKLENGTH(extra_length); + recinfo[1].type=key_field; + recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr : + key_length); + if (key_field == FIELD_VARCHAR) + recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length); + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].seg[0].type= key_type; + keyinfo[0].seg[0].flag= (key_field == FIELD_BLOB) ? HA_BLOB_PART: + (key_field == FIELD_VARCHAR) ? HA_VAR_LENGTH_PART:0; + keyinfo[0].seg[0].start=recinfo[0].length; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit= 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language= default_charset_info->number; + keyinfo[0].flag = (no_fulltext?HA_PACK_KEY:HA_FULLTEXT); + + if (!silent) + printf("- Creating isam-file\n"); + if (maria_create(filename,(no_keys?0:1),keyinfo,2,recinfo,0,NULL, + (MARIA_CREATE_INFO*) 0, create_flag)) + goto err; + if (!(file=maria_open(filename,2,0))) + goto err; + + if (!silent) + printf("- %s stopwords\n",no_stopwords?"Skipping":"Initializing"); + maria_ft_init_stopwords(no_stopwords?NULL:maria_ft_precompiled_stopwords); + + if (!silent) + printf("- Writing key:s\n"); + + my_errno=0; + for (i=NUPD ; i<NDATAS; i++ ) + { + create_record(record,i); + error=maria_write(file,record); + if (verbose || error) + printf("I= %2d maria_write: %d errno: %d, record: %s\n", + i,error,my_errno,data[i].f0); + } + + if (!skip_update) + { + if (!silent) + printf("- Updating rows\n"); + + /* Read through all rows and update them */ + pos=(ha_rows) 0; + i=0; + while ((error=maria_rrnd(file,read_record,pos)) == 0) + { + create_record(record,NUPD-i-1); + if (maria_update(file,read_record,record)) + { + printf("Can't update row: %.*s, error: %d\n", + keyinfo[0].seg[0].length,record,my_errno); + } + if(++i == NUPD) break; + pos=HA_OFFSET_ERROR; + } + if (i != NUPD) + printf("Found %d of %d rows\n", i,NUPD); + } + + if (maria_close(file)) goto err; + if(no_search) return 0; + if (!silent) + printf("- Reopening file\n"); + if (!(file=maria_open(filename,2,0))) goto err; + if (!silent) + printf("- Reading rows with key\n"); + for (i=0 ; i < NQUERIES ; i++) + { + FT_DOCLIST *result; + result=maria_ft_nlq_init_search(file,0,(char*) query[i],strlen(query[i]),1); + if(!result) + { + printf("Query %d: `%s' failed with errno %3d\n",i,query[i],my_errno); + continue; + } + printf("Query %d: `%s'. Found: %d. Top five documents:\n", + i,query[i],result->ndocs); + for (j=0;j<5;j++) + { + double w; int err; + err= maria_ft_nlq_read_next(result, read_record); + if (err==HA_ERR_END_OF_FILE) + { + printf("No more matches!\n"); + break; + } + else if (err) + { + printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno); + break; + } + w=maria_ft_nlq_get_relevance(result); + if (key_field == FIELD_VARCHAR) + { + uint l; + char *p; + p=recinfo[0].length+read_record; + l=uint2korr(p); + printf("%10.7f: %.*s\n",w,(int) l,p+2); + } + else + printf("%10.7f: %.*s\n",w,recinfo[1].length, + recinfo[0].length+read_record); + } + maria_ft_nlq_close_search(result); + } + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return (0); +err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + +static char blob_key[MAX_REC_LENGTH]; +/* static char blob_record[MAX_REC_LENGTH+20*20]; */ + +void create_record(char *pos, int n) +{ + bzero((char*) pos,MAX_REC_LENGTH); + if (recinfo[0].type == FIELD_BLOB) + { + uint tmp; + char *ptr; + strnmov(blob_key,data[n].f0,keyinfo[0].seg[0].length); + tmp=strlen(blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + uint tmp; + /* -1 is here because pack_length is stored in seg->length */ + uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1); + strnmov(pos+pack_length,data[n].f0,keyinfo[0].seg[0].length); + tmp=strlen(pos+pack_length); + if (pack_length == 1) + *pos= (char) tmp; + else + int2store(pos,tmp); + pos+=recinfo[0].length; + } + else + { + strnmov(pos,data[n].f0,keyinfo[0].seg[0].length); + pos+=recinfo[0].length; + } + if (recinfo[1].type == FIELD_BLOB) + { + uint tmp; + char *ptr; + strnmov(blob_key,data[n].f2,keyinfo[0].seg[0].length); + tmp=strlen(blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[1].length; + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + uint tmp; + /* -1 is here because pack_length is stored in seg->length */ + uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1); + strnmov(pos+pack_length,data[n].f2,keyinfo[0].seg[0].length); + tmp=strlen(pos+1); + if (pack_length == 1) + *pos= (char) tmp; + else + int2store(pos,tmp); + pos+=recinfo[1].length; + } + else + { + strnmov(pos,data[n].f2,keyinfo[0].seg[0].length); + pos+=recinfo[1].length; + } +} + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument) +{ + switch(optid) { + case 'v': verbose=1; break; + case 's': silent=1; break; + case 'F': no_fulltext=1; no_search=1; + case 'U': skip_update=1; break; + case 'K': no_keys=no_search=1; break; + case 'N': no_search=1; break; + case 'S': no_stopwords=1; break; + case '#': + DBUG_PUSH (argument); + break; + case 'V': + case '?': + case 'h': + usage(); + exit(1); + } + return 0; +} + +/* Read options */ + +static void get_options(int argc,char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + return; +} /* get options */ + + +static void usage() +{ + printf("%s [options]\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/ma_ft_test1.h b/storage/maria/ma_ft_test1.h new file mode 100644 index 00000000000..5883c42f5c5 --- /dev/null +++ b/storage/maria/ma_ft_test1.h @@ -0,0 +1,420 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +#define NUPD 20 +#define NDATAS 389 +struct { const char *f0, *f2; } data[NDATAS] = { + {"1", "General Information about MySQL"}, + {"1.1", "What is MySQL?"}, + {"1.2", "About this manual"}, + {"1.3", "History of MySQL"}, + {"1.4", "The main features of MySQL"}, + {"1.5", "General SQL information and tutorials"}, + {"1.6", "Useful MySQL-related links"}, + {"1.7", "What are stored procedures and triggers and so on?"}, + {"2", "MySQL mailing lists and how to ask questions/give error (bug) reports"}, + {"2.1", "Subscribing to/un-subscribing from the MySQL mailing list"}, + {"2.2", "Asking questions or reporting bugs"}, + {"2.3", "I think I have found a bug. What information do you need to help me?"}, + {"2.3.1", "MySQL keeps crashing"}, + {"2.4", "Guidelines for answering questions on the mailing list"}, + {"3", "Licensing or When do I have/want to pay for MySQL?"}, + {"3.1", "How much does MySQL cost?"}, + {"3.2", "How do I get commercial support?"}, + {"3.2.1", "Types of commercial support"}, + {"3.2.1.1", "Basic email support"}, + {"3.2.1.2", "Extended email support"}, +/*------------------------------- NUPD=20 -------------------------------*/ + {"3.2.1.3", "Asking: Login support"}, + {"3.2.1.4", "Extended login support"}, + {"3.3", "How do I pay for licenses/support?"}, + {"3.4", "Who do I contact when I want more information about licensing/support?"}, + {"3.5", "What Copyright does MySQL use?"}, + {"3.6", "When may I distribute MySQL commercially without a fee?"}, + {"3.7", "I want to sell a product that can be configured to use MySQL"}, + {"3.8", "I am running a commercial web server using MySQL"}, + {"3.9", "Do I need a license to sell commercial Perl/tcl/PHP/Web+ etc applications?"}, + {"3.10", "Possible future changes in the licensing"}, + {"4", "Compiling and installing MySQL"}, + {"4.1", "How do I get MySQL?"}, + {"4.2", "Which MySQL version should I use?"}, + {"4.3", "How/when will you release updates?"}, + {"4.4", "What operating systems does MySQL support?"}, + {"4.5", "Compiling MySQL from source code"}, + {"4.5.1", "Quick installation overview"}, + {"4.5.2", "Usual configure switches"}, + {"4.5.3", "Applying a patch"}, + {"4.6", "Problems compiling?"}, + {"4.7", "General compilation notes"}, + {"4.8", "MIT-pthreads notes (FreeBSD)"}, + {"4.9", "Perl installation comments"}, + {"4.10", "Special things to consider for some machine/OS combinations"}, + {"4.10.1", "Solaris notes"}, + {"4.10.2", "SunOS 4 notes"}, + {"4.10.3", "Linux notes for all versions"}, + {"4.10.3.1", "Linux-x86 notes"}, + {"4.10.3.2", "RedHat 5.0"}, + {"4.10.3.3", "RedHat 5.1"}, + {"4.10.3.4", "Linux-Sparc notes"}, + {"4.10.3.5", "Linux-Alpha notes"}, + {"4.10.3.6", "MkLinux notes"}, + {"4.10.4", "Alpha-DEC-Unix notes"}, + {"4.10.5", "Alpha-DEC-OSF1 notes"}, + {"4.10.6", "SGI-IRIX notes"}, + {"4.10.7", "FreeBSD notes"}, + {"4.10.7.1", "FreeBSD-3.0 notes"}, + {"4.10.8", "BSD/OS 2.# notes"}, + {"4.10.8.1", "BSD/OS 3.# notes"}, + {"4.10.9", "SCO notes"}, + {"4.10.10", "SCO Unixware 7.0 notes"}, + {"4.10.11", "IBM-AIX notes"}, + {"4.10.12", "HP-UX notes"}, + {"4.11", "TcX binaries"}, + {"4.12", "Win32 notes"}, + {"4.13", "Installation instructions for MySQL binary releases"}, + {"4.13.1", "How to get MySQL Perl support working"}, + {"4.13.2", "Linux notes"}, + {"4.13.3", "HP-UX notes"}, + {"4.13.4", "Linking client libraries"}, + {"4.14", "Problems running mysql_install_db"}, + {"4.15", "Problems starting MySQL"}, + {"4.16", "Automatic start/stop of MySQL"}, + {"4.17", "Option files"}, + {"5", "How standards-compatible is MySQL?"}, + {"5.1", "What extensions has MySQL to ANSI SQL92?"}, + {"5.2", "What functionality is missing in MySQL?"}, + {"5.2.1", "Sub-selects"}, + {"5.2.2", "SELECT INTO TABLE"}, + {"5.2.3", "Transactions"}, + {"5.2.4", "Triggers"}, + {"5.2.5", "Foreign Keys"}, + {"5.2.5.1", "Some reasons NOT to use FOREIGN KEYS"}, + {"5.2.6", "Views"}, + {"5.2.7", "-- as start of a comment"}, + {"5.3", "What standards does MySQL follow?"}, + {"5.4", "What functions exist only for compatibility?"}, + {"5.5", "Limitations of BLOB and TEXT types"}, + {"5.6", "How to cope without COMMIT-ROLLBACK"}, + {"6", "The MySQL access privilege system"}, + {"6.1", "What the privilege system does"}, + {"6.2", "Connecting to the MySQL server"}, + {"6.2.1", "Keeping your password secure"}, + {"6.3", "Privileges provided by MySQL"}, + {"6.4", "How the privilege system works"}, + {"6.5", "The privilege tables"}, + {"6.6", "Setting up the initial MySQL privileges"}, + {"6.7", "Adding new user privileges to MySQL"}, + {"6.8", "An example permission setup"}, + {"6.9", "Causes of Access denied errors"}, + {"6.10", "How to make MySQL secure against crackers"}, + {"7", "MySQL language reference"}, + {"7.1", "Literals: how to write strings and numbers"}, + {"7.1.1", "Strings"}, + {"7.1.2", "Numbers"}, + {"7.1.3", "NULL values"}, + {"7.1.4", "Database, table, index, column and alias names"}, + {"7.1.4.1", "Case sensitivity in names"}, + {"7.2", "Column types"}, + {"7.2.1", "Column type storage requirements"}, + {"7.2.5", "Numeric types"}, + {"7.2.6", "Date and time types"}, + {"7.2.6.1", "The DATE type"}, + {"7.2.6.2", "The TIME type"}, + {"7.2.6.3", "The DATETIME type"}, + {"7.2.6.4", "The TIMESTAMP type"}, + {"7.2.6.5", "The YEAR type"}, + {"7.2.6.6", "Miscellaneous date and time properties"}, + {"7.2.7", "String types"}, + {"7.2.7.1", "The CHAR and VARCHAR types"}, + {"7.2.7.2", "The BLOB and TEXT types"}, + {"7.2.7.3", "The ENUM type"}, + {"7.2.7.4", "The SET type"}, + {"7.2.8", "Choosing the right type for a column"}, + {"7.2.9", "Column indexes"}, + {"7.2.10", "Multiple-column indexes"}, + {"7.2.11", "Using column types from other database engines"}, + {"7.3", "Functions for use in SELECT and WHERE clauses"}, + {"7.3.1", "Grouping functions"}, + {"7.3.2", "Normal arithmetic operations"}, + {"7.3.3", "Bit functions"}, + {"7.3.4", "Logical operations"}, + {"7.3.5", "Comparison operators"}, + {"7.3.6", "String comparison functions"}, + {"7.3.7", "Control flow functions"}, + {"7.3.8", "Mathematical functions"}, + {"7.3.9", "String functions"}, + {"7.3.10", "Date and time functions"}, + {"7.3.11", "Miscellaneous functions"}, + {"7.3.12", "Functions for use with GROUP BY clauses"}, + {"7.4", "CREATE DATABASE syntax"}, + {"7.5", "DROP DATABASE syntax"}, + {"7.6", "CREATE TABLE syntax"}, + {"7.7", "ALTER TABLE syntax"}, + {"7.8", "OPTIMIZE TABLE syntax"}, + {"7.9", "DROP TABLE syntax"}, + {"7.10", "DELETE syntax"}, + {"7.11", "SELECT syntax"}, + {"7.12", "JOIN syntax"}, + {"7.13", "INSERT syntax"}, + {"7.14", "REPLACE syntax"}, + {"7.15", "LOAD DATA INFILE syntax"}, + {"7.16", "UPDATE syntax"}, + {"7.17", "USE syntax"}, + {"7.18", "SHOW syntax (Get information about tables, columns...)"}, + {"7.19", "EXPLAIN syntax (Get information about a SELECT)"}, + {"7.20", "DESCRIBE syntax (Get information about columns)"}, + {"7.21", "LOCK TABLES/UNLOCK TABLES syntax"}, + {"7.22", "SET OPTION syntax"}, + {"7.23", "GRANT syntax (Compatibility function)"}, + {"7.24", "CREATE INDEX syntax (Compatibility function)"}, + {"7.25", "DROP INDEX syntax (Compatibility function)"}, + {"7.26", "Comment syntax"}, + {"7.27", "CREATE FUNCTION/DROP FUNCTION syntax"}, + {"7.28", "Is MySQL picky about reserved words?"}, + {"8", "Example SQL queries"}, + {"8.1", "Queries from twin project"}, + {"8.1.1", "Find all non-distributed twins"}, + {"8.1.2", "Show a table on twin pair status"}, + {"9", "How safe/stable is MySQL?"}, + {"9.1", "How stable is MySQL?"}, + {"9.2", "Why are there is so many releases of MySQL?"}, + {"9.3", "Checking a table for errors"}, + {"9.4", "How to repair tables"}, + {"9.5", "Is there anything special to do when upgrading/downgrading MySQL?"}, + {"9.5.1", "Upgrading from a 3.21 version to 3.22"}, + {"9.5.2", "Upgrading from a 3.20 version to 3.21"}, + {"9.5.3", "Upgrading to another architecture"}, + {"9.6", "Year 2000 compliance"}, + {"10", "MySQL Server functions"}, + {"10.1", "What languages are supported by MySQL?"}, + {"10.1.1", "Character set used for data & sorting"}, + {"10.2", "The update log"}, + {"10.3", "How big can MySQL tables be?"}, + {"11", "Getting maximum performance from MySQL"}, + {"11.1", "How does one change the size of MySQL buffers?"}, + {"11.2", "How compiling and linking affects the speed of MySQL"}, + {"11.3", "How does MySQL use memory?"}, + {"11.4", "How does MySQL use indexes?"}, + {"11.5", "What optimizations are done on WHERE clauses?"}, + {"11.6", "How does MySQL open & close tables?"}, + {"11.6.0.1", "What are the drawbacks of creating possibly thousands of tables in a database?"}, + {"11.7", "How does MySQL lock tables?"}, + {"11.8", "How should I arrange my table to be as fast/small as possible?"}, + {"11.9", "What affects the speed of INSERT statements?"}, + {"11.10", "What affects the speed DELETE statements?"}, + {"11.11", "How do I get MySQL to run at full speed?"}, + {"11.12", "What are the different row formats? Or, when should VARCHAR/CHAR be used?"}, + {"11.13", "Why so many open tables?"}, + {"12", "MySQL benchmark suite"}, + {"13", "MySQL Utilites"}, + {"13.1", "Overview of the different MySQL programs"}, + {"13.2", "The MySQL table check, optimize and repair program"}, + {"13.2.1", "isamchk memory use"}, + {"13.2.2", "Getting low-level table information"}, + {"13.3", "The MySQL compressed read-only table generator"}, + {"14", "Adding new functions to MySQL"}, + {"15", "MySQL ODBC Support"}, + {"15.1", "Operating systems supported by MyODBC"}, + {"15.2", "How to report problems with MyODBC"}, + {"15.3", "Programs known to work with MyODBC"}, + {"15.4", "How to fill in the various fields in the ODBC administrator program"}, + {"15.5", "How to get the value of an AUTO_INCREMENT column in ODBC"}, + {"16", "Problems and common errors"}, + {"16.1", "Some common errors when using MySQL"}, + {"16.1.1", "MySQL server has gone away error"}, + {"16.1.2", "Can't connect to local MySQL server error"}, + {"16.1.3", "Out of memory error"}, + {"16.1.4", "Packet too large error"}, + {"16.1.5", "The table is full error"}, + {"16.1.6", "Commands out of sync error in client"}, + {"16.1.7", "Removing user error"}, + {"16.2", "How MySQL handles a full disk"}, + {"16.3", "How to run SQL commands from a text file"}, + {"16.4", "Where MySQL stores temporary files"}, + {"16.5", "Access denied error"}, + {"16.6", "How to run MySQL as a normal user"}, + {"16.7", "Problems with file permissions"}, + {"16.8", "File not found"}, + {"16.9", "Problems using DATE columns"}, + {"16.10", "Case sensitivity in searches"}, + {"16.11", "Problems with NULL values"}, + {"17", "Solving some common problems with MySQL"}, + {"17.1", "Database replication"}, + {"17.2", "Database backups"}, + {"18", "MySQL client tools and API's"}, + {"18.1", "MySQL C API"}, + {"18.2", "C API datatypes"}, + {"18.3", "C API function overview"}, + {"18.4", "C API function descriptions"}, + {"18.4.1", "mysql_affected_rows()"}, + {"18.4.2", "mysql_close()"}, + {"18.4.3", "mysql_connect()"}, + {"18.4.4", "mysql_create_db()"}, + {"18.4.5", "mysql_data_seek()"}, + {"18.4.6", "mysql_debug()"}, + {"18.4.7", "mysql_drop_db()"}, + {"18.4.8", "mysql_dump_debug_info()"}, + {"18.4.9", "mysql_eof()"}, + {"18.4.10", "mysql_errno()"}, + {"18.4.11", "mysql_error()"}, + {"18.4.12", "mysql_escape_string()"}, + {"18.4.13", "mysql_fetch_field()"}, + {"18.4.14", "mysql_fetch_fields()"}, + {"18.4.15", "mysql_fetch_field_direct()"}, + {"18.4.16", "mysql_fetch_lengths()"}, + {"18.4.17", "mysql_fetch_row()"}, + {"18.4.18", "mysql_field_seek()"}, + {"18.4.19", "mysql_field_tell()"}, + {"18.4.20", "mysql_free_result()"}, + {"18.4.21", "mysql_get_client_info()"}, + {"18.4.22", "mysql_get_host_info()"}, + {"18.4.23", "mysql_get_proto_info()"}, + {"18.4.24", "mysql_get_server_info()"}, + {"18.4.25", "mysql_info()"}, + {"18.4.26", "mysql_init()"}, + {"18.4.27", "mysql_insert_id()"}, + {"18.4.28", "mysql_kill()"}, + {"18.4.29", "mysql_list_dbs()"}, + {"18.4.30", "mysql_list_fields()"}, + {"18.4.31", "mysql_list_processes()"}, + {"18.4.32", "mysql_list_tables()"}, + {"18.4.33", "mysql_num_fields()"}, + {"18.4.34", "mysql_num_rows()"}, + {"18.4.35", "mysql_query()"}, + {"18.4.36", "mysql_real_connect()"}, + {"18.4.37", "mysql_real_query()"}, + {"18.4.38", "mysql_reload()"}, + {"18.4.39", "mysql_row_tell()"}, + {"18.4.40", "mysql_select_db()"}, + {"18.4.41", "mysql_shutdown()"}, + {"18.4.42", "mysql_stat()"}, + {"18.4.43", "mysql_store_result()"}, + {"18.4.44", "mysql_thread_id()"}, + {"18.4.45", "mysql_use_result()"}, + {"18.4.46", "Why is it that after mysql_query() returns success, mysql_store_result() sometimes returns NULL?"}, + {"18.4.47", "What results can I get from a query?"}, + {"18.4.48", "How can I get the unique ID for the last inserted row?"}, + {"18.4.49", "Problems linking with the C API"}, + {"18.4.50", "How to make a thread-safe client"}, + {"18.5", "MySQL Perl API's"}, + {"18.5.1", "DBI with DBD::mysql"}, + {"18.5.1.1", "The DBI interface"}, + {"18.5.1.2", "More DBI/DBD information"}, + {"18.6", "MySQL Java connectivity (JDBC)"}, + {"18.7", "MySQL PHP API's"}, + {"18.8", "MySQL C++ API's"}, + {"18.9", "MySQL Python API's"}, + {"18.10", "MySQL TCL API's"}, + {"19", "How MySQL compares to other databases"}, + {"19.1", "How MySQL compares to mSQL"}, + {"19.1.1", "How to convert mSQL tools for MySQL"}, + {"19.1.2", "How mSQL and MySQL client/server communications protocols differ"}, + {"19.1.3", "How mSQL 2.0 SQL syntax differs from MySQL"}, + {"19.2", "How MySQL compares to PostgreSQL"}, + {"A", "Some users of MySQL"}, + {"B", "Contributed programs"}, + {"C", "Contributors to MySQL"}, + {"D", "MySQL change history"}, + {"19.3", "Changes in release 3.22.x (Alpha version)"}, + {"19.3.1", "Changes in release 3.22.7"}, + {"19.3.2", "Changes in release 3.22.6"}, + {"19.3.3", "Changes in release 3.22.5"}, + {"19.3.4", "Changes in release 3.22.4"}, + {"19.3.5", "Changes in release 3.22.3"}, + {"19.3.6", "Changes in release 3.22.2"}, + {"19.3.7", "Changes in release 3.22.1"}, + {"19.3.8", "Changes in release 3.22.0"}, + {"19.4", "Changes in release 3.21.x"}, + {"19.4.1", "Changes in release 3.21.33"}, + {"19.4.2", "Changes in release 3.21.32"}, + {"19.4.3", "Changes in release 3.21.31"}, + {"19.4.4", "Changes in release 3.21.30"}, + {"19.4.5", "Changes in release 3.21.29"}, + {"19.4.6", "Changes in release 3.21.28"}, + {"19.4.7", "Changes in release 3.21.27"}, + {"19.4.8", "Changes in release 3.21.26"}, + {"19.4.9", "Changes in release 3.21.25"}, + {"19.4.10", "Changes in release 3.21.24"}, + {"19.4.11", "Changes in release 3.21.23"}, + {"19.4.12", "Changes in release 3.21.22"}, + {"19.4.13", "Changes in release 3.21.21a"}, + {"19.4.14", "Changes in release 3.21.21"}, + {"19.4.15", "Changes in release 3.21.20"}, + {"19.4.16", "Changes in release 3.21.19"}, + {"19.4.17", "Changes in release 3.21.18"}, + {"19.4.18", "Changes in release 3.21.17"}, + {"19.4.19", "Changes in release 3.21.16"}, + {"19.4.20", "Changes in release 3.21.15"}, + {"19.4.21", "Changes in release 3.21.14b"}, + {"19.4.22", "Changes in release 3.21.14a"}, + {"19.4.23", "Changes in release 3.21.13"}, + {"19.4.24", "Changes in release 3.21.12"}, + {"19.4.25", "Changes in release 3.21.11"}, + {"19.4.26", "Changes in release 3.21.10"}, + {"19.4.27", "Changes in release 3.21.9"}, + {"19.4.28", "Changes in release 3.21.8"}, + {"19.4.29", "Changes in release 3.21.7"}, + {"19.4.30", "Changes in release 3.21.6"}, + {"19.4.31", "Changes in release 3.21.5"}, + {"19.4.32", "Changes in release 3.21.4"}, + {"19.4.33", "Changes in release 3.21.3"}, + {"19.4.34", "Changes in release 3.21.2"}, + {"19.4.35", "Changes in release 3.21.0"}, + {"19.5", "Changes in release 3.20.x"}, + {"19.5.1", "Changes in release 3.20.18"}, + {"19.5.2", "Changes in release 3.20.17"}, + {"19.5.3", "Changes in release 3.20.16"}, + {"19.5.4", "Changes in release 3.20.15"}, + {"19.5.5", "Changes in release 3.20.14"}, + {"19.5.6", "Changes in release 3.20.13"}, + {"19.5.7", "Changes in release 3.20.11"}, + {"19.5.8", "Changes in release 3.20.10"}, + {"19.5.9", "Changes in release 3.20.9"}, + {"19.5.10", "Changes in release 3.20.8"}, + {"19.5.11", "Changes in release 3.20.7"}, + {"19.5.12", "Changes in release 3.20.6"}, + {"19.5.13", "Changes in release 3.20.3"}, + {"19.5.14", "Changes in release 3.20.0"}, + {"19.6", "Changes in release 3.19.x"}, + {"19.6.1", "Changes in release 3.19.5"}, + {"19.6.2", "Changes in release 3.19.4"}, + {"19.6.3", "Changes in release 3.19.3"}, + {"E", "Known errors and design deficiencies in MySQL"}, + {"F", "List of things we want to add to MySQL in the future (The TODO)"}, + {"19.7", "Things that must done in the real near future"}, + {"19.8", "Things that have to be done sometime"}, + {"19.9", "Some things we don't have any plans to do"}, + {"G", "Comments on porting to other systems"}, + {"19.10", "Debugging MySQL"}, + {"19.11", "Comments about RTS threads"}, + {"19.12", "What is the difference between different thread packages?"}, + {"H", "Description of MySQL regular expression syntax"}, + {"I", "What is Unireg?"}, + {"J", "The MySQL server license"}, + {"K", "The MySQL license for Microsoft operating systems"}, + {"*", "SQL command, type and function index"}, + {"*", "Concept Index"} +}; + +#define NQUERIES 5 +const char *query[NQUERIES]={ + "mysql information and manual", + "upgrading from previous version", + "column indexes", + "against about after more right the with/without", /* stopwords test */ + "mysql license and copyright" +}; diff --git a/storage/maria/ma_ft_update.c b/storage/maria/ma_ft_update.c new file mode 100644 index 00000000000..f36147ccde2 --- /dev/null +++ b/storage/maria/ma_ft_update.c @@ -0,0 +1,352 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* functions to work with full-text indices */ + +#include "ma_ftdefs.h" +#include <math.h> + +void _ma_ft_segiterator_init(MARIA_HA *info, uint keynr, const uchar *record, + FT_SEG_ITERATOR *ftsi) +{ + DBUG_ENTER("_ma_ft_segiterator_init"); + + ftsi->num=info->s->keyinfo[keynr].keysegs; + ftsi->seg=info->s->keyinfo[keynr].seg; + ftsi->rec=record; + DBUG_VOID_RETURN; +} + +void _ma_ft_segiterator_dummy_init(const uchar *record, uint len, + FT_SEG_ITERATOR *ftsi) +{ + DBUG_ENTER("_ma_ft_segiterator_dummy_init"); + + ftsi->num=1; + ftsi->seg=0; + ftsi->pos=record; + ftsi->len=len; + DBUG_VOID_RETURN; +} + +/* + This function breaks convention "return 0 in success" + but it's easier to use like this + + while(_ma_ft_segiterator()) + + so "1" means "OK", "0" means "EOF" +*/ + +uint _ma_ft_segiterator(register FT_SEG_ITERATOR *ftsi) +{ + DBUG_ENTER("_ma_ft_segiterator"); + + if (!ftsi->num) + DBUG_RETURN(0); + + ftsi->num--; + if (!ftsi->seg) + DBUG_RETURN(1); + + ftsi->seg--; + + if (ftsi->seg->null_bit && + (ftsi->rec[ftsi->seg->null_pos] & ftsi->seg->null_bit)) + { + ftsi->pos=0; + DBUG_RETURN(1); + } + ftsi->pos= ftsi->rec+ftsi->seg->start; + if (ftsi->seg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= (ftsi->seg->bit_start); + ftsi->len= (pack_length == 1 ? (uint) *(uchar*) ftsi->pos : + uint2korr(ftsi->pos)); + ftsi->pos+= pack_length; /* Skip VARCHAR length */ + DBUG_RETURN(1); + } + if (ftsi->seg->flag & HA_BLOB_PART) + { + ftsi->len= _ma_calc_blob_length(ftsi->seg->bit_start,ftsi->pos); + memcpy_fixed((char*) &ftsi->pos, ftsi->pos+ftsi->seg->bit_start, + sizeof(char*)); + DBUG_RETURN(1); + } + ftsi->len=ftsi->seg->length; + DBUG_RETURN(1); +} + + +/* parses a document i.e. calls maria_ft_parse for every keyseg */ + +uint _ma_ft_parse(TREE *parsed, MARIA_HA *info, uint keynr, const uchar *record, + MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) +{ + FT_SEG_ITERATOR ftsi; + struct st_mysql_ftparser *parser; + DBUG_ENTER("_ma_ft_parse"); + + _ma_ft_segiterator_init(info, keynr, record, &ftsi); + + maria_ft_parse_init(parsed, info->s->keyinfo[keynr].seg->charset); + parser= info->s->keyinfo[keynr].parser; + while (_ma_ft_segiterator(&ftsi)) + { + if (ftsi.pos) + if (maria_ft_parse(parsed, (uchar *)ftsi.pos, ftsi.len, parser, param, + mem_root)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + +FT_WORD * _ma_ft_parserecord(MARIA_HA *info, uint keynr, const uchar *record, + MEM_ROOT *mem_root) +{ + TREE ptree; + MYSQL_FTPARSER_PARAM *param; + DBUG_ENTER("_ma_ft_parserecord"); + if (! (param= maria_ftparser_call_initializer(info, keynr, 0))) + DBUG_RETURN(NULL); + bzero((char*) &ptree, sizeof(ptree)); + param->flags= 0; + if (_ma_ft_parse(&ptree, info, keynr, record, param, mem_root)) + DBUG_RETURN(NULL); + + DBUG_RETURN(maria_ft_linearize(&ptree, mem_root)); +} + +static int _ma_ft_store(MARIA_HA *info, uint keynr, uchar *keybuf, + FT_WORD *wlist, my_off_t filepos) +{ + uint key_length; + DBUG_ENTER("_ma_ft_store"); + + for (; wlist->pos; wlist++) + { + key_length= _ma_ft_make_key(info,keynr,keybuf,wlist,filepos); + if (_ma_ck_write(info, keynr, keybuf, key_length)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + +static int _ma_ft_erase(MARIA_HA *info, uint keynr, uchar *keybuf, + FT_WORD *wlist, my_off_t filepos) +{ + uint key_length, err=0; + DBUG_ENTER("_ma_ft_erase"); + + for (; wlist->pos; wlist++) + { + key_length= _ma_ft_make_key(info,keynr,keybuf,wlist,filepos); + if (_ma_ck_delete(info, keynr, keybuf, key_length)) + err=1; + } + DBUG_RETURN(err); +} + +/* + Compares an appropriate parts of two WORD_KEY keys directly out of records + returns 1 if they are different +*/ + +#define THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT 1 +#define GEE_THEY_ARE_ABSOLUTELY_IDENTICAL 0 + +int _ma_ft_cmp(MARIA_HA *info, uint keynr, const uchar *rec1, const uchar *rec2) +{ + FT_SEG_ITERATOR ftsi1, ftsi2; + CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset; + DBUG_ENTER("_ma_ft_cmp"); + + _ma_ft_segiterator_init(info, keynr, rec1, &ftsi1); + _ma_ft_segiterator_init(info, keynr, rec2, &ftsi2); + + while (_ma_ft_segiterator(&ftsi1) && _ma_ft_segiterator(&ftsi2)) + { + if ((ftsi1.pos != ftsi2.pos) && + (!ftsi1.pos || !ftsi2.pos || + ha_compare_text(cs, (uchar*) ftsi1.pos,ftsi1.len, + (uchar*) ftsi2.pos,ftsi2.len,0,0))) + DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT); + } + DBUG_RETURN(GEE_THEY_ARE_ABSOLUTELY_IDENTICAL); +} + + +/* update a document entry */ + +int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf, + const uchar *oldrec, const uchar *newrec, my_off_t pos) +{ + int error= -1; + FT_WORD *oldlist,*newlist, *old_word, *new_word; + CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset; + uint key_length; + int cmp, cmp2; + DBUG_ENTER("_ma_ft_update"); + + if (!(old_word=oldlist=_ma_ft_parserecord(info, keynr, oldrec, + &info->ft_memroot)) || + !(new_word=newlist=_ma_ft_parserecord(info, keynr, newrec, + &info->ft_memroot))) + goto err; + + error=0; + while(old_word->pos && new_word->pos) + { + cmp= ha_compare_text(cs, (uchar*) old_word->pos,old_word->len, + (uchar*) new_word->pos,new_word->len,0,0); + cmp2= cmp ? 0 : (fabs(old_word->weight - new_word->weight) > 1.e-5); + + if (cmp < 0 || cmp2) + { + key_length= _ma_ft_make_key(info,keynr,keybuf,old_word,pos); + if ((error= _ma_ck_delete(info,keynr, keybuf,key_length))) + goto err; + } + if (cmp > 0 || cmp2) + { + key_length= _ma_ft_make_key(info, keynr, keybuf, new_word,pos); + if ((error= _ma_ck_write(info, keynr, keybuf,key_length))) + goto err; + } + if (cmp<=0) old_word++; + if (cmp>=0) new_word++; + } + if (old_word->pos) + error= _ma_ft_erase(info,keynr,keybuf,old_word,pos); + else if (new_word->pos) + error= _ma_ft_store(info,keynr,keybuf,new_word,pos); + +err: + free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE)); + DBUG_RETURN(error); +} + + +/* adds a document to the collection */ + +int _ma_ft_add(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record, + my_off_t pos) +{ + int error= -1; + FT_WORD *wlist; + DBUG_ENTER("_ma_ft_add"); + DBUG_PRINT("enter",("keynr: %d",keynr)); + + if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot))) + error= _ma_ft_store(info,keynr,keybuf,wlist,pos); + free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE)); + DBUG_PRINT("exit",("Return: %d",error)); + DBUG_RETURN(error); +} + + +/* removes a document from the collection */ + +int _ma_ft_del(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record, + my_off_t pos) +{ + int error= -1; + FT_WORD *wlist; + DBUG_ENTER("_ma_ft_del"); + DBUG_PRINT("enter",("keynr: %d",keynr)); + + if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot))) + error= _ma_ft_erase(info,keynr,keybuf,wlist,pos); + free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE)); + DBUG_PRINT("exit",("Return: %d",error)); + DBUG_RETURN(error); +} + + +uint _ma_ft_make_key(MARIA_HA *info, uint keynr, uchar *keybuf, FT_WORD *wptr, + my_off_t filepos) +{ + uchar buf[HA_FT_MAXBYTELEN+16]; + DBUG_ENTER("_ma_ft_make_key"); + +#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT + { + float weight=(float) ((filepos==HA_OFFSET_ERROR) ? 0 : wptr->weight); + mi_float4store(buf,weight); + } +#else +#error +#endif + + int2store(buf+HA_FT_WLEN,wptr->len); + memcpy(buf+HA_FT_WLEN+2,wptr->pos,wptr->len); + DBUG_RETURN(_ma_make_key(info, keynr, keybuf, buf, filepos)); +} + + +/* + convert key value to ft2 +*/ + +uint _ma_ft_convert_to_ft2(MARIA_HA *info, uint keynr, uchar *key) +{ + my_off_t root; + DYNAMIC_ARRAY *da=info->ft1_to_ft2; + MARIA_KEYDEF *keyinfo=&info->s->ft2_keyinfo; + uchar *key_ptr= (uchar*) dynamic_array_ptr(da, 0), *end; + uint length, key_length; + DBUG_ENTER("_ma_ft_convert_to_ft2"); + + /* we'll generate one pageful at once, and insert the rest one-by-one */ + /* calculating the length of this page ...*/ + length=(keyinfo->block_length-2) / keyinfo->keylength; + set_if_smaller(length, da->elements); + length=length * keyinfo->keylength; + + get_key_full_length_rdonly(key_length, key); + while (_ma_ck_delete(info, keynr, key, key_length) == 0) + { + /* + nothing to do here. + _ma_ck_delete() will populate info->ft1_to_ft2 with deleted keys + */ + } + + /* creating pageful of keys */ + maria_putint(info->buff,length+2,0); + memcpy(info->buff+2, key_ptr, length); + info->keyread_buff_used=info->page_changed=1; /* info->buff is used */ + if ((root= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR || + _ma_write_keypage(info,keyinfo,root,DFLT_INIT_HITS,info->buff)) + DBUG_RETURN(-1); + + /* inserting the rest of key values */ + end= (uchar*) dynamic_array_ptr(da, da->elements); + for (key_ptr+=length; key_ptr < end; key_ptr+=keyinfo->keylength) + if(_ma_ck_real_write_btree(info, keyinfo, key_ptr, 0, &root, SEARCH_SAME)) + DBUG_RETURN(-1); + + /* now, writing the word key entry */ + ft_intXstore(key+key_length, - (int) da->elements); + _ma_dpointer(info, key+key_length+HA_FT_WLEN, root); + + DBUG_RETURN(_ma_ck_real_write_btree(info, + info->s->keyinfo+keynr, + key, 0, + &info->s->state.key_root[keynr], + SEARCH_SAME)); +} diff --git a/storage/maria/ma_ftdefs.h b/storage/maria/ma_ftdefs.h new file mode 100644 index 00000000000..5a7357e451c --- /dev/null +++ b/storage/maria/ma_ftdefs.h @@ -0,0 +1,152 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* some definitions for full-text indices */ + +#include "ma_fulltext.h" +#include <m_ctype.h> +#include <my_tree.h> +#include <queues.h> +#include <mysql/plugin.h> + +#define true_word_char(ctype, character) \ + ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \ + (character) == '_') +#define misc_word_char(X) 0 + +#define FT_MAX_WORD_LEN_FOR_SORT 31 + +#define FTPARSER_MEMROOT_ALLOC_SIZE 65536 + +#define COMPILE_STOPWORDS_IN + +/* Interested readers may consult SMART + (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z) + for an excellent implementation of vector space model we use. + It also demonstrate the usage of different weghting techniques. + This code, though, is completely original and is not based on the + SMART code but was in some cases inspired by it. + + NORM_PIVOT was taken from the article + A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization", + ACM SIGIR'96, 21-29, 1996 + */ + +#define LWS_FOR_QUERY LWS_TF +#define LWS_IN_USE LWS_LOG +#define PRENORM_IN_USE PRENORM_AVG +#define NORM_IN_USE NORM_PIVOT +#define GWS_IN_USE GWS_PROB +/*==============================================================*/ +#define LWS_TF (count) +#define LWS_BINARY (count>0) +#define LWS_SQUARE (count*count) +#define LWS_LOG (count?(log( (double) count)+1):0) +/*--------------------------------------------------------------*/ +#define PRENORM_NONE (p->weight) +#define PRENORM_MAX (p->weight/docstat.max) +#define PRENORM_AUG (0.4+0.6*p->weight/docstat.max) +#define PRENORM_AVG (p->weight/docstat.sum*docstat.uniq) +#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq))) +/*--------------------------------------------------------------*/ +#define NORM_NONE (1) +#define NORM_SUM (docstat.nsum) +#define NORM_COS (sqrt(docstat.nsum2)) + +#define PIVOT_VAL (0.0115) +#define NORM_PIVOT (1+PIVOT_VAL*docstat.uniq) +/*---------------------------------------------------------------*/ +#define GWS_NORM (1/sqrt(sum2)) +#define GWS_GFIDF (sum/doc_cnt) +/* Mysterious, but w/o (double) GWS_IDF performs better :-o */ +#define GWS_IDF log(aio->info->state->records/doc_cnt) +#define GWS_IDF1 log((double)aio->info->state->records/doc_cnt) +#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 ) +#define GWS_FREQ (1.0/doc_cnt) +#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2) +#define GWS_CUBIC pow(log((double)aio->info->state->records/doc_cnt),3) +#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records)) +/*=================================================================*/ + +/* Boolean search operators */ +#define FTB_YES (ft_boolean_syntax[0]) +#define FTB_EGAL (ft_boolean_syntax[1]) +#define FTB_NO (ft_boolean_syntax[2]) +#define FTB_INC (ft_boolean_syntax[3]) +#define FTB_DEC (ft_boolean_syntax[4]) +#define FTB_LBR (ft_boolean_syntax[5]) +#define FTB_RBR (ft_boolean_syntax[6]) +#define FTB_NEG (ft_boolean_syntax[7]) +#define FTB_TRUNC (ft_boolean_syntax[8]) +#define FTB_LQUOT (ft_boolean_syntax[10]) +#define FTB_RQUOT (ft_boolean_syntax[11]) + +typedef struct st_maria_ft_word { + uchar * pos; + uint len; + double weight; +} FT_WORD; + +int is_stopword(char *word, uint len); + +uint _ma_ft_make_key(MARIA_HA *, uint , uchar *, FT_WORD *, my_off_t); + +uchar maria_ft_get_word(CHARSET_INFO *, uchar **, uchar *, FT_WORD *, + MYSQL_FTPARSER_BOOLEAN_INFO *); +uchar maria_ft_simple_get_word(CHARSET_INFO *, uchar **, const uchar *, + FT_WORD *, my_bool); + +typedef struct _st_maria_ft_seg_iterator { + uint num, len; + HA_KEYSEG *seg; + const uchar *rec, *pos; +} FT_SEG_ITERATOR; + +void _ma_ft_segiterator_init(MARIA_HA *, uint, const uchar *, FT_SEG_ITERATOR *); +void _ma_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *); +uint _ma_ft_segiterator(FT_SEG_ITERATOR *); + +void maria_ft_parse_init(TREE *, CHARSET_INFO *); +int maria_ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser, + MYSQL_FTPARSER_PARAM *, MEM_ROOT *); +FT_WORD * maria_ft_linearize(TREE *, MEM_ROOT *); +FT_WORD * _ma_ft_parserecord(MARIA_HA *, uint, const uchar *, MEM_ROOT *); +uint _ma_ft_parse(TREE *, MARIA_HA *, uint, const uchar *, + MYSQL_FTPARSER_PARAM *, MEM_ROOT *); + +FT_INFO *maria_ft_init_nlq_search(MARIA_HA *, uint, uchar *, uint, uint, uchar *); +FT_INFO *maria_ft_init_boolean_search(MARIA_HA *, uint, uchar *, uint, CHARSET_INFO *); + +extern const struct _ft_vft _ma_ft_vft_nlq; +int maria_ft_nlq_read_next(FT_INFO *, char *); +float maria_ft_nlq_find_relevance(FT_INFO *, uchar *, uint); +void maria_ft_nlq_close_search(FT_INFO *); +float maria_ft_nlq_get_relevance(FT_INFO *); +my_off_t maria_ft_nlq_get_docid(FT_INFO *); +void maria_ft_nlq_reinit_search(FT_INFO *); + +extern const struct _ft_vft _ma_ft_vft_boolean; +int maria_ft_boolean_read_next(FT_INFO *, char *); +float maria_ft_boolean_find_relevance(FT_INFO *, uchar *, uint); +void maria_ft_boolean_close_search(FT_INFO *); +float maria_ft_boolean_get_relevance(FT_INFO *); +my_off_t maria_ft_boolean_get_docid(FT_INFO *); +void maria_ft_boolean_reinit_search(FT_INFO *); +extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, + uint keynr, + uint paramnr); +extern void maria_ftparser_call_deinitializer(MARIA_HA *info); diff --git a/storage/maria/ma_fulltext.h b/storage/maria/ma_fulltext.h new file mode 100644 index 00000000000..dc6cf9d1204 --- /dev/null +++ b/storage/maria/ma_fulltext.h @@ -0,0 +1,27 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* some definitions for full-text indices */ + +#include "maria_def.h" +#include "ft_global.h" + +int _ma_ft_cmp(MARIA_HA *, uint, const uchar *, const uchar *); +int _ma_ft_add(MARIA_HA *, uint, uchar *, const uchar *, my_off_t); +int _ma_ft_del(MARIA_HA *, uint, uchar *, const uchar *, my_off_t); + +uint _ma_ft_convert_to_ft2(MARIA_HA *, uint, uchar *); diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c new file mode 100644 index 00000000000..4aecc33f816 --- /dev/null +++ b/storage/maria/ma_info.c @@ -0,0 +1,141 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Return useful base information for an open table */ + +#include "maria_def.h" +#ifdef __WIN__ +#include <sys/stat.h> +#endif + + /* Get position to last record */ + +MARIA_RECORD_POS maria_position(MARIA_HA *info) +{ + return info->cur_row.lastpos; +} + + +/* Get information about the table */ +/* if flag == 2 one get current info (no sync from database */ + +int maria_status(MARIA_HA *info, register MARIA_INFO *x, uint flag) +{ + MY_STAT state; + MARIA_SHARE *share=info->s; + DBUG_ENTER("maria_status"); + + x->recpos= info->cur_row.lastpos; + if (flag == HA_STATUS_POS) + DBUG_RETURN(0); /* Compatible with ISAM */ + if (!(flag & HA_STATUS_NO_LOCK)) + { + pthread_mutex_lock(&share->intern_lock); + VOID(_ma_readinfo(info,F_RDLCK,0)); + fast_ma_writeinfo(info); + pthread_mutex_unlock(&share->intern_lock); + } + if (flag & HA_STATUS_VARIABLE) + { + x->records = info->state->records; + x->deleted = info->state->del; + x->delete_length = info->state->empty; + x->data_file_length =info->state->data_file_length; + x->index_file_length=info->state->key_file_length; + + x->keys = share->state.header.keys; + x->check_time = share->state.check_time; + x->mean_reclength = x->records ? + (ulong) ((x->data_file_length - x->delete_length) /x ->records) : + (ulong) share->min_pack_length; + } + if (flag & HA_STATUS_ERRKEY) + { + x->errkey= info->errkey; + x->dup_key_pos= info->dup_key_pos; + } + if (flag & HA_STATUS_CONST) + { + x->reclength = share->base.reclength; + x->max_data_file_length=share->base.max_data_file_length; + x->max_index_file_length=info->s->base.max_key_file_length; + x->filenr = info->dfile.file; + x->options = share->options; + x->create_time=share->state.create_time; + x->reflength= maria_get_pointer_length(share->base.max_data_file_length, + maria_data_pointer_size); + x->record_offset= (info->s->data_file_type == STATIC_RECORD ? + share->base.pack_reclength: 0); + x->sortkey= -1; /* No clustering */ + x->rec_per_key = share->state.rec_per_key_part; + x->key_map = share->state.key_map; + x->data_file_name = share->data_file_name; + x->index_file_name = share->index_file_name; + x->data_file_type = share->data_file_type; + } + if ((flag & HA_STATUS_TIME) && !my_fstat(info->dfile.file, &state, MYF(0))) + x->update_time=state.st_mtime; + else + x->update_time=0; + if (flag & HA_STATUS_AUTO) + { + x->auto_increment= share->state.auto_increment+1; + if (!x->auto_increment) /* This shouldn't happen */ + x->auto_increment= ~(ulonglong) 0; + } + DBUG_RETURN(0); +} + + +/* + Write a message to the error log. + + SYNOPSIS + _ma_report_error() + file_name Name of table file (e.g. index_file_name). + errcode Error number. + + DESCRIPTION + This function supplies my_error() with a table name. Most error + messages need one. Since string arguments in error messages are limited + to 64 characters by convention, we ensure that in case of truncation, + that the end of the index file path is in the message. This contains + the most valuable information (the table name and the database name). + + RETURN + void +*/ + +void _ma_report_error(int errcode, const char *file_name) +{ + uint length; + DBUG_ENTER("_ma_report_error"); + DBUG_PRINT("enter",("errcode %d, table '%s'", errcode, file_name)); + + if ((length= strlen(file_name)) > 64) + { + /* we first remove the directory */ + uint dir_length= dirname_length(file_name); + file_name+= dir_length; + if ((length-= dir_length) > 64) + { + /* still too long, chop start of table name */ + file_name+= length - 64; + } + } + + my_error(errcode, MYF(ME_NOREFRESH), file_name); + DBUG_VOID_RETURN; +} diff --git a/storage/maria/ma_init.c b/storage/maria/ma_init.c new file mode 100644 index 00000000000..fb8efddd778 --- /dev/null +++ b/storage/maria/ma_init.c @@ -0,0 +1,67 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Initialize an maria-database */ + +#include "maria_def.h" +#include <ft_global.h> +#include "ma_blockrec.h" +#include "trnman_public.h" +#include "ma_checkpoint.h" + +my_bool maria_inited= FALSE; +pthread_mutex_t THR_LOCK_maria; + +/* + Initialize maria + + SYNOPSIS + maria_init() + + TODO + Open log files and do recovery if need + + RETURN + 0 ok + # error number +*/ + +int maria_init(void) +{ + if (!maria_inited) + { + maria_inited= TRUE; + pthread_mutex_init(&THR_LOCK_maria,MY_MUTEX_INIT_SLOW); + _ma_init_block_record_data(); + my_handler_error_register(); + } + return 0; +} + + +void maria_end(void) +{ + if (maria_inited) + { + maria_inited= maria_multi_threaded= FALSE; + ft_free_stopwords(); + ma_checkpoint_end(); + trnman_destroy(); + translog_destroy(); + end_pagecache(maria_log_pagecache, TRUE); + ma_control_file_end(); + pthread_mutex_destroy(&THR_LOCK_maria); + } +} diff --git a/storage/maria/ma_key.c b/storage/maria/ma_key.c new file mode 100644 index 00000000000..96b8d2af0eb --- /dev/null +++ b/storage/maria/ma_key.c @@ -0,0 +1,569 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Functions to handle keys */ + +#include "maria_def.h" +#include "m_ctype.h" +#include "ma_sp_defs.h" +#ifdef HAVE_IEEEFP_H +#include <ieeefp.h> +#endif + +#define CHECK_KEYS /* Enable safety checks */ + +#define FIX_LENGTH(cs, pos, length, char_length) \ + do { \ + if (length > char_length) \ + char_length= my_charpos(cs, pos, pos+length, char_length); \ + set_if_smaller(char_length,length); \ + } while(0) + +static int _ma_put_key_in_record(MARIA_HA *info,uint keynr,uchar *record); + +/* + Make a intern key from a record + + SYNOPSIS + _ma_make_key() + info MyiSAM handler + keynr key number + key Store created key here + record Record + filepos Position to record in the data file + + RETURN + Length of key +*/ + +uint _ma_make_key(register MARIA_HA *info, uint keynr, uchar *key, + const uchar *record, MARIA_RECORD_POS filepos) +{ + const uchar *pos; + uchar *start; + reg1 HA_KEYSEG *keyseg; + my_bool is_ft= info->s->keyinfo[keynr].flag & HA_FULLTEXT; + DBUG_ENTER("_ma_make_key"); + + if (info->s->keyinfo[keynr].flag & HA_SPATIAL) + { + /* + TODO: nulls processing + */ +#ifdef HAVE_SPATIAL + DBUG_RETURN(_ma_sp_make_key(info,keynr, key,record,filepos)); +#else + DBUG_ASSERT(0); /* maria_open should check that this never happens*/ +#endif + } + + start=key; + for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type ;keyseg++) + { + enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type; + uint length=keyseg->length; + uint char_length; + CHARSET_INFO *cs=keyseg->charset; + + if (keyseg->null_bit) + { + if (record[keyseg->null_pos] & keyseg->null_bit) + { + *key++= 0; /* NULL in key */ + continue; + } + *key++=1; /* Not NULL */ + } + + char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen : + length); + + pos= record+keyseg->start; + if (type == HA_KEYTYPE_BIT) + { + if (keyseg->bit_length) + { + uchar bits= get_rec_bits((uchar*) record + keyseg->bit_pos, + keyseg->bit_start, keyseg->bit_length); + *key++= (char) bits; + length--; + } + memcpy(key, pos, length); + key+= length; + continue; + } + if (keyseg->flag & HA_SPACE_PACK) + { + if (type != HA_KEYTYPE_NUM) + { + length= cs->cset->lengthsp(cs, pos, length); + } + else + { + const uchar *end= pos + length; + while (pos < end && pos[0] == ' ') + pos++; + length= (uint) (end-pos); + } + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key, pos, (size_t) char_length); + key+=char_length; + continue; + } + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= (keyseg->bit_start == 1 ? 1 : 2); + uint tmp_length= (pack_length == 1 ? (uint) *(uchar*) pos : + uint2korr(pos)); + pos+= pack_length; /* Skip VARCHAR length */ + set_if_smaller(length,tmp_length); + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key,pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & HA_BLOB_PART) + { + uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos); + memcpy_fixed(&pos,pos+keyseg->bit_start,sizeof(char*)); + set_if_smaller(length,tmp_length); + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key,pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & HA_SWAP_KEY) + { /* Numerical column */ +#ifdef HAVE_ISNAN + if (type == HA_KEYTYPE_FLOAT) + { + float nr; + float4get(nr,pos); + if (isnan(nr)) + { + /* Replace NAN with zero */ + bzero(key,length); + key+=length; + continue; + } + } + else if (type == HA_KEYTYPE_DOUBLE) + { + double nr; + float8get(nr,pos); + if (isnan(nr)) + { + bzero(key,length); + key+=length; + continue; + } + } +#endif + pos+=length; + while (length--) + { + *key++ = *--pos; + } + continue; + } + FIX_LENGTH(cs, pos, length, char_length); + memcpy(key, pos, char_length); + if (length > char_length) + cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' '); + key+= length; + } + _ma_dpointer(info,key,filepos); + DBUG_PRINT("exit",("keynr: %d",keynr)); + DBUG_DUMP("key",start,(uint) (key-start)+keyseg->length); + DBUG_EXECUTE("key", + _ma_print_key(DBUG_FILE,info->s->keyinfo[keynr].seg,start, + (uint) (key-start));); + DBUG_RETURN((uint) (key-start)); /* Return keylength */ +} /* _ma_make_key */ + + +/* + Pack a key to intern format from given format (c_rkey) + + SYNOPSIS + _ma_pack_key() + info MARIA handler + uint keynr key number + key Store packed key here + old Not packed key + keypart_map bitmap of used keyparts + last_used_keyseg out parameter. May be NULL + + RETURN + length of packed key + + last_use_keyseg Store pointer to the keyseg after the last used one +*/ + +uint _ma_pack_key(register MARIA_HA *info, uint keynr, uchar *key, + const uchar *old, key_part_map keypart_map, + HA_KEYSEG **last_used_keyseg) +{ + uchar *start_key=key; + HA_KEYSEG *keyseg; + my_bool is_ft= info->s->keyinfo[keynr].flag & HA_FULLTEXT; + DBUG_ENTER("_ma_pack_key"); + + /* "one part" rtree key is 2*SPDIMS part key in Maria */ + if (info->s->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE) + keypart_map= (((key_part_map)1) << (2*SPDIMS)) - 1; + + /* only key prefixes are supported */ + DBUG_ASSERT(((keypart_map+1) & keypart_map) == 0); + + for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type && keypart_map; + old+= keyseg->length, keyseg++) + { + enum ha_base_keytype type= (enum ha_base_keytype) keyseg->type; + uint length= keyseg->length; + uint char_length; + const uchar *pos; + CHARSET_INFO *cs=keyseg->charset; + + keypart_map>>= 1; + if (keyseg->null_bit) + { + if (!(*key++= (char) 1-*old++)) /* Copy null marker */ + { + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) + old+= 2; + continue; /* Found NULL */ + } + } + char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen : + length); + pos= old; + if (keyseg->flag & HA_SPACE_PACK) + { + const uchar *end= pos + length; + if (type == HA_KEYTYPE_NUM) + { + while (pos < end && pos[0] == ' ') + pos++; + } + else if (type != HA_KEYTYPE_BINARY) + { + while (end > pos && end[-1] == ' ') + end--; + } + length=(uint) (end-pos); + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key,pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) + { + /* Length of key-part used with maria_rkey() always 2 */ + uint tmp_length=uint2korr(pos); + pos+=2; + set_if_smaller(length,tmp_length); /* Safety */ + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + old+=2; /* Skip length */ + memcpy(key, pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & HA_SWAP_KEY) + { /* Numerical column */ + pos+=length; + while (length--) + *key++ = *--pos; + continue; + } + FIX_LENGTH(cs, pos, length, char_length); + memcpy(key, pos, char_length); + if (length > char_length) + cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' '); + key+= length; + } + if (last_used_keyseg) + *last_used_keyseg= keyseg; + + DBUG_PRINT("exit", ("length: %u", (uint) (key-start_key))); + DBUG_RETURN((uint) (key-start_key)); +} /* _ma_pack_key */ + + + +/* + Store found key in record + + SYNOPSIS + _ma_put_key_in_record() + info MARIA handler + keynr Key number that was used + record Store key here + + Last read key is in info->lastkey + + NOTES + Used when only-keyread is wanted + + RETURN + 0 ok + 1 error +*/ + +static int _ma_put_key_in_record(register MARIA_HA *info, uint keynr, + uchar *record) +{ + reg2 uchar *key; + uchar *pos,*key_end; + reg1 HA_KEYSEG *keyseg; + uchar *blob_ptr; + DBUG_ENTER("_ma_put_key_in_record"); + + blob_ptr= info->lastkey2; /* Place to put blob parts */ + key=info->lastkey; /* KEy that was read */ + key_end=key+info->lastkey_length; + for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type ;keyseg++) + { + if (keyseg->null_bit) + { + if (!*key++) + { + record[keyseg->null_pos]|= keyseg->null_bit; + continue; + } + record[keyseg->null_pos]&= ~keyseg->null_bit; + } + if (keyseg->type == HA_KEYTYPE_BIT) + { + uint length= keyseg->length; + + if (keyseg->bit_length) + { + uchar bits= *key++; + set_rec_bits(bits, record + keyseg->bit_pos, keyseg->bit_start, + keyseg->bit_length); + length--; + } + else + { + clr_rec_bits(record + keyseg->bit_pos, keyseg->bit_start, + keyseg->bit_length); + } + memcpy(record + keyseg->start, key, length); + key+= length; + continue; + } + if (keyseg->flag & HA_SPACE_PACK) + { + uint length; + get_key_length(length,key); +#ifdef CHECK_KEYS + if (length > keyseg->length || key+length > key_end) + goto err; +#endif + pos= record+keyseg->start; + if (keyseg->type != (int) HA_KEYTYPE_NUM) + { + memcpy(pos,key,(size_t) length); + keyseg->charset->cset->fill(keyseg->charset, + pos + length, keyseg->length - length, + ' '); + } + else + { + bfill(pos,keyseg->length-length,' '); + memcpy(pos+keyseg->length-length,key,(size_t) length); + } + key+=length; + continue; + } + + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint length; + get_key_length(length,key); +#ifdef CHECK_KEYS + if (length > keyseg->length || key+length > key_end) + goto err; +#endif + /* Store key length */ + if (keyseg->bit_start == 1) + *(uchar*) (record+keyseg->start)= (uchar) length; + else + int2store(record+keyseg->start, length); + /* And key data */ + memcpy(record+keyseg->start + keyseg->bit_start, key, length); + key+= length; + } + else if (keyseg->flag & HA_BLOB_PART) + { + uint length; + get_key_length(length,key); +#ifdef CHECK_KEYS + if (length > keyseg->length || key+length > key_end) + goto err; +#endif + memcpy(record+keyseg->start+keyseg->bit_start, + (char*) &blob_ptr,sizeof(char*)); + memcpy(blob_ptr,key,length); + blob_ptr+=length; + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + _ma_store_blob_length(record+keyseg->start, + (uint) keyseg->bit_start,length); + key+=length; + } + else if (keyseg->flag & HA_SWAP_KEY) + { + uchar *to= record+keyseg->start+keyseg->length; + uchar *end= key+keyseg->length; +#ifdef CHECK_KEYS + if (end > key_end) + goto err; +#endif + do + { + *--to= *key++; + } while (key != end); + continue; + } + else + { +#ifdef CHECK_KEYS + if (key+keyseg->length > key_end) + goto err; +#endif + memcpy(record+keyseg->start, key, (size_t) keyseg->length); + key+= keyseg->length; + } + } + DBUG_RETURN(0); + +err: + DBUG_RETURN(1); /* Crashed row */ +} /* _ma_put_key_in_record */ + + + /* Here when key reads are used */ + +int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos) +{ + fast_ma_writeinfo(info); + if (filepos != HA_OFFSET_ERROR) + { + if (info->lastinx >= 0) + { /* Read only key */ + if (_ma_put_key_in_record(info,(uint) info->lastinx,buf)) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + return -1; + } + info->update|= HA_STATE_AKTIV; /* We should find a record */ + return 0; + } + my_errno=HA_ERR_WRONG_INDEX; + } + return(-1); /* Wrong data to read */ +} + + +/* + Retrieve auto_increment info + + SYNOPSIS + retrieve_auto_increment() + info Maria handler + record Row to update + + IMPLEMENTATION + For signed columns we don't retrieve the auto increment value if it's + less than zero. +*/ + +ulonglong ma_retrieve_auto_increment(MARIA_HA *info,const uchar *record) +{ + ulonglong value= 0; /* Store unsigned values here */ + longlong s_value= 0; /* Store signed values here */ + HA_KEYSEG *keyseg= info->s->keyinfo[info->s->base.auto_key-1].seg; + const uchar *key= record + keyseg->start; + + switch (keyseg->type) { + case HA_KEYTYPE_INT8: + s_value= (longlong) *(char*)key; + break; + case HA_KEYTYPE_BINARY: + value=(ulonglong) *(uchar*) key; + break; + case HA_KEYTYPE_SHORT_INT: + s_value= (longlong) sint2korr(key); + break; + case HA_KEYTYPE_USHORT_INT: + value=(ulonglong) uint2korr(key); + break; + case HA_KEYTYPE_LONG_INT: + s_value= (longlong) sint4korr(key); + break; + case HA_KEYTYPE_ULONG_INT: + value=(ulonglong) uint4korr(key); + break; + case HA_KEYTYPE_INT24: + s_value= (longlong) sint3korr(key); + break; + case HA_KEYTYPE_UINT24: + value=(ulonglong) uint3korr(key); + break; + case HA_KEYTYPE_FLOAT: /* This shouldn't be used */ + { + float f_1; + float4get(f_1,key); + /* Ignore negative values */ + value = (f_1 < (float) 0.0) ? 0 : (ulonglong) f_1; + break; + } + case HA_KEYTYPE_DOUBLE: /* This shouldn't be used */ + { + double f_1; + float8get(f_1,key); + /* Ignore negative values */ + value = (f_1 < 0.0) ? 0 : (ulonglong) f_1; + break; + } + case HA_KEYTYPE_LONGLONG: + s_value= sint8korr(key); + break; + case HA_KEYTYPE_ULONGLONG: + value= uint8korr(key); + break; + default: + DBUG_ASSERT(0); + value=0; /* Error */ + break; + } + + /* + The following code works becasue if s_value < 0 then value is 0 + and if s_value == 0 then value will contain either s_value or the + correct value. + */ + return (s_value > 0) ? (ulonglong) s_value : value; +} diff --git a/storage/maria/ma_keycache.c b/storage/maria/ma_keycache.c new file mode 100644 index 00000000000..7a2a56488e6 --- /dev/null +++ b/storage/maria/ma_keycache.c @@ -0,0 +1,163 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Key cache assignments +*/ + +#include "maria_def.h" + +/* + Assign pages of the index file for a table to a key cache + + SYNOPSIS + maria_assign_to_pagecache() + info open table + key_map map of indexes to assign to the key cache + pagecache_ptr pointer to the key cache handle + assign_lock Mutex to lock during assignment + + PREREQUESTS + One must have a READ lock or a WRITE lock on the table when calling + the function to ensure that there is no other writers to it. + + The caller must also ensure that one doesn't call this function from + two different threads with the same table. + + NOTES + At present pages for all indexes must be assigned to the same key cache. + In future only pages for indexes specified in the key_map parameter + of the table will be assigned to the specified key cache. + + RETURN VALUE + 0 If a success + # Error code +*/ + +int maria_assign_to_pagecache(MARIA_HA *info, + ulonglong key_map __attribute__((unused)), + PAGECACHE *pagecache) +{ + int error= 0; + MARIA_SHARE* share= info->s; + DBUG_ENTER("maria_assign_to_pagecache"); + DBUG_PRINT("enter", + ("old_pagecache_handle: 0x%lx new_pagecache_handle: 0x%lx", + (long) share->pagecache, (long) pagecache)); + + /* + Skip operation if we didn't change key cache. This can happen if we + call this for all open instances of the same table + */ + if (share->pagecache == pagecache) + DBUG_RETURN(0); + + /* + First flush all blocks for the table in the old key cache. + This is to ensure that the disk is consistent with the data pages + in memory (which may not be the case if the table uses delayed_key_write) + + Note that some other read thread may still fill in the key cache with + new blocks during this call and after, but this doesn't matter as + all threads will start using the new key cache for their next call to + maria library and we know that there will not be any changed blocks + in the old key cache. + */ + + if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE)) + { + error= my_errno; + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); /* Mark that table must be checked */ + } + + /* + Flush the new key cache for this file. This is needed to ensure + that there is no old blocks (with outdated data) left in the new key + cache from an earlier assign_to_keycache operation + + (This can never fail as there is never any not written data in the + new key cache) + */ + (void) flush_pagecache_blocks(pagecache, &share->kfile, FLUSH_RELEASE); + + /* + ensure that setting the key cache and changing the multi_pagecache + is done atomicly + */ + pthread_mutex_lock(&share->intern_lock); + /* + Tell all threads to use the new key cache + This should be seen at the lastes for the next call to an maria function. + */ + share->pagecache= pagecache; + + /* store the key cache in the global hash structure for future opens */ + if (multi_pagecache_set(share->unique_file_name, share->unique_name_length, + share->pagecache)) + error= my_errno; + pthread_mutex_unlock(&share->intern_lock); + DBUG_RETURN(error); +} + + +/* + Change all MARIA entries that uses one key cache to another key cache + + SYNOPSIS + maria_change_pagecache() + old_pagecache Old key cache + new_pagecache New key cache + + NOTES + This is used when we delete one key cache. + + To handle the case where some other threads tries to open an MARIA + table associated with the to-be-deleted key cache while this operation + is running, we have to call 'multi_pagecache_change()' from this + function while we have a lock on the MARIA table list structure. + + This is safe as long as it's only MARIA that is using this specific + key cache. +*/ + + +void maria_change_pagecache(PAGECACHE *old_pagecache, + PAGECACHE *new_pagecache) +{ + LIST *pos; + DBUG_ENTER("maria_change_pagecache"); + + /* + Lock list to ensure that no one can close the table while we manipulate it + */ + pthread_mutex_lock(&THR_LOCK_maria); + for (pos=maria_open_list ; pos ; pos=pos->next) + { + MARIA_HA *info= (MARIA_HA*) pos->data; + MARIA_SHARE *share= info->s; + if (share->pagecache == old_pagecache) + maria_assign_to_pagecache(info, (ulonglong) ~0, new_pagecache); + } + + /* + We have to do the following call while we have the lock on the + MARIA list structure to ensure that another thread is not trying to + open a new table that will be associted with the old key cache + */ + multi_pagecache_change(old_pagecache, new_pagecache); + pthread_mutex_unlock(&THR_LOCK_maria); + DBUG_VOID_RETURN; +} diff --git a/storage/maria/ma_locking.c b/storage/maria/ma_locking.c new file mode 100644 index 00000000000..01d59ed56df --- /dev/null +++ b/storage/maria/ma_locking.c @@ -0,0 +1,570 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + locking of isam-tables. + reads info from a isam-table. Must be first request before doing any furter + calls to any isamfunktion. Is used to allow many process use the same + isamdatabase. +*/ + +#include "ma_ftdefs.h" + + /* lock table by F_UNLCK, F_RDLCK or F_WRLCK */ + +int maria_lock_database(MARIA_HA *info, int lock_type) +{ + int error; + uint count; + MARIA_SHARE *share=info->s; + DBUG_ENTER("maria_lock_database"); + DBUG_PRINT("enter",("lock_type: %d old lock %d r_locks: %u w_locks: %u " + "global_changed: %d open_count: %u name: '%s'", + lock_type, info->lock_type, share->r_locks, + share->w_locks, + share->global_changed, share->state.open_count, + share->index_file_name)); + if (share->options & HA_OPTION_READ_ONLY_DATA || + info->lock_type == lock_type) + DBUG_RETURN(0); + if (lock_type == F_EXTRA_LCK) /* Used by TMP tables */ + { + ++share->w_locks; + ++share->tot_locks; + info->lock_type= lock_type; + DBUG_RETURN(0); + } + + error=0; + pthread_mutex_lock(&share->intern_lock); + if (share->kfile.file >= 0) /* May only be false on windows */ + { + switch (lock_type) { + case F_UNLCK: + maria_ftparser_call_deinitializer(info); + if (info->lock_type == F_RDLCK) + { + count= --share->r_locks; + _ma_restore_status(info); + } + else + { + count= --share->w_locks; + _ma_update_status(info); + } + --share->tot_locks; + if (info->lock_type == F_WRLCK && !share->w_locks) + { + if (!share->delay_key_write && + flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_KEEP)) + { + error= my_errno; + maria_print_error(info->s, HA_ERR_CRASHED); + /* Mark that table must be checked */ + maria_mark_crashed(info); + } + /* pages of transactional tables get flushed at Checkpoint */ + if (!share->base.born_transactional && + _ma_flush_table_files(info, MARIA_FLUSH_DATA, + FLUSH_KEEP, FLUSH_KEEP)) + error= my_errno; + } + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + if (end_io_cache(&info->rec_cache)) + { + error=my_errno; + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); + } + } + if (!count) + { + DBUG_PRINT("info",("changed: %u w_locks: %u", + (uint) share->changed, share->w_locks)); + if (share->changed && !share->w_locks) + { +#ifdef HAVE_MMAP + if ((info->s->mmaped_length != + info->s->state.state.data_file_length) && + (info->s->nonmmaped_inserts > MAX_NONMAPPED_INSERTS)) + { + if (info->s->concurrent_insert) + rw_wrlock(&info->s->mmap_lock); + _ma_remap_file(info, info->s->state.state.data_file_length); + info->s->nonmmaped_inserts= 0; + if (info->s->concurrent_insert) + rw_unlock(&info->s->mmap_lock); + } +#endif + share->state.process= share->last_process=share->this_process; + share->state.unique= info->last_unique= info->this_unique; + share->state.update_count= info->last_loop= ++info->this_loop; + /* transactional tables rather flush their state at Checkpoint */ + if (!share->base.born_transactional) + { + if (_ma_state_info_write_sub(share->kfile.file, &share->state, 1)) + error= my_errno; + else + { + /* A value of 0 means below means "state flushed" */ + share->changed= 0; + } + } + if (maria_flush) + { + if (_ma_sync_table_files(info)) + error= my_errno; + } + else + share->not_flushed=1; + if (error) + { + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); + } + } + } + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + info->lock_type= F_UNLCK; + /* verify that user of the table cleaned up after itself */ + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + break; + case F_RDLCK: + if (info->lock_type == F_WRLCK) + { + /* + Change RW to READONLY + + mysqld does not turn write locks to read locks, + so we're never here in mysqld. + */ + share->w_locks--; + share->r_locks++; + info->lock_type=lock_type; + break; + } +#ifdef MARIA_EXTERNAL_LOCKING + if (!share->r_locks && !share->w_locks) + { + /* note that a transactional table should not do this */ + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + { + error=my_errno; + break; + } + } +#endif + VOID(_ma_test_if_changed(info)); + share->r_locks++; + share->tot_locks++; + info->lock_type=lock_type; + break; + case F_WRLCK: + if (info->lock_type == F_RDLCK) + { /* Change READONLY to RW */ + if (share->r_locks == 1) + { + share->r_locks--; + share->w_locks++; + info->lock_type=lock_type; + break; + } + } +#ifdef MARIA_EXTERNAL_LOCKING + if (!(share->options & HA_OPTION_READ_ONLY_DATA)) + { + if (!share->w_locks) + { + if (!share->r_locks) + { + /* + Note that transactional tables should not do this. + If we enabled this code, we should make sure to skip it if + born_transactional is true. We should not test + now_transactional to decide if we can call + _ma_state_info_read_dsk(), because it can temporarily be 0 + (TRUNCATE on a partitioned table) and thus it would make a state + modification below without mutex, confusing a concurrent + checkpoint running. + Even if this code was enabled only for non-transactional tables: + in scenario LOCK TABLE t1 WRITE; INSERT INTO t1; DELETE FROM t1; + state on disk read by DELETE is obsolete as it was not flushed + at the end of INSERT. MyISAM same. It however causes no issue as + maria_delete_all_rows() calls _ma_reset_status() thus is not + influenced by the obsolete read values. + */ + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + { + error=my_errno; + break; + } + } + } + } +#endif /* defined(MARIA_EXTERNAL_LOCKING) */ + VOID(_ma_test_if_changed(info)); + + info->lock_type=lock_type; + info->invalidator=info->s->invalidator; + share->w_locks++; + share->tot_locks++; + break; + default: + DBUG_ASSERT(0); + break; /* Impossible */ + } + } +#ifdef __WIN__ + else + { + /* + Check for bad file descriptors if this table is part + of a merge union. Failing to capture this may cause + a crash on windows if the table is renamed and + later on referenced by the merge table. + */ + if( info->owned_by_merge && (info->s)->kfile.file < 0 ) + { + error = HA_ERR_NO_SUCH_TABLE; + } + } +#endif + pthread_mutex_unlock(&share->intern_lock); + DBUG_RETURN(error); +} /* maria_lock_database */ + + +/**************************************************************************** + The following functions are called by thr_lock() in threaded applications +****************************************************************************/ + +/* + Create a copy of the current status for the table + + SYNOPSIS + _ma_get_status() + param Pointer to Myisam handler + concurrent_insert Set to 1 if we are going to do concurrent inserts + (THR_WRITE_CONCURRENT_INSERT was used) +*/ + +void _ma_get_status(void* param, int concurrent_insert) +{ + MARIA_HA *info=(MARIA_HA*) param; + DBUG_ENTER("_ma_get_status"); + DBUG_PRINT("info",("key_file: %ld data_file: %ld concurrent_insert: %d", + (long) info->s->state.state.key_file_length, + (long) info->s->state.state.data_file_length, + concurrent_insert)); +#ifndef DBUG_OFF + if (info->state->key_file_length > info->s->state.state.key_file_length || + info->state->data_file_length > info->s->state.state.data_file_length) + DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld", + (long) info->state->key_file_length, + (long) info->state->data_file_length)); +#endif + info->save_state=info->s->state.state; + info->state= &info->save_state; + info->append_insert_at_end= concurrent_insert; + DBUG_VOID_RETURN; +} + + +void _ma_update_status(void* param) +{ + MARIA_HA *info=(MARIA_HA*) param; + MARIA_SHARE *share= info->s; + /* + Because someone may have closed the table we point at, we only + update the state if its our own state. This isn't a problem as + we are always pointing at our own lock or at a read lock. + (This is enforced by thr_multi_lock.c) + */ + if (info->state == &info->save_state) + { +#ifndef DBUG_OFF + DBUG_PRINT("info",("updating status: key_file: %ld data_file: %ld", + (long) info->state->key_file_length, + (long) info->state->data_file_length)); + if (info->state->key_file_length < share->state.state.key_file_length || + info->state->data_file_length < share->state.state.data_file_length) + DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld", + (long) share->state.state.key_file_length, + (long) share->state.state.data_file_length)); +#endif + /* + we are going to modify the state without lock's log, this would break + recovery if done with a transactional table. + */ + DBUG_ASSERT(!info->s->base.born_transactional); + share->state.state= *info->state; + info->state= &share->state.state; + } + info->append_insert_at_end= 0; +} + + +void _ma_restore_status(void *param) +{ + MARIA_HA *info= (MARIA_HA*) param; + info->state= &info->s->state.state; + info->append_insert_at_end= 0; +} + + +void _ma_copy_status(void* to,void *from) +{ + ((MARIA_HA*) to)->state= &((MARIA_HA*) from)->save_state; +} + + +/* + Check if should allow concurrent inserts + + IMPLEMENTATION + Allow concurrent inserts if we don't have a hole in the table or + if there is no active write lock and there is active read locks and + maria_concurrent_insert == 2. In this last case the new + row('s) are inserted at end of file instead of filling up the hole. + + The last case is to allow one to inserts into a heavily read-used table + even if there is holes. + + NOTES + If there is a an rtree indexes in the table, concurrent inserts are + disabled in maria_open() + + RETURN + 0 ok to use concurrent inserts + 1 not ok +*/ + +my_bool _ma_check_status(void *param) +{ + MARIA_HA *info=(MARIA_HA*) param; + /* + The test for w_locks == 1 is here because this thread has already done an + external lock (in other words: w_locks == 1 means no other threads has + a write lock) + */ + DBUG_PRINT("info",("dellink: %ld r_locks: %u w_locks: %u", + (long) info->s->state.dellink, (uint) info->s->r_locks, + (uint) info->s->w_locks)); + return (my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR || + (maria_concurrent_insert == 2 && info->s->r_locks && + info->s->w_locks == 1)); +} + + +/**************************************************************************** + ** functions to read / write the state +****************************************************************************/ + +int _ma_readinfo(register MARIA_HA *info __attribute__ ((unused)), + int lock_type __attribute__ ((unused)), + int check_keybuffer __attribute__ ((unused))) +{ +#ifdef MARIA_EXTERNAL_LOCKING + DBUG_ENTER("_ma_readinfo"); + + if (info->lock_type == F_UNLCK) + { + MARIA_SHARE *share=info->s; + if (!share->tot_locks) + { + /* should not be done for transactional tables */ + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + { + int error=my_errno ? my_errno : -1; + my_errno=error; + DBUG_RETURN(1); + } + } + if (check_keybuffer) + VOID(_ma_test_if_changed(info)); + info->invalidator=info->s->invalidator; + } + else if (lock_type == F_WRLCK && info->lock_type == F_RDLCK) + { + my_errno=EACCES; /* Not allowed to change */ + DBUG_RETURN(-1); /* when have read_lock() */ + } + DBUG_RETURN(0); +#else + return 0; +#endif /* defined(MARIA_EXTERNAL_LOCKING) */ +} /* _ma_readinfo */ + + +/* + Every isam-function that uppdates the isam-database MUST end with this + request + + NOTES + my_errno is not changed if this succeeds! +*/ + +int _ma_writeinfo(register MARIA_HA *info, uint operation) +{ + int error,olderror; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_writeinfo"); + DBUG_PRINT("info",("operation: %u tot_locks: %u", operation, + share->tot_locks)); + + error=0; + if (share->tot_locks == 0 && !share->base.born_transactional) + { + /* transactional tables flush their state at Checkpoint */ + if (operation) + { /* Two threads can't be here */ + olderror= my_errno; /* Remember last error */ + share->state.process= share->last_process= share->this_process; + share->state.unique= info->last_unique= info->this_unique; + share->state.update_count= info->last_loop= ++info->this_loop; + if ((error= _ma_state_info_write_sub(share->kfile.file, + &share->state, 1))) + olderror=my_errno; +#ifdef __WIN__ + if (maria_flush) + { + _commit(share->kfile.file); + _commit(info->dfile.file); + } +#endif + my_errno=olderror; + } + } + else if (operation) + share->changed= 1; /* Mark keyfile changed */ + DBUG_RETURN(error); +} /* _ma_writeinfo */ + + + /* Test if someone has changed the database */ + /* (Should be called after readinfo) */ + +int _ma_test_if_changed(register MARIA_HA *info) +{ + MARIA_SHARE *share=info->s; + if (share->state.process != share->last_process || + share->state.unique != info->last_unique || + share->state.update_count != info->last_loop) + { /* Keyfile has changed */ + DBUG_PRINT("info",("index file changed")); + if (share->state.process != share->this_process) + VOID(flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_RELEASE)); + share->last_process=share->state.process; + info->last_unique= share->state.unique; + info->last_loop= share->state.update_count; + info->update|= HA_STATE_WRITTEN; /* Must use file on next */ + info->data_changed= 1; /* For maria_is_changed */ + return 1; + } + return (!(info->update & HA_STATE_AKTIV) || + (info->update & (HA_STATE_WRITTEN | HA_STATE_DELETED | + HA_STATE_KEY_CHANGED))); +} /* _ma_test_if_changed */ + + +/* + Put a mark in the .MYI file that someone is updating the table + + + DOCUMENTATION + + state.open_count in the .MYI file is used the following way: + - For the first change of the .MYI file in this process open_count is + incremented by _ma_mark_file_changed(). (We have a write lock on the file + when this happens) + - In maria_close() it's decremented by _ma_decrement_open_count() if it + was incremented in the same process. + + This mean that if we are the only process using the file, the open_count + tells us if the MARIA file wasn't properly closed. (This is true if + my_disable_locking is set). + + open_count is not maintained on disk for transactional or temporary tables. +*/ + + +int _ma_mark_file_changed(MARIA_HA *info) +{ + char buff[3]; + register MARIA_SHARE *share=info->s; + DBUG_ENTER("_ma_mark_file_changed"); + + if (!(share->state.changed & STATE_CHANGED) || ! share->global_changed) + { + share->state.changed|=(STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS); + if (!share->global_changed) + { + share->global_changed=1; + share->state.open_count++; + } + /* + temp tables don't need an open_count as they are removed on crash; + transactional tables are fixed by log-based recovery, so don't need an + open_count either (and we thus avoid the disk write below). + */ + if (!(share->temporary | share->base.born_transactional)) + { + mi_int2store(buff,share->state.open_count); + buff[2]=1; /* Mark that it's changed */ + DBUG_RETURN(my_pwrite(share->kfile.file, buff, sizeof(buff), + sizeof(share->state.header), + MYF(MY_NABP))); + } + } + DBUG_RETURN(0); +} + + +/* + This is only called by close or by extra(HA_FLUSH) if the OS has the pwrite() + call. In these context the following code should be safe! + */ + +int _ma_decrement_open_count(MARIA_HA *info) +{ + char buff[2]; + register MARIA_SHARE *share=info->s; + int lock_error=0,write_error=0; + if (share->global_changed) + { + uint old_lock=info->lock_type; + share->global_changed=0; + lock_error=maria_lock_database(info,F_WRLCK); + /* Its not fatal even if we couldn't get the lock ! */ + if (share->state.open_count > 0) + { + share->state.open_count--; + if (!(share->temporary | share->base.born_transactional)) + { + mi_int2store(buff,share->state.open_count); + write_error= my_pwrite(share->kfile.file, buff, sizeof(buff), + sizeof(share->state.header), + MYF(MY_NABP)); + } + } + if (!lock_error) + lock_error=maria_lock_database(info,old_lock); + } + return test(lock_error || write_error); +} diff --git a/storage/maria/ma_loghandler.c b/storage/maria/ma_loghandler.c new file mode 100644 index 00000000000..f3c90ceb1f5 --- /dev/null +++ b/storage/maria/ma_loghandler.c @@ -0,0 +1,6778 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "ma_blockrec.h" +#include "trnman.h" + +/** + @file + @brief Module which writes and reads to a transaction log + + @todo LOG: in functions where the log's lock is required, a + translog_assert_owner() could be added. +*/ + +/* number of opened log files in the pagecache (should be at least 2) */ +#define OPENED_FILES_NUM 3 + +/* records buffer size (should be LOG_PAGE_SIZE * n) */ +#define TRANSLOG_WRITE_BUFFER (1024*1024) +/* min chunk length */ +#define TRANSLOG_MIN_CHUNK 3 +/* + Number of buffers used by loghandler + + Should be at least 4, because one thread can block up to 2 buffers in + normal circumstances (less then half of one and full other, or just + switched one and other), But if we met end of the file in the middle and + have to switch buffer it will be 3. + 1 buffer for flushing/writing. + We have a bigger number here for higher concurrency. +*/ +#define TRANSLOG_BUFFERS_NO 5 +/* number of bytes (+ header) which can be unused on first page in sequence */ +#define TRANSLOG_MINCHUNK_CONTENT 1 +/* version of log file */ +#define TRANSLOG_VERSION_ID 10000 /* 1.00.00 */ + +#define TRANSLOG_PAGE_FLAGS 6 /* transaction log page flags offset */ + +/* QQ: For temporary debugging */ +#define UNRECOVERABLE_ERROR(E) \ + do { \ + DBUG_PRINT("error", E); \ + printf E; \ + putchar('\n'); \ + } while(0); + +/* Maximum length of compressed LSNs (the worst case of whole LSN storing) */ +#define COMPRESSED_LSN_MAX_STORE_SIZE (2 + LSN_STORE_SIZE) +#define MAX_NUMBER_OF_LSNS_PER_RECORD 2 + +/* log write buffer descriptor */ +struct st_translog_buffer +{ + LSN last_lsn; + /* This buffer offset in the file */ + TRANSLOG_ADDRESS offset; + /* + Next buffer offset in the file (it is not always offset + size, + in case of flush by LSN it can be offset + size - TRANSLOG_PAGE_SIZE) + */ + TRANSLOG_ADDRESS next_buffer_offset; + /* + How much written (or will be written when copy_to_buffer_in_progress + become 0) to this buffer + */ + translog_size_t size; + /* File handler for this buffer */ + File file; + /* Threads which are waiting for buffer filling/freeing */ + WQUEUE waiting_filling_buffer; + /* Number of record which are in copy progress */ + uint copy_to_buffer_in_progress; + /* list of waiting buffer ready threads */ + struct st_my_thread_var *waiting_flush; + struct st_translog_buffer *overlay; +#ifndef DBUG_OFF + uint buffer_no; +#endif + /* lock for the buffer. Current buffer also lock the handler */ + pthread_mutex_t mutex; + /* IO cache for current log */ + uchar buffer[TRANSLOG_WRITE_BUFFER]; +}; + + +struct st_buffer_cursor +{ + /* pointer on the buffer */ + uchar *ptr; + /* current buffer */ + struct st_translog_buffer *buffer; + /* current page fill */ + uint16 current_page_fill; + /* how many times we finish this page to write it */ + uint16 write_counter; + /* previous write offset */ + uint16 previous_offset; + /* Number of current buffer */ + uint8 buffer_no; + my_bool chaser, protected; +}; + + +struct st_translog_descriptor +{ + /* *** Parameters of the log handler *** */ + + /* Page cache for the log reads */ + PAGECACHE *pagecache; + /* Flags */ + uint flags; + /* max size of one log size (for new logs creation) */ + uint32 log_file_max_size; + /* server version */ + uint32 server_version; + /* server ID */ + uint32 server_id; + /* Loghandler's buffer capacity in case of chunk 2 filling */ + uint32 buffer_capacity_chunk_2; + /* Half of the buffer capacity in case of chunk 2 filling */ + uint32 half_buffer_capacity_chunk_2; + /* Page overhead calculated by flags */ + uint16 page_overhead; + /* Page capacity calculated by flags (TRANSLOG_PAGE_SIZE-page_overhead-1) */ + uint16 page_capacity_chunk_2; + /* Directory to store files */ + char directory[FN_REFLEN]; + + /* *** Current state of the log handler *** */ + /* Current and (OPENED_FILES_NUM-1) last logs number in page cache */ + File log_file_num[OPENED_FILES_NUM]; + File directory_fd; + /* buffers for log writing */ + struct st_translog_buffer buffers[TRANSLOG_BUFFERS_NO]; + /* + horizon - visible end of the log (here is absolute end of the log: + position where next chunk can start + */ + TRANSLOG_ADDRESS horizon; + /* horizon buffer cursor */ + struct st_buffer_cursor bc; + /* maximum LSN of the current (not finished) file */ + LSN max_lsn; + + /* Last flushed LSN */ + LSN flushed; + /* Last LSN sent to the disk (but maybe not written yet) */ + LSN sent_to_file; + /* All what is after this address is not sent to disk yet */ + TRANSLOG_ADDRESS in_buffers_only; + pthread_mutex_t sent_to_file_lock; + pthread_mutex_t log_flush_lock; + + /* Protects changing of headers of finished files (max_lsn) */ + pthread_mutex_t file_header_lock; + + /* + Sorted array (with protection) of files where we started writing process + and so we can't give last LSN yet + */ + pthread_mutex_t unfinished_files_lock; + DYNAMIC_ARRAY unfinished_files; + + /* Purger data: minimum file in the log (or 0 if unknown) */ + uint32 min_file_number; + /* Protect purger from many calls and it's data */ + pthread_mutex_t purger_lock; + /* last low water mark checked */ + LSN last_lsn_checked; +}; + +static struct st_translog_descriptor log_descriptor; + +/* Marker for end of log */ +static uchar end_of_log= 0; + +my_bool translog_inited= 0; + +/* chunk types */ +#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */ +#define TRANSLOG_CHUNK_FIXED (1 << 6) /* 1 (pseudo)fixed record (also LSN) */ +#define TRANSLOG_CHUNK_NOHDR (2 << 6) /* 2 no head chunk (till page end) */ +#define TRANSLOG_CHUNK_LNGTH (3 << 6) /* 3 chunk with chunk length */ +#define TRANSLOG_CHUNK_TYPE (3 << 6) /* Mask to get chunk type */ +#define TRANSLOG_REC_TYPE 0x3F /* Mask to get record type */ + +/* compressed (relative) LSN constants */ +#define TRANSLOG_CLSN_LEN_BITS 0xC0 /* Mask to get compressed LSN length */ + + + +#include <my_atomic.h> +/* an array that maps id of a MARIA_SHARE to this MARIA_SHARE */ +static MARIA_SHARE **id_to_share= NULL; +/* lock for id_to_share */ +static my_atomic_rwlock_t LOCK_id_to_share; + +static my_bool write_hook_for_redo(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + struct st_translog_parts *parts); +static my_bool write_hook_for_undo(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + struct st_translog_parts *parts); +static my_bool write_hook_for_redo_delete_all(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, + struct st_translog_parts *parts); +static my_bool write_hook_for_undo_row_insert(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, + struct st_translog_parts *parts); +static my_bool write_hook_for_undo_row_delete(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, + struct st_translog_parts *parts); +static my_bool write_hook_for_clr_end(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + struct st_translog_parts *parts); +static my_bool write_hook_for_file_id(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + struct st_translog_parts *parts); + +static my_bool translog_page_validator(uchar *page_addr, uchar* data_ptr); + +/* + Initialize log_record_type_descriptors + + NOTE that after first public Maria release, these can NOT be changed +*/ + +LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES]; + + +#ifndef DBUG_OFF +/** + @brief check the description table validity + + @param num how many records should be filled +*/ + +static void check_translog_description_table(int num) +{ + int i; + DBUG_ENTER("check_translog_description_table"); + DBUG_PRINT("enter", ("last record: %d", num)); + DBUG_ASSERT(num > 0); + /* last is reserved for extending the table */ + DBUG_ASSERT(num < LOGREC_NUMBER_OF_TYPES - 1); + DBUG_PRINT("info", ("records number: OK")); + DBUG_PRINT("info", + ("record type: %d class: %d fixed: %u header: %u LSNs: %u " + "name: %s", + 0, + log_record_type_descriptor[0].class, + (uint)log_record_type_descriptor[0].fixed_length, + (uint)log_record_type_descriptor[0].read_header_len, + (uint)log_record_type_descriptor[0].compressed_LSN, + log_record_type_descriptor[0].name)); + DBUG_ASSERT(log_record_type_descriptor[0].class == LOGRECTYPE_NOT_ALLOWED); + DBUG_PRINT("info", ("record type 0: OK")); + for (i= 1; i <= num; i++) + { + DBUG_PRINT("info", + ("record type: %d class: %d fixed: %u header: %u LSNs: %u " + "name: %s", + i, log_record_type_descriptor[i].class, + (uint)log_record_type_descriptor[i].fixed_length, + (uint)log_record_type_descriptor[i].read_header_len, + (uint)log_record_type_descriptor[i].compressed_LSN, + log_record_type_descriptor[i].name)); + switch (log_record_type_descriptor[i].class) { + case LOGRECTYPE_NOT_ALLOWED: + DBUG_ASSERT(0); + break; + case LOGRECTYPE_VARIABLE_LENGTH: + DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == 0); + DBUG_ASSERT((log_record_type_descriptor[i].compressed_LSN == 0) || + ((log_record_type_descriptor[i].compressed_LSN == 1) && + (log_record_type_descriptor[i].read_header_len >= + LSN_STORE_SIZE)) || + ((log_record_type_descriptor[i].compressed_LSN == 2) && + (log_record_type_descriptor[i].read_header_len >= + LSN_STORE_SIZE * 2))); + break; + case LOGRECTYPE_PSEUDOFIXEDLENGTH: + DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == + log_record_type_descriptor[i].read_header_len); + DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN > 0); + DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN <= 2); + break; + case LOGRECTYPE_FIXEDLENGTH: + DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == + log_record_type_descriptor[i].read_header_len); + DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN == 0); + break; + default: + DBUG_ASSERT(0); + } + DBUG_PRINT("info", ("record type %d: OK", i)); + } + DBUG_PRINT("info", ("All filled records are OK")); + for (i= num + 1; i < LOGREC_NUMBER_OF_TYPES; i++) + { + DBUG_ASSERT(log_record_type_descriptor[i].class == LOGRECTYPE_NOT_ALLOWED); + DBUG_PRINT("info", ("record type %d: OK", i)); + } + DBUG_VOID_RETURN; +} +#endif + +static LOG_DESC INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE= +{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0, + "fixed0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, NULL, NULL, 0, +"variable0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 7, 7, NULL, NULL, NULL, 1, +"fixed1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 12, NULL, NULL, NULL, 1, +"variable1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 23, 23, NULL, NULL, NULL, 2, +"fixed2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 19, NULL, NULL, NULL, 2, +"variable2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + + +void example_loghandler_init() +{ + int i; + log_record_type_descriptor[LOGREC_FIXED_RECORD_0LSN_EXAMPLE]= + INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE]= + INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_FIXED_RECORD_1LSN_EXAMPLE]= + INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE]= + INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_FIXED_RECORD_2LSN_EXAMPLE]= + INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE]= + INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE; + for (i= LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE + 1; + i < LOGREC_NUMBER_OF_TYPES; + i++) + log_record_type_descriptor[i].class= LOGRECTYPE_NOT_ALLOWED; + DBUG_EXECUTE("info", + check_translog_description_table(LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE);); +} + + +static LOG_DESC INIT_LOGREC_RESERVED_FOR_CHUNKS23= +{LOGRECTYPE_NOT_ALLOWED, 0, 0, NULL, NULL, NULL, 0, + "reserved", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL }; + +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_HEAD= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_insert_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_TAIL= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_insert_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +/** @todo RECOVERY BUG unused, remove? */ +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOB= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 8, NULL, write_hook_for_redo, NULL, 0, + "redo_insert_row_blob", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +/** @todo RECOVERY BUG handle it in recovery */ +/*QQQ:TODO:header???*/ +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOBS= +{LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_insert_row_blobs", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_HEAD= +{LOGRECTYPE_FIXEDLENGTH, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_purge_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_TAIL= +{LOGRECTYPE_FIXEDLENGTH, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_purge_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_PURGE_BLOCKS= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_purge_blocks", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +/* not yet used; for when we have versioning */ +static LOG_DESC INIT_LOGREC_REDO_DELETE_ROW= +{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0, + "redo_delete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +/** @todo RECOVERY BUG unused, remove? */ +static LOG_DESC INIT_LOGREC_REDO_UPDATE_ROW_HEAD= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0, + "redo_update_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INDEX= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0, + "redo_index", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_UNDELETE_ROW= +{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0, + "redo_undelete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_CLR_END= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, LSN_STORE_SIZE + FILEID_STORE_SIZE + 1, + LSN_STORE_SIZE + FILEID_STORE_SIZE + 1, NULL, write_hook_for_clr_end, NULL, 1, + "clr_end", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_PURGE_END= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, NULL, NULL, 1, + "purge_end", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_ROW_INSERT= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_undo_row_insert, NULL, 1, + "undo_row_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_ROW_DELETE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_undo_row_delete, NULL, 1, + "undo_row_delete", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_ROW_UPDATE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_undo, NULL, 1, + "undo_row_update", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 10, NULL, write_hook_for_undo, NULL, 1, + "undo_key_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 15, NULL, write_hook_for_undo, NULL, 1, + "undo_key_delete", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_PREPARE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "prepare", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_PREPARE_WITH_UNDO_PURGE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE, NULL, NULL, NULL, 1, + "prepare_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_COMMIT= +{LOGRECTYPE_FIXEDLENGTH, 0, 0, NULL, + NULL, NULL, 0, "commit", LOGREC_IS_GROUP_ITSELF, NULL, + NULL}; + +static LOG_DESC INIT_LOGREC_COMMIT_WITH_UNDO_PURGE= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, NULL, NULL, 1, + "commit_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_CHECKPOINT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "checkpoint", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_CREATE_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 1 + 2, NULL, NULL, NULL, 0, +"redo_create_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_RENAME_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "redo_rename_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_DROP_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "redo_drop_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_DELETE_ALL= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE, + NULL, write_hook_for_redo_delete_all, NULL, 0, + "redo_delete_all", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_REPAIR_TABLE= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + 4, FILEID_STORE_SIZE + 4, + NULL, NULL, NULL, 0, + "redo_repair_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_FILE_ID= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 2, NULL, write_hook_for_file_id, NULL, 0, + "file_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_LONG_TRANSACTION_ID= +{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0, + "long_transaction_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL; + +static void loghandler_init() +{ + int i; + log_record_type_descriptor[LOGREC_RESERVED_FOR_CHUNKS23]= + INIT_LOGREC_RESERVED_FOR_CHUNKS23; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_HEAD]= + INIT_LOGREC_REDO_INSERT_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_TAIL]= + INIT_LOGREC_REDO_INSERT_ROW_TAIL; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_BLOB]= + INIT_LOGREC_REDO_INSERT_ROW_BLOB; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_BLOBS]= + INIT_LOGREC_REDO_INSERT_ROW_BLOBS; + log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_HEAD]= + INIT_LOGREC_REDO_PURGE_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_TAIL]= + INIT_LOGREC_REDO_PURGE_ROW_TAIL; + log_record_type_descriptor[LOGREC_REDO_PURGE_BLOCKS]= + INIT_LOGREC_REDO_PURGE_BLOCKS; + log_record_type_descriptor[LOGREC_REDO_DELETE_ROW]= + INIT_LOGREC_REDO_DELETE_ROW; + log_record_type_descriptor[LOGREC_REDO_UPDATE_ROW_HEAD]= + INIT_LOGREC_REDO_UPDATE_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_INDEX]= + INIT_LOGREC_REDO_INDEX; + log_record_type_descriptor[LOGREC_REDO_UNDELETE_ROW]= + INIT_LOGREC_REDO_UNDELETE_ROW; + log_record_type_descriptor[LOGREC_CLR_END]= + INIT_LOGREC_CLR_END; + log_record_type_descriptor[LOGREC_PURGE_END]= + INIT_LOGREC_PURGE_END; + log_record_type_descriptor[LOGREC_UNDO_ROW_INSERT]= + INIT_LOGREC_UNDO_ROW_INSERT; + log_record_type_descriptor[LOGREC_UNDO_ROW_DELETE]= + INIT_LOGREC_UNDO_ROW_DELETE; + log_record_type_descriptor[LOGREC_UNDO_ROW_UPDATE]= + INIT_LOGREC_UNDO_ROW_UPDATE; + log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT]= + INIT_LOGREC_UNDO_KEY_INSERT; + log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE]= + INIT_LOGREC_UNDO_KEY_DELETE; + log_record_type_descriptor[LOGREC_PREPARE]= + INIT_LOGREC_PREPARE; + log_record_type_descriptor[LOGREC_PREPARE_WITH_UNDO_PURGE]= + INIT_LOGREC_PREPARE_WITH_UNDO_PURGE; + log_record_type_descriptor[LOGREC_COMMIT]= + INIT_LOGREC_COMMIT; + log_record_type_descriptor[LOGREC_COMMIT_WITH_UNDO_PURGE]= + INIT_LOGREC_COMMIT_WITH_UNDO_PURGE; + log_record_type_descriptor[LOGREC_CHECKPOINT]= + INIT_LOGREC_CHECKPOINT; + log_record_type_descriptor[LOGREC_REDO_CREATE_TABLE]= + INIT_LOGREC_REDO_CREATE_TABLE; + log_record_type_descriptor[LOGREC_REDO_RENAME_TABLE]= + INIT_LOGREC_REDO_RENAME_TABLE; + log_record_type_descriptor[LOGREC_REDO_DROP_TABLE]= + INIT_LOGREC_REDO_DROP_TABLE; + log_record_type_descriptor[LOGREC_REDO_DELETE_ALL]= + INIT_LOGREC_REDO_DELETE_ALL; + log_record_type_descriptor[LOGREC_REDO_REPAIR_TABLE]= + INIT_LOGREC_REDO_REPAIR_TABLE; + log_record_type_descriptor[LOGREC_FILE_ID]= + INIT_LOGREC_FILE_ID; + log_record_type_descriptor[LOGREC_LONG_TRANSACTION_ID]= + INIT_LOGREC_LONG_TRANSACTION_ID; + for (i= LOGREC_LONG_TRANSACTION_ID + 1; + i < LOGREC_NUMBER_OF_TYPES; + i++) + log_record_type_descriptor[i].class= LOGRECTYPE_NOT_ALLOWED; + DBUG_EXECUTE("info", + check_translog_description_table(LOGREC_LONG_TRANSACTION_ID);); +}; + + +/* all possible flags page overheads */ +static uint page_overhead[TRANSLOG_FLAGS_NUM]; + +typedef struct st_translog_validator_data +{ + TRANSLOG_ADDRESS *addr; + my_bool was_recovered; +} TRANSLOG_VALIDATOR_DATA; + + +const char *maria_data_root; + + +/* + Check cursor/buffer consistence + + SYNOPSIS + translog_check_cursor + cursor cursor which will be checked +*/ + +#ifndef DBUG_OFF +static void translog_check_cursor(struct st_buffer_cursor *cursor) +{ + DBUG_ASSERT(cursor->chaser || + ((ulong) (cursor->ptr - cursor->buffer->buffer) == + cursor->buffer->size)); + DBUG_ASSERT(cursor->buffer->buffer_no == cursor->buffer_no); + DBUG_ASSERT((cursor->ptr -cursor->buffer->buffer) %TRANSLOG_PAGE_SIZE == + cursor->current_page_fill % TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); +} +#endif + +/* + Get file name of the log by log number + + SYNOPSIS + translog_filename_by_fileno() + file_no Number of the log we want to open + path Pointer to buffer where file name will be + stored (must be FN_REFLEN bytes at least + RETURN + pointer to path +*/ + +static char *translog_filename_by_fileno(uint32 file_no, char *path) +{ + char file_name[10 + 8 + 1]; /* See fallowing my_sprintf() call */ + char *res; + DBUG_ENTER("translog_filename_by_fileno"); + DBUG_ASSERT(file_no <= 0xfffffff); + my_sprintf(file_name, (file_name, "maria_log.%08u", file_no)); + res= fn_format(path, file_name, log_descriptor.directory, "", MYF(MY_WME)); + DBUG_PRINT("info", ("Path: '%s' path: 0x%lx res: 0x%lx", + res, (ulong) path, (ulong) res)); + DBUG_RETURN(res); +} + + +/* + Open log file with given number without cache + + SYNOPSIS + open_logfile_by_number_no_cache() + file_no Number of the log we want to open + + RETURN + -1 error + # file descriptor number +*/ + +static File open_logfile_by_number_no_cache(uint32 file_no) +{ + File file; + char path[FN_REFLEN]; + DBUG_ENTER("open_logfile_by_number_no_cache"); + + /* TODO: add O_DIRECT to open flags (when buffer is aligned) */ + /* TODO: use my_create() */ + if ((file= my_open(translog_filename_by_fileno(file_no, path), + O_CREAT | O_BINARY | O_RDWR, + MYF(MY_WME))) < 0) + { + UNRECOVERABLE_ERROR(("Error %d during opening file '%s'", errno, path)); + DBUG_RETURN(-1); + } + DBUG_PRINT("info", ("File: '%s' handler: %d", path, file)); + DBUG_RETURN(file); +} + + +/* + Write log file page header in the just opened new log file + + SYNOPSIS + translog_write_file_header(); + + NOTES + First page is just a marker page; We don't store any real log data in it. + + RETURN + 0 OK + 1 ERROR +*/ + +uchar NEAR maria_trans_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 11, '\001', 'M', 'A', 'R', 'I', 'A', + 'L', 'O', 'G' }; + +static my_bool translog_write_file_header() +{ + ulonglong timestamp; + uchar page_buff[TRANSLOG_PAGE_SIZE], *page= page_buff; + DBUG_ENTER("translog_write_file_header"); + + /* file tag */ + memcpy(page, maria_trans_file_magic, sizeof(maria_trans_file_magic)); + page+= sizeof(maria_trans_file_magic); + /* timestamp */ + timestamp= my_getsystime(); + int8store(page, timestamp); + page+= 8; + /* maria version */ + int4store(page, TRANSLOG_VERSION_ID); + page+= 4; + /* mysql version (MYSQL_VERSION_ID) */ + int4store(page, log_descriptor.server_version); + page+= 4; + /* server ID */ + int4store(page, log_descriptor.server_id); + page+= 4; + /* loghandler page_size/DISK_DRIVE_SECTOR_SIZE */ + int2store(page, TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE); + page+= 2; + /* file number */ + int3store(page, LSN_FILE_NO(log_descriptor.horizon)); + page+= 3; + /* + Here should be max lsn storing for current file (which is LSN_IPOSSIBLE): + lsn_store(page, LSN_IPOSSIBLE); + page+= LSN_STORE_SIZE; + But it is zeros so we can rely on bzero() in this case + */ + bzero(page, sizeof(page_buff) - (page- page_buff)); + + DBUG_RETURN(my_pwrite(log_descriptor.log_file_num[0], page_buff, + sizeof(page_buff), 0, log_write_flags) != 0); +} + +/* + @brief write the new LSN on the given file header + + @param file The file descriptor + @param lsn That LSN which should be written + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_max_lsn_to_header(File file, LSN lsn) +{ + uchar lsn_buff[LSN_STORE_SIZE]; + DBUG_ENTER("translog_max_lsn_to_header"); + DBUG_PRINT("enter", ("File descriptor: %ld " + "lsn: (%lu,0x%lx)", + (long) file, + LSN_IN_PARTS(lsn))); + + lsn_store(lsn_buff, lsn); + + DBUG_RETURN(my_pwrite(file, lsn_buff, + LSN_STORE_SIZE, + (sizeof(maria_trans_file_magic) + + 8 + 4 + 4 + 4 + 2 + 3), + log_write_flags) != 0 || + my_sync(file, MYF(MY_WME)) != 0); +} + + +/* + Information from transaction log file header +*/ + +typedef struct st_loghandler_file_info +{ + /* + LSN_IMPOSSIBLE for current file and max LSN which parts stored in the + file for all other (finished) files. + */ + LSN max_lsn; + ulonglong timestamp; /* Time stamp */ + ulong maria_version; /* Version of maria loghandler */ + ulong mysql_versiob; /* Version of mysql server */ + ulong server_id; /* Server ID */ + uint page_size; /* Loghandler page size */ + uint file_number; /* Number of the file (from the file header) */ +} LOGHANDLER_FILE_INFO; + +/* + @brief Read hander file information from loghandler file + + @param desc header information descriptor to be filled with information + @param file file descriptor to read + + @retval 0 OK + @retval 1 Error +*/ + +#define LOG_HEADER_DATA_SIZE (sizeof(maria_trans_file_magic) + \ + 8 + 4 + 4 + 4 + 2 + 3 + \ + LSN_STORE_SIZE) + +my_bool translog_read_file_header(LOGHANDLER_FILE_INFO *desc, File file) +{ + uchar page_buff[LOG_HEADER_DATA_SIZE], *ptr; + DBUG_ENTER("translog_read_file_header"); + + if (my_pread(file, page_buff, + sizeof(page_buff), 0, MYF(MY_FNABP | MY_WME))) + { + DBUG_PRINT("info", ("log read fail error: %d", my_errno)); + DBUG_RETURN(1); + } + ptr= page_buff + sizeof(maria_trans_file_magic); + desc->timestamp= uint8korr(ptr); + ptr+= 8; + desc->maria_version= uint4korr(ptr); + ptr+= 4; + desc->mysql_versiob= uint4korr(ptr); + ptr+= 4; + desc->server_id= uint4korr(ptr); + ptr+= 4; + desc->page_size= uint2korr(ptr); + ptr+= 2; + desc->file_number= uint3korr(ptr); + ptr+=3; + desc->max_lsn= lsn_korr(ptr); + DBUG_RETURN(0); +} + + +/* + @brief set the lsn to the files from_file - to_file if it is greater + then written in the file + + @param from_file first file number (min) + @param to_file last file number (max) + @param lsn the lsn for writing + @param is_locked true if current thread locked the log handler + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_set_lsn_for_files(uint32 from_file, uint32 to_file, + LSN lsn, my_bool is_locked) +{ + uint32 file; + DBUG_ENTER("translog_set_lsn_for_files"); + DBUG_PRINT("enter", ("From: %lu to: %lu lsn: (%lu,0x%lx) locked: %d", + (ulong) from_file, (ulong) to_file, + LSN_IN_PARTS(lsn), + is_locked)); + DBUG_ASSERT(from_file <= to_file); + DBUG_ASSERT(from_file > 0); /* we have not file 0 */ + + /* Checks the current file (not finished yet file) */ + if (!is_locked) + translog_lock(); + if (to_file == (uint32) LSN_FILE_NO(log_descriptor.horizon)) + { + if (likely(cmp_translog_addr(lsn, log_descriptor.max_lsn) > 0)) + log_descriptor.max_lsn= lsn; + to_file--; + } + if (!is_locked) + translog_unlock(); + + /* Checks finished files if they are */ + pthread_mutex_lock(&log_descriptor.file_header_lock); + for (file= from_file; file <= to_file; file++) + { + LOGHANDLER_FILE_INFO info; + File fd= open_logfile_by_number_no_cache(file); + if (fd < 0 || + translog_read_file_header(&info, fd) || + (cmp_translog_addr(lsn, info.max_lsn) > 0 && + translog_max_lsn_to_header(fd, lsn))) + DBUG_RETURN(1); + } + pthread_mutex_unlock(&log_descriptor.file_header_lock); + + DBUG_RETURN(0); +} + + +/* descriptor of file in unfinished_files */ +struct st_file_counter +{ + uint32 file; /* file number */ + uint32 counter; /* counter for started writes */ +}; + + +/* + @brief mark file "in progress" (for multi-group records) + + @param file log file number +*/ + +static void translog_mark_file_unfinished(uint32 file) +{ + int place, i; + struct st_file_counter fc, *fc_ptr; + fc.file= file; fc.counter= 1; + + DBUG_ENTER("translog_mark_file_unfinished"); + DBUG_PRINT("enter", ("file: %lu", (ulong) file)); + + pthread_mutex_lock(&log_descriptor.unfinished_files_lock); + + if (log_descriptor.unfinished_files.elements == 0) + { + insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc); + DBUG_PRINT("info", ("The first element inserted")); + goto end; + } + + for (place= log_descriptor.unfinished_files.elements - 1; + place >= 0; + place--) + { + fc_ptr= dynamic_element(&log_descriptor.unfinished_files, + place, struct st_file_counter *); + if (fc_ptr->file <= file) + break; + } + + if (place >= 0 && fc_ptr->file == file) + { + fc_ptr->counter++; + DBUG_PRINT("info", ("counter increased")); + goto end; + } + + if (place == (int)log_descriptor.unfinished_files.elements) + { + insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc); + DBUG_PRINT("info", ("The last element inserted")); + goto end; + } + /* shift and assign new element */ + insert_dynamic(&log_descriptor.unfinished_files, + (uchar*) + dynamic_element(&log_descriptor.unfinished_files, + log_descriptor.unfinished_files.elements- 1, + struct st_file_counter *)); + for(i= log_descriptor.unfinished_files.elements - 1; i > place; i--) + { + /* we do not use set_dynamic() to avoid unneeded checks */ + memcpy(dynamic_element(&log_descriptor.unfinished_files, + i, struct st_file_counter *), + dynamic_element(&log_descriptor.unfinished_files, + i + 1, struct st_file_counter *), + sizeof(struct st_file_counter)); + } + memcpy(dynamic_element(&log_descriptor.unfinished_files, + place + 1, struct st_file_counter *), + &fc, sizeof(struct st_file_counter)); +end: + pthread_mutex_unlock(&log_descriptor.unfinished_files_lock); + DBUG_VOID_RETURN; +} + + + +/* + @brief remove file mark "in progress" (for multi-group records) + + @param file log file number +*/ + +static void translog_mark_file_finished(uint32 file) +{ + int i; + struct st_file_counter *fc_ptr; + + DBUG_ENTER("translog_mark_file_finished"); + DBUG_PRINT("enter", ("file: %lu", (ulong) file)); + + pthread_mutex_lock(&log_descriptor.unfinished_files_lock); + + DBUG_ASSERT(log_descriptor.unfinished_files.elements > 0); + for (i= 0; + i < (int) log_descriptor.unfinished_files.elements; + i++) + { + fc_ptr= dynamic_element(&log_descriptor.unfinished_files, + i, struct st_file_counter *); + if (fc_ptr->file == file) + { + break; + } + } + DBUG_ASSERT(i < (int) log_descriptor.unfinished_files.elements); + + if (! --fc_ptr->counter) + delete_dynamic_element(&log_descriptor.unfinished_files, i); + pthread_mutex_unlock(&log_descriptor.unfinished_files_lock); + DBUG_VOID_RETURN; +} + + +/* + @brief get max LSN of the record which parts stored in this file + + @param file file number + + @return requested LSN or LSN_IMPOSSIBLE/LSN_ERROR + @retval LSN_IMPOSSIBLE File is still not finished + @retval LSN_ERROR Error opening file + @retval # LSN of the record which parts stored in this file +*/ + +LSN translog_get_file_max_lsn_stored(uint32 file) +{ + uint32 limit= FILENO_IMPOSSIBLE; + DBUG_ENTER("translog_get_file_max_lsn_stored"); + DBUG_PRINT("enter", ("file: %lu", (ulong)file)); + DBUG_ASSERT(translog_inited == 1); + + pthread_mutex_lock(&log_descriptor.unfinished_files_lock); + + /* find file with minimum file number "in progress" */ + if (log_descriptor.unfinished_files.elements > 0) + { + struct st_file_counter *fc_ptr; + fc_ptr= dynamic_element(&log_descriptor.unfinished_files, + 0, struct st_file_counter *); + limit= fc_ptr->file; /* minimal file number "in progress" */ + } + pthread_mutex_unlock(&log_descriptor.unfinished_files_lock); + + /* + if there is no "in progress file" then unfinished file is in progress + for sure + */ + if (limit == FILENO_IMPOSSIBLE) + { + TRANSLOG_ADDRESS horizon= translog_get_horizon(); + limit= LSN_FILE_NO(horizon); + } + + if (file >= limit) + { + DBUG_PRINT("info", ("The file in in progress")); + DBUG_RETURN(LSN_IMPOSSIBLE); + } + + { + LOGHANDLER_FILE_INFO info; + File fd= open_logfile_by_number_no_cache(file); + if (fd < 0 || + translog_read_file_header(&info, fd)) + { + DBUG_PRINT("error", ("Can't read file header")); + DBUG_RETURN(LSN_ERROR); + } + DBUG_PRINT("error", ("Max lsn: (%lu,0x%lx)", + LSN_IN_PARTS(info.max_lsn))); + DBUG_RETURN(info.max_lsn); + } +} + +/* + Initialize transaction log file buffer + + SYNOPSIS + translog_buffer_init() + buffer The buffer to initialize + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_buffer_init(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_init"); + buffer->last_lsn= LSN_IMPOSSIBLE; + /* This Buffer File */ + buffer->file= -1; + buffer->overlay= 0; + /* IO cache for current log */ + bzero(buffer->buffer, TRANSLOG_WRITE_BUFFER); + /* Buffer size */ + buffer->size= 0; + /* cond of thread which is waiting for buffer filling */ + buffer->waiting_filling_buffer.last_thread= 0; + /* Number of record which are in copy progress */ + buffer->copy_to_buffer_in_progress= 0; + /* list of waiting buffer ready threads */ + buffer->waiting_flush= 0; + /* lock for the buffer. Current buffer also lock the handler */ + if (pthread_mutex_init(&buffer->mutex, MY_MUTEX_INIT_FAST)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/* + Close transaction log file by descriptor + + SYNOPSIS + translog_close_log_file() + file file descriptor + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_close_log_file(File file) +{ + int rc; + PAGECACHE_FILE fl; + fl.file= file; + flush_pagecache_blocks(log_descriptor.pagecache, &fl, FLUSH_RELEASE); + /* + Sync file when we close it + TODO: sync only we have changed the log + */ + rc= my_sync(file, MYF(MY_WME)); + rc|= my_close(file, MYF(MY_WME)); + return test(rc); +} + + +/* + Create and fill header of new file + + SYNOPSIS + translog_create_new_file() + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_create_new_file() +{ + int i; + uint32 file_no= LSN_FILE_NO(log_descriptor.horizon); + DBUG_ENTER("translog_create_new_file"); + + /* + Writes max_lsn to the file header before finishing it (it is no need to + lock file header buffer because it is still unfinished file) + */ + translog_max_lsn_to_header(log_descriptor.log_file_num[0], + log_descriptor.max_lsn); + log_descriptor.max_lsn= LSN_IMPOSSIBLE; + + if (log_descriptor.log_file_num[OPENED_FILES_NUM - 1] != -1 && + translog_close_log_file(log_descriptor.log_file_num[OPENED_FILES_NUM - + 1])) + DBUG_RETURN(1); + for (i= OPENED_FILES_NUM - 1; i > 0; i--) + log_descriptor.log_file_num[i]= log_descriptor.log_file_num[i - 1]; + + if ((log_descriptor.log_file_num[0]= + open_logfile_by_number_no_cache(file_no)) == -1 || + translog_write_file_header()) + DBUG_RETURN(1); + + if (ma_control_file_write_and_force(LSN_IMPOSSIBLE, file_no, + CONTROL_FILE_UPDATE_ONLY_LOGNO)) + DBUG_RETURN(1); + + DBUG_RETURN(0); +} + + +/* + Lock the loghandler buffer + + SYNOPSIS + translog_buffer_lock() + buffer This buffer which should be locked + + RETURN + 0 OK + 1 Error +*/ + +#ifndef DBUG_OFF +static my_bool translog_buffer_lock(struct st_translog_buffer *buffer) +{ + int res; + DBUG_ENTER("translog_buffer_lock"); + DBUG_PRINT("enter", + ("Lock buffer #%u: (0x%lx) mutex: 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer, + (ulong) &buffer->mutex)); + res= (pthread_mutex_lock(&buffer->mutex) != 0); + DBUG_RETURN(res); +} +#else +#define translog_buffer_lock(B) \ + pthread_mutex_lock(&B->mutex) +#endif + + +/* + Unlock the loghandler buffer + + SYNOPSIS + translog_buffer_unlock() + buffer This buffer which should be unlocked + + RETURN + 0 OK + 1 Error +*/ + +#ifndef DBUG_OFF +static my_bool translog_buffer_unlock(struct st_translog_buffer *buffer) +{ + int res; + DBUG_ENTER("translog_buffer_unlock"); + DBUG_PRINT("enter", ("Unlock buffer... #%u (0x%lx) " + "mutex: 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer, + (ulong) &buffer->mutex)); + + res= (pthread_mutex_unlock(&buffer->mutex) != 0); + DBUG_PRINT("enter", ("Unlocked buffer... #%u: 0x%lx mutex: 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer, + (ulong) &buffer->mutex)); + DBUG_RETURN(res); +} +#else +#define translog_buffer_unlock(B) \ + pthread_mutex_unlock(&B->mutex) +#endif + + +/* + Write a header on the page + + SYNOPSIS + translog_new_page_header() + horizon Where to write the page + cursor Where to write the page + + NOTE + - space for page header should be checked before +*/ + +static void translog_new_page_header(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + uchar *ptr; + + DBUG_ENTER("translog_new_page_header"); + DBUG_ASSERT(cursor->ptr); + + cursor->protected= 0; + + ptr= cursor->ptr; + /* Page number */ + int3store(ptr, LSN_OFFSET(*horizon) / TRANSLOG_PAGE_SIZE); + ptr+= 3; + /* File number */ + int3store(ptr, LSN_FILE_NO(*horizon)); + ptr+= 3; + *(ptr++)= (uchar) log_descriptor.flags; + if (log_descriptor.flags & TRANSLOG_PAGE_CRC) + { +#ifndef DBUG_OFF + DBUG_PRINT("info", ("write 0x11223344 CRC to (%lu,0x%lx)", + LSN_IN_PARTS(*horizon))); + /* This will be overwritten by real CRC; This is just for debugging */ + int4store(ptr, 0x11223344); +#endif + /* CRC will be put when page is finished */ + ptr+= CRC_LENGTH; + } + if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) + { + time_t tm; + uint16 tmp_time= time(&tm); + int2store(ptr, tmp_time); + ptr+= (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2; + } + { + uint len= (ptr - cursor->ptr); + (*horizon)+= len; /* it is increasing of offset part of the address */ + cursor->current_page_fill= len; + if (!cursor->chaser) + cursor->buffer->size+= len; + } + cursor->ptr= ptr; + DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu)", + (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + DBUG_EXECUTE("info", translog_check_cursor(cursor);); + DBUG_VOID_RETURN; +} + + +/* + Put sector protection on the page image + + SYNOPSIS + translog_put_sector_protection() + page reference on the page content + cursor cursor of the buffer + + NOTES + We put a sector protection on all following sectors on the page, + except the first sector that is protected by page header. +*/ + +static void translog_put_sector_protection(uchar *page, + struct st_buffer_cursor *cursor) +{ + uchar *table= page + log_descriptor.page_overhead - + (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2; + uint16 value= uint2korr(table) + cursor->write_counter; + uint16 last_protected_sector= ((cursor->previous_offset - 1) / + DISK_DRIVE_SECTOR_SIZE); + uint16 start_sector= cursor->previous_offset / DISK_DRIVE_SECTOR_SIZE; + uint i, offset; + DBUG_ENTER("translog_put_sector_protection"); + + if (start_sector == 0) + start_sector= 1; /* First sector is protected */ + + DBUG_PRINT("enter", ("Write counter:%u value:%u offset:%u, " + "last protected:%u start sector:%u", + (uint) cursor->write_counter, + (uint) value, + (uint) cursor->previous_offset, + (uint) last_protected_sector, (uint) start_sector)); + if (last_protected_sector == start_sector) + { + i= last_protected_sector * 2; + offset= last_protected_sector * DISK_DRIVE_SECTOR_SIZE; + /* restore data, because we modified sector which was protected */ + if (offset < cursor->previous_offset) + page[offset]= table[i]; + offset++; + if (offset < cursor->previous_offset) + page[offset]= table[i + 1]; + } + for (i= start_sector * 2, offset= start_sector * DISK_DRIVE_SECTOR_SIZE; + i < (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2; + (i+= 2), (offset+= DISK_DRIVE_SECTOR_SIZE)) + { + DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x%x", + i / 2, offset, (uint) page[offset], + (uint) page[offset + 1])); + table[i]= page[offset]; + table[i + 1]= page[offset + 1]; + int2store(page + offset, value); + DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x%x", + i / 2, offset, (uint) page[offset], + (uint) page[offset + 1])); + } + DBUG_VOID_RETURN; +} + + +/* + Calculate CRC32 of given area + + SYNOPSIS + translog_crc() + area Pointer of the area beginning + length The Area length + + RETURN + CRC32 +*/ + +static uint32 translog_crc(uchar *area, uint length) +{ + DBUG_ENTER("translog_crc"); + DBUG_RETURN(crc32(0L, (unsigned char*) area, length)); +} + + +/* + Finish current page with zeros + + SYNOPSIS + translog_finish_page() + horizon \ horizon & buffer pointers + cursor / +*/ + +static void translog_finish_page(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + uint16 left= TRANSLOG_PAGE_SIZE - cursor->current_page_fill; + uchar *page= cursor->ptr -cursor->current_page_fill; + DBUG_ENTER("translog_finish_page"); + DBUG_PRINT("enter", ("Buffer: #%u 0x%lx " + "Buffer addr: (%lu,0x%lx) " + "Page addr: (%lu,0x%lx) " + "size:%lu (%lu) Pg:%u left:%u", + (uint) cursor->buffer_no, (ulong) cursor->buffer, + LSN_IN_PARTS(cursor->buffer->offset), + (ulong) LSN_FILE_NO(*horizon), + (ulong) (LSN_OFFSET(*horizon) - + cursor->current_page_fill), + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr -cursor->buffer->buffer), + (uint) cursor->current_page_fill, (uint) left)); + DBUG_ASSERT(LSN_FILE_NO(*horizon) == LSN_FILE_NO(cursor->buffer->offset)); + DBUG_EXECUTE("info", translog_check_cursor(cursor);); + if (cursor->protected) + { + DBUG_PRINT("info", ("Already protected and finished")); + DBUG_VOID_RETURN; + } + cursor->protected= 1; + + DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE); + if (left != 0) + { + DBUG_PRINT("info", ("left: %u", (uint) left)); + bzero(cursor->ptr, left); + cursor->ptr +=left; + (*horizon)+= left; /* offset increasing */ + if (!cursor->chaser) + cursor->buffer->size+= left; + cursor->current_page_fill= 0; + DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx " + "chaser: %d Size: %lu (%lu)", + (uint) cursor->buffer->buffer_no, + (ulong) cursor->buffer, cursor->chaser, + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + DBUG_EXECUTE("info", translog_check_cursor(cursor);); + } + if (page[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) + { + translog_put_sector_protection(page, cursor); + DBUG_PRINT("info", ("drop write_counter")); + cursor->write_counter= 0; + cursor->previous_offset= 0; + } + if (page[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC) + { + uint32 crc= translog_crc(page + log_descriptor.page_overhead, + TRANSLOG_PAGE_SIZE - + log_descriptor.page_overhead); + DBUG_PRINT("info", ("CRC: %lx", (ulong) crc)); + /* We have page number, file number and flag before crc */ + int4store(page + 3 + 3 + 1, crc); + } + DBUG_VOID_RETURN; +} + + +/* + Wait until all thread finish filling this buffer + + SYNOPSIS + translog_wait_for_writers() + buffer This buffer should be check + + NOTE + This buffer should be locked +*/ + +static void translog_wait_for_writers(struct st_translog_buffer *buffer) +{ + struct st_my_thread_var *thread= my_thread_var; + DBUG_ENTER("translog_wait_for_writers"); + DBUG_PRINT("enter", ("Buffer #%u 0x%lx copies in progress: %u", + (uint) buffer->buffer_no, (ulong) buffer, + (int) buffer->copy_to_buffer_in_progress)); + + while (buffer->copy_to_buffer_in_progress) + { + DBUG_PRINT("info", ("wait for writers... " + "buffer: #%u 0x%lx " + "mutex: 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer, + (ulong) &buffer->mutex)); + DBUG_ASSERT(buffer->file != -1); + wqueue_add_and_wait(&buffer->waiting_filling_buffer, thread, + &buffer->mutex); + DBUG_PRINT("info", ("wait for writers done " + "buffer: #%u 0x%lx " + "mutex: 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer, + (ulong) &buffer->mutex)); + } + + DBUG_VOID_RETURN; +} + + +/* + + Wait for buffer to become free + + SYNOPSIS + translog_wait_for_buffer_free() + buffer The buffer we are waiting for + + NOTE + - this buffer should be locked +*/ + +static void translog_wait_for_buffer_free(struct st_translog_buffer *buffer) +{ + struct st_my_thread_var *thread= my_thread_var; + DBUG_ENTER("translog_wait_for_buffer_free"); + DBUG_PRINT("enter", ("Buffer: #%u 0x%lx copies in progress: %u " + "File: %d size: 0x%lu", + (uint) buffer->buffer_no, (ulong) buffer, + (int) buffer->copy_to_buffer_in_progress, + buffer->file, (ulong) buffer->size)); + + translog_wait_for_writers(buffer); + + while (buffer->file != -1) + { + DBUG_PRINT("info", ("wait for writers... " + "buffer: #%u 0x%lx " + "mutex: 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer, + (ulong) &buffer->mutex)); + wqueue_add_and_wait(&buffer->waiting_filling_buffer, thread, + &buffer->mutex); + DBUG_PRINT("info", ("wait for writers done. " + "buffer: #%u 0x%lx " + "mutex: 0x%lx", + (uint) buffer->buffer_no, (ulong) buffer, + (ulong) &buffer->mutex)); + } + DBUG_ASSERT(buffer->copy_to_buffer_in_progress == 0); + DBUG_VOID_RETURN; +} + + +/* + Initialize the cursor for a buffer + + SYNOPSIS + translog_cursor_init() + buffer The buffer + cursor It's cursor + buffer_no Number of buffer +*/ + +static void translog_cursor_init(struct st_buffer_cursor *cursor, + struct st_translog_buffer *buffer, + uint8 buffer_no) +{ + DBUG_ENTER("translog_cursor_init"); + cursor->ptr= buffer->buffer; + cursor->buffer= buffer; + cursor->buffer_no= buffer_no; + cursor->current_page_fill= 0; + cursor->chaser= (cursor != &log_descriptor.bc); + cursor->write_counter= 0; + cursor->previous_offset= 0; + cursor->protected= 0; + DBUG_VOID_RETURN; +} + + +/* + Initialize buffer for current file + + SYNOPSIS + translog_start_buffer() + buffer The buffer + cursor It's cursor + buffer_no Number of buffer +*/ + +static void translog_start_buffer(struct st_translog_buffer *buffer, + struct st_buffer_cursor *cursor, + uint buffer_no) +{ + DBUG_ENTER("translog_start_buffer"); + DBUG_PRINT("enter", + ("Assign buffer: #%u (0x%lx) to file: %d offset: 0x%lx(%lu)", + (uint) buffer->buffer_no, (ulong) buffer, + log_descriptor.log_file_num[0], + (ulong) LSN_OFFSET(log_descriptor.horizon), + (ulong) LSN_OFFSET(log_descriptor.horizon))); + DBUG_ASSERT(buffer_no == buffer->buffer_no); + buffer->last_lsn= LSN_IMPOSSIBLE; + buffer->offset= log_descriptor.horizon; + buffer->next_buffer_offset= LSN_IMPOSSIBLE; + buffer->file= log_descriptor.log_file_num[0]; + buffer->overlay= 0; + buffer->size= 0; + translog_cursor_init(cursor, buffer, buffer_no); + DBUG_PRINT("info", ("init cursor #%u: 0x%lx chaser: %d Size: %lu (%lu)", + (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + DBUG_EXECUTE("info", translog_check_cursor(cursor);); + DBUG_VOID_RETURN; +} + + +/* + Switch to the next buffer in a chain + + SYNOPSIS + translog_buffer_next() + horizon \ Pointers on current position in file and buffer + cursor / + next_file Also start new file + + NOTE: + - loghandler should be locked + - after return new and old buffer still are locked + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_buffer_next(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + my_bool new_file) +{ + uint old_buffer_no= cursor->buffer_no; + uint new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO; + struct st_translog_buffer *new_buffer= log_descriptor.buffers + new_buffer_no; + my_bool chasing= cursor->chaser; + DBUG_ENTER("translog_buffer_next"); + + DBUG_PRINT("info", ("horizon: (%lu,0x%lx) chasing: %d", + LSN_IN_PARTS(log_descriptor.horizon), chasing)); + + DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, *horizon) >= 0); + + translog_finish_page(horizon, cursor); + + if (!chasing) + { + translog_buffer_lock(new_buffer); + translog_wait_for_buffer_free(new_buffer); + } +#ifndef DBUG_OFF + else + DBUG_ASSERT(new_buffer->file != 0); +#endif + if (new_file) + { + + /* move the horizon to the next file and its header page */ + (*horizon)+= LSN_ONE_FILE; + (*horizon)= LSN_REPLACE_OFFSET(*horizon, TRANSLOG_PAGE_SIZE); + if (!chasing && translog_create_new_file()) + { + DBUG_RETURN(1); + } + } + + /* prepare next page */ + if (chasing) + translog_cursor_init(cursor, new_buffer, new_buffer_no); + else + translog_start_buffer(new_buffer, cursor, new_buffer_no); + log_descriptor.buffers[old_buffer_no].next_buffer_offset= new_buffer->offset; + translog_new_page_header(horizon, cursor); + DBUG_RETURN(0); +} + + +/* + Sets max LSN sent to file, and address from which data is only in the buffer + + SYNOPSIS + translog_set_sent_to_file() + lsn LSN to assign + in_buffers to assign to in_buffers_only + + TODO: use atomic operations if possible (64bit architectures?) +*/ + +static void translog_set_sent_to_file(LSN lsn, TRANSLOG_ADDRESS in_buffers) +{ + DBUG_ENTER("translog_set_sent_to_file"); + pthread_mutex_lock(&log_descriptor.sent_to_file_lock); + DBUG_PRINT("enter", ("lsn: (%lu,0x%lx) in_buffers: (%lu,0x%lx) " + "in_buffers_only: (%lu,0x%lx)", + LSN_IN_PARTS(lsn), + LSN_IN_PARTS(in_buffers), + LSN_IN_PARTS(log_descriptor.in_buffers_only))); + DBUG_ASSERT(cmp_translog_addr(lsn, log_descriptor.sent_to_file) >= 0); + log_descriptor.sent_to_file= lsn; + /* LSN_IMPOSSIBLE == 0 => it will work for very first time */ + if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0) + { + log_descriptor.in_buffers_only= in_buffers; + DBUG_PRINT("info", ("set new in_buffers_only")); + } + pthread_mutex_unlock(&log_descriptor.sent_to_file_lock); + DBUG_VOID_RETURN; +} + + +/* + Sets address from which data is only in the buffer + + SYNOPSIS + translog_set_only_in_buffers() + lsn LSN to assign + in_buffers to assign to in_buffers_only +*/ + +static void translog_set_only_in_buffers(TRANSLOG_ADDRESS in_buffers) +{ + DBUG_ENTER("translog_set_only_in_buffers"); + pthread_mutex_lock(&log_descriptor.sent_to_file_lock); + DBUG_PRINT("enter", ("in_buffers: (%lu,0x%lx) " + "in_buffers_only: (%lu,0x%lx)", + LSN_IN_PARTS(in_buffers), + LSN_IN_PARTS(log_descriptor.in_buffers_only))); + /* LSN_IMPOSSIBLE == 0 => it will work for very first time */ + if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0) + { + log_descriptor.in_buffers_only= in_buffers; + DBUG_PRINT("info", ("set new in_buffers_only")); + } + pthread_mutex_unlock(&log_descriptor.sent_to_file_lock); + DBUG_VOID_RETURN; +} + + +/* + Gets address from which data is only in the buffer + + SYNOPSIS + translog_only_in_buffers() + + RETURN + address from which data is only in the buffer +*/ + +static TRANSLOG_ADDRESS translog_only_in_buffers() +{ + register TRANSLOG_ADDRESS addr; + DBUG_ENTER("translog_only_in_buffers"); + pthread_mutex_lock(&log_descriptor.sent_to_file_lock); + addr= log_descriptor.in_buffers_only; + pthread_mutex_unlock(&log_descriptor.sent_to_file_lock); + DBUG_RETURN(addr); +} + + +/* + Get max LSN sent to file + + SYNOPSIS + translog_get_sent_to_file() + + RETURN + max LSN send to file +*/ + +static LSN translog_get_sent_to_file() +{ + register LSN lsn; + DBUG_ENTER("translog_get_sent_to_file"); + pthread_mutex_lock(&log_descriptor.sent_to_file_lock); + lsn= log_descriptor.sent_to_file; + pthread_mutex_unlock(&log_descriptor.sent_to_file_lock); + DBUG_RETURN(lsn); +} + + +/* + Get first chunk address on the given page + + SYNOPSIS + translog_get_first_chunk_offset() + page The page where to find first chunk + + RETURN + first chunk offset +*/ + +static my_bool translog_get_first_chunk_offset(uchar *page) +{ + uint16 page_header= 7; + DBUG_ENTER("translog_get_first_chunk_offset"); + + if (page[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC) + page_header+= 4; + if (page[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) + page_header+= (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2; + DBUG_RETURN(page_header); +} + + +/* + Write coded length of record + + SYNOPSIS + translog_write_variable_record_1group_code_len + dst Destination buffer pointer + length Length which should be coded + header_len Calculated total header length +*/ + +static void +translog_write_variable_record_1group_code_len(uchar *dst, + translog_size_t length, + uint16 header_len) +{ + switch (header_len) { + case 6: /* (5 + 1) */ + DBUG_ASSERT(length <= 250); + *dst= (uint8) length; + return; + case 8: /* (5 + 3) */ + DBUG_ASSERT(length <= 0xFFFF); + *dst= 251; + int2store(dst + 1, length); + return; + case 9: /* (5 + 4) */ + DBUG_ASSERT(length <= (ulong) 0xFFFFFF); + *dst= 252; + int3store(dst + 1, length); + return; + case 10: /* (5 + 5) */ + *dst= 253; + int4store(dst + 1, length); + return; + default: + DBUG_ASSERT(0); + } + return; +} + + +/* + Decode record data length and advance given pointer to the next field + + SYNOPSIS + translog_variable_record_1group_decode_len() + src The pointer to the pointer to the length beginning + + RETURN + decoded length +*/ + +static translog_size_t translog_variable_record_1group_decode_len(uchar **src) +{ + uint8 first= (uint8) (**src); + switch (first) { + case 251: + (*src)+= 3; + return (uint2korr((*src) - 2)); + case 252: + (*src)+= 4; + return (uint3korr((*src) - 3)); + case 253: + (*src)+= 5; + return (uint4korr((*src) - 4)); + case 254: + case 255: + DBUG_ASSERT(0); /* reserved for future use */ + return (0); + default: + (*src)++; + return (first); + } +} + + +/* + Get total length of this chunk (not only body) + + SYNOPSIS + translog_get_total_chunk_length() + page The page where chunk placed + offset Offset of the chunk on this place + + RETURN + total length of the chunk +*/ + +static uint16 translog_get_total_chunk_length(uchar *page, uint16 offset) +{ + DBUG_ENTER("translog_get_total_chunk_length"); + switch (page[offset] & TRANSLOG_CHUNK_TYPE) { + case TRANSLOG_CHUNK_LSN: + { + /* 0 chunk referred as LSN (head or tail) */ + translog_size_t rec_len; + uchar *start= page + offset; + uchar *ptr= start + 1 + 2; + uint16 chunk_len, header_len, page_rest; + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN")); + rec_len= translog_variable_record_1group_decode_len(&ptr); + chunk_len= uint2korr(ptr); + header_len= (ptr -start) + 2; + DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u", + (ulong) rec_len, (uint) chunk_len, (uint) header_len)); + if (chunk_len) + { + DBUG_PRINT("info", ("chunk len: %u + %u = %u", + (uint) header_len, (uint) chunk_len, + (uint) (chunk_len + header_len))); + DBUG_RETURN(chunk_len + header_len); + } + page_rest= TRANSLOG_PAGE_SIZE - offset; + DBUG_PRINT("info", ("page_rest %u", (uint) page_rest)); + if (rec_len + header_len < page_rest) + DBUG_RETURN(rec_len + header_len); + DBUG_RETURN(page_rest); + } + case TRANSLOG_CHUNK_FIXED: + { + uchar *ptr; + uint type= page[offset] & TRANSLOG_REC_TYPE; + uint length; + int i; + /* 1 (pseudo)fixed record (also LSN) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED")); + DBUG_ASSERT(log_record_type_descriptor[type].class == + LOGRECTYPE_FIXEDLENGTH || + log_record_type_descriptor[type].class == + LOGRECTYPE_PSEUDOFIXEDLENGTH); + if (log_record_type_descriptor[type].class == LOGRECTYPE_FIXEDLENGTH) + { + DBUG_PRINT("info", + ("Fixed length: %u", + (uint) (log_record_type_descriptor[type].fixed_length + 3))); + DBUG_RETURN(log_record_type_descriptor[type].fixed_length + 3); + } + + ptr= page + offset + 3; /* first compressed LSN */ + length= log_record_type_descriptor[type].fixed_length + 3; + for (i= 0; i < log_record_type_descriptor[type].compressed_LSN; i++) + { + /* first 2 bits is length - 2 */ + uint len= ((((uint8) (*ptr)) & TRANSLOG_CLSN_LEN_BITS) >> 6) + 2; + if (ptr[0] == 0 && ((uint8) ptr[1]) == 1) + len+= LSN_STORE_SIZE; /* case of full LSN storing */ + ptr+= len; + /* subtract economized bytes */ + length-= (LSN_STORE_SIZE - len); + } + DBUG_PRINT("info", ("Pseudo-fixed length: %u", length)); + DBUG_RETURN(length); + } + case TRANSLOG_CHUNK_NOHDR: + /* 2 no header chunk (till page end) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR length: %u", + (uint) (TRANSLOG_PAGE_SIZE - offset))); + DBUG_RETURN(TRANSLOG_PAGE_SIZE - offset); + case TRANSLOG_CHUNK_LNGTH: /* 3 chunk with chunk length */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH")); + DBUG_ASSERT(TRANSLOG_PAGE_SIZE - offset >= 3); + DBUG_PRINT("info", ("length: %u", uint2korr(page + offset + 1) + 3)); + DBUG_RETURN(uint2korr(page + offset + 1) + 3); + default: + DBUG_ASSERT(0); + DBUG_RETURN(0); + } +} + + +/* + Flush given buffer + + SYNOPSIS + translog_buffer_flush() + buffer This buffer should be flushed + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_buffer_flush(struct st_translog_buffer *buffer) +{ + uint32 i; + PAGECACHE_FILE file; + DBUG_ENTER("translog_buffer_flush"); + DBUG_PRINT("enter", + ("Buffer: #%u 0x%lx: " + "file: %d offset: (%lu,0x%lx) size: %lu", + (uint) buffer->buffer_no, (ulong) buffer, + buffer->file, + LSN_IN_PARTS(buffer->offset), + (ulong) buffer->size)); + + DBUG_ASSERT(buffer->file != -1); + + translog_wait_for_writers(buffer); + if (buffer->overlay && buffer->overlay->file != -1) + { + struct st_translog_buffer *overlay= buffer->overlay; + translog_buffer_unlock(buffer); + translog_buffer_lock(overlay); + translog_wait_for_buffer_free(overlay); + translog_buffer_unlock(overlay); + translog_buffer_lock(buffer); + } + + file.file= buffer->file; + for (i= 0; i < buffer->size; i+= TRANSLOG_PAGE_SIZE) + { + TRANSLOG_ADDRESS addr= (buffer->offset + i); + TRANSLOG_VALIDATOR_DATA data; + data.addr= &addr; + DBUG_ASSERT(log_descriptor.pagecache->block_size == TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size); + if (pagecache_inject(log_descriptor.pagecache, + &file, + (LSN_OFFSET(buffer->offset) + i) / TRANSLOG_PAGE_SIZE, + 3, + buffer->buffer + i, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, 0, + &translog_page_validator, (uchar*) &data)) + { + UNRECOVERABLE_ERROR(("Can't write page (%lu,0x%lx) to pagecache", + (ulong) buffer->file, + (ulong) (LSN_OFFSET(buffer->offset)+ i))); + } + } + if (my_pwrite(buffer->file, (char*) buffer->buffer, + buffer->size, LSN_OFFSET(buffer->offset), + log_write_flags)) + { + UNRECOVERABLE_ERROR(("Can't write buffer (%lu,0x%lx) size %lu " + "to the disk (%d)", + (ulong) buffer->file, + (ulong) LSN_OFFSET(buffer->offset), + (ulong) buffer->size, errno)); + DBUG_RETURN(1); + } + + if (LSN_OFFSET(buffer->last_lsn) != 0) /* if buffer->last_lsn is set */ + translog_set_sent_to_file(buffer->last_lsn, + buffer->next_buffer_offset); + else + translog_set_only_in_buffers(buffer->next_buffer_offset); + /* Free buffer */ + buffer->file= -1; + buffer->overlay= 0; + if (buffer->waiting_filling_buffer.last_thread) + { + wqueue_release_queue(&buffer->waiting_filling_buffer); + } + DBUG_RETURN(0); +} + + +/* + Recover page with sector protection (wipe out failed chunks) + + SYNOPSYS + translog_recover_page_up_to_sector() + page reference on the page + offset offset of failed sector + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_recover_page_up_to_sector(uchar *page, uint16 offset) +{ + uint16 chunk_offset= translog_get_first_chunk_offset(page), valid_chunk_end; + DBUG_ENTER("translog_recover_page_up_to_sector"); + DBUG_PRINT("enter", ("offset: %u first chunk: %u", + (uint) offset, (uint) chunk_offset)); + + while (page[chunk_offset] != '\0' && chunk_offset < offset) + { + uint16 chunk_length; + if ((chunk_length= + translog_get_total_chunk_length(page, chunk_offset)) == 0) + { + UNRECOVERABLE_ERROR(("cant get chunk length (offset %u)", + (uint) chunk_offset)); + DBUG_RETURN(1); + } + DBUG_PRINT("info", ("chunk: offset: %u length %u", + (uint) chunk_offset, (uint) chunk_length)); + if (((ulong) chunk_offset) + ((ulong) chunk_length) > TRANSLOG_PAGE_SIZE) + { + UNRECOVERABLE_ERROR(("damaged chunk (offset %u) in trusted area", + (uint) chunk_offset)); + DBUG_RETURN(1); + } + chunk_offset+= chunk_length; + } + + valid_chunk_end= chunk_offset; + /* end of trusted area - sector parsing */ + while (page[chunk_offset] != '\0') + { + uint16 chunk_length; + if ((chunk_length= + translog_get_total_chunk_length(page, chunk_offset)) == 0) + break; + + DBUG_PRINT("info", ("chunk: offset: %u length %u", + (uint) chunk_offset, (uint) chunk_length)); + if (((ulong) chunk_offset) + ((ulong) chunk_length) > + (uint) (offset + DISK_DRIVE_SECTOR_SIZE)) + break; + + chunk_offset+= chunk_length; + valid_chunk_end= chunk_offset; + } + DBUG_PRINT("info", ("valid chunk end offset: %u", (uint) valid_chunk_end)); + + bzero(page + valid_chunk_end, TRANSLOG_PAGE_SIZE - valid_chunk_end); + + DBUG_RETURN(0); +} + + +/* + Log page validator + + SYNOPSIS + translog_page_validator() + page_addr The page to check + data data, need for validation (address in this case) + + RETURN + 0 OK + 1 Error +*/ +static my_bool translog_page_validator(uchar *page_addr, uchar* data_ptr) +{ + uint this_page_page_overhead; + uint flags; + uchar *page= (uchar*) page_addr, *page_pos; + TRANSLOG_VALIDATOR_DATA *data= (TRANSLOG_VALIDATOR_DATA *) data_ptr; + TRANSLOG_ADDRESS addr= *(data->addr); + DBUG_ENTER("translog_page_validator"); + + data->was_recovered= 0; + + if (uint3korr(page) != LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE || + uint3korr(page + 3) != LSN_FILE_NO(addr)) + { + UNRECOVERABLE_ERROR(("Page (%lu,0x%lx): " + "page address written in the page is incorrect: " + "File %lu instead of %lu or page %lu instead of %lu", + LSN_IN_PARTS(addr), + (ulong) uint3korr(page + 3), (ulong) LSN_FILE_NO(addr), + (ulong) uint3korr(page), + (ulong) LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE)); + DBUG_RETURN(1); + } + flags= (uint)(page[TRANSLOG_PAGE_FLAGS]); + this_page_page_overhead= page_overhead[flags]; + if (flags & ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | + TRANSLOG_RECORD_CRC)) + { + UNRECOVERABLE_ERROR(("Page (%lu,0x%lx): " + "Garbage in the page flags field detected : %x", + LSN_IN_PARTS(addr), (uint) flags)); + DBUG_RETURN(1); + } + page_pos= page + (3 + 3 + 1); + if (flags & TRANSLOG_PAGE_CRC) + { + uint32 crc= translog_crc(page + this_page_page_overhead, + TRANSLOG_PAGE_SIZE - + this_page_page_overhead); + if (crc != uint4korr(page_pos)) + { + UNRECOVERABLE_ERROR(("Page (%lu,0x%lx): " + "CRC mismatch: calculated: %lx on the page %lx", + LSN_IN_PARTS(addr), + (ulong) crc, (ulong) uint4korr(page_pos))); + DBUG_RETURN(1); + } + page_pos+= CRC_LENGTH; /* Skip crc */ + } + if (flags & TRANSLOG_SECTOR_PROTECTION) + { + uint i, offset; + uchar *table= page_pos; + uint16 current= uint2korr(table); + for (i= 2, offset= DISK_DRIVE_SECTOR_SIZE; + i < (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2; + i+= 2, offset+= DISK_DRIVE_SECTOR_SIZE) + { + /* + TODO: add chunk counting for "suspecting" sectors (difference is + more than 1-2) + */ + uint16 test= uint2korr(page + offset); + DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx " + "read: 0x%x stored: 0x%x%x", + i / 2, offset, (ulong) current, + (uint) uint2korr(page + offset), (uint) table[i], + (uint) table[i + 1])); + if (((test < current) && + (LL(0xFFFF) - current + test > DISK_DRIVE_SECTOR_SIZE / 3)) || + ((test >= current) && + (test - current > DISK_DRIVE_SECTOR_SIZE / 3))) + { + if (translog_recover_page_up_to_sector(page, offset)) + DBUG_RETURN(1); + data->was_recovered= 1; + DBUG_RETURN(0); + } + + /* Return value on the page */ + page[offset]= table[i]; + page[offset + 1]= table[i + 1]; + current= test; + DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx " + "read: 0x%x stored: 0x%x%x", + i / 2, offset, (ulong) current, + (uint) uint2korr(page + offset), (uint) table[i], + (uint) table[i + 1])); + } + } + DBUG_RETURN(0); +} + + +/* + Lock the loghandler + + SYNOPSIS + translog_lock() + + RETURN + 0 OK + 1 Error +*/ + +my_bool translog_lock() +{ + struct st_translog_buffer *current_buffer; + DBUG_ENTER("translog_lock"); + + /* + Locking the loghandler mean locking current buffer, but it can change + during locking, so we should check it + */ + for (;;) + { + current_buffer= log_descriptor.bc.buffer; + if (translog_buffer_lock(current_buffer)) + DBUG_RETURN(1); + if (log_descriptor.bc.buffer == current_buffer) + break; + translog_buffer_unlock(current_buffer); + } + DBUG_RETURN(0); +} + + +/* + Unlock the loghandler + + SYNOPSIS + translog_unlock() + + RETURN + 0 OK + 1 Error +*/ + +my_bool translog_unlock() +{ + DBUG_ENTER("translog_unlock"); + translog_buffer_unlock(log_descriptor.bc.buffer); + + DBUG_RETURN(0); +} + + +/* + Get log page by file number and offset of the beginning of the page + + SYNOPSIS + translog_get_page() + data validator data, which contains the page address + buffer buffer for page placing + (might not be used in some cache implementations) + + RETURN + NULL - Error + # pointer to the page cache which should be used to read this page +*/ + +static uchar *translog_get_page(TRANSLOG_VALIDATOR_DATA *data, uchar *buffer) +{ + TRANSLOG_ADDRESS addr= *(data->addr), in_buffers; + uint cache_index; + uint32 file_no= LSN_FILE_NO(addr); + DBUG_ENTER("translog_get_page"); + DBUG_PRINT("enter", ("File: %lu Offset: %lu(0x%lx)", + (ulong) file_no, + (ulong) LSN_OFFSET(addr), + (ulong) LSN_OFFSET(addr))); + + /* it is really page address */ + DBUG_ASSERT(LSN_OFFSET(addr) % TRANSLOG_PAGE_SIZE == 0); + + in_buffers= translog_only_in_buffers(); + DBUG_PRINT("info", ("in_buffers: (%lu,0x%lx)", + LSN_IN_PARTS(in_buffers))); + if (in_buffers != LSN_IMPOSSIBLE && + cmp_translog_addr(addr, in_buffers) >= 0) + { + translog_lock(); + /* recheck with locked loghandler */ + in_buffers= translog_only_in_buffers(); + if (cmp_translog_addr(addr, in_buffers) >= 0) + { + uint16 buffer_no= log_descriptor.bc.buffer_no; + uint16 buffer_start= buffer_no; + struct st_translog_buffer *buffer_unlock= log_descriptor.bc.buffer; + struct st_translog_buffer *curr_buffer= log_descriptor.bc.buffer; + for (;;) + { + /* + if the page is in the buffer and it is the last version of the + page (in case of devision the page bu buffer flush + */ + if (curr_buffer->file != -1 && + cmp_translog_addr(addr, curr_buffer->offset) >= 0 && + cmp_translog_addr(addr, + (curr_buffer->next_buffer_offset ? + curr_buffer->next_buffer_offset: + curr_buffer->offset + curr_buffer->size)) < 0) + { + int is_last_unfinished_page; + uint last_protected_sector= 0; + uchar *from, *table= NULL; + translog_wait_for_writers(curr_buffer); + DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset)); + from= curr_buffer->buffer + (addr - curr_buffer->offset); + memcpy(buffer, from, TRANSLOG_PAGE_SIZE); + is_last_unfinished_page= ((log_descriptor.bc.buffer == + curr_buffer) && + (log_descriptor.bc.ptr >= from) && + (log_descriptor.bc.ptr < + from + TRANSLOG_PAGE_SIZE)); + if (is_last_unfinished_page && + (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)) + { + last_protected_sector= ((log_descriptor.bc.previous_offset - 1) / + DISK_DRIVE_SECTOR_SIZE); + table= buffer + log_descriptor.page_overhead - + (TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE) * 2; + } + + DBUG_ASSERT(buffer_unlock == curr_buffer); + translog_buffer_unlock(buffer_unlock); + if (is_last_unfinished_page) + { + uint i; + /* + This is last unfinished page => we should not check CRC and + remove only that protection which already installed (no need + to check it) + + We do not check the flag of sector protection, because if + (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) is + not set then last_protected_sector will be 0 so following loop + will be never executed + */ + DBUG_PRINT("info", ("This is last unfinished page, " + "last protected sector %u", + last_protected_sector)); + for (i= 1; i <= last_protected_sector; i++) + { + uint index= i * 2; + uint offset= i * DISK_DRIVE_SECTOR_SIZE; + DBUG_PRINT("info", ("Sector %u: 0x%02x%02x <- 0x%02x%02x", + i, buffer[offset], buffer[offset + 1], + table[index], table[index + 1])); + buffer[offset]= table[index]; + buffer[offset + 1]= table[index + 1]; + } + } + else + { + /* + This IF should be true because we use in-memory data which + supposed to be correct. + */ + if (translog_page_validator((uchar*) buffer, (uchar*) data)) + buffer= NULL; + } + DBUG_RETURN(buffer); + } + buffer_no= (buffer_no + 1) % TRANSLOG_BUFFERS_NO; + curr_buffer= log_descriptor.buffers + buffer_no; + translog_buffer_lock(curr_buffer); + translog_buffer_unlock(buffer_unlock); + buffer_unlock= curr_buffer; + /* we can't make full circle */ + DBUG_ASSERT(buffer_start != buffer_no); + } + } + translog_unlock(); + } + if ((cache_index= LSN_FILE_NO(log_descriptor.horizon) - file_no) < + OPENED_FILES_NUM) + { + PAGECACHE_FILE file; + /* file in the cache */ + if (log_descriptor.log_file_num[cache_index] == -1) + { + if ((log_descriptor.log_file_num[cache_index]= + open_logfile_by_number_no_cache(file_no)) == -1) + DBUG_RETURN(NULL); + } + file.file= log_descriptor.log_file_num[cache_index]; + + buffer= (uchar*) + pagecache_valid_read(log_descriptor.pagecache, &file, + LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE, + 3, (char*) buffer, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0, + &translog_page_validator, (uchar*) data); + } + else + { + /* + TODO: WE KEEP THE LAST OPENED_FILES_NUM FILES IN THE LOG CACHE, NOT + THE LAST USED FILES. THIS WILL BE A NOTABLE PROBLEM IF WE ARE + FOLLOWING AN UNDO CHAIN THAT GOES OVER MANY OLD LOG FILES. WE WILL + PROBABLY NEED SPECIAL HANDLING OF THIS OR HAVE A FILO FOR THE LOG + FILES. + */ + + File file= open_logfile_by_number_no_cache(file_no); + if (file == -1) + DBUG_RETURN(NULL); + if (my_pread(file, (char*) buffer, TRANSLOG_PAGE_SIZE, + LSN_OFFSET(addr), MYF(MY_FNABP | MY_WME))) + buffer= NULL; + else if (translog_page_validator((uchar*) buffer, (uchar*) data)) + buffer= NULL; + my_close(file, MYF(MY_WME)); + } + DBUG_RETURN(buffer); +} + + +/* + Finds last page of the given log file + + SYNOPSIS + translog_get_last_page_addr() + addr address structure to fill with data, which contain + file number of the log file + last_page_ok assigned 1 if last page was OK + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_get_last_page_addr(TRANSLOG_ADDRESS *addr, + my_bool *last_page_ok) +{ + MY_STAT stat_buff, *stat; + char path[FN_REFLEN]; + uint32 rec_offset; + uint32 file_no= LSN_FILE_NO(*addr); + DBUG_ENTER("translog_get_last_page_addr"); + + if (!(stat= my_stat(translog_filename_by_fileno(file_no, path), + &stat_buff, MYF(MY_WME)))) + DBUG_RETURN(1); + DBUG_PRINT("info", ("File size: %lu", (ulong) stat->st_size)); + if (stat->st_size > TRANSLOG_PAGE_SIZE) + { + rec_offset= (((stat->st_size / TRANSLOG_PAGE_SIZE) - 1) * + TRANSLOG_PAGE_SIZE); + *last_page_ok= (stat->st_size == rec_offset + TRANSLOG_PAGE_SIZE); + } + else + { + *last_page_ok= 0; + rec_offset= 0; + } + *addr= MAKE_LSN(file_no, rec_offset); + DBUG_PRINT("info", ("Last page: 0x%lx ok: %d", (ulong) rec_offset, + *last_page_ok)); + DBUG_RETURN(0); +} + + +/* + Get number bytes for record length storing + + SYNOPSIS + translog_variable_record_length_bytes() + length Record length wich will be codded + + RETURN + 1,3,4,5 - number of bytes to store given length +*/ + +static uint translog_variable_record_length_bytes(translog_size_t length) +{ + if (length < 250) + return 1; + if (length < 0xFFFF) + return 3; + if (length < (ulong) 0xFFFFFF) + return 4; + return 5; +} + + +/* + Get header of this chunk + + SYNOPSIS + translog_get_chunk_header_length() + page The page where chunk placed + offset Offset of the chunk on this place + + RETURN + # total length of the chunk + 0 Error +*/ + +static uint16 translog_get_chunk_header_length(uchar *page, uint16 offset) +{ + DBUG_ENTER("translog_get_chunk_header_length"); + page+= offset; + switch (*page & TRANSLOG_CHUNK_TYPE) { + case TRANSLOG_CHUNK_LSN: + { + /* 0 chunk referred as LSN (head or tail) */ + translog_size_t rec_len; + uchar *start= page; + uchar *ptr= start + 1 + 2; + uint16 chunk_len, header_len; + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN")); + rec_len= translog_variable_record_1group_decode_len(&ptr); + chunk_len= uint2korr(ptr); + header_len= (ptr - start) +2; + DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u", + (ulong) rec_len, (uint) chunk_len, (uint) header_len)); + if (chunk_len) + { + /* TODO: fine header end */ + DBUG_ASSERT(0); + DBUG_RETURN(0); /* Keep compiler happy */ + } + DBUG_RETURN(header_len); + } + case TRANSLOG_CHUNK_FIXED: + { + /* 1 (pseudo)fixed record (also LSN) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED = 3")); + DBUG_RETURN(3); + } + case TRANSLOG_CHUNK_NOHDR: + /* 2 no header chunk (till page end) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR = 1")); + DBUG_RETURN(1); + break; + case TRANSLOG_CHUNK_LNGTH: + /* 3 chunk with chunk length */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH = 3")); + DBUG_RETURN(3); + break; + default: + DBUG_ASSERT(0); + DBUG_RETURN(0); /* Keep compiler happy */ + } +} + + +/* + Initialize transaction log + + SYNOPSIS + translog_init() + directory Directory where log files are put + log_file_max_size max size of one log size (for new logs creation) + server_version version of MySQL server (MYSQL_VERSION_ID) + server_id server ID (replication & Co) + pagecache Page cache for the log reads + flags flags (TRANSLOG_PAGE_CRC, TRANSLOG_SECTOR_PROTECTION + TRANSLOG_RECORD_CRC) + + TODO + Free used resources in case of error. + + RETURN + 0 OK + 1 Error +*/ + +my_bool translog_init(const char *directory, + uint32 log_file_max_size, + uint32 server_version, + uint32 server_id, PAGECACHE *pagecache, uint flags) +{ + int i; + int old_log_was_recovered= 0, logs_found= 0; + uint old_flags= flags; + TRANSLOG_ADDRESS sure_page, last_page, last_valid_page; + my_bool version_changed= 0; + DBUG_ENTER("translog_init"); + DBUG_ASSERT(translog_inited == 0); + + loghandler_init(); /* Safe to do many times */ + + if (pthread_mutex_init(&log_descriptor.sent_to_file_lock, + MY_MUTEX_INIT_FAST) || + pthread_mutex_init(&log_descriptor.file_header_lock, + MY_MUTEX_INIT_FAST) || + pthread_mutex_init(&log_descriptor.unfinished_files_lock, + MY_MUTEX_INIT_FAST) || + pthread_mutex_init(&log_descriptor.purger_lock, + MY_MUTEX_INIT_FAST) || + pthread_mutex_init(&log_descriptor.log_flush_lock, + MY_MUTEX_INIT_FAST) || + init_dynamic_array(&log_descriptor.unfinished_files, + sizeof(struct st_file_counter), + 10, 10 CALLER_INFO)) + DBUG_RETURN(1); + log_descriptor.min_file_number= 0; + log_descriptor.last_lsn_checked= LSN_IMPOSSIBLE; + + /* Directory to store files */ + unpack_dirname(log_descriptor.directory, directory); + + if ((log_descriptor.directory_fd= my_open(log_descriptor.directory, + O_RDONLY, MYF(MY_WME))) < 0) + { + UNRECOVERABLE_ERROR(("Error %d during opening directory '%s'", + errno, log_descriptor.directory)); + DBUG_RETURN(1); + } + + log_descriptor.in_buffers_only= LSN_IMPOSSIBLE; + /* max size of one log size (for new logs creation) */ + log_descriptor.log_file_max_size= + log_file_max_size - (log_file_max_size % TRANSLOG_PAGE_SIZE); + /* server version */ + log_descriptor.server_version= server_version; + /* server ID */ + log_descriptor.server_id= server_id; + /* Page cache for the log reads */ + log_descriptor.pagecache= pagecache; + /* Flags */ + DBUG_ASSERT((flags & + ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | + TRANSLOG_RECORD_CRC)) == 0); + log_descriptor.flags= flags; + for (i= 0; i < TRANSLOG_FLAGS_NUM; i++) + { + page_overhead[i]= 7; + if (i & TRANSLOG_PAGE_CRC) + page_overhead[i]+= CRC_LENGTH; + if (i & TRANSLOG_SECTOR_PROTECTION) + page_overhead[i]+= (TRANSLOG_PAGE_SIZE / + DISK_DRIVE_SECTOR_SIZE) * 2; + } + log_descriptor.page_overhead= page_overhead[flags]; + log_descriptor.page_capacity_chunk_2= + TRANSLOG_PAGE_SIZE - log_descriptor.page_overhead - 1; + DBUG_ASSERT(TRANSLOG_WRITE_BUFFER % TRANSLOG_PAGE_SIZE == 0); + log_descriptor.buffer_capacity_chunk_2= + (TRANSLOG_WRITE_BUFFER / TRANSLOG_PAGE_SIZE) * + log_descriptor.page_capacity_chunk_2; + log_descriptor.half_buffer_capacity_chunk_2= + log_descriptor.buffer_capacity_chunk_2 / 2; + DBUG_PRINT("info", + ("Overhead: %u pc2: %u bc2: %u, bc2/2: %u", + log_descriptor.page_overhead, + log_descriptor.page_capacity_chunk_2, + log_descriptor.buffer_capacity_chunk_2, + log_descriptor.half_buffer_capacity_chunk_2)); + + /* *** Current state of the log handler *** */ + + /* Init log handler file handlers cache */ + for (i= 0; i < OPENED_FILES_NUM; i++) + log_descriptor.log_file_num[i]= -1; + + /* just to init it somehow */ + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + + /* Buffers for log writing */ + for (i= 0; i < TRANSLOG_BUFFERS_NO; i++) + { + if (translog_buffer_init(log_descriptor.buffers + i)) + DBUG_RETURN(1); +#ifndef DBUG_OFF + log_descriptor.buffers[i].buffer_no= (uint8) i; +#endif + DBUG_PRINT("info", ("translog_buffer buffer #%u: 0x%lx", + i, (ulong) log_descriptor.buffers + i)); + } + + logs_found= (last_logno != FILENO_IMPOSSIBLE); + + if (logs_found) + { + my_bool pageok; + /* + TODO: scan directory for maria_log.XXXXXXXX files and find + highest XXXXXXXX & set logs_found + TODO: check that last checkpoint within present log addresses space + + find the log end + */ + if (LSN_FILE_NO(last_checkpoint_lsn) == FILENO_IMPOSSIBLE) + { + DBUG_ASSERT(LSN_OFFSET(last_checkpoint_lsn) == 0); + /* there was no checkpoints we will read from the beginning */ + sure_page= (LSN_ONE_FILE | TRANSLOG_PAGE_SIZE); + } + else + { + sure_page= last_checkpoint_lsn; + DBUG_ASSERT(LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE != 0); + sure_page-= LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE; + } + log_descriptor.horizon= last_page= MAKE_LSN(last_logno,0); + if (translog_get_last_page_addr(&last_page, &pageok)) + DBUG_RETURN(1); + if (LSN_OFFSET(last_page) == 0) + { + if (LSN_FILE_NO(last_page) == 1) + { + logs_found= 0; /* file #1 has no pages */ + } + else + { + last_page-= LSN_ONE_FILE; + if (translog_get_last_page_addr(&last_page, &pageok)) + DBUG_RETURN(1); + } + } + } + if (logs_found) + { + TRANSLOG_ADDRESS current_page= sure_page; + my_bool pageok; + + DBUG_ASSERT(sure_page <= last_page); + + /* TODO: check page size */ + + last_valid_page= LSN_IMPOSSIBLE; + /* scan and validate pages */ + do + { + TRANSLOG_ADDRESS current_file_last_page; + current_file_last_page= current_page; + if (translog_get_last_page_addr(¤t_file_last_page, &pageok)) + DBUG_RETURN(1); + if (!pageok) + { + DBUG_PRINT("error", ("File %lu have no complete last page", + (ulong) LSN_FILE_NO(current_file_last_page))); + old_log_was_recovered= 1; + /* This file is not written till the end so it should be last */ + last_page= current_file_last_page; + /* TODO: issue warning */ + } + do + { + TRANSLOG_VALIDATOR_DATA data; + uchar buffer[TRANSLOG_PAGE_SIZE], *page; + data.addr= ¤t_page; + if ((page= translog_get_page(&data, buffer)) == NULL) + DBUG_RETURN(1); + if (data.was_recovered) + { + DBUG_PRINT("error", ("file no: %lu (%d) " + "rec_offset: 0x%lx (%lu) (%d)", + (ulong) LSN_FILE_NO(current_page), + (uint3korr(page + 3) != + LSN_FILE_NO(current_page)), + (ulong) LSN_OFFSET(current_page), + (ulong) (LSN_OFFSET(current_page) / + TRANSLOG_PAGE_SIZE), + (uint3korr(page) != + LSN_OFFSET(current_page) / + TRANSLOG_PAGE_SIZE))); + old_log_was_recovered= 1; + break; + } + old_flags= page[TRANSLOG_PAGE_FLAGS]; + last_valid_page= current_page; + current_page+= TRANSLOG_PAGE_SIZE; /* increase offset */ + } while (current_page <= current_file_last_page); + current_page+= LSN_ONE_FILE; + current_page= LSN_REPLACE_OFFSET(current_page, TRANSLOG_PAGE_SIZE); + } while (LSN_FILE_NO(current_page) <= LSN_FILE_NO(last_page) && + !old_log_was_recovered); + if (last_valid_page == LSN_IMPOSSIBLE) + { + /* Panic!!! Even page which should be valid is invalid */ + /* TODO: issue error */ + DBUG_RETURN(1); + } + DBUG_PRINT("info", ("Last valid page is in file: %lu " + "offset: %lu (0x%lx) " + "Logs found: %d was recovered: %d " + "flags match: %d", + (ulong) LSN_FILE_NO(last_valid_page), + (ulong) LSN_OFFSET(last_valid_page), + (ulong) LSN_OFFSET(last_valid_page), + logs_found, old_log_was_recovered, + (old_flags == flags))); + + /* TODO: check server ID */ + if (logs_found && !old_log_was_recovered && old_flags == flags) + { + TRANSLOG_VALIDATOR_DATA data; + uchar buffer[TRANSLOG_PAGE_SIZE], *page; + uint16 chunk_offset; + data.addr= &last_valid_page; + /* continue old log */ + DBUG_ASSERT(LSN_FILE_NO(last_valid_page)== + LSN_FILE_NO(log_descriptor.horizon)); + if ((page= translog_get_page(&data, buffer)) == NULL || + (chunk_offset= translog_get_first_chunk_offset(page)) == 0) + DBUG_RETURN(1); + + /* Puts filled part of old page in the buffer */ + log_descriptor.horizon= last_valid_page; + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + /* + Free space if filled with 0 and first uchar of + real chunk can't be 0 + */ + while (chunk_offset < TRANSLOG_PAGE_SIZE && page[chunk_offset] != '\0') + { + uint16 chunk_length; + if ((chunk_length= + translog_get_total_chunk_length(page, chunk_offset)) == 0) + DBUG_RETURN(1); + DBUG_PRINT("info", ("chunk: offset: %u length: %u", + (uint) chunk_offset, (uint) chunk_length)); + chunk_offset+= chunk_length; + + /* chunk can't cross the page border */ + DBUG_ASSERT(chunk_offset <= TRANSLOG_PAGE_SIZE); + } + memcpy(log_descriptor.buffers->buffer, page, chunk_offset); + log_descriptor.bc.buffer->size+= chunk_offset; + log_descriptor.bc.ptr+= chunk_offset; + log_descriptor.bc.current_page_fill= chunk_offset; + log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, + (chunk_offset + + LSN_OFFSET(last_valid_page))); + DBUG_PRINT("info", ("Move Page #%u: 0x%lx chaser: %d Size: %lu (%lu)", + (uint) log_descriptor.bc.buffer_no, + (ulong) log_descriptor.bc.buffer, + log_descriptor.bc.chaser, + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr - log_descriptor.bc. + buffer->buffer))); + DBUG_EXECUTE("info", translog_check_cursor(&log_descriptor.bc);); + } + if (!old_log_was_recovered && old_flags == flags) + { + LOGHANDLER_FILE_INFO info; + if (translog_read_file_header(&info, log_descriptor.log_file_num[0])) + DBUG_RETURN(1); + version_changed= (info.maria_version != TRANSLOG_VERSION_ID); + } + } + DBUG_PRINT("info", ("Logs found: %d was recovered: %d", + logs_found, old_log_was_recovered)); + if (!logs_found) + { + /* Start new log system from scratch */ + /* Used space */ + log_descriptor.horizon= MAKE_LSN(1, TRANSLOG_PAGE_SIZE); /* header page */ + /* Current logs file number in page cache */ + if ((log_descriptor.log_file_num[0]= + open_logfile_by_number_no_cache(1)) == -1 || + translog_write_file_header()) + DBUG_RETURN(1); + if (ma_control_file_write_and_force(LSN_IMPOSSIBLE, 1, + CONTROL_FILE_UPDATE_ONLY_LOGNO)) + DBUG_RETURN(1); + /* assign buffer 0 */ + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); + } + else if (old_log_was_recovered || old_flags != flags || version_changed) + { + /* leave the damaged file untouched */ + log_descriptor.horizon+= LSN_ONE_FILE; + /* header page */ + log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, + TRANSLOG_PAGE_SIZE); + if (translog_create_new_file()) + DBUG_RETURN(1); + /* + Buffer system left untouched after recovery => we should init it + (starting from buffer 0) + */ + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); + } + + /* all LSNs that are on disk are flushed */ + log_descriptor.sent_to_file= + log_descriptor.flushed= log_descriptor.horizon; + log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset; + log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */ + /* + horizon is (potentially) address of the next LSN we need decrease + it to signal that all LSNs before it are flushed + */ + log_descriptor.flushed--; /* offset decreased */ + log_descriptor.sent_to_file--; /* offset decreased */ + /* + Log records will refer to a MARIA_SHARE by a unique 2-byte id; set up + structures for generating 2-byte ids: + */ + my_atomic_rwlock_init(&LOCK_id_to_share); + id_to_share= (MARIA_SHARE **) my_malloc(SHARE_ID_MAX * sizeof(MARIA_SHARE*), + MYF(MY_WME | MY_ZEROFILL)); + if (unlikely(!id_to_share)) + DBUG_RETURN(1); + id_to_share--; /* min id is 1 */ + translog_inited= 1; + DBUG_RETURN(0); +} + + +/* + Free transaction log file buffer + + SYNOPSIS + translog_buffer_destroy() + buffer_no The buffer to free + + NOTE + This buffer should be locked +*/ + +static void translog_buffer_destroy(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_destroy"); + DBUG_PRINT("enter", + ("Buffer #%u: 0x%lx file: %d offset: (%lu,0x%lx) size: %lu", + (uint) buffer->buffer_no, (ulong) buffer, + buffer->file, + LSN_IN_PARTS(buffer->offset), + (ulong) buffer->size)); + DBUG_ASSERT(buffer->waiting_filling_buffer.last_thread == 0); + if (buffer->file != -1) + { + /* + We ignore errors here, because we can't do something about it + (it is shutting down) + */ + translog_buffer_flush(buffer); + } + DBUG_PRINT("info", ("Destroy mutex: 0x%lx", (ulong) &buffer->mutex)); + pthread_mutex_destroy(&buffer->mutex); + DBUG_VOID_RETURN; +} + + +/* + Free log handler resources + + SYNOPSIS + translog_destroy() +*/ + +void translog_destroy() +{ + uint i; + DBUG_ENTER("translog_destroy"); + + if (translog_inited) + { + if (log_descriptor.bc.buffer->file != -1) + translog_finish_page(&log_descriptor.horizon, &log_descriptor.bc); + + for (i= 0; i < TRANSLOG_BUFFERS_NO; i++) + { + struct st_translog_buffer *buffer= log_descriptor.buffers + i; + translog_buffer_destroy(buffer); + } + + /* close files */ + for (i= 0; i < OPENED_FILES_NUM; i++) + { + if (log_descriptor.log_file_num[i] != -1) + translog_close_log_file(log_descriptor.log_file_num[i]); + } + pthread_mutex_destroy(&log_descriptor.sent_to_file_lock); + pthread_mutex_destroy(&log_descriptor.file_header_lock); + pthread_mutex_destroy(&log_descriptor.unfinished_files_lock); + pthread_mutex_destroy(&log_descriptor.purger_lock); + pthread_mutex_destroy(&log_descriptor.log_flush_lock); + delete_dynamic(&log_descriptor.unfinished_files); + + my_close(log_descriptor.directory_fd, MYF(MY_WME)); + my_atomic_rwlock_destroy(&LOCK_id_to_share); + my_free((uchar*)(id_to_share + 1), MYF(MY_ALLOW_ZERO_PTR)); + translog_inited= 0; + } + DBUG_VOID_RETURN; +} + + + + +#define translog_buffer_lock_assert_owner(B) \ + safe_mutex_assert_owner(&B->mutex); +void translog_lock_assert_owner() +{ + translog_buffer_lock_assert_owner(log_descriptor.bc.buffer); +} + + +/* + Start new page + + SYNOPSIS + translog_page_next() + horizon \ Position in file and buffer where we are + cursor / + prev_buffer Buffer which should be flushed will be assigned + here if it is need. This is always set. + + NOTE + handler should be locked + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_page_next(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + struct st_translog_buffer **prev_buffer) +{ + struct st_translog_buffer *buffer= cursor->buffer; + DBUG_ENTER("translog_page_next"); + + if ((cursor->ptr +TRANSLOG_PAGE_SIZE > + cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER) || + (LSN_OFFSET(*horizon) > + log_descriptor.log_file_max_size - TRANSLOG_PAGE_SIZE)) + { + DBUG_PRINT("info", ("Switch to next buffer Buffer Size: %lu (%lu) => %d " + "File size: %lu max: %lu => %d", + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer), + (cursor->ptr + TRANSLOG_PAGE_SIZE > + cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER), + (ulong) LSN_OFFSET(*horizon), + (ulong) log_descriptor.log_file_max_size, + (LSN_OFFSET(*horizon) > + (log_descriptor.log_file_max_size - + TRANSLOG_PAGE_SIZE)))); + if (translog_buffer_next(horizon, cursor, + LSN_OFFSET(*horizon) > + (log_descriptor.log_file_max_size - + TRANSLOG_PAGE_SIZE))) + DBUG_RETURN(1); + *prev_buffer= buffer; + DBUG_PRINT("info", ("Buffer #%u (0x%lu): have to be flushed", + (uint) buffer->buffer_no, (ulong) buffer)); + } + else + { + DBUG_PRINT("info", ("Use the same buffer #%u (0x%lu): " + "Buffer Size: %lu (%lu)", + (uint) buffer->buffer_no, + (ulong) buffer, + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + translog_finish_page(horizon, cursor); + translog_new_page_header(horizon, cursor); + *prev_buffer= NULL; + } + DBUG_RETURN(0); +} + + +/* + Write data of given length to the current page + + SYNOPSIS + translog_write_data_on_page() + horizon \ Pointers on file and buffer + cursor / + length IN length of the chunk + buffer buffer with data + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_write_data_on_page(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + translog_size_t length, + uchar *buffer) +{ + DBUG_ENTER("translog_write_data_on_page"); + DBUG_PRINT("enter", ("Chunk length: %lu Page size %u", + (ulong) length, (uint) cursor->current_page_fill)); + DBUG_ASSERT(length > 0); + DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(length + cursor->ptr <=cursor->buffer->buffer + + TRANSLOG_WRITE_BUFFER); + + memcpy(cursor->ptr, buffer, length); + cursor->ptr+= length; + (*horizon)+= length; /* adds offset */ + cursor->current_page_fill+= length; + if (!cursor->chaser) + cursor->buffer->size+= length; + DBUG_PRINT("info", ("Write data buffer #%u: 0x%lx " + "chaser: %d Size: %lu (%lu)", + (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + DBUG_EXECUTE("info", translog_check_cursor(cursor);); + + DBUG_RETURN(0); +} + + +/* + Write data from parts of given length to the current page + + SYNOPSIS + translog_write_parts_on_page() + horizon \ Pointers on file and buffer + cursor / + length IN length of the chunk + parts IN/OUT chunk source + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_write_parts_on_page(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + translog_size_t length, + struct st_translog_parts *parts) +{ + translog_size_t left= length; + uint cur= (uint) parts->current; + DBUG_ENTER("translog_write_parts_on_page"); + DBUG_PRINT("enter", ("Chunk length: %lu parts: %u of %u. Page size: %u " + "Buffer size: %lu (%lu)", + (ulong) length, + (uint) (cur + 1), (uint) parts->elements, + (uint) cursor->current_page_fill, + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + DBUG_ASSERT(length > 0); + DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(length + cursor->ptr <=cursor->buffer->buffer + + TRANSLOG_WRITE_BUFFER); + + do + { + translog_size_t len; + LEX_STRING *part; + uchar *buff; + + DBUG_ASSERT(cur < parts->elements); + part= parts->parts + cur; + buff= (uchar*) part->str; + DBUG_PRINT("info", ("Part: %u Length: %lu left: %lu buff: 0x%lx", + (uint) (cur + 1), (ulong) part->length, (ulong) left, + (ulong) buff)); + + if (part->length > left) + { + /* we should write less then the current part */ + len= left; + part->length-= len; + part->str+= len; + DBUG_PRINT("info", ("Set new part: %u Length: %lu", + (uint) (cur + 1), (ulong) part->length)); + } + else + { + len= part->length; + cur++; + DBUG_PRINT("info", ("moved to next part (len: %lu)", (ulong) len)); + } + DBUG_PRINT("info", ("copy: 0x%lx <- 0x%lx %u", + (ulong) cursor->ptr, (ulong)buff, (uint)len)); + if (likely(len)) + { + memcpy(cursor->ptr, buff, len); + left-= len; + cursor->ptr+= len; + } + } while (left); + + DBUG_PRINT("info", ("Horizon: (%lu,0x%lx) Length %lu(0x%lx)", + LSN_IN_PARTS(*horizon), + (ulong) length, (ulong) length)); + parts->current= cur; + (*horizon)+= length; /* offset increasing */ + cursor->current_page_fill+= length; + if (!cursor->chaser) + cursor->buffer->size+= length; + DBUG_PRINT("info", ("Write parts buffer #%u: 0x%lx " + "chaser: %d Size: %lu (%lu) " + "Horizon: (%lu,0x%lx) buff offset: 0x%lx", + (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer), + LSN_IN_PARTS(*horizon), + (ulong) (LSN_OFFSET(cursor->buffer->offset) + + cursor->buffer->size))); + DBUG_EXECUTE("info", translog_check_cursor(cursor);); + + DBUG_RETURN(0); +} + + +/* + Put 1 group chunk type 0 header into parts array + + SYNOPSIS + translog_write_variable_record_1group_header() + parts Descriptor of record source parts + type The log record type + short_trid Short transaction ID or 0 if it has no sense + header_length Calculated header length of chunk type 0 + chunk0_header Buffer for the chunk header writing +*/ + +static void +translog_write_variable_record_1group_header(struct st_translog_parts *parts, + enum translog_record_type type, + SHORT_TRANSACTION_ID short_trid, + uint16 header_length, + uchar *chunk0_header) +{ + LEX_STRING *part; + DBUG_ASSERT(parts->current != 0); /* first part is left for header */ + part= parts->parts + (--parts->current); + parts->total_record_length+= (part->length= header_length); + part->str= (char*)chunk0_header; + /* puts chunk type */ + *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN); + int2store(chunk0_header + 1, short_trid); + /* puts record length */ + translog_write_variable_record_1group_code_len(chunk0_header + 3, + parts->record_length, + header_length); + /* puts 0 as chunk length which indicate 1 group record */ + int2store(chunk0_header + header_length - 2, 0); +} + + +/* + Increase number of writers for this buffer + + SYNOPSIS + translog_buffer_increase_writers() + buffer target buffer +*/ + +static inline void +translog_buffer_increase_writers(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_increase_writers"); + buffer->copy_to_buffer_in_progress++; + DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u 0x%lx: %d", + (uint) buffer->buffer_no, (ulong) buffer, + buffer->copy_to_buffer_in_progress)); + DBUG_VOID_RETURN; +} + + +/* + Decrease number of writers for this buffer + + SYNOPSIS + translog_buffer_decrease_writers() + buffer target buffer +*/ + + +static void translog_buffer_decrease_writers(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_decrease_writers"); + buffer->copy_to_buffer_in_progress--; + DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u 0x%lx: %d", + (uint) buffer->buffer_no, (ulong) buffer, + buffer->copy_to_buffer_in_progress)); + if (buffer->copy_to_buffer_in_progress == 0 && + buffer->waiting_filling_buffer.last_thread != NULL) + wqueue_release_queue(&buffer->waiting_filling_buffer); + DBUG_VOID_RETURN; +} + + +/* + Put chunk 2 from new page beginning + + SYNOPSIS + translog_write_variable_record_chunk2_page() + parts Descriptor of record source parts + horizon \ Pointers on file position and buffer + cursor / + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_write_variable_record_chunk2_page(struct st_translog_parts *parts, + TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + struct st_translog_buffer *buffer_to_flush; + int rc; + uchar chunk2_header[1]; + DBUG_ENTER("translog_write_variable_record_chunk2_page"); + chunk2_header[0]= TRANSLOG_CHUNK_NOHDR; + + LINT_INIT(buffer_to_flush); + rc= translog_page_next(horizon, cursor, &buffer_to_flush); + if (buffer_to_flush != NULL) + { + rc|= translog_buffer_lock(buffer_to_flush); + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + rc|= translog_buffer_unlock(buffer_to_flush); + } + if (rc) + DBUG_RETURN(1); + + /* Puts chunk type */ + translog_write_data_on_page(horizon, cursor, 1, chunk2_header); + /* Puts chunk body */ + translog_write_parts_on_page(horizon, cursor, + log_descriptor.page_capacity_chunk_2, parts); + DBUG_RETURN(0); +} + + +/* + Put chunk 3 of requested length in the buffer from new page beginning + + SYNOPSIS + translog_write_variable_record_chunk3_page() + parts Descriptor of record source parts + length Length of this chunk + horizon \ Pointers on file position and buffer + cursor / + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_write_variable_record_chunk3_page(struct st_translog_parts *parts, + uint16 length, + TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + struct st_translog_buffer *buffer_to_flush; + LEX_STRING *part; + int rc; + uchar chunk3_header[1 + 2]; + DBUG_ENTER("translog_write_variable_record_chunk3_page"); + + LINT_INIT(buffer_to_flush); + rc= translog_page_next(horizon, cursor, &buffer_to_flush); + if (buffer_to_flush != NULL) + { + rc|= translog_buffer_lock(buffer_to_flush); + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + rc|= translog_buffer_unlock(buffer_to_flush); + } + if (rc) + DBUG_RETURN(1); + if (length == 0) + { + /* It was call to write page header only (no data for chunk 3) */ + DBUG_PRINT("info", ("It is a call to make page header only")); + DBUG_RETURN(0); + } + + DBUG_ASSERT(parts->current != 0); /* first part is left for header */ + part= parts->parts + (--parts->current); + parts->total_record_length+= (part->length= 1 + 2); + part->str= (char*)chunk3_header; + /* Puts chunk type */ + *chunk3_header= (uchar) (TRANSLOG_CHUNK_LNGTH); + /* Puts chunk length */ + int2store(chunk3_header + 1, length); + + translog_write_parts_on_page(horizon, cursor, length + 1 + 2, parts); + DBUG_RETURN(0); +} + +/* + Move log pointer (horizon) on given number pages starting from next page, + and given offset on the last page + + SYNOPSIS + translog_advance_pointer() + pages Number of full pages starting from the next one + last_page_data Plus this data on the last page + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_advance_pointer(uint pages, uint16 last_page_data) +{ + translog_size_t last_page_offset= (log_descriptor.page_overhead + + last_page_data); + translog_size_t offset= (TRANSLOG_PAGE_SIZE - + log_descriptor.bc.current_page_fill + + pages * TRANSLOG_PAGE_SIZE + last_page_offset); + translog_size_t buffer_end_offset, file_end_offset, min_offset; + DBUG_ENTER("translog_advance_pointer"); + DBUG_PRINT("enter", ("Pointer: (%lu, 0x%lx) + %u + %u pages + %u + %u", + LSN_IN_PARTS(log_descriptor.horizon), + (uint) (TRANSLOG_PAGE_SIZE - + log_descriptor.bc.current_page_fill), + pages, (uint) log_descriptor.page_overhead, + (uint) last_page_data)); + + for (;;) + { + uint8 new_buffer_no; + struct st_translog_buffer *new_buffer; + struct st_translog_buffer *old_buffer; + buffer_end_offset= TRANSLOG_WRITE_BUFFER - log_descriptor.bc.buffer->size; + file_end_offset= (log_descriptor.log_file_max_size - + LSN_OFFSET(log_descriptor.horizon)); + DBUG_PRINT("info", ("offset: %lu buffer_end_offs: %lu, " + "file_end_offs: %lu", + (ulong) offset, (ulong) buffer_end_offset, + (ulong) file_end_offset)); + DBUG_PRINT("info", ("Buff #%u %u (0x%lx) offset 0x%lx + size 0x%lx = " + "0x%lx (0x%lx)", + (uint) log_descriptor.bc.buffer->buffer_no, + (uint) log_descriptor.bc.buffer_no, + (ulong) log_descriptor.bc.buffer, + (ulong) LSN_OFFSET(log_descriptor.bc.buffer->offset), + (ulong) log_descriptor.bc.buffer->size, + (ulong) (LSN_OFFSET(log_descriptor.bc.buffer->offset) + + log_descriptor.bc.buffer->size), + (ulong) LSN_OFFSET(log_descriptor.horizon))); + DBUG_ASSERT(LSN_OFFSET(log_descriptor.bc.buffer->offset) + + log_descriptor.bc.buffer->size == + LSN_OFFSET(log_descriptor.horizon)); + + if (offset <= buffer_end_offset && offset <= file_end_offset) + break; + old_buffer= log_descriptor.bc.buffer; + new_buffer_no= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO; + new_buffer= log_descriptor.buffers + new_buffer_no; + + translog_buffer_lock(new_buffer); + translog_wait_for_buffer_free(new_buffer); + + min_offset= min(buffer_end_offset, file_end_offset); + /* TODO: check is it ptr or size enough */ + log_descriptor.bc.buffer->size+= min_offset; + log_descriptor.bc.ptr+= min_offset; + DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu)", + (uint) log_descriptor.bc.buffer->buffer_no, + (ulong) log_descriptor.bc.buffer, + log_descriptor.bc.chaser, + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr -log_descriptor.bc. + buffer->buffer))); + DBUG_ASSERT((ulong) (log_descriptor.bc.ptr - + log_descriptor.bc.buffer->buffer) == + log_descriptor.bc.buffer->size); + DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no == + log_descriptor.bc.buffer_no); + translog_buffer_increase_writers(log_descriptor.bc.buffer); + + if (file_end_offset <= buffer_end_offset) + { + log_descriptor.horizon+= LSN_ONE_FILE; + log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, + TRANSLOG_PAGE_SIZE); + DBUG_PRINT("info", ("New file: %lu", + (ulong) LSN_FILE_NO(log_descriptor.horizon))); + if (translog_create_new_file()) + { + DBUG_RETURN(1); + } + } + else + { + DBUG_PRINT("info", ("The same file")); + log_descriptor.horizon+= min_offset; /* offset increasing */ + } + translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no); + old_buffer->next_buffer_offset= new_buffer->offset; + if (translog_buffer_unlock(old_buffer)) + DBUG_RETURN(1); + offset-= min_offset; + } + log_descriptor.bc.ptr+= offset; + log_descriptor.bc.buffer->size+= offset; + translog_buffer_increase_writers(log_descriptor.bc.buffer); + log_descriptor.horizon+= offset; /* offset increasing */ + log_descriptor.bc.current_page_fill= last_page_offset; + DBUG_PRINT("info", ("drop write_counter")); + log_descriptor.bc.write_counter= 0; + log_descriptor.bc.previous_offset= 0; + DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx chaser: %d Size: %lu (%lu) " + "offset: %u last page: %u", + (uint) log_descriptor.bc.buffer->buffer_no, + (ulong) log_descriptor.bc.buffer, + log_descriptor.bc.chaser, + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr - + log_descriptor.bc.buffer-> + buffer), (uint) offset, + (uint) last_page_offset)); + DBUG_PRINT("info", + ("pointer moved to: (%lu, 0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon))); + DBUG_EXECUTE("info", translog_check_cursor(&log_descriptor.bc);); + log_descriptor.bc.protected= 0; + DBUG_RETURN(0); +} + + + +/* + Get page rest + + SYNOPSIS + translog_get_current_page_rest() + + NOTE loghandler should be locked + + RETURN + number of bytes left on the current page +*/ + +#define translog_get_current_page_rest() \ + (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill) + +/* + Get buffer rest in full pages + + SYNOPSIS + translog_get_current_buffer_rest() + + NOTE loghandler should be locked + + RETURN + number of full pages left on the current buffer +*/ + +#define translog_get_current_buffer_rest() \ + ((log_descriptor.bc.buffer->buffer + TRANSLOG_WRITE_BUFFER - \ + log_descriptor.bc.ptr) / \ + TRANSLOG_PAGE_SIZE) + +/* + Calculate possible group size without first (current) page + + SYNOPSIS + translog_get_current_group_size() + + NOTE loghandler should be locked + + RETURN + group size without first (current) page +*/ + +static translog_size_t translog_get_current_group_size() +{ + /* buffer rest in full pages */ + translog_size_t buffer_rest= translog_get_current_buffer_rest(); + DBUG_ENTER("translog_get_current_group_size"); + DBUG_PRINT("info", ("buffer_rest in pages: %u", buffer_rest)); + + buffer_rest*= log_descriptor.page_capacity_chunk_2; + /* in case of only half of buffer free we can write this and next buffer */ + if (buffer_rest < log_descriptor.half_buffer_capacity_chunk_2) + { + DBUG_PRINT("info", ("buffer_rest: %lu -> add %lu", + (ulong) buffer_rest, + (ulong) log_descriptor.buffer_capacity_chunk_2)); + buffer_rest+= log_descriptor.buffer_capacity_chunk_2; + } + + DBUG_PRINT("info", ("buffer_rest: %lu", (ulong) buffer_rest)); + + DBUG_RETURN(buffer_rest); +} + + +/* + Write variable record in 1 group + + SYNOPSIS + translog_write_variable_record_1group() + lsn LSN of the record will be written here + type the log record type + short_trid Short transaction ID or 0 if it has no sense + parts Descriptor of record source parts + buffer_to_flush Buffer which have to be flushed if it is not 0 + header_length Calculated header length of chunk type 0 + trn Transaction structure pointer for hooks by + record log type, for short_id + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_write_variable_record_1group(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + struct st_translog_buffer + *buffer_to_flush, uint16 header_length, + TRN *trn) +{ + TRANSLOG_ADDRESS horizon; + struct st_buffer_cursor cursor; + int rc= 0; + uint i; + translog_size_t record_rest, full_pages, first_page; + uint additional_chunk3_page= 0; + uchar chunk0_header[1 + 2 + 5 + 2]; + DBUG_ENTER("translog_write_variable_record_1group"); + + *lsn= horizon= log_descriptor.horizon; + if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), + *lsn, TRUE) || + (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, + lsn, parts))) + { + translog_unlock(); + DBUG_RETURN(1); + } + cursor= log_descriptor.bc; + cursor.chaser= 1; + + /* Advance pointer To be able unlock the loghandler */ + first_page= translog_get_current_page_rest(); + record_rest= parts->record_length - (first_page - header_length); + full_pages= record_rest / log_descriptor.page_capacity_chunk_2; + record_rest= (record_rest % log_descriptor.page_capacity_chunk_2); + + if (record_rest + 1 == log_descriptor.page_capacity_chunk_2) + { + DBUG_PRINT("info", ("2 chunks type 3 is needed")); + /* We will write 2 chunks type 3 at the end of this group */ + additional_chunk3_page= 1; + record_rest= 1; + } + + DBUG_PRINT("info", ("first_page: %u (%u) full_pages: %u (%lu) " + "additional: %u (%u) rest %u = %u", + first_page, first_page - header_length, + full_pages, + (ulong) full_pages * + log_descriptor.page_capacity_chunk_2, + additional_chunk3_page, + additional_chunk3_page * + (log_descriptor.page_capacity_chunk_2 - 1), + record_rest, parts->record_length)); + /* record_rest + 3 is chunk type 3 overhead + record_rest */ + rc|= translog_advance_pointer(full_pages + additional_chunk3_page, + (record_rest ? record_rest + 3 : 0)); + log_descriptor.bc.buffer->last_lsn= *lsn; + + rc|= translog_unlock(); + + /* + Check if we switched buffer and need process it (current buffer is + unlocked already => we will not delay other threads + */ + if (buffer_to_flush != NULL) + { + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + rc|= translog_buffer_unlock(buffer_to_flush); + } + if (rc) + DBUG_RETURN(1); + + translog_write_variable_record_1group_header(parts, type, short_trid, + header_length, chunk0_header); + + /* fill the pages */ + translog_write_parts_on_page(&horizon, &cursor, first_page, parts); + + + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + + for (i= 0; i < full_pages; i++) + { + if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) + DBUG_RETURN(1); + + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + } + + if (additional_chunk3_page) + { + if (translog_write_variable_record_chunk3_page(parts, + log_descriptor. + page_capacity_chunk_2 - 2, + &horizon, &cursor)) + DBUG_RETURN(1); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + DBUG_ASSERT(cursor.current_page_fill == TRANSLOG_PAGE_SIZE); + } + + if (translog_write_variable_record_chunk3_page(parts, + record_rest, + &horizon, &cursor)) + DBUG_RETURN(1); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + (ulong) LSN_FILE_NO(log_descriptor.horizon), + (ulong) LSN_OFFSET(log_descriptor.horizon), + (ulong) LSN_FILE_NO(horizon), + (ulong) LSN_OFFSET(horizon))); + + if (!(rc= translog_buffer_lock(cursor.buffer))) + { + /* + Check if we wrote something on 1:st not full page and need to reconstruct + CRC and sector protection + */ + translog_buffer_decrease_writers(cursor.buffer); + } + rc|= translog_buffer_unlock(cursor.buffer); + DBUG_RETURN(rc); +} + + +/* + Write variable record in 1 chunk + + SYNOPSIS + translog_write_variable_record_1chunk() + lsn LSN of the record will be written here + type the log record type + short_trid Short transaction ID or 0 if it has no sense + parts Descriptor of record source parts + buffer_to_flush Buffer which have to be flushed if it is not 0 + header_length Calculated header length of chunk type 0 + trn Transaction structure pointer for hooks by + record log type, for short_id + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_write_variable_record_1chunk(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + struct st_translog_buffer + *buffer_to_flush, uint16 header_length, + TRN *trn) +{ + int rc; + uchar chunk0_header[1 + 2 + 5 + 2]; + DBUG_ENTER("translog_write_variable_record_1chunk"); + + translog_write_variable_record_1group_header(parts, type, short_trid, + header_length, chunk0_header); + + *lsn= log_descriptor.horizon; + if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), + *lsn, TRUE) || + (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, + lsn, parts))) + { + translog_unlock(); + DBUG_RETURN(1); + } + + rc= translog_write_parts_on_page(&log_descriptor.horizon, + &log_descriptor.bc, + parts->total_record_length, parts); + log_descriptor.bc.buffer->last_lsn= *lsn; + rc|= translog_unlock(); + + /* + check if we switched buffer and need process it (current buffer is + unlocked already => we will not delay other threads + */ + if (buffer_to_flush != NULL) + { + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + rc|= translog_buffer_unlock(buffer_to_flush); + } + + DBUG_RETURN(rc); +} + + +/* + Calculate and write LSN difference (compressed LSN) + + SYNOPSIS + translog_put_LSN_diff() + base_lsn LSN from which we calculate difference + lsn LSN for codding + dst Result will be written to dst[-pack_length] .. dst[-1] + + NOTE: + To store an LSN in a compact way we will use the following compression: + + If a log record has LSN1, and it contains the lSN2 as a back reference, + Instead of LSN2 we write LSN1-LSN2, encoded as: + + two bits the number N (see below) + 14 bits + N bytes + + That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2 + is stored in the first two bits. + + RETURN + # pointer on coded LSN + NULL Error +*/ + +static uchar *translog_put_LSN_diff(LSN base_lsn, LSN lsn, uchar *dst) +{ + DBUG_ENTER("translog_put_LSN_diff"); + DBUG_PRINT("enter", ("Base: (0x%lu,0x%lx) val: (0x%lu,0x%lx) dst: 0x%lx", + LSN_IN_PARTS(base_lsn), LSN_IN_PARTS(lsn), + (ulong) dst)); + if (LSN_FILE_NO(base_lsn) == LSN_FILE_NO(lsn)) + { + uint32 diff; + DBUG_ASSERT(base_lsn > lsn); + diff= base_lsn - lsn; + DBUG_PRINT("info", ("File is the same. Diff: 0x%lx", (ulong) diff)); + if (diff <= 0x3FFF) + { + dst-= 2; + /* + Note we store this high uchar first to ensure that first uchar has + 0 in the 3 upper bits. + */ + dst[0]= diff >> 8; + dst[1]= (diff & 0xFF); + } + else if (diff <= 0x3FFFFF) + { + dst-= 3; + dst[0]= 0x40 | (diff >> 16); + int2store(dst + 1, diff & 0xFFFF); + } + else if (diff <= 0x3FFFFFFF) + { + dst-= 4; + dst[0]= 0x80 | (diff >> 24); + int3store(dst + 1, diff & 0xFFFFFF); + } + else + { + dst-= 5; + dst[0]= 0xC0; + int4store(dst + 1, diff); + } + } + else + { + uint32 diff; + uint32 offset_diff; + ulonglong base_offset= LSN_OFFSET(base_lsn); + DBUG_ASSERT(base_lsn > lsn); + diff= LSN_FILE_NO(base_lsn) - LSN_FILE_NO(lsn); + DBUG_PRINT("info", ("File is different. Diff: 0x%lx", (ulong) diff)); + + if (base_offset < LSN_OFFSET(lsn)) + { + /* take 1 from file offset */ + diff--; + base_offset+= LL(0x100000000); + } + offset_diff= base_offset - LSN_OFFSET(lsn); + if (diff > 0x3f) + { + /* + It is full LSN after special 1 diff (which is impossible + in real life) + */ + dst-= 2 + LSN_STORE_SIZE; + dst[0]= 0; + dst[1]= 1; + lsn_store(dst + 2, lsn); + } + else + { + dst-= 5; + *dst= (0xC0 | diff); + int4store(dst + 1, offset_diff); + } + } + DBUG_PRINT("info", ("new dst: 0x%lx", (ulong) dst)); + DBUG_RETURN(dst); +} + + +/* + Get LSN from LSN-difference (compressed LSN) + + SYNOPSIS + translog_get_LSN_from_diff() + base_lsn LSN from which we calculate difference + src pointer to coded lsn + dst pointer to buffer where to write 7byte LSN + + NOTE: + To store an LSN in a compact way we will use the following compression: + + If a log record has LSN1, and it contains the lSN2 as a back reference, + Instead of LSN2 we write LSN1-LSN2, encoded as: + + two bits the number N (see below) + 14 bits + N bytes + + That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2 + is stored in the first two bits. + + RETURN + pointer to buffer after decoded LSN +*/ + +static uchar *translog_get_LSN_from_diff(LSN base_lsn, uchar *src, uchar *dst) +{ + LSN lsn; + uint32 diff; + uint32 first_byte; + uint32 file_no, rec_offset; + uint8 code; + DBUG_ENTER("translog_get_LSN_from_diff"); + DBUG_PRINT("enter", ("Base: (0x%lx,0x%lx) src: 0x%lx dst 0x%lx", + LSN_IN_PARTS(base_lsn), (ulong) src, (ulong) dst)); + first_byte= *((uint8*) src); + code= first_byte >> 6; /* Length is in 2 most significant bits */ + first_byte&= 0x3F; + src++; /* Skip length + encode */ + file_no= LSN_FILE_NO(base_lsn); /* Assume relative */ + DBUG_PRINT("info", ("code: %u first byte: %lu", + (uint) code, (ulong) first_byte)); + switch (code) { + case 0: + if (first_byte == 0 && *((uint8*)src) == 1) + { + /* + It is full LSN after special 1 diff (which is impossible + in real life) + */ + memcpy(dst, src + 1, LSN_STORE_SIZE); + DBUG_PRINT("info", ("Special case of full LSN, new src: 0x%lx", + (ulong) (src + 1 + LSN_STORE_SIZE))); + DBUG_RETURN(src + 1 + LSN_STORE_SIZE); + } + rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 8) + *((uint8*)src)); + break; + case 1: + diff= uint2korr(src); + rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 16) + diff); + break; + case 2: + diff= uint3korr(src); + rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 24) + diff); + break; + case 3: + { + ulonglong base_offset= LSN_OFFSET(base_lsn); + diff= uint4korr(src); + if (diff > LSN_OFFSET(base_lsn)) + { + /* take 1 from file offset */ + first_byte++; + base_offset+= LL(0x100000000); + } + file_no= LSN_FILE_NO(base_lsn) - first_byte; + rec_offset= base_offset - diff; + break; + } + default: + DBUG_ASSERT(0); + DBUG_RETURN(NULL); + } + lsn= MAKE_LSN(file_no, rec_offset); + src+= code + 1; + lsn_store(dst, lsn); + DBUG_PRINT("info", ("new src: 0x%lx", (ulong) src)); + DBUG_RETURN(src); +} + + +/* + Encode relative LSNs listed in the parameters + + SYNOPSIS + translog_relative_LSN_encode() + parts Parts list with encoded LSN(s) + base_lsn LSN which is base for encoding + lsns number of LSN(s) to encode + compressed_LSNs buffer which can be used for storing compressed LSN(s) + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_relative_LSN_encode(struct st_translog_parts *parts, + LSN base_lsn, + uint lsns, uchar *compressed_LSNs) +{ + LEX_STRING *part; + uint lsns_len= lsns * LSN_STORE_SIZE; + char buffer_src[MAX_NUMBER_OF_LSNS_PER_RECORD * LSN_STORE_SIZE]; + char *buffer= buffer_src; + + DBUG_ENTER("translog_relative_LSN_encode"); + + DBUG_ASSERT(parts->current != 0); + part= parts->parts + parts->current; + + /* collect all LSN(s) in one chunk if it (they) is (are) divided */ + if (part->length < lsns_len) + { + uint copied= part->length; + LEX_STRING *next_part; + DBUG_PRINT("info", ("Using buffer: 0x%lx", (ulong) compressed_LSNs)); + memcpy(buffer, (uchar*)part->str, part->length); + next_part= parts->parts + parts->current + 1; + do + { + DBUG_ASSERT(next_part < parts->parts + parts->elements); + if ((next_part->length + copied) < lsns_len) + { + memcpy(buffer + copied, (uchar*)next_part->str, + next_part->length); + copied+= next_part->length; + next_part->length= 0; next_part->str= 0; + /* delete_dynamic_element(&parts->parts, parts->current + 1); */ + next_part++; + parts->current++; + part= parts->parts + parts->current; + } + else + { + uint len= lsns_len - copied; + memcpy(buffer + copied, (uchar*)next_part->str, len); + copied= lsns_len; + next_part->str+= len; + next_part->length-= len; + } + } while (copied < lsns_len); + } + else + { + buffer= part->str; + part->str+= lsns_len; + part->length-= lsns_len; + parts->current--; + part= parts->parts + parts->current; + } + + { + /* Compress */ + LSN ref; + int economy; + uchar *src_ptr; + uchar *dst_ptr= compressed_LSNs + (MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE); + for (src_ptr= buffer + lsns_len - LSN_STORE_SIZE; + src_ptr >= (uchar*) buffer; + src_ptr-= LSN_STORE_SIZE) + { + ref= lsn_korr(src_ptr); + if ((dst_ptr= translog_put_LSN_diff(base_lsn, ref, dst_ptr)) == NULL) + DBUG_RETURN(1); + } + part->length= (uint)((compressed_LSNs + + (MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE)) - + dst_ptr); + parts->record_length-= (economy= lsns_len - part->length); + DBUG_PRINT("info", ("new length of LSNs: %lu economy: %d", + (ulong)part->length, economy)); + parts->total_record_length-= economy; + part->str= (char*)dst_ptr; + } + DBUG_RETURN(0); +} + + +/* + Write multi-group variable-size record + + SYNOPSIS + translog_write_variable_record_mgroup() + lsn LSN of the record will be written here + type the log record type + short_trid Short transaction ID or 0 if it has no sense + parts Descriptor of record source parts + buffer_to_flush Buffer which have to be flushed if it is not 0 + header_length Header length calculated for 1 group + buffer_rest Beginning from which we plan to write in full pages + trn Transaction structure pointer for hooks by + record log type, for short_id + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_write_variable_record_mgroup(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + struct st_translog_buffer + *buffer_to_flush, + uint16 header_length, + translog_size_t buffer_rest, + TRN *trn) +{ + TRANSLOG_ADDRESS horizon; + struct st_buffer_cursor cursor; + int rc= 0; + uint i, chunk2_page, full_pages; + uint curr_group= 0; + translog_size_t record_rest, first_page, chunk3_pages, chunk0_pages= 1; + translog_size_t done= 0; + struct st_translog_group_descriptor group; + DYNAMIC_ARRAY groups; + uint16 chunk3_size; + uint16 page_capacity= log_descriptor.page_capacity_chunk_2 + 1; + uint16 last_page_capacity; + my_bool new_page_before_chunk0= 1, first_chunk0= 1; + uchar chunk0_header[1 + 2 + 5 + 2 + 2], group_desc[7 + 1]; + uchar chunk2_header[1]; + uint header_fixed_part= header_length + 2; + uint groups_per_page= (page_capacity - header_fixed_part) / (7 + 1); + uint file_of_the_first_group; + DBUG_ENTER("translog_write_variable_record_mgroup"); + + chunk2_header[0]= TRANSLOG_CHUNK_NOHDR; + + if (init_dynamic_array(&groups, sizeof(struct st_translog_group_descriptor), + 10, 10 CALLER_INFO)) + { + translog_unlock(); + UNRECOVERABLE_ERROR(("init array failed")); + DBUG_RETURN(1); + } + + first_page= translog_get_current_page_rest(); + record_rest= parts->record_length - (first_page - 1); + DBUG_PRINT("info", ("Record Rest: %lu", (ulong) record_rest)); + + if (record_rest < buffer_rest) + { + DBUG_PRINT("info", ("too many free space because changing header")); + buffer_rest-= log_descriptor.page_capacity_chunk_2; + DBUG_ASSERT(record_rest >= buffer_rest); + } + + file_of_the_first_group= LSN_FILE_NO(log_descriptor.horizon); + translog_mark_file_unfinished(file_of_the_first_group); + do + { + group.addr= horizon= log_descriptor.horizon; + cursor= log_descriptor.bc; + cursor.chaser= 1; + if ((full_pages= buffer_rest / log_descriptor.page_capacity_chunk_2) > 255) + { + /* sizeof(uint8) == 256 is max number of chunk in multi-chunks group */ + full_pages= 255; + buffer_rest= full_pages * log_descriptor.page_capacity_chunk_2; + } + /* + group chunks = + full pages + first page (which actually can be full, too). + But here we assign number of chunks - 1 + */ + group.num= full_pages; + if (insert_dynamic(&groups, (uchar*) &group)) + { + UNRECOVERABLE_ERROR(("insert into array failed")); + goto err_unlock; + } + + DBUG_PRINT("info", ("chunk: #%u first_page: %u (%u) " + "full_pages: %lu (%lu) " + "Left %lu", + groups.elements, + first_page, first_page - 1, + (ulong) full_pages, + (ulong) (full_pages * + log_descriptor.page_capacity_chunk_2), + (ulong)(parts->record_length - (first_page - 1 + + buffer_rest) - + done))); + rc|= translog_advance_pointer(full_pages, 0); + + rc|= translog_unlock(); + + if (buffer_to_flush != NULL) + { + rc|= translog_buffer_lock(buffer_to_flush); + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + rc|= translog_buffer_unlock(buffer_to_flush); + buffer_to_flush= NULL; + } + if (rc) + { + UNRECOVERABLE_ERROR(("flush of unlock buffer failed")); + goto err; + } + + translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header); + translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) " + "Left %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + done))); + + for (i= 0; i < full_pages; i++) + { + if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) + goto err; + + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) " + "local: (%lu,0x%lx) " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + i * log_descriptor.page_capacity_chunk_2 - + done))); + } + + done+= (first_page - 1 + buffer_rest); + + /* TODO: make separate function for following */ + rc= translog_page_next(&horizon, &cursor, &buffer_to_flush); + if (buffer_to_flush != NULL) + { + rc|= translog_buffer_lock(buffer_to_flush); + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + rc|= translog_buffer_unlock(buffer_to_flush); + buffer_to_flush= NULL; + } + if (rc) + { + UNRECOVERABLE_ERROR(("flush of unlock buffer failed")); + goto err; + } + rc= translog_buffer_lock(cursor.buffer); + if (!rc) + translog_buffer_decrease_writers(cursor.buffer); + rc|= translog_buffer_unlock(cursor.buffer); + if (rc) + goto err; + + translog_lock(); + + first_page= translog_get_current_page_rest(); + buffer_rest= translog_get_current_group_size(); + } while (first_page + buffer_rest < (uint) (parts->record_length - done)); + + group.addr= horizon= log_descriptor.horizon; + cursor= log_descriptor.bc; + cursor.chaser= 1; + group.num= 0; /* 0 because it does not matter */ + if (insert_dynamic(&groups, (uchar*) &group)) + { + UNRECOVERABLE_ERROR(("insert into array failed")); + goto err_unlock; + } + record_rest= parts->record_length - done; + DBUG_PRINT("info", ("Record rest: %lu", (ulong) record_rest)); + if (first_page <= record_rest + 1) + { + chunk2_page= 1; + record_rest-= (first_page - 1); + full_pages= record_rest / log_descriptor.page_capacity_chunk_2; + record_rest= (record_rest % log_descriptor.page_capacity_chunk_2); + last_page_capacity= page_capacity; + } + else + { + chunk2_page= full_pages= 0; + last_page_capacity= first_page; + } + chunk3_size= 0; + chunk3_pages= 0; + if (last_page_capacity > record_rest + 1 && record_rest != 0) + { + if (last_page_capacity > + record_rest + header_fixed_part + groups.elements * (7 + 1)) + { + /* 1 record of type 0 */ + chunk3_pages= 0; + } + else + { + chunk3_pages= 1; + if (record_rest + 2 == last_page_capacity) + { + chunk3_size= record_rest - 1; + record_rest= 1; + } + else + { + chunk3_size= record_rest; + record_rest= 0; + } + } + } + /* + A first non-full page will hold type 0 chunk only if it fit in it with + all its headers + */ + while (page_capacity < + record_rest + header_fixed_part + + (groups.elements - groups_per_page * (chunk0_pages - 1)) * (7 + 1)) + chunk0_pages++; + DBUG_PRINT("info", ("chunk0_pages: %u groups %u groups per full page: %u " + "Group on last page: %u", + chunk0_pages, groups.elements, + groups_per_page, + (groups.elements - + ((page_capacity - header_fixed_part) / (7 + 1)) * + (chunk0_pages - 1)))); + DBUG_PRINT("info", ("first_page: %u chunk2: %u full_pages: %u (%lu) " + "chunk3: %u (%u) rest: %u", + first_page, + chunk2_page, full_pages, + (ulong) full_pages * + log_descriptor.page_capacity_chunk_2, + chunk3_pages, (uint) chunk3_size, (uint) record_rest)); + rc= translog_advance_pointer(full_pages + chunk3_pages + + (chunk0_pages - 1), + record_rest + header_fixed_part + + (groups.elements - + ((page_capacity - + header_fixed_part) / (7 + 1)) * + (chunk0_pages - 1)) * (7 + 1)); + rc|= translog_unlock(); + if (rc) + goto err; + + if (chunk2_page) + { + DBUG_PRINT("info", ("chunk 2 to finish first page")); + translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header); + translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + done))); + } + else if (chunk3_pages) + { + DBUG_PRINT("info", ("chunk 3")); + DBUG_ASSERT(full_pages == 0); + uchar chunk3_header[3]; + chunk3_pages= 0; + chunk3_header[0]= TRANSLOG_CHUNK_LNGTH; + int2store(chunk3_header + 1, chunk3_size); + translog_write_data_on_page(&horizon, &cursor, 3, chunk3_header); + translog_write_parts_on_page(&horizon, &cursor, chunk3_size, parts); + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - chunk3_size - done))); + } + else + { + DBUG_PRINT("info", ("no new_page_before_chunk0")); + new_page_before_chunk0= 0; + } + + for (i= 0; i < full_pages; i++) + { + DBUG_ASSERT(chunk2_page != 0); + if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) + goto err; + + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx) " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + i * log_descriptor.page_capacity_chunk_2 - + done))); + } + + if (chunk3_pages && + translog_write_variable_record_chunk3_page(parts, + chunk3_size, + &horizon, &cursor)) + goto err; + DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx) local: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + + *chunk0_header= (uchar) (type |TRANSLOG_CHUNK_LSN); + int2store(chunk0_header + 1, short_trid); + translog_write_variable_record_1group_code_len(chunk0_header + 3, + parts->record_length, + header_length); + do + { + int limit; + if (new_page_before_chunk0) + { + rc= translog_page_next(&horizon, &cursor, &buffer_to_flush); + if (buffer_to_flush != NULL) + { + rc|= translog_buffer_lock(buffer_to_flush); + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + rc|= translog_buffer_unlock(buffer_to_flush); + buffer_to_flush= NULL; + } + if (rc) + { + UNRECOVERABLE_ERROR(("flush of unlock buffer failed")); + goto err; + } + } + new_page_before_chunk0= 1; + + if (first_chunk0) + { + first_chunk0= 0; + *lsn= horizon; + if (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook) (type, trn, + tbl_info, + lsn, parts)) + goto err; + } + + /* + A first non-full page will hold type 0 chunk only if it fit in it with + all its headers => the fist page is full or number of groups less then + possible number of full page. + */ + limit= (groups_per_page < groups.elements - curr_group ? + groups_per_page : groups.elements - curr_group); + DBUG_PRINT("info", ("Groups: %u curr: %u limit: %u", + (uint) groups.elements, (uint) curr_group, + (uint) limit)); + + if (chunk0_pages == 1) + { + DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) + %u = %u", + (uint) limit, (uint) record_rest, + (uint) (2 + limit * (7 + 1) + record_rest))); + int2store(chunk0_header + header_length - 2, + 2 + limit * (7 + 1) + record_rest); + } + else + { + DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) = %u", + (uint) limit, (uint) (2 + limit * (7 + 1)))); + int2store(chunk0_header + header_length - 2, 2 + limit * (7 + 1)); + } + int2store(chunk0_header + header_length, groups.elements - curr_group); + translog_write_data_on_page(&horizon, &cursor, header_fixed_part, + chunk0_header); + for (i= curr_group; i < limit + curr_group; i++) + { + struct st_translog_group_descriptor *grp_ptr; + grp_ptr= dynamic_element(&groups, i, + struct st_translog_group_descriptor *); + lsn_store(group_desc, grp_ptr->addr); + group_desc[7]= grp_ptr->num; + translog_write_data_on_page(&horizon, &cursor, (7 + 1), group_desc); + } + + if (chunk0_pages == 1 && record_rest != 0) + translog_write_parts_on_page(&horizon, &cursor, record_rest, parts); + + chunk0_pages--; + curr_group+= limit; + + } while (chunk0_pages != 0); + rc= translog_buffer_lock(cursor.buffer); + if (cmp_translog_addr(cursor.buffer->last_lsn, *lsn) < 0) + cursor.buffer->last_lsn= *lsn; + translog_buffer_decrease_writers(cursor.buffer); + rc|= translog_buffer_unlock(cursor.buffer); + + if (translog_set_lsn_for_files(file_of_the_first_group, LSN_FILE_NO(*lsn), + *lsn, FALSE)) + goto err; + translog_mark_file_finished(file_of_the_first_group); + + + delete_dynamic(&groups); + DBUG_RETURN(rc); + +err_unlock: + translog_unlock(); +err: + delete_dynamic(&groups); + DBUG_RETURN(1); +} + + +/* + Write the variable length log record + + SYNOPSIS + translog_write_variable_record() + lsn LSN of the record will be written here + type the log record type + short_trid Short transaction ID or 0 if it has no sense + parts Descriptor of record source parts + trn Transaction structure pointer for hooks by + record log type, for short_id + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_write_variable_record(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + TRN *trn) +{ + struct st_translog_buffer *buffer_to_flush= NULL; + uint header_length1= 1 + 2 + 2 + + translog_variable_record_length_bytes(parts->record_length); + ulong buffer_rest; + uint page_rest; + /* Max number of such LSNs per record is 2 */ + uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE]; + my_bool res; + DBUG_ENTER("translog_write_variable_record"); + + translog_lock(); + DBUG_PRINT("info", ("horizon: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon))); + page_rest= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill; + DBUG_PRINT("info", ("header length: %u page_rest: %u", + header_length1, page_rest)); + + /* + header and part which we should read have to fit in one chunk + TODO: allow to divide readable header + */ + if (page_rest < + (header_length1 + log_record_type_descriptor[type].read_header_len)) + { + DBUG_PRINT("info", + ("Next page, size: %u header: %u + %u", + log_descriptor.bc.current_page_fill, + header_length1, + log_record_type_descriptor[type].read_header_len)); + translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, + &buffer_to_flush); + /* Chunk 2 header is 1 byte, so full page capacity will be one uchar more */ + page_rest= log_descriptor.page_capacity_chunk_2 + 1; + DBUG_PRINT("info", ("page_rest: %u", page_rest)); + } + + /* + To minimize compressed size we will compress always relative to + very first chunk address (log_descriptor.horizon for now) + */ + if (log_record_type_descriptor[type].compressed_LSN > 0) + { + if (translog_relative_LSN_encode(parts, log_descriptor.horizon, + log_record_type_descriptor[type]. + compressed_LSN, compressed_LSNs)) + { + translog_unlock(); + if (buffer_to_flush != NULL) + { + /* + It is just try to finish log in nice way in case of error, so we + do not check result of the following functions, because we are + going return error state in any case + */ + translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + DBUG_RETURN(1); + } + /* recalculate header length after compression */ + header_length1= 1 + 2 + 2 + + translog_variable_record_length_bytes(parts->record_length); + DBUG_PRINT("info", ("after compressing LSN(s) header length: %u " + "record length: %lu", + header_length1, (ulong)parts->record_length)); + } + + /* TODO: check space on current page for header + few bytes */ + if (page_rest >= parts->record_length + header_length1) + { + /* following function makes translog_unlock(); */ + res= translog_write_variable_record_1chunk(lsn, type, tbl_info, + short_trid, + parts, buffer_to_flush, + header_length1, trn); + DBUG_RETURN(res); + } + + buffer_rest= translog_get_current_group_size(); + + if (buffer_rest >= parts->record_length + header_length1 - page_rest) + { + /* following function makes translog_unlock(); */ + res= translog_write_variable_record_1group(lsn, type, tbl_info, + short_trid, + parts, buffer_to_flush, + header_length1, trn); + DBUG_RETURN(res); + } + /* following function makes translog_unlock(); */ + res= translog_write_variable_record_mgroup(lsn, type, tbl_info, + short_trid, + parts, buffer_to_flush, + header_length1, + buffer_rest, trn); + DBUG_RETURN(res); +} + + +/* + Write the fixed and pseudo-fixed log record + + SYNOPSIS + translog_write_fixed_record() + lsn LSN of the record will be written here + type the log record type + short_trid Short transaction ID or 0 if it has no sense + parts Descriptor of record source parts + trn Transaction structure pointer for hooks by + record log type, for short_id + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_write_fixed_record(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + TRN *trn) +{ + struct st_translog_buffer *buffer_to_flush= NULL; + uchar chunk1_header[1 + 2]; + /* Max number of such LSNs per record is 2 */ + uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE]; + LEX_STRING *part; + int rc; + DBUG_ENTER("translog_write_fixed_record"); + DBUG_ASSERT((log_record_type_descriptor[type].class == + LOGRECTYPE_FIXEDLENGTH && + parts->record_length == + log_record_type_descriptor[type].fixed_length) || + (log_record_type_descriptor[type].class == + LOGRECTYPE_PSEUDOFIXEDLENGTH && + parts->record_length == + log_record_type_descriptor[type].fixed_length)); + + translog_lock(); + DBUG_PRINT("info", ("horizon: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.horizon))); + + DBUG_ASSERT(log_descriptor.bc.current_page_fill <= TRANSLOG_PAGE_SIZE); + DBUG_PRINT("info", + ("Page size: %u record: %u next cond: %d", + log_descriptor.bc.current_page_fill, + (parts->record_length + + log_record_type_descriptor[type].compressed_LSN * 2 + 3), + ((((uint) log_descriptor.bc.current_page_fill) + + (parts->record_length + + log_record_type_descriptor[type].compressed_LSN * 2 + 3)) > + TRANSLOG_PAGE_SIZE))); + /* + check that there is enough place on current page. + NOTE: compressing may increase page LSN size on two bytes for every LSN + */ + if ((((uint) log_descriptor.bc.current_page_fill) + + (parts->record_length + + log_record_type_descriptor[type].compressed_LSN * 2 + 3)) > + TRANSLOG_PAGE_SIZE) + { + DBUG_PRINT("info", ("Next page")); + translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, + &buffer_to_flush); + } + + *lsn= log_descriptor.horizon; + if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), + *lsn, TRUE) || + (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook) (type, trn, tbl_info, + lsn, parts))) + { + rc= 1; + goto err; + } + + /* compress LSNs */ + if (log_record_type_descriptor[type].class == LOGRECTYPE_PSEUDOFIXEDLENGTH) + { + DBUG_ASSERT(log_record_type_descriptor[type].compressed_LSN > 0); + if (translog_relative_LSN_encode(parts, *lsn, + log_record_type_descriptor[type]. + compressed_LSN, compressed_LSNs)) + { + rc= 1; + goto err; + } + } + + /* + Write the whole record at once (we know that there is enough place on + the destination page) + */ + DBUG_ASSERT(parts->current != 0); /* first part is left for header */ + part= parts->parts + (--parts->current); + parts->total_record_length+= (part->length= 1 + 2); + part->str= (char*)chunk1_header; + *chunk1_header= (uchar) (type | TRANSLOG_CHUNK_FIXED); + int2store(chunk1_header + 1, short_trid); + + rc= translog_write_parts_on_page(&log_descriptor.horizon, + &log_descriptor.bc, + parts->total_record_length, parts); + + log_descriptor.bc.buffer->last_lsn= *lsn; + +err: + rc|= translog_unlock(); + + /* + check if we switched buffer and need process it (current buffer is + unlocked already => we will not delay other threads + */ + if (buffer_to_flush != NULL) + { + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + rc|= translog_buffer_unlock(buffer_to_flush); + } + + DBUG_RETURN(rc); +} + + +/** + @brief Writes the log record + + If share has no 2-byte-id yet, gives an id to the share and logs + LOGREC_FILE_ID. If transaction has not logged LOGREC_LONG_TRANSACTION_ID + yet, logs it. + + @param lsn LSN of the record will be written here + @param type the log record type + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param tbl_info MARIA_HA of table or NULL + @param rec_len record length or 0 (count it) + @param part_no number of parts or 0 (count it) + @param parts_data zero ended (in case of number of parts is 0) + array of LEX_STRINGs (parts), first + TRANSLOG_INTERNAL_PARTS positions in the log + should be unused (need for loghandler) + @param store_share_id if tbl_info!=NULL then share's id will + automatically be stored in the two first bytes + pointed (so pointer is assumed to be !=NULL) + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_write_record(LSN *lsn, + enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + translog_size_t rec_len, + uint part_no, + LEX_STRING *parts_data, + uchar *store_share_id) +{ + struct st_translog_parts parts; + LEX_STRING *part; + int rc; + uint short_trid= trn->short_id; + DBUG_ENTER("translog_write_record"); + DBUG_PRINT("enter", ("type: %u ShortTrID: %u rec_len: %lu", + (uint) type, (uint) short_trid, (ulong) rec_len)); + DBUG_ASSERT(translog_inited == 1); + + if (tbl_info) + { + MARIA_SHARE *share= tbl_info->s; + if (!share->now_transactional) + { + DBUG_PRINT("info", ("It is not transactional table")); + DBUG_RETURN(0); + } + if (unlikely(share->id == 0)) + { + /* + First log write for this MARIA_SHARE; give it a short id. + When the lock manager is enabled and needs a short id, it should be + assigned in the lock manager (because row locks will be taken before + log records are written; for example SELECT FOR UPDATE takes locks but + writes no log record. + */ + if (unlikely(translog_assign_id_to_share(tbl_info, trn))) + DBUG_RETURN(1); + } + fileid_store(store_share_id, share->id); + } + if (unlikely(!(trn->first_undo_lsn & TRANSACTION_LOGGED_LONG_ID))) + { + LSN dummy_lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[6]; + int6store(log_data, trn->trid); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; /* no recursion */ + if (unlikely(translog_write_record(&dummy_lsn, LOGREC_LONG_TRANSACTION_ID, + trn, NULL, sizeof(log_data), + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL))) + DBUG_RETURN(1); + } + + parts.parts= parts_data; + + /* count parts if they are not counted by upper level */ + if (part_no == 0) + { + for (part_no= TRANSLOG_INTERNAL_PARTS; + parts_data[part_no].length != 0; + part_no++); + } + parts.elements= part_no; + parts.current= TRANSLOG_INTERNAL_PARTS; + + /* clear TRANSLOG_INTERNAL_PARTS */ + DBUG_ASSERT(TRANSLOG_INTERNAL_PARTS != 0); + parts_data[0].str= 0; + parts_data[0].length= 0; + + /* count length of the record */ + if (rec_len == 0) + { + for(part= parts_data + TRANSLOG_INTERNAL_PARTS;\ + part < parts_data + part_no; + part++) + { + rec_len+= part->length; + } + } + parts.record_length= rec_len; + +#ifndef DBUG_OFF + { + uint i; + uint len= 0; +#ifdef HAVE_purify + ha_checksum checksum= 0; +#endif + for (i= TRANSLOG_INTERNAL_PARTS; i < part_no; i++) + { +#ifdef HAVE_purify + /* Find unitialized bytes early */ + checksum+= my_checksum(checksum, parts_data[i].str, + parts_data[i].length); +#endif + len+= parts_data[i].length; + } + DBUG_ASSERT(len == rec_len); + } +#endif + /* + Start total_record_length from record_length then overhead will + be add + */ + parts.total_record_length= parts.record_length; + DBUG_PRINT("info", ("record length: %lu", (ulong) parts.record_length)); + + /* process this parts */ + if (!(rc= (log_record_type_descriptor[type].prewrite_hook && + (*log_record_type_descriptor[type].prewrite_hook) (type, trn, + tbl_info, + &parts)))) + { + switch (log_record_type_descriptor[type].class) { + case LOGRECTYPE_VARIABLE_LENGTH: + rc= translog_write_variable_record(lsn, type, tbl_info, + short_trid, &parts, trn); + break; + case LOGRECTYPE_PSEUDOFIXEDLENGTH: + case LOGRECTYPE_FIXEDLENGTH: + rc= translog_write_fixed_record(lsn, type, tbl_info, + short_trid, &parts, trn); + break; + case LOGRECTYPE_NOT_ALLOWED: + default: + DBUG_ASSERT(0); + rc= 1; + } + } + + DBUG_PRINT("info", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(*lsn))); + DBUG_RETURN(rc); +} + + +/* + Decode compressed (relative) LSN(s) + + SYNOPSIS + translog_relative_lsn_decode() + base_lsn LSN for encoding + src Decode LSN(s) from here + dst Put decoded LSNs here + lsns number of LSN(s) + + RETURN + position in sources after decoded LSN(s) +*/ + +static uchar *translog_relative_LSN_decode(LSN base_lsn, + uchar *src, uchar *dst, uint lsns) +{ + uint i; + for (i= 0; i < lsns; i++, dst+= LSN_STORE_SIZE) + { + src= translog_get_LSN_from_diff(base_lsn, src, dst); + } + return src; +} + +/** + @brief Get header of fixed/pseudo length record and call hook for + it processing + + @param page Pointer to the buffer with page where LSN chunk is + placed + @param page_offset Offset of the first chunk in the page + @param buff Buffer to be filled with header data + + @return Length of header or operation status + @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +static int translog_fixed_length_header(uchar *page, + translog_size_t page_offset, + TRANSLOG_HEADER_BUFFER *buff) +{ + struct st_log_record_type_descriptor *desc= + log_record_type_descriptor + buff->type; + uchar *src= page + page_offset + 3; + uchar *dst= buff->header; + uchar *start= src; + uint lsns= desc->compressed_LSN; + uint length= desc->fixed_length; + + DBUG_ENTER("translog_fixed_length_header"); + + buff->record_length= length; + + if (desc->class == LOGRECTYPE_PSEUDOFIXEDLENGTH) + { + DBUG_ASSERT(lsns > 0); + src= translog_relative_LSN_decode(buff->lsn, src, dst, lsns); + lsns*= LSN_STORE_SIZE; + dst+= lsns; + length-= lsns; + buff->compressed_LSN_economy= (lsns - (src - start)); + } + else + buff->compressed_LSN_economy= 0; + + memcpy(dst, src, length); + buff->non_header_data_start_offset= page_offset + + ((src + length) - (page + page_offset)); + buff->non_header_data_len= 0; + DBUG_RETURN(buff->record_length); +} + + +/* + Free resources used by TRANSLOG_HEADER_BUFFER + + SYNOPSIS + translog_free_record_header(); +*/ + +void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff) +{ + DBUG_ENTER("translog_free_record_header"); + DBUG_ASSERT(translog_inited == 1); + if (buff->groups_no != 0) + { + my_free((uchar*) buff->groups, MYF(0)); + buff->groups_no= 0; + } + DBUG_VOID_RETURN; +} + + +/** + @brief Returns the current horizon at the end of the current log + + @return Horizon +*/ + +TRANSLOG_ADDRESS translog_get_horizon() +{ + TRANSLOG_ADDRESS res; + DBUG_ASSERT(translog_inited == 1); + translog_lock(); + res= log_descriptor.horizon; + translog_unlock(); + return res; +} + + +/** + @brief Returns the current horizon at the end of the current log, caller is + assumed to already hold the lock + + @return Horizon +*/ + +TRANSLOG_ADDRESS translog_get_horizon_no_lock() +{ + DBUG_ASSERT(translog_inited == 1); + translog_lock_assert_owner(); + return log_descriptor.horizon; +} + + +/* + Set last page in the scanner data structure + + SYNOPSIS + translog_scanner_set_last_page() + scanner Information about current chunk during scanning + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_scanner_set_last_page(TRANSLOG_SCANNER_DATA + *scanner) +{ + my_bool page_ok; + scanner->last_file_page= scanner->page_addr; + return (translog_get_last_page_addr(&scanner->last_file_page, &page_ok)); +} + + +/* + Initialize reader scanner + + SYNOPSIS + translog_init_scanner() + lsn LSN with which it have to be inited + fixed_horizon true if it is OK do not read records which was written + after scanning beginning + scanner scanner which have to be inited + + RETURN + 0 OK + 1 Error +*/ + +my_bool translog_init_scanner(LSN lsn, + my_bool fixed_horizon, + struct st_translog_scanner_data *scanner) +{ + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_init_scanner"); + DBUG_PRINT("enter", ("LSN: (0x%lu,0x%lx)", LSN_IN_PARTS(lsn))); + DBUG_ASSERT(LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE != 0); + DBUG_ASSERT(translog_inited == 1); + + data.addr= &scanner->page_addr; + data.was_recovered= 0; + + scanner->page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE; + + scanner->fixed_horizon= fixed_horizon; + + scanner->horizon= translog_get_horizon(); + DBUG_PRINT("info", ("horizon: (0x%lu,0x%lx)", + LSN_IN_PARTS(scanner->horizon))); + + /* lsn < horizon */ + DBUG_ASSERT(lsn < scanner->horizon); + + scanner->page_addr= lsn; + scanner->page_addr-= scanner->page_offset; /*decrease offset */ + + if (translog_scanner_set_last_page(scanner)) + DBUG_RETURN(1); + + if ((scanner->page= translog_get_page(&data, scanner->buffer)) == NULL) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/* + Checks End of the Log + + SYNOPSIS + translog_scanner_eol() + scanner Information about current chunk during scanning + + RETURN + 1 End of the Log + 0 OK +*/ + +static my_bool translog_scanner_eol(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_scanner_eol"); + DBUG_PRINT("enter", + ("Horizon: (%lu, 0x%lx) Current: (%lu, 0x%lx+0x%x=0x%lx)", + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->page_addr), + (uint) scanner->page_offset, + (ulong) (LSN_OFFSET(scanner->page_addr) + scanner->page_offset))); + if (scanner->horizon > (scanner->page_addr + + scanner->page_offset)) + { + DBUG_PRINT("info", ("Horizon is not reached")); + DBUG_RETURN(0); + } + if (scanner->fixed_horizon) + { + DBUG_PRINT("info", ("Horizon is fixed and reached")); + DBUG_RETURN(1); + } + scanner->horizon= translog_get_horizon(); + DBUG_PRINT("info", + ("Horizon is re-read, EOL: %d", + scanner->horizon <= (scanner->page_addr + + scanner->page_offset))); + DBUG_RETURN(scanner->horizon <= (scanner->page_addr + + scanner->page_offset)); +} + + +/* + Cheks End of the Page + + SYNOPSIS + translog_scanner_eop() + scanner Information about current chunk during scanning + + RETURN + 1 End of the Page + 0 OK +*/ + +static my_bool translog_scanner_eop(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_scanner_eop"); + DBUG_RETURN(scanner->page_offset >= TRANSLOG_PAGE_SIZE || + scanner->page[scanner->page_offset] == 0); +} + + +/* + Checks End of the File (I.e. we are scanning last page, which do not + mean end of this page) + + SYNOPSIS + translog_scanner_eof() + scanner Information about current chunk during scanning + + RETURN + 1 End of the File + 0 OK +*/ + +static my_bool translog_scanner_eof(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_scanner_eof"); + DBUG_ASSERT(LSN_FILE_NO(scanner->page_addr) == + LSN_FILE_NO(scanner->last_file_page)); + DBUG_PRINT("enter", ("curr Page: 0x%lx last page: 0x%lx " + "normal EOF: %d", + (ulong) LSN_OFFSET(scanner->page_addr), + (ulong) LSN_OFFSET(scanner->last_file_page), + LSN_OFFSET(scanner->page_addr) == + LSN_OFFSET(scanner->last_file_page))); + /* + TODO: detect damaged file EOF, + TODO: issue warning if damaged file EOF detected + */ + DBUG_RETURN(scanner->page_addr == + scanner->last_file_page); +} + + +/* + Move scanner to the next chunk + + SYNOPSIS + translog_get_next_chunk() + scanner Information about current chunk during scanning + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner) +{ + uint16 len; + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_get_next_chunk"); + + if ((len= translog_get_total_chunk_length(scanner->page, + scanner->page_offset)) == 0) + DBUG_RETURN(1); + scanner->page_offset+= len; + + if (translog_scanner_eol(scanner)) + { + scanner->page= &end_of_log; + scanner->page_offset= 0; + DBUG_RETURN(0); + } + if (translog_scanner_eop(scanner)) + { + if (translog_scanner_eof(scanner)) + { + DBUG_PRINT("info", ("horizon: (%lu,0x%lx) pageaddr: (%lu,0x%lx)", + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->page_addr))); + /* if it is log end it have to be caught before */ + DBUG_ASSERT(LSN_FILE_NO(scanner->horizon) > + LSN_FILE_NO(scanner->page_addr)); + scanner->page_addr+= LSN_ONE_FILE; + scanner->page_addr= LSN_REPLACE_OFFSET(scanner->page_addr, + TRANSLOG_PAGE_SIZE); + if (translog_scanner_set_last_page(scanner)) + DBUG_RETURN(1); + } + else + { + scanner->page_addr+= TRANSLOG_PAGE_SIZE; /* offset increased */ + } + + data.addr= &scanner->page_addr; + data.was_recovered= 0; + if ((scanner->page= translog_get_page(&data, scanner->buffer)) == NULL) + DBUG_RETURN(1); + + scanner->page_offset= translog_get_first_chunk_offset(scanner->page); + if (translog_scanner_eol(scanner)) + { + scanner->page= &end_of_log; + scanner->page_offset= 0; + DBUG_RETURN(0); + } + DBUG_ASSERT(scanner->page[scanner->page_offset]); + } + DBUG_RETURN(0); +} + + +/** + @brief Get header of variable length record and call hook for it processing + + @param page Pointer to the buffer with page where LSN chunk is + placed + @param page_offset Offset of the first chunk in the page + @param buff Buffer to be filled with header data + @param scanner If present should be moved to the header page if + it differ from LSN page + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +static int +translog_variable_length_header(uchar *page, translog_size_t page_offset, + TRANSLOG_HEADER_BUFFER *buff, + TRANSLOG_SCANNER_DATA *scanner) +{ + struct st_log_record_type_descriptor *desc= (log_record_type_descriptor + + buff->type); + uchar *src= page + page_offset + 1 + 2; + uchar *dst= buff->header; + LSN base_lsn; + uint lsns= desc->compressed_LSN; + uint16 chunk_len; + uint16 length= desc->read_header_len; + uint16 buffer_length= length; + uint16 body_len; + TRANSLOG_SCANNER_DATA internal_scanner; + DBUG_ENTER("translog_variable_length_header"); + + buff->record_length= translog_variable_record_1group_decode_len(&src); + chunk_len= uint2korr(src); + DBUG_PRINT("info", ("rec len: %lu chunk len: %u length: %u bufflen: %u", + (ulong) buff->record_length, (uint) chunk_len, + (uint) length, (uint) buffer_length)); + if (chunk_len == 0) + { + uint16 page_rest; + DBUG_PRINT("info", ("1 group")); + src+= 2; + page_rest= TRANSLOG_PAGE_SIZE - (src - page); + + base_lsn= buff->lsn; + body_len= min(page_rest, buff->record_length); + } + else + { + uint grp_no, curr; + uint header_to_skip; + uint16 page_rest; + + DBUG_PRINT("info", ("multi-group")); + grp_no= buff->groups_no= uint2korr(src + 2); + if (!(buff->groups= + (TRANSLOG_GROUP*) my_malloc(sizeof(TRANSLOG_GROUP) * grp_no, + MYF(0)))) + DBUG_RETURN(RECHEADER_READ_ERROR); + DBUG_PRINT("info", ("Groups: %u", (uint) grp_no)); + src+= (2 + 2); + page_rest= TRANSLOG_PAGE_SIZE - (src - page); + curr= 0; + header_to_skip= src - (page + page_offset); + buff->chunk0_pages= 0; + + for (;;) + { + uint i, read= grp_no; + + buff->chunk0_pages++; + if (page_rest < grp_no * (7 + 1)) + read= page_rest / (7 + 1); + DBUG_PRINT("info", ("Read chunk0 page#%u read: %u left: %u " + "start from: %u", + buff->chunk0_pages, read, grp_no, curr)); + for (i= 0; i < read; i++, curr++) + { + DBUG_ASSERT(curr < buff->groups_no); + buff->groups[curr].addr= lsn_korr(src + i * (7 + 1)); + buff->groups[curr].num= src[i * (7 + 1) + 7]; + DBUG_PRINT("info", ("group #%u (%lu,0x%lx) chunks: %u", + curr, + LSN_IN_PARTS(buff->groups[curr].addr), + (uint) buff->groups[curr].num)); + } + grp_no-= read; + if (grp_no == 0) + { + if (scanner) + { + buff->chunk0_data_addr= scanner->page_addr; + buff->chunk0_data_addr+= (page_offset + header_to_skip + + read * (7 + 1)); /* offset increased */ + } + else + { + buff->chunk0_data_addr= buff->lsn; + /* offset increased */ + buff->chunk0_data_addr+= (header_to_skip + read * (7 + 1)); + } + buff->chunk0_data_len= chunk_len - 2 - read * (7 + 1); + DBUG_PRINT("info", ("Data address: (%lu,0x%lx) len: %u", + LSN_IN_PARTS(buff->chunk0_data_addr), + buff->chunk0_data_len)); + break; + } + if (scanner == NULL) + { + DBUG_PRINT("info", ("use internal scanner for header reading")); + scanner= &internal_scanner; + if (translog_init_scanner(buff->lsn, 1, scanner)) + DBUG_RETURN(RECHEADER_READ_ERROR); + } + if (translog_get_next_chunk(scanner)) + DBUG_RETURN(RECHEADER_READ_ERROR); + page= scanner->page; + page_offset= scanner->page_offset; + src= page + page_offset + header_to_skip; + chunk_len= uint2korr(src - 2 - 2); + DBUG_PRINT("info", ("Chunk len: %u", (uint) chunk_len)); + page_rest= TRANSLOG_PAGE_SIZE - (src - page); + } + + if (scanner == NULL) + { + DBUG_PRINT("info", ("use internal scanner")); + scanner= &internal_scanner; + } + + base_lsn= buff->groups[0].addr; + translog_init_scanner(base_lsn, 1, scanner); + /* first group chunk is always chunk type 2 */ + page= scanner->page; + page_offset= scanner->page_offset; + src= page + page_offset + 1; + page_rest= TRANSLOG_PAGE_SIZE - (src - page); + body_len= page_rest; + } + if (lsns) + { + uchar *start= src; + src= translog_relative_LSN_decode(base_lsn, src, dst, lsns); + lsns*= LSN_STORE_SIZE; + dst+= lsns; + length-= lsns; + buff->record_length+= (buff->compressed_LSN_economy= + (lsns - (src - start))); + DBUG_PRINT("info", ("lsns: %u length: %u economy: %d new length: %lu", + lsns / LSN_STORE_SIZE, (uint) length, + (int) buff->compressed_LSN_economy, + (ulong) buff->record_length)); + body_len-= (src - start); + } + else + buff->compressed_LSN_economy= 0; + + DBUG_ASSERT(body_len >= length); + body_len-= length; + memcpy(dst, src, length); + buff->non_header_data_start_offset= src + length - page; + buff->non_header_data_len= body_len; + DBUG_PRINT("info", ("non_header_data_start_offset: %u len: %u buffer: %u", + buff->non_header_data_start_offset, + buff->non_header_data_len, buffer_length)); + DBUG_RETURN(buffer_length); +} + + +/** + @brief Read record header from the given buffer + + @param page page content buffer + @param page_offset offset of the chunk in the page + @param buff destination buffer + @param scanner If this is set the scanner will be moved to the + record header page (differ from LSN page in case of + multi-group records) + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +int translog_read_record_header_from_buffer(uchar *page, + uint16 page_offset, + TRANSLOG_HEADER_BUFFER *buff, + TRANSLOG_SCANNER_DATA *scanner) +{ + translog_size_t res; + DBUG_ENTER("translog_read_record_header_from_buffer"); + DBUG_ASSERT((page[page_offset] & TRANSLOG_CHUNK_TYPE) == + TRANSLOG_CHUNK_LSN || + (page[page_offset] & TRANSLOG_CHUNK_TYPE) == + TRANSLOG_CHUNK_FIXED); + DBUG_ASSERT(translog_inited == 1); + buff->type= (page[page_offset] & TRANSLOG_REC_TYPE); + buff->short_trid= uint2korr(page + page_offset + 1); + DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)", + (uint) buff->type, (uint)buff->short_trid, + LSN_IN_PARTS(buff->lsn))); + /* Read required bytes from the header and call hook */ + switch (log_record_type_descriptor[buff->type].class) { + case LOGRECTYPE_VARIABLE_LENGTH: + res= translog_variable_length_header(page, page_offset, buff, + scanner); + break; + case LOGRECTYPE_PSEUDOFIXEDLENGTH: + case LOGRECTYPE_FIXEDLENGTH: + res= translog_fixed_length_header(page, page_offset, buff); + break; + default: + DBUG_ASSERT(0); /* we read some junk (got no LSN) */ + res= RECHEADER_READ_ERROR; + } + DBUG_RETURN(res); +} + + +/** + @brief Read record header and some fixed part of a record (the part depend + on record type). + + @param lsn log record serial number (address of the record) + @param buff log record header buffer + + @note Some type of record can be read completely by this call + @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative + LSN can be translated to absolute one), some fields can be added (like + actual header length in the record if the header has variable length) + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff) +{ + uchar buffer[TRANSLOG_PAGE_SIZE], *page; + translog_size_t res, page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE; + TRANSLOG_ADDRESS addr; + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_read_record_header"); + DBUG_PRINT("enter", ("LSN: (0x%lu,0x%lx)", LSN_IN_PARTS(lsn))); + DBUG_ASSERT(LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE != 0); + DBUG_ASSERT(translog_inited == 1); + + buff->lsn= lsn; + buff->groups_no= 0; + data.addr= &addr; + data.was_recovered= 0; + addr= lsn; + addr-= page_offset; /* offset decreasing */ + res= (!(page= translog_get_page(&data, buffer))) ? RECHEADER_READ_ERROR : + translog_read_record_header_from_buffer(page, page_offset, buff, 0); + DBUG_RETURN(res); +} + + +/** + @brief Read record header and some fixed part of a record (the part depend + on record type). + + @param scan scanner position to read + @param buff log record header buffer + @param move_scanner request to move scanner to the header position + + @note Some type of record can be read completely by this call + @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative + LSN can be translated to absolute one), some fields can be added (like + actual header length in the record if the header has variable length) + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where stored + decoded part of the header +*/ + +int translog_read_record_header_scan(TRANSLOG_SCANNER_DATA *scanner, + TRANSLOG_HEADER_BUFFER *buff, + my_bool move_scanner) +{ + translog_size_t res; + DBUG_ENTER("translog_read_record_header_scan"); + DBUG_PRINT("enter", ("Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) " + "Lst: (%lu,0x%lx) Offset: %u(%x) fixed %d", + LSN_IN_PARTS(scanner->page_addr), + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->last_file_page), + (uint) scanner->page_offset, + (uint) scanner->page_offset, scanner->fixed_horizon)); + DBUG_ASSERT(translog_inited == 1); + buff->groups_no= 0; + buff->lsn= scanner->page_addr; + buff->lsn+= scanner->page_offset; /* offset increasing */ + res= translog_read_record_header_from_buffer(scanner->page, + scanner->page_offset, + buff, + (move_scanner ? + scanner : 0)); + DBUG_RETURN(res); +} + + +/** + @brief Read record header and some fixed part of the next record (the part + depend on record type). + + @param scanner data for scanning if lsn is NULL scanner data + will be used for continue scanning. + The scanner can be NULL. + + @param buff log record header buffer + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval RECHEADER_READ_EOF EOF + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner, + TRANSLOG_HEADER_BUFFER *buff) +{ + uint8 chunk_type; + translog_size_t res; + buff->groups_no= 0; /* to be sure that we will free it right */ + + DBUG_ENTER("translog_read_next_record_header"); + DBUG_PRINT("enter", ("scanner: 0x%lx", (ulong) scanner)); + DBUG_PRINT("info", ("Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) " + "Lst: (%lu,0x%lx) Offset: %u(%x) fixed: %d", + LSN_IN_PARTS(scanner->page_addr), + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->last_file_page), + (uint) scanner->page_offset, + (uint) scanner->page_offset, scanner->fixed_horizon)); + DBUG_ASSERT(translog_inited == 1); + + do + { + if (translog_get_next_chunk(scanner)) + DBUG_RETURN(RECHEADER_READ_ERROR); + chunk_type= scanner->page[scanner->page_offset] & TRANSLOG_CHUNK_TYPE; + DBUG_PRINT("info", ("type: %x byte: %x", (uint) chunk_type, + (uint) scanner->page[scanner->page_offset])); + } while (chunk_type != TRANSLOG_CHUNK_LSN && chunk_type != + TRANSLOG_CHUNK_FIXED && scanner->page[scanner->page_offset] != 0); + + if (scanner->page[scanner->page_offset] == 0) + { + /* Last record was read */ + buff->lsn= LSN_IMPOSSIBLE; + /* Return 'end of log' marker */ + res= RECHEADER_READ_EOF; + } + else + res= translog_read_record_header_scan(scanner, buff, 0); + DBUG_RETURN(res); +} + + +/* + Moves record data reader to the next chunk and fill the data reader + information about that chunk. + + SYNOPSIS + translog_record_read_next_chunk() + data data cursor + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_record_read_next_chunk(struct st_translog_reader_data + *data) +{ + translog_size_t new_current_offset= data->current_offset + data->chunk_size; + uint16 chunk_header_len, chunk_len; + uint8 type; + DBUG_ENTER("translog_record_read_next_chunk"); + + if (data->eor) + { + DBUG_PRINT("info", ("end of the record flag set")); + DBUG_RETURN(1); + } + + if (data->header.groups_no && + data->header.groups_no - 1 != data->current_group && + data->header.groups[data->current_group].num == data->current_chunk) + { + /* Goto next group */ + data->current_group++; + data->current_chunk= 0; + DBUG_PRINT("info", ("skip to group: #%u", data->current_group)); + translog_init_scanner(data->header.groups[data->current_group].addr, + 1, &data->scanner); + } + else + { + data->current_chunk++; + if (translog_get_next_chunk(&data->scanner)) + DBUG_RETURN(1); + } + type= data->scanner.page[data->scanner.page_offset] & TRANSLOG_CHUNK_TYPE; + + if (type == TRANSLOG_CHUNK_LSN && data->header.groups_no) + { + DBUG_PRINT("info", + ("Last chunk: data len: %u offset: %u group: %u of %u", + data->header.chunk0_data_len, data->scanner.page_offset, + data->current_group, data->header.groups_no - 1)); + DBUG_ASSERT(data->header.groups_no - 1 == data->current_group); + DBUG_ASSERT(data->header.lsn == + data->scanner.page_addr + data->scanner.page_offset); + translog_init_scanner(data->header.chunk0_data_addr, 1, &data->scanner); + data->chunk_size= data->header.chunk0_data_len; + data->body_offset= data->scanner.page_offset; + data->current_offset= new_current_offset; + data->eor= 1; + DBUG_RETURN(0); + } + + if (type == TRANSLOG_CHUNK_LSN || type == TRANSLOG_CHUNK_FIXED) + { + data->eor= 1; + DBUG_RETURN(1); /* End of record */ + } + + chunk_header_len= + translog_get_chunk_header_length(data->scanner.page, + data->scanner.page_offset); + chunk_len= translog_get_total_chunk_length(data->scanner.page, + data->scanner.page_offset); + data->chunk_size= chunk_len - chunk_header_len; + data->body_offset= data->scanner.page_offset + chunk_header_len; + data->current_offset= new_current_offset; + DBUG_PRINT("info", ("grp: %u chunk: %u body_offset: %u chunk_size: %u " + "current_offset: %lu", + (uint) data->current_group, + (uint) data->current_chunk, + (uint) data->body_offset, + (uint) data->chunk_size, (ulong) data->current_offset)); + DBUG_RETURN(0); +} + + +/* + Initialize record reader data from LSN + + SYNOPSIS + translog_init_reader_data() + lsn reference to LSN we should start from + data reader data to initialize + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_init_reader_data(LSN lsn, + struct st_translog_reader_data *data) +{ + int read_header; + DBUG_ENTER("translog_init_reader_data"); + if (translog_init_scanner(lsn, 1, &data->scanner) || + ((read_header= + translog_read_record_header_scan(&data->scanner, &data->header, 1)) + == RECHEADER_READ_ERROR)) + DBUG_RETURN(1); + data->read_header= read_header; + data->body_offset= data->header.non_header_data_start_offset; + data->chunk_size= data->header.non_header_data_len; + data->current_offset= data->read_header; + data->current_group= 0; + data->current_chunk= 0; + data->eor= 0; + DBUG_PRINT("info", ("read_header: %u " + "body_offset: %u chunk_size: %u current_offset: %lu", + (uint) data->read_header, + (uint) data->body_offset, + (uint) data->chunk_size, (ulong) data->current_offset)); + DBUG_RETURN(0); +} + + +/* + Read a part of the record. + + SYNOPSIS + translog_read_record_header() + lsn log record serial number (address of the record) + offset From the beginning of the record beginning (read§ + by translog_read_record_header). + length Length of record part which have to be read. + buffer Buffer where to read the record part (have to be at + least 'length' bytes length) + + RETURN + length of data actually read +*/ + +translog_size_t translog_read_record(LSN lsn, + translog_size_t offset, + translog_size_t length, + uchar *buffer, + struct st_translog_reader_data *data) +{ + translog_size_t requested_length= length; + translog_size_t end= offset + length; + struct st_translog_reader_data internal_data; + DBUG_ENTER("translog_read_record"); + DBUG_ASSERT(translog_inited == 1); + + if (data == NULL) + { + DBUG_ASSERT(lsn != LSN_IMPOSSIBLE); + data= &internal_data; + } + if (lsn || + (offset < data->current_offset && + !(offset < data->read_header && offset + length < data->read_header))) + { + if (translog_init_reader_data(lsn, data)) + DBUG_RETURN(0); + } + DBUG_PRINT("info", ("Offset: %lu length: %lu " + "Scanner: Cur: (%lu,0x%lx) Hrz: (%lu,0x%lx) " + "Lst: (%lu,0x%lx) Offset: %u(%x) fixed: %d", + (ulong) offset, (ulong) length, + LSN_IN_PARTS(data->scanner.page_addr), + LSN_IN_PARTS(data->scanner.horizon), + LSN_IN_PARTS(data->scanner.last_file_page), + (uint) data->scanner.page_offset, + (uint) data->scanner.page_offset, + data->scanner.fixed_horizon)); + if (offset < data->read_header) + { + uint16 len= min(data->read_header, end) - offset; + DBUG_PRINT("info", + ("enter header offset: %lu length: %lu", + (ulong) offset, (ulong) length)); + memcpy(buffer, data->header.header + offset, len); + length-= len; + if (length == 0) + DBUG_RETURN(requested_length); + offset+= len; + buffer+= len; + DBUG_PRINT("info", + ("len: %u offset: %lu curr: %lu length: %lu", + len, (ulong) offset, (ulong) data->current_offset, + (ulong) length)); + } + /* TODO: find first page which we should read by offset */ + + /* read the record chunk by chunk */ + for(;;) + { + uint page_end= data->current_offset + data->chunk_size; + DBUG_PRINT("info", + ("enter body offset: %lu curr: %lu " + "length: %lu page_end: %lu", + (ulong) offset, (ulong) data->current_offset, (ulong) length, + (ulong) page_end)); + if (offset < page_end) + { + uint len= page_end - offset; + DBUG_ASSERT(offset >= data->current_offset); + memcpy(buffer, + data->scanner.page + data->body_offset + + (offset - data->current_offset), len); + length-= len; + if (length == 0) + DBUG_RETURN(requested_length); + offset+= len; + buffer+= len; + DBUG_PRINT("info", + ("len: %u offset: %lu curr: %lu length: %lu", + len, (ulong) offset, (ulong) data->current_offset, + (ulong) length)); + } + if (translog_record_read_next_chunk(data)) + DBUG_RETURN(requested_length - length); + } +} + + +/* + Force skipping to the next buffer + + SYNOPSIS + translog_force_current_buffer_to_finish() +*/ + +static void translog_force_current_buffer_to_finish() +{ + TRANSLOG_ADDRESS new_buff_beginning; + uint16 old_buffer_no= log_descriptor.bc.buffer_no; + uint16 new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO; + struct st_translog_buffer *new_buffer= (log_descriptor.buffers + + new_buffer_no); + struct st_translog_buffer *old_buffer= log_descriptor.bc.buffer; + uchar *data= log_descriptor.bc.ptr - log_descriptor.bc.current_page_fill; + uint16 left= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill; + uint16 current_page_fill, write_counter, previous_offset; + DBUG_ENTER("translog_force_current_buffer_to_finish"); + DBUG_PRINT("enter", ("Buffer #%u 0x%lx " + "Buffer addr: (%lu,0x%lx) " + "Page addr: (%lu,0x%lx) " + "size: %lu (%lu) Pg: %u left: %u", + (uint) log_descriptor.bc.buffer_no, + (ulong) log_descriptor.bc.buffer, + LSN_IN_PARTS(log_descriptor.bc.buffer->offset), + (ulong) LSN_FILE_NO(log_descriptor.horizon), + (ulong) (LSN_OFFSET(log_descriptor.horizon) - + log_descriptor.bc.current_page_fill), + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr -log_descriptor.bc. + buffer->buffer), + (uint) log_descriptor.bc.current_page_fill, + (uint) left)); + + LINT_INIT(current_page_fill); + new_buff_beginning= log_descriptor.bc.buffer->offset; + new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */ + + DBUG_ASSERT(log_descriptor.bc.ptr !=NULL); + DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) == + LSN_FILE_NO(log_descriptor.bc.buffer->offset)); + DBUG_EXECUTE("info", translog_check_cursor(&log_descriptor.bc);); + DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE); + if (left != 0) + { + /* + TODO: if 'left' is so small that can't hold any other record + then do not move the page + */ + DBUG_PRINT("info", ("left: %u", (uint) left)); + + /* decrease offset */ + new_buff_beginning-= log_descriptor.bc.current_page_fill; + current_page_fill= log_descriptor.bc.current_page_fill; + + bzero(log_descriptor.bc.ptr, left); + log_descriptor.bc.buffer->size+= left; + DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx " + "Size: %lu", + (uint) log_descriptor.bc.buffer->buffer_no, + (ulong) log_descriptor.bc.buffer, + (ulong) log_descriptor.bc.buffer->size)); + DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no == + log_descriptor.bc.buffer_no); + } + else + { + log_descriptor.bc.current_page_fill= 0; + } + + translog_buffer_lock(new_buffer); + translog_wait_for_buffer_free(new_buffer); + + write_counter= log_descriptor.bc.write_counter; + previous_offset= log_descriptor.bc.previous_offset; + translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no); + log_descriptor.bc.buffer->offset= new_buff_beginning; + log_descriptor.bc.write_counter= write_counter; + log_descriptor.bc.previous_offset= previous_offset; + + if (data[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) + { + translog_put_sector_protection(data, &log_descriptor.bc); + if (left) + { + log_descriptor.bc.write_counter++; + log_descriptor.bc.previous_offset= current_page_fill; + } + else + { + DBUG_PRINT("info", ("drop write_counter")); + log_descriptor.bc.write_counter= 0; + log_descriptor.bc.previous_offset= 0; + } + } + + if (data[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC) + { + uint32 crc= translog_crc(data + log_descriptor.page_overhead, + TRANSLOG_PAGE_SIZE - + log_descriptor.page_overhead); + DBUG_PRINT("info", ("CRC: 0x%lx", (ulong) crc)); + int4store(data + 3 + 3 + 1, crc); + } + + if (left) + { + /* + TODO: do not copy begining of the page if we have no CRC or sector + checks on + */ + memcpy(new_buffer->buffer, data, current_page_fill); + log_descriptor.bc.ptr+= current_page_fill; + log_descriptor.bc.buffer->size= log_descriptor.bc.current_page_fill= + current_page_fill; + new_buffer->overlay= old_buffer; + } + else + translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); + old_buffer->next_buffer_offset= new_buffer->offset; + + DBUG_VOID_RETURN; +} + + +/** + @brief Flush the log up to given LSN (included) + + @param lsn log record serial number up to which (inclusive) + the log has to be flushed + + @return Operation status + @retval 0 OK + @retval 1 Error + + @todo LOG: when a log write fails, we should not write to this log anymore + (if we add more log records to this log they will be unreadable: we will hit + the broken log record): all translog_flush() should be made to fail (because + translog_flush() is when a a transaction wants something durable and we + cannot make anything durable as log is corrupted). For that, a "my_bool + st_translog_descriptor::write_error" could be set to 1 when a + translog_write_record() or translog_flush() fails, and translog_flush() + would test this var (and translog_write_record() could also test this var if + it wants, though it's not absolutely needed). + Then, either shut Maria down immediately, or switch to a new log (but if we + get write error after write error, that would create too many logs). + A popular open-source transactional engine intentionally crashes as soon as + a log flush fails (we however don't want to crash the entire mysqld, but + stopping all engine's operations immediately would make sense). + Same applies to translog_write_record(). + + @todo: remove serialization and make group commit. +*/ + +my_bool translog_flush(LSN lsn) +{ + LSN old_flushed, sent_to_file; + int rc= 0; + uint i; + my_bool full_circle= 0; + DBUG_ENTER("translog_flush"); + DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn))); + DBUG_ASSERT(translog_inited == 1); + + pthread_mutex_lock(&log_descriptor.log_flush_lock); + translog_lock(); + old_flushed= log_descriptor.flushed; + for (;;) + { + uint16 buffer_no= log_descriptor.bc.buffer_no; + uint16 buffer_start= buffer_no; + struct st_translog_buffer *buffer_unlock= log_descriptor.bc.buffer; + struct st_translog_buffer *buffer= log_descriptor.bc.buffer; + /* we can't flush in future */ + DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, lsn) >= 0); + if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0) + { + DBUG_PRINT("info", ("already flushed: (%lu,0x%lx)", + LSN_IN_PARTS(log_descriptor.flushed))); + goto out; + } + /* send to the file if it is not sent */ + sent_to_file= translog_get_sent_to_file(); + if (cmp_translog_addr(sent_to_file, lsn) >= 0) + break; + + do + { + buffer_no= (buffer_no + 1) % TRANSLOG_BUFFERS_NO; + buffer= log_descriptor.buffers + buffer_no; + translog_buffer_lock(buffer); + translog_buffer_unlock(buffer_unlock); + buffer_unlock= buffer; + if (buffer->file != -1) + { + buffer_unlock= NULL; + if (buffer_start == buffer_no) + { + /* we made a circle */ + full_circle= 1; + translog_force_current_buffer_to_finish(); + } + break; + } + } while ((buffer_start != buffer_no) && + cmp_translog_addr(log_descriptor.flushed, lsn) < 0); + if (buffer_unlock != NULL && buffer_unlock != buffer) + translog_buffer_unlock(buffer_unlock); + rc= translog_buffer_flush(buffer); + translog_buffer_unlock(buffer); + if (rc) + { + rc= 1; + goto out; + } + if (!full_circle) + translog_lock(); + } + + for (i= LSN_FILE_NO(old_flushed); i <= LSN_FILE_NO(lsn); i++) + { + uint cache_index; + File file; + + if ((cache_index= LSN_FILE_NO(log_descriptor.horizon) - i) < + OPENED_FILES_NUM) + { + /* file in the cache */ + if (log_descriptor.log_file_num[cache_index] == -1) + { + if ((log_descriptor.log_file_num[cache_index]= + open_logfile_by_number_no_cache(i)) == -1) + { + rc= 1; + goto out; + } + } + file= log_descriptor.log_file_num[cache_index]; + rc|= my_sync(file, MYF(MY_WME)); + } + /* We sync file when we are closing it => do nothing if file closed */ + } + log_descriptor.flushed= sent_to_file; + /** @todo LOG decide if syncing of directory is needed */ + rc|= my_sync(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD)); +out: + translog_unlock(); + pthread_mutex_unlock(&log_descriptor.log_flush_lock); + DBUG_RETURN(rc); +} + + +/** + @brief Sets transaction's rec_lsn if needed + + A transaction sometimes writes a REDO even before the page is in the + pagecache (example: brand new head or tail pages; full pages). So, if + Checkpoint happens just after the REDO write, it needs to know that the + REDO phase must start before this REDO. Scanning the pagecache cannot + tell that as the page is not in the cache. So, transaction sets its rec_lsn + to the REDO's LSN or somewhere before, and Checkpoint reads the + transaction's rec_lsn. + + @todo move it to a separate file + + @return Operation status, always 0 (success) +*/ + +static my_bool write_hook_for_redo(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, + struct st_translog_parts *parts + __attribute__ ((unused))) +{ + /* + Users of dummy_transaction_object must keep this TRN clean as it + is used by many threads (like those manipulating non-transactional + tables). It might be dangerous if one user sets rec_lsn or some other + member and it is picked up by another user (like putting this rec_lsn into + a page of a non-transactional table); it's safer if all members stay 0. So + non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not + call this hook; we trust them but verify ;) + */ + DBUG_ASSERT(trn->trid != 0); + /* + If the hook stays so simple, it would be faster to pass + !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn + to translog_write_record(), like Monty did in his original code, and not + have a hook. For now we keep it like this. + */ + if (trn->rec_lsn == 0) + trn->rec_lsn= *lsn; + return 0; +} + + +/** + @brief Sets transaction's undo_lsn, first_undo_lsn if needed + + @todo move it to a separate file + + @return Operation status, always 0 (success) +*/ + +static my_bool write_hook_for_undo(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, + struct st_translog_parts *parts + __attribute__ ((unused))) +{ + DBUG_ASSERT(trn->trid != 0); + trn->undo_lsn= *lsn; + if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0)) + trn->first_undo_lsn= + trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn); + return 0; + /* + when we implement purging, we will specialize this hook: UNDO_PURGE + records will additionally set trn->undo_purge_lsn + */ +} + + +/** + @brief Sets the table's records count to 0, then calls the generic REDO + hook. + + @todo move it to a separate file + + @return Operation status, always 0 (success) +*/ + +static my_bool write_hook_for_redo_delete_all(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, + struct st_translog_parts *parts + __attribute__ ((unused))) +{ + tbl_info->s->state.state.records= 0; + return write_hook_for_redo(type, trn, tbl_info, lsn, parts); +} + + +/** + @brief Upates "records" and calls the generic UNDO hook + + @todo move it to a separate file + + @return Operation status, always 0 (success) +*/ + +static my_bool write_hook_for_undo_row_insert(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, + struct st_translog_parts *parts + __attribute__ ((unused))) +{ + tbl_info->s->state.state.records++; + return write_hook_for_undo(type, trn, tbl_info, lsn, parts); +} + + +/** + @brief Upates "records" and calls the generic UNDO hook + + @todo move it to a separate file + + @return Operation status, always 0 (success) +*/ + +static my_bool write_hook_for_undo_row_delete(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, + struct st_translog_parts *parts + __attribute__ ((unused))) +{ + tbl_info->s->state.state.records--; + return write_hook_for_undo(type, trn, tbl_info, lsn, parts); +} + + +/** + @brief Sets transaction's undo_lsn, first_undo_lsn if needed + + @todo move it to a separate file + + @return Operation status, always 0 (success) +*/ + +static my_bool write_hook_for_clr_end(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn + __attribute__ ((unused)), + struct st_translog_parts *parts) +{ + char *ptr= parts->parts[TRANSLOG_INTERNAL_PARTS + 0].str; + enum translog_record_type undone_record_type= + ptr[LSN_STORE_SIZE + FILEID_STORE_SIZE]; + + DBUG_ASSERT(trn->trid != 0); + trn->undo_lsn= lsn_korr(ptr); + switch (undone_record_type) { + case LOGREC_UNDO_ROW_DELETE: + tbl_info->s->state.state.records++; + break; + case LOGREC_UNDO_ROW_INSERT: + tbl_info->s->state.state.records--; + break; + case LOGREC_UNDO_ROW_UPDATE: + break; + default: + DBUG_ASSERT(0); + } + if (trn->undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */ + trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn); + return 0; +} + + +/** + @brief Updates table's lsn_of_file_id. + + @todo move it to a separate file + + @return Operation status, always 0 (success) +*/ + +static my_bool write_hook_for_file_id(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn + __attribute__ ((unused)), + MARIA_HA *tbl_info, + LSN *lsn + __attribute__ ((unused)), + struct st_translog_parts *parts + __attribute__ ((unused))) +{ + DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0); + tbl_info->s->lsn_of_file_id= *lsn; + return 0; +} + + +/** + @brief Gives a 2-byte-id to MARIA_SHARE and logs this fact + + If a MARIA_SHARE does not yet have a 2-byte-id (unique over all currently + open MARIA_SHAREs), give it one and record this assignment in the log + (LOGREC_FILE_ID log record). + + @param tbl_info table + @param trn calling transaction + + @return Operation status + @retval 0 OK + @retval 1 Error + + @note Can be called even if share already has an id (then will do nothing) +*/ + +int translog_assign_id_to_share(MARIA_HA *tbl_info, TRN *trn) +{ + MARIA_SHARE *share= tbl_info->s; + /* + If you give an id to a non-BLOCK_RECORD table, you also need to release + this id somewhere. Then you can change the assertion. + */ + DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); + /* re-check under mutex to avoid having 2 ids for the same share */ + pthread_mutex_lock(&share->intern_lock); + if (likely(share->id == 0)) + { + /* Inspired by set_short_trid() of trnman.c */ + uint i= share->kfile.file % SHARE_ID_MAX + 1; + do + { + my_atomic_rwlock_wrlock(&LOCK_id_to_share); + for ( ; i <= SHARE_ID_MAX ; i++) /* the range is [1..SHARE_ID_MAX] */ + { + void *tmp= NULL; + if (id_to_share[i] == NULL && + my_atomic_casptr((void **)&id_to_share[i], &tmp, share)) + { + share->id= (uint16)i; + break; + } + } + my_atomic_rwlock_wrunlock(&LOCK_id_to_share); + i= 1; /* scan the whole array */ + } while (share->id == 0); + DBUG_PRINT("info", ("id_to_share: 0x%lx -> %u", (ulong)share, share->id)); + LSN lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uchar log_data[FILEID_STORE_SIZE]; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + /* + open_file_name is an unresolved name (symlinks are not resolved, datadir + is not realpath-ed, etc) which is good: the log can be moved to another + directory and continue working. + */ + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= share->open_file_name; + /** + @todo if we had the name's length in MARIA_SHARE we could avoid this + strlen() + */ + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= + strlen(share->open_file_name) + 1; + if (unlikely(translog_write_record(&lsn, LOGREC_FILE_ID, trn, tbl_info, + sizeof(log_data) + + log_array[TRANSLOG_INTERNAL_PARTS + + 1].length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, log_data))) + return 1; + } + pthread_mutex_unlock(&share->intern_lock); + return 0; +} + + +/** + @brief Recycles a MARIA_SHARE's short id. + + @param share table + + @note Must be called only if share has an id (i.e. id != 0) +*/ + +void translog_deassign_id_from_share(MARIA_SHARE *share) +{ + DBUG_PRINT("info", ("id_to_share: 0x%lx id %u -> 0", + (ulong)share, share->id)); + /* + We don't need any mutex as we are called only when closing the last + instance of the table or at the end of REPAIR: no writes can be + happening. But a Checkpoint may be reading share->id, so we require this + mutex: + */ + safe_mutex_assert_owner(&share->intern_lock); + my_atomic_rwlock_rdlock(&LOCK_id_to_share); + my_atomic_storeptr((void **)&id_to_share[share->id], 0); + my_atomic_rwlock_rdunlock(&LOCK_id_to_share); + share->id= 0; + /* useless but safety: */ + share->lsn_of_file_id= LSN_IMPOSSIBLE; +} + + +void translog_assign_id_to_share_from_recovery(MARIA_SHARE *share, + uint16 id) +{ + DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded); + DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); + DBUG_ASSERT(share->id == 0); + DBUG_ASSERT(id_to_share[id] == NULL); + id_to_share[share->id= id]= share; +} + + +/** + @brief check if such log file exists + + @param file_no number of the file to test + + @retval 0 no such file + @retval 1 there is file with such number +*/ + +my_bool translog_is_file(uint file_no) +{ + MY_STAT stat_buff; + char path[FN_REFLEN]; + return (test(my_stat(translog_filename_by_fileno(file_no, path), + &stat_buff, MYF(0)))); +} + + +/** + @brief returns minimum log file number + + @param horizon the end of the log + @param is_protected true if it is under purge_log protection + + @retval minimum file number + @retval 0 no files found +*/ + +static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected) +{ + uint min_file= 1, max_file; + DBUG_ENTER("translog_first_file"); + if (!is_protected) + pthread_mutex_lock(&log_descriptor.purger_lock); + if (log_descriptor.min_file_number && + translog_is_file(log_descriptor.min_file_number)) + { + DBUG_PRINT("info", ("cached %lu", + (ulong) log_descriptor.min_file_number)); + if (!is_protected) + pthread_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(log_descriptor.min_file_number); + } + + max_file= LSN_FILE_NO(horizon); + + if (MAKE_LSN(1, TRANSLOG_PAGE_SIZE) >= horizon) + { + /* there is no first page yet */ + DBUG_RETURN(0); + } + + /* binary search for last file */ + while (min_file != max_file && min_file != (max_file - 1)) + { + uint test= (min_file + max_file) / 2; + DBUG_PRINT("info", ("min_file: %u test: %u max_file: %u", + min_file, test, max_file)); + if (test == max_file) + test--; + if (translog_is_file(test)) + max_file= test; + else + min_file= test; + } + log_descriptor.min_file_number= max_file; + if (!is_protected) + pthread_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(max_file); +} + + +/** + @brief returns the most close LSN higher the given chunk address + + @param addr the chunk address to start from + @param horizon the horizon if it is known or LSN_IMPOSSIBLE + + @retval LSN_ERROR Error + @retval LSN_IMPOSSIBLE no LSNs after the address + @retval # LSN of the most close LSN higher the given chunk address +*/ + +LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon) +{ + uint chunk_type; + TRANSLOG_SCANNER_DATA scanner; + DBUG_ENTER("translog_next_LSN"); + + if (horizon == LSN_IMPOSSIBLE) + horizon= translog_get_horizon(); + + if (addr == horizon) + DBUG_RETURN(LSN_IMPOSSIBLE); + + translog_init_scanner(addr, 0, &scanner); + + chunk_type= scanner.page[scanner.page_offset] & TRANSLOG_CHUNK_TYPE; + DBUG_PRINT("info", ("type: %x byte: %x", (uint) chunk_type, + (uint) scanner.page[scanner.page_offset])); + while (chunk_type != TRANSLOG_CHUNK_LSN && + chunk_type != TRANSLOG_CHUNK_FIXED && + scanner.page[scanner.page_offset] != 0) + { + if (translog_get_next_chunk(&scanner)) + DBUG_RETURN(LSN_ERROR); + chunk_type= scanner.page[scanner.page_offset] & TRANSLOG_CHUNK_TYPE; + DBUG_PRINT("info", ("type: %x byte: %x", (uint) chunk_type, + (uint) scanner.page[scanner.page_offset])); + } + if (scanner.page[scanner.page_offset] == 0) + DBUG_RETURN(LSN_IMPOSSIBLE); /* reached page filler */ + DBUG_RETURN(scanner.page_addr + scanner.page_offset); +} + +/** + @brief returns the LSN of the first record starting in this log + + @retval LSN_ERROR Error + @retval LSN_IMPOSSIBLE no log or the log is empty + @retval # LSN of the first record +*/ + +LSN translog_first_lsn_in_log() +{ + TRANSLOG_ADDRESS addr, horizon= translog_get_horizon(); + TRANSLOG_VALIDATOR_DATA data; + uint file; + uint16 chunk_offset; + uchar *page; + DBUG_ENTER("translog_first_lsn_in_log"); + DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(addr))); + DBUG_ASSERT(translog_inited == 1); + + if (!(file= translog_first_file(horizon, 0))) + { + /* log has no records yet */ + DBUG_RETURN(LSN_IMPOSSIBLE); + } + + addr= MAKE_LSN(file, TRANSLOG_PAGE_SIZE); /* the first page of the file */ + data.addr= &addr; + { + uchar buffer[TRANSLOG_PAGE_SIZE]; + if ((page= translog_get_page(&data, buffer)) == NULL || + (chunk_offset= translog_get_first_chunk_offset(page)) == 0) + DBUG_RETURN(LSN_ERROR); + } + addr+= chunk_offset; + + DBUG_RETURN(translog_next_LSN(addr, horizon)); +} + + +/** + @brief returns theoretical first LSN if first log is present + + @retval LSN_ERROR Error + @retval LSN_IMPOSSIBLE no log + @retval # LSN of the first record +*/ + +LSN translog_first_theoretical_lsn() +{ + TRANSLOG_ADDRESS addr= translog_get_horizon(); + uchar buffer[TRANSLOG_PAGE_SIZE], *page; + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_first_theoretical_lsn"); + DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(addr))); + DBUG_ASSERT(translog_inited == 1); + + if (!translog_is_file(1)) + DBUG_RETURN(LSN_IMPOSSIBLE); + if (addr == MAKE_LSN(1, TRANSLOG_PAGE_SIZE)) + { + /* log has no records yet */ + DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE + + log_descriptor.page_overhead)); + } + + addr= MAKE_LSN(1, TRANSLOG_PAGE_SIZE); /* the first page of the file */ + data.addr= &addr; + if ((page= translog_get_page(&data, buffer)) == NULL) + DBUG_RETURN(LSN_ERROR); + + DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE + + page_overhead[page[TRANSLOG_PAGE_FLAGS]])); +} + + +/** + @brief Check given low water mark and purge files if it is need + + @param low the last (minimum) address which is need + + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_purge(TRANSLOG_ADDRESS low) +{ + uint32 last_need_file= LSN_FILE_NO(low); + TRANSLOG_ADDRESS horizon= translog_get_horizon(); + int rc= 0; + DBUG_ENTER("translog_purge"); + DBUG_PRINT("enter", ("low: (%lu,0x%lx)", LSN_IN_PARTS(low))); + DBUG_ASSERT(translog_inited == 1); + + pthread_mutex_lock(&log_descriptor.purger_lock); + if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file) + { + uint32 i; + uint32 min_file= translog_first_file(horizon, 1); + DBUG_ASSERT(min_file != 0); /* log is already started */ + + for(i= min_file; i < last_need_file && rc == 0; i++) + { + LSN lsn= translog_get_file_max_lsn_stored(i); + if (lsn == LSN_IMPOSSIBLE) + break; /* files are still in writing */ + if (lsn == LSN_ERROR) + { + rc= 1; + break; + } + if (cmp_translog_addr(lsn, low) >= 0) + break; + DBUG_PRINT("info", ("purge file %lu", (ulong) i)); + { + char path[FN_REFLEN], *file_name; + file_name= translog_filename_by_fileno(i, path); + rc= test(my_delete(file_name, MYF(MY_WME))); + } + } + } + + pthread_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(rc); +} diff --git a/storage/maria/ma_loghandler.h b/storage/maria/ma_loghandler.h new file mode 100644 index 00000000000..164ff013b10 --- /dev/null +++ b/storage/maria/ma_loghandler.h @@ -0,0 +1,364 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _ma_loghandler_h +#define _ma_loghandler_h + +/* transaction log default cache size (TODO: make it global variable) */ +#define TRANSLOG_PAGECACHE_SIZE 1024*1024*2 +/* transaction log default file size (TODO: make it global variable) */ +#define TRANSLOG_FILE_SIZE 1024*1024*1024 +/* transaction log default flags (TODO: make it global variable) */ +#define TRANSLOG_DEFAULT_FLAGS 0 + +/* Transaction log flags */ +#define TRANSLOG_PAGE_CRC 1 +#define TRANSLOG_SECTOR_PROTECTION (1<<1) +#define TRANSLOG_RECORD_CRC (1<<2) +#define TRANSLOG_FLAGS_NUM ((TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | \ + TRANSLOG_RECORD_CRC) + 1) + +#define RECHEADER_READ_ERROR -1 +#define RECHEADER_READ_EOF -2 + +/* + Page size in transaction log + It should be Power of 2 and multiple of DISK_DRIVE_SECTOR_SIZE + (DISK_DRIVE_SECTOR_SIZE * 2^N) +*/ +#define TRANSLOG_PAGE_SIZE (8*1024) + +#include "ma_loghandler_lsn.h" +#include "trnman_public.h" + +/* short transaction ID type */ +typedef uint16 SHORT_TRANSACTION_ID; + +struct st_maria_info; + +/* Length of CRC at end of pages */ +#define CRC_LENGTH 4 +/* Size of file id in logs */ +#define FILEID_STORE_SIZE 2 +/* Size of page reference in log */ +#define PAGE_STORE_SIZE ROW_EXTENT_PAGE_SIZE +/* Size of page ranges in log */ +#define PAGERANGE_STORE_SIZE ROW_EXTENT_COUNT_SIZE +#define DIRPOS_STORE_SIZE 1 + +/* Store methods to match the above sizes */ +#define fileid_store(T,A) int2store(T,A) +#define page_store(T,A) int5store(T,A) +#define dirpos_store(T,A) ((*(uchar*) (T)) = A) +#define pagerange_store(T,A) int2store(T,A) +#define fileid_korr(P) uint2korr(P) +#define page_korr(P) uint5korr(P) +#define dirpos_korr(P) ((P)[0]) +#define pagerange_korr(P) uint2korr(P) + +/* + Length of disk drive sector size (we assume that writing it + to disk is atomic operation) +*/ +#define DISK_DRIVE_SECTOR_SIZE 512 + +/* + Number of empty entries we need to have in LEX_STRING for + translog_write_record() +*/ +#define LOG_INTERNAL_PARTS 1 + +/* position reserved in an array of parts of a log record */ +#define TRANSLOG_INTERNAL_PARTS 2 + +/* types of records in the transaction log */ +/* Todo: Set numbers for these when we have all entries figured out */ + +enum translog_record_type +{ + LOGREC_RESERVED_FOR_CHUNKS23= 0, + LOGREC_REDO_INSERT_ROW_HEAD, + LOGREC_REDO_INSERT_ROW_TAIL, + LOGREC_REDO_INSERT_ROW_BLOB, + LOGREC_REDO_INSERT_ROW_BLOBS, + LOGREC_REDO_PURGE_ROW_HEAD, + LOGREC_REDO_PURGE_ROW_TAIL, + LOGREC_REDO_PURGE_BLOCKS, + LOGREC_REDO_DELETE_ROW, + LOGREC_REDO_UPDATE_ROW_HEAD, + LOGREC_REDO_INDEX, + LOGREC_REDO_UNDELETE_ROW, + LOGREC_CLR_END, + LOGREC_PURGE_END, + LOGREC_UNDO_ROW_INSERT, + LOGREC_UNDO_ROW_DELETE, + LOGREC_UNDO_ROW_UPDATE, + LOGREC_UNDO_KEY_INSERT, + LOGREC_UNDO_KEY_DELETE, + LOGREC_PREPARE, + LOGREC_PREPARE_WITH_UNDO_PURGE, + LOGREC_COMMIT, + LOGREC_COMMIT_WITH_UNDO_PURGE, + LOGREC_CHECKPOINT, + LOGREC_REDO_CREATE_TABLE, + LOGREC_REDO_RENAME_TABLE, + LOGREC_REDO_DROP_TABLE, + LOGREC_REDO_DELETE_ALL, + LOGREC_REDO_REPAIR_TABLE, + LOGREC_FILE_ID, + LOGREC_LONG_TRANSACTION_ID, + LOGREC_RESERVED_FUTURE_EXTENSION= 63 +}; +#define LOGREC_NUMBER_OF_TYPES 64 /* Maximum, can't be extended */ + +/* Size of log file; One log file is restricted to 4G */ +typedef uint32 translog_size_t; + +#define TRANSLOG_RECORD_HEADER_MAX_SIZE 1024 + +typedef struct st_translog_group_descriptor +{ + TRANSLOG_ADDRESS addr; + uint8 num; +} TRANSLOG_GROUP; + + +typedef struct st_translog_header_buffer +{ + /* LSN of the read record */ + LSN lsn; + /* array of groups descriptors, can be used only if groups_no > 0 */ + TRANSLOG_GROUP *groups; + /* short transaction ID or 0 if it has no sense for the record */ + SHORT_TRANSACTION_ID short_trid; + /* + The Record length in buffer (including read header, but excluding + hidden part of record (type, short TrID, length) + */ + translog_size_t record_length; + /* + Buffer for write decoded header of the record (depend on the record + type) + */ + uchar header[TRANSLOG_RECORD_HEADER_MAX_SIZE]; + /* number of groups listed in */ + uint groups_no; + /* in multi-group number of chunk0 pages (valid only if groups_no > 0) */ + uint chunk0_pages; + /* type of the read record */ + enum translog_record_type type; + /* chunk 0 data address (valid only if groups_no > 0) */ + TRANSLOG_ADDRESS chunk0_data_addr; + /* + Real compressed LSN(s) size economy (<number of LSN(s)>*7 - <real_size>) + */ + int16 compressed_LSN_economy; + /* short transaction ID or 0 if it has no sense for the record */ + uint16 non_header_data_start_offset; + /* non read body data length in this first chunk */ + uint16 non_header_data_len; + /* chunk 0 data size (valid only if groups_no > 0) */ + uint16 chunk0_data_len; +} TRANSLOG_HEADER_BUFFER; + + +typedef struct st_translog_scanner_data +{ + uchar buffer[TRANSLOG_PAGE_SIZE]; /* buffer for page content */ + TRANSLOG_ADDRESS page_addr; /* current page address */ + /* end of the log which we saw last time */ + TRANSLOG_ADDRESS horizon; + TRANSLOG_ADDRESS last_file_page; /* Last page on in this file */ + uchar *page; /* page content pointer */ + /* offset of the chunk in the page */ + translog_size_t page_offset; + /* set horizon only once at init */ + my_bool fixed_horizon; +} TRANSLOG_SCANNER_DATA; + + +struct st_translog_reader_data +{ + TRANSLOG_HEADER_BUFFER header; /* Header */ + TRANSLOG_SCANNER_DATA scanner; /* chunks scanner */ + translog_size_t body_offset; /* current chunk body offset */ + /* data offset from the record beginning */ + translog_size_t current_offset; + /* number of bytes read in header */ + uint16 read_header; + uint16 chunk_size; /* current chunk size */ + uint current_group; /* current group */ + uint current_chunk; /* current chunk in the group */ + my_bool eor; /* end of the record */ +}; + +struct st_transaction; +C_MODE_START + +/* Records types for unittests */ +#define LOGREC_FIXED_RECORD_0LSN_EXAMPLE 1 +#define LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE 2 +#define LOGREC_FIXED_RECORD_1LSN_EXAMPLE 3 +#define LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE 4 +#define LOGREC_FIXED_RECORD_2LSN_EXAMPLE 5 +#define LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE 6 + +extern void example_loghandler_init(); + +extern my_bool translog_init(const char *directory, uint32 log_file_max_size, + uint32 server_version, uint32 server_id, + PAGECACHE *pagecache, uint flags); + +extern my_bool +translog_write_record(LSN *lsn, enum translog_record_type type, + struct st_transaction *trn, + struct st_maria_info *tbl_info, + translog_size_t rec_len, uint part_no, + LEX_STRING *parts_data, uchar *store_share_id); + +extern void translog_destroy(); + +extern int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff); + +extern void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff); + +extern translog_size_t translog_read_record(LSN lsn, + translog_size_t offset, + translog_size_t length, + uchar *buffer, + struct st_translog_reader_data + *data); + +extern my_bool translog_flush(LSN lsn); + +extern my_bool translog_init_scanner(LSN lsn, + my_bool fixed_horizon, + struct st_translog_scanner_data *scanner); + +extern int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner, + TRANSLOG_HEADER_BUFFER *buff); +extern LSN translog_get_file_max_lsn_stored(uint32 file); +extern my_bool translog_purge(TRANSLOG_ADDRESS low); +extern my_bool translog_is_file(uint file_no); +extern my_bool translog_lock(); +extern my_bool translog_unlock(); +extern void translog_lock_assert_owner(); +extern TRANSLOG_ADDRESS translog_get_horizon(); +extern TRANSLOG_ADDRESS translog_get_horizon_no_lock(); +extern int translog_assign_id_to_share(struct st_maria_info *tbl_info, + struct st_transaction *trn); +extern void translog_deassign_id_from_share(struct st_maria_share *share); +extern void +translog_assign_id_to_share_from_recovery(struct st_maria_share *share, + uint16 id); +extern my_bool translog_inited; + +/* + all the rest added because of recovery; should we make + ma_loghandler_for_recovery.h ? +*/ + +#define SHARE_ID_MAX 65535 /* array's size */ + +extern LSN translog_first_lsn_in_log(); +extern LSN translog_first_theoretical_lsn(); +extern LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon); + +/* record parts descriptor */ +struct st_translog_parts +{ + /* full record length */ + translog_size_t record_length; + /* full record length with chunk headers */ + translog_size_t total_record_length; + /* current part index */ + uint current; + /* total number of elements in parts */ + uint elements; + /* array of parts (LEX_STRING) */ + LEX_STRING *parts; +}; + +typedef my_bool(*prewrite_rec_hook) (enum translog_record_type type, + TRN *trn, struct st_maria_info *tbl_info, + struct st_translog_parts *parts); + +typedef my_bool(*inwrite_rec_hook) (enum translog_record_type type, + TRN *trn, struct st_maria_info *tbl_info, + LSN *lsn, + struct st_translog_parts *parts); + +typedef uint16(*read_rec_hook) (enum translog_record_type type, + uint16 read_length, uchar *read_buff, + uchar *decoded_buff); + + +/* record classes */ +enum record_class +{ + LOGRECTYPE_NOT_ALLOWED, + LOGRECTYPE_VARIABLE_LENGTH, + LOGRECTYPE_PSEUDOFIXEDLENGTH, + LOGRECTYPE_FIXEDLENGTH +}; + +/* C++ can't bear that a variable's name is "class" */ +#ifndef __cplusplus + +enum enum_record_in_group { + LOGREC_NOT_LAST_IN_GROUP= 0, LOGREC_LAST_IN_GROUP, LOGREC_IS_GROUP_ITSELF +}; + +/* + Descriptor of log record type + Note: Don't reorder because of constructs later... +*/ +typedef struct st_log_record_type_descriptor +{ + /* internal class of the record */ + enum record_class class; + /* + length for fixed-size record, pseudo-fixed record + length with uncompressed LSNs + */ + uint16 fixed_length; + /* how much record body (belonged to headers too) read with headers */ + uint16 read_header_len; + /* HOOK for writing the record called before lock */ + prewrite_rec_hook prewrite_hook; + /* HOOK for writing the record called when LSN is known, inside lock */ + inwrite_rec_hook inwrite_hook; + /* HOOK for reading headers */ + read_rec_hook read_hook; + /* + For pseudo fixed records number of compressed LSNs followed by + system header + */ + int16 compressed_LSN; + /* the rest is for maria_read_log & Recovery */ + /** @brief for debug error messages or "maria_read_log" command-line tool */ + const char *name; + enum enum_record_in_group record_in_group; + /* a function to execute when we see the record during the REDO phase */ + int (*record_execute_in_redo_phase)(const TRANSLOG_HEADER_BUFFER *); + /* a function to execute when we see the record during the UNDO phase */ + int (*record_execute_in_undo_phase)(const TRANSLOG_HEADER_BUFFER *, TRN *); +} LOG_DESC; + +extern LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES]; +#endif + +C_MODE_END +#endif diff --git a/storage/maria/ma_loghandler_lsn.h b/storage/maria/ma_loghandler_lsn.h new file mode 100644 index 00000000000..e019be16fd2 --- /dev/null +++ b/storage/maria/ma_loghandler_lsn.h @@ -0,0 +1,100 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _ma_loghandler_lsn_h +#define _ma_loghandler_lsn_h + +/* + Transaction log record address: + file_no << 32 | offset + file_no is only 3 bytes so we can use signed integer to make + comparison more simple. +*/ +typedef int64 TRANSLOG_ADDRESS; + +/* + Compare addresses + A1 > A2 -> result > 0 + A1 == A2 -> 0 + A1 < A2 -> result < 0 +*/ +#define cmp_translog_addr(A1,A2) ((A1) - (A2)) + +/* LSN type (address of certain log record chank */ +typedef TRANSLOG_ADDRESS LSN; + +/* Gets file number part of a LSN/log address */ +#define LSN_FILE_NO(L) ((L) >> 32) + +/* Gets raw file number part of a LSN/log address */ +#define LSN_FILE_NO_PART(L) ((L) & ((int64)0xFFFFFF00000000LL)) + +/* Parts of LSN for printing */ +#define LSN_IN_PARTS(L) (ulong)LSN_FILE_NO(L),(ulong)LSN_OFFSET(L) + +/* Gets record offset of a LSN/log address */ +#define LSN_OFFSET(L) ((L) & 0xFFFFFFFFL) + +/* Makes lsn/log address from file number and record offset */ +#define MAKE_LSN(F,S) ((LSN) ((((uint64)(F)) << 32) | (S))) + +/* checks LSN */ +#define LSN_VALID(L) \ + ((LSN_FILE_NO_PART(L) != FILENO_IMPOSSIBLE) && \ + (LSN_OFFSET(L) != LOG_OFFSET_IMPOSSIBLE)) + +/* size of stored LSN on a disk, don't change it! */ +#define LSN_STORE_SIZE 7 + +/* Puts LSN into buffer (dst) */ +#define lsn_store(dst, lsn) \ + do { \ + int3store((dst), LSN_FILE_NO(lsn)); \ + int4store((dst) + 3, LSN_OFFSET(lsn)); \ + } while (0) + +/* Unpacks LSN from the buffer (P) */ +#define lsn_korr(P) MAKE_LSN(uint3korr(P), uint4korr((P) + 3)) + +/* what we need to add to LSN to increase it on one file */ +#define LSN_ONE_FILE ((int64)0x100000000LL) + +#define LSN_REPLACE_OFFSET(L, S) (LSN_FILE_NO_PART(L) | (S)) + +/* + an 8-byte type whose most significant uchar is used for "flags"; 7 + other bytes are a LSN. +*/ +typedef LSN LSN_WITH_FLAGS; +#define LSN_WITH_FLAGS_TO_LSN(x) (x & ULL(0x00FFFFFFFFFFFFFF)) +#define LSN_WITH_FLAGS_TO_FLAGS(x) (x & ULL(0xFF00000000000000)) + +#define FILENO_IMPOSSIBLE 0 /**< log file's numbering starts at 1 */ +#define LOG_OFFSET_IMPOSSIBLE 0 /**< log always has a header */ +#define LSN_IMPOSSIBLE 0 +/* following LSN also is impossible */ +#define LSN_ERROR 1 + +/** @brief some impossible LSN serve as markers */ +#define LSN_REPAIRED_BY_MARIA_CHK ((LSN)2) + +/** + @brief the maximum valid LSN. + Unlike ULONGLONG_MAX, it can be safely used in comparison with valid LSNs + (ULONGLONG_MAX is too big for correctness of cmp_translog_address()). +*/ +#define LSN_MAX (LSN)ULL(0x00FFFFFFFFFFFFFF) + +#endif diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c new file mode 100644 index 00000000000..9b665cfb958 --- /dev/null +++ b/storage/maria/ma_open.c @@ -0,0 +1,1577 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* open a isam-database */ + +#include "ma_fulltext.h" +#include "ma_sp_defs.h" +#include "ma_rt_index.h" +#include "ma_blockrec.h" +#include "trnman.h" +#include <m_ctype.h> + +#if defined(MSDOS) || defined(__WIN__) +#ifdef __WIN__ +#include <fcntl.h> +#else +#include <process.h> /* Prototype for getpid */ +#endif +#endif +#ifdef VMS +#include "static.c" +#endif + +static void setup_key_functions(MARIA_KEYDEF *keyinfo); +static my_bool maria_scan_init_dummy(MARIA_HA *info); +static void maria_scan_end_dummy(MARIA_HA *info); +static my_bool maria_once_init_dummy(MARIA_SHARE *, File); +static my_bool maria_once_end_dummy(MARIA_SHARE *); +static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base); +static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state); + +#define get_next_element(to,pos,size) { memcpy((char*) to,pos,(size_t) size); \ + pos+=size;} + + +#define disk_pos_assert(pos, end_pos) \ +if (pos > end_pos) \ +{ \ + my_errno=HA_ERR_CRASHED; \ + goto err; \ +} + + +/****************************************************************************** +** Return the shared struct if the table is already open. +** In MySQL the server will handle version issues. +******************************************************************************/ + +MARIA_HA *_ma_test_if_reopen(char *filename) +{ + LIST *pos; + + for (pos=maria_open_list ; pos ; pos=pos->next) + { + MARIA_HA *info=(MARIA_HA*) pos->data; + MARIA_SHARE *share=info->s; + if (!strcmp(share->unique_file_name,filename) && share->last_version) + return info; + } + return 0; +} + + +/* + Open a new instance of an already opened Maria table + + SYNOPSIS + maria_clone_internal() + share Share of already open table + mode Mode of table (O_RDONLY | O_RDWR) + data_file Filedescriptor of data file to use < 0 if one should open + open it. + + RETURN + # Maria handler + 0 Error +*/ + + +static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, int mode, + File data_file) +{ + int save_errno; + uint errpos; + MARIA_HA info,*m_info; + my_bitmap_map *changed_fields_bitmap; + DBUG_ENTER("maria_clone_internal"); + + errpos= 0; + bzero((uchar*) &info,sizeof(info)); + + if (mode == O_RDWR && share->mode == O_RDONLY) + { + my_errno=EACCES; /* Can't open in write mode */ + goto err; + } + if (data_file >= 0) + info.dfile.file= data_file; + else if (_ma_open_datafile(&info, share, -1)) + goto err; + errpos= 5; + + /* alloc and set up private structure parts */ + if (!my_multi_malloc(MY_WME, + &m_info,sizeof(MARIA_HA), + &info.blobs,sizeof(MARIA_BLOB)*share->base.blobs, + &info.buff,(share->base.max_key_block_length*2+ + share->base.max_key_length), + &info.lastkey,share->base.max_key_length*3+1, + &info.first_mbr_key, share->base.max_key_length, + &info.maria_rtree_recursion_state, + share->have_rtree ? 1024 : 0, + &changed_fields_bitmap, + bitmap_buffer_size(share->base.fields), + NullS)) + goto err; + errpos= 6; + + memcpy(info.blobs,share->blobs,sizeof(MARIA_BLOB)*share->base.blobs); + info.lastkey2=info.lastkey+share->base.max_key_length; + + info.s=share; + info.cur_row.lastpos= HA_OFFSET_ERROR; + info.update= (short) (HA_STATE_NEXT_FOUND+HA_STATE_PREV_FOUND); + info.opt_flag=READ_CHECK_USED; + info.this_unique= (ulong) info.dfile.file; /* Uniq number in process */ + if (share->data_file_type == COMPRESSED_RECORD) + info.this_unique= share->state.unique; + info.this_loop=0; /* Update counter */ + info.last_unique= share->state.unique; + info.last_loop= share->state.update_count; + info.lock_type=F_UNLCK; + info.quick_mode=0; + info.bulk_insert=0; + info.ft1_to_ft2=0; + info.errkey= -1; + info.page_changed=1; + info.keyread_buff= info.buff + share->base.max_key_block_length; + bitmap_init(&info.changed_fields, changed_fields_bitmap, + share->base.fields, 0); + if ((*share->init)(&info)) + goto err; + + pthread_mutex_lock(&share->intern_lock); + info.read_record= share->read_record; + share->reopen++; + share->write_flag=MYF(MY_NABP | MY_WAIT_IF_FULL); + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + info.lock_type=F_RDLCK; + share->r_locks++; + share->tot_locks++; + } + if (share->options & HA_OPTION_TMP_TABLE) + { + share->temporary= share->delay_key_write= 1; + + share->write_flag=MYF(MY_NABP); + share->w_locks++; /* We don't have to update status */ + share->tot_locks++; + info.lock_type=F_WRLCK; + } + if ((share->options & HA_OPTION_DELAY_KEY_WRITE) && + maria_delay_key_write) + share->delay_key_write=1; + + info.state= &share->state.state; /* Change global values by default */ + if (!share->base.born_transactional) /* but for transactional ones ... */ + info.trn= &dummy_transaction_object; /* ... force crash if no trn given */ + pthread_mutex_unlock(&share->intern_lock); + + /* Allocate buffer for one record */ + /* prerequisites: info->rec_buffer == 0 && info->rec_buff_size == 0 */ + if (_ma_alloc_buffer(&info.rec_buff, &info.rec_buff_size, + share->base.default_rec_buff_size)) + goto err; + + bzero(info.rec_buff, share->base.default_rec_buff_size); + + *m_info=info; +#ifdef THREAD + thr_lock_data_init(&share->lock,&m_info->lock,(void*) m_info); +#endif + m_info->open_list.data=(void*) m_info; + maria_open_list=list_add(maria_open_list,&m_info->open_list); + + DBUG_RETURN(m_info); + +err: + save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE; + if ((save_errno == HA_ERR_CRASHED) || + (save_errno == HA_ERR_CRASHED_ON_USAGE) || + (save_errno == HA_ERR_CRASHED_ON_REPAIR)) + _ma_report_error(save_errno, share->open_file_name); + switch (errpos) { + case 6: + (*share->end)(&info); + my_free((uchar*) m_info,MYF(0)); + /* fall through */ + case 5: + if (data_file < 0) + VOID(my_close(info.dfile.file, MYF(0))); + break; + } + my_errno=save_errno; + DBUG_RETURN (NULL); +} /* maria_clone_internal */ + + +/* Make a clone of a maria table */ + +MARIA_HA *maria_clone(MARIA_SHARE *share, int mode) +{ + MARIA_HA *new_info; + pthread_mutex_lock(&THR_LOCK_maria); + new_info= maria_clone_internal(share, mode, + share->data_file_type == BLOCK_RECORD ? + share->bitmap.file.file : -1); + pthread_mutex_unlock(&THR_LOCK_maria); + return new_info; +} + + +/****************************************************************************** + open a MARIA table + + See my_base.h for the handle_locking argument + if handle_locking and HA_OPEN_ABORT_IF_CRASHED then abort if the table + is marked crashed or if we are not using locking and the table doesn't + have an open count of 0. +******************************************************************************/ + +MARIA_HA *maria_open(const char *name, int mode, uint open_flags) +{ + int kfile,open_mode,save_errno; + uint i,j,len,errpos,head_length,base_pos,info_length,keys, + key_parts,unique_key_parts,fulltext_keys,uniques; + char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN], + data_name[FN_REFLEN]; + char *disk_cache, *disk_pos, *end_pos; + MARIA_HA info,*m_info,*old_info; + MARIA_SHARE share_buff,*share; + ulong rec_per_key_part[HA_MAX_POSSIBLE_KEY*HA_MAX_KEY_SEG]; + my_off_t key_root[HA_MAX_POSSIBLE_KEY]; + ulonglong max_key_file_length, max_data_file_length; + File data_file= -1; + DBUG_ENTER("maria_open"); + + LINT_INIT(m_info); + kfile= -1; + errpos= 0; + head_length=sizeof(share_buff.state.header); + bzero((uchar*) &info,sizeof(info)); + + my_realpath(name_buff, fn_format(org_name,name,"",MARIA_NAME_IEXT, + MY_UNPACK_FILENAME),MYF(0)); + pthread_mutex_lock(&THR_LOCK_maria); + old_info= 0; + if ((open_flags & HA_OPEN_COPY) || + !(old_info=_ma_test_if_reopen(name_buff))) + { + share= &share_buff; + bzero((uchar*) &share_buff,sizeof(share_buff)); + share_buff.state.rec_per_key_part=rec_per_key_part; + share_buff.state.key_root=key_root; + share_buff.pagecache= multi_pagecache_search(name_buff, strlen(name_buff), + maria_pagecache); + + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_open", + if (strstr(name, "/t1")) + { + my_errno= HA_ERR_CRASHED; + goto err; + }); + if ((kfile=my_open(name_buff,(open_mode=O_RDWR) | O_SHARE,MYF(0))) < 0) + { + if ((errno != EROFS && errno != EACCES) || + mode != O_RDONLY || + (kfile=my_open(name_buff,(open_mode=O_RDONLY) | O_SHARE,MYF(0))) < 0) + goto err; + } + share->mode=open_mode; + errpos= 1; + if (my_read(kfile,(char*) share->state.header.file_version,head_length, + MYF(MY_NABP))) + { + my_errno= HA_ERR_NOT_A_TABLE; + goto err; + } + if (memcmp((uchar*) share->state.header.file_version, + (uchar*) maria_file_magic, 4)) + { + DBUG_PRINT("error",("Wrong header in %s",name_buff)); + DBUG_DUMP("error_dump",(char*) share->state.header.file_version, + head_length); + my_errno=HA_ERR_NOT_A_TABLE; + goto err; + } + share->options= mi_uint2korr(share->state.header.options); + if (share->options & + ~(HA_OPTION_PACK_RECORD | HA_OPTION_PACK_KEYS | + HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA | + HA_OPTION_TEMP_COMPRESS_RECORD | HA_OPTION_CHECKSUM | + HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE | + HA_OPTION_RELIES_ON_SQL_LAYER | HA_OPTION_NULL_FIELDS)) + { + DBUG_PRINT("error",("wrong options: 0x%lx", share->options)); + my_errno=HA_ERR_OLD_FILE; + goto err; + } + if ((share->options & HA_OPTION_RELIES_ON_SQL_LAYER) && + ! (open_flags & HA_OPEN_FROM_SQL_LAYER)) + { + DBUG_PRINT("error", ("table cannot be openned from non-sql layer")); + my_errno= HA_ERR_UNSUPPORTED; + goto err; + } + /* Don't call realpath() if the name can't be a link */ + if (!strcmp(name_buff, org_name) || + my_readlink(index_name, org_name, MYF(0)) == -1) + (void) strmov(index_name, org_name); + *strrchr(org_name, '.')= '\0'; + (void) fn_format(data_name,org_name,"",MARIA_NAME_DEXT, + MY_APPEND_EXT|MY_UNPACK_FILENAME|MY_RESOLVE_SYMLINKS); + + info_length=mi_uint2korr(share->state.header.header_length); + base_pos= mi_uint2korr(share->state.header.base_pos); + if (!(disk_cache=(char*) my_alloca(info_length+128))) + { + my_errno=ENOMEM; + goto err; + } + end_pos=disk_cache+info_length; + errpos= 2; + + VOID(my_seek(kfile,0L,MY_SEEK_SET,MYF(0))); + errpos= 3; + if (my_read(kfile,disk_cache,info_length,MYF(MY_NABP))) + { + my_errno=HA_ERR_CRASHED; + goto err; + } + len=mi_uint2korr(share->state.header.state_info_length); + keys= (uint) share->state.header.keys; + uniques= (uint) share->state.header.uniques; + fulltext_keys= (uint) share->state.header.fulltext_keys; + key_parts= mi_uint2korr(share->state.header.key_parts); + unique_key_parts= mi_uint2korr(share->state.header.unique_key_parts); + if (len != MARIA_STATE_INFO_SIZE) + { + DBUG_PRINT("warning", + ("saved_state_info_length: %d state_info_length: %d", + len,MARIA_STATE_INFO_SIZE)); + } + share->state_diff_length=len-MARIA_STATE_INFO_SIZE; + + _ma_state_info_read(disk_cache, &share->state); + len= mi_uint2korr(share->state.header.base_info_length); + if (len != MARIA_BASE_INFO_SIZE) + { + DBUG_PRINT("warning",("saved_base_info_length: %d base_info_length: %d", + len,MARIA_BASE_INFO_SIZE)); + } + disk_pos= _ma_base_info_read(disk_cache + base_pos, &share->base); + share->state.state_length=base_pos; + + if (!(open_flags & HA_OPEN_FOR_REPAIR) && + ((share->state.changed & STATE_CRASHED) || + ((open_flags & HA_OPEN_ABORT_IF_CRASHED) && + (my_disable_locking && share->state.open_count)))) + { + DBUG_PRINT("error",("Table is marked as crashed. open_flags: %u " + "changed: %u open_count: %u !locking: %d", + open_flags, share->state.changed, + share->state.open_count, my_disable_locking)); + my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ? + HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE); + goto err; + } + + /* sanity check */ + if (share->base.keystart > 65535 || share->base.rec_reflength > 8) + { + my_errno=HA_ERR_CRASHED; + goto err; + } + + key_parts+=fulltext_keys*FT_SEGS; + if (share->base.max_key_length > maria_max_key_length() || + keys > MARIA_MAX_KEY || key_parts > MARIA_MAX_KEY * HA_MAX_KEY_SEG) + { + DBUG_PRINT("error",("Wrong key info: Max_key_length: %d keys: %d key_parts: %d", share->base.max_key_length, keys, key_parts)); + my_errno=HA_ERR_UNSUPPORTED; + goto err; + } + /* + If page cache is not initialized, then assume we will create it + after the table is opened! + */ + if (share->base.block_size != maria_block_size && + share_buff.pagecache->inited != 0) + { + DBUG_PRINT("error", ("Wrong block size %u; Expected %u", + (uint) share->base.block_size, + (uint) maria_block_size)); + my_errno=HA_ERR_UNSUPPORTED; + goto err; + } + + /* Correct max_file_length based on length of sizeof(off_t) */ + max_data_file_length= + (share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) ? + (((ulonglong) 1 << (share->base.rec_reflength*8))-1) : + (_ma_safe_mul(share->base.pack_reclength, + (ulonglong) 1 << (share->base.rec_reflength*8))-1); + + max_key_file_length= + _ma_safe_mul(MARIA_MIN_KEY_BLOCK_LENGTH, + ((ulonglong) 1 << (share->base.key_reflength*8))-1); +#if SIZEOF_OFF_T == 4 + set_if_smaller(max_data_file_length, INT_MAX32); + set_if_smaller(max_key_file_length, INT_MAX32); +#endif + share->base.max_data_file_length=(my_off_t) max_data_file_length; + share->base.max_key_file_length=(my_off_t) max_key_file_length; + + if (share->options & HA_OPTION_COMPRESS_RECORD) + share->base.max_key_length+=2; /* For safety */ + + if (!my_multi_malloc(MY_WME, + &share,sizeof(*share), + &share->state.rec_per_key_part,sizeof(long)*key_parts, + &share->keyinfo,keys*sizeof(MARIA_KEYDEF), + &share->uniqueinfo,uniques*sizeof(MARIA_UNIQUEDEF), + &share->keyparts, + (key_parts+unique_key_parts+keys+uniques) * + sizeof(HA_KEYSEG), + &share->columndef, + (share->base.fields+1)*sizeof(MARIA_COLUMNDEF), + &share->blobs,sizeof(MARIA_BLOB)*share->base.blobs, + &share->unique_file_name,strlen(name_buff)+1, + &share->index_file_name,strlen(index_name)+1, + &share->data_file_name,strlen(data_name)+1, + &share->open_file_name,strlen(name)+1, + &share->state.key_root,keys*sizeof(my_off_t), +#ifdef THREAD + &share->key_root_lock,sizeof(rw_lock_t)*keys, +#endif + &share->mmap_lock,sizeof(rw_lock_t), + NullS)) + goto err; + errpos= 4; + + *share=share_buff; + memcpy((char*) share->state.rec_per_key_part, + (char*) rec_per_key_part, sizeof(long)*key_parts); + memcpy((char*) share->state.key_root, + (char*) key_root, sizeof(my_off_t)*keys); + strmov(share->unique_file_name, name_buff); + share->unique_name_length= strlen(name_buff); + strmov(share->index_file_name, index_name); + strmov(share->data_file_name, data_name); + strmov(share->open_file_name, name); + + share->block_size= share->base.block_size; + { + HA_KEYSEG *pos=share->keyparts; + for (i=0 ; i < keys ; i++) + { + share->keyinfo[i].share= share; + disk_pos=_ma_keydef_read(disk_pos, &share->keyinfo[i]); + disk_pos_assert(disk_pos + share->keyinfo[i].keysegs * HA_KEYSEG_SIZE, + end_pos); + if (share->keyinfo[i].key_alg == HA_KEY_ALG_RTREE) + share->have_rtree= 1; + share->keyinfo[i].seg=pos; + for (j=0 ; j < share->keyinfo[i].keysegs; j++,pos++) + { + disk_pos=_ma_keyseg_read(disk_pos, pos); + if (pos->type == HA_KEYTYPE_TEXT || + pos->type == HA_KEYTYPE_VARTEXT1 || + pos->type == HA_KEYTYPE_VARTEXT2) + { + if (!pos->language) + pos->charset=default_charset_info; + else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME)))) + { + my_errno=HA_ERR_UNKNOWN_CHARSET; + goto err; + } + } + else if (pos->type == HA_KEYTYPE_BINARY) + pos->charset= &my_charset_bin; + } + if (share->keyinfo[i].flag & HA_SPATIAL) + { +#ifdef HAVE_SPATIAL + uint sp_segs=SPDIMS*2; + share->keyinfo[i].seg=pos-sp_segs; + share->keyinfo[i].keysegs--; +#else + my_errno=HA_ERR_UNSUPPORTED; + goto err; +#endif + } + else if (share->keyinfo[i].flag & HA_FULLTEXT) + { + if (!fulltext_keys) + { /* 4.0 compatibility code, to be removed in 5.0 */ + share->keyinfo[i].seg=pos-FT_SEGS; + share->keyinfo[i].keysegs-=FT_SEGS; + } + else + { + uint k; + share->keyinfo[i].seg=pos; + for (k=0; k < FT_SEGS; k++) + { + *pos= ft_keysegs[k]; + pos[0].language= pos[-1].language; + if (!(pos[0].charset= pos[-1].charset)) + { + my_errno=HA_ERR_CRASHED; + goto err; + } + pos++; + } + } + if (!share->ft2_keyinfo.seg) + { + memcpy(& share->ft2_keyinfo, & share->keyinfo[i], sizeof(MARIA_KEYDEF)); + share->ft2_keyinfo.keysegs=1; + share->ft2_keyinfo.flag=0; + share->ft2_keyinfo.keylength= + share->ft2_keyinfo.minlength= + share->ft2_keyinfo.maxlength=HA_FT_WLEN+share->base.rec_reflength; + share->ft2_keyinfo.seg=pos-1; + share->ft2_keyinfo.end=pos; + setup_key_functions(& share->ft2_keyinfo); + } + } + setup_key_functions(share->keyinfo+i); + share->keyinfo[i].end=pos; + pos->type=HA_KEYTYPE_END; /* End */ + pos->length=share->base.rec_reflength; + pos->null_bit=0; + pos->flag=0; /* For purify */ + pos++; + } + for (i=0 ; i < uniques ; i++) + { + disk_pos=_ma_uniquedef_read(disk_pos, &share->uniqueinfo[i]); + disk_pos_assert(disk_pos + share->uniqueinfo[i].keysegs * + HA_KEYSEG_SIZE, end_pos); + share->uniqueinfo[i].seg=pos; + for (j=0 ; j < share->uniqueinfo[i].keysegs; j++,pos++) + { + disk_pos=_ma_keyseg_read(disk_pos, pos); + if (pos->type == HA_KEYTYPE_TEXT || + pos->type == HA_KEYTYPE_VARTEXT1 || + pos->type == HA_KEYTYPE_VARTEXT2) + { + if (!pos->language) + pos->charset=default_charset_info; + else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME)))) + { + my_errno=HA_ERR_UNKNOWN_CHARSET; + goto err; + } + } + } + share->uniqueinfo[i].end=pos; + pos->type=HA_KEYTYPE_END; /* End */ + pos->null_bit=0; + pos->flag=0; + pos++; + } + share->ftparsers= 0; + } + share->data_file_type= share->state.header.data_file_type; + share->base_length= (BASE_ROW_HEADER_SIZE + + share->base.is_nulls_extended + + share->base.null_bytes + + share->base.pack_bytes + + test(share->options & HA_OPTION_CHECKSUM)); + if (open_flags & HA_OPEN_COPY) + { + /* + this instance will be a temporary one used just to create a data + file for REPAIR. Don't do logging. This base information will not go + to disk. + */ + share->base.born_transactional= FALSE; + } + if (share->base.born_transactional) + { + share->page_type= PAGECACHE_LSN_PAGE; +#ifdef ENABLE_WHEN_WE_HAVE_TRANS_ROW_ID /* QQ */ + share->base_length+= TRANS_ROW_EXTRA_HEADER_SIZE; +#endif + if (share->state.create_rename_lsn == LSN_REPAIRED_BY_MARIA_CHK) + { + /* + Was repaired with maria_chk, maybe later maria_pack-ed. Some sort of + import into the server. It starts its existence (from the point of + view of the server, including server's recovery) now. + */ + if ((open_flags & HA_OPEN_FROM_SQL_LAYER) || maria_in_recovery) + _ma_update_create_rename_lsn_sub(share, translog_get_horizon(), + TRUE); + } + else if ((!LSN_VALID(share->state.create_rename_lsn) || + !LSN_VALID(share->state.is_of_horizon) || + (cmp_translog_addr(share->state.create_rename_lsn, + share->state.is_of_horizon) > 0)) && + !(open_flags & HA_OPEN_FOR_REPAIR)) + { + /* + If in Recovery, it will not work. If LSN is invalid and not + LSN_REPAIRED_BY_MARIA_CHK, header must be corrupted. + In both cases, must repair. + */ + my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ? + HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE); + goto err; + } + } + else + share->page_type= PAGECACHE_PLAIN_PAGE; + share->now_transactional= share->base.born_transactional; + + share->base.default_rec_buff_size= max(share->base.pack_reclength, + share->base.max_key_length); + if (share->data_file_type == DYNAMIC_RECORD) + { + share->base.extra_rec_buff_size= + (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER) + MARIA_SPLIT_LENGTH + + MARIA_REC_BUFF_OFFSET); + share->base.default_rec_buff_size+= share->base.extra_rec_buff_size; + } + disk_pos_assert(disk_pos + share->base.fields *MARIA_COLUMNDEF_SIZE, + end_pos); + for (i= j= 0 ; i < share->base.fields ; i++) + { + disk_pos=_ma_columndef_read(disk_pos,&share->columndef[i]); + share->columndef[i].pack_type=0; + share->columndef[i].huff_tree=0; + if (share->columndef[i].type == (int) FIELD_BLOB) + { + share->blobs[j].pack_length= + share->columndef[i].length-portable_sizeof_char_ptr;; + share->blobs[j].offset= share->columndef[i].offset; + j++; + } + } + share->columndef[i].type=(int) FIELD_LAST; /* End marker */ + + if ((share->data_file_type == BLOCK_RECORD || + share->data_file_type == COMPRESSED_RECORD)) + { + if (_ma_open_datafile(&info, share, -1)) + goto err; + data_file= info.dfile.file; + } + errpos= 5; + + share->kfile.file= kfile; + share->this_process=(ulong) getpid(); + share->last_process= share->state.process; + share->base.key_parts=key_parts; + share->base.all_key_parts=key_parts+unique_key_parts; + if (!(share->last_version=share->state.version)) + share->last_version=1; /* Safety */ + share->rec_reflength=share->base.rec_reflength; /* May be changed */ + share->base.margin_key_file_length=(share->base.max_key_file_length - + (keys ? MARIA_INDEX_BLOCK_MARGIN * + share->block_size * keys : 0)); + share->block_size= share->base.block_size; + my_afree((uchar*) disk_cache); + _ma_setup_functions(share); + if ((*share->once_init)(share, info.dfile.file)) + goto err; + share->is_log_table= FALSE; + if (open_flags & HA_OPEN_TMP_TABLE) + share->options|= HA_OPTION_TMP_TABLE; + if (open_flags & HA_OPEN_DELAY_KEY_WRITE) + share->options|= HA_OPTION_DELAY_KEY_WRITE; + if (mode == O_RDONLY) + share->options|= HA_OPTION_READ_ONLY_DATA; + +#ifdef THREAD + thr_lock_init(&share->lock); + VOID(pthread_mutex_init(&share->intern_lock,MY_MUTEX_INIT_FAST)); + for (i=0; i<keys; i++) + VOID(my_rwlock_init(&share->key_root_lock[i], NULL)); + VOID(my_rwlock_init(&share->mmap_lock, NULL)); + if (!thr_lock_inited) + { + /* Probably a single threaded program; Don't use concurrent inserts */ + maria_concurrent_insert=0; + } + else if (maria_concurrent_insert) + { + share->concurrent_insert= + ((share->options & (HA_OPTION_READ_ONLY_DATA | HA_OPTION_TMP_TABLE | + HA_OPTION_COMPRESS_RECORD | + HA_OPTION_TEMP_COMPRESS_RECORD)) || + (open_flags & HA_OPEN_TMP_TABLE) || + share->data_file_type == BLOCK_RECORD || + share->have_rtree) ? 0 : 1; + if (share->concurrent_insert) + { + share->lock.get_status=_ma_get_status; + share->lock.copy_status=_ma_copy_status; + /** + @todo RECOVERY + INSERT DELAYED and concurrent inserts are currently disabled for + transactional tables; when enabled again, we should re-evaluate + what problems the call to _ma_update_status() by + thr_reschedule_write_lock() can do (it may hurt Checkpoint as it + would be without intern_lock, and it modifies the state). + */ + share->lock.update_status=_ma_update_status; + share->lock.restore_status=_ma_restore_status; + share->lock.check_status=_ma_check_status; + } + } +#endif + /* + Memory mapping can only be requested after initializing intern_lock. + */ + if (open_flags & HA_OPEN_MMAP) + { + info.s= share; + maria_extra(&info, HA_EXTRA_MMAP, 0); + } + } + else + { + share= old_info->s; + if (share->data_file_type == BLOCK_RECORD) + data_file= share->bitmap.file.file; /* Only opened once */ + } + + if (!(m_info= maria_clone_internal(share, mode, data_file))) + goto err; + pthread_mutex_unlock(&THR_LOCK_maria); + DBUG_RETURN(m_info); + +err: + save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE; + if ((save_errno == HA_ERR_CRASHED) || + (save_errno == HA_ERR_CRASHED_ON_USAGE) || + (save_errno == HA_ERR_CRASHED_ON_REPAIR)) + _ma_report_error(save_errno, name); + switch (errpos) { + case 5: + if (data_file >= 0) + VOID(my_close(data_file, MYF(0))); + if (old_info) + break; /* Don't remove open table */ + (*share->once_end)(share); + /* fall through */ + case 4: + my_free((uchar*) share,MYF(0)); + /* fall through */ + case 3: + /* fall through */ + case 2: + my_afree((uchar*) disk_cache); + /* fall through */ + case 1: + VOID(my_close(kfile,MYF(0))); + /* fall through */ + case 0: + default: + break; + } + pthread_mutex_unlock(&THR_LOCK_maria); + my_errno= save_errno; + DBUG_RETURN (NULL); +} /* maria_open */ + + +/* + Reallocate a buffer, if the current buffer is not large enough +*/ + +my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size, + size_t new_size) +{ + if (*old_size < new_size) + { + uchar *addr; + if (!(addr= (uchar*) my_realloc((uchar*) *old_addr, new_size, + MYF(MY_ALLOW_ZERO_PTR)))) + return 1; + *old_addr= addr; + *old_size= new_size; + } + return 0; +} + + +ulonglong _ma_safe_mul(ulonglong a, ulonglong b) +{ + ulonglong max_val= ~ (ulonglong) 0; /* my_off_t is unsigned */ + + if (!a || max_val / a < b) + return max_val; + return a*b; +} + + /* Set up functions in structs */ + +void _ma_setup_functions(register MARIA_SHARE *share) +{ + share->once_init= maria_once_init_dummy; + share->once_end= maria_once_end_dummy; + share->init= maria_scan_init_dummy; + share->end= maria_scan_end_dummy; + share->scan_init= maria_scan_init_dummy;/* Compat. dummy function */ + share->scan_end= maria_scan_end_dummy;/* Compat. dummy function */ + share->write_record_init= _ma_write_init_default; + share->write_record_abort= _ma_write_abort_default; + + switch (share->data_file_type) { + case COMPRESSED_RECORD: + share->read_record= _ma_read_pack_record; + share->scan= _ma_read_rnd_pack_record; + share->once_init= _ma_once_init_pack_row; + share->once_end= _ma_once_end_pack_row; + /* + Calculate checksum according to data in the original, not compressed, + row. + */ + if (share->state.header.org_data_file_type == STATIC_RECORD && + ! (share->options & HA_OPTION_NULL_FIELDS)) + share->calc_checksum= _ma_static_checksum; + else + share->calc_checksum= _ma_checksum; + share->calc_write_checksum= share->calc_checksum; + break; + case DYNAMIC_RECORD: + share->read_record= _ma_read_dynamic_record; + share->scan= _ma_read_rnd_dynamic_record; + share->delete_record= _ma_delete_dynamic_record; + share->compare_record= _ma_cmp_dynamic_record; + share->compare_unique= _ma_cmp_dynamic_unique; + share->calc_checksum= share->calc_write_checksum= _ma_checksum; + /* add bits used to pack data to pack_reclength for faster allocation */ + share->base.pack_reclength+= share->base.pack_bytes; + if (share->base.blobs) + { + share->update_record= _ma_update_blob_record; + share->write_record= _ma_write_blob_record; + } + else + { + share->write_record= _ma_write_dynamic_record; + share->update_record= _ma_update_dynamic_record; + } + break; + case STATIC_RECORD: + share->read_record= _ma_read_static_record; + share->scan= _ma_read_rnd_static_record; + share->delete_record= _ma_delete_static_record; + share->compare_record= _ma_cmp_static_record; + share->update_record= _ma_update_static_record; + share->write_record= _ma_write_static_record; + share->compare_unique= _ma_cmp_static_unique; + if (share->state.header.org_data_file_type == STATIC_RECORD && + ! (share->options & HA_OPTION_NULL_FIELDS)) + share->calc_checksum= _ma_static_checksum; + else + share->calc_checksum= _ma_checksum; + break; + case BLOCK_RECORD: + share->once_init= _ma_once_init_block_record; + share->once_end= _ma_once_end_block_record; + share->init= _ma_init_block_record; + share->end= _ma_end_block_record; + share->write_record_init= _ma_write_init_block_record; + share->write_record_abort= _ma_write_abort_block_record; + share->scan_init= _ma_scan_init_block_record; + share->scan_end= _ma_scan_end_block_record; + share->read_record= _ma_read_block_record; + share->scan= _ma_scan_block_record; + share->delete_record= _ma_delete_block_record; + share->compare_record= _ma_compare_block_record; + share->update_record= _ma_update_block_record; + share->write_record= _ma_write_block_record; + share->compare_unique= _ma_cmp_block_unique; + share->calc_checksum= _ma_checksum; + /* + write_block_record() will calculate the checksum; Tell maria_write() + that it doesn't have to do this. + */ + share->calc_write_checksum= 0; + break; + } + share->file_read= _ma_nommap_pread; + share->file_write= _ma_nommap_pwrite; + share->calc_check_checksum= share->calc_checksum; + + if (!(share->options & HA_OPTION_CHECKSUM) && + share->data_file_type != COMPRESSED_RECORD) + share->calc_checksum= share->calc_write_checksum= 0; + return; +} + + +static void setup_key_functions(register MARIA_KEYDEF *keyinfo) +{ + if (keyinfo->key_alg == HA_KEY_ALG_RTREE) + { +#ifdef HAVE_RTREE_KEYS + keyinfo->ck_insert = maria_rtree_insert; + keyinfo->ck_delete = maria_rtree_delete; +#else + DBUG_ASSERT(0); /* maria_open should check it never happens */ +#endif + } + else + { + keyinfo->ck_insert = _ma_ck_write; + keyinfo->ck_delete = _ma_ck_delete; + } + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { /* Simple prefix compression */ + keyinfo->bin_search= _ma_seq_search; + keyinfo->get_key= _ma_get_binary_pack_key; + keyinfo->pack_key= _ma_calc_bin_pack_key_length; + keyinfo->store_key= _ma_store_bin_pack_key; + } + else if (keyinfo->flag & HA_VAR_LENGTH_KEY) + { + keyinfo->get_key= _ma_get_pack_key; + if (keyinfo->seg[0].flag & HA_PACK_KEY) + { /* Prefix compression */ + if (!keyinfo->seg->charset || use_strnxfrm(keyinfo->seg->charset) || + (keyinfo->seg->flag & HA_NULL_PART)) + keyinfo->bin_search= _ma_seq_search; + else + keyinfo->bin_search= _ma_prefix_search; + keyinfo->pack_key= _ma_calc_var_pack_key_length; + keyinfo->store_key= _ma_store_var_pack_key; + } + else + { + keyinfo->bin_search= _ma_seq_search; + keyinfo->pack_key= _ma_calc_var_key_length; /* Variable length key */ + keyinfo->store_key= _ma_store_static_key; + } + } + else + { + keyinfo->bin_search= _ma_bin_search; + keyinfo->get_key= _ma_get_static_key; + keyinfo->pack_key= _ma_calc_static_key_length; + keyinfo->store_key= _ma_store_static_key; + } + return; +} + + +/** + @brief Function to save and store the header in the index file (.MYI) + + Operates under MARIA_SHARE::intern_lock if requested. + Sets MARIA_SHARE::MARIA_STATE_INFO::is_of_horizon if transactional table. + Then calls _ma_state_info_write_sub(). + + @param share table + @param pWrite bitmap: if 1 is set my_pwrite() is used otherwise + my_write(); if 2 is set, info about keys is written + (should only be needed after ALTER TABLE + ENABLE/DISABLE KEYS, and REPAIR/OPTIMIZE); if 4 is + set, MARIA_SHARE::intern_lock is taken. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite) +{ + uint res; + if (pWrite & 4) + pthread_mutex_lock(&share->intern_lock); + else if (maria_multi_threaded) + safe_mutex_assert_owner(&share->intern_lock); + if (share->base.born_transactional && translog_inited && + !maria_in_recovery) + { + /* + In a recovery, we want to set is_of_horizon to the LSN of the last + record executed by Recovery, not the current EOF of the log (which + is too new). Recovery does it by itself. + */ + share->state.is_of_horizon= translog_get_horizon(); + } + res= _ma_state_info_write_sub(share->kfile.file, &share->state, pWrite); + if (pWrite & 4) + pthread_mutex_unlock(&share->intern_lock); + return res; +} + + +/** + @brief Function to save and store the header in the index file (.MYI). + + Shortcut to use instead of _ma_state_info_write() when appropriate. + + @param file descriptor of the index file to write + @param state state information to write to the file + @param pWrite bitmap: if 1 is set my_pwrite() is used otherwise + my_write(); if 2 is set, info about keys is written + (should only be needed after ALTER TABLE + ENABLE/DISABLE KEYS, and REPAIR/OPTIMIZE). + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite) +{ + /** @todo RECOVERY write it only at checkpoint time */ + uchar buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE]; + uchar *ptr=buff; + uint i, keys= (uint) state->header.keys; + size_t res; + DBUG_ENTER("_ma_state_info_write"); + + memcpy_fixed(ptr,&state->header,sizeof(state->header)); + ptr+=sizeof(state->header); + + /* open_count must be first because of _ma_mark_file_changed ! */ + mi_int2store(ptr,state->open_count); ptr+= 2; + /* + if you change the offset of create_rename_lsn/is_of_horizon inside the + index file's header, fix ma_create + ma_rename + ma_delete_all + + backward-compatibility. + */ + lsn_store(ptr, state->create_rename_lsn); ptr+= LSN_STORE_SIZE; + lsn_store(ptr, state->is_of_horizon); ptr+= LSN_STORE_SIZE; + *ptr++= (uchar)state->changed; + *ptr++= state->sortkey; + mi_rowstore(ptr,state->state.records); ptr+= 8; + mi_rowstore(ptr,state->state.del); ptr+= 8; + mi_rowstore(ptr,state->split); ptr+= 8; + mi_sizestore(ptr,state->dellink); ptr+= 8; + mi_sizestore(ptr,state->first_bitmap_with_space); ptr+= 8; + mi_sizestore(ptr,state->state.key_file_length); ptr+= 8; + mi_sizestore(ptr,state->state.data_file_length); ptr+= 8; + mi_sizestore(ptr,state->state.empty); ptr+= 8; + mi_sizestore(ptr,state->state.key_empty); ptr+= 8; + mi_int8store(ptr,state->auto_increment); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->state.checksum); ptr+= 8; + mi_int4store(ptr,state->process); ptr+= 4; + mi_int4store(ptr,state->unique); ptr+= 4; + mi_int4store(ptr,state->status); ptr+= 4; + mi_int4store(ptr,state->update_count); ptr+= 4; + + ptr+= state->state_diff_length; + + for (i=0; i < keys; i++) + { + mi_sizestore(ptr,state->key_root[i]); ptr+= 8; + } + /** @todo RECOVERY BUG key_del is a problem for recovery */ + mi_sizestore(ptr,state->key_del); ptr+= 8; + if (pWrite & 2) /* From maria_chk */ + { + uint key_parts= mi_uint2korr(state->header.key_parts); + mi_int4store(ptr,state->sec_index_changed); ptr+= 4; + mi_int4store(ptr,state->sec_index_used); ptr+= 4; + mi_int4store(ptr,state->version); ptr+= 4; + mi_int8store(ptr,state->key_map); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->create_time); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->recover_time); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->check_time); ptr+= 8; + mi_sizestore(ptr,state->rec_per_key_rows); ptr+= 8; + for (i=0 ; i < key_parts ; i++) + { + mi_int4store(ptr,state->rec_per_key_part[i]); ptr+=4; + } + } + + res= (pWrite & 1) ? + my_pwrite(file, buff, (size_t) (ptr-buff), 0L, + MYF(MY_NABP | MY_THREADSAFE)) : + my_write(file, buff, (size_t) (ptr-buff), + MYF(MY_NABP)); + DBUG_RETURN(res != 0); +} + + +static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state) +{ + uint i,keys,key_parts; + memcpy_fixed(&state->header,ptr, sizeof(state->header)); + ptr+= sizeof(state->header); + keys= (uint) state->header.keys; + key_parts= mi_uint2korr(state->header.key_parts); + + state->open_count = mi_uint2korr(ptr); ptr+= 2; + state->create_rename_lsn= lsn_korr(ptr); ptr+= LSN_STORE_SIZE; + state->is_of_horizon= lsn_korr(ptr); ptr+= LSN_STORE_SIZE; + state->changed= (my_bool) *ptr++; + state->sortkey= (uint) *ptr++; + state->state.records= mi_rowkorr(ptr); ptr+= 8; + state->state.del = mi_rowkorr(ptr); ptr+= 8; + state->split = mi_rowkorr(ptr); ptr+= 8; + state->dellink= mi_sizekorr(ptr); ptr+= 8; + state->first_bitmap_with_space= mi_sizekorr(ptr); ptr+= 8; + state->state.key_file_length = mi_sizekorr(ptr); ptr+= 8; + state->state.data_file_length= mi_sizekorr(ptr); ptr+= 8; + state->state.empty = mi_sizekorr(ptr); ptr+= 8; + state->state.key_empty= mi_sizekorr(ptr); ptr+= 8; + state->auto_increment=mi_uint8korr(ptr); ptr+= 8; + state->state.checksum=(ha_checksum) mi_uint8korr(ptr);ptr+= 8; + state->process= mi_uint4korr(ptr); ptr+= 4; + state->unique = mi_uint4korr(ptr); ptr+= 4; + state->status = mi_uint4korr(ptr); ptr+= 4; + state->update_count=mi_uint4korr(ptr); ptr+= 4; + + ptr+= state->state_diff_length; + + for (i=0; i < keys; i++) + { + state->key_root[i]= mi_sizekorr(ptr); ptr+= 8; + } + state->key_del= mi_sizekorr(ptr); ptr+= 8; + state->sec_index_changed = mi_uint4korr(ptr); ptr+= 4; + state->sec_index_used = mi_uint4korr(ptr); ptr+= 4; + state->version = mi_uint4korr(ptr); ptr+= 4; + state->key_map = mi_uint8korr(ptr); ptr+= 8; + state->create_time = (time_t) mi_sizekorr(ptr); ptr+= 8; + state->recover_time =(time_t) mi_sizekorr(ptr); ptr+= 8; + state->check_time = (time_t) mi_sizekorr(ptr); ptr+= 8; + state->rec_per_key_rows=mi_sizekorr(ptr); ptr+= 8; + for (i=0 ; i < key_parts ; i++) + { + state->rec_per_key_part[i]= mi_uint4korr(ptr); ptr+=4; + } + return ptr; +} + + +/** + @brief Fills the state by reading its copy on disk. + + Should not be called for transactional tables, as their state on disk is + rarely current and so is often misleading for a reader. + Does nothing in single user mode. + + @param file file to read from + @param state state which will be filled + @param pRead if true, use my_pread(), otherwise my_read() +*/ + +uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state) +{ + char buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE]; + + /* trick to detect transactional tables */ + DBUG_ASSERT(state->create_rename_lsn == LSN_IMPOSSIBLE); + if (!maria_single_user) + { + if (my_pread(file, buff, state->state_length, 0L, MYF(MY_NABP))) + return 1; + _ma_state_info_read(buff, state); + } + return 0; +} + + +/**************************************************************************** +** store and read of MARIA_BASE_INFO +****************************************************************************/ + +uint _ma_base_info_write(File file, MARIA_BASE_INFO *base) +{ + uchar buff[MARIA_BASE_INFO_SIZE], *ptr=buff; + + mi_sizestore(ptr,base->keystart); ptr+= 8; + mi_sizestore(ptr,base->max_data_file_length); ptr+= 8; + mi_sizestore(ptr,base->max_key_file_length); ptr+= 8; + mi_rowstore(ptr,base->records); ptr+= 8; + mi_rowstore(ptr,base->reloc); ptr+= 8; + mi_int4store(ptr,base->mean_row_length); ptr+= 4; + mi_int4store(ptr,base->reclength); ptr+= 4; + mi_int4store(ptr,base->pack_reclength); ptr+= 4; + mi_int4store(ptr,base->min_pack_length); ptr+= 4; + mi_int4store(ptr,base->max_pack_length); ptr+= 4; + mi_int4store(ptr,base->min_block_length); ptr+= 4; + mi_int2store(ptr,base->fields); ptr+= 2; + mi_int2store(ptr,base->fixed_not_null_fields); ptr+= 2; + mi_int2store(ptr,base->fixed_not_null_fields_length); ptr+= 2; + mi_int2store(ptr,base->max_field_lengths); ptr+= 2; + mi_int2store(ptr,base->pack_fields); ptr+= 2; + mi_int2store(ptr,0); ptr+= 2; + mi_int2store(ptr,base->null_bytes); ptr+= 2; + mi_int2store(ptr,base->original_null_bytes); ptr+= 2; + mi_int2store(ptr,base->field_offsets); ptr+= 2; + mi_int2store(ptr,base->min_row_length); ptr+= 2; + mi_int2store(ptr,base->block_size); ptr+= 2; + *ptr++= base->rec_reflength; + *ptr++= base->key_reflength; + *ptr++= base->keys; + *ptr++= base->auto_key; + *ptr++= base->born_transactional; + *ptr++= 0; /* Reserved */ + mi_int2store(ptr,base->pack_bytes); ptr+= 2; + mi_int2store(ptr,base->blobs); ptr+= 2; + mi_int2store(ptr,base->max_key_block_length); ptr+= 2; + mi_int2store(ptr,base->max_key_length); ptr+= 2; + mi_int2store(ptr,base->extra_alloc_bytes); ptr+= 2; + *ptr++= base->extra_alloc_procent; + bzero(ptr,16); ptr+= 16; /* extra */ + DBUG_ASSERT((ptr - buff) == MARIA_BASE_INFO_SIZE); + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + + +static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base) +{ + base->keystart= mi_sizekorr(ptr); ptr+= 8; + base->max_data_file_length= mi_sizekorr(ptr); ptr+= 8; + base->max_key_file_length= mi_sizekorr(ptr); ptr+= 8; + base->records= (ha_rows) mi_sizekorr(ptr); ptr+= 8; + base->reloc= (ha_rows) mi_sizekorr(ptr); ptr+= 8; + base->mean_row_length= mi_uint4korr(ptr); ptr+= 4; + base->reclength= mi_uint4korr(ptr); ptr+= 4; + base->pack_reclength= mi_uint4korr(ptr); ptr+= 4; + base->min_pack_length= mi_uint4korr(ptr); ptr+= 4; + base->max_pack_length= mi_uint4korr(ptr); ptr+= 4; + base->min_block_length= mi_uint4korr(ptr); ptr+= 4; + base->fields= mi_uint2korr(ptr); ptr+= 2; + base->fixed_not_null_fields= mi_uint2korr(ptr); ptr+= 2; + base->fixed_not_null_fields_length= mi_uint2korr(ptr);ptr+= 2; + base->max_field_lengths= mi_uint2korr(ptr); ptr+= 2; + base->pack_fields= mi_uint2korr(ptr); ptr+= 2; + ptr+= 2; + base->null_bytes= mi_uint2korr(ptr); ptr+= 2; + base->original_null_bytes= mi_uint2korr(ptr); ptr+= 2; + base->field_offsets= mi_uint2korr(ptr); ptr+= 2; + base->min_row_length= mi_uint2korr(ptr); ptr+= 2; + base->block_size= mi_uint2korr(ptr); ptr+= 2; + + base->rec_reflength= *ptr++; + base->key_reflength= *ptr++; + base->keys= *ptr++; + base->auto_key= *ptr++; + base->born_transactional= *ptr++; + ptr++; + base->pack_bytes= mi_uint2korr(ptr); ptr+= 2; + base->blobs= mi_uint2korr(ptr); ptr+= 2; + base->max_key_block_length= mi_uint2korr(ptr); ptr+= 2; + base->max_key_length= mi_uint2korr(ptr); ptr+= 2; + base->extra_alloc_bytes= mi_uint2korr(ptr); ptr+= 2; + base->extra_alloc_procent= *ptr++; + ptr+= 16; + return ptr; +} + +/*-------------------------------------------------------------------------- + maria_keydef +---------------------------------------------------------------------------*/ + +uint _ma_keydef_write(File file, MARIA_KEYDEF *keydef) +{ + uchar buff[MARIA_KEYDEF_SIZE]; + uchar *ptr=buff; + + *ptr++= (uchar) keydef->keysegs; + *ptr++= keydef->key_alg; /* Rtree or Btree */ + mi_int2store(ptr,keydef->flag); ptr+= 2; + mi_int2store(ptr,keydef->block_length); ptr+= 2; + mi_int2store(ptr,keydef->keylength); ptr+= 2; + mi_int2store(ptr,keydef->minlength); ptr+= 2; + mi_int2store(ptr,keydef->maxlength); ptr+= 2; + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + +char *_ma_keydef_read(char *ptr, MARIA_KEYDEF *keydef) +{ + keydef->keysegs = (uint) *ptr++; + keydef->key_alg = *ptr++; /* Rtree or Btree */ + + keydef->flag = mi_uint2korr(ptr); ptr+= 2; + keydef->block_length = mi_uint2korr(ptr); ptr+= 2; + keydef->keylength = mi_uint2korr(ptr); ptr+= 2; + keydef->minlength = mi_uint2korr(ptr); ptr+= 2; + keydef->maxlength = mi_uint2korr(ptr); ptr+= 2; + keydef->underflow_block_length=keydef->block_length/3; + keydef->version = 0; /* Not saved */ + keydef->parser = &ft_default_parser; + keydef->ftparser_nr = 0; + return ptr; +} + +/*************************************************************************** +** maria_keyseg +***************************************************************************/ + +int _ma_keyseg_write(File file, const HA_KEYSEG *keyseg) +{ + uchar buff[HA_KEYSEG_SIZE]; + uchar *ptr=buff; + ulong pos; + + *ptr++= keyseg->type; + *ptr++= keyseg->language; + *ptr++= keyseg->null_bit; + *ptr++= keyseg->bit_start; + *ptr++= keyseg->bit_end; + *ptr++= keyseg->bit_length; + mi_int2store(ptr,keyseg->flag); ptr+= 2; + mi_int2store(ptr,keyseg->length); ptr+= 2; + mi_int4store(ptr,keyseg->start); ptr+= 4; + pos= keyseg->null_bit ? keyseg->null_pos : keyseg->bit_pos; + mi_int4store(ptr, pos); + ptr+=4; + + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + + +char *_ma_keyseg_read(char *ptr, HA_KEYSEG *keyseg) +{ + keyseg->type = *ptr++; + keyseg->language = *ptr++; + keyseg->null_bit = *ptr++; + keyseg->bit_start = *ptr++; + keyseg->bit_end = *ptr++; + keyseg->bit_length = *ptr++; + keyseg->flag = mi_uint2korr(ptr); ptr+= 2; + keyseg->length = mi_uint2korr(ptr); ptr+= 2; + keyseg->start = mi_uint4korr(ptr); ptr+= 4; + keyseg->null_pos = mi_uint4korr(ptr); ptr+= 4; + keyseg->charset=0; /* Will be filled in later */ + if (keyseg->null_bit) + keyseg->bit_pos= (uint16)(keyseg->null_pos + (keyseg->null_bit == 7)); + else + { + keyseg->bit_pos= (uint16)keyseg->null_pos; + keyseg->null_pos= 0; + } + return ptr; +} + +/*-------------------------------------------------------------------------- + maria_uniquedef +---------------------------------------------------------------------------*/ + +uint _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *def) +{ + uchar buff[MARIA_UNIQUEDEF_SIZE]; + uchar *ptr=buff; + + mi_int2store(ptr,def->keysegs); ptr+=2; + *ptr++= (uchar) def->key; + *ptr++ = (uchar) def->null_are_equal; + + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + +char *_ma_uniquedef_read(char *ptr, MARIA_UNIQUEDEF *def) +{ + def->keysegs = mi_uint2korr(ptr); + def->key = ptr[2]; + def->null_are_equal=ptr[3]; + return ptr+4; /* 1 extra uchar */ +} + +/*************************************************************************** +** MARIA_COLUMNDEF +***************************************************************************/ + +uint _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef) +{ + uchar buff[MARIA_COLUMNDEF_SIZE]; + uchar *ptr=buff; + + mi_int6store(ptr,columndef->offset); ptr+= 6; + mi_int2store(ptr,columndef->type); ptr+= 2; + mi_int2store(ptr,columndef->length); ptr+= 2; + mi_int2store(ptr,columndef->fill_length); ptr+= 2; + mi_int2store(ptr,columndef->null_pos); ptr+= 2; + mi_int2store(ptr,columndef->empty_pos); ptr+= 2; + (*ptr++)= columndef->null_bit; + (*ptr++)= columndef->empty_bit; + return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + +char *_ma_columndef_read(char *ptr, MARIA_COLUMNDEF *columndef) +{ + columndef->offset= mi_uint6korr(ptr); ptr+= 6; + columndef->type= mi_sint2korr(ptr); ptr+= 2; + columndef->length= mi_uint2korr(ptr); ptr+= 2; + columndef->fill_length= mi_uint2korr(ptr); ptr+= 2; + columndef->null_pos= mi_uint2korr(ptr); ptr+= 2; + columndef->empty_pos= mi_uint2korr(ptr); ptr+= 2; + columndef->null_bit= (uint8) *ptr++; + columndef->empty_bit= (uint8) *ptr++; + return ptr; +} + +/************************************************************************** + Open data file + We can't use dup() here as the data file descriptors need to have different + active seek-positions. + + The argument file_to_dup is here for the future if there would on some OS + exist a dup()-like call that would give us two different file descriptors. +*************************************************************************/ + +int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share, + File file_to_dup __attribute__((unused))) +{ + info->dfile.file= share->bitmap.file.file= + my_open(share->data_file_name, share->mode | O_SHARE, + MYF(MY_WME)); + return info->dfile.file >= 0 ? 0 : 1; +} + + +int _ma_open_keyfile(MARIA_SHARE *share) +{ + /* + Modifications to share->kfile should be under intern_lock to protect + against a concurrent checkpoint. + */ + pthread_mutex_lock(&share->intern_lock); + share->kfile.file= my_open(share->unique_file_name, + share->mode | O_SHARE, + MYF(MY_WME)); + pthread_mutex_unlock(&share->intern_lock); + return (share->kfile.file < 0); +} + + +/* + Disable all indexes. + + SYNOPSIS + maria_disable_indexes() + info A pointer to the MARIA storage engine MARIA_HA struct. + + DESCRIPTION + Disable all indexes. + + RETURN + 0 ok +*/ + +int maria_disable_indexes(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + maria_clear_all_keys_active(share->state.key_map); + return 0; +} + + +/* + Enable all indexes + + SYNOPSIS + maria_enable_indexes() + info A pointer to the MARIA storage engine MARIA_HA struct. + + DESCRIPTION + Enable all indexes. The indexes might have been disabled + by maria_disable_index() before. + The function works only if both data and indexes are empty, + otherwise a repair is required. + To be sure, call handler::delete_all_rows() before. + + RETURN + 0 ok + HA_ERR_CRASHED data or index is non-empty. +*/ + +int maria_enable_indexes(MARIA_HA *info) +{ + int error= 0; + MARIA_SHARE *share= info->s; + + if (share->state.state.data_file_length || + (share->state.state.key_file_length != share->base.keystart)) + { + maria_print_error(info->s, HA_ERR_CRASHED); + error= HA_ERR_CRASHED; + } + else + maria_set_all_keys_active(share->state.key_map, share->base.keys); + return error; +} + + +/* + Test if indexes are disabled. + + SYNOPSIS + maria_indexes_are_disabled() + info A pointer to the MARIA storage engine MARIA_HA struct. + + DESCRIPTION + Test if indexes are disabled. + + RETURN + 0 indexes are not disabled + 1 all indexes are disabled + 2 non-unique indexes are disabled +*/ + +int maria_indexes_are_disabled(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + /* + No keys or all are enabled. keys is the number of keys. Left shifted + gives us only one bit set. When decreased by one, gives us all all bits + up to this one set and it gets unset. + */ + if (!share->base.keys || + (maria_is_all_keys_active(share->state.key_map, share->base.keys))) + return 0; + + /* All are disabled */ + if (maria_is_any_key_active(share->state.key_map)) + return 1; + + /* + We have keys. Some enabled, some disabled. + Don't check for any non-unique disabled but return directly 2 + */ + return 2; +} + + +static my_bool maria_scan_init_dummy(MARIA_HA *info __attribute__((unused))) +{ + return 0; +} + +static void maria_scan_end_dummy(MARIA_HA *info __attribute__((unused))) +{ +} + +static my_bool maria_once_init_dummy(MARIA_SHARE *share + __attribute__((unused)), + File dfile __attribute__((unused))) +{ + return 0; +} + +static my_bool maria_once_end_dummy(MARIA_SHARE *share __attribute__((unused))) +{ + return 0; +} diff --git a/storage/maria/ma_packrec.c b/storage/maria/ma_packrec.c new file mode 100644 index 00000000000..173fafaf73f --- /dev/null +++ b/storage/maria/ma_packrec.c @@ -0,0 +1,1717 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + /* Functions to compressed records */ + +#include "maria_def.h" + +#define IS_CHAR ((uint) 32768) /* Bit if char (not offset) in tree */ + +/* Some definitions to keep in sync with maria_pack.c */ +#define HEAD_LENGTH 32 /* Length of fixed header */ + +#if INT_MAX > 32767 +#define BITS_SAVED 32 +#define MAX_QUICK_TABLE_BITS 9 /* Because we may shift in 24 bits */ +#else +#define BITS_SAVED 16 +#define MAX_QUICK_TABLE_BITS 6 +#endif + +#define get_bit(BU) ((BU)->bits ? \ + (BU)->current_byte & ((maria_bit_type) 1 << --(BU)->bits) :\ + (fill_buffer(BU), (BU)->bits= BITS_SAVED-1,\ + (BU)->current_byte & ((maria_bit_type) 1 << (BITS_SAVED-1)))) +#define skip_to_next_byte(BU) ((BU)->bits&=~7) +#define get_bits(BU,count) (((BU)->bits >= count) ? (((BU)->current_byte >> ((BU)->bits-=count)) & mask[count]) : fill_and_get_bits(BU,count)) + +#define decode_bytes_test_bit(bit) \ + if (low_byte & (1 << (7-bit))) \ + pos++; \ + if (*pos & IS_CHAR) \ + { bits-=(bit+1); break; } \ + pos+= *pos + +/* Size in uint16 of a Huffman tree for uchar compression of 256 uchar values. */ +#define OFFSET_TABLE_SIZE 512 + +static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file, + pbool fix_keys); +static uint read_huff_table(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree, + uint16 **decode_table,uchar **intervall_buff, + uint16 *tmp_buff); +static void make_quick_table(uint16 *to_table,uint16 *decode_table, + uint *next_free,uint value,uint bits, + uint max_bits); +static void fill_quick_table(uint16 *table,uint bits, uint max_bits, + uint value); +static uint copy_decode_table(uint16 *to_pos,uint offset, + uint16 *decode_table); +static uint find_longest_bitstream(uint16 *table, uint16 *end); +static void (*get_unpack_function(MARIA_COLUMNDEF *rec))(MARIA_COLUMNDEF *field, + MARIA_BIT_BUFF *buff, + uchar *to, + uchar *end); +static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_skip_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_endspace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_prespace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_zerofill_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_constant(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_intervall(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static uint decode_pos(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree); +static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff,uchar *buffer, + uint length); +static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff,uint count); +static void fill_buffer(MARIA_BIT_BUFF *bit_buff); +static uint max_bit(uint value); +static uint read_pack_length(uint version, const uchar *buf, ulong *length); +#ifdef HAVE_MMAP +static uchar *_ma_mempack_get_block_info(MARIA_HA *maria, + MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, + uchar **rec_buff_p, + size_t *rec_buff_size_p, + uchar *header); +#endif + +static maria_bit_type mask[]= +{ + 0x00000000, + 0x00000001, 0x00000003, 0x00000007, 0x0000000f, + 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, + 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, +#if BITS_SAVED > 16 + 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, + 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, + 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, + 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff, +#endif +}; + + +my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile) +{ + share->options|= HA_OPTION_READ_ONLY_DATA; + return (_ma_read_pack_info(share, dfile, + (pbool) + test(!(share->options & + (HA_OPTION_PACK_RECORD | + HA_OPTION_TEMP_COMPRESS_RECORD))))); +} + + +my_bool _ma_once_end_pack_row(MARIA_SHARE *share) +{ + if (share->decode_trees) + { + my_free((uchar*) share->decode_trees,MYF(0)); + my_free((uchar*) share->decode_tables,MYF(0)); + } + return 0; +} + + +/* Read all packed info, allocate memory and fix field structs */ + +static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file, + pbool fix_keys) +{ + int diff_length; + uint i,trees,huff_tree_bits,rec_reflength,length; + uint16 *decode_table,*tmp_buff; + ulong elements,intervall_length; + char *disk_cache; + uchar *intervall_buff; + uchar header[HEAD_LENGTH]; + MARIA_BIT_BUFF bit_buff; + DBUG_ENTER("_ma_read_pack_info"); + + if (maria_quick_table_bits < 4) + maria_quick_table_bits=4; + else if (maria_quick_table_bits > MAX_QUICK_TABLE_BITS) + maria_quick_table_bits=MAX_QUICK_TABLE_BITS; + + my_errno=0; + if (my_read(file,(uchar*) header,sizeof(header),MYF(MY_NABP))) + { + if (!my_errno) + my_errno=HA_ERR_END_OF_FILE; + goto err0; + } + /* Only the first three bytes of magic number are independent of version. */ + if (memcmp((uchar*) header, (uchar*) maria_pack_file_magic, 3)) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err0; + } + share->pack.version= header[3]; /* fourth uchar of magic number */ + share->pack.header_length= uint4korr(header+4); + share->min_pack_length=(uint) uint4korr(header+8); + share->max_pack_length=(uint) uint4korr(header+12); + set_if_bigger(share->base.pack_reclength,share->max_pack_length); + elements=uint4korr(header+16); + intervall_length=uint4korr(header+20); + trees=uint2korr(header+24); + share->pack.ref_length=header[26]; + rec_reflength=header[27]; + diff_length=(int) rec_reflength - (int) share->base.rec_reflength; + if (fix_keys) + share->rec_reflength=rec_reflength; + share->base.min_block_length=share->min_pack_length+1; + if (share->min_pack_length > 254) + share->base.min_block_length+=2; + DBUG_PRINT("info", ("fixed header length: %u", HEAD_LENGTH)); + DBUG_PRINT("info", ("total header length: %lu", share->pack.header_length)); + DBUG_PRINT("info", ("pack file version: %u", share->pack.version)); + DBUG_PRINT("info", ("min pack length: %lu", share->min_pack_length)); + DBUG_PRINT("info", ("max pack length: %lu", share->max_pack_length)); + DBUG_PRINT("info", ("elements of all trees: %lu", elements)); + DBUG_PRINT("info", ("distinct values bytes: %lu", intervall_length)); + DBUG_PRINT("info", ("number of code trees: %u", trees)); + DBUG_PRINT("info", ("bytes for record lgt: %u", share->pack.ref_length)); + DBUG_PRINT("info", ("record pointer length: %u", rec_reflength)); + + + /* + Memory segment #1: + - Decode tree heads + - Distinct column values + */ + if (!(share->decode_trees=(MARIA_DECODE_TREE*) + my_malloc((uint) (trees*sizeof(MARIA_DECODE_TREE)+ + intervall_length*sizeof(uchar)), + MYF(MY_WME)))) + goto err0; + intervall_buff=(uchar*) (share->decode_trees+trees); + + /* + Memory segment #2: + - Decode tables + - Quick decode tables + - Temporary decode table + - Compressed data file header cache + This segment will be reallocated after construction of the tables. + */ + length=(uint) (elements*2+trees*(1 << maria_quick_table_bits)); + if (!(share->decode_tables=(uint16*) + my_malloc((length+OFFSET_TABLE_SIZE)*sizeof(uint16)+ + (uint) (share->pack.header_length - sizeof(header)), + MYF(MY_WME | MY_ZEROFILL)))) + goto err1; + tmp_buff=share->decode_tables+length; + disk_cache=(uchar*) (tmp_buff+OFFSET_TABLE_SIZE); + + if (my_read(file,disk_cache, + (uint) (share->pack.header_length-sizeof(header)), + MYF(MY_NABP))) + goto err2; + + huff_tree_bits=max_bit(trees ? trees-1 : 0); + init_bit_buffer(&bit_buff, (uchar*) disk_cache, + (uint) (share->pack.header_length-sizeof(header))); + /* Read new info for each field */ + for (i=0 ; i < share->base.fields ; i++) + { + share->columndef[i].base_type=(enum en_fieldtype) get_bits(&bit_buff,5); + share->columndef[i].pack_type=(uint) get_bits(&bit_buff,6); + share->columndef[i].space_length_bits=get_bits(&bit_buff,5); + share->columndef[i].huff_tree=share->decode_trees+(uint) get_bits(&bit_buff, + huff_tree_bits); + share->columndef[i].unpack= get_unpack_function(share->columndef + i); + DBUG_PRINT("info", ("col: %2u type: %2u pack: %u slbits: %2u", + i, share->columndef[i].base_type, + share->columndef[i].pack_type, + share->columndef[i].space_length_bits)); + } + skip_to_next_byte(&bit_buff); + /* + Construct the decoding tables from the file header. Keep track of + the used memory. + */ + decode_table=share->decode_tables; + for (i=0 ; i < trees ; i++) + if (read_huff_table(&bit_buff,share->decode_trees+i,&decode_table, + &intervall_buff,tmp_buff)) + goto err3; + /* Reallocate the decoding tables to the used size. */ + decode_table=(uint16*) + my_realloc((uchar*) share->decode_tables, + (uint) ((uchar*) decode_table - (uchar*) share->decode_tables), + MYF(MY_HOLD_ON_ERROR)); + /* Fix the table addresses in the tree heads. */ + { + long diff=PTR_BYTE_DIFF(decode_table,share->decode_tables); + share->decode_tables=decode_table; + for (i=0 ; i < trees ; i++) + share->decode_trees[i].table=ADD_TO_PTR(share->decode_trees[i].table, + diff, uint16*); + } + + /* Fix record-ref-length for keys */ + if (fix_keys) + { + for (i=0 ; i < share->base.keys ; i++) + { + MARIA_KEYDEF *keyinfo= &share->keyinfo[i]; + keyinfo->keylength+= (uint16) diff_length; + keyinfo->minlength+= (uint16) diff_length; + keyinfo->maxlength+= (uint16) diff_length; + keyinfo->seg[keyinfo->flag & HA_FULLTEXT ? + FT_SEGS : keyinfo->keysegs].length= (uint16) rec_reflength; + } + if (share->ft2_keyinfo.seg) + { + MARIA_KEYDEF *ft2_keyinfo= &share->ft2_keyinfo; + ft2_keyinfo->keylength+= (uint16) diff_length; + ft2_keyinfo->minlength+= (uint16) diff_length; + ft2_keyinfo->maxlength+= (uint16) diff_length; + } + } + + if (bit_buff.error || bit_buff.pos < bit_buff.end) + goto err3; + + DBUG_RETURN(0); + +err3: + my_errno=HA_ERR_WRONG_IN_RECORD; +err2: + my_free((uchar*) share->decode_tables,MYF(0)); +err1: + my_free((uchar*) share->decode_trees,MYF(0)); +err0: + DBUG_RETURN(1); +} + + +/* + Read a huff-code-table from datafile. + + SYNOPSIS + read_huff_table() + bit_buff Bit buffer pointing at start of the + decoding table in the file header cache. + decode_tree Pointer to the decode tree head. + decode_table IN/OUT Address of a pointer to the next free space. + intervall_buff IN/OUT Address of a pointer to the next unused values. + tmp_buff Buffer for temporary extraction of a full + decoding table as read from bit_buff. + + RETURN + 0 OK. + 1 Error. +*/ +static uint read_huff_table(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree, + uint16 **decode_table, uchar **intervall_buff, + uint16 *tmp_buff) +{ + uint min_chr,elements,char_bits,offset_bits,size,intervall_length,table_bits, + next_free_offset; + uint16 *ptr,*end; + DBUG_ENTER("read_huff_table"); + + if (!get_bits(bit_buff,1)) + { + /* Byte value compression. */ + min_chr=get_bits(bit_buff,8); + elements=get_bits(bit_buff,9); + char_bits=get_bits(bit_buff,5); + offset_bits=get_bits(bit_buff,5); + intervall_length=0; + ptr=tmp_buff; + ptr=tmp_buff; + DBUG_PRINT("info", ("byte value compression")); + DBUG_PRINT("info", ("minimum uchar value: %u", min_chr)); + DBUG_PRINT("info", ("number of tree nodes: %u", elements)); + DBUG_PRINT("info", ("bits for values: %u", char_bits)); + DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits)); + if (elements > 256) + { + DBUG_PRINT("error", ("ERROR: illegal number of tree elements: %u", + elements)); + DBUG_RETURN(1); + } + } + else + { + /* Distinct column value compression. */ + min_chr=0; + elements=get_bits(bit_buff,15); + intervall_length=get_bits(bit_buff,16); + char_bits=get_bits(bit_buff,5); + offset_bits=get_bits(bit_buff,5); + decode_tree->quick_table_bits=0; + ptr= *decode_table; + DBUG_PRINT("info", ("distinct column value compression")); + DBUG_PRINT("info", ("number of tree nodes: %u", elements)); + DBUG_PRINT("info", ("value buffer length: %u", intervall_length)); + DBUG_PRINT("info", ("bits for value index: %u", char_bits)); + DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits)); + } + size=elements*2-2; + DBUG_PRINT("info", ("tree size in uint16: %u", size)); + DBUG_PRINT("info", ("tree size in bytes: %u", + size * (uint) sizeof(uint16))); + + for (end=ptr+size ; ptr < end ; ptr++) + { + if (get_bit(bit_buff)) + { + *ptr= (uint16) get_bits(bit_buff,offset_bits); + if ((ptr + *ptr >= end) || !*ptr) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + DBUG_RETURN(1); + } + } + else + *ptr= (uint16) (IS_CHAR + (get_bits(bit_buff,char_bits) + min_chr)); + } + skip_to_next_byte(bit_buff); + + decode_tree->table= *decode_table; + decode_tree->intervalls= *intervall_buff; + if (! intervall_length) + { + /* Byte value compression. ptr started from tmp_buff. */ + /* Find longest Huffman code from begin to end of tree in bits. */ + table_bits= find_longest_bitstream(tmp_buff, ptr); + if (table_bits >= OFFSET_TABLE_SIZE) + DBUG_RETURN(1); + if (table_bits > maria_quick_table_bits) + table_bits=maria_quick_table_bits; + DBUG_PRINT("info", ("table bits: %u", table_bits)); + + next_free_offset= (1 << table_bits); + make_quick_table(*decode_table,tmp_buff,&next_free_offset,0,table_bits, + table_bits); + (*decode_table)+= next_free_offset; + decode_tree->quick_table_bits=table_bits; + } + else + { + /* Distinct column value compression. ptr started from *decode_table */ + (*decode_table)=end; + /* + get_bits() moves some bytes to a cache buffer in advance. May need + to step back. + */ + bit_buff->pos-= bit_buff->bits/8; + /* Copy the distinct column values from the buffer. */ + memcpy(*intervall_buff,bit_buff->pos,(size_t) intervall_length); + (*intervall_buff)+=intervall_length; + bit_buff->pos+=intervall_length; + bit_buff->bits=0; + } + DBUG_RETURN(0); +} + + +/* + Make a quick_table for faster decoding. + + SYNOPSIS + make_quick_table() + to_table Target quick_table and remaining decode table. + decode_table Source Huffman (sub-)tree within tmp_buff. + next_free_offset IN/OUT Next free offset from to_table. + Starts behind quick_table on the top-level. + value Huffman bits found so far. + bits Remaining bits to be collected. + max_bits Total number of bits to collect (table_bits). + + DESCRIPTION + + The quick table is an array of 16-bit values. There exists one value + for each possible code representable by max_bits (table_bits) bits. + In most cases table_bits is 9. So there are 512 16-bit values. + + If the high-order bit (16) is set (IS_CHAR) then the array slot for + this value is a valid Huffman code for a resulting uchar value. + + The low-order 8 bits (1..8) are the resulting uchar value. + + Bits 9..14 are the length of the Huffman code for this uchar value. + This means so many bits from the input stream were needed to + represent this uchar value. The remaining bits belong to later + Huffman codes. This also means that for every Huffman code shorter + than table_bits there are multiple entires in the array, which + differ just in the unused bits. + + If the high-order bit (16) is clear (0) then the remaining bits are + the position of the remaining Huffman decode tree segment behind the + quick table. + + RETURN + void +*/ + +static void make_quick_table(uint16 *to_table, uint16 *decode_table, + uint *next_free_offset, uint value, uint bits, + uint max_bits) +{ + DBUG_ENTER("make_quick_table"); + + /* + When down the table to the requested maximum, copy the rest of the + Huffman table. + */ + if (!bits--) + { + /* + Remaining left Huffman tree segment starts behind quick table. + Remaining right Huffman tree segment starts behind left segment. + */ + to_table[value]= (uint16) *next_free_offset; + /* + Re-construct the remaining Huffman tree segment at + next_free_offset in to_table. + */ + *next_free_offset=copy_decode_table(to_table, *next_free_offset, + decode_table); + DBUG_VOID_RETURN; + } + + /* Descent on the left side. Left side bits are clear (0). */ + if (!(*decode_table & IS_CHAR)) + { + /* Not a leaf. Follow the pointer. */ + make_quick_table(to_table,decode_table+ *decode_table, + next_free_offset,value,bits,max_bits); + } + else + { + /* + A leaf. A Huffman code is complete. Fill the quick_table + array for all possible bit strings starting with this Huffman + code. + */ + fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table); + } + + /* Descent on the right side. Right side bits are set (1). */ + decode_table++; + value|= (1 << bits); + if (!(*decode_table & IS_CHAR)) + { + /* Not a leaf. Follow the pointer. */ + make_quick_table(to_table,decode_table+ *decode_table, + next_free_offset,value,bits,max_bits); + } + else + { + /* + A leaf. A Huffman code is complete. Fill the quick_table + array for all possible bit strings starting with this Huffman + code. + */ + fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table); + } + + DBUG_VOID_RETURN; +} + + +/* + Fill quick_table for all possible values starting with this Huffman code. + + SYNOPSIS + fill_quick_table() + table Target quick_table position. + bits Unused bits from max_bits. + max_bits Total number of bits to collect (table_bits). + value The uchar encoded by the found Huffman code. + + DESCRIPTION + + Fill the segment (all slots) of the quick_table array with the + resulting value for the found Huffman code. There are as many slots + as there are combinations representable by the unused bits. + + In most cases we use 9 table bits. Assume a 3-bit Huffman code. Then + there are 6 unused bits. Hence we fill 2**6 = 64 slots with the + value. + + RETURN + void +*/ + +static void fill_quick_table(uint16 *table, uint bits, uint max_bits, + uint value) +{ + uint16 *end; + DBUG_ENTER("fill_quick_table"); + + /* + Bits 1..8 of value represent the decoded uchar value. + Bits 9..14 become the length of the Huffman code for this uchar value. + Bit 16 flags a valid code (IS_CHAR). + */ + value|= (max_bits - bits) << 8 | IS_CHAR; + + for (end= table + (uint) (((uint) 1 << bits)); table < end; table++) + { + *table= (uint16) value; + } + DBUG_VOID_RETURN; +} + + +/* + Reconstruct a decode subtree at the target position. + + SYNOPSIS + copy_decode_table() + to_pos Target quick_table and remaining decode table. + offset Next free offset from to_pos. + decode_table Source Huffman subtree within tmp_buff. + + NOTE + Pointers in the decode tree are relative to the pointers position. + + RETURN + next free offset from to_pos. +*/ + +static uint copy_decode_table(uint16 *to_pos, uint offset, + uint16 *decode_table) +{ + uint prev_offset= offset; + DBUG_ENTER("copy_decode_table"); + + /* Descent on the left side. */ + if (!(*decode_table & IS_CHAR)) + { + /* Set a pointer to the next target node. */ + to_pos[offset]=2; + /* Copy the left hand subtree there. */ + offset=copy_decode_table(to_pos,offset+2,decode_table+ *decode_table); + } + else + { + /* Copy the uchar value. */ + to_pos[offset]= *decode_table; + /* Step behind this node. */ + offset+=2; + } + + /* Descent on the right side. */ + decode_table++; + if (!(*decode_table & IS_CHAR)) + { + /* Set a pointer to the next free target node. */ + to_pos[prev_offset+1]=(uint16) (offset-prev_offset-1); + /* Copy the right hand subtree to the entry of that node. */ + offset=copy_decode_table(to_pos,offset,decode_table+ *decode_table); + } + else + { + /* Copy the uchar value. */ + to_pos[prev_offset+1]= *decode_table; + } + DBUG_RETURN(offset); +} + + +/* + Find the length of the longest Huffman code in this table in bits. + + SYNOPSIS + find_longest_bitstream() + table Code (sub-)table start. + end End of code table. + + IMPLEMENTATION + + Recursively follow the branch(es) of the code pair on every level of + the tree until two uchar values (and no branch) are found. Add one to + each level when returning back from each recursion stage. + + 'end' is used for error checking only. A clean tree terminates + before reaching 'end'. Hence the exact value of 'end' is not too + important. However having it higher than necessary could lead to + misbehaviour should 'next' jump into the dirty area. + + RETURN + length Length of longest Huffman code in bits. + >= OFFSET_TABLE_SIZE Error, broken tree. It does not end before 'end'. +*/ + +static uint find_longest_bitstream(uint16 *table, uint16 *end) +{ + uint length=1; + uint length2; + if (!(*table & IS_CHAR)) + { + uint16 *next= table + *table; + if (next > end || next == table) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + return OFFSET_TABLE_SIZE; + } + length=find_longest_bitstream(next, end)+1; + } + table++; + if (!(*table & IS_CHAR)) + { + uint16 *next= table + *table; + if (next > end || next == table) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + return OFFSET_TABLE_SIZE; + } + length2= find_longest_bitstream(next, end) + 1; + length=max(length,length2); + } + return length; +} + + +/* + Read record from datafile. + + SYNOPSIS + _ma_read_pack_record() + info A pointer to MARIA_HA. + filepos File offset of the record. + buf RETURN The buffer to receive the record. + + RETURN + 0 On success + # Error number +*/ + +int _ma_read_pack_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos) +{ + MARIA_BLOCK_INFO block_info; + File file; + DBUG_ENTER("maria_read_pack_record"); + + if (filepos == HA_OFFSET_ERROR) + DBUG_RETURN(my_errno); /* _search() didn't find record */ + + file= info->dfile.file; + if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info, + &info->rec_buff, &info->rec_buff_size, file, + filepos)) + goto err; + if (my_read(file,(uchar*) info->rec_buff + block_info.offset , + block_info.rec_len - block_info.offset, MYF(MY_NABP))) + goto panic; + info->update|= HA_STATE_AKTIV; + DBUG_RETURN(_ma_pack_rec_unpack(info,&info->bit_buff, buf, + info->rec_buff, block_info.rec_len)); +panic: + my_errno=HA_ERR_WRONG_IN_RECORD; +err: + DBUG_RETURN(my_errno); +} + + + +int _ma_pack_rec_unpack(register MARIA_HA *info, MARIA_BIT_BUFF *bit_buff, + register uchar *to, uchar *from, ulong reclength) +{ + uchar *end_field; + reg3 MARIA_COLUMNDEF *end; + MARIA_COLUMNDEF *current_field; + MARIA_SHARE *share=info->s; + DBUG_ENTER("_ma_pack_rec_unpack"); + + if (info->s->base.null_bytes) + { + memcpy(to, from, info->s->base.null_bytes); + to+= info->s->base.null_bytes; + from+= info->s->base.null_bytes; + reclength-= info->s->base.null_bytes; + } + init_bit_buffer(bit_buff, (uchar*) from, reclength); + for (current_field=share->columndef, end=current_field+share->base.fields ; + current_field < end ; + current_field++,to=end_field) + { + end_field=to+current_field->length; + (*current_field->unpack)(current_field, bit_buff, to, end_field); + } + if (!bit_buff->error && + bit_buff->pos - bit_buff->bits / 8 == bit_buff->end) + DBUG_RETURN(0); + info->update&= ~HA_STATE_AKTIV; + DBUG_RETURN(my_errno=HA_ERR_WRONG_IN_RECORD); +} /* _ma_pack_rec_unpack */ + + + /* Return function to unpack field */ + +static void (*get_unpack_function(MARIA_COLUMNDEF *rec)) + (MARIA_COLUMNDEF *, MARIA_BIT_BUFF *, uchar *, uchar *) +{ + switch (rec->base_type) { + case FIELD_SKIP_ZERO: + if (rec->pack_type & PACK_TYPE_ZERO_FILL) + return &uf_zerofill_skip_zero; + return &uf_skip_zero; + case FIELD_NORMAL: + if (rec->pack_type & PACK_TYPE_SPACE_FIELDS) + return &uf_space_normal; + if (rec->pack_type & PACK_TYPE_ZERO_FILL) + return &uf_zerofill_normal; + return &decode_bytes; + case FIELD_SKIP_ENDSPACE: + if (rec->pack_type & PACK_TYPE_SPACE_FIELDS) + { + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_space_endspace_selected; + return &uf_space_endspace; + } + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_endspace_selected; + return &uf_endspace; + case FIELD_SKIP_PRESPACE: + if (rec->pack_type & PACK_TYPE_SPACE_FIELDS) + { + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_space_prespace_selected; + return &uf_space_prespace; + } + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_prespace_selected; + return &uf_prespace; + case FIELD_CONSTANT: + return &uf_constant; + case FIELD_INTERVALL: + return &uf_intervall; + case FIELD_ZERO: + case FIELD_CHECK: + return &uf_zero; + case FIELD_BLOB: + return &uf_blob; + case FIELD_VARCHAR: + if (rec->length <= 256) /* 255 + 1 uchar length */ + return &uf_varchar1; + return &uf_varchar2; + case FIELD_LAST: + default: + return 0; /* This should never happend */ + } +} + + /* The different functions to unpack a field */ + +static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bzero((char*) to,(uint) (end-to)); + else + { + end-=rec->space_length_bits; + decode_bytes(rec,bit_buff,to,end); + bzero((char*) end,rec->space_length_bits); + } +} + +static void uf_skip_zero(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bzero((char*) to,(uint) (end-to)); + else + decode_bytes(rec,bit_buff,to,end); +} + +static void uf_space_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bfill((uchar*) to,(end-to),' '); + else + decode_bytes(rec,bit_buff,to,end); +} + +static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill((uchar*) to,(end-to),' '); + else + { + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill((uchar*) end-spaces,spaces,' '); + } + else + decode_bytes(rec,bit_buff,to,end); + } +} + +static void uf_endspace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill((uchar*) end-spaces,spaces,' '); + } + else + decode_bytes(rec,bit_buff,to,end); +} + +static void uf_space_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill((uchar*) to,(end-to),' '); + else + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill((uchar*) end-spaces,spaces,' '); + } +} + +static void uf_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill((uchar*) end-spaces,spaces,' '); +} + +static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill((uchar*) to,(end-to),' '); + else + { + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill((uchar*) to,spaces,' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); + } + else + decode_bytes(rec,bit_buff,to,end); + } +} + + +static void uf_prespace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill((uchar*) to,spaces,' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); + } + else + decode_bytes(rec,bit_buff,to,end); +} + + +static void uf_space_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill((uchar*) to,(end-to),' '); + else + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill((uchar*) to,spaces,' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); + } +} + +static void uf_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill((uchar*) to,spaces,' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); +} + +static void uf_zerofill_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + end-=rec->space_length_bits; + decode_bytes(rec,bit_buff, to, end); + bzero((char*) end,rec->space_length_bits); +} + +static void uf_constant(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff __attribute__((unused)), + uchar *to, uchar *end) +{ + memcpy(to,rec->huff_tree->intervalls,(size_t) (end-to)); +} + +static void uf_intervall(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, + uchar *end) +{ + reg1 uint field_length=(uint) (end-to); + memcpy(to,rec->huff_tree->intervalls+field_length*decode_pos(bit_buff, + rec->huff_tree), + (size_t) field_length); +} + + +/*ARGSUSED*/ +static void uf_zero(MARIA_COLUMNDEF *rec __attribute__((unused)), + MARIA_BIT_BUFF *bit_buff __attribute__((unused)), + uchar *to, uchar *end) +{ + bzero(to, (uint) (end-to)); +} + +static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bzero(to, (uint) (end-to)); + else + { + ulong length=get_bits(bit_buff,rec->space_length_bits); + uint pack_length=(uint) (end-to)-portable_sizeof_char_ptr; + if (bit_buff->blob_pos+length > bit_buff->blob_end) + { + bit_buff->error=1; + bzero((uchar*) to,(end-to)); + return; + } + decode_bytes(rec,bit_buff,(uchar*) bit_buff->blob_pos, + (uchar*) bit_buff->blob_pos+length); + _ma_store_blob_length((uchar*) to,pack_length,length); + memcpy_fixed((char*) to+pack_length,(char*) &bit_buff->blob_pos, + sizeof(char*)); + bit_buff->blob_pos+=length; + } +} + + +static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end __attribute__((unused))) +{ + if (get_bit(bit_buff)) + to[0]= 0; /* Zero lengths */ + else + { + ulong length=get_bits(bit_buff,rec->space_length_bits); + *to= (char) length; + decode_bytes(rec,bit_buff,to+1,to+1+length); + } +} + + +static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end __attribute__((unused))) +{ + if (get_bit(bit_buff)) + to[0]=to[1]=0; /* Zero lengths */ + else + { + ulong length=get_bits(bit_buff,rec->space_length_bits); + int2store(to,length); + decode_bytes(rec,bit_buff,to+2,to+2+length); + } +} + + /* Functions to decode of buffer of bits */ + +#if BITS_SAVED == 64 + +static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + reg1 uint bits,low_byte; + reg3 uint16 *pos; + reg4 uint table_bits,table_and; + MARIA_DECODE_TREE *decode_tree; + + decode_tree=rec->decode_tree; + bits=bit_buff->bits; /* Save in reg for quicker access */ + table_bits=decode_tree->quick_table_bits; + table_and= (1 << table_bits)-1; + + do + { + if (bits <= 32) + { + if (bit_buff->pos > bit_buff->end+4) + { + bit_buff->error=1; + return; /* Can't be right */ + } + bit_buff->current_byte= (bit_buff->current_byte << 32) + + ((((uint) bit_buff->pos[3])) + + (((uint) bit_buff->pos[2]) << 8) + + (((uint) bit_buff->pos[1]) << 16) + + (((uint) bit_buff->pos[0]) << 24)); + bit_buff->pos+=4; + bits+=32; + } + /* + First use info in quick_table. + + The quick table is an array of 16-bit values. There exists one + value for each possible code representable by table_bits bits. + In most cases table_bits is 9. So there are 512 16-bit values. + + If the high-order bit (16) is set (IS_CHAR) then the array slot + for this value is a valid Huffman code for a resulting uchar value. + + The low-order 8 bits (1..8) are the resulting uchar value. + + Bits 9..14 are the length of the Huffman code for this uchar value. + This means so many bits from the input stream were needed to + represent this uchar value. The remaining bits belong to later + Huffman codes. This also means that for every Huffman code shorter + than table_bits there are multiple entires in the array, which + differ just in the unused bits. + + If the high-order bit (16) is clear (0) then the remaining bits are + the position of the remaining Huffman decode tree segment behind the + quick table. + */ + low_byte=(uint) (bit_buff->current_byte >> (bits - table_bits)) & table_and; + low_byte=decode_tree->table[low_byte]; + if (low_byte & IS_CHAR) + { + /* + All Huffman codes of less or equal table_bits length are in the + quick table. This is one of them. + */ + *to++ = (char) (low_byte & 255); /* Found char in quick table */ + bits-= ((low_byte >> 8) & 31); /* Remove bits used */ + } + else + { /* Map through rest of decode-table */ + /* This means that the Huffman code must be longer than table_bits. */ + pos=decode_tree->table+low_byte; + bits-=table_bits; + /* NOTE: decode_bytes_test_bit() is a macro wich contains a break !!! */ + for (;;) + { + low_byte=(uint) (bit_buff->current_byte >> (bits-8)); + decode_bytes_test_bit(0); + decode_bytes_test_bit(1); + decode_bytes_test_bit(2); + decode_bytes_test_bit(3); + decode_bytes_test_bit(4); + decode_bytes_test_bit(5); + decode_bytes_test_bit(6); + decode_bytes_test_bit(7); + bits-=8; + } + *to++ = (char) *pos; + } + } while (to != end); + + bit_buff->bits=bits; + return; +} + +#else + +static void decode_bytes(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + reg1 uint bits,low_byte; + reg3 uint16 *pos; + reg4 uint table_bits,table_and; + MARIA_DECODE_TREE *decode_tree; + + decode_tree=rec->huff_tree; + bits=bit_buff->bits; /* Save in reg for quicker access */ + table_bits=decode_tree->quick_table_bits; + table_and= (1 << table_bits)-1; + + do + { + if (bits < table_bits) + { + if (bit_buff->pos > bit_buff->end+1) + { + bit_buff->error=1; + return; /* Can't be right */ + } +#if BITS_SAVED == 32 + bit_buff->current_byte= (bit_buff->current_byte << 24) + + (((uint) ((uchar) bit_buff->pos[2]))) + + (((uint) ((uchar) bit_buff->pos[1])) << 8) + + (((uint) ((uchar) bit_buff->pos[0])) << 16); + bit_buff->pos+=3; + bits+=24; +#else + if (bits) /* We must have at leasts 9 bits */ + { + bit_buff->current_byte= (bit_buff->current_byte << 8) + + (uint) ((uchar) bit_buff->pos[0]); + bit_buff->pos++; + bits+=8; + } + else + { + bit_buff->current_byte= ((uint) ((uchar) bit_buff->pos[0]) << 8) + + ((uint) ((uchar) bit_buff->pos[1])); + bit_buff->pos+=2; + bits+=16; + } +#endif + } + /* First use info in quick_table */ + low_byte=(bit_buff->current_byte >> (bits - table_bits)) & table_and; + low_byte=decode_tree->table[low_byte]; + if (low_byte & IS_CHAR) + { + *to++ = (low_byte & 255); /* Found char in quick table */ + bits-= ((low_byte >> 8) & 31); /* Remove bits used */ + } + else + { /* Map through rest of decode-table */ + pos=decode_tree->table+low_byte; + bits-=table_bits; + for (;;) + { + if (bits < 8) + { /* We don't need to check end */ +#if BITS_SAVED == 32 + bit_buff->current_byte= (bit_buff->current_byte << 24) + + (((uint) ((uchar) bit_buff->pos[2]))) + + (((uint) ((uchar) bit_buff->pos[1])) << 8) + + (((uint) ((uchar) bit_buff->pos[0])) << 16); + bit_buff->pos+=3; + bits+=24; +#else + bit_buff->current_byte= (bit_buff->current_byte << 8) + + (uint) ((uchar) bit_buff->pos[0]); + bit_buff->pos+=1; + bits+=8; +#endif + } + low_byte=(uint) (bit_buff->current_byte >> (bits-8)); + decode_bytes_test_bit(0); + decode_bytes_test_bit(1); + decode_bytes_test_bit(2); + decode_bytes_test_bit(3); + decode_bytes_test_bit(4); + decode_bytes_test_bit(5); + decode_bytes_test_bit(6); + decode_bytes_test_bit(7); + bits-=8; + } + *to++ = (char) *pos; + } + } while (to != end); + + bit_buff->bits=bits; + return; +} +#endif /* BIT_SAVED == 64 */ + + +static uint decode_pos(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree) +{ + uint16 *pos=decode_tree->table; + for (;;) + { + if (get_bit(bit_buff)) + pos++; + if (*pos & IS_CHAR) + return (uint) (*pos & ~IS_CHAR); + pos+= *pos; + } +} + + +int _ma_read_rnd_pack_record(MARIA_HA *info, + uchar *buf, + register MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks) +{ + File file; + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share=info->s; + DBUG_ENTER("_ma_read_rnd_pack_record"); + + if (filepos >= info->state->data_file_length) + { + my_errno= HA_ERR_END_OF_FILE; + goto err; + } + + file= info->dfile.file; + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(&info->rec_cache, (uchar*) block_info.header, + filepos, share->pack.ref_length, + skip_deleted_blocks ? READING_NEXT : 0)) + goto err; + file= -1; + } + if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info, + &info->rec_buff, &info->rec_buff_size, + file, filepos)) + goto err; /* Error code is already set */ +#ifndef DBUG_OFF + if (block_info.rec_len > share->max_pack_length) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; + } +#endif + + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(&info->rec_cache, (uchar*) info->rec_buff, + block_info.filepos, block_info.rec_len, + skip_deleted_blocks ? READING_NEXT : 0)) + goto err; + } + else + { + if (my_read(info->dfile.file, (uchar*)info->rec_buff + block_info.offset, + block_info.rec_len-block_info.offset, + MYF(MY_NABP))) + goto err; + } + info->packed_length= block_info.rec_len; + info->cur_row.lastpos= filepos; + info->cur_row.nextpos= block_info.filepos+block_info.rec_len; + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + + DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf, + info->rec_buff, block_info.rec_len)); + err: + DBUG_RETURN(my_errno); +} + + + /* Read and process header from a huff-record-file */ + +uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, + uchar **rec_buff_p, size_t *rec_buff_size_p, + File file, my_off_t filepos) +{ + uchar *header= info->header; + uint head_length,ref_length; + LINT_INIT(ref_length); + + if (file >= 0) + { + ref_length=maria->s->pack.ref_length; + /* + We can't use my_pread() here because _ma_read_rnd_pack_record assumes + position is ok + */ + VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0))); + if (my_read(file,(char*) header,ref_length,MYF(MY_NABP))) + return BLOCK_FATAL_ERROR; + DBUG_DUMP("header",(uchar*) header,ref_length); + } + head_length= read_pack_length((uint) maria->s->pack.version, header, + &info->rec_len); + if (maria->s->base.blobs) + { + head_length+= read_pack_length((uint) maria->s->pack.version, + header + head_length, &info->blob_len); + /* + Ensure that the record buffer is big enough for the compressed + record plus all expanded blobs. [We do not have an extra buffer + for the resulting blobs. Sigh.] + */ + if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p, + info->rec_len + info->blob_len + + maria->s->base.extra_rec_buff_size)) + return BLOCK_FATAL_ERROR; /* not enough memory */ + bit_buff->blob_pos= (uchar*) *rec_buff_p + info->rec_len; + bit_buff->blob_end= bit_buff->blob_pos + info->blob_len; + maria->blob_length=info->blob_len; + } + info->filepos=filepos+head_length; + if (file > 0) + { + info->offset=min(info->rec_len, ref_length - head_length); + memcpy(*rec_buff_p, header + head_length, info->offset); + } + return 0; +} + + + /* rutines for bit buffer */ + /* Note buffer must be 6 uchar bigger than longest row */ + +static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff, uchar *buffer, + uint length) +{ + bit_buff->pos=buffer; + bit_buff->end=buffer+length; + bit_buff->bits=bit_buff->error=0; + bit_buff->current_byte=0; /* Avoid purify errors */ +} + +static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff, uint count) +{ + uint tmp; + count-=bit_buff->bits; + tmp=(bit_buff->current_byte & mask[bit_buff->bits]) << count; + fill_buffer(bit_buff); + bit_buff->bits=BITS_SAVED - count; + return tmp+(bit_buff->current_byte >> (BITS_SAVED - count)); +} + + /* Fill in empty bit_buff->current_byte from buffer */ + /* Sets bit_buff->error if buffer is exhausted */ + +static void fill_buffer(MARIA_BIT_BUFF *bit_buff) +{ + if (bit_buff->pos >= bit_buff->end) + { + bit_buff->error= 1; + bit_buff->current_byte=0; + return; + } +#if BITS_SAVED == 64 + bit_buff->current_byte= ((((uint) ((uchar) bit_buff->pos[7]))) + + (((uint) ((uchar) bit_buff->pos[6])) << 8) + + (((uint) ((uchar) bit_buff->pos[5])) << 16) + + (((uint) ((uchar) bit_buff->pos[4])) << 24) + + ((ulonglong) + ((((uint) ((uchar) bit_buff->pos[3]))) + + (((uint) ((uchar) bit_buff->pos[2])) << 8) + + (((uint) ((uchar) bit_buff->pos[1])) << 16) + + (((uint) ((uchar) bit_buff->pos[0])) << 24)) << 32)); + bit_buff->pos+=8; +#else +#if BITS_SAVED == 32 + bit_buff->current_byte= (((uint) ((uchar) bit_buff->pos[3])) + + (((uint) ((uchar) bit_buff->pos[2])) << 8) + + (((uint) ((uchar) bit_buff->pos[1])) << 16) + + (((uint) ((uchar) bit_buff->pos[0])) << 24)); + bit_buff->pos+=4; +#else + bit_buff->current_byte= (uint) (((uint) ((uchar) bit_buff->pos[1]))+ + (((uint) ((uchar) bit_buff->pos[0])) << 8)); + bit_buff->pos+=2; +#endif +#endif +} + + /* Get number of bits neaded to represent value */ + +static uint max_bit(register uint value) +{ + reg2 uint power=1; + + while ((value>>=1)) + power++; + return (power); +} + + +/***************************************************************************** + Some redefined functions to handle files when we are using memmap +*****************************************************************************/ +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif + +#ifdef HAVE_MMAP + +static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos); +static int _ma_read_rnd_mempack_record(MARIA_HA*, uchar *, MARIA_RECORD_POS, + my_bool); + +my_bool _ma_memmap_file(MARIA_HA *info) +{ + MARIA_SHARE *share=info->s; + DBUG_ENTER("maria_memmap_file"); + + if (!info->s->file_map) + { + if (my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) < + share->state.state.data_file_length+MEMMAP_EXTRA_MARGIN) + { + DBUG_PRINT("warning",("File isn't extended for memmap")); + DBUG_RETURN(0); + } + if (_ma_dynmap_file(info, share->state.state.data_file_length)) + DBUG_RETURN(0); + } + info->opt_flag|= MEMMAP_USED; + info->read_record= share->read_record= _ma_read_mempack_record; + share->scan= _ma_read_rnd_mempack_record; + DBUG_RETURN(1); +} + + +void _ma_unmap_file(MARIA_HA *info) +{ + VOID(my_munmap(info->s->file_map, + (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN)); +} + + +static uchar * +_ma_mempack_get_block_info(MARIA_HA *maria, + MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, + uchar **rec_buff_p, + size_t *rec_buff_size_p, + uchar *header) +{ + header+= read_pack_length((uint) maria->s->pack.version, header, + &info->rec_len); + if (maria->s->base.blobs) + { + header+= read_pack_length((uint) maria->s->pack.version, header, + &info->blob_len); + /* _ma_alloc_rec_buff sets my_errno on error */ + if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p, + info->blob_len + maria->s->base.extra_rec_buff_size)) + return 0; /* not enough memory */ + bit_buff->blob_pos= (uchar*) *rec_buff_p; + bit_buff->blob_end= (uchar*) *rec_buff_p + info->blob_len; + } + return header; +} + + +static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share=info->s; + uchar *pos; + DBUG_ENTER("maria_read_mempack_record"); + + if (filepos == HA_OFFSET_ERROR) + DBUG_RETURN(my_errno); /* _search() didn't find record */ + + if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff, + &block_info, &info->rec_buff, + &info->rec_buff_size, + (uchar*) share->file_map+ + filepos))) + DBUG_RETURN(my_errno); + DBUG_RETURN(_ma_pack_rec_unpack(info, &info->bit_buff, buf, + pos, block_info.rec_len)); +} + + +/*ARGSUSED*/ +static int _ma_read_rnd_mempack_record(MARIA_HA *info, + uchar *buf, + register MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks + __attribute__((unused))) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share=info->s; + uchar *pos,*start; + DBUG_ENTER("_ma_read_rnd_mempack_record"); + + if (filepos >= share->state.state.data_file_length) + { + my_errno=HA_ERR_END_OF_FILE; + goto err; + } + if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff, + &block_info, + &info->rec_buff, + &info->rec_buff_size, + (uchar*) + (start= share->file_map + + filepos)))) + goto err; +#ifndef DBUG_OFF + if (block_info.rec_len > info->s->max_pack_length) + { + my_errno=HA_ERR_WRONG_IN_RECORD; + goto err; + } +#endif + info->packed_length=block_info.rec_len; + info->cur_row.lastpos= filepos; + info->cur_row.nextpos= filepos+(uint) (pos-start)+block_info.rec_len; + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + + DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf, + pos, block_info.rec_len)); + err: + DBUG_RETURN(my_errno); +} + +#endif /* HAVE_MMAP */ + + /* Save length of row */ + +uint _ma_save_pack_length(uint version, uchar *block_buff, ulong length) +{ + if (length < 254) + { + *(uchar*) block_buff= (uchar) length; + return 1; + } + if (length <= 65535) + { + *(uchar*) block_buff=254; + int2store(block_buff+1,(uint) length); + return 3; + } + *(uchar*) block_buff=255; + if (version == 1) /* old format */ + { + DBUG_ASSERT(length <= 0xFFFFFF); + int3store(block_buff + 1, (ulong) length); + return 4; + } + else + { + int4store(block_buff + 1, (ulong) length); + return 5; + } +} + + +static uint read_pack_length(uint version, const uchar *buf, ulong *length) +{ + if (buf[0] < 254) + { + *length= buf[0]; + return 1; + } + else if (buf[0] == 254) + { + *length= uint2korr(buf + 1); + return 3; + } + if (version == 1) /* old format */ + { + *length= uint3korr(buf + 1); + return 4; + } + else + { + *length= uint4korr(buf + 1); + return 5; + } +} + + +uint _ma_calc_pack_length(uint version, ulong length) +{ + return (length < 254) ? 1 : (length < 65536) ? 3 : (version == 1) ? 4 : 5; +} diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c new file mode 100644 index 00000000000..f749414474f --- /dev/null +++ b/storage/maria/ma_page.c @@ -0,0 +1,188 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Read and write key blocks */ + +#include "maria_def.h" + + /* Fetch a key-page in memory */ + +uchar *_ma_fetch_keypage(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t page, int level, + uchar *buff, + int return_buffer __attribute__ ((unused))) +{ + uchar *tmp; + uint page_size; + DBUG_ENTER("_ma_fetch_keypage"); + DBUG_PRINT("enter",("page: %ld", (long) page)); + + DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length); + /* + TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when + LSN on the pages will be implemented + */ + tmp= pagecache_read(info->s->pagecache, &info->s->kfile, + page / keyinfo->block_length, level, buff, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_LEFT_UNLOCKED, 0); + if (tmp == info->buff) + info->keyread_buff_used=1; + else if (!tmp) + { + DBUG_PRINT("error",("Got errno: %d from pagecache_read",my_errno)); + info->last_keypage=HA_OFFSET_ERROR; + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); + } + info->last_keypage=page; + page_size= maria_data_on_page(tmp); + if (page_size < 4 || page_size > keyinfo->block_length) + { + DBUG_PRINT("error",("page %lu had wrong page length: %u", + (ulong) page, page_size)); + DBUG_DUMP("page", (char*) tmp, keyinfo->block_length); + info->last_keypage = HA_OFFSET_ERROR; + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno= HA_ERR_CRASHED; + tmp= 0; + } + DBUG_RETURN(tmp); +} /* _ma_fetch_keypage */ + + + /* Write a key-page on disk */ + +int _ma_write_keypage(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + my_off_t page, int level, uchar *buff) +{ + DBUG_ENTER("_ma_write_keypage"); + +#ifdef EXTRA_DEBUG /* Safety check */ + if (page < info->s->base.keystart || + page+keyinfo->block_length > info->state->key_file_length || + (page & (MARIA_MIN_KEY_BLOCK_LENGTH-1))) + { + DBUG_PRINT("error",("Trying to write inside key status region: " + "key_start: %lu length: %lu page: %lu", + (long) info->s->base.keystart, + (long) info->state->key_file_length, + (long) page)); + my_errno=EINVAL; + DBUG_RETURN((-1)); + } + DBUG_PRINT("page",("write page at: %lu",(long) page)); + DBUG_DUMP("buff",(uchar*) buff,maria_data_on_page(buff)); +#endif + +#ifdef HAVE_purify + { + /* Clear unitialized part of page to avoid valgrind/purify warnings */ + uint length= maria_data_on_page(buff); + bzero((uchar*) buff+length,keyinfo->block_length-length); + length=keyinfo->block_length; + } +#endif + + DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length); + /* + TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when + LSN on the pages will be implemented + */ + DBUG_RETURN(pagecache_write(info->s->pagecache, + &info->s->kfile, page / keyinfo->block_length, + level, buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0)); +} /* maria_write_keypage */ + + + /* Remove page from disk */ + +int _ma_dispose(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_off_t pos, + int level) +{ + my_off_t old_link; + char buff[8]; + uint offset; + pgcache_page_no_t page_no; + DBUG_ENTER("_ma_dispose"); + DBUG_PRINT("enter",("pos: %ld", (long) pos)); + + old_link= info->s->state.key_del; + info->s->state.key_del= pos; + page_no= pos / keyinfo->block_length; + offset= pos % keyinfo->block_length; + mi_sizestore(buff,old_link); + info->s->state.changed|= STATE_NOT_SORTED_PAGES; + + DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length && + info->s->pagecache->block_size == info->s->block_size); + /* + TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when + LSN on the pages will be implemented + */ + DBUG_RETURN(pagecache_write_part(info->s->pagecache, + &info->s->kfile, page_no, level, buff, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0, + offset, sizeof(buff), 0, 0)); +} /* _ma_dispose */ + + + /* Make new page on disk */ + +my_off_t _ma_new(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, int level) +{ + my_off_t pos; + uchar *buff; + DBUG_ENTER("_ma_new"); + + if ((pos= info->s->state.key_del) == HA_OFFSET_ERROR) + { + if (info->state->key_file_length >= + info->s->base.max_key_file_length - keyinfo->block_length) + { + my_errno=HA_ERR_INDEX_FILE_FULL; + DBUG_RETURN(HA_OFFSET_ERROR); + } + pos=info->state->key_file_length; + info->state->key_file_length+= keyinfo->block_length; + } + else + { + buff= alloca(info->s->block_size); + DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length && + info->s->pagecache->block_size == info->s->block_size); + /* + TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when + LSN on the pages will be implemented + */ + DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length); + if (!pagecache_read(info->s->pagecache, + &info->s->kfile, pos / keyinfo->block_length, level, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0)) + pos= HA_OFFSET_ERROR; + else + info->s->state.key_del= mi_sizekorr(buff); + } + info->s->state.changed|= STATE_NOT_SORTED_PAGES; + DBUG_PRINT("exit",("Pos: %ld",(long) pos)); + DBUG_RETURN(pos); +} /* _ma_new */ diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c new file mode 100755 index 00000000000..9f450d25c50 --- /dev/null +++ b/storage/maria/ma_pagecache.c @@ -0,0 +1,4197 @@ +/* Copyright (C) 2000-2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + These functions handle page cacheing for Maria tables. + + One cache can handle many files. + It must contain buffers of the same blocksize. + init_pagecache() should be used to init cache handler. + + The free list (free_block_list) is a stack like structure. + When a block is freed by free_block(), it is pushed onto the stack. + When a new block is required it is first tried to pop one from the stack. + If the stack is empty, it is tried to get a never-used block from the pool. + If this is empty too, then a block is taken from the LRU ring, flushing it + to disk, if necessary. This is handled in find_block(). + With the new free list, the blocks can have three temperatures: + hot, warm and cold (which is free). This is remembered in the block header + by the enum PCBLOCK_TEMPERATURE temperature variable. Remembering the + temperature is necessary to correctly count the number of warm blocks, + which is required to decide when blocks are allowed to become hot. Whenever + a block is inserted to another (sub-)chain, we take the old and new + temperature into account to decide if we got one more or less warm block. + blocks_unused is the sum of never used blocks in the pool and of currently + free blocks. blocks_used is the number of blocks fetched from the pool and + as such gives the maximum number of in-use blocks at any time. +*/ + +#include "maria_def.h" +#include <m_string.h> +#include "ma_pagecache.h" +#include <my_bit.h> +#include <errno.h> +#include <stdarg.h> + +/* + Some compilation flags have been added specifically for this module + to control the following: + - not to let a thread to yield the control when reading directly + from page cache, which might improve performance in many cases; + to enable this add: + #define SERIALIZED_READ_FROM_CACHE + - to set an upper bound for number of threads simultaneously + using the page cache; this setting helps to determine an optimal + size for hash table and improve performance when the number of + blocks in the page cache much less than the number of threads + accessing it; + to set this number equal to <N> add + #define MAX_THREADS <N> + - to substitute calls of pthread_cond_wait for calls of + pthread_cond_timedwait (wait with timeout set up); + this setting should be used only when you want to trap a deadlock + situation, which theoretically should not happen; + to set timeout equal to <T> seconds add + #define PAGECACHE_TIMEOUT <T> + - to enable the module traps and to send debug information from + page cache module to a special debug log add: + #define PAGECACHE_DEBUG + the name of this debug log file <LOG NAME> can be set through: + #define PAGECACHE_DEBUG_LOG <LOG NAME> + if the name is not defined, it's set by default; + if the PAGECACHE_DEBUG flag is not set up and we are in a debug + mode, i.e. when ! defined(DBUG_OFF), the debug information from the + module is sent to the regular debug log. + + Example of the settings: + #define SERIALIZED_READ_FROM_CACHE + #define MAX_THREADS 100 + #define PAGECACHE_TIMEOUT 1 + #define PAGECACHE_DEBUG + #define PAGECACHE_DEBUG_LOG "my_pagecache_debug.log" +*/ + +/* + In key cache we have external raw locking here we use + SERIALIZED_READ_FROM_CACHE to avoid problem of reading + not consistent data from the page. + (keycache functions (key_cache_read(), key_cache_insert() and + key_cache_write()) rely on external MyISAM lock, we don't) +*/ +#define SERIALIZED_READ_FROM_CACHE yes + +#define PCBLOCK_INFO(B) \ + DBUG_PRINT("info", \ + ("block: 0x%lx file: %lu page: %lu s: %0x hshL: 0x%lx req: %u/%u " \ + "wrlocks: %u", \ + (ulong)(B), \ + (ulong)((B)->hash_link ? \ + (B)->hash_link->file.file : \ + 0), \ + (ulong)((B)->hash_link ? \ + (B)->hash_link->pageno : \ + 0), \ + (B)->status, \ + (ulong)(B)->hash_link, \ + (uint) (B)->requests, \ + (uint)((B)->hash_link ? \ + (B)->hash_link->requests : \ + 0), \ + block->wlocks)) + +/* TODO: put it to my_static.c */ +my_bool my_disable_flush_pagecache_blocks= 0; +/** + when flushing pages of a file, it can happen that we take some dirty blocks + out of changed_blocks[]; Checkpoint must not run at this moment. +*/ +uint changed_blocks_is_incomplete= 0; + +#define STRUCT_PTR(TYPE, MEMBER, a) \ + (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER)) + +/* types of condition variables */ +#define COND_FOR_REQUESTED 0 /* queue of thread waiting for read operation */ +#define COND_FOR_SAVED 1 /* queue of thread waiting for flush */ +#define COND_FOR_WRLOCK 2 /* queue of write lock */ +#define COND_SIZE 3 /* number of COND_* queues */ + +/* offset of LSN on the page */ +#define PAGE_LSN_OFFSET 0 + +typedef pthread_cond_t KEYCACHE_CONDVAR; + +/* descriptor of the page in the page cache block buffer */ +struct st_pagecache_page +{ + PAGECACHE_FILE file; /* file to which the page belongs to */ + pgcache_page_no_t pageno; /* number of the page in the file */ +}; + +/* element in the chain of a hash table bucket */ +struct st_pagecache_hash_link +{ + struct st_pagecache_hash_link + *next, **prev; /* to connect links in the same bucket */ + struct st_pagecache_block_link + *block; /* reference to the block for the page: */ + PAGECACHE_FILE file; /* from such a file */ + pgcache_page_no_t pageno; /* this page */ + uint requests; /* number of requests for the page */ +}; + +/* simple states of a block */ +#define PCBLOCK_ERROR 1 /* an error occurred when performing disk i/o */ +#define PCBLOCK_READ 2 /* the is page in the block buffer */ +#define PCBLOCK_IN_SWITCH 4 /* block is preparing to read new page */ +#define PCBLOCK_REASSIGNED 8 /* block does not accept requests for old page */ +#define PCBLOCK_IN_FLUSH 16 /* block is in flush operation */ +#define PCBLOCK_CHANGED 32 /* block buffer contains a dirty page */ + +/* page status, returned by find_block */ +#define PAGE_READ 0 +#define PAGE_TO_BE_READ 1 +#define PAGE_WAIT_TO_BE_READ 2 + +/* block temperature determines in which (sub-)chain the block currently is */ +enum PCBLOCK_TEMPERATURE { PCBLOCK_COLD /*free*/ , PCBLOCK_WARM , PCBLOCK_HOT }; + +/* debug info */ +#ifndef DBUG_OFF +static const char *page_cache_page_type_str[]= +{ + /* used only for control page type changing during debugging */ + "EMPTY", + "PLAIN", + "LSN", + "READ_UNKNOWN" +}; + +static const char *page_cache_page_write_mode_str[]= +{ + "DELAY", + "NOW", + "DONE" +}; + +static const char *page_cache_page_lock_str[]= +{ + "free -> free", + "read -> read", + "write -> write", + "free -> read", + "free -> write", + "read -> free", + "write -> free", + "write -> read" +}; + +static const char *page_cache_page_pin_str[]= +{ + "pinned -> pinned", + "unpinned -> unpinned", + "unpinned -> pinned", + "pinned -> unpinned" +}; + + +typedef struct st_pagecache_pin_info +{ + struct st_pagecache_pin_info *next, **prev; + struct st_my_thread_var *thread; +} PAGECACHE_PIN_INFO; + +/* + st_pagecache_lock_info structure should be kept in next, prev, thread part + compatible with st_pagecache_pin_info to be compatible in functions. +*/ + +typedef struct st_pagecache_lock_info +{ + struct st_pagecache_lock_info *next, **prev; + struct st_my_thread_var *thread; + my_bool write_lock; +} PAGECACHE_LOCK_INFO; + + +/* service functions maintain debugging info about pin & lock */ + + +/* + Links information about thread pinned/locked the block to the list + + SYNOPSIS + info_link() + list the list to link in + node the node which should be linked +*/ + +static void info_link(PAGECACHE_PIN_INFO **list, PAGECACHE_PIN_INFO *node) +{ + if ((node->next= *list)) + node->next->prev= &(node->next); + *list= node; + node->prev= list; +} + + +/* + Unlinks information about thread pinned/locked the block from the list + + SYNOPSIS + info_unlink() + node the node which should be unlinked +*/ + +static void info_unlink(PAGECACHE_PIN_INFO *node) +{ + if ((*node->prev= node->next)) + node->next->prev= node->prev; +} + + +/* + Finds information about given thread in the list of threads which + pinned/locked this block. + + SYNOPSIS + info_find() + list the list where to find the thread + thread thread ID (reference to the st_my_thread_var + of the thread) + + RETURN + 0 - the thread was not found + pointer to the information node of the thread in the list +*/ + +static PAGECACHE_PIN_INFO *info_find(PAGECACHE_PIN_INFO *list, + struct st_my_thread_var *thread) +{ + register PAGECACHE_PIN_INFO *i= list; + for(; i != 0; i= i->next) + if (i->thread == thread) + return i; + return 0; +} + +#endif /* !DBUG_OFF */ + +/* page cache block */ +struct st_pagecache_block_link +{ + struct st_pagecache_block_link + *next_used, **prev_used; /* to connect links in the LRU chain (ring) */ + struct st_pagecache_block_link + *next_changed, **prev_changed; /* for lists of file dirty/clean blocks */ + struct st_pagecache_hash_link + *hash_link; /* backward ptr to referring hash_link */ +#ifndef DBUG_OFF + PAGECACHE_PIN_INFO *pin_list; + PAGECACHE_LOCK_INFO *lock_list; +#endif + KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */ + uchar *buffer; /* buffer for the block page */ + PAGECACHE_FILE *write_locker; + ulonglong last_hit_time; /* timestamp of the last hit */ + WQUEUE + wqueue[COND_SIZE]; /* queues on waiting requests for new/old pages */ + uint requests; /* number of requests for the block */ + uint status; /* state of the block */ + uint pins; /* pin counter */ + uint wlocks; /* write locks counter */ + enum PCBLOCK_TEMPERATURE temperature; /* block temperature: cold, warm, hot */ + enum pagecache_page_type type; /* type of the block */ + uint hits_left; /* number of hits left until promotion */ + /** @brief LSN when first became dirty; LSN_MAX means "not yet set" */ + LSN rec_lsn; +}; + +#ifndef DBUG_OFF +/* debug checks */ + +#ifdef NOT_USED +static my_bool info_check_pin(PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_pin mode + __attribute__((unused))) +{ + struct st_my_thread_var *thread= my_thread_var; + PAGECACHE_PIN_INFO *info= info_find(block->pin_list, thread); + DBUG_ENTER("info_check_pin"); + DBUG_PRINT("enter", ("thread: 0x%lx pin: %s", + (ulong) thread, page_cache_page_pin_str[mode])); + if (info) + { + if (mode == PAGECACHE_PIN_LEFT_UNPINNED) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_UNPINNED!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + else if (mode == PAGECACHE_PIN) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; PIN!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + } + else + { + if (mode == PAGECACHE_PIN_LEFT_PINNED) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_PINNED!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + else if (mode == PAGECACHE_UNPIN) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; UNPIN!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Debug function which checks current lock/pin state and requested changes + + SYNOPSIS + info_check_lock() + lock requested lock changes + pin requested pin changes + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool info_check_lock(PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin) +{ + struct st_my_thread_var *thread= my_thread_var; + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *) info_find((PAGECACHE_PIN_INFO *) block->lock_list, + thread); + DBUG_ENTER("info_check_lock"); + switch(lock) + { + case PAGECACHE_LOCK_LEFT_UNLOCKED: + if (pin != PAGECACHE_PIN_LEFT_UNPINNED || + info) + goto error; + break; + case PAGECACHE_LOCK_LEFT_READLOCKED: + if ((pin != PAGECACHE_PIN_LEFT_UNPINNED && + pin != PAGECACHE_PIN_LEFT_PINNED) || + info == 0 || info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_LEFT_WRITELOCKED: + if (pin != PAGECACHE_PIN_LEFT_PINNED || + info == 0 || !info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_READ: + if ((pin != PAGECACHE_PIN_LEFT_UNPINNED && + pin != PAGECACHE_PIN) || + info != 0) + goto error; + break; + case PAGECACHE_LOCK_WRITE: + if (pin != PAGECACHE_PIN || + info != 0) + goto error; + break; + case PAGECACHE_LOCK_READ_UNLOCK: + if ((pin != PAGECACHE_PIN_LEFT_UNPINNED && + pin != PAGECACHE_UNPIN) || + info == 0 || info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_WRITE_UNLOCK: + if (pin != PAGECACHE_UNPIN || + info == 0 || !info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_WRITE_TO_READ: + if ((pin != PAGECACHE_PIN_LEFT_PINNED && + pin != PAGECACHE_UNPIN) || + info == 0 || !info->write_lock) + goto error; + break; + } + DBUG_RETURN(0); +error: + DBUG_PRINT("info", + ("info_check_lock: thread: 0x%lx block 0x%lx: info: %d wrt: %d," + "to lock: %s, to pin: %s", + (ulong)thread, (ulong)block, test(info), + (info ? info->write_lock : 0), + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + DBUG_RETURN(1); +} +#endif /* NOT_USED */ +#endif /* !DBUG_OFF */ + +#define FLUSH_CACHE 2000 /* sort this many blocks at once */ + +static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block); +static void test_key_cache(PAGECACHE *pagecache, + const char *where, my_bool lock); + +#define PAGECACHE_HASH(p, f, pos) (((ulong) (pos) + \ + (ulong) (f).file) & (p->hash_entries-1)) +#define FILE_HASH(f) ((uint) (f).file & (PAGECACHE_CHANGED_BLOCKS_HASH - 1)) + +#define DEFAULT_PAGECACHE_DEBUG_LOG "pagecache_debug.log" + +#if defined(PAGECACHE_DEBUG) && ! defined(PAGECACHE_DEBUG_LOG) +#define PAGECACHE_DEBUG_LOG DEFAULT_PAGECACHE_DEBUG_LOG +#endif + +#if defined(PAGECACHE_DEBUG_LOG) +static FILE *pagecache_debug_log= NULL; +static void pagecache_debug_print _VARARGS((const char *fmt, ...)); +#define PAGECACHE_DEBUG_OPEN \ + if (!pagecache_debug_log) \ + { \ + pagecache_debug_log= fopen(PAGECACHE_DEBUG_LOG, "w"); \ + (void) setvbuf(pagecache_debug_log, NULL, _IOLBF, BUFSIZ); \ + } + +#define PAGECACHE_DEBUG_CLOSE \ + if (pagecache_debug_log) \ + { \ + fclose(pagecache_debug_log); \ + pagecache_debug_log= 0; \ + } +#else +#define PAGECACHE_DEBUG_OPEN +#define PAGECACHE_DEBUG_CLOSE +#endif /* defined(PAGECACHE_DEBUG_LOG) */ + +#if defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG) +#define KEYCACHE_DBUG_PRINT(l, m) \ + { if (pagecache_debug_log) \ + fprintf(pagecache_debug_log, "%s: ", l); \ + pagecache_debug_print m; } + +#define KEYCACHE_DBUG_ASSERT(a) \ + { if (! (a) && pagecache_debug_log) \ + fclose(pagecache_debug_log); \ + assert(a); } +#else +#define KEYCACHE_DBUG_PRINT(l, m) DBUG_PRINT(l, m) +#define KEYCACHE_DBUG_ASSERT(a) DBUG_ASSERT(a) +#endif /* defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG) */ + +#if defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF) +#ifdef THREAD +static long pagecache_thread_id; +#define KEYCACHE_THREAD_TRACE(l) \ + KEYCACHE_DBUG_PRINT(l,("|thread %ld",pagecache_thread_id)) + +#define KEYCACHE_THREAD_TRACE_BEGIN(l) \ + { struct st_my_thread_var *thread_var= my_thread_var; \ + pagecache_thread_id= thread_var->id; \ + KEYCACHE_DBUG_PRINT(l,("[thread %ld",pagecache_thread_id)) } + +#define KEYCACHE_THREAD_TRACE_END(l) \ + KEYCACHE_DBUG_PRINT(l,("]thread %ld",pagecache_thread_id)) +#else /* THREAD */ +#define KEYCACHE_THREAD_TRACE(l) KEYCACHE_DBUG_PRINT(l,("")) +#define KEYCACHE_THREAD_TRACE_BEGIN(l) KEYCACHE_DBUG_PRINT(l,("")) +#define KEYCACHE_THREAD_TRACE_END(l) KEYCACHE_DBUG_PRINT(l,("")) +#endif /* THREAD */ +#else +#define KEYCACHE_THREAD_TRACE_BEGIN(l) +#define KEYCACHE_THREAD_TRACE_END(l) +#define KEYCACHE_THREAD_TRACE(l) +#endif /* defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF) */ + +#define PCBLOCK_NUMBER(p, b) \ + ((uint) (((char*)(b)-(char *) p->block_root)/sizeof(PAGECACHE_BLOCK_LINK))) +#define PAGECACHE_HASH_LINK_NUMBER(p, h) \ + ((uint) (((char*)(h)-(char *) p->hash_link_root)/ \ + sizeof(PAGECACHE_HASH_LINK))) + +#if (defined(PAGECACHE_TIMEOUT) && !defined(__WIN__)) || defined(PAGECACHE_DEBUG) +static int pagecache_pthread_cond_wait(pthread_cond_t *cond, + pthread_mutex_t *mutex); +#else +#define pagecache_pthread_cond_wait pthread_cond_wait +#endif + +#if defined(PAGECACHE_DEBUG) +static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex); +static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex); +static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond); +#define pagecache_pthread_mutex_lock(M) \ +{ DBUG_PRINT("lock", ("mutex lock 0x%lx %u", (ulong)(M), __LINE__)); \ + ___pagecache_pthread_mutex_lock(M);} +#define pagecache_pthread_mutex_unlock(M) \ +{ DBUG_PRINT("lock", ("mutex unlock 0x%lx %u", (ulong)(M), __LINE__)); \ + ___pagecache_pthread_mutex_unlock(M);} +#define pagecache_pthread_cond_signal(M) \ +{ DBUG_PRINT("lock", ("signal 0x%lx %u", (ulong)(M), __LINE__)); \ + ___pagecache_pthread_cond_signal(M);} +#else +#define pagecache_pthread_mutex_lock pthread_mutex_lock +#define pagecache_pthread_mutex_unlock pthread_mutex_unlock +#define pagecache_pthread_cond_signal pthread_cond_signal +#endif /* defined(PAGECACHE_DEBUG) */ + +extern my_bool translog_flush(LSN lsn); + +/* + Write page to the disk + + SYNOPSIS + pagecache_fwrite() + pagecache - page cache pointer + filedesc - pagecache file descriptor structure + buffer - buffer which we will write + type - page type (plain or with LSN) + flags - MYF() flags + + RETURN + 0 - OK + !=0 - Error +*/ + +static uint pagecache_fwrite(PAGECACHE *pagecache, + PAGECACHE_FILE *filedesc, + uchar *buffer, + pgcache_page_no_t pageno, + enum pagecache_page_type type, + myf flags) +{ + DBUG_ENTER("pagecache_fwrite"); + DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE); + if (type == PAGECACHE_LSN_PAGE) + { + LSN lsn; + DBUG_PRINT("info", ("Log handler call")); + /* TODO: integrate with page format */ + lsn= lsn_korr(buffer + PAGE_LSN_OFFSET); + DBUG_ASSERT(LSN_VALID(lsn)); + translog_flush(lsn); + } + DBUG_RETURN(my_pwrite(filedesc->file, buffer, pagecache->block_size, + (pageno)<<(pagecache->shift), flags)); +} + + +/* + Read page from the disk + + SYNOPSIS + pagecache_fread() + pagecache - page cache pointer + filedesc - pagecache file descriptor structure + buffer - buffer in which we will read + pageno - page number + flags - MYF() flags +*/ +#define pagecache_fread(pagecache, filedesc, buffer, pageno, flags) \ + my_pread((filedesc)->file, buffer, pagecache->block_size, \ + (pageno)<<(pagecache->shift), flags) + + +/* + next_power(value) is 2 at the power of (1+floor(log2(value))); + e.g. next_power(2)=4, next_power(3)=4. +*/ +static inline uint next_power(uint value) +{ + return (uint) my_round_up_to_next_power((uint32) value) << 1; +} + + +/* + Initialize a page cache + + SYNOPSIS + init_pagecache() + pagecache pointer to a page cache data structure + key_cache_block_size size of blocks to keep cached data + use_mem total memory to use for the key cache + division_limit division limit (may be zero) + age_threshold age threshold (may be zero) + block_size size of block (should be power of 2) + + RETURN VALUE + number of blocks in the key cache, if successful, + 0 - otherwise. + + NOTES. + if pagecache->inited != 0 we assume that the key cache + is already initialized. This is for now used by myisamchk, but shouldn't + be something that a program should rely on! + + It's assumed that no two threads call this function simultaneously + referring to the same key cache handle. + +*/ + +int init_pagecache(PAGECACHE *pagecache, size_t use_mem, + uint division_limit, uint age_threshold, + uint block_size) +{ + uint blocks, hash_links, length; + int error; + DBUG_ENTER("init_pagecache"); + DBUG_ASSERT(block_size >= 512); + + PAGECACHE_DEBUG_OPEN; + if (pagecache->inited && pagecache->disk_blocks > 0) + { + DBUG_PRINT("warning",("key cache already in use")); + DBUG_RETURN(0); + } + + pagecache->global_cache_w_requests= pagecache->global_cache_r_requests= 0; + pagecache->global_cache_read= pagecache->global_cache_write= 0; + pagecache->disk_blocks= -1; + if (! pagecache->inited) + { + pagecache->inited= 1; + pagecache->in_init= 0; + pthread_mutex_init(&pagecache->cache_lock, MY_MUTEX_INIT_FAST); + pagecache->resize_queue.last_thread= NULL; + } + + pagecache->mem_size= use_mem; + pagecache->block_size= block_size; + pagecache->shift= my_bit_log2(block_size); + DBUG_PRINT("info", ("block_size: %u", + block_size)); + DBUG_ASSERT(((uint)(1 << pagecache->shift)) == block_size); + + blocks= (int) (use_mem / (sizeof(PAGECACHE_BLOCK_LINK) + + 2 * sizeof(PAGECACHE_HASH_LINK) + + sizeof(PAGECACHE_HASH_LINK*) * + 5/4 + block_size)); + /* + We need to support page cache with just one block to be able to do + scanning of rows-in-block files + */ + if (blocks >= 1) + { + for ( ; ; ) + { + /* Set my_hash_entries to the next bigger 2 power */ + if ((pagecache->hash_entries= next_power(blocks)) < + (blocks) * 5/4) + pagecache->hash_entries<<= 1; + hash_links= 2 * blocks; +#if defined(MAX_THREADS) + if (hash_links < MAX_THREADS + blocks - 1) + hash_links= MAX_THREADS + blocks - 1; +#endif + while ((length= (ALIGN_SIZE(blocks * sizeof(PAGECACHE_BLOCK_LINK)) + + ALIGN_SIZE(hash_links * sizeof(PAGECACHE_HASH_LINK)) + + ALIGN_SIZE(sizeof(PAGECACHE_HASH_LINK*) * + pagecache->hash_entries))) + + (((ulong) blocks) << pagecache->shift) > use_mem) + blocks--; + /* Allocate memory for cache page buffers */ + if ((pagecache->block_mem= + my_large_malloc((ulong) blocks * pagecache->block_size, + MYF(MY_WME)))) + { + /* + Allocate memory for blocks, hash_links and hash entries; + For each block 2 hash links are allocated + */ + if ((pagecache->block_root= + (PAGECACHE_BLOCK_LINK*) my_malloc((uint) length, + MYF(0)))) + break; + my_large_free(pagecache->block_mem, MYF(0)); + pagecache->block_mem= 0; + } + if (blocks < 8) + { + my_errno= ENOMEM; + goto err; + } + blocks= blocks / 4*3; + } + pagecache->blocks_unused= (ulong) blocks; + pagecache->disk_blocks= (int) blocks; + pagecache->hash_links= hash_links; + pagecache->hash_root= + (PAGECACHE_HASH_LINK**) ((char*) pagecache->block_root + + ALIGN_SIZE(blocks*sizeof(PAGECACHE_BLOCK_LINK))); + pagecache->hash_link_root= + (PAGECACHE_HASH_LINK*) ((char*) pagecache->hash_root + + ALIGN_SIZE((sizeof(PAGECACHE_HASH_LINK*) * + pagecache->hash_entries))); + bzero((uchar*) pagecache->block_root, + pagecache->disk_blocks * sizeof(PAGECACHE_BLOCK_LINK)); + bzero((uchar*) pagecache->hash_root, + pagecache->hash_entries * sizeof(PAGECACHE_HASH_LINK*)); + bzero((uchar*) pagecache->hash_link_root, + pagecache->hash_links * sizeof(PAGECACHE_HASH_LINK)); + pagecache->hash_links_used= 0; + pagecache->free_hash_list= NULL; + pagecache->blocks_used= pagecache->blocks_changed= 0; + + pagecache->global_blocks_changed= 0; + pagecache->blocks_available=0; /* For debugging */ + + /* The LRU chain is empty after initialization */ + pagecache->used_last= NULL; + pagecache->used_ins= NULL; + pagecache->free_block_list= NULL; + pagecache->time= 0; + pagecache->warm_blocks= 0; + pagecache->min_warm_blocks= (division_limit ? + blocks * division_limit / 100 + 1 : + blocks); + pagecache->age_threshold= (age_threshold ? + blocks * age_threshold / 100 : + blocks); + + pagecache->cnt_for_resize_op= 0; + pagecache->resize_in_flush= 0; + pagecache->can_be_used= 1; + + pagecache->waiting_for_hash_link.last_thread= NULL; + pagecache->waiting_for_block.last_thread= NULL; + DBUG_PRINT("exit", + ("disk_blocks: %d block_root: 0x%lx hash_entries: %d\ + hash_root: 0x%lx hash_links: %d hash_link_root: 0x%lx", + pagecache->disk_blocks, (long) pagecache->block_root, + pagecache->hash_entries, (long) pagecache->hash_root, + pagecache->hash_links, (long) pagecache->hash_link_root)); + bzero((uchar*) pagecache->changed_blocks, + sizeof(pagecache->changed_blocks[0]) * + PAGECACHE_CHANGED_BLOCKS_HASH); + bzero((uchar*) pagecache->file_blocks, + sizeof(pagecache->file_blocks[0]) * + PAGECACHE_CHANGED_BLOCKS_HASH); + } + + pagecache->blocks= pagecache->disk_blocks > 0 ? pagecache->disk_blocks : 0; + DBUG_RETURN((int) pagecache->disk_blocks); + +err: + error= my_errno; + pagecache->disk_blocks= 0; + pagecache->blocks= 0; + if (pagecache->block_mem) + { + my_large_free((uchar*) pagecache->block_mem, MYF(0)); + pagecache->block_mem= NULL; + } + if (pagecache->block_root) + { + my_free((uchar*) pagecache->block_root, MYF(0)); + pagecache->block_root= NULL; + } + my_errno= error; + pagecache->can_be_used= 0; + DBUG_RETURN(0); +} + + +/* + Flush all blocks in the key cache to disk +*/ + +#ifdef NOT_USED +static int flush_all_key_blocks(PAGECACHE *pagecache) +{ +#if defined(PAGECACHE_DEBUG) + uint cnt=0; +#endif + while (pagecache->blocks_changed > 0) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->used_last->next_used ; ; block=block->next_used) + { + if (block->hash_link) + { +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + if (flush_pagecache_blocks_int(pagecache, &block->hash_link->file, + FLUSH_RELEASE)) + return 1; + break; + } + if (block == pagecache->used_last) + break; + } + } + return 0; +} +#endif /* NOT_USED */ + +/* + Resize a key cache + + SYNOPSIS + resize_pagecache() + pagecache pointer to a page cache data structure + use_mem total memory to use for the new key cache + division_limit new division limit (if not zero) + age_threshold new age threshold (if not zero) + + RETURN VALUE + number of blocks in the key cache, if successful, + 0 - otherwise. + + NOTES. + The function first compares the memory size parameter + with the key cache value. + + If they differ the function free the the memory allocated for the + old key cache blocks by calling the end_pagecache function and + then rebuilds the key cache with new blocks by calling + init_key_cache. + + The function starts the operation only when all other threads + performing operations with the key cache let her to proceed + (when cnt_for_resize=0). + + Before being usable, this function needs: + - to receive fixes for BUG#17332 "changing key_buffer_size on a running + server can crash under load" similar to those done to the key cache + - to have us (Sanja) look at the additional constraints placed on + resizing, due to the page locking specific to this page cache. + So we disable it for now. +*/ +#if NOT_USED /* keep disabled until code is fixed see above !! */ +int resize_pagecache(PAGECACHE *pagecache, + size_t use_mem, uint division_limit, + uint age_threshold) +{ + int blocks; +#ifdef THREAD + struct st_my_thread_var *thread; + WQUEUE *wqueue; + +#endif + DBUG_ENTER("resize_pagecache"); + + if (!pagecache->inited) + DBUG_RETURN(pagecache->disk_blocks); + + if(use_mem == pagecache->mem_size) + { + change_pagecache_param(pagecache, division_limit, age_threshold); + DBUG_RETURN(pagecache->disk_blocks); + } + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + +#ifdef THREAD + wqueue= &pagecache->resize_queue; + thread= my_thread_var; + wqueue_link_into_queue(wqueue, thread); + + while (wqueue->last_thread->next != thread) + { + pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); + } +#endif + + pagecache->resize_in_flush= 1; + if (flush_all_key_blocks(pagecache)) + { + /* TODO: if this happens, we should write a warning in the log file ! */ + pagecache->resize_in_flush= 0; + blocks= 0; + pagecache->can_be_used= 0; + goto finish; + } + pagecache->resize_in_flush= 0; + pagecache->can_be_used= 0; +#ifdef THREAD + while (pagecache->cnt_for_resize_op) + { + KEYCACHE_DBUG_PRINT("resize_pagecache: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); + } +#else + KEYCACHE_DBUG_ASSERT(pagecache->cnt_for_resize_op == 0); +#endif + + end_pagecache(pagecache, 0); /* Don't free mutex */ + /* The following will work even if use_mem is 0 */ + blocks= init_pagecache(pagecache, pagecache->block_size, use_mem, + division_limit, age_threshold); + +finish: +#ifdef THREAD + wqueue_unlink_from_queue(wqueue, thread); + /* Signal for the next resize request to proceeed if any */ + if (wqueue->last_thread) + { + KEYCACHE_DBUG_PRINT("resize_pagecache: signal", + ("thread %ld", wqueue->last_thread->next->id)); + pagecache_pthread_cond_signal(&wqueue->last_thread->next->suspend); + } +#endif + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(blocks); +} +#endif /* 0 */ + + +/* + Increment counter blocking resize key cache operation +*/ +static inline void inc_counter_for_resize_op(PAGECACHE *pagecache) +{ + pagecache->cnt_for_resize_op++; +} + + +/* + Decrement counter blocking resize key cache operation; + Signal the operation to proceed when counter becomes equal zero +*/ +static inline void dec_counter_for_resize_op(PAGECACHE *pagecache) +{ +#ifdef THREAD + struct st_my_thread_var *last_thread; + if (!--pagecache->cnt_for_resize_op && + (last_thread= pagecache->resize_queue.last_thread)) + { + KEYCACHE_DBUG_PRINT("dec_counter_for_resize_op: signal", + ("thread %ld", last_thread->next->id)); + pagecache_pthread_cond_signal(&last_thread->next->suspend); + } +#else + pagecache->cnt_for_resize_op--; +#endif +} + +/* + Change the page cache parameters + + SYNOPSIS + change_pagecache_param() + pagecache pointer to a page cache data structure + division_limit new division limit (if not zero) + age_threshold new age threshold (if not zero) + + RETURN VALUE + none + + NOTES. + Presently the function resets the key cache parameters + concerning midpoint insertion strategy - division_limit and + age_threshold. +*/ + +void change_pagecache_param(PAGECACHE *pagecache, uint division_limit, + uint age_threshold) +{ + DBUG_ENTER("change_pagecache_param"); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (division_limit) + pagecache->min_warm_blocks= (pagecache->disk_blocks * + division_limit / 100 + 1); + if (age_threshold) + pagecache->age_threshold= (pagecache->disk_blocks * + age_threshold / 100); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_VOID_RETURN; +} + + +/* + Removes page cache from memory. Does NOT flush pages to disk. + + SYNOPSIS + end_pagecache() + pagecache page cache handle + cleanup Complete free (Free also mutex for key cache) + + RETURN VALUE + none +*/ + +void end_pagecache(PAGECACHE *pagecache, my_bool cleanup) +{ + DBUG_ENTER("end_pagecache"); + DBUG_PRINT("enter", ("key_cache: 0x%lx", (long) pagecache)); + + if (!pagecache->inited) + DBUG_VOID_RETURN; + + if (pagecache->disk_blocks > 0) + { + if (pagecache->block_mem) + { + my_large_free((uchar*) pagecache->block_mem, MYF(0)); + pagecache->block_mem= NULL; + my_free((uchar*) pagecache->block_root, MYF(0)); + pagecache->block_root= NULL; + } + pagecache->disk_blocks= -1; + /* Reset blocks_changed to be safe if flush_all_key_blocks is called */ + pagecache->blocks_changed= 0; + } + + DBUG_PRINT("status", ("used: %lu changed: %lu w_requests: %lu " + "writes: %lu r_requests: %lu reads: %lu", + pagecache->blocks_used, pagecache->global_blocks_changed, + (ulong) pagecache->global_cache_w_requests, + (ulong) pagecache->global_cache_write, + (ulong) pagecache->global_cache_r_requests, + (ulong) pagecache->global_cache_read)); + + if (cleanup) + { + pthread_mutex_destroy(&pagecache->cache_lock); + pagecache->inited= pagecache->can_be_used= 0; + PAGECACHE_DEBUG_CLOSE; + } + DBUG_VOID_RETURN; +} /* end_pagecache */ + + +/* + Unlink a block from the chain of dirty/clean blocks +*/ + +static inline void unlink_changed(PAGECACHE_BLOCK_LINK *block) +{ + if (block->next_changed) + block->next_changed->prev_changed= block->prev_changed; + *block->prev_changed= block->next_changed; +} + + +/* + Link a block into the chain of dirty/clean blocks +*/ + +static inline void link_changed(PAGECACHE_BLOCK_LINK *block, + PAGECACHE_BLOCK_LINK **phead) +{ + block->prev_changed= phead; + if ((block->next_changed= *phead)) + (*phead)->prev_changed= &block->next_changed; + *phead= block; +} + + +/* + Unlink a block from the chain of dirty/clean blocks, if it's asked for, + and link it to the chain of clean blocks for the specified file +*/ + +static void link_to_file_list(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + PAGECACHE_FILE *file, my_bool unlink) +{ + if (unlink) + unlink_changed(block); + link_changed(block, &pagecache->file_blocks[FILE_HASH(*file)]); + if (block->status & PCBLOCK_CHANGED) + { + block->status&= ~PCBLOCK_CHANGED; + block->rec_lsn= LSN_MAX; + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + } +} + + +/* + Unlink a block from the chain of clean blocks for the specified + file and link it to the chain of dirty blocks for this file +*/ + +static inline void link_to_changed_list(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block) +{ + unlink_changed(block); + link_changed(block, + &pagecache->changed_blocks[FILE_HASH(block->hash_link->file)]); + block->status|=PCBLOCK_CHANGED; + pagecache->blocks_changed++; + pagecache->global_blocks_changed++; +} + + +/* + Link a block to the LRU chain at the beginning or at the end of + one of two parts. + + SYNOPSIS + link_block() + pagecache pointer to a page cache data structure + block pointer to the block to link to the LRU chain + hot <-> to link the block into the hot subchain + at_end <-> to link the block at the end of the subchain + + RETURN VALUE + none + + NOTES. + The LRU chain is represented by a curcular list of block structures. + The list is double-linked of the type (**prev,*next) type. + The LRU chain is divided into two parts - hot and warm. + There are two pointers to access the last blocks of these two + parts. The beginning of the warm part follows right after the + end of the hot part. + Only blocks of the warm part can be used for replacement. + The first block from the beginning of this subchain is always + taken for eviction (pagecache->last_used->next) + + LRU chain: +------+ H O T +------+ + +----| end |----...<----| beg |----+ + | +------+last +------+ | + v<-link in latest hot (new end) | + | link in latest warm (new end)->^ + | +------+ W A R M +------+ | + +----| beg |---->...----| end |----+ + +------+ +------+ins + first for eviction +*/ + +static void link_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + my_bool hot, my_bool at_end) +{ + PAGECACHE_BLOCK_LINK *ins; + PAGECACHE_BLOCK_LINK **ptr_ins; + + PCBLOCK_INFO(block); + KEYCACHE_DBUG_ASSERT(! (block->hash_link && block->hash_link->requests)); +#ifdef THREAD + if (!hot && pagecache->waiting_for_block.last_thread) + { + /* Signal that in the LRU warm sub-chain an available block has appeared */ + struct st_my_thread_var *last_thread= + pagecache->waiting_for_block.last_thread; + struct st_my_thread_var *first_thread= last_thread->next; + struct st_my_thread_var *next_thread= first_thread; + PAGECACHE_HASH_LINK *hash_link= + (PAGECACHE_HASH_LINK *) first_thread->opt_info; + struct st_my_thread_var *thread; + do + { + thread= next_thread; + next_thread= thread->next; + /* + We notify about the event all threads that ask + for the same page as the first thread in the queue + */ + if ((PAGECACHE_HASH_LINK *) thread->opt_info == hash_link) + { + KEYCACHE_DBUG_PRINT("link_block: signal", ("thread: %ld", thread->id)); + pagecache_pthread_cond_signal(&thread->suspend); + wqueue_unlink_from_queue(&pagecache->waiting_for_block, thread); + block->requests++; + } + } + while (thread != last_thread); + hash_link->block= block; + KEYCACHE_THREAD_TRACE("link_block: after signaling"); +#if defined(PAGECACHE_DEBUG) + KEYCACHE_DBUG_PRINT("link_block", + ("linked,unlinked block: %u status: %x #requests: %u #available: %u", + PCBLOCK_NUMBER(pagecache, block), block->status, + block->requests, pagecache->blocks_available)); +#endif + return; + } +#else /* THREAD */ + KEYCACHE_DBUG_ASSERT(! (!hot && pagecache->waiting_for_block.last_thread)); + /* Condition not transformed using DeMorgan, to keep the text identical */ +#endif /* THREAD */ + ptr_ins= hot ? &pagecache->used_ins : &pagecache->used_last; + ins= *ptr_ins; + if (ins) + { + ins->next_used->prev_used= &block->next_used; + block->next_used= ins->next_used; + block->prev_used= &ins->next_used; + ins->next_used= block; + if (at_end) + *ptr_ins= block; + } + else + { + /* The LRU chain is empty */ + pagecache->used_last= pagecache->used_ins= block->next_used= block; + block->prev_used= &block->next_used; + } + KEYCACHE_THREAD_TRACE("link_block"); +#if defined(PAGECACHE_DEBUG) + pagecache->blocks_available++; + KEYCACHE_DBUG_PRINT("link_block", + ("linked block: %u:%1u status: %x #requests: %u #available: %u", + PCBLOCK_NUMBER(pagecache, block), at_end, block->status, + block->requests, pagecache->blocks_available)); + KEYCACHE_DBUG_ASSERT((ulong) pagecache->blocks_available <= + pagecache->blocks_used); +#endif +} + + +/* + Unlink a block from the LRU chain + + SYNOPSIS + unlink_block() + pagecache pointer to a page cache data structure + block pointer to the block to unlink from the LRU chain + + RETURN VALUE + none + + NOTES. + See NOTES for link_block +*/ + +static void unlink_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("unlink_block"); + DBUG_PRINT("unlink_block", ("unlink 0x%lx", (ulong)block)); + if (block->next_used == block) + /* The list contains only one member */ + pagecache->used_last= pagecache->used_ins= NULL; + else + { + block->next_used->prev_used= block->prev_used; + *block->prev_used= block->next_used; + if (pagecache->used_last == block) + pagecache->used_last= STRUCT_PTR(PAGECACHE_BLOCK_LINK, + next_used, block->prev_used); + if (pagecache->used_ins == block) + pagecache->used_ins= STRUCT_PTR(PAGECACHE_BLOCK_LINK, + next_used, block->prev_used); + } + block->next_used= NULL; + + KEYCACHE_THREAD_TRACE("unlink_block"); +#if defined(PAGECACHE_DEBUG) + KEYCACHE_DBUG_ASSERT(pagecache->blocks_available != 0); + pagecache->blocks_available--; + KEYCACHE_DBUG_PRINT("unlink_block", + ("unlinked block: 0x%lx (%u) status: %x #requests: %u #available: %u", + (ulong)block, PCBLOCK_NUMBER(pagecache, block), + block->status, + block->requests, pagecache->blocks_available)); + PCBLOCK_INFO(block); +#endif + DBUG_VOID_RETURN; +} + + +/* + Register requests for a block + + SYNOPSIS + reg_requests() + pagecache this page cache reference + block the block we request reference + count how many requests we register (it is 1 everywhere) + + NOTE + Registration of request means we are going to use this block so we exclude + it from the LRU if it is first request +*/ +static void reg_requests(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + int count) +{ + DBUG_ENTER("reg_requests"); + DBUG_PRINT("enter", ("block: 0x%lx (%u) status: %x reqs: %u", + (ulong)block, PCBLOCK_NUMBER(pagecache, block), + block->status, block->requests)); + PCBLOCK_INFO(block); + if (! block->requests) + /* First request for the block unlinks it */ + unlink_block(pagecache, block); + block->requests+= count; + DBUG_VOID_RETURN; +} + + +/* + Unregister request for a block + linking it to the LRU chain if it's the last request + + SYNOPSIS + unreg_request() + pagecache pointer to a page cache data structure + block pointer to the block to link to the LRU chain + at_end <-> to link the block at the end of the LRU chain + + RETURN VALUE + none + + NOTES. + Every linking to the LRU chain decrements by one a special block + counter (if it's positive). If the at_end parameter is TRUE the block is + added either at the end of warm sub-chain or at the end of hot sub-chain. + It is added to the hot subchain if its counter is zero and number of + blocks in warm sub-chain is not less than some low limit (determined by + the division_limit parameter). Otherwise the block is added to the warm + sub-chain. If the at_end parameter is FALSE the block is always added + at beginning of the warm sub-chain. + Thus a warm block can be promoted to the hot sub-chain when its counter + becomes zero for the first time. + At the same time the block at the very beginning of the hot subchain + might be moved to the beginning of the warm subchain if it stays untouched + for a too long time (this time is determined by parameter age_threshold). +*/ + +static void unreg_request(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, int at_end) +{ + DBUG_ENTER("unreg_request"); + DBUG_PRINT("enter", ("block 0x%lx (%u) status: %x reqs: %u", + (ulong)block, PCBLOCK_NUMBER(pagecache, block), + block->status, block->requests)); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->requests > 0); + if (! --block->requests) + { + my_bool hot; + if (block->hits_left) + block->hits_left--; + hot= !block->hits_left && at_end && + pagecache->warm_blocks > pagecache->min_warm_blocks; + if (hot) + { + if (block->temperature == PCBLOCK_WARM) + pagecache->warm_blocks--; + block->temperature= PCBLOCK_HOT; + KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu", + pagecache->warm_blocks)); + } + link_block(pagecache, block, hot, (my_bool)at_end); + block->last_hit_time= pagecache->time; + pagecache->time++; + + block= pagecache->used_ins; + /* Check if we should link a hot block to the warm block */ + if (block && pagecache->time - block->last_hit_time > + pagecache->age_threshold) + { + unlink_block(pagecache, block); + link_block(pagecache, block, 0, 0); + if (block->temperature != PCBLOCK_WARM) + { + pagecache->warm_blocks++; + block->temperature= PCBLOCK_WARM; + } + KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu", + pagecache->warm_blocks)); + } + } + DBUG_VOID_RETURN; +} + +/* + Remove a reader of the page in block +*/ + +static inline void remove_reader(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("remove_reader"); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->hash_link->requests > 0); +#ifdef THREAD + if (! --block->hash_link->requests && block->condvar) + pagecache_pthread_cond_signal(block->condvar); +#else + --block->hash_link->requests; +#endif + DBUG_VOID_RETURN; +} + + +/* + Wait until the last reader of the page in block + signals on its termination +*/ + +static inline void wait_for_readers(PAGECACHE *pagecache + __attribute__((unused)), + PAGECACHE_BLOCK_LINK *block) +{ +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + while (block->hash_link->requests) + { + KEYCACHE_DBUG_PRINT("wait_for_readers: wait", + ("suspend thread: %ld block: %u", + thread->id, PCBLOCK_NUMBER(pagecache, block))); + block->condvar= &thread->suspend; + pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); + block->condvar= NULL; + } +#else + KEYCACHE_DBUG_ASSERT(block->hash_link->requests == 0); +#endif +} + + +/* + Add a hash link to a bucket in the hash_table +*/ + +static inline void link_hash(PAGECACHE_HASH_LINK **start, + PAGECACHE_HASH_LINK *hash_link) +{ + if (*start) + (*start)->prev= &hash_link->next; + hash_link->next= *start; + hash_link->prev= start; + *start= hash_link; +} + + +/* + Remove a hash link from the hash table +*/ + +static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link) +{ + KEYCACHE_DBUG_PRINT("unlink_hash", ("fd: %u pos_ %lu #requests=%u", + (uint) hash_link->file.file, (ulong) hash_link->pageno, + hash_link->requests)); + KEYCACHE_DBUG_ASSERT(hash_link->requests == 0); + if ((*hash_link->prev= hash_link->next)) + hash_link->next->prev= hash_link->prev; + hash_link->block= NULL; +#ifdef THREAD + if (pagecache->waiting_for_hash_link.last_thread) + { + /* Signal that a free hash link has appeared */ + struct st_my_thread_var *last_thread= + pagecache->waiting_for_hash_link.last_thread; + struct st_my_thread_var *first_thread= last_thread->next; + struct st_my_thread_var *next_thread= first_thread; + PAGECACHE_PAGE *first_page= (PAGECACHE_PAGE *) (first_thread->opt_info); + struct st_my_thread_var *thread; + + hash_link->file= first_page->file; + hash_link->pageno= first_page->pageno; + do + { + PAGECACHE_PAGE *page; + thread= next_thread; + page= (PAGECACHE_PAGE *) thread->opt_info; + next_thread= thread->next; + /* + We notify about the event all threads that ask + for the same page as the first thread in the queue + */ + if (page->file.file == hash_link->file.file && + page->pageno == hash_link->pageno) + { + KEYCACHE_DBUG_PRINT("unlink_hash: signal", ("thread %ld", thread->id)); + pagecache_pthread_cond_signal(&thread->suspend); + wqueue_unlink_from_queue(&pagecache->waiting_for_hash_link, thread); + } + } + while (thread != last_thread); + link_hash(&pagecache->hash_root[PAGECACHE_HASH(pagecache, + hash_link->file, + hash_link->pageno)], + hash_link); + return; + } +#else /* THREAD */ + KEYCACHE_DBUG_ASSERT(! (pagecache->waiting_for_hash_link.last_thread)); +#endif /* THREAD */ + hash_link->next= pagecache->free_hash_list; + pagecache->free_hash_list= hash_link; +} + + +/* + Get the hash link for the page if it is in the cache (do not put the + page in the cache if it is absent there) + + SYNOPSIS + get_present_hash_link() + pagecache Pagecache reference + file file ID + pageno page number in the file + start where to put pointer to found hash bucket (for + direct referring it) + + RETURN + found hashlink pointer +*/ + +static PAGECACHE_HASH_LINK *get_present_hash_link(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + PAGECACHE_HASH_LINK ***start) +{ + reg1 PAGECACHE_HASH_LINK *hash_link; +#if defined(PAGECACHE_DEBUG) + int cnt; +#endif + DBUG_ENTER("get_present_hash_link"); + + KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u pos: %lu", + (uint) file->file, (ulong) pageno)); + + /* + Find the bucket in the hash table for the pair (file, pageno); + start contains the head of the bucket list, + hash_link points to the first member of the list + */ + hash_link= *(*start= &pagecache->hash_root[PAGECACHE_HASH(pagecache, + *file, pageno)]); +#if defined(PAGECACHE_DEBUG) + cnt= 0; +#endif + /* Look for an element for the pair (file, pageno) in the bucket chain */ + while (hash_link && + (hash_link->pageno != pageno || + hash_link->file.file != file->file)) + { + hash_link= hash_link->next; +#if defined(PAGECACHE_DEBUG) + cnt++; + if (! (cnt <= pagecache->hash_links_used)) + { + int i; + for (i=0, hash_link= **start ; + i < cnt ; i++, hash_link= hash_link->next) + { + KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u pos: %lu", + (uint) hash_link->file.file, (ulong) hash_link->pageno)); + } + } + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->hash_links_used); +#endif + } + if (hash_link) + { + /* Register the request for the page */ + hash_link->requests++; + } + /* + As soon as the caller will release the page cache's lock, "hash_link" + will be potentially obsolete (unusable) information. + */ + DBUG_RETURN(hash_link); +} + + +/* + Get the hash link for a page +*/ + +static PAGECACHE_HASH_LINK *get_hash_link(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno) +{ + reg1 PAGECACHE_HASH_LINK *hash_link; + PAGECACHE_HASH_LINK **start; + + KEYCACHE_DBUG_PRINT("get_hash_link", ("fd: %u pos: %lu", + (uint) file->file, (ulong) pageno)); + +restart: + /* try to find the page in the cache */ + hash_link= get_present_hash_link(pagecache, file, pageno, + &start); + if (!hash_link) + { + /* There is no hash link in the hash table for the pair (file, pageno) */ + if (pagecache->free_hash_list) + { + hash_link= pagecache->free_hash_list; + pagecache->free_hash_list= hash_link->next; + } + else if (pagecache->hash_links_used < pagecache->hash_links) + { + hash_link= &pagecache->hash_link_root[pagecache->hash_links_used++]; + } + else + { +#ifdef THREAD + /* Wait for a free hash link */ + struct st_my_thread_var *thread= my_thread_var; + PAGECACHE_PAGE page; + KEYCACHE_DBUG_PRINT("get_hash_link", ("waiting")); + page.file= *file; + page.pageno= pageno; + thread->opt_info= (void *) &page; + wqueue_link_into_queue(&pagecache->waiting_for_hash_link, thread); + KEYCACHE_DBUG_PRINT("get_hash_link: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + thread->opt_info= NULL; +#else + KEYCACHE_DBUG_ASSERT(0); +#endif + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + hash_link->file= *file; + hash_link->pageno= pageno; + link_hash(start, hash_link); + /* Register the request for the page */ + hash_link->requests++; + } + + return hash_link; +} + + +/* + Get a block for the file page requested by a pagecache read/write operation; + If the page is not in the cache return a free block, if there is none + return the lru block after saving its buffer if the page is dirty. + + SYNOPSIS + + find_block() + pagecache pointer to a page cache data structure + file handler for the file to read page from + pageno number of the page in the file + init_hits_left how initialize the block counter for the page + wrmode <-> get for writing + reg_req Register request to thye page + page_st out {PAGE_READ,PAGE_TO_BE_READ,PAGE_WAIT_TO_BE_READ} + + RETURN VALUE + Pointer to the found block if successful, 0 - otherwise + + NOTES. + For the page from file positioned at pageno the function checks whether + the page is in the key cache specified by the first parameter. + If this is the case it immediately returns the block. + If not, the function first chooses a block for this page. If there is + no not used blocks in the key cache yet, the function takes the block + at the very beginning of the warm sub-chain. It saves the page in that + block if it's dirty before returning the pointer to it. + The function returns in the page_st parameter the following values: + PAGE_READ - if page already in the block, + PAGE_TO_BE_READ - if it is to be read yet by the current thread + WAIT_TO_BE_READ - if it is to be read by another thread + If an error occurs THE PCBLOCK_ERROR bit is set in the block status. + It might happen that there are no blocks in LRU chain (in warm part) - + all blocks are unlinked for some read/write operations. Then the function + waits until first of this operations links any block back. +*/ + +static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + int init_hits_left, + my_bool wrmode, + my_bool reg_req, + int *page_st) +{ + PAGECACHE_HASH_LINK *hash_link; + PAGECACHE_BLOCK_LINK *block; + int error= 0; + int page_status; + + DBUG_ENTER("find_block"); + KEYCACHE_THREAD_TRACE("find_block:begin"); + DBUG_PRINT("enter", ("fd: %d pos: %lu wrmode: %d", + file->file, (ulong) pageno, wrmode)); + KEYCACHE_DBUG_PRINT("find_block", ("fd: %d pos: %lu wrmode: %d", + file->file, (ulong) pageno, + wrmode)); +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "start of find_block", 0);); +#endif + +restart: + /* Find the hash link for the requested page (file, pageno) */ + hash_link= get_hash_link(pagecache, file, pageno); + + page_status= -1; + if ((block= hash_link->block) && + block->hash_link == hash_link && (block->status & PCBLOCK_READ)) + page_status= PAGE_READ; + + if (wrmode && pagecache->resize_in_flush) + { + /* This is a write request during the flush phase of a resize operation */ + + if (page_status != PAGE_READ) + { + /* We don't need the page in the cache: we are going to write on disk */ + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + unlink_hash(pagecache, hash_link); + return 0; + } + if (!(block->status & PCBLOCK_IN_FLUSH)) + { + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + /* + Remove block to invalidate the page in the block buffer + as we are going to write directly on disk. + Although we have an exclusive lock for the updated key part + the control can be yielded by the current thread as we might + have unfinished readers of other key parts in the block + buffer. Still we are guaranteed not to have any readers + of the key part we are writing into until the block is + removed from the cache as we set the PCBLOCK_REASSIGNED + flag (see the code below that handles reading requests). + */ + free_block(pagecache, block); + return 0; + } + /* Wait until the page is flushed on disk */ + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + { +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + do + { + KEYCACHE_DBUG_PRINT("find_block: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while(thread->next); +#else + KEYCACHE_DBUG_ASSERT(0); + /* + Given the use of "resize_in_flush", it seems impossible + that this whole branch is ever entered in single-threaded case + because "(wrmode && pagecache->resize_in_flush)" cannot be true. + TODO: Check this, and then put the whole branch into the + "#ifdef THREAD" guard. + */ +#endif + } + /* Invalidate page in the block if it has not been done yet */ + if (block->status) + free_block(pagecache, block); + return 0; + } + + if (page_status == PAGE_READ && + (block->status & (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED))) + { + /* This is a request for a page to be removed from cache */ + + KEYCACHE_DBUG_PRINT("find_block", + ("request for old page in block: %u " + "wrmode: %d block->status: %d", + PCBLOCK_NUMBER(pagecache, block), wrmode, + block->status)); + /* + Only reading requests can proceed until the old dirty page is flushed, + all others are to be suspended, then resubmitted + */ + if (!wrmode && !(block->status & PCBLOCK_REASSIGNED)) + { + if (reg_req) + reg_requests(pagecache, block, 1); + } + else + { + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + KEYCACHE_DBUG_PRINT("find_block", + ("request waiting for old page to be saved")); + { +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + /* Put the request into the queue of those waiting for the old page */ + wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + /* Wait until the request can be resubmitted */ + do + { + KEYCACHE_DBUG_PRINT("find_block: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while(thread->next); +#else + KEYCACHE_DBUG_ASSERT(0); + /* No parallel requests in single-threaded case */ +#endif + } + KEYCACHE_DBUG_PRINT("find_block", + ("request for old page resubmitted")); + DBUG_PRINT("info", ("restarting...")); + /* Resubmit the request */ + goto restart; + } + block->status&= ~PCBLOCK_IN_SWITCH; + } + else + { + /* This is a request for a new page or for a page not to be removed */ + if (! block) + { + /* No block is assigned for the page yet */ + if (pagecache->blocks_unused) + { + if (pagecache->free_block_list) + { + /* There is a block in the free list. */ + block= pagecache->free_block_list; + pagecache->free_block_list= block->next_used; + block->next_used= NULL; + } + else + { + /* There are some never used blocks, take first of them */ + block= &pagecache->block_root[pagecache->blocks_used]; + block->buffer= ADD_TO_PTR(pagecache->block_mem, + ((ulong) pagecache->blocks_used* + pagecache->block_size), + uchar*); + pagecache->blocks_used++; + } + pagecache->blocks_unused--; + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->pins == 0); + block->status= 0; +#ifndef DBUG_OFF + block->type= PAGECACHE_EMPTY_PAGE; +#endif + block->requests= 1; + block->temperature= PCBLOCK_COLD; + block->hits_left= init_hits_left; + block->last_hit_time= 0; + block->rec_lsn= LSN_MAX; + link_to_file_list(pagecache, block, file, 0); + block->hash_link= hash_link; + hash_link->block= block; + page_status= PAGE_TO_BE_READ; + DBUG_PRINT("info", ("page to be read set for page 0x%lx", + (ulong)block)); + KEYCACHE_DBUG_PRINT("find_block", + ("got free or never used block %u", + PCBLOCK_NUMBER(pagecache, block))); + } + else + { + /* There are no never used blocks, use a block from the LRU chain */ + + /* + Wait until a new block is added to the LRU chain; + several threads might wait here for the same page, + all of them must get the same block + */ + +#ifdef THREAD + if (! pagecache->used_last) + { + struct st_my_thread_var *thread= my_thread_var; + thread->opt_info= (void *) hash_link; + wqueue_link_into_queue(&pagecache->waiting_for_block, thread); + do + { + KEYCACHE_DBUG_PRINT("find_block: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + thread->opt_info= NULL; + } +#else + KEYCACHE_DBUG_ASSERT(pagecache->used_last); +#endif + block= hash_link->block; + if (! block) + { + /* + Take the first block from the LRU chain + unlinking it from the chain + */ + block= pagecache->used_last->next_used; + block->hits_left= init_hits_left; + block->last_hit_time= 0; + if (reg_req) + reg_requests(pagecache, block, 1); + hash_link->block= block; + } + PCBLOCK_INFO(block); + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->pins == 0); + + if (block->hash_link != hash_link && + ! (block->status & PCBLOCK_IN_SWITCH) ) + { + /* this is a primary request for a new page */ + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->pins == 0); + block->status|= PCBLOCK_IN_SWITCH; + + KEYCACHE_DBUG_PRINT("find_block", + ("got block %u for new page", + PCBLOCK_NUMBER(pagecache, block))); + + if (block->status & PCBLOCK_CHANGED) + { + /* The block contains a dirty page - push it out of the cache */ + + KEYCACHE_DBUG_PRINT("find_block", ("block is dirty")); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + The call is thread safe because only the current + thread might change the block->hash_link value + */ + DBUG_ASSERT(block->pins == 0); + error= pagecache_fwrite(pagecache, + &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + block->type, + MYF(MY_NABP | MY_WAIT_IF_FULL)); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + pagecache->global_cache_write++; + } + + block->status|= PCBLOCK_REASSIGNED; + if (block->hash_link) + { + /* + Wait until all pending read requests + for this page are executed + (we could have avoided this waiting, if we had read + a page in the cache in a sweep, without yielding control) + */ + wait_for_readers(pagecache, block); + + /* Remove the hash link for this page from the hash table */ + unlink_hash(pagecache, block->hash_link); + /* All pending requests for this page must be resubmitted */ +#ifdef THREAD + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); +#endif + } + link_to_file_list(pagecache, block, file, + (my_bool)(block->hash_link ? 1 : 0)); + PCBLOCK_INFO(block); + block->status= error? PCBLOCK_ERROR : 0; +#ifndef DBUG_OFF + block->type= PAGECACHE_EMPTY_PAGE; +#endif + block->hash_link= hash_link; + page_status= PAGE_TO_BE_READ; + DBUG_PRINT("info", ("page to be read set for page 0x%lx", + (ulong)block)); + + KEYCACHE_DBUG_ASSERT(block->hash_link->block == block); + KEYCACHE_DBUG_ASSERT(hash_link->block->hash_link == hash_link); + } + else + { + /* This is for secondary requests for a new page only */ + KEYCACHE_DBUG_PRINT("find_block", + ("block->hash_link: %p hash_link: %p " + "block->status: %u", block->hash_link, + hash_link, block->status )); + page_status= (((block->hash_link == hash_link) && + (block->status & PCBLOCK_READ)) ? + PAGE_READ : PAGE_WAIT_TO_BE_READ); + } + } + pagecache->global_cache_read++; + } + else + { + if (reg_req) + reg_requests(pagecache, block, 1); + KEYCACHE_DBUG_PRINT("find_block", + ("block->hash_link: %p hash_link: %p " + "block->status: %u", block->hash_link, + hash_link, block->status )); + page_status= (((block->hash_link == hash_link) && + (block->status & PCBLOCK_READ)) ? + PAGE_READ : PAGE_WAIT_TO_BE_READ); + } + } + + KEYCACHE_DBUG_ASSERT(page_status != -1); + *page_st= page_status; + DBUG_PRINT("info", + ("block: 0x%lx fd: %u pos: %lu block->status: %u page_status: %u", + (ulong) block, (uint) file->file, + (ulong) pageno, block->status, (uint) page_status)); + KEYCACHE_DBUG_PRINT("find_block", + ("block: 0x%lx fd: %d pos: %lu block->status: %u page_status: %d", + (ulong) block, + file->file, (ulong) pageno, block->status, + page_status)); + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "end of find_block",0);); +#endif + KEYCACHE_THREAD_TRACE("find_block:end"); + DBUG_RETURN(block); +} + + +static void add_pin(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("add_pin"); + DBUG_PRINT("enter", ("block: 0x%lx pins: %u", + (ulong) block, + block->pins)); + PCBLOCK_INFO(block); + block->pins++; +#ifndef DBUG_OFF + { + PAGECACHE_PIN_INFO *info= + (PAGECACHE_PIN_INFO *)my_malloc(sizeof(PAGECACHE_PIN_INFO), MYF(0)); + info->thread= my_thread_var; + info_link(&block->pin_list, info); + } +#endif + DBUG_VOID_RETURN; +} + +static void remove_pin(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("remove_pin"); + DBUG_PRINT("enter", ("block: 0x%lx pins: %u", + (ulong) block, + block->pins)); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->pins > 0); + block->pins--; +#ifndef DBUG_OFF + { + PAGECACHE_PIN_INFO *info= info_find(block->pin_list, my_thread_var); + DBUG_ASSERT(info != 0); + info_unlink(info); + my_free((uchar*) info, MYF(0)); + } +#endif + DBUG_VOID_RETURN; +} +#ifndef DBUG_OFF +static void info_add_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl) +{ + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *)my_malloc(sizeof(PAGECACHE_LOCK_INFO), MYF(0)); + info->thread= my_thread_var; + info->write_lock= wl; + info_link((PAGECACHE_PIN_INFO **)&block->lock_list, + (PAGECACHE_PIN_INFO *)info); +} +static void info_remove_lock(PAGECACHE_BLOCK_LINK *block) +{ + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list, + my_thread_var); + DBUG_ASSERT(info != 0); + info_unlink((PAGECACHE_PIN_INFO *)info); + my_free((uchar*)info, MYF(0)); +} +static void info_change_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl) +{ + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list, + my_thread_var); + DBUG_ASSERT(info != 0); + DBUG_ASSERT(info->write_lock != wl); + info->write_lock= wl; +} +#else +#define info_add_lock(B,W) +#define info_remove_lock(B) +#define info_change_lock(B,W) +#endif + +/* + Put on the block write lock + + SYNOPSIS + get_wrlock() + pagecache pointer to a page cache data structure + block the block to work with + user_file Unique handler per handler file. Used to check if + we request many write locks withing the same + statement + + RETURN + 0 - OK + 1 - Can't lock this block, need retry +*/ + +static my_bool get_wrlock(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + PAGECACHE_FILE *user_file) +{ + PAGECACHE_FILE file= block->hash_link->file; + pgcache_page_no_t pageno= block->hash_link->pageno; + DBUG_ENTER("get_wrlock"); + DBUG_PRINT("info", ("the block 0x%lx " + "files %d(%d) pages %d(%d)", + (ulong)block, + file.file, block->hash_link->file.file, + pageno, block->hash_link->pageno)); + PCBLOCK_INFO(block); + while (block->wlocks && block->write_locker != user_file) + { + /* Lock failed we will wait */ +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + DBUG_PRINT("info", ("fail to lock, waiting... 0x%lx", (ulong)block)); + wqueue_add_to_queue(&block->wqueue[COND_FOR_WRLOCK], thread); + dec_counter_for_resize_op(pagecache); + do + { + KEYCACHE_DBUG_PRINT("get_wrlock: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while(thread->next); +#else + DBUG_ASSERT(0); +#endif + PCBLOCK_INFO(block); + if ((block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH)) || + file.file != block->hash_link->file.file || + pageno != block->hash_link->pageno) + { + DBUG_PRINT("info", ("the block 0x%lx changed => need retry" + "status %x files %d != %d or pages %d !=%d", + (ulong)block, block->status, + file.file, block->hash_link->file.file, + pageno, block->hash_link->pageno)); + DBUG_RETURN(1); + } + } + /* we are doing it by global cache mutex protection, so it is OK */ + block->wlocks++; + block->write_locker= user_file; + DBUG_PRINT("info", ("WR lock set, block 0x%lx", (ulong)block)); + DBUG_RETURN(0); +} + + +/* + Remove write lock from the block + + SYNOPSIS + release_wrlock() + pagecache pointer to a page cache data structure + block the block to work with + + RETURN + 0 - OK +*/ + +static void release_wrlock(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("release_wrlock"); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->wlocks > 0); + DBUG_ASSERT(block->pins > 0); + block->wlocks--; + if (block->wlocks > 0) + DBUG_VOID_RETURN; /* Multiple write locked */ + DBUG_PRINT("info", ("WR lock reset, block 0x%lx", (ulong)block)); +#ifdef THREAD + /* release all threads waiting for write lock */ + if (block->wqueue[COND_FOR_WRLOCK].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_WRLOCK]); +#endif + PCBLOCK_INFO(block); + DBUG_VOID_RETURN; +} + + +/* + Try to lock/unlock and pin/unpin the block + + SYNOPSIS + make_lock_and_pin() + pagecache pointer to a page cache data structure + block the block to work with + lock lock change mode + pin pinchange mode + file File handler requesting pin + + RETURN + 0 - OK + 1 - Try to lock the block failed +*/ + +static my_bool make_lock_and_pin(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + PAGECACHE_FILE *file) +{ + DBUG_ENTER("make_lock_and_pin"); + + DBUG_PRINT("enter", ("block: 0x%lx", (ulong)block)); +#ifndef DBUG_OFF + if (block) + { + DBUG_PRINT("enter", ("block: 0x%lx (%u) wrlocks: %u pins: %u lock: %s pin: %s", + (ulong)block, PCBLOCK_NUMBER(pagecache, block), + block->wlocks, + block->pins, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + PCBLOCK_INFO(block); + } +#endif + + switch (lock) { + case PAGECACHE_LOCK_WRITE: /* free -> write */ + /* Writelock and pin the buffer */ + if (get_wrlock(pagecache, block, file)) + { + /* can't lock => need retry */ + goto retry; + } + + /* The cache is locked so nothing afraid of */ + add_pin(block); + info_add_lock(block, 1); + break; + case PAGECACHE_LOCK_WRITE_TO_READ: /* write -> read */ + case PAGECACHE_LOCK_WRITE_UNLOCK: /* write -> free */ + /* + Removes write lock and puts read lock (which is nothing in our + implementation) + */ + release_wrlock(block); + /* fall through */ + case PAGECACHE_LOCK_READ_UNLOCK: /* read -> free */ + case PAGECACHE_LOCK_LEFT_READLOCKED: /* read -> read */ + if (pin == PAGECACHE_UNPIN) + { + remove_pin(block); + } + if (lock == PAGECACHE_LOCK_WRITE_TO_READ) + { + info_change_lock(block, 0); + } + else if (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_READ_UNLOCK) + { + info_remove_lock(block); + } + break; + case PAGECACHE_LOCK_READ: /* free -> read */ + if (pin == PAGECACHE_PIN) + { + /* The cache is locked so nothing afraid off */ + add_pin(block); + } + info_add_lock(block, 0); + break; + case PAGECACHE_LOCK_LEFT_UNLOCKED: /* free -> free */ + case PAGECACHE_LOCK_LEFT_WRITELOCKED: /* write -> write */ + break; /* do nothing */ + default: + DBUG_ASSERT(0); /* Never should happened */ + } + +#ifndef DBUG_OFF + if (block) + PCBLOCK_INFO(block); +#endif + DBUG_RETURN(0); +retry: + DBUG_PRINT("INFO", ("Retry block 0x%lx", (ulong)block)); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->hash_link->requests > 0); + block->hash_link->requests--; + DBUG_ASSERT(block->requests > 0); + unreg_request(pagecache, block, 1); + PCBLOCK_INFO(block); + DBUG_RETURN(1); + +} + + +/* + Read into a key cache block buffer from disk. + + SYNOPSIS + + read_block() + pagecache pointer to a page cache data structure + block block to which buffer the data is to be read + primary <-> the current thread will read the data + validator validator of read from the disk data + validator_data pointer to the data need by the validator + + RETURN VALUE + None + + NOTES. + The function either reads a page data from file to the block buffer, + or waits until another thread reads it. What page to read is determined + by a block parameter - reference to a hash link for this page. + If an error occurs THE PCBLOCK_ERROR bit is set in the block status. +*/ + +static void read_block(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + my_bool primary, + pagecache_disk_read_validator validator, + uchar* validator_data) +{ + uint got_length; + + /* On entry cache_lock is locked */ + + DBUG_ENTER("read_block"); + if (primary) + { + /* + This code is executed only by threads + that submitted primary requests + */ + + DBUG_PRINT("read_block", + ("page to be read by primary request")); + + /* Page is not in buffer yet, is to be read from disk */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + Here other threads may step in and register as secondary readers. + They will register in block->wqueue[COND_FOR_REQUESTED]. + */ + got_length= pagecache_fread(pagecache, &block->hash_link->file, + block->buffer, + block->hash_link->pageno, MYF(0)); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (got_length < pagecache->block_size) + block->status|= PCBLOCK_ERROR; + else + block->status= PCBLOCK_READ; + + if (validator != NULL && + (*validator)(block->buffer, validator_data)) + block->status|= PCBLOCK_ERROR; + + DBUG_PRINT("read_block", + ("primary request: new page in cache")); + /* Signal that all pending requests for this page now can be processed */ +#ifdef THREAD + if (block->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); +#endif + } + else + { + /* + This code is executed only by threads + that submitted secondary requests + */ + DBUG_PRINT("read_block", + ("secondary request waiting for new page to be read")); + { +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + /* Put the request into a queue and wait until it can be processed */ + wqueue_add_to_queue(&block->wqueue[COND_FOR_REQUESTED], thread); + do + { + DBUG_PRINT("read_block: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); +#else + KEYCACHE_DBUG_ASSERT(0); + /* No parallel requests in single-threaded case */ +#endif + } + DBUG_PRINT("read_block", + ("secondary request: new page in cache")); + } + DBUG_VOID_RETURN; +} + + +/** + @brief Set LSN on the page to the given one if the given LSN is bigger + + @param pagecache pointer to a page cache data structure + @param lsn LSN to set + @param block block to check and set +*/ + +static void check_and_set_lsn(PAGECACHE *pagecache, + LSN lsn, PAGECACHE_BLOCK_LINK *block) +{ + LSN old; + DBUG_ENTER("check_and_set_lsn"); + DBUG_ASSERT(block->type == PAGECACHE_LSN_PAGE); + old= lsn_korr(block->buffer + PAGE_LSN_OFFSET); + DBUG_PRINT("info", ("old lsn: (%lu, 0x%lx) new lsn: (%lu, 0x%lx)", + LSN_IN_PARTS(old), LSN_IN_PARTS(lsn))); + if (cmp_translog_addr(lsn, old) > 0) + { + + DBUG_ASSERT(block->type != PAGECACHE_READ_UNKNOWN_PAGE); + lsn_store(block->buffer + PAGE_LSN_OFFSET, lsn); + /* we stored LSN in page so we dirtied it */ + if (!(block->status & PCBLOCK_CHANGED)) + link_to_changed_list(pagecache, block); + } + DBUG_VOID_RETURN; +} + + +/* + Unlock/unpin page and put LSN stamp if it need + + SYNOPSIS + pagecache_unlock() + pagecache pointer to a page cache data structure + file handler for the file for the block of data to be read + pageno number of the block of data in the file + lock lock change + pin pin page + first_REDO_LSN_for_page do not set it if it is zero + lsn if it is not LSN_IMPOSSIBLE (0) and it + is bigger then LSN on the page it will be written on + the page + + NOTE + Pininig uses requests registration mechanism it works following way: + | beginnig | ending | + | of func. | of func. | + ----------------------------+-------------+---------------+ + PAGECACHE_PIN_LEFT_PINNED | - | - | + PAGECACHE_PIN_LEFT_UNPINNED | reg request | unreg request | + PAGECACHE_PIN | reg request | - | + PAGECACHE_UNPIN | - | unreg request | + + +*/ + +void pagecache_unlock(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn) +{ + PAGECACHE_BLOCK_LINK *block; + int page_st; + DBUG_ENTER("pagecache_unlock"); + DBUG_PRINT("enter", ("fd: %u page: %lu %s %s", + (uint) file->file, (ulong) pageno, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + /* we do not allow any lock/pin increasing here */ + DBUG_ASSERT(pin != PAGECACHE_PIN); + DBUG_ASSERT(lock != PAGECACHE_LOCK_READ); + DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock because want + to unlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + /* See NOTE for pagecache_unlock about registering requests */ + block= find_block(pagecache, file, pageno, 0, 0, + test(pin == PAGECACHE_PIN_LEFT_UNPINNED), &page_st); + PCBLOCK_INFO(block); + DBUG_ASSERT(block != 0 && page_st == PAGE_READ); + if (first_REDO_LSN_for_page) + { + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK); + DBUG_ASSERT(pin == PAGECACHE_UNPIN); + if (block->rec_lsn == LSN_MAX) + block->rec_lsn= first_REDO_LSN_for_page; + else + DBUG_ASSERT(cmp_translog_addr(block->rec_lsn, + first_REDO_LSN_for_page) <= 0); + + } + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + if (make_lock_and_pin(pagecache, block, lock, pin, file)) + { + DBUG_ASSERT(0); /* should not happend */ + } + + remove_reader(block); + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + if (pin != PAGECACHE_PIN_LEFT_PINNED) + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/* + Unpin page + + SYNOPSIS + pagecache_unpin() + pagecache pointer to a page cache data structure + file handler for the file for the block of data to be read + pageno number of the block of data in the file + lsn if it is not LSN_IMPOSSIBLE (0) and it + is bigger then LSN on the page it will be written on + the page +*/ + +void pagecache_unpin(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + LSN lsn) +{ + PAGECACHE_BLOCK_LINK *block; + int page_st; + DBUG_ENTER("pagecache_unpin"); + DBUG_PRINT("enter", ("fd: %u page: %lu", + (uint) file->file, (ulong) pageno)); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock bacause want + aunlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + /* See NOTE for pagecache_unlock about registering requests */ + block= find_block(pagecache, file, pageno, 0, 0, 0, &page_st); + DBUG_ASSERT(block != 0); + DBUG_ASSERT(page_st == PAGE_READ); + + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + /* + we can just unpin only with keeping read lock because: + a) we can't pin without any lock + b) we can't unpin keeping write lock + */ + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_LEFT_READLOCKED, + PAGECACHE_UNPIN, file)) + DBUG_ASSERT(0); /* should not happend */ + + remove_reader(block); + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests + */ + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/* + Unlock/unpin page and put LSN stamp if it need + (uses direct block/page pointer) + + SYNOPSIS + pagecache_unlock_by_link() + pagecache pointer to a page cache data structure + link direct link to page (returned by read or write) + lock lock change + pin pin page + first_REDO_LSN_for_page do not set it if it is LSN_IMPOSSIBLE (0) + lsn if it is not LSN_IMPOSSIBLE and it is bigger then + LSN on the page it will be written on the page +*/ + +void pagecache_unlock_by_link(PAGECACHE *pagecache, + PAGECACHE_PAGE_LINK *link, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn) +{ + PAGECACHE_BLOCK_LINK *block= (PAGECACHE_BLOCK_LINK *)link; + DBUG_ENTER("pagecache_unlock_by_link"); + DBUG_PRINT("enter", ("block: 0x%lx fd: %u page: %lu %s %s", + (ulong) block, + (uint) block->hash_link->file.file, + (ulong) block->hash_link->pageno, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + /* + We do not allow any lock/pin increasing here and page can't be + unpinned because we use direct link. + */ + DBUG_ASSERT(pin != PAGECACHE_PIN); + DBUG_ASSERT(pin != PAGECACHE_PIN_LEFT_UNPINNED); + DBUG_ASSERT(lock != PAGECACHE_LOCK_READ); + DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE); + if (pin == PAGECACHE_PIN_LEFT_UNPINNED && + lock == PAGECACHE_LOCK_READ_UNLOCK) + { + /* block do not need here so we do not provide it */ + if (make_lock_and_pin(pagecache, 0, lock, pin, 0)) + DBUG_ASSERT(0); /* should not happend */ + DBUG_VOID_RETURN; + } + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock because want + unlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + if (first_REDO_LSN_for_page != LSN_IMPOSSIBLE) + { + /* + LOCK_READ_UNLOCK is ok here as the page may have first locked + with WRITE lock that was temporarly converted to READ lock before + it's unpinned + */ + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_READ_UNLOCK); + DBUG_ASSERT(pin == PAGECACHE_UNPIN); + if (block->rec_lsn == LSN_MAX) + block->rec_lsn= first_REDO_LSN_for_page; + else + DBUG_ASSERT(cmp_translog_addr(block->rec_lsn, + first_REDO_LSN_for_page) <= 0); + } + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + if (make_lock_and_pin(pagecache, block, lock, pin, 0)) + DBUG_ASSERT(0); /* should not happend */ + + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + if (pin != PAGECACHE_PIN_LEFT_PINNED) + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/* + Unpin page + (uses direct block/page pointer) + + SYNOPSIS + pagecache_unpin_by_link() + pagecache pointer to a page cache data structure + link direct link to page (returned by read or write) + lsn if it is not LSN_IMPOSSIBLE (0) and it + is bigger then LSN on the page it will be written on + the page +*/ + +void pagecache_unpin_by_link(PAGECACHE *pagecache, + PAGECACHE_PAGE_LINK *link, + LSN lsn) +{ + PAGECACHE_BLOCK_LINK *block= (PAGECACHE_BLOCK_LINK *)link; + DBUG_ENTER("pagecache_unpin_by_link"); + DBUG_PRINT("enter", ("block: 0x%lx fd: %u page: %lu", + (ulong) block, + (uint) block->hash_link->file.file, + (ulong) block->hash_link->pageno)); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock because want + unlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + /* + We can just unpin only with keeping read lock because: + a) we can't pin without any lock + b) we can't unpin keeping write lock + */ + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_LEFT_READLOCKED, + PAGECACHE_UNPIN, 0)) + DBUG_ASSERT(0); /* should not happend */ + + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/* + Read a block of data from a cached file into a buffer; + + SYNOPSIS + pagecache_valid_read() + pagecache pointer to a page cache data structure + file handler for the file for the block of data to be read + pageno number of the block of data in the file + level determines the weight of the data + buff buffer to where the data must be placed + type type of the page + lock lock change + link link to the page if we pin it + validator validator of read from the disk data + validator_data pointer to the data need by the validator + + RETURN VALUE + Returns address from where the data is placed if successful, 0 - otherwise. + + Pin will be chosen according to lock parameter (see lock_to_pin) +*/ +static enum pagecache_page_pin lock_to_pin[]= +{ + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_UNLOCKED*/, + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_READLOCKED*/, + PAGECACHE_PIN_LEFT_PINNED /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/, + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ*/, + PAGECACHE_PIN /*PAGECACHE_LOCK_WRITE*/, + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ_UNLOCK*/, + PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_UNLOCK*/, + PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_TO_READ*/ +}; + +uchar *pagecache_valid_read(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + PAGECACHE_PAGE_LINK *link, + pagecache_disk_read_validator validator, + uchar* validator_data) +{ + int error= 0; + enum pagecache_page_pin pin= lock_to_pin[lock]; + PAGECACHE_PAGE_LINK fake_link; + DBUG_ENTER("pagecache_valid_read"); + DBUG_PRINT("enter", ("fd: %u page: %lu level: %u t:%s %s %s", + (uint) file->file, (ulong) pageno, level, + page_cache_page_type_str[type], + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + + if (!link) + link= &fake_link; + else + *link= 0; + +restart: + + if (pagecache->can_be_used) + { + /* Key cache is used */ + PAGECACHE_BLOCK_LINK *block; + uint status; + int page_st; + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + { + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + goto no_key_cache; + } + + inc_counter_for_resize_op(pagecache); + pagecache->global_cache_r_requests++; + /* See NOTE for pagecache_unlock about registering requests. */ + block= find_block(pagecache, file, pageno, level, + test(lock == PAGECACHE_LOCK_WRITE), + test((pin == PAGECACHE_PIN_LEFT_UNPINNED) || + (pin == PAGECACHE_PIN)), + &page_st); + DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE || + block->type == type || + type == PAGECACHE_LSN_PAGE || + type == PAGECACHE_READ_UNKNOWN_PAGE || + block->type == PAGECACHE_READ_UNKNOWN_PAGE); + if (type != PAGECACHE_READ_UNKNOWN_PAGE || + block->type == PAGECACHE_EMPTY_PAGE) + block->type= type; + if (((block->status & PCBLOCK_ERROR) == 0) && (page_st != PAGE_READ)) + { + DBUG_PRINT("info", ("read block 0x%lx", (ulong)block)); + /* The requested page is to be read into the block buffer */ + read_block(pagecache, block, + (my_bool)(page_st == PAGE_TO_BE_READ), + validator, validator_data); + DBUG_PRINT("info", ("read is done")); + } + if (make_lock_and_pin(pagecache, block, lock, pin, file)) + { + /* + We failed to write lock the block, cache is unlocked, + we will try to get the block again. + */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + + if (! ((status= block->status) & PCBLOCK_ERROR)) + { +#if !defined(SERIALIZED_READ_FROM_CACHE) + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); +#endif + + DBUG_ASSERT((pagecache->block_size & 511) == 0); + /* Copy data from the cache buffer */ + bmove512(buff, block->buffer, pagecache->block_size); + +#if !defined(SERIALIZED_READ_FROM_CACHE) + pagecache_pthread_mutex_lock(&pagecache->cache_lock); +#endif + } + + remove_reader(block); + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN) + unreg_request(pagecache, block, 1); + else + *link= (PAGECACHE_PAGE_LINK)block; + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + if (status & PCBLOCK_ERROR) + DBUG_RETURN((uchar *) 0); + + DBUG_RETURN(buff); + } + +no_key_cache: /* Key cache is not used */ + + /* We can't use mutex here as the key cache may not be initialized */ + pagecache->global_cache_r_requests++; + pagecache->global_cache_read++; + if (pagecache_fread(pagecache, file, (uchar*) buff, pageno, MYF(MY_NABP))) + error= 1; + DBUG_RETURN(error ? (uchar*) 0 : buff); +} + + +/* + Delete page from the buffer + + SYNOPSIS + pagecache_delete() + pagecache pointer to a page cache data structure + file handler for the file for the block of data to be read + pageno number of the block of data in the file + lock lock change + flush flush page if it is dirty + + RETURN VALUE + 0 - deleted or was not present at all + 1 - error + + NOTES. + lock can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was write locked + before) or PAGECACHE_LOCK_WRITE (delete will write lock page before delete) +*/ +my_bool pagecache_delete(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + my_bool flush) +{ + int error= 0; + enum pagecache_page_pin pin= lock_to_pin[lock]; + DBUG_ENTER("pagecache_delete"); + DBUG_PRINT("enter", ("fd: %u page: %lu %s %s", + (uint) file->file, (ulong) pageno, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED); + DBUG_ASSERT(pin == PAGECACHE_PIN || + pin == PAGECACHE_PIN_LEFT_PINNED); + +restart: + + if (pagecache->can_be_used) + { + /* Key cache is used */ + reg1 PAGECACHE_BLOCK_LINK *block; + PAGECACHE_HASH_LINK **unused_start, *link; + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + goto end; + + inc_counter_for_resize_op(pagecache); + link= get_present_hash_link(pagecache, file, pageno, &unused_start); + if (!link) + { + DBUG_PRINT("info", ("There is no such page in the cache")); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(0); + } + block= link->block; + /* See NOTE for pagecache_unlock about registering requests. */ + if (pin == PAGECACHE_PIN) + reg_requests(pagecache, block, 1); + DBUG_ASSERT(block != 0); + if (make_lock_and_pin(pagecache, block, lock, pin, file)) + { + /* + We failed to writelock the block, cache is unlocked, and last write + lock is released, we will try to get the block again. + */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + + if (block->status & PCBLOCK_CHANGED) + { + if (flush) + { + /* The block contains a dirty page - push it out of the cache */ + + KEYCACHE_DBUG_PRINT("find_block", ("block is dirty")); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + The call is thread safe because only the current + thread might change the block->hash_link value + */ + DBUG_ASSERT(block->pins == 1); + error= pagecache_fwrite(pagecache, + &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + block->type, + MYF(MY_NABP | MY_WAIT_IF_FULL)); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + pagecache->global_cache_write++; + + if (error) + { + block->status|= PCBLOCK_ERROR; + goto err; + } + } + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + /* + free_block() will change the status and rec_lsn of the block so no + need to change them here. + */ + } + /* Cache is locked, so we can relese page before freeing it */ + make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, file); + DBUG_ASSERT(link->requests > 0); + link->requests--; + /* See NOTE for pagecache_unlock about registering requests. */ + free_block(pagecache, block); + +err: + dec_counter_for_resize_op(pagecache); +end: + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + } + + DBUG_RETURN(error); +} + + +my_bool pagecache_delete_pages(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint page_count, + enum pagecache_page_lock lock, + my_bool flush) +{ + ulong page_end; + DBUG_ENTER("pagecache_delete_pages"); + DBUG_ASSERT(page_count > 0); + + page_end= pageno + page_count; + do + { + if (pagecache_delete(pagecache, file, pageno, + lock, flush)) + DBUG_RETURN(1); + } while (++pageno != page_end); + DBUG_RETURN(0); +} + + +/* + Write a buffer into a cached file. + + SYNOPSIS + + pagecache_write_part() + pagecache pointer to a page cache data structure + file handler for the file to write data to + pageno number of the block of data in the file + level determines the weight of the data + buff buffer with the data + type type of the page + lock lock change + pin pin page + write_mode how to write page + link link to the page if we pin it + + RETURN VALUE + 0 if a success, 1 - otherwise. +*/ + +/* description of how to change lock before and after write */ +struct write_lock_change +{ + int need_lock_change; /* need changing of lock at the end of write */ + enum pagecache_page_lock new_lock; /* lock at the beginning */ + enum pagecache_page_lock unlock_lock; /* lock at the end */ +}; + +static struct write_lock_change write_lock_change_table[]= +{ + {1, + PAGECACHE_LOCK_WRITE, + PAGECACHE_LOCK_WRITE_UNLOCK} /*PAGECACHE_LOCK_LEFT_UNLOCKED*/, + {0, /*unsupported (we can't write having the block read locked) */ + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_LEFT_READLOCKED*/, + {0, PAGECACHE_LOCK_LEFT_WRITELOCKED, 0} /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/, + {1, + PAGECACHE_LOCK_WRITE, + PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_READ*/, + {0, PAGECACHE_LOCK_WRITE, 0} /*PAGECACHE_LOCK_WRITE*/, + {0, /*unsupported (we can't write having the block read locked) */ + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_READ_UNLOCK*/, + {1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_LOCK_WRITE_UNLOCK } /*PAGECACHE_LOCK_WRITE_UNLOCK*/, + {1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_WRITE_TO_READ*/ +}; + +/* description of how to change pin before and after write */ +struct write_pin_change +{ + enum pagecache_page_pin new_pin; /* pin status at the beginning */ + enum pagecache_page_pin unlock_pin; /* pin status at the end */ +}; + +static struct write_pin_change write_pin_change_table[]= +{ + {PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN_LEFT_PINNED*/, + {PAGECACHE_PIN, + PAGECACHE_UNPIN} /*PAGECACHE_PIN_LEFT_UNPINNED*/, + {PAGECACHE_PIN, + PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN*/, + {PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_UNPIN} /*PAGECACHE_UNPIN*/ +}; + +my_bool pagecache_write_part(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + enum pagecache_write_mode write_mode, + PAGECACHE_PAGE_LINK *link, + uint offset, uint size, + pagecache_disk_read_validator validator, + uchar* validator_data) +{ + PAGECACHE_BLOCK_LINK *block= NULL; + PAGECACHE_PAGE_LINK fake_link; + int error= 0; + int need_lock_change= write_lock_change_table[lock].need_lock_change; + DBUG_ENTER("pagecache_write_part"); + DBUG_PRINT("enter", ("fd: %u page: %lu level: %u type: %s lock: %s " + "pin: %s mode: %s offset: %u size %u", + (uint) file->file, (ulong) pageno, level, + page_cache_page_type_str[type], + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin], + page_cache_page_write_mode_str[write_mode], + offset, size)); + DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE); + DBUG_ASSERT(lock != PAGECACHE_LOCK_LEFT_READLOCKED); + DBUG_ASSERT(lock != PAGECACHE_LOCK_READ_UNLOCK); + DBUG_ASSERT(offset + size <= pagecache->block_size); + if (!link) + link= &fake_link; + else + *link= 0; + +restart: + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "start of key_cache_write", 1);); +#endif + + if (pagecache->can_be_used) + { + /* Key cache is used */ + int page_st; + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + { + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + goto no_key_cache; + } + + inc_counter_for_resize_op(pagecache); + pagecache->global_cache_w_requests++; + /* See NOTE for pagecache_unlock about registering requests. */ + block= find_block(pagecache, file, pageno, level, + test(write_mode != PAGECACHE_WRITE_DONE && + lock != PAGECACHE_LOCK_LEFT_WRITELOCKED && + lock != PAGECACHE_LOCK_WRITE_UNLOCK && + lock != PAGECACHE_LOCK_WRITE_TO_READ), + test((pin == PAGECACHE_PIN_LEFT_UNPINNED) || + (pin == PAGECACHE_PIN)), + &page_st); + if (!block) + { + DBUG_ASSERT(write_mode != PAGECACHE_WRITE_DONE); + /* It happens only for requests submitted during resize operation */ + dec_counter_for_resize_op(pagecache); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* Write to the disk key cache is in resize at the moment*/ + goto no_key_cache; + } + + DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE || + block->type == PAGECACHE_READ_UNKNOWN_PAGE || + block->type == type || + (block->type == PAGECACHE_PLAIN_PAGE && + type == PAGECACHE_LSN_PAGE)); + block->type= type; + + if (make_lock_and_pin(pagecache, block, + write_lock_change_table[lock].new_lock, + (need_lock_change ? + write_pin_change_table[pin].new_pin : + pin), file)) + { + /* + We failed to writelock the block, cache is unlocked, and last write + lock is released, we will try to get the block again. + */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + + if (write_mode == PAGECACHE_WRITE_DONE) + { + if (!(block->status & PCBLOCK_ERROR)) + { + /* Copy data from buff */ + if (!(size & 511)) + bmove512(block->buffer + offset, buff, size); + else + memcpy(block->buffer + offset, buff, size); + block->status= PCBLOCK_READ; + /* + The validator can change the page content (removing page + protection) so it have to be called + */ + if (validator != NULL && + (*validator)(block->buffer, validator_data)) + block->status|= PCBLOCK_ERROR; + KEYCACHE_DBUG_PRINT("key_cache_insert", + ("Page injection")); +#ifdef THREAD + /* Signal that all pending requests for this now can be processed. */ + if (block->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); +#endif + } + } + else + { + DBUG_ASSERT(validator == 0 && validator_data == 0); + if (! (block->status & PCBLOCK_CHANGED)) + link_to_changed_list(pagecache, block); + + if (! (block->status & PCBLOCK_ERROR)) + { + if (!(size & 511)) + bmove512(block->buffer + offset, buff, size); + else + memcpy(block->buffer + offset, buff, size); + block->status|= PCBLOCK_READ; + } + } + + if (need_lock_change) + { + /* + We don't set rec_lsn of the block; this is ok as for the + Maria-block-record's pages, we always keep pages pinned here. + */ + if (make_lock_and_pin(pagecache, block, + write_lock_change_table[lock].unlock_lock, + write_pin_change_table[pin].unlock_pin, file)) + DBUG_ASSERT(0); + } + + /* Unregister the request */ + DBUG_ASSERT(block->hash_link->requests > 0); + block->hash_link->requests--; + /* See NOTE for pagecache_unlock about registering requests. */ + if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN) + unreg_request(pagecache, block, 1); + else + *link= (PAGECACHE_PAGE_LINK)block; + + if (block->status & PCBLOCK_ERROR) + error= 1; + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + goto end; + } + +no_key_cache: + /* Key cache is not used */ + if (write_mode == PAGECACHE_WRITE_DELAY) + { + pagecache->global_cache_w_requests++; + pagecache->global_cache_write++; + if (pagecache_fwrite(pagecache, file, (uchar*) buff, pageno, type, + MYF(MY_NABP | MY_WAIT_IF_FULL))) + error=1; + } + +end: +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("exec", + test_key_cache(pagecache, "end of key_cache_write", 1);); +#endif + if (block) + PCBLOCK_INFO(block); + else + DBUG_PRINT("info", ("No block")); + DBUG_RETURN(error); +} + + +/* + Free block: remove reference to it from hash table, + remove it from the chain file of dirty/clean blocks + and add it to the free list. +*/ + +static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) +{ + KEYCACHE_THREAD_TRACE("free block"); + KEYCACHE_DBUG_PRINT("free_block", + ("block: %u hash_link 0x%lx", + PCBLOCK_NUMBER(pagecache, block), + (long) block->hash_link)); + if (block->hash_link) + { + /* + While waiting for readers to finish, new readers might request the + block. But since we set block->status|= PCBLOCK_REASSIGNED, they + will wait on block->wqueue[COND_FOR_SAVED]. They must be signalled + later. + */ + block->status|= PCBLOCK_REASSIGNED; + wait_for_readers(pagecache, block); + unlink_hash(pagecache, block->hash_link); + } + + unlink_changed(block); + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->pins == 0); + block->status= 0; +#ifndef DBUG_OFF + block->type= PAGECACHE_EMPTY_PAGE; +#endif + block->rec_lsn= LSN_MAX; + KEYCACHE_THREAD_TRACE("free block"); + KEYCACHE_DBUG_PRINT("free_block", + ("block is freed")); + unreg_request(pagecache, block, 0); + block->hash_link= NULL; + + /* Remove the free block from the LRU ring. */ + unlink_block(pagecache, block); + if (block->temperature == PCBLOCK_WARM) + pagecache->warm_blocks--; + block->temperature= PCBLOCK_COLD; + /* Insert the free block in the free list. */ + block->next_used= pagecache->free_block_list; + pagecache->free_block_list= block; + /* Keep track of the number of currently unused blocks. */ + pagecache->blocks_unused++; + +#ifdef THREAD + /* All pending requests for this page must be resubmitted. */ + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); +#endif +} + + +static int cmp_sec_link(PAGECACHE_BLOCK_LINK **a, PAGECACHE_BLOCK_LINK **b) +{ + return (((*a)->hash_link->pageno < (*b)->hash_link->pageno) ? -1 : + ((*a)->hash_link->pageno > (*b)->hash_link->pageno) ? 1 : 0); +} + + +/* + Flush a portion of changed blocks to disk, + free used blocks if requested +*/ + +static int flush_cached_blocks(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + PAGECACHE_BLOCK_LINK **cache, + PAGECACHE_BLOCK_LINK **end, + enum flush_type type) +{ + int error; + int last_errno= 0; + uint count= (uint) (end-cache); + DBUG_ENTER("flush_cached_blocks"); + + /* Don't lock the cache during the flush */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + As all blocks referred in 'cache' are marked by PCBLOCK_IN_FLUSH + we are guarantied no thread will change them + */ + qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + for (; cache != end; cache++) + { + PAGECACHE_BLOCK_LINK *block= *cache; + + if (block->pins) + { + KEYCACHE_DBUG_PRINT("flush_cached_blocks", + ("block: %u (0x%lx) pinned", + PCBLOCK_NUMBER(pagecache, block), (ulong)block)); + DBUG_PRINT("info", ("block: %u (0x%lx) pinned", + PCBLOCK_NUMBER(pagecache, block), (ulong)block)); + PCBLOCK_INFO(block); + last_errno= -1; + unreg_request(pagecache, block, 1); + continue; + } + /* if the block is not pinned then it is not write locked */ + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->pins == 0); + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_WRITE, PAGECACHE_PIN, 0)) + DBUG_ASSERT(0); + + KEYCACHE_DBUG_PRINT("flush_cached_blocks", + ("block: %u (0x%lx) to be flushed", + PCBLOCK_NUMBER(pagecache, block), (ulong)block)); + DBUG_PRINT("info", ("block: %u (0x%lx) to be flushed", + PCBLOCK_NUMBER(pagecache, block), (ulong)block)); + PCBLOCK_INFO(block); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("block: %u (0x%lx) pins: %u", + PCBLOCK_NUMBER(pagecache, block), (ulong)block, + block->pins)); + DBUG_ASSERT(block->pins == 1); + error= pagecache_fwrite(pagecache, file, + block->buffer, + block->hash_link->pageno, + block->type, + MYF(MY_NABP | MY_WAIT_IF_FULL)); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + + make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, 0); + + pagecache->global_cache_write++; + if (error) + { + block->status|= PCBLOCK_ERROR; + if (!last_errno) + last_errno= errno ? errno : -1; + } +#ifdef THREAD + /* + Let to proceed for possible waiting requests to write to the block page. + It might happen only during an operation to resize the key cache. + */ + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); +#endif + /* type will never be FLUSH_IGNORE_CHANGED here */ + if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE)) + { + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + free_block(pagecache, block); + } + else + { + block->status&= ~PCBLOCK_IN_FLUSH; + link_to_file_list(pagecache, block, file, 1); + unreg_request(pagecache, block, 1); + } + } + DBUG_RETURN(last_errno); +} + + +/** + @brief flush all key blocks for a file to disk but don't do any mutex locks + + @param pagecache pointer to a pagecache data structure + @param file handler for the file to flush to + @param flush_type type of the flush + + @note + This function doesn't do any mutex locks because it needs to be called + both from flush_pagecache_blocks and flush_all_key_blocks (the later one + does the mutex lock in the resize_pagecache() function). + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static int flush_pagecache_blocks_int(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + enum flush_type type) +{ + PAGECACHE_BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache; + int last_errno= 0; + DBUG_ENTER("flush_pagecache_blocks_int"); + DBUG_PRINT("enter",("file: %d blocks_used: %lu blocks_changed: %lu", + file->file, pagecache->blocks_used, pagecache->blocks_changed)); + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, + "start of flush_pagecache_blocks", 0);); +#endif + + cache= cache_buff; + if (pagecache->disk_blocks > 0 && + (!my_disable_flush_pagecache_blocks || type != FLUSH_KEEP)) + { + /* Key cache exists and flush is not disabled */ + int error= 0; + uint count= 0; + PAGECACHE_BLOCK_LINK **pos, **end; + PAGECACHE_BLOCK_LINK *first_in_switch= NULL; + PAGECACHE_BLOCK_LINK *block, *next; +#if defined(PAGECACHE_DEBUG) + uint cnt= 0; +#endif + uint8 changed_blocks_is_incomplete_incremented= 0; + + if (type != FLUSH_IGNORE_CHANGED) + { + /* + Count how many key blocks we have to cache to be able + to flush all dirty pages with minimum seek moves + */ + for (block= pagecache->changed_blocks[FILE_HASH(*file)] ; + block; + block= block->next_changed) + { + if (block->hash_link->file.file == file->file) + { + count++; + KEYCACHE_DBUG_ASSERT(count<= pagecache->blocks_used); + } + } + /* Allocate a new buffer only if its bigger than the one we have */ + if (count > FLUSH_CACHE && + !(cache= + (PAGECACHE_BLOCK_LINK**) + my_malloc(sizeof(PAGECACHE_BLOCK_LINK*)*count, MYF(0)))) + { + cache= cache_buff; + count= FLUSH_CACHE; + } + } + + /* Retrieve the blocks and write them to a buffer to be flushed */ +restart: + end= (pos= cache)+count; + for (block= pagecache->changed_blocks[FILE_HASH(*file)] ; + block; + block= next) + { +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + next= block->next_changed; + if (block->hash_link->file.file == file->file) + { + /* + Mark the block with BLOCK_IN_FLUSH in order not to let + other threads to use it for new pages and interfere with + our sequence of flushing dirty file pages + */ + block->status|= PCBLOCK_IN_FLUSH; + + if (! (block->status & PCBLOCK_IN_SWITCH)) + { + /* + We care only for the blocks for which flushing was not + initiated by other threads as a result of page swapping + */ + reg_requests(pagecache, block, 1); + if (type != FLUSH_IGNORE_CHANGED) + { + /* It's not a temporary file */ + if (pos == end) + { + /* + This happens only if there is not enough + memory for the big block + */ + if ((error= flush_cached_blocks(pagecache, file, cache, + end,type))) + last_errno=error; + DBUG_PRINT("info", ("restarting...")); + /* + Restart the scan as some other thread might have changed + the changed blocks chain: the blocks that were in switch + state before the flush started have to be excluded + */ + goto restart; + } + *pos++= block; + } + else + { + /* It's a temporary file */ + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + free_block(pagecache, block); + } + } + else + { + /* Link the block into a list of blocks 'in switch' */ + unlink_changed(block); + link_changed(block, &first_in_switch); + /* + We have just removed a page from the list of dirty pages + ("changed_blocks") though it's still dirty (the flush by another + thread has not yet happened). Checkpoint will miss the page and so + must be blocked until that flush has happened. + Note that if there are two concurrent + flush_pagecache_blocks_int() on this file, then the first one may + move the block into its first_in_switch, and the second one would + just not see the block and wrongly consider its job done. + @todo RECOVERY Maria does protect such flushes with intern_lock, + but Checkpoint does not (Checkpoint makes sure that + changed_blocks_is_incomplete is 0 when it starts, but as + flush_cached_blocks() releases mutex, this may change... + */ + /** + @todo RECOVERY: check all places where we remove a page from the + list of dirty pages + */ + if (unlikely(!changed_blocks_is_incomplete_incremented)) + { + changed_blocks_is_incomplete_incremented= 1; + changed_blocks_is_incomplete++; + } + } + } + } + if (pos != cache) + { + if ((error= flush_cached_blocks(pagecache, file, cache, pos, type))) + last_errno= error; + } + /* Wait until list of blocks in switch is empty */ + while (first_in_switch) + { +#if defined(PAGECACHE_DEBUG) + cnt= 0; +#endif + block= first_in_switch; + { +#ifdef THREAD + struct st_my_thread_var *thread= my_thread_var; + wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + do + { + KEYCACHE_DBUG_PRINT("flush_pagecache_blocks_int: wait", + ("suspend thread %ld", thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); +#else + KEYCACHE_DBUG_ASSERT(0); + /* No parallel requests in single-threaded case */ +#endif + } +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + } + changed_blocks_is_incomplete-= + changed_blocks_is_incomplete_incremented; + /* The following happens very seldom */ + if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE)) + { +#if defined(PAGECACHE_DEBUG) + cnt=0; +#endif + for (block= pagecache->file_blocks[FILE_HASH(*file)] ; + block; + block= next) + { +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + next= block->next_changed; + if (block->hash_link->file.file == file->file && + (! (block->status & PCBLOCK_CHANGED) + || type == FLUSH_IGNORE_CHANGED)) + { + reg_requests(pagecache, block, 1); + free_block(pagecache, block); + } + } + } + } + +#ifndef DBUG_OFF + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "end of flush_pagecache_blocks", 0);); +#endif + if (cache != cache_buff) + my_free((uchar*) cache, MYF(0)); + if (last_errno) + errno=last_errno; /* Return first error */ + DBUG_RETURN(last_errno != 0); +} + + +/* + Flush all blocks for a file to disk + + SYNOPSIS + + flush_pagecache_blocks() + pagecache pointer to a page cache data structure + file handler for the file to flush to + flush_type type of the flush + + RETURN + 0 OK + 1 error +*/ + +int flush_pagecache_blocks(PAGECACHE *pagecache, + PAGECACHE_FILE *file, enum flush_type type) +{ + int res; + DBUG_ENTER("flush_pagecache_blocks"); + DBUG_PRINT("enter", ("pagecache: 0x%lx", (long) pagecache)); + + if (pagecache->disk_blocks <= 0) + DBUG_RETURN(0); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + inc_counter_for_resize_op(pagecache); + res= flush_pagecache_blocks_int(pagecache, file, type); + dec_counter_for_resize_op(pagecache); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(res); +} + + +/* + Reset the counters of a key cache. + + SYNOPSIS + reset_pagecache_counters() + name the name of a key cache + pagecache pointer to the pagecache to be reset + + DESCRIPTION + This procedure is used to reset the counters of all currently used key + caches, both the default one and the named ones. + + RETURN + 0 on success (always because it can't fail) +*/ + +int reset_pagecache_counters(const char *name, PAGECACHE *pagecache) +{ + DBUG_ENTER("reset_pagecache_counters"); + if (!pagecache->inited) + { + DBUG_PRINT("info", ("Key cache %s not initialized.", name)); + DBUG_RETURN(0); + } + DBUG_PRINT("info", ("Resetting counters for key cache %s.", name)); + + pagecache->global_blocks_changed= 0; /* Key_blocks_not_flushed */ + pagecache->global_cache_r_requests= 0; /* Key_read_requests */ + pagecache->global_cache_read= 0; /* Key_reads */ + pagecache->global_cache_w_requests= 0; /* Key_write_requests */ + pagecache->global_cache_write= 0; /* Key_writes */ + DBUG_RETURN(0); +} + + +/** + @brief Allocates a buffer and stores in it some info about all dirty pages + + Does the allocation because the caller cannot know the size itself. + Memory freeing is to be done by the caller (if the "str" member of the + LEX_STRING is not NULL). + Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they + are not interesting for a checkpoint record. + The caller has the intention of doing checkpoints. + + @param pagecache pointer to the page cache + @param[out] str pointer to where the allocated buffer, and + its size, will be put + @param[out] min_rec_lsn pointer to where the minimum rec_lsn of all + relevant dirty pages will be put + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache, + LEX_STRING *str, + LSN *min_rec_lsn) +{ + my_bool error= 0; + uint stored_list_size= 0; + uint file_hash; + char *ptr; + LSN minimum_rec_lsn= LSN_MAX; + DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN"); + + DBUG_ASSERT(NULL == str->str); + /* + We lock the entire cache but will be quick, just reading/writing a few MBs + of memory at most. + */ + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + while (changed_blocks_is_incomplete > 0) + { + /* + Some pages are more recent in memory than on disk (=dirty) and are not + in "changed_blocks" so we cannot know them. Wait. + */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + sleep(1); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + } + + /* Count how many dirty pages are interesting */ + for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) + { + /* + Q: is there something subtle with block->hash_link: can it be NULL? + does it have to be == hash_link->block... ? + */ + DBUG_ASSERT(block->hash_link != NULL); + DBUG_ASSERT(block->status & PCBLOCK_CHANGED); + if (block->type != PAGECACHE_LSN_PAGE) + continue; /* no need to store it */ + stored_list_size++; + } + } + + compile_time_assert(sizeof(pagecache->blocks == 4)); + str->length= 4 + /* number of dirty pages */ + (4 + /* file */ + 4 + /* pageno */ + LSN_STORE_SIZE /* rec_lsn */ + ) * stored_list_size; + if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME)))) + goto err; + ptr= str->str; + int4store(ptr, stored_list_size); + ptr+= 4; + if (!stored_list_size) + goto end; + for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) + { + if (block->type != PAGECACHE_LSN_PAGE) + continue; /* no need to store it in the checkpoint record */ + compile_time_assert((4 == sizeof(block->hash_link->file.file))); + compile_time_assert((4 == sizeof(block->hash_link->pageno))); + int4store(ptr, block->hash_link->file.file); + ptr+= 4; + int4store(ptr, block->hash_link->pageno); + ptr+= 4; + lsn_store(ptr, block->rec_lsn); + ptr+= LSN_STORE_SIZE; + if (block->rec_lsn != LSN_MAX) + { + DBUG_ASSERT(LSN_VALID(block->rec_lsn)); + if (cmp_translog_addr(block->rec_lsn, minimum_rec_lsn) < 0) + minimum_rec_lsn= block->rec_lsn; + } /* otherwise, some trn->rec_lsn should hold the correct info */ + } + } +end: + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + *min_rec_lsn= minimum_rec_lsn; + DBUG_RETURN(error); + +err: + error= 1; + goto end; +} + + +#ifndef DBUG_OFF +/* + Test if disk-cache is ok +*/ +static void test_key_cache(PAGECACHE *pagecache __attribute__((unused)), + const char *where __attribute__((unused)), + my_bool lock __attribute__((unused))) +{ + /* TODO */ +} +#endif + +#if defined(PAGECACHE_TIMEOUT) + +#define KEYCACHE_DUMP_FILE "pagecache_dump.txt" +#define MAX_QUEUE_LEN 100 + + +static void pagecache_dump(PAGECACHE *pagecache) +{ + FILE *pagecache_dump_file=fopen(KEYCACHE_DUMP_FILE, "w"); + struct st_my_thread_var *last; + struct st_my_thread_var *thread; + PAGECACHE_BLOCK_LINK *block; + PAGECACHE_HASH_LINK *hash_link; + PAGECACHE_PAGE *page; + uint i; + + fprintf(pagecache_dump_file, "thread:%u\n", thread->id); + + i=0; + thread=last=waiting_for_hash_link.last_thread; + fprintf(pagecache_dump_file, "queue of threads waiting for hash link\n"); + if (thread) + do + { + thread= thread->next; + page= (PAGECACHE_PAGE *) thread->opt_info; + fprintf(pagecache_dump_file, + "thread:%u, (file,pageno)=(%u,%lu)\n", + thread->id,(uint) page->file.file,(ulong) page->pageno); + if (++i == MAX_QUEUE_LEN) + break; + } + while (thread != last); + + i=0; + thread=last=waiting_for_block.last_thread; + fprintf(pagecache_dump_file, "queue of threads waiting for block\n"); + if (thread) + do + { + thread=thread->next; + hash_link= (PAGECACHE_HASH_LINK *) thread->opt_info; + fprintf(pagecache_dump_file, + "thread:%u hash_link:%u (file,pageno)=(%u,%lu)\n", + thread->id, (uint) PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link), + (uint) hash_link->file.file,(ulong) hash_link->pageno); + if (++i == MAX_QUEUE_LEN) + break; + } + while (thread != last); + + for (i=0 ; i < pagecache->blocks_used ; i++) + { + int j; + block= &pagecache->block_root[i]; + hash_link= block->hash_link; + fprintf(pagecache_dump_file, + "block:%u hash_link:%d status:%x #requests=%u waiting_for_readers:%d\n", + i, (int) (hash_link ? + PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link) : + -1), + block->status, block->requests, block->condvar ? 1 : 0); + for (j=0 ; j < COND_SIZE; j++) + { + PAGECACHE_WQUEUE *wqueue=&block->wqueue[j]; + thread= last= wqueue->last_thread; + fprintf(pagecache_dump_file, "queue #%d\n", j); + if (thread) + { + do + { + thread=thread->next; + fprintf(pagecache_dump_file, + "thread:%u\n", thread->id); + if (++i == MAX_QUEUE_LEN) + break; + } + while (thread != last); + } + } + } + fprintf(pagecache_dump_file, "LRU chain:"); + block= pagecache= used_last; + if (block) + { + do + { + block= block->next_used; + fprintf(pagecache_dump_file, + "block:%u, ", PCBLOCK_NUMBER(pagecache, block)); + } + while (block != pagecache->used_last); + } + fprintf(pagecache_dump_file, "\n"); + + fclose(pagecache_dump_file); +} + +#endif /* defined(PAGECACHE_TIMEOUT) */ + +#if defined(PAGECACHE_TIMEOUT) && !defined(__WIN__) + + +static int pagecache_pthread_cond_wait(pthread_cond_t *cond, + pthread_mutex_t *mutex) +{ + int rc; + struct timeval now; /* time when we started waiting */ + struct timespec timeout; /* timeout value for the wait function */ + struct timezone tz; +#if defined(PAGECACHE_DEBUG) + int cnt=0; +#endif + + /* Get current time */ + gettimeofday(&now, &tz); + /* Prepare timeout value */ + timeout.tv_sec= now.tv_sec + PAGECACHE_TIMEOUT; + /* + timeval uses microseconds. + timespec uses nanoseconds. + 1 nanosecond = 1000 micro seconds + */ + timeout.tv_nsec= now.tv_usec * 1000; + KEYCACHE_THREAD_TRACE_END("started waiting"); +#if defined(PAGECACHE_DEBUG) + cnt++; + if (cnt % 100 == 0) + fprintf(pagecache_debug_log, "waiting...\n"); + fflush(pagecache_debug_log); +#endif + rc= pthread_cond_timedwait(cond, mutex, &timeout); + KEYCACHE_THREAD_TRACE_BEGIN("finished waiting"); + if (rc == ETIMEDOUT || rc == ETIME) + { +#if defined(PAGECACHE_DEBUG) + fprintf(pagecache_debug_log,"aborted by pagecache timeout\n"); + fclose(pagecache_debug_log); + abort(); +#endif + pagecache_dump(); + } + +#if defined(PAGECACHE_DEBUG) + KEYCACHE_DBUG_ASSERT(rc != ETIMEDOUT); +#else + assert(rc != ETIMEDOUT); +#endif + return rc; +} +#else +#if defined(PAGECACHE_DEBUG) +static int pagecache_pthread_cond_wait(pthread_cond_t *cond, + pthread_mutex_t *mutex) +{ + int rc; + KEYCACHE_THREAD_TRACE_END("started waiting"); + rc= pthread_cond_wait(cond, mutex); + KEYCACHE_THREAD_TRACE_BEGIN("finished waiting"); + return rc; +} +#endif +#endif /* defined(PAGECACHE_TIMEOUT) && !defined(__WIN__) */ + +#if defined(PAGECACHE_DEBUG) +static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex) +{ + int rc; + rc= pthread_mutex_lock(mutex); + KEYCACHE_THREAD_TRACE_BEGIN(""); + return rc; +} + + +static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex) +{ + KEYCACHE_THREAD_TRACE_END(""); + pthread_mutex_unlock(mutex); +} + + +static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond) +{ + int rc; + KEYCACHE_THREAD_TRACE("signal"); + rc= pthread_cond_signal(cond); + return rc; +} + + +#if defined(PAGECACHE_DEBUG_LOG) + + +static void pagecache_debug_print(const char * fmt, ...) +{ + va_list args; + va_start(args,fmt); + if (pagecache_debug_log) + { + VOID(vfprintf(pagecache_debug_log, fmt, args)); + VOID(fputc('\n',pagecache_debug_log)); + } + va_end(args); +} +#endif /* defined(PAGECACHE_DEBUG_LOG) */ + +#if defined(PAGECACHE_DEBUG_LOG) + + +void pagecache_debug_log_close(void) +{ + if (pagecache_debug_log) + fclose(pagecache_debug_log); +} +#endif /* defined(PAGECACHE_DEBUG_LOG) */ + +#endif /* defined(PAGECACHE_DEBUG) */ diff --git a/storage/maria/ma_pagecache.h b/storage/maria/ma_pagecache.h new file mode 100644 index 00000000000..0e2aff3644d --- /dev/null +++ b/storage/maria/ma_pagecache.h @@ -0,0 +1,267 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Page cache variable structures */ + +#ifndef _ma_pagecache_h +#define _ma_pagecache_h +C_MODE_START + +#include "ma_loghandler_lsn.h" +#include <m_string.h> + +/* Type of the page */ +enum pagecache_page_type +{ + /* + Used only for control page type changing during debugging. This define + should only be using when using DBUG. + */ + PAGECACHE_EMPTY_PAGE, + /* the page does not contain LSN */ + PAGECACHE_PLAIN_PAGE, + /* the page contain LSN (maria tablespace page) */ + PAGECACHE_LSN_PAGE, + /* Page type used when scanning file and we don't care about the type */ + PAGECACHE_READ_UNKNOWN_PAGE +}; + +/* + This enum describe lock status changing. every type of page cache will + interpret WRITE/READ lock as it need. +*/ +enum pagecache_page_lock +{ + PAGECACHE_LOCK_LEFT_UNLOCKED, /* free -> free */ + PAGECACHE_LOCK_LEFT_READLOCKED, /* read -> read */ + PAGECACHE_LOCK_LEFT_WRITELOCKED, /* write -> write */ + PAGECACHE_LOCK_READ, /* free -> read */ + PAGECACHE_LOCK_WRITE, /* free -> write */ + PAGECACHE_LOCK_READ_UNLOCK, /* read -> free */ + PAGECACHE_LOCK_WRITE_UNLOCK, /* write -> free */ + PAGECACHE_LOCK_WRITE_TO_READ /* write -> read */ +}; +/* + This enum describe pin status changing +*/ +enum pagecache_page_pin +{ + PAGECACHE_PIN_LEFT_PINNED, /* pinned -> pinned */ + PAGECACHE_PIN_LEFT_UNPINNED, /* unpinned -> unpinned */ + PAGECACHE_PIN, /* unpinned -> pinned */ + PAGECACHE_UNPIN /* pinned -> unpinned */ +}; +/* How to write the page */ +enum pagecache_write_mode +{ + /* do not write immediately, i.e. it will be dirty page */ + PAGECACHE_WRITE_DELAY, + /* page already is in the file. (key cache insert analogue) */ + PAGECACHE_WRITE_DONE +}; + +typedef void *PAGECACHE_PAGE_LINK; + +/* file descriptor for Maria */ +typedef struct st_pagecache_file +{ + File file; +} PAGECACHE_FILE; + +/* page number for maria */ +typedef uint32 pgcache_page_no_t; + +/* declare structures that is used by st_pagecache */ + +struct st_pagecache_block_link; +typedef struct st_pagecache_block_link PAGECACHE_BLOCK_LINK; +struct st_pagecache_page; +typedef struct st_pagecache_page PAGECACHE_PAGE; +struct st_pagecache_hash_link; +typedef struct st_pagecache_hash_link PAGECACHE_HASH_LINK; + +#include <wqueue.h> + +typedef my_bool (*pagecache_disk_read_validator)(uchar *page, uchar *data); + +#define PAGECACHE_CHANGED_BLOCKS_HASH 128 /* must be power of 2 */ + +/* + The page cache structure + It also contains read-only statistics parameters. +*/ + +typedef struct st_pagecache +{ + my_bool inited; + my_bool resize_in_flush; /* true during flush of resize operation */ + my_bool can_be_used; /* usage of cache for read/write is allowed */ + uint shift; /* block size = 2 ^ shift */ + size_t mem_size; /* specified size of the cache memory */ + uint32 block_size; /* size of the page buffer of a cache block */ + ulong min_warm_blocks; /* min number of warm blocks; */ + ulong age_threshold; /* age threshold for hot blocks */ + ulonglong time; /* total number of block link operations */ + uint hash_entries; /* max number of entries in the hash table */ + int hash_links; /* max number of hash links */ + int hash_links_used; /* number of hash links taken from free links pool */ + int disk_blocks; /* max number of blocks in the cache */ + ulong blocks_used; /* maximum number of concurrently used blocks */ + ulong blocks_unused; /* number of currently unused blocks */ + ulong blocks_changed; /* number of currently dirty blocks */ + ulong warm_blocks; /* number of blocks in warm sub-chain */ + ulong cnt_for_resize_op; /* counter to block resize operation */ + ulong blocks_available; /* number of blocks available in the LRU chain */ + PAGECACHE_HASH_LINK **hash_root;/* arr. of entries into hash table buckets */ + PAGECACHE_HASH_LINK *hash_link_root;/* memory for hash table links */ + PAGECACHE_HASH_LINK *free_hash_list;/* list of free hash links */ + PAGECACHE_BLOCK_LINK *free_block_list;/* list of free blocks */ + PAGECACHE_BLOCK_LINK *block_root;/* memory for block links */ + uchar HUGE_PTR *block_mem; /* memory for block buffers */ + PAGECACHE_BLOCK_LINK *used_last;/* ptr to the last block of the LRU chain */ + PAGECACHE_BLOCK_LINK *used_ins;/* ptr to the insertion block in LRU chain */ + pthread_mutex_t cache_lock; /* to lock access to the cache structure */ + WQUEUE resize_queue; /* threads waiting during resize operation */ + WQUEUE waiting_for_hash_link;/* waiting for a free hash link */ + WQUEUE waiting_for_block; /* requests waiting for a free block */ + /* hash for dirty file bl.*/ + PAGECACHE_BLOCK_LINK *changed_blocks[PAGECACHE_CHANGED_BLOCKS_HASH]; + /* hash for other file bl.*/ + PAGECACHE_BLOCK_LINK *file_blocks[PAGECACHE_CHANGED_BLOCKS_HASH]; + + /* + The following variables are and variables used to hold parameters for + initializing the key cache. + */ + + ulonglong param_buff_size; /* size the memory allocated for the cache */ + ulong param_block_size; /* size of the blocks in the key cache */ + ulong param_division_limit; /* min. percentage of warm blocks */ + ulong param_age_threshold; /* determines when hot block is downgraded */ + + /* Statistics variables. These are reset in reset_pagecache_counters(). */ + ulong global_blocks_changed; /* number of currently dirty blocks */ + ulonglong global_cache_w_requests;/* number of write requests (write hits) */ + ulonglong global_cache_write; /* number of writes from cache to files */ + ulonglong global_cache_r_requests;/* number of read requests (read hits) */ + ulonglong global_cache_read; /* number of reads from files to cache */ + + int blocks; /* max number of blocks in the cache */ + my_bool in_init; /* Set to 1 in MySQL during init/resize */ +} PAGECACHE; + +/* The default key cache */ +extern PAGECACHE dflt_pagecache_var, *dflt_pagecache; + +extern int init_pagecache(PAGECACHE *pagecache, size_t use_mem, + uint division_limit, uint age_threshold, + uint block_size); +extern int resize_pagecache(PAGECACHE *pagecache, + size_t use_mem, uint division_limit, + uint age_threshold); +extern void change_pagecache_param(PAGECACHE *pagecache, uint division_limit, + uint age_threshold); + +#define pagecache_read(P,F,N,L,B,T,K,I) \ + pagecache_valid_read(P,F,N,L,B,T,K,I,0,0) + +extern uchar *pagecache_valid_read(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + PAGECACHE_PAGE_LINK *link, + pagecache_disk_read_validator validator, + uchar* validator_data); + +#define pagecache_write(P,F,N,L,B,T,O,I,M,K) \ + pagecache_write_part(P,F,N,L,B,T,O,I,M,K,0,(P)->block_size,0,0) + +#define pagecache_inject(P,F,N,L,B,T,O,I,K,V,D) \ + pagecache_write_part(P,F,N,L,B,T,O,I,PAGECACHE_WRITE_DONE, \ + K,0,(P)->block_size,V,D) + +extern my_bool pagecache_write_part(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + enum pagecache_write_mode write_mode, + PAGECACHE_PAGE_LINK *link, + uint offset, + uint size, + pagecache_disk_read_validator validator, + uchar* validator_data); +extern void pagecache_unlock(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn); +extern void pagecache_unlock_by_link(PAGECACHE *pagecache, + PAGECACHE_PAGE_LINK *link, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn); +extern void pagecache_unpin(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + LSN lsn); +extern void pagecache_unpin_by_link(PAGECACHE *pagecache, + PAGECACHE_PAGE_LINK *link, + LSN lsn); +extern int flush_pagecache_blocks(PAGECACHE *keycache, + PAGECACHE_FILE *file, + enum flush_type type); +extern my_bool pagecache_delete(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + my_bool flush); +extern my_bool pagecache_delete_pages(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint page_count, + enum pagecache_page_lock lock, + my_bool flush); +extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup); +extern my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache, + LEX_STRING *str, + LSN *min_lsn); +extern int reset_pagecache_counters(const char *name, PAGECACHE *pagecache); + + +/* Functions to handle multiple key caches */ +extern my_bool multi_pagecache_init(void); +extern void multi_pagecache_free(void); +extern PAGECACHE *multi_pagecache_search(uchar *key, uint length, + PAGECACHE *def); +extern my_bool multi_pagecache_set(const uchar *key, uint length, + PAGECACHE *pagecache); +extern void multi_pagecache_change(PAGECACHE *old_data, + PAGECACHE *new_data); +extern int reset_pagecache_counters(const char *name, + PAGECACHE *pagecache); + +C_MODE_END +#endif /* _keycache_h */ diff --git a/storage/maria/ma_pagecaches.c b/storage/maria/ma_pagecaches.c new file mode 100644 index 00000000000..a9460be10c5 --- /dev/null +++ b/storage/maria/ma_pagecaches.c @@ -0,0 +1,105 @@ +/* Copyright (C) 2003-2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Handling of multiple key caches + + The idea is to have a thread safe hash on the table name, + with a default key cache value that is returned if the table name is not in + the cache. +*/ + +#include "maria_def.h" +#include "ma_pagecache.h" +#include <hash.h> +#include <m_string.h> +#include "../../mysys/my_safehash.h" + +/***************************************************************************** + Functions to handle the pagecache objects +*****************************************************************************/ + +/* Variable to store all key cache objects */ +static SAFE_HASH pagecache_hash; + + +my_bool multi_pagecache_init(void) +{ + return safe_hash_init(&pagecache_hash, 16, (uchar*) maria_pagecache); +} + + +void multi_pagecache_free(void) +{ + safe_hash_free(&pagecache_hash); +} + +/* + Get a key cache to be used for a specific table. + + SYNOPSIS + multi_pagecache_search() + key key to find (usually table path) + uint length Length of key. + def Default value if no key cache + + NOTES + This function is coded in such a way that we will return the + default key cache even if one never called multi_pagecache_init. + This will ensure that it works with old MyISAM clients. + + RETURN + key cache to use +*/ + +PAGECACHE *multi_pagecache_search(uchar *key, uint length, + PAGECACHE *def) +{ + if (!pagecache_hash.hash.records) + return def; + return (PAGECACHE*) safe_hash_search(&pagecache_hash, key, length, + (void*) def); +} + + +/* + Assosiate a key cache with a key + + + SYONOPSIS + multi_pagecache_set() + key key (path to table etc..) + length Length of key + pagecache cache to assococite with the table + + NOTES + This can be used both to insert a new entry and change an existing + entry +*/ + + +my_bool multi_pagecache_set(const uchar *key, uint length, + PAGECACHE *pagecache) +{ + return safe_hash_set(&pagecache_hash, key, length, (uchar*) pagecache); +} + + +void multi_pagecache_change(PAGECACHE *old_data, + PAGECACHE *new_data) +{ + safe_hash_change(&pagecache_hash, (uchar*) old_data, (uchar*) new_data); +} diff --git a/storage/maria/ma_panic.c b/storage/maria/ma_panic.c new file mode 100644 index 00000000000..0394f630343 --- /dev/null +++ b/storage/maria/ma_panic.c @@ -0,0 +1,134 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "ma_fulltext.h" + +/* + Stop usage of Maria + + SYNOPSIS + maria_panic() + flag HA_PANIC_CLOSE: All maria files (tables and log) are closed. + maria_end() is called. + HA_PANIC_WRITE: All misam files are unlocked and + all changed data in single user maria is + written to file + HA_PANIC_READ All maria files that was locked when + maria_panic(HA_PANIC_WRITE) was done is + locked. A maria_readinfo() is done for + all single user files to get changes + in database + + RETURN + 0 ok + # error number in case of error +*/ + +int maria_panic(enum ha_panic_function flag) +{ + int error=0; + LIST *list_element,*next_open; + MARIA_HA *info; + DBUG_ENTER("maria_panic"); + + if (!maria_inited) + DBUG_RETURN(0); + pthread_mutex_lock(&THR_LOCK_maria); + for (list_element=maria_open_list ; list_element ; list_element=next_open) + { + next_open=list_element->next; /* Save if close */ + info=(MARIA_HA*) list_element->data; + switch (flag) { + case HA_PANIC_CLOSE: + /* + If bad luck (if some tables would be used now, which normally does not + happen in MySQL), as we release the mutex, the list may change and so + we may crash. + */ + pthread_mutex_unlock(&THR_LOCK_maria); + if (maria_close(info)) + error=my_errno; + pthread_mutex_lock(&THR_LOCK_maria); + break; + case HA_PANIC_WRITE: /* Do this to free databases */ +#ifdef CANT_OPEN_FILES_TWICE + if (info->s->options & HA_OPTION_READ_ONLY_DATA) + break; +#endif + if (flush_pagecache_blocks(info->s->pagecache, &info->s->kfile, + FLUSH_RELEASE)) + error=my_errno; + if (info->opt_flag & WRITE_CACHE_USED) + if (flush_io_cache(&info->rec_cache)) + error=my_errno; + if (info->opt_flag & READ_CACHE_USED) + { + if (flush_io_cache(&info->rec_cache)) + error=my_errno; + reinit_io_cache(&info->rec_cache,READ_CACHE,0, + (pbool) (info->lock_type != F_UNLCK),1); + } + if (info->lock_type != F_UNLCK && ! info->was_locked) + { + info->was_locked=info->lock_type; + if (maria_lock_database(info,F_UNLCK)) + error=my_errno; + } +#ifdef CANT_OPEN_FILES_TWICE + if (info->s->kfile.file >= 0 && my_close(info->s->kfile.file, MYF(0))) + error = my_errno; + if (info->dfile.file >= 0 && my_close(info->dfile.file, MYF(0))) + error = my_errno; + info->s->kfile.file= info->dfile.file= -1;/* Files aren't open anymore */ + break; +#endif + case HA_PANIC_READ: /* Restore to before WRITE */ +#ifdef CANT_OPEN_FILES_TWICE + { /* Open closed files */ + char name_buff[FN_REFLEN]; + if (info->s->kfile.file < 0) + if ((info->s->kfile.file= my_open(fn_format(name_buff, + info->filename, "", + N_NAME_IEXT,4), + info->mode, + MYF(MY_WME))) < 0) + error = my_errno; + if (info->dfile.file < 0) + { + if ((info->dfile.file= my_open(fn_format(name_buff, info->filename, + "", N_NAME_DEXT, 4), + info->mode, + MYF(MY_WME))) < 0) + error = my_errno; + info->rec_cache.file= info->dfile.file; + } + } +#endif + if (info->was_locked) + { + if (maria_lock_database(info, info->was_locked)) + error=my_errno; + info->was_locked=0; + } + break; + } + } + pthread_mutex_unlock(&THR_LOCK_maria); + if (flag == HA_PANIC_CLOSE) + maria_end(); + if (!error) + DBUG_RETURN(0); + DBUG_RETURN(my_errno=error); +} /* maria_panic */ diff --git a/storage/maria/ma_preload.c b/storage/maria/ma_preload.c new file mode 100644 index 00000000000..138bb94f7d0 --- /dev/null +++ b/storage/maria/ma_preload.c @@ -0,0 +1,133 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Preload indexes into key cache +*/ + +#include "maria_def.h" + + +/* + Preload pages of the index file for a table into the key cache + + SYNOPSIS + maria_preload() + info open table + map map of indexes to preload into key cache + ignore_leaves only non-leaves pages are to be preloaded + + RETURN VALUE + 0 if a success. error code - otherwise. + + NOTES. + At present pages for all indexes are preloaded. + In future only pages for indexes specified in the key_map parameter + of the table will be preloaded. +*/ + +int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves) +{ + uint i; + ulong length, block_length= 0; + uchar *buff= NULL; + MARIA_SHARE* share= info->s; + uint keys= share->state.header.keys; + MARIA_KEYDEF *keyinfo= share->keyinfo; + my_off_t key_file_length= share->state.state.key_file_length; + my_off_t pos= share->base.keystart; + DBUG_ENTER("maria_preload"); + + if (!keys || !maria_is_any_key_active(key_map) || key_file_length == pos) + DBUG_RETURN(0); + + block_length= keyinfo[0].block_length; + + if (ignore_leaves) + { + /* Check whether all indexes use the same block size */ + for (i= 1 ; i < keys ; i++) + { + if (keyinfo[i].block_length != block_length) + DBUG_RETURN(my_errno= HA_ERR_NON_UNIQUE_BLOCK_SIZE); + } + } + else + block_length= share->pagecache->block_size; + + length= info->preload_buff_size/block_length * block_length; + set_if_bigger(length, block_length); + + if (!(buff= (uchar *) my_malloc(length, MYF(MY_WME)))) + DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM); + + if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE)) + goto err; + + do + { + /* Read the next block of index file into the preload buffer */ + if ((my_off_t) length > (key_file_length-pos)) + length= (ulong) (key_file_length-pos); + if (my_pread(share->kfile.file, (uchar*) buff, length, pos, + MYF(MY_FAE|MY_FNABP))) + goto err; + + if (ignore_leaves) + { + uchar *end= buff+length; + do + { + if (_ma_test_if_nod(buff)) + { + DBUG_ASSERT(share->pagecache->block_size == block_length); + if (pagecache_write(share->pagecache, + &share->kfile, pos / block_length, + DFLT_INIT_HITS, + (uchar*) buff, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DONE, 0)) + goto err; + } + pos+= block_length; + } + while ((buff+= block_length) != end); + buff= end-length; + } + else + { + if (pagecache_write(share->pagecache, + &share->kfile, pos / block_length, + DFLT_INIT_HITS, + (uchar*) buff, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DONE, 0)) + goto err; + pos+= length; + } + } + while (pos != key_file_length); + + my_free((char*) buff, MYF(0)); + DBUG_RETURN(0); + +err: + my_free((char*) buff, MYF(MY_ALLOW_ZERO_PTR)); + DBUG_RETURN(my_errno= errno); +} diff --git a/storage/maria/ma_range.c b/storage/maria/ma_range.c new file mode 100644 index 00000000000..02616d8ac5c --- /dev/null +++ b/storage/maria/ma_range.c @@ -0,0 +1,295 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Gives a approximated number of how many records there is between two keys. + Used when optimizing querries. + */ + +#include "maria_def.h" +#include "ma_rt_index.h" + +static ha_rows _ma_record_pos(MARIA_HA *,const uchar *, key_part_map, + enum ha_rkey_function); +static double _ma_search_pos(MARIA_HA *, MARIA_KEYDEF *, uchar *, + uint, uint, my_off_t); +static uint _ma_keynr(MARIA_HA *, MARIA_KEYDEF *, uchar *, uchar *, uint *); + + +/** + @brief Estimate how many records there is in a given range + + @param info MARIA handler + @param inx Index to use + @param min_key Min key. Is = 0 if no min range + @param max_key Max key. Is = 0 if no max range + + @note + We should ONLY return 0 if there is no rows in range + + @return Estimated number of rows or error + @retval HA_POS_ERROR error (or we can't estimate number of rows) + @retval number Estimated number of rows +*/ + +ha_rows maria_records_in_range(MARIA_HA *info, int inx, key_range *min_key, + key_range *max_key) +{ + ha_rows start_pos,end_pos,res; + DBUG_ENTER("maria_records_in_range"); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(HA_POS_ERROR); + + if (fast_ma_readinfo(info)) + DBUG_RETURN(HA_POS_ERROR); + info->update&= (HA_STATE_CHANGED+HA_STATE_ROW_CHANGED); + if (info->s->concurrent_insert) + rw_rdlock(&info->s->key_root_lock[inx]); + + switch(info->s->keyinfo[inx].key_alg){ +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + { + uchar *key_buff; + uint start_key_len; + + /* + The problem is that the optimizer doesn't support + RTree keys properly at the moment. + Hope this will be fixed some day. + But now NULL in the min_key means that we + didn't make the task for the RTree key + and expect BTree functionality from it. + As it's not able to handle such request + we return the error. + */ + if (!min_key) + { + res= HA_POS_ERROR; + break; + } + key_buff= info->lastkey+info->s->base.max_key_length; + start_key_len= _ma_pack_key(info,inx, key_buff, + min_key->key, min_key->keypart_map, + (HA_KEYSEG**) 0); + res= maria_rtree_estimate(info, inx, key_buff, start_key_len, + maria_read_vec[min_key->flag]); + res= res ? res : 1; /* Don't return 0 */ + break; + } +#endif + case HA_KEY_ALG_BTREE: + default: + start_pos= (min_key ? + _ma_record_pos(info, min_key->key, min_key->keypart_map, + min_key->flag) : + (ha_rows) 0); + end_pos= (max_key ? + _ma_record_pos(info, max_key->key, max_key->keypart_map, + max_key->flag) : + info->state->records + (ha_rows) 1); + res= (end_pos < start_pos ? (ha_rows) 0 : + (end_pos == start_pos ? (ha_rows) 1 : end_pos-start_pos)); + if (start_pos == HA_POS_ERROR || end_pos == HA_POS_ERROR) + res=HA_POS_ERROR; + } + + if (info->s->concurrent_insert) + rw_unlock(&info->s->key_root_lock[inx]); + fast_ma_writeinfo(info); + + /** + @todo LOCK + If res==0 (no rows), if we need to guarantee repeatability of the search, + we will need to set a next-key lock in this statement. + Also SELECT COUNT(*)... + */ + + DBUG_PRINT("info",("records: %ld",(ulong) (res))); + DBUG_RETURN(res); +} + + + /* Find relative position (in records) for key in index-tree */ + +static ha_rows _ma_record_pos(MARIA_HA *info, const uchar *key, + key_part_map keypart_map, + enum ha_rkey_function search_flag) +{ + uint inx=(uint) info->lastinx, nextflag, key_len; + MARIA_KEYDEF *keyinfo=info->s->keyinfo+inx; + uchar *key_buff; + double pos; + DBUG_ENTER("_ma_record_pos"); + DBUG_PRINT("enter",("search_flag: %d",search_flag)); + DBUG_ASSERT(keypart_map); + + key_buff=info->lastkey+info->s->base.max_key_length; + key_len= _ma_pack_key(info, inx, key_buff, key, keypart_map, + (HA_KEYSEG**) 0); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, keyinfo->seg, + key_buff, key_len);); + nextflag=maria_read_vec[search_flag]; + if (!(nextflag & (SEARCH_FIND | SEARCH_NO_FIND | SEARCH_LAST))) + key_len=USE_WHOLE_KEY; + + /* + my_handler.c:mi_compare_text() has a flag 'skip_end_space'. + This is set in my_handler.c:ha_key_cmp() in dependence on the + compare flags 'nextflag' and the column type. + + TEXT columns are of type HA_KEYTYPE_VARTEXT. In this case the + condition is skip_end_space= ((nextflag & (SEARCH_FIND | + SEARCH_UPDATE)) == SEARCH_FIND). + + SEARCH_FIND is used for an exact key search. The combination + SEARCH_FIND | SEARCH_UPDATE is used in write/update/delete + operations with a comment like "Not real duplicates", whatever this + means. From the condition above we can see that 'skip_end_space' is + always false for these operations. The result is that trailing space + counts in key comparison and hence, emtpy strings ('', string length + zero, but not NULL) compare less that strings starting with control + characters and these in turn compare less than strings starting with + blanks. + + When estimating the number of records in a key range, we request an + exact search for the minimum key. This translates into a plain + SEARCH_FIND flag. Using this alone would lead to a 'skip_end_space' + compare. Empty strings would be expected above control characters. + Their keys would not be found because they are located below control + characters. + + This is the reason that we add the SEARCH_UPDATE flag here. It makes + the key estimation compare in the same way like key write operations + do. Olny so we will find the keys where they have been inserted. + + Adding the flag unconditionally does not hurt as it is used in the + above mentioned condition only. So it can safely be used together + with other flags. + */ + pos= _ma_search_pos(info,keyinfo, key_buff, key_len, + nextflag | SEARCH_SAVE_BUFF | SEARCH_UPDATE, + info->s->state.key_root[inx]); + if (pos >= 0.0) + { + DBUG_PRINT("exit",("pos: %ld",(ulong) (pos*info->state->records))); + DBUG_RETURN((ulong) (pos*info->state->records+0.5)); + } + DBUG_RETURN(HA_POS_ERROR); +} + + + /* This is a modified version of _ma_search */ + /* Returns offset for key in indextable (decimal 0.0 <= x <= 1.0) */ + +static double _ma_search_pos(register MARIA_HA *info, + register MARIA_KEYDEF *keyinfo, + uchar *key, uint key_len, uint nextflag, + register my_off_t pos) +{ + int flag; + uint nod_flag,keynr,max_keynr; + my_bool after_key; + uchar *keypos, *buff; + double offset; + DBUG_ENTER("_ma_search_pos"); + LINT_INIT(max_keynr); + + if (pos == HA_OFFSET_ERROR) + DBUG_RETURN(0.5); + + if (!(buff= _ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,info->buff,1))) + goto err; + flag=(*keyinfo->bin_search)(info, keyinfo, buff, key, key_len, nextflag, + &keypos,info->lastkey, &after_key); + nod_flag=_ma_test_if_nod(buff); + keynr= _ma_keynr(info,keyinfo,buff,keypos,&max_keynr); + + if (flag) + { + if (flag == MARIA_FOUND_WRONG_KEY) + DBUG_RETURN(-1); /* error */ + /* + Didn't found match. keypos points at next (bigger) key + Try to find a smaller, better matching key. + Matches keynr + [0-1] + */ + if (flag > 0 && ! nod_flag) + offset= 1.0; + else if ((offset= _ma_search_pos(info,keyinfo,key,key_len,nextflag, + _ma_kpos(nod_flag,keypos))) < 0) + DBUG_RETURN(offset); + } + else + { + /* + Found match. Keypos points at the start of the found key + Matches keynr+1 + */ + offset=1.0; /* Matches keynr+1 */ + if ((nextflag & SEARCH_FIND) && nod_flag && + ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME || + key_len != USE_WHOLE_KEY)) + { + /* + There may be identical keys in the tree. Try to match on of those. + Matches keynr + [0-1] + */ + if ((offset= _ma_search_pos(info,keyinfo,key,key_len,SEARCH_FIND, + _ma_kpos(nod_flag,keypos))) < 0) + DBUG_RETURN(offset); /* Read error */ + } + } + DBUG_PRINT("info",("keynr: %d offset: %g max_keynr: %d nod: %d flag: %d", + keynr,offset,max_keynr,nod_flag,flag)); + DBUG_RETURN((keynr+offset)/(max_keynr+1)); +err: + DBUG_PRINT("exit",("Error: %d",my_errno)); + DBUG_RETURN (-1.0); +} + + + /* Get keynummer of current key and max number of keys in nod */ + +static uint _ma_keynr(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *page, uchar *keypos, uint *ret_max_key) +{ + uint nod_flag,keynr,max_key; + uchar t_buff[HA_MAX_KEY_BUFF],*end; + + end= page+maria_data_on_page(page); + nod_flag=_ma_test_if_nod(page); + page+=2+nod_flag; + + if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY))) + { + *ret_max_key= (uint) (end-page)/(keyinfo->keylength+nod_flag); + return (uint) (keypos-page)/(keyinfo->keylength+nod_flag); + } + + max_key=keynr=0; + t_buff[0]=0; /* Safety */ + while (page < end) + { + if (!(*keyinfo->get_key)(keyinfo,nod_flag,&page,t_buff)) + return 0; /* Error */ + max_key++; + if (page == keypos) + keynr=max_key; + } + *ret_max_key=max_key; + return(keynr); +} diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c new file mode 100644 index 00000000000..e740e334b5f --- /dev/null +++ b/storage/maria/ma_recovery.c @@ -0,0 +1,2249 @@ +/* Copyright (C) 2006, 2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3072 Maria recovery + First version written by Guilhem Bichot on 2006-04-27. +*/ + +/* Here is the implementation of this module */ + +#include "maria_def.h" +#include "ma_recovery.h" +#include "ma_blockrec.h" +#include "trnman.h" + +struct st_trn_for_recovery /* used only in the REDO phase */ +{ + LSN group_start_lsn, undo_lsn, first_undo_lsn; + TrID long_trid; +}; +struct st_dirty_page /* used only in the REDO phase */ +{ + uint64 file_and_page_id; + LSN rec_lsn; +}; +struct st_table_for_recovery /* used in the REDO and UNDO phase */ +{ + MARIA_HA *info; + File org_kfile, org_dfile; /**< OS descriptors when Checkpoint saw table */ +}; +/* Variables used by all functions of this module. Ok as single-threaded */ +static struct st_trn_for_recovery *all_active_trans; +static struct st_table_for_recovery *all_tables; +static HASH all_dirty_pages; +static struct st_dirty_page *dirty_pages_pool; +static LSN current_group_end_lsn, + checkpoint_start= LSN_IMPOSSIBLE; +static TrID max_long_trid= 0; /**< max long trid seen by REDO phase */ +static FILE *tracef; /**< trace file for debugging */ +static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */ + +#define prototype_redo_exec_hook(R) \ + static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec) + +#define prototype_redo_exec_hook_dummy(R) \ + static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec \ + __attribute ((unused))) + +#define prototype_undo_exec_hook(R) \ + static int exec_UNDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec, TRN *trn) + +prototype_redo_exec_hook(LONG_TRANSACTION_ID); +prototype_redo_exec_hook_dummy(CHECKPOINT); +prototype_redo_exec_hook(REDO_CREATE_TABLE); +prototype_redo_exec_hook(REDO_RENAME_TABLE); +prototype_redo_exec_hook(REDO_REPAIR_TABLE); +prototype_redo_exec_hook(REDO_DROP_TABLE); +prototype_redo_exec_hook(FILE_ID); +prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD); +prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL); +prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD); +prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL); +prototype_redo_exec_hook(REDO_PURGE_BLOCKS); +prototype_redo_exec_hook(REDO_DELETE_ALL); +prototype_redo_exec_hook(UNDO_ROW_INSERT); +prototype_redo_exec_hook(UNDO_ROW_DELETE); +prototype_redo_exec_hook(UNDO_ROW_UPDATE); +prototype_redo_exec_hook(COMMIT); +prototype_redo_exec_hook(CLR_END); +prototype_undo_exec_hook(UNDO_ROW_INSERT); +prototype_undo_exec_hook(UNDO_ROW_DELETE); +prototype_undo_exec_hook(UNDO_ROW_UPDATE); + +static int run_redo_phase(LSN lsn, my_bool apply); +static uint end_of_redo_phase(my_bool prepare_for_undo_phase); +static int run_undo_phase(uint unfinished); +static void display_record_position(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec, + uint number); +static int display_and_apply_record(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec); +static MARIA_HA *get_MARIA_HA_from_REDO_record(const + TRANSLOG_HEADER_BUFFER *rec); +static MARIA_HA *get_MARIA_HA_from_UNDO_record(const + TRANSLOG_HEADER_BUFFER *rec); +static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon); +static LSN parse_checkpoint_record(LSN lsn); +static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn, + LSN first_undo_lsn); +static int new_table(uint16 sid, const char *name, + File org_kfile, File org_dfile, + LSN lsn_of_file_id); +static int new_page(File fileid, pgcache_page_no_t pageid, LSN rec_lsn, + struct st_dirty_page *dirty_page); +static int close_all_tables(void); +static void print_redo_phase_progress(TRANSLOG_ADDRESS addr); + +/** @brief global [out] buffer for translog_read_record(); never shrinks */ +static LEX_STRING log_record_buffer; +static void enlarge_buffer(const TRANSLOG_HEADER_BUFFER *rec) +{ + if (log_record_buffer.length < rec->record_length) + { + log_record_buffer.length= rec->record_length; + log_record_buffer.str= my_realloc(log_record_buffer.str, + rec->record_length, + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); + } +} +static my_bool redo_phase_message_printed; +/** @brief Prints to a trace file if it is not NULL */ +void tprint(FILE *trace_file, const char *format, ...) + ATTRIBUTE_FORMAT(printf, 2, 3); +void tprint(FILE *trace_file, const char *format, ...) +{ + va_list args; + va_start(args, format); + if (trace_file != NULL) + vfprintf(trace_file, format, args); + va_end(args); +} + +#define ALERT_USER() DBUG_ASSERT(0) + + +/** + @brief Recovers from the last checkpoint. + + Runs the REDO phase using special structures, then sets up the playground + of runtime: recreates transactions inside trnman, open tables with their + two-byte-id mapping; takes a checkpoint and runs the UNDO phase. Closes all + tables. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int maria_recover(void) +{ + int res= 1; + FILE *trace_file; + DBUG_ENTER("maria_recover"); + + DBUG_ASSERT(!maria_in_recovery); + maria_in_recovery= TRUE; + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + trace_file= fopen("maria_recovery.trace", "w"); +#else + trace_file= NULL; /* no trace file for being fast */ +#endif + tprint(trace_file, "TRACE of the last MARIA recovery from mysqld\n"); + DBUG_ASSERT(maria_pagecache->inited); + res= maria_apply_log(LSN_IMPOSSIBLE, TRUE, trace_file, TRUE, TRUE); + if (!res) + tprint(trace_file, "SUCCESS\n"); + if (trace_file) + fclose(trace_file); + maria_in_recovery= FALSE; + DBUG_RETURN(res); +} + + +/** + @brief Displays and/or applies the log + + @param from_lsn LSN from which log reading/applying should start; + LSN_IMPOSSIBLE means "use last checkpoint" + @param apply if log records should be applied or not + @param trace_file trace file where progress/debug messages will go + @param skip_DDLs Should DDL records (CREATE/RENAME/DROP/REPAIR) + be skipped by the REDO phase or not + + @todo This trace_file thing is primitive; soon we will make it similar to + ma_check_print_warning() etc, and a successful recovery does not need to + create a trace file. But for debugging now it is useful. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int maria_apply_log(LSN from_lsn, my_bool apply, FILE *trace_file, + my_bool should_run_undo_phase, my_bool skip_DDLs_arg) +{ + int error= 0; + uint unfinished_trans; + DBUG_ENTER("maria_apply_log"); + + DBUG_ASSERT(apply || !should_run_undo_phase); + DBUG_ASSERT(!maria_multi_threaded); + all_active_trans= (struct st_trn_for_recovery *) + my_malloc((SHORT_TRID_MAX + 1) * sizeof(struct st_trn_for_recovery), + MYF(MY_ZEROFILL)); + all_tables= (struct st_table_for_recovery *) + my_malloc((SHARE_ID_MAX + 1) * sizeof(struct st_table_for_recovery), + MYF(MY_ZEROFILL)); + if (!all_active_trans || !all_tables) + goto err; + + redo_phase_message_printed= FALSE; + tracef= trace_file; + if (!(skip_DDLs= skip_DDLs_arg)) + { + /* + Example of what can go wrong when replaying DDLs: + CREATE TABLE t (logged); INSERT INTO t VALUES(1) (logged); + ALTER TABLE t ... which does + CREATE a temporary table #sql... (logged) + INSERT data from t into #sql... (not logged) + RENAME #sql TO t (logged) + Removing tables by hand and replaying the log will leave in the + end an empty table "t": missing records. If after the RENAME an INSERT + into t was done, that row had number 1 in its page, executing the + REDO_INSERT_ROW_HEAD on the recreated empty t will fail (assertion + failure in _ma_apply_redo_insert_row_head_or_tail(): new data page is + created whereas rownr is not 0). + Another issue is that replaying of DDLs is not correct enough to work if + there was a crash during a DDL (see comment in execution of + REDO_RENAME_TABLE ). + */ + tprint(tracef, "WARNING: MySQL server currently disables log records" + " about insertion of data by ALTER TABLE" + " (copy_data_between_tables()), applying of log records may" + " well not work. Additionally, applying of DDL records will" + " cause damage if there are tables left by a crash of a DDL.\n"); + } + + if (from_lsn == LSN_IMPOSSIBLE) + { + if (last_checkpoint_lsn == LSN_IMPOSSIBLE) + { + from_lsn= translog_first_theoretical_lsn(); + /* + as far as we have not yet any checkpoint then the very first + log file should be present. + */ + if (unlikely((from_lsn == LSN_IMPOSSIBLE) || + (from_lsn == LSN_ERROR))) + goto err; + } + else + { + from_lsn= parse_checkpoint_record(last_checkpoint_lsn); + if (from_lsn == LSN_IMPOSSIBLE) + goto err; + from_lsn= translog_next_LSN(from_lsn, LSN_IMPOSSIBLE); + if (from_lsn == LSN_ERROR) + goto err; + /* + from_lsn LSN_IMPOSSIBLE will be correctly processed + by run_redo_phase() + */ + } + } + + if (run_redo_phase(from_lsn, apply)) + goto err; + + unfinished_trans= end_of_redo_phase(should_run_undo_phase); + if (unfinished_trans == (uint)-1) + goto err; + if (should_run_undo_phase) + { + if (run_undo_phase(unfinished_trans)) + return 1; + } + else if (unfinished_trans > 0) + tprint(tracef, "WARNING: %u unfinished transactions; some tables may be" + " left inconsistent!\n", unfinished_trans); + + /* + we don't use maria_panic() because it would maria_end(), and Recovery does + not want that (we want to keep some modules initialized for runtime). + */ + if (close_all_tables()) + goto err; + + /* If inside ha_maria, a checkpoint will soon be taken and save our work */ + goto end; +err: + error= 1; + tprint(tracef, "Recovery of tables with transaction logs FAILED\n"); +end: + hash_free(&all_dirty_pages); + bzero(&all_dirty_pages, sizeof(all_dirty_pages)); + my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR)); + dirty_pages_pool= NULL; + my_free(all_tables, MYF(MY_ALLOW_ZERO_PTR)); + all_tables= NULL; + my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR)); + all_active_trans= NULL; + my_free(log_record_buffer.str, MYF(MY_ALLOW_ZERO_PTR)); + log_record_buffer.str= NULL; + log_record_buffer.length= 0; + if (tracef != stdout && redo_phase_message_printed) + { + /** @todo RECOVERY BUG all prints to stderr should go to error log */ + fprintf(stderr, "\n"); + } + /* we don't cleanly close tables if we hit some error (may corrupt them) */ + DBUG_RETURN(error); +} + + +/* very basic info about the record's header */ +static void display_record_position(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec, + uint number) +{ + /* + if number==0, we're going over records which we had already seen and which + form a group, so we indent below the group's end record + */ + tprint(tracef, "%sRec#%u LSN (%lu,0x%lx) short_trid %u %s(num_type:%u) len %lu\n", + number ? "" : " ", number, LSN_IN_PARTS(rec->lsn), + rec->short_trid, log_desc->name, rec->type, + (ulong)rec->record_length); +} + + +static int display_and_apply_record(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec) +{ + int error; + if (log_desc->record_execute_in_redo_phase == NULL) + { + /* die on all not-yet-handled records :) */ + DBUG_ASSERT("one more hook" == "to write"); + return 1; + } + if ((error= (*log_desc->record_execute_in_redo_phase)(rec))) + tprint(tracef, "Got error when executing redo on record\n"); + return error; +} + + +prototype_redo_exec_hook(LONG_TRANSACTION_ID) +{ + uint16 sid= rec->short_trid; + TrID long_trid= all_active_trans[sid].long_trid; + /* abort group of this trn (must be of before a crash) */ + LSN gslsn= all_active_trans[sid].group_start_lsn; + if (gslsn != LSN_IMPOSSIBLE) + { + tprint(tracef, "Group at LSN (%lu,0x%lx) short_trid %u aborted\n", + LSN_IN_PARTS(gslsn), sid); + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + } + if (long_trid != 0) + { + LSN ulsn= all_active_trans[sid].undo_lsn; + if (ulsn != LSN_IMPOSSIBLE) + { + char llbuf[22]; + llstr(long_trid, llbuf); + tprint(tracef, "Found an old transaction long_trid %s short_trid %u" + " with same short id as this new transaction, and has neither" + " committed nor rollback (undo_lsn: (%lu,0x%lx))\n", llbuf, + sid, LSN_IN_PARTS(ulsn)); + goto err; + } + } + long_trid= uint6korr(rec->header); + new_transaction(sid, long_trid, LSN_IMPOSSIBLE, LSN_IMPOSSIBLE); + goto end; +err: + ALERT_USER(); + return 1; +end: + return 0; +} + + +static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn, + LSN first_undo_lsn) +{ + char llbuf[22]; + all_active_trans[sid].long_trid= long_id; + llstr(long_id, llbuf); + tprint(tracef, "Transaction long_trid %s short_trid %u starts\n", + llbuf, sid); + all_active_trans[sid].undo_lsn= undo_lsn; + all_active_trans[sid].first_undo_lsn= first_undo_lsn; + set_if_bigger(max_long_trid, long_id); +} + + +prototype_redo_exec_hook_dummy(CHECKPOINT) +{ + /* the only checkpoint we care about was found via control file, ignore */ + return 0; +} + + +prototype_redo_exec_hook(REDO_CREATE_TABLE) +{ + File dfile= -1, kfile= -1; + char *linkname_ptr, filename[FN_REFLEN]; + char *name, *ptr; + myf create_flag; + uint flags; + int error= 1, create_mode= O_RDWR | O_TRUNC; + MARIA_HA *info= NULL; + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + return 0; + } + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + goto end; + } + name= log_record_buffer.str; + tprint(tracef, "Table '%s'", name); + /* we try hard to get create_rename_lsn, to avoid mistakes if possible */ + info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR); + if (info) + { + MARIA_SHARE *share= info->s; + /* check that we're not already using it */ + if (share->reopen != 1) + { + tprint(tracef, ", is already open (reopen=%u)\n", share->reopen); + ALERT_USER(); + goto end; + } + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + if (!share->base.born_transactional) + { + /* + could be that transactional table was later dropped, and a non-trans + one was renamed to its name, thus create_rename_lsn is 0 and should + not be trusted. + */ + tprint(tracef, ", is not transactional, ignoring creation\n"); + ALERT_USER(); + error= 0; + goto end; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than" + " record, ignoring creation", + LSN_IN_PARTS(share->state.create_rename_lsn)); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + tprint(tracef, ", is crashed, can't recreate it"); + ALERT_USER(); + goto end; + } + maria_close(info); + info= NULL; + } + else /* one or two files absent, or header corrupted... */ + tprint(tracef, "can't be opened, probably does not exist"); + /* if does not exist, or is older, overwrite it */ + /** @todo symlinks */ + ptr= name + strlen(name) + 1; + if ((flags= ptr[0] ? HA_DONT_TOUCH_DATA : 0)) + tprint(tracef, ", we will only touch index file"); + fn_format(filename, name, "", MARIA_NAME_IEXT, + (MY_UNPACK_FILENAME | + (flags & HA_DONT_TOUCH_DATA) ? MY_RETURN_REAL_PATH : 0) | + MY_APPEND_EXT); + linkname_ptr= NULL; + create_flag= MY_DELETE_OLD; + tprint(tracef, ", creating as '%s'", filename); + if ((kfile= my_create_with_symlink(linkname_ptr, filename, 0, create_mode, + MYF(MY_WME|create_flag))) < 0) + { + tprint(tracef, "Failed to create index file\n"); + goto end; + } + ptr++; + uint kfile_size_before_extension= uint2korr(ptr); + ptr+= 2; + uint keystart= uint2korr(ptr); + ptr+= 2; + /* set create_rename_lsn (for maria_read_log to be idempotent) */ + lsn_store(ptr + sizeof(info->s->state.header) + 2, rec->lsn); + /* we also set is_of_horizon, like maria_create() does */ + lsn_store(ptr + sizeof(info->s->state.header) + 2 + LSN_STORE_SIZE, + rec->lsn); + if (my_pwrite(kfile, ptr, + kfile_size_before_extension, 0, MYF(MY_NABP|MY_WME)) || + my_chsize(kfile, keystart, 0, MYF(MY_WME))) + { + tprint(tracef, "Failed to write to index file\n"); + goto end; + } + if (!(flags & HA_DONT_TOUCH_DATA)) + { + fn_format(filename,name,"", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + linkname_ptr= NULL; + create_flag=MY_DELETE_OLD; + if (((dfile= + my_create_with_symlink(linkname_ptr, filename, 0, create_mode, + MYF(MY_WME | create_flag))) < 0) || + my_close(dfile, MYF(MY_WME))) + { + tprint(tracef, "Failed to create data file\n"); + goto end; + } + /* + we now have an empty data file. To be able to + _ma_initialize_data_file() we need some pieces of the share to be + correctly filled. So we just open the table (fortunately, an empty + data file does not preclude this). + */ + if (((info= maria_open(name, O_RDONLY, 0)) == NULL) || + _ma_initialize_data_file(info->s, info->dfile.file)) + { + tprint(tracef, "Failed to open new table or write to data file\n"); + goto end; + } + } + error= 0; +end: + tprint(tracef, "\n"); + if (kfile >= 0) + error|= my_close(kfile, MYF(MY_WME)); + if (info != NULL) + error|= maria_close(info); + return error; +} + + +prototype_redo_exec_hook(REDO_RENAME_TABLE) +{ + char *old_name, *new_name; + int error= 1; + MARIA_HA *info= NULL; + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + return 0; + } + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + goto end; + } + old_name= log_record_buffer.str; + new_name= old_name + strlen(old_name) + 1; + tprint(tracef, "Table '%s' to rename to '%s'; old-name table ", old_name, + new_name); + /* + Here is why we skip CREATE/DROP/RENAME when doing a recovery from + ha_maria (whereas we do when called from maria_read_log). Consider: + CREATE TABLE t; + RENAME TABLE t to u; + DROP TABLE u; + RENAME TABLE v to u; # crash between index rename and data rename. + And do a Recovery (not removing tables beforehand). + Recovery replays CREATE, then RENAME: the maria_open("t") works, + maria_open("u") does not (no data file) so table "u" is considered + inexistent and so maria_rename() is done which overwrites u's index file, + which is lost. Ok, the data file (v.MAD) is still available, but only a + REPAIR USE_FRM can rebuild the index, which is unsafe and downtime. + So it is preferrable to not execute RENAME, and leave the "mess" of files, + rather than possibly destroy a file. DBA will manually rename files. + A safe recovery method would probably require checking the existence of + the index file and of the data file separately (not via maria_open()), and + maybe also to store a create_rename_lsn in the data file too + For now, all we risk is to leave the mess (half-renamed files) left by the + crash. We however sync files and directories at each file rename. The SQL + layer is anyway not crash-safe for DDLs (except the repartioning-related + ones). + We replay DDLs in maria_read_log to be able to recreate tables from + scratch. It means that "maria_read_log -a" should not be used on a + database which just crashed during a DDL. And also ALTER TABLE does not + log insertions of records into the temporary table, so replaying may + fail (see comment and warning in maria_apply_log()). + */ + info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR); + if (info) + { + MARIA_SHARE *share= info->s; + /* + We may have open instances on this table. But it does not matter, the + maria_extra() below will take care of them. + */ + if (!share->base.born_transactional) + { + tprint(tracef, ", is not transactional, ignoring renaming\n"); + ALERT_USER(); + error= 0; + goto end; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than" + " record, ignoring renaming", + LSN_IN_PARTS(share->state.create_rename_lsn)); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + tprint(tracef, ", is crashed, can't rename it"); + ALERT_USER(); + goto end; + } + /* + This maria_extra() call serves to signal that old open instances of + this table should not be used anymore, and (only on Windows) to close + open files so they can be renamed + */ + if (maria_extra(info, HA_EXTRA_PREPARE_FOR_RENAME, NULL) || + maria_close(info)) + goto end; + info= NULL; + tprint(tracef, ", is ok for renaming; new-name table "); + } + else /* one or two files absent, or header corrupted... */ + { + tprint(tracef, ", can't be opened, probably does not exist"); + error= 0; + goto end; + } + /* + We must also check the create_rename_lsn of the 'new_name' table if it + exists: otherwise we may, with our rename which overwrites, destroy + another table. For example: + CREATE TABLE t; + RENAME t to u; + DROP TABLE u; + RENAME v to u; # v is an old table, its creation/insertions not in log + And start executing the log (without removing tables beforehand): creates + t, renames it to u (if not testing create_rename_lsn) thus overwriting + old-named v, drops u, and we are stuck, we have lost data. + */ + info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR); + if (info) + { + MARIA_SHARE *share= info->s; + /* We should not have open instances on this table. */ + if (share->reopen != 1) + { + tprint(tracef, ", is already open (reopen=%u)\n", share->reopen); + ALERT_USER(); + goto end; + } + if (!share->base.born_transactional) + { + tprint(tracef, ", is not transactional, ignoring renaming\n"); + ALERT_USER(); + goto drop; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than" + " record, ignoring renaming", + LSN_IN_PARTS(share->state.create_rename_lsn)); + /* + We have to drop the old_name table. Consider: + CREATE TABLE t; + CREATE TABLE v; + RENAME TABLE t to u; + DROP TABLE u; + RENAME TABLE v to u; + and apply the log without removing tables beforehand. t will be + created, v too; in REDO_RENAME u will be more recent, but we still + have to drop t otherwise it stays. + */ + goto drop; + } + if (maria_is_crashed(info)) + { + tprint(tracef, ", is crashed, can't rename it"); + ALERT_USER(); + goto end; + } + if (maria_close(info)) + goto end; + info= NULL; + /* abnormal situation */ + tprint(tracef, ", exists but is older than record, can't rename it"); + goto end; + } + else /* one or two files absent, or header corrupted... */ + tprint(tracef, ", can't be opened, probably does not exist"); + tprint(tracef, ", renaming '%s'", old_name); + if (maria_rename(old_name, new_name)) + { + tprint(tracef, "Failed to rename table\n"); + goto end; + } + info= maria_open(new_name, O_RDONLY, 0); + if (info == NULL) + { + tprint(tracef, "Failed to open renamed table\n"); + goto end; + } + if (_ma_update_create_rename_lsn(info->s, rec->lsn, TRUE)) + goto end; + if (maria_close(info)) + goto end; + info= NULL; + error= 0; + goto end; +drop: + tprint(tracef, ", only dropping '%s'", old_name); + if (maria_delete_table(old_name)) + { + tprint(tracef, "Failed to drop table\n"); + goto end; + } + error= 0; + goto end; +end: + tprint(tracef, "\n"); + if (info != NULL) + error|= maria_close(info); + return error; +} + + +/* + The record may come from REPAIR, ALTER TABLE ENABLE KEYS, OPTIMIZE. +*/ +prototype_redo_exec_hook(REDO_REPAIR_TABLE) +{ + int error= 1; + MARIA_HA *info; + if (skip_DDLs) + { + /* + REPAIR is not exactly a DDL, but it manipulates files without logging + insertions into them. + */ + tprint(tracef, "we skip DDLs\n"); + return 0; + } + if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL) + return 0; + /* + Otherwise, the mapping is newer than the table, and our record is newer + than the mapping, so we can repair. + */ + tprint(tracef, " repairing...\n"); + /** + @todo RECOVERY BUG fix this: + the maria_chk_init() call causes a heap of linker errors in ha_maria.cc! + */ +#if 0 + HA_CHECK param; + maria_chk_init(¶m); + param.isam_file_name= info->s->open_file_name; + param.testflag= uint4korr(rec->header); + if (maria_repair(¶m, info, info->s->open_file_name, + param.testflag & T_QUICK)) + goto end; + if (_ma_update_create_rename_lsn(info->s, rec->lsn, TRUE)) + goto end; + error= 0; +end: + return error; +#else + DBUG_ASSERT("fix this table repairing" == NULL); + return error; +#endif +} + + +prototype_redo_exec_hook(REDO_DROP_TABLE) +{ + char *name; + int error= 1; + MARIA_HA *info= NULL; + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + return 0; + } + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + goto end; + } + name= log_record_buffer.str; + tprint(tracef, "Table '%s'", name); + info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR); + if (info) + { + MARIA_SHARE *share= info->s; + /* + We may have open instances on this table. But it does not matter, the + maria_extra() below will take care of them. + */ + if (!share->base.born_transactional) + { + tprint(tracef, ", is not transactional, ignoring removal\n"); + ALERT_USER(); + error= 0; + goto end; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than" + " record, ignoring removal", + LSN_IN_PARTS(share->state.create_rename_lsn)); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + tprint(tracef, ", is crashed, can't drop it"); + ALERT_USER(); + goto end; + } + /* + This maria_extra() call serves to signal that old open instances of + this table should not be used anymore, and (only on Windows) to close + open files so they can be deleted + */ + if (maria_extra(info, HA_EXTRA_PREPARE_FOR_DROP, NULL) || + maria_close(info)) + goto end; + info= NULL; + /* if it is older, or its header is corrupted, drop it */ + tprint(tracef, ", dropping '%s'", name); + if (maria_delete_table(name)) + { + tprint(tracef, "Failed to drop table\n"); + goto end; + } + } + else /* one or two files absent, or header corrupted... */ + tprint(tracef,", can't be opened, probably does not exist"); + error= 0; +end: + tprint(tracef, "\n"); + if (info != NULL) + error|= maria_close(info); + return error; +} + + +prototype_redo_exec_hook(FILE_ID) +{ + uint16 sid; + int error= 1; + const char *name; + MARIA_HA *info; + + if (cmp_translog_addr(rec->lsn, checkpoint_start) < 0) + { + /* + If that mapping was still true at checkpoint time, it was found in + checkpoint record, no need to recreate it. If that mapping had ended at + checkpoint time (table was closed or repaired), a flush and force + happened and so mapping is not needed. + */ + tprint(tracef, "ignoring because before checkpoint\n"); + return 0; + } + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + goto end; + } + sid= fileid_korr(log_record_buffer.str); + info= all_tables[sid].info; + if (info != NULL) + { + tprint(tracef, " Closing table '%s'\n", info->s->open_file_name); + prepare_table_for_close(info, rec->lsn); + if (maria_close(info)) + { + tprint(tracef, "Failed to close table\n"); + goto end; + } + all_tables[sid].info= NULL; + } + name= log_record_buffer.str + FILEID_STORE_SIZE; + if (new_table(sid, name, -1, -1, rec->lsn)) + goto end; + error= 0; +end: + return error; +} + + +static int new_table(uint16 sid, const char *name, + File org_kfile, File org_dfile, + LSN lsn_of_file_id) +{ + /* + -1 (skip table): close table and return 0; + 1 (error): close table and return 1; + 0 (success): leave table open and return 0. + */ + int error= 1; + + tprint(tracef, "Table '%s', id %u", name, sid); + MARIA_HA *info= maria_open(name, O_RDWR, HA_OPEN_FOR_REPAIR); + if (info == NULL) + { + tprint(tracef, ", is absent (must have been dropped later?)" + " or its header is so corrupted that we cannot open it;" + " we skip it\n"); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + tprint(tracef, "Table is crashed, can't apply log records to it\n"); + goto end; + } + MARIA_SHARE *share= info->s; + /* check that we're not already using it */ + if (share->reopen != 1) + { + tprint(tracef, ", is already open (reopen=%u)\n", share->reopen); + ALERT_USER(); + goto end; + } + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + if (!share->base.born_transactional) + { + tprint(tracef, ", is not transactional\n"); + ALERT_USER(); + error= -1; + goto end; + } + if (cmp_translog_addr(lsn_of_file_id, share->state.create_rename_lsn) <= 0) + { + tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than" + " LOGREC_FILE_ID's LSN (%lu,0x%lx), ignoring open request", + LSN_IN_PARTS(share->state.create_rename_lsn), + LSN_IN_PARTS(lsn_of_file_id)); + error= -1; + goto end; + } + /* don't log any records for this work */ + _ma_tmp_disable_logging_for_table(share); + /* execution of some REDO records relies on data_file_length */ + my_off_t dfile_len= my_seek(info->dfile.file, 0, SEEK_END, MYF(MY_WME)); + my_off_t kfile_len= my_seek(info->s->kfile.file, 0, SEEK_END, MYF(MY_WME)); + if ((dfile_len == MY_FILEPOS_ERROR) || + (kfile_len == MY_FILEPOS_ERROR)) + { + tprint(tracef, ", length unknown\n"); + goto end; + } + share->state.state.data_file_length= dfile_len; + share->state.state.key_file_length= kfile_len; + if ((dfile_len % share->block_size) > 0) + { + tprint(tracef, ", has too short last page\n"); + /* Recovery will fix this, no error */ + ALERT_USER(); + } + /* + This LSN serves in this situation; assume log is: + FILE_ID(6->"t2") REDO_INSERT(6) FILE_ID(6->"t1") CHECKPOINT(6->"t1") + then crash, checkpoint record is parsed and opens "t1" with id 6; assume + REDO phase starts from the REDO_INSERT above: it will wrongly try to + update a page of "t1". With this LSN below, REDO_INSERT can realize the + mapping is newer than itself, and not execute. + Same example is possible with UNDO_INSERT (update of the state). + */ + info->s->lsn_of_file_id= lsn_of_file_id; + all_tables[sid].info= info; + all_tables[sid].org_kfile= org_kfile; + all_tables[sid].org_dfile= org_dfile; + /* + We don't set info->s->id, it would be useless (no logging in REDO phase); + if you change that, know that some records in REDO phase call + _ma_update_create_rename_lsn() which resets info->s->id. + */ + tprint(tracef, ", opened"); + error= 0; +end: + tprint(tracef, "\n"); + if (error) + { + if (info != NULL) + maria_close(info); + if (error == -1) + error= 0; + } + return error; +} + + +prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD) +{ + int error= 1; + uchar *buff= NULL; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + { + /* + Table was skipped at open time (because later dropped/renamed, not + transactional, or create_rename_lsn newer than LOGREC_FILE_ID); it is + not an error. + */ + return 0; + } + /* + If REDO's LSN is > page's LSN (read from disk), we are going to modify the + page and change its LSN. The normal runtime code stores the UNDO's LSN + into the page. Here storing the REDO's LSN (rec->lsn) would work + (we are not writing to the log here, so don't have to "flush up to UNDO's + LSN"). But in a test scenario where we do updates at runtime, then remove + tables, apply the log and check that this results in the same table as at + runtime, putting the same LSN as runtime had done will decrease + differences. So we use the UNDO's LSN which is current_group_end_lsn. + */ + enlarge_buffer(rec); + if (log_record_buffer.str == NULL) + { + tprint(tracef, "Failed to read allocate buffer for record\n"); + goto end; + } + if (translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + goto end; + } + buff= log_record_buffer.str; + if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn, + HEAD_PAGE, + buff + FILEID_STORE_SIZE, + buff + + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE, + rec->record_length - + (FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE))) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL) +{ + int error= 1; + uchar *buff; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + goto end; + } + buff= log_record_buffer.str; + if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn, + TAIL_PAGE, + buff + FILEID_STORE_SIZE, + buff + + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE, + rec->record_length - + (FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE))) + goto end; + error= 0; + +end: + return error; +} + + +prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn, + HEAD_PAGE, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn, + TAIL_PAGE, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_PURGE_BLOCKS) +{ + int error= 1; + uchar *buff; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + goto end; + } + + buff= log_record_buffer.str; + if (_ma_apply_redo_purge_blocks(info, current_group_end_lsn, + buff + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_DELETE_ALL) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + tprint(tracef, " deleting all %lu rows\n", + (ulong)info->s->state.state.records); + if (maria_delete_all_rows(info)) + goto end; + error= 0; +end: + return error; +} + + +#define set_undo_lsn_for_active_trans(TRID, LSN) do { \ + all_active_trans[TRID].undo_lsn= LSN; \ + if (all_active_trans[TRID].first_undo_lsn == LSN_IMPOSSIBLE) \ + all_active_trans[TRID].first_undo_lsn= LSN; } while (0) + +prototype_redo_exec_hook(UNDO_ROW_INSERT) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + if (info == NULL) + return 0; + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (cmp_translog_addr(rec->lsn, info->s->state.is_of_horizon) >= 0) + { + tprint(tracef, " state older than record, updating rows' count\n"); + info->s->state.state.records++; + /** @todo RECOVERY BUG Also update the table's checksum */ + /** + @todo some bits below will rather be set when executing UNDOs related + to keys + */ + info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + } + tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records); + return 0; +} + + +prototype_redo_exec_hook(UNDO_ROW_DELETE) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + if (info == NULL) + return 0; + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (cmp_translog_addr(rec->lsn, info->s->state.is_of_horizon) >= 0) + { + tprint(tracef, " state older than record, updating rows' count\n"); + info->s->state.state.records--; + info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + } + tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records); + return 0; +} + + +prototype_redo_exec_hook(UNDO_ROW_UPDATE) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + if (info == NULL) + return 0; + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (cmp_translog_addr(rec->lsn, info->s->state.is_of_horizon) >= 0) + { + info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + } + return 0; +} + + +prototype_redo_exec_hook(COMMIT) +{ + uint16 sid= rec->short_trid; + TrID long_trid= all_active_trans[sid].long_trid; + LSN gslsn= all_active_trans[sid].group_start_lsn; + char llbuf[22]; + if (long_trid == 0) + { + tprint(tracef, "We don't know about transaction with short_trid %u;" + "it probably committed long ago, forget it\n", sid); + return 0; + } + llstr(long_trid, llbuf); + tprint(tracef, "Transaction long_trid %s short_trid %u committed", llbuf, sid); + if (gslsn != LSN_IMPOSSIBLE) + { + /* + It's not an error, it may be that trn got a disk error when writing to a + table, so an unfinished group staid in the log. + */ + tprint(tracef, ", with group at LSN (%lu,0x%lx) short_trid %u aborted\n", + LSN_IN_PARTS(gslsn), sid); + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + } + else + tprint(tracef, "\n"); + bzero(&all_active_trans[sid], sizeof(all_active_trans[sid])); +#ifdef MARIA_VERSIONING + /* + if real recovery: + transaction was committed, move it to some separate list for later + purging (but don't purge now! purging may have been started before, we + may find REDO_PURGE records soon). + */ +#endif + return 0; +} + + +prototype_redo_exec_hook(CLR_END) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + if (info == NULL) + return 0; + LSN previous_undo_lsn= lsn_korr(rec->header); + enum translog_record_type undone_record_type= + (rec->header)[LSN_STORE_SIZE + FILEID_STORE_SIZE]; + const LOG_DESC *log_desc= &log_record_type_descriptor[undone_record_type]; + + set_undo_lsn_for_active_trans(rec->short_trid, previous_undo_lsn); + tprint(tracef, " CLR_END was about %s, undo_lsn now LSN (%lu,0x%lx)\n", + log_desc->name, LSN_IN_PARTS(previous_undo_lsn)); + if (cmp_translog_addr(rec->lsn, info->s->state.is_of_horizon) >= 0) + { + tprint(tracef, " state older than record, updating rows' count\n"); + switch (undone_record_type) { + case LOGREC_UNDO_ROW_DELETE: + info->s->state.state.records++; + break; + case LOGREC_UNDO_ROW_INSERT: + info->s->state.state.records--; + break; + case LOGREC_UNDO_ROW_UPDATE: + break; + default: + DBUG_ASSERT(0); + } + info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + } + tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records); + return 0; +} + + +prototype_undo_exec_hook(UNDO_ROW_INSERT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + + if (info == NULL) + { + /* + Unlike for REDOs, if the table was skipped it is abnormal; we have a + transaction to rollback which used this table, as it is not rolled back + it was supposed to hold this table and so the table should still be + there. + */ + return 1; + } + info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + + info->trn= trn; + error= _ma_apply_undo_row_insert(info, previous_undo_lsn, + rec->header + LSN_STORE_SIZE + + FILEID_STORE_SIZE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records); + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(previous_undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_ROW_DELETE) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + + if (info == NULL) + return 1; + + info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + return 1; + } + + info->trn= trn; + /* + For now we skip the page and directory entry. This is to be used + later when we mark rows as deleted. + */ + error= _ma_apply_undo_row_delete(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE, + rec->record_length - + (LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE)); + info->trn= 0; + tprint(tracef, " rows' count %lu\n undo_lsn now LSN (%lu,0x%lx)\n", + (ulong)info->s->state.state.records, + LSN_IN_PARTS(previous_undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_ROW_UPDATE) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + + if (info == NULL) + return 1; + + info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_row_update(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - + (LSN_STORE_SIZE + FILEID_STORE_SIZE)); + info->trn= 0; + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(previous_undo_lsn)); + return error; +} + + +static int run_redo_phase(LSN lsn, my_bool apply) +{ + /* install hooks for execution */ +#define install_redo_exec_hook(R) \ + log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \ + exec_REDO_LOGREC_ ## R; +#define install_undo_exec_hook(R) \ + log_record_type_descriptor[LOGREC_ ## R].record_execute_in_undo_phase= \ + exec_UNDO_LOGREC_ ## R; + install_redo_exec_hook(LONG_TRANSACTION_ID); + install_redo_exec_hook(CHECKPOINT); + install_redo_exec_hook(REDO_CREATE_TABLE); + install_redo_exec_hook(REDO_RENAME_TABLE); + install_redo_exec_hook(REDO_REPAIR_TABLE); + install_redo_exec_hook(REDO_DROP_TABLE); + install_redo_exec_hook(FILE_ID); + install_redo_exec_hook(REDO_INSERT_ROW_HEAD); + install_redo_exec_hook(REDO_INSERT_ROW_TAIL); + install_redo_exec_hook(REDO_PURGE_ROW_HEAD); + install_redo_exec_hook(REDO_PURGE_ROW_TAIL); + install_redo_exec_hook(REDO_PURGE_BLOCKS); + install_redo_exec_hook(REDO_DELETE_ALL); + install_redo_exec_hook(UNDO_ROW_INSERT); + install_redo_exec_hook(UNDO_ROW_DELETE); + install_redo_exec_hook(UNDO_ROW_UPDATE); + install_redo_exec_hook(COMMIT); + install_redo_exec_hook(CLR_END); + install_undo_exec_hook(UNDO_ROW_INSERT); + install_undo_exec_hook(UNDO_ROW_DELETE); + install_undo_exec_hook(UNDO_ROW_UPDATE); + + current_group_end_lsn= LSN_IMPOSSIBLE; + + TRANSLOG_HEADER_BUFFER rec; + + if (unlikely(lsn == LSN_IMPOSSIBLE || lsn == translog_get_horizon())) + { + tprint(tracef, "checkpoint address refers to the log end log or " + "log is empty, nothing to do.\n"); + return 0; + } + + int len= translog_read_record_header(lsn, &rec); + + /** @todo EOF should be detected */ + if (len == RECHEADER_READ_ERROR) + { + tprint(tracef, "Failed to read header of the first record.\n"); + return 1; + } + struct st_translog_scanner_data scanner; + if (translog_init_scanner(lsn, 1, &scanner)) + { + tprint(tracef, "Scanner init failed\n"); + return 1; + } + uint i; + for (i= 1;;i++) + { + uint16 sid= rec.short_trid; + const LOG_DESC *log_desc= &log_record_type_descriptor[rec.type]; + display_record_position(log_desc, &rec, i); + /* + A complete group is a set of log records with an "end mark" record + (e.g. a set of REDOs for an operation, terminated by an UNDO for this + operation); if there is no "end mark" record the group is incomplete + and won't be executed. + */ + if ((log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF) || + (log_desc->record_in_group == LOGREC_LAST_IN_GROUP)) + { + if (all_active_trans[sid].group_start_lsn != LSN_IMPOSSIBLE) + { + if (log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF) + { + /* + can happen if the transaction got a table write error, then + unlocked tables thus wrote a COMMIT record. + */ + tprint(tracef, "\nDiscarding unfinished group before this record\n"); + ALERT_USER(); + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + } + else + { + /* + There is a complete group for this transaction, containing more + than this event. + */ + tprint(tracef, " ends a group:\n"); + struct st_translog_scanner_data scanner2; + TRANSLOG_HEADER_BUFFER rec2; + len= + translog_read_record_header(all_active_trans[sid].group_start_lsn, &rec2); + if (len < 0) /* EOF or error */ + { + tprint(tracef, "Cannot find record where it should be\n"); + return 1; + } + if (translog_init_scanner(rec2.lsn, 1, &scanner2)) + { + tprint(tracef, "Scanner2 init failed\n"); + return 1; + } + current_group_end_lsn= rec.lsn; + do + { + if (rec2.short_trid == sid) /* it's in our group */ + { + const LOG_DESC *log_desc2= &log_record_type_descriptor[rec2.type]; + display_record_position(log_desc2, &rec2, 0); + if (apply && display_and_apply_record(log_desc2, &rec2)) + return 1; + } + len= translog_read_next_record_header(&scanner2, &rec2); + if (len < 0) /* EOF or error */ + { + tprint(tracef, "Cannot find record where it should be\n"); + return 1; + } + } + while (rec2.lsn < rec.lsn); + translog_free_record_header(&rec2); + /* group finished */ + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + current_group_end_lsn= LSN_IMPOSSIBLE; /* for debugging */ + display_record_position(log_desc, &rec, 0); + } + } + if (apply && display_and_apply_record(log_desc, &rec)) + return 1; + } + else /* record does not end group */ + { + /* just record the fact, can't know if can execute yet */ + if (all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE) + { + /* group not yet started */ + all_active_trans[sid].group_start_lsn= rec.lsn; + } + } + len= translog_read_next_record_header(&scanner, &rec); + if (len < 0) + { + switch (len) + { + case RECHEADER_READ_EOF: + tprint(tracef, "EOF on the log\n"); + break; + case RECHEADER_READ_ERROR: + tprint(tracef, "Error reading log\n"); + return 1; + } + break; + } + } + translog_free_record_header(&rec); + return 0; +} + + +/** + @brief Informs about any aborted groups or unfinished transactions, + prepares for the UNDO phase if needed. + + @param prepare_for_undo_phase + + @note Observe that it may init trnman. +*/ +static uint end_of_redo_phase(my_bool prepare_for_undo_phase) +{ + uint sid, unfinished= 0; + char llbuf[22]; + + hash_free(&all_dirty_pages); + /* + hash_free() can be called multiple times probably, but be safe it that + changes + */ + bzero(&all_dirty_pages, sizeof(all_dirty_pages)); + my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR)); + dirty_pages_pool= NULL; + + llstr(max_long_trid, llbuf); + tprint(tracef, "Maximum transaction long id seen: %s\n", llbuf); + if (prepare_for_undo_phase && trnman_init(max_long_trid)) + return -1; + + for (sid= 0; sid <= SHORT_TRID_MAX; sid++) + { + TrID long_trid= all_active_trans[sid].long_trid; + LSN gslsn= all_active_trans[sid].group_start_lsn; + TRN *trn; + if (gslsn != LSN_IMPOSSIBLE) + { + tprint(tracef, "Group at LSN (%lu,0x%lx) short_trid %u aborted\n", + LSN_IN_PARTS(gslsn), sid); + ALERT_USER(); + } + if (all_active_trans[sid].undo_lsn != LSN_IMPOSSIBLE) + { + char llbuf[22]; + llstr(long_trid, llbuf); + tprint(tracef, "Transaction long_trid %s short_trid %u unfinished\n", + llbuf, sid); + /* dummy_transaction_object serves only for DDLs */ + DBUG_ASSERT(long_trid != 0); + if (prepare_for_undo_phase) + { + if ((trn= trnman_recreate_trn_from_recovery(sid, long_trid)) == NULL) + return -1; + trn->undo_lsn= all_active_trans[sid].undo_lsn; + trn->first_undo_lsn= all_active_trans[sid].first_undo_lsn | + TRANSACTION_LOGGED_LONG_ID; /* because trn is known in log */ + } + /* otherwise we will just warn about it */ + unfinished++; + } +#ifdef MARIA_VERSIONING + /* + If real recovery: if transaction was committed, move it to some separate + list for soon purging. + */ +#endif + } + + my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR)); + all_active_trans= NULL; + + /* + The UNDO phase uses some normal run-time code of ROLLBACK: generates log + records, etc; prepare tables for that + */ + LSN addr= translog_get_horizon(); + for (sid= 0; sid <= SHARE_ID_MAX; sid++) + { + MARIA_HA *info= all_tables[sid].info; + if (info != NULL) + { + prepare_table_for_close(info, addr); + /* + But we don't close it; we leave it available for the UNDO phase; + it's likely that the UNDO phase will need it. + */ + if (prepare_for_undo_phase) + translog_assign_id_to_share_from_recovery(info->s, sid); + } + } + +#if 0 /* will be enabled soon */ + if (prepare_for_undo_phase) + { + /* + We take a checkpoint as it can save future recovery work if we crash + soon. But we don't flush pages, as UNDOs would change them again + probably. + */ + if (ma_checkpoint_init(FALSE)) + return -1; + int res= ma_checkpoint_execute(CHECKPOINT_INDIRECT, FALSE); + ma_checkpoint_end(); + if (res) + unfinished= -1; + } +#endif + + return unfinished; +} + + +static int run_undo_phase(uint unfinished) +{ + if (unfinished > 0) + { + if (tracef != stdout) + { + /** @todo RECOVERY BUG all prints to stderr should go to error log */ + fprintf(stderr, " 100%%; transactions to roll back:"); + } + tprint(tracef, "%u transactions will be rolled back\n", unfinished); + for( ; ; ) + { + if (tracef != stdout) + fprintf(stderr, " %u", unfinished); + if ((unfinished--) == 0) + break; + char llbuf[22]; + TRN *trn= trnman_get_any_trn(); + DBUG_ASSERT(trn != NULL); + llstr(trn->trid, llbuf); + tprint(tracef, "Rolling back transaction of long id %s\n", llbuf); + + /* Execute all undo entries */ + while (trn->undo_lsn) + { + TRANSLOG_HEADER_BUFFER rec; + LOG_DESC *log_desc; + if (translog_read_record_header(trn->undo_lsn, &rec) == + RECHEADER_READ_ERROR) + return 1; + log_desc= &log_record_type_descriptor[rec.type]; + display_record_position(log_desc, &rec, 0); + if (log_desc->record_execute_in_undo_phase(&rec, trn)) + { + tprint(tracef, "Got error when executing undo\n"); + return 1; + } + } + + if (trnman_rollback_trn(trn)) + return 1; + /* We could want to span a few threads (4?) instead of 1 */ + /* In the future, we want to have this phase *online* */ + } + } + return 0; +} + + +/** + @brief re-enables transactionality, updates is_of_horizon + + @param info table + @param horizon address to set is_of_horizon +*/ + +static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon) +{ + MARIA_SHARE *share= info->s; + /* + In a fully-forward REDO phase (no checkpoint record), + state is now at least as new as the LSN of the current record. It may be + newer, in case we are seeing a LOGREC_FILE_ID which tells us to close a + table, but that table was later modified further in the log. + But if we parsed a checkpoint record, it may be this way in the log: + FILE_ID(6->t2)... FILE_ID(6->t1)... CHECKPOINT(6->t1) + Checkpoint parsing opened t1 with id 6; first FILE_ID above is going to + make t1 close; the first condition below is however false (when checkpoint + was taken it increased is_of_horizon) and so it works. For safety we + add the second condition. + */ + if (cmp_translog_addr(share->state.is_of_horizon, horizon) < 0 && + cmp_translog_addr(share->lsn_of_file_id, horizon) < 0) + share->state.is_of_horizon= horizon; + _ma_reenable_logging_for_table(share); +} + + +static MARIA_HA *get_MARIA_HA_from_REDO_record(const + TRANSLOG_HEADER_BUFFER *rec) +{ + uint16 sid; + pgcache_page_no_t page; + MARIA_HA *info; + char llbuf[22]; + + print_redo_phase_progress(rec->lsn); + sid= fileid_korr(rec->header); + page= page_korr(rec->header + FILEID_STORE_SIZE); + /** + @todo RECOVERY BUG + - for REDO_PURGE_BLOCKS, page is not at this pos + - for DELETE_ALL, record ends here! buffer overrun! + Solution: caller should pass a param enum { i_am_about_data_file, + i_am_about_index_file, none }. + */ + llstr(page, llbuf); + tprint(tracef, " For page %s of table of short id %u", llbuf, sid); + info= all_tables[sid].info; + if (info == NULL) + { + tprint(tracef, ", table skipped, so skipping record\n"); + return NULL; + } + tprint(tracef, ", '%s'", info->s->open_file_name); + if (cmp_translog_addr(rec->lsn, info->s->lsn_of_file_id) <= 0) + { + /* + This can happen only if processing a record before the checkpoint + record. + id->name mapping is newer than REDO record: for sure the table subject + of the REDO has been flushed and forced (id re-assignment implies this); + REDO can be ignored (and must be, as we don't know what this subject + table was). + */ + DBUG_ASSERT(cmp_translog_addr(rec->lsn, checkpoint_start) < 0); + tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent" + " than record, skipping record", + LSN_IN_PARTS(info->s->lsn_of_file_id)); + return NULL; + } + /* detect if an open instance of a dropped table (internal bug) */ + DBUG_ASSERT(info->s->last_version != 0); + if (cmp_translog_addr(rec->lsn, checkpoint_start) < 0) + { + /** + @todo RECOVERY BUG always assuming this is REDO for data file, but it + could soon be index file + */ + uint64 file_and_page_id= + (((uint64)all_tables[sid].org_dfile) << 32) | page; + struct st_dirty_page *dirty_page= (struct st_dirty_page *) + hash_search(&all_dirty_pages, + (uchar *)&file_and_page_id, sizeof(file_and_page_id)); + if ((dirty_page == NULL) || + cmp_translog_addr(rec->lsn, dirty_page->rec_lsn) < 0) + { + tprint(tracef, ", ignoring because of dirty_pages list\n"); + return NULL; + } + } + + /* + So we are going to read the page, and if its LSN is older than the + record's we will modify the page + */ + tprint(tracef, ", applying record\n"); + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */ + return info; +} + + +static MARIA_HA *get_MARIA_HA_from_UNDO_record(const + TRANSLOG_HEADER_BUFFER *rec) +{ + uint16 sid; + MARIA_HA *info; + + sid= fileid_korr(rec->header + LSN_STORE_SIZE); + tprint(tracef, " For table of short id %u", sid); + info= all_tables[sid].info; + if (info == NULL) + { + tprint(tracef, ", table skipped, so skipping record\n"); + return NULL; + } + tprint(tracef, ", '%s'", info->s->open_file_name); + if (cmp_translog_addr(rec->lsn, info->s->lsn_of_file_id) <= 0) + { + tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent" + " than record, skipping record", + LSN_IN_PARTS(info->s->lsn_of_file_id)); + return NULL; + } + DBUG_ASSERT(info->s->last_version != 0); + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */ + tprint(tracef, ", applying record\n"); + return info; +} + + +/** + @brief Parses checkpoint record. + + Builds from it the dirty_pages list (a hash), opens tables and maps them to + their 2-byte IDs, recreates transactions (not real TRNs though). + + @return From where in the log the REDO phase should start + @retval LSN_IMPOSSIBLE error + @retval other ok +*/ + +static LSN parse_checkpoint_record(LSN lsn) +{ + uint i; + TRANSLOG_HEADER_BUFFER rec; + + tprint(tracef, "Loading data from checkpoint record at LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(lsn)); + int len= translog_read_record_header(lsn, &rec); + + if (len == RECHEADER_READ_ERROR) + { + tprint(tracef, "Cannot find checkpoint record where it should be\n"); + return LSN_IMPOSSIBLE; + } + + enlarge_buffer(&rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec.lsn, 0, rec.record_length, + log_record_buffer.str, NULL) != + rec.record_length) + { + tprint(tracef, "Failed to read record\n"); + return LSN_IMPOSSIBLE; + } + + char *ptr= log_record_buffer.str; + checkpoint_start= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + + /* transactions */ + uint nb_active_transactions= uint2korr(ptr); + ptr+= 2; + tprint(tracef, "%u active transactions\n", nb_active_transactions); + LSN minimum_rec_lsn_of_active_transactions= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + + /* + how much brain juice and discussions there was to come to writing this + line + */ + set_if_smaller(checkpoint_start, minimum_rec_lsn_of_active_transactions); + + for (i= 0; i < nb_active_transactions; i++) + { + uint16 sid= uint2korr(ptr); + ptr+= 2; + TrID long_id= uint6korr(ptr); + ptr+= 6; + DBUG_ASSERT(sid > 0 && long_id > 0); + LSN undo_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + LSN first_undo_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + new_transaction(sid, long_id, undo_lsn, first_undo_lsn); + } + uint nb_committed_transactions= uint4korr(ptr); + ptr+= 4; + tprint(tracef, "%lu committed transactions\n", + (ulong)nb_committed_transactions); + /* no purging => committed transactions are not important */ + ptr+= (6 + LSN_STORE_SIZE) * nb_committed_transactions; + + /* tables */ + uint nb_tables= uint4korr(ptr); + ptr+= 4; + tprint(tracef, "%u open tables\n", nb_tables); + for (i= 0; i< nb_tables; i++) + { + char name[FN_REFLEN]; + uint16 sid= uint2korr(ptr); + ptr+= 2; + DBUG_ASSERT(sid > 0); + File kfile= uint4korr(ptr); + ptr+= 4; + File dfile= uint4korr(ptr); + ptr+= 4; + LSN first_log_write_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + uint name_len= strlen(ptr) + 1; + ptr+= name_len; + strnmov(name, ptr, sizeof(name)); + if (new_table(sid, name, kfile, dfile, first_log_write_lsn)) + return LSN_IMPOSSIBLE; + } + + /* dirty pages */ + uint nb_dirty_pages= uint4korr(ptr); + ptr+= 4; + tprint(tracef, "%u dirty pages\n", nb_dirty_pages); + if (hash_init(&all_dirty_pages, &my_charset_bin, nb_dirty_pages, + offsetof(struct st_dirty_page, file_and_page_id), + sizeof(((struct st_dirty_page *)NULL)->file_and_page_id), + NULL, NULL, 0)) + return LSN_IMPOSSIBLE; + dirty_pages_pool= + (struct st_dirty_page *)my_malloc(nb_dirty_pages * + sizeof(struct st_dirty_page), + MYF(MY_WME)); + if (unlikely(dirty_pages_pool == NULL)) + return LSN_IMPOSSIBLE; + struct st_dirty_page *next_dirty_page_in_pool= dirty_pages_pool; + LSN minimum_rec_lsn_of_dirty_pages= LSN_MAX; + for (i= 0; i < nb_dirty_pages ; i++) + { + File fileid= uint4korr(ptr); + ptr+= 4; + pgcache_page_no_t pageid= uint4korr(ptr); + ptr+= 4; + LSN rec_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + if (new_page(fileid, pageid, rec_lsn, next_dirty_page_in_pool++)) + return LSN_IMPOSSIBLE; + set_if_smaller(minimum_rec_lsn_of_dirty_pages, rec_lsn); + } + /* after that, there will be no insert/delete into the hash */ + /* + sanity check on record (did we screw up with all those "ptr+=", did the + checkpoint write code and checkpoint read code go out of sync?). + */ + if (ptr != (log_record_buffer.str + log_record_buffer.length)) + { + tprint(tracef, "checkpoint record corrupted\n"); + return LSN_IMPOSSIBLE; + } + set_if_smaller(checkpoint_start, minimum_rec_lsn_of_dirty_pages); + + return checkpoint_start; +} + +static int new_page(File fileid, pgcache_page_no_t pageid, LSN rec_lsn, + struct st_dirty_page *dirty_page) +{ + /* serves as hash key */ + dirty_page->file_and_page_id= (((uint64)fileid) << 32) | pageid; + dirty_page->rec_lsn= rec_lsn; + return my_hash_insert(&all_dirty_pages, (uchar *)dirty_page); +} + + +static int close_all_tables(void) +{ + int error= 0; + LIST *list_element, *next_open; + MARIA_HA *info; + pthread_mutex_lock(&THR_LOCK_maria); + if (maria_open_list == NULL) + goto end; + tprint(tracef, "Closing all tables\n"); + if (tracef != stdout && redo_phase_message_printed) + { + /** @todo RECOVERY BUG all prints to stderr should go to error log */ + fprintf(stderr, "; flushing tables"); + } + + /* + Since the end of end_of_redo_phase(), we may have written new records + (if UNDO phase ran) and thus the state is newer than at + end_of_redo_phase(), we need to bump is_of_horizon again. + */ + TRANSLOG_ADDRESS addr= translog_get_horizon(); + for (list_element= maria_open_list ; list_element ; list_element= next_open) + { + next_open= list_element->next; + info= (MARIA_HA*)list_element->data; + pthread_mutex_unlock(&THR_LOCK_maria); /* ok, UNDO phase not online yet */ + prepare_table_for_close(info, addr); + error|= maria_close(info); + pthread_mutex_lock(&THR_LOCK_maria); + } +end: + pthread_mutex_unlock(&THR_LOCK_maria); + return error; +} + +static void print_redo_phase_progress(TRANSLOG_ADDRESS addr) +{ + static int end_logno= FILENO_IMPOSSIBLE, end_offset, percentage_printed= 0; + static ulonglong initial_remainder= -1; + if (tracef == stdout) + return; + if (!redo_phase_message_printed) + { + /** @todo RECOVERY BUG all prints to stderr should go to error log */ + fprintf(stderr, "Maria engine: starting recovery; recovered pages: 0%%"); + redo_phase_message_printed= TRUE; + } + if (end_logno == FILENO_IMPOSSIBLE) + { + LSN end_addr= translog_get_horizon(); + end_logno= LSN_FILE_NO(end_addr); + end_offset= LSN_OFFSET(end_addr); + } + int cur_logno= LSN_FILE_NO(addr); + int cur_offset= LSN_OFFSET(addr); + ulonglong remainder; + remainder= (cur_logno == end_logno) ? (end_offset - cur_offset) : + (TRANSLOG_FILE_SIZE - cur_offset + + max(end_logno - cur_logno - 1, 0) * TRANSLOG_FILE_SIZE + end_offset); + if (initial_remainder == (ulonglong)(-1)) + initial_remainder= remainder; + int percentage_done= + (initial_remainder - remainder) * ULL(100) / initial_remainder; + if ((percentage_done - percentage_printed) >= 10) + { + percentage_printed= percentage_done; + fprintf(stderr, " %d%%", percentage_done); + } +} + +#ifdef MARIA_EXTERNAL_LOCKING +#error Maria's Checkpoint and Recovery are really not ready for it +#endif + +/* +Recovery of the state : how it works +===================================== + +Here we ignore Checkpoints for a start. + +The state (MARIA_HA::MARIA_SHARE::MARIA_STATE_INFO) is updated in +memory frequently (at least at every row write/update/delete) but goes +to disk at few moments: maria_close() when closing the last open +instance, and a few rare places like CHECK/REPAIR/ALTER +(non-transactional tables also do it at maria_lock_database() but we +needn't cover them here). + +In case of crash, state on disk is likely to be older than what it was +in memory, the REDO phase needs to recreate the state as it was in +memory at the time of crash. When we say Recovery here we will always +mean "REDO phase". + +For example MARIA_STATUS_INFO::records (count of records). It is updated at +the end of every row write/update/delete/delete_all. When Recovery sees the +sign of such row operation (UNDO or REDO), it may need to update the records' +count if that count does not reflect that operation (is older). How to know +the age of the state compared to the log record: every time the state +goes to disk at runtime, its member "is_of_horizon" is updated to the +current end-of-log horizon. So Recovery just needs to compare is_of_horizon +and the record's LSN to know if it should modify "records". + +Other operations like ALTER TABLE DISABLE KEYS update the state but +don't write log records, thus the REDO phase cannot repeat their +effect on the state in case of crash. But we make them sync the state +as soon as they have finished. This reduces the window for a problem. + +It looks like only one thread at a time updates the state in memory or +on disk. However there is not 100% certainty when it comes to +HA_EXTRA_(FORCE_REOPEN|PREPARE_FOR_RENAME): can they read the state +from memory while some other thread is updating "records" in memory? +If yes, they may write a corrupted state to disk. +We assume that no for now: ASK_MONTY. + +With checkpoints +================ + +Checkpoint module needs to read the state in memory and write it to +disk. This may happen while some other thread is modifying the state +in memory or on disk. Checkpoint thus may be reading changing data, it +needs a mutex to not have it corrupted, and concurrent modifiers of +the state need that mutex too for the same reason. +"records" is modified for every row write/update/delete, we don't want +to add a mutex lock/unlock there. So we re-use the mutex lock/unlock +which is already present in these moments, namely the log's mutex which is +taken when UNDO_ROW_INSERT|UPDATE|DELETE is written: we update "records" in +under-log-mutex hooks when writing these records (thus "records" is +not updated at the end of maria_write/update/delete() anymore). +Thus Checkpoint takes the log's lock and can read "records" from +memory an write it to disk and release log's lock. +We however want to avoid having the disk write under the log's +lock. So it has to be under another mutex, natural choice is +intern_lock (as Checkpoint needs it anyway to read MARIA_SHARE::kfile, +and as maria_close() takes it too). All state writes to disk are +changed to be protected with intern_lock. +So Checkpoint takes intern_lock, log's lock, reads "records" from +memory, releases log's lock, updates is_of_horizon and writes "records" to +disk, release intern_lock. +In practice, not only "records" needs to be written but the full +state. So, Checkpoint reads the full state from memory. Some other +thread may at this moment be modifying in memory some pieces of the +state which are not protected by the lock's log (see ma_extra.c +HA_EXTRA_NO_KEYS), and Checkpoint would be reading a corrupted state +from memory; to guard against that we extend the intern_lock-zone to +changes done to the state in memory by HA_EXTRA_NO_KEYS et al, and +also any change made in memory to create_rename_lsn/state_is_of_horizon. +Last, we don't want in Checkpoint to do + log lock; read state from memory; release log lock; +for each table, it may hold the log's lock too much in total. +So, we instead do + log lock; read N states from memory; release log lock; +Thus, the sequence above happens outside of any intern_lock. +But this re-introduces the problem that some other thread may be changing the +state in memory and on disk under intern_lock, without log's lock, like +HA_EXTRA_NO_KEYS, while we read the N states. However, when Checkpoint later +comes to handling the table under intern_lock, which is serialized with +HA_EXTRA_NO_KEYS, it can see that is_of_horizon is higher then when the state +was read from memory under log's lock, and thus can decide to not flush the +obsolete state it has, knowing that the other thread flushed a more recent +state already. If on the other hand is_of_horizon is not higher, the read +state is current and can be flushed. So we have a per-table sequence: + lock intern_lock; test if is_of_horizon is higher than when we read the state + under log's lock; if no then flush the read state to disk. +*/ + +/* some comments and pseudo-code which we keep for later */ +#if 0 + /* + MikaelR suggests: support checkpoints during REDO phase too: do checkpoint + after a certain amount of log records have been executed. This helps + against repeated crashes. Those checkpoints could not be user-requested + (as engine is not communicating during the REDO phase), so they would be + automatic: this changes the original assumption that we don't write to the + log while in the REDO phase, but why not. How often should we checkpoint? + */ + + /* + We want to have two steps: + engine->recover_with_max_memory(); + next_engine->recover_with_max_memory(); + engine->init_with_normal_memory(); + next_engine->init_with_normal_memory(); + So: in recover_with_max_memory() allocate a giant page cache, do REDO + phase, then all page cache is flushed and emptied and freed (only retain + small structures like TM): take full checkpoint, which is useful if + next engine crashes in its recovery the next second. + Destroy all shares (maria_close()), then at init_with_normal_memory() we + do this: + */ + + /**** UNDO PHASE *****/ + + /* + Launch one or more threads to do the background rollback. Don't wait for + them to complete their rollback (background rollback; for debugging, we + can have an option which waits). Set a counter (total_of_rollback_threads) + to the number of threads to lauch. + + Note that InnoDB's rollback-in-background works as long as InnoDB is the + last engine to recover, otherwise MySQL will refuse new connections until + the last engine has recovered so it's not "background" from the user's + point of view. InnoDB is near top of sys_table_types so all others + (e.g. BDB) recover after it... So it's really "online rollback" only if + InnoDB is the only engine. + */ + + /* wake up delete/update handler */ + /* tell the TM that it can now accept new transactions */ + + /* + mark that checkpoint requests are now allowed. + */ +#endif diff --git a/storage/maria/ma_recovery.h b/storage/maria/ma_recovery.h new file mode 100644 index 00000000000..e3864d6022b --- /dev/null +++ b/storage/maria/ma_recovery.h @@ -0,0 +1,30 @@ +/* Copyright (C) 2006,2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3072 Maria recovery + First version written by Guilhem Bichot on 2006-04-27. + Does not compile yet. +*/ + +/* This is the interface of this module. */ + +/* Performs recovery of the engine at start */ + +C_MODE_START +int maria_recover(void); +int maria_apply_log(LSN lsn, my_bool apply, FILE *trace_file, + my_bool execute_undo_phase, my_bool skip_DDLs); +C_MODE_END diff --git a/storage/maria/ma_rename.c b/storage/maria/ma_rename.c new file mode 100644 index 00000000000..44cd60711da --- /dev/null +++ b/storage/maria/ma_rename.c @@ -0,0 +1,139 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Rename a table +*/ + +#include "ma_fulltext.h" +#include "trnman_public.h" + +/** + @brief renames a table + + @param old_name current name of table + @param new_name table should be renamed to this name + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int maria_rename(const char *old_name, const char *new_name) +{ + char from[FN_REFLEN],to[FN_REFLEN]; + int data_file_rename_error; +#ifdef USE_RAID + uint raid_type=0,raid_chunks=0; +#endif + MARIA_HA *info; + MARIA_SHARE *share; + myf sync_dir; + DBUG_ENTER("maria_rename"); + +#ifdef EXTRA_DEBUG + _ma_check_table_is_closed(old_name,"rename old_table"); + _ma_check_table_is_closed(new_name,"rename new table2"); +#endif + /** @todo LOCK take X-lock on table */ + if (!(info= maria_open(old_name, O_RDWR, HA_OPEN_FOR_REPAIR))) + DBUG_RETURN(my_errno); + share= info->s; +#ifdef USE_RAID + raid_type = share->base.raid_type; + raid_chunks = share->base.raid_chunks; +#endif + + /* + the renaming of an internal table to the final table (like in ALTER TABLE) + is the moment when this table receives its correct create_rename_lsn and + this is important; make sure transactionality has been re-enabled. + */ + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + sync_dir= (share->now_transactional && !share->temporary && + !maria_in_recovery) ? MY_SYNC_DIR : 0; + if (sync_dir) + { + LSN lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uint old_name_len= strlen(old_name)+1, new_name_len= strlen(new_name)+1; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char *)old_name; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= old_name_len; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char *)new_name; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= new_name_len; + /* + For this record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe, which it is not now (that would require + work using the ddl_log of sql/sql_table.cc); when it is, we should + reconsider the moment of writing this log record (before or after op, + under THR_LOCK_maria or not...), how to use it in Recovery. + For now it can serve to apply logs to a backup so we sync it. + */ + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_RENAME_TABLE, + &dummy_transaction_object, NULL, + old_name_len + new_name_len, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL) || + translog_flush(lsn))) + { + maria_close(info); + DBUG_RETURN(1); + } + /* + store LSN into file, needed for Recovery to not be confused if a + RENAME happened (applying REDOs to the wrong table). + */ + if (_ma_update_create_rename_lsn(share, lsn, TRUE)) + { + maria_close(info); + DBUG_RETURN(1); + } + } + + maria_close(info); +#ifdef USE_RAID +#ifdef EXTRA_DEBUG + _ma_check_table_is_closed(old_name,"rename raidcheck"); +#endif +#endif /* USE_RAID */ + + fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + if (my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir))) + DBUG_RETURN(my_errno); + fn_format(from,old_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + fn_format(to,new_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); +#ifdef USE_RAID + if (raid_type) + data_file_rename_error= my_raid_rename(from, to, raid_chunks, + MYF(MY_WME | sync_dir)); + else +#endif + data_file_rename_error= + my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir)); + if (data_file_rename_error) + { + /* + now we have a renamed index file and a non-renamed data file, try to + undo the rename of the index file. + */ + data_file_rename_error= my_errno; + fn_format(from, old_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT)); + fn_format(to, new_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT)); + my_rename_with_symlink(to, from, MYF(MY_WME | sync_dir)); + } + DBUG_RETURN(data_file_rename_error); + +} diff --git a/storage/maria/ma_rfirst.c b/storage/maria/ma_rfirst.c new file mode 100644 index 00000000000..226aaa551f0 --- /dev/null +++ b/storage/maria/ma_rfirst.c @@ -0,0 +1,26 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + + /* Read first row through a specfic key */ + +int maria_rfirst(MARIA_HA *info, uchar *buf, int inx) +{ + DBUG_ENTER("maria_rfirst"); + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->update|= HA_STATE_PREV_FOUND; + DBUG_RETURN(maria_rnext(info,buf,inx)); +} /* maria_rfirst */ diff --git a/storage/maria/ma_rkey.c b/storage/maria/ma_rkey.c new file mode 100644 index 00000000000..c9653d30110 --- /dev/null +++ b/storage/maria/ma_rkey.c @@ -0,0 +1,178 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Read record based on a key */ + +#include "maria_def.h" +#include "ma_rt_index.h" + + /* Read a record using key */ + /* Ordinary search_flag is 0 ; Give error if no record with key */ + +int maria_rkey(MARIA_HA *info, uchar *buf, int inx, const uchar *key, + key_part_map keypart_map, enum ha_rkey_function search_flag) +{ + uchar *key_buff; + MARIA_SHARE *share=info->s; + MARIA_KEYDEF *keyinfo; + HA_KEYSEG *last_used_keyseg; + uint pack_key_length, use_key_length, nextflag; + DBUG_ENTER("maria_rkey"); + DBUG_PRINT("enter", ("base: 0x%lx buf: 0x%lx inx: %d search_flag: %d", + (long) info, (long) buf, inx, search_flag)); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(my_errno); + + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->last_key_func= search_flag; + keyinfo= share->keyinfo + inx; + + if (info->once_flags & USE_PACKED_KEYS) + { + info->once_flags&= ~USE_PACKED_KEYS; /* Reset flag */ + /* + key is already packed!; This happens when we are using a MERGE TABLE + */ + key_buff= info->lastkey+info->s->base.max_key_length; + pack_key_length= keypart_map; + bmove(key_buff, key, pack_key_length); + last_used_keyseg= info->s->keyinfo[inx].seg + info->last_used_keyseg; + } + else + { + DBUG_ASSERT(keypart_map); + /* Save the packed key for later use in the second buffer of lastkey. */ + key_buff=info->lastkey+info->s->base.max_key_length; + pack_key_length= _ma_pack_key(info,(uint) inx, key_buff, key, + keypart_map, &last_used_keyseg); + /* Save packed_key_length for use by the MERGE engine. */ + info->pack_key_length= pack_key_length; + info->last_used_keyseg= (uint16) (last_used_keyseg - + info->s->keyinfo[inx].seg); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, keyinfo->seg, + key_buff, pack_key_length);); + } + + if (fast_ma_readinfo(info)) + goto err; + if (share->concurrent_insert) + rw_rdlock(&share->key_root_lock[inx]); + + nextflag=maria_read_vec[search_flag]; + use_key_length=pack_key_length; + if (!(nextflag & (SEARCH_FIND | SEARCH_NO_FIND | SEARCH_LAST))) + use_key_length=USE_WHOLE_KEY; + + switch (info->s->keyinfo[inx].key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + if (maria_rtree_find_first(info,inx,key_buff,use_key_length,nextflag) < 0) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno= HA_ERR_CRASHED; + info->cur_row.lastpos= HA_OFFSET_ERROR; + } + break; +#endif + case HA_KEY_ALG_BTREE: + default: + if (!_ma_search(info, keyinfo, key_buff, use_key_length, + maria_read_vec[search_flag], + info->s->state.key_root[inx]) && + share->concurrent_insert) + { + /* + If we searching for a partial key (or using >, >=, < or <=) and + the data is outside of the data file, we need to continue searching + for the first key inside the data file + */ + if (info->cur_row.lastpos >= info->state->data_file_length && + (search_flag != HA_READ_KEY_EXACT || + last_used_keyseg != keyinfo->seg + keyinfo->keysegs)) + { + do + { + uint not_used[2]; + /* + Skip rows that are inserted by other threads since we got a lock + Note that this can only happen if we are not searching after an + full length exact key, because the keys are sorted + according to position + */ + if (_ma_search_next(info, keyinfo, info->lastkey, + info->lastkey_length, + maria_readnext_vec[search_flag], + info->s->state.key_root[inx])) + break; + /* + Check that the found key does still match the search. + _ma_search_next() delivers the next key regardless of its + value. + */ + if (search_flag == HA_READ_KEY_EXACT && + ha_key_cmp(keyinfo->seg, (uchar*) key_buff, + (uchar*) info->lastkey, use_key_length, + SEARCH_FIND, not_used)) + { + my_errno= HA_ERR_KEY_NOT_FOUND; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + } + } while (info->cur_row.lastpos >= info->state->data_file_length); + } + } + } + if (share->concurrent_insert) + rw_unlock(&share->key_root_lock[inx]); + + if (info->cur_row.lastpos == HA_OFFSET_ERROR) + { + fast_ma_writeinfo(info); + goto err; + } + + /* Calculate length of the found key; Used by maria_rnext_same */ + if ((keyinfo->flag & HA_VAR_LENGTH_KEY) && last_used_keyseg) + info->last_rkey_length= _ma_keylength_part(keyinfo, info->lastkey, + last_used_keyseg); + else + info->last_rkey_length= pack_key_length; + + /* Check if we don't want to have record back, only error message */ + if (!buf) + { + fast_ma_writeinfo(info); + DBUG_RETURN(0); + } + if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + + info->cur_row.lastpos= HA_OFFSET_ERROR; /* Didn't find row */ + +err: + /* Store last used key as a base for read next */ + memcpy(info->lastkey,key_buff,pack_key_length); + info->last_rkey_length= pack_key_length; + bzero((char*) info->lastkey+pack_key_length,info->s->base.rec_reflength); + info->lastkey_length=pack_key_length+info->s->base.rec_reflength; + + if (search_flag == HA_READ_AFTER_KEY) + info->update|=HA_STATE_NEXT_FOUND; /* Previous gives last row */ + DBUG_RETURN(my_errno); +} /* _ma_rkey */ diff --git a/storage/maria/ma_rlast.c b/storage/maria/ma_rlast.c new file mode 100644 index 00000000000..a9a470d37d9 --- /dev/null +++ b/storage/maria/ma_rlast.c @@ -0,0 +1,26 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + + /* Read last row with the same key as the previous read. */ + +int maria_rlast(MARIA_HA *info, uchar *buf, int inx) +{ + DBUG_ENTER("maria_rlast"); + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->update|= HA_STATE_NEXT_FOUND; + DBUG_RETURN(maria_rprev(info,buf,inx)); +} /* maria_rlast */ diff --git a/storage/maria/ma_rnext.c b/storage/maria/ma_rnext.c new file mode 100644 index 00000000000..fcc0f1f6a90 --- /dev/null +++ b/storage/maria/ma_rnext.c @@ -0,0 +1,122 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +#include "ma_rt_index.h" + + /* + Read next row with the same key as previous read + One may have done a write, update or delete of the previous row. + NOTE! Even if one changes the previous row, the next read is done + based on the position of the last used key! + */ + +int maria_rnext(MARIA_HA *info, uchar *buf, int inx) +{ + int error,changed; + uint flag; + DBUG_ENTER("maria_rnext"); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(my_errno); + flag=SEARCH_BIGGER; /* Read next */ + if (info->cur_row.lastpos == HA_OFFSET_ERROR && + info->update & HA_STATE_PREV_FOUND) + flag=0; /* Read first */ + + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + if (info->s->concurrent_insert) + rw_rdlock(&info->s->key_root_lock[inx]); + changed= _ma_test_if_changed(info); + if (!flag) + { + switch(info->s->keyinfo[inx].key_alg){ +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + error=maria_rtree_get_first(info,inx,info->lastkey_length); + break; +#endif + case HA_KEY_ALG_BTREE: + default: + error= _ma_search_first(info,info->s->keyinfo+inx, + info->s->state.key_root[inx]); + break; + } + } + else + { + switch (info->s->keyinfo[inx].key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + /* + Note that rtree doesn't support that the table + may be changed since last call, so we do need + to skip rows inserted by other threads like in btree + */ + error= maria_rtree_get_next(info,inx,info->lastkey_length); + break; +#endif + case HA_KEY_ALG_BTREE: + default: + if (!changed) + error= _ma_search_next(info,info->s->keyinfo+inx,info->lastkey, + info->lastkey_length,flag, + info->s->state.key_root[inx]); + else + error= _ma_search(info,info->s->keyinfo+inx,info->lastkey, + USE_WHOLE_KEY,flag, info->s->state.key_root[inx]); + } + } + + if (info->s->concurrent_insert) + { + if (!error) + { + while (info->cur_row.lastpos >= info->state->data_file_length) + { + /* Skip rows inserted by other threads since we got a lock */ + if ((error= _ma_search_next(info,info->s->keyinfo+inx, + info->lastkey, + info->lastkey_length, + SEARCH_BIGGER, + info->s->state.key_root[inx]))) + break; + } + } + rw_unlock(&info->s->key_root_lock[inx]); + } + /* Don't clear if database-changed */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->update|= HA_STATE_NEXT_FOUND; + + if (error) + { + if (my_errno == HA_ERR_KEY_NOT_FOUND) + my_errno=HA_ERR_END_OF_FILE; + } + else if (!buf) + { + DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0); + } + else if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + DBUG_PRINT("error",("Got error: %d, errno: %d",error, my_errno)); + DBUG_RETURN(my_errno); +} /* maria_rnext */ diff --git a/storage/maria/ma_rnext_same.c b/storage/maria/ma_rnext_same.c new file mode 100644 index 00000000000..6782cf5b8cf --- /dev/null +++ b/storage/maria/ma_rnext_same.c @@ -0,0 +1,107 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "ma_rt_index.h" + +/* + Read next row with the same key as previous read, but abort if + the key changes. + One may have done a write, update or delete of the previous row. + + NOTE! Even if one changes the previous row, the next read is done + based on the position of the last used key! +*/ + +int maria_rnext_same(MARIA_HA *info, uchar *buf) +{ + int error; + uint inx,not_used[2]; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_rnext_same"); + + if ((int) (inx= info->lastinx) < 0 || + info->cur_row.lastpos == HA_OFFSET_ERROR) + DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX); + keyinfo= info->s->keyinfo+inx; + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + + if (info->s->concurrent_insert) + rw_rdlock(&info->s->key_root_lock[inx]); + + switch (keyinfo->key_alg) + { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + if ((error=maria_rtree_find_next(info,inx, + maria_read_vec[info->last_key_func]))) + { + error=1; + my_errno=HA_ERR_END_OF_FILE; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + } + break; +#endif + case HA_KEY_ALG_BTREE: + default: + if (!(info->update & HA_STATE_RNEXT_SAME)) + { + /* First rnext_same; Store old key */ + memcpy(info->lastkey2,info->lastkey,info->last_rkey_length); + } + for (;;) + { + if ((error= _ma_search_next(info,keyinfo,info->lastkey, + info->lastkey_length,SEARCH_BIGGER, + info->s->state.key_root[inx]))) + break; + if (ha_key_cmp(keyinfo->seg, (uchar*) info->lastkey, + (uchar*) info->lastkey2, + info->last_rkey_length, SEARCH_FIND, not_used)) + { + error=1; + my_errno=HA_ERR_END_OF_FILE; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + } + /* Skip rows that are inserted by other threads since we got a lock */ + if (info->cur_row.lastpos < info->state->data_file_length) + break; + } + } + if (info->s->concurrent_insert) + rw_unlock(&info->s->key_root_lock[inx]); + /* Don't clear if database-changed */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->update|= HA_STATE_NEXT_FOUND | HA_STATE_RNEXT_SAME; + + if (error) + { + if (my_errno == HA_ERR_KEY_NOT_FOUND) + my_errno=HA_ERR_END_OF_FILE; + } + else if (!buf) + { + DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0); + } + else if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + DBUG_RETURN(my_errno); +} /* maria_rnext_same */ diff --git a/storage/maria/ma_rprev.c b/storage/maria/ma_rprev.c new file mode 100644 index 00000000000..753ff604975 --- /dev/null +++ b/storage/maria/ma_rprev.c @@ -0,0 +1,88 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + + /* + Read previous row with the same key as previous read + One may have done a write, update or delete of the previous row. + NOTE! Even if one changes the previous row, the next read is done + based on the position of the last used key! + */ + +int maria_rprev(MARIA_HA *info, uchar *buf, int inx) +{ + int error,changed; + register uint flag; + MARIA_SHARE *share=info->s; + DBUG_ENTER("maria_rprev"); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(my_errno); + flag=SEARCH_SMALLER; /* Read previous */ + if (info->cur_row.lastpos == HA_OFFSET_ERROR && + info->update & HA_STATE_NEXT_FOUND) + flag=0; /* Read last */ + + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + changed= _ma_test_if_changed(info); + if (share->concurrent_insert) + rw_rdlock(&share->key_root_lock[inx]); + if (!flag) + error= _ma_search_last(info, share->keyinfo+inx, + share->state.key_root[inx]); + else if (!changed) + error= _ma_search_next(info,share->keyinfo+inx,info->lastkey, + info->lastkey_length,flag, + share->state.key_root[inx]); + else + error= _ma_search(info,share->keyinfo+inx,info->lastkey, + USE_WHOLE_KEY, flag, share->state.key_root[inx]); + + if (share->concurrent_insert) + { + if (!error) + { + while (info->cur_row.lastpos >= info->state->data_file_length) + { + /* Skip rows that are inserted by other threads since we got a lock */ + if ((error= _ma_search_next(info,share->keyinfo+inx,info->lastkey, + info->lastkey_length, + SEARCH_SMALLER, + share->state.key_root[inx]))) + break; + } + } + rw_unlock(&share->key_root_lock[inx]); + } + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->update|= HA_STATE_PREV_FOUND; + if (error) + { + if (my_errno == HA_ERR_KEY_NOT_FOUND) + my_errno=HA_ERR_END_OF_FILE; + } + else if (!buf) + { + DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0); + } + else if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + DBUG_RETURN(my_errno); +} /* maria_rprev */ diff --git a/storage/maria/ma_rrnd.c b/storage/maria/ma_rrnd.c new file mode 100644 index 00000000000..24c4bfdd467 --- /dev/null +++ b/storage/maria/ma_rrnd.c @@ -0,0 +1,44 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Read a record with random-access. The position to the record must + get by MARIA_HA. The next record can be read with pos= MARIA_POS_ERROR */ + + +#include "maria_def.h" + +/* + Read a row based on position. + + RETURN + 0 Ok. + HA_ERR_RECORD_DELETED Record is deleted. + HA_ERR_END_OF_FILE EOF. +*/ + +int maria_rrnd(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos) +{ + DBUG_ENTER("maria_rrnd"); + + DBUG_ASSERT(filepos != HA_OFFSET_ERROR); + + /* Init all but update-flag */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + + info->cur_row.lastpos= filepos; /* Remember for update */ + DBUG_RETURN((*info->s->read_record)(info, buf, filepos)); +} diff --git a/storage/maria/ma_rsame.c b/storage/maria/ma_rsame.c new file mode 100644 index 00000000000..9c9acac013a --- /dev/null +++ b/storage/maria/ma_rsame.c @@ -0,0 +1,69 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +/* + Find current row with read on position or read on key + + NOTES + If inx >= 0 find record using key + + RETURN + 0 Ok + HA_ERR_KEY_NOT_FOUND Row is deleted + HA_ERR_END_OF_FILE End of file +*/ + + +int maria_rsame(MARIA_HA *info, uchar *record, int inx) +{ + DBUG_ENTER("maria_rsame"); + + if (inx != -1 && ! maria_is_key_active(info->s->state.key_map, inx)) + { + DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX); + } + if (info->cur_row.lastpos == HA_OFFSET_ERROR || + info->update & HA_STATE_DELETED) + { + DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No current record */ + } + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + /* Read row from data file */ + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + + if (inx >= 0) + { + info->lastinx=inx; + info->lastkey_length= _ma_make_key(info,(uint) inx,info->lastkey,record, + info->cur_row.lastpos); + if (info->s->concurrent_insert) + rw_rdlock(&info->s->key_root_lock[inx]); + VOID(_ma_search(info,info->s->keyinfo+inx,info->lastkey, USE_WHOLE_KEY, + SEARCH_SAME, + info->s->state.key_root[inx])); + if (info->s->concurrent_insert) + rw_unlock(&info->s->key_root_lock[inx]); + } + + if (!(*info->read_record)(info, record, info->cur_row.lastpos)) + DBUG_RETURN(0); + if (my_errno == HA_ERR_RECORD_DELETED) + my_errno=HA_ERR_KEY_NOT_FOUND; + DBUG_RETURN(my_errno); +} /* maria_rsame */ diff --git a/storage/maria/ma_rsamepos.c b/storage/maria/ma_rsamepos.c new file mode 100644 index 00000000000..186bc80c06d --- /dev/null +++ b/storage/maria/ma_rsamepos.c @@ -0,0 +1,58 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* read record through position and fix key-position */ +/* As maria_rsame but supply a position */ + +#include "maria_def.h" + + + /* + ** If inx >= 0 update index pointer + ** Returns one of the following values: + ** 0 = Ok. + ** HA_ERR_KEY_NOT_FOUND = Row is deleted + ** HA_ERR_END_OF_FILE = End of file + */ + +int maria_rsame_with_pos(MARIA_HA *info, uchar *record, int inx, + MARIA_RECORD_POS filepos) +{ + DBUG_ENTER("maria_rsame_with_pos"); + DBUG_PRINT("enter",("index: %d filepos: %ld", inx, (long) filepos)); + + if (inx < -1 || + (inx >= 0 && ! maria_is_key_active(info->s->state.key_map, inx))) + { + DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX); + } + + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + if ((*info->s->read_record)(info, record, filepos)) + { + if (my_errno == HA_ERR_RECORD_DELETED) + my_errno=HA_ERR_KEY_NOT_FOUND; + DBUG_RETURN(my_errno); + } + info->cur_row.lastpos= filepos; + info->lastinx= inx; + if (inx >= 0) + { + info->lastkey_length= _ma_make_key(info,(uint) inx,info->lastkey,record, + info->cur_row.lastpos); + info->update|=HA_STATE_KEY_CHANGED; /* Don't use indexposition */ + } + DBUG_RETURN(0); +} /* maria_rsame_pos */ diff --git a/storage/maria/ma_rt_index.c b/storage/maria/ma_rt_index.c new file mode 100644 index 00000000000..4980233fc11 --- /dev/null +++ b/storage/maria/ma_rt_index.c @@ -0,0 +1,1140 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" +#include "ma_rt_key.h" +#include "ma_rt_mbr.h" + +#define REINSERT_BUFFER_INC 10 +#define PICK_BY_AREA +/*#define PICK_BY_PERIMETER*/ + +typedef struct st_page_level +{ + uint level; + my_off_t offs; +} stPageLevel; + +typedef struct st_page_list +{ + ulong n_pages; + ulong m_pages; + stPageLevel *pages; +} stPageList; + + +/* + Find next key in r-tree according to search_flag recursively + + NOTES + Used in maria_rtree_find_first() and maria_rtree_find_next() + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +static int maria_rtree_find_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uint search_flag, + uint nod_cmp_flag, my_off_t page, int level) +{ + uint nod_flag; + int res; + uchar *page_buf, *k, *last; + int k_len; + uint *saved_key = (uint*) (info->maria_rtree_recursion_state) + level; + + if (!(page_buf = (uchar*) my_alloca((uint)keyinfo->block_length))) + { + my_errno = HA_ERR_OUT_OF_MEM; + return -1; + } + if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0)) + goto err1; + nod_flag = _ma_test_if_nod(page_buf); + + k_len = keyinfo->keylength - info->s->base.rec_reflength; + + if(info->maria_rtree_recursion_depth >= level) + { + k= page_buf + *saved_key; + } + else + { + k = rt_PAGE_FIRST_KEY(page_buf, nod_flag); + } + last= rt_PAGE_END(page_buf); + + for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag)) + { + if (nod_flag) + { + /* this is an internal node in the tree */ + if (!(res = maria_rtree_key_cmp(keyinfo->seg, + info->first_mbr_key, k, + info->last_rkey_length, nod_cmp_flag))) + { + switch ((res = maria_rtree_find_req(info, keyinfo, search_flag, + nod_cmp_flag, + _ma_kpos(nod_flag, k), + level + 1))) + { + case 0: /* found - exit from recursion */ + *saved_key = k - page_buf; + goto ok; + case 1: /* not found - continue searching */ + info->maria_rtree_recursion_depth = level; + break; + default: /* error */ + case -1: + goto err1; + } + } + } + else + { + /* this is a leaf */ + if (!maria_rtree_key_cmp(keyinfo->seg, info->first_mbr_key, + k, info->last_rkey_length, search_flag)) + { + uchar *after_key = (uchar*) rt_PAGE_NEXT_KEY(k, k_len, nod_flag); + info->cur_row.lastpos = _ma_dpos(info, 0, after_key); + info->lastkey_length = k_len + info->s->base.rec_reflength; + memcpy(info->lastkey, k, info->lastkey_length); + info->maria_rtree_recursion_depth = level; + *saved_key = last - page_buf; + + if (after_key < last) + { + info->int_keypos = info->buff; + info->int_maxpos = info->buff + (last - after_key); + memcpy(info->buff, after_key, last - after_key); + info->keyread_buff_used = 0; + } + else + { + info->keyread_buff_used = 1; + } + + res = 0; + goto ok; + } + } + } + info->cur_row.lastpos = HA_OFFSET_ERROR; + my_errno = HA_ERR_KEY_NOT_FOUND; + res = 1; + +ok: + my_afree((uchar*)page_buf); + return res; + +err1: + my_afree((uchar*)page_buf); + info->cur_row.lastpos = HA_OFFSET_ERROR; + return -1; +} + + +/* + Find first key in r-tree according to search_flag condition + + SYNOPSIS + maria_rtree_find_first() + info Handler to MARIA file + uint keynr Key number to use + key Key to search for + key_length Length of 'key' + search_flag Bitmap of flags how to do the search + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_find_first(MARIA_HA *info, uint keynr, uchar *key, + uint key_length, uint search_flag) +{ + my_off_t root; + uint nod_cmp_flag; + MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr; + + if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + /* + Save searched key, include data pointer. + The data pointer is required if the search_flag contains MBR_DATA. + (minimum bounding rectangle) + */ + memcpy(info->first_mbr_key, key, keyinfo->keylength); + info->last_rkey_length = key_length; + + info->maria_rtree_recursion_depth = -1; + info->keyread_buff_used = 1; + + nod_cmp_flag= ((search_flag & (MBR_EQUAL | MBR_WITHIN)) ? + MBR_WITHIN : MBR_INTERSECT); + return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root, + 0); +} + + +/* + Find next key in r-tree according to search_flag condition + + SYNOPSIS + maria_rtree_find_next() + info Handler to MARIA file + uint keynr Key number to use + search_flag Bitmap of flags how to do the search + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint search_flag) +{ + my_off_t root; + uint nod_cmp_flag; + MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr; + + if (info->update & HA_STATE_DELETED) + return maria_rtree_find_first(info, keynr, info->lastkey, + info->lastkey_length, + search_flag); + + if (!info->keyread_buff_used) + { + uchar *key= info->int_keypos; + + while (key < info->int_maxpos) + { + if (!maria_rtree_key_cmp(keyinfo->seg, + info->first_mbr_key, key, + info->last_rkey_length, search_flag)) + { + uchar *after_key= key + keyinfo->keylength; + + info->cur_row.lastpos= _ma_dpos(info, 0, after_key); + memcpy(info->lastkey, key, info->lastkey_length); + + if (after_key < info->int_maxpos) + info->int_keypos= after_key; + else + info->keyread_buff_used= 1; + return 0; + } + key+= keyinfo->keylength; + } + } + if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + nod_cmp_flag = ((search_flag & (MBR_EQUAL | MBR_WITHIN)) ? + MBR_WITHIN : MBR_INTERSECT); + return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root, 0); +} + + +/* + Get next key in r-tree recursively + + NOTES + Used in maria_rtree_get_first() and maria_rtree_get_next() + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +static int maria_rtree_get_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uint key_length, + my_off_t page, int level) +{ + uchar *page_buf, *last, *k; + uint nod_flag, k_len; + int res; + uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level; + + if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length))) + return -1; + if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0)) + goto err1; + nod_flag = _ma_test_if_nod(page_buf); + + k_len = keyinfo->keylength - info->s->base.rec_reflength; + + if(info->maria_rtree_recursion_depth >= level) + { + k = page_buf + *saved_key; + if (!nod_flag) + { + /* Only leaf pages contain data references. */ + /* Need to check next key with data reference. */ + k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag); + } + } + else + { + k = rt_PAGE_FIRST_KEY(page_buf, nod_flag); + } + last = rt_PAGE_END(page_buf); + + for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag)) + { + if (nod_flag) + { + /* this is an internal node in the tree */ + switch ((res = maria_rtree_get_req(info, keyinfo, key_length, + _ma_kpos(nod_flag, k), level + 1))) + { + case 0: /* found - exit from recursion */ + *saved_key = k - page_buf; + goto ok; + case 1: /* not found - continue searching */ + info->maria_rtree_recursion_depth = level; + break; + default: + case -1: /* error */ + goto err1; + } + } + else + { + /* this is a leaf */ + uchar *after_key = rt_PAGE_NEXT_KEY(k, k_len, nod_flag); + info->cur_row.lastpos = _ma_dpos(info, 0, after_key); + info->lastkey_length = k_len + info->s->base.rec_reflength; + memcpy(info->lastkey, k, info->lastkey_length); + + info->maria_rtree_recursion_depth = level; + *saved_key = k - page_buf; + + if (after_key < last) + { + info->int_keypos = (uchar*) saved_key; + memcpy(info->buff, page_buf, keyinfo->block_length); + info->int_maxpos = rt_PAGE_END(info->buff); + info->keyread_buff_used = 0; + } + else + { + info->keyread_buff_used = 1; + } + + res = 0; + goto ok; + } + } + info->cur_row.lastpos = HA_OFFSET_ERROR; + my_errno = HA_ERR_KEY_NOT_FOUND; + res = 1; + +ok: + my_afree((uchar*)page_buf); + return res; + +err1: + my_afree((uchar*)page_buf); + info->cur_row.lastpos = HA_OFFSET_ERROR; + return -1; +} + + +/* + Get first key in r-tree + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length) +{ + my_off_t root; + MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr; + + if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + info->maria_rtree_recursion_depth = -1; + info->keyread_buff_used = 1; + + return maria_rtree_get_req(info, &keyinfo[keynr], key_length, root, 0); +} + + +/* + Get next key in r-tree + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length) +{ + my_off_t root; + MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr; + + if (!info->keyread_buff_used) + { + uint k_len = keyinfo->keylength - info->s->base.rec_reflength; + /* rt_PAGE_NEXT_KEY(info->int_keypos) */ + uchar *key = info->buff + *(int*)info->int_keypos + k_len + + info->s->base.rec_reflength; + /* rt_PAGE_NEXT_KEY(key) */ + uchar *after_key = key + k_len + info->s->base.rec_reflength; + + info->cur_row.lastpos = _ma_dpos(info, 0, after_key); + info->lastkey_length = k_len + info->s->base.rec_reflength; + memcpy(info->lastkey, key, k_len + info->s->base.rec_reflength); + + *(int*)info->int_keypos = key - info->buff; + if (after_key >= info->int_maxpos) + { + info->keyread_buff_used = 1; + } + + return 0; + } + else + { + if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + return maria_rtree_get_req(info, &keyinfo[keynr], key_length, root, 0); + } +} + + +/* + Choose non-leaf better key for insertion +*/ + +#ifdef PICK_BY_PERIMETER +static uchar *maria_rtree_pick_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, + uint key_length, uchar *page_buf, + uint nod_flag) +{ + double increase; + double best_incr = DBL_MAX; + double perimeter; + double best_perimeter; + uchar *best_key; + uchar *k = rt_PAGE_FIRST_KEY(page_buf, nod_flag); + uchar *last = rt_PAGE_END(page_buf); + + LINT_INIT(best_perimeter); + LINT_INIT(best_key); + + for (; k < last; k = rt_PAGE_NEXT_KEY(k, key_length, nod_flag)) + { + if ((increase = maria_rtree_perimeter_increase(keyinfo->seg, k, key, key_length, + &perimeter)) == -1) + return NULL; + if ((increase < best_incr)|| + (increase == best_incr && perimeter < best_perimeter)) + { + best_key = k; + best_perimeter= perimeter; + best_incr = increase; + } + } + return best_key; +} + +#endif /*PICK_BY_PERIMETER*/ + +#ifdef PICK_BY_AREA +static uchar *maria_rtree_pick_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, + uint key_length, uchar *page_buf, + uint nod_flag) +{ + double increase; + double best_incr = DBL_MAX; + double area; + double best_area; + uchar *best_key; + uchar *k = rt_PAGE_FIRST_KEY(page_buf, nod_flag); + uchar *last = rt_PAGE_END(page_buf); + + LINT_INIT(best_area); + LINT_INIT(best_key); + + for (; k < last; k = rt_PAGE_NEXT_KEY(k, key_length, nod_flag)) + { + /* The following is safe as -1.0 is an exact number */ + if ((increase = maria_rtree_area_increase(keyinfo->seg, k, key, key_length, + &area)) == -1.0) + return NULL; + /* The following should be safe, even if we compare doubles */ + if (increase < best_incr) + { + best_key = k; + best_area = area; + best_incr = increase; + } + else + { + /* The following should be safe, even if we compare doubles */ + if ((increase == best_incr) && (area < best_area)) + { + best_key = k; + best_area = area; + best_incr = increase; + } + } + } + return best_key; +} + +#endif /*PICK_BY_AREA*/ + +/* + Go down and insert key into tree + + RETURN + -1 Error + 0 Child was not split + 1 Child was split +*/ + +static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, + uint key_length, my_off_t page, + my_off_t *new_page, + int ins_level, int level) +{ + uint nod_flag; + int res; + uchar *page_buf, *k; + DBUG_ENTER("maria_rtree_insert_req"); + + if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length + + HA_MAX_KEY_BUFF))) + { + my_errno = HA_ERR_OUT_OF_MEM; + DBUG_RETURN(-1); /* purecov: inspected */ + } + if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0)) + goto err1; + nod_flag = _ma_test_if_nod(page_buf); + DBUG_PRINT("rtree", ("page: %lu level: %d ins_level: %d nod_flag: %u", + (ulong) page, level, ins_level, nod_flag)); + + if ((ins_level == -1 && nod_flag) || /* key: go down to leaf */ + (ins_level > -1 && ins_level > level)) /* branch: go down to ins_level */ + { + if ((k = maria_rtree_pick_key(info, keyinfo, key, key_length, page_buf, + nod_flag)) == NULL) + goto err1; + switch ((res = maria_rtree_insert_req(info, keyinfo, key, key_length, + _ma_kpos(nod_flag, k), new_page, + ins_level, level + 1))) + { + case 0: /* child was not split */ + { + maria_rtree_combine_rect(keyinfo->seg, k, key, k, key_length); + if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf)) + goto err1; + goto ok; + } + case 1: /* child was split */ + { + uchar *new_key = page_buf + keyinfo->block_length + nod_flag; + /* set proper MBR for key */ + if (maria_rtree_set_key_mbr(info, keyinfo, k, key_length, + _ma_kpos(nod_flag, k))) + goto err1; + /* add new key for new page */ + _ma_kpointer(info, new_key - nod_flag, *new_page); + if (maria_rtree_set_key_mbr(info, keyinfo, new_key, key_length, + *new_page)) + goto err1; + res = maria_rtree_add_key(info, keyinfo, new_key, key_length, + page_buf, new_page); + if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf)) + goto err1; + goto ok; + } + default: + case -1: /* error */ + { + goto err1; + } + } + } + else + { + res = maria_rtree_add_key(info, keyinfo, key, key_length, page_buf, + new_page); + if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf)) + goto err1; + } + +ok: + my_afree(page_buf); + DBUG_RETURN(res); + +err1: + my_afree(page_buf); + DBUG_RETURN(-1); /* purecov: inspected */ +} + + +/* + Insert key into the tree + + RETURN + -1 Error + 0 Root was not split + 1 Root was split +*/ + +static int maria_rtree_insert_level(MARIA_HA *info, uint keynr, uchar *key, + uint key_length, int ins_level) +{ + my_off_t old_root; + MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr; + int res; + my_off_t new_page; + DBUG_ENTER("maria_rtree_insert_level"); + + if ((old_root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + if ((old_root = _ma_new(info, keyinfo, DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + DBUG_RETURN(-1); + info->keyread_buff_used = 1; + maria_putint(info->buff, 2, 0); + res = maria_rtree_add_key(info, keyinfo, key, key_length, info->buff, NULL); + if (_ma_write_keypage(info, keyinfo, old_root, DFLT_INIT_HITS, info->buff)) + DBUG_RETURN(1); + info->s->state.key_root[keynr] = old_root; + DBUG_RETURN(res); + } + + switch ((res = maria_rtree_insert_req(info, keyinfo, key, key_length, + old_root, &new_page, ins_level, 0))) + { + case 0: /* root was not split */ + { + break; + } + case 1: /* root was split, grow a new root */ + { + uchar *new_root_buf, *new_key; + my_off_t new_root; + uint nod_flag = info->s->base.key_reflength; + + DBUG_PRINT("rtree", ("root was split, grow a new root")); + if (!(new_root_buf= (uchar*) my_alloca((uint)keyinfo->block_length + + HA_MAX_KEY_BUFF))) + { + my_errno = HA_ERR_OUT_OF_MEM; + DBUG_RETURN(-1); /* purecov: inspected */ + } + + maria_putint(new_root_buf, 2, nod_flag); + if ((new_root = _ma_new(info, keyinfo, DFLT_INIT_HITS)) == + HA_OFFSET_ERROR) + goto err1; + + new_key = new_root_buf + keyinfo->block_length + nod_flag; + + _ma_kpointer(info, new_key - nod_flag, old_root); + if (maria_rtree_set_key_mbr(info, keyinfo, new_key, key_length, + old_root)) + goto err1; + if (maria_rtree_add_key(info, keyinfo, new_key, key_length, new_root_buf, + NULL) + == -1) + goto err1; + _ma_kpointer(info, new_key - nod_flag, new_page); + if (maria_rtree_set_key_mbr(info, keyinfo, new_key, key_length, + new_page)) + goto err1; + if (maria_rtree_add_key(info, keyinfo, new_key, key_length, new_root_buf, + NULL) + == -1) + goto err1; + if (_ma_write_keypage(info, keyinfo, new_root, + DFLT_INIT_HITS, new_root_buf)) + goto err1; + info->s->state.key_root[keynr] = new_root; + DBUG_PRINT("rtree", ("new root page: %lu level: %d nod_flag: %u", + (ulong) new_root, 0, + _ma_test_if_nod(new_root_buf))); + + my_afree((uchar*)new_root_buf); + break; +err1: + my_afree((uchar*)new_root_buf); + DBUG_RETURN(-1); /* purecov: inspected */ + } + default: + case -1: /* error */ + { + break; + } + } + DBUG_RETURN(res); +} + + +/* + Insert key into the tree - interface function + + RETURN + -1 Error + 0 OK +*/ + +int maria_rtree_insert(MARIA_HA *info, uint keynr, uchar *key, uint key_length) +{ + DBUG_ENTER("maria_rtree_insert"); + DBUG_RETURN((!key_length || + (maria_rtree_insert_level(info, keynr, key, key_length, -1) == -1)) ? + -1 : 0); +} + + +/* + Fill reinsert page buffer + + RETURN + -1 Error + 0 OK +*/ + +static int maria_rtree_fill_reinsert_list(stPageList *ReinsertList, my_off_t page, + int level) +{ + DBUG_ENTER("maria_rtree_fill_reinsert_list"); + DBUG_PRINT("rtree", ("page: %lu level: %d", (ulong) page, level)); + if (ReinsertList->n_pages == ReinsertList->m_pages) + { + ReinsertList->m_pages += REINSERT_BUFFER_INC; + if (!(ReinsertList->pages = (stPageLevel*)my_realloc((uchar*)ReinsertList->pages, + ReinsertList->m_pages * sizeof(stPageLevel), MYF(MY_ALLOW_ZERO_PTR)))) + goto err1; + } + /* save page to ReinsertList */ + ReinsertList->pages[ReinsertList->n_pages].offs = page; + ReinsertList->pages[ReinsertList->n_pages].level = level; + ReinsertList->n_pages++; + DBUG_RETURN(0); + +err1: + DBUG_RETURN(-1); /* purecov: inspected */ +} + + +/* + Go down and delete key from the tree + + RETURN + -1 Error + 0 Deleted + 1 Not found + 2 Empty leaf +*/ + +static int maria_rtree_delete_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, + uint key_length, my_off_t page, + uint *page_size, + stPageList *ReinsertList, int level) +{ + ulong i; + uint nod_flag; + int res; + uchar *page_buf, *last, *k; + DBUG_ENTER("maria_rtree_delete_req"); + + if (!(page_buf = (uchar*) my_alloca((uint)keyinfo->block_length))) + { + my_errno = HA_ERR_OUT_OF_MEM; + DBUG_RETURN(-1); /* purecov: inspected */ + } + if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0)) + goto err1; + nod_flag = _ma_test_if_nod(page_buf); + DBUG_PRINT("rtree", ("page: %lu level: %d nod_flag: %u", + (ulong) page, level, nod_flag)); + + k = rt_PAGE_FIRST_KEY(page_buf, nod_flag); + last = rt_PAGE_END(page_buf); + + for (i = 0; k < last; k = rt_PAGE_NEXT_KEY(k, key_length, nod_flag), i++) + { + if (nod_flag) + { + /* not leaf */ + if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length, MBR_WITHIN)) + { + switch ((res = maria_rtree_delete_req(info, keyinfo, key, key_length, + _ma_kpos(nod_flag, k), page_size, ReinsertList, level + 1))) + { + case 0: /* deleted */ + { + /* test page filling */ + if (*page_size + key_length >= + rt_PAGE_MIN_SIZE(keyinfo->block_length)) + { + /* OK */ + /* Calculate a new key value (MBR) for the shrinked block. */ + if (maria_rtree_set_key_mbr(info, keyinfo, k, key_length, + _ma_kpos(nod_flag, k))) + goto err1; + if (_ma_write_keypage(info, keyinfo, page, + DFLT_INIT_HITS, page_buf)) + goto err1; + } + else + { + /* + Too small: delete key & add it descendant to reinsert list. + Store position and level of the block so that it can be + accessed later for inserting the remaining keys. + */ + DBUG_PRINT("rtree", ("too small. move block to reinsert list")); + if (maria_rtree_fill_reinsert_list(ReinsertList, + _ma_kpos(nod_flag, k), + level + 1)) + goto err1; + /* + Delete the key that references the block. This makes the + block disappear from the index. Hence we need to insert + its remaining keys later. Note: if the block is a branch + block, we do not only remove this block, but the whole + subtree. So we need to re-insert its keys on the same + level later to reintegrate the subtrees. + */ + maria_rtree_delete_key(info, page_buf, k, key_length, nod_flag); + if (_ma_write_keypage(info, keyinfo, page, + DFLT_INIT_HITS, page_buf)) + goto err1; + *page_size = maria_data_on_page(page_buf); + } + + goto ok; + } + case 1: /* not found - continue searching */ + { + break; + } + case 2: /* vacuous case: last key in the leaf */ + { + maria_rtree_delete_key(info, page_buf, k, key_length, nod_flag); + if (_ma_write_keypage(info, keyinfo, page, + DFLT_INIT_HITS, page_buf)) + goto err1; + *page_size = maria_data_on_page(page_buf); + res = 0; + goto ok; + } + default: /* error */ + case -1: + { + goto err1; + } + } + } + } + else + { + /* leaf */ + if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length, MBR_EQUAL | MBR_DATA)) + { + maria_rtree_delete_key(info, page_buf, k, key_length, nod_flag); + *page_size = maria_data_on_page(page_buf); + if (*page_size == 2) + { + /* last key in the leaf */ + res = 2; + if (_ma_dispose(info, keyinfo, page, DFLT_INIT_HITS)) + goto err1; + } + else + { + res = 0; + if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf)) + goto err1; + } + goto ok; + } + } + } + res = 1; + +ok: + my_afree((uchar*)page_buf); + DBUG_RETURN(res); + +err1: + my_afree((uchar*)page_buf); + DBUG_RETURN(-1); /* purecov: inspected */ +} + + +/* + Delete key - interface function + + RETURN + -1 Error + 0 Deleted +*/ + +int maria_rtree_delete(MARIA_HA *info, uint keynr, uchar *key, uint key_length) +{ + uint page_size; + stPageList ReinsertList; + my_off_t old_root; + MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr; + DBUG_ENTER("maria_rtree_delete"); + + if ((old_root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + DBUG_RETURN(-1); /* purecov: inspected */ + } + DBUG_PRINT("rtree", ("starting deletion at root page: %lu", + (ulong) old_root)); + + ReinsertList.pages = NULL; + ReinsertList.n_pages = 0; + ReinsertList.m_pages = 0; + + switch (maria_rtree_delete_req(info, keyinfo, key, key_length, old_root, + &page_size, &ReinsertList, 0)) + { + case 2: /* empty */ + { + info->s->state.key_root[keynr] = HA_OFFSET_ERROR; + DBUG_RETURN(0); + } + case 0: /* deleted */ + { + uint nod_flag; + ulong i; + for (i = 0; i < ReinsertList.n_pages; ++i) + { + uchar *page_buf, *k, *last; + + if (!(page_buf = (uchar*) my_alloca((uint)keyinfo->block_length))) + { + my_errno = HA_ERR_OUT_OF_MEM; + goto err1; + } + if (!_ma_fetch_keypage(info, keyinfo, ReinsertList.pages[i].offs, + DFLT_INIT_HITS, page_buf, 0)) + goto err1; + nod_flag = _ma_test_if_nod(page_buf); + DBUG_PRINT("rtree", ("reinserting keys from " + "page: %lu level: %d nod_flag: %u", + (ulong) ReinsertList.pages[i].offs, + ReinsertList.pages[i].level, nod_flag)); + + k = rt_PAGE_FIRST_KEY(page_buf, nod_flag); + last = rt_PAGE_END(page_buf); + for (; k < last; k = rt_PAGE_NEXT_KEY(k, key_length, nod_flag)) + { + int res; + if ((res= + maria_rtree_insert_level(info, keynr, k, key_length, + ReinsertList.pages[i].level)) == -1) + { + my_afree(page_buf); + goto err1; + } + if (res) + { + ulong j; + DBUG_PRINT("rtree", ("root has been split, adjust levels")); + for (j= i; j < ReinsertList.n_pages; j++) + { + ReinsertList.pages[j].level++; + DBUG_PRINT("rtree", ("keys from page: %lu now level: %d", + (ulong) ReinsertList.pages[i].offs, + ReinsertList.pages[i].level)); + } + } + } + my_afree(page_buf); + if (_ma_dispose(info, keyinfo, ReinsertList.pages[i].offs, + DFLT_INIT_HITS)) + goto err1; + } + if (ReinsertList.pages) + my_free((uchar*) ReinsertList.pages, MYF(0)); + + /* check for redundant root (not leaf, 1 child) and eliminate */ + if ((old_root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + goto err1; + if (!_ma_fetch_keypage(info, keyinfo, old_root, DFLT_INIT_HITS, + info->buff, 0)) + goto err1; + nod_flag = _ma_test_if_nod(info->buff); + page_size = maria_data_on_page(info->buff); + if (nod_flag && (page_size == 2 + key_length + nod_flag)) + { + my_off_t new_root = _ma_kpos(nod_flag, + rt_PAGE_FIRST_KEY(info->buff, nod_flag)); + if (_ma_dispose(info, keyinfo, old_root, DFLT_INIT_HITS)) + goto err1; + info->s->state.key_root[keynr] = new_root; + } + info->update= HA_STATE_DELETED; + DBUG_RETURN(0); + +err1: + DBUG_RETURN(-1); /* purecov: inspected */ + } + case 1: /* not found */ + { + my_errno = HA_ERR_KEY_NOT_FOUND; + DBUG_RETURN(-1); /* purecov: inspected */ + } + default: + case -1: /* error */ + DBUG_RETURN(-1); /* purecov: inspected */ + } +} + + +/* + Estimate number of suitable keys in the tree + + RETURN + estimated value +*/ + +ha_rows maria_rtree_estimate(MARIA_HA *info, uint keynr, uchar *key, + uint key_length, uint flag) +{ + MARIA_KEYDEF *keyinfo = info->s->keyinfo + keynr; + my_off_t root; + uint i = 0; + uint nod_flag, k_len; + uchar *page_buf, *k, *last; + double area = 0; + ha_rows res = 0; + + if (flag & MBR_DISJOINT) + return info->state->records; + + if ((root = info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + return HA_POS_ERROR; + if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length))) + return HA_POS_ERROR; + if (!_ma_fetch_keypage(info, keyinfo, root, DFLT_INIT_HITS, page_buf, 0)) + goto err1; + nod_flag = _ma_test_if_nod(page_buf); + + k_len = keyinfo->keylength - info->s->base.rec_reflength; + + k = rt_PAGE_FIRST_KEY(page_buf, nod_flag); + last = rt_PAGE_END(page_buf); + + for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag), i++) + { + if (nod_flag) + { + double k_area = maria_rtree_rect_volume(keyinfo->seg, k, key_length); + + /* The following should be safe, even if we compare doubles */ + if (k_area == 0) + { + if (flag & (MBR_CONTAIN | MBR_INTERSECT)) + { + area += 1; + } + else if (flag & (MBR_WITHIN | MBR_EQUAL)) + { + if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length, + MBR_WITHIN)) + area += 1; + } + else + goto err1; + } + else + { + if (flag & (MBR_CONTAIN | MBR_INTERSECT)) + { + area+= maria_rtree_overlapping_area(keyinfo->seg, key, k, + key_length) / k_area; + } + else if (flag & (MBR_WITHIN | MBR_EQUAL)) + { + if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length, + MBR_WITHIN)) + area+= (maria_rtree_rect_volume(keyinfo->seg, key, key_length) / + k_area); + } + else + goto err1; + } + } + else + { + if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length, flag)) + ++res; + } + } + if (nod_flag) + { + if (i) + res = (ha_rows) (area / i * info->state->records); + else + res = HA_POS_ERROR; + } + + my_afree((uchar*)page_buf); + return res; + +err1: + my_afree(page_buf); + return HA_POS_ERROR; +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_index.h b/storage/maria/ma_rt_index.h new file mode 100644 index 00000000000..fe2f62b662c --- /dev/null +++ b/storage/maria/ma_rt_index.h @@ -0,0 +1,49 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _rt_index_h +#define _rt_index_h + +#ifdef HAVE_RTREE_KEYS + +#define rt_PAGE_FIRST_KEY(page, nod_flag) (page + 2 + nod_flag) +#define rt_PAGE_NEXT_KEY(key, key_length, nod_flag) (key + key_length + \ + (nod_flag ? nod_flag : info->s->base.rec_reflength)) +#define rt_PAGE_END(page) (page + maria_data_on_page(page)) + +#define rt_PAGE_MIN_SIZE(block_length) ((uint)(block_length) / 3) + +int maria_rtree_insert(MARIA_HA *info, uint keynr, uchar *key, + uint key_length); +int maria_rtree_delete(MARIA_HA *info, uint keynr, uchar *key, + uint key_length); + +int maria_rtree_find_first(MARIA_HA *info, uint keynr, uchar *key, + uint key_length, uint search_flag); +int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint search_flag); + +int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length); +int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length); + +ha_rows maria_rtree_estimate(MARIA_HA *info, uint keynr, uchar *key, + uint key_length, uint flag); + +int maria_rtree_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *page, + uchar *key, uint key_length, + my_off_t *new_page_offs); + +#endif /*HAVE_RTREE_KEYS*/ +#endif /* _rt_index_h */ diff --git a/storage/maria/ma_rt_key.c b/storage/maria/ma_rt_key.c new file mode 100644 index 00000000000..b74d5d06690 --- /dev/null +++ b/storage/maria/ma_rt_key.c @@ -0,0 +1,109 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +#ifdef HAVE_RTREE_KEYS +#include "ma_rt_index.h" +#include "ma_rt_key.h" +#include "ma_rt_mbr.h" + +/* + Add key to the page + + RESULT VALUES + -1 Error + 0 Not split + 1 Split +*/ + +int maria_rtree_add_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, + uint key_length, uchar *page_buf, my_off_t *new_page) +{ + uint page_size = maria_data_on_page(page_buf); + uint nod_flag = _ma_test_if_nod(page_buf); + DBUG_ENTER("maria_rtree_add_key"); + + if (page_size + key_length + info->s->base.rec_reflength <= + keyinfo->block_length) + { + /* split won't be necessary */ + if (nod_flag) + { + /* save key */ + DBUG_ASSERT(_ma_kpos(nod_flag, key) < info->state->key_file_length); + memcpy(rt_PAGE_END(page_buf), key - nod_flag, key_length + nod_flag); + page_size += key_length + nod_flag; + } + else + { + /* save key */ + DBUG_ASSERT(_ma_dpos(info, nod_flag, key + key_length + + info->s->base.rec_reflength) < + info->state->data_file_length + + info->s->base.pack_reclength); + memcpy(rt_PAGE_END(page_buf), key, key_length + + info->s->base.rec_reflength); + page_size += key_length + info->s->base.rec_reflength; + } + maria_putint(page_buf, page_size, nod_flag); + DBUG_RETURN(0); + } + + DBUG_RETURN(maria_rtree_split_page(info, keyinfo, page_buf, key, key_length, + new_page) ? -1 : 1); +} + + +/* + Delete key from the page +*/ + +int maria_rtree_delete_key(MARIA_HA *info, uchar *page_buf, uchar *key, + uint key_length, uint nod_flag) +{ + uint16 page_size = maria_data_on_page(page_buf); + uchar *key_start; + + key_start= key - nod_flag; + if (!nod_flag) + key_length += info->s->base.rec_reflength; + + memmove(key_start, key + key_length, page_size - key_length - + (key - page_buf)); + page_size-= key_length + nod_flag; + + maria_putint(page_buf, page_size, nod_flag); + return 0; +} + + +/* + Calculate and store key MBR +*/ + +int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, + uint key_length, my_off_t child_page) +{ + DBUG_ENTER("maria_rtree_set_key_mbr"); + if (!_ma_fetch_keypage(info, keyinfo, child_page, + DFLT_INIT_HITS, info->buff, 0)) + DBUG_RETURN(-1); + + DBUG_RETURN(maria_rtree_page_mbr(info, keyinfo->seg, + info->buff, key, key_length)); +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_key.h b/storage/maria/ma_rt_key.h new file mode 100644 index 00000000000..3f95d3d3e67 --- /dev/null +++ b/storage/maria/ma_rt_key.h @@ -0,0 +1,32 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Ramil Kalimullin, who has a shared copyright to this code */ + +#ifndef _rt_key_h +#define _rt_key_h + +#ifdef HAVE_RTREE_KEYS + +int maria_rtree_add_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, + uint key_length, uchar *page_buf, my_off_t *new_page); +int maria_rtree_delete_key(MARIA_HA *info, uchar *page, uchar *key, + uint key_length, uint nod_flag); +int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, + uint key_length, my_off_t child_page); + +#endif /*HAVE_RTREE_KEYS*/ +#endif /* _rt_key_h */ diff --git a/storage/maria/ma_rt_mbr.c b/storage/maria/ma_rt_mbr.c new file mode 100644 index 00000000000..a224cefac12 --- /dev/null +++ b/storage/maria/ma_rt_mbr.c @@ -0,0 +1,806 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" +#include "ma_rt_mbr.h" + +#define INTERSECT_CMP(amin, amax, bmin, bmax) ((amin > bmax) || (bmin > amax)) +#define CONTAIN_CMP(amin, amax, bmin, bmax) ((bmin > amin) || (bmax < amax)) +#define WITHIN_CMP(amin, amax, bmin, bmax) ((amin > bmin) || (amax < bmax)) +#define DISJOINT_CMP(amin, amax, bmin, bmax) ((amin <= bmax) && (bmin <= amax)) +#define EQUAL_CMP(amin, amax, bmin, bmax) ((amin != bmin) || (amax != bmax)) + +#define FCMP(A, B) ((int)(A) - (int)(B)) +#define p_inc(A, B, X) {A += X; B += X;} + +#define RT_CMP(nextflag) \ + if (nextflag & MBR_INTERSECT) \ + { \ + if (INTERSECT_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_CONTAIN) \ + { \ + if (CONTAIN_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_WITHIN) \ + { \ + if (WITHIN_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_EQUAL) \ + { \ + if (EQUAL_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_DISJOINT) \ + { \ + if (DISJOINT_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + }\ + else /* if unknown comparison operator */ \ + { \ + DBUG_ASSERT(0); \ + } + +#define RT_CMP_KORR(type, korr_func, len, nextflag) \ +{ \ + type amin, amax, bmin, bmax; \ + amin = korr_func(a); \ + bmin = korr_func(b); \ + amax = korr_func(a+len); \ + bmax = korr_func(b+len); \ + RT_CMP(nextflag); \ +} + +#define RT_CMP_GET(type, get_func, len, nextflag) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + RT_CMP(nextflag); \ +} + +/* + Compares two keys a and b depending on nextflag + nextflag can contain these flags: + MBR_INTERSECT(a,b) a overlaps b + MBR_CONTAIN(a,b) a contains b + MBR_DISJOINT(a,b) a disjoint b + MBR_WITHIN(a,b) a within b + MBR_EQUAL(a,b) All coordinates of MBRs are equal + MBR_DATA(a,b) Data reference is the same + Returns 0 on success. +*/ + +int maria_rtree_key_cmp(HA_KEYSEG *keyseg, uchar *b, uchar *a, uint key_length, + uint nextflag) +{ + for (; (int) key_length > 0; keyseg += 2 ) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_CMP_KORR(int8, mi_sint1korr, 1, nextflag); + break; + case HA_KEYTYPE_BINARY: + RT_CMP_KORR(uint8, mi_uint1korr, 1, nextflag); + break; + case HA_KEYTYPE_SHORT_INT: + RT_CMP_KORR(int16, mi_sint2korr, 2, nextflag); + break; + case HA_KEYTYPE_USHORT_INT: + RT_CMP_KORR(uint16, mi_uint2korr, 2, nextflag); + break; + case HA_KEYTYPE_INT24: + RT_CMP_KORR(int32, mi_sint3korr, 3, nextflag); + break; + case HA_KEYTYPE_UINT24: + RT_CMP_KORR(uint32, mi_uint3korr, 3, nextflag); + break; + case HA_KEYTYPE_LONG_INT: + RT_CMP_KORR(int32, mi_sint4korr, 4, nextflag); + break; + case HA_KEYTYPE_ULONG_INT: + RT_CMP_KORR(uint32, mi_uint4korr, 4, nextflag); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_CMP_KORR(longlong, mi_sint8korr, 8, nextflag) + break; + case HA_KEYTYPE_ULONGLONG: + RT_CMP_KORR(ulonglong, mi_uint8korr, 8, nextflag) + break; +#endif + case HA_KEYTYPE_FLOAT: + /* The following should be safe, even if we compare doubles */ + RT_CMP_GET(float, mi_float4get, 4, nextflag); + break; + case HA_KEYTYPE_DOUBLE: + RT_CMP_GET(double, mi_float8get, 8, nextflag); + break; + case HA_KEYTYPE_END: + goto end; + default: + return 1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } + +end: + if (nextflag & MBR_DATA) + { + uchar *end = a + keyseg->length; + do + { + if (*a++ != *b++) + return FCMP(a[-1], b[-1]); + } while (a != end); + } + return 0; +} + +#define RT_VOL_KORR(type, korr_func, len, cast) \ +{ \ + type amin, amax; \ + amin = korr_func(a); \ + amax = korr_func(a+len); \ + res *= (cast(amax) - cast(amin)); \ +} + +#define RT_VOL_GET(type, get_func, len, cast) \ +{ \ + type amin, amax; \ + get_func(amin, a); \ + get_func(amax, a+len); \ + res *= (cast(amax) - cast(amin)); \ +} + +/* + Calculates rectangle volume +*/ +double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar *a, uint key_length) +{ + double res = 1; + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_VOL_KORR(int8, mi_sint1korr, 1, (double)); + break; + case HA_KEYTYPE_BINARY: + RT_VOL_KORR(uint8, mi_uint1korr, 1, (double)); + break; + case HA_KEYTYPE_SHORT_INT: + RT_VOL_KORR(int16, mi_sint2korr, 2, (double)); + break; + case HA_KEYTYPE_USHORT_INT: + RT_VOL_KORR(uint16, mi_uint2korr, 2, (double)); + break; + case HA_KEYTYPE_INT24: + RT_VOL_KORR(int32, mi_sint3korr, 3, (double)); + break; + case HA_KEYTYPE_UINT24: + RT_VOL_KORR(uint32, mi_uint3korr, 3, (double)); + break; + case HA_KEYTYPE_LONG_INT: + RT_VOL_KORR(int32, mi_sint4korr, 4, (double)); + break; + case HA_KEYTYPE_ULONG_INT: + RT_VOL_KORR(uint32, mi_uint4korr, 4, (double)); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_VOL_KORR(longlong, mi_sint8korr, 8, (double)); + break; + case HA_KEYTYPE_ULONGLONG: + RT_VOL_KORR(longlong, mi_sint8korr, 8, ulonglong2double); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_VOL_GET(float, mi_float4get, 4, (double)); + break; + case HA_KEYTYPE_DOUBLE: + RT_VOL_GET(double, mi_float8get, 8, (double)); + break; + case HA_KEYTYPE_END: + key_length = 0; + break; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + } + return res; +} + +#define RT_D_MBR_KORR(type, korr_func, len, cast) \ +{ \ + type amin, amax; \ + amin = korr_func(a); \ + amax = korr_func(a+len); \ + *res++ = cast(amin); \ + *res++ = cast(amax); \ +} + +#define RT_D_MBR_GET(type, get_func, len, cast) \ +{ \ + type amin, amax; \ + get_func(amin, a); \ + get_func(amax, a+len); \ + *res++ = cast(amin); \ + *res++ = cast(amax); \ +} + + +/* + Creates an MBR as an array of doubles. +*/ + +int maria_rtree_d_mbr(HA_KEYSEG *keyseg, uchar *a, uint key_length, double *res) +{ + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_D_MBR_KORR(int8, mi_sint1korr, 1, (double)); + break; + case HA_KEYTYPE_BINARY: + RT_D_MBR_KORR(uint8, mi_uint1korr, 1, (double)); + break; + case HA_KEYTYPE_SHORT_INT: + RT_D_MBR_KORR(int16, mi_sint2korr, 2, (double)); + break; + case HA_KEYTYPE_USHORT_INT: + RT_D_MBR_KORR(uint16, mi_uint2korr, 2, (double)); + break; + case HA_KEYTYPE_INT24: + RT_D_MBR_KORR(int32, mi_sint3korr, 3, (double)); + break; + case HA_KEYTYPE_UINT24: + RT_D_MBR_KORR(uint32, mi_uint3korr, 3, (double)); + break; + case HA_KEYTYPE_LONG_INT: + RT_D_MBR_KORR(int32, mi_sint4korr, 4, (double)); + break; + case HA_KEYTYPE_ULONG_INT: + RT_D_MBR_KORR(uint32, mi_uint4korr, 4, (double)); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_D_MBR_KORR(longlong, mi_sint8korr, 8, (double)); + break; + case HA_KEYTYPE_ULONGLONG: + RT_D_MBR_KORR(longlong, mi_sint8korr, 8, ulonglong2double); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_D_MBR_GET(float, mi_float4get, 4, (double)); + break; + case HA_KEYTYPE_DOUBLE: + RT_D_MBR_GET(double, mi_float8get, 8, (double)); + break; + case HA_KEYTYPE_END: + key_length = 0; + break; + default: + return 1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + } + return 0; +} + +#define RT_COMB_KORR(type, korr_func, store_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin = korr_func(a); \ + bmin = korr_func(b); \ + amax = korr_func(a+len); \ + bmax = korr_func(b+len); \ + amin = min(amin, bmin); \ + amax = max(amax, bmax); \ + store_func(c, amin); \ + store_func(c+len, amax); \ +} + +#define RT_COMB_GET(type, get_func, store_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + amin = min(amin, bmin); \ + amax = max(amax, bmax); \ + store_func(c, amin); \ + store_func(c+len, amax); \ +} + +/* + Creates common minimal bounding rectungle + for two input rectagnles a and b + Result is written to c +*/ + +int maria_rtree_combine_rect(HA_KEYSEG *keyseg, uchar* a, uchar* b, uchar* c, + uint key_length) +{ + for ( ; (int) key_length > 0 ; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_COMB_KORR(int8, mi_sint1korr, mi_int1store, 1); + break; + case HA_KEYTYPE_BINARY: + RT_COMB_KORR(uint8, mi_uint1korr, mi_int1store, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_COMB_KORR(int16, mi_sint2korr, mi_int2store, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_COMB_KORR(uint16, mi_uint2korr, mi_int2store, 2); + break; + case HA_KEYTYPE_INT24: + RT_COMB_KORR(int32, mi_sint3korr, mi_int3store, 3); + break; + case HA_KEYTYPE_UINT24: + RT_COMB_KORR(uint32, mi_uint3korr, mi_int3store, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_COMB_KORR(int32, mi_sint4korr, mi_int4store, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_COMB_KORR(uint32, mi_uint4korr, mi_int4store, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_COMB_KORR(longlong, mi_sint8korr, mi_int8store, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_COMB_KORR(ulonglong, mi_uint8korr, mi_int8store, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_COMB_GET(float, mi_float4get, mi_float4store, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_COMB_GET(double, mi_float8get, mi_float8store, 8); + break; + case HA_KEYTYPE_END: + return 0; + default: + return 1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + c+= keyseg_length; + } + return 0; +} + + +#define RT_OVL_AREA_KORR(type, korr_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin = korr_func(a); \ + bmin = korr_func(b); \ + amax = korr_func(a+len); \ + bmax = korr_func(b+len); \ + amin = max(amin, bmin); \ + amax = min(amax, bmax); \ + if (amin >= amax) \ + return 0; \ + res *= amax - amin; \ +} + +#define RT_OVL_AREA_GET(type, get_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + amin = max(amin, bmin); \ + amax = min(amax, bmax); \ + if (amin >= amax) \ + return 0; \ + res *= amax - amin; \ +} + +/* +Calculates overlapping area of two MBRs a & b +*/ +double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b, + uint key_length) +{ + double res = 1; + for (; (int) key_length > 0 ; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_OVL_AREA_KORR(int8, mi_sint1korr, 1); + break; + case HA_KEYTYPE_BINARY: + RT_OVL_AREA_KORR(uint8, mi_uint1korr, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_OVL_AREA_KORR(int16, mi_sint2korr, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_OVL_AREA_KORR(uint16, mi_uint2korr, 2); + break; + case HA_KEYTYPE_INT24: + RT_OVL_AREA_KORR(int32, mi_sint3korr, 3); + break; + case HA_KEYTYPE_UINT24: + RT_OVL_AREA_KORR(uint32, mi_uint3korr, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_OVL_AREA_KORR(int32, mi_sint4korr, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_OVL_AREA_KORR(uint32, mi_uint4korr, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_OVL_AREA_GET(float, mi_float4get, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_OVL_AREA_GET(double, mi_float8get, 8); + break; + case HA_KEYTYPE_END: + return res; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } + return res; +} + +#define RT_AREA_INC_KORR(type, korr_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin = korr_func(a); \ + bmin = korr_func(b); \ + amax = korr_func(a+len); \ + bmax = korr_func(b+len); \ + a_area *= (((double)amax) - ((double)amin)); \ + loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ +} + +#define RT_AREA_INC_GET(type, get_func, len)\ +{\ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + a_area *= (((double)amax) - ((double)amin)); \ + loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ +} + +/* + Calculates MBR_AREA(a+b) - MBR_AREA(a) +*/ + +double maria_rtree_area_increase(HA_KEYSEG *keyseg, uchar *a, uchar *b, + uint key_length, double *ab_area) +{ + double a_area= 1.0; + double loc_ab_area= 1.0; + + *ab_area= 1.0; + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + + if (keyseg->null_bit) /* Handle NULL part */ + return -1; + + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_AREA_INC_KORR(int8, mi_sint1korr, 1); + break; + case HA_KEYTYPE_BINARY: + RT_AREA_INC_KORR(uint8, mi_uint1korr, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_AREA_INC_KORR(int16, mi_sint2korr, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_AREA_INC_KORR(uint16, mi_uint2korr, 2); + break; + case HA_KEYTYPE_INT24: + RT_AREA_INC_KORR(int32, mi_sint3korr, 3); + break; + case HA_KEYTYPE_UINT24: + RT_AREA_INC_KORR(int32, mi_uint3korr, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_AREA_INC_KORR(int32, mi_sint4korr, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_AREA_INC_KORR(uint32, mi_uint4korr, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_AREA_INC_KORR(longlong, mi_sint8korr, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_AREA_INC_KORR(longlong, mi_sint8korr, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_AREA_INC_GET(float, mi_float4get, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_AREA_INC_GET(double, mi_float8get, 8); + break; + case HA_KEYTYPE_END: + goto safe_end; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } +safe_end: + *ab_area= loc_ab_area; + return loc_ab_area - a_area; +} + +#define RT_PERIM_INC_KORR(type, korr_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin = korr_func(a); \ + bmin = korr_func(b); \ + amax = korr_func(a+len); \ + bmax = korr_func(b+len); \ + a_perim+= (((double)amax) - ((double)amin)); \ + *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ +} + +#define RT_PERIM_INC_GET(type, get_func, len)\ +{\ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + a_perim+= (((double)amax) - ((double)amin)); \ + *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \ +} + +/* +Calculates MBR_PERIMETER(a+b) - MBR_PERIMETER(a) +*/ +double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b, + uint key_length, double *ab_perim) +{ + double a_perim = 0.0; + + *ab_perim= 0.0; + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + + if (keyseg->null_bit) /* Handle NULL part */ + return -1; + + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_PERIM_INC_KORR(int8, mi_sint1korr, 1); + break; + case HA_KEYTYPE_BINARY: + RT_PERIM_INC_KORR(uint8, mi_uint1korr, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_PERIM_INC_KORR(int16, mi_sint2korr, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_PERIM_INC_KORR(uint16, mi_uint2korr, 2); + break; + case HA_KEYTYPE_INT24: + RT_PERIM_INC_KORR(int32, mi_sint3korr, 3); + break; + case HA_KEYTYPE_UINT24: + RT_PERIM_INC_KORR(int32, mi_uint3korr, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_PERIM_INC_KORR(int32, mi_sint4korr, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_PERIM_INC_KORR(uint32, mi_uint4korr, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_PERIM_INC_GET(float, mi_float4get, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_PERIM_INC_GET(double, mi_float8get, 8); + break; + case HA_KEYTYPE_END: + return *ab_perim - a_perim; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } + return *ab_perim - a_perim; +} + + +#define RT_PAGE_MBR_KORR(type, korr_func, store_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin = korr_func(k + inc); \ + amax = korr_func(k + inc + len); \ + k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag); \ + for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag)) \ +{ \ + bmin = korr_func(k + inc); \ + bmax = korr_func(k + inc + len); \ + if (amin > bmin) \ + amin = bmin; \ + if (amax < bmax) \ + amax = bmax; \ +} \ + store_func(c, amin); \ + c += len; \ + store_func(c, amax); \ + c += len; \ + inc += 2 * len; \ +} + +#define RT_PAGE_MBR_GET(type, get_func, store_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, k + inc); \ + get_func(amax, k + inc + len); \ + k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag); \ + for (; k < last; k = rt_PAGE_NEXT_KEY(k, k_len, nod_flag)) \ +{ \ + get_func(bmin, k + inc); \ + get_func(bmax, k + inc + len); \ + if (amin > bmin) \ + amin = bmin; \ + if (amax < bmax) \ + amax = bmax; \ +} \ + store_func(c, amin); \ + c += len; \ + store_func(c, amax); \ + c += len; \ + inc += 2 * len; \ +} + +/* + Calculates key page total MBR = MBR(key1) + MBR(key2) + ... +*/ +int maria_rtree_page_mbr(MARIA_HA *info, HA_KEYSEG *keyseg, uchar *page_buf, + uchar *c, uint key_length) +{ + uint inc = 0; + uint k_len = key_length; + uint nod_flag = _ma_test_if_nod(page_buf); + uchar *k; + uchar *last = rt_PAGE_END(page_buf); + + for (; (int)key_length > 0; keyseg += 2) + { + key_length -= keyseg->length * 2; + + /* Handle NULL part */ + if (keyseg->null_bit) + { + return 1; + } + + k = rt_PAGE_FIRST_KEY(page_buf, nod_flag); + + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_PAGE_MBR_KORR(int8, mi_sint1korr, mi_int1store, 1); + break; + case HA_KEYTYPE_BINARY: + RT_PAGE_MBR_KORR(uint8, mi_uint1korr, mi_int1store, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_PAGE_MBR_KORR(int16, mi_sint2korr, mi_int2store, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_PAGE_MBR_KORR(uint16, mi_uint2korr, mi_int2store, 2); + break; + case HA_KEYTYPE_INT24: + RT_PAGE_MBR_KORR(int32, mi_sint3korr, mi_int3store, 3); + break; + case HA_KEYTYPE_UINT24: + RT_PAGE_MBR_KORR(uint32, mi_uint3korr, mi_int3store, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_PAGE_MBR_KORR(int32, mi_sint4korr, mi_int4store, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_PAGE_MBR_KORR(uint32, mi_uint4korr, mi_int4store, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_PAGE_MBR_KORR(longlong, mi_sint8korr, mi_int8store, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_PAGE_MBR_KORR(ulonglong, mi_uint8korr, mi_int8store, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_PAGE_MBR_GET(float, mi_float4get, mi_float4store, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_PAGE_MBR_GET(double, mi_float8get, mi_float8store, 8); + break; + case HA_KEYTYPE_END: + return 0; + default: + return 1; + } + } + return 0; +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_mbr.h b/storage/maria/ma_rt_mbr.h new file mode 100644 index 00000000000..ad855518e62 --- /dev/null +++ b/storage/maria/ma_rt_mbr.h @@ -0,0 +1,38 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _rt_mbr_h +#define _rt_mbr_h + +#ifdef HAVE_RTREE_KEYS + +int maria_rtree_key_cmp(HA_KEYSEG *keyseg, uchar *a, uchar *b, uint key_length, + uint nextflag); +int maria_rtree_combine_rect(HA_KEYSEG *keyseg,uchar *, uchar *, uchar*, + uint key_length); +double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar*, uint key_length); +int maria_rtree_d_mbr(HA_KEYSEG *keyseg, uchar *a, uint key_length, + double *res); +double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar *a, uchar *b, + uint key_length); +double maria_rtree_area_increase(HA_KEYSEG *keyseg, uchar *a, uchar *b, + uint key_length, double *ab_area); +double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b, + uint key_length, double *ab_perim); +int maria_rtree_page_mbr(MARIA_HA *info, HA_KEYSEG *keyseg, uchar *page_buf, + uchar* c, uint key_length); +#endif /*HAVE_RTREE_KEYS*/ +#endif /* _rt_mbr_h */ diff --git a/storage/maria/ma_rt_split.c b/storage/maria/ma_rt_split.c new file mode 100644 index 00000000000..a91eaa47bea --- /dev/null +++ b/storage/maria/ma_rt_split.c @@ -0,0 +1,362 @@ +/* Copyright (C) 2006 MySQL AB & Alexey Botchkov & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" +#include "ma_rt_key.h" +#include "ma_rt_mbr.h" + +typedef struct +{ + double square; + int n_node; + uchar *key; + double *coords; +} SplitStruct; + +inline static double *reserve_coords(double **d_buffer, int n_dim) +{ + double *coords = *d_buffer; + (*d_buffer) += n_dim * 2; + return coords; +} + +static void mbr_join(double *a, const double *b, int n_dim) +{ + double *end = a + n_dim * 2; + do + { + if (a[0] > b[0]) + a[0] = b[0]; + + if (a[1] < b[1]) + a[1] = b[1]; + + a += 2; + b += 2; + }while (a != end); +} + +/* +Counts the square of mbr which is a join of a and b +*/ +static double mbr_join_square(const double *a, const double *b, int n_dim) +{ + const double *end = a + n_dim * 2; + double square = 1.0; + do + { + square *= + ((a[1] < b[1]) ? b[1] : a[1]) - ((a[0] > b[0]) ? b[0] : a[0]); + + a += 2; + b += 2; + }while (a != end); + + return square; +} + +static double count_square(const double *a, int n_dim) +{ + const double *end = a + n_dim * 2; + double square = 1.0; + do + { + square *= a[1] - a[0]; + a += 2; + }while (a != end); + return square; +} + +inline static void copy_coords(double *dst, const double *src, int n_dim) +{ + memcpy(dst, src, sizeof(double) * (n_dim * 2)); +} + +/* +Select two nodes to collect group upon +*/ +static void pick_seeds(SplitStruct *node, int n_entries, + SplitStruct **seed_a, SplitStruct **seed_b, int n_dim) +{ + SplitStruct *cur1; + SplitStruct *lim1 = node + (n_entries - 1); + SplitStruct *cur2; + SplitStruct *lim2 = node + n_entries; + + double max_d = -DBL_MAX; + double d; + + for (cur1 = node; cur1 < lim1; ++cur1) + { + for (cur2=cur1 + 1; cur2 < lim2; ++cur2) + { + + d = mbr_join_square(cur1->coords, cur2->coords, n_dim) - cur1->square - + cur2->square; + if (d > max_d) + { + max_d = d; + *seed_a = cur1; + *seed_b = cur2; + } + } + } +} + +/* +Select next node and group where to add +*/ +static void pick_next(SplitStruct *node, int n_entries, double *g1, double *g2, + SplitStruct **choice, int *n_group, int n_dim) +{ + SplitStruct *cur = node; + SplitStruct *end = node + n_entries; + + double max_diff = -DBL_MAX; + + for (; cur<end; ++cur) + { + double diff; + double abs_diff; + + if (cur->n_node) + { + continue; + } + + diff = mbr_join_square(g1, cur->coords, n_dim) - + mbr_join_square(g2, cur->coords, n_dim); + + abs_diff = fabs(diff); + if (abs_diff > max_diff) + { + max_diff = abs_diff; + *n_group = 1 + (diff > 0); + *choice = cur; + } + } +} + +/* +Mark not-in-group entries as n_group +*/ +static void mark_all_entries(SplitStruct *node, int n_entries, int n_group) +{ + SplitStruct *cur = node; + SplitStruct *end = node + n_entries; + for (; cur<end; ++cur) + { + if (cur->n_node) + { + continue; + } + cur->n_node = n_group; + } +} + +static int split_maria_rtree_node(SplitStruct *node, int n_entries, + int all_size, /* Total key's size */ + int key_size, + int min_size, /* Minimal group size */ + int size1, int size2 /* initial group sizes */, + double **d_buffer, int n_dim) +{ + SplitStruct *cur; + SplitStruct *a; + SplitStruct *b; + double *g1 = reserve_coords(d_buffer, n_dim); + double *g2 = reserve_coords(d_buffer, n_dim); + SplitStruct *next; + int next_node; + int i; + SplitStruct *end = node + n_entries; + LINT_INIT(a); + LINT_INIT(b); + LINT_INIT(next); + LINT_INIT(next_node); + + if (all_size < min_size * 2) + { + return 1; + } + + cur = node; + for (; cur<end; ++cur) + { + cur->square = count_square(cur->coords, n_dim); + cur->n_node = 0; + } + + pick_seeds(node, n_entries, &a, &b, n_dim); + a->n_node = 1; + b->n_node = 2; + + + copy_coords(g1, a->coords, n_dim); + size1 += key_size; + copy_coords(g2, b->coords, n_dim); + size2 += key_size; + + + for (i=n_entries - 2; i>0; --i) + { + if (all_size - (size2 + key_size) < min_size) /* Can't write into group 2 */ + { + mark_all_entries(node, n_entries, 1); + break; + } + + if (all_size - (size1 + key_size) < min_size) /* Can't write into group 1 */ + { + mark_all_entries(node, n_entries, 2); + break; + } + + pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim); + if (next_node == 1) + { + size1 += key_size; + mbr_join(g1, next->coords, n_dim); + } + else + { + size2 += key_size; + mbr_join(g2, next->coords, n_dim); + } + next->n_node = next_node; + } + + return 0; +} + +int maria_rtree_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *page, uchar *key, + uint key_length, my_off_t *new_page_offs) +{ + int n1, n2; /* Number of items in groups */ + + SplitStruct *task; + SplitStruct *cur; + SplitStruct *stop; + double *coord_buf; + double *next_coord; + double *old_coord; + int n_dim; + uchar *source_cur, *cur1, *cur2; + uchar *new_page; + int err_code= 0; + uint nod_flag= _ma_test_if_nod(page); + uint full_length= key_length + (nod_flag ? nod_flag : + info->s->base.rec_reflength); + int max_keys= (maria_data_on_page(page)-2) / (full_length); + DBUG_ENTER("maria_rtree_split_page"); + DBUG_PRINT("rtree", ("splitting block")); + + n_dim = keyinfo->keysegs / 2; + + if (!(coord_buf= (double*) my_alloca(n_dim * 2 * sizeof(double) * + (max_keys + 1 + 4) + + sizeof(SplitStruct) * (max_keys + 1)))) + DBUG_RETURN(-1); /* purecov: inspected */ + + task= (SplitStruct *)(coord_buf + n_dim * 2 * (max_keys + 1 + 4)); + + next_coord = coord_buf; + + stop = task + max_keys; + source_cur = rt_PAGE_FIRST_KEY(page, nod_flag); + + for (cur = task; cur < stop; ++cur, source_cur = rt_PAGE_NEXT_KEY(source_cur, + key_length, nod_flag)) + { + cur->coords = reserve_coords(&next_coord, n_dim); + cur->key = source_cur; + maria_rtree_d_mbr(keyinfo->seg, source_cur, key_length, cur->coords); + } + + cur->coords = reserve_coords(&next_coord, n_dim); + maria_rtree_d_mbr(keyinfo->seg, key, key_length, cur->coords); + cur->key = key; + + old_coord = next_coord; + + if (split_maria_rtree_node(task, max_keys + 1, + maria_data_on_page(page) + full_length + 2, full_length, + rt_PAGE_MIN_SIZE(keyinfo->block_length), + 2, 2, &next_coord, n_dim)) + { + err_code = 1; + goto split_err; + } + + if (!(new_page = (uchar*) my_alloca((uint)keyinfo->block_length))) + { + err_code= -1; + goto split_err; + } + + stop = task + (max_keys + 1); + cur1 = rt_PAGE_FIRST_KEY(page, nod_flag); + cur2 = rt_PAGE_FIRST_KEY(new_page, nod_flag); + + n1= n2 = 0; + for (cur = task; cur < stop; ++cur) + { + uchar *to; + if (cur->n_node == 1) + { + to = cur1; + cur1 = rt_PAGE_NEXT_KEY(cur1, key_length, nod_flag); + ++n1; + } + else + { + to = cur2; + cur2 = rt_PAGE_NEXT_KEY(cur2, key_length, nod_flag); + ++n2; + } + if (to != cur->key) + memcpy(to - nod_flag, cur->key - nod_flag, full_length); + } + + maria_putint(page, 2 + n1 * full_length, nod_flag); + maria_putint(new_page, 2 + n2 * full_length, nod_flag); + + if ((*new_page_offs= _ma_new(info, keyinfo, DFLT_INIT_HITS)) == + HA_OFFSET_ERROR) + err_code= -1; + else + err_code= _ma_write_keypage(info, keyinfo, *new_page_offs, + DFLT_INIT_HITS, new_page); + DBUG_PRINT("rtree", ("split new block: %lu", (ulong) *new_page_offs)); + + my_afree((uchar*)new_page); + +split_err: + /** + @todo the cast below is useless (coord_buf is uchar*); at the moment we + changed all "byte" to "uchar", some casts became useless and should be + removed. + */ + my_afree((uchar*) coord_buf); + DBUG_RETURN(err_code); +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_test.c b/storage/maria/ma_rt_test.c new file mode 100644 index 00000000000..4360e81c550 --- /dev/null +++ b/storage/maria/ma_rt_test.c @@ -0,0 +1,473 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Testing of the basic functions of a MARIA rtree table */ +/* Written by Alex Barkov who has a shared copyright to this code */ + + +#include "maria.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" + +#define MAX_REC_LENGTH 1024 +#define ndims 2 +#define KEYALG HA_KEY_ALG_RTREE + +static int read_with_pos(MARIA_HA * file, int silent); +static void create_record(char *record,uint rownr); +static void create_record1(char *record,uint rownr); +static void print_record(char * record,my_off_t offs,const char * tail); +static int run_test(const char *filename); + +static double rt_data[]= +{ + /*1*/ 0,10,0,10, + /*2*/ 5,15,0,10, + /*3*/ 0,10,5,15, + /*4*/ 10,20,10,20, + /*5*/ 0,10,0,10, + /*6*/ 5,15,0,10, + /*7*/ 0,10,5,15, + /*8*/ 10,20,10,20, + /*9*/ 0,10,0,10, + /*10*/ 5,15,0,10, + /*11*/ 0,10,5,15, + /*12*/ 10,20,10,20, + /*13*/ 0,10,0,10, + /*14*/ 5,15,0,10, + /*15*/ 0,10,5,15, + /*16*/ 10,20,10,20, + /*17*/ 5,15,0,10, + /*18*/ 0,10,5,15, + /*19*/ 10,20,10,20, + /*20*/ 0,10,0,10, + + /*1*/ 100,110,0,10, + /*2*/ 105,115,0,10, + /*3*/ 100,110,5,15, + /*4*/ 110,120,10,20, + /*5*/ 100,110,0,10, + /*6*/ 105,115,0,10, + /*7*/ 100,110,5,15, + /*8*/ 110,120,10,20, + /*9*/ 100,110,0,10, + /*10*/ 105,115,0,10, + /*11*/ 100,110,5,15, + /*12*/ 110,120,10,20, + /*13*/ 100,110,0,10, + /*14*/ 105,115,0,10, + /*15*/ 100,110,5,15, + /*16*/ 110,120,10,20, + /*17*/ 105,115,0,10, + /*18*/ 100,110,5,15, + /*19*/ 110,120,10,20, + /*20*/ 100,110,0,10, + -1 +}; + +int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused))) +{ + MY_INIT(argv[0]); + maria_init(); + exit(run_test("rt_test")); +} + + +static int run_test(const char *filename) +{ + MARIA_HA *file; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + MARIA_COLUMNDEF recinfo[20]; + MARIA_KEYDEF keyinfo[20]; + HA_KEYSEG keyseg[20]; + key_range range; + + int silent=0; + int opt_unique=0; + int create_flag=0; + int key_type=HA_KEYTYPE_DOUBLE; + int key_length=8; + int null_fields=0; + int nrecords=sizeof(rt_data)/(sizeof(double)*4);/* 3000;*/ + int rec_length=0; + int uniques=0; + int i; + int error; + int row_count=0; + char record[MAX_REC_LENGTH]; + char read_record[MAX_REC_LENGTH]; + int upd= 10; + ha_rows hrows; + + /* Define a column for NULLs and DEL markers*/ + + recinfo[0].type=FIELD_NORMAL; + recinfo[0].length=1; /* For NULL bits */ + rec_length=1; + + /* Define 2*ndims columns for coordinates*/ + + for (i=1; i<=2*ndims ;i++){ + recinfo[i].type=FIELD_NORMAL; + recinfo[i].length=key_length; + rec_length+=key_length; + } + + /* Define a key with 2*ndims segments */ + + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=2*ndims; + keyinfo[0].flag=0; + keyinfo[0].key_alg=KEYALG; + + for (i=0; i<2*ndims; i++){ + keyinfo[0].seg[i].type= key_type; + keyinfo[0].seg[i].flag=0; /* Things like HA_REVERSE_SORT */ + keyinfo[0].seg[i].start= (key_length*i)+1; + keyinfo[0].seg[i].length=key_length; + keyinfo[0].seg[i].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[i].null_pos=0; + keyinfo[0].seg[i].language=default_charset_info->number; + } + + if (!silent) + printf("- Creating isam-file\n"); + + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=10000000; + + if (maria_create(filename, + DYNAMIC_RECORD, + 1, /* keys */ + keyinfo, + 1+2*ndims+opt_unique, /* columns */ + recinfo,uniques,&uniquedef,&create_info,create_flag)) + goto err; + + if (!silent) + printf("- Open isam-file\n"); + + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + + if (!silent) + printf("- Writing key:s\n"); + + for (i=0; i<nrecords; i++ ) + { + create_record(record,i); + error=maria_write(file,record); + print_record(record,maria_position(file),"\n"); + if (!error) + { + row_count++; + } + else + { + printf("maria_write: %d\n", error); + goto err; + } + } + + if ((error=read_with_pos(file,silent))) + goto err; + + if (!silent) + printf("- Reading rows with key\n"); + + for (i=0 ; i < nrecords ; i++) + { + my_errno=0; + create_record(record,i); + + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rkey(file,read_record,0,record+1,0,HA_READ_MBR_EQUAL); + + if (error && error!=HA_ERR_KEY_NOT_FOUND) + { + printf(" maria_rkey: %3d errno: %3d\n",error,my_errno); + goto err; + } + if (error == HA_ERR_KEY_NOT_FOUND) + { + print_record(record,maria_position(file)," NOT FOUND\n"); + continue; + } + print_record(read_record,maria_position(file),"\n"); + } + + if (!silent) + printf("- Deleting rows\n"); + for (i=0; i < nrecords/4; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + print_record(read_record,maria_position(file),"\n"); + + error=maria_delete(file,read_record); + if (error) + { + printf("pos: %2d maria_delete: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + } + + if (!silent) + printf("- Updating rows with position\n"); + for (i=0; i < (nrecords - nrecords/4) ; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + if (error==HA_ERR_RECORD_DELETED) + continue; + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + print_record(read_record,maria_position(file),""); + create_record(record,i+nrecords*upd); + printf("\t-> "); + print_record(record,maria_position(file),"\n"); + error=maria_update(file,read_record,record); + if (error) + { + printf("pos: %2d maria_update: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + } + + if ((error=read_with_pos(file,silent))) + goto err; + + if (!silent) + printf("- Test maria_rkey then a sequence of maria_rnext_same\n"); + + create_record(record, nrecords*4/5); + print_record(record,0," search for\n"); + + if ((error=maria_rkey(file,read_record,0,record+1,0,HA_READ_MBR_INTERSECT))) + { + printf("maria_rkey: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rkey\n"); + row_count=1; + + for (;;) + { + if ((error=maria_rnext_same(file,read_record))) + { + if (error==HA_ERR_END_OF_FILE) + break; + printf("maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext_same\n"); + row_count++; + } + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_rfirst then a sequence of maria_rnext\n"); + + error=maria_rfirst(file,read_record,0); + if (error) + { + printf("maria_rfirst: %3d errno: %3d\n",error,my_errno); + goto err; + } + row_count=1; + print_record(read_record,maria_position(file)," maria_frirst\n"); + + for (i=0;i<nrecords;i++) + { + if ((error=maria_rnext(file,read_record,0))) + { + if (error==HA_ERR_END_OF_FILE) + break; + printf("maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext\n"); + row_count++; + } + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_records_in_range()\n"); + + create_record1(record, nrecords*4/5); + print_record(record,0,"\n"); + + range.key= record+1; + range.length= 1000; /* Big enough */ + range.flag= HA_READ_MBR_INTERSECT; + hrows= maria_records_in_range(file,0, &range, (key_range*) 0); + printf(" %ld rows\n", (long) hrows); + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return 0; + +err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + + + +static int read_with_pos (MARIA_HA * file,int silent) +{ + int error; + int i; + char read_record[MAX_REC_LENGTH]; + + if (!silent) + printf("- Reading rows with position\n"); + for (i=0;;i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + if (error==HA_ERR_END_OF_FILE) + break; + if (error==HA_ERR_RECORD_DELETED) + continue; + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + return error; + } + print_record(read_record,maria_position(file),"\n"); + } + return 0; +} + + +#ifdef NOT_USED +static void bprint_record(char * record, + my_off_t offs __attribute__((unused)), + const char * tail) +{ + int i; + char * pos; + i=(unsigned char)record[0]; + printf("%02X ",i); + + for( pos=record+1, i=0; i<32; i++,pos++){ + int b=(unsigned char)*pos; + printf("%02X",b); + } + printf("%s",tail); +} +#endif + + +static void print_record(char * record, + my_off_t offs __attribute__((unused)), + const char * tail) +{ + int i; + char * pos; + double c; + + printf(" rec=(%d)",(unsigned char)record[0]); + for ( pos=record+1, i=0; i<2*ndims; i++) + { + memcpy(&c,pos,sizeof(c)); + float8get(c,pos); + printf(" %.14g ",c); + pos+=sizeof(c); + } + printf("pos=%ld",(long int)offs); + printf("%s",tail); +} + + + +static void create_record1(char *record,uint rownr) +{ + int i; + char * pos; + double c=rownr+10; + + bzero((char*) record,MAX_REC_LENGTH); + record[0]=0x01; /* DEL marker */ + + for ( pos=record+1, i=0; i<2*ndims; i++) + { + memcpy(pos,&c,sizeof(c)); + float8store(pos,c); + pos+=sizeof(c); + } +} + +#ifdef NOT_USED + +static void create_record0(char *record,uint rownr) +{ + int i; + char * pos; + double c=rownr+10; + double c0=0; + + bzero((char*) record,MAX_REC_LENGTH); + record[0]=0x01; /* DEL marker */ + + for ( pos=record+1, i=0; i<ndims; i++) + { + memcpy(pos,&c0,sizeof(c0)); + float8store(pos,c0); + pos+=sizeof(c0); + memcpy(pos,&c,sizeof(c)); + float8store(pos,c); + pos+=sizeof(c); + } +} + +#endif + +static void create_record(char *record,uint rownr) +{ + int i; + char *pos; + double *data= rt_data+rownr*4; + record[0]=0x01; /* DEL marker */ + for ( pos=record+1, i=0; i<ndims*2; i++) + { + float8store(pos,data[i]); + pos+=8; + } +} + +#else +int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused))) +{ + exit(0); +} +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_scan.c b/storage/maria/ma_scan.c new file mode 100644 index 00000000000..f9657833fdd --- /dev/null +++ b/storage/maria/ma_scan.c @@ -0,0 +1,60 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Read through all rows sequntially */ + +#include "maria_def.h" + +int maria_scan_init(register MARIA_HA *info) +{ + DBUG_ENTER("maria_scan_init"); + + info->cur_row.nextpos= info->s->pack.header_length; /* Read first record */ + info->lastinx= -1; /* Can't forward or backward */ + if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + + if ((*info->s->scan_init)(info)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); +} + +/* + Read a row based on position. + + SYNOPSIS + maria_scan() + info Maria handler + record Read data here + + RETURN + 0 ok + HA_ERR_END_OF_FILE End of file + # Error code +*/ + +int maria_scan(MARIA_HA *info, uchar *record) +{ + DBUG_ENTER("maria_scan"); + /* Init all but update-flag */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + DBUG_RETURN((*info->s->scan)(info, record, info->cur_row.nextpos, 1)); +} + + +void maria_scan_end(MARIA_HA *info) +{ + (*info->s->scan_end)(info); +} diff --git a/storage/maria/ma_search.c b/storage/maria/ma_search.c new file mode 100644 index 00000000000..8cb3e56e646 --- /dev/null +++ b/storage/maria/ma_search.c @@ -0,0 +1,1934 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* key handling functions */ + +#include "ma_fulltext.h" +#include "m_ctype.h" + +static my_bool _ma_get_prev_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *page, + uchar *key, uchar *keypos, + uint *return_key_length); + + /* Check index */ + +int _ma_check_index(MARIA_HA *info, int inx) +{ + if (inx == -1) /* Use last index */ + inx=info->lastinx; + if (inx < 0 || ! maria_is_key_active(info->s->state.key_map, inx)) + { + my_errno=HA_ERR_WRONG_INDEX; + return -1; + } + if (info->lastinx != inx) /* Index changed */ + { + info->lastinx = inx; + info->page_changed=1; + info->update= ((info->update & (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED)) | + HA_STATE_NEXT_FOUND | HA_STATE_PREV_FOUND); + } + if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache)) + return(-1); + return(inx); +} /* _ma_check_index */ + + + /* + ** Search after row by a key + ** Position to row is stored in info->lastpos + ** Return: -1 if not found + ** 1 if one should continue search on higher level + */ + +int _ma_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *key, uint key_len, uint nextflag, register my_off_t pos) +{ + my_bool last_key; + int error,flag; + uint nod_flag; + uchar *keypos,*maxpos; + uchar lastkey[HA_MAX_KEY_BUFF],*buff; + DBUG_ENTER("_ma_search"); + DBUG_PRINT("enter",("pos: %lu nextflag: %u lastpos: %lu", + (ulong) pos, nextflag, (ulong) info->cur_row.lastpos)); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE,keyinfo->seg,key,key_len);); + + if (pos == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */ + info->cur_row.lastpos= HA_OFFSET_ERROR; + if (!(nextflag & (SEARCH_SMALLER | SEARCH_BIGGER | SEARCH_LAST))) + DBUG_RETURN(-1); /* Not found ; return error */ + DBUG_RETURN(1); /* Search at upper levels */ + } + + if (!(buff= _ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS, + info->keyread_buff, + test(!(nextflag & SEARCH_SAVE_BUFF))))) + goto err; + DBUG_DUMP("page", buff, maria_data_on_page(buff)); + + flag=(*keyinfo->bin_search)(info,keyinfo,buff,key,key_len,nextflag, + &keypos,lastkey, &last_key); + if (flag == MARIA_FOUND_WRONG_KEY) + DBUG_RETURN(-1); + nod_flag=_ma_test_if_nod(buff); + maxpos=buff+maria_data_on_page(buff)-1; + + if (flag) + { + if ((error= _ma_search(info,keyinfo,key,key_len,nextflag, + _ma_kpos(nod_flag,keypos))) <= 0) + DBUG_RETURN(error); + + if (flag >0) + { + if (nextflag & (SEARCH_SMALLER | SEARCH_LAST) && + keypos == buff+2+nod_flag) + DBUG_RETURN(1); /* Bigger than key */ + } + else if (nextflag & SEARCH_BIGGER && keypos >= maxpos) + DBUG_RETURN(1); /* Smaller than key */ + } + else + { + if ((nextflag & SEARCH_FIND) && nod_flag && + ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME || + key_len != USE_WHOLE_KEY)) + { + if ((error= _ma_search(info,keyinfo,key,key_len,SEARCH_FIND, + _ma_kpos(nod_flag,keypos))) >= 0 || + my_errno != HA_ERR_KEY_NOT_FOUND) + DBUG_RETURN(error); + info->last_keypage= HA_OFFSET_ERROR; /* Buffer not in mem */ + } + } + if (pos != info->last_keypage) + { + uchar *old_buff=buff; + if (!(buff= _ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS, + info->keyread_buff, + test(!(nextflag & SEARCH_SAVE_BUFF))))) + goto err; + keypos=buff+(keypos-old_buff); + maxpos=buff+(maxpos-old_buff); + } + + if ((nextflag & (SEARCH_SMALLER | SEARCH_LAST)) && flag != 0) + { + uint not_used[2]; + if (_ma_get_prev_key(info,keyinfo, buff, info->lastkey, keypos, + &info->lastkey_length)) + goto err; + if (!(nextflag & SEARCH_SMALLER) && + ha_key_cmp(keyinfo->seg, (uchar*) info->lastkey, (uchar*) key, key_len, + SEARCH_FIND, not_used)) + { + my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */ + goto err; + } + } + else + { + info->lastkey_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,lastkey); + if (!info->lastkey_length) + goto err; + memcpy(info->lastkey,lastkey,info->lastkey_length); + } + info->cur_row.lastpos= _ma_dpos(info,0,info->lastkey+info->lastkey_length); + /* Save position for a possible read next / previous */ + info->int_keypos= info->keyread_buff+ (keypos-buff); + info->int_maxpos= info->keyread_buff+ (maxpos-buff); + info->int_nod_flag=nod_flag; + info->int_keytree_version=keyinfo->version; + info->last_search_keypage=info->last_keypage; + info->page_changed=0; + info->keyread_buff_used= (info->keyread_buff != buff); /* If we have to reread */ + + DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); + +err: + DBUG_PRINT("exit",("Error: %d",my_errno)); + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->page_changed=1; + DBUG_RETURN (-1); +} /* _ma_search */ + + + /* Search after key in page-block */ + /* If packed key puts smaller or identical key in buff */ + /* ret_pos point to where find or bigger key starts */ + /* ARGSUSED */ + +int _ma_bin_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page, + uchar *key, uint key_len, uint comp_flag, uchar **ret_pos, + uchar *buff __attribute__((unused)), my_bool *last_key) +{ + reg4 int start,mid,end,save_end; + int flag; + uint totlength,nod_flag,not_used[2]; + DBUG_ENTER("_ma_bin_search"); + + LINT_INIT(flag); + totlength=keyinfo->keylength+(nod_flag=_ma_test_if_nod(page)); + start=0; mid=1; + save_end=end=(int) ((maria_data_on_page(page)-2-nod_flag)/totlength-1); + DBUG_PRINT("test",("page_length: %d end: %d",maria_data_on_page(page),end)); + page+=2+nod_flag; + + while (start != end) + { + mid= (start+end)/2; + if ((flag=ha_key_cmp(keyinfo->seg,(uchar*) page+(uint) mid*totlength, + (uchar*) key, key_len, comp_flag, not_used)) + >= 0) + end=mid; + else + start=mid+1; + } + if (mid != start) + flag=ha_key_cmp(keyinfo->seg, (uchar*) page+(uint) start*totlength, + (uchar*) key, key_len, comp_flag, not_used); + if (flag < 0) + start++; /* point at next, bigger key */ + *ret_pos=page+(uint) start*totlength; + *last_key= end == save_end; + DBUG_PRINT("exit",("flag: %d keypos: %d",flag,start)); + DBUG_RETURN(flag); +} /* _ma_bin_search */ + + +/* + Locate a packed key in a key page. + + SYNOPSIS + _ma_seq_search() + info Open table information. + keyinfo Key definition information. + page Key page (beginning). + key Search key. + key_len Length to use from search key or USE_WHOLE_KEY + comp_flag Search flags like SEARCH_SAME etc. + ret_pos RETURN Position in key page behind this key. + buff RETURN Copy of previous or identical unpacked key. + last_key RETURN If key is last in page. + + DESCRIPTION + Used instead of _ma_bin_search() when key is packed. + Puts smaller or identical key in buff. + Key is searched sequentially. + + RETURN + > 0 Key in 'buff' is smaller than search key. + 0 Key in 'buff' is identical to search key. + < 0 Not found. +*/ + +int _ma_seq_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page, + uchar *key, uint key_len, uint comp_flag, uchar **ret_pos, + uchar *buff, my_bool *last_key) +{ + int flag; + uint nod_flag,length,not_used[2]; + uchar t_buff[HA_MAX_KEY_BUFF],*end; + DBUG_ENTER("_ma_seq_search"); + + LINT_INIT(flag); LINT_INIT(length); + end= page+maria_data_on_page(page); + nod_flag=_ma_test_if_nod(page); + page+=2+nod_flag; + *ret_pos=page; + t_buff[0]=0; /* Avoid bugs */ + while (page < end) + { + length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,t_buff); + if (length == 0 || page > end) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_PRINT("error", + ("Found wrong key: length: %u page: 0x%lx end: 0x%lx", + length, (long) page, (long) end)); + DBUG_RETURN(MARIA_FOUND_WRONG_KEY); + } + if ((flag= ha_key_cmp(keyinfo->seg, (uchar*) t_buff,(uchar*) key, + key_len,comp_flag, not_used)) >= 0) + break; +#ifdef EXTRA_DEBUG + DBUG_PRINT("loop",("page: 0x%lx key: '%s' flag: %d", (long) page, t_buff, + flag)); +#endif + memcpy(buff,t_buff,length); + *ret_pos=page; + } + if (flag == 0) + memcpy(buff,t_buff,length); /* Result is first key */ + *last_key= page == end; + DBUG_PRINT("exit",("flag: %d ret_pos: 0x%lx", flag, (long) *ret_pos)); + DBUG_RETURN(flag); +} /* _ma_seq_search */ + + +int _ma_prefix_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *page, uchar *key, uint key_len, uint nextflag, + uchar **ret_pos, uchar *buff, my_bool *last_key) +{ + /* + my_flag is raw comparison result to be changed according to + SEARCH_NO_FIND,SEARCH_LAST and HA_REVERSE_SORT flags. + flag is the value returned by ha_key_cmp and as treated as final + */ + int flag=0, my_flag=-1; + uint nod_flag, length, len, matched, cmplen, kseg_len; + uint prefix_len,suffix_len; + int key_len_skip, seg_len_pack, key_len_left; + uchar *end; + uchar *kseg, *vseg, *saved_vseg, *saved_from; + uchar *sort_order= keyinfo->seg->charset->sort_order; + uchar tt_buff[HA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2; + uchar *saved_to; + uint saved_length=0, saved_prefix_len=0; + uint length_pack; + DBUG_ENTER("_ma_prefix_search"); + + LINT_INIT(length); + LINT_INIT(prefix_len); + LINT_INIT(seg_len_pack); + LINT_INIT(saved_from); + LINT_INIT(saved_to); + LINT_INIT(saved_vseg); + + t_buff[0]=0; /* Avoid bugs */ + end= page+maria_data_on_page(page); + nod_flag=_ma_test_if_nod(page); + page+=2+nod_flag; + *ret_pos=page; + kseg= (uchar*) key; + + get_key_pack_length(kseg_len, length_pack, kseg); + key_len_skip=length_pack+kseg_len; + key_len_left=(int) key_len- (int) key_len_skip; + /* If key_len is 0, then lenght_pack is 1, then key_len_left is -1. */ + cmplen=(key_len_left>=0) ? kseg_len : key_len-length_pack; + DBUG_PRINT("info",("key: '%.*s'",kseg_len,kseg)); + + /* + Keys are compressed the following way: + + If the max length of first key segment <= 127 bytes the prefix is + 1 uchar else it's 2 byte + + (prefix) length The high bit is set if this is a prefix for the prev key. + [suffix length] Packed length of suffix if the previous was a prefix. + (suffix) data Key data bytes (past the common prefix or whole segment). + [next-key-seg] Next key segments (([packed length], data), ...) + pointer Reference to the data file (last_keyseg->length). + */ + + matched=0; /* how many char's from prefix were alredy matched */ + len=0; /* length of previous key unpacked */ + + while (page < end) + { + uint packed= *page & 128; + + vseg= (uchar*) page; + if (keyinfo->seg->length >= 127) + { + suffix_len=mi_uint2korr(vseg) & 32767; + vseg+=2; + } + else + suffix_len= *vseg++ & 127; + + if (packed) + { + if (suffix_len == 0) + { + /* == 0x80 or 0x8000, same key, prefix length == old key length. */ + prefix_len=len; + } + else + { + /* > 0x80 or 0x8000, this is prefix lgt, packed suffix lgt follows. */ + prefix_len=suffix_len; + get_key_length(suffix_len,vseg); + } + } + else + { + /* Not packed. No prefix used from last key. */ + prefix_len=0; + } + + len=prefix_len+suffix_len; + seg_len_pack=get_pack_length(len); + t_buff=tt_buff+3-seg_len_pack; + store_key_length(t_buff,len); + + if (prefix_len > saved_prefix_len) + memcpy(t_buff+seg_len_pack+saved_prefix_len,saved_vseg, + prefix_len-saved_prefix_len); + saved_vseg=vseg; + saved_prefix_len=prefix_len; + + DBUG_PRINT("loop",("page: '%.*s%.*s'",prefix_len,t_buff+seg_len_pack, + suffix_len,vseg)); + { + uchar *from= vseg+suffix_len; + HA_KEYSEG *keyseg; + uint l; + + for (keyseg=keyinfo->seg+1 ; keyseg->type ; keyseg++ ) + { + + if (keyseg->flag & HA_NULL_PART) + { + if (!(*from++)) + continue; + } + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) + { + get_key_length(l,from); + } + else + l=keyseg->length; + + from+=l; + } + from+= keyseg->length; + page= (uchar*) from+nod_flag; + length= (uint) (from-vseg); + } + + if (page > end) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_PRINT("error", + ("Found wrong key: length: %u page: 0x%lx end: %lx", + length, (long) page, (long) end)); + DBUG_RETURN(MARIA_FOUND_WRONG_KEY); + } + + if (matched >= prefix_len) + { + /* We have to compare. But we can still skip part of the key */ + uint left; + uchar *k= kseg+prefix_len; + + /* + If prefix_len > cmplen then we are in the end-space comparison + phase. Do not try to acces the key any more ==> left= 0. + */ + left= ((len <= cmplen) ? suffix_len : + ((prefix_len < cmplen) ? cmplen - prefix_len : 0)); + + matched=prefix_len+left; + + if (sort_order) + { + for (my_flag=0;left;left--) + if ((my_flag= (int) sort_order[*vseg++] - (int) sort_order[*k++])) + break; + } + else + { + for (my_flag=0;left;left--) + if ((my_flag= (int) *vseg++ - (int) *k++)) + break; + } + + if (my_flag>0) /* mismatch */ + break; + if (my_flag==0) /* match */ + { + /* + ** len cmplen seg_left_len more_segs + ** < matched=len; continue search + ** > = prefix ? found : (matched=len; continue search) + ** > < - ok, found + ** = < - ok, found + ** = = - ok, found + ** = = + next seg + */ + if (len < cmplen) + { + if ((keyinfo->seg->type != HA_KEYTYPE_TEXT && + keyinfo->seg->type != HA_KEYTYPE_VARTEXT1 && + keyinfo->seg->type != HA_KEYTYPE_VARTEXT2)) + my_flag= -1; + else + { + /* We have to compare k and vseg as if they were space extended */ + uchar *k_end= k+ (cmplen - len); + for ( ; k < k_end && *k == ' '; k++) ; + if (k == k_end) + goto cmp_rest; /* should never happen */ + if ((uchar) *k < (uchar) ' ') + { + my_flag= 1; /* Compared string is smaller */ + break; + } + my_flag= -1; /* Continue searching */ + } + } + else if (len > cmplen) + { + uchar *vseg_end; + if ((nextflag & SEARCH_PREFIX) && key_len_left == 0) + goto fix_flag; + + /* We have to compare k and vseg as if they were space extended */ + for (vseg_end= vseg + (len-cmplen) ; + vseg < vseg_end && *vseg == (uchar) ' '; + vseg++, matched++) ; + DBUG_ASSERT(vseg < vseg_end); + + if ((uchar) *vseg > (uchar) ' ') + { + my_flag= 1; /* Compared string is smaller */ + break; + } + my_flag= -1; /* Continue searching */ + } + else + { + cmp_rest: + if (key_len_left>0) + { + uint not_used[2]; + if ((flag = ha_key_cmp(keyinfo->seg+1,vseg, + k, key_len_left, nextflag, not_used)) >= 0) + break; + } + else + { + /* + at this line flag==-1 if the following lines were already + visited and 0 otherwise, i.e. flag <=0 here always !!! + */ + fix_flag: + DBUG_ASSERT(flag <= 0); + if (nextflag & (SEARCH_NO_FIND | SEARCH_LAST)) + flag=(nextflag & (SEARCH_BIGGER | SEARCH_LAST)) ? -1 : 1; + if (flag>=0) + break; + } + } + } + matched-=left; + } + /* else (matched < prefix_len) ---> do nothing. */ + + memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len); + saved_to= buff+saved_length; + saved_from= saved_vseg; + saved_length=length; + *ret_pos=page; + } + if (my_flag) + flag=(keyinfo->seg->flag & HA_REVERSE_SORT) ? -my_flag : my_flag; + if (flag == 0) + { + memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len); + saved_to= buff+saved_length; + saved_from= saved_vseg; + saved_length=length; + } + if (saved_length) + memcpy(saved_to, (uchar*) saved_from, saved_length); + + *last_key= page == end; + + DBUG_PRINT("exit",("flag: %d ret_pos: 0x%lx", flag, (long) *ret_pos)); + DBUG_RETURN(flag); +} /* _ma_prefix_search */ + + + /* Get pos to a key_block */ + +my_off_t _ma_kpos(uint nod_flag, uchar *after_key) +{ + after_key-=nod_flag; + switch (nod_flag) { +#if SIZEOF_OFF_T > 4 + case 7: + return mi_uint7korr(after_key)*MARIA_MIN_KEY_BLOCK_LENGTH; + case 6: + return mi_uint6korr(after_key)*MARIA_MIN_KEY_BLOCK_LENGTH; + case 5: + return mi_uint5korr(after_key)*MARIA_MIN_KEY_BLOCK_LENGTH; +#else + case 7: + after_key++; + case 6: + after_key++; + case 5: + after_key++; +#endif + case 4: + return ((my_off_t) mi_uint4korr(after_key))*MARIA_MIN_KEY_BLOCK_LENGTH; + case 3: + return ((my_off_t) mi_uint3korr(after_key))*MARIA_MIN_KEY_BLOCK_LENGTH; + case 2: + return (my_off_t) (mi_uint2korr(after_key)*MARIA_MIN_KEY_BLOCK_LENGTH); + case 1: + return (uint) (*after_key)*MARIA_MIN_KEY_BLOCK_LENGTH; + case 0: /* At leaf page */ + default: /* Impossible */ + return(HA_OFFSET_ERROR); + } +} /* _kpos */ + + + /* Save pos to a key_block */ + +void _ma_kpointer(register MARIA_HA *info, register uchar *buff, my_off_t pos) +{ + pos/=MARIA_MIN_KEY_BLOCK_LENGTH; + switch (info->s->base.key_reflength) { +#if SIZEOF_OFF_T > 4 + case 7: mi_int7store(buff,pos); break; + case 6: mi_int6store(buff,pos); break; + case 5: mi_int5store(buff,pos); break; +#else + case 7: *buff++=0; + /* fall trough */ + case 6: *buff++=0; + /* fall trough */ + case 5: *buff++=0; + /* fall trough */ +#endif + case 4: mi_int4store(buff,pos); break; + case 3: mi_int3store(buff,pos); break; + case 2: mi_int2store(buff,(uint) pos); break; + case 1: buff[0]= (uchar) pos; break; + default: abort(); /* impossible */ + } +} /* _ma_kpointer */ + + + /* Calc pos to a data-record from a key */ + + +my_off_t _ma_dpos(MARIA_HA *info, uint nod_flag, const uchar *after_key) +{ + my_off_t pos; + after_key-=(nod_flag + info->s->rec_reflength); + switch (info->s->rec_reflength) { +#if SIZEOF_OFF_T > 4 + case 8: pos= (my_off_t) mi_uint8korr(after_key); break; + case 7: pos= (my_off_t) mi_uint7korr(after_key); break; + case 6: pos= (my_off_t) mi_uint6korr(after_key); break; + case 5: pos= (my_off_t) mi_uint5korr(after_key); break; +#else + case 8: pos= (my_off_t) mi_uint4korr(after_key+4); break; + case 7: pos= (my_off_t) mi_uint4korr(after_key+3); break; + case 6: pos= (my_off_t) mi_uint4korr(after_key+2); break; + case 5: pos= (my_off_t) mi_uint4korr(after_key+1); break; +#endif + case 4: pos= (my_off_t) mi_uint4korr(after_key); break; + case 3: pos= (my_off_t) mi_uint3korr(after_key); break; + case 2: pos= (my_off_t) mi_uint2korr(after_key); break; + default: + pos=0L; /* Shut compiler up */ + } + return ((info->s->data_file_type == STATIC_RECORD) ? + pos * info->s->base.pack_reclength : pos); +} + + +/* Calc position from a record pointer ( in delete link chain ) */ + +my_off_t _ma_rec_pos(MARIA_SHARE *s, uchar *ptr) +{ + my_off_t pos; + switch (s->rec_reflength) { +#if SIZEOF_OFF_T > 4 + case 8: + pos= (my_off_t) mi_uint8korr(ptr); + if (pos == HA_OFFSET_ERROR) + return HA_OFFSET_ERROR; /* end of list */ + break; + case 7: + pos= (my_off_t) mi_uint7korr(ptr); + if (pos == (((my_off_t) 1) << 56) -1) + return HA_OFFSET_ERROR; /* end of list */ + break; + case 6: + pos= (my_off_t) mi_uint6korr(ptr); + if (pos == (((my_off_t) 1) << 48) -1) + return HA_OFFSET_ERROR; /* end of list */ + break; + case 5: + pos= (my_off_t) mi_uint5korr(ptr); + if (pos == (((my_off_t) 1) << 40) -1) + return HA_OFFSET_ERROR; /* end of list */ + break; +#else + case 8: + case 7: + case 6: + case 5: + ptr+= (s->rec_reflength-4); + /* fall through */ +#endif + case 4: + pos= (my_off_t) mi_uint4korr(ptr); + if (pos == (my_off_t) (uint32) ~0L) + return HA_OFFSET_ERROR; + break; + case 3: + pos= (my_off_t) mi_uint3korr(ptr); + if (pos == (my_off_t) (1 << 24) -1) + return HA_OFFSET_ERROR; + break; + case 2: + pos= (my_off_t) mi_uint2korr(ptr); + if (pos == (my_off_t) (1 << 16) -1) + return HA_OFFSET_ERROR; + break; + default: abort(); /* Impossible */ + } + return ((s->data_file_type == STATIC_RECORD) ? + pos * s->base.pack_reclength : pos); +} + + + /* save position to record */ + +void _ma_dpointer(MARIA_HA *info, uchar *buff, my_off_t pos) +{ + if (info->s->data_file_type == STATIC_RECORD && + pos != HA_OFFSET_ERROR) + pos/= info->s->base.pack_reclength; + + switch (info->s->rec_reflength) { +#if SIZEOF_OFF_T > 4 + case 8: mi_int8store(buff,pos); break; + case 7: mi_int7store(buff,pos); break; + case 6: mi_int6store(buff,pos); break; + case 5: mi_int5store(buff,pos); break; +#else + case 8: *buff++=0; + /* fall trough */ + case 7: *buff++=0; + /* fall trough */ + case 6: *buff++=0; + /* fall trough */ + case 5: *buff++=0; + /* fall trough */ +#endif + case 4: mi_int4store(buff,pos); break; + case 3: mi_int3store(buff,pos); break; + case 2: mi_int2store(buff,(uint) pos); break; + default: abort(); /* Impossible */ + } +} /* _ma_dpointer */ + + + /* Get key from key-block */ + /* page points at previous key; its advanced to point at next key */ + /* key should contain previous key */ + /* Returns length of found key + pointers */ + /* nod_flag is a flag if we are on nod */ + + /* same as _ma_get_key but used with fixed length keys */ + +uint _ma_get_static_key(register MARIA_KEYDEF *keyinfo, uint nod_flag, + register uchar **page, register uchar *key) +{ + memcpy((uchar*) key,(uchar*) *page, + (size_t) (keyinfo->keylength+nod_flag)); + *page+=keyinfo->keylength+nod_flag; + return(keyinfo->keylength); +} /* _ma_get_static_key */ + + +/* + get key witch is packed against previous key or key with a NULL column. + + SYNOPSIS + _ma_get_pack_key() + keyinfo key definition information. + nod_flag If nod: Length of node pointer, else zero. + page_pos RETURN position in key page behind this key. + key IN/OUT in: prev key, out: unpacked key. + + RETURN + key_length + length of data pointer +*/ + +uint _ma_get_pack_key(register MARIA_KEYDEF *keyinfo, uint nod_flag, + register uchar **page_pos, register uchar *key) +{ + reg1 HA_KEYSEG *keyseg; + uchar *start_key,*page=*page_pos; + uint length; + + start_key=key; + for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++) + { + if (keyseg->flag & HA_PACK_KEY) + { + /* key with length, packed to previous key */ + uchar *start= key; + uint packed= *page & 128,tot_length,rest_length; + if (keyseg->length >= 127) + { + length=mi_uint2korr(page) & 32767; + page+=2; + } + else + length= *page++ & 127; + + if (packed) + { + if (length > (uint) keyseg->length) + { + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + return 0; /* Error */ + } + if (length == 0) /* Same key */ + { + if (keyseg->flag & HA_NULL_PART) + *key++=1; /* Can't be NULL */ + get_key_length(length,key); + key+= length; /* Same diff_key as prev */ + if (length > keyseg->length) + { + DBUG_PRINT("error", + ("Found too long null packed key: %u of %u at 0x%lx", + length, keyseg->length, (long) *page_pos)); + DBUG_DUMP("key",(char*) *page_pos,16); + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + return 0; + } + continue; + } + if (keyseg->flag & HA_NULL_PART) + { + key++; /* Skip null marker*/ + start++; + } + + get_key_length(rest_length,page); + tot_length=rest_length+length; + + /* If the stored length has changed, we must move the key */ + if (tot_length >= 255 && *start != 255) + { + /* length prefix changed from a length of one to a length of 3 */ + bmove_upp((char*) key+length+3,(char*) key+length+1,length); + *key=255; + mi_int2store(key+1,tot_length); + key+=3+length; + } + else if (tot_length < 255 && *start == 255) + { + bmove(key+1,key+3,length); + *key=tot_length; + key+=1+length; + } + else + { + store_key_length_inc(key,tot_length); + key+=length; + } + memcpy(key,page,rest_length); + page+=rest_length; + key+=rest_length; + continue; + } + else + { + if (keyseg->flag & HA_NULL_PART) + { + if (!length--) /* Null part */ + { + *key++=0; + continue; + } + *key++=1; /* Not null */ + } + } + if (length > (uint) keyseg->length) + { + DBUG_PRINT("error",("Found too long packed key: %u of %u at 0x%lx", + length, keyseg->length, (long) *page_pos)); + DBUG_DUMP("key",(char*) *page_pos,16); + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + return 0; /* Error */ + } + store_key_length_inc(key,length); + } + else + { + if (keyseg->flag & HA_NULL_PART) + { + if (!(*key++ = *page++)) + continue; + } + if (keyseg->flag & + (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) + { + uchar *tmp=page; + get_key_length(length,tmp); + length+=(uint) (tmp-page); + } + else + length=keyseg->length; + } + memcpy((uchar*) key,(uchar*) page,(size_t) length); + key+=length; + page+=length; + } + length=keyseg->length+nod_flag; + bmove((uchar*) key,(uchar*) page,length); + *page_pos= page+length; + return ((uint) (key-start_key)+keyseg->length); +} /* _ma_get_pack_key */ + + + +/* key that is packed relatively to previous */ + +uint _ma_get_binary_pack_key(register MARIA_KEYDEF *keyinfo, uint nod_flag, + register uchar **page_pos, register uchar *key) +{ + reg1 HA_KEYSEG *keyseg; + uchar *start_key,*page,*page_end,*from,*from_end; + uint length,tmp; + DBUG_ENTER("_ma_get_binary_pack_key"); + + page= *page_pos; + page_end=page+HA_MAX_KEY_BUFF+1; + start_key=key; + + /* + Keys are compressed the following way: + + prefix length Packed length of prefix common with prev key. (1 or 3 bytes) + for each key segment: + [is null] Null indicator if can be null (1 byte, zero means null) + [length] Packed length if varlength (1 or 3 bytes) + key segment 'length' bytes of key segment value + pointer Reference to the data file (last_keyseg->length). + + get_key_length() is a macro. It gets the prefix length from 'page' + and puts it into 'length'. It increments 'page' by 1 or 3, depending + on the packed length of the prefix length. + */ + get_key_length(length,page); + if (length) + { + if (length > keyinfo->maxlength) + { + DBUG_PRINT("error", + ("Found too long binary packed key: %u of %u at 0x%lx", + length, keyinfo->maxlength, (long) *page_pos)); + DBUG_DUMP("key",(char*) *page_pos,16); + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); /* Wrong key */ + } + /* Key is packed against prev key, take prefix from prev key. */ + from= key; + from_end= key + length; + } + else + { + /* Key is not packed against prev key, take all from page buffer. */ + from= page; + from_end= page_end; + } + + /* + The trouble is that key can be split in two parts: + The first part (prefix) is in from .. from_end - 1. + The second part starts at page. + The split can be at every byte position. So we need to check for + the end of the first part before using every byte. + */ + for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++) + { + if (keyseg->flag & HA_NULL_PART) + { + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + if (!(*key++ = *from++)) + continue; /* Null part */ + } + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) + { + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + /* Get length of dynamic length key part */ + if ((length= (uint) (uchar) (*key++ = *from++)) == 255) + { + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + length= ((uint) (uchar) ((*key++ = *from++))) << 8; + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + length+= (uint) (uchar) ((*key++ = *from++)); + } + } + else + length=keyseg->length; + + if ((tmp=(uint) (from_end-from)) <= length) + { + key+=tmp; /* Use old key */ + length-=tmp; + from=page; from_end=page_end; + } + DBUG_ASSERT((int) length >= 0); + DBUG_PRINT("info",("key: 0x%lx from: 0x%lx length: %u", + (long) key, (long) from, length)); + memmove((uchar*) key, (uchar*) from, (size_t) length); + key+=length; + from+=length; + } + /* + Last segment (type == 0) contains length of data pointer. + If we have mixed key blocks with data pointer and key block pointer, + we have to copy both. + */ + length=keyseg->length+nod_flag; + if ((tmp=(uint) (from_end-from)) <= length) + { + /* Remaining length is less or equal max possible length. */ + memcpy(key+tmp,page,length-tmp); /* Get last part of key */ + *page_pos= page+length-tmp; + } + else + { + /* + Remaining length is greater than max possible length. + This can happen only if we switched to the new key bytes already. + 'page_end' is calculated with MI_MAX_KEY_BUFF. So it can be far + behind the real end of the key. + */ + if (from_end != page_end) + { + DBUG_PRINT("error",("Error when unpacking key")); + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); /* Error */ + } + /* Copy data pointer and, if appropriate, key block pointer. */ + memcpy((uchar*) key,(uchar*) from,(size_t) length); + *page_pos= from+length; + } + DBUG_RETURN((uint) (key-start_key)+keyseg->length); +} + + + /* Get key at position without knowledge of previous key */ + /* Returns pointer to next key */ + +uchar *_ma_get_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *page, + uchar *key, uchar *keypos, uint *return_key_length) +{ + uint nod_flag; + DBUG_ENTER("_ma_get_key"); + + nod_flag=_ma_test_if_nod(page); + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY))) + { + bmove((uchar*) key,(uchar*) keypos,keyinfo->keylength+nod_flag); + DBUG_RETURN(keypos+keyinfo->keylength+nod_flag); + } + else + { + page+=2+nod_flag; + key[0]=0; /* safety */ + while (page <= keypos) + { + *return_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,key); + if (*return_key_length == 0) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); + } + } + } + DBUG_PRINT("exit",("page: 0x%lx length: %u", (long) page, + *return_key_length)); + DBUG_RETURN(page); +} /* _ma_get_key */ + + + /* Get key at position without knowledge of previous key */ + /* Returns 0 if ok */ + +static my_bool _ma_get_prev_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *page, uchar *key, uchar *keypos, + uint *return_key_length) +{ + uint nod_flag; + DBUG_ENTER("_ma_get_prev_key"); + + nod_flag=_ma_test_if_nod(page); + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY))) + { + *return_key_length=keyinfo->keylength; + bmove((uchar*) key,(uchar*) keypos- *return_key_length-nod_flag, + *return_key_length); + DBUG_RETURN(0); + } + else + { + page+=2+nod_flag; + key[0]=0; /* safety */ + while (page < keypos) + { + *return_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,key); + if (*return_key_length == 0) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(1); + } + } + } + DBUG_RETURN(0); +} /* _ma_get_key */ + + + + /* Get last key from key-page */ + /* Return pointer to where key starts */ + +uchar *_ma_get_last_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *page, + uchar *lastkey, uchar *endpos, uint *return_key_length) +{ + uint nod_flag; + uchar *lastpos; + DBUG_ENTER("_ma_get_last_key"); + DBUG_PRINT("enter",("page: 0x%lx endpos: 0x%lx", (long) page, + (long) endpos)); + + nod_flag=_ma_test_if_nod(page); + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY))) + { + lastpos=endpos-keyinfo->keylength-nod_flag; + *return_key_length=keyinfo->keylength; + if (lastpos > page) + bmove((uchar*) lastkey,(uchar*) lastpos,keyinfo->keylength+nod_flag); + } + else + { + lastpos=(page+=2+nod_flag); + lastkey[0]=0; + while (page < endpos) + { + lastpos=page; + *return_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,lastkey); + if (*return_key_length == 0) + { + DBUG_PRINT("error",("Couldn't find last key: page: 0x%lx", + (long) page)); + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); + } + } + } + DBUG_PRINT("exit",("lastpos: 0x%lx length: %u", (long) lastpos, + *return_key_length)); + DBUG_RETURN(lastpos); +} /* _ma_get_last_key */ + + + /* Calculate length of key */ + +uint _ma_keylength(MARIA_KEYDEF *keyinfo, register const uchar *key) +{ + reg1 HA_KEYSEG *keyseg; + const uchar *start; + + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY))) + return (keyinfo->keylength); + + start= key; + for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++) + { + if (keyseg->flag & HA_NULL_PART) + if (!*key++) + continue; + if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint length; + get_key_length(length,key); + key+=length; + } + else + key+= keyseg->length; + } + return((uint) (key-start)+keyseg->length); +} /* _ma_keylength */ + + +/* + Calculate length of part key. + + Used in maria_rkey() to find the key found for the key-part that was used. + This is needed in case of multi-byte character sets where we may search + after '0xDF' but find 'ss' +*/ + +uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, register const uchar *key, + HA_KEYSEG *end) +{ + reg1 HA_KEYSEG *keyseg; + const uchar *start= key; + + for (keyseg=keyinfo->seg ; keyseg != end ; keyseg++) + { + if (keyseg->flag & HA_NULL_PART) + if (!*key++) + continue; + if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint length; + get_key_length(length,key); + key+=length; + } + else + key+= keyseg->length; + } + return (uint) (key-start); +} + + +/* Move a key */ + +uchar *_ma_move_key(MARIA_KEYDEF *keyinfo, uchar *to, const uchar *from) +{ + reg1 uint length; + memcpy(to, from, (size_t) (length= _ma_keylength(keyinfo, from))); + return to+length; +} + + +/* + Find next/previous record with same key + + WARNING + This can't be used when database is touched after last read +*/ + +int _ma_search_next(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, uint nextflag, my_off_t pos) +{ + int error; + uint nod_flag; + uchar lastkey[HA_MAX_KEY_BUFF]; + DBUG_ENTER("_ma_search_next"); + DBUG_PRINT("enter",("nextflag: %u lastpos: %lu int_keypos: %lu page_changed %d keyread_buff_used: %d", + nextflag, (ulong) info->cur_row.lastpos, + (ulong) info->int_keypos, + info->page_changed, info->keyread_buff_used)); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE,keyinfo->seg,key,key_length);); + + /* Force full read if we are at last key or if we are not on a leaf + and the key tree has changed since we used it last time + Note that even if the key tree has changed since last read, we can use + the last read data from the leaf if we haven't used the buffer for + something else. + */ + + if (((nextflag & SEARCH_BIGGER) && info->int_keypos >= info->int_maxpos) || + info->page_changed || + (info->int_keytree_version != keyinfo->version && + (info->int_nod_flag || info->keyread_buff_used))) + DBUG_RETURN(_ma_search(info,keyinfo,key, USE_WHOLE_KEY, + nextflag | SEARCH_SAVE_BUFF, pos)); + + if (info->keyread_buff_used) + { + if (!_ma_fetch_keypage(info,keyinfo,info->last_search_keypage, + DFLT_INIT_HITS,info->keyread_buff,0)) + DBUG_RETURN(-1); + info->keyread_buff_used=0; + } + + /* Last used buffer is in info->keyread_buff */ + nod_flag=_ma_test_if_nod(info->keyread_buff); + + if (nextflag & SEARCH_BIGGER) /* Next key */ + { + my_off_t tmp_pos= _ma_kpos(nod_flag,info->int_keypos); + if (tmp_pos != HA_OFFSET_ERROR) + { + if ((error= _ma_search(info,keyinfo,key, USE_WHOLE_KEY, + nextflag | SEARCH_SAVE_BUFF, tmp_pos)) <=0) + DBUG_RETURN(error); + } + memcpy(lastkey,key,key_length); + if (!(info->lastkey_length=(*keyinfo->get_key)(keyinfo,nod_flag, + &info->int_keypos,lastkey))) + DBUG_RETURN(-1); + } + else /* Previous key */ + { + uint length; + /* Find start of previous key */ + info->int_keypos= _ma_get_last_key(info,keyinfo,info->keyread_buff,lastkey, + info->int_keypos, &length); + if (!info->int_keypos) + DBUG_RETURN(-1); + if (info->int_keypos == info->keyread_buff+2) + DBUG_RETURN(_ma_search(info,keyinfo,key, USE_WHOLE_KEY, + nextflag | SEARCH_SAVE_BUFF, pos)); + if ((error= _ma_search(info,keyinfo,key, USE_WHOLE_KEY, + nextflag | SEARCH_SAVE_BUFF, + _ma_kpos(nod_flag,info->int_keypos))) <= 0) + DBUG_RETURN(error); + + /* QQ: We should be able to optimize away the following call */ + if (! _ma_get_last_key(info,keyinfo,info->keyread_buff,lastkey, + info->int_keypos,&info->lastkey_length)) + DBUG_RETURN(-1); + } + memcpy(info->lastkey,lastkey,info->lastkey_length); + info->cur_row.lastpos= _ma_dpos(info,0,info->lastkey+info->lastkey_length); + DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); +} /* _ma_search_next */ + + + /* Search after position for the first row in an index */ + /* This is stored in info->cur_row.lastpos */ + +int _ma_search_first(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + register my_off_t pos) +{ + uint nod_flag; + uchar *page; + DBUG_ENTER("_ma_search_first"); + + if (pos == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_KEY_NOT_FOUND; + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + + do + { + if (!_ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,info->keyread_buff,0)) + { + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + nod_flag=_ma_test_if_nod(info->keyread_buff); + page=info->keyread_buff+2+nod_flag; + } while ((pos= _ma_kpos(nod_flag,page)) != HA_OFFSET_ERROR); + + if (!(info->lastkey_length=(*keyinfo->get_key)(keyinfo,nod_flag,&page, + info->lastkey))) + DBUG_RETURN(-1); /* Crashed */ + + info->int_keypos=page; info->int_maxpos=info->keyread_buff+maria_data_on_page(info->keyread_buff)-1; + info->int_nod_flag=nod_flag; + info->int_keytree_version=keyinfo->version; + info->last_search_keypage=info->last_keypage; + info->page_changed=info->keyread_buff_used=0; + info->cur_row.lastpos= _ma_dpos(info,0,info->lastkey+info->lastkey_length); + + DBUG_PRINT("exit",("found key at %lu", (ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); +} /* _ma_search_first */ + + + /* Search after position for the last row in an index */ + /* This is stored in info->cur_row.lastpos */ + +int _ma_search_last(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + register my_off_t pos) +{ + uint nod_flag; + uchar *buff,*page; + DBUG_ENTER("_ma_search_last"); + + if (pos == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */ + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + + buff=info->keyread_buff; + do + { + if (!_ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,buff,0)) + { + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + page= buff+maria_data_on_page(buff); + nod_flag=_ma_test_if_nod(buff); + } while ((pos= _ma_kpos(nod_flag,page)) != HA_OFFSET_ERROR); + + if (!_ma_get_last_key(info,keyinfo,buff,info->lastkey,page, + &info->lastkey_length)) + DBUG_RETURN(-1); + info->cur_row.lastpos= _ma_dpos(info,0,info->lastkey+info->lastkey_length); + info->int_keypos=info->int_maxpos=page; + info->int_nod_flag=nod_flag; + info->int_keytree_version=keyinfo->version; + info->last_search_keypage=info->last_keypage; + info->page_changed=info->keyread_buff_used=0; + + DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); +} /* _ma_search_last */ + + + +/**************************************************************************** +** +** Functions to store and pack a key in a page +** +** maria_calc_xx_key_length takes the following arguments: +** nod_flag If nod: Length of nod-pointer +** next_key Position to pos after the new key in buffer +** org_key Key that was before the next key in buffer +** prev_key Last key before current key +** key Key that will be stored +** s_temp Information how next key will be packed +****************************************************************************/ + +/* Static length key */ + +int +_ma_calc_static_key_length(MARIA_KEYDEF *keyinfo,uint nod_flag, + uchar *next_pos __attribute__((unused)), + uchar *org_key __attribute__((unused)), + uchar *prev_key __attribute__((unused)), + const uchar *key, MARIA_KEY_PARAM *s_temp) +{ + s_temp->key= key; + return (int) (s_temp->totlength=keyinfo->keylength+nod_flag); +} + +/* Variable length key */ + +int +_ma_calc_var_key_length(MARIA_KEYDEF *keyinfo,uint nod_flag, + uchar *next_pos __attribute__((unused)), + uchar *org_key __attribute__((unused)), + uchar *prev_key __attribute__((unused)), + const uchar *key, MARIA_KEY_PARAM *s_temp) +{ + s_temp->key= key; + return (int) (s_temp->totlength= _ma_keylength(keyinfo,key)+nod_flag); +} + +/* + length of key with a variable length first segment which is prefix + compressed (maria_chk reports 'packed + stripped') + + Keys are compressed the following way: + + If the max length of first key segment <= 127 bytes the prefix is + 1 uchar else it's 2 byte + + prefix byte(s) The high bit is set if this is a prefix for the prev key + length Packed length if the previous was a prefix byte + [length] data bytes ('length' bytes) + next-key-seg Next key segments + + If the first segment can have NULL: + The length is 0 for NULLS and 1+length for not null columns. + +*/ + +int +_ma_calc_var_pack_key_length(MARIA_KEYDEF *keyinfo, uint nod_flag, + uchar *next_key, + uchar *org_key, uchar *prev_key, const uchar *key, + MARIA_KEY_PARAM *s_temp) +{ + reg1 HA_KEYSEG *keyseg; + int length; + uint key_length,ref_length,org_key_length=0, + length_pack,new_key_length,diff_flag,pack_marker; + const uchar *start,*end,*key_end; + uchar *sort_order; + bool same_length; + + length_pack=s_temp->ref_length=s_temp->n_ref_length=s_temp->n_length=0; + same_length=0; keyseg=keyinfo->seg; + key_length= _ma_keylength(keyinfo,key)+nod_flag; + + sort_order=0; + if ((keyinfo->flag & HA_FULLTEXT) && + ((keyseg->type == HA_KEYTYPE_TEXT) || + (keyseg->type == HA_KEYTYPE_VARTEXT1) || + (keyseg->type == HA_KEYTYPE_VARTEXT2)) && + !use_strnxfrm(keyseg->charset)) + sort_order= keyseg->charset->sort_order; + + /* diff flag contains how many bytes is needed to pack key */ + if (keyseg->length >= 127) + { + diff_flag=2; + pack_marker=32768; + } + else + { + diff_flag= 1; + pack_marker=128; + } + s_temp->pack_marker=pack_marker; + + /* Handle the case that the first part have NULL values */ + if (keyseg->flag & HA_NULL_PART) + { + if (!*key++) + { + s_temp->key= key; + s_temp->key_length= 0; + s_temp->totlength= key_length-1+diff_flag; + s_temp->next_key_pos= 0; /* No next key */ + return (s_temp->totlength); + } + s_temp->store_not_null=1; + key_length--; /* We don't store NULL */ + if (prev_key && !*prev_key++) + org_key=prev_key=0; /* Can't pack against prev */ + else if (org_key) + org_key++; /* Skip NULL */ + } + else + s_temp->store_not_null=0; + s_temp->prev_key= org_key; + + /* The key part will start with a packed length */ + + get_key_pack_length(new_key_length,length_pack,key); + end= key_end= key+ new_key_length; + start= key; + + /* Calc how many characters are identical between this and the prev. key */ + if (prev_key) + { + get_key_length(org_key_length,prev_key); + s_temp->prev_key=prev_key; /* Pointer at data */ + /* Don't use key-pack if length == 0 */ + if (new_key_length && new_key_length == org_key_length) + same_length=1; + else if (new_key_length > org_key_length) + end= key + org_key_length; + + if (sort_order) /* SerG */ + { + while (key < end && + sort_order[* (uchar*) key] == sort_order[* (uchar*) prev_key]) + { + key++; prev_key++; + } + } + else + { + while (key < end && *key == *prev_key) + { + key++; prev_key++; + } + } + } + + s_temp->key=key; + s_temp->key_length= (uint) (key_end-key); + + if (same_length && key == key_end) + { + /* identical variable length key */ + s_temp->ref_length= pack_marker; + length=(int) key_length-(int) (key_end-start)-length_pack; + length+= diff_flag; + if (next_key) + { /* Can't combine with next */ + s_temp->n_length= *next_key; /* Needed by _ma_store_key */ + next_key=0; + } + } + else + { + if (start != key) + { /* Starts as prev key */ + ref_length= (uint) (key-start); + s_temp->ref_length= ref_length + pack_marker; + length= (int) (key_length - ref_length); + + length-= length_pack; + length+= diff_flag; + length+= ((new_key_length-ref_length) >= 255) ? 3 : 1;/* Rest_of_key */ + } + else + { + s_temp->key_length+=s_temp->store_not_null; /* If null */ + length= key_length - length_pack+ diff_flag; + } + } + s_temp->totlength=(uint) length; + s_temp->prev_length=0; + DBUG_PRINT("test",("tot_length: %u length: %d uniq_key_length: %u", + key_length, length, s_temp->key_length)); + + /* If something after that hasn't length=0, test if we can combine */ + if ((s_temp->next_key_pos=next_key)) + { + uint packed,n_length; + + packed = *next_key & 128; + if (diff_flag == 2) + { + n_length= mi_uint2korr(next_key) & 32767; /* Length of next key */ + next_key+=2; + } + else + n_length= *next_key++ & 127; + if (!packed) + n_length-= s_temp->store_not_null; + + if (n_length || packed) /* Don't pack 0 length keys */ + { + uint next_length_pack, new_ref_length=s_temp->ref_length; + + if (packed) + { + /* If first key and next key is packed (only on delete) */ + if (!prev_key && org_key) + { + get_key_length(org_key_length,org_key); + key=start; + if (sort_order) /* SerG */ + { + while (key < end && + sort_order[*(uchar*) key] == sort_order[*(uchar*) org_key]) + { + key++; org_key++; + } + } + else + { + while (key < end && *key == *org_key) + { + key++; org_key++; + } + } + if ((new_ref_length= (uint) (key - start))) + new_ref_length+=pack_marker; + } + + if (!n_length) + { + /* + We put a different key between two identical variable length keys + Extend next key to have same prefix as this key + */ + if (new_ref_length) /* prefix of previus key */ + { /* make next key longer */ + s_temp->part_of_prev_key= new_ref_length; + s_temp->prev_length= org_key_length - + (new_ref_length-pack_marker); + s_temp->n_ref_length= s_temp->part_of_prev_key; + s_temp->n_length= s_temp->prev_length; + n_length= get_pack_length(s_temp->prev_length); + s_temp->prev_key+= (new_ref_length - pack_marker); + length+= s_temp->prev_length + n_length; + } + else + { /* Can't use prev key */ + s_temp->part_of_prev_key=0; + s_temp->prev_length= org_key_length; + s_temp->n_ref_length=s_temp->n_length= org_key_length; + length+= org_key_length; + } + return (int) length; + } + + ref_length=n_length; + /* Get information about not packed key suffix */ + get_key_pack_length(n_length,next_length_pack,next_key); + + /* Test if new keys has fewer characters that match the previous key */ + if (!new_ref_length) + { /* Can't use prev key */ + s_temp->part_of_prev_key= 0; + s_temp->prev_length= ref_length; + s_temp->n_ref_length= s_temp->n_length= n_length+ref_length; + return (int) length+ref_length-next_length_pack; + } + if (ref_length+pack_marker > new_ref_length) + { + uint new_pack_length=new_ref_length-pack_marker; + /* We must copy characters from the original key to the next key */ + s_temp->part_of_prev_key= new_ref_length; + s_temp->prev_length= ref_length - new_pack_length; + s_temp->n_ref_length=s_temp->n_length=n_length + s_temp->prev_length; + s_temp->prev_key+= new_pack_length; + length-= (next_length_pack - get_pack_length(s_temp->n_length)); + return (int) length + s_temp->prev_length; + } + } + else + { + /* Next key wasn't a prefix of previous key */ + ref_length=0; + next_length_pack=0; + } + DBUG_PRINT("test",("length: %d next_key: 0x%lx", length, + (long) next_key)); + + { + uint tmp_length; + key=(start+=ref_length); + if (key+n_length < key_end) /* Normalize length based */ + key_end= key+n_length; + if (sort_order) /* SerG */ + { + while (key < key_end && + sort_order[*(uchar*) key] == sort_order[*(uchar*) next_key]) + { + key++; next_key++; + } + } + else + { + while (key < key_end && *key == *next_key) + { + key++; next_key++; + } + } + if (!(tmp_length=(uint) (key-start))) + { /* Key can't be re-packed */ + s_temp->next_key_pos=0; + return length; + } + ref_length+=tmp_length; + n_length-=tmp_length; + length-=tmp_length+next_length_pack; /* We gained these chars */ + } + if (n_length == 0 && ref_length == new_key_length) + { + s_temp->n_ref_length=pack_marker; /* Same as prev key */ + } + else + { + s_temp->n_ref_length=ref_length | pack_marker; + length+= get_pack_length(n_length); + s_temp->n_length=n_length; + } + } + } + return length; +} + + +/* Length of key which is prefix compressed */ + +int _ma_calc_bin_pack_key_length(MARIA_KEYDEF *keyinfo, uint nod_flag, + uchar *next_key, + uchar *org_key, uchar *prev_key, + const uchar *key, + MARIA_KEY_PARAM *s_temp) +{ + uint length,key_length,ref_length; + + s_temp->totlength=key_length= _ma_keylength(keyinfo,key)+nod_flag; +#ifdef HAVE_purify + s_temp->n_length= s_temp->n_ref_length=0; /* For valgrind */ +#endif + s_temp->key=key; + s_temp->prev_key=org_key; + if (prev_key) /* If not first key in block */ + { + /* pack key against previous key */ + /* + As keys may be identical when running a sort in maria_chk, we + have to guard against the case where keys may be identical + */ + const uchar *end; + end=key+key_length; + for ( ; *key == *prev_key && key < end; key++,prev_key++) ; + s_temp->ref_length= ref_length=(uint) (key-s_temp->key); + length=key_length - ref_length + get_pack_length(ref_length); + } + else + { + /* No previous key */ + s_temp->ref_length=ref_length=0; + length=key_length+1; + } + if ((s_temp->next_key_pos=next_key)) /* If another key after */ + { + /* pack key against next key */ + uint next_length,next_length_pack; + get_key_pack_length(next_length,next_length_pack,next_key); + + /* If first key and next key is packed (only on delete) */ + if (!prev_key && org_key && next_length) + { + const uchar *end; + for (key= s_temp->key, end=key+next_length ; + *key == *org_key && key < end; + key++,org_key++) ; + ref_length= (uint) (key - s_temp->key); + } + + if (next_length > ref_length) + { + /* We put a key with different case between two keys with the same prefix + Extend next key to have same prefix as + this key */ + s_temp->n_ref_length= ref_length; + s_temp->prev_length= next_length-ref_length; + s_temp->prev_key+= ref_length; + return (int) (length+ s_temp->prev_length - next_length_pack + + get_pack_length(ref_length)); + } + /* Check how many characters are identical to next key */ + key= s_temp->key+next_length; + while (*key++ == *next_key++) ; + if ((ref_length= (uint) (key - s_temp->key)-1) == next_length) + { + s_temp->next_key_pos=0; + return length; /* can't pack next key */ + } + s_temp->prev_length=0; + s_temp->n_ref_length=ref_length; + return (int) (length-(ref_length - next_length) - next_length_pack + + get_pack_length(ref_length)); + } + return (int) length; +} + + +/* +** store a key packed with _ma_calc_xxx_key_length in page-buffert +*/ + +/* store key without compression */ + +void _ma_store_static_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), + register uchar *key_pos, + register MARIA_KEY_PARAM *s_temp) +{ + memcpy((uchar*) key_pos,(uchar*) s_temp->key,(size_t) s_temp->totlength); +} + + +/* store variable length key with prefix compression */ + +#define store_pack_length(test,pos,length) { \ + if (test) { *((pos)++) = (uchar) (length); } else \ + { *((pos)++) = (uchar) ((length) >> 8); *((pos)++) = (uchar) (length); } } + + +void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), + register uchar *key_pos, + register MARIA_KEY_PARAM *s_temp) +{ + uint length; + uchar *start; + + start=key_pos; + + if (s_temp->ref_length) + { + /* Packed against previous key */ + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->ref_length); + /* If not same key after */ + if (s_temp->ref_length != s_temp->pack_marker) + store_key_length_inc(key_pos,s_temp->key_length); + } + else + { + /* Not packed against previous key */ + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->key_length); + } + bmove((uchar*) key_pos,(uchar*) s_temp->key, + (length=s_temp->totlength-(uint) (key_pos-start))); + + if (!s_temp->next_key_pos) /* No following key */ + return; + key_pos+=length; + + if (s_temp->prev_length) + { + /* Extend next key because new key didn't have same prefix as prev key */ + if (s_temp->part_of_prev_key) + { + store_pack_length(s_temp->pack_marker == 128,key_pos, + s_temp->part_of_prev_key); + store_key_length_inc(key_pos,s_temp->n_length); + } + else + { + s_temp->n_length+= s_temp->store_not_null; + store_pack_length(s_temp->pack_marker == 128,key_pos, + s_temp->n_length); + } + memcpy(key_pos, s_temp->prev_key, s_temp->prev_length); + } + else if (s_temp->n_ref_length) + { + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_ref_length); + if (s_temp->n_ref_length == s_temp->pack_marker) + return; /* Identical key */ + store_key_length(key_pos,s_temp->n_length); + } + else + { + s_temp->n_length+= s_temp->store_not_null; + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_length); + } +} + + +/* variable length key with prefix compression */ + +void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), + register uchar *key_pos, + register MARIA_KEY_PARAM *s_temp) +{ + store_key_length_inc(key_pos,s_temp->ref_length); + memcpy((char*) key_pos,(char*) s_temp->key+s_temp->ref_length, + (size_t) s_temp->totlength-s_temp->ref_length); + + if (s_temp->next_key_pos) + { + key_pos+=(uint) (s_temp->totlength-s_temp->ref_length); + store_key_length_inc(key_pos,s_temp->n_ref_length); + if (s_temp->prev_length) /* If we must extend key */ + { + memcpy(key_pos,s_temp->prev_key,s_temp->prev_length); + } + } +} diff --git a/storage/maria/ma_sort.c b/storage/maria/ma_sort.c new file mode 100644 index 00000000000..2851a3a09dd --- /dev/null +++ b/storage/maria/ma_sort.c @@ -0,0 +1,1058 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Creates a index for a database by reading keys, sorting them and outputing + them in sorted order through MARIA_SORT_INFO functions. +*/ + +#include "ma_fulltext.h" +#if defined(MSDOS) || defined(__WIN__) +#include <fcntl.h> +#else +#include <stddef.h> +#endif +#include <queues.h> + +/* static variables */ + +#undef MIN_SORT_MEMORY +#undef MYF_RW +#undef DISK_BUFFER_SIZE + +#define MERGEBUFF 15 +#define MERGEBUFF2 31 +#define MIN_SORT_MEMORY (4096-MALLOC_OVERHEAD) +#define MYF_RW MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL) +#define DISK_BUFFER_SIZE (IO_SIZE*16) + + +/* + Pointers of functions for store and read keys from temp file +*/ + +extern void print_error _VARARGS((const char *fmt,...)); + +/* Functions defined in this file */ + +static ha_rows NEAR_F find_all_keys(MARIA_SORT_PARAM *info,uint keys, + uchar **sort_keys, + DYNAMIC_ARRAY *buffpek,int *maxbuffer, + IO_CACHE *tempfile, + IO_CACHE *tempfile_for_exceptions); +static int NEAR_F write_keys(MARIA_SORT_PARAM *info, uchar **sort_keys, + uint count, BUFFPEK *buffpek,IO_CACHE *tempfile); +static int NEAR_F write_key(MARIA_SORT_PARAM *info, uchar *key, + IO_CACHE *tempfile); +static int NEAR_F write_index(MARIA_SORT_PARAM *info, uchar **sort_keys, + uint count); +static int NEAR_F merge_many_buff(MARIA_SORT_PARAM *info,uint keys, + uchar **sort_keys, + BUFFPEK *buffpek,int *maxbuffer, + IO_CACHE *t_file); +static uint NEAR_F read_to_buffer(IO_CACHE *fromfile,BUFFPEK *buffpek, + uint sort_length); +static int NEAR_F merge_buffers(MARIA_SORT_PARAM *info,uint keys, + IO_CACHE *from_file, IO_CACHE *to_file, + uchar **sort_keys, BUFFPEK *lastbuff, + BUFFPEK *Fb, BUFFPEK *Tb); +static int NEAR_F merge_index(MARIA_SORT_PARAM *,uint, uchar **,BUFFPEK *, int, + IO_CACHE *); +static int flush_maria_ft_buf(MARIA_SORT_PARAM *info); + +static int NEAR_F write_keys_varlen(MARIA_SORT_PARAM *info, uchar **sort_keys, + uint count, BUFFPEK *buffpek, + IO_CACHE *tempfile); +static uint NEAR_F read_to_buffer_varlen(IO_CACHE *fromfile,BUFFPEK *buffpek, + uint sort_length); +static int NEAR_F write_merge_key(MARIA_SORT_PARAM *info, IO_CACHE *to_file, + char *key, uint sort_length, uint count); +static int NEAR_F write_merge_key_varlen(MARIA_SORT_PARAM *info, + IO_CACHE *to_file, + char* key, uint sort_length, + uint count); +static inline int +my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs); + +/* + Creates a index of sorted keys + + SYNOPSIS + _ma_create_index_by_sort() + info Sort parameters + no_messages Set to 1 if no output + sortbuff_size Size if sortbuffer to allocate + + RESULT + 0 ok + <> 0 Error +*/ + +int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages, + ulong sortbuff_size) +{ + int error,maxbuffer,skr; + uint memavl,old_memavl,keys,sort_length; + DYNAMIC_ARRAY buffpek; + ha_rows records; + uchar **sort_keys; + IO_CACHE tempfile, tempfile_for_exceptions; + DBUG_ENTER("_ma_create_index_by_sort"); + DBUG_PRINT("enter",("sort_length: %d", info->key_length)); + + if (info->keyinfo->flag & HA_VAR_LENGTH_KEY) + { + info->write_keys= write_keys_varlen; + info->read_to_buffer=read_to_buffer_varlen; + info->write_key=write_merge_key_varlen; + } + else + { + info->write_keys= write_keys; + info->read_to_buffer=read_to_buffer; + info->write_key=write_merge_key; + } + + my_b_clear(&tempfile); + my_b_clear(&tempfile_for_exceptions); + bzero((char*) &buffpek,sizeof(buffpek)); + sort_keys= (uchar **) NULL; error= 1; + maxbuffer=1; + + memavl=max(sortbuff_size,MIN_SORT_MEMORY); + records= info->sort_info->max_records; + sort_length= info->key_length; + LINT_INIT(keys); + + while (memavl >= MIN_SORT_MEMORY) + { + if ((records < UINT_MAX32) && + ((my_off_t) (records + 1) * + (sort_length + sizeof(char*)) <= (my_off_t) memavl)) + keys= records+1; + else + do + { + skr=maxbuffer; + if (memavl < sizeof(BUFFPEK)*(uint) maxbuffer || + (keys=(memavl-sizeof(BUFFPEK)*(uint) maxbuffer)/ + (sort_length+sizeof(char*))) <= 1 || + keys < (uint) maxbuffer) + { + _ma_check_print_error(info->sort_info->param, + "maria_sort_buffer_size is too small"); + goto err; + } + } + while ((maxbuffer= (int) (records/(keys-1)+1)) != skr); + + if ((sort_keys=(uchar**) my_malloc(keys*(sort_length+sizeof(char*))+ + HA_FT_MAXBYTELEN, MYF(0)))) + { + if (my_init_dynamic_array(&buffpek, sizeof(BUFFPEK), maxbuffer, + maxbuffer/2)) + { + my_free((uchar*) sort_keys,MYF(0)); + sort_keys= 0; + } + else + break; + } + old_memavl=memavl; + if ((memavl=memavl/4*3) < MIN_SORT_MEMORY && old_memavl > MIN_SORT_MEMORY) + memavl=MIN_SORT_MEMORY; + } + if (memavl < MIN_SORT_MEMORY) + { + _ma_check_print_error(info->sort_info->param, "Maria sort buffer" + " too small"); /* purecov: tested */ + goto err; /* purecov: tested */ + } + (*info->lock_in_memory)(info->sort_info->param);/* Everything is allocated */ + + if (!no_messages) + printf(" - Searching for keys, allocating buffer for %d keys\n",keys); + + if ((records=find_all_keys(info,keys,sort_keys,&buffpek,&maxbuffer, + &tempfile,&tempfile_for_exceptions)) + == HA_POS_ERROR) + goto err; /* purecov: tested */ + if (maxbuffer == 0) + { + if (!no_messages) + printf(" - Dumping %lu keys\n", (ulong) records); + if (write_index(info,sort_keys, (uint) records)) + goto err; /* purecov: inspected */ + } + else + { + keys=(keys*(sort_length+sizeof(char*)))/sort_length; + if (maxbuffer >= MERGEBUFF2) + { + if (!no_messages) + printf(" - Merging %lu keys\n", (ulong) records); /* purecov: tested */ + if (merge_many_buff(info,keys,sort_keys, + dynamic_element(&buffpek,0,BUFFPEK *),&maxbuffer,&tempfile)) + goto err; /* purecov: inspected */ + } + if (flush_io_cache(&tempfile) || + reinit_io_cache(&tempfile,READ_CACHE,0L,0,0)) + goto err; /* purecov: inspected */ + if (!no_messages) + printf(" - Last merge and dumping keys\n"); /* purecov: tested */ + if (merge_index(info,keys,sort_keys,dynamic_element(&buffpek,0,BUFFPEK *), + maxbuffer,&tempfile)) + goto err; /* purecov: inspected */ + } + + if (flush_maria_ft_buf(info) || _ma_flush_pending_blocks(info)) + goto err; + + if (my_b_inited(&tempfile_for_exceptions)) + { + MARIA_HA *idx=info->sort_info->info; + uint keyno=info->key; + uint key_length, ref_length=idx->s->rec_reflength; + + if (!no_messages) + printf(" - Adding exceptions\n"); /* purecov: tested */ + if (flush_io_cache(&tempfile_for_exceptions) || + reinit_io_cache(&tempfile_for_exceptions,READ_CACHE,0L,0,0)) + goto err; + + while (!my_b_read(&tempfile_for_exceptions,(uchar*)&key_length, + sizeof(key_length)) + && !my_b_read(&tempfile_for_exceptions,(uchar*)sort_keys, + (uint) key_length)) + { + if (_ma_ck_write(idx,keyno,(uchar*) sort_keys,key_length-ref_length)) + goto err; + } + } + + error =0; + +err: + if (sort_keys) + my_free((uchar*) sort_keys,MYF(0)); + delete_dynamic(&buffpek); + close_cached_file(&tempfile); + close_cached_file(&tempfile_for_exceptions); + + DBUG_RETURN(error ? -1 : 0); +} /* _ma_create_index_by_sort */ + + +/* Search after all keys and place them in a temp. file */ + +static ha_rows NEAR_F find_all_keys(MARIA_SORT_PARAM *info, uint keys, + uchar **sort_keys, DYNAMIC_ARRAY *buffpek, + int *maxbuffer, IO_CACHE *tempfile, + IO_CACHE *tempfile_for_exceptions) +{ + int error; + uint idx; + DBUG_ENTER("find_all_keys"); + + idx=error=0; + sort_keys[0]= (uchar*) (sort_keys+keys); + + while (!(error=(*info->key_read)(info,sort_keys[idx]))) + { + if (info->real_key_length > info->key_length) + { + if (write_key(info,sort_keys[idx],tempfile_for_exceptions)) + DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */ + continue; + } + + if (++idx == keys) + { + if (info->write_keys(info,sort_keys,idx-1, + (BUFFPEK *)alloc_dynamic(buffpek), + tempfile)) + DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */ + + sort_keys[0]=(uchar*) (sort_keys+keys); + memcpy(sort_keys[0],sort_keys[idx-1],(size_t) info->key_length); + idx=1; + } + sort_keys[idx]=sort_keys[idx-1]+info->key_length; + } + if (error > 0) + DBUG_RETURN(HA_POS_ERROR); /* Aborted by get_key */ /* purecov: inspected */ + if (buffpek->elements) + { + if (info->write_keys(info,sort_keys,idx,(BUFFPEK *)alloc_dynamic(buffpek), + tempfile)) + DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */ + *maxbuffer=buffpek->elements-1; + } + else + *maxbuffer=0; + + DBUG_RETURN((*maxbuffer)*(keys-1)+idx); +} /* find_all_keys */ + + +#ifdef THREAD +/* Search after all keys and place them in a temp. file */ + +pthread_handler_t _ma_thr_find_all_keys(void *arg) +{ + MARIA_SORT_PARAM *sort_param= (MARIA_SORT_PARAM*) arg; + int error; + uint memavl,old_memavl,keys,sort_length; + uint idx, maxbuffer; + uchar **sort_keys=0; + + LINT_INIT(keys); + + error=1; + + if (my_thread_init()) + goto err; + + { /* Add extra block since DBUG_ENTER declare variables */ + DBUG_ENTER("_ma_thr_find_all_keys"); + DBUG_PRINT("enter", ("master: %d", sort_param->master)); + if (sort_param->sort_info->got_error) + goto err; + + if (sort_param->keyinfo->flag & HA_VAR_LENGTH_KEY) + { + sort_param->write_keys= write_keys_varlen; + sort_param->read_to_buffer= read_to_buffer_varlen; + sort_param->write_key= write_merge_key_varlen; + } + else + { + sort_param->write_keys= write_keys; + sort_param->read_to_buffer= read_to_buffer; + sort_param->write_key= write_merge_key; + } + + my_b_clear(&sort_param->tempfile); + my_b_clear(&sort_param->tempfile_for_exceptions); + bzero((char*) &sort_param->buffpek,sizeof(sort_param->buffpek)); + bzero((char*) &sort_param->unique, sizeof(sort_param->unique)); + + memavl= max(sort_param->sortbuff_size, MIN_SORT_MEMORY); + idx= sort_param->sort_info->max_records; + sort_length= sort_param->key_length; + maxbuffer= 1; + + while (memavl >= MIN_SORT_MEMORY) + { + if ((my_off_t) (idx+1)*(sort_length+sizeof(char*)) <= + (my_off_t) memavl) + keys= idx+1; + else + { + uint skr; + do + { + skr= maxbuffer; + if (memavl < sizeof(BUFFPEK)*maxbuffer || + (keys=(memavl-sizeof(BUFFPEK)*maxbuffer)/ + (sort_length+sizeof(char*))) <= 1 || + keys < (uint) maxbuffer) + { + _ma_check_print_error(sort_param->sort_info->param, + "maria_sort_buffer_size is too small"); + goto err; + } + } + while ((maxbuffer= (int) (idx/(keys-1)+1)) != skr); + } + if ((sort_keys= (uchar **) + my_malloc(keys*(sort_length+sizeof(char*))+ + ((sort_param->keyinfo->flag & HA_FULLTEXT) ? + HA_FT_MAXBYTELEN : 0), MYF(0)))) + { + if (my_init_dynamic_array(&sort_param->buffpek, sizeof(BUFFPEK), + maxbuffer, maxbuffer/2)) + { + my_free((uchar*) sort_keys,MYF(0)); + sort_keys= (uchar **) NULL; /* for err: label */ + } + else + break; + } + old_memavl= memavl; + if ((memavl= memavl/4*3) < MIN_SORT_MEMORY && + old_memavl > MIN_SORT_MEMORY) + memavl= MIN_SORT_MEMORY; + } + if (memavl < MIN_SORT_MEMORY) + { + _ma_check_print_error(sort_param->sort_info->param, + "Maria sort buffer too small"); + goto err; /* purecov: tested */ + } + + if (sort_param->sort_info->param->testflag & T_VERBOSE) + printf("Key %d - Allocating buffer for %d keys\n", + sort_param->key+1, keys); + sort_param->sort_keys= sort_keys; + + idx= error= 0; + sort_keys[0]= (uchar*) (sort_keys+keys); + + DBUG_PRINT("info", ("reading keys")); + while (!(error= sort_param->sort_info->got_error) && + !(error= (*sort_param->key_read)(sort_param, sort_keys[idx]))) + { + if (sort_param->real_key_length > sort_param->key_length) + { + if (write_key(sort_param,sort_keys[idx], + &sort_param->tempfile_for_exceptions)) + goto err; + continue; + } + + if (++idx == keys) + { + if (sort_param->write_keys(sort_param, sort_keys, idx - 1, + (BUFFPEK *)alloc_dynamic(&sort_param-> + buffpek), + &sort_param->tempfile)) + goto err; + sort_keys[0]= (uchar*) (sort_keys+keys); + memcpy(sort_keys[0], sort_keys[idx - 1], + (size_t) sort_param->key_length); + idx= 1; + } + sort_keys[idx]=sort_keys[idx - 1] + sort_param->key_length; + } + if (error > 0) + goto err; + if (sort_param->buffpek.elements) + { + if (sort_param->write_keys(sort_param,sort_keys, idx, + (BUFFPEK *) alloc_dynamic(&sort_param-> + buffpek), + &sort_param->tempfile)) + goto err; + sort_param->keys= (sort_param->buffpek.elements - 1) * (keys - 1) + idx; + } + else + sort_param->keys= idx; + + sort_param->sort_keys_length= keys; + goto ok; + +err: + DBUG_PRINT("error", ("got some error")); + sort_param->sort_info->got_error= 1; /* no need to protect with a mutex */ + my_free((uchar*) sort_keys,MYF(MY_ALLOW_ZERO_PTR)); + sort_param->sort_keys=0; + delete_dynamic(& sort_param->buffpek); + close_cached_file(&sort_param->tempfile); + close_cached_file(&sort_param->tempfile_for_exceptions); + +ok: + free_root(&sort_param->wordroot, MYF(0)); + /* + Detach from the share if the writer is involved. Avoid others to + be blocked. This includes a flush of the write buffer. This will + also indicate EOF to the readers. + */ + if (sort_param->sort_info->info->rec_cache.share) + remove_io_thread(&sort_param->sort_info->info->rec_cache); + + /* Readers detach from the share if any. Avoid others to be blocked. */ + if (sort_param->read_cache.share) + remove_io_thread(&sort_param->read_cache); + + pthread_mutex_lock(&sort_param->sort_info->mutex); + if (!--sort_param->sort_info->threads_running) + pthread_cond_signal(&sort_param->sort_info->cond); + pthread_mutex_unlock(&sort_param->sort_info->mutex); + DBUG_PRINT("exit", ("======== ending thread ========")); + } + my_thread_end(); + return NULL; +} + + +int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param) +{ + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; + ulong length, keys; + ulong *rec_per_key_part=param->rec_per_key_part; + int got_error=sort_info->got_error; + uint i; + MARIA_HA *info=sort_info->info; + MARIA_SHARE *share=info->s; + MARIA_SORT_PARAM *sinfo; + uchar *mergebuf=0; + DBUG_ENTER("_ma_thr_write_keys"); + LINT_INIT(length); + + for (i= 0, sinfo= sort_param ; + i < sort_info->total_keys ; + i++, rec_per_key_part+=sinfo->keyinfo->keysegs, sinfo++) + { + if (!sinfo->sort_keys) + { + got_error=1; + my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + continue; + } + if (!got_error) + { + maria_set_key_active(share->state.key_map, sinfo->key); + + if (!sinfo->buffpek.elements) + { + if (param->testflag & T_VERBOSE) + { + printf("Key %d - Dumping %u keys\n",sinfo->key+1, sinfo->keys); + fflush(stdout); + } + if (write_index(sinfo, sinfo->sort_keys, sinfo->keys) || + flush_maria_ft_buf(sinfo) || _ma_flush_pending_blocks(sinfo)) + got_error=1; + } + if (!got_error && param->testflag & T_STATISTICS) + maria_update_key_parts(sinfo->keyinfo, rec_per_key_part, sinfo->unique, + param->stats_method == MI_STATS_METHOD_IGNORE_NULLS? + sinfo->notnull: NULL, + (ulonglong) info->state->records); + } + my_free((uchar*) sinfo->sort_keys,MYF(0)); + my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR)); + sinfo->sort_keys=0; + } + + for (i= 0, sinfo= sort_param ; + i < sort_info->total_keys ; + i++, + delete_dynamic(&sinfo->buffpek), + close_cached_file(&sinfo->tempfile), + close_cached_file(&sinfo->tempfile_for_exceptions), + sinfo++) + { + if (got_error) + continue; + if (sinfo->keyinfo->flag & HA_VAR_LENGTH_KEY) + { + sinfo->write_keys=write_keys_varlen; + sinfo->read_to_buffer=read_to_buffer_varlen; + sinfo->write_key=write_merge_key_varlen; + } + else + { + sinfo->write_keys=write_keys; + sinfo->read_to_buffer=read_to_buffer; + sinfo->write_key=write_merge_key; + } + if (sinfo->buffpek.elements) + { + uint maxbuffer=sinfo->buffpek.elements-1; + if (!mergebuf) + { + length=param->sort_buffer_length; + while (length >= MIN_SORT_MEMORY && !mergebuf) + { + mergebuf=my_malloc(length, MYF(0)); + length=length*3/4; + } + if (!mergebuf) + { + got_error=1; + continue; + } + } + keys=length/sinfo->key_length; + if (maxbuffer >= MERGEBUFF2) + { + if (param->testflag & T_VERBOSE) + printf("Key %d - Merging %u keys\n",sinfo->key+1, sinfo->keys); + if (merge_many_buff(sinfo, keys, (uchar **) mergebuf, + dynamic_element(&sinfo->buffpek, 0, BUFFPEK *), + (int*) &maxbuffer, &sinfo->tempfile)) + { + got_error=1; + continue; + } + } + if (flush_io_cache(&sinfo->tempfile) || + reinit_io_cache(&sinfo->tempfile,READ_CACHE,0L,0,0)) + { + got_error=1; + continue; + } + if (param->testflag & T_VERBOSE) + printf("Key %d - Last merge and dumping keys\n", sinfo->key+1); + if (merge_index(sinfo, keys, (uchar**) mergebuf, + dynamic_element(&sinfo->buffpek,0,BUFFPEK *), + maxbuffer,&sinfo->tempfile) || + flush_maria_ft_buf(sinfo) || + _ma_flush_pending_blocks(sinfo)) + { + got_error=1; + continue; + } + } + if (my_b_inited(&sinfo->tempfile_for_exceptions)) + { + uint key_length; + + if (param->testflag & T_VERBOSE) + printf("Key %d - Dumping 'long' keys\n", sinfo->key+1); + + if (flush_io_cache(&sinfo->tempfile_for_exceptions) || + reinit_io_cache(&sinfo->tempfile_for_exceptions,READ_CACHE,0L,0,0)) + { + got_error=1; + continue; + } + + while (!got_error && + !my_b_read(&sinfo->tempfile_for_exceptions,(uchar*)&key_length, + sizeof(key_length))) + { + uchar maria_ft_buf[HA_FT_MAXBYTELEN + HA_FT_WLEN + 10]; + if (key_length > sizeof(maria_ft_buf) || + my_b_read(&sinfo->tempfile_for_exceptions, (uchar*)maria_ft_buf, + (uint)key_length) || + _ma_ck_write(info, sinfo->key, maria_ft_buf, + key_length - info->s->rec_reflength)) + got_error=1; + } + } + } + my_free((uchar*) mergebuf,MYF(MY_ALLOW_ZERO_PTR)); + DBUG_RETURN(got_error); +} +#endif /* THREAD */ + +/* Write all keys in memory to file for later merge */ + +static int write_keys(MARIA_SORT_PARAM *info, register uchar **sort_keys, + uint count, BUFFPEK *buffpek, IO_CACHE *tempfile) +{ + uchar **end; + uint sort_length=info->key_length; + DBUG_ENTER("write_keys"); + + qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp, + info); + if (!my_b_inited(tempfile) && + open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); /* purecov: inspected */ + + buffpek->file_pos=my_b_tell(tempfile); + buffpek->count=count; + + for (end=sort_keys+count ; sort_keys != end ; sort_keys++) + { + if (my_b_write(tempfile, *sort_keys, (uint) sort_length)) + DBUG_RETURN(1); /* purecov: inspected */ + } + DBUG_RETURN(0); +} /* write_keys */ + + +static inline int +my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs) +{ + int err; + uint16 len= _ma_keylength(info->keyinfo, bufs); + + /* The following is safe as this is a local file */ + if ((err= my_b_write(to_file, (uchar*)&len, sizeof(len)))) + return (err); + if ((err= my_b_write(to_file,bufs, (uint) len))) + return (err); + return (0); +} + + +static int NEAR_F write_keys_varlen(MARIA_SORT_PARAM *info, + register uchar **sort_keys, + uint count, BUFFPEK *buffpek, + IO_CACHE *tempfile) +{ + uchar **end; + int err; + DBUG_ENTER("write_keys_varlen"); + + qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp, + info); + if (!my_b_inited(tempfile) && + open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); /* purecov: inspected */ + + buffpek->file_pos=my_b_tell(tempfile); + buffpek->count=count; + for (end=sort_keys+count ; sort_keys != end ; sort_keys++) + { + if ((err= my_var_write(info,tempfile, *sort_keys))) + DBUG_RETURN(err); + } + DBUG_RETURN(0); +} /* write_keys_varlen */ + + +static int NEAR_F write_key(MARIA_SORT_PARAM *info, uchar *key, + IO_CACHE *tempfile) +{ + uint key_length=info->real_key_length; + DBUG_ENTER("write_key"); + + if (!my_b_inited(tempfile) && + open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); + + if (my_b_write(tempfile, (uchar*)&key_length,sizeof(key_length)) || + my_b_write(tempfile, key, (uint) key_length)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} /* write_key */ + + +/* Write index */ + +static int NEAR_F write_index(MARIA_SORT_PARAM *info, + register uchar **sort_keys, + register uint count) +{ + DBUG_ENTER("write_index"); + + qsort2((uchar*) sort_keys,(size_t) count,sizeof(uchar*), + (qsort2_cmp) info->key_cmp,info); + while (count--) + { + if ((*info->key_write)(info, *sort_keys++)) + DBUG_RETURN(-1); /* purecov: inspected */ + } + DBUG_RETURN(0); +} /* write_index */ + + + /* Merge buffers to make < MERGEBUFF2 buffers */ + +static int NEAR_F merge_many_buff(MARIA_SORT_PARAM *info, uint keys, + uchar **sort_keys, BUFFPEK *buffpek, + int *maxbuffer, IO_CACHE *t_file) +{ + register int i; + IO_CACHE t_file2, *from_file, *to_file, *temp; + BUFFPEK *lastbuff; + DBUG_ENTER("merge_many_buff"); + + if (*maxbuffer < MERGEBUFF2) + DBUG_RETURN(0); /* purecov: inspected */ + if (flush_io_cache(t_file) || + open_cached_file(&t_file2,my_tmpdir(info->tmpdir),"ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); /* purecov: inspected */ + + from_file= t_file ; to_file= &t_file2; + while (*maxbuffer >= MERGEBUFF2) + { + reinit_io_cache(from_file,READ_CACHE,0L,0,0); + reinit_io_cache(to_file,WRITE_CACHE,0L,0,0); + lastbuff=buffpek; + for (i=0 ; i <= *maxbuffer-MERGEBUFF*3/2 ; i+=MERGEBUFF) + { + if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++, + buffpek+i,buffpek+i+MERGEBUFF-1)) + goto cleanup; + } + if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++, + buffpek+i,buffpek+ *maxbuffer)) + break; /* purecov: inspected */ + if (flush_io_cache(to_file)) + break; /* purecov: inspected */ + temp=from_file; from_file=to_file; to_file=temp; + *maxbuffer= (int) (lastbuff-buffpek)-1; + } +cleanup: + close_cached_file(to_file); /* This holds old result */ + if (to_file == t_file) + *t_file=t_file2; /* Copy result file */ + + DBUG_RETURN(*maxbuffer >= MERGEBUFF2); /* Return 1 if interrupted */ +} /* merge_many_buff */ + + +/* + Read data to buffer + + SYNOPSIS + read_to_buffer() + fromfile File to read from + buffpek Where to read from + sort_length max length to read + RESULT + > 0 Ammount of bytes read + -1 Error +*/ + +static uint NEAR_F read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek, + uint sort_length) +{ + register uint count; + uint length; + + if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count))) + { + if (my_pread(fromfile->file,(uchar*) buffpek->base, + (length= sort_length*count),buffpek->file_pos,MYF_RW)) + return((uint) -1); /* purecov: inspected */ + buffpek->key=buffpek->base; + buffpek->file_pos+= length; /* New filepos */ + buffpek->count-= count; + buffpek->mem_count= count; + } + return (count*sort_length); +} /* read_to_buffer */ + +static uint NEAR_F read_to_buffer_varlen(IO_CACHE *fromfile, BUFFPEK *buffpek, + uint sort_length) +{ + register uint count; + uint16 length_of_key = 0; + uint idx; + uchar *buffp; + + if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count))) + { + buffp= buffpek->base; + + for (idx=1;idx<=count;idx++) + { + if (my_pread(fromfile->file,(uchar*)&length_of_key,sizeof(length_of_key), + buffpek->file_pos,MYF_RW)) + return((uint) -1); + buffpek->file_pos+=sizeof(length_of_key); + if (my_pread(fromfile->file,(uchar*) buffp,length_of_key, + buffpek->file_pos,MYF_RW)) + return((uint) -1); + buffpek->file_pos+=length_of_key; + buffp = buffp + sort_length; + } + buffpek->key=buffpek->base; + buffpek->count-= count; + buffpek->mem_count= count; + } + return (count*sort_length); +} /* read_to_buffer_varlen */ + + +static int NEAR_F write_merge_key_varlen(MARIA_SORT_PARAM *info, + IO_CACHE *to_file,char* key, + uint sort_length, uint count) +{ + uint idx; + + char *bufs = key; + for (idx=1;idx<=count;idx++) + { + int err; + if ((err= my_var_write(info,to_file, (uchar*) bufs))) + return (err); + bufs=bufs+sort_length; + } + return(0); +} + + +static int NEAR_F write_merge_key(MARIA_SORT_PARAM *info __attribute__((unused)), + IO_CACHE *to_file, char* key, + uint sort_length, uint count) +{ + return my_b_write(to_file,(uchar*) key,(uint) sort_length*count); +} + +/* + Merge buffers to one buffer + If to_file == 0 then use info->key_write +*/ + +static int NEAR_F +merge_buffers(MARIA_SORT_PARAM *info, uint keys, IO_CACHE *from_file, + IO_CACHE *to_file, uchar **sort_keys, BUFFPEK *lastbuff, + BUFFPEK *Fb, BUFFPEK *Tb) +{ + int error; + uint sort_length,maxcount; + ha_rows count; + my_off_t to_start_filepos; + uchar *strpos; + BUFFPEK *buffpek,**refpek; + QUEUE queue; + volatile int *killed= _ma_killed_ptr(info->sort_info->param); + DBUG_ENTER("merge_buffers"); + + count=error=0; + maxcount=keys/((uint) (Tb-Fb) +1); + LINT_INIT(to_start_filepos); + if (to_file) + to_start_filepos=my_b_tell(to_file); + strpos= (uchar*) sort_keys; + sort_length=info->key_length; + + if (init_queue(&queue,(uint) (Tb-Fb)+1,offsetof(BUFFPEK,key),0, + (int (*)(void*, uchar *,uchar*)) info->key_cmp, + (void*) info)) + DBUG_RETURN(1); /* purecov: inspected */ + + for (buffpek= Fb ; buffpek <= Tb ; buffpek++) + { + count+= buffpek->count; + buffpek->base= strpos; + buffpek->max_keys=maxcount; + strpos+= (uint) (error=(int) info->read_to_buffer(from_file,buffpek, + sort_length)); + if (error == -1) + goto err; /* purecov: inspected */ + queue_insert(&queue,(char*) buffpek); + } + + while (queue.elements > 1) + { + for (;;) + { + if (*killed) + { + error=1; goto err; + } + buffpek=(BUFFPEK*) queue_top(&queue); + if (to_file) + { + if (info->write_key(info,to_file,(uchar*) buffpek->key, + (uint) sort_length,1)) + { + error=1; goto err; /* purecov: inspected */ + } + } + else + { + if ((*info->key_write)(info,(void*) buffpek->key)) + { + error=1; goto err; /* purecov: inspected */ + } + } + buffpek->key+=sort_length; + if (! --buffpek->mem_count) + { + if (!(error=(int) info->read_to_buffer(from_file,buffpek,sort_length))) + { + uchar *base= buffpek->base; + uint max_keys=buffpek->max_keys; + + VOID(queue_remove(&queue,0)); + + /* Put room used by buffer to use in other buffer */ + for (refpek= (BUFFPEK**) &queue_top(&queue); + refpek <= (BUFFPEK**) &queue_end(&queue); + refpek++) + { + buffpek= *refpek; + if (buffpek->base+buffpek->max_keys*sort_length == base) + { + buffpek->max_keys+=max_keys; + break; + } + else if (base+max_keys*sort_length == buffpek->base) + { + buffpek->base=base; + buffpek->max_keys+=max_keys; + break; + } + } + break; /* One buffer have been removed */ + } + } + else if (error == -1) + goto err; /* purecov: inspected */ + queue_replaced(&queue); /* Top element has been replaced */ + } + } + buffpek=(BUFFPEK*) queue_top(&queue); + buffpek->base= (uchar*) sort_keys; + buffpek->max_keys=keys; + do + { + if (to_file) + { + if (info->write_key(info,to_file,(uchar*) buffpek->key, + sort_length,buffpek->mem_count)) + { + error=1; goto err; /* purecov: inspected */ + } + } + else + { + register uchar *end; + strpos= buffpek->key; + for (end= strpos+buffpek->mem_count*sort_length; + strpos != end ; + strpos+=sort_length) + { + if ((*info->key_write)(info, (uchar*) strpos)) + { + error=1; goto err; /* purecov: inspected */ + } + } + } + } + while ((error=(int) info->read_to_buffer(from_file,buffpek,sort_length)) != + -1 && error != 0); + + lastbuff->count=count; + if (to_file) + lastbuff->file_pos=to_start_filepos; +err: + delete_queue(&queue); + DBUG_RETURN(error); +} /* merge_buffers */ + + + /* Do a merge to output-file (save only positions) */ + +static int NEAR_F +merge_index(MARIA_SORT_PARAM *info, uint keys, uchar **sort_keys, + BUFFPEK *buffpek, int maxbuffer, IO_CACHE *tempfile) +{ + DBUG_ENTER("merge_index"); + if (merge_buffers(info,keys,tempfile,(IO_CACHE*) 0,sort_keys,buffpek,buffpek, + buffpek+maxbuffer)) + DBUG_RETURN(1); /* purecov: inspected */ + DBUG_RETURN(0); +} /* merge_index */ + + +static int flush_maria_ft_buf(MARIA_SORT_PARAM *info) +{ + int err=0; + if (info->sort_info->ft_buf) + { + err=_ma_sort_ft_buf_flush(info); + my_free((uchar*)info->sort_info->ft_buf, MYF(0)); + info->sort_info->ft_buf=0; + } + return err; +} + diff --git a/storage/maria/ma_sp_defs.h b/storage/maria/ma_sp_defs.h new file mode 100644 index 00000000000..a70695bea3a --- /dev/null +++ b/storage/maria/ma_sp_defs.h @@ -0,0 +1,47 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _SP_DEFS_H +#define _SP_DEFS_H + +#define SPDIMS 2 +#define SPTYPE HA_KEYTYPE_DOUBLE +#define SPLEN 8 + +#ifdef HAVE_SPATIAL + +enum wkbType +{ + wkbPoint = 1, + wkbLineString = 2, + wkbPolygon = 3, + wkbMultiPoint = 4, + wkbMultiLineString = 5, + wkbMultiPolygon = 6, + wkbGeometryCollection = 7 +}; + +enum wkbByteOrder +{ + wkbXDR = 0, /* Big Endian */ + wkbNDR = 1 /* Little Endian */ +}; + +uint _ma_sp_make_key(register MARIA_HA *info, uint keynr, uchar *key, + const uchar *record, my_off_t filepos); + +#endif /*HAVE_SPATIAL*/ +#endif /* _SP_DEFS_H */ diff --git a/storage/maria/ma_sp_key.c b/storage/maria/ma_sp_key.c new file mode 100644 index 00000000000..1ea9b410ab6 --- /dev/null +++ b/storage/maria/ma_sp_key.c @@ -0,0 +1,299 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" + +#ifdef HAVE_SPATIAL + +#include "ma_sp_defs.h" + +static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims, + double *mbr, int top); +static int sp_mbr_from_wkb(uchar (*wkb), uint size, uint n_dims, double *mbr); + +static void get_double(double *d, const uchar *pos) +{ + float8get(*d, pos); +} + +uint _ma_sp_make_key(register MARIA_HA *info, uint keynr, uchar *key, + const uchar *record, my_off_t filepos) +{ + HA_KEYSEG *keyseg; + MARIA_KEYDEF *keyinfo = &info->s->keyinfo[keynr]; + uint len = 0; + uchar *pos; + uint dlen; + uchar *dptr; + double mbr[SPDIMS * 2]; + uint i; + + keyseg = &keyinfo->seg[-1]; + pos = (uchar*)record + keyseg->start; + + dlen = _ma_calc_blob_length(keyseg->bit_start, pos); + memcpy_fixed(&dptr, pos + keyseg->bit_start, sizeof(char*)); + if (!dptr) + { + my_errno= HA_ERR_NULL_IN_SPATIAL; + return 0; + } + sp_mbr_from_wkb(dptr + 4, dlen - 4, SPDIMS, mbr); /* SRID */ + + for (i = 0, keyseg = keyinfo->seg; keyseg->type; keyseg++, i++) + { + uint length = keyseg->length; + + pos = ((uchar*)mbr) + keyseg->start; + if (keyseg->flag & HA_SWAP_KEY) + { +#ifdef HAVE_ISNAN + if (keyseg->type == HA_KEYTYPE_FLOAT) + { + float nr; + float4get(nr, pos); + if (isnan(nr)) + { + /* Replace NAN with zero */ + bzero(key, length); + key+= length; + continue; + } + } + else if (keyseg->type == HA_KEYTYPE_DOUBLE) + { + double nr; + get_double(&nr, pos); + if (isnan(nr)) + { + bzero(key, length); + key+= length; + continue; + } + } +#endif + pos += length; + while (length--) + { + *key++ = *--pos; + } + } + else + { + memcpy((uchar*)key, pos, length); + key += keyseg->length; + } + len += keyseg->length; + } + _ma_dpointer(info, key, filepos); + return len; +} + +/* +Calculate minimal bounding rectangle (mbr) of the spatial object +stored in "well-known binary representation" (wkb) format. +*/ +static int sp_mbr_from_wkb(uchar *wkb, uint size, uint n_dims, double *mbr) +{ + uint i; + + for (i=0; i < n_dims; ++i) + { + mbr[i * 2] = DBL_MAX; + mbr[i * 2 + 1] = -DBL_MAX; + } + + return sp_get_geometry_mbr(&wkb, wkb + size, n_dims, mbr, 1); +} + +/* + Add one point stored in wkb to mbr +*/ + +static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order __attribute__((unused)), + double *mbr) +{ + double ord; + double *mbr_end= mbr + n_dims * 2; + + while (mbr < mbr_end) + { + if ((*wkb) > end - 8) + return -1; + get_double(&ord, (const uchar*) *wkb); + (*wkb)+= 8; + if (ord < *mbr) + float8store((char*) mbr, ord); + mbr++; + if (ord > *mbr) + float8store((char*) mbr, ord); + mbr++; + } + return 0; +} + + +static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr) +{ + return sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr); +} + + +static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr) +{ + uint n_points; + + n_points = uint4korr(*wkb); + (*wkb) += 4; + for (; n_points > 0; --n_points) + { + /* Add next point to mbr */ + if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + return 0; +} + + +static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr) +{ + uint n_linear_rings; + uint n_points; + + n_linear_rings = uint4korr((*wkb)); + (*wkb) += 4; + + for (; n_linear_rings > 0; --n_linear_rings) + { + n_points = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_points > 0; --n_points) + { + /* Add next point to mbr */ + if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + } + return 0; +} + +static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims, + double *mbr, int top) +{ + int res; + uchar byte_order; + uint wkb_type; + + byte_order = *(*wkb); + ++(*wkb); + + wkb_type = uint4korr((*wkb)); + (*wkb) += 4; + + switch ((enum wkbType) wkb_type) + { + case wkbPoint: + res = sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr); + break; + case wkbLineString: + res = sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr); + break; + case wkbPolygon: + res = sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr); + break; + case wkbMultiPoint: + { + uint n_items; + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + byte_order = *(*wkb); + ++(*wkb); + (*wkb) += 4; + if (sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + res = 0; + break; + } + case wkbMultiLineString: + { + uint n_items; + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + byte_order = *(*wkb); + ++(*wkb); + (*wkb) += 4; + if (sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + res = 0; + break; + } + case wkbMultiPolygon: + { + uint n_items; + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + byte_order = *(*wkb); + ++(*wkb); + (*wkb) += 4; + if (sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + res = 0; + break; + } + case wkbGeometryCollection: + { + uint n_items; + + if (!top) + return -1; + + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + if (sp_get_geometry_mbr(wkb, end, n_dims, mbr, 0)) + return -1; + } + res = 0; + break; + } + default: + res = -1; + } + return res; +} + +#endif /*HAVE_SPATIAL*/ diff --git a/storage/maria/ma_sp_test.c b/storage/maria/ma_sp_test.c new file mode 100644 index 00000000000..7a413f68135 --- /dev/null +++ b/storage/maria/ma_sp_test.c @@ -0,0 +1,568 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Testing of the basic functions of a MARIA spatial table */ +/* Written by Alex Barkov, who has a shared copyright to this code */ + +#include "maria.h" + +#ifdef HAVE_SPATIAL +#include "ma_sp_defs.h" + +#define MAX_REC_LENGTH 1024 +#define KEYALG HA_KEY_ALG_RTREE + +static void create_linestring(char *record,uint rownr); +static void print_record(char * record,my_off_t offs,const char * tail); + +static void create_key(char *key,uint rownr); +static void print_key(const char *key,const char * tail); + +static int run_test(const char *filename); +static int read_with_pos(MARIA_HA * file, int silent); + +static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points, + uchar *wkb); +static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims); + +static char blob_key[MAX_REC_LENGTH]; + + +int main(int argc __attribute__((unused)),char *argv[]) +{ + MY_INIT(argv[0]); + maria_init(); + exit(run_test("sp_test")); +} + + +int run_test(const char *filename) +{ + MARIA_HA *file; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + MARIA_COLUMNDEF recinfo[20]; + MARIA_KEYDEF keyinfo[20]; + HA_KEYSEG keyseg[20]; + key_range min_range, max_range; + int silent=0; + int create_flag=0; + int null_fields=0; + int nrecords=30; + int uniques=0; + int i; + int error; + int row_count=0; + char record[MAX_REC_LENGTH]; + char key[MAX_REC_LENGTH]; + char read_record[MAX_REC_LENGTH]; + int upd=10; + ha_rows hrows; + + /* Define a column for NULLs and DEL markers*/ + + recinfo[0].type=FIELD_NORMAL; + recinfo[0].length=1; /* For NULL bits */ + + + /* Define spatial column */ + + recinfo[1].type=FIELD_BLOB; + recinfo[1].length=4 + portable_sizeof_char_ptr; + + + + /* Define a key with 1 spatial segment */ + + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].flag=HA_SPATIAL; + keyinfo[0].key_alg=KEYALG; + + keyinfo[0].seg[0].type= HA_KEYTYPE_BINARY; + keyinfo[0].seg[0].flag=0; + keyinfo[0].seg[0].start= 1; + keyinfo[0].seg[0].length=1; /* Spatial ignores it anyway */ + keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language=default_charset_info->number; + keyinfo[0].seg[0].bit_start=4; /* Long BLOB */ + + + if (!silent) + printf("- Creating isam-file\n"); + + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=10000000; + + if (maria_create(filename, + DYNAMIC_RECORD, + 1, /* keys */ + keyinfo, + 2, /* columns */ + recinfo,uniques,&uniquedef,&create_info,create_flag)) + goto err; + + if (!silent) + printf("- Open isam-file\n"); + + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + + if (!silent) + printf("- Writing key:s\n"); + + for (i=0; i<nrecords; i++ ) + { + create_linestring(record,i); + error=maria_write(file,record); + print_record(record,maria_position(file),"\n"); + if (!error) + { + row_count++; + } + else + { + printf("maria_write: %d\n", error); + goto err; + } + } + + if ((error=read_with_pos(file,silent))) + goto err; + + if (!silent) + printf("- Deleting rows with position\n"); + for (i=0; i < nrecords/4; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + print_record(read_record,maria_position(file),"\n"); + error=maria_delete(file,read_record); + if (error) + { + printf("pos: %2d maria_delete: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + } + + if (!silent) + printf("- Updating rows with position\n"); + for (i=0; i < nrecords/2 ; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + if (error==HA_ERR_RECORD_DELETED) + continue; + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + print_record(read_record,maria_position(file),""); + create_linestring(record,i+nrecords*upd); + printf("\t-> "); + print_record(record,maria_position(file),"\n"); + error=maria_update(file,read_record,record); + if (error) + { + printf("pos: %2d maria_update: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + } + + if ((error=read_with_pos(file,silent))) + goto err; + + if (!silent) + printf("- Test maria_rkey then a sequence of maria_rnext_same\n"); + + create_key(key, nrecords*4/5); + print_key(key," search for INTERSECT\n"); + + if ((error=maria_rkey(file,read_record,0,key,0,HA_READ_MBR_INTERSECT))) + { + printf("maria_rkey: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rkey\n"); + row_count=1; + + for (;;) + { + if ((error=maria_rnext_same(file,read_record))) + { + if (error==HA_ERR_END_OF_FILE) + break; + printf("maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext_same\n"); + row_count++; + } + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_rfirst then a sequence of maria_rnext\n"); + + error=maria_rfirst(file,read_record,0); + if (error) + { + printf("maria_rfirst: %3d errno: %3d\n",error,my_errno); + goto err; + } + row_count=1; + print_record(read_record,maria_position(file)," maria_frirst\n"); + + for(i=0;i<nrecords;i++) { + if ((error=maria_rnext(file,read_record,0))) + { + if (error==HA_ERR_END_OF_FILE) + break; + printf("maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext\n"); + row_count++; + } + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_records_in_range()\n"); + + create_key(key, nrecords*upd); + print_key(key," INTERSECT\n"); + min_range.key= key; + min_range.length= 1000; /* Big enough */ + min_range.flag= HA_READ_MBR_INTERSECT; + max_range.key= record+1; + max_range.length= 1000; /* Big enough */ + max_range.flag= HA_READ_KEY_EXACT; + hrows= maria_records_in_range(file,0, &min_range, &max_range); + printf(" %ld rows\n", (long) hrows); + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return 0; + +err: + printf("got error: %3d when using maria-database\n",my_errno); + maria_end(); + return 1; /* skip warning */ +} + + +static int read_with_pos (MARIA_HA * file,int silent) +{ + int error; + int i; + char read_record[MAX_REC_LENGTH]; + int rows=0; + + if (!silent) + printf("- Reading rows with position\n"); + for (i=0;;i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + if (error==HA_ERR_END_OF_FILE) + break; + if (error==HA_ERR_RECORD_DELETED) + continue; + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + return error; + } + rows++; + print_record(read_record,maria_position(file),"\n"); + } + printf(" %d rows\n",rows); + return 0; +} + + +#ifdef NOT_USED +static void bprint_record(char * record, + my_off_t offs __attribute__((unused)), + const char * tail) +{ + int i; + char * pos; + i=(unsigned char)record[0]; + printf("%02X ",i); + + for( pos=record+1, i=0; i<32; i++,pos++) + { + int b=(unsigned char)*pos; + printf("%02X",b); + } + printf("%s",tail); +} +#endif + + +static void print_record(char * record, my_off_t offs,const char * tail) +{ + char *pos; + char *ptr; + uint len; + + printf(" rec=(%d)",(unsigned char)record[0]); + pos=record+1; + len=sint4korr(pos); + pos+=4; + printf(" len=%d ",len); + memcpy_fixed(&ptr,pos,sizeof(char*)); + if (ptr) + maria_rtree_PrintWKB((uchar*) ptr,SPDIMS); + else + printf("<NULL> "); + printf(" offs=%ld ",(long int)offs); + printf("%s",tail); +} + + +#ifdef NOT_USED +static void create_point(char *record,uint rownr) +{ + uint tmp; + char *ptr; + char *pos=record; + double x[200]; + int i; + + for(i=0;i<SPDIMS;i++) + x[i]=rownr; + + bzero((char*) record,MAX_REC_LENGTH); + *pos=0x01; /* DEL marker */ + pos++; + + memset(blob_key,0,sizeof(blob_key)); + tmp=maria_rtree_CreatePointWKB(x,SPDIMS,blob_key); + + int4store(pos,tmp); + pos+=4; + + ptr=blob_key; + memcpy_fixed(pos,&ptr,sizeof(char*)); +} +#endif + + +static void create_linestring(char *record,uint rownr) +{ + uint tmp; + char *ptr; + char *pos=record; + double x[200]; + int i,j; + int npoints=2; + + for(j=0;j<npoints;j++) + for(i=0;i<SPDIMS;i++) + x[i+j*SPDIMS]=rownr*j; + + bzero((char*) record,MAX_REC_LENGTH); + *pos=0x01; /* DEL marker */ + pos++; + + memset(blob_key,0,sizeof(blob_key)); + tmp=maria_rtree_CreateLineStringWKB(x,SPDIMS,npoints, (uchar*) blob_key); + + int4store(pos,tmp); + pos+=4; + + ptr=blob_key; + memcpy_fixed(pos,&ptr,sizeof(char*)); +} + + +static void create_key(char *key,uint rownr) +{ + double c=rownr; + char *pos; + uint i; + + bzero(key,MAX_REC_LENGTH); + for ( pos=key, i=0; i<2*SPDIMS; i++) + { + float8store(pos,c); + pos+=sizeof(c); + } +} + +static void print_key(const char *key,const char * tail) +{ + double c; + uint i; + + printf(" key="); + for (i=0; i<2*SPDIMS; i++) + { + float8get(c,key); + key+=sizeof(c); + printf("%.14g ",c); + } + printf("%s",tail); +} + + +#ifdef NOT_USED + +static int maria_rtree_CreatePointWKB(double *ords, uint n_dims, uchar *wkb) +{ + uint i; + + *wkb = wkbXDR; + ++wkb; + int4store(wkb, wkbPoint); + wkb += 4; + + for (i=0; i < n_dims; ++i) + { + float8store(wkb, ords[i]); + wkb += 8; + } + return 5 + n_dims * 8; +} +#endif + + +static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points, + uchar *wkb) +{ + uint i; + uint n_ords = n_dims * n_points; + + *wkb = wkbXDR; + ++wkb; + int4store(wkb, wkbLineString); + wkb += 4; + int4store(wkb, n_points); + wkb += 4; + for (i=0; i < n_ords; ++i) + { + float8store(wkb, ords[i]); + wkb += 8; + } + return 9 + n_points * n_dims * 8; +} + + +static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims) +{ + uint wkb_type; + + ++wkb; + wkb_type = uint4korr(wkb); + wkb += 4; + + switch ((enum wkbType)wkb_type) + { + case wkbPoint: + { + uint i; + double ord; + + printf("POINT("); + for (i=0; i < n_dims; ++i) + { + float8get(ord, wkb); + wkb += 8; + printf("%.14g", ord); + if (i < n_dims - 1) + printf(" "); + else + printf(")"); + } + break; + } + case wkbLineString: + { + uint p, i; + uint n_points; + double ord; + + printf("LineString("); + n_points = uint4korr(wkb); + wkb += 4; + for (p=0; p < n_points; ++p) + { + for (i=0; i < n_dims; ++i) + { + float8get(ord, wkb); + wkb += 8; + printf("%.14g", ord); + if (i < n_dims - 1) + printf(" "); + } + if (p < n_points - 1) + printf(", "); + else + printf(")"); + } + break; + } + case wkbPolygon: + { + printf("POLYGON(...)"); + break; + } + case wkbMultiPoint: + { + printf("MULTIPOINT(...)"); + break; + } + case wkbMultiLineString: + { + printf("MULTILINESTRING(...)"); + break; + } + case wkbMultiPolygon: + { + printf("MULTIPOLYGON(...)"); + break; + } + case wkbGeometryCollection: + { + printf("GEOMETRYCOLLECTION(...)"); + break; + } + default: + { + printf("UNKNOWN GEOMETRY TYPE"); + break; + } + } +} + +#else +int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused))) +{ + exit(0); +} +#endif /*HAVE_SPATIAL*/ diff --git a/storage/maria/ma_static.c b/storage/maria/ma_static.c new file mode 100644 index 00000000000..41b202491a7 --- /dev/null +++ b/storage/maria/ma_static.c @@ -0,0 +1,79 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Static variables for MARIA library. All definied here for easy making of + a shared library +*/ + +#ifndef _global_h +#include "maria_def.h" +#include "trnman.h" +#endif + +LIST *maria_open_list=0; +uchar NEAR maria_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 9, '\001', }; +uchar NEAR maria_pack_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 10, '\001', }; +uint maria_quick_table_bits=9; +ulong maria_block_size= MARIA_KEY_BLOCK_LENGTH; +my_bool maria_flush= 0, maria_single_user= 0; +my_bool maria_delay_key_write= 0; +#if defined(THREAD) && !defined(DONT_USE_RW_LOCKS) +ulong maria_concurrent_insert= 2; +#else +ulong maria_concurrent_insert= 0; +#endif +my_off_t maria_max_temp_length= MAX_FILE_SIZE; +ulong maria_bulk_insert_tree_size=8192*1024; +ulong maria_data_pointer_size= 4; + +PAGECACHE maria_pagecache_var; +PAGECACHE *maria_pagecache= &maria_pagecache_var; + +PAGECACHE maria_log_pagecache_var; +PAGECACHE *maria_log_pagecache= &maria_log_pagecache_var; + +/** + @brief when transactionality does not matter we can use this transaction + + Used in external programs like ma_test*, and also internally inside + libmaria when there is no transaction around and the operation isn't + transactional (CREATE/DROP/RENAME/OPTIMIZE/REPAIR). +*/ +TRN dummy_transaction_object; + +/* Enough for comparing if number is zero */ +uchar maria_zero_string[]= {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + +/* + read_vec[] is used for converting between P_READ_KEY.. and SEARCH_ + Position is , == , >= , <= , > , < +*/ + +uint NEAR maria_read_vec[]= +{ + SEARCH_FIND, SEARCH_FIND | SEARCH_BIGGER, SEARCH_FIND | SEARCH_SMALLER, + SEARCH_NO_FIND | SEARCH_BIGGER, SEARCH_NO_FIND | SEARCH_SMALLER, + SEARCH_FIND | SEARCH_PREFIX, SEARCH_LAST, SEARCH_LAST | SEARCH_SMALLER, + MBR_CONTAIN, MBR_INTERSECT, MBR_WITHIN, MBR_DISJOINT, MBR_EQUAL +}; + +uint NEAR maria_readnext_vec[]= +{ + SEARCH_BIGGER, SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_BIGGER, SEARCH_SMALLER, + SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_SMALLER +}; diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c new file mode 100644 index 00000000000..ebfab4fad76 --- /dev/null +++ b/storage/maria/ma_statrec.c @@ -0,0 +1,294 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + /* Functions to handle fixed-length-records */ + +#include "maria_def.h" + + +my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record) +{ + uchar temp[8]; /* max pointer length */ + if (info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end) + { + my_off_t filepos=info->s->state.dellink; + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_read(info,(char*) &temp[0],info->s->base.rec_reflength, + info->s->state.dellink+1, + MYF(MY_NABP))) + goto err; + info->s->state.dellink= _ma_rec_pos(info->s,temp); + info->state->del--; + info->state->empty-=info->s->base.pack_reclength; + if (info->s->file_write(info, (char*) record, info->s->base.reclength, + filepos, + MYF(MY_NABP))) + goto err; + } + else + { + if (info->state->data_file_length > info->s->base.max_data_file_length- + info->s->base.pack_reclength) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + return(2); + } + if (info->opt_flag & WRITE_CACHE_USED) + { /* Cash in use */ + if (my_b_write(&info->rec_cache, (uchar*) record, + info->s->base.reclength)) + goto err; + if (info->s->base.pack_reclength != info->s->base.reclength) + { + uint length=info->s->base.pack_reclength - info->s->base.reclength; + bzero((char*) temp,length); + if (my_b_write(&info->rec_cache, (uchar*) temp,length)) + goto err; + } + } + else + { + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_write(info,(char*) record,info->s->base.reclength, + info->state->data_file_length, + info->s->write_flag)) + goto err; + if (info->s->base.pack_reclength != info->s->base.reclength) + { + uint length=info->s->base.pack_reclength - info->s->base.reclength; + bzero((char*) temp,length); + if (info->s->file_write(info, (uchar*) temp,length, + info->state->data_file_length+ + info->s->base.reclength, + info->s->write_flag)) + goto err; + } + } + info->state->data_file_length+=info->s->base.pack_reclength; + info->s->state.split++; + } + return 0; + err: + return 1; +} + +my_bool _ma_update_static_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec __attribute__ ((unused)), + const uchar *record) +{ + info->rec_cache.seek_not_done=1; /* We have done a seek */ + return (info->s->file_write(info, + (char*) record,info->s->base.reclength, + pos, + MYF(MY_NABP)) != 0); +} + + +my_bool _ma_delete_static_record(MARIA_HA *info, + const uchar *record __attribute__ ((unused))) +{ + uchar temp[9]; /* 1+sizeof(uint32) */ + info->state->del++; + info->state->empty+=info->s->base.pack_reclength; + temp[0]= '\0'; /* Mark that record is deleted */ + _ma_dpointer(info,temp+1,info->s->state.dellink); + info->s->state.dellink= info->cur_row.lastpos; + info->rec_cache.seek_not_done=1; + return (info->s->file_write(info, temp, 1+info->s->rec_reflength, + info->cur_row.lastpos, MYF(MY_NABP)) != 0); +} + + +my_bool _ma_cmp_static_record(register MARIA_HA *info, + register const uchar *old) +{ + DBUG_ENTER("_ma_cmp_static_record"); + + /* We are going to do changes; dont let anybody disturb */ + dont_break(); /* Dont allow SIGHUP or SIGINT */ + + if (info->opt_flag & WRITE_CACHE_USED) + { + if (flush_io_cache(&info->rec_cache)) + { + DBUG_RETURN(1); + } + info->rec_cache.seek_not_done=1; /* We have done a seek */ + } + + if ((info->opt_flag & READ_CHECK_USED)) + { /* If check isn't disabled */ + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_read(info, (char*) info->rec_buff, + info->s->base.reclength, + info->cur_row.lastpos, + MYF(MY_NABP))) + DBUG_RETURN(1); + if (memcmp((uchar*) info->rec_buff, (uchar*) old, + (uint) info->s->base.reclength)) + { + DBUG_DUMP("read",old,info->s->base.reclength); + DBUG_DUMP("disk",info->rec_buff,info->s->base.reclength); + my_errno=HA_ERR_RECORD_CHANGED; /* Record have changed */ + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos) +{ + DBUG_ENTER("_ma_cmp_static_unique"); + + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_read(info, (char*) info->rec_buff, info->s->base.reclength, + pos, MYF(MY_NABP))) + DBUG_RETURN(1); + DBUG_RETURN(_ma_unique_comp(def, record, (uchar*) info->rec_buff, + def->null_are_equal)); +} + + +/* + Read a fixed-length-record + + RETURN + 0 Ok + 1 record delete + -1 on read-error or locking-error +*/ + +int _ma_read_static_record(register MARIA_HA *info, register uchar *record, + MARIA_RECORD_POS pos) +{ + int error; + + if (pos != HA_OFFSET_ERROR) + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file <= pos && + flush_io_cache(&info->rec_cache)) + return(my_errno); + info->rec_cache.seek_not_done=1; /* We have done a seek */ + + error=info->s->file_read(info,(char*) record,info->s->base.reclength, + pos, MYF(MY_NABP)); + if (! error) + { + fast_ma_writeinfo(info); + if (!*record) + { + /* Record is deleted */ + return ((my_errno=HA_ERR_RECORD_DELETED)); + } + info->update|= HA_STATE_AKTIV; /* Record is read */ + return(0); + } + } + fast_ma_writeinfo(info); /* No such record */ + return(my_errno); +} + + + +int _ma_read_rnd_static_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks) +{ + int locked,error,cache_read; + uint cache_length; + MARIA_SHARE *share=info->s; + DBUG_ENTER("_ma_read_rnd_static_record"); + + cache_read=0; + cache_length=0; + if (info->opt_flag & READ_CACHE_USED) + { /* Cache in use */ + if (filepos == my_b_tell(&info->rec_cache) && + (skip_deleted_blocks || !filepos)) + { + cache_read=1; /* Read record using cache */ + cache_length=(uint) (info->rec_cache.read_end - info->rec_cache.read_pos); + } + else + info->rec_cache.seek_not_done=1; /* Filepos is changed */ + } + locked=0; + if (info->lock_type == F_UNLCK) + { + if (filepos >= info->state->data_file_length) + { /* Test if new records */ + if (_ma_readinfo(info,F_RDLCK,0)) + DBUG_RETURN(my_errno); + locked=1; + } + else + { /* We don't nead new info */ +#ifndef UNSAFE_LOCKING + if ((! cache_read || share->base.reclength > cache_length) && + share->tot_locks == 0) + { /* record not in cache */ + locked=1; + } +#else + info->tmp_lock_type=F_RDLCK; +#endif + } + } + if (filepos >= info->state->data_file_length) + { + DBUG_PRINT("test",("filepos: %ld (%ld) records: %ld del: %ld", + (long) filepos/share->base.reclength, (long) filepos, + (long) info->state->records, (long) info->state->del)); + fast_ma_writeinfo(info); + DBUG_RETURN(my_errno=HA_ERR_END_OF_FILE); + } + info->cur_row.lastpos= filepos; + info->cur_row.nextpos= filepos+share->base.pack_reclength; + + if (! cache_read) /* No cacheing */ + { + error= _ma_read_static_record(info, buf, filepos); + DBUG_RETURN(error); + } + + /* Read record with cacheing */ + error=my_b_read(&info->rec_cache,(uchar*) buf,share->base.reclength); + if (info->s->base.pack_reclength != info->s->base.reclength && !error) + { + char tmp[8]; /* Skill fill bytes */ + error=my_b_read(&info->rec_cache,(uchar*) tmp, + info->s->base.pack_reclength - info->s->base.reclength); + } + if (locked) + VOID(_ma_writeinfo(info,0)); /* Unlock keyfile */ + if (!error) + { + if (!buf[0]) + { /* Record is removed */ + DBUG_RETURN(my_errno=HA_ERR_RECORD_DELETED); + } + /* Found and may be updated */ + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + DBUG_RETURN(0); + } + /* my_errno should be set if rec_cache.error == -1 */ + if (info->rec_cache.error != -1 || my_errno == 0) + my_errno=HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(my_errno); /* Something wrong (EOF?) */ +} diff --git a/storage/maria/ma_test1.c b/storage/maria/ma_test1.c new file mode 100644 index 00000000000..80bd3c348a7 --- /dev/null +++ b/storage/maria/ma_test1.c @@ -0,0 +1,846 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Testing of the basic functions of a MARIA table */ + +#include "maria_def.h" +#include <my_getopt.h> +#include <m_string.h> +#include "ma_control_file.h" +#include "ma_loghandler.h" +#include "trnman.h" + +extern PAGECACHE *maria_log_pagecache; +extern const char *maria_data_root; + +#define MAX_REC_LENGTH 1024 + +static void usage(); + +static int rec_pointer_size=0, flags[50], testflag; +static int key_field=FIELD_SKIP_PRESPACE,extra_field=FIELD_SKIP_ENDSPACE; +static int key_type=HA_KEYTYPE_NUM; +static int create_flag=0; +static enum data_file_type record_type= DYNAMIC_RECORD; + +static uint insert_count, update_count, remove_count; +static uint pack_keys=0, pack_seg=0, key_length; +static uint unique_key=HA_NOSAME; +static uint die_in_middle_of_transaction; +static my_bool pagecacheing, null_fields, silent, skip_update, opt_unique; +static my_bool verbose, skip_delete, transactional; +static MARIA_COLUMNDEF recinfo[4]; +static MARIA_KEYDEF keyinfo[10]; +static HA_KEYSEG keyseg[10]; +static HA_KEYSEG uniqueseg[10]; + +static int run_test(const char *filename); +static void get_options(int argc, char *argv[]); +static void create_key(char *key,uint rownr); +static void create_record(char *record,uint rownr); +static void update_record(char *record); + + +/* + These are here only for testing of recovery with undo. We are not + including maria_def.h here as this test is also to be an example of + how to use maria outside of the maria directory +*/ + +extern int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index, + enum flush_type flush_type_for_data, + enum flush_type flush_type_for_index); +#define MARIA_FLUSH_DATA 1 + + +int main(int argc,char *argv[]) +{ + MY_INIT(argv[0]); + my_init(); + get_options(argc,argv); + maria_data_root= "."; + /* Maria requires that we always have a page cache */ + if (maria_init() || + (init_pagecache(maria_pagecache, IO_SIZE*16, 0, 0, + maria_block_size) == 0) || + ma_control_file_create_or_open() || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS) || + (transactional && trnman_init(0))) + { + fprintf(stderr, "Error in initialization"); + exit(1); + } + + exit(run_test("test1")); +} + + +static int run_test(const char *filename) +{ + MARIA_HA *file; + int i,j,error,deleted,rec_length,uniques=0; + uint offset_to_key; + ha_rows found,row_count; + char record[MAX_REC_LENGTH],key[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH]; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + + if (die_in_middle_of_transaction) + null_fields= 1; + + bzero((char*) recinfo,sizeof(recinfo)); + bzero((char*) &create_info,sizeof(create_info)); + + /* First define 2 columns */ + create_info.null_bytes= 1; + recinfo[0].type= key_field; + recinfo[0].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr : + key_length); + if (key_field == FIELD_VARCHAR) + recinfo[0].length+= HA_VARCHAR_PACKLENGTH(key_length); + recinfo[1].type=extra_field; + recinfo[1].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24); + if (extra_field == FIELD_VARCHAR) + recinfo[1].length+= HA_VARCHAR_PACKLENGTH(recinfo[1].length); + recinfo[1].null_bit= null_fields ? 2 : 0; + + if (opt_unique) + { + recinfo[2].type=FIELD_CHECK; + recinfo[2].length=MARIA_UNIQUE_HASH_LENGTH; + } + rec_length= recinfo[0].length+recinfo[1].length+recinfo[2].length; + + if (key_type == HA_KEYTYPE_VARTEXT1 && + key_length > 255) + key_type= HA_KEYTYPE_VARTEXT2; + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].seg[0].type= key_type; + keyinfo[0].seg[0].flag= pack_seg; + keyinfo[0].seg[0].start=1; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language= default_charset_info->number; + if (pack_seg & HA_BLOB_PART) + { + keyinfo[0].seg[0].bit_start=4; /* Length of blob length */ + } + keyinfo[0].flag = (uint8) (pack_keys | unique_key); + + bzero((uchar*) flags,sizeof(flags)); + if (opt_unique) + { + uint start; + uniques=1; + bzero((char*) &uniquedef,sizeof(uniquedef)); + bzero((char*) uniqueseg,sizeof(uniqueseg)); + uniquedef.seg=uniqueseg; + uniquedef.keysegs=2; + + /* Make a unique over all columns (except first NULL fields) */ + for (i=0, start=1 ; i < 2 ; i++) + { + uniqueseg[i].start=start; + start+=recinfo[i].length; + uniqueseg[i].length=recinfo[i].length; + uniqueseg[i].language= default_charset_info->number; + } + uniqueseg[0].type= key_type; + uniqueseg[0].null_bit= null_fields ? 2 : 0; + uniqueseg[1].type= HA_KEYTYPE_TEXT; + if (extra_field == FIELD_BLOB) + { + uniqueseg[1].length=0; /* The whole blob */ + uniqueseg[1].bit_start=4; /* long blob */ + uniqueseg[1].flag|= HA_BLOB_PART; + } + else if (extra_field == FIELD_VARCHAR) + { + uniqueseg[1].flag|= HA_VAR_LENGTH_PART; + uniqueseg[1].type= (HA_VARCHAR_PACKLENGTH(recinfo[1].length-1) == 1 ? + HA_KEYTYPE_VARTEXT1 : HA_KEYTYPE_VARTEXT2); + } + } + else + uniques=0; + + offset_to_key= test(null_fields); + if (key_field == FIELD_BLOB || key_field == FIELD_VARCHAR) + offset_to_key+= 2; + + if (!silent) + printf("- Creating maria file\n"); + create_info.max_rows=(ulong) (rec_pointer_size ? + (1L << (rec_pointer_size*8))/40 : + 0); + create_info.transactional= transactional; + if (maria_create(filename, record_type, 1, keyinfo,2+opt_unique,recinfo, + uniques, &uniquedef, &create_info, + create_flag)) + goto err; + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + if (!silent) + printf("- Writing key:s\n"); + + if (maria_begin(file)) + goto err; + my_errno=0; + row_count=deleted=0; + for (i=49 ; i>=1 ; i-=2 ) + { + if (insert_count-- == 0) { VOID(maria_close(file)) ; exit(0) ; } + j=i%25 +1; + create_record(record,j); + error=maria_write(file,record); + if (!error) + row_count++; + flags[j]=1; + if (verbose || error) + printf("J= %2d maria_write: %d errno: %d\n", j,error,my_errno); + } + + if (maria_commit(file) || maria_begin(file)) + goto err; + + if (testflag == 1) + goto end; + + /* Insert 2 rows with null values */ + if (null_fields) + { + create_record(record,0); + error=maria_write(file,record); + if (!error) + row_count++; + if (verbose || error) + printf("J= NULL maria_write: %d errno: %d\n", error,my_errno); + error=maria_write(file,record); + if (!error) + row_count++; + if (verbose || error) + printf("J= NULL maria_write: %d errno: %d\n", error,my_errno); + flags[0]=2; + } + + if (testflag == 2) + { + printf("Terminating after inserts\n"); + goto end; + } + + if (maria_commit(file) || maria_begin(file)) + goto err; + + if (!skip_update) + { + if (opt_unique) + { + if (!silent) + printf("- Checking unique constraint\n"); + create_record(record,j); + if (!maria_write(file,record) || my_errno != HA_ERR_FOUND_DUPP_UNIQUE) + { + printf("unique check failed\n"); + } + } + if (!silent) + printf("- Updating rows\n"); + + /* Update first last row to force extend of file */ + if (maria_rsame(file,read_record,-1)) + { + printf("Can't find last row with maria_rsame\n"); + } + else + { + memcpy(record,read_record,rec_length); + update_record(record); + if (maria_update(file,read_record,record)) + { + printf("Can't update last row: %.*s\n", + keyinfo[0].seg[0].length,read_record+1); + } + } + + /* Read through all rows and update them */ + assert(maria_scan_init(file) == 0); + + found=0; + while ((error= maria_scan(file,read_record)) == 0) + { + if (--update_count == 0) { VOID(maria_close(file)) ; exit(0) ; } + memcpy(record,read_record,rec_length); + update_record(record); + if (maria_update(file,read_record,record)) + { + printf("Can't update row: %.*s, error: %d\n", + keyinfo[0].seg[0].length,record+1,my_errno); + } + found++; + } + if (found != row_count) + printf("Found %ld of %ld rows\n", (ulong) found, (ulong) row_count); + maria_scan_end(file); + } + + if (testflag == 3) + { + printf("Terminating after updates\n"); + goto end; + } + if (!silent) + printf("- Reopening file\n"); + if (maria_commit(file)) + goto err; + if (maria_close(file)) + goto err; + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + if (maria_begin(file)) + goto err; + if (!skip_delete) + { + if (!silent) + printf("- Removing keys\n"); + + for (i=0 ; i <= 10 ; i++) + { + /* + If you want to debug the problem in ma_test_recovery with BLOBs + (see @todo there), you can break out of the loop after just one + delete, it is enough, like this: + if (i==1) break; + */ + /* testing */ + if (remove_count-- == 0) + { + fprintf(stderr, + "delete-rows number of rows deleted; Going down hard!\n"); + goto end; + } + j=i*2; + if (!flags[j]) + continue; + create_key(key,j); + my_errno=0; + if ((error = maria_rkey(file, read_record, 0, key, + HA_WHOLE_KEY, HA_READ_KEY_EXACT))) + { + if (verbose || (flags[j] >= 1 || + (error && my_errno != HA_ERR_KEY_NOT_FOUND))) + printf("key: '%.*s' maria_rkey: %3d errno: %3d\n", + (int) key_length,key+offset_to_key,error,my_errno); + } + else + { + error=maria_delete(file,read_record); + if (verbose || error) + printf("key: '%.*s' maria_delete: %3d errno: %3d\n", + (int) key_length, key+offset_to_key, error, my_errno); + if (! error) + { + deleted++; + flags[j]--; + } + } + } + } + + if (testflag == 4) + { + printf("Terminating after deletes\n"); + goto end; + } + + if (!silent) + printf("- Reading rows with key\n"); + record[1]= 0; /* For nicer printf */ + for (i=0 ; i <= 25 ; i++) + { + create_key(key,i); + my_errno=0; + error=maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT); + if (verbose || + (error == 0 && flags[i] == 0 && unique_key) || + (error && (flags[i] != 0 || my_errno != HA_ERR_KEY_NOT_FOUND))) + { + printf("key: '%.*s' maria_rkey: %3d errno: %3d record: %s\n", + (int) key_length,key+offset_to_key,error,my_errno,record+1); + } + } + + if (!silent) + printf("- Reading rows with position\n"); + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + + for (i=1,found=0 ; i <= 30 ; i++) + { + my_errno=0; + if ((error= maria_scan(file, read_record)) == HA_ERR_END_OF_FILE) + { + if (found != row_count-deleted) + printf("Found only %ld of %ld rows\n", (ulong) found, + (ulong) (row_count - deleted)); + break; + } + if (!error) + found++; + if (verbose || (error != 0 && error != HA_ERR_RECORD_DELETED && + error != HA_ERR_END_OF_FILE)) + { + printf("pos: %2d maria_rrnd: %3d errno: %3d record: %s\n", + i-1,error,my_errno,read_record+1); + } + } + +end: + if (die_in_middle_of_transaction) + { + /* As commit record is not done, UNDO entries needs to be rolled back */ + switch (die_in_middle_of_transaction) { + case 1: + /* + Flush changed pages go to disk. That will also flush log. Recovery + will skip REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE, + FLUSH_RELEASE); + break; + case 2: + /* + Just flush log. Pages are likely to not be on disk. Recovery will + then execute REDOs and UNDOs. + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + case 3: + /* + Flush nothing. Pages and log are likely to not be on disk. Recovery + will then do nothing. + */ + break; + } + printf("Dying on request without maria_commit()/maria_close()\n"); + exit(0); + } + + if (maria_commit(file)) + goto err; + if (maria_close(file)) + goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return (0); +err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + + +static void create_key_part(char *key,uint rownr) +{ + if (!unique_key) + rownr&=7; /* Some identical keys */ + if (keyinfo[0].seg[0].type == HA_KEYTYPE_NUM) + { + sprintf(key,"%*d",keyinfo[0].seg[0].length,rownr); + } + else if (keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT1 || + keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT2) + { /* Alpha record */ + /* Create a key that may be easily packed */ + bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B'); + sprintf(key+keyinfo[0].seg[0].length-2,"%-2d",rownr); + if ((rownr & 7) == 0) + { + /* Change the key to force a unpack of the next key */ + bfill(key+3,keyinfo[0].seg[0].length-5,rownr < 10 ? 'a' : 'b'); + } + } + else + { /* Alpha record */ + if (keyinfo[0].seg[0].flag & HA_SPACE_PACK) + sprintf(key,"%-*d",keyinfo[0].seg[0].length,rownr); + else + { + /* Create a key that may be easily packed */ + bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B'); + sprintf(key+keyinfo[0].seg[0].length-2,"%-2d",rownr); + if ((rownr & 7) == 0) + { + /* Change the key to force a unpack of the next key */ + key[1]= (rownr < 10 ? 'a' : 'b'); + } + } + } +} + + +static void create_key(char *key,uint rownr) +{ + if (keyinfo[0].seg[0].null_bit) + { + if (rownr == 0) + { + key[0]=1; /* null key */ + key[1]=0; /* For easy print of key */ + return; + } + *key++=0; + } + if (keyinfo[0].seg[0].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint tmp; + create_key_part(key+2,rownr); + tmp=strlen(key+2); + int2store(key,tmp); + } + else + create_key_part(key,rownr); +} + + +static char blob_key[MAX_REC_LENGTH]; +static char blob_record[MAX_REC_LENGTH+20*20]; + + +static void create_record(char *record,uint rownr) +{ + char *pos; + bzero((char*) record,MAX_REC_LENGTH); + record[0]=1; /* delete marker */ + if (rownr == 0 && keyinfo[0].seg[0].null_bit) + record[0]|=keyinfo[0].seg[0].null_bit; /* Null key */ + + pos=record+1; + if (recinfo[0].type == FIELD_BLOB) + { + uint tmp; + char *ptr; + create_key_part(blob_key,rownr); + tmp=strlen(blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1); + create_key_part(pos+pack_length,rownr); + tmp= strlen(pos+pack_length); + if (pack_length == 1) + *(uchar*) pos= (uchar) tmp; + else + int2store(pos,tmp); + pos+= recinfo[0].length; + } + else + { + create_key_part(pos,rownr); + pos+=recinfo[0].length; + } + if (recinfo[1].type == FIELD_BLOB) + { + uint tmp; + char *ptr;; + sprintf(blob_record,"... row: %d", rownr); + strappend(blob_record,max(MAX_REC_LENGTH-rownr,10),' '); + tmp=strlen(blob_record); + int4store(pos,tmp); + ptr=blob_record; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1); + sprintf(pos+pack_length, "... row: %d", rownr); + tmp= strlen(pos+pack_length); + if (pack_length == 1) + *(uchar*) pos= (uchar) tmp; + else + int2store(pos,tmp); + } + else + { + sprintf(pos,"... row: %d", rownr); + strappend(pos,recinfo[1].length,' '); + } +} + +/* change row to test re-packing of rows and reallocation of keys */ + +static void update_record(char *record) +{ + char *pos=record+1; + if (recinfo[0].type == FIELD_BLOB) + { + char *column,*ptr; + int length; + length=uint4korr(pos); /* Long blob */ + memcpy_fixed(&column,pos+4,sizeof(char*)); + memcpy(blob_key,column,length); /* Move old key */ + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); /* Store pointer to new key */ + if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM) + default_charset_info->cset->casedn(default_charset_info, + blob_key, length, blob_key, length); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1); + uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos); + default_charset_info->cset->casedn(default_charset_info, + pos + pack_length, length, + pos + pack_length, length); + pos+=recinfo[0].length; + } + else + { + if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM) + default_charset_info->cset->casedn(default_charset_info, + pos, keyinfo[0].seg[0].length, + pos, keyinfo[0].seg[0].length); + pos+=recinfo[0].length; + } + + if (recinfo[1].type == FIELD_BLOB) + { + char *column; + int length; + length=uint4korr(pos); + memcpy_fixed(&column,pos+4,sizeof(char*)); + memcpy(blob_record,column,length); + bfill(blob_record+length,20,'.'); /* Make it larger */ + length+=20; + int4store(pos,length); + column=blob_record; + memcpy_fixed(pos+4,&column,sizeof(char*)); + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + /* Second field is longer than 10 characters */ + uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1); + uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos); + pos= record+ recinfo[1].offset; + bfill(pos+pack_length+length,recinfo[1].length-length-pack_length,'.'); + length=recinfo[1].length-pack_length; + if (pack_length == 1) + *(uchar*) pos= (uchar) length; + else + int2store(pos,length); + } + else + { + bfill(pos+recinfo[1].length-10,10,'.'); + } +} + + +static struct my_option my_long_options[] = +{ + {"checksum", 'c', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', "Undocumented", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"delete-rows", 'd', "Abort after this many rows has been deleted", + (uchar**) &remove_count, (uchar**) &remove_count, 0, GET_UINT, REQUIRED_ARG, + 1000, 0, 0, 0, 0, 0}, + {"help", '?', "Display help and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"insert-rows", 'i', "Undocumented", (uchar**) &insert_count, + (uchar**) &insert_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0}, + {"key-alpha", 'a', "Use a key of type HA_KEYTYPE_TEXT", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-binary-pack", 'B', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-blob", 'b', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-cache", 'K', "Undocumented", (uchar**) &pagecacheing, + (uchar**) &pagecacheing, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-length", 'k', "Undocumented", (uchar**) &key_length, + (uchar**) &key_length, 0, GET_UINT, REQUIRED_ARG, 6, 0, 0, 0, 0, 0}, + {"key-multiple", 'm', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-prefix_pack", 'P', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-space_pack", 'p', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-varchar", 'w', "Test VARCHAR keys", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"null-fields", 'N', "Define fields with NULL", + (uchar**) &null_fields, (uchar**) &null_fields, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"row-fixed-size", 'S', "Fixed size records", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"rows-in-block", 'M', "Store rows in block format", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"row-pointer-size", 'R', "Undocumented", (uchar**) &rec_pointer_size, + (uchar**) &rec_pointer_size, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"silent", 's', "Undocumented", + (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, + 0, 0}, + {"skip-delete", 'U', "Don't test deletes", (uchar**) &skip_delete, + (uchar**) &skip_delete, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"skip-update", 'D', "Don't test updates", (uchar**) &skip_update, + (uchar**) &skip_update, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag, + (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"test-undo", 'A', + "Abort hard. Used for testing recovery with undo", + (uchar**) &die_in_middle_of_transaction, + (uchar**) &die_in_middle_of_transaction, + 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"transactional", 'T', + "Test in transactional mode. (Only works with block format)", + (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"unique", 'C', "Undocumented", (uchar**) &opt_unique, + (uchar**) &opt_unique, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"update-rows", 'u', "Max number of rows to update", (uchar**) &update_count, + (uchar**) &update_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Be more verbose", (uchar**) &verbose, + (uchar**) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version number and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch(optid) { + case 'a': + key_type= HA_KEYTYPE_TEXT; + break; + case 'c': + create_flag|= HA_CREATE_CHECKSUM; + break; + case 'R': /* Length of record pointer */ + if (rec_pointer_size > 3) + rec_pointer_size=0; + break; + case 'P': + pack_keys= HA_PACK_KEY; /* Use prefix compression */ + break; + case 'B': + pack_keys= HA_BINARY_PACK_KEY; /* Use binary compression */ + break; + case 'M': + record_type= BLOCK_RECORD; + break; + case 'S': + if (key_field == FIELD_VARCHAR) + { + create_flag=0; /* Static sized varchar */ + record_type= STATIC_RECORD; + } + else if (key_field != FIELD_BLOB) + { + key_field=FIELD_NORMAL; /* static-size record */ + extra_field=FIELD_NORMAL; + record_type= STATIC_RECORD; + } + break; + case 'p': + pack_keys=HA_PACK_KEY; /* Use prefix + space packing */ + pack_seg=HA_SPACE_PACK; + key_type=HA_KEYTYPE_TEXT; + break; + case 'm': + unique_key=0; + break; + case 'b': + key_field=FIELD_BLOB; /* blob key */ + extra_field= FIELD_BLOB; + pack_seg|= HA_BLOB_PART; + key_type= HA_KEYTYPE_VARTEXT1; + if (record_type == STATIC_RECORD) + record_type= DYNAMIC_RECORD; + break; + case 'k': + if (key_length < 4 || key_length > HA_MAX_KEY_LENGTH) + { + fprintf(stderr,"Wrong key length\n"); + exit(1); + } + break; + case 'w': + key_field=FIELD_VARCHAR; /* varchar keys */ + extra_field= FIELD_VARCHAR; + key_type= HA_KEYTYPE_VARTEXT1; + pack_seg|= HA_VAR_LENGTH_PART; + if (record_type == STATIC_RECORD) + record_type= DYNAMIC_RECORD; + break; + case 'K': /* Use key cacheing */ + pagecacheing=1; + break; + case 'V': + printf("test1 Ver 1.2 \n"); + exit(0); + case '#': + DBUG_PUSH (argument); + break; + case '?': + usage(); + exit(1); + } + return 0; +} + + +/* Read options */ + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + + return; +} /* get options */ + + +static void usage() +{ + printf("Usage: %s [options]\n\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c new file mode 100644 index 00000000000..935be09850c --- /dev/null +++ b/storage/maria/ma_test2.c @@ -0,0 +1,1180 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Test av isam-databas: stor test */ + +#ifndef USE_MY_FUNC /* We want to be able to dbug this !! */ +#define USE_MY_FUNC +#endif +#ifdef DBUG_OFF +#undef DBUG_OFF +#endif +#ifndef SAFEMALLOC +#define SAFEMALLOC +#endif +#include "maria_def.h" +#include "trnman.h" +#include <m_ctype.h> +#include <my_bit.h> + + +#define STANDARD_LENGTH 37 +#define MARIA_KEYS 6 +#define MAX_PARTS 4 +#if !defined(MSDOS) && !defined(labs) +#define labs(a) abs(a) +#endif + +static void get_options(int argc, char *argv[]); +static uint rnd(uint max_value); +static void fix_length(uchar *record,uint length); +static void put_blob_in_record(char *blob_pos,char **blob_buffer, + ulong *length); +static void copy_key(struct st_maria_info *info,uint inx, + uchar *record,uchar *key); + +static int verbose=0,testflag=0, + first_key=0,async_io=0,pagecacheing=0,write_cacheing=0,locking=0, + rec_pointer_size=0,pack_fields=1,silent=0, + opt_quick_mode=0, transactional= 0, skip_update= 0, + die_in_middle_of_transaction= 0; +static int pack_seg=HA_SPACE_PACK,pack_type=HA_PACK_KEY,remove_count=-1; +static int create_flag= 0, srand_arg= 0; +static ulong pagecache_size=IO_SIZE*16; +static enum data_file_type record_type= DYNAMIC_RECORD; + +static uint keys=MARIA_KEYS,recant=1000; +static uint use_blob=0; +static uint16 key1[1001],key3[5000]; +static char record[300],record2[300],key[100],key2[100], + read_record[300],read_record2[300],read_record3[300]; +static HA_KEYSEG glob_keyseg[MARIA_KEYS][MAX_PARTS]; + + /* Test program */ + +int main(int argc, char *argv[]) +{ + uint i; + int j,n1,n2,n3,error,k; + uint write_count,update,dupp_keys,opt_delete,start,length,blob_pos, + reclength,ant,found_parts; + my_off_t lastpos; + ha_rows range_records,records; + MARIA_HA *file; + MARIA_KEYDEF keyinfo[10]; + MARIA_COLUMNDEF recinfo[10]; + MARIA_INFO info; + const char *filename; + char *blob_buffer; + MARIA_CREATE_INFO create_info; + MY_INIT(argv[0]); + + filename= "test2"; + get_options(argc,argv); + if (! async_io) + my_disable_async_io=1; + + maria_data_root= "."; + /* Maria requires that we always have a page cache */ + if (maria_init() || + (init_pagecache(maria_pagecache, pagecache_size, 0, 0, + maria_block_size) == 0) || + ma_control_file_create_or_open() || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS) || + (transactional && trnman_init(0))) + { + fprintf(stderr, "Error in initialization"); + exit(1); + } + + reclength=STANDARD_LENGTH+60+(use_blob ? 8 : 0); + blob_pos=STANDARD_LENGTH+60; + keyinfo[0].seg= &glob_keyseg[0][0]; + keyinfo[0].seg[0].start=0; + keyinfo[0].seg[0].length=6; + keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[0].seg[0].language= default_charset_info->number; + keyinfo[0].seg[0].flag=(uint8) pack_seg; + keyinfo[0].seg[0].null_bit=0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].keysegs=1; + keyinfo[0].flag = pack_type; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[1].seg= &glob_keyseg[1][0]; + keyinfo[1].seg[0].start=7; + keyinfo[1].seg[0].length=6; + keyinfo[1].seg[0].type=HA_KEYTYPE_BINARY; + keyinfo[1].seg[0].flag=0; + keyinfo[1].seg[0].null_bit=0; + keyinfo[1].seg[0].null_pos=0; + keyinfo[1].seg[1].start=0; /* two part key */ + keyinfo[1].seg[1].length=6; + keyinfo[1].seg[1].type=HA_KEYTYPE_NUM; + keyinfo[1].seg[1].flag=HA_REVERSE_SORT; + keyinfo[1].seg[1].null_bit=0; + keyinfo[1].seg[1].null_pos=0; + keyinfo[1].key_alg=HA_KEY_ALG_BTREE; + keyinfo[1].keysegs=2; + keyinfo[1].flag =0; + keyinfo[1].block_length= MARIA_MIN_KEY_BLOCK_LENGTH; /* Diff blocklength */ + keyinfo[2].seg= &glob_keyseg[2][0]; + keyinfo[2].seg[0].start=12; + keyinfo[2].seg[0].length=8; + keyinfo[2].seg[0].type=HA_KEYTYPE_BINARY; + keyinfo[2].seg[0].flag=HA_REVERSE_SORT; + keyinfo[2].seg[0].null_bit=0; + keyinfo[2].seg[0].null_pos=0; + keyinfo[2].key_alg=HA_KEY_ALG_BTREE; + keyinfo[2].keysegs=1; + keyinfo[2].flag =HA_NOSAME; + keyinfo[2].block_length= 0; /* Default block length */ + keyinfo[3].seg= &glob_keyseg[3][0]; + keyinfo[3].seg[0].start=0; + keyinfo[3].seg[0].length=reclength-(use_blob ? 8 : 0); + keyinfo[3].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[3].seg[0].language=default_charset_info->number; + keyinfo[3].seg[0].flag=(uint8) pack_seg; + keyinfo[3].seg[0].null_bit=0; + keyinfo[3].seg[0].null_pos=0; + keyinfo[3].key_alg=HA_KEY_ALG_BTREE; + keyinfo[3].keysegs=1; + keyinfo[3].flag = pack_type; + keyinfo[3].block_length= 0; /* Default block length */ + keyinfo[4].seg= &glob_keyseg[4][0]; + keyinfo[4].seg[0].start=0; + keyinfo[4].seg[0].length=5; + keyinfo[4].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[4].seg[0].language=default_charset_info->number; + keyinfo[4].seg[0].flag=0; + keyinfo[4].seg[0].null_bit=0; + keyinfo[4].seg[0].null_pos=0; + keyinfo[4].key_alg=HA_KEY_ALG_BTREE; + keyinfo[4].keysegs=1; + keyinfo[4].flag = pack_type; + keyinfo[4].block_length= 0; /* Default block length */ + keyinfo[5].seg= &glob_keyseg[5][0]; + keyinfo[5].seg[0].start=0; + keyinfo[5].seg[0].length=4; + keyinfo[5].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[5].seg[0].language=default_charset_info->number; + keyinfo[5].seg[0].flag=pack_seg; + keyinfo[5].seg[0].null_bit=0; + keyinfo[5].seg[0].null_pos=0; + keyinfo[5].key_alg=HA_KEY_ALG_BTREE; + keyinfo[5].keysegs=1; + keyinfo[5].flag = pack_type; + keyinfo[5].block_length= 0; /* Default block length */ + + recinfo[0].type=pack_fields ? FIELD_SKIP_PRESPACE : 0; + recinfo[0].length=7; + recinfo[0].null_bit=0; + recinfo[0].null_pos=0; + recinfo[1].type=pack_fields ? FIELD_SKIP_PRESPACE : 0; + recinfo[1].length=5; + recinfo[1].null_bit=0; + recinfo[1].null_pos=0; + recinfo[2].type=pack_fields ? FIELD_SKIP_PRESPACE : 0; + recinfo[2].length=9; + recinfo[2].null_bit=0; + recinfo[2].null_pos=0; + recinfo[3].type=FIELD_NORMAL; + recinfo[3].length=STANDARD_LENGTH-7-5-9-4; + recinfo[3].null_bit=0; + recinfo[3].null_pos=0; + recinfo[4].type=pack_fields ? FIELD_SKIP_ZERO : 0; + recinfo[4].length=4; + recinfo[4].null_bit=0; + recinfo[4].null_pos=0; + recinfo[5].type=pack_fields ? FIELD_SKIP_ENDSPACE : 0; + recinfo[5].length=60; + recinfo[5].null_bit=0; + recinfo[5].null_pos=0; + if (use_blob) + { + recinfo[6].type=FIELD_BLOB; + recinfo[6].length=4+portable_sizeof_char_ptr; + recinfo[6].null_bit=0; + recinfo[6].null_pos=0; + } + + write_count=update=dupp_keys=opt_delete=0; + blob_buffer=0; + + for (i=1000 ; i>0 ; i--) key1[i]=0; + for (i=4999 ; i>0 ; i--) key3[i]=0; + + if (!silent) + printf("- Creating maria-file\n"); + file= 0; + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=(ha_rows) (rec_pointer_size ? + (1L << (rec_pointer_size*8))/ + reclength : 0); + create_info.reloc_rows=(ha_rows) 100; + create_info.transactional= transactional; + if (maria_create(filename, record_type, keys,&keyinfo[first_key], + use_blob ? 7 : 6, &recinfo[0], + 0,(MARIA_UNIQUEDEF*) 0, + &create_info,create_flag)) + goto err; + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED))) + goto err; + maria_begin(file); + if (testflag == 1) + goto end; + if (!silent) + printf("- Writing key:s\n"); + if (locking) + maria_lock_database(file,F_WRLCK); + if (write_cacheing) + maria_extra(file,HA_EXTRA_WRITE_CACHE,0); + if (opt_quick_mode) + maria_extra(file,HA_EXTRA_QUICK,0); + + for (i=0 ; i < recant ; i++) + { + ulong blob_length; +#if 0 + /* + Starting from i==72, there was a difference between runtime and + log-applying. This is now fixed, by not using non_header_data_len in + log-applying. + */ + if (i == 72) goto end; +#endif + n1=rnd(1000); n2=rnd(100); n3=rnd(5000); + sprintf(record,"%6d:%4d:%8d:Pos: %4d ",n1,n2,n3,write_count); + int4store(record+STANDARD_LENGTH-4,(long) i); + fix_length(record,(uint) STANDARD_LENGTH+rnd(60)); + put_blob_in_record(record+blob_pos,&blob_buffer, &blob_length); + DBUG_PRINT("test",("record: %d blob_length: %lu", i, blob_length)); + + if (maria_write(file,record)) + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0) + { + printf("Error: %d in write at record: %d\n",my_errno,i); + goto err; + } + if (verbose) printf(" Double key: %d at record# %d\n", n3, i); + } + else + { + if (key3[n3] == 1 && first_key <3 && first_key+keys >= 3) + { + printf("Error: Didn't get error when writing second key: '%8d'\n",n3); + goto err; + } + write_count++; key1[n1]++; key3[n3]=1; + } + + /* Check if we can find key without flushing database */ + if (i % 10 == 0) + { + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + if (!j) + for (j=999 ; j>0 && key1[j] == 0 ; j--) ; + sprintf(key,"%6d",j); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + { + printf("Test in loop: Can't find key: \"%s\"\n",key); + goto err; + } + } + } + if (testflag == 2) + goto end; + + if (write_cacheing) + { + if (maria_extra(file,HA_EXTRA_NO_CACHE,0)) + { + puts("got error from maria_extra(HA_EXTRA_NO_CACHE)"); + goto err; + } + } +#ifdef REMOVE_WHEN_WE_HAVE_RESIZE + if (pagecacheing) + resize_pagecache(maria_pagecache, maria_block_size, + pagecache_size * 2, 0, 0); +#endif + if (!silent) + printf("- Delete\n"); + if (srand_arg) + srand(srand_arg); + for (i=0 ; i<recant/10 ; i++) + { + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + if (j != 0) + { + sprintf(key,"%6d",j); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + { + printf("can't find key1: \"%s\"\n",key); + goto err; + } + if (bcmp(read_record+keyinfo[0].seg[0].start, + key, keyinfo[0].seg[0].length)) + { + printf("Found wrong record when searching for key: \"%s\"\n",key); + goto err; + } + if (opt_delete == (uint) remove_count) /* While testing */ + goto end; + if (maria_delete(file,read_record)) + { + printf("error: %d; can't delete record: \"%s\"\n", my_errno,read_record); + goto err; + } + opt_delete++; + key1[atoi(read_record+keyinfo[0].seg[0].start)]--; + key3[atoi(read_record+keyinfo[2].seg[0].start)]=0; + } + else + puts("Warning: Skipping delete test because no dupplicate keys"); + } + if (testflag == 3) + goto end; + + if (!silent) + printf("- Update\n"); + if (srand_arg) + srand(srand_arg); + for (i=0 ; i<recant/10 ; i++) + { + n1=rnd(1000); n2=rnd(100); n3=rnd(5000); + sprintf(record2,"%6d:%4d:%8d:XXX: %4d ",n1,n2,n3,update); + int4store(record2+STANDARD_LENGTH-4,(long) i); + fix_length(record2,(uint) STANDARD_LENGTH+rnd(60)); + + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + if (j != 0) + { + sprintf(key,"%6d",j); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + { + printf("can't find key1: \"%s\"\n",key); + goto err; + } + if (bcmp(read_record+keyinfo[0].seg[0].start, + key, keyinfo[0].seg[0].length)) + { + printf("Found wrong record when searching for key: \"%s\"; Found \"%.*s\"\n", + key, keyinfo[0].seg[0].length, + read_record+keyinfo[0].seg[0].start); + goto err; + } + if (use_blob) + { + ulong blob_length; + if (i & 1) + put_blob_in_record(record+blob_pos,&blob_buffer, &blob_length); + else + bmove(record+blob_pos,read_record+blob_pos,8); + } + if (skip_update) + continue; + if (maria_update(file,read_record,record2)) + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0) + { + printf("error: %d; can't update:\nFrom: \"%s\"\nTo: \"%s\"\n", + my_errno,read_record,record2); + goto err; + } + if (verbose) + printf("Double key when tried to update:\nFrom: \"%s\"\nTo: \"%s\"\n",record,record2); + } + else + { + key1[atoi(read_record+keyinfo[0].seg[0].start)]--; + key3[atoi(read_record+keyinfo[2].seg[0].start)]=0; + key1[n1]++; key3[n3]=1; + update++; + } + } + } + if (testflag == 4) + goto end; + + for (i=999, dupp_keys=j=0 ; i>0 ; i--) + { + if (key1[i] > dupp_keys) + { + dupp_keys=key1[i]; j=i; + } + } + sprintf(key,"%6d",j); + start=keyinfo[0].seg[0].start; + length=keyinfo[0].seg[0].length; + if (dupp_keys) + { + if (!silent) + printf("- Same key: first - next -> last - prev -> first\n"); + DBUG_PRINT("progpos",("first - next -> last - prev -> first")); + if (verbose) printf(" Using key: \"%s\" Keys: %d\n",key,dupp_keys); + + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + if (maria_rsame(file,read_record2,-1)) + goto err; + if (memcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame didn't find same record\n"); + goto err; + } + info.recpos=maria_position(file); + if (maria_rfirst(file,read_record2,0) || + maria_rsame_with_pos(file,read_record2,0,info.recpos) || + memcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame_with_pos didn't find same record\n"); + goto err; + } + { + info.recpos= maria_position(file); + int skr=maria_rnext(file,read_record2,0); + if ((skr && my_errno != HA_ERR_END_OF_FILE) || + maria_rprev(file,read_record2,-1) || + memcmp(read_record,read_record2,reclength) != 0 || + info.recpos != maria_position(file)) + { + printf("maria_rsame_with_pos lost position\n"); + goto err; + } + } + ant=1; + while (maria_rnext(file,read_record2,0) == 0 && + memcmp(read_record2+start,key,length) == 0) ant++; + if (ant != dupp_keys) + { + printf("next: Found: %d keys of %d\n",ant,dupp_keys); + goto err; + } + ant=0; + while (maria_rprev(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys) + { + printf("prev: Found: %d records of %d\n",ant,dupp_keys); + goto err; + } + + /* Check of maria_rnext_same */ + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + ant=1; + while (!maria_rnext_same(file,read_record3) && ant < dupp_keys+10) + ant++; + if (ant != dupp_keys || my_errno != HA_ERR_END_OF_FILE) + { + printf("maria_rnext_same: Found: %d records of %d\n",ant,dupp_keys); + goto err; + } + } + + if (!silent) + printf("- All keys: first - next -> last - prev -> first\n"); + DBUG_PRINT("progpos",("All keys: first - next -> last - prev -> first")); + ant=1; + if (maria_rfirst(file,read_record,0)) + { + printf("Can't find first record\n"); + goto err; + } + while ((error=maria_rnext(file,read_record3,0)) == 0 && ant < write_count+10) + ant++; + if (ant != write_count - opt_delete || error != HA_ERR_END_OF_FILE) + { + printf("next: I found: %d records of %d (error: %d)\n", + ant, write_count - opt_delete, error); + goto err; + } + if (maria_rlast(file,read_record2,0) || + bcmp(read_record2,read_record3,reclength)) + { + printf("Can't find last record\n"); + DBUG_DUMP("record2",(uchar*) read_record2,reclength); + DBUG_DUMP("record3",(uchar*) read_record3,reclength); + goto err; + } + ant=1; + while (maria_rprev(file,read_record3,0) == 0 && ant < write_count+10) + ant++; + if (ant != write_count - opt_delete) + { + printf("prev: I found: %d records of %d\n",ant,write_count); + goto err; + } + if (bcmp(read_record,read_record3,reclength)) + { + printf("Can't find first record\n"); + goto err; + } + + if (!silent) + printf("- Test if: Read first - next - prev - prev - next == first\n"); + DBUG_PRINT("progpos",("- Read first - next - prev - prev - next == first")); + if (maria_rfirst(file,read_record,0) || + maria_rnext(file,read_record3,0) || + maria_rprev(file,read_record3,0) || + maria_rprev(file,read_record3,0) == 0 || + maria_rnext(file,read_record3,0)) + goto err; + if (bcmp(read_record,read_record3,reclength) != 0) + printf("Can't find first record\n"); + + if (!silent) + printf("- Test if: Read last - prev - next - next - prev == last\n"); + DBUG_PRINT("progpos",("Read last - prev - next - next - prev == last")); + if (maria_rlast(file,read_record2,0) || + maria_rprev(file,read_record3,0) || + maria_rnext(file,read_record3,0) || + maria_rnext(file,read_record3,0) == 0 || + maria_rprev(file,read_record3,0)) + goto err; + if (bcmp(read_record2,read_record3,reclength)) + printf("Can't find last record\n"); +#ifdef NOT_ANYMORE + if (!silent) + puts("- Test read key-part"); + strmov(key2,key); + for(i=strlen(key2) ; i-- > 1 ;) + { + key2[i]=0; + + /* The following row is just to catch some bugs in the key code */ + bzero((char*) file->lastkey,file->s->base.max_key_length*2); + if (maria_rkey(file,read_record,0,key2,(uint) i,HA_READ_PREFIX)) + goto err; + if (bcmp(read_record+start,key,(uint) i)) + { + puts("Didn't find right record"); + goto err; + } + } +#endif + if (dupp_keys > 2) + { + if (!silent) + printf("- Read key (first) - next - delete - next -> last\n"); + DBUG_PRINT("progpos",("first - next - delete - next -> last")); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + if (maria_rnext(file,read_record3,0)) goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=1; + while (maria_rnext(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-1) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-1); + goto err; + } + } + if (dupp_keys>4) + { + if (!silent) + printf("- Read last of key - prev - delete - prev -> first\n"); + DBUG_PRINT("progpos",("last - prev - delete - prev -> first")); + if (maria_rprev(file,read_record3,0)) goto err; + if (maria_rprev(file,read_record3,0)) goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=1; + while (maria_rprev(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-2) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-2); + goto err; + } + } + if (dupp_keys > 6) + { + if (!silent) + printf("- Read first - delete - next -> last\n"); + DBUG_PRINT("progpos",("first - delete - next -> last")); + if (maria_rkey(file,read_record3,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=1; + if (maria_rnext(file,read_record,0)) + goto err; /* Skall finnas poster */ + while (maria_rnext(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-3) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-3); + goto err; + } + + if (!silent) + printf("- Read last - delete - prev -> first\n"); + DBUG_PRINT("progpos",("last - delete - prev -> first")); + if (maria_rprev(file,read_record3,0)) goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=0; + while (maria_rprev(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-4) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-4); + goto err; + } + } + + if (!silent) + puts("- Test if: Read rrnd - same"); + DBUG_PRINT("progpos",("Read rrnd - same")); + assert(maria_scan_init(file) == 0); + for (i=0 ; i < write_count ; i++) + { + int tmp; + if ((tmp= maria_scan(file,read_record)) && + tmp != HA_ERR_END_OF_FILE && + tmp != HA_ERR_RECORD_DELETED) + { + printf("Got error %d when scanning table\n", tmp); + break; + } + } + maria_scan_end(file); + if (i != write_count && i != write_count - opt_delete) + { + printf("Found wrong number of rows while scanning table\n"); + goto err; + } + + bmove(read_record2,read_record,reclength); + for (i=min(2,keys) ; i-- > 0 ;) + { + if (maria_rsame(file,read_record2,(int) i)) goto err; + if (bcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame didn't find same record\n"); + goto err; + } + } + if (!silent) + puts("- Test maria_records_in_range"); + maria_status(file,&info,HA_STATUS_VARIABLE); + for (i=0 ; i < info.keys ; i++) + { + key_range min_key, max_key; + if (maria_rfirst(file,read_record,(int) i) || + maria_rlast(file,read_record2,(int) i)) + goto err; + copy_key(file,(uint) i,(uchar*) read_record,(uchar*) key); + copy_key(file,(uint) i,(uchar*) read_record2,(uchar*) key2); + min_key.key= key; + min_key.keypart_map= HA_WHOLE_KEY; + min_key.flag= HA_READ_KEY_EXACT; + max_key.key= key2; + max_key.keypart_map= HA_WHOLE_KEY; + max_key.flag= HA_READ_AFTER_KEY; + + range_records= maria_records_in_range(file,(int) i, &min_key, &max_key); + if (range_records < info.records*8/10 || + range_records > info.records*12/10) + { + printf("maria_records_range returned %ld; Should be about %ld\n", + (long) range_records,(long) info.records); + goto err; + } + if (verbose) + { + printf("maria_records_range returned %ld; Exact is %ld (diff: %4.2g %%)\n", + (long) range_records, (long) info.records, + labs((long) range_records - (long) info.records)*100.0/ + info.records); + } + } + for (i=0 ; i < 5 ; i++) + { + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + for (k=rnd(1000)+1 ; k>0 && key1[k] == 0 ; k--) ; + if (j != 0 && k != 0) + { + key_range min_key, max_key; + if (j > k) + swap_variables(int, j, k); + sprintf(key,"%6d",j); + sprintf(key2,"%6d",k); + + min_key.key= key; + min_key.keypart_map= HA_WHOLE_KEY; + min_key.flag= HA_READ_AFTER_KEY; + max_key.key= key2; + max_key.keypart_map= HA_WHOLE_KEY; + max_key.flag= HA_READ_BEFORE_KEY; + range_records= maria_records_in_range(file, 0, &min_key, &max_key); + records=0; + for (j++ ; j < k ; j++) + records+=key1[j]; + if ((long) range_records < (long) records*7/10-2 || + (long) range_records > (long) records*14/10+2) + { + printf("maria_records_range for key: %d returned %lu; Should be about %lu\n", + i, (ulong) range_records, (ulong) records); + goto err; + } + if (verbose && records) + { + printf("maria_records_range returned %lu; Exact is %lu (diff: %4.2g %%)\n", + (ulong) range_records, (ulong) records, + labs((long) range_records-(long) records)*100.0/records); + + } + } + } + + if (!silent) + printf("- maria_info\n"); + maria_status(file,&info,HA_STATUS_VARIABLE | HA_STATUS_CONST); + if (info.records != write_count-opt_delete || info.deleted > opt_delete + update + || info.keys != keys) + { + puts("Wrong info from maria_info"); + printf("Got: records: %lu delete: %lu i_keys: %d\n", + (ulong) info.records, (ulong) info.deleted, info.keys); + goto err; + } + if (verbose) + { + char buff[80]; + get_date(buff,3,info.create_time); + printf("info: Created %s\n",buff); + get_date(buff,3,info.check_time); + printf("info: checked %s\n",buff); + get_date(buff,3,info.update_time); + printf("info: Modified %s\n",buff); + } + + maria_panic(HA_PANIC_WRITE); + maria_panic(HA_PANIC_READ); + if (maria_is_changed(file)) + puts("Warning: maria_is_changed reported that datafile was changed"); + + if (!silent) + printf("- maria_extra(CACHE) + maria_rrnd.... + maria_extra(NO_CACHE)\n"); + if (maria_reset(file) || maria_extra(file,HA_EXTRA_CACHE,0)) + { + if (locking || (!use_blob && !pack_fields)) + { + puts("got error from maria_extra(HA_EXTRA_CACHE)"); + goto err; + } + } + ant=0; + assert(maria_scan_init(file) == 0); + while ((error= maria_scan(file,record)) != HA_ERR_END_OF_FILE && + ant < write_count + 10) + ant+= error ? 0 : 1; + maria_scan_end(file); + if (ant != write_count-opt_delete) + { + printf("scan with cache: I can only find: %d records of %d\n", + ant,write_count-opt_delete); + goto err; + } + if (maria_extra(file,HA_EXTRA_NO_CACHE,0)) + { + puts("got error from maria_extra(HA_EXTRA_NO_CACHE)"); + goto err; + } + + ant=0; + maria_scan_init(file); + while ((error=maria_scan(file,record)) != HA_ERR_END_OF_FILE && + ant < write_count + 10) + ant+= error ? 0 : 1; + if (ant != write_count-opt_delete) + { + printf("scan with cache: I can only find: %d records of %d\n", + ant,write_count-opt_delete); + goto err; + } + + if (testflag == 5) + goto end; + + if (!silent) + printf("- Removing keys\n"); + DBUG_PRINT("progpos",("Removing keys")); + lastpos = HA_OFFSET_ERROR; + /* DBUG_POP(); */ + maria_reset(file); + found_parts=0; + maria_scan_init(file); + while ((error= maria_scan(file,read_record)) != HA_ERR_END_OF_FILE) + { + info.recpos=maria_position(file); + if (lastpos >= info.recpos && lastpos != HA_OFFSET_ERROR) + { + printf("maria_rrnd didn't advance filepointer; old: %ld, new: %ld\n", + (long) lastpos, (long) info.recpos); + goto err; + } + lastpos=info.recpos; + if (error == 0) + { + if (opt_delete == (uint) remove_count) /* While testing */ + goto end; + if (rnd(2) == 1 && maria_rsame(file,read_record,-1)) + { + printf("can't find record %lx\n",(long) info.recpos); + goto err; + } + if (use_blob) + { + ulong blob_length,pos; + uchar *ptr; + memcpy_fixed(&ptr, read_record+blob_pos+4, sizeof(ptr)); + longget(blob_length,read_record+blob_pos); + for (pos=0 ; pos < blob_length ; pos++) + { + if (ptr[pos] != (uchar) (blob_length+pos)) + { + printf("Found blob with wrong info at %ld\n",(long) lastpos); + maria_scan_end(file); + my_errno= 0; + goto err; + } + } + } + if (maria_delete(file,read_record)) + { + printf("can't delete record: %6.6s, delete_count: %d\n", + read_record, opt_delete); + maria_scan_end(file); + goto err; + } + opt_delete++; +#if 0 + / + /* + 179 is ok, 180 causes a difference between runtime and log-applying. + This is now fixed (we zero the last directory entry during + log-applying, just to eliminate this irrelevant difference). + */ + if (opt_delete==180) goto end; +#endif + } + else + found_parts++; + } + maria_scan_end(file); + if (my_errno != HA_ERR_END_OF_FILE && my_errno != HA_ERR_RECORD_DELETED) + printf("error: %d from maria_rrnd\n",my_errno); + if (write_count != opt_delete) + { + printf("Deleted only %d of %d records (%d parts)\n",opt_delete,write_count, + found_parts); + goto err; + } +end: + if (die_in_middle_of_transaction) + { + /* As commit record is not done, UNDO entries needs to be rolled back */ + switch (die_in_middle_of_transaction) { + case 1: + /* + Flush changed pages go to disk. That will also flush log. Recovery + will skip REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE, + FLUSH_RELEASE); + break; + case 2: + /* + Just flush log. Pages are likely to not be on disk. Recovery will + then execute REDOs and UNDOs. + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + case 3: + /* + Flush nothing. Pages and log are likely to not be on disk. Recovery + will then do nothing. + */ + break; + } + printf("Dying on request without maria_commit()/maria_close()\n"); + exit(0); + } + if (maria_commit(file)) + goto err; + if (maria_close(file)) + { + file= 0; + goto err; + } + file= 0; + maria_panic(HA_PANIC_CLOSE); /* Should close log */ + if (!silent) + { + printf("\nFollowing test have been made:\n"); + printf("Write records: %d\nUpdate records: %d\nSame-key-read: %d\nDelete records: %d\n", write_count,update,dupp_keys,opt_delete); + if (rec_pointer_size) + printf("Record pointer size: %d\n",rec_pointer_size); + printf("maria_block_size: %lu\n", maria_block_size); + if (write_cacheing) + puts("Key cache resized"); + if (write_cacheing) + puts("Write cacheing used"); + if (write_cacheing) + puts("quick mode"); + if (async_io && locking) + puts("Asyncron io with locking used"); + else if (locking) + puts("Locking used"); + if (use_blob) + puts("blobs used"); + printf("key cache status: \n\ +blocks used:%10lu\n\ +not flushed:%10lu\n\ +w_requests: %10lu\n\ +writes: %10lu\n\ +r_requests: %10lu\n\ +reads: %10lu\n", + maria_pagecache->blocks_used, + maria_pagecache->global_blocks_changed, + (ulong) maria_pagecache->global_cache_w_requests, + (ulong) maria_pagecache->global_cache_write, + (ulong) maria_pagecache->global_cache_r_requests, + (ulong) maria_pagecache->global_cache_read); + } + end_pagecache(maria_pagecache,1); + my_free(blob_buffer, MYF(MY_ALLOW_ZERO_PTR)); + my_end(silent ? MY_CHECK_ERROR : MY_CHECK_ERROR | MY_GIVE_INFO); + return(0); +err: + printf("got error: %d when using MARIA-database\n",my_errno); + if (file) + { + if (maria_commit(file)) + goto err; + VOID(maria_close(file)); + } + maria_end(); + return(1); +} /* main */ + + +/* Read options */ + +static void get_options(int argc, char **argv) +{ + char *pos,*progname; + + progname= argv[0]; + + while (--argc >0 && *(pos = *(++argv)) == '-' ) { + switch(*++pos) { + case 'B': + pack_type= HA_BINARY_PACK_KEY; + break; + case 'b': + use_blob= 1; + if (*++pos) + use_blob= atol(pos); + break; + case 'K': /* Use key cacheing */ + pagecacheing=1; + if (*++pos) + pagecache_size=atol(pos); + break; + case 'W': /* Use write cacheing */ + write_cacheing=1; + if (*++pos) + my_default_record_cache_size=atoi(pos); + break; + case 'd': + remove_count= atoi(++pos); + break; + case 'i': + if (*++pos) + srand(srand_arg= atoi(pos)); + break; + case 'L': + locking=1; + break; + case 'A': /* use asyncron io */ + async_io=1; + if (*++pos) + my_default_record_cache_size=atoi(pos); + break; + case 'v': /* verbose */ + verbose=1; + break; + case 'm': /* records */ + if ((recant=atoi(++pos)) < 10 && testflag > 2) + { + fprintf(stderr,"record count must be >= 10 (if testflag > 2)\n"); + exit(1); + } + break; + case 'e': /* maria_block_length */ + case 'E': + if ((maria_block_size= atoi(++pos)) < MARIA_MIN_KEY_BLOCK_LENGTH || + maria_block_size > MARIA_MAX_KEY_BLOCK_LENGTH) + { + fprintf(stderr,"Wrong maria_block_length\n"); + exit(1); + } + maria_block_size= my_round_up_to_next_power(maria_block_size); + break; + case 'f': + if ((first_key=atoi(++pos)) < 0 || first_key >= MARIA_KEYS) + first_key=0; + break; + case 'k': + if ((keys=(uint) atoi(++pos)) < 1 || + keys > (uint) (MARIA_KEYS-first_key)) + keys=MARIA_KEYS-first_key; + break; + case 'M': + record_type= BLOCK_RECORD; + break; + case 'P': + pack_type=0; /* Don't use DIFF_LENGTH */ + pack_seg=0; + break; + case 'R': /* Length of record pointer */ + rec_pointer_size=atoi(++pos); + if (rec_pointer_size > 7) + rec_pointer_size=0; + break; + case 'S': + pack_fields=0; /* Static-length-records */ + record_type= STATIC_RECORD; + break; + case 's': + silent=1; + break; + case 't': + testflag=atoi(++pos); /* testmod */ + break; + case 'T': + transactional= 1; + break; + case 'u': + die_in_middle_of_transaction= atoi(++pos); + break; + case 'q': + opt_quick_mode=1; + break; + case 'c': + create_flag|= HA_CREATE_CHECKSUM; + break; + case 'D': + create_flag|=HA_CREATE_DELAY_KEY_WRITE; + break; + case 'g': + skip_update= TRUE; + break; + case '?': + case 'I': + case 'V': + printf("%s Ver 1.0 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE); + puts("By Monty, for your professional use\n"); + printf("Usage: %s [-?AbBcDIKLPRqSsTVWltv] [-k#] [-f#] [-m#] [-e#] [-E#] [-t#]\n", + progname); + exit(0); + case '#': + DBUG_PUSH (++pos); + break; + default: + printf("Illegal option: '%c'\n",*pos); + break; + } + } + return; +} /* get options */ + + /* Get a random value 0 <= x <= n */ + +static uint rnd(uint max_value) +{ + return (uint) ((rand() & 32767)/32767.0*max_value); +} /* rnd */ + + + /* Create a variable length record */ + +static void fix_length(uchar *rec, uint length) +{ + bmove(rec+STANDARD_LENGTH, + "0123456789012345678901234567890123456789012345678901234567890", + length-STANDARD_LENGTH); + strfill(rec+length,STANDARD_LENGTH+60-length,' '); +} /* fix_length */ + + + /* Put maybe a blob in record */ + +static void put_blob_in_record(char *blob_pos, char **blob_buffer, + ulong *blob_length) +{ + ulong i,length; + if (use_blob) + { + if (rnd(10) == 0) + { + if (! *blob_buffer && + !(*blob_buffer=my_malloc((uint) use_blob,MYF(MY_WME)))) + { + use_blob=0; + return; + } + length=rnd(use_blob); + for (i=0 ; i < length ; i++) + (*blob_buffer)[i]=(char) (length+i); + int4store(blob_pos,length); + memcpy_fixed(blob_pos+4,(char*) blob_buffer,sizeof(char*)); + *blob_length= length; + } + else + { + int4store(blob_pos,0); + *blob_length= 0; + } + } + return; +} + + +static void copy_key(MARIA_HA *info,uint inx,uchar *rec,uchar *key_buff) +{ + HA_KEYSEG *keyseg; + + for (keyseg=info->s->keyinfo[inx].seg ; keyseg->type ; keyseg++) + { + memcpy(key_buff,rec+keyseg->start,(size_t) keyseg->length); + key_buff+=keyseg->length; + } + return; +} diff --git a/storage/maria/ma_test3.c b/storage/maria/ma_test3.c new file mode 100644 index 00000000000..c25dd5dcdc6 --- /dev/null +++ b/storage/maria/ma_test3.c @@ -0,0 +1,500 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Test av locking */ + +#ifndef __NETWARE__ + +#include "maria.h" +#include <sys/types.h> +#ifdef HAVE_SYS_WAIT_H +# include <sys/wait.h> +#endif +#ifndef WEXITSTATUS +# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8) +#endif +#ifndef WIFEXITED +# define WIFEXITED(stat_val) (((stat_val) & 255) == 0) +#endif + + +#if defined(HAVE_LRAND48) +#define rnd(X) (lrand48() % X) +#define rnd_init(X) srand48(X) +#else +#define rnd(X) (random() % X) +#define rnd_init(X) srandom(X) +#endif + + +const char *filename= "test3"; +uint tests=10,forks=10,pagecacheing=0; + +static void get_options(int argc, char *argv[]); +void start_test(int id); +int test_read(MARIA_HA *,int),test_write(MARIA_HA *,int,int), + test_update(MARIA_HA *,int,int),test_rrnd(MARIA_HA *,int); + +struct record { + char id[8]; + char nr[4]; + char text[10]; +} record; + + +int main(int argc,char **argv) +{ + int status,wait_ret; + uint i=0; + MARIA_KEYDEF keyinfo[10]; + MARIA_COLUMNDEF recinfo[10]; + HA_KEYSEG keyseg[10][2]; + MY_INIT(argv[0]); + get_options(argc,argv); + + fprintf(stderr, "WARNING! this program is to test 'external locking'" + " (when several processes share a table through file locking)" + " which is not supported by Maria at all; expect errors." + " We may soon remove this program.\n"); + maria_init(); + bzero((char*) keyinfo,sizeof(keyinfo)); + bzero((char*) recinfo,sizeof(recinfo)); + bzero((char*) keyseg,sizeof(keyseg)); + keyinfo[0].seg= &keyseg[0][0]; + keyinfo[0].seg[0].start=0; + keyinfo[0].seg[0].length=8; + keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[0].seg[0].flag=HA_SPACE_PACK; + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].keysegs=1; + keyinfo[0].flag = (uint8) HA_PACK_KEY; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[1].seg= &keyseg[1][0]; + keyinfo[1].seg[0].start=8; + keyinfo[1].seg[0].length=4; /* Long is always 4 in maria */ + keyinfo[1].seg[0].type=HA_KEYTYPE_LONG_INT; + keyinfo[1].seg[0].flag=0; + keyinfo[1].key_alg=HA_KEY_ALG_BTREE; + keyinfo[1].keysegs=1; + keyinfo[1].flag =HA_NOSAME; + keyinfo[1].block_length= 0; /* Default block length */ + + recinfo[0].type=0; + recinfo[0].length=sizeof(record.id); + recinfo[1].type=0; + recinfo[1].length=sizeof(record.nr); + recinfo[2].type=0; + recinfo[2].length=sizeof(record.text); + + puts("- Creating maria-file"); + my_delete(filename,MYF(0)); /* Remove old locks under gdb */ + if (maria_create(filename,BLOCK_RECORD, 2, &keyinfo[0],2,&recinfo[0],0, + (MARIA_UNIQUEDEF*) 0, (MARIA_CREATE_INFO*) 0,0)) + exit(1); + + rnd_init(0); + printf("- Starting %d processes\n",forks); fflush(stdout); + for (i=0 ; i < forks; i++) + { + if (!fork()) + { + start_test(i+1); + sleep(1); + return 0; + } + VOID(rnd(1)); + } + + for (i=0 ; i < forks ; i++) + while ((wait_ret=wait(&status)) && wait_ret == -1); + maria_end(); + return 0; +} + + +static void get_options(int argc, char **argv) +{ + char *pos,*progname; + + progname= argv[0]; + + while (--argc >0 && *(pos = *(++argv)) == '-' ) { + switch(*++pos) { + case 'f': + forks=atoi(++pos); + break; + case 't': + tests=atoi(++pos); + break; + case 'K': /* Use key cacheing */ + pagecacheing=1; + break; + case 'A': /* All flags */ + pagecacheing=1; + break; + case '?': + case 'I': + case 'V': + printf("%s Ver 1.0 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE); + puts("By Monty, for your professional use\n"); + puts("Test av locking with threads\n"); + printf("Usage: %s [-?lKA] [-f#] [-t#]\n",progname); + exit(0); + case '#': + DBUG_PUSH (++pos); + break; + default: + printf("Illegal option: '%c'\n",*pos); + break; + } + } + return; +} + + +void start_test(int id) +{ + uint i; + int error,lock_type; + MARIA_INFO isam_info; + MARIA_HA *file,*file1,*file2=0,*lock; + + if (!(file1=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)) || + !(file2=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED))) + { + fprintf(stderr,"Can't open isam-file: %s\n",filename); + exit(1); + } + if (pagecacheing && rnd(2) == 0) + init_pagecache(maria_pagecache, 65536L, 0, 0, MARIA_KEY_BLOCK_LENGTH); + printf("Process %d, pid: %d\n",id,getpid()); fflush(stdout); + + for (error=i=0 ; i < tests && !error; i++) + { + file= (rnd(2) == 1) ? file1 : file2; + lock=0 ; lock_type=0; + if (rnd(10) == 0) + { + if (maria_lock_database(lock=(rnd(2) ? file1 : file2), + lock_type=(rnd(2) == 0 ? F_RDLCK : F_WRLCK))) + { + fprintf(stderr,"%2d: start: Can't lock table %d\n",id,my_errno); + error=1; + break; + } + } + switch (rnd(4)) { + case 0: error=test_read(file,id); break; + case 1: error=test_rrnd(file,id); break; + case 2: error=test_write(file,id,lock_type); break; + case 3: error=test_update(file,id,lock_type); break; + } + if (lock) + maria_lock_database(lock,F_UNLCK); + } + if (!error) + { + maria_status(file1,&isam_info,HA_STATUS_VARIABLE); + printf("%2d: End of test. Records: %ld Deleted: %ld\n", + id,(long) isam_info.records, (long) isam_info.deleted); + fflush(stdout); + } + + maria_close(file1); + maria_close(file2); + if (error) + { + printf("%2d: Aborted\n",id); fflush(stdout); + exit(1); + } +} + + +int test_read(MARIA_HA *file,int id) +{ + uint i,lock,found,next,prev; + ulong find; + + lock=0; + if (rnd(2) == 0) + { + lock=1; + if (maria_lock_database(file,F_RDLCK)) + { + fprintf(stderr,"%2d: Can't lock table %d\n",id,my_errno); + return 1; + } + } + + found=next=prev=0; + for (i=0 ; i < 100 ; i++) + { + find=rnd(100000); + if (!maria_rkey(file,record.id,1,(uchar*) &find, + HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + found++; + else + { + if (my_errno != HA_ERR_KEY_NOT_FOUND) + { + fprintf(stderr,"%2d: Got error %d from read in read\n",id,my_errno); + return 1; + } + else if (!maria_rnext(file,record.id,1)) + next++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in read\n",id,my_errno); + return 1; + } + else if (!maria_rprev(file,record.id,1)) + prev++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in read\n", + id,my_errno); + return 1; + } + } + } + } + } + if (lock) + { + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"%2d: Can't unlock table\n",id); + return 1; + } + } + printf("%2d: read: found: %5d next: %5d prev: %5d\n", + id,found,next,prev); + fflush(stdout); + return 0; +} + + +int test_rrnd(MARIA_HA *file,int id) +{ + uint count,lock; + + lock=0; + if (rnd(2) == 0) + { + lock=1; + if (maria_lock_database(file,F_RDLCK)) + { + fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno); + maria_close(file); + return 1; + } + if (rnd(2) == 0) + maria_extra(file,HA_EXTRA_CACHE,0); + } + + count=0; + if (maria_rrnd(file,record.id,0L)) + { + if (my_errno == HA_ERR_END_OF_FILE) + goto end; + fprintf(stderr,"%2d: Can't read first record (%d)\n",id,my_errno); + return 1; + } + for (count=1 ; !maria_rrnd(file,record.id,HA_OFFSET_ERROR) ;count++) ; + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rrnd\n",id,my_errno); + return 1; + } + +end: + if (lock) + { + maria_extra(file,HA_EXTRA_NO_CACHE,0); + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"%2d: Can't unlock table\n",id); + exit(0); + } + } + printf("%2d: rrnd: %5d\n",id,count); fflush(stdout); + return 0; +} + + +int test_write(MARIA_HA *file,int id,int lock_type) +{ + uint i,tries,count,lock; + + lock=0; + if (rnd(2) == 0 || lock_type == F_RDLCK) + { + lock=1; + if (maria_lock_database(file,F_WRLCK)) + { + if (lock_type == F_RDLCK && my_errno == EDEADLK) + { + printf("%2d: write: deadlock\n",id); fflush(stdout); + return 0; + } + fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno); + maria_close(file); + return 1; + } + if (rnd(2) == 0) + maria_extra(file,HA_EXTRA_WRITE_CACHE,0); + } + + sprintf(record.id,"%7d",getpid()); + strnmov(record.text,"Testing...", sizeof(record.text)); + + tries=(uint) rnd(100)+10; + for (i=count=0 ; i < tries ; i++) + { + uint32 tmp=rnd(80000)+20000; + int4store(record.nr,tmp); + if (!maria_write(file,record.id)) + count++; + else + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY) + { + fprintf(stderr,"%2d: Got error %d (errno %d) from write\n",id,my_errno, + errno); + return 1; + } + } + } + if (lock) + { + maria_extra(file,HA_EXTRA_NO_CACHE,0); + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"%2d: Can't unlock table\n",id); + exit(0); + } + } + printf("%2d: write: %5d\n",id,count); fflush(stdout); + return 0; +} + + +int test_update(MARIA_HA *file,int id,int lock_type) +{ + uint i,lock,found,next,prev,update; + uint32 tmp; + char find[4]; + struct record new_record; + + lock=0; + if (rnd(2) == 0 || lock_type == F_RDLCK) + { + lock=1; + if (maria_lock_database(file,F_WRLCK)) + { + if (lock_type == F_RDLCK && my_errno == EDEADLK) + { + printf("%2d: write: deadlock\n",id); fflush(stdout); + return 0; + } + fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno); + return 1; + } + } + bzero((char*) &new_record,sizeof(new_record)); + strmov(new_record.text,"Updated"); + + found=next=prev=update=0; + for (i=0 ; i < 100 ; i++) + { + tmp=rnd(100000); + int4store(find,tmp); + if (!maria_rkey(file,record.id,1,(uchar*) find, + HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + found++; + else + { + if (my_errno != HA_ERR_KEY_NOT_FOUND) + { + fprintf(stderr,"%2d: Got error %d from read in update\n",id,my_errno); + return 1; + } + else if (!maria_rnext(file,record.id,1)) + next++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in update\n", + id,my_errno); + return 1; + } + else if (!maria_rprev(file,record.id,1)) + prev++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in update\n", + id,my_errno); + return 1; + } + continue; + } + } + } + memcpy_fixed(new_record.id,record.id,sizeof(record.id)); + tmp=rnd(20000)+40000; + int4store(new_record.nr,tmp); + if (!maria_update(file,record.id,new_record.id)) + update++; + else + { + if (my_errno != HA_ERR_RECORD_CHANGED && + my_errno != HA_ERR_RECORD_DELETED && + my_errno != HA_ERR_FOUND_DUPP_KEY) + { + fprintf(stderr,"%2d: Got error %d from update\n",id,my_errno); + return 1; + } + } + } + if (lock) + { + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"Can't unlock table,id, error%d\n",my_errno); + return 1; + } + } + printf("%2d: update: %5d\n",id,update); fflush(stdout); + return 0; +} + +#else /* __NETWARE__ */ + +#include <stdio.h> + +main() +{ + fprintf(stderr,"this test has not been ported to NetWare\n"); + return 0; +} + +#endif /* __NETWARE__ */ diff --git a/storage/maria/ma_test_all.res b/storage/maria/ma_test_all.res new file mode 100644 index 00000000000..57b0feeeae8 --- /dev/null +++ b/storage/maria/ma_test_all.res @@ -0,0 +1,62 @@ +Running tests with dynamic row format +Running tests with static row format +Running tests with block row format +ma_test2 -s -L -K -R1 -m2000 ; Should give error 135 +Error: 135 in write at record: 1099 +got error: 135 when using MARIA-database +./maria_chk -sm test2 will warn that 'Datafile is almost full' +maria_chk: MARIA file test2 +maria_chk: warning: Datafile is almost full, 65516 of 65534 used +MARIA-table 'test2' is usable but should be fixed + +real 0m0.808s +user 0m0.584s +sys 0m0.212s + +real 0m0.780s +user 0m0.584s +sys 0m0.176s + +real 0m0.809s +user 0m0.616s +sys 0m0.180s + +real 0m1.356s +user 0m1.140s +sys 0m0.188s + +real 0m0.783s +user 0m0.600s +sys 0m0.176s + +real 0m1.390s +user 0m1.184s +sys 0m0.152s + +real 0m1.875s +user 0m1.632s +sys 0m0.244s + +real 0m1.313s +user 0m1.148s +sys 0m0.160s + +real 0m1.846s +user 0m1.644s +sys 0m0.188s + +real 0m1.875s +user 0m1.632s +sys 0m0.212s + +real 0m1.819s +user 0m1.672s +sys 0m0.124s + +real 0m2.117s +user 0m1.816s +sys 0m0.292s + +real 0m1.871s +user 0m1.636s +sys 0m0.196s diff --git a/storage/maria/ma_test_all.sh b/storage/maria/ma_test_all.sh new file mode 100755 index 00000000000..108dffd7df7 --- /dev/null +++ b/storage/maria/ma_test_all.sh @@ -0,0 +1,245 @@ +#!/bin/sh +# +# Execute some simple basic test on MyISAM libary to check if things +# works at all. + +# If you want to run this in Valgrind, you should use --trace-children=yes, +# so that it detects problems in ma_test* and not in the shell script + +# Running in a "shared memory" disk is 10 times faster; you can do +# mkdir /dev/shm/test; cd /dev/shm/test; maria_path=<path_to_maria_binaries> + +# Remove # from following line if you need some more information +#set -x -v -e + +set -e # abort at first failure + +valgrind="valgrind --alignment=8 --leak-check=yes" +silent="-s" +suffix="" +if [ -z "$maria_path" ] +then + maria_path="." +fi + +# Delete temporary files +rm -f *.TMD +rm -f maria_log* + +run_tests() +{ + row_type=$1 + # + # First some simple tests + # + $maria_path/ma_test1$suffix $silent $row_type + $maria_path/maria_chk$suffix -se test1 + $maria_path/ma_test1$suffix $silent -N $row_type + $maria_path/maria_chk$suffix -se test1 + $maria_path/ma_test1$suffix $silent -P --checksum $row_type + $maria_path/maria_chk$suffix -se test1 + $maria_path/ma_test1$suffix $silent -P -N $row_type + $maria_path/maria_chk$suffix -se test1 + $maria_path/ma_test1$suffix $silent -B -N -R2 $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -k 480 --unique $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -N -R1 $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -p $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -p -N --unique $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -p -N --key_length=127 --checksum $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -p -N --key_length=128 $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -p --key_length=480 $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -B $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -B --key_length=64 --unique $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -B -k 480 --checksum $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -B -k 480 -N --unique --checksum $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -m $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -m -P --unique --checksum $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -m -P --key_length=480 --key_cache $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -m -p $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -w --unique $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -w --key_length=64 --checksum $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -w -N --key_length=480 $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -w --key_length=480 --checksum $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -b -N $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -a -b --key_length=480 $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent -p -B --key_length=480 $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent --checksum --unique $row_type + $maria_path/maria_chk$suffix -se test1 + $maria_path/ma_test1$suffix $silent --unique $row_type + $maria_path/maria_chk$suffix -se test1 + + $maria_path/ma_test1$suffix $silent --key_multiple -N -S $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent --key_multiple -a -p --key_length=480 $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent --key_multiple -a -B --key_length=480 $row_type + $maria_path/maria_chk$suffix -sm test1 + $maria_path/ma_test1$suffix $silent --key_multiple -P -S $row_type + $maria_path/maria_chk$suffix -sm test1 + + $maria_path/maria_pack$suffix --force -s test1 + $maria_path/maria_chk$suffix -ess test1 + + $maria_path/ma_test2$suffix $silent -L -K -W -P $row_type + $maria_path/maria_chk$suffix -sm test2 + $maria_path/ma_test2$suffix $silent -L -K -W -P -A $row_type + $maria_path/maria_chk$suffix -sm test2 + $maria_path/ma_test2$suffix $silent -L -K -P -R3 -m50 -b1000000 $row_type + $maria_path/maria_chk$suffix -sm test2 + $maria_path/ma_test2$suffix $silent -L -B $row_type + $maria_path/maria_chk$suffix -sm test2 + $maria_path/ma_test2$suffix $silent -D -B -c $row_type + $maria_path/maria_chk$suffix -sm test2 + $maria_path/ma_test2$suffix $silent -m10000 -e4096 -K $row_type + $maria_path/maria_chk$suffix -sm test2 + $maria_path/ma_test2$suffix $silent -m10000 -e8192 -K $row_type + $maria_path/maria_chk$suffix -sm test2 + $maria_path/ma_test2$suffix $silent -m10000 -e16384 -E16384 -K -L $row_type + $maria_path/maria_chk$suffix -sm test2 +} + +run_repair_tests() +{ + row_type=$1 + $maria_path/ma_test1$suffix $silent --checksum $row_type + $maria_path/maria_chk$suffix -se test1 + $maria_path/maria_chk$suffix -rs test1 + $maria_path/maria_chk$suffix -se test1 + $maria_path/maria_chk$suffix -rqs test1 + $maria_path/maria_chk$suffix -se test1 + $maria_path/maria_chk$suffix -rs --correct-checksum test1 + $maria_path/maria_chk$suffix -se test1 + $maria_path/maria_chk$suffix -rqs --correct-checksum test1 + $maria_path/maria_chk$suffix -se test1 + $maria_path/maria_chk$suffix -ros --correct-checksum test1 + $maria_path/maria_chk$suffix -se test1 + $maria_path/maria_chk$suffix -rqos --correct-checksum test1 + $maria_path/maria_chk$suffix -se test1 + $maria_path/ma_test2$suffix $silent -c -d1 $row_type + $maria_path/maria_chk$suffix -s --parallel-recover test2 + $maria_path/maria_chk$suffix -se test2 + $maria_path/maria_chk$suffix -s --parallel-recover --quick test2 + $maria_path/maria_chk$suffix -se test2 + $maria_path/ma_test2$suffix $silent -c $row_type + $maria_path/maria_chk$suffix -se test2 + $maria_path/maria_chk$suffix -sr test2 + $maria_path/maria_chk$suffix -se test2 +} + +run_pack_tests() +{ + row_type=$1 + # check of maria_pack / maria_chk + $maria_path/ma_test1$suffix $silent --checksum $row_type + $maria_path/maria_pack$suffix --force -s test1 + $maria_path/maria_chk$suffix -ess test1 + $maria_path/maria_chk$suffix -rqs test1 + $maria_path/maria_chk$suffix -es test1 + $maria_path/maria_chk$suffix -rs test1 + $maria_path/maria_chk$suffix -es test1 + $maria_path/maria_chk$suffix -rus test1 + $maria_path/maria_chk$suffix -es test1 + + $maria_path/ma_test1$suffix $silent --checksum -S $row_type + $maria_path/maria_chk$suffix -se test1 + $maria_path/maria_chk$suffix -ros test1 + $maria_path/maria_chk$suffix -rqs test1 + $maria_path/maria_chk$suffix -se test1 + + $maria_path/maria_pack$suffix --force -s test1 + $maria_path/maria_chk$suffix -rqs test1 + $maria_path/maria_chk$suffix -es test1 + $maria_path/maria_chk$suffix -rus test1 + $maria_path/maria_chk$suffix -es test1 + + $maria_path/ma_test2$suffix $silent -c -d1 $row_type + $maria_path/maria_chk$suffix -s --parallel-recover test2 + $maria_path/maria_chk$suffix -se test2 + $maria_path/maria_chk$suffix -s --parallel-recover --unpack test2 + $maria_path/maria_chk$suffix -se test2 + $maria_path/maria_pack$suffix --force -s test1 + $maria_path/maria_chk$suffix -s --parallel-recover --unpack test2 + $maria_path/maria_chk$suffix -se test2 +} + +echo "Running tests with dynamic row format" +run_tests "" +run_repair_tests "" +run_pack_tests "" + +echo "Running tests with static row format" +run_tests -S +run_repair_tests -S +run_pack_tests -S + +echo "Running tests with block row format" +run_tests -M +run_repair_tests -M +run_pack_tests -M + +echo "Running tests with block row format and transactions" +run_tests "-M -T" +run_repair_tests "-M -T" +run_pack_tests "-M -T" + +# +# Tests that gives warnings or errors +# + +$maria_path/ma_test2$suffix $silent -L -K -W -P -S -R1 -m500 +$maria_path/maria_chk$suffix -sm test2 +echo "ma_test2$suffix $silent -L -K -R1 -m2000 ; Should give error 135" +$maria_path/ma_test2$suffix $silent -L -K -R1 -m2000 >ma_test2_message.txt 2>&1 && false # success is failure +cat ma_test2_message.txt +grep "Error: 135" ma_test2_message.txt > /dev/null +echo "$maria_path/maria_chk$suffix -sm test2 will warn that 'Datafile is almost full'" +$maria_path/maria_chk$suffix -sm test2 >ma_test2_message.txt 2>&1 +cat ma_test2_message.txt +grep "warning: Datafile is almost full" ma_test2_message.txt >/dev/null +rm -f ma_test2_message.txt +$maria_path/maria_chk$suffix -ssm test2 + +# +# Test that removing tables and applying the log leads to identical tables +# +/bin/sh $maria_path/ma_test_recovery + +# +# Some timing tests +# +time $maria_path/ma_test2$suffix $silent +time $maria_path/ma_test2$suffix $silent -S +time $maria_path/ma_test2$suffix $silent -M +time $maria_path/ma_test2$suffix $silent -B +time $maria_path/ma_test2$suffix $silent -L +time $maria_path/ma_test2$suffix $silent -K +time $maria_path/ma_test2$suffix $silent -K -B +time $maria_path/ma_test2$suffix $silent -L -B +time $maria_path/ma_test2$suffix $silent -L -K -B +time $maria_path/ma_test2$suffix $silent -L -K -W -B +time $maria_path/ma_test2$suffix $silent -L -K -W -B -S +time $maria_path/ma_test2$suffix $silent -L -K -W -B -M +time $maria_path/ma_test2$suffix $silent -D -K -W -B -S diff --git a/storage/maria/ma_test_recovery b/storage/maria/ma_test_recovery new file mode 100755 index 00000000000..7c45af1e206 --- /dev/null +++ b/storage/maria/ma_test_recovery @@ -0,0 +1,210 @@ +#!/bin/sh + +set -e +silent="-s" +if [ -z "$maria_path" ] +then + maria_path="." +fi + +# test data is always put in the current directory or a tmp subdirectory of it +tmp="./tmp" + +if test '!' -d $tmp +then + mkdir $tmp +fi + +echo "MARIA RECOVERY TESTS" + +check_table_is_same() +{ + # Computes checksum of new table and compares to checksum of old table + # Shows any difference in table's state (info from the index's header) + + $maria_path/maria_chk -dvv $table | grep -v "Creation time:" > $tmp/maria_chk_message.txt 2>&1 + + # save the index file (because we want to test idempotency afterwards) + cp $table.MAI tmp/ + # In the repair below it's good to use -q because it will die on any + # incorrectness of the data file if UNDO was badly applied. + # QQ: Remove the following line when we also can recover the index file + $maria_path/maria_chk -s -rq $table + + $maria_path/maria_chk -s -e $table + checksum2=`$maria_path/maria_chk -dss $table` + if test "$checksum" != "$checksum2" + then + echo "checksum differs for $table before and after recovery" + return 1; + fi + + diff $tmp/maria_chk_message.good.txt $tmp/maria_chk_message.txt > $tmp/maria_chk_diff.txt || true + if [ -s $tmp/maria_chk_diff.txt ] + then + echo "Differences in maria_chk -dvv, recovery not yet perfect !" + echo "========DIFF START=======" + cat $tmp/maria_chk_diff.txt + echo "========DIFF END=======" + fi + mv tmp/$table.MAI . +} + +apply_log() +{ + # applies log, can verify if applying did write to log or not + + shouldchangelog=$1 + if [ "$shouldchangelog" != "shouldnotchangelog" ] && + [ "$shouldchangelog" != "shouldchangelog" ] && + [ "$shouldchangelog" != "dontknow" ] + then + echo "bad argument '$shouldchangelog'" + return 1 + fi + log_md5=`md5sum maria_log.*` + echo "applying log" + $maria_path/maria_read_log -a > $tmp/maria_read_log_$table.txt + log_md5_2=`md5sum maria_log.*` + if [ "$log_md5" != "$log_md5_2" ] + then + if [ "$shouldchangelog" == "shouldnotchangelog" ] + then + echo "maria_read_log should not have modified the log" + return 1 + fi + else + if [ "$shouldchangelog" == "shouldchangelog" ] + then + echo "maria_read_log should have modified the log" + return 1 + fi + fi +} + +# To not flood the screen, we redirect all the commands below to a text file +# and just give a final error if their output is not as expected + +( + +# this message is to remember about the problem with -b (see @todo below) +echo "!!!!!!!! REMEMBER to FIX this BLOB issue !!!!!!!" + +echo "Testing the REDO PHASE ALONE" +# runs a program inserting/deleting rows, then moves the resulting table +# elsewhere; applies the log and checks that the data file is +# identical to the saved original. +# Does not test the index file as we don't have logging for it yet. + +set -- "ma_test1 $silent -M -T -c" "ma_test2 $silent -L -K -W -P -M -T -c" "ma_test2 $silent -M -T -c -b" +while [ $# != 0 ] +do + prog=$1 + rm -f maria_log.* maria_log_control + echo "TEST WITH $prog" + $maria_path/$prog + # derive table's name from program's name + table=`echo $prog | sed -e 's;.*ma_\(test[0-9]\).*;\1;' ` + $maria_path/maria_chk -dvv $table | grep -v "Creation time:"> $tmp/maria_chk_message.good.txt 2>&1 + checksum=`$maria_path/maria_chk -dss $table` + mv $table.MAD $tmp/$table.MAD.good + rm $table.MAI + apply_log "shouldnotchangelog" + cmp $table.MAD $tmp/$table.MAD.good + check_table_is_same + echo "testing idempotency" + apply_log "shouldnotchangelog" + cmp $table.MAD $tmp/$table.MAD.good + check_table_is_same + shift +done + +echo "Testing the REDO AND UNDO PHASE" +# The test programs look like: +# work; commit (time T1); work; exit-without-commit (time T2) +# We first run the test program and let it exit after T1's commit. +# Then we run it again and let it exit at T2. Then we compare +# and expect identity. + +for blobs in "" "-b" # we test table without blobs and then table with blobs +do + for test_undo in 1 2 3 + do + # first iteration tests rollback of insert, second tests rollback of delete + set -- "ma_test1 $silent -M -T -c -N $blobs" "--testflag=1" "--testflag=2 --test-undo=" "ma_test1 $silent -M -T -c -N $blobs" "--testflag=3" "--testflag=4 --test-undo=" "ma_test1 $silent -M -T -c -N $blobs" "--testflag=2" "--testflag=3 --test-undo=" "ma_test2 $silent -L -K -W -P -M -T -c $blobs" "-t1" "-t2 -u" + # -N (create NULL fields) is needed because --test-undo adds it anyway + while [ $# != 0 ] + do + prog=$1 + commit_run_args=$2 + abort_run_args=$3; + rm -f maria_log.* maria_log_control + echo "TEST WITH $prog $commit_run_args (commit at end)" + $maria_path/$prog $commit_run_args + # derive table's name from program's name + table=`echo $prog | sed -e 's;.*ma_\(test[0-9]\).*;\1;' ` + $maria_path/maria_chk -dvv $table | grep -v "Creation time:"> $tmp/maria_chk_message.good.txt 2>&1 + checksum=`$maria_path/maria_chk -dss $table` + mv $table.MAD $tmp/$table.MAD.good + rm $table.MAI + rm maria_log.* maria_log_control + echo "TEST WITH $prog $abort_run_args$test_undo (additional aborted work)" + $maria_path/$prog $abort_run_args$test_undo + cp $table.MAD $tmp/$table.MAD.before_undo + if [ $test_undo -lt 3 ] + then + apply_log "shouldchangelog" # should undo aborted work + else + # probably nothing to undo went to log or data file + apply_log "dontknow" + fi + cp $table.MAD $tmp/$table.MAD.after_undo + + # It is impossible to do a "cmp" between .good and .after_undo, + # because the UNDO phase generated log + # records whose LSN tagged pages. Another reason is that rolling back + # INSERT only marks the rows free, does not empty them (optimization), so + # traces of the INSERT+rollback remain. + + check_table_is_same + echo "testing idempotency" + apply_log "shouldnotchangelog" + cmp $table.MAD $tmp/$table.MAD.after_undo + check_table_is_same + echo "testing applying of CLRs to recreate table" + rm $table.MA? + apply_log "shouldnotchangelog" + # the cmp below fails with ma_test1+blobs! @todo RECOVERY BUG why? + # It is probably serious; REDOs shouldn't place rows in different + # positions from what the run-time code did. Indeed it may lead to + # more or less free space... + # Execution of UNDO re-inserted rows at different positions than + # originally. This generated REDOs which do not insert at the same + # positions as the execution of UNDOs, but at the same positions + # as before the row was originally deleted. + if [ "$blobs" == "" ] + then + cmp $table.MAD $tmp/$table.MAD.after_undo + fi + check_table_is_same + shift 3 + done + rm -f $table.* $tmp/$table* $tmp/maria_chk_*.txt $tmp/maria_read_log_$table.txt +done +done + +) 2>&1 > $tmp/ma_test_recovery.output + +# also note that maria_chk -dvv shows differences for ma_test2 in UNDO phase, +# this is normal: removing records does not shrink the data/key file, +# does not put back the "analyzed,optimized keys"(etc) index state. +diff $maria_path/ma_test_recovery.expected $tmp/ma_test_recovery.output > /dev/null || diff_failed=1 +if [ "$diff_failed" == "1" ] + then + echo "UNEXPECTED OUTPUT OF TESTS, FAILED" + echo "For more info, do diff $maria_path/ma_test_recovery.expected $tmp/ma_test_recovery.output" + exit 1 + fi +echo "ALL RECOVERY TESTS OK" +# this message is to remember about the problem with -b (see @todo above) +echo "!!!!!!!! BUT REMEMBER to FIX this BLOB issue !!!!!!!" diff --git a/storage/maria/ma_test_recovery.expected b/storage/maria/ma_test_recovery.expected new file mode 100644 index 00000000000..926943b11b3 --- /dev/null +++ b/storage/maria/ma_test_recovery.expected @@ -0,0 +1,1123 @@ +!!!!!!!! REMEMBER to FIX this BLOB issue !!!!!!! +Testing the REDO PHASE ALONE +TEST WITH ma_test1 -s -M -T -c +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 3757530372 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number 1 8192 8192 +--- +> 1 2 6 unique number 1 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 3757530372 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number 1 8192 8192 +--- +> 1 2 6 unique number 1 8192 +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +11c11 +< Datafile length: 90112 Keyfile length: 204800 +--- +> Datafile length: 90112 Keyfile length: 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +11c11 +< Datafile length: 90112 Keyfile length: 204800 +--- +> Datafile length: 90112 Keyfile length: 8192 +========DIFF END======= +TEST WITH ma_test2 -s -M -T -c -b +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +11c11 +< Datafile length: 81920 Keyfile length: 172032 +--- +> Datafile length: 81920 Keyfile length: 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +11c11 +< Datafile length: 81920 Keyfile length: 172032 +--- +> Datafile length: 81920 Keyfile length: 8192 +========DIFF END======= +Testing the REDO AND UNDO PHASE +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 221293111 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 221293111 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 221293111 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 3697324514 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 2428948025 +--- +> Checksum: 3026590807 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 2428948025 +--- +> Checksum: 3026590807 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 2428948025 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -u1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 90112 Keyfile length: 204800 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 90112 Keyfile length: 204800 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 90112 Keyfile length: 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 221293111 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 221293111 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 221293111 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 3697324514 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 2428948025 +--- +> Checksum: 3026590807 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 2428948025 +--- +> Checksum: 3026590807 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 2428948025 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -u2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 90112 Keyfile length: 204800 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 90112 Keyfile length: 204800 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 90112 Keyfile length: 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 221293111 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 221293111 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 221293111 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 3697324514 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 2428948025 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 2428948025 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 2428948025 +--- +> Checksum: 0 +11c11 +< Datafile length: 16384 Keyfile length: 16384 +--- +> Datafile length: 16384 Keyfile length: 8192 +18c18 +< 1 2 6 unique number NULL 0 8192 8192 +--- +> 1 2 6 unique number NULL 0 8192 +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -u3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 90112 Keyfile length: 204800 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 90112 Keyfile length: 204800 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 90112 Keyfile length: 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 411409161 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 411409161 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 411409161 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 4024695312 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 529753687 +--- +> Checksum: 800025671 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 529753687 +--- +> Checksum: 800025671 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 529753687 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t2 -u1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 81920 Keyfile length: 212992 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 81920 Keyfile length: 212992 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 81920 Keyfile length: 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 411409161 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 411409161 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 411409161 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 4024695312 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 529753687 +--- +> Checksum: 800025671 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 529753687 +--- +> Checksum: 800025671 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 529753687 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t2 -u2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 81920 Keyfile length: 212992 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 81920 Keyfile length: 212992 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 81920 Keyfile length: 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 411409161 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 411409161 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 411409161 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 4024695312 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 529753687 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 529753687 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Checksum: 529753687 +--- +> Checksum: 0 +11c11 +< Datafile length: 49152 Keyfile length: 16384 +--- +> Datafile length: 49152 Keyfile length: 8192 +18c18 +< 1 2 6 unique varchar BLOB NULL 0 8192 8192 +--- +> 1 2 6 unique varchar BLOB NULL 0 8192 +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b -t2 -u3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 81920 Keyfile length: 212992 +========DIFF END======= +testing idempotency +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 81920 Keyfile length: 212992 +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in maria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +6c6 +< Status: checked,analyzed,optimized keys,sorted index pages +--- +> Status: changed +11c11 +< Datafile length: 8192 Keyfile length: 8192 +--- +> Datafile length: 81920 Keyfile length: 8192 +========DIFF END======= diff --git a/storage/maria/ma_unique.c b/storage/maria/ma_unique.c new file mode 100644 index 00000000000..3ab717887c7 --- /dev/null +++ b/storage/maria/ma_unique.c @@ -0,0 +1,235 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Functions to check if a row is unique */ + +#include "maria_def.h" +#include <m_ctype.h> + +my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, uchar *record, + ha_checksum unique_hash, my_off_t disk_pos) +{ + my_off_t lastpos=info->cur_row.lastpos; + MARIA_KEYDEF *key= &info->s->keyinfo[def->key]; + uchar *key_buff= info->lastkey2; + DBUG_ENTER("_ma_check_unique"); + DBUG_PRINT("enter",("unique_hash: %lu", (ulong) unique_hash)); + + maria_unique_store(record+key->seg->start, unique_hash); + _ma_make_key(info,def->key,key_buff,record,0); + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + if (_ma_search(info,info->s->keyinfo+def->key,key_buff, + MARIA_UNIQUE_HASH_LENGTH, + SEARCH_FIND,info->s->state.key_root[def->key])) + { + info->page_changed=1; /* Can't optimize read next */ + info->cur_row.lastpos= lastpos; + DBUG_RETURN(0); /* No matching rows */ + } + + for (;;) + { + if (info->cur_row.lastpos != disk_pos && + !(*info->s->compare_unique)(info,def,record,info->cur_row.lastpos)) + { + my_errno=HA_ERR_FOUND_DUPP_UNIQUE; + info->errkey= (int) def->key; + info->dup_key_pos= info->cur_row.lastpos; + info->page_changed= 1; /* Can't optimize read next */ + info->cur_row.lastpos= lastpos; + DBUG_PRINT("info",("Found duplicate")); + DBUG_RETURN(1); /* Found identical */ + } + if (_ma_search_next(info,info->s->keyinfo+def->key, info->lastkey, + MARIA_UNIQUE_HASH_LENGTH, SEARCH_BIGGER, + info->s->state.key_root[def->key]) || + bcmp((char*) info->lastkey, (char*) key_buff, + MARIA_UNIQUE_HASH_LENGTH)) + { + info->page_changed= 1; /* Can't optimize read next */ + info->cur_row.lastpos= lastpos; + DBUG_RETURN(0); /* end of tree */ + } + } +} + + +/* + Calculate a hash for a row + + TODO + Add support for bit fields +*/ + +ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *record) +{ + const uchar *pos, *end; + ha_checksum crc= 0; + ulong seed1=0, seed2= 4; + HA_KEYSEG *keyseg; + + for (keyseg=def->seg ; keyseg < def->end ; keyseg++) + { + enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type; + uint length=keyseg->length; + + if (keyseg->null_bit) + { + if (record[keyseg->null_pos] & keyseg->null_bit) + { + /* + Change crc in a way different from an empty string or 0. + (This is an optimisation; The code will work even if this isn't + done) + */ + crc=((crc << 8) + 511+ + (crc >> (8*sizeof(ha_checksum)-8))); + continue; + } + } + pos= record+keyseg->start; + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= keyseg->bit_start; + uint tmp_length= (pack_length == 1 ? (uint) *(uchar*) pos : + uint2korr(pos)); + pos+= pack_length; /* Skip VARCHAR length */ + set_if_smaller(length,tmp_length); + } + else if (keyseg->flag & HA_BLOB_PART) + { + uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos); + memcpy_fixed((uchar*) &pos,pos+keyseg->bit_start,sizeof(char*)); + if (!length || length > tmp_length) + length=tmp_length; /* The whole blob */ + } + end= pos+length; + if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 || + type == HA_KEYTYPE_VARTEXT2) + { + keyseg->charset->coll->hash_sort(keyseg->charset, + (const uchar*) pos, length, &seed1, + &seed2); + crc^= seed1; + } + else + while (pos != end) + crc=((crc << 8) + + (((uchar) *(uchar*) pos++))) + + (crc >> (8*sizeof(ha_checksum)-8)); + } + return crc; +} + + +/* + compare unique key for two rows + + TODO + Add support for bit fields + + RETURN + 0 if both rows have equal unique value + 1 Rows are different +*/ + +my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b, + my_bool null_are_equal) +{ + const uchar *pos_a, *pos_b, *end; + HA_KEYSEG *keyseg; + + for (keyseg=def->seg ; keyseg < def->end ; keyseg++) + { + enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type; + uint a_length, b_length; + a_length= b_length= keyseg->length; + + /* If part is NULL it's regarded as different */ + if (keyseg->null_bit) + { + uint tmp; + if ((tmp=(a[keyseg->null_pos] & keyseg->null_bit)) != + (uint) (b[keyseg->null_pos] & keyseg->null_bit)) + return 1; + if (tmp) + { + if (!null_are_equal) + return 1; + continue; + } + } + pos_a= a+keyseg->start; + pos_b= b+keyseg->start; + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= keyseg->bit_start; + if (pack_length == 1) + { + a_length= (uint) *(uchar*) pos_a++; + b_length= (uint) *(uchar*) pos_b++; + } + else + { + a_length= uint2korr(pos_a); + b_length= uint2korr(pos_b); + pos_a+= 2; /* Skip VARCHAR length */ + pos_b+= 2; + } + set_if_smaller(a_length, keyseg->length); /* Safety */ + set_if_smaller(b_length, keyseg->length); /* safety */ + } + else if (keyseg->flag & HA_BLOB_PART) + { + /* Only compare 'length' characters if length != 0 */ + a_length= _ma_calc_blob_length(keyseg->bit_start,pos_a); + b_length= _ma_calc_blob_length(keyseg->bit_start,pos_b); + /* Check that a and b are of equal length */ + if (keyseg->length) + { + /* + This is used in some cases when we are not interested in comparing + the whole length of the blob. + */ + set_if_smaller(a_length, keyseg->length); + set_if_smaller(b_length, keyseg->length); + } + memcpy_fixed((uchar*) &pos_a,pos_a+keyseg->bit_start,sizeof(char*)); + memcpy_fixed((uchar*) &pos_b,pos_b+keyseg->bit_start,sizeof(char*)); + } + if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 || + type == HA_KEYTYPE_VARTEXT2) + { + if (ha_compare_text(keyseg->charset, (uchar *) pos_a, a_length, + (uchar *) pos_b, b_length, 0, 1)) + return 1; + } + else + { + if (a_length != b_length) + return 1; + end= pos_a+a_length; + while (pos_a != end) + { + if (*pos_a++ != *pos_b++) + return 1; + } + } + } + return 0; +} diff --git a/storage/maria/ma_update.c b/storage/maria/ma_update.c new file mode 100644 index 00000000000..0cb2e2b648b --- /dev/null +++ b/storage/maria/ma_update.c @@ -0,0 +1,250 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Update an old row in a MARIA table */ + +#include "ma_fulltext.h" +#include "ma_rt_index.h" + +int maria_update(register MARIA_HA *info, const uchar *oldrec, uchar *newrec) +{ + int flag,key_changed,save_errno; + reg3 my_off_t pos; + uint i; + uchar old_key[HA_MAX_KEY_BUFF],*new_key; + bool auto_key_changed=0; + ulonglong changed; + MARIA_SHARE *share=info->s; + ha_checksum old_checksum; + DBUG_ENTER("maria_update"); + LINT_INIT(new_key); + LINT_INIT(changed); + LINT_INIT(old_checksum); + + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage", + maria_print_error(info->s, HA_ERR_CRASHED); + DBUG_RETURN(my_errno= HA_ERR_CRASHED);); + if (!(info->update & HA_STATE_AKTIV)) + { + DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); + } + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + if (info->state->key_file_length >= share->base.margin_key_file_length) + { + DBUG_RETURN(my_errno=HA_ERR_INDEX_FILE_FULL); + } + pos= info->cur_row.lastpos; + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + + if ((*share->compare_record)(info,oldrec)) + { + save_errno= my_errno; + DBUG_PRINT("warning", ("Got error from compare record")); + goto err_end; /* Record has changed */ + } + + if (share->calc_checksum) + { + /* + We can't use the row based checksum as this doesn't have enough + precision. + */ + if (info->s->calc_checksum) + old_checksum= (*info->s->calc_checksum)(info, oldrec); + } + + /* Calculate and check all unique constraints */ + key_changed=0; + for (i=0 ; i < share->state.header.uniques ; i++) + { + MARIA_UNIQUEDEF *def=share->uniqueinfo+i; + if (_ma_unique_comp(def, newrec, oldrec,1) && + _ma_check_unique(info, def, newrec, _ma_unique_hash(def, newrec), + pos)) + { + save_errno=my_errno; + goto err_end; + } + } + if (_ma_mark_file_changed(info)) + { + save_errno=my_errno; + goto err_end; + } + + /* Check which keys changed from the original row */ + + new_key= info->lastkey2; + changed=0; + for (i=0 ; i < share->base.keys ; i++) + { + if (maria_is_key_active(share->state.key_map, i)) + { + if (share->keyinfo[i].flag & HA_FULLTEXT ) + { + if (_ma_ft_cmp(info,i,oldrec, newrec)) + { + if ((int) i == info->lastinx) + { + /* + We are changeing the index we are reading on. Mark that + the index data has changed and we need to do a full search + when doing read-next + */ + key_changed|=HA_STATE_WRITTEN; + } + changed|=((ulonglong) 1 << i); + if (_ma_ft_update(info,i,(char*) old_key,oldrec,newrec,pos)) + goto err; + } + } + else + { + uint new_length= _ma_make_key(info,i,new_key,newrec,pos); + uint old_length= _ma_make_key(info,i,old_key,oldrec,pos); + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + if (new_length != old_length || + memcmp(old_key, new_key, new_length)) + { + if ((int) i == info->lastinx) + key_changed|=HA_STATE_WRITTEN; /* Mark that keyfile changed */ + changed|=((ulonglong) 1 << i); + share->keyinfo[i].version++; + if (share->keyinfo[i].ck_delete(info,i,old_key,old_length)) goto err; + if (share->keyinfo[i].ck_insert(info,i,new_key,new_length)) goto err; + if (share->base.auto_key == i+1) + auto_key_changed=1; + } + } + } + } + /* + If we are running with external locking, we must update the index file + that something has changed. + */ + if (changed || !my_disable_locking) + key_changed|= HA_STATE_CHANGED; + + if (share->calc_checksum) + { + info->cur_row.checksum= (*share->calc_checksum)(info,newrec); + info->state->checksum+= (info->cur_row.checksum - old_checksum); + /* Store new checksum in index file header */ + key_changed|= HA_STATE_CHANGED; + } + { + /* + Don't update index file if data file is not extended and no status + information changed + */ + MARIA_STATUS_INFO state; + ha_rows org_split; + my_off_t org_delete_link; + + memcpy((char*) &state, (char*) info->state, sizeof(state)); + org_split= share->state.split; + org_delete_link= share->state.dellink; + if ((*share->update_record)(info, pos, oldrec, newrec)) + goto err; + if (!key_changed && + (memcmp((char*) &state, (char*) info->state, sizeof(state)) || + org_split != share->state.split || + org_delete_link != share->state.dellink)) + key_changed|= HA_STATE_CHANGED; /* Must update index file */ + } + if (auto_key_changed) + set_if_bigger(info->s->state.auto_increment, + ma_retrieve_auto_increment(info, newrec)); + + /* + We can't yet have HA_STATE_AKTIV here, as block_record dosn't support + it + */ + info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED | key_changed); + + /* + Every Maria function that updates Maria table must end with + call to _ma_writeinfo(). If operation (second param of + _ma_writeinfo()) is not 0 it sets share->changed to 1, that is + flags that data has changed. If operation is 0, this function + equals to no-op in this case. + + ma_update() must always pass !0 value as operation, since even if + there is no index change there could be data change. + */ + VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE)); + allow_break(); /* Allow SIGHUP & SIGINT */ + if (info->invalidator != 0) + { + DBUG_PRINT("info", ("invalidator... '%s' (update)", info->s->open_file_name)); + (*info->invalidator)(info->s->open_file_name); + info->invalidator=0; + } + DBUG_RETURN(0); + +err: + DBUG_PRINT("error",("key: %d errno: %d",i,my_errno)); + save_errno=my_errno; + if (my_errno == HA_ERR_FOUND_DUPP_KEY || my_errno == HA_ERR_OUT_OF_MEM || + my_errno == HA_ERR_RECORD_FILE_FULL) + { + info->errkey= (int) i; + flag=0; + do + { + if (((ulonglong) 1 << i) & changed) + { + if (share->keyinfo[i].flag & HA_FULLTEXT) + { + if ((flag++ && _ma_ft_del(info,i,(char*) new_key,newrec,pos)) || + _ma_ft_add(info,i,(char*) old_key,oldrec,pos)) + break; + } + else + { + uint new_length= _ma_make_key(info,i,new_key,newrec,pos); + uint old_length= _ma_make_key(info,i,old_key,oldrec,pos); + if ((flag++ && _ma_ck_delete(info,i,new_key,new_length)) || + _ma_ck_write(info,i,old_key,old_length)) + break; + } + } + } while (i-- != 0); + } + else + { + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); + } + info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_ROW_CHANGED | + key_changed); + + err_end: + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + allow_break(); /* Allow SIGHUP & SIGINT */ + if (save_errno == HA_ERR_KEY_NOT_FOUND) + { + maria_print_error(info->s, HA_ERR_CRASHED); + save_errno=HA_ERR_CRASHED; + } + DBUG_RETURN(my_errno=save_errno); +} /* maria_update */ diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c new file mode 100644 index 00000000000..b034d71ef9d --- /dev/null +++ b/storage/maria/ma_write.c @@ -0,0 +1,1102 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Write a row to a MARIA table */ + +#include "ma_fulltext.h" +#include "ma_rt_index.h" + +#define MAX_POINTER_LENGTH 8 + + /* Functions declared in this file */ + +static int w_search(MARIA_HA *info,MARIA_KEYDEF *keyinfo, + uint comp_flag, uchar *key, + uint key_length, my_off_t pos, uchar *father_buff, + uchar *father_keypos, my_off_t father_page, + my_bool insert_last); +static int _ma_balance_page(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uchar *key, + uchar *curr_buff,uchar *father_buff, + uchar *father_keypos,my_off_t father_page); +static uchar *_ma_find_last_pos(MARIA_KEYDEF *keyinfo, uchar *page, + uchar *key, uint *return_key_length, + uchar **after_key); +int _ma_ck_write_tree(register MARIA_HA *info, uint keynr,uchar *key, + uint key_length); +int _ma_ck_write_btree(register MARIA_HA *info, uint keynr,uchar *key, + uint key_length); + + +MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, + const uchar *record + __attribute__((unused))) +{ + return ((info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end) ? + info->s->state.dellink : + info->state->data_file_length); +} + +my_bool _ma_write_abort_default(MARIA_HA *info __attribute__((unused))) +{ + return 0; +} + + +/* Write new record to a table */ + +int maria_write(MARIA_HA *info, uchar *record) +{ + MARIA_SHARE *share=info->s; + uint i; + int save_errno; + MARIA_RECORD_POS filepos; + uchar *buff; + my_bool lock_tree= share->concurrent_insert; + my_bool fatal_error; + DBUG_ENTER("maria_write"); + DBUG_PRINT("enter",("index_file: %d data_file: %d", + info->s->kfile.file, info->dfile.file)); + + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage", + maria_print_error(info->s, HA_ERR_CRASHED); + DBUG_RETURN(my_errno= HA_ERR_CRASHED);); + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + dont_break(); /* Dont allow SIGHUP or SIGINT */ + + if (share->base.reloc == (ha_rows) 1 && + share->base.records == (ha_rows) 1 && + info->state->records == (ha_rows) 1) + { /* System file */ + my_errno=HA_ERR_RECORD_FILE_FULL; + goto err2; + } + if (info->state->key_file_length >= share->base.margin_key_file_length) + { + my_errno=HA_ERR_INDEX_FILE_FULL; + goto err2; + } + if (_ma_mark_file_changed(info)) + goto err2; + + /* Calculate and check all unique constraints */ + for (i=0 ; i < share->state.header.uniques ; i++) + { + if (_ma_check_unique(info,share->uniqueinfo+i,record, + _ma_unique_hash(share->uniqueinfo+i,record), + HA_OFFSET_ERROR)) + goto err2; + } + + if ((info->opt_flag & OPT_NO_ROWS)) + filepos= HA_OFFSET_ERROR; + else + { + /* + This may either calculate a record or, or write the record and return + the record id + */ + if ((filepos= (*share->write_record_init)(info, record)) == + HA_OFFSET_ERROR) + goto err2; + } + + /* Write all keys to indextree */ + buff= info->lastkey2; + for (i=0 ; i < share->base.keys ; i++) + { + if (maria_is_key_active(share->state.key_map, i)) + { + bool local_lock_tree= (lock_tree && + !(info->bulk_insert && + is_tree_inited(&info->bulk_insert[i]))); + if (local_lock_tree) + { + rw_wrlock(&share->key_root_lock[i]); + share->keyinfo[i].version++; + } + if (share->keyinfo[i].flag & HA_FULLTEXT ) + { + if (_ma_ft_add(info,i,(char*) buff,record,filepos)) + { + if (local_lock_tree) + rw_unlock(&share->key_root_lock[i]); + DBUG_PRINT("error",("Got error: %d on write",my_errno)); + goto err; + } + } + else + { + if (share->keyinfo[i].ck_insert(info,i,buff, + _ma_make_key(info,i,buff,record, + filepos))) + { + if (local_lock_tree) + rw_unlock(&share->key_root_lock[i]); + DBUG_PRINT("error",("Got error: %d on write",my_errno)); + goto err; + } + } + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + if (local_lock_tree) + rw_unlock(&share->key_root_lock[i]); + } + } + /** + @todo RECOVERY BUG + this += must happen under log's mutex when writing the UNDO + */ + if (share->calc_write_checksum) + info->cur_row.checksum= (*share->calc_write_checksum)(info,record); + if (filepos != HA_OFFSET_ERROR) + { + if ((*share->write_record)(info,record)) + goto err; + /** + @todo when we enable multiple writers, we will have to protect + 'records' and 'checksum' somehow. + */ + info->state->checksum+= info->cur_row.checksum; + } + if (share->base.auto_key) + set_if_bigger(info->s->state.auto_increment, + ma_retrieve_auto_increment(info, record)); + info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_WRITTEN | + HA_STATE_ROW_CHANGED); + info->state->records+= !share->now_transactional; /*otherwise already done*/ + info->cur_row.lastpos= filepos; + VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE)); + if (info->invalidator != 0) + { + DBUG_PRINT("info", ("invalidator... '%s' (update)", info->s->open_file_name)); + (*info->invalidator)(info->s->open_file_name); + info->invalidator=0; + } + + /* + Update status of the table. We need to do so after each row write + for the log tables, as we want the new row to become visible to + other threads as soon as possible. We don't lock mutex here + (as it is required by pthread memory visibility rules) as (1) it's + not critical to use outdated share->is_log_table value (2) locking + mutex here for every write is too expensive. + */ + if (share->is_log_table) + _ma_update_status((void*) info); + + allow_break(); /* Allow SIGHUP & SIGINT */ + DBUG_RETURN(0); + +err: + save_errno= my_errno; + fatal_error= 0; + if (my_errno == HA_ERR_FOUND_DUPP_KEY || + my_errno == HA_ERR_RECORD_FILE_FULL || + my_errno == HA_ERR_NULL_IN_SPATIAL || + my_errno == HA_ERR_OUT_OF_MEM) + { + if (info->bulk_insert) + { + uint j; + for (j=0 ; j < share->base.keys ; j++) + maria_flush_bulk_insert(info, j); + } + info->errkey= (int) i; + /* + We delete keys in the reverse order of insertion. This is the order that + a rollback would do and is important for CLR_ENDs generated by + _ma_ft|ck_delete() and write_record_abort() to work (with any other + order they would cause wrong jumps in the chain). + */ + while ( i-- > 0) + { + if (maria_is_key_active(share->state.key_map, i)) + { + bool local_lock_tree= (lock_tree && + !(info->bulk_insert && + is_tree_inited(&info->bulk_insert[i]))); + if (local_lock_tree) + rw_wrlock(&share->key_root_lock[i]); + /** + @todo RECOVERY BUG + The key deletes below should generate CLR_ENDs + */ + if (share->keyinfo[i].flag & HA_FULLTEXT) + { + if (_ma_ft_del(info,i,(char*) buff,record,filepos)) + { + if (local_lock_tree) + rw_unlock(&share->key_root_lock[i]); + break; + } + } + else + { + uint key_length= _ma_make_key(info,i,buff,record,filepos); + if (_ma_ck_delete(info,i,buff,key_length)) + { + if (local_lock_tree) + rw_unlock(&share->key_root_lock[i]); + break; + } + } + if (local_lock_tree) + rw_unlock(&share->key_root_lock[i]); + } + } + } + else + fatal_error= 1; + + if ((*share->write_record_abort)(info)) + fatal_error= 1; + if (fatal_error) + { + maria_print_error(info->s, HA_ERR_CRASHED); + maria_mark_crashed(info); + } + + info->update= (HA_STATE_CHANGED | HA_STATE_WRITTEN | HA_STATE_ROW_CHANGED); + my_errno=save_errno; +err2: + save_errno=my_errno; + DBUG_PRINT("error", ("got error: %d", save_errno)); + VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + allow_break(); /* Allow SIGHUP & SIGINT */ + DBUG_RETURN(my_errno=save_errno); +} /* maria_write */ + + + /* Write one key to btree */ + +int _ma_ck_write(MARIA_HA *info, uint keynr, uchar *key, uint key_length) +{ + DBUG_ENTER("_ma_ck_write"); + + if (info->bulk_insert && is_tree_inited(&info->bulk_insert[keynr])) + { + DBUG_RETURN(_ma_ck_write_tree(info, keynr, key, key_length)); + } + else + { + DBUG_RETURN(_ma_ck_write_btree(info, keynr, key, key_length)); + } +} /* _ma_ck_write */ + + +/********************************************************************** + * Normal insert code * + **********************************************************************/ + +int _ma_ck_write_btree(register MARIA_HA *info, uint keynr, uchar *key, + uint key_length) +{ + int error; + uint comp_flag; + MARIA_KEYDEF *keyinfo=info->s->keyinfo+keynr; + my_off_t *root=&info->s->state.key_root[keynr]; + DBUG_ENTER("_ma_ck_write_btree"); + + if (keyinfo->flag & HA_SORT_ALLOWS_SAME) + comp_flag=SEARCH_BIGGER; /* Put after same key */ + else if (keyinfo->flag & (HA_NOSAME|HA_FULLTEXT)) + { + comp_flag=SEARCH_FIND | SEARCH_UPDATE; /* No duplicates */ + if (keyinfo->flag & HA_NULL_ARE_EQUAL) + comp_flag|= SEARCH_NULL_ARE_EQUAL; + } + else + comp_flag=SEARCH_SAME; /* Keys in rec-pos order */ + + error= _ma_ck_real_write_btree(info, keyinfo, key, key_length, + root, comp_flag); + if (info->ft1_to_ft2) + { + if (!error) + error= _ma_ft_convert_to_ft2(info, keynr, key); + delete_dynamic(info->ft1_to_ft2); + my_free((uchar*)info->ft1_to_ft2, MYF(0)); + info->ft1_to_ft2=0; + } + DBUG_RETURN(error); +} /* _ma_ck_write_btree */ + + +int _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, my_off_t *root, + uint comp_flag) +{ + int error; + DBUG_ENTER("_ma_ck_real_write_btree"); + /* key_length parameter is used only if comp_flag is SEARCH_FIND */ + if (*root == HA_OFFSET_ERROR || + (error=w_search(info, keyinfo, comp_flag, key, key_length, + *root, (uchar*) 0, (uchar*) 0, + (my_off_t) 0, 1)) > 0) + error= _ma_enlarge_root(info,keyinfo,key,root); + DBUG_RETURN(error); +} /* _ma_ck_real_write_btree */ + + + /* Make a new root with key as only pointer */ + +int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, + my_off_t *root) +{ + uint t_length,nod_flag; + MARIA_KEY_PARAM s_temp; + MARIA_SHARE *share=info->s; + DBUG_ENTER("_ma_enlarge_root"); + + nod_flag= (*root != HA_OFFSET_ERROR) ? share->base.key_reflength : 0; + _ma_kpointer(info,info->buff+2,*root); /* if nod */ + t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,(uchar*) 0, + (uchar*) 0, (uchar*) 0, key,&s_temp); + maria_putint(info->buff,t_length+2+nod_flag,nod_flag); + (*keyinfo->store_key)(keyinfo,info->buff+2+nod_flag,&s_temp); + info->keyread_buff_used=info->page_changed=1; /* info->buff is used */ + if ((*root= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR || + _ma_write_keypage(info,keyinfo,*root,DFLT_INIT_HITS,info->buff)) + DBUG_RETURN(-1); + DBUG_RETURN(0); +} /* _ma_enlarge_root */ + + + /* + Search after a position for a key and store it there + Returns -1 = error + 0 = ok + 1 = key should be stored in higher tree + */ + +static int w_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uint comp_flag, uchar *key, uint key_length, my_off_t page, + uchar *father_buff, uchar *father_keypos, + my_off_t father_page, my_bool insert_last) +{ + int error,flag; + uint nod_flag, search_key_length; + uchar *temp_buff,*keypos; + uchar keybuff[HA_MAX_KEY_BUFF]; + my_bool was_last_key; + my_off_t next_page, dup_key_pos; + DBUG_ENTER("w_search"); + DBUG_PRINT("enter",("page: %ld", (long) page)); + + search_key_length= (comp_flag & SEARCH_FIND) ? key_length : USE_WHOLE_KEY; + if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ + HA_MAX_KEY_BUFF*2))) + DBUG_RETURN(-1); + if (!_ma_fetch_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff,0)) + goto err; + + flag=(*keyinfo->bin_search)(info,keyinfo,temp_buff,key,search_key_length, + comp_flag, &keypos, keybuff, &was_last_key); + nod_flag= _ma_test_if_nod(temp_buff); + if (flag == 0) + { + uint tmp_key_length; + /* get position to record with duplicated key */ + tmp_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,keybuff); + if (tmp_key_length) + dup_key_pos= _ma_dpos(info,0,keybuff+tmp_key_length); + else + dup_key_pos= HA_OFFSET_ERROR; + + if (keyinfo->flag & HA_FULLTEXT) + { + uint off; + int subkeys; + + get_key_full_length_rdonly(off, keybuff); + subkeys=ft_sintXkorr(keybuff+off); + comp_flag=SEARCH_SAME; + if (subkeys >= 0) + { + /* normal word, one-level tree structure */ + flag=(*keyinfo->bin_search)(info, keyinfo, temp_buff, key, + USE_WHOLE_KEY, comp_flag, + &keypos, keybuff, &was_last_key); + } + else + { + /* popular word. two-level tree. going down */ + my_off_t root=dup_key_pos; + keyinfo=&info->s->ft2_keyinfo; + get_key_full_length_rdonly(off, key); + key+=off; + keypos-=keyinfo->keylength+nod_flag; /* we'll modify key entry 'in vivo' */ + error= _ma_ck_real_write_btree(info, keyinfo, key, 0, + &root, comp_flag); + _ma_dpointer(info, keypos+HA_FT_WLEN, root); + subkeys--; /* should there be underflow protection ? */ + DBUG_ASSERT(subkeys < 0); + ft_intXstore(keypos, subkeys); + if (!error) + error= _ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff); + my_afree((uchar*) temp_buff); + DBUG_RETURN(error); + } + } + else /* not HA_FULLTEXT, normal HA_NOSAME key */ + { + info->dup_key_pos= dup_key_pos; + my_afree((uchar*) temp_buff); + my_errno=HA_ERR_FOUND_DUPP_KEY; + DBUG_RETURN(-1); + } + } + if (flag == MARIA_FOUND_WRONG_KEY) + DBUG_RETURN(-1); + if (!was_last_key) + insert_last=0; + next_page= _ma_kpos(nod_flag,keypos); + if (next_page == HA_OFFSET_ERROR || + (error=w_search(info, keyinfo, comp_flag, key, key_length, next_page, + temp_buff, keypos, page, insert_last)) >0) + { + error= _ma_insert(info,keyinfo,key,temp_buff,keypos,keybuff,father_buff, + father_keypos,father_page, insert_last); + if (_ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff)) + goto err; + } + my_afree((uchar*) temp_buff); + DBUG_RETURN(error); +err: + my_afree((uchar*) temp_buff); + DBUG_PRINT("exit",("Error: %d",my_errno)); + DBUG_RETURN (-1); +} /* w_search */ + + +/* + Insert new key. + + SYNOPSIS + _ma_insert() + info Open table information. + keyinfo Key definition information. + key New key. + anc_buff Key page (beginning). + key_pos Position in key page where to insert. + key_buff Copy of previous key. + father_buff parent key page for balancing. + father_key_pos position in parent key page for balancing. + father_page position of parent key page in file. + insert_last If to append at end of page. + + DESCRIPTION + Insert new key at right of key_pos. + + RETURN + 2 if key contains key to upper level. + 0 OK. + < 0 Error. +*/ + +int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *key, uchar *anc_buff, uchar *key_pos, uchar *key_buff, + uchar *father_buff, uchar *father_key_pos, my_off_t father_page, + my_bool insert_last) +{ + uint a_length,nod_flag; + int t_length; + uchar *endpos, *prev_key; + MARIA_KEY_PARAM s_temp; + DBUG_ENTER("_ma_insert"); + DBUG_PRINT("enter",("key_pos: 0x%lx", (ulong) key_pos)); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE,keyinfo->seg,key, + USE_WHOLE_KEY);); + + nod_flag=_ma_test_if_nod(anc_buff); + a_length= maria_data_on_page(anc_buff); + endpos= anc_buff+ a_length; + prev_key=(key_pos == anc_buff+2+nod_flag ? (uchar*) 0 : key_buff); + t_length=(*keyinfo->pack_key)(keyinfo,nod_flag, + (key_pos == endpos ? (uchar*) 0 : key_pos), + prev_key, prev_key, + key,&s_temp); +#ifndef DBUG_OFF + if (key_pos != anc_buff+2+nod_flag && (keyinfo->flag & + (HA_BINARY_PACK_KEY | HA_PACK_KEY))) + { + DBUG_DUMP("prev_key",(uchar*) key_buff, _ma_keylength(keyinfo,key_buff)); + } + if (keyinfo->flag & HA_PACK_KEY) + { + DBUG_PRINT("test",("t_length: %d ref_len: %d", + t_length,s_temp.ref_length)); + DBUG_PRINT("test",("n_ref_len: %d n_length: %d key_pos: 0x%lx", + s_temp.n_ref_length, s_temp.n_length, (long) s_temp.key)); + } +#endif + if (t_length > 0) + { + if (t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(-1); + } + bmove_upp((uchar*) endpos+t_length,(uchar*) endpos,(uint) (endpos-key_pos)); + } + else + { + if (-t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH) + { + maria_print_error(info->s, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(-1); + } + bmove(key_pos,key_pos-t_length,(uint) (endpos-key_pos)+t_length); + } + (*keyinfo->store_key)(keyinfo,key_pos,&s_temp); + a_length+=t_length; + maria_putint(anc_buff,a_length,nod_flag); + if (a_length <= keyinfo->block_length) + { + if (keyinfo->block_length - a_length < 32 && + keyinfo->flag & HA_FULLTEXT && key_pos == endpos && + info->s->base.key_reflength <= info->s->base.rec_reflength && + info->s->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) + { + /* + Normal word. One-level tree. Page is almost full. + Let's consider converting. + We'll compare 'key' and the first key at anc_buff + */ + uchar *a=key, *b=anc_buff+2+nod_flag; + uint alen, blen, ft2len=info->s->ft2_keyinfo.keylength; + /* the very first key on the page is always unpacked */ + DBUG_ASSERT((*b & 128) == 0); +#if HA_FT_MAXLEN >= 127 + blen= mi_uint2korr(b); b+=2; +#else + blen= *(uchar*) b++; +#endif + get_key_length(alen,a); + DBUG_ASSERT(info->ft1_to_ft2==0); + if (alen == blen && + ha_compare_text(keyinfo->seg->charset, (uchar*) a, alen, + (uchar*) b, blen, 0, 0) == 0) + { + /* yup. converting */ + info->ft1_to_ft2=(DYNAMIC_ARRAY *) + my_malloc(sizeof(DYNAMIC_ARRAY), MYF(MY_WME)); + my_init_dynamic_array(info->ft1_to_ft2, ft2len, 300, 50); + + /* + now, adding all keys from the page to dynarray + if the page is a leaf (if not keys will be deleted later) + */ + if (!nod_flag) + { + /* let's leave the first key on the page, though, because + we cannot easily dispatch an empty page here */ + b+=blen+ft2len+2; + for (a=anc_buff+a_length ; b < a ; b+=ft2len+2) + insert_dynamic(info->ft1_to_ft2, (char*) b); + + /* fixing the page's length - it contains only one key now */ + maria_putint(anc_buff,2+blen+ft2len+2,0); + } + /* the rest will be done when we're back from recursion */ + } + } + DBUG_RETURN(0); /* There is room on page */ + } + /* Page is full */ + if (nod_flag) + insert_last=0; + if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + father_buff && !insert_last) + DBUG_RETURN(_ma_balance_page(info,keyinfo,key,anc_buff,father_buff, + father_key_pos,father_page)); + DBUG_RETURN(_ma_split_page(info,keyinfo,key,anc_buff,key_buff, insert_last)); +} /* _ma_insert */ + + + /* split a full page in two and assign emerging item to key */ + +int _ma_split_page(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *key, uchar *buff, uchar *key_buff, + my_bool insert_last_key) +{ + uint length,a_length,key_ref_length,t_length,nod_flag,key_length; + uchar *key_pos,*pos, *after_key; + my_off_t new_pos; + MARIA_KEY_PARAM s_temp; + DBUG_ENTER("maria_split_page"); + LINT_INIT(after_key); + DBUG_DUMP("buff",(uchar*) buff,maria_data_on_page(buff)); + + if (info->s->keyinfo+info->lastinx == keyinfo) + info->page_changed=1; /* Info->buff is used */ + info->keyread_buff_used=1; + nod_flag=_ma_test_if_nod(buff); + key_ref_length=2+nod_flag; + if (insert_last_key) + key_pos= _ma_find_last_pos(keyinfo,buff,key_buff, &key_length, &after_key); + else + key_pos= _ma_find_half_pos(nod_flag,keyinfo,buff,key_buff, &key_length, + &after_key); + if (!key_pos) + DBUG_RETURN(-1); + + length=(uint) (key_pos-buff); + a_length= maria_data_on_page(buff); + maria_putint(buff,length,nod_flag); + + key_pos=after_key; + if (nod_flag) + { + DBUG_PRINT("test",("Splitting nod")); + pos=key_pos-nod_flag; + memcpy((uchar*) info->buff+2,(uchar*) pos,(size_t) nod_flag); + } + + /* Move middle item to key and pointer to new page */ + if ((new_pos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + DBUG_RETURN(-1); + _ma_kpointer(info, _ma_move_key(keyinfo,key,key_buff),new_pos); + + /* Store new page */ + if (!(*keyinfo->get_key)(keyinfo,nod_flag,&key_pos,key_buff)) + DBUG_RETURN(-1); + + t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,(uchar *) 0, + (uchar*) 0, (uchar*) 0, + key_buff, &s_temp); + length=(uint) ((buff+a_length)-key_pos); + memcpy((uchar*) info->buff+key_ref_length+t_length,(uchar*) key_pos, + (size_t) length); + (*keyinfo->store_key)(keyinfo,info->buff+key_ref_length,&s_temp); + maria_putint(info->buff,length+t_length+key_ref_length,nod_flag); + + if (_ma_write_keypage(info,keyinfo,new_pos,DFLT_INIT_HITS,info->buff)) + DBUG_RETURN(-1); + DBUG_DUMP("key",(uchar*) key, _ma_keylength(keyinfo,key)); + DBUG_RETURN(2); /* Middle key up */ +} /* _ma_split_page */ + + + /* + Calculate how to much to move to split a page in two + Returns pointer to start of key. + key will contain the key. + return_key_length will contain the length of key + after_key will contain the position to where the next key starts + */ + +uchar *_ma_find_half_pos(uint nod_flag, MARIA_KEYDEF *keyinfo, uchar *page, + uchar *key, uint *return_key_length, + uchar **after_key) +{ + uint keys,length,key_ref_length; + uchar *end,*lastpos; + DBUG_ENTER("_ma_find_half_pos"); + + key_ref_length=2+nod_flag; + length= maria_data_on_page(page)-key_ref_length; + page+=key_ref_length; + if (!(keyinfo->flag & + (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | + HA_BINARY_PACK_KEY))) + { + key_ref_length=keyinfo->keylength+nod_flag; + keys=length/(key_ref_length*2); + *return_key_length=keyinfo->keylength; + end=page+keys*key_ref_length; + *after_key=end+key_ref_length; + memcpy(key,end,key_ref_length); + DBUG_RETURN(end); + } + + end=page+length/2-key_ref_length; /* This is aprox. half */ + *key='\0'; + do + { + lastpos=page; + if (!(length=(*keyinfo->get_key)(keyinfo,nod_flag,&page,key))) + DBUG_RETURN(0); + } while (page < end); + *return_key_length=length; + *after_key=page; + DBUG_PRINT("exit",("returns: 0x%lx page: 0x%lx half: 0x%lx", + (long) lastpos, (long) page, (long) end)); + DBUG_RETURN(lastpos); +} /* _ma_find_half_pos */ + + +/* + Split buffer at last key + Returns pointer to the start of the key before the last key + key will contain the last key +*/ + +static uchar *_ma_find_last_pos(MARIA_KEYDEF *keyinfo, uchar *page, + uchar *key, uint *return_key_length, + uchar **after_key) +{ + uint keys,length,last_length,key_ref_length; + uchar *end,*lastpos,*prevpos; + uchar key_buff[HA_MAX_KEY_BUFF]; + DBUG_ENTER("_ma_find_last_pos"); + + key_ref_length=2; + length= maria_data_on_page(page)-key_ref_length; + page+=key_ref_length; + if (!(keyinfo->flag & + (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | + HA_BINARY_PACK_KEY))) + { + keys=length/keyinfo->keylength-2; + *return_key_length=length=keyinfo->keylength; + end=page+keys*length; + *after_key=end+length; + memcpy(key,end,length); + DBUG_RETURN(end); + } + + LINT_INIT(prevpos); + LINT_INIT(last_length); + end=page+length-key_ref_length; + *key='\0'; + length=0; + lastpos=page; + while (page < end) + { + prevpos=lastpos; lastpos=page; + last_length=length; + memcpy(key, key_buff, length); /* previous key */ + if (!(length=(*keyinfo->get_key)(keyinfo,0,&page,key_buff))) + { + maria_print_error(keyinfo->share, HA_ERR_CRASHED); + my_errno=HA_ERR_CRASHED; + DBUG_RETURN(0); + } + } + *return_key_length=last_length; + *after_key=lastpos; + DBUG_PRINT("exit",("returns: 0x%lx page: 0x%lx end: 0x%lx", + (long) prevpos,(long) page,(long) end)); + DBUG_RETURN(prevpos); +} /* _ma_find_last_pos */ + + + /* Balance page with not packed keys with page on right/left */ + /* returns 0 if balance was done */ + +static int _ma_balance_page(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uchar *curr_buff, uchar *father_buff, + uchar *father_key_pos, my_off_t father_page) +{ + my_bool right; + uint k_length,father_length,father_keylength,nod_flag,curr_keylength, + right_length,left_length,new_right_length,new_left_length,extra_length, + length,keys; + uchar *pos,*buff,*extra_buff; + my_off_t next_page,new_pos; + uchar tmp_part_key[HA_MAX_KEY_BUFF]; + DBUG_ENTER("_ma_balance_page"); + + k_length=keyinfo->keylength; + father_length= maria_data_on_page(father_buff); + father_keylength=k_length+info->s->base.key_reflength; + nod_flag=_ma_test_if_nod(curr_buff); + curr_keylength=k_length+nod_flag; + info->page_changed=1; + + if ((father_key_pos != father_buff+father_length && + (info->state->records & 1)) || + father_key_pos == father_buff+2+info->s->base.key_reflength) + { + right=1; + next_page= _ma_kpos(info->s->base.key_reflength, + father_key_pos+father_keylength); + buff=info->buff; + DBUG_PRINT("test",("use right page: %lu", (ulong) next_page)); + } + else + { + right=0; + father_key_pos-=father_keylength; + next_page= _ma_kpos(info->s->base.key_reflength,father_key_pos); + /* Fix that curr_buff is to left */ + buff=curr_buff; curr_buff=info->buff; + DBUG_PRINT("test",("use left page: %lu", (ulong) next_page)); + } /* father_key_pos ptr to parting key */ + + if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,info->buff,0)) + goto err; + DBUG_DUMP("next",(uchar*) info->buff,maria_data_on_page(info->buff)); + + /* Test if there is room to share keys */ + + left_length= maria_data_on_page(curr_buff); + right_length= maria_data_on_page(buff); + keys=(left_length+right_length-4-nod_flag*2)/curr_keylength; + + if ((right ? right_length : left_length) + curr_keylength <= + keyinfo->block_length) + { /* Merge buffs */ + new_left_length=2+nod_flag+(keys/2)*curr_keylength; + new_right_length=2+nod_flag+((keys+1)/2)*curr_keylength; + maria_putint(curr_buff,new_left_length,nod_flag); + maria_putint(buff,new_right_length,nod_flag); + + if (left_length < new_left_length) + { /* Move keys buff -> leaf */ + pos=curr_buff+left_length; + memcpy((uchar*) pos,(uchar*) father_key_pos, (size_t) k_length); + memcpy((uchar*) pos+k_length, (uchar*) buff+2, + (size_t) (length=new_left_length - left_length - k_length)); + pos=buff+2+length; + memcpy((uchar*) father_key_pos,(uchar*) pos,(size_t) k_length); + bmove((uchar*) buff+2,(uchar*) pos+k_length,new_right_length); + } + else + { /* Move keys -> buff */ + + bmove_upp((uchar*) buff+new_right_length,(uchar*) buff+right_length, + right_length-2); + length=new_right_length-right_length-k_length; + memcpy((uchar*) buff+2+length,father_key_pos,(size_t) k_length); + pos=curr_buff+new_left_length; + memcpy((uchar*) father_key_pos,(uchar*) pos,(size_t) k_length); + memcpy((uchar*) buff+2,(uchar*) pos+k_length,(size_t) length); + } + + if (_ma_write_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,info->buff) || + _ma_write_keypage(info,keyinfo,father_page,DFLT_INIT_HITS,father_buff)) + goto err; + DBUG_RETURN(0); + } + + /* curr_buff[] and buff[] are full, lets split and make new nod */ + + extra_buff=info->buff+info->s->base.max_key_block_length; + new_left_length=new_right_length=2+nod_flag+(keys+1)/3*curr_keylength; + if (keys == 5) /* Too few keys to balance */ + new_left_length-=curr_keylength; + extra_length=nod_flag+left_length+right_length- + new_left_length-new_right_length-curr_keylength; + DBUG_PRINT("info",("left_length: %d right_length: %d new_left_length: %d new_right_length: %d extra_length: %d", + left_length, right_length, + new_left_length, new_right_length, + extra_length)); + maria_putint(curr_buff,new_left_length,nod_flag); + maria_putint(buff,new_right_length,nod_flag); + maria_putint(extra_buff,extra_length+2,nod_flag); + + /* move first largest keys to new page */ + pos=buff+right_length-extra_length; + memcpy((uchar*) extra_buff+2,pos,(size_t) extra_length); + /* Save new parting key */ + memcpy(tmp_part_key, pos-k_length,k_length); + /* Make place for new keys */ + bmove_upp((uchar*) buff+new_right_length,(uchar*) pos-k_length, + right_length-extra_length-k_length-2); + /* Copy keys from left page */ + pos= curr_buff+new_left_length; + memcpy((uchar*) buff+2,(uchar*) pos+k_length, + (size_t) (length=left_length-new_left_length-k_length)); + /* Copy old parting key */ + memcpy((uchar*) buff+2+length,father_key_pos,(size_t) k_length); + + /* Move new parting keys up to caller */ + memcpy((uchar*) (right ? key : father_key_pos),pos,(size_t) k_length); + memcpy((uchar*) (right ? father_key_pos : key),tmp_part_key, k_length); + + if ((new_pos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + goto err; + _ma_kpointer(info,key+k_length,new_pos); + if (_ma_write_keypage(info,keyinfo,(right ? new_pos : next_page), + DFLT_INIT_HITS,info->buff) || + _ma_write_keypage(info,keyinfo,(right ? next_page : new_pos), + DFLT_INIT_HITS,extra_buff)) + goto err; + + DBUG_RETURN(1); /* Middle key up */ + +err: + DBUG_RETURN(-1); +} /* _ma_balance_page */ + +/********************************************************************** + * Bulk insert code * + **********************************************************************/ + +typedef struct { + MARIA_HA *info; + uint keynr; +} bulk_insert_param; + + +int _ma_ck_write_tree(register MARIA_HA *info, uint keynr, uchar *key, + uint key_length) +{ + int error; + DBUG_ENTER("_ma_ck_write_tree"); + + error= tree_insert(&info->bulk_insert[keynr], key, + key_length + info->s->rec_reflength, + info->bulk_insert[keynr].custom_arg) ? 0 : HA_ERR_OUT_OF_MEM ; + + DBUG_RETURN(error); +} /* _ma_ck_write_tree */ + + +/* typeof(_ma_keys_compare)=qsort_cmp2 */ + +static int keys_compare(bulk_insert_param *param, uchar *key1, uchar *key2) +{ + uint not_used[2]; + return ha_key_cmp(param->info->s->keyinfo[param->keynr].seg, + (uchar*) key1, (uchar*) key2, USE_WHOLE_KEY, SEARCH_SAME, + not_used); +} + + +static int keys_free(uchar *key, TREE_FREE mode, bulk_insert_param *param) +{ + /* + Probably I can use info->lastkey here, but I'm not sure, + and to be safe I'd better use local lastkey. + */ + uchar lastkey[HA_MAX_KEY_BUFF]; + uint keylen; + MARIA_KEYDEF *keyinfo; + + switch (mode) { + case free_init: + if (param->info->s->concurrent_insert) + { + rw_wrlock(¶m->info->s->key_root_lock[param->keynr]); + param->info->s->keyinfo[param->keynr].version++; + } + return 0; + case free_free: + keyinfo=param->info->s->keyinfo+param->keynr; + keylen= _ma_keylength(keyinfo, key); + memcpy(lastkey, key, keylen); + return _ma_ck_write_btree(param->info,param->keynr,lastkey, + keylen - param->info->s->rec_reflength); + case free_end: + if (param->info->s->concurrent_insert) + rw_unlock(¶m->info->s->key_root_lock[param->keynr]); + return 0; + } + return -1; +} + + +int maria_init_bulk_insert(MARIA_HA *info, ulong cache_size, ha_rows rows) +{ + MARIA_SHARE *share=info->s; + MARIA_KEYDEF *key=share->keyinfo; + bulk_insert_param *params; + uint i, num_keys, total_keylength; + ulonglong key_map; + DBUG_ENTER("_ma_init_bulk_insert"); + DBUG_PRINT("enter",("cache_size: %lu", cache_size)); + + DBUG_ASSERT(!info->bulk_insert && + (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT)); + + maria_clear_all_keys_active(key_map); + for (i=total_keylength=num_keys=0 ; i < share->base.keys ; i++) + { + if (! (key[i].flag & HA_NOSAME) && (share->base.auto_key != i + 1) && + maria_is_key_active(share->state.key_map, i)) + { + num_keys++; + maria_set_key_active(key_map, i); + total_keylength+=key[i].maxlength+TREE_ELEMENT_EXTRA_SIZE; + } + } + + if (num_keys==0 || + num_keys * MARIA_MIN_SIZE_BULK_INSERT_TREE > cache_size) + DBUG_RETURN(0); + + if (rows && rows*total_keylength < cache_size) + cache_size=rows; + else + cache_size/=total_keylength*16; + + info->bulk_insert=(TREE *) + my_malloc((sizeof(TREE)*share->base.keys+ + sizeof(bulk_insert_param)*num_keys),MYF(0)); + + if (!info->bulk_insert) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + + params=(bulk_insert_param *)(info->bulk_insert+share->base.keys); + for (i=0 ; i < share->base.keys ; i++) + { + if (maria_is_key_active(key_map, i)) + { + params->info=info; + params->keynr=i; + /* Only allocate a 16'th of the buffer at a time */ + init_tree(&info->bulk_insert[i], + cache_size * key[i].maxlength, + cache_size * key[i].maxlength, 0, + (qsort_cmp2)keys_compare, 0, + (tree_element_free) keys_free, (void *)params++); + } + else + info->bulk_insert[i].root=0; + } + + DBUG_RETURN(0); +} + +void maria_flush_bulk_insert(MARIA_HA *info, uint inx) +{ + if (info->bulk_insert) + { + if (is_tree_inited(&info->bulk_insert[inx])) + reset_tree(&info->bulk_insert[inx]); + } +} + +void maria_end_bulk_insert(MARIA_HA *info) +{ + DBUG_ENTER("maria_end_bulk_insert"); + if (info->bulk_insert) + { + uint i; + for (i=0 ; i < info->s->base.keys ; i++) + { + if (is_tree_inited(& info->bulk_insert[i])) + { + delete_tree(& info->bulk_insert[i]); + } + } + my_free((void *)info->bulk_insert, MYF(0)); + info->bulk_insert=0; + } + DBUG_VOID_RETURN; +} diff --git a/storage/maria/maria_chk.c b/storage/maria/maria_chk.c new file mode 100644 index 00000000000..f9ed249817e --- /dev/null +++ b/storage/maria/maria_chk.c @@ -0,0 +1,1841 @@ +/* Copyright (C) 2006-2003 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Describe, check and repair of MARIA tables */ + +#include "ma_fulltext.h" +#include <myisamchk.h> +#include <my_bit.h> +#include <m_ctype.h> +#include <stdarg.h> +#include <my_getopt.h> +#ifdef HAVE_SYS_VADVICE_H +#include <sys/vadvise.h> +#endif +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif +SET_STACK_SIZE(9000) /* Minimum stack size for program */ + +#ifndef USE_RAID +#define my_raid_create(A,B,C,D,E,F,G) my_create(A,B,C,G) +#define my_raid_delete(A,B,C) my_delete(A,B) +#endif + +static uint decode_bits; +static char **default_argv; +static const char *load_default_groups[]= { "maria_chk", 0 }; +static const char *set_collation_name, *opt_tmpdir; +static CHARSET_INFO *set_collation; +static const char *my_progname_short; +static int stopwords_inited= 0; +static MY_TMPDIR maria_chk_tmpdir; + +static const char *type_names[]= +{ + "impossible","char","binary", "short", "long", "float", + "double","number","unsigned short", + "unsigned long","longlong","ulonglong","int24", + "uint24","int8","varchar", "varbin", "varchar2", "varbin2", "bit", + "?","?" +}; + +static const char *prefix_packed_txt="packed ", + *bin_packed_txt="prefix ", + *diff_txt="stripped ", + *null_txt="NULL", + *blob_txt="BLOB "; + +static const char *field_pack[]= +{ + "","no endspace", "no prespace", + "no zeros", "blob", "constant", "table-lockup", + "always zero","varchar","unique-hash","?","?" +}; + +static const char *record_formats[]= +{ + "Fixed length", "Packed", "Compressed", "Block", "?" +}; + +static const char *maria_stats_method_str="nulls_unequal"; + +static void get_options(int *argc,char * * *argv); +static void print_version(void); +static void usage(void); +static int maria_chk(HA_CHECK *param, char *filename); +static void descript(HA_CHECK *param, register MARIA_HA *info, char *name); +static int maria_sort_records(HA_CHECK *param, register MARIA_HA *info, + char *name, uint sort_key, + my_bool write_info, my_bool update_index); +static int sort_record_index(MARIA_SORT_PARAM *sort_param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t page, uchar *buff,uint sortkey, + File new_file, my_bool update_index); + +HA_CHECK check_param; + + /* Main program */ + +int main(int argc, char **argv) +{ + int error; + MY_INIT(argv[0]); + my_progname_short= my_progname+dirname_length(my_progname); + + maria_chk_init(&check_param); + check_param.opt_lock_memory= 1; /* Lock memory if possible */ + check_param.using_global_keycache = 0; + get_options(&argc,(char***) &argv); + maria_quick_table_bits=decode_bits; + error=0; + maria_init(); + + while (--argc >= 0) + { + int new_error=maria_chk(&check_param, *(argv++)); + if ((check_param.testflag & T_REP_ANY) != T_REP) + check_param.testflag&= ~T_REP; + VOID(fflush(stdout)); + VOID(fflush(stderr)); + if ((check_param.error_printed | check_param.warning_printed) && + (check_param.testflag & T_FORCE_CREATE) && + (!(check_param.testflag & (T_REP | T_REP_BY_SORT | T_SORT_RECORDS | + T_SORT_INDEX)))) + { + ulonglong old_testflag=check_param.testflag; + if (!(check_param.testflag & T_REP)) + check_param.testflag|= T_REP_BY_SORT; + check_param.testflag&= ~T_EXTEND; /* Don't needed */ + error|=maria_chk(&check_param, argv[-1]); + check_param.testflag= old_testflag; + VOID(fflush(stdout)); + VOID(fflush(stderr)); + } + else + error|=new_error; + if (argc && (!(check_param.testflag & T_SILENT) || + check_param.testflag & T_INFO)) + { + puts("\n---------\n"); + VOID(fflush(stdout)); + } + } + if (check_param.total_files > 1) + { /* Only if descript */ + char buff[22],buff2[22]; + if (!(check_param.testflag & T_SILENT) || check_param.testflag & T_INFO) + puts("\n---------\n"); + printf("\nTotal of all %d MARIA-files:\nData records: %9s Deleted blocks: %9s\n",check_param.total_files,llstr(check_param.total_records,buff), + llstr(check_param.total_deleted,buff2)); + } + free_defaults(default_argv); + free_tmpdir(&maria_chk_tmpdir); + maria_end(); + my_end(check_param.testflag & T_INFO ? + MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR); + exit(error); +#ifndef _lint + return 0; /* No compiler warning */ +#endif +} /* main */ + +enum options_mc { + OPT_CHARSETS_DIR=256, OPT_SET_COLLATION,OPT_START_CHECK_POS, + OPT_CORRECT_CHECKSUM, OPT_KEY_BUFFER_SIZE, + OPT_KEY_CACHE_BLOCK_SIZE, OPT_MARIA_BLOCK_SIZE, + OPT_READ_BUFFER_SIZE, OPT_WRITE_BUFFER_SIZE, OPT_SORT_BUFFER_SIZE, + OPT_SORT_KEY_BLOCKS, OPT_DECODE_BITS, OPT_FT_MIN_WORD_LEN, + OPT_FT_MAX_WORD_LEN, OPT_FT_STOPWORD_FILE, + OPT_MAX_RECORD_LENGTH, OPT_AUTO_CLOSE, OPT_STATS_METHOD +}; + +static struct my_option my_long_options[] = +{ + {"analyze", 'a', + "Analyze distribution of keys. Will make some joins in MySQL faster. You can check the calculated distribution.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifdef __NETWARE__ + {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"block-search", 'b', + "No help available.", + 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"backup", 'B', + "Make a backup of the .MYD file as 'filename-time.BAK'.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"character-sets-dir", OPT_CHARSETS_DIR, + "Directory where character sets are.", + (uchar**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"check", 'c', + "Check table for errors.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"check-only-changed", 'C', + "Check only tables that have changed since last check. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"correct-checksum", OPT_CORRECT_CHECKSUM, + "Correct checksum information for table.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', + "Output debug log. Often this is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"description", 'd', + "Prints some information about table.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"data-file-length", 'D', + "Max length of data file (when recreating data-file when it's full).", + (uchar**) &check_param.max_data_file_length, + (uchar**) &check_param.max_data_file_length, + 0, GET_LL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"extend-check", 'e', + "If used when checking a table, ensure that the table is 100 percent consistent, which will take a long time. If used when repairing a table, try to recover every possible row from the data file. Normally this will also find a lot of garbage rows; Don't use this option with repair if you are not totally desperate.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"fast", 'F', + "Check only tables that haven't been closed properly. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"force", 'f', + "Restart with -r if there are any errors in the table. States will be updated as with --update-state.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"HELP", 'H', + "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', + "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"information", 'i', + "Print statistics information about table that is checked.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"keys-used", 'k', + "Tell MARIA to update only some specific keys. # is a bit mask of which keys to use. This can be used to get faster inserts.", + (uchar**) &check_param.keys_in_use, + (uchar**) &check_param.keys_in_use, + 0, GET_ULL, REQUIRED_ARG, -1, 0, 0, 0, 0, 0}, + {"max-record-length", OPT_MAX_RECORD_LENGTH, + "Skip rows bigger than this if maria_chk can't allocate memory to hold it", + (uchar**) &check_param.max_record_length, + (uchar**) &check_param.max_record_length, + 0, GET_ULL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0}, + {"medium-check", 'm', + "Faster than extend-check, but only finds 99.99% of all errors. Should be good enough for most cases.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"quick", 'q', "Faster repair by not modifying the data file.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"read-only", 'T', + "Don't mark table as checked.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"recover", 'r', + "Can fix almost anything except unique keys that aren't unique.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"parallel-recover", 'p', + "Same as '-r' but creates all the keys in parallel.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"safe-recover", 'o', + "Uses old recovery method; Slower than '-r' but can handle a couple of cases where '-r' reports that it can't fix the data file.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"sort-recover", 'n', + "Force recovering with sorting even if the temporary file was very big.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifdef DEBUG + {"start-check-pos", OPT_START_CHECK_POS, + "No help available.", + 0, 0, 0, GET_ULL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"set-auto-increment", 'A', + "Force auto_increment to start at this or higher value. If no value is given, then sets the next auto_increment value to the highest used value for the auto key + 1.", + (uchar**) &check_param.auto_increment_value, + (uchar**) &check_param.auto_increment_value, + 0, GET_ULL, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"set-collation", OPT_SET_COLLATION, + "Change the collation used by the index", + (uchar**) &set_collation_name, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"set-variable", 'O', + "Change the value of a variable. Please note that this option is deprecated; you can set variables directly with --variable-name=value.", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"silent", 's', + "Only print errors. One can use two -s to make maria_chk very silent.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"sort-index", 'S', + "Sort index blocks. This speeds up 'read-next' in applications.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"sort-records", 'R', + "Sort records according to an index. This makes your data much more localized and may speed up things. (It may be VERY slow to do a sort the first time!)", + (uchar**) &check_param.opt_sort_key, + (uchar**) &check_param.opt_sort_key, + 0, GET_UINT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"tmpdir", 't', + "Path for temporary files.", + (uchar**) &opt_tmpdir, + 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"update-state", 'U', + "Mark tables as crashed if any errors were found.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"unpack", 'u', + "Unpack file packed with mariapack.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', + "Print more information. This can be used with --description and --check. Use many -v for more verbosity!", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', + "Print version and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"wait", 'w', + "Wait if table is locked.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "key_buffer_size", OPT_KEY_BUFFER_SIZE, "", + (uchar**) &check_param.use_buffers, (uchar**) &check_param.use_buffers, 0, + GET_ULONG, REQUIRED_ARG, (long) USE_BUFFER_INIT, (long) MALLOC_OVERHEAD, + (long) ~0L, (long) MALLOC_OVERHEAD, (long) IO_SIZE, 0}, + { "read_buffer_size", OPT_READ_BUFFER_SIZE, "", + (uchar**) &check_param.read_buffer_length, + (uchar**) &check_param.read_buffer_length, 0, GET_ULONG, REQUIRED_ARG, + (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD, + (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0}, + { "write_buffer_size", OPT_WRITE_BUFFER_SIZE, "", + (uchar**) &check_param.write_buffer_length, + (uchar**) &check_param.write_buffer_length, 0, GET_ULONG, REQUIRED_ARG, + (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD, + (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0}, + { "sort_buffer_size", OPT_SORT_BUFFER_SIZE, "", + (uchar**) &check_param.sort_buffer_length, + (uchar**) &check_param.sort_buffer_length, 0, GET_ULONG, REQUIRED_ARG, + (long) SORT_BUFFER_INIT, (long) (MIN_SORT_BUFFER + MALLOC_OVERHEAD), + (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0}, + { "sort_key_blocks", OPT_SORT_KEY_BLOCKS, "", + (uchar**) &check_param.sort_key_blocks, + (uchar**) &check_param.sort_key_blocks, 0, GET_ULONG, REQUIRED_ARG, + BUFFERS_WHEN_SORTING, 4L, 100L, 0L, 1L, 0}, + { "decode_bits", OPT_DECODE_BITS, "", (uchar**) &decode_bits, + (uchar**) &decode_bits, 0, GET_UINT, REQUIRED_ARG, 9L, 4L, 17L, 0L, 1L, 0}, + { "ft_min_word_len", OPT_FT_MIN_WORD_LEN, "", (uchar**) &ft_min_word_len, + (uchar**) &ft_min_word_len, 0, GET_ULONG, REQUIRED_ARG, 4, 1, HA_FT_MAXCHARLEN, + 0, 1, 0}, + { "ft_max_word_len", OPT_FT_MAX_WORD_LEN, "", (uchar**) &ft_max_word_len, + (uchar**) &ft_max_word_len, 0, GET_ULONG, REQUIRED_ARG, HA_FT_MAXCHARLEN, 10, + HA_FT_MAXCHARLEN, 0, 1, 0}, + { "maria_ft_stopword_file", OPT_FT_STOPWORD_FILE, + "Use stopwords from this file instead of built-in list.", + (uchar**) &ft_stopword_file, (uchar**) &ft_stopword_file, 0, GET_STR, + REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"stats_method", OPT_STATS_METHOD, + "Specifies how index statistics collection code should threat NULLs. " + "Possible values of name are \"nulls_unequal\" (default behavior for 4.1/5.0), " + "\"nulls_equal\" (emulate 4.0 behavior), and \"nulls_ignored\".", + (uchar**) &maria_stats_method_str, (uchar**) &maria_stats_method_str, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +#include <help_start.h> + +static void print_version(void) +{ + printf("%s Ver 1.0 for %s at %s\n", my_progname, SYSTEM_TYPE, + MACHINE_TYPE); + NETWARE_SET_SCREEN_MODE(1); +} + + +static void usage(void) +{ + print_version(); + puts("By Monty, for your professional use"); + puts("This software comes with NO WARRANTY: see the PUBLIC for details.\n"); + puts("Description, check and repair of MARIA tables."); + puts("Used without options all tables on the command will be checked for errors"); + printf("Usage: %s [OPTIONS] tables[.MYI]\n", my_progname_short); + printf("\nGlobal options:\n"); +#ifndef DBUG_OFF + printf("\ + -#, --debug=... Output debug log. Often this is 'd:t:o,filename'.\n"); +#endif + printf("\ + -?, --help Display this help and exit.\n\ + -O, --set-variable var=option.\n\ + Change the value of a variable. Please note that\n\ + this option is deprecated; you can set variables\n\ + directly with '--variable-name=value'.\n\ + -t, --tmpdir=path Path for temporary files. Multiple paths can be\n\ + specified, separated by "); +#if defined( __WIN__) || defined(__NETWARE__) + printf("semicolon (;)"); +#else + printf("colon (:)"); +#endif + printf(", they will be used\n\ + in a round-robin fashion.\n\ + -s, --silent Only print errors. One can use two -s to make\n\ + maria_chk very silent.\n\ + -v, --verbose Print more information. This can be used with\n\ + --description and --check. Use many -v for more verbosity.\n\ + -V, --version Print version and exit.\n\ + -w, --wait Wait if table is locked.\n\n"); +#ifdef DEBUG + puts(" --start-check-pos=# Start reading file at given offset.\n"); +#endif + + puts("Check options (check is the default action for maria_chk):\n\ + -c, --check Check table for errors.\n\ + -e, --extend-check Check the table VERY throughly. Only use this in\n\ + extreme cases as maria_chk should normally be able to\n\ + find out if the table is ok even without this switch.\n\ + -F, --fast Check only tables that haven't been closed properly.\n\ + -C, --check-only-changed\n\ + Check only tables that have changed since last check.\n\ + -f, --force Restart with '-r' if there are any errors in the table.\n\ + States will be updated as with '--update-state'.\n\ + -i, --information Print statistics information about table that is checked.\n\ + -m, --medium-check Faster than extend-check, but only finds 99.99% of\n\ + all errors. Should be good enough for most cases.\n\ + -U --update-state Mark tables as crashed if you find any errors.\n\ + -T, --read-only Don't mark table as checked.\n"); + + puts("Repair options (When using '-r' or '-o'):\n\ + -B, --backup Make a backup of the .MYD file as 'filename-time.BAK'.\n\ + --correct-checksum Correct checksum information for table.\n\ + -D, --data-file-length=# Max length of data file (when recreating data\n\ + file when it's full).\n\ + -e, --extend-check Try to recover every possible row from the data file\n\ + Normally this will also find a lot of garbage rows;\n\ + Don't use this option if you are not totally desperate.\n\ + -f, --force Overwrite old temporary files.\n\ + -k, --keys-used=# Tell MARIA to update only some specific keys. # is a\n\ + bit mask of which keys to use. This can be used to\n\ + get faster inserts.\n\ + --max-record-length=#\n\ + Skip rows bigger than this if maria_chk can't allocate\n\ + memory to hold it.\n\ + -r, --recover Can fix almost anything except unique keys that aren't\n\ + unique.\n\ + -n, --sort-recover Forces recovering with sorting even if the temporary\n\ + file would be very big.\n\ + -p, --parallel-recover\n\ + Uses the same technique as '-r' and '-n', but creates\n\ + all the keys in parallel, in different threads.\n\ + -o, --safe-recover Uses old recovery method; Slower than '-r' but can\n\ + handle a couple of cases where '-r' reports that it\n\ + can't fix the data file.\n\ + --character-sets-dir=...\n\ + Directory where character sets are.\n\ + --set-collation=name\n\ + Change the collation used by the index.\n\ + -q, --quick Faster repair by not modifying the data file.\n\ + One can give a second '-q' to force maria_chk to\n\ + modify the original datafile in case of duplicate keys.\n\ + NOTE: Tables where the data file is currupted can't be\n\ + fixed with this option.\n\ + -u, --unpack Unpack file packed with mariapack.\n\ +"); + + puts("Other actions:\n\ + -a, --analyze Analyze distribution of keys. Will make some joins in\n\ + MySQL faster. You can check the calculated distribution\n\ + by using '--description --verbose table_name'.\n\ + --stats_method=name Specifies how index statistics collection code should\n\ + threat NULLs. Possible values of name are \"nulls_unequal\"\n\ + (default for 4.1/5.0), \"nulls_equal\" (emulate 4.0), and \n\ + \"nulls_ignored\".\n\ + -d, --description Prints some information about table.\n\ + -A, --set-auto-increment[=value]\n\ + Force auto_increment to start at this or higher value\n\ + If no value is given, then sets the next auto_increment\n\ + value to the highest used value for the auto key + 1.\n\ + -S, --sort-index Sort index blocks. This speeds up 'read-next' in\n\ + applications.\n\ + -R, --sort-records=#\n\ + Sort records according to an index. This makes your\n\ + data much more localized and may speed up things\n\ + (It may be VERY slow to do a sort the first time!).\n\ + -b, --block-search=#\n\ + Find a record, a block at given offset belongs to."); + + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + +#include <help_end.h> + +const char *maria_stats_method_names[] = {"nulls_unequal", "nulls_equal", + "nulls_ignored", NullS}; +TYPELIB maria_stats_method_typelib= { + array_elements(maria_stats_method_names) - 1, "", + maria_stats_method_names, NULL}; + + /* Read options */ + +static my_bool +get_one_option(int optid, + const struct my_option *opt __attribute__((unused)), + char *argument) +{ + switch (optid) { +#ifdef __NETWARE__ + case OPT_AUTO_CLOSE: + setscreenmode(SCR_AUTOCLOSE_ON_EXIT); + break; +#endif + case 'a': + if (argument == disabled_my_option) + check_param.testflag&= ~T_STATISTICS; + else + check_param.testflag|= T_STATISTICS; + break; + case 'A': + if (argument) + check_param.auto_increment_value= strtoull(argument, NULL, 0); + else + check_param.auto_increment_value= 0; /* Set to max used value */ + check_param.testflag|= T_AUTO_INC; + break; + case 'b': + check_param.search_after_block= strtoul(argument, NULL, 10); + break; + case 'B': + if (argument == disabled_my_option) + check_param.testflag&= ~T_BACKUP_DATA; + else + check_param.testflag|= T_BACKUP_DATA; + break; + case 'c': + if (argument == disabled_my_option) + check_param.testflag&= ~T_CHECK; + else + check_param.testflag|= T_CHECK; + break; + case 'C': + if (argument == disabled_my_option) + check_param.testflag&= ~(T_CHECK | T_CHECK_ONLY_CHANGED); + else + check_param.testflag|= T_CHECK | T_CHECK_ONLY_CHANGED; + break; + case 'D': + check_param.max_data_file_length=strtoll(argument, NULL, 10); + break; + case 's': /* silent */ + if (argument == disabled_my_option) + check_param.testflag&= ~(T_SILENT | T_VERY_SILENT); + else + { + if (check_param.testflag & T_SILENT) + check_param.testflag|= T_VERY_SILENT; + check_param.testflag|= T_SILENT; + check_param.testflag&= ~T_WRITE_LOOP; + } + break; + case 'w': + if (argument == disabled_my_option) + check_param.testflag&= ~T_WAIT_FOREVER; + else + check_param.testflag|= T_WAIT_FOREVER; + break; + case 'd': /* description if isam-file */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_DESCRIPT; + else + check_param.testflag|= T_DESCRIPT; + break; + case 'e': /* extend check */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_EXTEND; + else + check_param.testflag|= T_EXTEND; + break; + case 'i': + if (argument == disabled_my_option) + check_param.testflag&= ~T_INFO; + else + check_param.testflag|= T_INFO; + break; + case 'f': + if (argument == disabled_my_option) + { + check_param.tmpfile_createflag= O_RDWR | O_TRUNC | O_EXCL; + check_param.testflag&= ~(T_FORCE_CREATE | T_UPDATE_STATE); + } + else + { + check_param.tmpfile_createflag= O_RDWR | O_TRUNC; + check_param.testflag|= T_FORCE_CREATE | T_UPDATE_STATE; + } + break; + case 'F': + if (argument == disabled_my_option) + check_param.testflag&= ~T_FAST; + else + check_param.testflag|= T_FAST; + break; + case 'k': + check_param.keys_in_use= (ulonglong) strtoll(argument, NULL, 10); + break; + case 'm': + if (argument == disabled_my_option) + check_param.testflag&= ~T_MEDIUM; + else + check_param.testflag|= T_MEDIUM; /* Medium check */ + break; + case 'r': /* Repair table */ + check_param.testflag&= ~T_REP_ANY; + if (argument != disabled_my_option) + check_param.testflag|= T_REP_BY_SORT; + break; + case 'p': + check_param.testflag&= ~T_REP_ANY; + if (argument != disabled_my_option) + check_param.testflag|= T_REP_PARALLEL; + break; + case 'o': + check_param.testflag&= ~T_REP_ANY; + check_param.force_sort= 0; + if (argument != disabled_my_option) + { + check_param.testflag|= T_REP; + my_disable_async_io= 1; /* More safety */ + } + break; + case 'n': + check_param.testflag&= ~T_REP_ANY; + if (argument == disabled_my_option) + check_param.force_sort= 0; + else + { + check_param.testflag|= T_REP_BY_SORT; + check_param.force_sort= 1; + } + break; + case 'q': + if (argument == disabled_my_option) + check_param.testflag&= ~(T_QUICK | T_FORCE_UNIQUENESS); + else + check_param.testflag|= + (check_param.testflag & T_QUICK) ? T_FORCE_UNIQUENESS : T_QUICK; + break; + case 'u': + if (argument == disabled_my_option) + check_param.testflag&= ~(T_UNPACK | T_REP_BY_SORT); + else + check_param.testflag|= T_UNPACK | T_REP_BY_SORT; + break; + case 'v': /* Verbose */ + if (argument == disabled_my_option) + { + check_param.testflag&= ~T_VERBOSE; + check_param.verbose=0; + } + else + { + check_param.testflag|= T_VERBOSE; + check_param.verbose++; + } + break; + case 'R': /* Sort records */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_SORT_RECORDS; + else + { + check_param.testflag|= T_SORT_RECORDS; + check_param.opt_sort_key= (uint) atoi(argument) - 1; + if (check_param.opt_sort_key >= MARIA_MAX_KEY) + { + fprintf(stderr, + "The value of the sort key is bigger than max key: %d.\n", + MARIA_MAX_KEY); + exit(1); + } + } + break; + case 'S': /* Sort index */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_SORT_INDEX; + else + check_param.testflag|= T_SORT_INDEX; + break; + case 'T': + if (argument == disabled_my_option) + check_param.testflag&= ~T_READONLY; + else + check_param.testflag|= T_READONLY; + break; + case 'U': + if (argument == disabled_my_option) + check_param.testflag&= ~T_UPDATE_STATE; + else + check_param.testflag|= T_UPDATE_STATE; + break; + case '#': + DBUG_SET_INITIAL(argument ? argument : "d:t:o,/tmp/maria_chk.trace"); + break; + case 'V': + print_version(); + exit(0); + case OPT_CORRECT_CHECKSUM: + if (argument == disabled_my_option) + check_param.testflag&= ~T_CALC_CHECKSUM; + else + check_param.testflag|= T_CALC_CHECKSUM; + break; + case OPT_STATS_METHOD: + { + int method; + enum_handler_stats_method method_conv; + LINT_INIT(method_conv); + maria_stats_method_str= argument; + if ((method=find_type(argument, &maria_stats_method_typelib, 2)) <= 0) + { + fprintf(stderr, "Invalid value of stats_method: %s.\n", argument); + exit(1); + } + switch (method-1) { + case 0: + method_conv= MI_STATS_METHOD_NULLS_EQUAL; + break; + case 1: + method_conv= MI_STATS_METHOD_NULLS_NOT_EQUAL; + break; + case 2: + method_conv= MI_STATS_METHOD_IGNORE_NULLS; + break; + default: assert(0); /* Impossible */ + } + check_param.stats_method= method_conv; + break; + } +#ifdef DEBUG /* Only useful if debugging */ + case OPT_START_CHECK_POS: + check_param.start_check_pos= strtoull(argument, NULL, 0); + break; +#endif + case 'H': + my_print_help(my_long_options); + exit(0); + case '?': + usage(); + exit(0); + } + return 0; +} + + +static void get_options(register int *argc,register char ***argv) +{ + int ho_error; + + load_defaults("my", load_default_groups, argc, argv); + default_argv= *argv; + if (isatty(fileno(stdout))) + check_param.testflag|=T_WRITE_LOOP; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + /* If using repair, then update checksum if one uses --update-state */ + if ((check_param.testflag & T_UPDATE_STATE) && + (check_param.testflag & T_REP_ANY)) + check_param.testflag|= T_CALC_CHECKSUM; + + if (*argc == 0) + { + usage(); + exit(-1); + } + + if ((check_param.testflag & T_UNPACK) && + (check_param.testflag & (T_QUICK | T_SORT_RECORDS))) + { + VOID(fprintf(stderr, + "%s: --unpack can't be used with --quick or --sort-records\n", + my_progname_short)); + exit(1); + } + if ((check_param.testflag & T_READONLY) && + (check_param.testflag & + (T_REP_ANY | T_STATISTICS | T_AUTO_INC | + T_SORT_RECORDS | T_SORT_INDEX | T_FORCE_CREATE))) + { + VOID(fprintf(stderr, + "%s: Can't use --readonly when repairing or sorting\n", + my_progname_short)); + exit(1); + } + + if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir)) + exit(1); + + check_param.tmpdir=&maria_chk_tmpdir; + + if (set_collation_name) + if (!(set_collation= get_charset_by_name(set_collation_name, + MYF(MY_WME)))) + exit(1); + + return; +} /* get options */ + + + /* Check table */ + +static int maria_chk(HA_CHECK *param, char *filename) +{ + int error,lock_type,recreate; + int rep_quick= param->testflag & (T_QUICK | T_FORCE_UNIQUENESS); + MARIA_HA *info; + File datafile; + char llbuff[22],llbuff2[22]; + my_bool state_updated=0; + MARIA_SHARE *share; + DBUG_ENTER("maria_chk"); + + param->out_flag=error=param->warning_printed=param->error_printed= + recreate=0; + datafile=0; + param->isam_file_name=filename; /* For error messages */ + if (!(info=maria_open(filename, + (param->testflag & (T_DESCRIPT | T_READONLY)) ? + O_RDONLY : O_RDWR, + HA_OPEN_FOR_REPAIR | + ((param->testflag & T_WAIT_FOREVER) ? + HA_OPEN_WAIT_IF_LOCKED : + (param->testflag & T_DESCRIPT) ? + HA_OPEN_IGNORE_IF_LOCKED : HA_OPEN_ABORT_IF_LOCKED)))) + { + /* Avoid twice printing of isam file name */ + param->error_printed=1; + switch (my_errno) { + case HA_ERR_CRASHED: + _ma_check_print_error(param,"'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename); + break; + case HA_ERR_NOT_A_TABLE: + _ma_check_print_error(param,"'%s' is not a MARIA-table",filename); + break; + case HA_ERR_CRASHED_ON_USAGE: + _ma_check_print_error(param,"'%s' is marked as crashed",filename); + break; + case HA_ERR_CRASHED_ON_REPAIR: + _ma_check_print_error(param,"'%s' is marked as crashed after last repair",filename); + break; + case HA_ERR_OLD_FILE: + _ma_check_print_error(param,"'%s' is a old type of MARIA-table", filename); + break; + case HA_ERR_END_OF_FILE: + _ma_check_print_error(param,"Couldn't read complete header from '%s'", filename); + break; + case EAGAIN: + _ma_check_print_error(param,"'%s' is locked. Use -w to wait until unlocked",filename); + break; + case ENOENT: + _ma_check_print_error(param,"File '%s' doesn't exist",filename); + break; + case EACCES: + _ma_check_print_error(param,"You don't have permission to use '%s'", + filename); + break; + default: + _ma_check_print_error(param,"%d when opening MARIA-table '%s'", + my_errno,filename); + break; + } + DBUG_RETURN(1); + } + share=info->s; + share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */ + share->tot_locks-= share->r_locks; + share->r_locks=0; + maria_block_size= share->base.block_size; + + if (share->data_file_type == BLOCK_RECORD || + ((param->testflag & T_UNPACK) && + share->state.header.org_data_file_type == BLOCK_RECORD)) + { + if (param->testflag & T_SORT_RECORDS) + { + _ma_check_print_error(param, + "Record format used by '%s' is is not yet supported with repair/check", + filename); + param->error_printed= 0; + error= 1; + goto end2; + } + /* We can't do parallell repair with BLOCK_RECORD yet */ + if (param->testflag & (T_REP_BY_SORT | T_REP_PARALLEL)) + { + param->testflag&= ~(T_REP_BY_SORT | T_REP_PARALLEL); + param->testflag|= T_REP; + } + } + + /* + Skip the checking of the file if: + We are using --fast and the table is closed properly + We are using --check-only-changed-tables and the table hasn't changed + */ + if (param->testflag & (T_FAST | T_CHECK_ONLY_CHANGED)) + { + my_bool need_to_check= (maria_is_crashed(info) || + share->state.open_count != 0); + + if ((param->testflag & (T_REP_ANY | T_SORT_RECORDS)) && + ((share->state.changed & (STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR) || + !(param->testflag & T_CHECK_ONLY_CHANGED)))) + need_to_check=1; + + if (info->s->base.keys && info->state->records) + { + if ((param->testflag & T_STATISTICS) && + (share->state.changed & STATE_NOT_ANALYZED)) + need_to_check=1; + if ((param->testflag & T_SORT_INDEX) && + (share->state.changed & STATE_NOT_SORTED_PAGES)) + need_to_check=1; + if ((param->testflag & T_REP_BY_SORT) && + (share->state.changed & STATE_NOT_OPTIMIZED_KEYS)) + need_to_check=1; + } + if ((param->testflag & T_CHECK_ONLY_CHANGED) && + (share->state.changed & (STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR))) + need_to_check=1; + if (!need_to_check) + { + if (!(param->testflag & T_SILENT) || param->testflag & T_INFO) + printf("MARIA file: %s is already checked\n",filename); + if (maria_close(info)) + { + _ma_check_print_error(param,"%d when closing MARIA-table '%s'", + my_errno,filename); + DBUG_RETURN(1); + } + DBUG_RETURN(0); + } + } + if ((param->testflag & (T_REP_ANY | T_STATISTICS | + T_SORT_RECORDS | T_SORT_INDEX)) && + (((param->testflag & T_UNPACK) && + share->data_file_type == COMPRESSED_RECORD) || + mi_uint2korr(share->state.header.state_info_length) != + MARIA_STATE_INFO_SIZE || + mi_uint2korr(share->state.header.base_info_length) != + MARIA_BASE_INFO_SIZE || + maria_is_any_intersect_keys_active(param->keys_in_use, share->base.keys, + ~share->state.key_map) || + maria_test_if_almost_full(info) || + info->s->state.header.file_version[3] != maria_file_magic[3] || + (set_collation && + set_collation->number != share->state.header.language))) + { + if (set_collation) + param->language= set_collation->number; + if (maria_recreate_table(param, &info,filename)) + { + VOID(fprintf(stderr, + "MARIA-table '%s' is not fixed because of errors\n", + filename)); + return(-1); + } + recreate=1; + if (!(param->testflag & T_REP_ANY)) + { + param->testflag|=T_REP_BY_SORT; /* if only STATISTICS */ + if (!(param->testflag & T_SILENT)) + printf("- '%s' has old table-format. Recreating index\n",filename); + rep_quick|=T_QUICK; + } + share=info->s; + share->tot_locks-= share->r_locks; + share->r_locks=0; + } + + if (param->testflag & T_DESCRIPT) + { + param->total_files++; + param->total_records+=info->state->records; + param->total_deleted+=info->state->del; + descript(param, info, filename); + maria_close(info); /* Should always succeed */ + return(0); + } + + if (!stopwords_inited++) + ft_init_stopwords(); + + if (!(param->testflag & T_READONLY)) + lock_type = F_WRLCK; /* table is changed */ + else + lock_type= F_RDLCK; + if (info->lock_type == F_RDLCK) + info->lock_type=F_UNLCK; /* Read only table */ + if (_ma_readinfo(info,lock_type,0)) + { + _ma_check_print_error(param,"Can't lock indexfile of '%s', error: %d", + filename,my_errno); + param->error_printed=0; + error= 1; + goto end2; + } + /* + _ma_readinfo() has locked the table. + We mark the table as locked (without doing file locks) to be able to + use functions that only works on locked tables (like row caching). + */ + maria_lock_database(info, F_EXTRA_LCK); + datafile= info->dfile.file; + if (init_pagecache(maria_pagecache, param->use_buffers, 0, 0, + maria_block_size) == 0) + { + _ma_check_print_error(param, "Can't initialize page cache with %lu memory", + (ulong) param->use_buffers); + error= 1; + goto end2; + } + + if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX)) + { + if (param->testflag & T_REP_ANY) + { + ulonglong tmp=share->state.key_map; + maria_copy_keys_active(share->state.key_map, share->base.keys, + param->keys_in_use); + if (tmp != share->state.key_map) + info->update|=HA_STATE_CHANGED; + } + if (rep_quick && + maria_chk_del(param, info, param->testflag & ~T_VERBOSE)) + { + if (param->testflag & T_FORCE_CREATE) + { + rep_quick=0; + _ma_check_print_info(param,"Creating new data file\n"); + } + else + { + error=1; + _ma_check_print_error(param, + "Quick-recover aborted; Run recovery without switch 'q'"); + } + } + if (!error) + { + /* + Tell the server's Recovery to ignore old REDOs on this table; we don't + know what the log's end LSN is now, so we just let the server know + that it will have to find and store it. + This is the only case where create_rename_lsn can be a horizon and not + a LSN. + */ + if (share->base.born_transactional) + share->state.create_rename_lsn= share->state.is_of_horizon= + LSN_REPAIRED_BY_MARIA_CHK; + if ((param->testflag & (T_REP_BY_SORT | T_REP_PARALLEL)) && + (maria_is_any_key_active(share->state.key_map) || + (rep_quick && !param->keys_in_use && !recreate)) && + maria_test_if_sort_rep(info, info->state->records, + info->s->state.key_map, + param->force_sort)) + { + if (param->testflag & T_REP_BY_SORT) + error=maria_repair_by_sort(param,info,filename,rep_quick); + else + error=maria_repair_parallel(param,info,filename,rep_quick); + state_updated=1; + } + else if (param->testflag & T_REP_ANY) + error=maria_repair(param, info,filename,rep_quick); + } + if (!error && param->testflag & T_SORT_RECORDS) + { + /* + The data file is nowadays reopened in the repair code so we should + soon remove the following reopen-code + */ +#ifndef TO_BE_REMOVED + if (param->out_flag & O_NEW_DATA) + { /* Change temp file to org file */ + VOID(my_close(info->dfile.file, MYF(MY_WME))); /* Close new file */ + error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT, + MYF(0)); + if (_ma_open_datafile(info,info->s, -1)) + error=1; + param->out_flag&= ~O_NEW_DATA; /* We are using new datafile */ + param->read_cache.file= info->dfile.file; + } +#endif + if (! error) + { + uint key; + /* + We can't update the index in maria_sort_records if we have a + prefix compressed or fulltext index + */ + my_bool update_index=1; + for (key=0 ; key < share->base.keys; key++) + if (share->keyinfo[key].flag & (HA_BINARY_PACK_KEY|HA_FULLTEXT)) + update_index=0; + + error=maria_sort_records(param,info,filename,param->opt_sort_key, + /* what is the following parameter for ? */ + (my_bool) !(param->testflag & T_REP), + update_index); + datafile= info->dfile.file; /* This is now locked */ + if (!error && !update_index) + { + if (param->verbose) + puts("Table had a compressed index; We must now recreate the index"); + error=maria_repair_by_sort(param,info,filename,1); + } + } + } + if (!error && param->testflag & T_SORT_INDEX) + error=maria_sort_index(param,info,filename); + if (!error) + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR); + else + maria_mark_crashed(info); + } + else if ((param->testflag & T_CHECK) || !(param->testflag & T_AUTO_INC)) + { + if (!(param->testflag & T_SILENT) || param->testflag & T_INFO) + printf("Checking MARIA file: %s\n",filename); + if (!(param->testflag & T_SILENT)) + printf("Data records: %7s Deleted blocks: %7s\n", + llstr(info->state->records,llbuff), + llstr(info->state->del,llbuff2)); + error =maria_chk_status(param,info); + maria_intersect_keys_active(share->state.key_map, param->keys_in_use); + error =maria_chk_size(param,info); + if (!error || !(param->testflag & (T_FAST | T_FORCE_CREATE))) + error|=maria_chk_del(param, info,param->testflag); + if ((!error || (!(param->testflag & (T_FAST | T_FORCE_CREATE)) && + !param->start_check_pos))) + { + error|=maria_chk_key(param, info); + if (!error && (param->testflag & (T_STATISTICS | T_AUTO_INC))) + error=maria_update_state_info(param, info, + ((param->testflag & T_STATISTICS) ? + UPDATE_STAT : 0) | + ((param->testflag & T_AUTO_INC) ? + UPDATE_AUTO_INC : 0)); + } + if ((!rep_quick && !error) || + !(param->testflag & (T_FAST | T_FORCE_CREATE))) + { + VOID(init_io_cache(¶m->read_cache,datafile, + (uint) param->read_buffer_length, + READ_CACHE, + (param->start_check_pos ? + param->start_check_pos : + share->pack.header_length), + 1, + MYF(MY_WME))); + maria_lock_memory(param); + if ((info->s->data_file_type != STATIC_RECORD) || + (param->testflag & (T_EXTEND | T_MEDIUM))) + error|=maria_chk_data_link(param, info, param->testflag & T_EXTEND); + error|= _ma_flush_table_files_after_repair(param, info); + VOID(end_io_cache(¶m->read_cache)); + } + if (!error) + { + if ((share->state.changed & STATE_CHANGED) && + (param->testflag & T_UPDATE_STATE)) + info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED | + STATE_CRASHED_ON_REPAIR); + } + else if (!maria_is_crashed(info) && + (param->testflag & T_UPDATE_STATE)) + { /* Mark crashed */ + maria_mark_crashed(info); + info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + } + } + + if ((param->testflag & T_AUTO_INC) || + ((param->testflag & T_REP_ANY) && info->s->base.auto_key)) + _ma_update_auto_increment_key(param, info, + (my_bool) !test(param->testflag & T_AUTO_INC)); + + if (info->update & HA_STATE_CHANGED && ! (param->testflag & T_READONLY)) + error|=maria_update_state_info(param, info, + UPDATE_OPEN_COUNT | + (((param->testflag & T_REP_ANY) ? + UPDATE_TIME : 0) | + (state_updated ? UPDATE_STAT : 0) | + ((param->testflag & T_SORT_RECORDS) ? + UPDATE_SORT : 0))); + info->update&= ~HA_STATE_CHANGED; + maria_lock_database(info, F_UNLCK); + +end2: + end_pagecache(maria_pagecache, 1); + if (maria_close(info)) + { + _ma_check_print_error(param,"%d when closing MARIA-table '%s'", + my_errno,filename); + DBUG_RETURN(1); + } + if (error == 0) + { + if (param->out_flag & O_NEW_DATA) + error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT, + ((param->testflag & T_BACKUP_DATA) ? + MYF(MY_REDEL_MAKE_BACKUP) : MYF(0))); + if (param->out_flag & O_NEW_INDEX) + error|=maria_change_to_newfile(filename,MARIA_NAME_IEXT,INDEX_TMP_EXT, + MYF(0)); + } + VOID(fflush(stdout)); VOID(fflush(stderr)); + if (param->error_printed) + { + if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX)) + { + VOID(fprintf(stderr, + "MARIA-table '%s' is not fixed because of errors\n", + filename)); + if (param->testflag & T_REP_ANY) + VOID(fprintf(stderr, + "Try fixing it by using the --safe-recover (-o), the --force (-f) option or by not using the --quick (-q) flag\n")); + } + else if (!(param->error_printed & 2) && + !(param->testflag & T_FORCE_CREATE)) + VOID(fprintf(stderr, + "MARIA-table '%s' is corrupted\nFix it using switch \"-r\" or \"-o\"\n", + filename)); + } + else if (param->warning_printed && + ! (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | + T_FORCE_CREATE))) + VOID(fprintf(stderr, "MARIA-table '%s' is usable but should be fixed\n", + filename)); + VOID(fflush(stderr)); + DBUG_RETURN(error); +} /* maria_chk */ + + +/* Write info about table */ + +static void descript(HA_CHECK *param, register MARIA_HA *info, char *name) +{ + uint key,keyseg_nr,field; + reg3 MARIA_KEYDEF *keyinfo; + reg2 HA_KEYSEG *keyseg; + reg4 const char *text; + char buff[160],length[10],*pos,*end; + enum en_fieldtype type; + MARIA_SHARE *share=info->s; + char llbuff[22],llbuff2[22]; + DBUG_ENTER("describe"); + + if (param->testflag & T_VERY_SILENT) + { + longlong checksum= info->state->checksum; + if (!(share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))) + checksum= 0; + printf("%s %s %s\n", name, llstr(info->state->records,llbuff), + llstr(checksum, llbuff2)); + DBUG_VOID_RETURN; + } + + printf("\nMARIA file: %s\n",name); + printf("Record format: %s\n", record_formats[share->data_file_type]); + printf("Character set: %s (%d)\n", + get_charset_name(share->state.header.language), + share->state.header.language); + + if (param->testflag & T_VERBOSE) + { + printf("File-version: %d\n", + (int) share->state.header.file_version[3]); + if (share->state.create_time) + { + get_date(buff,1,share->state.create_time); + printf("Creation time: %s\n",buff); + } + if (share->state.check_time) + { + get_date(buff,1,share->state.check_time); + printf("Recover time: %s\n",buff); + } + pos=buff; + if (share->state.changed & STATE_CRASHED) + strmov(buff,"crashed"); + else + { + if (share->state.open_count) + pos=strmov(pos,"open,"); + if (share->state.changed & STATE_CHANGED) + pos=strmov(pos,"changed,"); + else + pos=strmov(pos,"checked,"); + if (!(share->state.changed & STATE_NOT_ANALYZED)) + pos=strmov(pos,"analyzed,"); + if (!(share->state.changed & STATE_NOT_OPTIMIZED_KEYS)) + pos=strmov(pos,"optimized keys,"); + if (!(share->state.changed & STATE_NOT_SORTED_PAGES)) + pos=strmov(pos,"sorted index pages,"); + pos[-1]=0; /* Remove extra ',' */ + } + printf("Status: %s\n",buff); + if (share->base.auto_key) + { + printf("Auto increment key: %16d Last value: %18s\n", + share->base.auto_key, + llstr(share->state.auto_increment,llbuff)); + } + if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + printf("Checksum: %26s\n",llstr(info->state->checksum,llbuff)); +; + if (share->options & HA_OPTION_DELAY_KEY_WRITE) + printf("Keys are only flushed at close\n"); + + } + printf("Data records: %16s Deleted blocks: %18s\n", + llstr(info->state->records,llbuff),llstr(info->state->del,llbuff2)); + if (param->testflag & T_SILENT) + DBUG_VOID_RETURN; /* This is enough */ + + if (param->testflag & T_VERBOSE) + { +#ifdef USE_RELOC + printf("Init-relocation: %16s\n",llstr(share->base.reloc,llbuff)); +#endif + printf("Datafile parts: %16s Deleted data: %18s\n", + llstr(share->state.split,llbuff), + llstr(info->state->empty,llbuff2)); + printf("Datafile pointer (bytes): %11d Keyfile pointer (bytes): %13d\n", + share->rec_reflength,share->base.key_reflength); + printf("Datafile length: %16s Keyfile length: %18s\n", + llstr(info->state->data_file_length,llbuff), + llstr(info->state->key_file_length,llbuff2)); + + if (info->s->base.reloc == 1L && info->s->base.records == 1L) + puts("This is a one-record table"); + else + { + if (share->base.max_data_file_length != HA_OFFSET_ERROR || + share->base.max_key_file_length != HA_OFFSET_ERROR) + printf("Max datafile length: %16s Max keyfile length: %18s\n", + llstr(share->base.max_data_file_length-1,llbuff), + llstr(share->base.max_key_file_length-1,llbuff2)); + } + } + printf("Block_size: %16d\n",(int) share->block_size); + printf("Recordlength: %16d\n",(int) share->base.pack_reclength); + if (! maria_is_all_keys_active(share->state.key_map, share->base.keys)) + { + longlong2str(share->state.key_map,buff,2); + printf("Using only keys '%s' of %d possibly keys\n", + buff, share->base.keys); + } + puts("\ntable description:"); + printf("Key Start Len Index Type"); + if (param->testflag & T_VERBOSE) + printf(" Rec/key Root Blocksize"); + VOID(putchar('\n')); + + for (key=keyseg_nr=0, keyinfo= &share->keyinfo[0] ; + key < share->base.keys; + key++,keyinfo++) + { + keyseg=keyinfo->seg; + if (keyinfo->flag & HA_NOSAME) text="unique "; + else if (keyinfo->flag & HA_FULLTEXT) text="fulltext "; + else text="multip."; + + pos=buff; + if (keyseg->flag & HA_REVERSE_SORT) + *pos++ = '-'; + pos=strmov(pos,type_names[keyseg->type]); + *pos++ = ' '; + *pos=0; + if (keyinfo->flag & HA_PACK_KEY) + pos=strmov(pos,prefix_packed_txt); + if (keyinfo->flag & HA_BINARY_PACK_KEY) + pos=strmov(pos,bin_packed_txt); + if (keyseg->flag & HA_SPACE_PACK) + pos=strmov(pos,diff_txt); + if (keyseg->flag & HA_BLOB_PART) + pos=strmov(pos,blob_txt); + if (keyseg->flag & HA_NULL_PART) + pos=strmov(pos,null_txt); + *pos=0; + + printf("%-4d%-6ld%-3d %-8s%-21s", + key+1,(long) keyseg->start+1,keyseg->length,text,buff); + if (share->state.key_root[key] != HA_OFFSET_ERROR) + llstr(share->state.key_root[key],buff); + else + buff[0]=0; + if (param->testflag & T_VERBOSE) + printf("%11lu %12s %10d", + share->state.rec_per_key_part[keyseg_nr++], + buff,keyinfo->block_length); + VOID(putchar('\n')); + while ((++keyseg)->type != HA_KEYTYPE_END) + { + pos=buff; + if (keyseg->flag & HA_REVERSE_SORT) + *pos++ = '-'; + pos=strmov(pos,type_names[keyseg->type]); + *pos++= ' '; + if (keyseg->flag & HA_SPACE_PACK) + pos=strmov(pos,diff_txt); + if (keyseg->flag & HA_BLOB_PART) + pos=strmov(pos,blob_txt); + if (keyseg->flag & HA_NULL_PART) + pos=strmov(pos,null_txt); + *pos=0; + printf(" %-6ld%-3d %-21s", + (long) keyseg->start+1,keyseg->length,buff); + if (param->testflag & T_VERBOSE) + printf("%11lu", share->state.rec_per_key_part[keyseg_nr++]); + VOID(putchar('\n')); + } + keyseg++; + } + if (share->state.header.uniques) + { + MARIA_UNIQUEDEF *uniqueinfo; + puts("\nUnique Key Start Len Nullpos Nullbit Type"); + for (key=0,uniqueinfo= &share->uniqueinfo[0] ; + key < share->state.header.uniques; key++, uniqueinfo++) + { + my_bool new_row=0; + char null_bit[8],null_pos[8]; + printf("%-8d%-5d",key+1,uniqueinfo->key+1); + for (keyseg=uniqueinfo->seg ; keyseg->type != HA_KEYTYPE_END ; keyseg++) + { + if (new_row) + fputs(" ",stdout); + null_bit[0]=null_pos[0]=0; + if (keyseg->null_bit) + { + sprintf(null_bit,"%d",keyseg->null_bit); + sprintf(null_pos,"%ld",(long) keyseg->null_pos+1); + } + printf("%-7ld%-5d%-9s%-10s%-30s\n", + (long) keyseg->start+1,keyseg->length, + null_pos,null_bit, + type_names[keyseg->type]); + new_row=1; + } + } + } + if (param->verbose > 1) + { + char null_bit[8],null_pos[8]; + printf("\nField Start Length Nullpos Nullbit Type"); + if (share->options & HA_OPTION_COMPRESS_RECORD) + printf(" Huff tree Bits"); + VOID(putchar('\n')); + + for (field=0 ; field < share->base.fields ; field++) + { + if (share->options & HA_OPTION_COMPRESS_RECORD) + type=share->columndef[field].base_type; + else + type=(enum en_fieldtype) share->columndef[field].type; + end=strmov(buff,field_pack[type]); + if (share->options & HA_OPTION_COMPRESS_RECORD) + { + if (share->columndef[field].pack_type & PACK_TYPE_SELECTED) + end=strmov(end,", not_always"); + if (share->columndef[field].pack_type & PACK_TYPE_SPACE_FIELDS) + end=strmov(end,", no empty"); + if (share->columndef[field].pack_type & PACK_TYPE_ZERO_FILL) + { + sprintf(end,", zerofill(%d)",share->columndef[field].space_length_bits); + end=strend(end); + } + } + if (buff[0] == ',') + strmov(buff,buff+2); + int10_to_str((long) share->columndef[field].length,length,10); + null_bit[0]=null_pos[0]=0; + if (share->columndef[field].null_bit) + { + sprintf(null_bit,"%d",share->columndef[field].null_bit); + sprintf(null_pos,"%d",share->columndef[field].null_pos+1); + } + printf("%-6d%-6u%-7s%-8s%-8s%-35s",field+1, + (uint) share->columndef[field].offset+1, + length, null_pos, null_bit, buff); + if (share->options & HA_OPTION_COMPRESS_RECORD) + { + if (share->columndef[field].huff_tree) + printf("%3d %2d", + (uint) (share->columndef[field].huff_tree-share->decode_trees)+1, + share->columndef[field].huff_tree->quick_table_bits); + } + VOID(putchar('\n')); + } + } + DBUG_VOID_RETURN; +} /* describe */ + + + /* Sort records according to one key */ + +static int maria_sort_records(HA_CHECK *param, + register MARIA_HA *info, char *name, + uint sort_key, + my_bool write_info, + my_bool update_index) +{ + int got_error; + uint key; + MARIA_KEYDEF *keyinfo; + File new_file; + uchar *temp_buff; + ha_rows old_record_count; + MARIA_SHARE *share=info->s; + char llbuff[22],llbuff2[22]; + MARIA_SORT_INFO sort_info; + MARIA_SORT_PARAM sort_param; + DBUG_ENTER("sort_records"); + + bzero((char*)&sort_info,sizeof(sort_info)); + bzero((char*)&sort_param,sizeof(sort_param)); + sort_param.sort_info=&sort_info; + sort_info.param=param; + keyinfo= &share->keyinfo[sort_key]; + got_error=1; + temp_buff=0; + new_file= -1; + + if (! maria_is_key_active(share->state.key_map, sort_key)) + { + _ma_check_print_warning(param, + "Can't sort table '%s' on key %d; No such key", + name,sort_key+1); + param->error_printed=0; + DBUG_RETURN(0); /* Nothing to do */ + } + if (keyinfo->flag & HA_FULLTEXT) + { + _ma_check_print_warning(param,"Can't sort table '%s' on FULLTEXT key %d", + name,sort_key+1); + param->error_printed=0; + DBUG_RETURN(0); /* Nothing to do */ + } + if (share->data_file_type == COMPRESSED_RECORD) + { + _ma_check_print_warning(param,"Can't sort read-only table '%s'", name); + param->error_printed=0; + DBUG_RETURN(0); /* Nothing to do */ + } + if (!(param->testflag & T_SILENT)) + { + printf("- Sorting records for MARIA-table '%s'\n",name); + if (write_info) + printf("Data records: %9s Deleted: %9s\n", + llstr(info->state->records,llbuff), + llstr(info->state->del,llbuff2)); + } + if (share->state.key_root[sort_key] == HA_OFFSET_ERROR) + DBUG_RETURN(0); /* Nothing to do */ + + if (init_io_cache(&info->rec_cache,-1,(uint) param->write_buffer_length, + WRITE_CACHE,share->pack.header_length,1, + MYF(MY_WME | MY_WAIT_IF_FULL))) + goto err; + info->opt_flag|=WRITE_CACHE_USED; + + if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length))) + { + _ma_check_print_error(param,"Not enough memory for key block"); + goto err; + } + if (!(sort_param.record=(uchar*) my_malloc((uint) share->base.pack_reclength, + MYF(0)))) + { + _ma_check_print_error(param,"Not enough memory for record"); + goto err; + } + fn_format(param->temp_filename,name,"", MARIA_NAME_DEXT,2+4+32); + new_file= my_create(fn_format(param->temp_filename, + param->temp_filename,"", + DATA_TMP_EXT, + MY_REPLACE_EXT | MY_UNPACK_FILENAME), + 0, param->tmpfile_createflag, + MYF(0)); + if (new_file < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (share->pack.header_length) + if (maria_filecopy(param, new_file, info->dfile.file, 0L, + share->pack.header_length, + "datafile-header")) + goto err; + info->rec_cache.file=new_file; /* Use this file for cacheing*/ + + maria_lock_memory(param); + for (key=0 ; key < share->base.keys ; key++) + share->keyinfo[key].flag|= HA_SORT_ALLOWS_SAME; + + if (my_pread(share->kfile.file, temp_buff, + (uint) keyinfo->block_length, + share->state.key_root[sort_key], + MYF(MY_NABP+MY_WME))) + { + _ma_check_print_error(param,"Can't read indexpage from filepos: %s", + (ulong) share->state.key_root[sort_key]); + goto err; + } + + /* Setup param for _ma_sort_write_record */ + sort_info.info=info; + sort_info.new_data_file_type=share->data_file_type; + sort_param.fix_datafile=1; + sort_param.master=1; + sort_param.filepos=share->pack.header_length; + old_record_count=info->state->records; + info->state->records=0; + if (sort_info.new_data_file_type != COMPRESSED_RECORD) + info->state->checksum=0; + + if (sort_record_index(&sort_param,info,keyinfo, + share->state.key_root[sort_key], + temp_buff, sort_key,new_file,update_index) || + maria_write_data_suffix(&sort_info,1) || + flush_io_cache(&info->rec_cache)) + goto err; + + if (info->state->records != old_record_count) + { + _ma_check_print_error(param,"found %s of %s records", + llstr(info->state->records,llbuff), + llstr(old_record_count,llbuff2)); + goto err; + } + + VOID(my_close(info->dfile.file, MYF(MY_WME))); + param->out_flag|=O_NEW_DATA; /* Data in new file */ + info->dfile.file= new_file; /* Use new datafile */ + info->state->del=0; + info->state->empty=0; + share->state.dellink= HA_OFFSET_ERROR; + info->state->data_file_length=sort_param.filepos; + share->state.split=info->state->records; /* Only hole records */ + share->state.version=(ulong) time((time_t*) 0); + + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + if (param->testflag & T_WRITE_LOOP) + { + VOID(fputs(" \r",stdout)); VOID(fflush(stdout)); + } + got_error=0; + +err: + if (got_error && new_file >= 0) + { + VOID(end_io_cache(&info->rec_cache)); + (void) my_close(new_file,MYF(MY_WME)); + (void) my_delete(param->temp_filename, MYF(MY_WME)); + } + if (temp_buff) + { + my_afree((uchar*) temp_buff); + } + my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR)); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + VOID(end_io_cache(&info->rec_cache)); + my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR)); + sort_info.buff=0; + share->state.sortkey=sort_key; + DBUG_RETURN(_ma_flush_table_files_after_repair(param, info) | got_error); +} /* sort_records */ + + +/* Sort records recursive using one index */ + +static int sort_record_index(MARIA_SORT_PARAM *sort_param,MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t page, uchar *buff, uint sort_key, + File new_file,my_bool update_index) +{ + uint nod_flag,used_length,key_length; + uchar *temp_buff,*keypos,*endpos; + my_off_t next_page,rec_pos; + uchar lastkey[HA_MAX_KEY_BUFF]; + char llbuff[22]; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + HA_CHECK *param=sort_info->param; + DBUG_ENTER("sort_record_index"); + + nod_flag=_ma_test_if_nod(buff); + temp_buff=0; + + if (nod_flag) + { + if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length))) + { + _ma_check_print_error(param,"Not Enough memory"); + DBUG_RETURN(-1); + } + } + used_length= maria_data_on_page(buff); + keypos=buff+2+nod_flag; + endpos=buff+used_length; + for ( ;; ) + { + _sanity(__FILE__,__LINE__); + if (nod_flag) + { + next_page= _ma_kpos(nod_flag, keypos); + if (my_pread(info->s->kfile.file, (uchar*)temp_buff, + (uint) keyinfo->block_length, next_page, + MYF(MY_NABP+MY_WME))) + { + _ma_check_print_error(param,"Can't read keys from filepos: %s", + llstr(next_page,llbuff)); + goto err; + } + if (sort_record_index(sort_param, info,keyinfo,next_page,temp_buff, + sort_key, + new_file, update_index)) + goto err; + } + _sanity(__FILE__,__LINE__); + if (keypos >= endpos || + (key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&keypos,lastkey)) + == 0) + break; + rec_pos= _ma_dpos(info,0,lastkey+key_length); + + if ((*info->s->read_record)(info,sort_param->record,rec_pos)) + { + _ma_check_print_error(param,"%d when reading datafile",my_errno); + goto err; + } + if (rec_pos != sort_param->filepos && update_index) + { + _ma_dpointer(info,keypos-nod_flag-info->s->rec_reflength, + sort_param->filepos); + if (maria_movepoint(info,sort_param->record,rec_pos,sort_param->filepos, + sort_key)) + { + _ma_check_print_error(param,"%d when updating key-pointers",my_errno); + goto err; + } + } + if (_ma_sort_write_record(sort_param)) + goto err; + } + /* Clear end of block to get better compression if the table is backuped */ + bzero((uchar*) buff+used_length,keyinfo->block_length-used_length); + if (my_pwrite(info->s->kfile.file, (uchar*)buff, (uint)keyinfo->block_length, + page,param->myf_rw)) + { + _ma_check_print_error(param,"%d when updating keyblock",my_errno); + goto err; + } + if (temp_buff) + my_afree((uchar*) temp_buff); + DBUG_RETURN(0); +err: + if (temp_buff) + my_afree((uchar*) temp_buff); + DBUG_RETURN(1); +} /* sort_record_index */ + + + +/* + Check if maria_chk was killed by a signal + This is overloaded by other programs that want to be able to abort + sorting +*/ + +static int not_killed= 0; + +volatile int *_ma_killed_ptr(HA_CHECK *param __attribute__((unused))) +{ + return ¬_killed; /* always NULL */ +} + + /* print warnings and errors */ + /* VARARGS */ + +void _ma_check_print_info(HA_CHECK *param __attribute__((unused)), + const char *fmt,...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_info"); + DBUG_PRINT("enter", ("format: %s", fmt)); + + va_start(args,fmt); + VOID(vfprintf(stdout, fmt, args)); + VOID(fputc('\n',stdout)); + va_end(args); + DBUG_VOID_RETURN; +} + +/* VARARGS */ + +void _ma_check_print_warning(HA_CHECK *param, const char *fmt,...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_warning"); + DBUG_PRINT("enter", ("format: %s", fmt)); + + fflush(stdout); + if (!param->warning_printed && !param->error_printed) + { + if (param->testflag & T_SILENT) + fprintf(stderr,"%s: MARIA file %s\n",my_progname_short, + param->isam_file_name); + param->out_flag|= O_DATA_LOST; + } + param->warning_printed=1; + va_start(args,fmt); + fprintf(stderr,"%s: warning: ",my_progname_short); + VOID(vfprintf(stderr, fmt, args)); + VOID(fputc('\n',stderr)); + fflush(stderr); + va_end(args); + DBUG_VOID_RETURN; +} + +/* VARARGS */ + +void _ma_check_print_error(HA_CHECK *param, const char *fmt,...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_error"); + DBUG_PRINT("enter", ("format: %s", fmt)); + + fflush(stdout); + if (!param->warning_printed && !param->error_printed) + { + if (param->testflag & T_SILENT) + fprintf(stderr,"%s: MARIA file %s\n",my_progname_short,param->isam_file_name); + param->out_flag|= O_DATA_LOST; + } + param->error_printed|=1; + va_start(args,fmt); + fprintf(stderr,"%s: error: ",my_progname_short); + VOID(vfprintf(stderr, fmt, args)); + VOID(fputc('\n',stderr)); + fflush(stderr); + va_end(args); + DBUG_VOID_RETURN; +} diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h new file mode 100644 index 00000000000..09852f4dc86 --- /dev/null +++ b/storage/maria/maria_def.h @@ -0,0 +1,958 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* This file is included by all internal maria files */ + +#include "maria.h" /* Structs & some defines */ +#include <myisampack.h> /* packing of keys */ +#include <my_tree.h> +#include <my_bitmap.h> +#ifdef THREAD +#include <my_pthread.h> +#include <thr_lock.h> +#else +#include <my_no_pthread.h> +#endif + +#include "ma_loghandler.h" +#include "ma_control_file.h" + +#define MAX_NONMAPPED_INSERTS 1000 +#define MARIA_MAX_TREE_LEVELS 32 +#define SANITY_CHECKS + +struct st_transaction; + +/* undef map from my_nosys; We need test-if-disk full */ +#undef my_write + +typedef struct st_maria_status_info +{ + ha_rows records; /* Rows in table */ + ha_rows del; /* Removed rows */ + my_off_t empty; /* lost space in datafile */ + my_off_t key_empty; /* lost space in indexfile */ + my_off_t key_file_length; + my_off_t data_file_length; + ha_checksum checksum; +} MARIA_STATUS_INFO; + +typedef struct st_maria_state_info +{ + struct + { /* Fileheader */ + uchar file_version[4]; + uchar options[2]; + uchar header_length[2]; + uchar state_info_length[2]; + uchar base_info_length[2]; + uchar base_pos[2]; + uchar key_parts[2]; /* Key parts */ + uchar unique_key_parts[2]; /* Key parts + unique parts */ + uchar keys; /* number of keys in file */ + uchar uniques; /* number of UNIQUE definitions */ + uchar language; /* Language for indexes */ + uchar fulltext_keys; + uchar data_file_type; + /* Used by mariapack to store the original data_file_type */ + uchar org_data_file_type; + } header; + + MARIA_STATUS_INFO state; + ha_rows split; /* number of split blocks */ + my_off_t dellink; /* Link to next removed block */ + ulonglong first_bitmap_with_space; + ulonglong auto_increment; + ulong process; /* process that updated table last */ + ulong unique; /* Unique number for this process */ + ulong update_count; /* Updated for each write lock */ + ulong status; + ulong *rec_per_key_part; + ha_checksum checksum; /* Table checksum */ + my_off_t *key_root; /* Start of key trees */ + my_off_t key_del; /* delete links for index pages */ + my_off_t rec_per_key_rows; /* Rows when calculating rec_per_key */ + + ulong sec_index_changed; /* Updated when new sec_index */ + ulong sec_index_used; /* which extra index are in use */ + ulonglong key_map; /* Which keys are in use */ + ulong version; /* timestamp of create */ + time_t create_time; /* Time when created database */ + time_t recover_time; /* Time for last recover */ + time_t check_time; /* Time for last check */ + uint sortkey; /* sorted by this key (not used) */ + uint open_count; + uint8 changed; /* Changed since mariachk */ + LSN create_rename_lsn; /**< LSN when table was last created/renamed */ + /** @brief Log horizon when state was last updated on disk */ + TRANSLOG_ADDRESS is_of_horizon; + + /* the following isn't saved on disk */ + uint state_diff_length; /* Should be 0 */ + uint state_length; /* Length of state header in file */ + ulong *key_info; +} MARIA_STATE_INFO; + + +#define MARIA_STATE_INFO_SIZE \ + (24 + LSN_STORE_SIZE*2 + 4 + 11*8 + 4*4 + 8 + 3*4 + 5*8) +#define MARIA_STATE_KEY_SIZE 8 +#define MARIA_STATE_KEYBLOCK_SIZE 8 +#define MARIA_STATE_KEYSEG_SIZE 4 +#define MARIA_STATE_EXTRA_SIZE (MARIA_MAX_KEY*MARIA_STATE_KEY_SIZE + MARIA_MAX_KEY*HA_MAX_KEY_SEG*MARIA_STATE_KEYSEG_SIZE) +#define MARIA_KEYDEF_SIZE (2+ 5*2) +#define MARIA_UNIQUEDEF_SIZE (2+1+1) +#define HA_KEYSEG_SIZE (6+ 2*2 + 4*2) +#define MARIA_COLUMNDEF_SIZE (6+2+2+2+2+2+1+1) +#define MARIA_BASE_INFO_SIZE (5*8 + 6*4 + 11*2 + 6 + 5*2 + 1 + 16) +#define MARIA_INDEX_BLOCK_MARGIN 16 /* Safety margin for .MYI tables */ +/* Internal management bytes needed to store 2 keys on an index page */ +#define MARIA_INDEX_MIN_OVERHEAD_SIZE (4 + (TRANSID_SIZE+1) * 2) + +/* + Basic information of the Maria table. This is stored on disk + and not changed (unless we do DLL changes). +*/ + +typedef struct st_ma_base_info +{ + my_off_t keystart; /* Start of keys */ + my_off_t max_data_file_length; + my_off_t max_key_file_length; + my_off_t margin_key_file_length; + ha_rows records, reloc; /* Create information */ + ulong mean_row_length; /* Create information */ + ulong reclength; /* length of unpacked record */ + ulong pack_reclength; /* Length of full packed rec */ + ulong min_pack_length; + ulong max_pack_length; /* Max possibly length of packed rec */ + ulong min_block_length; + uint fields; /* fields in table */ + uint fixed_not_null_fields; + uint fixed_not_null_fields_length; + uint max_field_lengths; + uint pack_fields; /* packed fields in table */ + uint varlength_fields; /* char/varchar/blobs */ + /* Number of bytes in the index used to refer to a row (2-8) */ + uint rec_reflength; + /* Number of bytes in the index used to refer to another index page (2-8) */ + uint key_reflength; /* = 2-8 */ + uint keys; /* same as in state.header */ + uint auto_key; /* Which key-1 is a auto key */ + uint blobs; /* Number of blobs */ + /* Length of packed bits (when table was created first time) */ + uint pack_bytes; + /* Length of null bits (when table was created first time) */ + uint original_null_bytes; + uint null_bytes; /* Null bytes in record */ + uint field_offsets; /* Number of field offsets */ + uint max_key_block_length; /* Max block length */ + uint max_key_length; /* Max key length */ + /* Extra allocation when using dynamic record format */ + uint extra_alloc_bytes; + uint extra_alloc_procent; + uint is_nulls_extended; /* 1 if new null bytes */ + uint min_row_length; /* Min possible length of a row */ + uint default_row_flag; /* 0 or ROW_FLAG_NULLS_EXTENDED */ + uint block_size; + /* Size of initial record buffer */ + uint default_rec_buff_size; + /* Extra number of bytes the row format require in the record buffer */ + uint extra_rec_buff_size; + + /* The following are from the header */ + uint key_parts, all_key_parts; + /** + @brief If false, we disable logging, versioning, transaction etc. Observe + difference with MARIA_SHARE::now_transactional + */ + my_bool born_transactional; +} MARIA_BASE_INFO; + + +/* Structs used intern in database */ + +typedef struct st_maria_blob /* Info of record */ +{ + ulong offset; /* Offset to blob in record */ + uint pack_length; /* Type of packed length */ + ulong length; /* Calc:ed for each record */ +} MARIA_BLOB; + + +typedef struct st_maria_pack +{ + ulong header_length; + uint ref_length; + uchar version; +} MARIA_PACK; + +typedef struct st_maria_file_bitmap +{ + uchar *map; + ulonglong page; /* Page number for current bitmap */ + uint used_size; /* Size of bitmap head that is not 0 */ + my_bool changed; /* 1 if page needs to be flushed */ + PAGECACHE_FILE file; /* datafile where bitmap is stored */ + +#ifdef THREAD + pthread_mutex_t bitmap_lock; +#endif + /* Constants, allocated when initiating bitmaps */ + uint sizes[8]; /* Size per bit combination */ + uint total_size; /* Total usable size of bitmap page */ + uint block_size; /* Block size of file */ + ulong pages_covered; /* Pages covered by bitmap + 1 */ +} MARIA_FILE_BITMAP; + +#define MARIA_CHECKPOINT_LOOKS_AT_ME 1 +#define MARIA_CHECKPOINT_SHOULD_FREE_ME 2 +#define MARIA_CHECKPOINT_SEEN_IN_LOOP 4 + +typedef struct st_maria_share +{ /* Shared between opens */ + MARIA_STATE_INFO state; + MARIA_BASE_INFO base; + MARIA_KEYDEF ft2_keyinfo; /* Second-level ft-key + definition */ + MARIA_KEYDEF *keyinfo; /* Key definitions */ + MARIA_UNIQUEDEF *uniqueinfo; /* unique definitions */ + HA_KEYSEG *keyparts; /* key part info */ + MARIA_COLUMNDEF *columndef; /* Pointer to column information */ + MARIA_PACK pack; /* Data about packed records */ + MARIA_BLOB *blobs; /* Pointer to blobs */ + char *unique_file_name; /* realpath() of index file */ + char *data_file_name; /* Resolved path names from symlinks */ + char *index_file_name; + char *open_file_name; /* parameter to open filename */ + uchar *file_map; /* mem-map of file if possible */ + PAGECACHE *pagecache; /* ref to the current key cache */ + MARIA_DECODE_TREE *decode_trees; + uint16 *decode_tables; + uint16 id; /**< 2-byte id by which log records refer to the table */ + /* Called the first time the table instance is opened */ + my_bool (*once_init)(struct st_maria_share *, File); + /* Called when the last instance of the table is closed */ + my_bool (*once_end)(struct st_maria_share *); + /* Is called for every open of the table */ + my_bool (*init)(struct st_maria_info *); + /* Is called for every close of the table */ + void (*end)(struct st_maria_info *); + /* Called when we want to read a record from a specific position */ + int (*read_record)(struct st_maria_info *, uchar *, MARIA_RECORD_POS); + /* Initialize a scan */ + my_bool (*scan_init)(struct st_maria_info *); + /* Read next record while scanning */ + int (*scan)(struct st_maria_info *, uchar *, MARIA_RECORD_POS, my_bool); + /* End scan */ + void (*scan_end)(struct st_maria_info *); + /* Pre-write of row (some handlers may do the actual write here) */ + MARIA_RECORD_POS (*write_record_init)(struct st_maria_info *, const uchar *); + /* Write record (or accept write_record_init) */ + my_bool (*write_record)(struct st_maria_info *, const uchar *); + /* Called when write failed */ + my_bool (*write_record_abort)(struct st_maria_info *); + my_bool (*update_record)(struct st_maria_info *, MARIA_RECORD_POS, + const uchar *, const uchar *); + my_bool (*delete_record)(struct st_maria_info *, const uchar *record); + my_bool (*compare_record)(struct st_maria_info *, const uchar *); + /* calculate checksum for a row */ + ha_checksum(*calc_checksum)(struct st_maria_info *, const uchar *); + /* + Calculate checksum for a row during write. May be 0 if we calculate + the checksum in write_record_init() + */ + ha_checksum(*calc_write_checksum)(struct st_maria_info *, const uchar *); + /* calculate checksum for a row during check table */ + ha_checksum(*calc_check_checksum)(struct st_maria_info *, const uchar *); + /* Compare a row in memory with a row on disk */ + my_bool (*compare_unique)(struct st_maria_info *, MARIA_UNIQUEDEF *, + const uchar *record, MARIA_RECORD_POS pos); + /* Mapings to read/write the data file */ + uint (*file_read)(MARIA_HA *, uchar *, uint, my_off_t, myf); + uint (*file_write)(MARIA_HA *, uchar *, uint, my_off_t, myf); + invalidator_by_filename invalidator; /* query cache invalidator */ + ulong this_process; /* processid */ + ulong last_process; /* For table-change-check */ + ulong last_version; /* Version on start */ + ulong options; /* Options used */ + ulong min_pack_length; /* These are used by packed data */ + ulong max_pack_length; + ulong state_diff_length; + uint rec_reflength; /* rec_reflength in use now */ + uint unique_name_length; + uint32 ftparsers; /* Number of distinct ftparsers + + 1 */ + PAGECACHE_FILE kfile; /* Shared keyfile */ + File data_file; /* Shared data file */ + int mode; /* mode of file on open */ + uint reopen; /* How many times reopened */ + uint w_locks, r_locks, tot_locks; /* Number of read/write locks */ + uint block_size; /* block_size of keyfile & data file*/ + /* Fixed length part of a packed row in BLOCK_RECORD format */ + uint base_length; + myf write_flag; + enum data_file_type data_file_type; + enum pagecache_page_type page_type; /* value depending transactional */ + uint8 in_checkpoint; /**< if Checkpoint looking at table */ + my_bool temporary; + /* Below flag is needed to make log tables work with concurrent insert */ + my_bool is_log_table; + + my_bool changed, /* If changed since lock */ + global_changed, /* If changed since open */ + not_flushed, concurrent_insert; + my_bool delay_key_write; + my_bool have_rtree; + /** + @brief if the table is transactional right now. It may have been created + transactional (base.born_transactional==TRUE) but with transactionality + (logging) temporarily disabled (now_transactional==FALSE). The opposite + (FALSE, TRUE) is impossible. + */ + my_bool now_transactional; +#ifdef THREAD + THR_LOCK lock; + pthread_mutex_t intern_lock; /* Locking for use with _locking */ + rw_lock_t *key_root_lock; +#endif + my_off_t mmaped_length; + uint nonmmaped_inserts; /* counter of writing in + non-mmaped area */ + MARIA_FILE_BITMAP bitmap; + rw_lock_t mmap_lock; + LSN lsn_of_file_id; /**< LSN of its last LOGREC_FILE_ID */ +} MARIA_SHARE; + + +typedef uchar MARIA_BITMAP_BUFFER; + +typedef struct st_maria_bitmap_block +{ + ulonglong page; /* Page number */ + /* Number of continuous pages. TAIL_BIT is set if this is a tail page */ + uint page_count; + uint empty_space; /* Set for head and tail pages */ + /* + Number of BLOCKS for block-region (holds all non-blob-fields or one blob) + */ + uint sub_blocks; + /* set to <> 0 in write_record() if this block was actually used */ + uint8 used; + uint8 org_bitmap_value; +} MARIA_BITMAP_BLOCK; + + +typedef struct st_maria_bitmap_blocks +{ + MARIA_BITMAP_BLOCK *block; + uint count; + my_bool tail_page_skipped; /* If some tail pages was not used */ + my_bool page_skipped; /* If some full pages was not used */ +} MARIA_BITMAP_BLOCKS; + + +/* Data about the currently read row */ +typedef struct st_maria_row +{ + MARIA_BITMAP_BLOCKS insert_blocks; + MARIA_BITMAP_BUFFER *extents; + MARIA_RECORD_POS lastpos, nextpos; + MARIA_RECORD_POS *tail_positions; + ha_checksum checksum; + uchar *empty_bits, *field_lengths; + uint *null_field_lengths; /* All null field lengths */ + ulong *blob_lengths; /* Length for each blob */ + ulong base_length, normal_length, char_length, varchar_length, blob_length; + ulong head_length, total_length; + size_t extents_buffer_length; /* Size of 'extents' buffer */ + uint field_lengths_length; /* Length of data in field_lengths */ + uint extents_count; /* number of extents in 'extents' */ + uint full_page_count, tail_count; /* For maria_chk */ + uint space_on_head_page; +} MARIA_ROW; + +/* Data to scan row in blocked format */ +typedef struct st_maria_block_scan +{ + uchar *bitmap_buff, *bitmap_pos, *bitmap_end, *page_buff; + uchar *dir, *dir_end; + ulong bitmap_page; + ulonglong bits; + uint number_of_rows, bit_pos; + MARIA_RECORD_POS row_base_page; +} MARIA_BLOCK_SCAN; + + +struct st_maria_info +{ + MARIA_SHARE *s; /* Shared between open:s */ + struct st_transaction *trn; /* Pointer to active transaction */ + MARIA_STATUS_INFO *state, save_state; + MARIA_ROW cur_row; /* The active row that we just read */ + MARIA_ROW new_row; /* Storage for a row during update */ + MARIA_BLOCK_SCAN scan; + MARIA_BLOB *blobs; /* Pointer to blobs */ + MARIA_BIT_BUFF bit_buff; + DYNAMIC_ARRAY bitmap_blocks; + DYNAMIC_ARRAY pinned_pages; + /* accumulate indexfile changes between write's */ + TREE *bulk_insert; + LEX_STRING *log_row_parts; /* For logging */ + DYNAMIC_ARRAY *ft1_to_ft2; /* used only in ft1->ft2 conversion */ + MEM_ROOT ft_memroot; /* used by the parser */ + MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit */ + uchar *buff; /* page buffer */ + uchar *keyread_buff; /* Buffer for last key read */ + uchar *lastkey, *lastkey2; /* Last used search key */ + uchar *first_mbr_key; /* Searhed spatial key */ + uchar *rec_buff; /* Temp buffer for recordpack */ + uchar *int_keypos, /* Save position for next/previous */ + *int_maxpos; /* -""- */ + uchar *update_field_data; /* Used by update in rows-in-block */ + uint int_nod_flag; /* -""- */ + uint32 int_keytree_version; /* -""- */ + int (*read_record) (struct st_maria_info *, uchar*, MARIA_RECORD_POS); + invalidator_by_filename invalidator; /* query cache invalidator */ + ulong this_unique; /* uniq filenumber or thread */ + ulong last_unique; /* last unique number */ + ulong this_loop; /* counter for this open */ + ulong last_loop; /* last used counter */ + MARIA_RECORD_POS save_lastpos; + MARIA_RECORD_POS dup_key_pos; + my_off_t pos; /* Intern variable */ + my_off_t last_keypage; /* Last key page read */ + my_off_t last_search_keypage; /* Last keypage when searching */ + + /* + QQ: the folloing two xxx_length fields should be removed, + as they are not compatible with parallel repair + */ + ulong packed_length, blob_length; /* Length of found, packed record */ + size_t rec_buff_size; + PAGECACHE_FILE dfile; /* The datafile */ + IO_CACHE rec_cache; /* When cacheing records */ + LIST open_list; + MY_BITMAP changed_fields; + uint opt_flag; /* Optim. for space/speed */ + uint update; /* If file changed since open */ + int lastinx; /* Last used index */ + uint lastkey_length; /* Length of key in lastkey */ + uint last_rkey_length; /* Last length in maria_rkey() */ + enum ha_rkey_function last_key_func; /* CONTAIN, OVERLAP, etc */ + uint save_lastkey_length; + uint pack_key_length; /* For MARIAMRG */ + uint16 last_used_keyseg; /* For MARIAMRG */ + int errkey; /* Got last error on this key */ + int lock_type; /* How database was locked */ + int tmp_lock_type; /* When locked by readinfo */ + uint data_changed; /* Somebody has changed data */ + uint save_update; /* When using KEY_READ */ + int save_lastinx; + uint preload_buff_size; /* When preloading indexes */ + myf lock_wait; /* is 0 or MY_DONT_WAIT */ + my_bool was_locked; /* Was locked in panic */ + my_bool append_insert_at_end; /* Set if concurrent insert */ + my_bool quick_mode; + /* If info->keyread_buff can't be used for rnext */ + my_bool page_changed; + /* If info->keyread_buff has to be re-read for rnext */ + my_bool keyread_buff_used; + my_bool once_flags; /* For MARIA_MRG */ +#ifdef __WIN__ + my_bool owned_by_merge; /* This Maria table is part of a merge union */ +#endif +#ifdef THREAD + THR_LOCK_DATA lock; +#endif + uchar *maria_rtree_recursion_state; /* For RTREE */ + uchar length_buff[5]; /* temp buff to store blob lengths */ + int maria_rtree_recursion_depth; +}; + +/* Some defines used by maria-functions */ + +#define USE_WHOLE_KEY 65535 /* Use whole key in _search() */ +#define F_EXTRA_LCK -1 + +/* bits in opt_flag */ +#define MEMMAP_USED 32 +#define REMEMBER_OLD_POS 64 + +#define WRITEINFO_UPDATE_KEYFILE 1 +#define WRITEINFO_NO_UNLOCK 2 + +/* once_flags */ +#define USE_PACKED_KEYS 1 +#define RRND_PRESERVE_LASTINX 2 + +/* bits in state.changed */ + +#define STATE_CHANGED 1 +#define STATE_CRASHED 2 +#define STATE_CRASHED_ON_REPAIR 4 +#define STATE_NOT_ANALYZED 8 +#define STATE_NOT_OPTIMIZED_KEYS 16 +#define STATE_NOT_SORTED_PAGES 32 +#define STATE_NOT_OPTIMIZED_ROWS 64 + +/* options to maria_read_cache */ + +#define READING_NEXT 1 +#define READING_HEADER 2 + +#define maria_data_on_page(x) ((uint) mi_uint2korr(x) & 32767) +#define maria_putint(x,y,nod) { uint16 boh=(nod ? (uint16) 32768 : 0) + (uint16) (y);\ + mi_int2store(x,boh); } +#define _ma_test_if_nod(x) (x[0] & 128 ? info->s->base.key_reflength : 0) +#define maria_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \ + DBUG_PRINT("error", ("Marked table crashed")); \ + }while(0) +#define maria_mark_crashed_on_repair(x) do{(x)->s->state.changed|= \ + STATE_CRASHED|STATE_CRASHED_ON_REPAIR; \ + (x)->update|= HA_STATE_CHANGED; \ + DBUG_PRINT("error", \ + ("Marked table crashed")); \ + }while(0) +#define maria_is_crashed(x) ((x)->s->state.changed & STATE_CRASHED) +#define maria_is_crashed_on_repair(x) ((x)->s->state.changed & STATE_CRASHED_ON_REPAIR) +#ifdef EXTRA_DEBUG +#define maria_print_error(SHARE, ERRNO) \ + _ma_report_error((ERRNO), (SHARE)->index_file_name) +#else +#define maria_print_error(SHARE, ERRNO) while (0) +#endif + + +/* Functions to store length of space packed keys, VARCHAR or BLOB keys */ + +#define store_key_length(key,length) \ +{ if ((length) < 255) \ + { *(key)=(length); } \ + else \ + { *(key)=255; mi_int2store((key)+1,(length)); } \ +} + +#define get_key_full_length(length,key) \ + { if (*(uchar*) (key) != 255) \ + length= ((uint) *(uchar*) ((key)++))+1; \ + else \ + { length=mi_uint2korr((key)+1)+3; (key)+=3; } \ +} + +#define get_key_full_length_rdonly(length,key) \ +{ if (*(uchar*) (key) != 255) \ + length= ((uint) *(uchar*) ((key)))+1; \ + else \ + { length=mi_uint2korr((key)+1)+3; } \ +} + +#define maria_max_key_length() ((maria_block_size - MARIA_INDEX_MIN_OVERHEAD_SIZE)/2) +#define get_pack_length(length) ((length) >= 255 ? 3 : 1) + +#define MARIA_MIN_BLOCK_LENGTH 20 /* Because of delete-link */ +/* Don't use to small record-blocks */ +#define MARIA_EXTEND_BLOCK_LENGTH 20 +#define MARIA_SPLIT_LENGTH ((MARIA_EXTEND_BLOCK_LENGTH+4)*2) + /* Max prefix of record-block */ +#define MARIA_MAX_DYN_BLOCK_HEADER 20 +#define MARIA_BLOCK_INFO_HEADER_LENGTH 20 +#define MARIA_DYN_DELETE_BLOCK_HEADER 20 /* length of delete-block-header */ +#define MARIA_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L) +#define MARIA_DYN_MAX_ROW_LENGTH (MARIA_DYN_MAX_BLOCK_LENGTH - MARIA_SPLIT_LENGTH) +#define MARIA_DYN_ALIGN_SIZE 4 /* Align blocks on this */ +#define MARIA_MAX_DYN_HEADER_BYTE 13 /* max header uchar for dynamic rows */ +#define MARIA_MAX_BLOCK_LENGTH ((((ulong) 1 << 24)-1) & (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1))) +#define MARIA_REC_BUFF_OFFSET ALIGN_SIZE(MARIA_DYN_DELETE_BLOCK_HEADER+sizeof(uint32)) + +#define MEMMAP_EXTRA_MARGIN 7 /* Write this as a suffix for file */ + +#define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */ +#define PACK_TYPE_SPACE_FIELDS 2 +#define PACK_TYPE_ZERO_FILL 4 +#define MARIA_FOUND_WRONG_KEY 32738 /* Impossible value from ha_key_cmp */ + +#define MARIA_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size)) +#define MARIA_MAX_KEYPTR_SIZE 5 /* For calculating block lengths */ +#define MARIA_MIN_KEYBLOCK_LENGTH 50 /* When to split delete blocks */ + +#define MARIA_MIN_SIZE_BULK_INSERT_TREE 16384 /* this is per key */ +#define MARIA_MIN_ROWS_TO_USE_BULK_INSERT 100 +#define MARIA_MIN_ROWS_TO_DISABLE_INDEXES 100 +#define MARIA_MIN_ROWS_TO_USE_WRITE_CACHE 10 + +/* The UNIQUE check is done with a hashed long key */ + +#define MARIA_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT +#define maria_unique_store(A,B) mi_int4store((A),(B)) + +#ifdef THREAD +extern pthread_mutex_t THR_LOCK_maria; +#endif +#if !defined(THREAD) || defined(DONT_USE_RW_LOCKS) +#define rw_wrlock(A) {} +#define rw_rdlock(A) {} +#define rw_unlock(A) {} +#endif + + +/* Some extern variables */ +extern LIST *maria_open_list; +extern uchar NEAR maria_file_magic[], NEAR maria_pack_file_magic[]; +extern uint NEAR maria_read_vec[], NEAR maria_readnext_vec[]; +extern uint maria_quick_table_bits; +extern const char *maria_data_root; +extern uchar maria_zero_string[]; +extern my_bool maria_inited; + + +/* This is used by _ma_calc_xxx_key_length och _ma_store_key */ +typedef struct st_maria_s_param +{ + uint ref_length, key_length, n_ref_length; + uint n_length, totlength, part_of_prev_key, prev_length, pack_marker; + const uchar *key; + uchar *prev_key, *next_key_pos; + bool store_not_null; +} MARIA_KEY_PARAM; + + +/* Used to store reference to pinned page */ +typedef struct st_pinned_page +{ + PAGECACHE_PAGE_LINK link; + enum pagecache_page_lock unlock; +} MARIA_PINNED_PAGE; + + +/* Prototypes for intern functions */ +extern int _ma_read_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS); +extern int _ma_read_rnd_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS, + my_bool); +extern my_bool _ma_write_dynamic_record(MARIA_HA *, const uchar *); +extern my_bool _ma_update_dynamic_record(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); +extern my_bool _ma_delete_dynamic_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_cmp_dynamic_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_write_blob_record(MARIA_HA *, const uchar *); +extern my_bool _ma_update_blob_record(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); +extern int _ma_read_static_record(MARIA_HA *info, uchar *, MARIA_RECORD_POS); +extern int _ma_read_rnd_static_record(MARIA_HA *, uchar *, MARIA_RECORD_POS, + my_bool); +extern my_bool _ma_write_static_record(MARIA_HA *, const uchar *); +extern my_bool _ma_update_static_record(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); +extern my_bool _ma_delete_static_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_cmp_static_record(MARIA_HA *info, const uchar *record); +extern int _ma_ck_write(MARIA_HA *info, uint keynr, uchar *key, + uint length); +extern int _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, + MARIA_RECORD_POS *root, uint comp_flag); +extern int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, MARIA_RECORD_POS *root); +extern int _ma_insert(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, + uchar *anc_buff, uchar *key_pos, uchar *key_buff, + uchar *father_buff, uchar *father_keypos, + my_off_t father_page, my_bool insert_last); +extern int _ma_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uchar *buff, uchar *key_buff, + my_bool insert_last); +extern uchar *_ma_find_half_pos(uint nod_flag, MARIA_KEYDEF *keyinfo, + uchar *page, uchar *key, + uint *return_key_length, + uchar ** after_key); +extern int _ma_calc_static_key_length(MARIA_KEYDEF *keyinfo, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *key_buff, const uchar *key, + MARIA_KEY_PARAM *s_temp); +extern int _ma_calc_var_key_length(MARIA_KEYDEF *keyinfo, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *key_buff, const uchar *key, + MARIA_KEY_PARAM *s_temp); +extern int _ma_calc_var_pack_key_length(MARIA_KEYDEF *keyinfo, + uint nod_flag, uchar *key_pos, + uchar *org_key, uchar *prev_key, + const uchar *key, + MARIA_KEY_PARAM *s_temp); +extern int _ma_calc_bin_pack_key_length(MARIA_KEYDEF *keyinfo, + uint nod_flag, uchar *key_pos, + uchar *org_key, uchar *prev_key, + const uchar *key, + MARIA_KEY_PARAM *s_temp); +void _ma_store_static_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +#ifdef NOT_USED +void _ma_store_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +#endif +void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); + +extern int _ma_ck_delete(MARIA_HA *info, uint keynr, uchar *key, + uint key_length); +extern int _ma_readinfo(MARIA_HA *info, int lock_flag, int check_keybuffer); +extern int _ma_writeinfo(MARIA_HA *info, uint options); +extern int _ma_test_if_changed(MARIA_HA *info); +extern int _ma_mark_file_changed(MARIA_HA *info); +extern int _ma_decrement_open_count(MARIA_HA *info); +extern int _ma_check_index(MARIA_HA *info, int inx); +extern int _ma_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, + uint key_len, uint nextflag, my_off_t pos); +extern int _ma_bin_search(struct st_maria_info *info, MARIA_KEYDEF *keyinfo, + uchar *page, uchar *key, uint key_len, + uint comp_flag, uchar **ret_pos, uchar *buff, + my_bool *was_last_key); +extern int _ma_seq_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *page, uchar *key, uint key_len, + uint comp_flag, uchar ** ret_pos, uchar *buff, + my_bool *was_last_key); +extern int _ma_prefix_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *page, uchar *key, uint key_len, + uint comp_flag, uchar ** ret_pos, uchar *buff, + my_bool *was_last_key); +extern my_off_t _ma_kpos(uint nod_flag, uchar *after_key); +extern void _ma_kpointer(MARIA_HA *info, uchar *buff, my_off_t pos); +extern MARIA_RECORD_POS _ma_dpos(MARIA_HA *info, uint nod_flag, + const uchar *after_key); +extern MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *info, uchar *ptr); +extern void _ma_dpointer(MARIA_HA *info, uchar *buff, MARIA_RECORD_POS pos); +extern uint _ma_get_static_key(MARIA_KEYDEF *keyinfo, uint nod_flag, + uchar **page, uchar *key); +extern uint _ma_get_pack_key(MARIA_KEYDEF *keyinfo, uint nod_flag, + uchar **page, uchar *key); +extern uint _ma_get_binary_pack_key(MARIA_KEYDEF *keyinfo, uint nod_flag, + uchar ** page_pos, uchar *key); +extern uchar *_ma_get_last_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *keypos, uchar *lastkey, + uchar *endpos, uint *return_key_length); +extern uchar *_ma_get_key(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *page, uchar *key, uchar *keypos, + uint *return_key_length); +extern uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key); +extern uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, register const uchar *key, + HA_KEYSEG *end); +extern uchar *_ma_move_key(MARIA_KEYDEF *keyinfo, uchar *to, const uchar *from); +extern int _ma_search_next(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, uint nextflag, + my_off_t pos); +extern int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos); +extern int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos); +extern uchar *_ma_fetch_keypage(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t page, int level, uchar *buff, + int return_buffer); +extern int _ma_write_keypage(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t page, int level, uchar *buff); +extern int _ma_dispose(MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_off_t pos, + int level); +extern my_off_t _ma_new(MARIA_HA *info, MARIA_KEYDEF *keyinfo, int level); +extern uint _ma_make_key(MARIA_HA *info, uint keynr, uchar *key, + const uchar *record, MARIA_RECORD_POS filepos); +extern uint _ma_pack_key(MARIA_HA *info, uint keynr, uchar *key, + const uchar *old, key_part_map keypart_map, + HA_KEYSEG ** last_used_keyseg); +extern int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS); +extern int _ma_read_cache(IO_CACHE *info, uchar *buff, MARIA_RECORD_POS pos, + uint length, int re_read_if_possibly); +extern ulonglong ma_retrieve_auto_increment(MARIA_HA *info, const uchar *record); + +extern my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size, + size_t new_size); +extern ulong _ma_rec_unpack(MARIA_HA *info, uchar *to, uchar *from, + ulong reclength); +extern my_bool _ma_rec_check(MARIA_HA *info, const uchar *record, + uchar *packpos, ulong packed_length, + my_bool with_checkum, ha_checksum checksum); +extern int _ma_write_part_record(MARIA_HA *info, my_off_t filepos, + ulong length, my_off_t next_filepos, + uchar ** record, ulong *reclength, + int *flag); +extern void _ma_print_key(FILE *stream, HA_KEYSEG *keyseg, + const uchar *key, uint length); +extern my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile); +extern my_bool _ma_once_end_pack_row(MARIA_SHARE *share); +extern int _ma_read_pack_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos); +extern int _ma_read_rnd_pack_record(MARIA_HA *, uchar *, MARIA_RECORD_POS, + my_bool); +extern int _ma_pack_rec_unpack(MARIA_HA *info, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *from, ulong reclength); +extern ulonglong _ma_safe_mul(ulonglong a, ulonglong b); +extern int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf, + const uchar *oldrec, const uchar *newrec, + my_off_t pos); + +/* + Parameter to _ma_get_block_info + The dynamic row header is read into this struct. For an explanation of + the fields, look at the function _ma_get_block_info(). +*/ + +typedef struct st_maria_block_info +{ + uchar header[MARIA_BLOCK_INFO_HEADER_LENGTH]; + ulong rec_len; + ulong data_len; + ulong block_len; + ulong blob_len; + MARIA_RECORD_POS filepos; + MARIA_RECORD_POS next_filepos; + MARIA_RECORD_POS prev_filepos; + uint second_read; + uint offset; +} MARIA_BLOCK_INFO; + + +/* bits in return from _ma_get_block_info */ + +#define BLOCK_FIRST 1 +#define BLOCK_LAST 2 +#define BLOCK_DELETED 4 +#define BLOCK_ERROR 8 /* Wrong data */ +#define BLOCK_SYNC_ERROR 16 /* Right data at wrong place */ +#define BLOCK_FATAL_ERROR 32 /* hardware-error */ + +#define NEED_MEM ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */ +#define MAXERR 20 +#define BUFFERS_WHEN_SORTING 16 /* Alloc for sort-key-tree */ +#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE +#define INDEX_TMP_EXT ".TMM" +#define DATA_TMP_EXT ".TMD" + +#define UPDATE_TIME 1 +#define UPDATE_STAT 2 +#define UPDATE_SORT 4 +#define UPDATE_AUTO_INC 8 +#define UPDATE_OPEN_COUNT 16 + +#define USE_BUFFER_INIT (((1024L*512L-MALLOC_OVERHEAD)/IO_SIZE)*IO_SIZE) +#define READ_BUFFER_INIT (1024L*256L-MALLOC_OVERHEAD) +#define SORT_BUFFER_INIT (2048L*1024L-MALLOC_OVERHEAD) +#define MIN_SORT_BUFFER (4096-MALLOC_OVERHEAD) + +#define fast_ma_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _ma_writeinfo((INFO),0) +#define fast_ma_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _ma_readinfo((INFO),F_RDLCK,1) + +extern uint _ma_get_block_info(MARIA_BLOCK_INFO *, File, my_off_t); +extern uint _ma_rec_pack(MARIA_HA *info, uchar *to, const uchar *from); +extern uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, uchar **rec_buff_p, + size_t *rec_buff_size, + File file, my_off_t filepos); +extern void _ma_store_blob_length(uchar *pos, uint pack_length, uint length); +extern void _ma_report_error(int errcode, const char *file_name); +extern my_bool _ma_memmap_file(MARIA_HA *info); +extern void _ma_unmap_file(MARIA_HA *info); +extern uint _ma_save_pack_length(uint version, uchar * block_buff, + ulong length); +extern uint _ma_calc_pack_length(uint version, ulong length); +extern ulong _ma_calc_blob_length(uint length, const uchar *pos); +extern uint _ma_mmap_pread(MARIA_HA *info, uchar *Buffer, + uint Count, my_off_t offset, myf MyFlags); +extern uint _ma_mmap_pwrite(MARIA_HA *info, uchar *Buffer, + uint Count, my_off_t offset, myf MyFlags); +extern uint _ma_nommap_pread(MARIA_HA *info, uchar *Buffer, + uint Count, my_off_t offset, myf MyFlags); +extern uint _ma_nommap_pwrite(MARIA_HA *info, uchar *Buffer, + uint Count, my_off_t offset, myf MyFlags); + +uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite); +uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite); +uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state); +uint _ma_base_info_write(File file, MARIA_BASE_INFO *base); +int _ma_keyseg_write(File file, const HA_KEYSEG *keyseg); +char *_ma_keyseg_read(char *ptr, HA_KEYSEG *keyseg); +uint _ma_keydef_write(File file, MARIA_KEYDEF *keydef); +char *_ma_keydef_read(char *ptr, MARIA_KEYDEF *keydef); +uint _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *keydef); +char *_ma_uniquedef_read(char *ptr, MARIA_UNIQUEDEF *keydef); +uint _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef); +char *_ma_columndef_read(char *ptr, MARIA_COLUMNDEF *columndef); +ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record); +ha_checksum _ma_checksum(MARIA_HA *info, const uchar *buf); +ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *buf); +my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + uchar *record, ha_checksum unique_hash, + MARIA_RECORD_POS pos); +ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *buf); +my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos); +my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos); +my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b, + my_bool null_are_equal); +void _ma_get_status(void *param, int concurrent_insert); +void _ma_update_status(void *param); +void _ma_restore_status(void *param); +void _ma_copy_status(void *to, void *from); +my_bool _ma_check_status(void *param); +void _ma_reset_status(MARIA_HA *maria); +#include "ma_commit.h" + +extern MARIA_HA *_ma_test_if_reopen(char *filename); +my_bool _ma_check_table_is_closed(const char *name, const char *where); +int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share, File file_to_dup); +int _ma_open_keyfile(MARIA_SHARE *share); +void _ma_setup_functions(register MARIA_SHARE *share); +my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size); +void _ma_remap_file(MARIA_HA *info, my_off_t size); + +MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, const uchar *record); +my_bool _ma_write_abort_default(MARIA_HA *info); + +C_MODE_START +#define MARIA_FLUSH_DATA 1 +#define MARIA_FLUSH_INDEX 2 +int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index, + enum flush_type flush_type_for_data, + enum flush_type flush_type_for_index); +/* Functions needed by _ma_check (are overrided in MySQL) */ +volatile int *_ma_killed_ptr(HA_CHECK *param); +void _ma_check_print_error _VARARGS((HA_CHECK *param, const char *fmt, ...)); +void _ma_check_print_warning _VARARGS((HA_CHECK *param, const char *fmt, ...)); +void _ma_check_print_info _VARARGS((HA_CHECK *param, const char *fmt, ...)); +C_MODE_END + +int _ma_flush_pending_blocks(MARIA_SORT_PARAM *param); +int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param); +int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param); +#ifdef THREAD +pthread_handler_t _ma_thr_find_all_keys(void *arg); +#endif +int _ma_flush_table_files_after_repair(HA_CHECK *param, MARIA_HA *info); + +int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param); +int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages, + ulong); +int _ma_sync_table_files(const MARIA_HA *info); +int _ma_initialize_data_file(MARIA_SHARE *share, File dfile); +int _ma_update_create_rename_lsn(MARIA_SHARE *share, + LSN lsn, my_bool do_sync); +int _ma_update_create_rename_lsn_sub(MARIA_SHARE *share, + LSN lsn, my_bool do_sync); + +void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn); +#define _ma_tmp_disable_logging_for_table(S) \ + { (S)->now_transactional= FALSE; (S)->page_type= PAGECACHE_PLAIN_PAGE; } +#define _ma_reenable_logging_for_table(S) \ + { if (((S)->now_transactional= (S)->base.born_transactional)) \ + (S)->page_type= PAGECACHE_LSN_PAGE; } + +extern PAGECACHE *maria_log_pagecache; diff --git a/storage/maria/maria_ftdump.c b/storage/maria/maria_ftdump.c new file mode 100644 index 00000000000..9df86b50474 --- /dev/null +++ b/storage/maria/maria_ftdump.c @@ -0,0 +1,279 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code + added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ + +#include "ma_ftdefs.h" +#include <my_getopt.h> + +static void usage(); +static void complain(int val); +static my_bool get_one_option(int, const struct my_option *, char *); + +static int count=0, stats=0, dump=0, lstats=0; +static my_bool verbose; +static char *query=NULL; +static uint lengths[256]; + +#define MAX_LEN (HA_FT_MAXBYTELEN+10) +#define HOW_OFTEN_TO_WRITE 10000 + +static struct my_option my_long_options[] = +{ + {"help", 'h', "Display help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', "Synonym for -h.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"count", 'c', "Calculate per-word stats (counts and global weights).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"dump", 'd', "Dump index (incl. data offsets and word weights).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"length", 'l', "Report length distribution.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"stats", 's', "Report global stats.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Be verbose.", + (uchar**) &verbose, (uchar**) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +int main(int argc,char *argv[]) +{ + int error=0, subkeys; + uint keylen, keylen2=0, inx, doc_cnt=0; + float weight= 1.0; + double gws, min_gws=0, avg_gws=0; + MARIA_HA *info; + char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN]; + ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0; + struct { MARIA_HA *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */ + + MY_INIT(argv[0]); + if ((error= handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(error); + maria_init(); + if (count || dump) + verbose=0; + if (!count && !dump && !lstats && !query) + stats=1; + + if (verbose) + setbuf(stdout,NULL); + + if (argc < 2) + usage(); + + { + char *end; + inx= (uint) strtoll(argv[1], &end, 10); + if (*end) + usage(); + } + + init_pagecache(maria_pagecache, USE_BUFFER_INIT, 0, 0, + MARIA_KEY_BLOCK_LENGTH); + + if (!(info=maria_open(argv[0], O_RDONLY, + HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER))) + { + error=my_errno; + goto err; + } + + *buf2=0; + aio->info=info; + + if ((inx >= info->s->base.keys) || + !(info->s->keyinfo[inx].flag & HA_FULLTEXT)) + { + printf("Key %d in table %s is not a FULLTEXT key\n", inx, info->s->open_file_name); + goto err; + } + + maria_lock_database(info, F_EXTRA_LCK); + + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->update|= HA_STATE_PREV_FOUND; + + while (!(error=maria_rnext(info,NULL,inx))) + { + keylen=*(info->lastkey); + + subkeys=ft_sintXkorr(info->lastkey+keylen+1); + if (subkeys >= 0) + weight=*(float*)&subkeys; + +#ifdef HAVE_SNPRINTF + snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey+1); +#else + sprintf(buf,"%.*s",(int) keylen,info->lastkey+1); +#endif + my_casedn_str(default_charset_info,buf); + total++; + lengths[keylen]++; + + if (count || stats) + { + if (strcmp(buf, buf2)) + { + if (*buf2) + { + uniq++; + avg_gws+=gws=GWS_IN_USE; + if (count) + printf("%9u %20.7f %s\n",doc_cnt,gws,buf2); + if (maxlen<keylen2) + { + maxlen=keylen2; + strmov(buf_maxlen, buf2); + } + if (max_doc_cnt < doc_cnt) + { + max_doc_cnt=doc_cnt; + strmov(buf_min_gws, buf2); + min_gws=gws; + } + } + strmov(buf2, buf); + keylen2=keylen; + doc_cnt=0; + } + doc_cnt+= (subkeys >= 0 ? 1 : -subkeys); + } + if (dump) + { + if (subkeys>=0) + printf("%9lx %20.7f %s\n", (long) info->cur_row.lastpos,weight,buf); + else + printf("%9lx => %17d %s\n",(long) info->cur_row.lastpos,-subkeys,buf); + } + if (verbose && (total%HOW_OFTEN_TO_WRITE)==0) + printf("%10ld\r",total); + } + maria_lock_database(info, F_UNLCK); + + if (count || stats) + { + if (*buf2) + { + uniq++; + avg_gws+=gws=GWS_IN_USE; + if (count) + printf("%9u %20.7f %s\n",doc_cnt,gws,buf2); + if (maxlen<keylen2) + { + maxlen=keylen2; + strmov(buf_maxlen, buf2); + } + if (max_doc_cnt < doc_cnt) + { + max_doc_cnt=doc_cnt; + strmov(buf_min_gws, buf2); + min_gws=gws; + } + } + } + + if (stats) + { + count=0; + for (inx=0;inx<256;inx++) + { + count+=lengths[inx]; + if ((ulong) count >= total/2) + break; + } + printf("Total rows: %lu\nTotal words: %lu\n" + "Unique words: %lu\nLongest word: %lu chars (%s)\n" + "Median length: %u\n" + "Average global weight: %f\n" + "Most common word: %lu times, weight: %f (%s)\n", + (long) info->state->records, total, uniq, maxlen, buf_maxlen, + inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws); + } + if (lstats) + { + count=0; + for (inx=0; inx<256; inx++) + { + count+=lengths[inx]; + if (count && lengths[inx]) + printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx, + (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count, + 100.0*count/total); + } + } + +err: + if (error && error != HA_ERR_END_OF_FILE) + printf("got error %d\n",my_errno); + if (info) + maria_close(info); + maria_end(); + return 0; +} + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch(optid) { + case 'd': + dump=1; + complain(count || query); + break; + case 's': + stats=1; + complain(query!=0); + break; + case 'c': + count= 1; + complain(dump || query); + break; + case 'l': + lstats=1; + complain(query!=0); + break; + case '?': + case 'h': + usage(); + } + return 0; +} + +#include <help_start.h> + +static void usage() +{ + printf("Use: maria_ft_dump <table_name> <index_num>\n"); + my_print_help(my_long_options); + my_print_variables(my_long_options); + NETWARE_SET_SCREEN_MODE(1); + exit(1); +} + +#include <help_end.h> + +static void complain(int val) /* Kinda assert :-) */ +{ + if (val) + { + printf("You cannot use these options together!\n"); + exit(1); + } +} diff --git a/storage/maria/maria_pack.c b/storage/maria/maria_pack.c new file mode 100644 index 00000000000..83f88fcb0dc --- /dev/null +++ b/storage/maria/maria_pack.c @@ -0,0 +1,3227 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Pack MARIA file */ + +#ifndef USE_MY_FUNC +#define USE_MY_FUNC /* We need at least my_malloc */ +#endif + +#include "maria_def.h" +#include <queues.h> +#include <my_tree.h> +#include "mysys_err.h" +#ifdef MSDOS +#include <io.h> +#endif +#ifndef __GNU_LIBRARY__ +#define __GNU_LIBRARY__ /* Skip warnings in getopt.h */ +#endif +#include <my_getopt.h> +#include <assert.h> + +#if SIZEOF_LONG_LONG > 4 +#define BITS_SAVED 64 +#else +#define BITS_SAVED 32 +#endif + +#define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */ +#define HEAD_LENGTH 32 +#define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */ + +#define DATA_TMP_EXT ".TMD" +#define OLD_EXT ".OLD" +#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE + +struct st_file_buffer { + File file; + uchar *buffer,*pos,*end; + my_off_t pos_in_file; + int bits; + ulonglong bitbucket; +}; + +struct st_huff_tree; +struct st_huff_element; + +typedef struct st_huff_counts { + uint field_length,max_zero_fill; + uint pack_type; + uint max_end_space,max_pre_space,length_bits,min_space; + ulong max_length; + enum en_fieldtype field_type; + struct st_huff_tree *tree; /* Tree for field */ + my_off_t counts[256]; + my_off_t end_space[8]; + my_off_t pre_space[8]; + my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed; + TREE int_tree; /* Tree for detecting distinct column values. */ + uchar *tree_buff; /* Column values, 'field_length' each. */ + uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */ +} HUFF_COUNTS; + +typedef struct st_huff_element HUFF_ELEMENT; + +/* + WARNING: It is crucial for the optimizations in calc_packed_length() + that 'count' is the first element of 'HUFF_ELEMENT'. +*/ +struct st_huff_element { + my_off_t count; + union un_element { + struct st_nod { + HUFF_ELEMENT *left,*right; + } nod; + struct st_leaf { + HUFF_ELEMENT *null; + uint element_nr; /* Number of element */ + } leaf; + } a; +}; + + +typedef struct st_huff_tree { + HUFF_ELEMENT *root,*element_buffer; + HUFF_COUNTS *counts; + uint tree_number; + uint elements; + my_off_t bytes_packed; + uint tree_pack_length; + uint min_chr,max_chr,char_bits,offset_bits,max_offset,height; + ulonglong *code; + uchar *code_len; +} HUFF_TREE; + + +typedef struct st_isam_mrg { + MARIA_HA **file,**current,**end; + uint free_file; + uint count; + uint min_pack_length; /* Theese is used by packed data */ + uint max_pack_length; + uint ref_length; + uint max_blob_length; + my_off_t records; + /* true if at least one source file has at least one disabled index */ + my_bool src_file_has_indexes_disabled; +} PACK_MRG_INFO; + + +extern int main(int argc,char * *argv); +static void get_options(int *argc,char ***argv); +static MARIA_HA *open_isam_file(char *name,int mode); +static bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count); +static int compress(PACK_MRG_INFO *file,char *join_name); +static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records); +static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, + uint trees, + HUFF_COUNTS *huff_counts, + uint fields); +static int compare_tree(void* cmp_arg __attribute__((unused)), + const uchar *s,const uchar *t); +static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts); +static void check_counts(HUFF_COUNTS *huff_counts,uint trees, + my_off_t records); +static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records, + uint max_space_length,my_off_t *space_counts, + my_off_t tot_space_count, + enum en_fieldtype field_type); +static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees); +static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts); +static int compare_huff_elements(void *not_used, uchar *a,uchar *b); +static int save_counts_in_queue(uchar *key,element_count count, + HUFF_TREE *tree); +static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag); +static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees); +static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees); +static void make_traverse_code_tree(HUFF_TREE *huff_tree, + HUFF_ELEMENT *element,uint size, + ulonglong code); +static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees, + my_off_t tot_elements,my_off_t filelength); +static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees); +static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees); +static uint *make_offset_code_tree(HUFF_TREE *huff_tree, + HUFF_ELEMENT *element, + uint *offset); +static uint max_bit(uint value); +static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts); +static char *make_new_name(char *new_name,char *old_name); +static char *make_old_name(char *new_name,char *old_name); +static void init_file_buffer(File file,pbool read_buffer); +static int flush_buffer(ulong neaded_length); +static void end_file_buffer(void); +static void write_bits(ulonglong value, uint bits); +static void flush_bits(void); +static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length, + ha_checksum crc); +static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length, + ha_checksum crc); +static int mrg_close(PACK_MRG_INFO *mrg); +static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf); +static void mrg_reset(PACK_MRG_INFO *mrg); +#if !defined(DBUG_OFF) +static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count); +static int fakecmp(my_off_t **count1, my_off_t **count2); +#endif + + +static int error_on_write=0,test_only=0,verbose=0,silent=0, + write_loop=0,force_pack=0, isamchk_neaded=0; +static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL; +static my_bool backup, opt_wait; +/* + tree_buff_length is somewhat arbitrary. The bigger it is the better + the chance to win in terms of compression factor. On the other hand, + this table becomes part of the compressed file header. And its length + is coded with 16 bits in the header. Hence the limit is 2**16 - 1. +*/ +static uint tree_buff_length= 65536 - MALLOC_OVERHEAD; +static char tmp_dir[FN_REFLEN]={0},*join_table; +static my_off_t intervall_length; +static ha_checksum glob_crc; +static struct st_file_buffer file_buffer; +static QUEUE queue; +static HUFF_COUNTS *global_count; +static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; +static const char *load_default_groups[]= { "mariapack",0 }; + + /* The main program */ + +int main(int argc, char **argv) +{ + int error,ok; + PACK_MRG_INFO merge; + char **default_argv; + MY_INIT(argv[0]); + + load_defaults("my",load_default_groups,&argc,&argv); + default_argv= argv; + get_options(&argc,&argv); + maria_init(); + + error=ok=isamchk_neaded=0; + if (join_table) + { /* Join files into one */ + if (open_isam_files(&merge,argv,(uint) argc) || + compress(&merge,join_table)) + error=1; + } + else while (argc--) + { + MARIA_HA *isam_file; + if (!(isam_file=open_isam_file(*argv++,O_RDWR))) + error=1; + else + { + merge.file= &isam_file; + merge.current=0; + merge.free_file=0; + merge.count=1; + if (compress(&merge,0)) + error=1; + else + ok=1; + } + } + if (ok && isamchk_neaded && !silent) + puts("Remember to run maria_chk -rq on compressed tables"); + VOID(fflush(stdout)); + VOID(fflush(stderr)); + free_defaults(default_argv); + maria_end(); + my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR); + exit(error ? 2 : 0); +#ifndef _lint + return 0; /* No compiler warning */ +#endif +} + +enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE}; + +static struct my_option my_long_options[] = +{ +#ifdef __NETWARE__ + {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"backup", 'b', "Make a backup of the table as table_name.OLD.", + (uchar**) &backup, (uchar**) &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"character-sets-dir", OPT_CHARSETS_DIR_MP, + "Directory where character sets are.", (uchar**) &charsets_dir, + (uchar**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"force", 'f', + "Force packing of table even if it gets bigger or if tempfile exists.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"join", 'j', + "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.", + (uchar**) &join_table, (uchar**) &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, + 0, 0, 0}, + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"silent", 's', "Be more silent.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"tmpdir", 'T', "Use temporary directory to store temporary table.", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"test", 't', "Don't pack table, only test packing it.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Output version information and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"wait", 'w', "Wait and retry if table is in use.", (uchar**) &opt_wait, + (uchar**) &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +#include <help_start.h> + +static void print_version(void) +{ + VOID(printf("%s Ver 1.0 for %s on %s\n", + my_progname, SYSTEM_TYPE, MACHINE_TYPE)); + NETWARE_SET_SCREEN_MODE(1); +} + + +static void usage(void) +{ + print_version(); + puts("Copyright (C) 2002 MySQL AB"); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Pack a MARIA-table to take much less space."); + puts("Keys are not updated, you must run maria_chk -rq on the index (.MAI) file"); + puts("afterwards to update the keys."); + puts("You should give the .MAI file as the filename argument."); + puts("To unpack a packed table, run maria_chk -u on the table"); + + VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname)); + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + +#include <help_end.h> + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument) +{ + uint length; + + switch(optid) { +#ifdef __NETWARE__ + case OPT_AUTO_CLOSE: + setscreenmode(SCR_AUTOCLOSE_ON_EXIT); + break; +#endif + case 'f': + force_pack= 1; + tmpfile_createflag= O_RDWR | O_TRUNC; + break; + case 's': + write_loop= verbose= 0; + silent= 1; + break; + case 't': + test_only= 1; + /* Avoid to reset 'verbose' if it was already set > 1. */ + if (! verbose) + verbose= 1; + break; + case 'T': + length= (uint) (strmov(tmp_dir, argument) - tmp_dir); + if (length != dirname_length(tmp_dir)) + { + tmp_dir[length]=FN_LIBCHAR; + tmp_dir[length+1]=0; + } + break; + case 'v': + verbose++; /* Allow for selecting the level of verbosity. */ + silent= 0; + break; + case '#': + DBUG_PUSH(argument ? argument : "d:t:o,/tmp/maria_pack.trace"); + break; + case 'V': + print_version(); + exit(0); + case 'I': + case '?': + usage(); + exit(0); + } + return 0; +} + + /* reads options */ + /* Initiates DEBUG - but no debugging here ! */ + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + + my_progname= argv[0][0]; + if (isatty(fileno(stdout))) + write_loop=1; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + if (!*argc) + { + usage(); + exit(1); + } + if (join_table) + { + backup=0; /* Not needed */ + tmp_dir[0]=0; + } + return; +} + + +static MARIA_HA *open_isam_file(char *name,int mode) +{ + MARIA_HA *isam_file; + MARIA_SHARE *share; + DBUG_ENTER("open_isam_file"); + + if (!(isam_file=maria_open(name,mode, + (opt_wait ? HA_OPEN_WAIT_IF_LOCKED : + HA_OPEN_ABORT_IF_LOCKED)))) + { + VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno)); + DBUG_RETURN(0); + } + share=isam_file->s; + if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table) + { + if (!force_pack) + { + VOID(fprintf(stderr, "%s is already compressed\n", name)); + VOID(maria_close(isam_file)); + DBUG_RETURN(0); + } + if (verbose) + puts("Recompressing already compressed table"); + share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */ + } + if (! force_pack && share->state.state.records != 0 && + (share->state.state.records <= 1 || + share->state.state.data_file_length < 1024)) + { + VOID(fprintf(stderr, "%s is too small to compress\n", name)); + VOID(maria_close(isam_file)); + DBUG_RETURN(0); + } + VOID(maria_lock_database(isam_file,F_WRLCK)); + DBUG_RETURN(isam_file); +} + + +static bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count) +{ + uint i,j; + mrg->count=0; + mrg->current=0; + mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE)); + mrg->free_file=1; + mrg->src_file_has_indexes_disabled= 0; + for (i=0; i < count ; i++) + { + if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY))) + goto error; + + mrg->src_file_has_indexes_disabled|= + ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map, + mrg->file[i]->s->base.keys); + } + /* Check that files are identical */ + for (j=0 ; j < count-1 ; j++) + { + MARIA_COLUMNDEF *m1,*m2,*end; + if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength || + mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields) + goto diff_file; + m1=mrg->file[j]->s->columndef; + end=m1+mrg->file[j]->s->base.fields; + m2=mrg->file[j+1]->s->columndef; + for ( ; m1 != end ; m1++,m2++) + { + if (m1->type != m2->type || m1->length != m2->length) + goto diff_file; + } + } + mrg->count=count; + return 0; + + diff_file: + VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n", + my_progname, names[j], names[j+1])); + error: + while (i--) + maria_close(mrg->file[i]); + my_free((uchar*) mrg->file,MYF(0)); + return 1; +} + + +static int compress(PACK_MRG_INFO *mrg,char *result_table) +{ + int error; + File new_file,join_isam_file; + MARIA_HA *isam_file; + MARIA_SHARE *share; + char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN]; + uint i,header_length,fields,trees,used_trees; + my_off_t old_length,new_length,tot_elements; + HUFF_COUNTS *huff_counts; + HUFF_TREE *huff_trees; + DBUG_ENTER("compress"); + + isam_file=mrg->file[0]; /* Take this as an example */ + share=isam_file->s; + new_file=join_isam_file= -1; + trees=fields=0; + huff_trees=0; + huff_counts=0; + maria_block_size= isam_file->s->block_size; + + /* Create temporary or join file */ + if (backup) + VOID(fn_format(org_name,isam_file->s->open_file_name,"",MARIA_NAME_DEXT, + 2)); + else + VOID(fn_format(org_name,isam_file->s->open_file_name,"",MARIA_NAME_DEXT, + 2+4+16)); + + if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0, + maria_block_size) == 0) + { + fprintf(stderr, "Can't initialize page cache\n"); + goto err; + } + + if (!test_only && result_table) + { + /* Make a new indexfile based on first file in list */ + uint length; + char *buff; + strmov(org_name,result_table); /* Fix error messages */ + VOID(fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2)); + if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) + < 0) + goto err; + length=(uint) share->base.keystart; + if (!(buff=my_malloc(length,MYF(MY_WME)))) + goto err; + if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) || + my_write(join_isam_file,buff,length, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL))) + { + my_free(buff,MYF(0)); + goto err; + } + my_free(buff,MYF(0)); + VOID(fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2)); + } + else if (!tmp_dir[0]) + VOID(make_new_name(new_name,org_name)); + else + VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4)); + if (!test_only && + (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0) + goto err; + + /* Start calculating statistics */ + + mrg->records=0; + for (i=0 ; i < mrg->count ; i++) + mrg->records+=mrg->file[i]->s->state.state.records; + + DBUG_PRINT("info", ("Compressing %s: (%lu records)", + result_table ? new_name : org_name, + (ulong) mrg->records)); + if (write_loop || verbose) + { + VOID(printf("Compressing %s: (%lu records)\n", + result_table ? new_name : org_name, (ulong) mrg->records)); + } + trees=fields=share->base.fields; + huff_counts=init_huff_count(isam_file,mrg->records); + QUICK_SAFEMALLOC; + + /* + Read the whole data file(s) for statistics. + */ + DBUG_PRINT("info", ("- Calculating statistics")); + if (write_loop || verbose) + VOID(printf("- Calculating statistics\n")); + if (get_statistic(mrg,huff_counts)) + goto err; + NORMAL_SAFEMALLOC; + old_length=0; + for (i=0; i < mrg->count ; i++) + old_length+= (mrg->file[i]->s->state.state.data_file_length - + mrg->file[i]->s->state.state.empty); + + /* + Create a global priority queue in preparation for making + temporary Huffman trees. + */ + if (init_queue(&queue,256,0,0,compare_huff_elements,0)) + goto err; + + /* + Check each column if we should use pre-space-compress, end-space- + compress, empty-field-compress or zero-field-compress. + */ + check_counts(huff_counts,fields,mrg->records); + + /* + Build a Huffman tree for each column. + */ + huff_trees=make_huff_trees(huff_counts,trees); + + /* + If the packed lengths of combined columns is less then the sum of + the non-combined columns, then create common Huffman trees for them. + We do this only for uchar compressed columns, not for distinct values + compressed columns. + */ + if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0) + goto err; + + /* + Assign codes to all uchar or column values. + */ + if (make_huff_decode_table(huff_trees,fields)) + goto err; + + /* Prepare a file buffer. */ + init_file_buffer(new_file,0); + + /* + Reserve space in the target file for the fixed compressed file header. + */ + file_buffer.pos_in_file=HEAD_LENGTH; + if (! test_only) + VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0))); + + /* + Write field infos: field type, pack type, length bits, tree number. + */ + write_field_info(huff_counts,fields,used_trees); + + /* + Write decode trees. + */ + if (!(tot_elements=write_huff_tree(huff_trees,trees))) + goto err; + + /* + Calculate the total length of the compression info header. + This includes the fixed compressed file header, the column compression + type descriptions, and the decode trees. + */ + header_length=(uint) file_buffer.pos_in_file+ + (uint) (file_buffer.pos-file_buffer.buffer); + + /* + Compress the source file into the target file. + */ + DBUG_PRINT("info", ("- Compressing file")); + if (write_loop || verbose) + VOID(printf("- Compressing file\n")); + error=compress_isam_file(mrg,huff_counts); + new_length=file_buffer.pos_in_file; + if (!error && !test_only) + { + char buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */ + bzero(buff,sizeof(buff)); + error=my_write(file_buffer.file,buff,sizeof(buff), + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0; + } + + /* + Write the fixed compressed file header. + */ + if (!error) + error=write_header(mrg,header_length,used_trees,tot_elements, + new_length); + + /* Flush the file buffer. */ + end_file_buffer(); + + /* Display statistics. */ + DBUG_PRINT("info", ("Min record length: %6d Max length: %6d " + "Mean total length: %6ld", + mrg->min_pack_length, mrg->max_pack_length, + (ulong) (mrg->records ? (new_length/mrg->records) : 0))); + if (verbose && mrg->records) + VOID(printf("Min record length: %6d Max length: %6d " + "Mean total length: %6ld\n", mrg->min_pack_length, + mrg->max_pack_length, (ulong) (new_length/mrg->records))); + + /* Close source and target file. */ + if (!test_only) + { + error|=my_close(new_file,MYF(MY_WME)); + if (!result_table) + { + error|=my_close(isam_file->dfile.file, MYF(MY_WME)); + isam_file->dfile.file= -1; /* Tell maria_close file is closed */ + isam_file->s->bitmap.file.file= -1; + } + } + + /* Cleanup. */ + free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields); + if (! test_only && ! error) + { + if (result_table) + { + error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc); + } + else + { + if (backup) + { + if (my_rename(org_name,make_old_name(temp_name, + isam_file->s->open_file_name), + MYF(MY_WME))) + error=1; + else + { + if (tmp_dir[0]) + error=my_copy(new_name,org_name,MYF(MY_WME)); + else + error=my_rename(new_name,org_name,MYF(MY_WME)); + if (!error) + { + VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME))); + if (tmp_dir[0]) + VOID(my_delete(new_name,MYF(MY_WME))); + } + } + } + else + { + if (tmp_dir[0]) + { + error=my_copy(new_name,org_name, + MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME)); + if (!error) + VOID(my_delete(new_name,MYF(MY_WME))); + } + else + error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME)); + } + if (! error) + error=save_state(isam_file,mrg,new_length,glob_crc); + } + } + error|=mrg_close(mrg); + if (join_isam_file >= 0) + error|=my_close(join_isam_file,MYF(MY_WME)); + if (error) + { + VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name)); + VOID(my_delete(new_name,MYF(MY_WME))); + DBUG_RETURN(-1); + } + if (write_loop || verbose) + { + if (old_length) + VOID(printf("%.4g%% \n", + (((longlong) (old_length - new_length)) * 100.0 / + (longlong) old_length))); + else + puts("Empty file saved in compressed format"); + } + DBUG_RETURN(0); + + err: + end_pagecache(maria_pagecache, 1); + free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields); + if (new_file >= 0) + VOID(my_close(new_file,MYF(0))); + if (join_isam_file >= 0) + VOID(my_close(join_isam_file,MYF(0))); + mrg_close(mrg); + VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name)); + DBUG_RETURN(-1); +} + + /* Init a huff_count-struct for each field and init it */ + +static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records) +{ + reg2 uint i; + reg1 HUFF_COUNTS *count; + if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields* + sizeof(HUFF_COUNTS), + MYF(MY_ZEROFILL | MY_WME)))) + { + for (i=0 ; i < info->s->base.fields ; i++) + { + enum en_fieldtype type; + count[i].field_length=info->s->columndef[i].length; + type= count[i].field_type= (enum en_fieldtype) info->s->columndef[i].type; + if (type == FIELD_INTERVALL || + type == FIELD_CONSTANT || + type == FIELD_ZERO) + type = FIELD_NORMAL; + if (count[i].field_length <= 8 && + (type == FIELD_NORMAL || + type == FIELD_SKIP_ZERO)) + count[i].max_zero_fill= count[i].field_length; + /* + For every column initialize a tree, which is used to detect distinct + column values. 'int_tree' works together with 'tree_buff' and + 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'. + This is accomplished by '-1' as the element size. + */ + init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL, + NULL); + if (records && type != FIELD_BLOB && type != FIELD_VARCHAR) + count[i].tree_pos=count[i].tree_buff = + my_malloc(count[i].field_length > 1 ? tree_buff_length : 2, + MYF(MY_WME)); + } + } + return count; +} + + + /* Free memory used by counts and trees */ + +static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees, + HUFF_COUNTS *huff_counts, + uint fields) +{ + register uint i; + + if (huff_trees) + { + for (i=0 ; i < trees ; i++) + { + if (huff_trees[i].element_buffer) + my_free((uchar*) huff_trees[i].element_buffer,MYF(0)); + if (huff_trees[i].code) + my_free((uchar*) huff_trees[i].code,MYF(0)); + } + my_free((uchar*) huff_trees,MYF(0)); + } + if (huff_counts) + { + for (i=0 ; i < fields ; i++) + { + if (huff_counts[i].tree_buff) + { + my_free((uchar*) huff_counts[i].tree_buff,MYF(0)); + delete_tree(&huff_counts[i].int_tree); + } + } + my_free((uchar*) huff_counts,MYF(0)); + } + delete_queue(&queue); /* This is safe to free */ + return; +} + + /* Read through old file and gather some statistics */ + +static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts) +{ + int error; + uint length, null_bytes; + ulong reclength,max_blob_length; + uchar *record,*pos,*next_pos,*end_pos,*start_pos; + ha_rows record_count; + HUFF_COUNTS *count,*end_count; + TREE_ELEMENT *element; + ha_checksum(*calc_checksum) (struct st_maria_info *, const uchar *); + DBUG_ENTER("get_statistic"); + + reclength= mrg->file[0]->s->base.reclength; + null_bytes= mrg->file[0]->s->base.null_bytes; + record=(uchar*) my_alloca(reclength); + end_count=huff_counts+mrg->file[0]->s->base.fields; + record_count=0; glob_crc=0; + max_blob_length=0; + + /* Check how to calculate checksum */ + if (mrg->file[0]->s->data_file_type == STATIC_RECORD) + calc_checksum= _ma_static_checksum; + else + calc_checksum= _ma_checksum; + + mrg_reset(mrg); + while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE) + { + ulong tot_blob_length=0; + if (! error) + { + /* glob_crc is a checksum over all bytes of all records. */ + glob_crc+= (*calc_checksum)(mrg->file[0],record); + + /* Count the incidence of values separately for every column. */ + for (pos=record + null_bytes, count=huff_counts ; + count < end_count ; + count++, + pos=next_pos) + { + next_pos=end_pos=(start_pos=pos)+count->field_length; + + /* + Put the whole column value in a tree if there is room for it. + 'int_tree' is used to quickly check for duplicate values. + 'tree_buff' collects as many distinct column values as + possible. If the field length is > 1, it is tree_buff_length, + else 2 bytes. Each value is 'field_length' bytes big. If there + are more distinct column values than fit into the buffer, we + give up with this tree. BLOBs and VARCHARs do not have a + tree_buff as it can only be used with fixed length columns. + For the special case of field length == 1, we handle only the + case that there is only one distinct value in the table(s). + Otherwise, we can have a maximum of 256 distinct values. This + is then handled by the normal Huffman tree build. + + Another limit for collecting distinct column values is the + number of values itself. Since we would need to build a + Huffman tree for the values, we are limited by the 'IS_OFFSET' + constant. This constant expresses a bit which is used to + determine if a tree element holds a final value or an offset + to a child element. Hence, all values and offsets need to be + smaller than 'IS_OFFSET'. A tree element is implemented with + two integer values, one for the left branch and one for the + right branch. For the extreme case that the first element + points to the last element, the number of integers in the tree + must be less or equal to IS_OFFSET. So the number of elements + must be less or equal to IS_OFFSET / 2. + + WARNING: At first, we insert a pointer into the record buffer + as the key for the tree. If we got a new distinct value, which + is really inserted into the tree, instead of being counted + only, we will copy the column value from the record buffer to + 'tree_buff' and adjust the key pointer of the tree accordingly. + */ + if (count->tree_buff) + { + global_count=count; + if (!(element=tree_insert(&count->int_tree,pos, 0, + count->int_tree.custom_arg)) || + (element->count == 1 && + (count->tree_buff + tree_buff_length < + count->tree_pos + count->field_length)) || + (count->int_tree.elements_in_tree > IS_OFFSET / 2) || + (count->field_length == 1 && + count->int_tree.elements_in_tree > 1)) + { + delete_tree(&count->int_tree); + my_free(count->tree_buff,MYF(0)); + count->tree_buff=0; + } + else + { + /* + If tree_insert() succeeds, it either creates a new element + or increments the counter of an existing element. + */ + if (element->count == 1) + { + /* Copy the new column value into 'tree_buff'. */ + memcpy(count->tree_pos,pos,(size_t) count->field_length); + /* Adjust the key pointer in the tree. */ + tree_set_pointer(element,count->tree_pos); + /* Point behind the last column value so far. */ + count->tree_pos+=count->field_length; + } + } + } + + /* Save character counters and space-counts and zero-field-counts */ + if (count->field_type == FIELD_NORMAL || + count->field_type == FIELD_SKIP_ENDSPACE) + { + /* Ignore trailing space. */ + for ( ; end_pos > pos ; end_pos--) + if (end_pos[-1] != ' ') + break; + /* Empty fields are just counted. Go to the next record. */ + if (end_pos == pos) + { + count->empty_fields++; + count->max_zero_fill=0; + continue; + } + /* + Count the total of all trailing spaces and the number of + short trailing spaces. Remember the longest trailing space. + */ + length= (uint) (next_pos-end_pos); + count->tot_end_space+=length; + if (length < 8) + count->end_space[length]++; + if (count->max_end_space < length) + count->max_end_space = length; + } + + if (count->field_type == FIELD_NORMAL || + count->field_type == FIELD_SKIP_PRESPACE) + { + /* Ignore leading space. */ + for (pos=start_pos; pos < end_pos ; pos++) + if (pos[0] != ' ') + break; + /* Empty fields are just counted. Go to the next record. */ + if (end_pos == pos) + { + count->empty_fields++; + count->max_zero_fill=0; + continue; + } + /* + Count the total of all leading spaces and the number of + short leading spaces. Remember the longest leading space. + */ + length= (uint) (pos-start_pos); + count->tot_pre_space+=length; + if (length < 8) + count->pre_space[length]++; + if (count->max_pre_space < length) + count->max_pre_space = length; + } + + /* Calculate pos, end_pos, and max_length for variable length fields. */ + if (count->field_type == FIELD_BLOB) + { + uint field_length=count->field_length -portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(field_length, start_pos); + memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*)); + end_pos=pos+blob_length; + tot_blob_length+=blob_length; + set_if_bigger(count->max_length,blob_length); + } + else if (count->field_type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1); + length= (pack_length == 1 ? (uint) *(uchar*) start_pos : + uint2korr(start_pos)); + pos= start_pos+pack_length; + end_pos= pos+length; + set_if_bigger(count->max_length,length); + } + + /* Evaluate 'max_zero_fill' for short fields. */ + if (count->field_length <= 8 && + (count->field_type == FIELD_NORMAL || + count->field_type == FIELD_SKIP_ZERO)) + { + uint i; + /* Zero fields are just counted. Go to the next record. */ + if (!memcmp((uchar*) start_pos,zero_string,count->field_length)) + { + count->zero_fields++; + continue; + } + /* + max_zero_fill starts with field_length. It is decreased every + time a shorter "zero trailer" is found. It is set to zero when + an empty field is found (see above). This suggests that the + variable should be called 'min_zero_fill'. + */ + for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ; + i++) ; + if (i < count->max_zero_fill) + count->max_zero_fill=i; + } + + /* Ignore zero fields and check fields. */ + if (count->field_type == FIELD_ZERO || + count->field_type == FIELD_CHECK) + continue; + + /* + Count the incidence of every uchar value in the + significant field value. + */ + for ( ; pos < end_pos ; pos++) + count->counts[(uchar) *pos]++; + + /* Step to next field. */ + } + + if (tot_blob_length > max_blob_length) + max_blob_length=tot_blob_length; + record_count++; + if (write_loop && record_count % WRITE_COUNT == 0) + { + VOID(printf("%lu\r", (ulong) record_count)); + VOID(fflush(stdout)); + } + } + else if (error != HA_ERR_RECORD_DELETED) + { + VOID(fprintf(stderr, "Got error %d while reading rows", error)); + break; + } + + /* Step to next record. */ + } + if (write_loop) + { + VOID(printf(" \r")); + VOID(fflush(stdout)); + } + + /* + If --debug=d,fakebigcodes is set, fake the counts to get big Huffman + codes. + */ + DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count);); + + DBUG_PRINT("info", ("Found the following number of incidents " + "of the uchar codes:")); + if (verbose >= 2) + VOID(printf("Found the following number of incidents " + "of the uchar codes:\n")); + for (count= huff_counts ; count < end_count; count++) + { + uint idx; + my_off_t total_count; + char llbuf[32]; + + DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1))); + if (verbose >= 2) + VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1))); + if (count->tree_buff) + { + DBUG_PRINT("info", ("number of distinct values: %u", + (uint) ((count->tree_pos - count->tree_buff) / + count->field_length))); + if (verbose >= 2) + VOID(printf("number of distinct values: %u\n", + (uint) ((count->tree_pos - count->tree_buff) / + count->field_length))); + } + total_count= 0; + for (idx= 0; idx < 256; idx++) + { + if (count->counts[idx]) + { + total_count+= count->counts[idx]; + DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx, + llstr((longlong) count->counts[idx], llbuf))); + if (verbose >= 2) + VOID(printf("counts[0x%02x]: %12s\n", idx, + llstr((longlong) count->counts[idx], llbuf))); + } + } + DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count, + llbuf))); + if ((verbose >= 2) && total_count) + { + VOID(printf("total: %12s\n", + llstr((longlong) total_count, llbuf))); + } + } + + mrg->records=record_count; + mrg->max_blob_length=max_blob_length; + my_afree((uchar*) record); + DBUG_RETURN(error != HA_ERR_END_OF_FILE); +} + +static int compare_huff_elements(void *not_used __attribute__((unused)), + uchar *a, uchar *b) +{ + return *((my_off_t*) a) < *((my_off_t*) b) ? -1 : + (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1); +} + + /* Check each tree if we should use pre-space-compress, end-space- + compress, empty-field-compress or zero-field-compress */ + +static void check_counts(HUFF_COUNTS *huff_counts, uint trees, + my_off_t records) +{ + uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count]; + my_off_t old_length,new_length,length; + DBUG_ENTER("check_counts"); + + bzero((uchar*) field_count,sizeof(field_count)); + space_fields=fill_zero_fields=0; + + for (; trees-- ; huff_counts++) + { + if (huff_counts->field_type == FIELD_BLOB) + { + huff_counts->length_bits=max_bit(huff_counts->max_length); + goto found_pack; + } + else if (huff_counts->field_type == FIELD_VARCHAR) + { + huff_counts->length_bits=max_bit(huff_counts->max_length); + goto found_pack; + } + else if (huff_counts->field_type == FIELD_CHECK) + { + huff_counts->bytes_packed=0; + huff_counts->counts[0]=0; + goto found_pack; + } + + huff_counts->field_type=FIELD_NORMAL; + huff_counts->pack_type=0; + + /* Check for zero-filled records (in this column), or zero records. */ + if (huff_counts->zero_fields || ! records) + { + my_off_t old_space_count; + /* + If there are only zero filled records (in this column), + or no records at all, we are done. + */ + if (huff_counts->zero_fields == records) + { + huff_counts->field_type= FIELD_ZERO; + huff_counts->bytes_packed=0; + huff_counts->counts[0]=0; + goto found_pack; + } + /* Remeber the number of significant spaces. */ + old_space_count=huff_counts->counts[' ']; + /* Add all leading and trailing spaces. */ + huff_counts->counts[' ']+= (huff_counts->tot_end_space + + huff_counts->tot_pre_space + + huff_counts->empty_fields * + huff_counts->field_length); + /* Check, what the compressed length of this would be. */ + old_length=calc_packed_length(huff_counts,0)+records/8; + /* Get the number of zero bytes. */ + length=huff_counts->zero_fields*huff_counts->field_length; + /* Add it to the counts. */ + huff_counts->counts[0]+=length; + /* Check, what the compressed length of this would be. */ + new_length=calc_packed_length(huff_counts,0); + /* If the compression without the zeroes would be shorter, we are done. */ + if (old_length < new_length && huff_counts->field_length > 1) + { + huff_counts->field_type=FIELD_SKIP_ZERO; + huff_counts->counts[0]-=length; + huff_counts->bytes_packed=old_length- records/8; + goto found_pack; + } + /* Remove the insignificant spaces, but keep the zeroes. */ + huff_counts->counts[' ']=old_space_count; + } + /* Check, what the compressed length of this column would be. */ + huff_counts->bytes_packed=calc_packed_length(huff_counts,0); + + /* + If there are enough empty records (in this column), + treating them specially may pay off. + */ + if (huff_counts->empty_fields) + { + if (huff_counts->field_length > 2 && + huff_counts->empty_fields + (records - huff_counts->empty_fields)* + (1+max_bit(max(huff_counts->max_pre_space, + huff_counts->max_end_space))) < + records * max_bit(huff_counts->field_length)) + { + huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS; + } + else + { + length=huff_counts->empty_fields*huff_counts->field_length; + if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space) + { + huff_counts->tot_end_space+=length; + huff_counts->max_end_space=huff_counts->field_length; + if (huff_counts->field_length < 8) + huff_counts->end_space[huff_counts->field_length]+= + huff_counts->empty_fields; + } + if (huff_counts->tot_pre_space) + { + huff_counts->tot_pre_space+=length; + huff_counts->max_pre_space=huff_counts->field_length; + if (huff_counts->field_length < 8) + huff_counts->pre_space[huff_counts->field_length]+= + huff_counts->empty_fields; + } + } + } + + /* + If there are enough trailing spaces (in this column), + treating them specially may pay off. + */ + if (huff_counts->tot_end_space) + { + huff_counts->counts[' ']+=huff_counts->tot_pre_space; + if (test_space_compress(huff_counts,records,huff_counts->max_end_space, + huff_counts->end_space, + huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE)) + goto found_pack; + huff_counts->counts[' ']-=huff_counts->tot_pre_space; + } + + /* + If there are enough leading spaces (in this column), + treating them specially may pay off. + */ + if (huff_counts->tot_pre_space) + { + if (test_space_compress(huff_counts,records,huff_counts->max_pre_space, + huff_counts->pre_space, + huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE)) + goto found_pack; + } + + found_pack: /* Found field-packing */ + + /* Test if we can use zero-fill */ + + if (huff_counts->max_zero_fill && + (huff_counts->field_type == FIELD_NORMAL || + huff_counts->field_type == FIELD_SKIP_ZERO)) + { + huff_counts->counts[0]-=huff_counts->max_zero_fill* + (huff_counts->field_type == FIELD_SKIP_ZERO ? + records - huff_counts->zero_fields : records); + huff_counts->pack_type|=PACK_TYPE_ZERO_FILL; + huff_counts->bytes_packed=calc_packed_length(huff_counts,0); + } + + /* Test if intervall-field is better */ + + if (huff_counts->tree_buff) + { + HUFF_TREE tree; + + DBUG_EXECUTE_IF("forceintervall", + huff_counts->bytes_packed= ~ (my_off_t) 0;); + tree.element_buffer=0; + if (!make_huff_tree(&tree,huff_counts) && + tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed) + { + if (tree.elements == 1) + huff_counts->field_type=FIELD_CONSTANT; + else + huff_counts->field_type=FIELD_INTERVALL; + huff_counts->pack_type=0; + } + else + { + my_free((uchar*) huff_counts->tree_buff,MYF(0)); + delete_tree(&huff_counts->int_tree); + huff_counts->tree_buff=0; + } + if (tree.element_buffer) + my_free((uchar*) tree.element_buffer,MYF(0)); + } + if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS) + space_fields++; + if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL) + fill_zero_fields++; + field_count[huff_counts->field_type]++; + } + DBUG_PRINT("info", ("normal: %3d empty-space: %3d " + "empty-zero: %3d empty-fill: %3d", + field_count[FIELD_NORMAL],space_fields, + field_count[FIELD_SKIP_ZERO],fill_zero_fields)); + DBUG_PRINT("info", ("pre-space: %3d end-space: %3d " + "intervall-fields: %3d zero: %3d", + field_count[FIELD_SKIP_PRESPACE], + field_count[FIELD_SKIP_ENDSPACE], + field_count[FIELD_INTERVALL], + field_count[FIELD_ZERO])); + if (verbose) + VOID(printf("\nnormal: %3d empty-space: %3d " + "empty-zero: %3d empty-fill: %3d\n" + "pre-space: %3d end-space: %3d " + "intervall-fields: %3d zero: %3d\n", + field_count[FIELD_NORMAL],space_fields, + field_count[FIELD_SKIP_ZERO],fill_zero_fields, + field_count[FIELD_SKIP_PRESPACE], + field_count[FIELD_SKIP_ENDSPACE], + field_count[FIELD_INTERVALL], + field_count[FIELD_ZERO])); + DBUG_VOID_RETURN; +} + + +/* Test if we can use space-compression and empty-field-compression */ + +static int +test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records, + uint max_space_length, my_off_t *space_counts, + my_off_t tot_space_count, enum en_fieldtype field_type) +{ + int min_pos; + uint length_bits,i; + my_off_t space_count,min_space_count,min_pack,new_length,skip; + + length_bits=max_bit(max_space_length); + + /* Default no end_space-packing */ + space_count=huff_counts->counts[(uint) ' ']; + min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count); + min_pack=calc_packed_length(huff_counts,0); + min_pos= -2; + huff_counts->counts[(uint) ' ']=space_count; + + /* Test with allways space-count */ + new_length=huff_counts->bytes_packed+length_bits*records/8; + if (new_length+1 < min_pack) + { + min_pos= -1; + min_pack=new_length; + min_space_count=space_count; + } + /* Test with length-flag */ + for (skip=0L, i=0 ; i < 8 ; i++) + { + if (space_counts[i]) + { + if (i) + huff_counts->counts[(uint) ' ']+=space_counts[i]; + skip+=huff_counts->pre_space[i]; + new_length=calc_packed_length(huff_counts,0)+ + (records+(records-skip)*(1+length_bits))/8; + if (new_length < min_pack) + { + min_pos=(int) i; + min_pack=new_length; + min_space_count=huff_counts->counts[(uint) ' ']; + } + } + } + + huff_counts->counts[(uint) ' ']=min_space_count; + huff_counts->bytes_packed=min_pack; + switch (min_pos) { + case -2: + return(0); /* No space-compress */ + case -1: /* Always space-count */ + huff_counts->field_type=field_type; + huff_counts->min_space=0; + huff_counts->length_bits=max_bit(max_space_length); + break; + default: + huff_counts->field_type=field_type; + huff_counts->min_space=(uint) min_pos; + huff_counts->pack_type|=PACK_TYPE_SELECTED; + huff_counts->length_bits=max_bit(max_space_length); + break; + } + return(1); /* Using space-compress */ +} + + + /* Make a huff_tree of each huff_count */ + +static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees) +{ + uint tree; + HUFF_TREE *huff_tree; + DBUG_ENTER("make_huff_trees"); + + if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE), + MYF(MY_WME | MY_ZEROFILL)))) + DBUG_RETURN(0); + + for (tree=0 ; tree < trees ; tree++) + { + if (make_huff_tree(huff_tree+tree,huff_counts+tree)) + { + while (tree--) + my_free((uchar*) huff_tree[tree].element_buffer,MYF(0)); + my_free((uchar*) huff_tree,MYF(0)); + DBUG_RETURN(0); + } + } + DBUG_RETURN(huff_tree); +} + +/* + Build a Huffman tree. + + SYNOPSIS + make_huff_tree() + huff_tree The Huffman tree. + huff_counts The counts. + + DESCRIPTION + Build a Huffman tree according to huff_counts->counts or + huff_counts->tree_buff. tree_buff, if non-NULL contains up to + tree_buff_length of distinct column values. In that case, whole + values can be Huffman encoded instead of single bytes. + + RETURN + 0 OK + != 0 Error +*/ + +static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts) +{ + uint i,found,bits_packed,first,last; + my_off_t bytes_packed; + HUFF_ELEMENT *a,*b,*new_huff_el; + + first=last=0; + if (huff_counts->tree_buff) + { + /* Calculate the number of distinct values in tree_buff. */ + found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) / + huff_counts->field_length; + first=0; last=found-1; + } + else + { + /* Count the number of uchar codes found in the column. */ + for (i=found=0 ; i < 256 ; i++) + { + if (huff_counts->counts[i]) + { + if (! found++) + first=i; + last=i; + } + } + if (found < 2) + found=2; + } + + /* When using 'tree_buff' we can have more that 256 values. */ + if (queue.max_elements < found) + { + delete_queue(&queue); + if (init_queue(&queue,found,0,0,compare_huff_elements,0)) + return -1; + } + + /* Allocate or reallocate an element buffer for the Huffman tree. */ + if (!huff_tree->element_buffer) + { + if (!(huff_tree->element_buffer= + (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME)))) + return 1; + } + else + { + HUFF_ELEMENT *temp; + if (!(temp= + (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer, + found*2*sizeof(HUFF_ELEMENT), + MYF(MY_WME)))) + return 1; + huff_tree->element_buffer=temp; + } + + huff_counts->tree=huff_tree; + huff_tree->counts=huff_counts; + huff_tree->min_chr=first; + huff_tree->max_chr=last; + huff_tree->char_bits=max_bit(last-first); + huff_tree->offset_bits=max_bit(found-1)+1; + + if (huff_counts->tree_buff) + { + huff_tree->elements=0; + huff_tree->tree_pack_length=(1+15+16+5+5+ + (huff_tree->char_bits+1)*found+ + (huff_tree->offset_bits+1)* + (found-2)+7)/8 + + (uint) (huff_tree->counts->tree_pos- + huff_tree->counts->tree_buff); + /* + Put a HUFF_ELEMENT into the queue for every distinct column value. + + tree_walk() calls save_counts_in_queue() for every element in + 'int_tree'. This takes elements from the target trees element + buffer and places references to them into the buffer of the + priority queue. We insert in column value order, but the order is + in fact irrelevant here. We will establish the correct order + later. + */ + tree_walk(&huff_counts->int_tree, + (int (*)(void*, element_count,void*)) save_counts_in_queue, + (uchar*) huff_tree, left_root_right); + } + else + { + huff_tree->elements=found; + huff_tree->tree_pack_length=(9+9+5+5+ + (huff_tree->char_bits+1)*found+ + (huff_tree->offset_bits+1)* + (found-2)+7)/8; + /* + Put a HUFF_ELEMENT into the queue for every uchar code found in the column. + + The elements are taken from the target trees element buffer. + Instead of using queue_insert(), we just place references to the + elements into the buffer of the priority queue. We insert in byte + value order, but the order is in fact irrelevant here. We will + establish the correct order later. + */ + for (i=first, found=0 ; i <= last ; i++) + { + if (huff_counts->counts[i]) + { + new_huff_el=huff_tree->element_buffer+(found++); + new_huff_el->count=huff_counts->counts[i]; + new_huff_el->a.leaf.null=0; + new_huff_el->a.leaf.element_nr=i; + queue.root[found]=(uchar*) new_huff_el; + } + } + /* + If there is only a single uchar value in this field in all records, + add a second element with zero incidence. This is required to enter + the loop, which builds the Huffman tree. + */ + while (found < 2) + { + new_huff_el=huff_tree->element_buffer+(found++); + new_huff_el->count=0; + new_huff_el->a.leaf.null=0; + if (last) + new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1; + else + new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1; + queue.root[found]=(uchar*) new_huff_el; + } + } + + /* Make a queue from the queue buffer. */ + queue.elements=found; + + /* + Make a priority queue from the queue. Construct its index so that we + have a partially ordered tree. + */ + for (i=found/2 ; i > 0 ; i--) + _downheap(&queue,i); + + /* The Huffman algorithm. */ + bytes_packed=0; bits_packed=0; + for (i=1 ; i < found ; i++) + { + /* + Pop the top element from the queue (the one with the least incidence). + Popping from a priority queue includes a re-ordering of the queue, + to get the next least incidence element to the top. + */ + a=(HUFF_ELEMENT*) queue_remove(&queue,0); + /* + Copy the next least incidence element. The queue implementation + reserves root[0] for temporary purposes. root[1] is the top. + */ + b=(HUFF_ELEMENT*) queue.root[1]; + /* Get a new element from the element buffer. */ + new_huff_el=huff_tree->element_buffer+found+i; + /* The new element gets the sum of the two least incidence elements. */ + new_huff_el->count=a->count+b->count; + /* + The Huffman algorithm assigns another bit to the code for a byte + every time that bytes incidence is combined (directly or indirectly) + to a new element as one of the two least incidence elements. + This means that one more bit per incidence of that uchar is required + in the resulting file. So we add the new combined incidence as the + number of bits by which the result grows. + */ + bits_packed+=(uint) (new_huff_el->count & 7); + bytes_packed+=new_huff_el->count/8; + /* The new element points to its children, lesser in left. */ + new_huff_el->a.nod.left=a; + new_huff_el->a.nod.right=b; + /* + Replace the copied top element by the new element and re-order the + queue. + */ + queue.root[1]=(uchar*) new_huff_el; + queue_replaced(&queue); + } + huff_tree->root=(HUFF_ELEMENT*) queue.root[1]; + huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8; + return 0; +} + +static int compare_tree(void* cmp_arg __attribute__((unused)), + register const uchar *s, register const uchar *t) +{ + uint length; + for (length=global_count->field_length; length-- ;) + if (*s++ != *t++) + return (int) s[-1] - (int) t[-1]; + return 0; +} + +/* + Organize distinct column values and their incidences into a priority queue. + + SYNOPSIS + save_counts_in_queue() + key The column value. + count The incidence of this value. + tree The Huffman tree to be built later. + + DESCRIPTION + We use the element buffer of the targeted tree. The distinct column + values are organized in a priority queue first. The Huffman + algorithm will later organize the elements into a Huffman tree. For + the time being, we just place references to the elements into the + queue buffer. The buffer will later be organized into a priority + queue. + + RETURN + 0 + */ + +static int save_counts_in_queue(uchar *key, element_count count, + HUFF_TREE *tree) +{ + HUFF_ELEMENT *new_huff_el; + + new_huff_el=tree->element_buffer+(tree->elements++); + new_huff_el->count=count; + new_huff_el->a.leaf.null=0; + new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) / + tree->counts->field_length; + queue.root[tree->elements]=(uchar*) new_huff_el; + return 0; +} + + +/* + Calculate length of file if given counts should be used. + + SYNOPSIS + calc_packed_length() + huff_counts The counts for a column of the table(s). + add_tree_lenght If the decode tree length should be added. + + DESCRIPTION + We need to follow the Huffman algorithm until we know, how many bits + are required for each uchar code. But we do not need the resulting + Huffman tree. Hence, we can leave out some steps which are essential + in make_huff_tree(). + + RETURN + Number of bytes required to compress this table column. +*/ + +static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts, + uint add_tree_lenght) +{ + uint i,found,bits_packed,first,last; + my_off_t bytes_packed; + HUFF_ELEMENT element_buffer[256]; + DBUG_ENTER("calc_packed_length"); + + /* + WARNING: We use a small hack for efficiency: Instead of placing + references to HUFF_ELEMENTs into the queue, we just insert + references to the counts of the uchar codes which appeared in this + table column. During the Huffman algorithm they are successively + replaced by references to HUFF_ELEMENTs. This works, because + HUFF_ELEMENTs have the incidence count at their beginning. + Regardless, wether the queue array contains references to counts of + type my_off_t or references to HUFF_ELEMENTs which have the count of + type my_off_t at their beginning, it always points to a count of the + same type. + + Instead of using queue_insert(), we just copy the references into + the buffer of the priority queue. We insert in uchar value order, but + the order is in fact irrelevant here. We will establish the correct + order later. + */ + first=last=0; + for (i=found=0 ; i < 256 ; i++) + { + if (huff_counts->counts[i]) + { + if (! found++) + first=i; + last=i; + /* We start with root[1], which is the queues top element. */ + queue.root[found]=(uchar*) &huff_counts->counts[i]; + } + } + if (!found) + DBUG_RETURN(0); /* Empty tree */ + /* + If there is only a single uchar value in this field in all records, + add a second element with zero incidence. This is required to enter + the loop, which follows the Huffman algorithm. + */ + if (found < 2) + queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1]; + + /* Make a queue from the queue buffer. */ + queue.elements=found; + + bytes_packed=0; bits_packed=0; + /* Add the length of the coding table, which would become part of the file. */ + if (add_tree_lenght) + bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+ + (max_bit(found-1)+1+1)*(found-2) +7)/8; + + /* + Make a priority queue from the queue. Construct its index so that we + have a partially ordered tree. + */ + for (i=(found+1)/2 ; i > 0 ; i--) + _downheap(&queue,i); + + /* The Huffman algorithm. */ + for (i=0 ; i < found-1 ; i++) + { + my_off_t *a; + my_off_t *b; + HUFF_ELEMENT *new_huff_el; + + /* + Pop the top element from the queue (the one with the least + incidence). Popping from a priority queue includes a re-ordering + of the queue, to get the next least incidence element to the top. + */ + a= (my_off_t*) queue_remove(&queue, 0); + /* + Copy the next least incidence element. The queue implementation + reserves root[0] for temporary purposes. root[1] is the top. + */ + b= (my_off_t*) queue.root[1]; + /* Create a new element in a local (automatic) buffer. */ + new_huff_el= element_buffer + i; + /* The new element gets the sum of the two least incidence elements. */ + new_huff_el->count= *a + *b; + /* + The Huffman algorithm assigns another bit to the code for a byte + every time that bytes incidence is combined (directly or indirectly) + to a new element as one of the two least incidence elements. + This means that one more bit per incidence of that uchar is required + in the resulting file. So we add the new combined incidence as the + number of bits by which the result grows. + */ + bits_packed+=(uint) (new_huff_el->count & 7); + bytes_packed+=new_huff_el->count/8; + /* + Replace the copied top element by the new element and re-order the + queue. This successively replaces the references to counts by + references to HUFF_ELEMENTs. + */ + queue.root[1]=(uchar*) new_huff_el; + queue_replaced(&queue); + } + DBUG_RETURN(bytes_packed+(bits_packed+7)/8); +} + + + /* Remove trees that don't give any compression */ + +static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees) +{ + uint k,tree_number; + HUFF_COUNTS count,*i,*j,*last_count; + + last_count=huff_counts+trees; + for (tree_number=0, i=huff_counts ; i < last_count ; i++) + { + if (!i->tree->tree_number) + { + i->tree->tree_number= ++tree_number; + if (i->tree_buff) + continue; /* Don't join intervall */ + for (j=i+1 ; j < last_count ; j++) + { + if (! j->tree->tree_number && ! j->tree_buff) + { + for (k=0 ; k < 256 ; k++) + count.counts[k]=i->counts[k]+j->counts[k]; + if (calc_packed_length(&count,1) <= + i->tree->bytes_packed + j->tree->bytes_packed+ + i->tree->tree_pack_length+j->tree->tree_pack_length+ + ALLOWED_JOIN_DIFF) + { + memcpy_fixed((uchar*) i->counts,(uchar*) count.counts, + sizeof(count.counts[0])*256); + my_free((uchar*) j->tree->element_buffer,MYF(0)); + j->tree->element_buffer=0; + j->tree=i->tree; + bmove((uchar*) i->counts,(uchar*) count.counts, + sizeof(count.counts[0])*256); + if (make_huff_tree(i->tree,i)) + return (uint) -1; + } + } + } + } + } + DBUG_PRINT("info", ("Original trees: %d After join: %d", + trees, tree_number)); + if (verbose) + VOID(printf("Original trees: %d After join: %d\n", trees, tree_number)); + return tree_number; /* Return trees left */ +} + + +/* + Fill in huff_tree encode tables. + + SYNOPSIS + make_huff_decode_table() + huff_tree An array of HUFF_TREE which are to be encoded. + trees The number of HUFF_TREE in the array. + + RETURN + 0 success + != 0 error +*/ + +static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees) +{ + uint elements; + for ( ; trees-- ; huff_tree++) + { + if (huff_tree->tree_number > 0) + { + elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256; + if (!(huff_tree->code = + (ulonglong*) my_malloc(elements* + (sizeof(ulonglong) + sizeof(uchar)), + MYF(MY_WME | MY_ZEROFILL)))) + return 1; + huff_tree->code_len=(uchar*) (huff_tree->code+elements); + make_traverse_code_tree(huff_tree, huff_tree->root, + 8 * sizeof(ulonglong), LL(0)); + } + } + return 0; +} + + +static void make_traverse_code_tree(HUFF_TREE *huff_tree, + HUFF_ELEMENT *element, + uint size, ulonglong code) +{ + uint chr; + if (!element->a.leaf.null) + { + chr=element->a.leaf.element_nr; + huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size); + huff_tree->code[chr]= (code >> size); + if (huff_tree->height < 8 * sizeof(ulonglong) - size) + huff_tree->height= 8 * sizeof(ulonglong) - size; + } + else + { + size--; + make_traverse_code_tree(huff_tree,element->a.nod.left,size,code); + make_traverse_code_tree(huff_tree, element->a.nod.right, size, + code + (((ulonglong) 1) << size)); + } + return; +} + + +/* + Convert a value into binary digits. + + SYNOPSIS + bindigits() + value The value. + length The number of low order bits to convert. + + NOTE + The result string is in static storage. It is reused on every call. + So you cannot use it twice in one expression. + + RETURN + A pointer to a static NUL-terminated string. + */ + +static char *bindigits(ulonglong value, uint bits) +{ + static char digits[72]; + char *ptr= digits; + uint idx= bits; + + DBUG_ASSERT(idx < sizeof(digits)); + while (idx) + *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1); + *ptr= '\0'; + return digits; +} + + +/* + Convert a value into hexadecimal digits. + + SYNOPSIS + hexdigits() + value The value. + + NOTE + The result string is in static storage. It is reused on every call. + So you cannot use it twice in one expression. + + RETURN + A pointer to a static NUL-terminated string. + */ + +static char *hexdigits(ulonglong value) +{ + static char digits[20]; + char *ptr= digits; + uint idx= 2 * sizeof(value); /* Two hex digits per byte. */ + + DBUG_ASSERT(idx < sizeof(digits)); + while (idx) + { + if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9') + *(ptr - 1)+= 'a' - '9' - 1; + } + *ptr= '\0'; + return digits; +} + + + /* Write header to new packed data file */ + +static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees, + my_off_t tot_elements,my_off_t filelength) +{ + uchar *buff= (uchar*) file_buffer.pos; + + bzero(buff,HEAD_LENGTH); + memcpy_fixed(buff,maria_pack_file_magic,4); + int4store(buff+4,head_length); + int4store(buff+8, mrg->min_pack_length); + int4store(buff+12,mrg->max_pack_length); + int4store(buff+16,tot_elements); + int4store(buff+20,intervall_length); + int2store(buff+24,trees); + buff[26]=(char) mrg->ref_length; + /* Save record pointer length */ + buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2); + if (test_only) + return 0; + VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0))); + return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0; +} + + /* Write fieldinfo to new packed file */ + +static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees) +{ + reg1 uint i; + uint huff_tree_bits; + huff_tree_bits=max_bit(trees ? trees-1 : 0); + + DBUG_PRINT("info", (" ")); + DBUG_PRINT("info", ("column types:")); + DBUG_PRINT("info", ("FIELD_NORMAL 0")); + DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1")); + DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2")); + DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3")); + DBUG_PRINT("info", ("FIELD_BLOB 4")); + DBUG_PRINT("info", ("FIELD_CONSTANT 5")); + DBUG_PRINT("info", ("FIELD_INTERVALL 6")); + DBUG_PRINT("info", ("FIELD_ZERO 7")); + DBUG_PRINT("info", ("FIELD_VARCHAR 8")); + DBUG_PRINT("info", ("FIELD_CHECK 9")); + DBUG_PRINT("info", (" ")); + DBUG_PRINT("info", ("pack type as a set of flags:")); + DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1")); + DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2")); + DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4")); + DBUG_PRINT("info", (" ")); + if (verbose >= 2) + { + VOID(printf("\n")); + VOID(printf("column types:\n")); + VOID(printf("FIELD_NORMAL 0\n")); + VOID(printf("FIELD_SKIP_ENDSPACE 1\n")); + VOID(printf("FIELD_SKIP_PRESPACE 2\n")); + VOID(printf("FIELD_SKIP_ZERO 3\n")); + VOID(printf("FIELD_BLOB 4\n")); + VOID(printf("FIELD_CONSTANT 5\n")); + VOID(printf("FIELD_INTERVALL 6\n")); + VOID(printf("FIELD_ZERO 7\n")); + VOID(printf("FIELD_VARCHAR 8\n")); + VOID(printf("FIELD_CHECK 9\n")); + VOID(printf("\n")); + VOID(printf("pack type as a set of flags:\n")); + VOID(printf("PACK_TYPE_SELECTED 1\n")); + VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n")); + VOID(printf("PACK_TYPE_ZERO_FILL 4\n")); + VOID(printf("\n")); + } + for (i=0 ; i++ < fields ; counts++) + { + write_bits((ulonglong) (int) counts->field_type, 5); + write_bits(counts->pack_type,6); + if (counts->pack_type & PACK_TYPE_ZERO_FILL) + write_bits(counts->max_zero_fill,5); + else + write_bits(counts->length_bits,5); + write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits); + DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u " + "lbits: %2u tree: %2u length: %4u", + i , counts->field_type, counts->pack_type, + counts->max_zero_fill, counts->length_bits, + counts->tree->tree_number, counts->field_length)); + if (verbose >= 2) + VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u " + "tree: %2u length: %4u\n", i , counts->field_type, + counts->pack_type, counts->max_zero_fill, counts->length_bits, + counts->tree->tree_number, counts->field_length)); + } + flush_bits(); + return; +} + + /* Write all huff_trees to new datafile. Return tot count of + elements in all trees + Returns 0 on error */ + +static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees) +{ + uint i,int_length; + uint tree_no; + uint codes; + uint errors= 0; + uint *packed_tree,*offset,length; + my_off_t elements; + + /* Find the highest number of elements in the trees. */ + for (i=length=0 ; i < trees ; i++) + if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length) + length=huff_tree[i].elements; + /* + Allocate a buffer for packing a decode tree. Two numbers per element + (left child and right child). + */ + if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2))) + { + my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2); + return 0; + } + + DBUG_PRINT("info", (" ")); + if (verbose >= 2) + VOID(printf("\n")); + tree_no= 0; + intervall_length=0; + for (elements=0; trees-- ; huff_tree++) + { + /* Skip columns that have been joined with other columns. */ + if (huff_tree->tree_number == 0) + continue; /* Deleted tree */ + tree_no++; + DBUG_PRINT("info", (" ")); + if (verbose >= 3) + VOID(printf("\n")); + /* Count the total number of elements (byte codes or column values). */ + elements+=huff_tree->elements; + huff_tree->max_offset=2; + /* Build a tree of offsets and codes for decoding in 'packed_tree'. */ + if (huff_tree->elements <= 1) + offset=packed_tree; + else + offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree); + + /* This should be the same as 'length' above. */ + huff_tree->offset_bits=max_bit(huff_tree->max_offset); + + /* + Since we check this during collecting the distinct column values, + this should never happen. + */ + if (huff_tree->max_offset >= IS_OFFSET) + { /* This should be impossible */ + VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n", + huff_tree->max_offset)); + my_afree((uchar*) packed_tree); + return 0; + } + + DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu " + "char_bits: %u\n", + (ulong) (file_buffer.pos - file_buffer.buffer), + huff_tree->elements, (ulong) (offset - packed_tree), + huff_tree->char_bits)); + if (!huff_tree->counts->tree_buff) + { + /* We do a uchar compression on this column. Mark with bit 0. */ + write_bits(0,1); + write_bits(huff_tree->min_chr,8); + write_bits(huff_tree->elements,9); + write_bits(huff_tree->char_bits,5); + write_bits(huff_tree->offset_bits,5); + int_length=0; + } + else + { + int_length=(uint) (huff_tree->counts->tree_pos - + huff_tree->counts->tree_buff); + /* We have distinct column values for this column. Mark with bit 1. */ + write_bits(1,1); + write_bits(huff_tree->elements,15); + write_bits(int_length,16); + write_bits(huff_tree->char_bits,5); + write_bits(huff_tree->offset_bits,5); + intervall_length+=int_length; + } + DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u " + "offset_bits: %2u %s: %5u codelen: %2u", + tree_no, huff_tree->elements, huff_tree->char_bits, + huff_tree->offset_bits, huff_tree->counts->tree_buff ? + "bufflen" : "min_chr", huff_tree->counts->tree_buff ? + int_length : huff_tree->min_chr, huff_tree->height)); + if (verbose >= 2) + VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u " + "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements, + huff_tree->char_bits, huff_tree->offset_bits, + huff_tree->counts->tree_buff ? "bufflen" : "min_chr", + huff_tree->counts->tree_buff ? int_length : + huff_tree->min_chr, huff_tree->height)); + + /* Check that the code tree length matches the element count. */ + length=(uint) (offset-packed_tree); + if (length != huff_tree->elements*2-2) + { + VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n", + length, huff_tree->elements * 2 - 2)); + errors++; + break; + } + + for (i=0 ; i < length ; i++) + { + if (packed_tree[i] & IS_OFFSET) + write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits), + huff_tree->offset_bits+1); + else + write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1); + DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x", + i, (packed_tree[i] & IS_OFFSET) ? + " -> " : "", (packed_tree[i] & IS_OFFSET) ? + packed_tree[i] - IS_OFFSET + i : packed_tree[i])); + if (verbose >= 3) + VOID(printf("tree[0x%04x]: %s0x%04x\n", + i, (packed_tree[i] & IS_OFFSET) ? " -> " : "", + (packed_tree[i] & IS_OFFSET) ? + packed_tree[i] - IS_OFFSET + i : packed_tree[i])); + } + flush_bits(); + + /* + Display coding tables and check their correctness. + */ + codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256; + for (i= 0; i < codes; i++) + { + ulonglong code; + uint bits; + uint len; + uint idx; + + if (! (len= huff_tree->code_len[i])) + continue; + DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i, + hexdigits(huff_tree->code[i]), huff_tree->code_len[i], + bindigits(huff_tree->code[i], + huff_tree->code_len[i]))); + if (verbose >= 3) + VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i, + hexdigits(huff_tree->code[i]), huff_tree->code_len[i], + bindigits(huff_tree->code[i], huff_tree->code_len[i]))); + + /* Check that the encode table decodes correctly. */ + code= 0; + bits= 0; + idx= 0; + DBUG_EXECUTE_IF("forcechkerr1", len--;); + DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code);); + DBUG_EXECUTE_IF("forcechkerr3", idx= length;); + for (;;) + { + if (! len) + { + VOID(fflush(stdout)); + VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n", + hexdigits(huff_tree->code[i]), huff_tree->code_len[i])); + errors++; + break; + } + code<<= 1; + code|= (huff_tree->code[i] >> (--len)) & 1; + bits++; + if (bits > 8 * sizeof(code)) + { + VOID(fflush(stdout)); + VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n", + bits, (uint) (8 * sizeof(code)))); + errors++; + break; + } + idx+= (uint) code & 1; + if (idx >= length) + { + VOID(fflush(stdout)); + VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n", + idx, length)); + errors++; + break; + } + if (packed_tree[idx] & IS_OFFSET) + idx+= packed_tree[idx] & ~IS_OFFSET; + else + break; /* Hit a leaf. This contains the result value. */ + } + if (errors) + break; + + DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;); + if (packed_tree[idx] != i) + { + VOID(fflush(stdout)); + VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n", + packed_tree[idx], i)); + errors++; + break; + } + } /*end for (codes)*/ + if (errors) + break; + + /* Write column values in case of distinct column value compression. */ + if (huff_tree->counts->tree_buff) + { + for (i=0 ; i < int_length ; i++) + { + write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8); + DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x", + i, (uchar) huff_tree->counts->tree_buff[i])); + if (verbose >= 3) + VOID(printf("column_values[0x%04x]: 0x%02x\n", + i, (uchar) huff_tree->counts->tree_buff[i])); + } + } + flush_bits(); + } + DBUG_PRINT("info", (" ")); + if (verbose >= 2) + VOID(printf("\n")); + my_afree((uchar*) packed_tree); + if (errors) + { + VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n")); + return 0; + } + return elements; +} + + +static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element, + uint *offset) +{ + uint *prev_offset; + + prev_offset= offset; + /* + 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null, + then there is no left child and, hence no right child either. This + is a property of a binary tree. An element is either a node with two + childs, or a leaf without childs. + + The current element is always a node with two childs. Go left first. + */ + if (!element->a.nod.left->a.leaf.null) + { + /* Store the uchar code or the index of the column value. */ + prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr; + offset+=2; + } + else + { + /* + Recursively traverse the tree to the left. Mark it as an offset to + another tree node (in contrast to a uchar code or column value index). + */ + prev_offset[0]= IS_OFFSET+2; + offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2); + } + + /* Now, check the right child. */ + if (!element->a.nod.right->a.leaf.null) + { + /* Store the uchar code or the index of the column value. */ + prev_offset[1]=element->a.nod.right->a.leaf.element_nr; + return offset; + } + else + { + /* + Recursively traverse the tree to the right. Mark it as an offset to + another tree node (in contrast to a uchar code or column value index). + */ + uint temp=(uint) (offset-prev_offset-1); + prev_offset[1]= IS_OFFSET+ temp; + if (huff_tree->max_offset < temp) + huff_tree->max_offset = temp; + return make_offset_code_tree(huff_tree,element->a.nod.right,offset); + } +} + + /* Get number of bits neaded to represent value */ + +static uint max_bit(register uint value) +{ + reg2 uint power=1; + + while ((value>>=1)) + power++; + return (power); +} + + +static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts) +{ + int error; + uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length; + uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes; + my_off_t record_count; + char llbuf[32]; + ulong length,pack_length; + uchar *record,*pos,*end_pos,*record_pos,*start_pos; + HUFF_COUNTS *count,*end_count; + HUFF_TREE *tree; + MARIA_HA *isam_file=mrg->file[0]; + uint pack_version= (uint) isam_file->s->pack.version; + DBUG_ENTER("compress_isam_file"); + + /* Allocate a buffer for the records (excluding blobs). */ + if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength))) + return -1; + + end_count=huff_counts+isam_file->s->base.fields; + min_record_length= (uint) ~0; + max_record_length=0; + null_bytes= isam_file->s->base.null_bytes; + + /* + Calculate the maximum number of bits required to pack the records. + Remember to understand 'max_zero_fill' as 'min_zero_fill'. + The tree height determines the maximum number of bits per value. + Some fields skip leading or trailing spaces or zeroes. The skipped + number of bytes is encoded by 'length_bits' bits. + Empty blobs and varchar are encoded with a single 1 bit. Other blobs + and varchar get a leading 0 bit. + */ + max_calc_length= null_bytes; + for (i= 0 ; i < isam_file->s->base.fields ; i++) + { + if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL)) + huff_counts[i].max_zero_fill=0; + if (huff_counts[i].field_type == FIELD_CONSTANT || + huff_counts[i].field_type == FIELD_ZERO || + huff_counts[i].field_type == FIELD_CHECK) + continue; + if (huff_counts[i].field_type == FIELD_INTERVALL) + max_calc_length+=huff_counts[i].tree->height; + else if (huff_counts[i].field_type == FIELD_BLOB || + huff_counts[i].field_type == FIELD_VARCHAR) + max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1; + else + max_calc_length+= + (huff_counts[i].field_length - huff_counts[i].max_zero_fill)* + huff_counts[i].tree->height+huff_counts[i].length_bits; + } + max_calc_length= (max_calc_length + 7) / 8; + pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length); + record_count=0; + /* 'max_blob_length' is the max length of all blobs of a record. */ + pack_blob_length= isam_file->s->base.blobs ? + _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0; + max_pack_length=pack_ref_length+pack_blob_length; + + DBUG_PRINT("fields", ("===")); + mrg_reset(mrg); + while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE) + { + ulong tot_blob_length=0; + if (! error) + { + if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length)) + break; + record_pos= (uchar*) file_buffer.pos; + file_buffer.pos+= max_pack_length; + if (null_bytes) + { + /* Copy null bits 'as is' */ + memcpy(file_buffer.pos, record, null_bytes); + file_buffer.pos+= null_bytes; + } + for (start_pos=record+null_bytes, count= huff_counts; + count < end_count ; + count++) + { + end_pos=start_pos+(field_length=count->field_length); + tree=count->tree; + + DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u " + "lbits: %2u tree: %2u length: %4u", + (ulong) (count - huff_counts + 1), + count->field_type, + count->pack_type, count->max_zero_fill, + count->length_bits, count->tree->tree_number, + count->field_length)); + + /* Check if the column contains spaces only. */ + if (count->pack_type & PACK_TYPE_SPACE_FIELDS) + { + for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ; + if (pos == end_pos) + { + DBUG_PRINT("fields", + ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1")); + DBUG_PRINT("fields", ("---")); + write_bits(1,1); + start_pos=end_pos; + continue; + } + DBUG_PRINT("fields", + ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1")); + write_bits(0,1); + } + end_pos-=count->max_zero_fill; + field_length-=count->max_zero_fill; + + switch (count->field_type) { + case FIELD_SKIP_ZERO: + if (!memcmp((uchar*) start_pos,zero_string,field_length)) + { + DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1")); + write_bits(1,1); + start_pos=end_pos; + break; + } + DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1")); + write_bits(0,1); + /* Fall through */ + case FIELD_NORMAL: + DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes", + (ulong) (end_pos - start_pos))); + for ( ; start_pos < end_pos ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + break; + case FIELD_SKIP_ENDSPACE: + for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ; + length= (ulong) (end_pos - pos); + if (count->pack_type & PACK_TYPE_SELECTED) + { + if (length > count->min_space) + { + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1")); + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(1,1); + write_bits(length,count->length_bits); + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE not more than min_space, " + "bits: 1")); + write_bits(0,1); + pos=end_pos; + } + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(length,count->length_bits); + } + /* Encode all significant bytes. */ + DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes", + (ulong) (pos - start_pos))); + for ( ; start_pos < pos ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + start_pos=end_pos; + break; + case FIELD_SKIP_PRESPACE: + for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ; + length= (ulong) (pos - start_pos); + if (count->pack_type & PACK_TYPE_SELECTED) + { + if (length > count->min_space) + { + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE more than min_space, bits: 1")); + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(1,1); + write_bits(length,count->length_bits); + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE not more than min_space, " + "bits: 1")); + pos=start_pos; + write_bits(0,1); + } + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(length,count->length_bits); + } + /* Encode all significant bytes. */ + DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes", + (ulong) (end_pos - start_pos))); + for (start_pos=pos ; start_pos < end_pos ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + break; + case FIELD_CONSTANT: + case FIELD_ZERO: + case FIELD_CHECK: + DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK")); + start_pos=end_pos; + break; + case FIELD_INTERVALL: + global_count=count; + pos=(uchar*) tree_search(&count->int_tree, start_pos, + count->int_tree.custom_arg); + intervall=(uint) (pos - count->tree_buff)/field_length; + DBUG_PRINT("fields", ("FIELD_INTERVALL")); + DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u", + intervall, hexdigits(tree->code[intervall]), + (uint) tree->code_len[intervall])); + write_bits(tree->code[intervall],(uint) tree->code_len[intervall]); + start_pos=end_pos; + break; + case FIELD_BLOB: + { + ulong blob_length= _ma_calc_blob_length(field_length- + portable_sizeof_char_ptr, + start_pos); + /* Empty blobs are encoded with a single 1 bit. */ + if (!blob_length) + { + DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1")); + write_bits(1,1); + } + else + { + uchar *blob,*blob_end; + DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1")); + write_bits(0,1); + /* Write the blob length. */ + DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u", + blob_length, count->length_bits)); + write_bits(blob_length,count->length_bits); + memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr, + sizeof(char*)); + blob_end=blob+blob_length; + /* Encode the blob bytes. */ + for ( ; blob < blob_end ; blob++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *blob, hexdigits(tree->code[(uchar) *blob]), + (uint) tree->code_len[(uchar) *blob], + bindigits(tree->code[(uchar) *start_pos], + (uint)tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *blob], + (uint) tree->code_len[(uchar) *blob]); + } + tot_blob_length+=blob_length; + } + start_pos= end_pos; + break; + } + case FIELD_VARCHAR: + { + uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1); + ulong col_length= (var_pack_length == 1 ? + (uint) *(uchar*) start_pos : + uint2korr(start_pos)); + /* Empty varchar are encoded with a single 1 bit. */ + if (!col_length) + { + DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1")); + write_bits(1,1); /* Empty varchar */ + } + else + { + uchar *end= start_pos + var_pack_length + col_length; + DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1")); + write_bits(0,1); + /* Write the varchar length. */ + DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u", + col_length, count->length_bits)); + write_bits(col_length,count->length_bits); + /* Encode the varchar bytes. */ + for (start_pos+= var_pack_length ; start_pos < end ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint)tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + } + start_pos= end_pos; + break; + } + case FIELD_LAST: + case FIELD_enum_val_count: + abort(); /* Impossible */ + } + start_pos+=count->max_zero_fill; + DBUG_PRINT("fields", ("---")); + } + flush_bits(); + length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length; + pack_length= _ma_save_pack_length(pack_version, record_pos, length); + if (pack_blob_length) + pack_length+= _ma_save_pack_length(pack_version, + record_pos + pack_length, + tot_blob_length); + DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu " + "length-bytes: %lu", (ulong) record_count, length, + tot_blob_length, pack_length)); + DBUG_PRINT("fields", ("===")); + + /* Correct file buffer if the header was smaller */ + if (pack_length != max_pack_length) + { + bmove(record_pos+pack_length,record_pos+max_pack_length,length); + file_buffer.pos-= (max_pack_length-pack_length); + } + if (length < (ulong) min_record_length) + min_record_length=(uint) length; + if (length > (ulong) max_record_length) + max_record_length=(uint) length; + record_count++; + if (write_loop && record_count % WRITE_COUNT == 0) + { + VOID(printf("%lu\r", (ulong) record_count)); + VOID(fflush(stdout)); + } + } + else if (error != HA_ERR_RECORD_DELETED) + break; + } + if (error == HA_ERR_END_OF_FILE) + error=0; + else + { + VOID(fprintf(stderr, "%s: Got error %d reading records\n", + my_progname, error)); + } + if (verbose >= 2) + VOID(printf("wrote %s records.\n", llstr((longlong) record_count, llbuf))); + + my_afree((uchar*) record); + mrg->ref_length=max_pack_length; + mrg->min_pack_length=max_record_length ? min_record_length : 0; + mrg->max_pack_length=max_record_length; + DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0)); +} + + +static char *make_new_name(char *new_name, char *old_name) +{ + return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4); +} + +static char *make_old_name(char *new_name, char *old_name) +{ + return fn_format(new_name,old_name,"",OLD_EXT,2+4); +} + + /* rutines for bit writing buffer */ + +static void init_file_buffer(File file, pbool read_buffer) +{ + file_buffer.file=file; + file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE), + MYF(MY_WME)); + file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8; + file_buffer.pos_in_file=0; + error_on_write=0; + if (read_buffer) + { + + file_buffer.pos=file_buffer.end; + file_buffer.bits=0; + } + else + { + file_buffer.pos=file_buffer.buffer; + file_buffer.bits=BITS_SAVED; + } + file_buffer.bitbucket= 0; +} + + +static int flush_buffer(ulong neaded_length) +{ + ulong length; + + /* + file_buffer.end is 8 bytes lower than the real end of the buffer. + This is done so that the end-of-buffer condition does not need to be + checked for every uchar (see write_bits()). Consequently, + file_buffer.pos can become greater than file_buffer.end. The + algorithms in the other functions ensure that there will never be + more than 8 bytes written to the buffer without an end-of-buffer + check. So the buffer cannot be overrun. But we need to check for the + near-to-buffer-end condition to avoid a negative result, which is + casted to unsigned and thus becomes giant. + */ + if ((file_buffer.pos < file_buffer.end) && + ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length)) + return 0; + length=(ulong) (file_buffer.pos-file_buffer.buffer); + file_buffer.pos=file_buffer.buffer; + file_buffer.pos_in_file+=length; + if (test_only) + return 0; + if (error_on_write|| my_write(file_buffer.file, + (const uchar*) file_buffer.buffer, + length, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL))) + { + error_on_write=1; + return 1; + } + + if (neaded_length != ~(ulong) 0 && + (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length) + { + char *tmp; + neaded_length+=256; /* some margin */ + tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME)); + if (!tmp) + return 1; + file_buffer.pos= ((uchar*) tmp + + (ulong) (file_buffer.pos - file_buffer.buffer)); + file_buffer.buffer= (uchar*) tmp; + file_buffer.end= (uchar*) (tmp+neaded_length-8); + } + return 0; +} + + +static void end_file_buffer(void) +{ + my_free((uchar*) file_buffer.buffer,MYF(0)); +} + + /* output `bits` low bits of `value' */ + +static void write_bits(register ulonglong value, register uint bits) +{ + DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) || + (bits == 8 * sizeof(value))); + + if ((file_buffer.bits-= (int) bits) >= 0) + { + file_buffer.bitbucket|= value << file_buffer.bits; + } + else + { + reg3 ulonglong bit_buffer; + bits= (uint) -file_buffer.bits; + bit_buffer= (file_buffer.bitbucket | + ((bits != 8 * sizeof(value)) ? (value >> bits) : 0)); +#if BITS_SAVED == 64 + *file_buffer.pos++= (uchar) (bit_buffer >> 56); + *file_buffer.pos++= (uchar) (bit_buffer >> 48); + *file_buffer.pos++= (uchar) (bit_buffer >> 40); + *file_buffer.pos++= (uchar) (bit_buffer >> 32); +#endif + *file_buffer.pos++= (uchar) (bit_buffer >> 24); + *file_buffer.pos++= (uchar) (bit_buffer >> 16); + *file_buffer.pos++= (uchar) (bit_buffer >> 8); + *file_buffer.pos++= (uchar) (bit_buffer); + + if (bits != 8 * sizeof(value)) + value&= (((ulonglong) 1) << bits) - 1; + if (file_buffer.pos >= file_buffer.end) + VOID(flush_buffer(~ (ulong) 0)); + file_buffer.bits=(int) (BITS_SAVED - bits); + file_buffer.bitbucket= value << (BITS_SAVED - bits); + } + return; +} + + /* Flush bits in bit_buffer to buffer */ + +static void flush_bits(void) +{ + int bits; + ulonglong bit_buffer; + + bits= file_buffer.bits & ~7; + bit_buffer= file_buffer.bitbucket >> bits; + bits= BITS_SAVED - bits; + while (bits > 0) + { + bits-= 8; + *file_buffer.pos++= (uchar) (bit_buffer >> bits); + } + if (file_buffer.pos >= file_buffer.end) + VOID(flush_buffer(~ (ulong) 0)); + file_buffer.bits= BITS_SAVED; + file_buffer.bitbucket= 0; +} + + +/**************************************************************************** +** functions to handle the joined files +****************************************************************************/ + +static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg, + my_off_t new_length, + ha_checksum crc) +{ + MARIA_SHARE *share=isam_file->s; + uint options=mi_uint2korr(share->state.header.options); + uint key; + DBUG_ENTER("save_state"); + + options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA; + mi_int2store(share->state.header.options,options); + /* Save the original file type of we have to undo the packing later */ + share->state.header.org_data_file_type= share->state.header.data_file_type; + share->state.header.data_file_type= COMPRESSED_RECORD; + + share->state.state.data_file_length=new_length; + share->state.state.del=0; + share->state.state.empty=0; + share->state.dellink= HA_OFFSET_ERROR; + share->state.split=(ha_rows) mrg->records; + share->state.version=(ulong) time((time_t*) 0); + if (! maria_is_all_keys_active(share->state.key_map, share->base.keys)) + { + /* + Some indexes are disabled, cannot use current key_file_length value + as an estimate of upper bound of index file size. Use packed data file + size instead. + */ + share->state.state.key_file_length= new_length; + } + /* + If there are no disabled indexes, keep key_file_length value from + original file so "maria_chk -rq" can use this value (this is necessary + because index size cannot be easily calculated for fulltext keys) + */ + maria_clear_all_keys_active(share->state.key_map); + for (key=0 ; key < share->base.keys ; key++) + share->state.key_root[key]= HA_OFFSET_ERROR; + share->state.key_del= HA_OFFSET_ERROR; + isam_file->state->checksum=crc; /* Save crc here */ + share->changed=1; /* Force write of header */ + share->state.open_count=0; + share->global_changed=0; + VOID(my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0))); + if (share->base.keys) + isamchk_neaded=1; + DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file, + &share->state, (1 + 2))); +} + + +static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length, + ha_checksum crc) +{ + MARIA_STATE_INFO state; + MARIA_HA *isam_file=mrg->file[0]; + uint options; + DBUG_ENTER("save_state_mrg"); + + state= isam_file->s->state; + options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD | + HA_OPTION_READ_ONLY_DATA); + mi_int2store(state.header.options,options); + state.state.data_file_length=new_length; + state.state.del=0; + state.state.empty=0; + state.state.records=state.split=(ha_rows) mrg->records; + /* See comment above in save_state about key_file_length handling. */ + if (mrg->src_file_has_indexes_disabled) + { + isam_file->s->state.state.key_file_length= + max(isam_file->s->state.state.key_file_length, new_length); + } + state.dellink= HA_OFFSET_ERROR; + state.version=(ulong) time((time_t*) 0); + maria_clear_all_keys_active(state.key_map); + state.state.checksum=crc; + if (isam_file->s->base.keys) + isamchk_neaded=1; + state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */ + DBUG_RETURN (_ma_state_info_write_sub(file,&state,1+2)); +} + + +/* reset for mrg_rrnd */ + +static void mrg_reset(PACK_MRG_INFO *mrg) +{ + if (mrg->current) + { + maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0); + mrg->current=0; + } +} + +static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf) +{ + int error; + MARIA_HA *isam_info; + my_off_t filepos; + + if (!info->current) + { + isam_info= *(info->current=info->file); + info->end=info->current+info->count; + maria_reset(isam_info); + maria_extra(isam_info, HA_EXTRA_CACHE, 0); + if ((error= maria_scan_init(isam_info))) + return(error); + } + else + isam_info= *info->current; + + for (;;) + { + if (!(error= maria_scan(isam_info, buf)) || + error != HA_ERR_END_OF_FILE) + return (error); + maria_scan_end(isam_info); + maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0); + if (info->current+1 == info->end) + return(HA_ERR_END_OF_FILE); + info->current++; + isam_info= *info->current; + filepos=isam_info->s->pack.header_length; + maria_reset(isam_info); + maria_extra(isam_info,HA_EXTRA_CACHE, 0); + if ((error= maria_scan_init(isam_info))) + return(error); + } +} + + +static int mrg_close(PACK_MRG_INFO *mrg) +{ + uint i; + int error=0; + DBUG_ENTER("mrg_close"); + + for (i=0 ; i < mrg->count ; i++) + error|=maria_close(mrg->file[i]); + if (mrg->free_file) + my_free((uchar*) mrg->file,MYF(0)); + DBUG_RETURN(error); +} + + +#if !defined(DBUG_OFF) +/* + Fake the counts to get big Huffman codes. + + SYNOPSIS + fakebigcodes() + huff_counts A pointer to the counts array. + end_count A pointer past the counts array. + + DESCRIPTION + + Huffman coding works by removing the two least frequent values from + the list of values and add a new value with the sum of their + incidences in a loop until only one value is left. Every time a + value is reused for a new value, it gets one more bit for its + encoding. Hence, the least frequent values get the longest codes. + + To get a maximum code length for a value, two of the values must + have an incidence of 1. As their sum is 2, the next infrequent value + must have at least an incidence of 2, then 4, 8, 16 and so on. This + means that one needs 2**n bytes (values) for a code length of n + bits. However, using more distinct values forces the use of longer + codes, or reaching the code length with less total bytes (values). + + To get 64(32)-bit codes, I sort the counts by decreasing incidence. + I assign counts of 1 to the two most frequent values, a count of 2 + for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All + the remaining values get 1. That way every possible uchar has an + assigned code, though not all codes are used if not all uchar values + are present in the column. + + This strategy would work with distinct column values too, but + requires that at least 64(32) values are present. To make things + easier here, I cancel all distinct column values and force byte + compression for all columns. + + RETURN + void +*/ + +static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count) +{ + HUFF_COUNTS *count; + my_off_t *cur_count_p; + my_off_t *end_count_p; + my_off_t **cur_sort_p; + my_off_t **end_sort_p; + my_off_t *sort_counts[256]; + my_off_t total; + DBUG_ENTER("fakebigcodes"); + + for (count= huff_counts; count < end_count; count++) + { + /* + Remove distinct column values. + */ + if (huff_counts->tree_buff) + { + my_free((uchar*) huff_counts->tree_buff, MYF(0)); + delete_tree(&huff_counts->int_tree); + huff_counts->tree_buff= NULL; + DBUG_PRINT("fakebigcodes", ("freed distinct column values")); + } + + /* + Sort counts by decreasing incidence. + */ + cur_count_p= count->counts; + end_count_p= cur_count_p + 256; + cur_sort_p= sort_counts; + while (cur_count_p < end_count_p) + *(cur_sort_p++)= cur_count_p++; + (void) qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp); + + /* + Assign faked counts. + */ + cur_sort_p= sort_counts; +#if SIZEOF_LONG_LONG > 4 + end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1; +#else + end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2; +#endif + /* Most frequent value gets a faked count of 1. */ + **(cur_sort_p++)= 1; + total= 1; + while (cur_sort_p < end_sort_p) + { + **(cur_sort_p++)= total; + total<<= 1; + } + /* Set the last value. */ + **(cur_sort_p++)= --total; + /* + Set the remaining counts. + */ + end_sort_p= sort_counts + 256; + while (cur_sort_p < end_sort_p) + **(cur_sort_p++)= 1; + } + DBUG_VOID_RETURN; +} + + +/* + Compare two counts for reverse sorting. + + SYNOPSIS + fakecmp() + count1 One count. + count2 Another count. + + RETURN + 1 count1 < count2 + 0 count1 == count2 + -1 count1 > count2 +*/ + +static int fakecmp(my_off_t **count1, my_off_t **count2) +{ + return ((**count1 < **count2) ? 1 : + (**count1 > **count2) ? -1 : 0); +} +#endif diff --git a/storage/maria/maria_read_log.c b/storage/maria/maria_read_log.c new file mode 100644 index 00000000000..a7a6370b1c4 --- /dev/null +++ b/storage/maria/maria_read_log.c @@ -0,0 +1,200 @@ +/* Copyright (C) 2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "maria_def.h" +#include "ma_recovery.h" +#include <my_getopt.h> + +#define PCACHE_SIZE (1024*1024*10) +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE (1024L*1024L) + +static const char *load_default_groups[]= { "maria_read_log",0 }; +static void get_options(int *argc,char * * *argv); +#ifndef DBUG_OFF +#if defined(__WIN__) +const char *default_dbug_option= "d:t:i:O,\\maria_read_log.trace"; +#else +const char *default_dbug_option= "d:t:i:o,/tmp/maria_read_log.trace"; +#endif +#endif /* DBUG_OFF */ +static my_bool opt_only_display, opt_display_and_apply; + +int main(int argc, char **argv) +{ + LSN lsn; + char **default_argv; + MY_INIT(argv[0]); + + load_defaults("my", load_default_groups, &argc, &argv); + default_argv= argv; + get_options(&argc, &argv); + + maria_data_root= "."; + maria_in_recovery= TRUE; + + if (maria_init()) + { + fprintf(stderr, "Can't init Maria engine (%d)\n", errno); + goto err; + } + /* we don't want to create a control file, it MUST exist */ + if (ma_control_file_create_or_open()) + { + fprintf(stderr, "Can't open control file (%d)\n", errno); + goto err; + } + if (last_logno == FILENO_IMPOSSIBLE) + { + fprintf(stderr, "Can't find any log\n"); + goto err; + } + /* same page cache for log and data; assumes same page size... */ + DBUG_ASSERT(maria_block_size == TRANSLOG_PAGE_SIZE); + if (init_pagecache(maria_pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE) == 0) + { + fprintf(stderr, "Got error in init_pagecache() (errno: %d)\n", errno); + goto err; + } + /* + If log handler does not find the "last_logno" log it will return error, + which is good. + But if it finds a log and this log was crashed, it will create a new log, + which is useless. TODO: start log handler in read-only mode. + */ + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, maria_pagecache, + TRANSLOG_DEFAULT_FLAGS)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + goto err; + } + + if (opt_only_display) + printf("You are using --only-display, NOTHING will be written to disk\n"); + + /* LSN could be also --start-from-lsn=# */ + lsn= translog_first_lsn_in_log(); + if (lsn == LSN_ERROR) + { + fprintf(stderr, "Opening transaction log failed\n"); + goto end; + } + if (lsn == LSN_IMPOSSIBLE) + { + fprintf(stdout, "The transaction log is empty\n"); + } + fprintf(stdout, "The transaction log starts from lsn (%lu,0x%lx)\n", + LSN_IN_PARTS(lsn)); + + fprintf(stdout, "TRACE of the last maria_read_log\n"); + if (maria_apply_log(lsn, opt_display_and_apply, stdout, + opt_display_and_apply, FALSE)) + goto err; + fprintf(stdout, "%s: SUCCESS\n", my_progname); + + goto end; +err: + /* don't touch anything more, in case we hit a bug */ + fprintf(stderr, "%s: FAILED\n", my_progname); + exit(1); +end: + maria_end(); + free_defaults(default_argv); + my_end(0); + exit(0); + return 0; /* No compiler warning */ +} + + +static struct my_option my_long_options[] = +{ + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"only-display", 'o', "display brief info about records's header", + (uchar **) &opt_only_display, (uchar **) &opt_only_display, 0, GET_BOOL, + NO_ARG,0, 0, 0, 0, 0, 0}, + {"display-and-apply", 'a', + "like --only-display but displays more info and modifies tables", + (uchar **) &opt_display_and_apply, (uchar **) &opt_display_and_apply, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +#include <help_start.h> + +static void print_version(void) +{ + VOID(printf("%s Ver 1.0 for %s on %s\n", + my_progname, SYSTEM_TYPE, MACHINE_TYPE)); + NETWARE_SET_SCREEN_MODE(1); +} + + +static void usage(void) +{ + print_version(); + puts("Copyright (C) 2007 MySQL AB"); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Display and apply log records from a MARIA transaction log"); + puts("found in the current directory (for now)"); + VOID(printf("\nUsage: %s OPTIONS\n", my_progname)); + puts("You need to use one of -o or -a"); + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + +#include <help_end.h> + +static my_bool +get_one_option(int optid __attribute__((unused)), + const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch (optid) { + case '?': + usage(); + exit(0); +#ifndef DBUG_OFF + case '#': + DBUG_SET_INITIAL(argument ? argument : default_dbug_option); + break; +#endif + } + return 0; +} + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + + my_progname= argv[0][0]; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + if ((opt_only_display + opt_display_and_apply) != 1) + { + usage(); + exit(1); + } +} diff --git a/storage/maria/maria_rename.sh b/storage/maria/maria_rename.sh new file mode 100755 index 00000000000..fb20e47e635 --- /dev/null +++ b/storage/maria/maria_rename.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +replace myisam maria MYISAM MARIA MyISAM MARIA -- mysql-test/t/*maria*test mysql-test/r/*maria*result + +FILES=`echo sql/ha_maria.{cc,h} include/maria*h storage/maria/*.{c,h}` + +replace myisam maria MYISAM MARIA MyISAM MARIA myisam.h maria.h myisamdef.h maria_def.h mi_ maria_ ft_ maria_ft_ "Copyright (C) 2000" "Copyright (C) 2006" MI_ISAMINFO MARIA_INFO MI_CREATE_INFO MARIA_CREATE_INFO maria_isam_ maria_ MI_INFO MARIA_HA MI_ MARIA_ MARIACHK MARIA_CHK rt_index.h ma_rt_index.h rtree_ maria_rtree rt_key.h ma_rt_key.h rt_mbr.h ma_rt_mbr.h -- $FILES + +replace check_table_is_closed _ma_check_table_is_closed test_if_reopen _ma_test_if_reopen my_n_base_info_read maria_n_base_info_read update_auto_increment _ma_update_auto_increment save_pack_length _ma_save_packlength calc_pack_length _ma_calc_pack_length -- $FILES + +replace mi_ ma_ ft_ ma_ft_ rt_ ma_rt_ myisam maria myisamchk maria_chk myisampack maria_pack myisamlog maria_log -- storage/maria/Makefile.am + +# +# Restore wrong replaces +# + +replace maria_sint1korr mi_sint1korr maria_uint1korr mi_uint1korr maria_sint2korr mi_sint2korr maria_sint3korr mi_sint3korr maria_sint4korr mi_sint4korr maria_sint8korr mi_sint8korr maria_uint2korr mi_uint2korr maria_uint3korr mi_uint3korr maria_uint4korr mi_uint4korr maria_uint5korr mi_uint5korr maria_uint6korr mi_uint6korr maria_uint7korr mi_uint7korr maria_uint8korr mi_uint8korr maria_int1store mi_int1store maria_int2store mi_int2store maria_int3store mi_int3store maria_int4store mi_int4store maria_int5store mi_int5store maria_int6store mi_int6store maria_int7store mi_int7store maria_int8store mi_int8store maria_float4store mi_float4store maria_float4get mi_float4get maria_float8store mi_float8store maria_float8get mi_float8get maria_rowstore mi_rowstore maria_rowkorr mi_rowkorr maria_sizestore mi_sizestore maria_sizekorr mi_sizekorr _maria_maria_ _maria MARIA_MAX_POSSIBLE_KEY HA_MAX_POSSIBLE_KEY MARIA_MAX_KEY_BUFF HA_MAX_KEY_BUFF MARIA_MAX_KEY_SEG HA_MAX_KEY_SEG maria_ft_sintXkorr ft_sintXkorr maria_ft_intXstore ft_intXstore maria_ft_boolean_syntax ft_boolean_syntax maria_ft_min_word_len ft_min_word_len maria_ft_max_word_len ft_max_word_len -- $FILES diff --git a/storage/maria/plug.in b/storage/maria/plug.in new file mode 100644 index 00000000000..1ce64f6e2bb --- /dev/null +++ b/storage/maria/plug.in @@ -0,0 +1,8 @@ +MYSQL_STORAGE_ENGINE(maria,, [Maria Storage Engine], + [Traditional transactional MySQL tables], [max,max-no-ndb]) +MYSQL_PLUGIN_DIRECTORY(maria, [storage/maria]) +MYSQL_PLUGIN_ACTIONS(maria, [AC_CONFIG_FILES(storage/maria/unittest/Makefile)]) +MYSQL_PLUGIN_STATIC(maria, [libmaria.a]) +# Maria will probably go first into max builds, not all builds, +# so we don't declare it mandatory. +MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(maria, [ha_maria.cc]) diff --git a/storage/maria/tablockman.c b/storage/maria/tablockman.c new file mode 100644 index 00000000000..eb8da1d6865 --- /dev/null +++ b/storage/maria/tablockman.c @@ -0,0 +1,676 @@ +/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */ +/* QQ: automatically place S instead of LS if possible */ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include <my_base.h> +#include <hash.h> +#include "tablockman.h" + +/* + Lock Manager for Table Locks + + The code below handles locks on resources - but it is optimized for a + case when a number of resources is not very large, and there are many of + locks per resource - that is a resource is likely to be a table or a + database, but hardly a row in a table. + + Locks belong to "lock owners". A Lock Owner is uniquely identified by a + 16-bit number - loid (lock owner identifier). A function loid_to_tlo must + be provided by the application that takes such a number as an argument + and returns a TABLE_LOCK_OWNER structure. + + Lock levels are completely defined by three tables. Lock compatibility + matrix specifies which locks can be held at the same time on a resource. + Lock combining matrix specifies what lock level has the same behaviour as + a pair of two locks of given levels. getlock_result matrix simplifies + intention locking and lock escalation for an application, basically it + defines which locks are intention locks and which locks are "loose" + locks. It is only used to provide better diagnostics for the + application, lock manager itself does not differentiate between normal, + intention, and loose locks. + + The assumptions are: few distinct resources, many locks are held at the + same time on one resource. Thus: a lock structure _per resource_ can be + rather large; a lock structure _per lock_ does not need to be very small + either; we need to optimize for _speed_. Operations we need are: place a + lock, check if a particular transaction already has a lock on this + resource, check if a conflicting lock exists, if yes - find who owns it. + + Solution: every resource has a structure with + 1. Hash of latest (see the lock upgrade section below) granted locks with + loid as a key. Thus, checking if a given transaction has a lock on + this resource is O(1) operation. + 2. Doubly-linked lists of all granted locks - one list for every lock + type. Thus, checking if a conflicting lock exists is a check whether + an appropriate list head pointer is not null, also O(1). + 3. Every lock has a loid of the owner, thus checking who owns a + conflicting lock is also O(1). + 4. Deque of waiting locks. It's a deque (double-ended queue) not a fifo, + because for lock upgrades requests are added to the queue head, not + tail. This is a single place where there it gets O(N) on number + of locks - when a transaction wakes up from waiting on a condition, + it may need to scan the queue backward to the beginning to find + a conflicting lock. It is guaranteed though that "all transactions + before it" received the same - or earlier - signal. In other words a + transaction needs to scan all transactions before it that received the + signal but didn't have a chance to resume the execution yet, so + practically OS scheduler won't let the scan to be O(N). + + Waiting: if there is a conflicting lock or if wait queue is not empty, a + requested lock cannot be granted at once. It is added to the end of the + wait queue. If a queue was empty and there is a conflicting lock - the + "blocker" transaction is the owner of this lock. If a queue is not empty, + an owner of the previous lock in the queue is the "blocker". But if the + previous lock is compatible with the request, then the "blocker" is the + transaction that the owner of the lock at the end of the queue is waiting + for (in other words, our lock is added to the end of the wait queue, and + our blocker is the same as of the lock right before us). + + Lock upgrades: when a thread that has a lock on a given resource, + requests a new lock on the same resource and the old lock is not enough + to satisfy new lock requirements (which is defined by + lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock + (defined by lock_combining_matrix as above) is placed. Depending on + other granted locks it is immediately granted or it has to wait. Here the + lock is added to the start of the waiting queue, not to the end. Old + lock, is removed from the hash, but not from the doubly-linked lists. + (indeed, a transaction checks "do I have a lock on this resource ?" by + looking in a hash, and it should find a latest lock, so old locks must be + removed; but a transaction checks "are there conflicting locks ?" by + checking doubly-linked lists, it doesn't matter if it will find an old + lock - if it would be removed, a new lock would be also a conflict). + So, a hash contains only "latest" locks - there can be only one latest + lock per resource per transaction. But doubly-linked lists contain all + locks, even "obsolete" ones, because it doesnt't hurt. Note that old + locks can not be freed early, in particular they stay in the + 'active_locks' list of a lock owner, because they may be "re-enabled" + on a savepoint rollback. + + To better support table-row relations where one needs to lock the table + with an intention lock before locking the row, extended diagnostics is + provided. When an intention lock (presumably on a table) is granted, + lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row, + perhaps the thread already has a normal lock on this table), + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual), + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check + whether it's possible to lock the row, but no need to lock it - perhaps + the thread has a loose lock on this table). This is defined by + getlock_result[] table. + + Instant duration locks are not supported. Though they're trivial to add, + they are normally only used on rows, not on tables. So, presumably, + they are not needed here. + + Mutexes: there're table mutexes (LOCKED_TABLE::mutex), lock owner mutexes + (TABLE_LOCK_OWNER::mutex), and a pool mutex (TABLOCKMAN::pool_mutex). + table mutex protects operations on the table lock structures, and lock + owner pointers waiting_for and waiting_for_loid. + lock owner mutex is only used to wait on lock owner condition + (TABLE_LOCK_OWNER::cond), there's no need to protect owner's lock + structures, and only lock owner itself may access them. + The pool mutex protects a pool of unused locks. Note the locking order: + first the table mutex, then the owner mutex or a pool mutex. + Table mutex lock cannot be attempted when owner or pool mutex are locked. + No mutex lock can be attempted if owner or pool mutex are locked. +*/ + +/* + Lock compatibility matrix. + + It's asymmetric. Read it as "Somebody has the lock <value in the row + label>, can I set the lock <value in the column label> ?" + + ') Though you can take LS lock while somebody has S lock, it makes no + sense - it's simpler to take S lock too. + + 1 - compatible + 0 - incompatible + -1 - "impossible", so that we can assert the impossibility. +*/ +static const int lock_compatibility_matrix[10][10]= +{ /* N S X IS IX SIX LS LX SLX LSIX */ + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, /* N */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */ + { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */ + { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */ +}; + +/* + Lock combining matrix. + + It's symmetric. Read it as "what lock level L is identical to the + set of two locks A and B" + + One should never get N from it, we assert the impossibility +*/ +static const enum lock_type lock_combining_matrix[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { N, N, N, N, N, N, N, N, N, N}, /* N */ + { N, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */ + { N, X, X, X, X, X, X, X, X, X}, /* X */ + { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */ + { N, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */ + { N, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */ + { N, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */ + { N, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */ + { N, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */ + { N, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */ +}; + +/* + the return codes for lockman_getlock + + It's asymmetric. Read it as "I have the lock <value in the row label>, + what value should be returned for <value in the column label> ?" + + 0 means impossible combination (assert!) + + Defines below help to preserve the table structure. + I/L/A values are self explanatory + x means the combination is possible (assert should not crash) + but it cannot happen in row locks, only in table locks (S,X), + or lock escalations (LS,LX) +*/ +#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE +#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +#define A GOT_THE_LOCK +#define x GOT_THE_LOCK +static const enum lockman_getlock_result getlock_result[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */ + { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */ + { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */ + { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */ + { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */ + { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */ + { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */ + { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */ + { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */ + { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */ +}; +#undef I +#undef L +#undef A +#undef x + +/* + this structure is optimized for a case when there're many locks + on the same resource - e.g. a table +*/ + +struct st_table_lock { + /* QQ: do we need upgraded_from ? */ + struct st_table_lock *next_in_lo, *upgraded_from, *next, *prev; + struct st_locked_table *table; + uint16 loid; + uchar lock_type; +}; + +#define hash_insert my_hash_insert /* for consistency :) */ + +static inline +TABLE_LOCK *find_by_loid(LOCKED_TABLE *table, uint16 loid) +{ + return (TABLE_LOCK *)hash_search(& table->latest_locks, + (uchar *)& loid, sizeof(loid)); +} + +static inline +void remove_from_wait_queue(TABLE_LOCK *lock, LOCKED_TABLE *table) +{ + DBUG_ASSERT(table == lock->table); + if (lock->prev) + { + DBUG_ASSERT(table->wait_queue_out != lock); + lock->prev->next= lock->next; + } + else + { + DBUG_ASSERT(table->wait_queue_out == lock); + table->wait_queue_out= lock->next; + } + if (lock->next) + { + DBUG_ASSERT(table->wait_queue_in != lock); + lock->next->prev= lock->prev; + } + else + { + DBUG_ASSERT(table->wait_queue_in == lock); + table->wait_queue_in= lock->prev; + } +} + +/* + DESCRIPTION + tries to lock a resource 'table' with a lock level 'lock'. + + RETURN + see enum lockman_getlock_result +*/ +enum lockman_getlock_result +tablockman_getlock(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo, + LOCKED_TABLE *table, enum lock_type lock) +{ + TABLE_LOCK *old, *new, *blocker, *blocker2; + TABLE_LOCK_OWNER *wait_for; + ulonglong deadline; + struct timespec timeout; + enum lock_type new_lock; + enum lockman_getlock_result res; + int i; + + DBUG_ASSERT(lo->waiting_lock == 0); + DBUG_ASSERT(lo->waiting_for == 0); + DBUG_ASSERT(lo->waiting_for_loid == 0); + + pthread_mutex_lock(& table->mutex); + /* do we already have a lock on this resource ? */ + old= find_by_loid(table, lo->loid); + + /* calculate the level of the upgraded lock, if yes */ + new_lock= old ? lock_combining_matrix[old->lock_type][lock] : lock; + + /* and check if old lock is enough to satisfy the new request */ + if (old && new_lock == old->lock_type) + { + /* yes */ + res= getlock_result[old->lock_type][lock]; + goto ret; + } + + /* no, placing a new lock. first - take a free lock structure from the pool */ + pthread_mutex_lock(& lm->pool_mutex); + new= lm->pool; + if (new) + { + lm->pool= new->next; + pthread_mutex_unlock(& lm->pool_mutex); + } + else + { + pthread_mutex_unlock(& lm->pool_mutex); + new= (TABLE_LOCK *)my_malloc(sizeof(*new), MYF(MY_WME)); + if (unlikely(!new)) + { + res= NO_MEMORY_FOR_LOCK; + goto ret; + } + } + + new->loid= lo->loid; + new->lock_type= new_lock; + new->table= table; + + /* and try to place it */ + for (new->prev= table->wait_queue_in;;) + { + wait_for= 0; + if (!old) + { + /* not upgrading - a lock must be added to the _end_ of the wait queue */ + for (blocker= new->prev; blocker && !wait_for; blocker= blocker->prev) + { + TABLE_LOCK_OWNER *tmp= lm->loid_to_tlo(blocker->loid); + + /* find a blocking lock */ + DBUG_ASSERT(table->wait_queue_out); + DBUG_ASSERT(table->wait_queue_in); + if (!lock_compatibility_matrix[blocker->lock_type][lock]) + { + /* found! */ + wait_for= tmp; + break; + } + + /* + hmm, the lock before doesn't block us, let's look one step further. + the condition below means: + + if we never waited on a condition yet + OR + the lock before ours (blocker) waits on a lock (blocker2) that is + present in the hash AND and conflicts with 'blocker' + + the condition after OR may fail if 'blocker2' was removed from + the hash, its signal woke us up, but 'blocker' itself didn't see + the signal yet. + */ + if (!lo->waiting_lock || + ((blocker2= find_by_loid(table, tmp->waiting_for_loid)) && + !lock_compatibility_matrix[blocker2->lock_type] + [blocker->lock_type])) + { + /* but it's waiting for a real lock. we'll wait for the same lock */ + wait_for= tmp->waiting_for; + /* + We don't really need tmp->waiting_for, as tmp->waiting_for_loid + is enough. waiting_for is just a local cache to avoid calling + loid_to_tlo(). + But it's essensial that tmp->waiting_for pointer can ONLY + be dereferenced if find_by_loid() above returns a non-null + pointer, because a TABLE_LOCK_OWNER object that it points to + may've been freed when we come here after a signal. + In particular tmp->waiting_for_loid cannot be replaced + with tmp->waiting_for->loid. + */ + DBUG_ASSERT(wait_for == lm->loid_to_tlo(tmp->waiting_for_loid)); + break; + } + + /* + otherwise - a lock it's waiting for doesn't exist. + We've no choice but to scan the wait queue backwards, looking + for a conflicting lock or a lock waiting for a real lock. + QQ is there a way to avoid this scanning ? + */ + } + } + + if (wait_for == 0) + { + /* checking for compatibility with existing locks */ + for (blocker= 0, i= 0; i < LOCK_TYPES; i++) + { + if (table->active_locks[i] && !lock_compatibility_matrix[i+1][lock]) + { + blocker= table->active_locks[i]; + /* if the first lock in the list is our own - skip it */ + if (blocker->loid == lo->loid) + blocker= blocker->next; + if (blocker) /* found a conflicting lock, need to wait */ + break; + } + } + if (!blocker) /* free to go */ + break; + wait_for= lm->loid_to_tlo(blocker->loid); + } + + /* ok, we're here - the wait is inevitable */ + lo->waiting_for= wait_for; + lo->waiting_for_loid= wait_for->loid; + if (!lo->waiting_lock) /* first iteration of the for() loop */ + { + /* lock upgrade or new lock request ? */ + if (old) + { + /* upgrade - add the lock to the _start_ of the wait queue */ + new->prev= 0; + if ((new->next= table->wait_queue_out)) + new->next->prev= new; + table->wait_queue_out= new; + if (!table->wait_queue_in) + table->wait_queue_in= table->wait_queue_out; + } + else + { + /* new lock - add the lock to the _end_ of the wait queue */ + new->next= 0; + if ((new->prev= table->wait_queue_in)) + new->prev->next= new; + table->wait_queue_in= new; + if (!table->wait_queue_out) + table->wait_queue_out= table->wait_queue_in; + } + lo->waiting_lock= new; + + deadline= my_getsystime() + lm->lock_timeout * 10000; + timeout.tv_sec= deadline/10000000; + timeout.tv_nsec= (deadline % 10000000) * 100; + } + + /* + prepare to wait. + we must lock blocker's mutex to wait on blocker's cond. + and we must release table's mutex. + note that blocker's mutex is locked _before_ table's mutex is released + */ + pthread_mutex_lock(wait_for->mutex); + pthread_mutex_unlock(& table->mutex); + + /* now really wait */ + i= pthread_cond_timedwait(wait_for->cond, wait_for->mutex, & timeout); + + pthread_mutex_unlock(wait_for->mutex); + + if (i == ETIMEDOUT || i == ETIME) + { + /* we rely on the caller to rollback and release all locks */ + res= LOCK_TIMEOUT; + goto ret2; + } + + pthread_mutex_lock(& table->mutex); + + /* ... and repeat from the beginning */ + } + /* yeah! we can place the lock now */ + + /* remove the lock from the wait queue, if it was there */ + if (lo->waiting_lock) + { + remove_from_wait_queue(new, table); + lo->waiting_lock= 0; + lo->waiting_for= 0; + lo->waiting_for_loid= 0; + } + + /* add it to the list of all locks of this lock owner */ + new->next_in_lo= lo->active_locks; + lo->active_locks= new; + + /* and to the list of active locks of this lock type */ + new->prev= 0; + if ((new->next= table->active_locks[new_lock-1])) + new->next->prev= new; + table->active_locks[new_lock-1]= new; + + /* update the latest_locks hash */ + if (old) + hash_delete(& table->latest_locks, (uchar *)old); + hash_insert(& table->latest_locks, (uchar *)new); + + new->upgraded_from= old; + + res= getlock_result[lock][lock]; + +ret: + pthread_mutex_unlock(& table->mutex); +ret2: + DBUG_ASSERT(res); + return res; +} + +/* + DESCRIPTION + release all locks belonging to a transaction. + signal waiters to continue +*/ +void tablockman_release_locks(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo) +{ + TABLE_LOCK *lock, *local_pool= 0, *local_pool_end; + + /* + instead of adding released locks to a pool one by one, we'll link + them in a list and add to a pool in one short action (under a mutex) + */ + local_pool_end= lo->waiting_lock ? lo->waiting_lock : lo->active_locks; + if (!local_pool_end) + return; + + /* release a waiting lock, if any */ + if ((lock= lo->waiting_lock)) + { + DBUG_ASSERT(lock->loid == lo->loid); + pthread_mutex_lock(& lock->table->mutex); + remove_from_wait_queue(lock, lock->table); + + /* + a special case: if this lock was not the last in the wait queue + and it's compatible with the next lock, than the next lock + is waiting for our blocker though really it waits for us, indirectly. + Signal our blocker to release this next lock (after we removed our + lock from the wait queue, of course). + */ + /* + An example to clarify the above: + trn1> S-lock the table. Granted. + trn2> IX-lock the table. Added to the wait queue. trn2 waits on trn1 + trn3> IS-lock the table. The queue is not empty, so IS-lock is added + to the queue. It's compatible with the waiting IX-lock, so trn3 + waits for trn2->waiting_for, that is trn1. + if trn1 releases the lock it signals trn1->cond and both waiting + transactions are awaken. But if trn2 times out, trn3 must be notified + too (as IS and S locks are compatible). So trn2 must signal trn1->cond. + */ + if (lock->next && + lock_compatibility_matrix[lock->next->lock_type][lock->lock_type]) + { + pthread_mutex_lock(lo->waiting_for->mutex); + pthread_cond_broadcast(lo->waiting_for->cond); + pthread_mutex_unlock(lo->waiting_for->mutex); + } + lo->waiting_for= 0; + lo->waiting_for_loid= 0; + pthread_mutex_unlock(& lock->table->mutex); + + lock->next= local_pool; + local_pool= lock; + } + + /* now release granted locks */ + lock= lo->active_locks; + while (lock) + { + TABLE_LOCK *cur= lock; + pthread_mutex_t *mutex= & lock->table->mutex; + DBUG_ASSERT(cur->loid == lo->loid); + + DBUG_ASSERT(lock != lock->next_in_lo); + lock= lock->next_in_lo; + + /* TODO ? group locks by table to reduce the number of mutex locks */ + pthread_mutex_lock(mutex); + hash_delete(& cur->table->latest_locks, (uchar *)cur); + + if (cur->prev) + cur->prev->next= cur->next; + if (cur->next) + cur->next->prev= cur->prev; + if (cur->table->active_locks[cur->lock_type-1] == cur) + cur->table->active_locks[cur->lock_type-1]= cur->next; + + cur->next= local_pool; + local_pool= cur; + + pthread_mutex_unlock(mutex); + } + + lo->waiting_lock= lo->active_locks= 0; + + /* + okay, all locks released. now signal that we're leaving, + in case somebody's waiting for it + */ + pthread_mutex_lock(lo->mutex); + pthread_cond_broadcast(lo->cond); + pthread_mutex_unlock(lo->mutex); + + /* and push all freed locks to the lockman's pool */ + pthread_mutex_lock(& lm->pool_mutex); + local_pool_end->next= lm->pool; + lm->pool= local_pool; + pthread_mutex_unlock(& lm->pool_mutex); +} + +void tablockman_init(TABLOCKMAN *lm, loid_to_tlo_func *func, uint timeout) +{ + lm->pool= 0; + lm->loid_to_tlo= func; + lm->lock_timeout= timeout; + pthread_mutex_init(& lm->pool_mutex, MY_MUTEX_INIT_FAST); + my_getsystime(); /* ensure that my_getsystime() is initialized */ +} + +void tablockman_destroy(TABLOCKMAN *lm) +{ + while (lm->pool) + { + TABLE_LOCK *tmp= lm->pool; + lm->pool= tmp->next; + my_free((void *)tmp, MYF(0)); + } + pthread_mutex_destroy(& lm->pool_mutex); +} + +/* + initialize a LOCKED_TABLE structure + + SYNOPSYS + lt a LOCKED_TABLE to initialize + initial_hash_size initial size for 'latest_locks' hash +*/ +void tablockman_init_locked_table(LOCKED_TABLE *lt, int initial_hash_size) +{ + bzero(lt, sizeof(*lt)); + pthread_mutex_init(& lt->mutex, MY_MUTEX_INIT_FAST); + hash_init(& lt->latest_locks, & my_charset_bin, initial_hash_size, + offsetof(TABLE_LOCK, loid), + sizeof(((TABLE_LOCK*)0)->loid), 0, 0, 0); +} + +void tablockman_destroy_locked_table(LOCKED_TABLE *lt) +{ + int i; + + DBUG_ASSERT(lt->wait_queue_out == 0); + DBUG_ASSERT(lt->wait_queue_in == 0); + DBUG_ASSERT(lt->latest_locks.records == 0); + for (i= 0; i<LOCK_TYPES; i++) + DBUG_ASSERT(lt->active_locks[i] == 0); + + hash_free(& lt->latest_locks); + pthread_mutex_destroy(& lt->mutex); +} + +#ifdef EXTRA_DEBUG +static const char *lock2str[LOCK_TYPES+1]= {"N", "S", "X", "IS", "IX", "SIX", + "LS", "LX", "SLX", "LSIX"}; + +void tablockman_print_tlo(TABLE_LOCK_OWNER *lo) +{ + TABLE_LOCK *lock; + + printf("lo%d>", lo->loid); + if ((lock= lo->waiting_lock)) + printf(" (%s.0x%lx)", lock2str[lock->lock_type], (ulong)lock->table); + for (lock= lo->active_locks; + lock && lock != lock->next_in_lo; + lock= lock->next_in_lo) + printf(" %s.0x%lx", lock2str[lock->lock_type], (ulong)lock->table); + if (lock && lock == lock->next_in_lo) + printf("!"); + printf("\n"); +} +#endif + diff --git a/storage/maria/tablockman.h b/storage/maria/tablockman.h new file mode 100644 index 00000000000..58c852b5a21 --- /dev/null +++ b/storage/maria/tablockman.h @@ -0,0 +1,87 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _tablockman_h +#define _tablockman_h + +/* + Lock levels: + ^^^^^^^^^^^ + + N - "no lock", not a lock, used sometimes internally to simplify the code + S - Shared + X - eXclusive + IS - Intention Shared + IX - Intention eXclusive + SIX - Shared + Intention eXclusive + LS - Loose Shared + LX - Loose eXclusive + SLX - Shared + Loose eXclusive + LSIX - Loose Shared + Intention eXclusive +*/ +#ifndef _lockman_h +/* QQ: TODO remove N-locks */ +enum lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST }; +enum lockman_getlock_result { + NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT, + GOT_THE_LOCK, + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE, + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +}; +#endif + +#define LOCK_TYPES (LOCK_TYPE_LAST-1) + +typedef struct st_table_lock TABLE_LOCK; + +typedef struct st_table_lock_owner { + TABLE_LOCK *active_locks; /* list of active locks */ + TABLE_LOCK *waiting_lock; /* waiting lock (one lock only) */ + struct st_table_lock_owner *waiting_for; /* transaction we're waiting for */ + pthread_cond_t *cond; /* transactions waiting for us, wait on 'cond' */ + pthread_mutex_t *mutex; /* mutex is required to use 'cond' */ + uint16 loid, waiting_for_loid; /* Lock Owner IDentifier */ +} TABLE_LOCK_OWNER; + +typedef struct st_locked_table { + pthread_mutex_t mutex; /* mutex for everything below */ + HASH latest_locks; /* latest locks in a hash */ + TABLE_LOCK *active_locks[LOCK_TYPES]; /* dl-list of locks per type */ + TABLE_LOCK *wait_queue_in, *wait_queue_out; /* wait deque (double-end queue)*/ +} LOCKED_TABLE; + +typedef TABLE_LOCK_OWNER *loid_to_tlo_func(uint16); + +typedef struct { + pthread_mutex_t pool_mutex; + TABLE_LOCK *pool; /* lifo pool of free locks */ + uint lock_timeout; /* lock timeout in milliseconds */ + loid_to_tlo_func *loid_to_tlo; /* for mapping loid to TABLE_LOCK_OWNER */ +} TABLOCKMAN; + +void tablockman_init(TABLOCKMAN *, loid_to_tlo_func *, uint); +void tablockman_destroy(TABLOCKMAN *); +enum lockman_getlock_result tablockman_getlock(TABLOCKMAN *, TABLE_LOCK_OWNER *, + LOCKED_TABLE *, enum lock_type); +void tablockman_release_locks(TABLOCKMAN *, TABLE_LOCK_OWNER *); +void tablockman_init_locked_table(LOCKED_TABLE *, int); +void tablockman_destroy_locked_table(LOCKED_TABLE *); + +#ifdef EXTRA_DEBUG +void tablockman_print_tlo(TABLE_LOCK_OWNER *); +#endif + +#endif + diff --git a/storage/maria/test_pack b/storage/maria/test_pack new file mode 100755 index 00000000000..689645b1661 --- /dev/null +++ b/storage/maria/test_pack @@ -0,0 +1,10 @@ +silent="-s" +suffix="" + +ma_test1$suffix -s ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -us test1 ; maria_chk$suffix -es test1 +ma_test1$suffix -s -S ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ;maria_chk$suffix -us test1 ; maria_chk$suffix -es test1 +ma_test1$suffix -s -b ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 +ma_test1$suffix -s -w ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -ros test1 ; maria_chk$suffix -es test1 + +ma_test2$suffix -s -t4 ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2 +ma_test2$suffix -s -t4 -b ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2 diff --git a/storage/maria/trnman.c b/storage/maria/trnman.c new file mode 100644 index 00000000000..03d11db3b5b --- /dev/null +++ b/storage/maria/trnman.c @@ -0,0 +1,743 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +#include <my_global.h> +#include <my_sys.h> +#include <m_string.h> +#include "trnman.h" +#include "ma_checkpoint.h" +#include "ma_control_file.h" + +/* + status variables: + how many trns in the active list currently, + in the committed list currently, allocated since startup. +*/ +uint trnman_active_transactions, trnman_committed_transactions, + trnman_allocated_transactions; + +/* list of active transactions in the trid order */ +static TRN active_list_min, active_list_max; +/* list of committed transactions in the trid order */ +static TRN committed_list_min, committed_list_max; + +/* a counter, used to generate transaction ids */ +static TrID global_trid_generator; + +/* the mutex for everything above */ +static pthread_mutex_t LOCK_trn_list; + +/* LIFO pool of unused TRN structured for reuse */ +static TRN *pool; + +/* a hash for committed transactions that maps trid to a TRN structure */ +static LF_HASH trid_to_committed_trn; + +/* an array that maps short_trid of an active transaction to a TRN structure */ +static TRN **short_trid_to_active_trn; + +/* locks for short_trid_to_active_trn and pool */ +static my_atomic_rwlock_t LOCK_short_trid_to_trn, LOCK_pool; + +/* + Simple interface functions + QQ: if they stay so simple, should we make them inline? +*/ + +uint trnman_increment_locked_tables(TRN *trn) +{ + return trn->locked_tables++; +} + +my_bool trnman_has_locked_tables(TRN *trn) +{ + return trn->locked_tables != 0; +} + +uint trnman_decrement_locked_tables(TRN *trn) +{ + return --trn->locked_tables; +} + +void trnman_reset_locked_tables(TRN *trn) +{ + trn->locked_tables= 0; +} + + +/* + NOTE + Just as short_id doubles as loid, this function doubles as + short_trid_to_LOCK_OWNER. See the compile-time assert below. +*/ + +#ifdef NOT_USED +static TRN *short_trid_to_TRN(uint16 short_trid) +{ + TRN *trn; + compile_time_assert(offsetof(TRN, locks) == 0); + my_atomic_rwlock_rdlock(&LOCK_short_trid_to_trn); + trn= my_atomic_loadptr((void **)&short_trid_to_active_trn[short_trid]); + my_atomic_rwlock_rdunlock(&LOCK_short_trid_to_trn); + return (TRN *)trn; +} +#endif + +static uchar *trn_get_hash_key(const uchar *trn, size_t *len, + my_bool unused __attribute__ ((unused))) +{ + *len= sizeof(TrID); + return (uchar *) & ((*((TRN **)trn))->trid); +} + + +/** + @brief Initializes transaction manager. + + @param initial_trid Generated TrIDs will start from initial_trid+1. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int trnman_init(TrID initial_trid) +{ + DBUG_ENTER("trnman_init"); + + short_trid_to_active_trn= (TRN **)my_malloc(SHORT_TRID_MAX*sizeof(TRN*), + MYF(MY_WME|MY_ZEROFILL)); + if (unlikely(!short_trid_to_active_trn)) + DBUG_RETURN(1); + short_trid_to_active_trn--; /* min short_trid is 1 */ + + /* + Initialize lists. + active_list_max.min_read_from must be larger than any trid, + so that when an active list is empty we would could free + all committed list. + And committed_list_max itself can not be freed so + committed_list_max.commit_trid must not be smaller that + active_list_max.min_read_from + */ + + active_list_max.trid= active_list_min.trid= 0; + active_list_max.min_read_from= ~(ulong) 0; + active_list_max.next= active_list_min.prev= 0; + active_list_max.prev= &active_list_min; + active_list_min.next= &active_list_max; + + committed_list_max.commit_trid= ~(ulong) 0; + committed_list_max.next= committed_list_min.prev= 0; + committed_list_max.prev= &committed_list_min; + committed_list_min.next= &committed_list_max; + + trnman_active_transactions= 0; + trnman_committed_transactions= 0; + trnman_allocated_transactions= 0; + + pool= 0; + global_trid_generator= initial_trid; + lf_hash_init(&trid_to_committed_trn, sizeof(TRN*), LF_HASH_UNIQUE, + 0, 0, trn_get_hash_key, 0); + DBUG_PRINT("info", ("pthread_mutex_init LOCK_trn_list")); + pthread_mutex_init(&LOCK_trn_list, MY_MUTEX_INIT_FAST); + my_atomic_rwlock_init(&LOCK_short_trid_to_trn); + my_atomic_rwlock_init(&LOCK_pool); + +#ifdef NOT_USED + lockman_init(&maria_lockman, (loid_to_lo_func *)&short_trid_to_TRN, 10000); +#endif + + DBUG_RETURN(0); +} + +/* + NOTE + this could only be called in the "idle" state - no transaction can be + running. See asserts below. +*/ +void trnman_destroy() +{ + DBUG_ENTER("trnman_destroy"); + + if (short_trid_to_active_trn == NULL) /* trnman already destroyed */ + DBUG_VOID_RETURN; + DBUG_ASSERT(trid_to_committed_trn.count == 0); + DBUG_ASSERT(trnman_active_transactions == 0); + DBUG_ASSERT(trnman_committed_transactions == 0); + DBUG_ASSERT(active_list_max.prev == &active_list_min); + DBUG_ASSERT(active_list_min.next == &active_list_max); + DBUG_ASSERT(committed_list_max.prev == &committed_list_min); + DBUG_ASSERT(committed_list_min.next == &committed_list_max); + while (pool) + { + TRN *trn= pool; + pool= pool->next; + DBUG_ASSERT(trn->locks.mutex == 0); + DBUG_ASSERT(trn->locks.cond == 0); + my_free((void *)trn, MYF(0)); + } + lf_hash_destroy(&trid_to_committed_trn); + DBUG_PRINT("info", ("pthread_mutex_destroy LOCK_trn_list")); + pthread_mutex_destroy(&LOCK_trn_list); + my_atomic_rwlock_destroy(&LOCK_short_trid_to_trn); + my_atomic_rwlock_destroy(&LOCK_pool); + my_free((void *)(short_trid_to_active_trn+1), MYF(0)); + short_trid_to_active_trn= NULL; +#ifdef NOT_USED + lockman_destroy(&maria_lockman); +#endif + DBUG_VOID_RETURN; +} + +/* + NOTE + TrID is limited to 6 bytes. Initial value of the generator + is set by the recovery code - being read from the last checkpoint + (or 1 on a first run). +*/ +static TrID new_trid() +{ + DBUG_ENTER("new_trid"); + DBUG_ASSERT(global_trid_generator < 0xffffffffffffLL); + DBUG_PRINT("info", ("safe_mutex_assert_owner LOCK_trn_list")); + safe_mutex_assert_owner(&LOCK_trn_list); + DBUG_RETURN(++global_trid_generator); +} + +static void set_short_trid(TRN *trn) +{ + int i= (global_trid_generator + (intptr)trn) * 312089 % SHORT_TRID_MAX + 1; + for ( ; !trn->short_id ; i= 1) + { + my_atomic_rwlock_wrlock(&LOCK_short_trid_to_trn); + for ( ; i <= SHORT_TRID_MAX; i++) /* the range is [1..SHORT_TRID_MAX] */ + { + void *tmp= NULL; + if (short_trid_to_active_trn[i] == NULL && + my_atomic_casptr((void **)&short_trid_to_active_trn[i], &tmp, trn)) + { + trn->short_id= i; + break; + } + } + my_atomic_rwlock_wrunlock(&LOCK_short_trid_to_trn); + } +} + +/* + DESCRIPTION + start a new transaction, allocate and initialize transaction object + mutex and cond will be used for lock waits +*/ + +TRN *trnman_new_trn(pthread_mutex_t *mutex, pthread_cond_t *cond, + void *stack_end) +{ + TRN *trn; + DBUG_ENTER("trnman_new_trn"); + + /* + we have a mutex, to do simple things under it - allocate a TRN, + increment trnman_active_transactions, set trn->min_read_from. + + Note that all the above is fast. generating short_trid may be slow, + as it involves scanning a large array - so it's done outside of the + mutex. + */ + + DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list")); + pthread_mutex_lock(&LOCK_trn_list); + + /* Allocating a new TRN structure */ + trn= pool; + /* + Popping an unused TRN from the pool + (ABA isn't possible, we're behind a mutex + */ + my_atomic_rwlock_wrlock(&LOCK_pool); + while (trn && !my_atomic_casptr((void **)&pool, (void **)&trn, + (void *)trn->next)) + /* no-op */; + my_atomic_rwlock_wrunlock(&LOCK_pool); + + /* Nothing in the pool ? Allocate a new one */ + if (!trn) + { + /* + trn should be completely initalized at create time to allow + one to keep a known state on it. + (Like redo_lns, which is assumed to be 0 at start of row handling + and reset to zero before end of row handling) + */ + trn= (TRN *)my_malloc(sizeof(TRN), MYF(MY_WME | MY_ZEROFILL)); + if (unlikely(!trn)) + { + DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list")); + pthread_mutex_unlock(&LOCK_trn_list); + return 0; + } + trnman_allocated_transactions++; + } + trn->pins= lf_hash_get_pins(&trid_to_committed_trn, stack_end); + if (!trn->pins) + { + trnman_free_trn(trn); + return 0; + } + + trnman_active_transactions++; + + trn->min_read_from= active_list_min.next->trid; + + trn->trid= new_trid(); + trn->short_id= 0; + + trn->next= &active_list_max; + trn->prev= active_list_max.prev; + active_list_max.prev= trn->prev->next= trn; + DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list")); + pthread_mutex_unlock(&LOCK_trn_list); + + if (unlikely(!trn->min_read_from)) + trn->min_read_from= trn->trid; + + trn->commit_trid= 0; + trn->rec_lsn= trn->undo_lsn= trn->first_undo_lsn= 0; + + trn->locks.mutex= mutex; + trn->locks.cond= cond; + trn->locks.waiting_for= 0; + trn->locks.all_locks= 0; +#ifdef NOT_USED + trn->locks.pins= lf_alloc_get_pins(&maria_lockman.alloc); +#endif + + trn->locked_tables= 0; + + /* + only after the following function TRN is considered initialized, + so it must be done the last + */ + set_short_trid(trn); + + DBUG_RETURN(trn); +} + +/* + remove a trn from the active list. + if necessary - move to committed list and set commit_trid + + NOTE + Locks are released at the end. In particular, after placing the + transaction in commit list, and after setting commit_trid. It's + important, as commit_trid affects visibility. Locks don't affect + anything they simply delay execution of other threads - they could be + released arbitrarily late. In other words, when locks are released it + serves as a start banner for other threads, they start to run. So + everything they may need must be ready at that point. + + RETURN + 0 ok + 1 error +*/ +int trnman_end_trn(TRN *trn, my_bool commit) +{ + int res= 1; + TRN *free_me= 0; + LF_PINS *pins= trn->pins; + DBUG_ENTER("trnman_end_trn"); + + DBUG_ASSERT(trn->rec_lsn == 0); + /* if a rollback, all UNDO records should have been executed */ + DBUG_ASSERT(commit || trn->undo_lsn == 0); + DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list")); + pthread_mutex_lock(&LOCK_trn_list); + + /* remove from active list */ + trn->next->prev= trn->prev; + trn->prev->next= trn->next; + + /* + if trn was the oldest active transaction, now that it goes away there + may be committed transactions in the list which no active transaction + needs to bother about - clean up the committed list + */ + if (trn->prev == &active_list_min) + { + uint free_me_count; + TRN *t; + for (t= committed_list_min.next, free_me_count= 0; + t->commit_trid < active_list_min.next->min_read_from; + t= t->next, free_me_count++) /* no-op */; + + DBUG_ASSERT((t != committed_list_min.next && free_me_count > 0) || + (t == committed_list_min.next && free_me_count == 0)); + /* found transactions committed before the oldest active one */ + if (t != committed_list_min.next) + { + free_me= committed_list_min.next; + committed_list_min.next= t; + t->prev->next= 0; + t->prev= &committed_list_min; + trnman_committed_transactions-= free_me_count; + } + } + + /* + if transaction is committed and it was not the only active transaction - + add it to the committed list (which is used for read-from relation) + */ + if (commit && active_list_min.next != &active_list_max) + { + trn->commit_trid= global_trid_generator; + trn->next= &committed_list_max; + trn->prev= committed_list_max.prev; + trnman_committed_transactions++; + + res= lf_hash_insert(&trid_to_committed_trn, pins, &trn); + /* + By going on with life is res<0, we let other threads block on + our rows (because they will never see us committed in + trid_to_committed_trn) until they timeout. Though correct, this is not a + good situation: + - if connection reconnects and wants to check if its rows have been + committed, it will not be able to do that (it will just lock on them) so + connection stays permanently in doubt + - internal structures trid_to_committed_trn and committed_list are + desynchronized. + So we should take Maria down immediately, the two problems being + automatically solved at restart. + */ + DBUG_ASSERT(res <= 0); + } + if (res) + { + /* + res == 1 means the condition in the if() above + was false. + res == -1 means lf_hash_insert failed + */ + trn->next= free_me; + free_me= trn; + } + else + { + committed_list_max.prev= trn->prev->next= trn; + } + trnman_active_transactions--; + DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list")); + pthread_mutex_unlock(&LOCK_trn_list); + + /* the rest is done outside of a critical section */ +#ifdef NOT_USED + lockman_release_locks(&maria_lockman, &trn->locks); +#endif + trn->locks.mutex= 0; + trn->locks.cond= 0; + my_atomic_rwlock_rdlock(&LOCK_short_trid_to_trn); + my_atomic_storeptr((void **)&short_trid_to_active_trn[trn->short_id], 0); + my_atomic_rwlock_rdunlock(&LOCK_short_trid_to_trn); + + /* + we, under the mutex, removed going-in-free_me transactions from the + active and committed lists, thus nobody else may see them when it scans + those lists, and thus nobody may want to free them. Now we don't + need a mutex to access free_me list + */ + /* QQ: send them to the purge thread */ + while (free_me) + { + TRN *t= free_me; + free_me= free_me->next; + + /* + ignore OOM here. it's harmless, and there's nothing we could do, anyway + */ + (void)lf_hash_delete(&trid_to_committed_trn, pins, &t->trid, sizeof(TrID)); + + trnman_free_trn(t); + } + + lf_hash_put_pins(pins); +#ifdef NOT_USED + lf_pinbox_put_pins(trn->locks.pins); +#endif + + DBUG_RETURN(res < 0); +} + +/* + free a trn (add to the pool, that is) + note - we can never really free() a TRN if there's at least one other + running transaction - see, e.g., how lock waits are implemented in + lockman.c + The same is true for other lock-free data structures too. We may need some + kind of FLUSH command to reset them all - ensuring that no transactions are + running. It may even be called automatically on checkpoints if no + transactions are running. +*/ +void trnman_free_trn(TRN *trn) +{ + TRN *tmp= pool; + + my_atomic_rwlock_wrlock(&LOCK_pool); + do + { + /* + without this volatile cast gcc-3.4.4 moved the assignment + down after the loop at -O2 + */ + *(TRN * volatile *)&(trn->next)= tmp; + } while (!my_atomic_casptr((void **)&pool, (void **)&tmp, trn)); + my_atomic_rwlock_wrunlock(&LOCK_pool); +} + +/* + NOTE + here we access the hash in a lock-free manner. + It's safe, a 'found' TRN can never be freed/reused before we access it. + In fact, it cannot be freed before 'trn' ends, because a 'found' TRN + can only be removed from the hash when: + found->commit_trid < ALL (trn->min_read_from) + that is, at least + found->commit_trid < trn->min_read_from + but + found->trid >= trn->min_read_from + and + found->commit_trid > found->trid + + RETURN + 1 can + 0 cannot + -1 error (OOM) +*/ +int trnman_can_read_from(TRN *trn, TrID trid) +{ + TRN **found; + my_bool can; + LF_REQUIRE_PINS(3); + + if (trid < trn->min_read_from) + return 1; /* can read */ + if (trid > trn->trid) + return 0; /* cannot read */ + + found= lf_hash_search(&trid_to_committed_trn, trn->pins, &trid, sizeof(trid)); + if (found == NULL) + return 0; /* not in the hash of committed transactions = cannot read */ + if (found == MY_ERRPTR) + return -1; + + can= (*found)->commit_trid < trn->trid; + lf_hash_search_unpin(trn->pins); + return can; +} + +/* TODO: the stubs below are waiting for savepoints to be implemented */ + +void trnman_new_statement(TRN *trn __attribute__ ((unused))) +{ +} + +void trnman_rollback_statement(TRN *trn __attribute__ ((unused))) +{ +} + + +/** + @brief Allocates buffers and stores in them some info about transactions + + Does the allocation because the caller cannot know the size itself. + Memory freeing is to be done by the caller (if the "str" member of the + LEX_STRING is not NULL). + The caller has the intention of doing checkpoints. + + @param[out] str_act pointer to where the allocated buffer, + and its size, will be put; buffer will be filled + with info about active transactions + @param[out] str_com pointer to where the allocated buffer, + and its size, will be put; buffer will be filled + with info about committed transactions + @param[out] min_first_undo_lsn pointer to where the minimum + first_undo_lsn of all transactions will be put + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com, + LSN *min_rec_lsn, LSN *min_first_undo_lsn) +{ + my_bool error; + TRN *trn; + char *ptr; + uint stored_transactions= 0; + LSN minimum_rec_lsn= LSN_MAX, minimum_first_undo_lsn= LSN_MAX; + DBUG_ENTER("trnman_collect_transactions"); + + DBUG_ASSERT((NULL == str_act->str) && (NULL == str_com->str)); + + /* validate the use of read_non_atomic() in general: */ + compile_time_assert((sizeof(LSN) == 8) && (sizeof(LSN_WITH_FLAGS) == 8)); + pthread_mutex_lock(&LOCK_trn_list); + str_act->length= 2 + /* number of active transactions */ + LSN_STORE_SIZE + /* minimum of their rec_lsn */ + (2 + /* short id */ + 6 + /* long id */ + LSN_STORE_SIZE + /* undo_lsn */ +#ifdef MARIA_VERSIONING /* not enabled yet */ + LSN_STORE_SIZE + /* undo_purge_lsn */ +#endif + LSN_STORE_SIZE /* first_undo_lsn */ + ) * trnman_active_transactions; + str_com->length= 4 + /* number of committed transactions */ + (6 + /* long id */ +#ifdef MARIA_VERSIONING /* not enabled yet */ + LSN_STORE_SIZE + /* undo_purge_lsn */ +#endif + LSN_STORE_SIZE /* first_undo_lsn */ + ) * trnman_committed_transactions; + if ((NULL == (str_act->str= my_malloc(str_act->length, MYF(MY_WME)))) || + (NULL == (str_com->str= my_malloc(str_com->length, MYF(MY_WME))))) + goto err; + /* First, the active transactions */ + ptr= str_act->str + 2 + LSN_STORE_SIZE; + for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next) + { + /* + trns with a short trid of 0 are not even initialized, we can ignore + them. trns with undo_lsn==0 have done no writes, we can ignore them + too. XID not needed now. + */ + uint sid; + LSN rec_lsn, undo_lsn, first_undo_lsn; + if ((sid= trn->short_id) == 0) + { + /* + Not even inited, has done nothing. Or it is the + dummy_transaction_object, which does only non-transactional + immediate-sync operations (CREATE/DROP/RENAME/REPAIR TABLE), and so + can be forgotten for Checkpoint. + */ + continue; + } + /* needed for low-water mark calculation */ + if (((rec_lsn= lsn_read_non_atomic(trn->rec_lsn)) > 0) && + (cmp_translog_addr(rec_lsn, minimum_rec_lsn) < 0)) + minimum_rec_lsn= rec_lsn; + /* + trn may have logged REDOs but not yet UNDO, that's why we read rec_lsn + before deciding to ignore if undo_lsn==0. + */ + if ((undo_lsn= trn->undo_lsn) == 0) /* trn can be forgotten */ + continue; + stored_transactions++; + int2store(ptr, sid); + ptr+= 2; + int6store(ptr, trn->trid); + ptr+= 6; + lsn_store(ptr, undo_lsn); /* needed for rollback */ + ptr+= LSN_STORE_SIZE; + /* needed for low-water mark calculation */ + if (((first_undo_lsn= lsn_read_non_atomic(trn->first_undo_lsn)) > 0) && + (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0)) + minimum_first_undo_lsn= first_undo_lsn; + lsn_store(ptr, first_undo_lsn); + ptr+= LSN_STORE_SIZE; +#ifdef MARIA_VERSIONING /* not enabled yet */ + /* to know where purging should start (last delete of this trn) */ + lsn_store(ptr, trn->undo_purge_lsn); + ptr+= LSN_STORE_SIZE; +#endif + /** + @todo RECOVERY: add a comment explaining why we can dirtily read some + vars, inspired by the text of "assumption 8" in WL#3072 + */ + } + str_act->length= ptr - str_act->str; /* as we maybe over-estimated */ + ptr= str_act->str; + DBUG_PRINT("info",("collected %u active transactions", + (uint)stored_transactions)); + int2store(ptr, stored_transactions); + ptr+= 2; + /* this LSN influences how REDOs for any page can be ignored by Recovery */ + lsn_store(ptr, minimum_rec_lsn); + /* one day there will also be a list of prepared transactions */ + /* do the same for committed ones */ + ptr= str_com->str; + int4store(ptr, trnman_committed_transactions); + ptr+= 4; + DBUG_PRINT("info",("collected %u committed transactions", + (uint)trnman_committed_transactions)); + for (trn= committed_list_min.next; trn != &committed_list_max; + trn= trn->next) + { + LSN first_undo_lsn; + int6store(ptr, trn->trid); + ptr+= 6; +#ifdef MARIA_VERSIONING /* not enabled yet */ + lsn_store(ptr, trn->undo_purge_lsn); + ptr+= LSN_STORE_SIZE; +#endif + first_undo_lsn= LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn); + if (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0) + minimum_first_undo_lsn= first_undo_lsn; + lsn_store(ptr, first_undo_lsn); + ptr+= LSN_STORE_SIZE; + } + /* + TODO: if we see there exists no transaction (active and committed) we can + tell the lock-free structures to do some freeing (my_free()). + */ + error= 0; + *min_rec_lsn= minimum_rec_lsn; + *min_first_undo_lsn= minimum_first_undo_lsn; + goto end; +err: + error= 1; +end: + pthread_mutex_unlock(&LOCK_trn_list); + DBUG_RETURN(error); +} + + +TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid) +{ + TrID old_trid_generator= global_trid_generator; + TRN *trn; + DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded); + if (unlikely((trn= trnman_new_trn(NULL, NULL, NULL)) == NULL)) + return NULL; + /* deallocate excessive allocations of trnman_new_trn() */ + global_trid_generator= old_trid_generator; + set_if_bigger(global_trid_generator, longid); + short_trid_to_active_trn[trn->short_id]= 0; + DBUG_ASSERT(short_trid_to_active_trn[shortid] == NULL); + short_trid_to_active_trn[shortid]= trn; + trn->trid= longid; + trn->short_id= shortid; + return trn; +} + + +TRN *trnman_get_any_trn() +{ + TRN *trn= active_list_min.next; + return (trn != &active_list_max) ? trn : NULL; +} diff --git a/storage/maria/trnman.h b/storage/maria/trnman.h new file mode 100644 index 00000000000..fce02d9ab89 --- /dev/null +++ b/storage/maria/trnman.h @@ -0,0 +1,59 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _trnman_h +#define _trnman_h + +C_MODE_START + +#include <lf.h> +#include "lockman.h" +#include "trnman_public.h" +#include "ma_loghandler_lsn.h" + +/* + trid - 6 uchar transaction identifier. Assigned when a transaction + is created. Transaction can always be identified by its trid, + even after transaction has ended. + + short_trid - 2-byte transaction identifier, identifies a running + transaction, is reassigned when transaction ends. +*/ + +/* + short transaction id is at the same time its identifier + for a lock manager - its lock owner identifier (loid) +*/ + +#define short_id locks.loid + +struct st_transaction +{ + LOCK_OWNER locks; /* must be the first! see short_trid_to_TRN() */ + LF_PINS *pins; + TrID trid, min_read_from, commit_trid; + TRN *next, *prev; + LSN rec_lsn, undo_lsn; + LSN_WITH_FLAGS first_undo_lsn; + uint locked_tables; + /* Note! if locks.loid is 0, trn is NOT initialized */ +}; + +#define TRANSACTION_LOGGED_LONG_ID ULL(0x8000000000000000) + +C_MODE_END + +#endif + diff --git a/storage/maria/trnman_public.h b/storage/maria/trnman_public.h new file mode 100644 index 00000000000..97b492c3a57 --- /dev/null +++ b/storage/maria/trnman_public.h @@ -0,0 +1,60 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + + +/* + External definitions for trnman.h + We need to split this into two files as gcc 4.1.2 gives error if it tries + to include my_atomic.h in C++ code. +*/ + +#ifndef _trnman_public_h +#define _trnman_public_h + +#include "ma_loghandler_lsn.h" + +C_MODE_START +typedef uint64 TrID; /* our TrID is 6 bytes */ +typedef struct st_transaction TRN; + +#define SHORT_TRID_MAX 65535 + +extern uint trnman_active_transactions, trnman_allocated_transactions; +extern TRN dummy_transaction_object; + +int trnman_init(TrID); +void trnman_destroy(void); +TRN *trnman_new_trn(pthread_mutex_t *, pthread_cond_t *, void *); +int trnman_end_trn(TRN *trn, my_bool commit); +#define trnman_commit_trn(T) trnman_end_trn(T, TRUE) +#define trnman_abort_trn(T) trnman_end_trn(T, FALSE) +#define trnman_rollback_trn(T) trnman_end_trn(T, FALSE) +void trnman_free_trn(TRN *trn); +int trnman_can_read_from(TRN *trn, TrID trid); +void trnman_new_statement(TRN *trn); +void trnman_rollback_statement(TRN *trn); +my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com, + LSN *min_rec_lsn, + LSN *min_first_undo_lsn); + +uint trnman_increment_locked_tables(TRN *trn); +uint trnman_decrement_locked_tables(TRN *trn); +my_bool trnman_has_locked_tables(TRN *trn); +void trnman_reset_locked_tables(TRN *trn); +TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid); +TRN *trnman_get_any_trn(); + +C_MODE_END +#endif diff --git a/storage/maria/unittest/Makefile.am b/storage/maria/unittest/Makefile.am new file mode 100644 index 00000000000..4631b436b0b --- /dev/null +++ b/storage/maria/unittest/Makefile.am @@ -0,0 +1,97 @@ +# Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +AM_CPPFLAGS = @ZLIB_INCLUDES@ -I$(top_builddir)/include \ + -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap +INCLUDES = @ZLIB_INCLUDES@ -I$(top_builddir)/include \ + -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap + +# Only reason to link with libmyisam.a here is that it's where some fulltext +# pieces are (but soon we'll remove fulltext dependencies from Maria). +LDADD= $(top_builddir)/unittest/mytap/libmytap.a \ + $(top_builddir)/storage/maria/libmaria.a \ + $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/mysys/libmysys.a \ + $(top_builddir)/dbug/libdbug.a \ + $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ +noinst_PROGRAMS = ma_control_file-t trnman-t lockman2-t \ + ma_pagecache_single_1k-t ma_pagecache_single_8k-t \ + ma_pagecache_single_64k-t-big \ + ma_pagecache_consist_1k-t-big \ + ma_pagecache_consist_64k-t-big \ + ma_pagecache_consist_1kHC-t-big \ + ma_pagecache_consist_64kHC-t-big \ + ma_pagecache_consist_1kRD-t-big \ + ma_pagecache_consist_64kRD-t-big \ + ma_pagecache_consist_1kWR-t-big \ + ma_pagecache_consist_64kWR-t-big \ + ma_test_loghandler-t \ + ma_test_loghandler_multigroup-t \ + ma_test_loghandler_multithread-t \ + ma_test_loghandler_pagecache-t \ + ma_test_loghandler_long-t-big \ + ma_test_loghandler_noflush-t \ + ma_test_loghandler_first_lsn-t \ + ma_test_loghandler_max_lsn-t \ + ma_test_loghandler_purge-t + +ma_test_loghandler_t_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c +ma_test_loghandler_multigroup_t_SOURCES = ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c +ma_test_loghandler_multithread_t_SOURCES = ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c +ma_test_loghandler_pagecache_t_SOURCES = ma_test_loghandler_pagecache-t.c ma_maria_log_cleanup.c +ma_test_loghandler_long_t_big_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c +ma_test_loghandler_long_t_big_CPPFLAGS = -DLONG_LOG_TEST +ma_test_loghandler_noflush_t_SOURCES = ma_test_loghandler_noflush-t.c ma_maria_log_cleanup.c +ma_test_loghandler_first_lsn_t_SOURCES = ma_test_loghandler_first_lsn-t.c ma_maria_log_cleanup.c +ma_test_loghandler_max_lsn_t_SOURCES = ma_test_loghandler_max_lsn-t.c ma_maria_log_cleanup.c +ma_test_loghandler_purge_t_SOURCES = ma_test_loghandler_purge-t.c ma_maria_log_cleanup.c + +ma_pagecache_single_src = ma_pagecache_single.c test_file.c test_file.h +ma_pagecache_consist_src = ma_pagecache_consist.c test_file.c test_file.h +ma_pagecache_common_cppflags = -DEXTRA_DEBUG -DPAGECACHE_DEBUG -DMAIN + +ma_pagecache_single_1k_t_SOURCES = $(ma_pagecache_single_src) +ma_pagecache_single_8k_t_SOURCES = $(ma_pagecache_single_src) +ma_pagecache_single_64k_t_big_SOURCES = $(ma_pagecache_single_src) +ma_pagecache_single_1k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024 +ma_pagecache_single_8k_t_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=8192 +ma_pagecache_single_64k_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536 + +ma_pagecache_consist_1k_t_big_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_1k_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024 +ma_pagecache_consist_64k_t_big_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_64k_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536 + +ma_pagecache_consist_1kHC_t_big_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_1kHC_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024 -DTEST_HIGH_CONCURENCY +ma_pagecache_consist_64kHC_t_big_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_64kHC_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536 -DTEST_HIGH_CONCURENCY + +ma_pagecache_consist_1kRD_t_big_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_1kRD_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024 -DTEST_READERS +ma_pagecache_consist_64kRD_t_big_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_64kRD_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536 -DTEST_READERS + +ma_pagecache_consist_1kWR_t_big_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_1kWR_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=1024 -DTEST_WRITERS +ma_pagecache_consist_64kWR_t_big_SOURCES = $(ma_pagecache_consist_src) +ma_pagecache_consist_64kWR_t_big_CPPFLAGS = $(ma_pagecache_common_cppflags) -DPAGE_SIZE=65536 -DTEST_WRITERS + +# the generic lock manager may not be used in the end and lockman1-t crashes, +# so we don't build lockman-t and lockman1-t +CLEANFILES = maria_log_control page_cache_test_file_1 \ + maria_log.???????? + diff --git a/storage/maria/unittest/lockman-t.c b/storage/maria/unittest/lockman-t.c new file mode 100644 index 00000000000..8c0f71175e7 --- /dev/null +++ b/storage/maria/unittest/lockman-t.c @@ -0,0 +1,309 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + lockman for row and table locks +*/ + +/* #define EXTRA_VERBOSE */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <my_atomic.h> +#include <lf.h> +#include "../lockman.h" + +#define Nlos 100 +LOCK_OWNER loarray[Nlos]; +pthread_mutex_t mutexes[Nlos]; +pthread_cond_t conds[Nlos]; +LOCKMAN lockman; + +#ifndef EXTRA_VERBOSE +#define print_lockhash(X) /* no-op */ +#define DIAG(X) /* no-op */ +#else +#define DIAG(X) diag X +#endif + +LOCK_OWNER *loid2lo(uint16 loid) +{ + return loarray+loid-1; +} + +#define unlock_all(O) diag("lo" #O "> release all locks"); \ + lockman_release_locks(&lockman, loid2lo(O));print_lockhash(&lockman) +#define test_lock(O, R, L, S, RES) \ + ok(lockman_getlock(&lockman, loid2lo(O), R, L) == RES, \ + "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \ + print_lockhash(&lockman) +#define lock_ok_a(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK) +#define lock_ok_i(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE) +#define lock_ok_l(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE) +#define lock_conflict(O, R, L) \ + test_lock(O, R, L, "cannot ", DIDNT_GET_THE_LOCK); + +void test_lockman_simple() +{ + /* simple */ + lock_ok_a(1, 1, S); + lock_ok_i(2, 2, IS); + lock_ok_i(1, 2, IX); + /* lock escalation */ + lock_ok_a(1, 1, X); + lock_ok_i(2, 2, IX); + /* failures */ + lock_conflict(2, 1, X); + unlock_all(2); + lock_ok_a(1, 2, S); + lock_ok_a(1, 2, IS); + lock_ok_a(1, 2, LS); + lock_ok_i(1, 3, IX); + lock_ok_a(2, 3, LS); + lock_ok_i(1, 3, IX); + lock_ok_l(2, 3, IS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_conflict(2, 1, S); + lock_ok_a(1, 1, LS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_ok_a(2, 1, LS); + lock_ok_a(1, 1, LS); + lock_ok_i(1, 1, IX); + lock_ok_i(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_i(1, 4, IS); + lock_ok_i(2, 4, IS); + lock_ok_i(3, 4, IS); + lock_ok_a(3, 4, LS); + lock_ok_i(4, 4, IS); + lock_conflict(4, 4, IX); + lock_conflict(2, 4, IX); + lock_ok_a(1, 4, LS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + unlock_all(4); + + lock_ok_i(1, 1, IX); + lock_ok_i(2, 1, IX); + lock_conflict(1, 1, S); + lock_conflict(2, 1, X); + unlock_all(1); + unlock_all(2); +} + +int rt_num_threads; +int litmus; +int thread_number= 0, timeouts= 0; +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + thread_number= timeouts= 0; + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Running %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, 0, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +pthread_mutex_t rt_mutex; +int Nrows= 100; +int Ntables= 10; +int table_lock_ratio= 10; +enum lock_type lock_array[6]= {S, X, LS, LX, IS, IX}; +char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"}; +char *res2str[4]= { + "DIDN'T GET THE LOCK", + "GOT THE LOCK", + "GOT THE LOCK NEED TO LOCK A SUBRESOURCE", + "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"}; +pthread_handler_t test_lockman(void *arg) +{ + int m= (*(int *)arg); + uint x, loid, row, table, res, locklevel, timeout= 0; + LOCK_OWNER *lo; + + pthread_mutex_lock(&rt_mutex); + loid= ++thread_number; + pthread_mutex_unlock(&rt_mutex); + lo= loid2lo(loid); + + for (x= ((int)(intptr)(&m)); m > 0; m--) + { + x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */ + row= x % Nrows + Ntables; + table= row % Ntables; + locklevel= (x/Nrows) & 3; + if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0) + { /* table lock */ + res= lockman_getlock(&lockman, lo, table, lock_array[locklevel]); + DIAG(("loid %2d, table %d, lock %s, res %s", loid, table, + lock2str[locklevel], res2str[res])); + if (res == DIDNT_GET_THE_LOCK) + { + lockman_release_locks(&lockman, lo); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + } + else + { /* row lock */ + locklevel&= 1; + res= lockman_getlock(&lockman, lo, table, lock_array[locklevel + 4]); + DIAG(("loid %2d, row %d, lock %s, res %s", loid, row, + lock2str[locklevel+4], res2str[res])); + switch (res) + { + case DIDNT_GET_THE_LOCK: + lockman_release_locks(&lockman, lo); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + case GOT_THE_LOCK: + continue; + case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE: + /* not implemented, so take a regular lock */ + case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE: + res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]); + DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row, + lock2str[locklevel], res2str[res])); + if (res == DIDNT_GET_THE_LOCK) + { + lockman_release_locks(&lockman, lo); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + continue; + default: + DBUG_ASSERT(0); + } + } + } + + lockman_release_locks(&lockman, lo); + + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + timeouts+= timeout; + if (!rt_num_threads) + diag("number of timeouts: %d", timeouts); + pthread_mutex_unlock(&rt_mutex); + + return 0; +} + +int main() +{ + int i; + + my_init(); + pthread_mutex_init(&rt_mutex, 0); + + plan(35); + + if (my_atomic_initialize()) + return exit_status(); + + + lockman_init(&lockman, &loid2lo, 50); + + for (i= 0; i < Nlos; i++) + { + loarray[i].pins= lf_alloc_get_pins(&lockman.alloc); + loarray[i].all_locks= 0; + loarray[i].waiting_for= 0; + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init (&conds[i], 0); + loarray[i].mutex= &mutexes[i]; + loarray[i].cond= &conds[i]; + loarray[i].loid= i+1; + } + + test_lockman_simple(); + +#define CYCLES 10000 +#define THREADS Nlos /* don't change this line */ + + /* mixed load, stress-test with random locks */ + Nrows= 100; + Ntables= 10; + table_lock_ratio= 10; + run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES); + + /* "real-life" simulation - many rows, no table locks */ + Nrows= 1000000; + Ntables= 10; + table_lock_ratio= 0; + run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10); + + for (i= 0; i < Nlos; i++) + { + lockman_release_locks(&lockman, &loarray[i]); + pthread_mutex_destroy(loarray[i].mutex); + pthread_cond_destroy(loarray[i].cond); + lf_pinbox_put_pins(loarray[i].pins); + } + + { + ulonglong now= my_getsystime(); + lockman_destroy(&lockman); + now= my_getsystime()-now; + diag("lockman_destroy: %g secs", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/maria/unittest/lockman1-t.c b/storage/maria/unittest/lockman1-t.c new file mode 100644 index 00000000000..41a1f0fd2f4 --- /dev/null +++ b/storage/maria/unittest/lockman1-t.c @@ -0,0 +1,335 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + lockman for row locks, tablockman for table locks +*/ + +/* #define EXTRA_VERBOSE */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <my_atomic.h> +#include <lf.h> +#include "../lockman.h" +#include "../tablockman.h" + +#define Nlos 100 +#define Ntbls 10 +LOCK_OWNER loarray[Nlos]; +TABLE_LOCK_OWNER loarray1[Nlos]; +pthread_mutex_t mutexes[Nlos]; +pthread_cond_t conds[Nlos]; +LOCKED_TABLE ltarray[Ntbls]; +LOCKMAN lockman; +TABLOCKMAN tablockman; + +#ifndef EXTRA_VERBOSE +#define print_lo1(X) /* no-op */ +#define DIAG(X) /* no-op */ +#else +#define DIAG(X) diag X +#endif + +LOCK_OWNER *loid2lo(uint16 loid) +{ + return loarray+loid-1; +} +TABLE_LOCK_OWNER *loid2lo1(uint16 loid) +{ + return loarray1+loid-1; +} + +#define unlock_all(O) diag("lo" #O "> release all locks"); \ + tablockman_release_locks(&tablockman, loid2lo1(O)); +#define test_lock(O, R, L, S, RES) \ + ok(tablockman_getlock(&tablockman, loid2lo1(O), <array[R], L) == RES, \ + "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \ + print_lo1(loid2lo1(O)); +#define lock_ok_a(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK) +#define lock_ok_i(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE) +#define lock_ok_l(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE) +#define lock_conflict(O, R, L) \ + test_lock(O, R, L, "cannot ", LOCK_TIMEOUT); + +void test_tablockman_simple() +{ + /* simple */ + lock_ok_a(1, 1, S); + lock_ok_i(2, 2, IS); + lock_ok_i(1, 2, IX); + /* lock escalation */ + lock_ok_a(1, 1, X); + lock_ok_i(2, 2, IX); + /* failures */ + lock_conflict(2, 1, X); + unlock_all(2); + lock_ok_a(1, 2, S); + lock_ok_a(1, 2, IS); + lock_ok_a(1, 2, LS); + lock_ok_i(1, 3, IX); + lock_ok_a(2, 3, LS); + lock_ok_i(1, 3, IX); + lock_ok_l(2, 3, IS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_conflict(2, 1, S); + lock_ok_a(1, 1, LS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_ok_a(2, 1, LS); + lock_ok_a(1, 1, LS); + lock_ok_i(1, 1, IX); + lock_ok_i(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_i(1, 4, IS); + lock_ok_i(2, 4, IS); + lock_ok_i(3, 4, IS); + lock_ok_a(3, 4, LS); + lock_ok_i(4, 4, IS); + lock_conflict(4, 4, IX); + lock_conflict(2, 4, IX); + lock_ok_a(1, 4, LS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + unlock_all(4); + + lock_ok_i(1, 1, IX); + lock_ok_i(2, 1, IX); + lock_conflict(1, 1, S); + lock_conflict(2, 1, X); + unlock_all(1); + unlock_all(2); +} + +int rt_num_threads; +int litmus; +int thread_number= 0, timeouts= 0; +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + thread_number= timeouts= 0; + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Running %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, 0, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +pthread_mutex_t rt_mutex; +int Nrows= 100; +int Ntables= 10; +int table_lock_ratio= 10; +enum lock_type lock_array[6]= {S, X, LS, LX, IS, IX}; +char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"}; +char *res2str[]= { + "DIDN'T GET THE LOCK", + "OUT OF MEMORY", + "DEADLOCK", + "LOCK TIMEOUT", + "GOT THE LOCK", + "GOT THE LOCK NEED TO LOCK A SUBRESOURCE", + "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"}; +pthread_handler_t test_lockman(void *arg) +{ + int m= (*(int *)arg); + uint x, loid, row, table, res, locklevel, timeout= 0; + LOCK_OWNER *lo; TABLE_LOCK_OWNER *lo1; DBUG_ASSERT(Ntables <= Ntbls); + + pthread_mutex_lock(&rt_mutex); + loid= ++thread_number; + pthread_mutex_unlock(&rt_mutex); + lo= loid2lo(loid); lo1= loid2lo1(loid); + + for (x= ((int)(intptr)(&m)); m > 0; m--) + { + x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */ + row= x % Nrows + Ntables; + table= row % Ntables; + locklevel= (x/Nrows) & 3; + if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0) + { /* table lock */ + res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel]); + DIAG(("loid %2d, table %d, lock %s, res %s", loid, table, + lock2str[locklevel], res2str[res])); + if (res < GOT_THE_LOCK) + { + lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + } + else + { /* row lock */ + locklevel&= 1; + res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]); + DIAG(("loid %2d, row %d, lock %s, res %s", loid, row, + lock2str[locklevel+4], res2str[res])); + switch (res) + { + case GOT_THE_LOCK: + continue; + case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE: + /* not implemented, so take a regular lock */ + case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE: + res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]); + DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row, + lock2str[locklevel], res2str[res])); + if (res == DIDNT_GET_THE_LOCK) + { + lockman_release_locks(&lockman, lo); + tablockman_release_locks(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + continue; + default: + lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + } + } + + lockman_release_locks(&lockman, lo); + tablockman_release_locks(&tablockman, lo1); + + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + timeouts+= timeout; + if (!rt_num_threads) + diag("number of timeouts: %d", timeouts); + pthread_mutex_unlock(&rt_mutex); + + return 0; +} + +int main() +{ + int i; + + my_init(); + pthread_mutex_init(&rt_mutex, 0); + + plan(35); + + if (my_atomic_initialize()) + return exit_status(); + + + lockman_init(&lockman, &loid2lo, 50); + tablockman_init(&tablockman, &loid2lo1, 50); + + for (i= 0; i < Nlos; i++) + { + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init (&conds[i], 0); + + loarray[i].pins= lf_alloc_get_pins(&lockman.alloc); + loarray[i].all_locks= 0; + loarray[i].waiting_for= 0; + loarray[i].mutex= &mutexes[i]; + loarray[i].cond= &conds[i]; + loarray[i].loid= i+1; + + loarray1[i].active_locks= 0; + loarray1[i].waiting_lock= 0; + loarray1[i].waiting_for= 0; + loarray1[i].mutex= &mutexes[i]; + loarray1[i].cond= &conds[i]; + loarray1[i].loid= i+1; + } + + for (i= 0; i < Ntbls; i++) + { + tablockman_init_locked_table(ltarray+i, Nlos); + } + + test_tablockman_simple(); + +#define CYCLES 10000 +#define THREADS Nlos /* don't change this line */ + + /* mixed load, stress-test with random locks */ + Nrows= 100; + Ntables= 10; + table_lock_ratio= 10; + run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES); + + /* "real-life" simulation - many rows, no table locks */ + Nrows= 1000000; + Ntables= 10; + table_lock_ratio= 0; + run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10); + + for (i= 0; i < Nlos; i++) + { + lockman_release_locks(&lockman, &loarray[i]); + pthread_mutex_destroy(loarray[i].mutex); + pthread_cond_destroy(loarray[i].cond); + lf_pinbox_put_pins(loarray[i].pins); + } + + { + ulonglong now= my_getsystime(); + lockman_destroy(&lockman); + now= my_getsystime()-now; + diag("lockman_destroy: %g secs", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/maria/unittest/lockman2-t.c b/storage/maria/unittest/lockman2-t.c new file mode 100644 index 00000000000..01af1a03d22 --- /dev/null +++ b/storage/maria/unittest/lockman2-t.c @@ -0,0 +1,361 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + tablockman for row and table locks +*/ + +/* #define EXTRA_VERBOSE */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <my_atomic.h> +#include <lf.h> +#include "../tablockman.h" + +#define Nlos 100 +#define Ntbls 110 +TABLE_LOCK_OWNER loarray1[Nlos]; +pthread_mutex_t mutexes[Nlos]; +pthread_cond_t conds[Nlos]; +LOCKED_TABLE ltarray[Ntbls]; +TABLOCKMAN tablockman; + +#ifndef EXTRA_VERBOSE +#define print_lo1(X) /* no-op */ +#define DIAG(X) /* no-op */ +#else +#define DIAG(X) diag X +#endif + +TABLE_LOCK_OWNER *loid2lo1(uint16 loid) +{ + return loarray1+loid-1; +} + +#define unlock_all(O) diag("lo" #O "> release all locks"); \ + tablockman_release_locks(&tablockman, loid2lo1(O)); +#define test_lock(O, R, L, S, RES) \ + ok(tablockman_getlock(&tablockman, loid2lo1(O), <array[R], L) == RES, \ + "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \ + print_lo1(loid2lo1(O)); +#define lock_ok_a(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK) +#define lock_ok_i(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE) +#define lock_ok_l(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE) +#define lock_conflict(O, R, L) \ + test_lock(O, R, L, "cannot ", LOCK_TIMEOUT); + +void test_tablockman_simple() +{ + /* simple */ + lock_ok_a(1, 1, S); + lock_ok_i(2, 2, IS); + lock_ok_i(1, 2, IX); + /* lock escalation */ + lock_ok_a(1, 1, X); + lock_ok_i(2, 2, IX); + /* failures */ + lock_conflict(2, 1, X); + unlock_all(2); + lock_ok_a(1, 2, S); + lock_ok_a(1, 2, IS); + lock_ok_a(1, 2, LS); + lock_ok_i(1, 3, IX); + lock_ok_a(2, 3, LS); + lock_ok_i(1, 3, IX); + lock_ok_l(2, 3, IS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_conflict(2, 1, S); + lock_ok_a(1, 1, LS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_ok_a(2, 1, LS); + lock_ok_a(1, 1, LS); + lock_ok_i(1, 1, IX); + lock_ok_i(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_i(1, 4, IS); + lock_ok_i(2, 4, IS); + lock_ok_i(3, 4, IS); + lock_ok_a(3, 4, LS); + lock_ok_i(4, 4, IS); + lock_conflict(4, 4, IX); + lock_conflict(2, 4, IX); + lock_ok_a(1, 4, LS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + unlock_all(4); + + lock_ok_i(1, 1, IX); + lock_ok_i(2, 1, IX); + lock_conflict(1, 1, S); + lock_conflict(2, 1, X); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IS); + lock_conflict(2, 1, X); + lock_conflict(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_a(1, 1, S); + lock_conflict(2, 1, IX); + lock_conflict(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); +} + +int rt_num_threads; +int litmus; +int thread_number= 0, timeouts= 0; +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + thread_number= timeouts= 0; + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Running %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, 0, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +static void reinit_tlo(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo) +{ +#ifdef NOT_USED_YET + TABLE_LOCK_OWNER backup= *lo; +#endif + + tablockman_release_locks(lm, lo); +#ifdef NOT_USED_YET + pthread_mutex_destroy(lo->mutex); + pthread_cond_destroy(lo->cond); + bzero(lo, sizeof(*lo)); + + lo->mutex= backup.mutex; + lo->cond= backup.cond; + lo->loid= backup.loid; + pthread_mutex_init(lo->mutex, MY_MUTEX_INIT_FAST); + pthread_cond_init(lo->cond, 0); +#endif +} + +pthread_mutex_t rt_mutex; +int Nrows= 100; +int Ntables= 10; +int table_lock_ratio= 10; +enum lock_type lock_array[6]= {S, X, LS, LX, IS, IX}; +const char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"}; +const char *res2str[]= { + 0, + "OUT OF MEMORY", + "DEADLOCK", + "LOCK TIMEOUT", + "GOT THE LOCK", + "GOT THE LOCK NEED TO LOCK A SUBRESOURCE", + "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"}; + +pthread_handler_t test_lockman(void *arg) +{ + int m= (*(int *)arg); + uint x, loid, row, table, res, locklevel, timeout= 0; + TABLE_LOCK_OWNER *lo1; + DBUG_ASSERT(Ntables <= Ntbls); + DBUG_ASSERT(Nrows + Ntables <= Ntbls); + + pthread_mutex_lock(&rt_mutex); + loid= ++thread_number; + pthread_mutex_unlock(&rt_mutex); + lo1= loid2lo1(loid); + + for (x= ((int)(intptr)(&m)); m > 0; m--) + { + /* three prime numbers */ + x= (uint) ((x*LL(3628273133) + LL(1500450271)) % LL(9576890767)); + row= x % Nrows + Ntables; + table= row % Ntables; + locklevel= (x/Nrows) & 3; + if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0) + { + /* table lock */ + res= tablockman_getlock(&tablockman, lo1, ltarray+table, + lock_array[locklevel]); + DIAG(("loid %2d, table %d, lock %s, res %s", loid, table, + lock2str[locklevel], res2str[res])); + if (res < GOT_THE_LOCK) + { + reinit_tlo(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + } + else + { /* row lock */ + locklevel&= 1; + res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]); + DIAG(("loid %2d, row %d, lock %s, res %s", loid, row, + lock2str[locklevel+4], res2str[res])); + switch (res) + { + case GOT_THE_LOCK: + continue; + case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE: + /* not implemented, so take a regular lock */ + case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE: + res= tablockman_getlock(&tablockman, lo1, ltarray+row, lock_array[locklevel]); + DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row, + lock2str[locklevel], res2str[res])); + if (res < GOT_THE_LOCK) + { + reinit_tlo(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + continue; + default: + reinit_tlo(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + } + } + + reinit_tlo(&tablockman, lo1); + + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + timeouts+= timeout; + if (!rt_num_threads) + diag("number of timeouts: %d", timeouts); + pthread_mutex_unlock(&rt_mutex); + + return 0; +} + +int main() +{ + int i; + + my_init(); + pthread_mutex_init(&rt_mutex, 0); + + plan(40); + + if (my_atomic_initialize()) + return exit_status(); + + + tablockman_init(&tablockman, &loid2lo1, 50); + + for (i= 0; i < Nlos; i++) + { + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init (&conds[i], 0); + + loarray1[i].active_locks= 0; + loarray1[i].waiting_lock= 0; + loarray1[i].waiting_for= 0; + loarray1[i].mutex= &mutexes[i]; + loarray1[i].cond= &conds[i]; + loarray1[i].loid= i+1; + } + + for (i= 0; i < Ntbls; i++) + { + tablockman_init_locked_table(ltarray+i, Nlos); + } + + test_tablockman_simple(); + +#define CYCLES 10000 +#define THREADS Nlos /* don't change this line */ + + /* mixed load, stress-test with random locks */ + Nrows= 100; + Ntables= 10; + table_lock_ratio= 10; + run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES); +#if 0 + /* "real-life" simulation - many rows, no table locks */ + Nrows= 1000000; + Ntables= 10; + table_lock_ratio= 0; + run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10); +#endif + for (i= 0; i < Nlos; i++) + { + tablockman_release_locks(&tablockman, &loarray1[i]); + pthread_mutex_destroy(loarray1[i].mutex); + pthread_cond_destroy(loarray1[i].cond); + } + + { + ulonglong now= my_getsystime(); + for (i= 0; i < Ntbls; i++) + { + tablockman_destroy_locked_table(ltarray+i); + } + tablockman_destroy(&tablockman); + now= my_getsystime()-now; + diag("lockman_destroy: %g secs", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/maria/unittest/ma_control_file-t.c b/storage/maria/unittest/ma_control_file-t.c new file mode 100644 index 00000000000..a7472361dad --- /dev/null +++ b/storage/maria/unittest/ma_control_file-t.c @@ -0,0 +1,447 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Unit test of the control file module of the Maria engine WL#3234 */ + +/* + Note that it is not possible to test the durability of the write (can't + pull the plug programmatically :) +*/ + +#include <my_global.h> +#include <my_sys.h> +#include <tap.h> + +#ifndef WITH_MARIA_STORAGE_ENGINE +/* + If Maria is not compiled in, normally we don't come to building this test. +*/ +#error "Maria engine is not compiled in, test cannot be built" +#endif + +#include "maria.h" +#include "../../../storage/maria/maria_def.h" +#include <my_getopt.h> + +char file_name[FN_REFLEN]; + +/* The values we'll set and expect the control file module to return */ +LSN expect_checkpoint_lsn; +uint32 expect_logno; + +static int delete_file(myf my_flags); +/* + Those are test-specific wrappers around the module's API functions: after + calling the module's API functions they perform checks on the result. +*/ +static int close_file(); /* wraps ma_control_file_end */ +static int create_or_open_file(); /* wraps ma_control_file_open_or_create */ +static int write_file(); /* wraps ma_control_file_write_and_force */ + +/* Tests */ +static int test_one_log(); +static int test_five_logs(); +static int test_3_checkpoints_and_2_logs(); +static int test_binary_content(); +static int test_start_stop(); +static int test_2_open_and_2_close(); +static int test_bad_magic_string(); +static int test_bad_checksum(); +static int test_bad_size(); + +/* Utility */ +static int verify_module_values_match_expected(); +static int verify_module_values_are_impossible(); +static void usage(); +static void get_options(int argc, char *argv[]); + +/* + If "expr" is FALSE, this macro will make the function print a diagnostic + message and immediately return 1. + This is inspired from assert() but does not crash the binary (sometimes we + may want to see how other tests go even if one fails). + RET_ERR means "return error". +*/ + +#define RET_ERR_UNLESS(expr) \ + {if (!(expr)) {diag("line %d: failure: '%s'", __LINE__, #expr); return 1;}} + + +int main(int argc,char *argv[]) +{ + MY_INIT(argv[0]); + maria_data_root= "."; + + plan(9); + + diag("Unit tests for control file"); + + get_options(argc,argv); + + diag("Deleting control file at startup, if there is an old one"); + RET_ERR_UNLESS(0 == delete_file(0)); /* if fails, can't continue */ + + diag("Tests of normal conditions"); + ok(0 == test_one_log(), "test of creating one log"); + ok(0 == test_five_logs(), "test of creating five logs"); + ok(0 == test_3_checkpoints_and_2_logs(), + "test of creating three checkpoints and two logs"); + ok(0 == test_binary_content(), "test of the binary content of the file"); + ok(0 == test_start_stop(), "test of multiple starts and stops"); + diag("Tests of abnormal conditions"); + ok(0 == test_2_open_and_2_close(), + "test of two open and two close (strange call sequence)"); + ok(0 == test_bad_magic_string(), "test of bad magic string"); + ok(0 == test_bad_checksum(), "test of bad checksum"); + ok(0 == test_bad_size(), "test of too small/big file"); + + return exit_status(); +} + + +static int delete_file(myf my_flags) +{ + RET_ERR_UNLESS(fn_format(file_name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) != NullS); + /* + Maybe file does not exist, ignore error. + The error will however be printed on stderr. + */ + my_delete(file_name, my_flags); + expect_checkpoint_lsn= LSN_IMPOSSIBLE; + expect_logno= FILENO_IMPOSSIBLE; + + return 0; +} + +/* + Verifies that global values last_checkpoint_lsn and last_logno (belonging + to the module) match what we expect. +*/ +static int verify_module_values_match_expected() +{ + RET_ERR_UNLESS(last_logno == expect_logno); + RET_ERR_UNLESS(last_checkpoint_lsn == + expect_checkpoint_lsn); + return 0; +} + + +/* + Verifies that global values last_checkpoint_lsn and last_logno (belonging + to the module) are impossible (this is used when the file has been closed). +*/ +static int verify_module_values_are_impossible() +{ + RET_ERR_UNLESS(last_logno == FILENO_IMPOSSIBLE); + RET_ERR_UNLESS(last_checkpoint_lsn == + LSN_IMPOSSIBLE); + return 0; +} + + +static int close_file() +{ + /* Simulate shutdown */ + ma_control_file_end(); + /* Verify amnesia */ + RET_ERR_UNLESS(verify_module_values_are_impossible() == 0); + return 0; +} + +static int create_or_open_file() +{ + RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) == CONTROL_FILE_OK); + /* Check that the module reports expected information */ + RET_ERR_UNLESS(verify_module_values_match_expected() == 0); + return 0; +} + +static int write_file(const LSN checkpoint_lsn, + uint32 logno, + uint objs_to_write) +{ + RET_ERR_UNLESS(ma_control_file_write_and_force(checkpoint_lsn, logno, + objs_to_write) == 0); + /* Check that the module reports expected information */ + RET_ERR_UNLESS(verify_module_values_match_expected() == 0); + return 0; +} + +static int test_one_log() +{ + uint objs_to_write; + + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + objs_to_write= CONTROL_FILE_UPDATE_ONLY_LOGNO; + expect_logno= 123; + RET_ERR_UNLESS(write_file(LSN_IMPOSSIBLE, + expect_logno, + objs_to_write) == 0); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_five_logs() +{ + uint objs_to_write; + uint i; + + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + objs_to_write= CONTROL_FILE_UPDATE_ONLY_LOGNO; + expect_logno= 100; + for (i= 0; i<5; i++) + { + expect_logno*= 3; + RET_ERR_UNLESS(write_file(LSN_IMPOSSIBLE, expect_logno, + objs_to_write) == 0); + } + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_3_checkpoints_and_2_logs() +{ + uint objs_to_write; + /* + Simulate one checkpoint, one log creation, two checkpoints, one + log creation. + */ + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + objs_to_write= CONTROL_FILE_UPDATE_ONLY_LSN; + expect_checkpoint_lsn= MAKE_LSN(5, 10000); + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, + expect_logno, objs_to_write) == 0); + + objs_to_write= CONTROL_FILE_UPDATE_ONLY_LOGNO; + expect_logno= 17; + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, + expect_logno, objs_to_write) == 0); + + objs_to_write= CONTROL_FILE_UPDATE_ONLY_LSN; + expect_checkpoint_lsn= MAKE_LSN(17, 20000); + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, + expect_logno, objs_to_write) == 0); + + objs_to_write= CONTROL_FILE_UPDATE_ONLY_LSN; + expect_checkpoint_lsn= MAKE_LSN(17, 45000); + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, + expect_logno, objs_to_write) == 0); + + objs_to_write= CONTROL_FILE_UPDATE_ONLY_LOGNO; + expect_logno= 19; + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, + expect_logno, objs_to_write) == 0); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_binary_content() +{ + uint i; + int fd; + + /* + TEST4: actually check by ourselves the content of the file. + Note that constants (offsets) are hard-coded here, precisely to prevent + someone from changing them in the control file module and breaking + backward-compatibility. + TODO: when we reach the format-freeze state, we may even just do a + comparison with a raw binary string, to not depend on any uint4korr + future change/breakage. + */ + + char buffer[23]; + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_read(fd, buffer, 23, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + i= uint3korr(buffer+12); + RET_ERR_UNLESS(i == LSN_FILE_NO(last_checkpoint_lsn)); + i= uint4korr(buffer+15); + RET_ERR_UNLESS(i == LSN_OFFSET(last_checkpoint_lsn)); + i= uint4korr(buffer+19); + RET_ERR_UNLESS(i == last_logno); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_start_stop() +{ + /* TEST5: Simulate start/nothing/stop/start/nothing/stop/start */ + + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_2_open_and_2_close() +{ + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + + +static int test_bad_magic_string() +{ + char buffer[4]; + int fd; + + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + /* Corrupt magic string */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pread(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_pwrite(fd, "papa", 4, 0, MYF(MY_FNABP | MY_WME)) == 0); + + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) == + CONTROL_FILE_BAD_MAGIC_STRING); + /* Restore magic string */ + RET_ERR_UNLESS(my_pwrite(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_bad_checksum() +{ + char buffer[4]; + int fd; + + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + /* Corrupt checksum */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pread(fd, buffer, 1, 8, MYF(MY_FNABP | MY_WME)) == 0); + buffer[0]+= 3; /* mangle checksum */ + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 8, MYF(MY_FNABP | MY_WME)) == 0); + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) == + CONTROL_FILE_BAD_CHECKSUM); + /* Restore checksum */ + buffer[0]-= 3; + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 4, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + + return 0; +} + + +static int test_bad_size() +{ + char buffer[]="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + int fd; + + /* A too short file */ + RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0); + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR | O_CREAT, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_write(fd, buffer, 10, MYF(MY_FNABP | MY_WME)) == 0); + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) == + CONTROL_FILE_TOO_SMALL); + RET_ERR_UNLESS(my_write(fd, buffer, 30, MYF(MY_FNABP | MY_WME)) == 0); + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(ma_control_file_create_or_open(TRUE) == CONTROL_FILE_TOO_BIG); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + + /* Leave a correct control file */ + RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0); + RET_ERR_UNLESS(create_or_open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + return 0; +} + + +static struct my_option my_long_options[] = +{ +#ifndef DBUG_OFF + {"debug", '#', "Debug log.", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"help", '?', "Display help and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version number and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static void version() +{ + printf("ma_control_file_test: unit test for the control file " + "module of the Maria storage engine. Ver 1.0 \n"); +} + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument) +{ + switch(optid) { + case 'V': + version(); + exit(0); + case '#': + DBUG_PUSH (argument); + break; + case '?': + version(); + usage(); + exit(0); + } + return 0; +} + + +/* Read options */ + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, + get_one_option))) + exit(ho_error); + + return; +} /* get options */ + + +static void usage() +{ + printf("Usage: %s [options]\n\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/unittest/ma_maria_log_cleanup.c b/storage/maria/unittest/ma_maria_log_cleanup.c new file mode 100644 index 00000000000..c5917764b9b --- /dev/null +++ b/storage/maria/unittest/ma_maria_log_cleanup.c @@ -0,0 +1,45 @@ +#include "../maria_def.h" +#include <my_dir.h> + +my_bool maria_log_remove() +{ + MY_DIR *dirp; + uint i; + MY_STAT stat_buff; + char file_name[FN_REFLEN]; + + /* Removes control file */ + if (fn_format(file_name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) == NullS) + return 1; + if (my_stat(file_name, &stat_buff, MYF(0)) && + my_delete(file_name, MYF(MY_WME)) != 0) + return 1; + + /* Finds and removes transaction log files */ + if (!(dirp = my_dir(maria_data_root, MYF(MY_DONT_SORT)))) + return 1; + + for (i= 0; i < dirp->number_off_files; i++) + { + char *file= dirp->dir_entry[i].name; + if (strncmp(file, "maria_log.", 10) == 0 && + file[10] >= '0' && file[10] <= '9' && + file[11] >= '0' && file[11] <= '9' && + file[12] >= '0' && file[12] <= '9' && + file[13] >= '0' && file[13] <= '9' && + file[14] >= '0' && file[14] <= '9' && + file[15] >= '0' && file[15] <= '9' && + file[16] >= '0' && file[16] <= '9' && + file[17] >= '0' && file[17] <= '9' && + file[18] == '\0') + { + if (fn_format(file_name, file, + maria_data_root, "", MYF(MY_WME)) == NullS || + my_delete(file_name, MYF(MY_WME)) != 0) + return 1; + } + } + return 0; +} + diff --git a/storage/maria/unittest/ma_pagecache_consist.c b/storage/maria/unittest/ma_pagecache_consist.c new file mode 100644 index 00000000000..54491a09c3b --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_consist.c @@ -0,0 +1,459 @@ +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). + Use diag() instead of fprintf(stderr). Use ok() and plan(). +*/ + +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (PAGE_SIZE*1024*8) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + +static char *file1_name= (char*)"page_cache_test_file_1"; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count; +static PAGECACHE pagecache; + +#ifdef TEST_HIGH_CONCURENCY +static uint number_of_readers= 10; +static uint number_of_writers= 20; +static uint number_of_tests= 30000; +static uint record_length_limit= PAGE_SIZE/200; +static uint number_of_pages= 20; +static uint flush_divider= 1000; +#else /*TEST_HIGH_CONCURENCY*/ +#ifdef TEST_READERS +static uint number_of_readers= 10; +static uint number_of_writers= 1; +static uint number_of_tests= 30000; +static uint record_length_limit= PAGE_SIZE/200; +static uint number_of_pages= 20; +static uint flush_divider= 1000; +#else /*TEST_READERS*/ +#ifdef TEST_WRITERS +static uint number_of_readers= 0; +static uint number_of_writers= 10; +static uint number_of_tests= 30000; +static uint record_length_limit= PAGE_SIZE/200; +static uint number_of_pages= 20; +static uint flush_divider= 1000; +#else /*TEST_WRITERS*/ +static uint number_of_readers= 10; +static uint number_of_writers= 10; +static uint number_of_tests= 50000; +static uint record_length_limit= PAGE_SIZE/200; +static uint number_of_pages= 20000; +static uint flush_divider= 1000; +#endif /*TEST_WRITERS*/ +#endif /*TEST_READERS*/ +#endif /*TEST_HIGH_CONCURENCY*/ + + +/* + Get pseudo-random length of the field in (0;limit) + + SYNOPSYS + get_len() + limit limit for generated value + + RETURN + length where length >= 0 & length < limit +*/ + +static uint get_len(uint limit) +{ + uint32 rec_len; + do + { + rec_len= random() / + (RAND_MAX / limit); + } while (rec_len >= limit || rec_len == 0); + return rec_len; +} + + +/* check page consistency */ +uint check_page(uchar *buff, ulong offset, int page_locked, int page_no, + int tag) +{ + uint end= sizeof(uint); + uint num= *((uint *)buff); + uint i; + DBUG_ENTER("check_page"); + + for (i= 0; i < num; i++) + { + uint len= *((uint *)(buff + end)); + uint j; + end+= sizeof(uint) + sizeof(uint); + if (len + end > PAGE_SIZE) + { + diag("incorrect field header #%u by offset %lu\n", i, offset + end); + goto err; + } + for(j= 0; j < len; j++) + { + if (buff[end + j] != (uchar)((i+1) % 256)) + { + diag("incorrect %lu byte\n", offset + end + j); + goto err; + } + } + end+= len; + } + for(i= end; i < PAGE_SIZE; i++) + { + if (buff[i] != 0) + { + int h; + DBUG_PRINT("err", + ("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n", + offset + i, offset, i, page_no, + (page_locked ? "locked" : "unlocked"), + end, num, tag)); + diag("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n", + offset + i, offset, i, page_no, + (page_locked ? "locked" : "unlocked"), + end, num, tag); + h= my_open("wrong_page", O_CREAT | O_TRUNC | O_RDWR, MYF(0)); + my_pwrite(h, (uchar*) buff, PAGE_SIZE, 0, MYF(0)); + my_close(h, MYF(0)); + goto err; + } + } + DBUG_RETURN(end); +err: + DBUG_PRINT("err", ("try to flush")); + if (page_locked) + { + pagecache_delete(&pagecache, &file1, page_no, + PAGECACHE_LOCK_LEFT_WRITELOCKED, 1); + } + else + { + flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE); + } + exit(1); +} + +void put_rec(uchar *buff, uint end, uint len, uint tag) +{ + uint i; + uint num= *((uint *)buff); + if (!len) + len= 1; + if (end + sizeof(uint)*2 + len > PAGE_SIZE) + return; + *((uint *)(buff + end))= len; + end+= sizeof(uint); + *((uint *)(buff + end))= tag; + end+= sizeof(uint); + num++; + *((uint *)buff)= num; + *((uint*)(buff + end))= len; + for (i= end; i < (len + end); i++) + { + buff[i]= (uchar) num % 256; + } +} + +/* + Recreate and reopen a file for test + + SYNOPSIS + reset_file() + file File to reset + file_name Path (and name) of file which should be reset +*/ + +void reset_file(PAGECACHE_FILE file, char *file_name) +{ + flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE); + if (my_close(file1.file, MYF(0)) != 0) + { + diag("Got error during %s closing from close() (errno: %d)\n", + file_name, errno); + exit(1); + } + my_delete(file_name, MYF(0)); + if ((file.file= my_open(file_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag("Got error during %s creation from open() (errno: %d)\n", + file_name, errno); + exit(1); + } +} + + +void reader(int num) +{ + unsigned char *buffr= malloc(PAGE_SIZE); + uint i; + + for (i= 0; i < number_of_tests; i++) + { + uint page= get_len(number_of_pages); + pagecache_read(&pagecache, &file1, page, 3, (char*)buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + check_page(buffr, page * PAGE_SIZE, 0, page, -num); + if (i % 500 == 0) + printf("reader%d: %d\n", num, i); + + } + printf("reader%d: done\n", num); + free(buffr); +} + + +void writer(int num) +{ + unsigned char *buffr= malloc(PAGE_SIZE); + uint i; + + for (i= 0; i < number_of_tests; i++) + { + uint end; + uint page= get_len(number_of_pages); + pagecache_read(&pagecache, &file1, page, 3, (char*)buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + end= check_page(buffr, page * PAGE_SIZE, 1, page, num); + put_rec(buffr, end, get_len(record_length_limit), num); + pagecache_write(&pagecache, &file1, page, 3, (char*)buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, + 0); + + if (i % flush_divider == 0) + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + if (i % 500 == 0) + printf("writer%d: %d\n", num, i); + } + printf("writer%d: done\n", num); + free(buffr); +} + + +static void *test_thread_reader(void *arg) +{ + int param=*((int*) arg); + + my_thread_init(); + DBUG_ENTER("test_reader"); + DBUG_PRINT("enter", ("param: %d", param)); + + reader(param); + + DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + DBUG_RETURN(0); +} + +static void *test_thread_writer(void *arg) +{ + int param=*((int*) arg); + + my_thread_init(); + DBUG_ENTER("test_writer"); + DBUG_PRINT("enter", ("param: %d", param)); + + writer(param); + + DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + DBUG_RETURN(0); +} + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error, pagen; + + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/test_pagecache_consist.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO) != 0) + { + fprintf(stderr, "Got error during file1 chmod() (errno: %d)\n", + errno); + exit(1); + } + my_pwrite(file1.file, "test file", 9, 0, MYF(0)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + fprintf(stderr, "COND_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + fprintf(stderr, "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + fprintf(stderr,"Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + fprintf(stderr, + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + VOID(thr_setconcurrency(2)); +#endif + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PAGE_SIZE)) == 0) + { + fprintf(stderr,"Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %d pages", pagen)); + { + unsigned char *buffr= malloc(PAGE_SIZE); + uint i; + memset(buffr, '\0', PAGE_SIZE); + for (i= 0; i < number_of_pages; i++) + { + pagecache_write(&pagecache, &file1, i, 3, (char*)buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0); + } + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + free(buffr); + } + if ((error= pthread_mutex_lock(&LOCK_thread_count))) + { + fprintf(stderr,"LOCK_thread_count: %d from pthread_mutex_lock (errno: %d)\n", + error,errno); + exit(1); + } + while (number_of_readers != 0 || number_of_writers != 0) + { + if (number_of_readers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_readers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_reader, + (void*) param))) + { + fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_readers--; + } + if (number_of_writers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_writers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + } + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + if ((error= pthread_mutex_lock(&LOCK_thread_count))) + fprintf(stderr,"LOCK_thread_count: %d from pthread_mutex_lock\n",error); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count))) + fprintf(stderr,"COND_thread_count: %d from pthread_cond_wait\n",error); + } + if ((error= pthread_mutex_unlock(&LOCK_thread_count))) + fprintf(stderr,"LOCK_thread_count: %d from pthread_mutex_unlock\n",error); + DBUG_PRINT("info", ("thread ended")); + + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(0)) != 0) + { + fprintf(stderr, "Got error during file1 closing from close() (errno: %d)\n", + errno); + exit(1); + } + /*my_delete(file1_name, MYF(0));*/ + my_end(0); + + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + + DBUG_PRINT("info", ("Program end")); + + DBUG_RETURN(exit_status()); +} diff --git a/storage/maria/unittest/ma_pagecache_single.c b/storage/maria/unittest/ma_pagecache_single.c new file mode 100644 index 00000000000..8add95e8a36 --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_single.c @@ -0,0 +1,588 @@ +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). + Use diag() instead of fprintf(stderr). +*/ +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (PAGE_SIZE*1024*10) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + +static char *file1_name= (char*)"page_cache_test_file_1"; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count; +static PAGECACHE pagecache; + +/* + File contance descriptors +*/ +static struct file_desc simple_read_write_test_file[]= +{ + {PAGE_SIZE, '\1'}, + { 0, 0} +}; +static struct file_desc simple_read_change_write_read_test_file[]= +{ + {PAGE_SIZE/2, '\65'}, + {PAGE_SIZE/2, '\1'}, + { 0, 0} +}; +static struct file_desc simple_pin_test_file1[]= +{ + {PAGE_SIZE*2, '\1'}, + { 0, 0} +}; +static struct file_desc simple_pin_test_file2[]= +{ + {PAGE_SIZE/2, '\1'}, + {PAGE_SIZE/2, (unsigned char)129}, + {PAGE_SIZE, '\1'}, + { 0, 0} +}; +static struct file_desc simple_delete_forget_test_file[]= +{ + {PAGE_SIZE, '\1'}, + { 0, 0} +}; +static struct file_desc simple_delete_flush_test_file[]= +{ + {PAGE_SIZE, '\2'}, + { 0, 0} +}; + + +/* + Recreate and reopen a file for test + + SYNOPSIS + reset_file() + file File to reset + file_name Path (and name) of file which should be reset +*/ + +void reset_file(PAGECACHE_FILE file, char *file_name) +{ + flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE); + if (my_close(file1.file, MYF(0)) != 0) + { + diag("Got error during %s closing from close() (errno: %d)\n", + file_name, errno); + exit(1); + } + my_delete(file_name, MYF(0)); + if ((file.file= my_open(file_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag("Got error during %s creation from open() (errno: %d)\n", + file_name, errno); + exit(1); + } +} + +/* + Write then read page, check file on disk +*/ + +int simple_read_write_test() +{ + unsigned char *buffw= malloc(PAGE_SIZE); + unsigned char *buffr= malloc(PAGE_SIZE); + int res; + DBUG_ENTER("simple_read_write_test"); + bfill(buffw, PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0); + pagecache_read(&pagecache, &file1, 0, 3, (char*)buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + ok((res= test(memcmp(buffr, buffw, PAGE_SIZE) == 0)), + "Simple write-read page "); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + ok((res&= test(test_file(file1, file1_name, PAGE_SIZE, PAGE_SIZE, + simple_read_write_test_file))), + "Simple write-read page file"); + if (res) + reset_file(file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + + +/* + Prepare page, then read (and lock), change (write new value and unlock), + then check the page in the cache and on the disk +*/ +int simple_read_change_write_read_test() +{ + unsigned char *buffw= malloc(PAGE_SIZE); + unsigned char *buffr= malloc(PAGE_SIZE); + int res; + DBUG_ENTER("simple_read_change_write_read_test"); + /* prepare the file */ + bfill(buffw, PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + /* test */ + pagecache_read(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + bfill(buffw, PAGE_SIZE/2, '\65'); + pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, + 0); + + pagecache_read(&pagecache, &file1, 0, 3, (char*)buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + ok((res= test(memcmp(buffr, buffw, PAGE_SIZE) == 0)), + "Simple read-change-write-read page "); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + ok((res&= test(test_file(file1, file1_name, PAGE_SIZE, PAGE_SIZE, + simple_read_change_write_read_test_file))), + "Simple read-change-write-read page file"); + if (res) + reset_file(file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + + +/* + Prepare page, read page 0 (and pin) then write page 1 and page 0. + Flush the file (shold flush only page 1 and return 1 (page 0 is + still pinned). + Check file on the disk. + Unpin and flush. + Check file on the disk. +*/ +int simple_pin_test() +{ + unsigned char *buffw= malloc(PAGE_SIZE); + unsigned char *buffr= malloc(PAGE_SIZE); + int res; + DBUG_ENTER("simple_pin_test"); + /* prepare the file */ + bfill(buffw, PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0); + /* test */ + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("error in flush_pagecache_blocks\n"); + exit(1); + } + pagecache_read(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + pagecache_write(&pagecache, &file1, 1, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0); + bfill(buffw + PAGE_SIZE/2, PAGE_SIZE/2, ((unsigned char) 129)); + pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_TO_READ, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, + 0); + /* + We have to get error because one page of the file is pinned, + other page should be flushed + */ + if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Did not get error in flush_pagecache_blocks\n"); + res= 0; + goto err; + } + ok((res= test(test_file(file1, file1_name, PAGE_SIZE*2, PAGE_SIZE*2, + simple_pin_test_file1))), + "Simple pin page file with pin"); + pagecache_unlock(&pagecache, + &file1, + 0, + PAGECACHE_LOCK_READ_UNLOCK, + PAGECACHE_UNPIN, + 0, 0); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks\n"); + res= 0; + goto err; + } + ok((res&= test(test_file(file1, file1_name, PAGE_SIZE*2, PAGE_SIZE, + simple_pin_test_file2))), + "Simple pin page result file"); + if (res) + reset_file(file1, file1_name); +err: + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + +/* + Prepare page, write new value, then delete page from cache without flush, + on the disk should be page with old content written during preparation +*/ + +int simple_delete_forget_test() +{ + unsigned char *buffw= malloc(PAGE_SIZE); + unsigned char *buffr= malloc(PAGE_SIZE); + int res; + DBUG_ENTER("simple_delete_forget_test"); + /* prepare the file */ + bfill(buffw, PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + /* test */ + bfill(buffw, PAGE_SIZE, '\2'); + pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0); + pagecache_delete(&pagecache, &file1, 0, + PAGECACHE_LOCK_WRITE, 0); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + ok((res= test(test_file(file1, file1_name, PAGE_SIZE, PAGE_SIZE, + simple_delete_forget_test_file))), + "Simple delete-forget page file"); + if (res) + reset_file(file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + +/* + Prepare page with locking, write new content to the page, + delete page with flush and on existing lock, + check that page on disk contain new value. +*/ + +int simple_delete_flush_test() +{ + unsigned char *buffw= malloc(PAGE_SIZE); + unsigned char *buffr= malloc(PAGE_SIZE); + int res; + DBUG_ENTER("simple_delete_flush_test"); + /* prepare the file */ + bfill(buffw, PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, + 0); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + /* test */ + bfill(buffw, PAGE_SIZE, '\2'); + pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, + 0); + pagecache_delete(&pagecache, &file1, 0, + PAGECACHE_LOCK_LEFT_WRITELOCKED, 1); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + ok((res= test(test_file(file1, file1_name, PAGE_SIZE, PAGE_SIZE, + simple_delete_flush_test_file))), + "Simple delete-forget page file"); + if (res) + reset_file(file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + + +/* + write then read file bigger then cache +*/ + +int simple_big_test() +{ + unsigned char *buffw= (unsigned char *)malloc(PAGE_SIZE); + unsigned char *buffr= (unsigned char *)malloc(PAGE_SIZE); + struct file_desc *desc= + (struct file_desc *)malloc((PCACHE_SIZE/(PAGE_SIZE/2) + 1) * + sizeof(struct file_desc)); + int res, i; + DBUG_ENTER("simple_big_test"); + /* prepare the file twice larger then cache */ + for (i= 0; i < PCACHE_SIZE/(PAGE_SIZE/2); i++) + { + bfill(buffw, PAGE_SIZE, (unsigned char) (i & 0xff)); + desc[i].length= PAGE_SIZE; + desc[i].content= (i & 0xff); + pagecache_write(&pagecache, &file1, i, 3, (char*)buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0); + } + desc[i].length= 0; + desc[i].content= '\0'; + ok(1, "Simple big file write"); + /* check written pages sequentally read */ + for (i= 0; i < PCACHE_SIZE/(PAGE_SIZE/2); i++) + { + int j; + pagecache_read(&pagecache, &file1, i, 3, (char*)buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + for(j= 0; j < PAGE_SIZE; j++) + { + if (buffr[j] != (i & 0xff)) + { + diag("simple_big_test seq: page %u byte %u mismatch\n", i, j); + return 0; + } + } + } + ok(1, "Simple big file sequential read"); + /* chack random reads */ + for (i= 0; i < PCACHE_SIZE/(PAGE_SIZE); i++) + { + int j, page; + page= rand() % (PCACHE_SIZE/(PAGE_SIZE/2)); + pagecache_read(&pagecache, &file1, page, 3, (char*)buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + for(j= 0; j < PAGE_SIZE; j++) + { + if (buffr[j] != (page & 0xff)) + { + diag("simple_big_test rnd: page %u byte %u mismatch\n", page, j); + return 0; + } + } + } + ok(1, "Simple big file random read"); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + + ok((res= test(test_file(file1, file1_name, PCACHE_SIZE*2, PAGE_SIZE, + desc))), + "Simple big file"); + if (res) + reset_file(file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} +/* + Thread function +*/ + +static void *test_thread(void *arg) +{ +#ifndef DBUG_OFF + int param= *((int*) arg); +#endif + + my_thread_init(); + DBUG_ENTER("test_thread"); + + DBUG_PRINT("enter", ("param: %d", param)); + + if (!simple_read_write_test() || + !simple_read_change_write_read_test() || + !simple_pin_test() || + !simple_delete_forget_test() || + !simple_delete_flush_test()) + exit(1); + + SKIP_BIG_TESTS(4) + { + if (!simple_big_test()) + exit(1); + } + + DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + DBUG_RETURN(0); +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error, pagen; + + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\test_pagecache_single.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/test_pagecache_single.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO) != 0) + { + fprintf(stderr, "Got error during file1 chmod() (errno: %d)\n", + errno); + exit(1); + } + my_pwrite(file1.file, "test file", 9, 0, MYF(0)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + fprintf(stderr,"Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + fprintf(stderr, + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + VOID(thr_setconcurrency(2)); +#endif + + plan(12); + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PAGE_SIZE)) == 0) + { + fprintf(stderr,"Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %d pages", pagen)); + + if ((error=pthread_mutex_lock(&LOCK_thread_count))) + { + fprintf(stderr,"Got error: %d from pthread_mutex_lock (errno: %d)\n", + error,errno); + exit(1); + } + param=(int*) malloc(sizeof(int)); + *param= 1; + if ((error= pthread_create(&tid, &thr_attr, test_thread, (void*) param))) + { + fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + if ((error= pthread_mutex_lock(&LOCK_thread_count))) + fprintf(stderr,"Got error: %d from pthread_mutex_lock\n",error); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count))) + fprintf(stderr,"Got error: %d from pthread_cond_wait\n",error); + } + if ((error= pthread_mutex_unlock(&LOCK_thread_count))) + fprintf(stderr,"Got error: %d from pthread_mutex_unlock\n",error); + DBUG_PRINT("info", ("thread ended")); + + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(0)) != 0) + { + fprintf(stderr, "Got error during file1 closing from close() (errno: %d)\n", + errno); + exit(1); + } + /*my_delete(file1_name, MYF(0));*/ + my_end(0); + + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + + DBUG_PRINT("info", ("Program end")); + + DBUG_RETURN(exit_status()); +} diff --git a/storage/maria/unittest/ma_test_loghandler-t.c b/storage/maria/unittest/ma_test_loghandler-t.c new file mode 100644 index 00000000000..6ea45f80433 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler-t.c @@ -0,0 +1,617 @@ +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif +static TRN *trn= &dummy_transaction_object; + +#define PCACHE_SIZE (1024*1024*10) + +#define LONG_BUFFER_SIZE (100 * 1024) + +#ifdef LONG_LOG_TEST +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE (1024L*1024L) +#define ITERATIONS (1600*4) + +#else +#define LOG_FLAGS (TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC) +#define LOG_FILE_SIZE (1024L*1024L*3L) +#define ITERATIONS 1600 +#endif + +/* +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE 1024L*1024L*1024L +#define ITERATIONS 181000 +*/ + +/* +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE 1024L*1024L*3L +#define ITERATIONS 1600 +*/ + +/* +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE 1024L*1024L*100L +#define ITERATIONS 65000 +*/ + +/* + Check that the buffer filled correctly + + SYNOPSIS + check_content() + ptr Pointer to the buffer + length length of the buffer + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool check_content(uchar *ptr, ulong length) +{ + ulong i; + uchar buff[2]; + for (i= 0; i < length; i++) + { + if (i % 2 == 0) + int2store(buff, i >> 1); + if (ptr[i] != buff[i % 2]) + { + fprintf(stderr, "Byte # %lu is %x instead of %x", + i, (uint) ptr[i], (uint) buff[i % 2]); + return 1; + } + } + return 0; +} + + +/* + Report OK for read operation + + SYNOPSIS + read_ok() + rec the record header +*/ + +void read_ok(TRANSLOG_HEADER_BUFFER *rec) +{ + char buff[80]; + snprintf(buff, sizeof(buff), "read record type: %u LSN: (%lu,0x%lx)", + rec->type, LSN_IN_PARTS(rec->lsn)); + ok(1, buff); +} + +/* + Read whole record content, and check content (put with offset) + + SYNOPSIS + read_and_check_content() + rec The record header buffer + buffer The buffer to read the record in + skip Skip this number of bytes ot the record content + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec, + uchar *buffer, uint skip) +{ + DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE * 2 + 7 * 2 + 2); + if (translog_read_record(rec->lsn, 0, rec->record_length, buffer, NULL) != + rec->record_length) + return 1; + return check_content(buffer + skip, rec->record_length - skip); +} + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint32 i; + uint32 rec_len; + uint pagen; + uchar long_tr_id[6]; + uchar lsn_buff[23]= + { + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + uchar long_buffer[LONG_BUFFER_SIZE * 2 + LSN_STORE_SIZE * 2 + 2]; + PAGECACHE pagecache; + LSN lsn, lsn_base, first_lsn; + TRANSLOG_HEADER_BUFFER rec; + LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 3]; + struct st_translog_scanner_data scanner; + int rc; + + MY_INIT(argv[0]); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= "."; + if (maria_log_remove()) + exit(1); + + for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i+= 2) + { + int2store(long_buffer + i, (i >> 1)); + /* long_buffer[i]= (i & 0xFF); */ + } + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_create_or_open(TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + translog_destroy(); + exit(1); + } + example_loghandler_init(); + /* Suppressing of automatic record writing */ + trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + plan(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1); + + srandom(122334817L); + + long_tr_id[5]= 0xff; + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + trn->short_id= 0; + trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, + 6, TRANSLOG_INTERNAL_PARTS + 1, parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + lsn_base= first_lsn= lsn; + + for (i= 1; i < ITERATIONS; i++) + { + trn->short_id= i % 0xFFFF; + if (i % 2) + { + lsn_store(lsn_buff, lsn_base); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + /* check auto-count feature */ + parts[TRANSLOG_INTERNAL_PARTS + 1].str= NULL; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= 0; + if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_1LSN_EXAMPLE, + trn, NULL, LSN_STORE_SIZE, 0, parts, NULL)) + { + fprintf(stderr, "1 Can't write reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 12) + rec_len= 12; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + /* check record length auto-counting */ + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + trn, NULL, 0, TRANSLOG_INTERNAL_PARTS + 2, + parts, NULL)) + { + fprintf(stderr, "1 Can't write var reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + } + else + { + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 23; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_2LSN_EXAMPLE, + trn, NULL, + 23, TRANSLOG_INTERNAL_PARTS + 1, parts, NULL)) + { + fprintf(stderr, "0 Can't write reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 19) + rec_len= 19; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 14; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE, + trn, NULL, 14 + rec_len, + TRANSLOG_INTERNAL_PARTS + 2, parts, NULL)) + { + fprintf(stderr, "0 Can't write var reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + } + int4store(long_tr_id, i); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + + lsn_base= lsn; + + if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 9) + rec_len= 9; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + trn, NULL, rec_len, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + if (translog_flush(lsn)) + { + fprintf(stderr, "Can't flush #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "flush"); + exit(1); + } + ok(1, "flush"); + } + + srandom(122334817L); + + rc= 1; + + { + int len= translog_read_record_header(first_lsn, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "translog_read_record_header failed (%d)\n", errno); + goto err; + } + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 || + rec.record_length != 6 || uint4korr(rec.header) != 0 || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF || + first_lsn != rec.lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(0)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, " + "lsn(%lu,0x%lx)\n", + (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length, + (uint) uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + goto err; + } + read_ok(&rec); + translog_free_record_header(&rec); + lsn= first_lsn; + if (translog_init_scanner(first_lsn, 1, &scanner)) + { + fprintf(stderr, "scanner init failed\n"); + goto err; + } + for (i= 1;; i++) + { + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + if (i != ITERATIONS) + { + fprintf(stderr, "EOL met at iteration %u instead of %u\n", + i, ITERATIONS); + goto err; + } + break; + } + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 7 || ref != lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE " + "data read(%d) " + "type: %u strid: %u len: %u" + "ref: (%lu,0x%lx) (%lu,0x%lx) " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref), LSN_IN_PARTS(lsn), + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 23 || + ref1 != lsn || + ref2 != first_lsn || + ((uchar)rec.header[22]) != 0x55 || + ((uchar)rec.header[21]) != 0xAA || + ((uchar)rec.header[20]) != 0x55 || + ((uchar)rec.header[19]) != 0xAA || + ((uchar)rec.header[18]) != 0x55 || + ((uchar)rec.header[17]) != 0xAA || + ((uchar)rec.header[16]) != 0x55 || + ((uchar)rec.header[15]) != 0xAA || + ((uchar)rec.header[14]) != 0x55) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %u, ref1(%lu,0x%lx), " + "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + (uint) rec.header[14], (uint) rec.header[15], + (uint) rec.header[16], (uint) rec.header[17], + (uint) rec.header[18], (uint) rec.header[19], + (uint) rec.header[20], (uint) rec.header[21], + (uint) rec.header[22], + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + read_ok(&rec); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header (var) " + "failed (%d)\n", i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration (first var) %u " + "instead of beginning of %u\n", i, ITERATIONS); + goto err; + } + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 12) + rec_len= 12; + if (rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE || + len != 12 || ref != lsn || + check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "data read(%d)" + "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), " + "hdr len: %u (%d), " + "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n", + i, (uint) rec.type, + rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + (uint) rec.short_trid, + rec.short_trid != (i % 0xFFFF), + (ulong) rec.record_length, (ulong) rec_len, + rec.record_length != rec_len + LSN_STORE_SIZE, + (uint) len, + len != 12, + LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn), + (len != 12 || ref != lsn), + check_content(rec.header + LSN_STORE_SIZE, + len - LSN_STORE_SIZE)); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 19) + rec_len= 19; + if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE * 2 || + len != 19 || + ref1 != lsn || + ref2 != first_lsn || + check_content(rec.header + LSN_STORE_SIZE * 2, + len - LSN_STORE_SIZE * 2)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, " + "ref1(%lu,0x%lx), ref2(%lu,0x%lx), " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + LSN_IN_PARTS(rec.lsn)); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + read_ok(&rec); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration %u " + "instead of beginning of %u\n", i, ITERATIONS); + goto err; + } + if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 6 || uint4korr(rec.header) != i || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(%d)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + (uint) uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + goto err; + } + lsn= rec.lsn; + read_ok(&rec); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if ((rec_len= random() / (RAND_MAX / (LONG_BUFFER_SIZE + 1))) < 9) + rec_len= 9; + if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len || + len != 9 || check_content(rec.header, (uint)len)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %lu != %lu, hdr len: %d, " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, LSN_IN_PARTS(rec.lsn)); + goto err; + } + if (read_and_check_content(&rec, long_buffer, 0)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + goto err; + } + read_ok(&rec); + translog_free_record_header(&rec); + } + } + + rc= 0; +err: + if (rc) + ok(0, "read record"); + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + + if (maria_log_remove()) + exit(1); + + return(test(exit_status())); +} diff --git a/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c new file mode 100644 index 00000000000..28233ae04cb --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c @@ -0,0 +1,147 @@ +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define LOG_FLAGS 0 + +static char *first_translog_file= (char*)"maria_log.00000001"; + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn, first_lsn, theor_lsn; + MY_STAT st; + LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + + MY_INIT(argv[0]); + + plan(2); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= "."; + if (maria_log_remove()) + exit(1); + /* be sure that we have no logs in the directory*/ + if (my_stat(CONTROL_FILE_BASE_NAME, &st, MYF(0))) + my_delete(CONTROL_FILE_BASE_NAME, MYF(0)); + if (my_stat(first_translog_file, &st, MYF(0))) + my_delete(first_translog_file, MYF(0)); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_create_or_open(TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + translog_destroy(); + exit(1); + } + example_loghandler_init(); + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + theor_lsn= translog_first_theoretical_lsn(); + if (theor_lsn == 1) + { + fprintf(stderr, "Error reading the first log file."); + translog_destroy(); + exit(1); + } + if (theor_lsn == LSN_IMPOSSIBLE) + { + fprintf(stderr, "There is no first log file."); + translog_destroy(); + exit(1); + } + first_lsn= translog_first_lsn_in_log(); + if (first_lsn != LSN_IMPOSSIBLE) + { + fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).", + LSN_IN_PARTS(first_lsn)); + translog_destroy(); + exit(1); + } + ok(1, "Empty log response"); + + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + theor_lsn= translog_first_theoretical_lsn(); + if (theor_lsn == 1) + { + fprintf(stderr, "Error reading the first log file\n"); + translog_destroy(); + exit(1); + } + if (theor_lsn == LSN_IMPOSSIBLE) + { + fprintf(stderr, "There is no first log file\n"); + translog_destroy(); + exit(1); + } + first_lsn= translog_first_lsn_in_log(); + if (first_lsn != theor_lsn) + { + fprintf(stderr, "Incorrect first lsn: (%lu,0x%lx) " + " theoretical first: (%lu,0x%lx)\n", + LSN_IN_PARTS(first_lsn), LSN_IN_PARTS(theor_lsn)); + translog_destroy(); + exit(1); + } + + ok(1, "Full log response"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + exit(0); +} diff --git a/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c new file mode 100644 index 00000000000..d6f0bde7a8e --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c @@ -0,0 +1,140 @@ +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (4*1024L*1024L) +#define LOG_FLAGS 0 + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + ulong i; + uint pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn, max_lsn, last_lsn= LSN_IMPOSSIBLE; + LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + + MY_INIT(argv[0]); + + plan(2); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= "."; + if (maria_log_remove()) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_create_or_open(TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + translog_destroy(); + exit(1); + } + example_loghandler_init(); + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + max_lsn= translog_get_file_max_lsn_stored(1); + if (max_lsn == 1) + { + fprintf(stderr, "Error reading the first log file."); + translog_destroy(); + exit(1); + } + if (max_lsn != LSN_IMPOSSIBLE) + { + fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).", + LSN_IN_PARTS(max_lsn)); + translog_destroy(); + exit(1); + } + ok(1, "Empty log response"); + + + /* write more then 1 file */ + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + for(i= 0; i < LOG_FILE_SIZE/6; i++) + { + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + if (LSN_FILE_NO(lsn) == 1) + last_lsn= lsn; + } + + + max_lsn= translog_get_file_max_lsn_stored(1); + if (max_lsn == 1) + { + fprintf(stderr, "Error reading the first log file\n"); + translog_destroy(); + exit(1); + } + if (max_lsn == LSN_IMPOSSIBLE) + { + fprintf(stderr, "Isn't first file still finished?!!\n"); + translog_destroy(); + exit(1); + } + if (max_lsn != last_lsn) + { + fprintf(stderr, "Incorrect max lsn: (%lu,0x%lx) " + " last lsn on first file: (%lu,0x%lx)\n", + LSN_IN_PARTS(max_lsn), LSN_IN_PARTS(last_lsn)); + translog_destroy(); + exit(1); + } + + ok(1, "First file max LSN"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + exit(0); +} diff --git a/storage/maria/unittest/ma_test_loghandler_multigroup-t.c b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c new file mode 100644 index 00000000000..d5f00bdb6fd --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c @@ -0,0 +1,641 @@ +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif +static TRN *trn= &dummy_transaction_object; + +#define PCACHE_SIZE (1024*1024*10) + +#define LONG_BUFFER_SIZE ((1024L*1024L*1024L) + (1024L*1024L*512)) + +#define MIN_REC_LENGTH (1024L*1024L + 1024L*512L + 1) + +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define ITERATIONS 2 +/*#define ITERATIONS 63 */ + +/* +#define LOG_FILE_SIZE 1024L*1024L*3L +#define ITERATIONS 1600 +*/ +/* +#define LOG_FILE_SIZE 1024L*1024L*100L +#define ITERATIONS 65000 +*/ + + +/* + Check that the buffer filled correctly + + SYNOPSIS + check_content() + ptr Pointer to the buffer + length length of the buffer + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool check_content(uchar *ptr, ulong length) +{ + ulong i; + uchar buff[4]; + DBUG_ENTER("check_content"); + for (i= 0; i < length; i++) + { + if (i % 4 == 0) + int4store(buff, (i >> 2)); + if (ptr[i] != buff[i % 4]) + { + fprintf(stderr, "Byte # %lu is %x instead of %x", + i, (uint) ptr[i], (uint) buff[i % 4]); + DBUG_DUMP("mem", ptr +(ulong) (i > 16 ? i - 16 : 0), + (i > 16 ? 16 : i) + (i + 16 < length ? 16 : length - i)); + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Read whole record content, and check content (put with offset) + + SYNOPSIS + read_and_check_content() + rec The record header buffer + buffer The buffer to read the record in + skip Skip this number of bytes ot the record content + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec, + uchar *buffer, uint skip) +{ + int res= 0; + translog_size_t len; + DBUG_ENTER("read_and_check_content"); + DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); + if ((len= translog_read_record(rec->lsn, 0, rec->record_length, + buffer, NULL)) != rec->record_length) + { + fprintf(stderr, "Requested %lu byte, read %lu\n", + (ulong) rec->record_length, (ulong) len); + res= 1; + } + res|= check_content(buffer + skip, rec->record_length - skip); + DBUG_RETURN(res); +} + + +static uint32 get_len() +{ + uint32 rec_len; + do + { + rec_len= random() / + (RAND_MAX / (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1)) + MIN_REC_LENGTH; + } while (rec_len >= LONG_BUFFER_SIZE); + return rec_len; +} + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint32 i; + uint32 rec_len; + uint pagen; + uchar long_tr_id[6]; + uchar lsn_buff[23]= + { + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + uchar *long_buffer= malloc(LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); + PAGECACHE pagecache; + LSN lsn, lsn_base, first_lsn; + TRANSLOG_HEADER_BUFFER rec; + LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 2]; + struct st_translog_scanner_data scanner; + int rc; + + MY_INIT(argv[0]); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= "."; + if (maria_log_remove()) + exit(1); + + { + uchar buff[4]; + for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i++) + { + if (i % 4 == 0) + int4store(buff, (i >> 2)); + long_buffer[i]= buff[i % 4]; + } + } + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_create_or_open(TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + translog_destroy(); + exit(1); + } + example_loghandler_init(); + /* Suppressing of automatic record writing */ + trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + plan(((ITERATIONS - 1) * 4 + 1) * 2); + + srandom(122334817L); + + long_tr_id[5]= 0xff; + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + trn->short_id= 0; + trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, + 6, TRANSLOG_INTERNAL_PARTS + 1, parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + lsn_base= first_lsn= lsn; + + for (i= 1; i < ITERATIONS; i++) + { + if (i % 2) + { + lsn_store(lsn_buff, lsn_base); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_1LSN_EXAMPLE, + trn, NULL, + LSN_STORE_SIZE, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL)) + { + fprintf(stderr, "1 Can't write reference before record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + rec_len= get_len(); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + trn, NULL, LSN_STORE_SIZE + rec_len, + TRANSLOG_INTERNAL_PARTS + 2, + parts, NULL)) + { + fprintf(stderr, "1 Can't write var reference before record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + } + else + { + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= 23; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_2LSN_EXAMPLE, + trn, NULL, 23, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "0 Can't write reference before record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + rec_len= get_len(); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE * 2; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= (char*)long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE, + trn, NULL, LSN_STORE_SIZE * 2 + rec_len, + TRANSLOG_INTERNAL_PARTS + 2, + parts, NULL)) + { + fprintf(stderr, "0 Can't write var reference before record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + } + int4store(long_tr_id, i); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + + lsn_base= lsn; + + rec_len= get_len(); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + trn, NULL, rec_len, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL)) + { + fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + } + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + + if (ma_control_file_create_or_open(TRUE)) + { + fprintf(stderr, "pass2: Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE)) == 0) + { + fprintf(stderr, "pass2: Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, 0)) + { + fprintf(stderr, "pass2: Can't init loghandler (%d)\n", errno); + translog_destroy(); + exit(1); + } + example_loghandler_init(); + + srandom(122334817L); + + rc= 1; + + { + int len= translog_read_record_header(first_lsn, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "translog_read_record_header failed (%d)\n", errno); + translog_free_record_header(&rec); + goto err; + } + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 || + rec.record_length != 6 || uint4korr(rec.header) != 0 || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF || + first_lsn != rec.lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(0)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, " + "lsn(0x%lu,0x%lx)\n", + (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length, + (uint)uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + ok(1, "read record"); + translog_free_record_header(&rec); + lsn= first_lsn; + if (translog_init_scanner(first_lsn, 1, &scanner)) + { + fprintf(stderr, "scanner init failed\n"); + goto err; + } + for (i= 1;; i++) + { + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + translog_free_record_header(&rec); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + if (i != ITERATIONS) + { + fprintf(stderr, "EOL met at iteration %u instead of %u\n", + i, ITERATIONS); + translog_free_record_header(&rec); + goto err; + } + break; + } + + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != LSN_STORE_SIZE || ref != lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE " + "data read(%d)" + "type %u, strid %u, len %u, ref(%lu,0x%lx), lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 23 || + ref1 != lsn || + ref2 != first_lsn || + ((uchar)rec.header[22]) != 0x55 || + ((uchar)rec.header[21]) != 0xAA || + ((uchar)rec.header[20]) != 0x55 || + ((uchar)rec.header[19]) != 0xAA || + ((uchar)rec.header[18]) != 0x55 || + ((uchar)rec.header[17]) != 0xAA || + ((uchar)rec.header[16]) != 0x55 || + ((uchar)rec.header[15]) != 0xAA || + ((uchar)rec.header[14]) != 0x55) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %u, ref1(%lu,0x%lx), " + "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + (uint) rec.header[14], (uint) rec.header[15], + (uint) rec.header[16], (uint) rec.header[17], + (uint) rec.header[18], (uint) rec.header[19], + (uint) rec.header[20], (uint) rec.header[21], + (uint) rec.header[22], + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + ok(1, "read record"); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header (var) " + "failed (%d)\n", i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration (first var) %u " + "instead of beginning of %u\n", i, ITERATIONS); + goto err; + } + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + rec_len= get_len(); + if (rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE || + len != 12 || ref != lsn || + check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "data read(%d)" + "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), " + "hdr len: %d (%d), " + "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n", + i, (uint) rec.type, + rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + (uint) rec.short_trid, + rec.short_trid != (i % 0xFFFF), + (ulong) rec.record_length, (ulong) rec_len, + rec.record_length != rec_len + LSN_STORE_SIZE, + len, + len != 12, + LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn), + (ref != lsn), + check_content(rec.header + LSN_STORE_SIZE, + len - LSN_STORE_SIZE)); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + rec_len= get_len(); + if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE * 2 || + len != 19 || + ref1 != lsn || + ref2 != first_lsn || + check_content(rec.header + LSN_STORE_SIZE * 2, + len - LSN_STORE_SIZE * 2)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + " data read(%d) " + "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, " + "ref1(%lu,0x%lx), ref2(%lu,0x%lx), " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, + LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + ok(1, "read record"); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + translog_free_record_header(&rec); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration %u " + "instead of beginning of %u\n", i, ITERATIONS); + translog_free_record_header(&rec); + goto err; + } + if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 6 || uint4korr(rec.header) != i || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(%d)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + (uint)uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + ok(1, "read record"); + translog_free_record_header(&rec); + + lsn= rec.lsn; + + len= translog_read_next_record_header(&scanner, &rec); + rec_len= get_len(); + if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len || + len != 9 || check_content(rec.header, len)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %lu != %lu, hdr len: %d, " + "lsn(%lu,0x%lx)\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, 0)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + ok(1, "read record"); + translog_free_record_header(&rec); + } + } + + rc= 0; +err: + if (rc) + ok(0, "read record"); + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + + return (test(exit_status())); +} diff --git a/storage/maria/unittest/ma_test_loghandler_multithread-t.c b/storage/maria/unittest/ma_test_loghandler_multithread-t.c new file mode 100644 index 00000000000..6255c11db89 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_multithread-t.c @@ -0,0 +1,479 @@ +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) + +/*#define LOG_FLAGS TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC */ +#define LOG_FLAGS 0 +/*#define LONG_BUFFER_SIZE (1024L*1024L*1024L + 1024L*1024L*512)*/ +#define LONG_BUFFER_SIZE (1024L*1024L*1024L) +#define MIN_REC_LENGTH 30 +#define SHOW_DIVIDER 10 +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define ITERATIONS 3 +#define WRITERS 3 +static uint number_of_writers= WRITERS; + +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count; + +static ulong lens[WRITERS][ITERATIONS]; +static LSN lsns1[WRITERS][ITERATIONS]; +static LSN lsns2[WRITERS][ITERATIONS]; +static uchar *long_buffer; + +/* + Get pseudo-random length of the field in + limits [MIN_REC_LENGTH..LONG_BUFFER_SIZE] + + SYNOPSIS + get_len() + + RETURN + length - length >= 0 length <= LONG_BUFFER_SIZE +*/ + +static uint32 get_len() +{ + uint32 rec_len; + do + { + rec_len= random() / + (RAND_MAX / (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1)) + MIN_REC_LENGTH; + } while (rec_len >= LONG_BUFFER_SIZE); + return rec_len; +} + + +/* + Check that the buffer filled correctly + + SYNOPSIS + check_content() + ptr Pointer to the buffer + length length of the buffer + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool check_content(uchar *ptr, ulong length) +{ + ulong i; + for (i= 0; i < length; i++) + { + if (((uchar)ptr[i]) != (i & 0xFF)) + { + fprintf(stderr, "Byte # %lu is %x instead of %x", + i, (uint) ptr[i], (uint) (i & 0xFF)); + return 1; + } + } + return 0; +} + + +/* + Read whole record content, and check content (put with offset) + + SYNOPSIS + read_and_check_content() + rec The record header buffer + buffer The buffer to read the record in + skip Skip this number of bytes ot the record content + + RETURN + 0 - OK + 1 - Error +*/ + + +static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec, + uchar *buffer, uint skip) +{ + int res= 0; + translog_size_t len; + + if ((len= translog_read_record(rec->lsn, 0, rec->record_length, + buffer, NULL)) != rec->record_length) + { + fprintf(stderr, "Requested %lu byte, read %lu\n", + (ulong) rec->record_length, (ulong) len); + res= 1; + } + res|= check_content(buffer + skip, rec->record_length - skip); + return(res); +} + +void writer(int num) +{ + LSN lsn; + TRN trn; + uchar long_tr_id[6]; + uint i; + + trn.short_id= num; + trn.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + for (i= 0; i < ITERATIONS; i++) + { + uint len= get_len(); + lens[num][i]= len; + LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + + int2store(long_tr_id, num); + int4store(long_tr_id + 2, i); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write LOGREC_FIXED_RECORD_0LSN_EXAMPLE record #%lu " + "thread %i\n", (ulong) i, num); + translog_destroy(); + pthread_mutex_lock(&LOCK_thread_count); + ok(0, "write records"); + pthread_mutex_unlock(&LOCK_thread_count); + return; + } + lsns1[num][i]= lsn; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= len; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + &trn, NULL, + len, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i); + translog_destroy(); + pthread_mutex_lock(&LOCK_thread_count); + ok(0, "write records"); + pthread_mutex_unlock(&LOCK_thread_count); + return; + } + lsns2[num][i]= lsn; + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "write records"); + pthread_mutex_unlock(&LOCK_thread_count); + } + return; +} + + +static void *test_thread_writer(void *arg) +{ + int param= *((int*) arg); + + my_thread_init(); + + writer(param); + + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + ok(1, "writer finished"); /* just to show progress */ + VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are + ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + return(0); +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__ ((unused))) +{ + uint32 i; + uint pagen; + PAGECACHE pagecache; + LSN first_lsn; + TRANSLOG_HEADER_BUFFER rec; + struct st_translog_scanner_data scanner; + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error; + int rc; + + plan(WRITERS + ITERATIONS * WRITERS * 3); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= "."; + long_buffer= malloc(LONG_BUFFER_SIZE + 7 * 2 + 2); + if (long_buffer == 0) + { + fprintf(stderr, "End of memory\n"); + exit(1); + } + for (i= 0; i < (LONG_BUFFER_SIZE + 7 * 2 + 2); i++) + long_buffer[i]= (i & 0xFF); + + MY_INIT(argv[0]); + if (maria_log_remove()) + exit(1); + + +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + fprintf(stderr, "COND_thread_count: %d from pthread_cond_init " + "(errno: %d)\n", error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + fprintf(stderr, "LOCK_thread_count: %d from pthread_cond_init " + "(errno: %d)\n", error, errno); + exit(1); + } + if ((error= pthread_attr_init(&thr_attr))) + { + fprintf(stderr, "Got error: %d from pthread_attr_init " + "(errno: %d)\n", error, errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + fprintf(stderr, + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error, errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + VOID(thr_setconcurrency(2)); +#endif + + my_thread_global_init(); + + if (ma_control_file_create_or_open(TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + translog_destroy(); + exit(1); + } + example_loghandler_init(); + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + srandom(122334817L); + { + LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + uchar long_tr_id[6]= + { + 0x11, 0x22, 0x33, 0x44, 0x55, 0x66 + }; + + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&first_lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write the first record\n"); + translog_destroy(); + exit(1); + } + } + + + if ((error= pthread_mutex_lock(&LOCK_thread_count))) + { + fprintf(stderr, "LOCK_thread_count: %d from pthread_mutex_lock " + "(errno: %d)\n", error, errno); + exit(1); + } + + while (number_of_writers != 0) + { + param= (int*) malloc(sizeof(int)); + *param= number_of_writers - 1; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n", + error, errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + if ((error= pthread_mutex_lock(&LOCK_thread_count))) + fprintf(stderr, "LOCK_thread_count: %d from pthread_mutex_lock\n", error); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count))) + fprintf(stderr, "COND_thread_count: %d from pthread_cond_wait\n", error); + } + if ((error= pthread_mutex_unlock(&LOCK_thread_count))) + fprintf(stderr, "LOCK_thread_count: %d from pthread_mutex_unlock\n", error); + + /* Find last LSN and flush up to it (all our log) */ + { + LSN max= 0; + for (i= 0; i < WRITERS; i++) + { + if (cmp_translog_addr(lsns2[i][ITERATIONS - 1], max) > 0) + max= lsns2[i][ITERATIONS - 1]; + } + translog_flush(max); + } + + rc= 1; + + { + uint indeces[WRITERS]; + uint index, stage; + int len; + bzero(indeces, sizeof(uint) * WRITERS); + + bzero(indeces, sizeof(indeces)); + + if (translog_init_scanner(first_lsn, 1, &scanner)) + { + fprintf(stderr, "scanner init failed\n"); + goto err; + } + for (i= 0;; i++) + { + len= translog_read_next_record_header(&scanner, &rec); + + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + translog_free_record_header(&rec); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + if (i != WRITERS * ITERATIONS * 2) + { + fprintf(stderr, "EOL met at iteration %u instead of %u\n", + i, ITERATIONS * WRITERS * 2); + translog_free_record_header(&rec); + goto err; + } + break; + } + index= indeces[rec.short_trid] / 2; + stage= indeces[rec.short_trid] % 2; + if (stage == 0) + { + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || + rec.record_length != 6 || + uint2korr(rec.header) != rec.short_trid || + index != uint4korr(rec.header + 2) || + cmp_translog_addr(lsns1[rec.short_trid][index], rec.lsn) != 0) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(%d)\n" + "type %u, strid %u %u, len %u, i: %u %u, " + "lsn(%lu,0x%lx) (%lu,0x%lx)\n", + i, (uint) rec.type, + (uint) rec.short_trid, (uint) uint2korr(rec.header), + (uint) rec.record_length, + (uint) index, (uint) uint4korr(rec.header + 2), + LSN_IN_PARTS(rec.lsn), + LSN_IN_PARTS(lsns1[rec.short_trid][index])); + translog_free_record_header(&rec); + goto err; + } + } + else + { + if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE || + len != 9 || + rec.record_length != lens[rec.short_trid][index] || + cmp_translog_addr(lsns2[rec.short_trid][index], rec.lsn) != 0 || + check_content(rec.header, (uint)len)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "data read(%d) " + "thread: %d, iteration %d, stage %d\n" + "type %u (%d), len %d, length %lu %lu (%d) " + "lsn(%lu,0x%lx) (%lu,0x%lx)\n", + i, (uint) rec.short_trid, index, stage, + (uint) rec.type, (rec.type != + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE), + len, + (ulong) rec.record_length, lens[rec.short_trid][index], + (rec.record_length != lens[rec.short_trid][index]), + LSN_IN_PARTS(rec.lsn), + LSN_IN_PARTS(lsns2[rec.short_trid][index])); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, 0)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "in whole rec read lsn(%lu,0x%lx)\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + ok(1, "record read"); + translog_free_record_header(&rec); + indeces[rec.short_trid]++; + } + } + + rc= 0; +err: + if (rc) + ok(0, "record read"); + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + + return(exit_status()); +} diff --git a/storage/maria/unittest/ma_test_loghandler_noflush-t.c b/storage/maria/unittest/ma_test_loghandler_noflush-t.c new file mode 100644 index 00000000000..2c3afb9a76b --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_noflush-t.c @@ -0,0 +1,132 @@ +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define LOG_FLAGS 0 + +static char *first_translog_file= (char*)"maria_log.00000001"; + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint pagen; + int rc= 1; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN first_lsn; + MY_STAT st; + TRANSLOG_HEADER_BUFFER rec; + LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + + MY_INIT(argv[0]); + + plan(1); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= "."; + if (maria_log_remove()) + exit(1); + /* be sure that we have no logs in the directory*/ + if (my_stat(CONTROL_FILE_BASE_NAME, &st, MYF(0))) + my_delete(CONTROL_FILE_BASE_NAME, MYF(0)); + if (my_stat(first_translog_file, &st, MYF(0))) + my_delete(first_translog_file, MYF(0)); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_create_or_open(TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + translog_destroy(); + exit(1); + } + example_loghandler_init(); + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + int4store(long_tr_id, 0); + long_tr_id[5]= 0xff; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&first_lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + translog_size_t len= translog_read_record_header(first_lsn, &rec); + if (len == 0) + { + fprintf(stderr, "translog_read_record_header failed (%d)\n", errno); + goto err; + } + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 || + rec.record_length != 6 || uint4korr(rec.header) != 0 || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF || + first_lsn != rec.lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(0)\n" + "type: %u (%d) strid: %u (%d) len: %u (%d) i: %u (%d), " + "4: %u (%d) 5: %u (%d) " + "lsn(%lu,0x%lx) (%d)\n", + (uint) rec.type, (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE), + (uint) rec.short_trid, (rec.short_trid != 0), + (uint) rec.record_length, (rec.record_length != 6), + (uint) uint4korr(rec.header), (uint4korr(rec.header) != 0), + (uint) rec.header[4], (((uchar)rec.header[4]) != 0), + (uint) rec.header[5], (((uchar)rec.header[5]) != 0xFF), + LSN_IN_PARTS(rec.lsn), (first_lsn != rec.lsn)); + goto err; + } + + ok(1, "read OK"); + rc= 0; + +err: + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + + exit(rc); +} diff --git a/storage/maria/unittest/ma_test_loghandler_pagecache-t.c b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c new file mode 100644 index 00000000000..276640dfd17 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c @@ -0,0 +1,159 @@ +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define LOG_FLAGS 0 + +static char *first_translog_file= (char*)"maria_log.00000001"; +static char *file1_name= (char*)"page_cache_test_file_1"; +static PAGECACHE_FILE file1; + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn; + MY_STAT st, *stat; + LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + + MY_INIT(argv[0]); + + plan(1); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= "."; + if (maria_log_remove()) + exit(1); + /* be sure that we have no logs in the directory*/ + if (my_stat(CONTROL_FILE_BASE_NAME, &st, MYF(0))) + my_delete(CONTROL_FILE_BASE_NAME, MYF(0)); + if (my_stat(first_translog_file, &st, MYF(0))) + my_delete(first_translog_file, MYF(0)); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler_pagecache.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler_pagecache.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_create_or_open(TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + translog_destroy(); + exit(1); + } + example_loghandler_init(); + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + if ((stat= my_stat(first_translog_file, &st, MYF(0))) == 0) + { + fprintf(stderr, "There is no %s (%d)\n", first_translog_file, errno); + exit(1); + } + if (st.st_size != TRANSLOG_PAGE_SIZE) + { + fprintf(stderr, + "incorrect initial size of %s: %ld instead of %ld\n", + first_translog_file, (long)st.st_size, (long)TRANSLOG_PAGE_SIZE); + exit(1); + } + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + if (chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO) != 0) + { + fprintf(stderr, "Got error during file1 chmod() (errno: %d)\n", + errno); + exit(1); + } + + { + uchar page[PCACHE_PAGE]; + + bzero(page, PCACHE_PAGE); +#define PAGE_LSN_OFFSET 0 + lsn_store(page + PAGE_LSN_OFFSET, lsn); + pagecache_write(&pagecache, &file1, 0, 3, (char*)page, + PAGECACHE_LSN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + } + if ((stat= my_stat(first_translog_file, &st, MYF(0))) == 0) + { + fprintf(stderr, "can't stat %s (%d)\n", first_translog_file, errno); + exit(1); + } + if (st.st_size != TRANSLOG_PAGE_SIZE * 2) + { + fprintf(stderr, + "incorrect initial size of %s: %ld instead of %ld\n", + first_translog_file, + (long)st.st_size, (long)(TRANSLOG_PAGE_SIZE * 2)); + ok(0, "log triggered"); + exit(1); + } + ok(1, "log triggered"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + my_delete(CONTROL_FILE_BASE_NAME, MYF(0)); + my_delete(first_translog_file, MYF(0)); + my_delete(file1_name, MYF(0)); + + exit(0); +} diff --git a/storage/maria/unittest/ma_test_loghandler_purge-t.c b/storage/maria/unittest/ma_test_loghandler_purge-t.c new file mode 100644 index 00000000000..c638aa85ac6 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_purge-t.c @@ -0,0 +1,176 @@ +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (4*1024L*1024L) +#define LOG_FLAGS 0 +#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2) + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + ulong i; + uint pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn; + LEX_STRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + uchar *long_buffer= malloc(LONG_BUFFER_SIZE); + + MY_INIT(argv[0]); + + plan(4); + + bzero(&pagecache, sizeof(pagecache)); + bzero(long_buffer, LONG_BUFFER_SIZE); + maria_data_root= "."; + if (maria_log_remove()) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(__WIN__) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_create_or_open(TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init(".", LOG_FILE_SIZE, 50112, 0, &pagecache, LOG_FLAGS)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + translog_destroy(); + exit(1); + } + example_loghandler_init(); + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + /* write more then 1 file */ + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + translog_purge(lsn); + if (!translog_is_file(1)) + { + fprintf(stderr, "First file was removed after first record\n"); + translog_destroy(); + exit(1); + } + ok(1, "First is not removed"); + + for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++) + { + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + } + + translog_purge(lsn); + if (translog_is_file(1)) + { + fprintf(stderr, "First file was not removed.\n"); + translog_destroy(); + exit(1); + } + + ok(1, "First file is removed"); + + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LONG_BUFFER_SIZE; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, LONG_BUFFER_SIZE, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL)) + { + fprintf(stderr, "Can't write variable record\n"); + translog_destroy(); + exit(1); + } + + translog_purge(lsn); + if (!translog_is_file(2) || !translog_is_file(3)) + { + fprintf(stderr, "Second file (%d) or third file (%d) is not present.\n", + translog_is_file(2), translog_is_file(3)); + translog_destroy(); + exit(1); + } + + ok(1, "Second and third files are not removed"); + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= (char*)long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL)) + { + fprintf(stderr, "Can't write last record\n"); + translog_destroy(); + exit(1); + } + + translog_purge(lsn); + if (translog_is_file(2)) + { + fprintf(stderr, "Second file is not removed\n"); + translog_destroy(); + exit(1); + } + + ok(1, "Second file is removed"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove()) + exit(1); + exit(0); +} diff --git a/storage/maria/unittest/test_file.c b/storage/maria/unittest/test_file.c new file mode 100644 index 00000000000..758d0bfa81b --- /dev/null +++ b/storage/maria/unittest/test_file.c @@ -0,0 +1,68 @@ +#include <tap.h> +#include <my_sys.h> +#include <my_dir.h> +#include "test_file.h" + + +/* + Check that file contance correspond to descriptor + + SYNOPSIS + test_file() + file File to test + file_name Path (and name) of file which is tested + size size of file + buff_size size of buffer which is enought to check the file + desc file descriptor to check with + + RETURN + 1 file if OK + 0 error +*/ + +int test_file(PAGECACHE_FILE file, char *file_name, + off_t size, size_t buff_size, struct file_desc *desc) +{ + MY_STAT stat_buff, *stat; + unsigned char *buffr= malloc(buff_size); + off_t pos= 0; + size_t byte; + int step= 0; + + if ((stat= my_stat(file_name, &stat_buff, MYF(0))) == NULL) + { + diag("Can't stat() %s (errno: %d)\n", file_name, errno); + return 0; + } + if (stat->st_size != size) + { + diag("file %s size is %lu (should be %lu)\n", + file_name, (ulong) stat->st_size, (ulong) size); + return 0; + } + /* check content */ + my_seek(file.file, 0, SEEK_SET, MYF(0)); + while (desc[step].length != 0) + { + if (my_read(file.file, (char*)buffr, desc[step].length, MYF(0)) != + desc[step].length) + { + diag("Can't read %u bytes from %s (errno: %d)\n", + (uint)desc[step].length, file_name, errno); + return 0; + } + for (byte= 0; byte < desc[step].length; byte++) + { + if (buffr[byte] != desc[step].content) + { + diag("content of %s mismatch 0x%x in position %lu instead of 0x%x\n", + file_name, (uint) buffr[byte], (ulong) (pos + byte), + desc[step].content); + return 0; + } + } + pos+= desc[step].length; + step++; + } + return 1; +} diff --git a/storage/maria/unittest/test_file.h b/storage/maria/unittest/test_file.h new file mode 100644 index 00000000000..293c692717e --- /dev/null +++ b/storage/maria/unittest/test_file.h @@ -0,0 +1,14 @@ +#include <m_string.h> +#include "../ma_pagecache.h" + +/* + File content descriptor +*/ +struct file_desc +{ + unsigned int length; + unsigned char content; +}; + +int test_file(PAGECACHE_FILE file, char *file_name, + off_t size, size_t buff_size, struct file_desc *desc); diff --git a/storage/maria/unittest/trnman-t.c b/storage/maria/unittest/trnman-t.c new file mode 100644 index 00000000000..db137cf088c --- /dev/null +++ b/storage/maria/unittest/trnman-t.c @@ -0,0 +1,194 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <my_atomic.h> +#include <lf.h> +#include <m_string.h> +#include "../trnman.h" + +pthread_mutex_t rt_mutex; +pthread_attr_t attr; +size_t stacksize= 0; +#define STACK_SIZE (((int)stacksize-2048)*STACK_DIRECTION) + +int rt_num_threads; +int litmus; + +/* + create and end (commit or rollback) transactions randomly +*/ +#define MAX_ITER 100 +pthread_handler_t test_trnman(void *arg) +{ + uint x, y, i, n; + TRN *trn[MAX_ITER]; + pthread_mutex_t mutexes[MAX_ITER]; + pthread_cond_t conds[MAX_ITER]; + int m= (*(int *)arg); + + for (i= 0; i < MAX_ITER; i++) + { + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init(&conds[i], 0); + } + + for (x= ((int)(intptr)(&m)); m > 0; ) + { + y= x= (x*LL(3628273133) + LL(1500450271)) % LL(9576890767); /* three prime numbers */ + m-= n= x % MAX_ITER; + for (i= 0; i < n; i++) + { + trn[i]= trnman_new_trn(&mutexes[i], &conds[i], &m + STACK_SIZE); + if (!trn[i]) + { + diag("trnman_new_trn() failed"); + litmus++; + } + } + for (i= 0; i < n; i++) + { + y= (y*19 + 7) % 31; + trnman_end_trn(trn[i], y & 1); + } + } + for (i= 0; i < MAX_ITER; i++) + { + pthread_mutex_destroy(&mutexes[i]); + pthread_cond_destroy(&conds[i]); + } + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + pthread_mutex_unlock(&rt_mutex); + + return 0; +} +#undef MAX_ITER + +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Testing %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, &attr, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Tested %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +#define ok_read_from(T1, T2, RES) \ + i= trnman_can_read_from(trn[T1], trn[T2]->trid); \ + ok(i == RES, "trn" #T1 " %s read from trn" #T2, i ? "can" : "cannot") +#define start_transaction(T) \ + trn[T]= trnman_new_trn(&mutexes[T], &conds[T], &i + STACK_SIZE) +#define commit(T) trnman_commit_trn(trn[T]) +#define abort(T) trnman_abort_trn(trn[T]) + +#define Ntrns 4 +void test_trnman_read_from() +{ + TRN *trn[Ntrns]; + pthread_mutex_t mutexes[Ntrns]; + pthread_cond_t conds[Ntrns]; + int i; + + for (i= 0; i < Ntrns; i++) + { + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init(&conds[i], 0); + } + + start_transaction(0); /* start trn1 */ + start_transaction(1); /* start trn2 */ + ok_read_from(1, 0, 0); + commit(0); /* commit trn1 */ + start_transaction(2); /* start trn4 */ + abort(2); /* abort trn4 */ + start_transaction(3); /* start trn5 */ + ok_read_from(3, 0, 1); + ok_read_from(3, 1, 0); + ok_read_from(3, 2, 0); + commit(1); /* commit trn2 */ + ok_read_from(3, 1, 0); + commit(3); /* commit trn5 */ + + for (i= 0; i < Ntrns; i++) + { + pthread_mutex_destroy(&mutexes[i]); + pthread_cond_destroy(&conds[i]); + } +} + +int main() +{ + my_init(); + + plan(6); + + if (my_atomic_initialize()) + return exit_status(); + + pthread_mutex_init(&rt_mutex, 0); + pthread_attr_init(&attr); +#ifdef HAVE_PTHREAD_ATTR_GETSTACKSIZE + pthread_attr_getstacksize(&attr, &stacksize); + if (stacksize == 0) +#endif + stacksize= PTHREAD_STACK_MIN; + +#define CYCLES 10000 +#define THREADS 10 + + trnman_init(0); + + test_trnman_read_from(); + run_test("trnman", test_trnman, THREADS, CYCLES); + + diag("mallocs: %d", trnman_allocated_transactions); + { + ulonglong now= my_getsystime(); + trnman_destroy(); + now= my_getsystime()-now; + diag("trnman_destroy: %g", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/myisam/Makefile.am b/storage/myisam/Makefile.am index f50c312b8e4..4bd0b177daa 100644 --- a/storage/myisam/Makefile.am +++ b/storage/myisam/Makefile.am @@ -97,8 +97,8 @@ libmyisam_a_SOURCES = mi_open.c mi_extra.c mi_info.c mi_rkey.c \ mi_delete_table.c mi_rename.c mi_check.c \ mi_keycache.c mi_preload.c \ ft_parser.c ft_stopwords.c ft_static.c \ - ft_update.c ft_boolean_search.c ft_nlq_search.c sort.c \ - ha_myisam.cc \ + ft_update.c ft_boolean_search.c ft_nlq_search.c \ + sort.c ha_myisam.cc ft_myisam.c \ rt_index.c rt_key.c rt_mbr.c rt_split.c sp_key.c CLEANFILES = test?.MY? FT?.MY? isam.log mi_test_all rt_test.MY? sp_test.MY? diff --git a/storage/myisam/ft_boolean_search.c b/storage/myisam/ft_boolean_search.c index 15f4e1e1d34..85342c6e0ca 100644 --- a/storage/myisam/ft_boolean_search.c +++ b/storage/myisam/ft_boolean_search.c @@ -162,7 +162,7 @@ static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b) static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b) { /* ORDER BY word DESC, ndepth DESC */ - int i= mi_compare_text(cs, (uchar*) (*b)->word+1,(*b)->len-1, + int i= ha_compare_text(cs, (uchar*) (*b)->word+1,(*b)->len-1, (uchar*) (*a)->word+1,(*a)->len-1,0,0); if (!i) i=CMP_NUM((*b)->ndepth,(*a)->ndepth); @@ -196,7 +196,7 @@ static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param, case FT_TOKEN_WORD: ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_WORD) + - (info->trunc ? MI_MAX_KEY_BUFF : + (info->trunc ? HA_MAX_KEY_BUFF : word_len * ftb_param->ftb->charset->mbmaxlen + HA_FT_WLEN + ftb_param->ftb->info->s->rec_reflength)); @@ -345,7 +345,6 @@ static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) uint off, extra=HA_FT_WLEN+info->s->base.rec_reflength; uchar *lastkey_buf=ftbw->word+ftbw->off; - LINT_INIT(off); if (ftbw->flags & FTB_FLAG_TRUNC) lastkey_buf+=ftbw->len; @@ -395,7 +394,7 @@ static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) if (!r && !ftbw->off) { - r= mi_compare_text(ftb->charset, + r= ha_compare_text(ftb->charset, info->lastkey+1, info->lastkey_length-extra-1, (uchar*) ftbw->word+1, @@ -868,7 +867,7 @@ static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param, for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2) { ftbw= ftb->list[c]; - if (mi_compare_text(ftb->charset, (uchar*)word, len, + if (ha_compare_text(ftb->charset, (uchar*)word, len, (uchar*)ftbw->word+1, ftbw->len-1, (my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) > 0) b= c; @@ -878,7 +877,7 @@ static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param, for (; c >= 0; c--) { ftbw= ftb->list[c]; - if (mi_compare_text(ftb->charset, (uchar*)word, len, + if (ha_compare_text(ftb->charset, (uchar*)word, len, (uchar*)ftbw->word + 1,ftbw->len - 1, (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0)) break; diff --git a/storage/myisam/ft_eval.c b/storage/myisam/ft_eval.c index 7eb78861e5e..de01510fdd7 100644 --- a/storage/myisam/ft_eval.c +++ b/storage/myisam/ft_eval.c @@ -48,7 +48,7 @@ int main(int argc, char *argv[]) recinfo[0].type=FIELD_SKIP_ENDSPACE; recinfo[0].length=docid_length; recinfo[1].type=FIELD_BLOB; - recinfo[1].length= 4+mi_portable_sizeof_char_ptr; + recinfo[1].length= 4+portable_sizeof_char_ptr; /* Define a key over the first column */ keyinfo[0].seg=keyseg; diff --git a/storage/myisam/ft_myisam.c b/storage/myisam/ft_myisam.c new file mode 100644 index 00000000000..bef3fbfd5f5 --- /dev/null +++ b/storage/myisam/ft_myisam.c @@ -0,0 +1,36 @@ +/* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* + This function is for interface functions between fulltext and myisam +*/ + +#include "ftdefs.h" + +FT_INFO *ft_init_search(uint flags, void *info, uint keynr, + uchar *query, uint query_len, CHARSET_INFO *cs, + uchar *record) +{ + FT_INFO *res; + if (flags & FT_BOOL) + res= ft_init_boolean_search((MI_INFO *)info, keynr, query, query_len,cs); + else + res= ft_init_nlq_search((MI_INFO *)info, keynr, query, query_len, flags, + record); + return res; +} diff --git a/storage/myisam/ft_nlq_search.c b/storage/myisam/ft_nlq_search.c index 282fa6751d8..b3a2e47a382 100644 --- a/storage/myisam/ft_nlq_search.c +++ b/storage/myisam/ft_nlq_search.c @@ -103,7 +103,7 @@ static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio) { if (keylen && - mi_compare_text(aio->charset,info->lastkey+1, + ha_compare_text(aio->charset,info->lastkey+1, info->lastkey_length-extra-1, keybuff+1,keylen-1,0,0)) break; diff --git a/storage/myisam/ft_parser.c b/storage/myisam/ft_parser.c index df2423aa50f..042a999fffa 100644 --- a/storage/myisam/ft_parser.c +++ b/storage/myisam/ft_parser.c @@ -31,7 +31,7 @@ typedef struct st_my_ft_parser_param static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2) { - return mi_compare_text(cs, (uchar*) w1->pos, w1->len, + return ha_compare_text(cs, (uchar*) w1->pos, w1->len, (uchar*) w2->pos, w2->len, 0, 0); } diff --git a/storage/myisam/ft_static.c b/storage/myisam/ft_static.c index 610c20eede6..d48bedc9e3b 100644 --- a/storage/myisam/ft_static.c +++ b/storage/myisam/ft_static.c @@ -54,20 +54,6 @@ const struct _ft_vft _ft_vft_boolean = { ft_boolean_get_relevance, ft_boolean_reinit_search }; - -FT_INFO *ft_init_search(uint flags, void *info, uint keynr, - uchar *query, uint query_len, CHARSET_INFO *cs, - uchar *record) -{ - FT_INFO *res; - if (flags & FT_BOOL) - res= ft_init_boolean_search((MI_INFO *)info, keynr, query, query_len,cs); - else - res= ft_init_nlq_search((MI_INFO *)info, keynr, query, query_len, flags, - record); - return res; -} - const char *ft_stopword_file = 0; const char *ft_precompiled_stopwords[] = { diff --git a/storage/myisam/ft_stopwords.c b/storage/myisam/ft_stopwords.c index 59866d9a351..8aefffbee1d 100644 --- a/storage/myisam/ft_stopwords.c +++ b/storage/myisam/ft_stopwords.c @@ -29,7 +29,7 @@ static TREE *stopwords3=NULL; static int FT_STOPWORD_cmp(void* cmp_arg __attribute__((unused)), FT_STOPWORD *w1, FT_STOPWORD *w2) { - return mi_compare_text(default_charset_info, + return ha_compare_text(default_charset_info, (uchar *)w1->pos,w1->len, (uchar *)w2->pos,w2->len,0,0); } @@ -51,10 +51,11 @@ static int ft_add_stopword(const char *w) int ft_init_stopwords() { + DBUG_ENTER("ft_init_stopwords"); if (!stopwords3) { if (!(stopwords3=(TREE *)my_malloc(sizeof(TREE),MYF(0)))) - return -1; + DBUG_RETURN(-1); init_tree(stopwords3,0,0,sizeof(FT_STOPWORD),(qsort_cmp2)&FT_STOPWORD_cmp, 0, (ft_stopword_file ? (tree_element_free)&FT_STOPWORD_free : 0), @@ -70,10 +71,10 @@ int ft_init_stopwords() int error=-1; if (!*ft_stopword_file) - return 0; + DBUG_RETURN(0); if ((fd=my_open(ft_stopword_file, O_RDONLY, MYF(MY_WME))) == -1) - return -1; + DBUG_RETURN(-1); len=(uint)my_seek(fd, 0L, MY_SEEK_END, MYF(0)); my_seek(fd, 0L, MY_SEEK_SET, MYF(0)); if (!(start=buffer=my_malloc(len+1, MYF(MY_WME)))) @@ -90,7 +91,7 @@ err1: my_free(buffer, MYF(0)); err0: my_close(fd, MYF(MY_WME)); - return error; + DBUG_RETURN(error); } else { @@ -100,13 +101,14 @@ err0: for (;*sws;sws++) { if (ft_add_stopword(*sws)) - return -1; + DBUG_RETURN(-1); } ft_stopword_file="(built-in)"; /* for SHOW VARIABLES */ } - return 0; + DBUG_RETURN(0); } + int is_stopword(char *word, uint len) { FT_STOPWORD sw; @@ -118,6 +120,8 @@ int is_stopword(char *word, uint len) void ft_free_stopwords() { + DBUG_ENTER("ft_free_stopwords"); + if (stopwords3) { delete_tree(stopwords3); /* purecov: inspected */ @@ -125,4 +129,5 @@ void ft_free_stopwords() stopwords3=0; } ft_stopword_file= 0; + DBUG_VOID_RETURN; } diff --git a/storage/myisam/ft_test1.c b/storage/myisam/ft_test1.c index e49c47bb268..b37935a0d7a 100644 --- a/storage/myisam/ft_test1.c +++ b/storage/myisam/ft_test1.c @@ -75,12 +75,12 @@ static int run_test(const char *filename) /* First define 2 columns */ recinfo[0].type=extra_field; - recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + mi_portable_sizeof_char_ptr : + recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : extra_length); if (extra_field == FIELD_VARCHAR) recinfo[0].length+= HA_VARCHAR_PACKLENGTH(extra_length); recinfo[1].type=key_field; - recinfo[1].length= (key_field == FIELD_BLOB ? 4+mi_portable_sizeof_char_ptr : + recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr : key_length); if (key_field == FIELD_VARCHAR) recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length); diff --git a/storage/myisam/ft_update.c b/storage/myisam/ft_update.c index e3e4c62158f..d1548e32870 100644 --- a/storage/myisam/ft_update.c +++ b/storage/myisam/ft_update.c @@ -180,7 +180,7 @@ int _mi_ft_cmp(MI_INFO *info, uint keynr, const uchar *rec1, const uchar *rec2) { if ((ftsi1.pos != ftsi2.pos) && (!ftsi1.pos || !ftsi2.pos || - mi_compare_text(cs, (uchar*) ftsi1.pos,ftsi1.len, + ha_compare_text(cs, (uchar*) ftsi1.pos,ftsi1.len, (uchar*) ftsi2.pos,ftsi2.len,0,0))) DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT); } @@ -209,7 +209,7 @@ int _mi_ft_update(MI_INFO *info, uint keynr, uchar *keybuf, error=0; while(old_word->pos && new_word->pos) { - cmp= mi_compare_text(cs, (uchar*) old_word->pos,old_word->len, + cmp= ha_compare_text(cs, (uchar*) old_word->pos,old_word->len, (uchar*) new_word->pos,new_word->len,0,0); cmp2= cmp ? 0 : (fabs(old_word->weight - new_word->weight) > 1.e-5); diff --git a/storage/myisam/fulltext.h b/storage/myisam/fulltext.h index 856e93e034d..9aef2d0d002 100644 --- a/storage/myisam/fulltext.h +++ b/storage/myisam/fulltext.h @@ -20,18 +20,8 @@ #include "myisamdef.h" #include "ft_global.h" -#define HA_FT_WTYPE HA_KEYTYPE_FLOAT -#define HA_FT_WLEN 4 -#define FT_SEGS 2 - -#define ft_sintXkorr(A) mi_sint4korr(A) -#define ft_intXstore(T,A) mi_int4store(T,A) - -extern const HA_KEYSEG ft_keysegs[FT_SEGS]; - int _mi_ft_cmp(MI_INFO *, uint, const uchar *, const uchar *); int _mi_ft_add(MI_INFO *, uint, uchar *, const uchar *, my_off_t); int _mi_ft_del(MI_INFO *, uint, uchar *, const uchar *, my_off_t); uint _mi_ft_convert_to_ft2(MI_INFO *, uint, uchar *); - diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc index ca4c40547ee..5e58565364c 100644 --- a/storage/myisam/ha_myisam.cc +++ b/storage/myisam/ha_myisam.cc @@ -22,6 +22,7 @@ #include "mysql_priv.h" #include <mysql/plugin.h> #include <m_ctype.h> +#include <my_bit.h> #include <myisampack.h> #include "ha_myisam.h" #include <stdarg.h> @@ -56,7 +57,7 @@ static handler *myisam_create_handler(handlerton *hton, // collect errors printed by mi_check routines -static void mi_check_print_msg(MI_CHECK *param, const char* msg_type, +static void mi_check_print_msg(HA_CHECK *param, const char* msg_type, const char *fmt, va_list args) { THD* thd = (THD*)param->thd; @@ -251,7 +252,8 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out, DBUG_PRINT("loop", ("found: 0x%lx recpos: %d minpos: %d length: %d", (long) found, recpos, minpos, length)); if (recpos != minpos) - { // Reserved space (Null bits?) + { + /* reserve space for null bits */ bzero((char*) recinfo_pos, sizeof(*recinfo_pos)); recinfo_pos->type= (int) FIELD_NORMAL; recinfo_pos++->length= (uint16) (minpos - recpos); @@ -300,7 +302,7 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out, Check for underlying table conformance SYNOPSIS - check_definition() + myisam_check_definition() t1_keyinfo in First table key definition t1_recinfo in First table record definition t1_keys in Number of keys in first table @@ -442,13 +444,13 @@ int check_definition(MI_KEYDEF *t1_keyinfo, MI_COLUMNDEF *t1_recinfo, extern "C" { -volatile int *killed_ptr(MI_CHECK *param) +volatile int *killed_ptr(HA_CHECK *param) { /* In theory Unsafe conversion, but should be ok for now */ return (int*) &(((THD *)(param->thd))->killed); } -void mi_check_print_error(MI_CHECK *param, const char *fmt,...) +void mi_check_print_error(HA_CHECK *param, const char *fmt,...) { param->error_printed|=1; param->out_flag|= O_DATA_LOST; @@ -458,7 +460,7 @@ void mi_check_print_error(MI_CHECK *param, const char *fmt,...) va_end(args); } -void mi_check_print_info(MI_CHECK *param, const char *fmt,...) +void mi_check_print_info(HA_CHECK *param, const char *fmt,...) { va_list args; va_start(args, fmt); @@ -466,7 +468,7 @@ void mi_check_print_info(MI_CHECK *param, const char *fmt,...) va_end(args); } -void mi_check_print_warning(MI_CHECK *param, const char *fmt,...) +void mi_check_print_warning(HA_CHECK *param, const char *fmt,...) { param->warning_printed=1; param->out_flag|= O_DATA_LOST; @@ -721,7 +723,7 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt) { if (!file) return HA_ADMIN_INTERNAL_ERROR; int error; - MI_CHECK param; + HA_CHECK param; MYISAM_SHARE* share = file->s; const char *old_proc_info=thd->proc_info; @@ -732,7 +734,7 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt) param.db_name= table->s->db.str; param.table_name= table->alias; param.testflag = check_opt->flags | T_CHECK | T_SILENT; - param.stats_method= (enum_mi_stats_method)thd->variables.myisam_stats_method; + param.stats_method= (enum_handler_stats_method)thd->variables.myisam_stats_method; if (!(table->db_stat & HA_READ_ONLY)) param.testflag|= T_STATISTICS; @@ -813,7 +815,7 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt) int ha_myisam::analyze(THD *thd, HA_CHECK_OPT* check_opt) { int error=0; - MI_CHECK param; + HA_CHECK param; MYISAM_SHARE* share = file->s; myisamchk_init(¶m); @@ -824,7 +826,7 @@ int ha_myisam::analyze(THD *thd, HA_CHECK_OPT* check_opt) param.testflag= (T_FAST | T_CHECK | T_SILENT | T_STATISTICS | T_DONT_CHECK_CHECKSUM); param.using_global_keycache = 1; - param.stats_method= (enum_mi_stats_method)thd->variables.myisam_stats_method; + param.stats_method= (enum_handler_stats_method)thd->variables.myisam_stats_method; if (!(share->state.changed & STATE_NOT_ANALYZED)) return HA_ADMIN_ALREADY_DONE; @@ -873,7 +875,7 @@ int ha_myisam::restore(THD* thd, HA_CHECK_OPT *check_opt) err: { - MI_CHECK param; + HA_CHECK param; myisamchk_init(¶m); param.thd= thd; param.op_name= "restore"; @@ -936,7 +938,7 @@ int ha_myisam::backup(THD* thd, HA_CHECK_OPT *check_opt) err: { - MI_CHECK param; + HA_CHECK param; myisamchk_init(¶m); param.thd= thd; param.op_name= "backup"; @@ -952,7 +954,7 @@ int ha_myisam::backup(THD* thd, HA_CHECK_OPT *check_opt) int ha_myisam::repair(THD* thd, HA_CHECK_OPT *check_opt) { int error; - MI_CHECK param; + HA_CHECK param; ha_rows start_records; if (!file) return HA_ADMIN_INTERNAL_ERROR; @@ -1002,7 +1004,7 @@ int ha_myisam::optimize(THD* thd, HA_CHECK_OPT *check_opt) { int error; if (!file) return HA_ADMIN_INTERNAL_ERROR; - MI_CHECK param; + HA_CHECK param; myisamchk_init(¶m); param.thd = thd; @@ -1021,7 +1023,7 @@ int ha_myisam::optimize(THD* thd, HA_CHECK_OPT *check_opt) } -int ha_myisam::repair(THD *thd, MI_CHECK ¶m, bool do_optimize) +int ha_myisam::repair(THD *thd, HA_CHECK ¶m, bool do_optimize) { int error=0; uint local_testflag=param.testflag; @@ -1209,7 +1211,7 @@ int ha_myisam::assign_to_keycache(THD* thd, HA_CHECK_OPT *check_opt) if (error != HA_ADMIN_OK) { /* Send error to user */ - MI_CHECK param; + HA_CHECK param; myisamchk_init(¶m); param.thd= thd; param.op_name= "assign_to_keycache"; @@ -1273,7 +1275,7 @@ int ha_myisam::preload_keys(THD* thd, HA_CHECK_OPT *check_opt) err: { - MI_CHECK param; + HA_CHECK param; myisamchk_init(¶m); param.thd= thd; param.op_name= "preload_keys"; @@ -1380,7 +1382,7 @@ int ha_myisam::enable_indexes(uint mode) else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE) { THD *thd=current_thd; - MI_CHECK param; + HA_CHECK param; const char *save_proc_info=thd->proc_info; thd->proc_info="Creating index"; myisamchk_init(¶m); @@ -1389,7 +1391,8 @@ int ha_myisam::enable_indexes(uint mode) T_CREATE_MISSING_KEYS); param.myf_rw&= ~MY_WAIT_IF_FULL; param.sort_buffer_length= thd->variables.myisam_sort_buff_size; - param.stats_method= (enum_mi_stats_method)thd->variables.myisam_stats_method; + param.stats_method= + (enum_handler_stats_method)thd->variables.myisam_stats_method; param.tmpdir=&mysql_tmpdir_list; if ((error= (repair(thd,param,0) != HA_ADMIN_OK)) && param.retry_repair) { @@ -1890,7 +1893,7 @@ void ha_myisam::get_auto_increment(ulonglong offset, ulonglong increment, { ulonglong nr; int error; - uchar key[MI_MAX_KEY_LENGTH]; + uchar key[HA_MAX_KEY_LENGTH]; if (!table->s->next_number_key_offset) { // Autoincrement at key-start diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h index e8594fc9039..96440b74c9d 100644 --- a/storage/myisam/ha_myisam.h +++ b/storage/myisam/ha_myisam.h @@ -21,6 +21,7 @@ /* class for the the myisam handler */ #include <myisam.h> +#include <myisamchk.h> #include <ft_global.h> #define HA_RECOVER_NONE 0 /* No automatic recover */ @@ -39,7 +40,7 @@ class ha_myisam: public handler ulonglong int_table_flags; char *data_file_name, *index_file_name; bool can_enable_indexes; - int repair(THD *thd, MI_CHECK ¶m, bool optimize); + int repair(THD *thd, HA_CHECK ¶m, bool optimize); public: ha_myisam(handlerton *hton, TABLE_SHARE *table_arg); @@ -56,8 +57,8 @@ class ha_myisam: public handler HA_READ_ORDER | HA_KEYREAD_ONLY); } uint max_supported_keys() const { return MI_MAX_KEY; } - uint max_supported_key_length() const { return MI_MAX_KEY_LENGTH; } - uint max_supported_key_part_length() const { return MI_MAX_KEY_LENGTH; } + uint max_supported_key_length() const { return HA_MAX_KEY_LENGTH; } + uint max_supported_key_part_length() const { return HA_MAX_KEY_LENGTH; } uint checksum() const; int open(const char *name, int mode, uint test_if_locked); diff --git a/storage/myisam/mi_check.c b/storage/myisam/mi_check.c index fe6b716877c..b41f06a5fb8 100644 --- a/storage/myisam/mi_check.c +++ b/storage/myisam/mi_check.c @@ -59,14 +59,14 @@ /* Functions defined in this file */ -static int check_k_link(MI_CHECK *param, MI_INFO *info,uint nr); -static int chk_index(MI_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo, +static int check_k_link(HA_CHECK *param, MI_INFO *info,uint nr); +static int chk_index(HA_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo, my_off_t page, uchar *buff, ha_rows *keys, ha_checksum *key_checksum, uint level); static uint isam_key_length(MI_INFO *info,MI_KEYDEF *keyinfo); static ha_checksum calc_checksum(ha_rows count); static int writekeys(MI_SORT_PARAM *sort_param); -static int sort_one_index(MI_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo, +static int sort_one_index(HA_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo, my_off_t pagepos, File new_file); static int sort_key_read(MI_SORT_PARAM *sort_param,void *key); static int sort_ft_key_read(MI_SORT_PARAM *sort_param,void *key); @@ -80,13 +80,13 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param, reg1 SORT_KEY_BLOCKS *key_block, uchar *key, my_off_t prev_block); static int sort_delete_record(MI_SORT_PARAM *sort_param); -/*static int flush_pending_blocks(MI_CHECK *param);*/ -static SORT_KEY_BLOCKS *alloc_key_blocks(MI_CHECK *param, uint blocks, +/*static int flush_pending_blocks(HA_CHECK *param);*/ +static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks, uint buffer_length); static ha_checksum mi_byte_checksum(const uchar *buf, uint length); -static void set_data_file_type(SORT_INFO *sort_info, MYISAM_SHARE *share); +static void set_data_file_type(MI_SORT_INFO *sort_info, MYISAM_SHARE *share); -void myisamchk_init(MI_CHECK *param) +void myisamchk_init(HA_CHECK *param) { bzero((uchar*) param,sizeof(*param)); param->opt_follow_links=1; @@ -108,7 +108,7 @@ void myisamchk_init(MI_CHECK *param) /* Check the status flags for the table */ -int chk_status(MI_CHECK *param, register MI_INFO *info) +int chk_status(HA_CHECK *param, register MI_INFO *info) { MYISAM_SHARE *share=info->s; @@ -136,7 +136,7 @@ int chk_status(MI_CHECK *param, register MI_INFO *info) /* Check delete links */ -int chk_del(MI_CHECK *param, register MI_INFO *info, uint test_flag) +int chk_del(HA_CHECK *param, register MI_INFO *info, uint test_flag) { reg2 ha_rows i; uint delete_link_length; @@ -245,7 +245,7 @@ wrong: /* Check delete links in index file */ -static int check_k_link(MI_CHECK *param, register MI_INFO *info, uint nr) +static int check_k_link(HA_CHECK *param, register MI_INFO *info, uint nr) { my_off_t next_link; uint block_size=(nr+1)*MI_MIN_KEY_BLOCK_LENGTH; @@ -323,7 +323,7 @@ static int check_k_link(MI_CHECK *param, register MI_INFO *info, uint nr) /* Check sizes of files */ -int chk_size(MI_CHECK *param, register MI_INFO *info) +int chk_size(HA_CHECK *param, register MI_INFO *info) { int error=0; register my_off_t skr,size; @@ -399,7 +399,7 @@ int chk_size(MI_CHECK *param, register MI_INFO *info) /* Check keys */ -int chk_key(MI_CHECK *param, register MI_INFO *info) +int chk_key(HA_CHECK *param, register MI_INFO *info) { uint key,found_keys=0,full_text_keys=0,result=0; ha_rows keys; @@ -584,7 +584,7 @@ do_stat: } /* chk_key */ -static int chk_index_down(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo, +static int chk_index_down(HA_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t page, uchar *buff, ha_rows *keys, ha_checksum *key_checksum, uint level) { @@ -731,13 +731,13 @@ int mi_collect_stats_nonulls_next(HA_KEYSEG *keyseg, ulonglong *notnull, /* Check if index is ok */ -static int chk_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo, +static int chk_index(HA_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t page, uchar *buff, ha_rows *keys, ha_checksum *key_checksum, uint level) { int flag; uint used_length,comp_flag,nod_flag,key_length=0; - uchar key[MI_MAX_POSSIBLE_KEY_BUFF],*temp_buff,*keypos,*old_keypos,*endpos; + uchar key[HA_MAX_POSSIBLE_KEY_BUFF],*temp_buff,*keypos,*old_keypos,*endpos; my_off_t next_page,record; char llbuff[22]; uint diff_pos[2]; @@ -934,7 +934,7 @@ static uint isam_key_length(MI_INFO *info, register MI_KEYDEF *keyinfo) /* Check that record-link is ok */ -int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend) +int chk_data_link(HA_CHECK *param, MI_INFO *info,int extend) { int error,got_error,flag; uint key,left_length,b_type,field; @@ -944,7 +944,7 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend) uchar *record,*to; char llbuff[22],llbuff2[22],llbuff3[22]; ha_checksum intern_record_checksum; - ha_checksum key_checksum[MI_MAX_POSSIBLE_KEY]; + ha_checksum key_checksum[HA_MAX_POSSIBLE_KEY]; my_bool static_row_size; MI_KEYDEF *keyinfo; MI_BLOCK_INFO block_info; @@ -992,6 +992,9 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend) if (*killed_ptr(param)) goto err2; switch (info->s->data_file_type) { + case BLOCK_RECORD: + DBUG_ASSERT(0); /* Impossible */ + break; case STATIC_RECORD: if (my_b_read(¶m->read_cache,(uchar*) record, info->s->base.pack_reclength)) @@ -1379,7 +1382,7 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend) /* Recover old table by reading each record and writing all keys */ /* Save new datafile-name in temp_filename */ -int mi_repair(MI_CHECK *param, register MI_INFO *info, +int mi_repair(HA_CHECK *param, register MI_INFO *info, char * name, int rep_quick) { int error,got_error; @@ -1389,7 +1392,7 @@ int mi_repair(MI_CHECK *param, register MI_INFO *info, File new_file; MYISAM_SHARE *share=info->s; char llbuff[22],llbuff2[22]; - SORT_INFO sort_info; + MI_SORT_INFO sort_info; MI_SORT_PARAM sort_param; DBUG_ENTER("mi_repair"); @@ -1772,7 +1775,7 @@ int movepoint(register MI_INFO *info, uchar *record, my_off_t oldpos, /* Tell system that we want all memory for our cache */ -void lock_memory(MI_CHECK *param __attribute__((unused))) +void lock_memory(HA_CHECK *param __attribute__((unused))) { #ifdef SUN_OS /* Key-cacheing thrases on sun 4.1 */ if (param->opt_lock_memory) @@ -1788,7 +1791,7 @@ void lock_memory(MI_CHECK *param __attribute__((unused))) /* Flush all changed blocks to disk */ -int flush_blocks(MI_CHECK *param, KEY_CACHE *key_cache, File file) +int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file) { if (flush_key_blocks(key_cache, file, FLUSH_RELEASE)) { @@ -1803,12 +1806,12 @@ int flush_blocks(MI_CHECK *param, KEY_CACHE *key_cache, File file) /* Sort index for more efficent reads */ -int mi_sort_index(MI_CHECK *param, register MI_INFO *info, char * name) +int mi_sort_index(HA_CHECK *param, register MI_INFO *info, char * name) { reg2 uint key; reg1 MI_KEYDEF *keyinfo; File new_file; - my_off_t index_pos[MI_MAX_POSSIBLE_KEY]; + my_off_t index_pos[HA_MAX_POSSIBLE_KEY]; uint r_locks,w_locks; int old_lock; MYISAM_SHARE *share=info->s; @@ -1903,12 +1906,12 @@ err2: /* Sort records recursive using one index */ -static int sort_one_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo, +static int sort_one_index(HA_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pagepos, File new_file) { uint length,nod_flag,used_length, key_length; uchar *buff,*keypos,*endpos; - uchar key[MI_MAX_POSSIBLE_KEY_BUFF]; + uchar key[HA_MAX_POSSIBLE_KEY_BUFF]; my_off_t new_page_pos,next_page; char llbuff[22]; DBUG_ENTER("sort_one_index"); @@ -2023,7 +2026,7 @@ int change_to_newfile(const char * filename, const char * old_ext, /* Locks a whole file */ /* Gives an error-message if file can't be locked */ -int lock_file(MI_CHECK *param, File file, my_off_t start, int lock_type, +int lock_file(HA_CHECK *param, File file, my_off_t start, int lock_type, const char *filetype, const char *filename) { if (my_lock(file,lock_type,start,F_TO_EOF, @@ -2040,7 +2043,7 @@ int lock_file(MI_CHECK *param, File file, my_off_t start, int lock_type, /* Copy a block between two files */ -int filecopy(MI_CHECK *param, File to,File from,my_off_t start, +int filecopy(HA_CHECK *param, File to,File from,my_off_t start, my_off_t length, const char *type) { char tmp_buff[IO_SIZE],*buff; @@ -2091,7 +2094,7 @@ err: <>0 Error */ -int mi_repair_by_sort(MI_CHECK *param, register MI_INFO *info, +int mi_repair_by_sort(HA_CHECK *param, register MI_INFO *info, const char * name, int rep_quick) { int got_error; @@ -2105,7 +2108,7 @@ int mi_repair_by_sort(MI_CHECK *param, register MI_INFO *info, HA_KEYSEG *keyseg; ulong *rec_per_key_part; char llbuff[22]; - SORT_INFO sort_info; + MI_SORT_INFO sort_info; ulonglong key_map=share->state.key_map; DBUG_ENTER("mi_repair_by_sort"); @@ -2511,7 +2514,7 @@ err: <>0 Error */ -int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info, +int mi_repair_parallel(HA_CHECK *param, register MI_INFO *info, const char * name, int rep_quick) { #ifndef THREAD @@ -2530,7 +2533,7 @@ int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info, char llbuff[22]; IO_CACHE new_data_cache; /* For non-quick repair. */ IO_CACHE_SHARE io_share; - SORT_INFO sort_info; + MI_SORT_INFO sort_info; ulonglong key_map=share->state.key_map; pthread_attr_t thr_attr; DBUG_ENTER("mi_repair_parallel"); @@ -3009,7 +3012,7 @@ err: static int sort_key_read(MI_SORT_PARAM *sort_param, void *key) { int error; - SORT_INFO *sort_info=sort_param->sort_info; + MI_SORT_INFO *sort_info=sort_param->sort_info; MI_INFO *info=sort_info->info; DBUG_ENTER("sort_key_read"); @@ -3036,7 +3039,7 @@ static int sort_key_read(MI_SORT_PARAM *sort_param, void *key) static int sort_ft_key_read(MI_SORT_PARAM *sort_param, void *key) { int error; - SORT_INFO *sort_info=sort_param->sort_info; + MI_SORT_INFO *sort_info=sort_param->sort_info; MI_INFO *info=sort_info->info; FT_WORD *wptr=0; DBUG_ENTER("sort_ft_key_read"); @@ -3123,8 +3126,8 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param) my_off_t pos; uchar *to; MI_BLOCK_INFO block_info; - SORT_INFO *sort_info=sort_param->sort_info; - MI_CHECK *param=sort_info->param; + MI_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; MI_INFO *info=sort_info->info; MYISAM_SHARE *share=info->s; char llbuff[22],llbuff2[22]; @@ -3134,6 +3137,9 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param) DBUG_RETURN(1); switch (share->data_file_type) { + case BLOCK_RECORD: + DBUG_ASSERT(0); /* Impossible */ + break; case STATIC_RECORD: for (;;) { @@ -3549,8 +3555,8 @@ int sort_write_record(MI_SORT_PARAM *sort_param) ulong block_length,reclength; uchar *from; uchar block_buff[8]; - SORT_INFO *sort_info=sort_param->sort_info; - MI_CHECK *param=sort_info->param; + MI_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; MI_INFO *info=sort_info->info; MYISAM_SHARE *share=info->s; DBUG_ENTER("sort_write_record"); @@ -3558,6 +3564,9 @@ int sort_write_record(MI_SORT_PARAM *sort_param) if (sort_param->fix_datafile) { switch (sort_info->new_data_file_type) { + case BLOCK_RECORD: + DBUG_ASSERT(0); /* Impossible */ + break; case STATIC_RECORD: if (my_b_write(&info->rec_cache,sort_param->record, share->base.pack_reclength)) @@ -3576,7 +3585,7 @@ int sort_write_record(MI_SORT_PARAM *sort_param) { /* must be sure that local buffer is big enough */ reclength=info->s->base.pack_reclength+ - _my_calc_total_blob_length(info,sort_param->record)+ + _mi_calc_total_blob_length(info,sort_param->record)+ ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER)+MI_SPLIT_LENGTH+ MI_DYN_DELETE_BLOCK_HEADER; if (sort_info->buff_length < reclength) @@ -3665,24 +3674,25 @@ static int sort_key_write(MI_SORT_PARAM *sort_param, const void *a) { uint diff_pos[2]; char llbuff[22],llbuff2[22]; - SORT_INFO *sort_info=sort_param->sort_info; - MI_CHECK *param= sort_info->param; + MI_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param= sort_info->param; int cmp; if (sort_info->key_block->inited) { - cmp=ha_key_cmp(sort_param->seg,sort_info->key_block->lastkey, + cmp=ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey, (uchar*) a, USE_WHOLE_KEY,SEARCH_FIND | SEARCH_UPDATE, diff_pos); if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL) - ha_key_cmp(sort_param->seg,sort_info->key_block->lastkey, + ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey, (uchar*) a, USE_WHOLE_KEY, SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diff_pos); else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) { diff_pos[0]= mi_collect_stats_nonulls_next(sort_param->seg, sort_param->notnull, - sort_info->key_block->lastkey, + (uchar*) sort_info-> + key_block->lastkey, (uchar*)a); } sort_param->unique[diff_pos[0]-1]++; @@ -3705,8 +3715,8 @@ static int sort_key_write(MI_SORT_PARAM *sort_param, const void *a) llstr(sort_info->info->lastpos,llbuff), llstr(get_record_for_key(sort_info->info, sort_param->keyinfo, - sort_info->key_block-> - lastkey), + (uchar*) sort_info-> + key_block->lastkey), llbuff2)); param->testflag|=T_RETRY_WITHOUT_QUICK; if (sort_info->param->testflag & T_VERBOSE) @@ -3727,7 +3737,7 @@ static int sort_key_write(MI_SORT_PARAM *sort_param, const void *a) int sort_ft_buf_flush(MI_SORT_PARAM *sort_param) { - SORT_INFO *sort_info=sort_param->sort_info; + MI_SORT_INFO *sort_info=sort_param->sort_info; SORT_KEY_BLOCKS *key_block=sort_info->key_block; MYISAM_SHARE *share=sort_info->info->s; uint val_off, val_len; @@ -3737,19 +3747,19 @@ int sort_ft_buf_flush(MI_SORT_PARAM *sort_param) val_len=share->ft2_keyinfo.keylength; get_key_full_length_rdonly(val_off, ft_buf->lastkey); - to=ft_buf->lastkey+val_off; + to= (uchar*) ft_buf->lastkey+val_off; if (ft_buf->buf) { /* flushing first-level tree */ - error=sort_insert_key(sort_param,key_block,ft_buf->lastkey, + error=sort_insert_key(sort_param,key_block, (uchar*) ft_buf->lastkey, HA_OFFSET_ERROR); for (from=to+val_len; - !error && from < ft_buf->buf; + !error && from < (uchar*) ft_buf->buf; from+= val_len) { memcpy(to, from, val_len); - error=sort_insert_key(sort_param,key_block,ft_buf->lastkey, + error=sort_insert_key(sort_param,key_block, (uchar*) ft_buf->lastkey, HA_OFFSET_ERROR); } return error; @@ -3758,8 +3768,8 @@ int sort_ft_buf_flush(MI_SORT_PARAM *sort_param) error=flush_pending_blocks(sort_param); /* updating lastkey with second-level tree info */ ft_intXstore(ft_buf->lastkey+val_off, -ft_buf->count); - _mi_dpointer(sort_info->info, ft_buf->lastkey+val_off+HA_FT_WLEN, - share->state.key_root[sort_param->key]); + _mi_dpointer(sort_info->info, (uchar*) ft_buf->lastkey+val_off+HA_FT_WLEN, + share->state.key_root[sort_param->key]); /* restoring first level tree data in sort_info/sort_param */ sort_info->key_block=sort_info->key_block_end- sort_info->param->sort_key_blocks; sort_param->keyinfo=share->keyinfo+sort_param->key; @@ -3767,14 +3777,14 @@ int sort_ft_buf_flush(MI_SORT_PARAM *sort_param) /* writing lastkey in first-level tree */ return error ? error : sort_insert_key(sort_param,sort_info->key_block, - ft_buf->lastkey,HA_OFFSET_ERROR); + (uchar*) ft_buf->lastkey,HA_OFFSET_ERROR); } static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a) { uint a_len, val_off, val_len, error; uchar *p; - SORT_INFO *sort_info=sort_param->sort_info; + MI_SORT_INFO *sort_info=sort_param->sort_info; SORT_FT_BUF *ft_buf=sort_info->ft_buf; SORT_KEY_BLOCKS *key_block=sort_info->key_block; @@ -3804,9 +3814,9 @@ static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a) } get_key_full_length_rdonly(val_off, ft_buf->lastkey); - if (mi_compare_text(sort_param->seg->charset, + if (ha_compare_text(sort_param->seg->charset, ((uchar *)a)+1,a_len-1, - ft_buf->lastkey+1,val_off-1, 0, 0)==0) + (uchar*) ft_buf->lastkey+1,val_off-1, 0, 0)==0) { if (!ft_buf->buf) /* store in second-level tree */ { @@ -3822,16 +3832,16 @@ static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a) return 0; /* converting to two-level tree */ - p=ft_buf->lastkey+val_off; + p= (uchar*) ft_buf->lastkey+val_off; while (key_block->inited) key_block++; sort_info->key_block=key_block; sort_param->keyinfo=& sort_info->info->s->ft2_keyinfo; - ft_buf->count=(ft_buf->buf - p)/val_len; + ft_buf->count=((uchar*) ft_buf->buf - p)/val_len; /* flushing buffer to second-level tree */ - for (error=0; !error && p < ft_buf->buf; p+= val_len) + for (error=0; !error && p < (uchar*) ft_buf->buf; p+= val_len) error=sort_insert_key(sort_param,key_block,p,HA_OFFSET_ERROR); ft_buf->buf=0; return error; @@ -3879,13 +3889,13 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param, MI_KEY_PARAM s_temp; MI_INFO *info; MI_KEYDEF *keyinfo=sort_param->keyinfo; - SORT_INFO *sort_info= sort_param->sort_info; - MI_CHECK *param=sort_info->param; + MI_SORT_INFO *sort_info= sort_param->sort_info; + HA_CHECK *param=sort_info->param; DBUG_ENTER("sort_insert_key"); - anc_buff=key_block->buff; + anc_buff= (uchar*) key_block->buff; info=sort_info->info; - lastkey=key_block->lastkey; + lastkey= (uchar*) key_block->lastkey; nod_flag= (key_block == sort_info->key_block ? 0 : info->s->base.key_reflength); @@ -3898,7 +3908,7 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param, DBUG_RETURN(1); } a_length=2+nod_flag; - key_block->end_pos=anc_buff+2; + key_block->end_pos= (char*) anc_buff+2; lastkey=0; /* No previous key in block */ } else @@ -3906,18 +3916,18 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param, /* Save pointer to previous block */ if (nod_flag) - _mi_kpointer(info,key_block->end_pos,prev_block); + _mi_kpointer(info,(uchar*) key_block->end_pos,prev_block); t_length=(*keyinfo->pack_key)(keyinfo,nod_flag, (uchar*) 0,lastkey,lastkey,key, &s_temp); - (*keyinfo->store_key)(keyinfo, key_block->end_pos+nod_flag,&s_temp); + (*keyinfo->store_key)(keyinfo, (uchar*) key_block->end_pos+nod_flag,&s_temp); a_length+=t_length; mi_putint(anc_buff,a_length,nod_flag); key_block->end_pos+=t_length; if (a_length <= keyinfo->block_length) { - VOID(_mi_move_key(keyinfo,key_block->lastkey,key)); + VOID(_mi_move_key(keyinfo,(uchar*) key_block->lastkey,key)); key_block->last_length=a_length-t_length; DBUG_RETURN(0); } @@ -3942,7 +3952,8 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param, DBUG_DUMP("buff",(uchar*) anc_buff,mi_getint(anc_buff)); /* Write separator-key to block in next level */ - if (sort_insert_key(sort_param,key_block+1,key_block->lastkey,filepos)) + if (sort_insert_key(sort_param,key_block+1,(uchar*) key_block->lastkey, + filepos)) DBUG_RETURN(1); /* clear old block and write new key in it */ @@ -3958,8 +3969,8 @@ static int sort_delete_record(MI_SORT_PARAM *sort_param) uint i; int old_file,error; uchar *key; - SORT_INFO *sort_info=sort_param->sort_info; - MI_CHECK *param=sort_info->param; + MI_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; MI_INFO *info=sort_info->info; DBUG_ENTER("sort_delete_record"); @@ -4015,7 +4026,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param) uint nod_flag,length; my_off_t filepos,key_file_length; SORT_KEY_BLOCKS *key_block; - SORT_INFO *sort_info= sort_param->sort_info; + MI_SORT_INFO *sort_info= sort_param->sort_info; myf myf_rw=sort_info->param->myf_rw; MI_INFO *info=sort_info->info; MI_KEYDEF *keyinfo=sort_param->keyinfo; @@ -4028,7 +4039,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param) key_block->inited=0; length=mi_getint(key_block->buff); if (nod_flag) - _mi_kpointer(info,key_block->end_pos,filepos); + _mi_kpointer(info,(uchar*) key_block->end_pos,filepos); key_file_length=info->state->key_file_length; bzero((uchar*) key_block->buff+length, keyinfo->block_length-length); if ((filepos=_mi_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR) @@ -4038,7 +4049,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param) if (key_file_length == info->state->key_file_length) { if (_mi_write_keypage(info, keyinfo, filepos, - DFLT_INIT_HITS, key_block->buff)) + DFLT_INIT_HITS, (uchar*) key_block->buff)) DBUG_RETURN(1); } else if (my_pwrite(info->s->kfile,(uchar*) key_block->buff, @@ -4053,7 +4064,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param) /* alloc space and pointers for key_blocks */ -static SORT_KEY_BLOCKS *alloc_key_blocks(MI_CHECK *param, uint blocks, +static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks, uint buffer_length) { reg1 uint i; @@ -4090,7 +4101,7 @@ int test_if_almost_full(MI_INFO *info) /* Recreate table with bigger more alloced record-data */ -int recreate_table(MI_CHECK *param, MI_INFO **org_info, char *filename) +int recreate_table(HA_CHECK *param, MI_INFO **org_info, char *filename) { int error; MI_INFO info; @@ -4263,7 +4274,7 @@ end: /* write suffix to data file if neaded */ -int write_data_suffix(SORT_INFO *sort_info, my_bool fix_datafile) +int write_data_suffix(MI_SORT_INFO *sort_info, my_bool fix_datafile) { MI_INFO *info=sort_info->info; @@ -4284,7 +4295,7 @@ int write_data_suffix(SORT_INFO *sort_info, my_bool fix_datafile) /* Update state and myisamchk_time of indexfile */ -int update_state_info(MI_CHECK *param, MI_INFO *info,uint update) +int update_state_info(HA_CHECK *param, MI_INFO *info,uint update) { MYISAM_SHARE *share=info->s; @@ -4356,7 +4367,7 @@ err: param->auto_increment is bigger than the biggest key. */ -void update_auto_increment_key(MI_CHECK *param, MI_INFO *info, +void update_auto_increment_key(HA_CHECK *param, MI_INFO *info, my_bool repair_only) { uchar *record; @@ -4589,7 +4600,7 @@ my_bool mi_test_if_sort_rep(MI_INFO *info, ha_rows rows, static void -set_data_file_type(SORT_INFO *sort_info, MYISAM_SHARE *share) +set_data_file_type(MI_SORT_INFO *sort_info, MYISAM_SHARE *share) { if ((sort_info->new_data_file_type=share->data_file_type) == COMPRESSED_RECORD && sort_info->param->testflag & T_UNPACK) diff --git a/storage/myisam/mi_checksum.c b/storage/myisam/mi_checksum.c index 4e87de373bd..1aa56e571e3 100644 --- a/storage/myisam/mi_checksum.c +++ b/storage/myisam/mi_checksum.c @@ -31,9 +31,9 @@ ha_checksum mi_checksum(MI_INFO *info, const uchar *buf) case FIELD_BLOB: { length=_mi_calc_blob_length(rec->length- - mi_portable_sizeof_char_ptr, + portable_sizeof_char_ptr, buf); - memcpy((char*) &pos, buf+rec->length- mi_portable_sizeof_char_ptr, + memcpy((char*) &pos, buf+rec->length- portable_sizeof_char_ptr, sizeof(char*)); break; } diff --git a/storage/myisam/mi_close.c b/storage/myisam/mi_close.c index 07105aea88d..747555dbdfb 100644 --- a/storage/myisam/mi_close.c +++ b/storage/myisam/mi_close.c @@ -75,6 +75,7 @@ int mi_close(register MI_INFO *info) not change the crashed state. We can NOT write the state in other cases as other threads may be using the file at this point + IF using --external-locking. */ if (share->mode != O_RDONLY && mi_is_crashed(info)) mi_state_info_write(share->kfile, &share->state, 1); diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c index 0cac5f08b3b..fc5b31e7689 100644 --- a/storage/myisam/mi_create.c +++ b/storage/myisam/mi_create.c @@ -17,6 +17,7 @@ #include "ftdefs.h" #include "sp_defs.h" +#include <my_bit.h> #if defined(MSDOS) || defined(__WIN__) #ifdef __WIN__ @@ -40,11 +41,11 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, File dfile,file; int errpos,save_errno, create_mode= O_RDWR | O_TRUNC; myf create_flag; - uint fields,length,max_key_length,packed,pointer,real_length_diff, + uint fields,length,max_key_length,packed,pack_bytes,pointer,real_length_diff, key_length,info_length,key_segs,options,min_key_length_skip, base_pos,long_varchar_count,varchar_length, max_key_block_length,unique_key_parts,fulltext_keys,offset; - uint aligned_key_start, block_length; + uint aligned_key_start, block_length, res; ulong reclength, real_reclength,min_pack_length; char filename[FN_REFLEN],linkname[FN_REFLEN], *linkname_ptr; ulong pack_reclength; @@ -56,7 +57,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, HA_KEYSEG *keyseg,tmp_keyseg; MI_COLUMNDEF *rec; ulong *rec_per_key_part; - my_off_t key_root[MI_MAX_POSSIBLE_KEY],key_del[MI_MAX_KEY_BLOCK_SIZE]; + my_off_t key_root[HA_MAX_POSSIBLE_KEY],key_del[MI_MAX_KEY_BLOCK_SIZE]; MI_CREATE_INFO tmp_create_info; DBUG_ENTER("mi_create"); DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u", @@ -94,7 +95,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, ci->reloc_rows=ci->max_rows; /* Check if wrong parameter */ if (!(rec_per_key_part= - (ulong*) my_malloc((keys + uniques)*MI_MAX_KEY_SEG*sizeof(long), + (ulong*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(long), MYF(MY_WME | MY_ZEROFILL)))) DBUG_RETURN(my_errno); @@ -116,10 +117,10 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, share.base.blobs++; if (pack_reclength != INT_MAX32) { - if (rec->length == 4+mi_portable_sizeof_char_ptr) + if (rec->length == 4+portable_sizeof_char_ptr) pack_reclength= INT_MAX32; else - pack_reclength+=(1 << ((rec->length-mi_portable_sizeof_char_ptr)*8)); /* Max blob length */ + pack_reclength+=(1 << ((rec->length-portable_sizeof_char_ptr)*8)); /* Max blob length */ } } else if (type == FIELD_SKIP_PRESPACE || @@ -192,11 +193,11 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, if (flags & HA_CREATE_RELIES_ON_SQL_LAYER) options|= HA_OPTION_RELIES_ON_SQL_LAYER; - packed=(packed+7)/8; + pack_bytes= (packed+7)/8; if (pack_reclength != INT_MAX32) pack_reclength+= reclength+packed + test(test_all_bits(options, HA_OPTION_CHECKSUM | HA_PACK_RECORD)); - min_pack_length+=packed; + min_pack_length+= pack_bytes; if (!ci->data_file_length && ci->max_rows) { @@ -273,7 +274,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, keyseg->type != HA_KEYTYPE_VARBINARY2) { my_errno=HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } } keydef->keysegs+=sp_segs; @@ -282,7 +283,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, min_key_length_skip+=SPLEN*2*SPDIMS; #else my_errno= HA_ERR_UNSUPPORTED; - goto err; + goto err_no_lock; #endif /*HAVE_SPATIAL*/ } else if (keydef->flag & HA_FULLTEXT) @@ -298,7 +299,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, keyseg->type != HA_KEYTYPE_VARTEXT2) { my_errno=HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } if (!(keyseg->flag & HA_BLOB_PART) && (keyseg->type == HA_KEYTYPE_VARTEXT1 || @@ -420,10 +421,10 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, } } /* if HA_FULLTEXT */ key_segs+=keydef->keysegs; - if (keydef->keysegs > MI_MAX_KEY_SEG) + if (keydef->keysegs > HA_MAX_KEY_SEG) { my_errno=HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } /* key_segs may be 0 in the case when we only want to be able to @@ -435,7 +436,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, share.state.rec_per_key_part[key_segs-1]=1L; length+=key_length; /* Get block length for key, if defined by user */ - block_length= (keydef->block_length ? + block_length= (keydef->block_length ? my_round_up_to_next_power(keydef->block_length) : myisam_block_size); block_length= max(block_length, MI_MIN_KEY_BLOCK_LENGTH); @@ -445,10 +446,10 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, pointer,MI_MAX_KEYPTR_SIZE, block_length); if (keydef->block_length > MI_MAX_KEY_BLOCK_LENGTH || - length >= MI_MAX_KEY_BUFF) + length >= HA_MAX_KEY_BUFF) { my_errno=HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } set_if_bigger(max_key_block_length,keydef->block_length); keydef->keylength= (uint16) key_length; @@ -495,7 +496,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, "indexes and/or unique constraints.", MYF(0), name + dirname_length(name)); my_errno= HA_WRONG_CREATE_OPTION; - goto err; + goto err_no_lock; } bmove(share.state.header.file_version,(uchar*) myisam_file_magic,4); @@ -550,9 +551,9 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, share.base.pack_reclength=reclength+ test(options & HA_OPTION_CHECKSUM); share.base.max_pack_length=pack_reclength; share.base.min_pack_length=min_pack_length; - share.base.pack_bits=packed; + share.base.pack_bits= pack_bytes; share.base.fields=fields; - share.base.pack_fields=packed; + share.base.pack_fields= packed; #ifdef USE_RAID share.base.raid_type=ci->raid_type; share.base.raid_chunks=ci->raid_chunks; @@ -826,13 +827,16 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, } errpos=0; pthread_mutex_unlock(&THR_LOCK_myisam); + res= 0; if (my_close(file,MYF(0))) - goto err; + res= my_errno; my_free((char*) rec_per_key_part,MYF(0)); - DBUG_RETURN(0); + DBUG_RETURN(res); err: pthread_mutex_unlock(&THR_LOCK_myisam); +err_no_lock: + save_errno=my_errno; switch (errpos) { case 3: diff --git a/storage/myisam/mi_dbug.c b/storage/myisam/mi_dbug.c index 07c314c43e6..0808a7e85dd 100644 --- a/storage/myisam/mi_dbug.c +++ b/storage/myisam/mi_dbug.c @@ -45,6 +45,7 @@ void _mi_print_key(FILE *stream, register HA_KEYSEG *keyseg, fprintf(stream,"NULL"); continue; } + end++; } switch (keyseg->type) { diff --git a/storage/myisam/mi_delete.c b/storage/myisam/mi_delete.c index 6fe31f30c19..88b31d616de 100644 --- a/storage/myisam/mi_delete.c +++ b/storage/myisam/mi_delete.c @@ -159,7 +159,7 @@ static int _mi_ck_real_delete(register MI_INFO *info, MI_KEYDEF *keyinfo, DBUG_RETURN(my_errno=HA_ERR_CRASHED); } if (!(root_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ - MI_MAX_KEY_BUFF*2))) + HA_MAX_KEY_BUFF*2))) { DBUG_PRINT("error",("Couldn't allocate memory")); DBUG_RETURN(my_errno=ENOMEM); @@ -221,7 +221,7 @@ static int d_search(register MI_INFO *info, register MI_KEYDEF *keyinfo, my_bool last_key; uchar *leaf_buff,*keypos; my_off_t leaf_page,next_block; - uchar lastkey[MI_MAX_KEY_BUFF]; + uchar lastkey[HA_MAX_KEY_BUFF]; DBUG_ENTER("d_search"); DBUG_DUMP("page",(uchar*) anc_buff,mi_getint(anc_buff)); @@ -306,7 +306,7 @@ static int d_search(register MI_INFO *info, register MI_KEYDEF *keyinfo, { leaf_page=_mi_kpos(nod_flag,keypos); if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ - MI_MAX_KEY_BUFF*2))) + HA_MAX_KEY_BUFF*2))) { DBUG_PRINT("error",("Couldn't allocate memory")); my_errno=ENOMEM; @@ -365,9 +365,7 @@ static int d_search(register MI_INFO *info, register MI_KEYDEF *keyinfo, { /* This happens only with packed keys */ DBUG_PRINT("test",("Enlarging of key when deleting")); if (!_mi_get_last_key(info,keyinfo,anc_buff,lastkey,keypos,&length)) - { goto err; - } ret_value=_mi_insert(info,keyinfo,key,anc_buff,keypos,lastkey, (uchar*) 0,(uchar*) 0,(my_off_t) 0,(my_bool) 0); } @@ -405,7 +403,7 @@ static int del(register MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *key, int ret_value,length; uint a_length,nod_flag,tmp; my_off_t next_page; - uchar keybuff[MI_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key; + uchar keybuff[HA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key; MYISAM_SHARE *share=info->s; MI_KEY_PARAM s_temp; DBUG_ENTER("del"); @@ -422,7 +420,7 @@ static int del(register MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *key, { next_page= _mi_kpos(nod_flag,endpos); if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ - MI_MAX_KEY_BUFF*2))) + HA_MAX_KEY_BUFF*2))) DBUG_RETURN(-1); if (!_mi_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,next_buff,0)) ret_value= -1; @@ -509,7 +507,7 @@ static int underflow(register MI_INFO *info, register MI_KEYDEF *keyinfo, uint length,anc_length,buff_length,leaf_length,p_length,s_length,nod_flag, key_reflength,key_length; my_off_t next_page; - uchar anc_key[MI_MAX_KEY_BUFF],leaf_key[MI_MAX_KEY_BUFF], + uchar anc_key[HA_MAX_KEY_BUFF],leaf_key[HA_MAX_KEY_BUFF], *buff,*endpos,*next_keypos,*anc_pos,*half_pos,*temp_pos,*prev_key, *after_key; MI_KEY_PARAM s_temp; diff --git a/storage/myisam/mi_dynrec.c b/storage/myisam/mi_dynrec.c index cdd70abe9ad..2a12fd04641 100644 --- a/storage/myisam/mi_dynrec.c +++ b/storage/myisam/mi_dynrec.c @@ -252,7 +252,7 @@ int _mi_write_blob_record(MI_INFO *info, const uchar *record) extra= (ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER)+MI_SPLIT_LENGTH+ MI_DYN_DELETE_BLOCK_HEADER+1); reclength= (info->s->base.pack_reclength + - _my_calc_total_blob_length(info,record)+ extra); + _mi_calc_total_blob_length(info,record)+ extra); #ifdef NOT_USED /* We now support big rows */ if (reclength > MI_DYN_MAX_ROW_LENGTH) { @@ -286,7 +286,7 @@ int _mi_update_blob_record(MI_INFO *info, my_off_t pos, const uchar *record) extra= (ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER)+MI_SPLIT_LENGTH+ MI_DYN_DELETE_BLOCK_HEADER); reclength= (info->s->base.pack_reclength+ - _my_calc_total_blob_length(info,record)+ extra); + _mi_calc_total_blob_length(info,record)+ extra); #ifdef NOT_USED /* We now support big rows */ if (reclength > MI_DYN_MAX_ROW_LENGTH) { @@ -901,7 +901,7 @@ uint _mi_rec_pack(MI_INFO *info, register uchar *to, else { char *temp_pos; - size_t tmp_length=length-mi_portable_sizeof_char_ptr; + size_t tmp_length=length-portable_sizeof_char_ptr; memcpy((uchar*) to,from,tmp_length); memcpy_fixed(&temp_pos,from+tmp_length,sizeof(char*)); memcpy(to+tmp_length,temp_pos,(size_t) blob->length); @@ -1022,11 +1022,11 @@ my_bool _mi_rec_check(MI_INFO *info,const uchar *record, uchar *rec_buff, if (type == FIELD_BLOB) { uint blob_length= - _mi_calc_blob_length(length-mi_portable_sizeof_char_ptr,record); + _mi_calc_blob_length(length-portable_sizeof_char_ptr,record); if (!blob_length && !(flag & bit)) goto err; if (blob_length) - to+=length - mi_portable_sizeof_char_ptr+ blob_length; + to+=length - portable_sizeof_char_ptr+ blob_length; } else if (type == FIELD_SKIP_ZERO) { @@ -1209,7 +1209,7 @@ ulong _mi_rec_unpack(register MI_INFO *info, register uchar *to, uchar *from, } else if (type == FIELD_BLOB) { - uint size_length=rec_length- mi_portable_sizeof_char_ptr; + uint size_length=rec_length- portable_sizeof_char_ptr; ulong blob_length=_mi_calc_blob_length(size_length,from); ulong from_left= (ulong) (from_end - from); if (from_left < size_length || @@ -1259,7 +1259,7 @@ err: /* Calc length of blob. Update info in blobs->length */ -ulong _my_calc_total_blob_length(MI_INFO *info, const uchar *record) +ulong _mi_calc_total_blob_length(MI_INFO *info, const uchar *record) { ulong length; MI_BLOB *blob,*end; @@ -1293,7 +1293,7 @@ ulong _mi_calc_blob_length(uint length, const uchar *pos) } -void _my_store_blob_length(uchar *pos,uint pack_length,uint length) +void _mi_store_blob_length(uchar *pos,uint pack_length,uint length) { switch (pack_length) { case 1: @@ -1506,7 +1506,7 @@ int _mi_cmp_dynamic_record(register MI_INFO *info, register const uchar *record) if (info->s->base.blobs) { if (!(buffer=(uchar*) my_alloca(info->s->base.pack_reclength+ - _my_calc_total_blob_length(info,record)))) + _mi_calc_total_blob_length(info,record)))) DBUG_RETURN(-1); } reclength=_mi_rec_pack(info,buffer,record); diff --git a/storage/myisam/mi_extra.c b/storage/myisam/mi_extra.c index 1b4c79d13de..33c9d1210ca 100644 --- a/storage/myisam/mi_extra.c +++ b/storage/myisam/mi_extra.c @@ -256,15 +256,16 @@ int mi_extra(MI_INFO *info, enum ha_extra_function function, void *extra_arg) share->last_version= 0L; /* Impossible version */ pthread_mutex_unlock(&THR_LOCK_myisam); break; - case HA_EXTRA_PREPARE_FOR_DELETE: + case HA_EXTRA_PREPARE_FOR_RENAME: + case HA_EXTRA_PREPARE_FOR_DROP: pthread_mutex_lock(&THR_LOCK_myisam); share->last_version= 0L; /* Impossible version */ #ifdef __WIN__REMOVE_OBSOLETE_WORKAROUND /* Close the isam and data files as Win32 can't drop an open table */ pthread_mutex_lock(&share->intern_lock); if (flush_key_blocks(share->key_cache, share->kfile, - (function == HA_EXTRA_FORCE_REOPEN ? - FLUSH_RELEASE : FLUSH_IGNORE_CHANGED))) + (function == HA_EXTRA_PREPARE_FOR_DROP ? + FLUSH_IGNORE_CHANGED : FLUSH_RELEASE))) { error=my_errno; share->changed=1; diff --git a/storage/myisam/mi_key.c b/storage/myisam/mi_key.c index 3f445ebf44d..94f3f34ec58 100644 --- a/storage/myisam/mi_key.c +++ b/storage/myisam/mi_key.c @@ -426,7 +426,7 @@ static int _mi_put_key_in_record(register MI_INFO *info, uint keynr, /* The above changed info->lastkey2. Inform mi_rnext_same(). */ info->update&= ~HA_STATE_RNEXT_SAME; - _my_store_blob_length(record+keyseg->start, + _mi_store_blob_length(record+keyseg->start, (uint) keyseg->bit_start,length); key+=length; } diff --git a/storage/myisam/mi_locking.c b/storage/myisam/mi_locking.c index ec359d13a14..2d89fce2a81 100644 --- a/storage/myisam/mi_locking.c +++ b/storage/myisam/mi_locking.c @@ -56,9 +56,15 @@ int mi_lock_database(MI_INFO *info, int lock_type) case F_UNLCK: ftparser_call_deinitializer(info); if (info->lock_type == F_RDLCK) + { count= --share->r_locks; + mi_restore_status(info); + } else + { count= --share->w_locks; + mi_update_status(info); + } --share->tot_locks; if (info->lock_type == F_WRLCK && !share->w_locks && !share->delay_key_write && flush_key_blocks(share->key_cache, @@ -84,16 +90,16 @@ int mi_lock_database(MI_INFO *info, int lock_type) if (share->changed && !share->w_locks) { #ifdef HAVE_MMAP - if ((info->s->mmaped_length != info->s->state.state.data_file_length) && - (info->s->nonmmaped_inserts > MAX_NONMAPPED_INSERTS)) - { - if (info->s->concurrent_insert) - rw_wrlock(&info->s->mmap_lock); - mi_remap_file(info, info->s->state.state.data_file_length); - info->s->nonmmaped_inserts= 0; - if (info->s->concurrent_insert) - rw_unlock(&info->s->mmap_lock); - } + if ((info->s->mmaped_length != info->s->state.state.data_file_length) && + (info->s->nonmmaped_inserts > MAX_NONMAPPED_INSERTS)) + { + if (info->s->concurrent_insert) + rw_wrlock(&info->s->mmap_lock); + mi_remap_file(info, info->s->state.state.data_file_length); + info->s->nonmmaped_inserts= 0; + if (info->s->concurrent_insert) + rw_unlock(&info->s->mmap_lock); + } #endif share->state.process= share->last_process=share->this_process; share->state.unique= info->last_unique= info->this_unique; @@ -300,6 +306,7 @@ void mi_get_status(void* param, int concurrent_insert) void mi_update_status(void* param) { MI_INFO *info=(MI_INFO*) param; + DBUG_ENTER("mi_update_status"); /* Because someone may have closed the table we point at, we only update the state if its our own state. This isn't a problem as @@ -336,20 +343,32 @@ void mi_update_status(void* param) } info->opt_flag&= ~WRITE_CACHE_USED; } + DBUG_VOID_RETURN; } void mi_restore_status(void *param) { MI_INFO *info= (MI_INFO*) param; + DBUG_ENTER("mi_restore_status"); + DBUG_PRINT("info",("key_file: %ld data_file: %ld", + (long) info->s->state.state.key_file_length, + (long) info->s->state.state.data_file_length)); info->state= &info->s->state.state; info->append_insert_at_end= 0; + DBUG_VOID_RETURN; } void mi_copy_status(void* to,void *from) { - ((MI_INFO*) to)->state= &((MI_INFO*) from)->save_state; + MI_INFO *info= (MI_INFO*) to; + DBUG_ENTER("mi_copy_status"); + info->state= &((MI_INFO*) from)->save_state; + DBUG_PRINT("info",("key_file: %ld data_file: %ld", + (long) info->state->key_file_length, + (long) info->state->data_file_length)); + DBUG_VOID_RETURN; } @@ -377,17 +396,18 @@ void mi_copy_status(void* to,void *from) my_bool mi_check_status(void *param) { MI_INFO *info=(MI_INFO*) param; + DBUG_ENTER("mi_check_status"); + DBUG_PRINT("info",("dellink: %ld r_locks: %u w_locks: %u", + (long) info->s->state.dellink, (uint) info->s->r_locks, + (uint) info->s->w_locks)); /* The test for w_locks == 1 is here because this thread has already done an external lock (in other words: w_locks == 1 means no other threads has a write lock) */ - DBUG_PRINT("info",("dellink: %ld r_locks: %u w_locks: %u", - (long) info->s->state.dellink, (uint) info->s->r_locks, - (uint) info->s->w_locks)); - return (my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR || + DBUG_RETURN((my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR || (myisam_concurrent_insert == 2 && info->s->r_locks && - info->s->w_locks == 1)); + info->s->w_locks == 1))); } diff --git a/storage/myisam/mi_log.c b/storage/myisam/mi_log.c index 8b9ca038fec..982ba8b4367 100644 --- a/storage/myisam/mi_log.c +++ b/storage/myisam/mi_log.c @@ -133,7 +133,7 @@ void _myisam_log_record(enum myisam_log_commands command, MI_INFO *info, if (!info->s->base.blobs) length=info->s->base.reclength; else - length=info->s->base.reclength+ _my_calc_total_blob_length(info,record); + length=info->s->base.reclength+ _mi_calc_total_blob_length(info,record); buff[0]=(uchar) command; mi_int2store(buff+1,info->dfile); mi_int4store(buff+3,pid); diff --git a/storage/myisam/mi_open.c b/storage/myisam/mi_open.c index b848c822f75..b0cc2e54ca7 100644 --- a/storage/myisam/mi_open.c +++ b/storage/myisam/mi_open.c @@ -82,8 +82,8 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) uchar *disk_cache, *disk_pos, *end_pos; MI_INFO info,*m_info,*old_info; MYISAM_SHARE share_buff,*share; - ulong rec_per_key_part[MI_MAX_POSSIBLE_KEY*MI_MAX_KEY_SEG]; - my_off_t key_root[MI_MAX_POSSIBLE_KEY],key_del[MI_MAX_KEY_BLOCK_SIZE]; + ulong rec_per_key_part[HA_MAX_POSSIBLE_KEY*HA_MAX_KEY_SEG]; + my_off_t key_root[HA_MAX_POSSIBLE_KEY],key_del[MI_MAX_KEY_BLOCK_SIZE]; ulonglong max_key_file_length, max_data_file_length; DBUG_ENTER("mi_open"); @@ -105,7 +105,8 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) share_buff.state.key_root=key_root; share_buff.state.key_del=key_del; share_buff.key_cache= multi_key_cache_search((uchar*) name_buff, - strlen(name_buff)); + strlen(name_buff), + dflt_key_cache); DBUG_EXECUTE_IF("myisam_pretend_crashed_table_on_open", if (strstr(name, "/t1")) @@ -210,7 +211,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) DBUG_PRINT("warning",("saved_base_info_length: %d base_info_length: %d", len,MI_BASE_INFO_SIZE)); } - disk_pos= my_n_base_info_read(disk_cache + base_pos, &share->base); + disk_pos= mi_n_base_info_read(disk_cache + base_pos, &share->base); share->state.state_length=base_pos; if (!(open_flags & HA_OPEN_FOR_REPAIR) && @@ -235,8 +236,8 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) } key_parts+=fulltext_keys*FT_SEGS; - if (share->base.max_key_length > MI_MAX_KEY_BUFF || keys > MI_MAX_KEY || - key_parts > MI_MAX_KEY * MI_MAX_KEY_SEG) + if (share->base.max_key_length > HA_MAX_KEY_BUFF || keys > MI_MAX_KEY || + key_parts > MI_MAX_KEY * HA_MAX_KEY_SEG) { DBUG_PRINT("error",("Wrong key info: Max_key_length: %d keys: %d key_parts: %d", share->base.max_key_length, keys, key_parts)); my_errno=HA_ERR_UNSUPPORTED; @@ -452,7 +453,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) if (share->rec[i].type == (int) FIELD_BLOB) { share->blobs[j].pack_length= - share->rec[i].length-mi_portable_sizeof_char_ptr;; + share->rec[i].length-portable_sizeof_char_ptr;; share->blobs[j].offset=offset; j++; } @@ -1017,7 +1018,7 @@ uint mi_base_info_write(File file, MI_BASE_INFO *base) } -uchar *my_n_base_info_read(uchar *ptr, MI_BASE_INFO *base) +uchar *mi_n_base_info_read(uchar *ptr, MI_BASE_INFO *base) { base->keystart = mi_sizekorr(ptr); ptr +=8; base->max_data_file_length = mi_sizekorr(ptr); ptr +=8; diff --git a/storage/myisam/mi_packrec.c b/storage/myisam/mi_packrec.c index 305b7e5532c..a1bfb9e2c9b 100644 --- a/storage/myisam/mi_packrec.c +++ b/storage/myisam/mi_packrec.c @@ -105,6 +105,7 @@ static void init_bit_buffer(MI_BIT_BUFF *bit_buff,uchar *buffer,uint length); static uint fill_and_get_bits(MI_BIT_BUFF *bit_buff,uint count); static void fill_buffer(MI_BIT_BUFF *bit_buff); static uint max_bit(uint value); +static uint read_pack_length(uint version, const uchar *buf, ulong *length); #ifdef HAVE_MMAP static uchar *_mi_mempack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff, MI_BLOCK_INFO *info, uchar **rec_buff_p, @@ -1036,7 +1037,7 @@ static void uf_blob(MI_COLUMNDEF *rec, MI_BIT_BUFF *bit_buff, else { ulong length=get_bits(bit_buff,rec->space_length_bits); - uint pack_length=(uint) (end-to)-mi_portable_sizeof_char_ptr; + uint pack_length=(uint) (end-to)-portable_sizeof_char_ptr; if (bit_buff->blob_pos+length > bit_buff->blob_end) { bit_buff->error=1; @@ -1044,7 +1045,7 @@ static void uf_blob(MI_COLUMNDEF *rec, MI_BIT_BUFF *bit_buff, return; } decode_bytes(rec,bit_buff,bit_buff->blob_pos,bit_buff->blob_pos+length); - _my_store_blob_length((uchar*) to,pack_length,length); + _mi_store_blob_length((uchar*) to,pack_length,length); memcpy_fixed((char*) to+pack_length,(char*) &bit_buff->blob_pos, sizeof(char*)); bit_buff->blob_pos+=length; @@ -1625,7 +1626,7 @@ uint save_pack_length(uint version, uchar *block_buff, ulong length) } -uint read_pack_length(uint version, const uchar *buf, ulong *length) +static uint read_pack_length(uint version, const uchar *buf, ulong *length) { if (buf[0] < 254) { diff --git a/storage/myisam/mi_range.c b/storage/myisam/mi_range.c index 932a4abd1b3..8bd122c828a 100644 --- a/storage/myisam/mi_range.c +++ b/storage/myisam/mi_range.c @@ -260,7 +260,7 @@ static uint _mi_keynr(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page, uchar *keypos, uint *ret_max_key) { uint nod_flag,keynr,max_key; - uchar t_buff[MI_MAX_KEY_BUFF],*end; + uchar t_buff[HA_MAX_KEY_BUFF],*end; end= page+mi_getint(page); nod_flag=mi_test_if_nod(page); diff --git a/storage/myisam/mi_rkey.c b/storage/myisam/mi_rkey.c index f1d35810d36..f20b0366683 100644 --- a/storage/myisam/mi_rkey.c +++ b/storage/myisam/mi_rkey.c @@ -85,6 +85,8 @@ int mi_rkey(MI_INFO *info, uchar *buf, int inx, const uchar *key, { mi_print_error(info->s, HA_ERR_CRASHED); my_errno=HA_ERR_CRASHED; + if (share->concurrent_insert) + rw_unlock(&share->key_root_lock[inx]); goto err; } break; diff --git a/storage/myisam/mi_search.c b/storage/myisam/mi_search.c index 2195ac178dd..f4cac27a43f 100644 --- a/storage/myisam/mi_search.c +++ b/storage/myisam/mi_search.c @@ -60,7 +60,7 @@ int _mi_search(register MI_INFO *info, register MI_KEYDEF *keyinfo, int error,flag; uint nod_flag; uchar *keypos,*maxpos; - uchar lastkey[MI_MAX_KEY_BUFF],*buff; + uchar lastkey[HA_MAX_KEY_BUFF],*buff; DBUG_ENTER("_mi_search"); DBUG_PRINT("enter",("pos: %lu nextflag: %u lastpos: %lu", (ulong) pos, nextflag, (ulong) info->lastpos)); @@ -242,7 +242,7 @@ int _mi_seq_search(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page, { int flag; uint nod_flag,length,not_used[2]; - uchar t_buff[MI_MAX_KEY_BUFF],*end; + uchar t_buff[HA_MAX_KEY_BUFF],*end; DBUG_ENTER("_mi_seq_search"); LINT_INIT(flag); LINT_INIT(length); @@ -296,7 +296,7 @@ int _mi_prefix_search(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page, int key_len_skip, seg_len_pack, key_len_left; uchar *end, *kseg, *vseg; uchar *sort_order=keyinfo->seg->charset->sort_order; - uchar tt_buff[MI_MAX_KEY_BUFF+2], *t_buff=tt_buff+2; + uchar tt_buff[HA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2; uchar *saved_from, *saved_to, *saved_vseg; uint saved_length=0, saved_prefix_len=0; uint length_pack; @@ -920,7 +920,7 @@ uint _mi_get_binary_pack_key(register MI_KEYDEF *keyinfo, uint nod_flag, DBUG_ENTER("_mi_get_binary_pack_key"); page= *page_pos; - page_end=page+MI_MAX_KEY_BUFF+1; + page_end=page+HA_MAX_KEY_BUFF+1; start_key=key; /* @@ -1238,7 +1238,7 @@ int _mi_search_next(register MI_INFO *info, register MI_KEYDEF *keyinfo, { int error; uint nod_flag; - uchar lastkey[MI_MAX_KEY_BUFF]; + uchar lastkey[HA_MAX_KEY_BUFF]; DBUG_ENTER("_mi_search_next"); DBUG_PRINT("enter",("nextflag: %u lastpos: %lu int_keypos: %lu", nextflag, (ulong) info->lastpos, diff --git a/storage/myisam/mi_test1.c b/storage/myisam/mi_test1.c index a68bcbed56c..8e491823939 100644 --- a/storage/myisam/mi_test1.c +++ b/storage/myisam/mi_test1.c @@ -71,14 +71,16 @@ static int run_test(const char *filename) /* First define 2 columns */ recinfo[0].type=FIELD_NORMAL; recinfo[0].length=1; /* For NULL bits */ recinfo[1].type=key_field; - recinfo[1].length= (key_field == FIELD_BLOB ? 4+mi_portable_sizeof_char_ptr : + recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr : key_length); if (key_field == FIELD_VARCHAR) recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length);; recinfo[2].type=extra_field; - recinfo[2].length= (extra_field == FIELD_BLOB ? 4 + mi_portable_sizeof_char_ptr : 24); + recinfo[2].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24); if (extra_field == FIELD_VARCHAR) recinfo[2].length+= HA_VARCHAR_PACKLENGTH(recinfo[2].length); + recinfo[1].null_bit= null_fields ? 2 : 0; + if (opt_unique) { recinfo[3].type=FIELD_CHECK; @@ -630,7 +632,7 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)), key_type= HA_KEYTYPE_VARTEXT1; break; case 'k': - if (key_length < 4 || key_length > MI_MAX_KEY_LENGTH) + if (key_length < 4 || key_length > HA_MAX_KEY_LENGTH) { fprintf(stderr,"Wrong key length\n"); exit(1); diff --git a/storage/myisam/mi_test2.c b/storage/myisam/mi_test2.c index 902801b5e6e..fd8adeed1c5 100644 --- a/storage/myisam/mi_test2.c +++ b/storage/myisam/mi_test2.c @@ -26,6 +26,7 @@ #endif #include "myisamdef.h" #include <m_ctype.h> +#include <my_bit.h> #define STANDARD_LENGTH 37 #define MYISAM_KEYS 6 @@ -187,7 +188,7 @@ int main(int argc, char *argv[]) if (use_blob) { recinfo[6].type=FIELD_BLOB; - recinfo[6].length=4+mi_portable_sizeof_char_ptr; + recinfo[6].length=4+portable_sizeof_char_ptr; recinfo[6].null_bit=0; recinfo[6].null_pos=0; } @@ -605,7 +606,7 @@ int main(int argc, char *argv[]) if (mi_rsame(file,read_record2,(int) i)) goto err; if (bcmp(read_record,read_record2,reclength) != 0) { - printf("is_rsame didn't find same record\n"); + printf("mi_rsame didn't find same record\n"); goto end; } } @@ -779,8 +780,7 @@ int main(int argc, char *argv[]) { ulong blob_length,pos; uchar *ptr; - longget(blob_length,read_record+blob_pos+4); - ptr=(uchar*) blob_length; + memcpy_fixed(&ptr, read_record+blob_pos+4, sizeof(ptr)); longget(blob_length,read_record+blob_pos); for (pos=0 ; pos < blob_length ; pos++) { diff --git a/storage/myisam/mi_unique.c b/storage/myisam/mi_unique.c index e490fb683e4..02fcd9289dd 100644 --- a/storage/myisam/mi_unique.c +++ b/storage/myisam/mi_unique.c @@ -212,7 +212,7 @@ int mi_unique_comp(MI_UNIQUEDEF *def, const uchar *a, const uchar *b, if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 || type == HA_KEYTYPE_VARTEXT2) { - if (mi_compare_text(keyseg->charset, (uchar *) pos_a, a_length, + if (ha_compare_text(keyseg->charset, (uchar *) pos_a, a_length, (uchar *) pos_b, b_length, 0, 1)) return 1; } diff --git a/storage/myisam/mi_update.c b/storage/myisam/mi_update.c index 956334b7806..dc6a1659931 100644 --- a/storage/myisam/mi_update.c +++ b/storage/myisam/mi_update.c @@ -23,7 +23,7 @@ int mi_update(register MI_INFO *info, const uchar *oldrec, uchar *newrec) int flag,key_changed,save_errno; reg3 my_off_t pos; uint i; - uchar old_key[MI_MAX_KEY_BUFF],*new_key; + uchar old_key[HA_MAX_KEY_BUFF],*new_key; bool auto_key_changed=0; ulonglong changed; MYISAM_SHARE *share=info->s; diff --git a/storage/myisam/mi_write.c b/storage/myisam/mi_write.c index 70ba7a4588a..05372ad12e8 100644 --- a/storage/myisam/mi_write.c +++ b/storage/myisam/mi_write.c @@ -346,7 +346,7 @@ static int w_search(register MI_INFO *info, register MI_KEYDEF *keyinfo, int error,flag; uint nod_flag, search_key_length; uchar *temp_buff,*keypos; - uchar keybuff[MI_MAX_KEY_BUFF]; + uchar keybuff[HA_MAX_KEY_BUFF]; my_bool was_last_key; my_off_t next_page, dupp_key_pos; DBUG_ENTER("w_search"); @@ -354,7 +354,7 @@ static int w_search(register MI_INFO *info, register MI_KEYDEF *keyinfo, search_key_length= (comp_flag & SEARCH_FIND) ? key_length : USE_WHOLE_KEY; if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ - MI_MAX_KEY_BUFF*2))) + HA_MAX_KEY_BUFF*2))) DBUG_RETURN(-1); if (!_mi_fetch_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff,0)) goto err; @@ -545,7 +545,7 @@ int _mi_insert(register MI_INFO *info, register MI_KEYDEF *keyinfo, get_key_length(alen,a); DBUG_ASSERT(info->ft1_to_ft2==0); if (alen == blen && - mi_compare_text(keyinfo->seg->charset, a, alen, b, blen, 0, 0)==0) + ha_compare_text(keyinfo->seg->charset, a, alen, b, blen, 0, 0)==0) { /* yup. converting */ info->ft1_to_ft2=(DYNAMIC_ARRAY *) @@ -707,7 +707,7 @@ static uchar *_mi_find_last_pos(MI_KEYDEF *keyinfo, uchar *page, { uint keys,length,last_length,key_ref_length; uchar *end,*lastpos,*prevpos; - uchar key_buff[MI_MAX_KEY_BUFF]; + uchar key_buff[HA_MAX_KEY_BUFF]; DBUG_ENTER("_mi_find_last_pos"); key_ref_length=2; @@ -764,7 +764,7 @@ static int _mi_balance_page(register MI_INFO *info, MI_KEYDEF *keyinfo, length,keys; uchar *pos,*buff,*extra_buff; my_off_t next_page,new_pos; - uchar tmp_part_key[MI_MAX_KEY_BUFF]; + uchar tmp_part_key[HA_MAX_KEY_BUFF]; DBUG_ENTER("_mi_balance_page"); k_length=keyinfo->keylength; @@ -930,7 +930,7 @@ static int keys_free(uchar *key, TREE_FREE mode, bulk_insert_param *param) Probably I can use info->lastkey here, but I'm not sure, and to be safe I'd better use local lastkey. */ - uchar lastkey[MI_MAX_KEY_BUFF]; + uchar lastkey[HA_MAX_KEY_BUFF]; uint keylen; MI_KEYDEF *keyinfo; diff --git a/storage/myisam/myisamchk.c b/storage/myisam/myisamchk.c index 567e1057e5d..80159518cd0 100644 --- a/storage/myisam/myisamchk.c +++ b/storage/myisam/myisamchk.c @@ -16,10 +16,10 @@ /* Describe, check and repair of MyISAM tables */ #include "fulltext.h" - #include <m_ctype.h> #include <stdarg.h> #include <my_getopt.h> +#include <my_bit.h> #ifdef HAVE_SYS_VADVICE_H #include <sys/vadvise.h> #endif @@ -67,9 +67,9 @@ static const char *myisam_stats_method_str="nulls_unequal"; static void get_options(int *argc,char * * *argv); static void print_version(void); static void usage(void); -static int myisamchk(MI_CHECK *param, char *filename); -static void descript(MI_CHECK *param, register MI_INFO *info, char * name); -static int mi_sort_records(MI_CHECK *param, register MI_INFO *info, +static int myisamchk(HA_CHECK *param, char *filename); +static void descript(HA_CHECK *param, register MI_INFO *info, char * name); +static int mi_sort_records(HA_CHECK *param, register MI_INFO *info, char * name, uint sort_key, my_bool write_info, my_bool update_index); static int sort_record_index(MI_SORT_PARAM *sort_param, MI_INFO *info, @@ -77,7 +77,7 @@ static int sort_record_index(MI_SORT_PARAM *sort_param, MI_INFO *info, my_off_t page,uchar *buff,uint sortkey, File new_file, my_bool update_index); -MI_CHECK check_param; +HA_CHECK check_param; /* Main program */ @@ -695,7 +695,7 @@ get_one_option(int optid, case OPT_STATS_METHOD: { int method; - enum_mi_stats_method method_conv; + enum_handler_stats_method method_conv; LINT_INIT(method_conv); myisam_stats_method_str= argument; if ((method=find_type(argument, &myisam_stats_method_typelib, 2)) <= 0) @@ -794,7 +794,7 @@ static void get_options(register int *argc,register char ***argv) /* Check table */ -static int myisamchk(MI_CHECK *param, char * filename) +static int myisamchk(HA_CHECK *param, char * filename) { int error,lock_type,recreate; int rep_quick= param->testflag & (T_QUICK | T_FORCE_UNIQUENESS); @@ -1199,7 +1199,7 @@ end2: /* Write info about table */ -static void descript(MI_CHECK *param, register MI_INFO *info, char * name) +static void descript(HA_CHECK *param, register MI_INFO *info, char * name) { uint key,keyseg_nr,field,start; reg3 MI_KEYDEF *keyinfo; @@ -1464,7 +1464,7 @@ static void descript(MI_CHECK *param, register MI_INFO *info, char * name) /* Sort records according to one key */ -static int mi_sort_records(MI_CHECK *param, +static int mi_sort_records(HA_CHECK *param, register MI_INFO *info, char * name, uint sort_key, my_bool write_info, @@ -1478,7 +1478,7 @@ static int mi_sort_records(MI_CHECK *param, ha_rows old_record_count; MYISAM_SHARE *share=info->s; char llbuff[22],llbuff2[22]; - SORT_INFO sort_info; + MI_SORT_INFO sort_info; MI_SORT_PARAM sort_param; DBUG_ENTER("sort_records"); @@ -1653,10 +1653,10 @@ static int sort_record_index(MI_SORT_PARAM *sort_param,MI_INFO *info, uint nod_flag,used_length,key_length; uchar *temp_buff,*keypos,*endpos; my_off_t next_page,rec_pos; - uchar lastkey[MI_MAX_KEY_BUFF]; + uchar lastkey[HA_MAX_KEY_BUFF]; char llbuff[22]; - SORT_INFO *sort_info= sort_param->sort_info; - MI_CHECK *param=sort_info->param; + MI_SORT_INFO *sort_info= sort_param->sort_info; + HA_CHECK *param=sort_info->param; DBUG_ENTER("sort_record_index"); nod_flag=mi_test_if_nod(buff); @@ -1744,7 +1744,7 @@ err: static int not_killed= 0; -volatile int *killed_ptr(MI_CHECK *param __attribute__((unused))) +volatile int *killed_ptr(HA_CHECK *param __attribute__((unused))) { return ¬_killed; /* always NULL */ } @@ -1752,7 +1752,7 @@ volatile int *killed_ptr(MI_CHECK *param __attribute__((unused))) /* print warnings and errors */ /* VARARGS */ -void mi_check_print_info(MI_CHECK *param __attribute__((unused)), +void mi_check_print_info(HA_CHECK *param __attribute__((unused)), const char *fmt,...) { va_list args; @@ -1765,7 +1765,7 @@ void mi_check_print_info(MI_CHECK *param __attribute__((unused)), /* VARARGS */ -void mi_check_print_warning(MI_CHECK *param, const char *fmt,...) +void mi_check_print_warning(HA_CHECK *param, const char *fmt,...) { va_list args; DBUG_ENTER("mi_check_print_warning"); @@ -1790,7 +1790,7 @@ void mi_check_print_warning(MI_CHECK *param, const char *fmt,...) /* VARARGS */ -void mi_check_print_error(MI_CHECK *param, const char *fmt,...) +void mi_check_print_error(HA_CHECK *param, const char *fmt,...) { va_list args; DBUG_ENTER("mi_check_print_error"); diff --git a/storage/myisam/myisamdef.h b/storage/myisam/myisamdef.h index 721d6b9f271..b0b6da03015 100644 --- a/storage/myisam/myisamdef.h +++ b/storage/myisam/myisamdef.h @@ -15,8 +15,8 @@ /* This file is included by all internal myisam files */ -#include "myisam.h" /* Structs & some defines */ -#include "myisampack.h" /* packing of keys */ +#include "myisam.h" /* Structs & some defines */ +#include "myisampack.h" /* packing of keys */ #include <my_tree.h> #ifdef THREAD #include <my_pthread.h> @@ -26,15 +26,16 @@ #endif #if defined(my_write) && !defined(MAP_TO_USE_RAID) -#undef my_write /* undef map from my_nosys; We need test-if-disk full */ +/* undef map from my_nosys; We need test-if-disk full */ +#undef my_write #endif typedef struct st_mi_status_info { - ha_rows records; /* Rows in table */ - ha_rows del; /* Removed rows */ - my_off_t empty; /* lost space in datafile */ - my_off_t key_empty; /* lost space in indexfile */ + ha_rows records; /* Rows in table */ + ha_rows del; /* Removed rows */ + my_off_t empty; /* lost space in datafile */ + my_off_t key_empty; /* lost space in indexfile */ my_off_t key_file_length; my_off_t data_file_length; ha_checksum checksum; @@ -42,347 +43,292 @@ typedef struct st_mi_status_info typedef struct st_mi_state_info { - struct { /* Fileheader */ + struct + { /* Fileheader */ uchar file_version[4]; uchar options[2]; uchar header_length[2]; uchar state_info_length[2]; uchar base_info_length[2]; uchar base_pos[2]; - uchar key_parts[2]; /* Key parts */ - uchar unique_key_parts[2]; /* Key parts + unique parts */ - uchar keys; /* number of keys in file */ - uchar uniques; /* number of UNIQUE definitions */ - uchar language; /* Language for indexes */ - uchar max_block_size_index; /* max keyblock size */ + uchar key_parts[2]; /* Key parts */ + uchar unique_key_parts[2]; /* Key parts + unique parts */ + uchar keys; /* number of keys in file */ + uchar uniques; /* number of UNIQUE definitions */ + uchar language; /* Language for indexes */ + uchar max_block_size_index; /* max keyblock size */ uchar fulltext_keys; uchar not_used; /* To align to 8 */ } header; MI_STATUS_INFO state; - ha_rows split; /* number of split blocks */ - my_off_t dellink; /* Link to next removed block */ + ha_rows split; /* number of split blocks */ + my_off_t dellink; /* Link to next removed block */ ulonglong auto_increment; - ulong process; /* process that updated table last */ - ulong unique; /* Unique number for this process */ - ulong update_count; /* Updated for each write lock */ + ulong process; /* process that updated table last */ + ulong unique; /* Unique number for this process */ + ulong update_count; /* Updated for each write lock */ ulong status; ulong *rec_per_key_part; - my_off_t *key_root; /* Start of key trees */ - my_off_t *key_del; /* delete links for trees */ - my_off_t rec_per_key_rows; /* Rows when calculating rec_per_key */ - - ulong sec_index_changed; /* Updated when new sec_index */ - ulong sec_index_used; /* which extra index are in use */ - ulonglong key_map; /* Which keys are in use */ ha_checksum checksum; /* Table checksum */ - ulong version; /* timestamp of create */ - time_t create_time; /* Time when created database */ - time_t recover_time; /* Time for last recover */ - time_t check_time; /* Time for last check */ - uint sortkey; /* sorted by this key (not used) */ + my_off_t *key_root; /* Start of key trees */ + my_off_t *key_del; /* delete links for trees */ + my_off_t rec_per_key_rows; /* Rows when calculating rec_per_key */ + + ulong sec_index_changed; /* Updated when new sec_index */ + ulong sec_index_used; /* which extra index are in use */ + ulonglong key_map; /* Which keys are in use */ + ulong version; /* timestamp of create */ + time_t create_time; /* Time when created database */ + time_t recover_time; /* Time for last recover */ + time_t check_time; /* Time for last check */ + uint sortkey; /* sorted by this key (not used) */ uint open_count; - uint8 changed; /* Changed since myisamchk */ + uint8 changed; /* Changed since myisamchk */ /* the following isn't saved on disk */ - uint state_diff_length; /* Should be 0 */ - uint state_length; /* Length of state header in file */ + uint state_diff_length; /* Should be 0 */ + uint state_length; /* Length of state header in file */ ulong *key_info; } MI_STATE_INFO; -#define MI_STATE_INFO_SIZE (24+14*8+7*4+2*2+8) -#define MI_STATE_KEY_SIZE 8 +#define MI_STATE_INFO_SIZE (24+14*8+7*4+2*2+8) +#define MI_STATE_KEY_SIZE 8 #define MI_STATE_KEYBLOCK_SIZE 8 -#define MI_STATE_KEYSEG_SIZE 4 -#define MI_STATE_EXTRA_SIZE ((MI_MAX_KEY+MI_MAX_KEY_BLOCK_SIZE)*MI_STATE_KEY_SIZE + MI_MAX_KEY*MI_MAX_KEY_SEG*MI_STATE_KEYSEG_SIZE) -#define MI_KEYDEF_SIZE (2+ 5*2) -#define MI_UNIQUEDEF_SIZE (2+1+1) -#define HA_KEYSEG_SIZE (6+ 2*2 + 4*2) -#define MI_COLUMNDEF_SIZE (2*3+1) -#define MI_BASE_INFO_SIZE (5*8 + 8*4 + 4 + 4*2 + 16) -#define MI_INDEX_BLOCK_MARGIN 16 /* Safety margin for .MYI tables */ +#define MI_STATE_KEYSEG_SIZE 4 +#define MI_STATE_EXTRA_SIZE ((MI_MAX_KEY+MI_MAX_KEY_BLOCK_SIZE)*MI_STATE_KEY_SIZE + MI_MAX_KEY*HA_MAX_KEY_SEG*MI_STATE_KEYSEG_SIZE) +#define MI_KEYDEF_SIZE (2+ 5*2) +#define MI_UNIQUEDEF_SIZE (2+1+1) +#define HA_KEYSEG_SIZE (6+ 2*2 + 4*2) +#define MI_COLUMNDEF_SIZE (2*3+1) +#define MI_BASE_INFO_SIZE (5*8 + 8*4 + 4 + 4*2 + 16) +#define MI_INDEX_BLOCK_MARGIN 16 /* Safety margin for .MYI tables */ typedef struct st_mi_base_info { - my_off_t keystart; /* Start of keys */ + my_off_t keystart; /* Start of keys */ my_off_t max_data_file_length; my_off_t max_key_file_length; my_off_t margin_key_file_length; - ha_rows records,reloc; /* Create information */ - ulong mean_row_length; /* Create information */ - ulong reclength; /* length of unpacked record */ - ulong pack_reclength; /* Length of full packed rec. */ + ha_rows records, reloc; /* Create information */ + ulong mean_row_length; /* Create information */ + ulong reclength; /* length of unpacked record */ + ulong pack_reclength; /* Length of full packed rec. */ ulong min_pack_length; - ulong max_pack_length; /* Max possibly length of packed rec.*/ + ulong max_pack_length; /* Max possibly length of packed rec.*/ ulong min_block_length; - ulong fields, /* fields in table */ - pack_fields; /* packed fields in table */ - uint rec_reflength; /* = 2-8 */ - uint key_reflength; /* = 2-8 */ - uint keys; /* same as in state.header */ - uint auto_key; /* Which key-1 is a auto key */ - uint blobs; /* Number of blobs */ - uint pack_bits; /* Length of packed bits */ - uint max_key_block_length; /* Max block length */ - uint max_key_length; /* Max key length */ + ulong fields, /* fields in table */ + pack_fields; /* packed fields in table */ + uint rec_reflength; /* = 2-8 */ + uint key_reflength; /* = 2-8 */ + uint keys; /* same as in state.header */ + uint auto_key; /* Which key-1 is a auto key */ + uint blobs; /* Number of blobs */ + uint pack_bits; /* Length of packed bits */ + uint max_key_block_length; /* Max block length */ + uint max_key_length; /* Max key length */ /* Extra allocation when using dynamic record format */ uint extra_alloc_bytes; uint extra_alloc_procent; /* Info about raid */ - uint raid_type,raid_chunks; + uint raid_type, raid_chunks; ulong raid_chunksize; /* The following are from the header */ - uint key_parts,all_key_parts; + uint key_parts, all_key_parts; } MI_BASE_INFO; - /* Structs used intern in database */ + /* Structs used intern in database */ -typedef struct st_mi_blob /* Info of record */ +typedef struct st_mi_blob /* Info of record */ { - ulong offset; /* Offset to blob in record */ - uint pack_length; /* Type of packed length */ - ulong length; /* Calc:ed for each record */ + ulong offset; /* Offset to blob in record */ + uint pack_length; /* Type of packed length */ + ulong length; /* Calc:ed for each record */ } MI_BLOB; -typedef struct st_mi_isam_pack { +typedef struct st_mi_isam_pack +{ ulong header_length; uint ref_length; uchar version; } MI_PACK; -#define MAX_NONMAPPED_INSERTS 1000 +#define MAX_NONMAPPED_INSERTS 1000 -typedef struct st_mi_isam_share { /* Shared between opens */ +typedef struct st_mi_isam_share +{ /* Shared between opens */ MI_STATE_INFO state; MI_BASE_INFO base; - MI_KEYDEF ft2_keyinfo; /* Second-level ft-key definition */ - MI_KEYDEF *keyinfo; /* Key definitions */ - MI_UNIQUEDEF *uniqueinfo; /* unique definitions */ - HA_KEYSEG *keyparts; /* key part info */ - MI_COLUMNDEF *rec; /* Pointer to field information */ - MI_PACK pack; /* Data about packed records */ - MI_BLOB *blobs; /* Pointer to blobs */ - char *unique_file_name; /* realpath() of index file */ - char *data_file_name, /* Resolved path names from symlinks */ - *index_file_name; - uchar *file_map; /* mem-map of file if possible */ - KEY_CACHE *key_cache; /* ref to the current key cache */ + MI_KEYDEF ft2_keyinfo; /* Second-level ft-key definition */ + MI_KEYDEF *keyinfo; /* Key definitions */ + MI_UNIQUEDEF *uniqueinfo; /* unique definitions */ + HA_KEYSEG *keyparts; /* key part info */ + MI_COLUMNDEF *rec; /* Pointer to field information */ + MI_PACK pack; /* Data about packed records */ + MI_BLOB *blobs; /* Pointer to blobs */ + char *unique_file_name; /* realpath() of index file */ + char *data_file_name, /* Resolved path names from symlinks */ + *index_file_name; + uchar *file_map; /* mem-map of file if possible */ + KEY_CACHE *key_cache; /* ref to the current key cache */ MI_DECODE_TREE *decode_trees; uint16 *decode_tables; - int (*read_record)(struct st_myisam_info*, my_off_t, uchar*); - int (*write_record)(struct st_myisam_info*, const uchar*); - int (*update_record)(struct st_myisam_info*, my_off_t, const uchar*); - int (*delete_record)(struct st_myisam_info*); - int (*read_rnd)(struct st_myisam_info*, uchar*, my_off_t, my_bool); - int (*compare_record)(struct st_myisam_info*, const uchar *); /* Function to use for a row checksum. */ - ha_checksum (*calc_checksum)(struct st_myisam_info*, const uchar *); - int (*compare_unique)(struct st_myisam_info*, MI_UNIQUEDEF *, - const uchar *record, my_off_t pos); - size_t (*file_read)(MI_INFO *, uchar *, size_t, my_off_t, myf); - size_t (*file_write)(MI_INFO *, const uchar *, size_t, my_off_t, myf); + int(*read_record) (struct st_myisam_info *, my_off_t, uchar*); + int(*write_record) (struct st_myisam_info *, const uchar*); + int(*update_record) (struct st_myisam_info *, my_off_t, const uchar*); + int(*delete_record) (struct st_myisam_info *); + int(*read_rnd) (struct st_myisam_info *, uchar*, my_off_t, my_bool); + int(*compare_record) (struct st_myisam_info *, const uchar*); + ha_checksum(*calc_checksum) (struct st_myisam_info *, const uchar*); + int(*compare_unique) (struct st_myisam_info *, MI_UNIQUEDEF *, + const uchar *record, my_off_t pos); + uint(*file_read) (MI_INFO *, uchar *, size_t, my_off_t, myf); + uint(*file_write) (MI_INFO *, const uchar *, size_t, my_off_t, myf); invalidator_by_filename invalidator; /* query cache invalidator */ - ulong this_process; /* processid */ - ulong last_process; /* For table-change-check */ - ulong last_version; /* Version on start */ - ulong options; /* Options used */ - ulong min_pack_length; /* Theese are used by packed data */ + ulong this_process; /* processid */ + ulong last_process; /* For table-change-check */ + ulong last_version; /* Version on start */ + ulong options; /* Options used */ + ulong min_pack_length; /* Theese are used by packed data */ ulong max_pack_length; ulong state_diff_length; - uint rec_reflength; /* rec_reflength in use now */ - uint unique_name_length; + uint rec_reflength; /* rec_reflength in use now */ + uint unique_name_length; uint32 ftparsers; /* Number of distinct ftparsers + 1 */ - File kfile; /* Shared keyfile */ - File data_file; /* Shared data file */ - int mode; /* mode of file on open */ - uint reopen; /* How many times reopened */ - uint w_locks,r_locks,tot_locks; /* Number of read/write locks */ - uint blocksize; /* blocksize of keyfile */ + File kfile; /* Shared keyfile */ + File data_file; /* Shared data file */ + int mode; /* mode of file on open */ + uint reopen; /* How many times reopened */ + uint w_locks, r_locks, tot_locks; /* Number of read/write locks */ + uint blocksize; /* blocksize of keyfile */ myf write_flag; enum data_file_type data_file_type; /* Below flag is needed to make log tables work with concurrent insert */ my_bool is_log_table; - my_bool changed, /* If changed since lock */ - global_changed, /* If changed since open */ - not_flushed, - temporary,delay_key_write, - concurrent_insert; + my_bool changed, /* If changed since lock */ + global_changed, /* If changed since open */ + not_flushed, temporary, delay_key_write, concurrent_insert; #ifdef THREAD THR_LOCK lock; - pthread_mutex_t intern_lock; /* Locking for use with _locking */ + pthread_mutex_t intern_lock; /* Locking for use with _locking */ rw_lock_t *key_root_lock; #endif my_off_t mmaped_length; - uint nonmmaped_inserts; /* counter of writing in non-mmaped - area */ + /* counter of writing in non-mmaped area */ + uint nonmmaped_inserts; rw_lock_t mmap_lock; } MYISAM_SHARE; -typedef uint mi_bit_type; - -typedef struct st_mi_bit_buff { /* Used for packing of record */ - mi_bit_type current_byte; - uint bits; - uchar *pos,*end,*blob_pos,*blob_end; - uint error; -} MI_BIT_BUFF; - -struct st_myisam_info { - MYISAM_SHARE *s; /* Shared between open:s */ - MI_STATUS_INFO *state,save_state; - MI_BLOB *blobs; /* Pointer to blobs */ - MI_BIT_BUFF bit_buff; +struct st_myisam_info +{ + MYISAM_SHARE *s; /* Shared between open:s */ + MI_STATUS_INFO *state, save_state; + MI_BLOB *blobs; /* Pointer to blobs */ + MI_BIT_BUFF bit_buff; /* accumulate indexfile changes between write's */ - TREE *bulk_insert; + TREE *bulk_insert; DYNAMIC_ARRAY *ft1_to_ft2; /* used only in ft1->ft2 conversion */ MEM_ROOT ft_memroot; /* used by the parser */ - MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit */ - char *filename; /* parameter to open filename */ - uchar *buff, /* Temp area for key */ - *lastkey,*lastkey2; /* Last used search key */ - uchar *first_mbr_key; /* Searhed spatial key */ - uchar *rec_buff; /* Tempbuff for recordpack */ - uchar *int_keypos, /* Save position for next/previous */ - *int_maxpos; /* -""- */ - uint int_nod_flag; /* -""- */ - uint32 int_keytree_version; /* -""- */ - int (*read_record)(struct st_myisam_info*, my_off_t, uchar*); + MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit */ + char *filename; /* parameter to open filename */ + uchar *buff, /* Temp area for key */ + *lastkey, *lastkey2; /* Last used search key */ + uchar *first_mbr_key; /* Searhed spatial key */ + uchar *rec_buff; /* Tempbuff for recordpack */ + uchar *int_keypos, /* Save position for next/previous */ + *int_maxpos; /* -""- */ + uint int_nod_flag; /* -""- */ + uint32 int_keytree_version; /* -""- */ + int(*read_record) (struct st_myisam_info *, my_off_t, uchar *); invalidator_by_filename invalidator; /* query cache invalidator */ - ulong this_unique; /* uniq filenumber or thread */ - ulong last_unique; /* last unique number */ - ulong this_loop; /* counter for this open */ - ulong last_loop; /* last used counter */ - my_off_t lastpos, /* Last record position */ - nextpos; /* Position to next record */ + ulong this_unique; /* uniq filenumber or thread */ + ulong last_unique; /* last unique number */ + ulong this_loop; /* counter for this open */ + ulong last_loop; /* last used counter */ + my_off_t lastpos, /* Last record position */ + nextpos; /* Position to next record */ my_off_t save_lastpos; - my_off_t pos; /* Intern variable */ - my_off_t last_keypage; /* Last key page read */ - my_off_t last_search_keypage; /* Last keypage when searching */ + my_off_t pos; /* Intern variable */ + my_off_t last_keypage; /* Last key page read */ + my_off_t last_search_keypage; /* Last keypage when searching */ my_off_t dupp_key_pos; ha_checksum checksum; /* Temp storage for row checksum */ - /* QQ: the folloing two xxx_length fields should be removed, - as they are not compatible with parallel repair */ - ulong packed_length,blob_length; /* Length of found, packed record */ - int dfile; /* The datafile */ - uint opt_flag; /* Optim. for space/speed */ - uint update; /* If file changed since open */ - int lastinx; /* Last used index */ - uint lastkey_length; /* Length of key in lastkey */ - uint last_rkey_length; /* Last length in mi_rkey() */ + /* + QQ: the folloing two xxx_length fields should be removed, + as they are not compatible with parallel repair + */ + ulong packed_length, blob_length; /* Length of found, packed record */ + int dfile; /* The datafile */ + uint opt_flag; /* Optim. for space/speed */ + uint update; /* If file changed since open */ + int lastinx; /* Last used index */ + uint lastkey_length; /* Length of key in lastkey */ + uint last_rkey_length; /* Last length in mi_rkey() */ enum ha_rkey_function last_key_func; /* CONTAIN, OVERLAP, etc */ - uint save_lastkey_length; - uint pack_key_length; /* For MYISAMMRG */ + uint save_lastkey_length; + uint pack_key_length; /* For MYISAMMRG */ uint16 last_used_keyseg; /* For MyISAMMRG */ - int errkey; /* Got last error on this key */ - int lock_type; /* How database was locked */ - int tmp_lock_type; /* When locked by readinfo */ - uint data_changed; /* Somebody has changed data */ - uint save_update; /* When using KEY_READ */ - int save_lastinx; - LIST open_list; - IO_CACHE rec_cache; /* When cacheing records */ - uint preload_buff_size; /* When preloading indexes */ - myf lock_wait; /* is 0 or MY_DONT_WAIT */ - my_bool was_locked; /* Was locked in panic */ - my_bool append_insert_at_end; /* Set if concurrent insert */ + int errkey; /* Got last error on this key */ + int lock_type; /* How database was locked */ + int tmp_lock_type; /* When locked by readinfo */ + uint data_changed; /* Somebody has changed data */ + uint save_update; /* When using KEY_READ */ + int save_lastinx; + LIST open_list; + IO_CACHE rec_cache; /* When cacheing records */ + uint preload_buff_size; /* When preloading indexes */ + myf lock_wait; /* is 0 or MY_DONT_WAIT */ + my_bool was_locked; /* Was locked in panic */ + my_bool append_insert_at_end; /* Set if concurrent insert */ my_bool quick_mode; - my_bool page_changed; /* If info->buff can't be used for rnext */ - my_bool buff_used; /* If info->buff has to be reread for rnext */ - my_bool once_flags; /* For MYISAMMRG */ + /* If info->buff can't be used for rnext */ + my_bool page_changed; + /* If info->buff has to be reread for rnext */ + my_bool buff_used; + my_bool once_flags; /* For MYISAMMRG */ #ifdef __WIN__ my_bool owned_by_merge; /* This MyISAM table is part of a merge union */ #endif #ifdef THREAD THR_LOCK_DATA lock; #endif - uchar *rtree_recursion_state; /* For RTREE */ - int rtree_recursion_depth; + uchar *rtree_recursion_state; /* For RTREE */ + int rtree_recursion_depth; }; -typedef struct st_buffpek { - my_off_t file_pos; /* Where we are in the sort file */ - uchar *base,*key; /* Key pointers */ - ha_rows count; /* Number of rows in table */ - ulong mem_count; /* numbers of keys in memory */ - ulong max_keys; /* Max keys in buffert */ -} BUFFPEK; - -typedef struct st_mi_sort_param -{ - pthread_t thr; - IO_CACHE read_cache, tempfile, tempfile_for_exceptions; - DYNAMIC_ARRAY buffpek; - MI_BIT_BUFF bit_buff; /* For parallel repair of packrec. */ - - /* - The next two are used to collect statistics, see update_key_parts for - description. - */ - ulonglong unique[MI_MAX_KEY_SEG+1]; - ulonglong notnull[MI_MAX_KEY_SEG+1]; - - my_off_t pos,max_pos,filepos,start_recpos; - uint key, key_length,real_key_length,sortbuff_size; - uint maxbuffers, keys, find_length, sort_keys_length; - my_bool fix_datafile, master; - my_bool calc_checksum; /* calculate table checksum */ - MI_KEYDEF *keyinfo; - HA_KEYSEG *seg; - SORT_INFO *sort_info; - uchar **sort_keys; - uchar *rec_buff; - void *wordlist, *wordptr; - MEM_ROOT wordroot; - uchar *record; - MY_TMPDIR *tmpdir; - int (*key_cmp)(struct st_mi_sort_param *, const void *, const void *); - int (*key_read)(struct st_mi_sort_param *,void *); - int (*key_write)(struct st_mi_sort_param *, const void *); - void (*lock_in_memory)(MI_CHECK *); - NEAR int (*write_keys)(struct st_mi_sort_param *, register uchar **, - uint , struct st_buffpek *, IO_CACHE *); - NEAR uint (*read_to_buffer)(IO_CACHE *,struct st_buffpek *, uint); - NEAR int (*write_key)(struct st_mi_sort_param *, IO_CACHE *,uchar *, - uint, uint); -} MI_SORT_PARAM; - - /* Some defines used by isam-funktions */ - -#define USE_WHOLE_KEY MI_MAX_KEY_BUFF*2 /* Use whole key in _mi_search() */ -#define F_EXTRA_LCK -1 - - /* bits in opt_flag */ -#define MEMMAP_USED 32 +#define USE_WHOLE_KEY HA_MAX_KEY_BUFF*2 /* Use whole key in _mi_search() */ +#define F_EXTRA_LCK -1 +/* bits in opt_flag */ +#define MEMMAP_USED 32 #define REMEMBER_OLD_POS 64 -#define WRITEINFO_UPDATE_KEYFILE 1 -#define WRITEINFO_NO_UNLOCK 2 +#define WRITEINFO_UPDATE_KEYFILE 1 +#define WRITEINFO_NO_UNLOCK 2 - /* once_flags */ +/* once_flags */ #define USE_PACKED_KEYS 1 #define RRND_PRESERVE_LASTINX 2 - /* bits in state.changed */ - -#define STATE_CHANGED 1 -#define STATE_CRASHED 2 +/* bits in state.changed */ +#define STATE_CHANGED 1 +#define STATE_CRASHED 2 #define STATE_CRASHED_ON_REPAIR 4 -#define STATE_NOT_ANALYZED 8 +#define STATE_NOT_ANALYZED 8 #define STATE_NOT_OPTIMIZED_KEYS 16 -#define STATE_NOT_SORTED_PAGES 32 - - /* options to mi_read_cache */ +#define STATE_NOT_SORTED_PAGES 32 -#define READING_NEXT 1 -#define READING_HEADER 2 +/* options to mi_read_cache */ +#define READING_NEXT 1 +#define READING_HEADER 2 -#define mi_getint(x) ((uint) mi_uint2korr(x) & 32767) +#define mi_getint(x) ((uint) mi_uint2korr(x) & 32767) #define mi_putint(x,y,nod) { uint16 boh=(nod ? (uint16) 32768 : 0) + (uint16) (y);\ - mi_int2store(x,boh); } + mi_int2store(x,boh); } #define mi_test_if_nod(x) (x[0] & 128 ? info->s->base.key_reflength : 0) #define mi_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \ DBUG_PRINT("error", ("Marked table crashed")); \ @@ -400,13 +346,6 @@ typedef struct st_mi_sort_param /* Functions to store length of space packed keys, VARCHAR or BLOB keys */ -#define store_key_length_inc(key,length) \ -{ if ((length) < 255) \ - { *(key)++=(length); } \ - else \ - { *(key)=255; mi_int2store((key)+1,(length)); (key)+=3; } \ -} - #define store_key_length(key,length) \ { if ((length) < 255) \ { *(key)=(length); } \ @@ -430,39 +369,39 @@ typedef struct st_mi_sort_param #define get_pack_length(length) ((length) >= 255 ? 3 : 1) -#define MI_MIN_BLOCK_LENGTH 20 /* Because of delete-link */ -#define MI_EXTEND_BLOCK_LENGTH 20 /* Don't use to small record-blocks */ -#define MI_SPLIT_LENGTH ((MI_EXTEND_BLOCK_LENGTH+4)*2) -#define MI_MAX_DYN_BLOCK_HEADER 20 /* Max prefix of record-block */ +#define MI_MIN_BLOCK_LENGTH 20 /* Because of delete-link */ +#define MI_EXTEND_BLOCK_LENGTH 20 /* Don't use to small record-blocks */ +#define MI_SPLIT_LENGTH ((MI_EXTEND_BLOCK_LENGTH+4)*2) +#define MI_MAX_DYN_BLOCK_HEADER 20 /* Max prefix of record-block */ #define MI_BLOCK_INFO_HEADER_LENGTH 20 -#define MI_DYN_DELETE_BLOCK_HEADER 20 /* length of delete-block-header */ -#define MI_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L) -#define MI_DYN_MAX_ROW_LENGTH (MI_DYN_MAX_BLOCK_LENGTH - MI_SPLIT_LENGTH) -#define MI_DYN_ALIGN_SIZE 4 /* Align blocks on this */ -#define MI_MAX_DYN_HEADER_BYTE 13 /* max header byte for dynamic rows */ -#define MI_MAX_BLOCK_LENGTH ((((ulong) 1 << 24)-1) & (~ (ulong) (MI_DYN_ALIGN_SIZE-1))) +#define MI_DYN_DELETE_BLOCK_HEADER 20 /* length of delete-block-header */ +#define MI_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L) +#define MI_DYN_MAX_ROW_LENGTH (MI_DYN_MAX_BLOCK_LENGTH - MI_SPLIT_LENGTH) +#define MI_DYN_ALIGN_SIZE 4 /* Align blocks on this */ +#define MI_MAX_DYN_HEADER_BYTE 13 /* max header byte for dynamic rows */ +#define MI_MAX_BLOCK_LENGTH ((((ulong) 1 << 24)-1) & (~ (ulong) (MI_DYN_ALIGN_SIZE-1))) #define MI_REC_BUFF_OFFSET ALIGN_SIZE(MI_DYN_DELETE_BLOCK_HEADER+sizeof(uint32)) -#define MEMMAP_EXTRA_MARGIN 7 /* Write this as a suffix for file */ +#define MEMMAP_EXTRA_MARGIN 7 /* Write this as a suffix for file */ -#define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */ -#define PACK_TYPE_SPACE_FIELDS 2 -#define PACK_TYPE_ZERO_FILL 4 -#define MI_FOUND_WRONG_KEY 32738 /* Impossible value from ha_key_cmp */ +#define PACK_TYPE_SELECTED 1 /* Bits in field->pack_type */ +#define PACK_TYPE_SPACE_FIELDS 2 +#define PACK_TYPE_ZERO_FILL 4 +#define MI_FOUND_WRONG_KEY 32738 /* Impossible value from ha_key_cmp */ -#define MI_MAX_KEY_BLOCK_SIZE (MI_MAX_KEY_BLOCK_LENGTH/MI_MIN_KEY_BLOCK_LENGTH) +#define MI_MAX_KEY_BLOCK_SIZE (MI_MAX_KEY_BLOCK_LENGTH/MI_MIN_KEY_BLOCK_LENGTH) #define MI_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size)) -#define MI_MAX_KEYPTR_SIZE 5 /* For calculating block lengths */ -#define MI_MIN_KEYBLOCK_LENGTH 50 /* When to split delete blocks */ +#define MI_MAX_KEYPTR_SIZE 5 /* For calculating block lengths */ +#define MI_MIN_KEYBLOCK_LENGTH 50 /* When to split delete blocks */ -#define MI_MIN_SIZE_BULK_INSERT_TREE 16384 /* this is per key */ +#define MI_MIN_SIZE_BULK_INSERT_TREE 16384 /* this is per key */ #define MI_MIN_ROWS_TO_USE_BULK_INSERT 100 #define MI_MIN_ROWS_TO_DISABLE_INDEXES 100 #define MI_MIN_ROWS_TO_USE_WRITE_CACHE 10 /* The UNIQUE check is done with a hashed long key */ -#define MI_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT +#define MI_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT #define mi_unique_store(A,B) mi_int4store((A),(B)) #ifdef THREAD @@ -474,175 +413,182 @@ extern pthread_mutex_t THR_LOCK_myisam; #define rw_unlock(A) {} #endif - /* Some extern variables */ +/* Some extern variables */ extern LIST *myisam_open_list; -extern uchar NEAR myisam_file_magic[],NEAR myisam_pack_file_magic[]; -extern uint NEAR myisam_read_vec[],NEAR myisam_readnext_vec[]; +extern uchar NEAR myisam_file_magic[], NEAR myisam_pack_file_magic[]; +extern uint NEAR myisam_read_vec[], NEAR myisam_readnext_vec[]; extern uint myisam_quick_table_bits; extern File myisam_log_file; extern ulong myisam_pid; - /* This is used by _mi_calc_xxx_key_length och _mi_store_key */ +/* This is used by _mi_calc_xxx_key_length och _mi_store_key */ typedef struct st_mi_s_param { - uint ref_length,key_length, - n_ref_length, - n_length, - totlength, - part_of_prev_key,prev_length,pack_marker; - uchar *key, *prev_key,*next_key_pos; - bool store_not_null; + uint ref_length, key_length, + n_ref_length, + n_length, totlength, part_of_prev_key, prev_length, pack_marker; + uchar *key, *prev_key, *next_key_pos; + bool store_not_null; } MI_KEY_PARAM; - /* Prototypes for intern functions */ +/* Prototypes for intern functions */ -extern int _mi_read_dynamic_record(MI_INFO *info,my_off_t filepos,uchar *buf); -extern int _mi_write_dynamic_record(MI_INFO*, const uchar*); -extern int _mi_update_dynamic_record(MI_INFO*, my_off_t, const uchar*); +extern int _mi_read_dynamic_record(MI_INFO *info, my_off_t filepos, uchar *buf); +extern int _mi_write_dynamic_record(MI_INFO *, const uchar *); +extern int _mi_update_dynamic_record(MI_INFO *, my_off_t, const uchar *); extern int _mi_delete_dynamic_record(MI_INFO *info); -extern int _mi_cmp_dynamic_record(MI_INFO *info,const uchar *record); -extern int _mi_read_rnd_dynamic_record(MI_INFO *, uchar *,my_off_t, my_bool); -extern int _mi_write_blob_record(MI_INFO*, const uchar*); -extern int _mi_update_blob_record(MI_INFO*, my_off_t, const uchar*); -extern int _mi_read_static_record(MI_INFO *info, my_off_t filepos,uchar *buf); -extern int _mi_write_static_record(MI_INFO*, const uchar*); -extern int _mi_update_static_record(MI_INFO*, my_off_t, const uchar*); +extern int _mi_cmp_dynamic_record(MI_INFO *info, const uchar *record); +extern int _mi_read_rnd_dynamic_record(MI_INFO *, uchar *, my_off_t, my_bool); +extern int _mi_write_blob_record(MI_INFO *, const uchar *); +extern int _mi_update_blob_record(MI_INFO *, my_off_t, const uchar *); +extern int _mi_read_static_record(MI_INFO *info, my_off_t filepos, uchar *buf); +extern int _mi_write_static_record(MI_INFO *, const uchar *); +extern int _mi_update_static_record(MI_INFO *, my_off_t, const uchar *); extern int _mi_delete_static_record(MI_INFO *info); -extern int _mi_cmp_static_record(MI_INFO *info,const uchar *record); -extern int _mi_read_rnd_static_record(MI_INFO*, uchar *,my_off_t, my_bool); -extern int _mi_ck_write(MI_INFO *info,uint keynr,uchar *key,uint length); +extern int _mi_cmp_static_record(MI_INFO *info, const uchar *record); +extern int _mi_read_rnd_static_record(MI_INFO *, uchar *, my_off_t, my_bool); +extern int _mi_ck_write(MI_INFO *info, uint keynr, uchar *key, uint length); extern int _mi_ck_real_write_btree(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key, uint key_length, my_off_t *root, uint comp_flag); -extern int _mi_enlarge_root(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key, my_off_t *root); -extern int _mi_insert(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key, - uchar *anc_buff,uchar *key_pos,uchar *key_buff, - uchar *father_buff, uchar *father_keypos, - my_off_t father_page, my_bool insert_last); -extern int _mi_split_page(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key, - uchar *buff,uchar *key_buff, my_bool insert_last); -extern uchar *_mi_find_half_pos(uint nod_flag,MI_KEYDEF *keyinfo,uchar *page, - uchar *key,uint *return_key_length, - uchar **after_key); -extern int _mi_calc_static_key_length(MI_KEYDEF *keyinfo,uint nod_flag, - uchar *key_pos, uchar *org_key, - uchar *key_buff, - uchar *key, MI_KEY_PARAM *s_temp); -extern int _mi_calc_var_key_length(MI_KEYDEF *keyinfo,uint nod_flag, - uchar *key_pos, uchar *org_key, - uchar *key_buff, - uchar *key, MI_KEY_PARAM *s_temp); -extern int _mi_calc_var_pack_key_length(MI_KEYDEF *keyinfo,uint nod_flag, - uchar *key_pos, uchar *org_key, - uchar *prev_key, - uchar *key, MI_KEY_PARAM *s_temp); -extern int _mi_calc_bin_pack_key_length(MI_KEYDEF *keyinfo,uint nod_flag, - uchar *key_pos,uchar *org_key, - uchar *prev_key, - uchar *key, MI_KEY_PARAM *s_temp); -void _mi_store_static_key(MI_KEYDEF *keyinfo, uchar *key_pos, - MI_KEY_PARAM *s_temp); -void _mi_store_var_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos, - MI_KEY_PARAM *s_temp); +extern int _mi_enlarge_root(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key, + my_off_t *root); +extern int _mi_insert(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key, + uchar *anc_buff, uchar *key_pos, uchar *key_buff, + uchar *father_buff, uchar *father_keypos, + my_off_t father_page, my_bool insert_last); +extern int _mi_split_page(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key, + uchar *buff, uchar *key_buff, my_bool insert_last); +extern uchar *_mi_find_half_pos(uint nod_flag, MI_KEYDEF *keyinfo, + uchar *page, uchar *key, + uint *return_key_length, uchar ** after_key); +extern int _mi_calc_static_key_length(MI_KEYDEF *keyinfo, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *key_buff, uchar *key, + MI_KEY_PARAM *s_temp); +extern int _mi_calc_var_key_length(MI_KEYDEF *keyinfo, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *key_buff, uchar *key, + MI_KEY_PARAM *s_temp); +extern int _mi_calc_var_pack_key_length(MI_KEYDEF *keyinfo, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *prev_key, uchar *key, + MI_KEY_PARAM *s_temp); +extern int _mi_calc_bin_pack_key_length(MI_KEYDEF *keyinfo, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *prev_key, uchar *key, + MI_KEY_PARAM *s_temp); +void _mi_store_static_key(MI_KEYDEF *keyinfo, uchar *key_pos, + MI_KEY_PARAM *s_temp); +void _mi_store_var_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos, + MI_KEY_PARAM *s_temp); #ifdef NOT_USED -void _mi_store_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos, - MI_KEY_PARAM *s_temp); +void _mi_store_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos, + MI_KEY_PARAM *s_temp); #endif -void _mi_store_bin_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos, - MI_KEY_PARAM *s_temp); +void _mi_store_bin_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos, + MI_KEY_PARAM *s_temp); -extern int _mi_ck_delete(MI_INFO *info,uint keynr,uchar *key,uint key_length); -extern int _mi_readinfo(MI_INFO *info,int lock_flag,int check_keybuffer); -extern int _mi_writeinfo(MI_INFO *info,uint options); +extern int _mi_ck_delete(MI_INFO *info, uint keynr, uchar *key, + uint key_length); +extern int _mi_readinfo(MI_INFO *info, int lock_flag, int check_keybuffer); +extern int _mi_writeinfo(MI_INFO *info, uint options); extern int _mi_test_if_changed(MI_INFO *info); extern int _mi_mark_file_changed(MI_INFO *info); extern int _mi_decrement_open_count(MI_INFO *info); -extern int _mi_check_index(MI_INFO *info,int inx); -extern int _mi_search(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key,uint key_len, - uint nextflag,my_off_t pos); -extern int _mi_bin_search(struct st_myisam_info *info,MI_KEYDEF *keyinfo, - uchar *page,uchar *key,uint key_len,uint comp_flag, - uchar * *ret_pos,uchar *buff, my_bool *was_last_key); -extern int _mi_seq_search(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *page, - uchar *key,uint key_len,uint comp_flag, - uchar **ret_pos,uchar *buff, my_bool *was_last_key); -extern int _mi_prefix_search(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *page, - uchar *key,uint key_len,uint comp_flag, - uchar **ret_pos,uchar *buff, my_bool *was_last_key); -extern my_off_t _mi_kpos(uint nod_flag,uchar *after_key); -extern void _mi_kpointer(MI_INFO *info,uchar *buff,my_off_t pos); -extern my_off_t _mi_dpos(MI_INFO *info, uint nod_flag,uchar *after_key); +extern int _mi_check_index(MI_INFO *info, int inx); +extern int _mi_search(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key, + uint key_len, uint nextflag, my_off_t pos); +extern int _mi_bin_search(struct st_myisam_info *info, MI_KEYDEF *keyinfo, + uchar *page, uchar *key, uint key_len, + uint comp_flag, uchar **ret_pos, uchar *buff, + my_bool *was_last_key); +extern int _mi_seq_search(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *page, + uchar *key, uint key_len, uint comp_flag, + uchar ** ret_pos, uchar *buff, + my_bool *was_last_key); +extern int _mi_prefix_search(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *page, + uchar *key, uint key_len, uint comp_flag, + uchar ** ret_pos, uchar *buff, + my_bool *was_last_key); +extern my_off_t _mi_kpos(uint nod_flag, uchar *after_key); +extern void _mi_kpointer(MI_INFO *info, uchar *buff, my_off_t pos); +extern my_off_t _mi_dpos(MI_INFO *info, uint nod_flag, uchar *after_key); extern my_off_t _mi_rec_pos(MYISAM_SHARE *info, uchar *ptr); -extern void _mi_dpointer(MI_INFO *info, uchar *buff,my_off_t pos); -extern int ha_key_cmp(HA_KEYSEG *keyseg, uchar *a,uchar *b, - uint key_length,uint nextflag,uint *diff_length); -extern uint _mi_get_static_key(MI_KEYDEF *keyinfo,uint nod_flag,uchar * *page, - uchar *key); -extern uint _mi_get_pack_key(MI_KEYDEF *keyinfo,uint nod_flag,uchar * *page, - uchar *key); +extern void _mi_dpointer(MI_INFO *info, uchar *buff, my_off_t pos); +extern int ha_key_cmp(HA_KEYSEG *keyseg, uchar *a, uchar *b, + uint key_length, uint nextflag, uint *diff_length); +extern uint _mi_get_static_key(MI_KEYDEF *keyinfo, uint nod_flag, + uchar **page, uchar *key); +extern uint _mi_get_pack_key(MI_KEYDEF *keyinfo, uint nod_flag, uchar **page, + uchar *key); extern uint _mi_get_binary_pack_key(MI_KEYDEF *keyinfo, uint nod_flag, - uchar **page_pos, uchar *key); -extern uchar *_mi_get_last_key(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *keypos, - uchar *lastkey,uchar *endpos, - uint *return_key_length); + uchar ** page_pos, uchar *key); +extern uchar *_mi_get_last_key(MI_INFO *info, MI_KEYDEF *keyinfo, + uchar *keypos, uchar *lastkey, uchar *endpos, + uint *return_key_length); extern uchar *_mi_get_key(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *page, - uchar *key, uchar *keypos, uint *return_key_length); -extern uint _mi_keylength(MI_KEYDEF *keyinfo,uchar *key); + uchar *key, uchar *keypos, + uint *return_key_length); +extern uint _mi_keylength(MI_KEYDEF *keyinfo, uchar *key); extern uint _mi_keylength_part(MI_KEYDEF *keyinfo, register uchar *key, - HA_KEYSEG *end); -extern uchar *_mi_move_key(MI_KEYDEF *keyinfo,uchar *to,uchar *from); -extern int _mi_search_next(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key, - uint key_length,uint nextflag,my_off_t pos); -extern int _mi_search_first(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t pos); -extern int _mi_search_last(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t pos); -extern uchar *_mi_fetch_keypage(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t page, - int level,uchar *buff,int return_buffer); -extern int _mi_write_keypage(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t page, - int level, uchar *buff); -extern int _mi_dispose(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t pos, - int level); -extern my_off_t _mi_new(MI_INFO *info,MI_KEYDEF *keyinfo,int level); -extern uint _mi_make_key(MI_INFO *info,uint keynr,uchar *key, - const uchar *record,my_off_t filepos); -extern uint _mi_pack_key(register MI_INFO *info, uint keynr, uchar *key, + HA_KEYSEG *end); +extern uchar *_mi_move_key(MI_KEYDEF *keyinfo, uchar *to, uchar *from); +extern int _mi_search_next(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key, + uint key_length, uint nextflag, my_off_t pos); +extern int _mi_search_first(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos); +extern int _mi_search_last(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos); +extern uchar *_mi_fetch_keypage(MI_INFO *info, MI_KEYDEF *keyinfo, + my_off_t page, int level, uchar *buff, + int return_buffer); +extern int _mi_write_keypage(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t page, + int level, uchar *buff); +extern int _mi_dispose(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos, + int level); +extern my_off_t _mi_new(MI_INFO *info, MI_KEYDEF *keyinfo, int level); +extern uint _mi_make_key(MI_INFO *info, uint keynr, uchar *key, + const uchar *record, my_off_t filepos); +extern uint _mi_pack_key(MI_INFO *info, uint keynr, uchar *key, uchar *old, key_part_map keypart_map, - HA_KEYSEG **last_used_keyseg); -extern int _mi_read_key_record(MI_INFO *info,my_off_t filepos,uchar *buf); -extern int _mi_read_cache(IO_CACHE *info,uchar *buff,my_off_t pos, - uint length,int re_read_if_possibly); -extern ulonglong retrieve_auto_increment(MI_INFO *info,const uchar *record); + HA_KEYSEG ** last_used_keyseg); +extern int _mi_read_key_record(MI_INFO *info, my_off_t filepos, uchar *buf); +extern int _mi_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos, + uint length, int re_read_if_possibly); +extern ulonglong retrieve_auto_increment(MI_INFO *info, const uchar *record); -extern uchar *mi_alloc_rec_buff(MI_INFO *,ulong, uchar**); +extern uchar *mi_alloc_rec_buff(MI_INFO *, ulong, uchar **); #define mi_get_rec_buff_ptr(info,buf) \ ((((info)->s->options & HA_OPTION_PACK_RECORD) && (buf)) ? \ (buf) - MI_REC_BUFF_OFFSET : (buf)) #define mi_get_rec_buff_len(info,buf) \ (*((uint32 *)(mi_get_rec_buff_ptr(info,buf)))) -extern ulong _mi_rec_unpack(MI_INFO *info,uchar *to,uchar *from, - ulong reclength); +extern ulong _mi_rec_unpack(MI_INFO *info, uchar *to, uchar *from, + ulong reclength); extern my_bool _mi_rec_check(MI_INFO *info,const uchar *record, uchar *packpos, ulong packed_length, my_bool with_checkum); -extern int _mi_write_part_record(MI_INFO *info,my_off_t filepos,ulong length, - my_off_t next_filepos,uchar **record, - ulong *reclength,int *flag); -extern void _mi_print_key(FILE *stream,HA_KEYSEG *keyseg,const uchar *key, - uint length); -extern my_bool _mi_read_pack_info(MI_INFO *info,pbool fix_keys); -extern int _mi_read_pack_record(MI_INFO *info,my_off_t filepos,uchar *buf); -extern int _mi_read_rnd_pack_record(MI_INFO*, uchar *,my_off_t, my_bool); +extern int _mi_write_part_record(MI_INFO *info, my_off_t filepos, ulong length, + my_off_t next_filepos, uchar ** record, + ulong *reclength, int *flag); +extern void _mi_print_key(FILE *stream, HA_KEYSEG *keyseg, const uchar *key, + uint length); +extern my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys); +extern int _mi_read_pack_record(MI_INFO *info, my_off_t filepos, uchar *buf); +extern int _mi_read_rnd_pack_record(MI_INFO *, uchar *, my_off_t, my_bool); extern int _mi_pack_rec_unpack(MI_INFO *info, MI_BIT_BUFF *bit_buff, uchar *to, uchar *from, ulong reclength); -extern ulonglong mi_safe_mul(ulonglong a,ulonglong b); +extern ulonglong mi_safe_mul(ulonglong a, ulonglong b); extern int _mi_ft_update(MI_INFO *info, uint keynr, uchar *keybuf, - const uchar *oldrec, const uchar *newrec, my_off_t pos); + const uchar *oldrec, const uchar *newrec, + my_off_t pos); struct st_sort_info; -typedef struct st_mi_block_info { /* Parameter to _mi_get_block_info */ +typedef struct st_mi_block_info /* Parameter to _mi_get_block_info */ +{ uchar header[MI_BLOCK_INFO_HEADER_LENGTH]; ulong rec_len; ulong data_len; @@ -655,35 +601,37 @@ typedef struct st_mi_block_info { /* Parameter to _mi_get_block_info */ uint offset; } MI_BLOCK_INFO; - /* bits in return from _mi_get_block_info */ - -#define BLOCK_FIRST 1 -#define BLOCK_LAST 2 -#define BLOCK_DELETED 4 -#define BLOCK_ERROR 8 /* Wrong data */ -#define BLOCK_SYNC_ERROR 16 /* Right data at wrong place */ -#define BLOCK_FATAL_ERROR 32 /* hardware-error */ - -#define NEED_MEM ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */ -#define MAXERR 20 -#define BUFFERS_WHEN_SORTING 16 /* Alloc for sort-key-tree */ -#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE -#define INDEX_TMP_EXT ".TMM" -#define DATA_TMP_EXT ".TMD" - -#define UPDATE_TIME 1 -#define UPDATE_STAT 2 -#define UPDATE_SORT 4 -#define UPDATE_AUTO_INC 8 -#define UPDATE_OPEN_COUNT 16 - -#define USE_BUFFER_INIT (((1024L*512L-MALLOC_OVERHEAD)/IO_SIZE)*IO_SIZE) -#define READ_BUFFER_INIT (1024L*256L-MALLOC_OVERHEAD) -#define SORT_BUFFER_INIT (2048L*1024L-MALLOC_OVERHEAD) -#define MIN_SORT_BUFFER (4096-MALLOC_OVERHEAD) - -enum myisam_log_commands { - MI_LOG_OPEN,MI_LOG_WRITE,MI_LOG_UPDATE,MI_LOG_DELETE,MI_LOG_CLOSE,MI_LOG_EXTRA,MI_LOG_LOCK,MI_LOG_DELETE_ALL + /* bits in return from _mi_get_block_info */ + +#define BLOCK_FIRST 1 +#define BLOCK_LAST 2 +#define BLOCK_DELETED 4 +#define BLOCK_ERROR 8 /* Wrong data */ +#define BLOCK_SYNC_ERROR 16 /* Right data at wrong place */ +#define BLOCK_FATAL_ERROR 32 /* hardware-error */ + +#define NEED_MEM ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */ +#define MAXERR 20 +#define BUFFERS_WHEN_SORTING 16 /* Alloc for sort-key-tree */ +#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE +#define INDEX_TMP_EXT ".TMM" +#define DATA_TMP_EXT ".TMD" + +#define UPDATE_TIME 1 +#define UPDATE_STAT 2 +#define UPDATE_SORT 4 +#define UPDATE_AUTO_INC 8 +#define UPDATE_OPEN_COUNT 16 + +#define USE_BUFFER_INIT (((1024L*512L-MALLOC_OVERHEAD)/IO_SIZE)*IO_SIZE) +#define READ_BUFFER_INIT (1024L*256L-MALLOC_OVERHEAD) +#define SORT_BUFFER_INIT (2048L*1024L-MALLOC_OVERHEAD) +#define MIN_SORT_BUFFER (4096-MALLOC_OVERHEAD) + +enum myisam_log_commands +{ + MI_LOG_OPEN, MI_LOG_WRITE, MI_LOG_UPDATE, MI_LOG_DELETE, MI_LOG_CLOSE, + MI_LOG_EXTRA, MI_LOG_LOCK, MI_LOG_DELETE_ALL }; #define myisam_log(a,b,c,d) if (myisam_log_file >= 0) _myisam_log(a,b,c,d) @@ -693,44 +641,42 @@ enum myisam_log_commands { #define fast_mi_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _mi_writeinfo((INFO),0) #define fast_mi_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _mi_readinfo((INFO),F_RDLCK,1) -#ifdef __cplusplus +#ifdef __cplusplus extern "C" { #endif - -extern uint _mi_get_block_info(MI_BLOCK_INFO *,File, my_off_t); -extern uint _mi_rec_pack(MI_INFO *info,uchar *to,const uchar *from); + extern uint _mi_get_block_info(MI_BLOCK_INFO *, File, my_off_t); +extern uint _mi_rec_pack(MI_INFO *info, uchar *to, const uchar *from); extern uint _mi_pack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff, MI_BLOCK_INFO *info, uchar **rec_buff_p, File file, my_off_t filepos); -extern void _my_store_blob_length(uchar *pos,uint pack_length,uint length); -extern void _myisam_log(enum myisam_log_commands command,MI_INFO *info, - const uchar *buffert,uint length); +extern void _mi_store_blob_length(uchar *pos, uint pack_length, uint length); +extern void _myisam_log(enum myisam_log_commands command, MI_INFO *info, + const uchar *buffert, uint length); extern void _myisam_log_command(enum myisam_log_commands command, - MI_INFO *info, const uchar *buffert, - uint length, int result); -extern void _myisam_log_record(enum myisam_log_commands command,MI_INFO *info, - const uchar *record,my_off_t filepos, - int result); + MI_INFO *info, const uchar *buffert, + uint length, int result); +extern void _myisam_log_record(enum myisam_log_commands command, MI_INFO *info, + const uchar *record, my_off_t filepos, + int result); extern void mi_report_error(int errcode, const char *file_name); extern my_bool _mi_memmap_file(MI_INFO *info); extern void _mi_unmap_file(MI_INFO *info); extern uint save_pack_length(uint version, uchar *block_buff, ulong length); -extern uint read_pack_length(uint version, const uchar *buf, ulong *length); extern uint calc_pack_length(uint version, ulong length); extern size_t mi_mmap_pread(MI_INFO *info, uchar *Buffer, - size_t Count, my_off_t offset, myf MyFlags); + uint Count, my_off_t offset, myf MyFlags); extern size_t mi_mmap_pwrite(MI_INFO *info, const uchar *Buffer, - size_t Count, my_off_t offset, myf MyFlags); + uint Count, my_off_t offset, myf MyFlags); extern size_t mi_nommap_pread(MI_INFO *info, uchar *Buffer, - size_t Count, my_off_t offset, myf MyFlags); + uint Count, my_off_t offset, myf MyFlags); extern size_t mi_nommap_pwrite(MI_INFO *info, const uchar *Buffer, - size_t Count, my_off_t offset, myf MyFlags); + uint Count, my_off_t offset, myf MyFlags); uint mi_state_info_write(File file, MI_STATE_INFO *state, uint pWrite); uchar *mi_state_info_read(uchar *ptr, MI_STATE_INFO *state); uint mi_state_info_read_dsk(File file, MI_STATE_INFO *state, my_bool pRead); uint mi_base_info_write(File file, MI_BASE_INFO *base); -uchar *my_n_base_info_read(uchar *ptr, MI_BASE_INFO *base); +uchar *mi_n_base_info_read(uchar *ptr, MI_BASE_INFO *base); int mi_keyseg_write(File file, const HA_KEYSEG *keyseg); uchar *mi_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg); uint mi_keydef_write(File file, MI_KEYDEF *keydef); @@ -742,23 +688,23 @@ uchar *mi_recinfo_read(uchar *ptr, MI_COLUMNDEF *recinfo); extern int mi_disable_indexes(MI_INFO *info); extern int mi_enable_indexes(MI_INFO *info); extern int mi_indexes_are_disabled(MI_INFO *info); -ulong _my_calc_total_blob_length(MI_INFO *info, const uchar *record); +ulong _mi_calc_total_blob_length(MI_INFO *info, const uchar *record); ha_checksum mi_checksum(MI_INFO *info, const uchar *buf); ha_checksum mi_static_checksum(MI_INFO *info, const uchar *buf); my_bool mi_check_unique(MI_INFO *info, MI_UNIQUEDEF *def, uchar *record, - ha_checksum unique_hash, my_off_t pos); + ha_checksum unique_hash, my_off_t pos); ha_checksum mi_unique_hash(MI_UNIQUEDEF *def, const uchar *buf); int _mi_cmp_static_unique(MI_INFO *info, MI_UNIQUEDEF *def, - const uchar *record, my_off_t pos); + const uchar *record, my_off_t pos); int _mi_cmp_dynamic_unique(MI_INFO *info, MI_UNIQUEDEF *def, - const uchar *record, my_off_t pos); + const uchar *record, my_off_t pos); int mi_unique_comp(MI_UNIQUEDEF *def, const uchar *a, const uchar *b, - my_bool null_are_equal); -void mi_get_status(void* param, int concurrent_insert); -void mi_update_status(void* param); -void mi_restore_status(void* param); -void mi_copy_status(void* to,void *from); -my_bool mi_check_status(void* param); + my_bool null_are_equal); +void mi_get_status(void *param, int concurrent_insert); +void mi_update_status(void *param); +void mi_restore_status(void *param); +void mi_copy_status(void *to, void *from); +my_bool mi_check_status(void *param); void mi_disable_non_unique_index(MI_INFO *info, ha_rows rows); extern MI_INFO *test_if_reopen(char *filename); @@ -770,22 +716,14 @@ my_bool mi_dynmap_file(MI_INFO *info, my_off_t size); void mi_remap_file(MI_INFO *info, my_off_t size); /* Functions needed by mi_check */ -volatile int *killed_ptr(MI_CHECK *param); -void mi_check_print_error _VARARGS((MI_CHECK *param, const char *fmt,...)); -void mi_check_print_warning _VARARGS((MI_CHECK *param, const char *fmt,...)); -void mi_check_print_info _VARARGS((MI_CHECK *param, const char *fmt,...)); -int flush_pending_blocks(MI_SORT_PARAM *param); -int sort_ft_buf_flush(MI_SORT_PARAM *sort_param); -int thr_write_keys(MI_SORT_PARAM *sort_param); +volatile int *killed_ptr(HA_CHECK *param); +void mi_check_print_error _VARARGS((HA_CHECK *param, const char *fmt, ...)); +void mi_check_print_warning _VARARGS((HA_CHECK *param, const char *fmt, ...)); +void mi_check_print_info _VARARGS((HA_CHECK *param, const char *fmt, ...)); #ifdef THREAD pthread_handler_t thr_find_all_keys(void *arg); #endif -int flush_blocks(MI_CHECK *param, KEY_CACHE *key_cache, File file); - -int sort_write_record(MI_SORT_PARAM *sort_param); -int _create_index_by_sort(MI_SORT_PARAM *info,my_bool no_messages, ulong); - +int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file); #ifdef __cplusplus } #endif - diff --git a/storage/myisam/myisamlog.c b/storage/myisam/myisamlog.c index 6566a7a7a02..04c0d9543d7 100644 --- a/storage/myisam/myisamlog.c +++ b/storage/myisam/myisamlog.c @@ -808,7 +808,7 @@ static int find_record_with_key(struct file_info *file_info, uchar *record) { uint key; MI_INFO *info=file_info->isam; - uchar tmp_key[MI_MAX_KEY_BUFF]; + uchar tmp_key[HA_MAX_KEY_BUFF]; for (key=0 ; key < info->s->base.keys ; key++) { diff --git a/storage/myisam/myisampack.c b/storage/myisam/myisampack.c index 37428ddd279..841fb45c184 100644 --- a/storage/myisam/myisampack.c +++ b/storage/myisam/myisampack.c @@ -305,7 +305,7 @@ static void usage(void) puts("and you are welcome to modify and redistribute it under the GPL license\n"); puts("Pack a MyISAM-table to take much less space."); - puts("Keys are not updated, you must run myisamchk -rq on the datafile"); + puts("Keys are not updated, you must run myisamchk -rq on the index (.MYI) file"); puts("afterwards to update the keys."); puts("You should give the .MYI file as the filename argument."); @@ -1008,7 +1008,7 @@ static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts) /* Calculate pos, end_pos, and max_length for variable length fields. */ if (count->field_type == FIELD_BLOB) { - uint field_length=count->field_length -mi_portable_sizeof_char_ptr; + uint field_length=count->field_length -portable_sizeof_char_ptr; ulong blob_length= _mi_calc_blob_length(field_length, start_pos); memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*)); end_pos=pos+blob_length; @@ -2650,7 +2650,7 @@ static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts) case FIELD_BLOB: { ulong blob_length=_mi_calc_blob_length(field_length- - mi_portable_sizeof_char_ptr, + portable_sizeof_char_ptr, start_pos); /* Empty blobs are encoded with a single 1 bit. */ if (!blob_length) @@ -2667,7 +2667,7 @@ static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts) DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u", blob_length, count->length_bits)); write_bits(blob_length,count->length_bits); - memcpy_fixed(&blob,end_pos-mi_portable_sizeof_char_ptr, + memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr, sizeof(char*)); blob_end=blob+blob_length; /* Encode the blob bytes. */ diff --git a/storage/myisam/rt_index.c b/storage/myisam/rt_index.c index 63ed60586d6..35a70b8c2bf 100644 --- a/storage/myisam/rt_index.c +++ b/storage/myisam/rt_index.c @@ -542,7 +542,7 @@ static int rtree_insert_req(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key, DBUG_ENTER("rtree_insert_req"); if (!(page_buf = (uchar*)my_alloca((uint)keyinfo->block_length + - MI_MAX_KEY_BUFF))) + HA_MAX_KEY_BUFF))) { my_errno = HA_ERR_OUT_OF_MEM; DBUG_RETURN(-1); /* purecov: inspected */ @@ -658,7 +658,7 @@ static int rtree_insert_level(MI_INFO *info, uint keynr, uchar *key, DBUG_PRINT("rtree", ("root was split, grow a new root")); if (!(new_root_buf = (uchar*)my_alloca((uint)keyinfo->block_length + - MI_MAX_KEY_BUFF))) + HA_MAX_KEY_BUFF))) { my_errno = HA_ERR_OUT_OF_MEM; DBUG_RETURN(-1); /* purecov: inspected */ diff --git a/storage/myisam/sort.c b/storage/myisam/sort.c index 2146a8d16cb..f11014570e6 100644 --- a/storage/myisam/sort.c +++ b/storage/myisam/sort.c @@ -15,7 +15,7 @@ /* Creates a index for a database by reading keys, sorting them and outputing - them in sorted order through SORT_INFO functions. + them in sorted order through MI_SORT_INFO functions. */ #include "fulltext.h" @@ -487,8 +487,8 @@ ok: int thr_write_keys(MI_SORT_PARAM *sort_param) { - SORT_INFO *sort_info=sort_param->sort_info; - MI_CHECK *param=sort_info->param; + MI_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; ulong length, keys; ulong *rec_per_key_part=param->rec_per_key_part; int got_error=sort_info->got_error; @@ -918,7 +918,7 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file, for (buffpek= Fb ; buffpek <= Tb ; buffpek++) { count+= buffpek->count; - buffpek->base= strpos; + buffpek->base= (uchar*) strpos; buffpek->max_keys=maxcount; strpos+= (uint) (error=(int) info->read_to_buffer(from_file,buffpek, sort_length)); @@ -956,7 +956,7 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file, { if (!(error=(int) info->read_to_buffer(from_file,buffpek,sort_length))) { - uchar *base=buffpek->base; + uchar *base= buffpek->base; uint max_keys=buffpek->max_keys; VOID(queue_remove(&queue,0)); @@ -988,7 +988,7 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file, } } buffpek=(BUFFPEK*) queue_top(&queue); - buffpek->base=(uchar *) sort_keys; + buffpek->base= (uchar*) sort_keys; buffpek->max_keys=keys; do { @@ -1003,7 +1003,7 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file, else { register uchar *end; - strpos= buffpek->key; + strpos= (uchar*) buffpek->key; for (end=strpos+buffpek->mem_count*sort_length; strpos != end ; strpos+=sort_length) diff --git a/storage/myisam/sp_test.c b/storage/myisam/sp_test.c index dee32ba423e..f572c7ab19b 100644 --- a/storage/myisam/sp_test.c +++ b/storage/myisam/sp_test.c @@ -79,7 +79,7 @@ int run_test(const char *filename) /* Define spatial column */ recinfo[1].type=FIELD_BLOB; - recinfo[1].length=4 + mi_portable_sizeof_char_ptr; + recinfo[1].length=4 + portable_sizeof_char_ptr; diff --git a/storage/myisammrg/ha_myisammrg.cc b/storage/myisammrg/ha_myisammrg.cc index 8a914e8a2de..b8119466d11 100644 --- a/storage/myisammrg/ha_myisammrg.cc +++ b/storage/myisammrg/ha_myisammrg.cc @@ -48,9 +48,11 @@ static const char *ha_myisammrg_exts[] = { }; extern int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out, MI_COLUMNDEF **recinfo_out, uint *records_out); -extern int check_definition(MI_KEYDEF *t1_keyinfo, MI_COLUMNDEF *t1_recinfo, +extern int check_definition(MI_KEYDEF *t1_keyinfo, + MI_COLUMNDEF *t1_recinfo, uint t1_keys, uint t1_recs, - MI_KEYDEF *t2_keyinfo, MI_COLUMNDEF *t2_recinfo, + MI_KEYDEF *t2_keyinfo, + MI_COLUMNDEF *t2_recinfo, uint t2_keys, uint t2_recs, bool strict); static void split_file_name(const char *file_name, LEX_STRING *db, LEX_STRING *name); @@ -390,7 +392,8 @@ int ha_myisammrg::extra(enum ha_extra_function operation) /* As this is just a mapping, we don't have to force the underlying tables to be closed */ if (operation == HA_EXTRA_FORCE_REOPEN || - operation == HA_EXTRA_PREPARE_FOR_DELETE) + operation == HA_EXTRA_PREPARE_FOR_DROP || + operation == HA_EXTRA_PREPARE_FOR_RENAME) return 0; return myrg_extra(file,operation,0); } diff --git a/storage/myisammrg/ha_myisammrg.h b/storage/myisammrg/ha_myisammrg.h index 91aabe277f7..1207ca96851 100644 --- a/storage/myisammrg/ha_myisammrg.h +++ b/storage/myisammrg/ha_myisammrg.h @@ -47,8 +47,8 @@ class ha_myisammrg: public handler HA_READ_ORDER | HA_KEYREAD_ONLY); } uint max_supported_keys() const { return MI_MAX_KEY; } - uint max_supported_key_length() const { return MI_MAX_KEY_LENGTH; } - uint max_supported_key_part_length() const { return MI_MAX_KEY_LENGTH; } + uint max_supported_key_length() const { return HA_MAX_KEY_LENGTH; } + uint max_supported_key_part_length() const { return HA_MAX_KEY_LENGTH; } double scan_time() { return ulonglong2double(stats.data_file_length) / IO_SIZE + file->tables; } |