diff options
102 files changed, 3487 insertions, 623 deletions
diff --git a/.bzrignore b/.bzrignore index b988e51232f..c2d82b8a418 100644 --- a/.bzrignore +++ b/.bzrignore @@ -138,13 +138,19 @@ bdb/test/logtrack.list bdb/txn/txn_auto.c binary/* client/insert_test +client/log_event.cc +client/log_event.h +client/mf_iocache.c +client/mf_iocache.cc client/mysql client/mysqladmin +client/mysqlbinlog client/mysqlcheck client/mysqldump client/mysqlimport client/mysqlshow client/mysqltest +client/mysys_priv.h client/select_test client/thimble client/thread_test @@ -254,6 +260,7 @@ libmysqld/sql_string.cc libmysqld/sql_table.cc libmysqld/sql_test.cc libmysqld/sql_udf.cc +libmysqld/sql_union.cc libmysqld/sql_unions.cc libmysqld/sql_update.cc libmysqld/sql_yacc.cc @@ -290,6 +297,7 @@ mysql-test/r/*.reject mysql-test/r/rpl_log.eval mysql-test/share/mysql mysql-test/var/* +mysql.kdevprj mysql.proj mysqld.S mysqld.sym @@ -376,4 +384,3 @@ support-files/mysql.spec tags tmp/* vio/viotest-ssl -libmysqld/sql_union.cc diff --git a/Docs/Flags/indonesia.eps b/Docs/Flags/indonesia.eps new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/Docs/Flags/indonesia.eps diff --git a/Docs/Flags/indonesia.gif b/Docs/Flags/indonesia.gif Binary files differnew file mode 100644 index 00000000000..1c421df50ba --- /dev/null +++ b/Docs/Flags/indonesia.gif diff --git a/Docs/Flags/indonesia.txt b/Docs/Flags/indonesia.txt new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/Docs/Flags/indonesia.txt diff --git a/Docs/Flags/yugoslavia.eps b/Docs/Flags/yugoslavia.eps new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/Docs/Flags/yugoslavia.eps diff --git a/Docs/Flags/yugoslavia.gif b/Docs/Flags/yugoslavia.gif Binary files differnew file mode 100644 index 00000000000..650eac242d6 --- /dev/null +++ b/Docs/Flags/yugoslavia.gif diff --git a/Docs/Flags/yugoslavia.txt b/Docs/Flags/yugoslavia.txt new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/Docs/Flags/yugoslavia.txt diff --git a/Makefile.am b/Makefile.am index 7343f617449..70a8140eff5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -40,6 +40,7 @@ linked_include_sources: echo timestamp > linked_include_sources linked_client_sources: @linked_client_targets@ + cd client; $(MAKE) link_sources echo timestamp > linked_client_sources linked_libmysql_sources: diff --git a/client/Makefile.am b/client/Makefile.am index 52260780248..91e64123ed4 100644 --- a/client/Makefile.am +++ b/client/Makefile.am @@ -20,8 +20,9 @@ INCLUDES = -I$(srcdir)/../include $(openssl_includes) \ -I../include -I$(srcdir)/.. -I$(top_srcdir) \ -I.. LIBS = @CLIENT_LIBS@ -LDADD = @CLIENT_EXTRA_LDFLAGS@ ../libmysql/libmysqlclient.la -bin_PROGRAMS = mysql mysqladmin mysqlcheck mysqlshow mysqldump mysqlimport mysqltest +LDADD = @CLIENT_EXTRA_LDFLAGS@ ../libmysql/libmysqlclient.la +bin_PROGRAMS = mysql mysqladmin mysqlcheck mysqlshow \ + mysqldump mysqlimport mysqltest mysqlbinlog noinst_PROGRAMS = insert_test select_test thread_test noinst_HEADERS = sql_string.h completion_hash.h my_readline.h mysql_SOURCES = mysql.cc readline.cc sql_string.cc completion_hash.cc @@ -36,10 +37,24 @@ insert_test_DEPENDENCIES= $(LIBRARIES) $(pkglib_LTLIBRARIES) select_test_DEPENDENCIES= $(LIBRARIES) $(pkglib_LTLIBRARIES) mysqltest_SOURCES= mysqltest.c mysqltest_DEPENDENCIES= $(LIBRARIES) $(pkglib_LTLIBRARIES) +mysqlbinlog_SOURCES = mysqlbinlog.cc +mysqlbinlog_DEPENDENCIES= $(LIBRARIES) $(pkglib_LTLIBRARIES) +sql_src=log_event.h log_event.cc +mysys_src=mysys_priv.h # Fix for mit-threads DEFS = -DUNDEF_THREADS_HACK +link_sources: + for f in $(sql_src) ; do \ + rm -f $$f; \ + @LN_CP_F@ ../sql/$$f $$f; \ + done; \ + for f in $(mysys_src); do \ + rm -f $$f; \ + @LN_CP_F@ ../mysys/$$f $$f; \ + done; + thread_test.o: thread_test.c $(COMPILE) -c @MT_INCLUDES@ $(INCLUDES) $< diff --git a/sql/mysqlbinlog.cc b/client/mysqlbinlog.cc index 5edfe6e0591..a89b41fdfd3 100644 --- a/sql/mysqlbinlog.cc +++ b/client/mysqlbinlog.cc @@ -22,13 +22,19 @@ #include <my_sys.h> #include <getopt.h> #include <thr_alarm.h> -#define MYSQL_SERVER // We want the C++ version of net #include <mysql.h> #include "log_event.h" -#include "mini_client.h" #define CLIENT_CAPABILITIES (CLIENT_LONG_PASSWORD | CLIENT_LONG_FLAG | CLIENT_LOCAL_FILES) +extern "C" +{ + int simple_command(MYSQL *mysql,enum enum_server_command command, + const char *arg, + uint length, my_bool skipp_check); + int net_safe_read(MYSQL* mysql); +} + char server_version[SERVER_VERSION_LENGTH]; uint32 server_id = 0; @@ -108,7 +114,7 @@ static void die(const char* fmt, ...) static void print_version() { - printf("%s Ver 1.4 for %s at %s\n",my_progname,SYSTEM_TYPE, MACHINE_TYPE); + printf("%s Ver 1.5 for %s at %s\n",my_progname,SYSTEM_TYPE, MACHINE_TYPE); } @@ -248,12 +254,12 @@ static int parse_args(int *argc, char*** argv) static MYSQL* safe_connect() { - MYSQL *local_mysql = mc_mysql_init(NULL); + MYSQL *local_mysql = mysql_init(NULL); if(!local_mysql) - die("Failed on mc_mysql_init"); + die("Failed on mysql_init"); - if(!mc_mysql_connect(local_mysql, host, user, pass, 0, port, 0, 0)) - die("failed on connect: %s", mc_mysql_error(local_mysql)); + if(!mysql_real_connect(local_mysql, host, user, pass, 0, port, 0, 0)) + die("failed on connect: %s", mysql_error(local_mysql)); return local_mysql; } @@ -281,7 +287,7 @@ static void dump_remote_table(NET* net, const char* db, const char* table) *p++ = table_len; memcpy(p, table, table_len); - if(mc_simple_command(mysql, COM_TABLE_DUMP, buf, p - buf + table_len, 1)) + if(simple_command(mysql, COM_TABLE_DUMP, buf, p - buf + table_len, 1)) die("Error sending the table dump command"); for(;;) @@ -314,14 +320,14 @@ static void dump_remote_log_entries(const char* logname) len = (uint) strlen(logname); int4store(buf + 6, 0); memcpy(buf + 10, logname,len); - if(mc_simple_command(mysql, COM_BINLOG_DUMP, buf, len + 10, 1)) + if(simple_command(mysql, COM_BINLOG_DUMP, buf, len + 10, 1)) die("Error sending the log dump command"); for(;;) { - len = mc_net_safe_read(mysql); + len = net_safe_read(mysql); if (len == packet_error) - die("Error reading packet from server: %s", mc_mysql_error(mysql)); + die("Error reading packet from server: %s", mysql_error(mysql)); if(len == 1 && net->read_pos[0] == 254) break; // end of data DBUG_PRINT("info",( "len= %u, net->read_pos[5] = %d\n", @@ -391,7 +397,7 @@ static void dump_local_log_entries(const char* logname) char llbuff[21]; my_off_t old_off = my_b_tell(file); - Log_event* ev = Log_event::read_log_event(file, 0); + Log_event* ev = Log_event::read_log_event(file); if (!ev) { if (file->error) @@ -430,9 +436,6 @@ int main(int argc, char** argv) if(use_remote) { -#ifndef __WIN__ - init_thr_alarm(10); // need to do this manually -#endif mysql = safe_connect(); } @@ -457,7 +460,7 @@ int main(int argc, char** argv) if (result_file != stdout) my_fclose(result_file, MYF(0)); if (use_remote) - mc_mysql_close(mysql); + mysql_close(mysql); return 0; } diff --git a/client/mysqlimport.c b/client/mysqlimport.c index 3672edd62e5..79f0a8d584e 100644 --- a/client/mysqlimport.c +++ b/client/mysqlimport.c @@ -48,17 +48,19 @@ static MYSQL mysql_connection; static char *opt_password=0, *current_user=0, *current_host=0, *current_db=0, *fields_terminated=0, *lines_terminated=0, *enclosed=0, *opt_enclosed=0, - *escaped=0, opt_low_priority=0, *opt_columns=0; + *escaped=0, opt_low_priority=0, *opt_columns=0, + *default_charset; static uint opt_mysql_port=0; static my_string opt_mysql_unix_port=0; #include "sslopt-vars.h" enum options {OPT_FTB=256, OPT_LTB, OPT_ENC, OPT_O_ENC, OPT_ESC, - OPT_LOW_PRIORITY, OPT_CHARSETS_DIR}; + OPT_LOW_PRIORITY, OPT_CHARSETS_DIR, OPT_DEFAULT_CHARSET}; static struct option long_options[] = { {"character-sets-dir", required_argument, 0, OPT_CHARSETS_DIR}, + {"default-character-set", required_argument, 0, OPT_DEFAULT_CHARSET}, {"columns", required_argument, 0, 'c'}, {"compress", no_argument, 0, 'C'}, {"debug", optional_argument, 0, '#'}, @@ -119,6 +121,8 @@ file. The SQL command 'LOAD DATA INFILE' is used to import the rows.\n"); printf("\n\ -#, --debug[=...] Output debug log. Often this is 'd:t:o,filename`\n\ -?, --help Displays this help and exits.\n\ + --default-character-set=...\n\ + Set the default character set.\n\ --character-sets-dir=...\n\ Directory where character sets are\n\ -c, --columns=... Use only these columns to import the data to.\n\ @@ -179,6 +183,9 @@ static int get_options(int *argc, char ***argv) case 'C': opt_compress=1; break; + case OPT_DEFAULT_CHARSET: + default_charset= optarg; + break; case OPT_CHARSETS_DIR: charsets_dir= optarg; break; @@ -269,6 +276,11 @@ static int get_options(int *argc, char ***argv) fprintf(stderr, "You can't use --ignore (-i) and --replace (-r) at the same time.\n"); return(1); } + if (default_charset) + { + if (set_default_charset_by_name(default_charset, MYF(MY_WME))) + exit(1); + } (*argc)-=optind; (*argv)+=optind; if (*argc < 2) diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c index 2507f805cd6..af2029bf1e8 100644 --- a/innobase/btr/btr0btr.c +++ b/innobase/btr/btr0btr.c @@ -71,30 +71,6 @@ btr_page_create( dict_tree_t* tree, /* in: index tree */ mtr_t* mtr); /* in: mtr */ /****************************************************************** -Allocates a new file page to be used in an index tree. */ -static -page_t* -btr_page_alloc( -/*===========*/ - /* out: new allocated page, - x-latched */ - dict_tree_t* tree, /* in: index tree */ - ulint hint_page_no, /* in: hint of a good page */ - byte file_direction, /* in: direction where a possible - page split is made */ - ulint level, /* in: level where the page is placed - in the tree */ - mtr_t* mtr); /* in: mtr */ -/****************************************************************** -Frees a file page used in an index tree. */ -static -void -btr_page_free( -/*==========*/ - dict_tree_t* tree, /* in: index tree */ - page_t* page, /* in, own: page to be freed */ - mtr_t* mtr); /* in: mtr */ -/****************************************************************** Sets the child node file address in a node pointer. */ UNIV_INLINE void @@ -319,11 +295,12 @@ btr_page_alloc_for_ibuf( /****************************************************************** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! */ -static + page_t* btr_page_alloc( /*===========*/ - /* out: new allocated page, x-latched */ + /* out: new allocated page, x-latched; + NULL if out of space */ dict_tree_t* tree, /* in: index tree */ ulint hint_page_no, /* in: hint of a good page */ byte file_direction, /* in: direction where a possible @@ -358,7 +335,10 @@ btr_page_alloc( new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, file_direction, TRUE, mtr); - ut_a(new_page_no != FIL_NULL); + if (new_page_no == FIL_NULL) { + + return(NULL); + } new_page = buf_page_get(dict_tree_get_space(tree), new_page_no, RW_X_LATCH, mtr); @@ -435,20 +415,22 @@ btr_page_free_for_ibuf( } /****************************************************************** -Frees a file page used in an index tree. */ -static +Frees a file page used in an index tree. Can be used also to (BLOB) +external storage pages, because the page level 0 can be given as an +argument. */ + void -btr_page_free( -/*==========*/ +btr_page_free_low( +/*==============*/ dict_tree_t* tree, /* in: index tree */ page_t* page, /* in: page to be freed, x-latched */ + ulint level, /* in: page level */ mtr_t* mtr) /* in: mtr */ { fseg_header_t* seg_header; page_t* root; ulint space; ulint page_no; - ulint level; ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); @@ -465,8 +447,6 @@ btr_page_free( } root = btr_root_get(tree, mtr); - - level = btr_page_get_level(page, mtr); if (level == 0) { seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; @@ -481,6 +461,26 @@ btr_page_free( } /****************************************************************** +Frees a file page used in an index tree. NOTE: cannot free field external +storage pages because the page must contain info on its level. */ + +void +btr_page_free( +/*==========*/ + dict_tree_t* tree, /* in: index tree */ + page_t* page, /* in: page to be freed, x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + ulint level; + + ut_ad(mtr_memo_contains(mtr, buf_block_align(page), + MTR_MEMO_PAGE_X_FIX)); + level = btr_page_get_level(page, mtr); + + btr_page_free_low(tree, page, level, mtr); +} + +/****************************************************************** Sets the child node file address in a node pointer. */ UNIV_INLINE void @@ -1276,6 +1276,7 @@ btr_insert_on_non_leaf_level( dtuple_t* tuple, /* in: the record to be inserted */ mtr_t* mtr) /* in: mtr */ { + big_rec_t* dummy_big_rec; btr_cur_t cursor; ulint err; rec_t* rec; @@ -1294,7 +1295,7 @@ btr_insert_on_non_leaf_level( | BTR_KEEP_SYS_FLAG | BTR_NO_UNDO_LOG_FLAG, &cursor, tuple, - &rec, NULL, mtr); + &rec, &dummy_big_rec, NULL, mtr); ut_a(err == DB_SUCCESS); } diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index a8680c6b380..47a67d425cd 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -12,7 +12,7 @@ many pages in the tablespace before we start the operation, because if leaf splitting has been started, it is difficult to undo, except by crashing the database and doing a roll-forward. -(c) 1994-1996 Innobase Oy +(c) 1994-2001 Innobase Oy Created 10/16/1994 Heikki Tuuri *******************************************************/ @@ -49,6 +49,15 @@ can be released by page reorganize, then it is reorganized */ this many index pages */ #define BTR_KEY_VAL_ESTIMATE_N_PAGES 8 +/* The structure of a BLOB part header */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_PART_LEN 0 /* BLOB part len on this + page */ +#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /* next BLOB part page no, + FIL_NULL if none */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_SIZE 8 + /*********************************************************************** Adds path information to the cursor for the current page, for which the binary search has been performed. */ @@ -60,6 +69,19 @@ btr_cur_add_path_info( ulint height, /* in: height of the page in tree; 0 means leaf node */ ulint root_height); /* in: root node height in tree */ +/*************************************************************** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /* in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /* in: record */ + upd_t* update, /* in: update vector */ + mtr_t* mtr); /* in: mini-transaction handle which contains + an X-latch to record page and to the tree */ /*==================== B-TREE SEARCH =========================*/ @@ -745,9 +767,13 @@ btr_cur_optimistic_insert( dtuple_t* entry, /* in: entry to insert */ rec_t** rec, /* out: pointer to inserted record if succeed */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ que_thr_t* thr, /* in: query thread or NULL */ mtr_t* mtr) /* in: mtr */ { + big_rec_t* big_rec_vec = NULL; dict_index_t* index; page_cur_t* page_cursor; page_t* page; @@ -764,6 +790,8 @@ btr_cur_optimistic_insert( ut_ad(dtuple_check_typed(entry)); + *big_rec = NULL; + page = btr_cur_get_page(cursor); index = cursor->index; @@ -772,15 +800,27 @@ btr_cur_optimistic_insert( max_size = page_get_max_insert_size_after_reorganize(page, 1); level = btr_page_get_level(page, mtr); +calculate_sizes_again: /* Calculate the record size when entry is converted to a record */ data_size = dtuple_get_data_size(entry); extra_size = rec_get_converted_extra_size(data_size, dtuple_get_n_fields(entry)); rec_size = data_size + extra_size; - if (rec_size >= page_get_free_space_of_empty() / 2) { + if ((rec_size >= page_get_free_space_of_empty() / 2) + || (rec_size >= REC_MAX_DATA_SIZE)) { - return(DB_TOO_BIG_RECORD); + /* The record is so big that we have to store some fields + externally on separate database pages */ + + big_rec_vec = dtuple_convert_big_rec(index, entry); + + if (big_rec_vec == NULL) { + + return(DB_TOO_BIG_RECORD); + } + + goto calculate_sizes_again; } /* If there have been many consecutive inserts, and we are on the leaf @@ -795,7 +835,11 @@ btr_cur_optimistic_insert( && (0 == level) && (btr_page_get_split_rec_to_right(cursor, &dummy_rec) || btr_page_get_split_rec_to_left(cursor, &dummy_rec))) { - + + if (big_rec_vec) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + return(DB_FAIL); } @@ -804,6 +848,9 @@ btr_cur_optimistic_insert( || (page_get_max_insert_size(page, 1) >= rec_size) || (page_get_n_recs(page) <= 1))) { + if (big_rec_vec) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } return(DB_FAIL); } @@ -812,6 +859,9 @@ btr_cur_optimistic_insert( if (err != DB_SUCCESS) { + if (big_rec_vec) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } return(err); } @@ -835,6 +885,19 @@ btr_cur_optimistic_insert( *rec = page_cur_tuple_insert(page_cursor, entry, mtr); + if (!(*rec)) { + char* err_buf = mem_alloc(1000); + + dtuple_sprintf(err_buf, 900, entry); + + fprintf(stderr, + "InnoDB: Error: cannot insert tuple %s to index %s of table %s\n" + "InnoDB: max insert size %lu\n", + err_buf, index->name, index->table->name, max_size); + + mem_free(err_buf); + } + ut_a(*rec); /* <- We calculated above the record would fit */ } @@ -845,6 +908,7 @@ btr_cur_optimistic_insert( btr_search_update_hash_on_insert(cursor); } #endif + if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) { lock_update_insert(*rec); @@ -860,6 +924,8 @@ btr_cur_optimistic_insert( rec_size + PAGE_DIR_SLOT_SIZE); } + *big_rec = big_rec_vec; + return(DB_SUCCESS); } @@ -884,17 +950,24 @@ btr_cur_pessimistic_insert( dtuple_t* entry, /* in: entry to insert */ rec_t** rec, /* out: pointer to inserted record if succeed */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ que_thr_t* thr, /* in: query thread or NULL */ mtr_t* mtr) /* in: mtr */ { - page_t* page; - ulint err; - ibool dummy_inh; - ibool success; - ulint n_extents = 0; + dict_index_t* index = cursor->index; + big_rec_t* big_rec_vec = NULL; + page_t* page; + ulint err; + ibool dummy_inh; + ibool success; + ulint n_extents = 0; ut_ad(dtuple_check_typed(entry)); + *big_rec = NULL; + page = btr_cur_get_page(cursor); ut_ad(mtr_memo_contains(mtr, @@ -908,8 +981,8 @@ btr_cur_pessimistic_insert( cursor->flag = BTR_CUR_BINARY; - err = btr_cur_optimistic_insert(flags, cursor, entry, rec, thr, mtr); - + err = btr_cur_optimistic_insert(flags, cursor, entry, rec, big_rec, + thr, mtr); if (err != DB_FAIL) { return(err); @@ -932,7 +1005,7 @@ btr_cur_pessimistic_insert( n_extents = cursor->tree_height / 16 + 3; - success = fsp_reserve_free_extents(cursor->index->space, + success = fsp_reserve_free_extents(index->space, n_extents, FSP_NORMAL, mtr); if (!success) { err = DB_OUT_OF_FILE_SPACE; @@ -941,7 +1014,22 @@ btr_cur_pessimistic_insert( } } - if (dict_tree_get_page(cursor->index->tree) + if ((rec_get_converted_size(entry) + >= page_get_free_space_of_empty() / 2) + || (rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE)) { + + /* The record is so big that we have to store some fields + externally on separate database pages */ + + big_rec_vec = dtuple_convert_big_rec(index, entry); + + if (big_rec_vec == NULL) { + + return(DB_TOO_BIG_RECORD); + } + } + + if (dict_tree_get_page(index->tree) == buf_frame_get_page_no(page)) { /* The page is the root page */ @@ -950,7 +1038,7 @@ btr_cur_pessimistic_insert( *rec = btr_page_split_and_insert(cursor, entry, mtr); } - btr_cur_position(cursor->index, page_rec_get_prev(*rec), cursor); + btr_cur_position(index, page_rec_get_prev(*rec), cursor); #ifdef BTR_CUR_ADAPT btr_search_update_hash_on_insert(cursor); @@ -963,9 +1051,11 @@ btr_cur_pessimistic_insert( err = DB_SUCCESS; if (n_extents > 0) { - fil_space_release_free_extents(cursor->index->space, n_extents); + fil_space_release_free_extents(index->space, n_extents); } - + + *big_rec = big_rec_vec; + return(err); } @@ -1227,7 +1317,8 @@ btr_cur_optimistic_update( dulint roll_ptr; trx_t* trx; mem_heap_t* heap; - ibool reorganized = FALSE; + ibool reorganized = FALSE; + ulint i; /* Only clustered index records are updated using this function */ ut_ad((cursor->index)->type & DICT_CLUSTERED); @@ -1247,6 +1338,23 @@ btr_cur_optimistic_update( cmpl_info, thr, mtr)); } + for (i = 0; i < upd_get_n_fields(update); i++) { + if (upd_get_nth_field(update, i)->extern_storage) { + + /* Externally stored fields are treated in pessimistic + update */ + + return(DB_OVERFLOW); + } + } + + if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) { + /* Externally stored fields are treated in pessimistic + update */ + + return(DB_OVERFLOW); + } + page_cursor = btr_cur_get_page_cur(cursor); heap = mem_heap_create(1024); @@ -1260,9 +1368,9 @@ btr_cur_optimistic_update( if (new_rec_size >= page_get_free_space_of_empty() / 2) { - mem_heap_free(heap); + mem_heap_free(heap); - return(DB_TOO_BIG_RECORD); + return(DB_OVERFLOW); } max_size = old_rec_size @@ -1377,6 +1485,48 @@ btr_cur_pess_upd_restore_supremum( rec); } +/*************************************************************** +Replaces and copies the data in the new column values stored in the +update vector to the clustered index entry given. */ +static +void +btr_cur_copy_new_col_vals( +/*======================*/ + dtuple_t* entry, /* in/out: index entry where replaced */ + upd_t* update, /* in: update vector */ + mem_heap_t* heap) /* in: heap where data is copied */ +{ + upd_field_t* upd_field; + dfield_t* dfield; + dfield_t* new_val; + ulint field_no; + byte* data; + ulint i; + + dtuple_set_info_bits(entry, update->info_bits); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + field_no = upd_field->field_no; + + dfield = dtuple_get_nth_field(entry, field_no); + + new_val = &(upd_field->new_val); + + if (new_val->len == UNIV_SQL_NULL) { + data = NULL; + } else { + data = mem_heap_alloc(heap, new_val->len); + + ut_memcpy(data, new_val->data, new_val->len); + } + + dfield_set_data(dfield, data, new_val->len); + } +} + /***************************************************************** Performs an update of a record on a page of a tree. It is assumed that mtr holds an x-latch on the tree and on the cursor page. If the @@ -1389,8 +1539,9 @@ btr_cur_pessimistic_update( /* out: DB_SUCCESS or error code */ ulint flags, /* in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /* in: cursor on the record to update; - cursor does not stay valid */ + btr_cur_t* cursor, /* in: cursor on the record to update */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or NULL */ upd_t* update, /* in: update vector; this is allowed also contain trx id and roll ptr fields, but the values in update vector have no effect */ @@ -1399,6 +1550,8 @@ btr_cur_pessimistic_update( que_thr_t* thr, /* in: query thread */ mtr_t* mtr) /* in: mtr */ { + big_rec_t* big_rec_vec = NULL; + big_rec_t* dummy_big_rec; dict_index_t* index; page_t* page; dict_tree_t* tree; @@ -1414,6 +1567,11 @@ btr_cur_pessimistic_update( ibool was_first; ibool success; ulint n_extents = 0; + ulint* ext_vect; + ulint n_ext_vect; + ulint reserve_flag; + + *big_rec = NULL; page = btr_cur_get_page(cursor); rec = btr_cur_get_rec(cursor); @@ -1449,8 +1607,14 @@ btr_cur_pessimistic_update( n_extents = cursor->tree_height / 16 + 3; + if (flags & BTR_NO_UNDO_LOG_FLAG) { + reserve_flag = FSP_CLEANING; + } else { + reserve_flag = FSP_NORMAL; + } + success = fsp_reserve_free_extents(cursor->index->space, - n_extents, FSP_NORMAL, mtr); + n_extents, reserve_flag, mtr); if (!success) { err = DB_OUT_OF_FILE_SPACE; @@ -1464,7 +1628,7 @@ btr_cur_pessimistic_update( new_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); - row_upd_clust_index_replace_new_col_vals(new_entry, update); + btr_cur_copy_new_col_vals(new_entry, update, heap); if (!(flags & BTR_KEEP_SYS_FLAG)) { row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, @@ -1487,17 +1651,49 @@ btr_cur_pessimistic_update( lock_rec_store_on_page_infimum(rec); btr_search_update_hash_on_delete(cursor); + + if (flags & BTR_NO_UNDO_LOG_FLAG) { + /* We are in a transaction rollback undoing a row + update: we must free possible externally stored fields + which got new values in the update */ + + ut_a(big_rec_vec == NULL); + + btr_rec_free_updated_extern_fields(index, rec, update, mtr); + } + + /* We have to set appropriate extern storage bits in the new + record to be inserted: we have to remember which fields were such */ + + ext_vect = mem_heap_alloc(heap, sizeof(ulint) * rec_get_n_fields(rec)); + n_ext_vect = btr_push_update_extern_fields(ext_vect, rec, update); + page_cur_delete_rec(page_cursor, mtr); page_cur_move_to_prev(page_cursor); - if (optim_err == DB_UNDERFLOW) { - rec = btr_cur_insert_if_possible(cursor, new_entry, + if ((rec_get_converted_size(new_entry) >= + page_get_free_space_of_empty() / 2) + || (rec_get_converted_size(new_entry) >= REC_MAX_DATA_SIZE)) { + + big_rec_vec = dtuple_convert_big_rec(index, new_entry); + + if (big_rec_vec == NULL) { + + mem_heap_free(heap); + + goto return_after_reservations; + } + } + + rec = btr_cur_insert_if_possible(cursor, new_entry, &dummy_reorganized, mtr); - ut_a(rec); /* <- We knew the insert would fit */ + ut_a(rec || optim_err != DB_UNDERFLOW); + if (rec) { lock_rec_restore_from_page_infimum(rec, page); - + rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr); + btr_cur_compress_if_useful(cursor, mtr); err = DB_SUCCESS; @@ -1521,9 +1717,13 @@ btr_cur_pessimistic_update( err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG, - cursor, new_entry, &rec, NULL, mtr); + cursor, new_entry, &rec, + &dummy_big_rec, NULL, mtr); ut_a(rec); ut_a(err == DB_SUCCESS); + ut_a(dummy_big_rec == NULL); + + rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr); lock_rec_restore_from_page_infimum(rec, page); @@ -1541,9 +1741,12 @@ btr_cur_pessimistic_update( return_after_reservations: if (n_extents > 0) { - fil_space_release_free_extents(cursor->index->space, n_extents); + fil_space_release_free_extents(cursor->index->space, + n_extents); } + *big_rec = big_rec_vec; + return(err); } @@ -1932,6 +2135,11 @@ btr_cur_optimistic_delete( ut_ad(btr_page_get_level(page, mtr) == 0); + if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) { + + return(FALSE); + } + if (btr_cur_can_delete_without_compress(cursor, mtr)) { lock_update_delete(btr_cur_get_rec(cursor)); @@ -2009,6 +2217,8 @@ btr_cur_pessimistic_delete( } } + btr_rec_free_externally_stored_fields(cursor->index, + btr_cur_get_rec(cursor), mtr); if ((page_get_n_recs(page) < 2) && (dict_tree_get_page(btr_cur_get_tree(cursor)) != buf_frame_get_page_no(page))) { @@ -2079,7 +2289,7 @@ return_after_reservations: fil_space_release_free_extents(cursor->index->space, n_extents); } - return(ret); + return(ret); } /*********************************************************************** @@ -2141,6 +2351,7 @@ btr_estimate_n_rows_in_range( btr_path_t* slot1; btr_path_t* slot2; ibool diverged; + ulint divergence_level; ulint n_rows; ulint i; mtr_t mtr; @@ -2183,6 +2394,7 @@ btr_estimate_n_rows_in_range( n_rows = 1; diverged = FALSE; + divergence_level = 1000000; for (i = 0; ; i++) { ut_ad(i < BTR_PATH_ARRAY_N_SLOTS); @@ -2193,6 +2405,13 @@ btr_estimate_n_rows_in_range( if (slot1->nth_rec == ULINT_UNDEFINED || slot2->nth_rec == ULINT_UNDEFINED) { + if (i > divergence_level + 1) { + /* In trees whose height is > 1 our algorithm + tends to underestimate: multiply the estimate + by 2: */ + + n_rows = n_rows * 2; + } return(n_rows); } @@ -2207,6 +2426,8 @@ btr_estimate_n_rows_in_range( return(10); } + divergence_level = i; + diverged = TRUE; } else if (diverged) { n_rows = (n_rows * (slot1->n_recs + slot2->n_recs)) @@ -2292,3 +2513,553 @@ btr_estimate_number_of_different_key_vals( return(index->table->stat_n_rows / (total_n_recs / n_diff)); } + +/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ + +/*********************************************************************** +Stores the positions of the fields marked as extern storage in the update +vector, and also those fields who are marked as extern storage in rec +and not mentioned in updated fields. We use this function to remember +which fields we must mark as extern storage in a record inserted for an +update. */ + +ulint +btr_push_update_extern_fields( +/*==========================*/ + /* out: number of values stored in ext_vect */ + ulint* ext_vect, /* in: array of ulints, must be preallocated + to have space for all fields in rec */ + rec_t* rec, /* in: record */ + upd_t* update) /* in: update vector or NULL */ +{ + ulint n_pushed = 0; + ibool is_updated; + ulint n; + ulint j; + ulint i; + + if (update) { + n = upd_get_n_fields(update); + + for (i = 0; i < n; i++) { + + if (upd_get_nth_field(update, i)->extern_storage) { + + ext_vect[n_pushed] = + upd_get_nth_field(update, i)->field_no; + + n_pushed++; + } + } + } + + n = rec_get_n_fields(rec); + + for (i = 0; i < n; i++) { + if (rec_get_nth_field_extern_bit(rec, i)) { + + /* Check it is not in updated fields */ + is_updated = FALSE; + + if (update) { + for (j = 0; j < upd_get_n_fields(update); + j++) { + if (upd_get_nth_field(update, j) + ->field_no == i) { + is_updated = TRUE; + } + } + } + + if (!is_updated) { + ext_vect[n_pushed] = i; + n_pushed++; + } + } + } + + return(n_pushed); +} + +/*********************************************************************** +Returns the length of a BLOB part stored on the header page. */ +static +ulint +btr_blob_get_part_len( +/*==================*/ + /* out: part length */ + byte* blob_header) /* in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN)); +} + +/*********************************************************************** +Returns the page number where the next BLOB part is stored. */ +static +ulint +btr_blob_get_next_page_no( +/*======================*/ + /* out: page number or FIL_NULL if + no more pages */ + byte* blob_header) /* in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO)); +} + +/*********************************************************************** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The fields are stored on pages allocated from leaf node +file segment of the index tree. */ + +ulint +btr_store_big_rec_extern_fields( +/*============================*/ + /* out: DB_SUCCESS or error */ + dict_index_t* index, /* in: index of rec; the index tree + MUST be X-latched */ + rec_t* rec, /* in: record */ + big_rec_t* big_rec_vec, /* in: vector containing fields + to be stored externally */ + mtr_t* local_mtr) /* in: mtr containing the latch to + rec and to the tree */ +{ + byte* data; + ulint local_len; + ulint extern_len; + ulint store_len; + ulint page_no; + page_t* page; + ulint space_id; + page_t* prev_page; + page_t* rec_page; + ulint prev_page_no; + ulint hint_page_no; + ulint i; + mtr_t mtr; + + ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(local_mtr, buf_block_align(data), + MTR_MEMO_PAGE_X_FIX)); + ut_a(index->type & DICT_CLUSTERED); + + space_id = buf_frame_get_space_id(rec); + + /* We have to create a file segment to the tablespace + for each field and put the pointer to the field in rec */ + + for (i = 0; i < big_rec_vec->n_fields; i++) { + + data = rec_get_nth_field(rec, big_rec_vec->fields[i].field_no, + &local_len); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + extern_len = big_rec_vec->fields[i].len; + + ut_a(extern_len > 0); + + prev_page_no = FIL_NULL; + + while (extern_len > 0) { + mtr_start(&mtr); + + if (prev_page_no == FIL_NULL) { + hint_page_no = buf_frame_get_page_no(rec) + 1; + } else { + hint_page_no = prev_page_no + 1; + } + + page = btr_page_alloc(index->tree, hint_page_no, + FSP_NO_DIR, 0, &mtr); + if (page == NULL) { + + mtr_commit(&mtr); + + return(DB_OUT_OF_FILE_SPACE); + } + + page_no = buf_frame_get_page_no(page); + + if (prev_page_no != FIL_NULL) { + prev_page = buf_page_get(space_id, + prev_page_no, + RW_X_LATCH, &mtr); + + buf_page_dbg_add_level(prev_page, + SYNC_EXTERN_STORAGE); + + mlog_write_ulint(prev_page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO, + page_no, MLOG_4BYTES, &mtr); + } + + if (extern_len > (UNIV_PAGE_SIZE - FIL_PAGE_DATA + - BTR_BLOB_HDR_SIZE + - FIL_PAGE_DATA_END)) { + store_len = UNIV_PAGE_SIZE - FIL_PAGE_DATA + - BTR_BLOB_HDR_SIZE + - FIL_PAGE_DATA_END; + } else { + store_len = extern_len; + } + + mlog_write_string(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_SIZE, + big_rec_vec->fields[i].data + + big_rec_vec->fields[i].len + - extern_len, + store_len, &mtr); + mlog_write_ulint(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN, + store_len, MLOG_4BYTES, &mtr); + mlog_write_ulint(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO, + FIL_NULL, MLOG_4BYTES, &mtr); + + extern_len -= store_len; + + rec_page = buf_page_get(space_id, + buf_frame_get_page_no(data), + RW_X_LATCH, &mtr); + + buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK); + + mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0, + MLOG_4BYTES, &mtr); + mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4, + big_rec_vec->fields[i].len + - extern_len, + MLOG_4BYTES, &mtr); + + if (prev_page_no == FIL_NULL) { + mlog_write_ulint(data + local_len + + BTR_EXTERN_SPACE_ID, + space_id, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(data + local_len + + BTR_EXTERN_PAGE_NO, + page_no, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(data + local_len + + BTR_EXTERN_OFFSET, + FIL_PAGE_DATA, + MLOG_4BYTES, &mtr); + + /* Set the bit denoting that this field + in rec is stored externally */ + + rec_set_nth_field_extern_bit(rec, + big_rec_vec->fields[i].field_no, + TRUE, &mtr); + } + + prev_page_no = page_no; + + mtr_commit(&mtr); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************** +Frees the space in an externally stored field to the file space +management. */ + +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /* in: index of the data, the index + tree MUST be X-latched */ + byte* data, /* in: internally stored data + + reference to the externally + stored part */ + ulint local_len, /* in: length of data */ + mtr_t* local_mtr) /* in: mtr containing the latch to + data an an X-latch to the index + tree */ +{ + page_t* page; + page_t* rec_page; + ulint space_id; + ulint page_no; + ulint offset; + ulint extern_len; + ulint next_page_no; + ulint part_len; + mtr_t mtr; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(local_mtr, buf_block_align(data), + MTR_MEMO_PAGE_X_FIX)); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + for (;;) { + mtr_start(&mtr); + + rec_page = buf_page_get(buf_frame_get_space_id(data), + buf_frame_get_page_no(data), RW_X_LATCH, &mtr); + + buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK); + + space_id = mach_read_from_4(data + local_len + + BTR_EXTERN_SPACE_ID); + + page_no = mach_read_from_4(data + local_len + + BTR_EXTERN_PAGE_NO); + + offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET); + + extern_len = mach_read_from_4(data + local_len + + BTR_EXTERN_LEN + 4); + + /* If extern len is 0, then there is no external storage data + at all */ + + if (extern_len == 0) { + + mtr_commit(&mtr); + + return; + } + + page = buf_page_get(space_id, page_no, RW_X_LATCH, &mtr); + + buf_page_dbg_add_level(page, SYNC_EXTERN_STORAGE); + + next_page_no = mach_read_from_4(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO); + + part_len = btr_blob_get_part_len(page + FIL_PAGE_DATA); + + ut_a(extern_len >= part_len); + + /* We must supply the page level (= 0) as an argument + because we did not store it on the page (we save the space + overhead from an index page header. */ + + btr_page_free_low(index->tree, page, 0, &mtr); + + mlog_write_ulint(data + local_len + BTR_EXTERN_PAGE_NO, + next_page_no, + MLOG_4BYTES, &mtr); + mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4, + extern_len - part_len, + MLOG_4BYTES, &mtr); + if (next_page_no == FIL_NULL) { + ut_a(extern_len - part_len == 0); + } + + if (extern_len - part_len == 0) { + ut_a(next_page_no == FIL_NULL); + } + + mtr_commit(&mtr); + } +} + +/*************************************************************** +Frees the externally stored fields for a record. */ + +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /* in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /* in: record */ + mtr_t* mtr) /* in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +{ + ulint n_fields; + byte* data; + ulint len; + ulint i; + + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), + MTR_MEMO_PAGE_X_FIX)); + if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { + + return; + } + + /* Free possible externally stored fields in the record */ + + n_fields = rec_get_n_fields(rec); + + for (i = 0; i < n_fields; i++) { + if (rec_get_nth_field_extern_bit(rec, i)) { + + data = rec_get_nth_field(rec, i, &len); + btr_free_externally_stored_field(index, data, len, mtr); + } + } +} + +/*************************************************************** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /* in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /* in: record */ + upd_t* update, /* in: update vector */ + mtr_t* mtr) /* in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +{ + upd_field_t* ufield; + ulint n_fields; + byte* data; + ulint len; + ulint i; + + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), + MTR_MEMO_PAGE_X_FIX)); + if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { + + return; + } + + /* Free possible externally stored fields in the record */ + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + ufield = upd_get_nth_field(update, i); + + if (rec_get_nth_field_extern_bit(rec, ufield->field_no)) { + + data = rec_get_nth_field(rec, ufield->field_no, &len); + btr_free_externally_stored_field(index, data, len, mtr); + } + } +} + +/*********************************************************************** +Copies an externally stored field of a record to mem heap. Parameter +data contains a pointer to 'internally' stored part of the field: +possibly some data, and the reference to the externally stored part in +the last 20 bytes of data. */ + +byte* +btr_copy_externally_stored_field( +/*=============================*/ + /* out: the whole field copied to heap */ + ulint* len, /* out: length of the whole field */ + byte* data, /* in: 'internally' stored part of the + field containing also the reference to + the external part */ + ulint local_len,/* in: length of data */ + mem_heap_t* heap) /* in: mem heap */ +{ + page_t* page; + ulint space_id; + ulint page_no; + ulint offset; + ulint extern_len; + byte* blob_header; + ulint part_len; + byte* buf; + ulint copied_len; + mtr_t mtr; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID); + + page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO); + + offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET); + + /* Currently a BLOB cannot be bigger that 4 GB; we + leave the 4 upper bytes in the length field unused */ + + extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4); + + buf = mem_heap_alloc(heap, local_len + extern_len); + + ut_memcpy(buf, data, local_len); + copied_len = local_len; + + if (extern_len == 0) { + *len = copied_len; + + return(buf); + } + + for (;;) { + mtr_start(&mtr); + + page = buf_page_get(space_id, page_no, RW_S_LATCH, &mtr); + + buf_page_dbg_add_level(page, SYNC_EXTERN_STORAGE); + + blob_header = page + offset; + + part_len = btr_blob_get_part_len(blob_header); + + ut_memcpy(buf + copied_len, blob_header + BTR_BLOB_HDR_SIZE, + part_len); + copied_len += part_len; + + page_no = btr_blob_get_next_page_no(blob_header); + + /* On other BLOB pages except the first the BLOB header + always is at the page data start: */ + + offset = FIL_PAGE_DATA; + + mtr_commit(&mtr); + + if (page_no == FIL_NULL) { + ut_a(copied_len == local_len + extern_len); + + *len = copied_len; + + return(buf); + } + + ut_a(copied_len < local_len + extern_len); + } +} + +/*********************************************************************** +Copies an externally stored field of a record to mem heap. */ + +byte* +btr_rec_copy_externally_stored_field( +/*=================================*/ + /* out: the field copied to heap */ + rec_t* rec, /* in: record */ + ulint no, /* in: field number */ + ulint* len, /* out: length of the field */ + mem_heap_t* heap) /* in: mem heap */ +{ + ulint local_len; + byte* data; + + ut_a(rec_get_nth_field_extern_bit(rec, no)); + + /* An externally stored field can contain some initial + data from the field, and in the last 20 bytes it has the + space id, page number, and offset where the rest of the + field data is stored, and the data length in addition to + the data stored locally. We may need to store some data + locally to get the local record length above the 128 byte + limit so that field offsets are stored in two bytes, and + the extern bit is available in those two bytes. */ + + data = rec_get_nth_field(rec, no, &local_len); + + return(btr_copy_externally_stored_field(len, data, local_len, heap)); +} diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index ede9e621462..3fabe6c6d0e 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -216,14 +216,44 @@ buf_calc_page_checksum( /* out: checksum */ byte* page) /* in: buffer page */ { - ulint checksum; + ulint checksum; - checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); - + ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA - - FIL_PAGE_END_LSN); - checksum = checksum & 0xFFFFFFFF; + checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); + + ut_fold_binary(page + FIL_PAGE_DATA, + UNIV_PAGE_SIZE - FIL_PAGE_DATA + - FIL_PAGE_END_LSN); + checksum = checksum & 0xFFFFFFFF; - return(checksum); + return(checksum); +} + +/************************************************************************ +Checks if a page is corrupt. */ + +ibool +buf_page_is_corrupted( +/*==================*/ + /* out: TRUE if corrupted */ + byte* read_buf) /* in: a database page */ +{ + ulint checksum; + + checksum = buf_calc_page_checksum(read_buf); + + if ((mach_read_from_4(read_buf + FIL_PAGE_LSN + 4) + != mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN + 4)) + || (checksum != mach_read_from_4(read_buf + + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN) + && mach_read_from_4(read_buf + FIL_PAGE_LSN) + != mach_read_from_4(read_buf + + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN))) { + return(TRUE); + } + + return(FALSE); } /************************************************************************ @@ -1265,34 +1295,22 @@ buf_page_io_complete( dulint id; dict_index_t* index; ulint io_type; - ulint checksum; ut_ad(block); io_type = block->io_fix; if (io_type == BUF_IO_READ) { - checksum = buf_calc_page_checksum(block->frame); - /* From version 3.23.38 up we store the page checksum to the 4 upper bytes of the page end lsn field */ - if ((mach_read_from_4(block->frame + FIL_PAGE_LSN + 4) - != mach_read_from_4(block->frame + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN + 4)) - || (checksum != mach_read_from_4(block->frame - + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN) - && mach_read_from_4(block->frame + FIL_PAGE_LSN) - != mach_read_from_4(block->frame - + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN))) { - fprintf(stderr, + if (buf_page_is_corrupted(block->frame)) { + fprintf(stderr, "InnoDB: Database page corruption or a failed\n" "InnoDB: file read of page %lu.\n", block->offset); - fprintf(stderr, + fprintf(stderr, "InnoDB: You may have to recover from a backup.\n"); - exit(1); + exit(1); } if (recv_recovery_is_on()) { @@ -1601,11 +1619,28 @@ void buf_print_io(void) /*==============*/ { + ulint size; + ut_ad(buf_pool); + size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE; + mutex_enter(&(buf_pool->mutex)); + + printf("LRU list length %lu \n", UT_LIST_GET_LEN(buf_pool->LRU)); + printf("Free list length %lu \n", UT_LIST_GET_LEN(buf_pool->free)); + printf("Flush list length %lu \n", + UT_LIST_GET_LEN(buf_pool->flush_list)); + printf("Buffer pool size in pages %lu\n", size); - printf("pages read %lu, created %lu, written %lu\n", + printf("Pending reads %lu \n", buf_pool->n_pend_reads); + + printf("Pending writes: LRU %lu, flush list %lu, single page %lu\n", + buf_pool->n_flush[BUF_FLUSH_LRU], + buf_pool->n_flush[BUF_FLUSH_LIST], + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + + printf("Pages read %lu, created %lu, written %lu\n", buf_pool->n_pages_read, buf_pool->n_pages_created, buf_pool->n_pages_written); mutex_exit(&(buf_pool->mutex)); diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 7129b8d20a9..82b12103c4c 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -1,7 +1,7 @@ /****************************************************** The database buffer buf_pool flush algorithm -(c) 1995 Innobase Oy +(c) 1995-2001 Innobase Oy Created 11/11/1995 Heikki Tuuri *******************************************************/ @@ -15,13 +15,13 @@ Created 11/11/1995 Heikki Tuuri #include "ut0byte.h" #include "ut0lst.h" #include "fil0fil.h" - #include "buf0buf.h" #include "buf0lru.h" #include "buf0rea.h" #include "ibuf0ibuf.h" #include "log0log.h" #include "os0file.h" +#include "trx0sys.h" /* When flushed, dirty blocks are searched in neigborhoods of this size, and flushed along with the original page. */ @@ -195,9 +195,145 @@ buf_flush_write_complete( } /************************************************************************ -Does an asynchronous write of a buffer page. NOTE: in simulated aio we must -call os_aio_simulated_wake_handler_threads after we have posted a batch -of writes! */ +Flushes possible buffered writes from the doublewrite memory buffer to disk, +and also wakes up the aio thread if simulated aio is used. It is very +important to call this function after a batch of writes has been posted, +and also when we may have to wait for a page latch! Otherwise a deadlock +of threads can occur. */ +static +void +buf_flush_buffered_writes(void) +/*===========================*/ +{ + buf_block_t* block; + ulint len; + ulint i; + + if (trx_doublewrite == NULL) { + os_aio_simulated_wake_handler_threads(); + + return; + } + + mutex_enter(&(trx_doublewrite->mutex)); + + /* Write first to doublewrite buffer blocks. We use synchronous + aio and thus know that file write has been completed when the + control returns. */ + + if (trx_doublewrite->first_free == 0) { + + mutex_exit(&(trx_doublewrite->mutex)); + + return; + } + + if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; + } else { + len = trx_doublewrite->first_free * UNIV_PAGE_SIZE; + } + + fil_io(OS_FILE_WRITE, + TRUE, TRX_SYS_SPACE, + trx_doublewrite->block1, 0, len, + (void*)trx_doublewrite->write_buf, NULL); + + if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + len = (trx_doublewrite->first_free + - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE; + + fil_io(OS_FILE_WRITE, + TRUE, TRX_SYS_SPACE, + trx_doublewrite->block2, 0, len, + (void*)(trx_doublewrite->write_buf + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE), + NULL); + } + + /* Now flush the doublewrite buffer data to disk */ + + fil_flush(TRX_SYS_SPACE); + + /* We know that the writes have been flushed to disk now + and in recovery we will find them in the doublewrite buffer + blocks. Next do the writes to the intended positions. */ + + for (i = 0; i < trx_doublewrite->first_free; i++) { + block = trx_doublewrite->buf_block_arr[i]; + + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, + (void*)block->frame, (void*)block); + } + + /* Wake possible simulated aio thread to actually post the + writes to the operating system */ + + os_aio_simulated_wake_handler_threads(); + + /* Wait that all async writes to tablespaces have been posted to + the OS */ + + os_aio_wait_until_no_pending_writes(); + + /* Now we flush the data to disk (for example, with fsync) */ + + fil_flush_file_spaces(FIL_TABLESPACE); + + /* We can now reuse the doublewrite memory buffer: */ + + trx_doublewrite->first_free = 0; + + mutex_exit(&(trx_doublewrite->mutex)); +} + +/************************************************************************ +Posts a buffer page for writing. If the doublewrite memory buffer is +full, calls buf_flush_buffered_writes and waits for for free space to +appear. */ +static +void +buf_flush_post_to_doublewrite_buf( +/*==============================*/ + buf_block_t* block) /* in: buffer block to write */ +{ +try_again: + mutex_enter(&(trx_doublewrite->mutex)); + + if (trx_doublewrite->first_free + >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + mutex_exit(&(trx_doublewrite->mutex)); + + buf_flush_buffered_writes(); + + goto try_again; + } + + ut_memcpy(trx_doublewrite->write_buf + + UNIV_PAGE_SIZE * trx_doublewrite->first_free, + block->frame, UNIV_PAGE_SIZE); + + trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block; + + trx_doublewrite->first_free++; + + if (trx_doublewrite->first_free + >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + mutex_exit(&(trx_doublewrite->mutex)); + + buf_flush_buffered_writes(); + + return; + } + + mutex_exit(&(trx_doublewrite->mutex)); +} + +/************************************************************************ +Does an asynchronous write of a buffer page. NOTE: in simulated aio and +also when the doublewrite buffer is used, we must call +buf_flush_buffered_writes after we have posted a batch of writes! */ static void buf_flush_write_block_low( @@ -222,15 +358,24 @@ buf_flush_write_block_low( mach_write_to_8(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, block->newest_modification); + /* Write to the page the space id and page number */ + + mach_write_to_4(block->frame + FIL_PAGE_SPACE, block->space); + mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->offset); + /* We overwrite the first 4 bytes of the end lsn field to store a page checksum */ mach_write_to_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, buf_calc_page_checksum(block->frame)); - fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, - FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, + if (!trx_doublewrite) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, (void*)block->frame, (void*)block); + } else { + buf_flush_post_to_doublewrite_buf(block); + } } /************************************************************************ @@ -251,14 +396,14 @@ buf_flush_try_page( buf_block_t* block; ibool locked; - ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST) - || (flush_type == BUF_FLUSH_SINGLE_PAGE)); + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST + || flush_type == BUF_FLUSH_SINGLE_PAGE); mutex_enter(&(buf_pool->mutex)); block = buf_page_hash_get(space, offset); - if ((flush_type == BUF_FLUSH_LIST) + if (flush_type == BUF_FLUSH_LIST && block && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; @@ -286,7 +431,7 @@ buf_flush_try_page( mutex_exit(&(buf_pool->mutex)); if (!locked) { - os_aio_simulated_wake_handler_threads(); + buf_flush_buffered_writes(); rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); } @@ -300,7 +445,7 @@ buf_flush_try_page( return(1); - } else if ((flush_type == BUF_FLUSH_LRU) && block + } else if (flush_type == BUF_FLUSH_LRU && block && buf_flush_ready_for_flush(block, flush_type)) { /* VERY IMPORTANT: @@ -328,7 +473,7 @@ buf_flush_try_page( return(1); - } else if ((flush_type == BUF_FLUSH_SINGLE_PAGE) && block + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; @@ -387,6 +532,14 @@ buf_flush_try_neighbors( low = offset; high = offset + 1; + } else if (flush_type == BUF_FLUSH_LIST) { + /* Since semaphore waits require us to flush the + doublewrite buffer to disk, it is best that the + search area is just the page itself, to minimize + chances for semaphore waits */ + + low = offset; + high = offset + 1; } /* printf("Flush area: low %lu high %lu\n", low, high); */ @@ -418,13 +571,6 @@ buf_flush_try_neighbors( mutex_exit(&(buf_pool->mutex)); - /* In simulated aio we wake up the i/o-handler threads now that - we have posted a batch of writes: */ - - /* printf("Flush count %lu ; Waking i/o handlers\n", count); */ - - os_aio_simulated_wake_handler_threads(); - return(count); } @@ -565,13 +711,15 @@ buf_flush_batch( mutex_exit(&(buf_pool->mutex)); - if (buf_debug_prints && (page_count > 0)) { + buf_flush_buffered_writes(); + + if (buf_debug_prints && page_count > 0) { if (flush_type == BUF_FLUSH_LRU) { - printf("To flush %lu pages in LRU flush\n", + printf("Flushed %lu pages in LRU flush\n", page_count); } else if (flush_type == BUF_FLUSH_LIST) { - printf("To flush %lu pages in flush list flush\n", - page_count, flush_type); + printf("Flushed %lu pages in flush list flush\n", + page_count); } else { ut_error; } diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c index 728bf4404b8..db187cdd896 100644 --- a/innobase/buf/buf0rea.c +++ b/innobase/buf/buf0rea.c @@ -49,7 +49,9 @@ ulint buf_read_page_low( /*==============*/ /* out: 1 if a read request was queued, 0 if the page - already resided in buf_pool */ + already resided in buf_pool or if the page is in + the doublewrite buffer blocks in which case it is never + read into the pool */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ..., ORed to OS_AIO_SIMULATED_WAKE_LATER (see below @@ -63,6 +65,16 @@ buf_read_page_low( wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; + if (trx_doublewrite && space == TRX_SYS_SPACE + && ( (offset >= trx_doublewrite->block1 + && offset < trx_doublewrite->block1 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + || (offset >= trx_doublewrite->block2 + && offset < trx_doublewrite->block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) { + return(0); + } + #ifdef UNIV_LOG_DEBUG if (space % 2 == 1) { /* We are updating a replicate space while holding the diff --git a/innobase/data/data0data.c b/innobase/data/data0data.c index fe5611bc312..4172fb9c8ce 100644 --- a/innobase/data/data0data.c +++ b/innobase/data/data0data.c @@ -13,7 +13,10 @@ Created 5/30/1994 Heikki Tuuri #endif #include "ut0rnd.h" - +#include "rem0rec.h" +#include "page0page.h" +#include "dict0dict.h" +#include "btr0cur.h" byte data_error; /* data pointers of tuple fields are initialized to point here for error checking */ @@ -378,6 +381,172 @@ dtuple_sprintf( return(len); } +/****************************************************************** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. */ + +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + /* out, own: created big record vector, + NULL if we are not able to shorten + the entry enough, i.e., if there are + too many short fields in entry */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + mem_heap_t* heap; + big_rec_t* vector; + dfield_t* dfield; + ulint size; + ulint n_fields; + ulint longest; + ulint longest_i; + ulint i; + + size = rec_get_converted_size(entry); + + heap = mem_heap_create(size + dtuple_get_n_fields(entry) + * sizeof(big_rec_field_t) + 1000); + + vector = mem_heap_alloc(heap, sizeof(big_rec_t)); + + vector->heap = heap; + vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry) + * sizeof(big_rec_field_t)); + + /* Decide which fields to shorten: the algorithm is to look for + the longest field which does not occur in the ordering part + of any index on the table */ + + n_fields = 0; + + while ((rec_get_converted_size(entry) + >= page_get_free_space_of_empty() / 2) + || rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE) { + + longest = 0; + for (i = dict_index_get_n_unique_in_tree(index); + i < dtuple_get_n_fields(entry); i++) { + + /* Skip over fields which are ordering in some index */ + + if (dict_field_get_col( + dict_index_get_nth_field(index, i)) + ->ord_part == 0) { + + dfield = dtuple_get_nth_field(entry, i); + + if (dfield->len != UNIV_SQL_NULL && + dfield->len > longest) { + + longest = dfield->len; + + longest_i = i; + } + } + } + + if (longest < BTR_EXTERN_FIELD_REF_SIZE + 10) { + + /* Cannot shorten more */ + + mem_heap_free(heap); + + return(NULL); + } + + /* Move data from field longest_i to big rec vector, + but do not let data size of the remaining entry + drop below 128 which is the limit for the 2-byte + offset storage format in a physical record */ + + dfield = dtuple_get_nth_field(entry, longest_i); + vector->fields[n_fields].field_no = longest_i; + + if (dtuple_get_data_size(entry) - dfield->len + <= REC_1BYTE_OFFS_LIMIT) { + vector->fields[n_fields].len = + dtuple_get_data_size(entry) + - REC_1BYTE_OFFS_LIMIT; + /* Since dfield will contain at least + a 20-byte reference to the extern storage, + we know that the data size of entry will be + > REC_1BYTE_OFFS_LIMIT */ + } else { + vector->fields[n_fields].len = dfield->len; + } + + vector->fields[n_fields].data = mem_heap_alloc(heap, + vector->fields[n_fields].len); + + /* Copy data (from the end of field) to big rec vector */ + + ut_memcpy(vector->fields[n_fields].data, + ((byte*)dfield->data) + dfield->len + - vector->fields[n_fields].len, + vector->fields[n_fields].len); + dfield->len = dfield->len - vector->fields[n_fields].len + + BTR_EXTERN_FIELD_REF_SIZE; + + /* Set the extern field reference in dfield to zero */ + memset(((byte*)dfield->data) + + dfield->len - BTR_EXTERN_FIELD_REF_SIZE, + 0, BTR_EXTERN_FIELD_REF_SIZE); + n_fields++; + } + + vector->n_fields = n_fields; + return(vector); +} + +/****************************************************************** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ + +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: entry whose data was put to vector */ + big_rec_t* vector) /* in, own: big rec vector; it is + freed in this function */ +{ + dfield_t* dfield; + ulint i; + + for (i = 0; i < vector->n_fields; i++) { + + dfield = dtuple_get_nth_field(entry, + vector->fields[i].field_no); + /* Copy data from big rec vector */ + + ut_memcpy(((byte*)dfield->data) + + dfield->len - BTR_EXTERN_FIELD_REF_SIZE, + vector->fields[i].data, + vector->fields[i].len); + dfield->len = dfield->len + vector->fields[i].len + - BTR_EXTERN_FIELD_REF_SIZE; + } + + mem_heap_free(vector->heap); +} + +/****************************************************************** +Frees the memory in a big rec vector. */ + +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /* in, own: big rec vector; it is + freed in this function */ +{ + mem_heap_free(vector->heap); +} + #ifdef notdefined /****************************************************************** diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index 6f201c7bce4..5c783627721 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -90,6 +90,9 @@ struct fil_node_struct { is ignored) */ ulint n_pending; /* count of pending i/o-ops on this file */ + ibool is_modified; /* this is set to TRUE when we write + to the file and FALSE when we call fil_flush + for this file space */ UT_LIST_NODE_T(fil_node_t) chain; /* link field for the file chain */ UT_LIST_NODE_T(fil_node_t) LRU; @@ -301,6 +304,8 @@ fil_node_create( node->size = size; node->magic_n = FIL_NODE_MAGIC_N; node->n_pending = 0; + + node->is_modified = FALSE; HASH_SEARCH(hash, system->spaces, id, space, space->id == id); @@ -721,6 +726,47 @@ fil_space_get_size( } /*********************************************************************** +Checks if the pair space, page_no refers to an existing page in a +tablespace file space. */ + +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + /* out: TRUE if the address is meaningful */ + ulint id, /* in: space id */ + ulint page_no)/* in: page number */ +{ + fil_space_t* space; + fil_system_t* system = fil_system; + ulint size; + ibool ret; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL) { + ret = FALSE; + } else { + size = space->size; + + if (page_no > size) { + ret = FALSE; + } else if (space->purpose != FIL_TABLESPACE) { + ret = FALSE; + } else { + ret = TRUE; + } + } + + mutex_exit(&(system->mutex)); + + return(ret); +} + +/*********************************************************************** Tries to reserve free extents in a file space. */ ibool @@ -812,8 +858,14 @@ fil_node_prepare_for_io( fil_node_close(last_node, system); } - node->handle = os_file_create(node->name, OS_FILE_OPEN, - OS_FILE_AIO, &ret); + if (space->purpose == FIL_LOG) { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_LOG_FILE, &ret); + } else { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } + ut_a(ret); node->open = TRUE; @@ -851,7 +903,8 @@ void fil_node_complete_io( /*=================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system) /* in: file system */ + fil_system_t* system, /* in: file system */ + ulint type) /* in: OS_FILE_WRITE or ..._READ */ { ut_ad(node); ut_ad(system); @@ -860,6 +913,10 @@ fil_node_complete_io( node->n_pending--; + if (type != OS_FILE_READ) { + node->is_modified = TRUE; + } + if (node->n_pending == 0) { /* The node must be put back to the LRU list */ UT_LIST_ADD_FIRST(LRU, system->LRU, node); @@ -1016,7 +1073,7 @@ loop: mutex_enter(&(system->mutex)); - fil_node_complete_io(node, system); + fil_node_complete_io(node, system, type); mutex_exit(&(system->mutex)); @@ -1090,12 +1147,14 @@ fil_aio_wait( fil_node_t* fil_node; fil_system_t* system = fil_system; void* message; + ulint type; ut_ad(fil_validate()); if (os_aio_use_native_aio) { #ifdef WIN_ASYNC_IO - ret = os_aio_windows_handle(segment, 0, &fil_node, &message); + ret = os_aio_windows_handle(segment, 0, &fil_node, &message, + &type); #elif defined(POSIX_ASYNC_IO) ret = os_aio_posix_handle(segment, &fil_node, &message); #else @@ -1103,14 +1162,14 @@ fil_aio_wait( #endif } else { ret = os_aio_simulated_handle(segment, (void**) &fil_node, - &message); + &message, &type); } ut_a(ret); mutex_enter(&(system->mutex)); - fil_node_complete_io(fil_node, fil_system); + fil_node_complete_io(fil_node, fil_system, type); mutex_exit(&(system->mutex)); @@ -1149,8 +1208,10 @@ fil_flush( node = UT_LIST_GET_FIRST(space->chain); while (node) { - if (node->open) { + if (node->open && node->is_modified) { file = node->handle; + + node->is_modified = FALSE; mutex_exit(&(system->mutex)); @@ -1159,9 +1220,11 @@ fil_flush( handle is still open: we assume that the OS will not crash or trap even if we pass a handle to a closed file below in os_file_flush! */ + + /* printf("Flushing to file %s\n", node->name); */ os_file_flush(file); - + mutex_enter(&(system->mutex)); } diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c index 101fb5f3ba0..ccc13f15fde 100644 --- a/innobase/fsp/fsp0fsp.c +++ b/innobase/fsp/fsp0fsp.c @@ -3239,8 +3239,8 @@ fsp_validate( ut_a(descr_count * FSP_EXTENT_SIZE == free_limit); ut_a(n_used + n_full_frag_pages - == n_used2 + (free_limit + XDES_DESCRIBED_PER_PAGE - 1) - / XDES_DESCRIBED_PER_PAGE + == n_used2 + 2* ((free_limit + XDES_DESCRIBED_PER_PAGE - 1) + / XDES_DESCRIBED_PER_PAGE) + seg_inode_len_full + seg_inode_len_free); ut_a(frag_n_used == n_used); diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c index 171c6169927..fd7b415551f 100644 --- a/innobase/ibuf/ibuf0ibuf.c +++ b/innobase/ibuf/ibuf0ibuf.c @@ -1698,8 +1698,7 @@ loop: btr_pcur_open_at_rnd_pos(data->index, BTR_SEARCH_LEAF, &pcur, &mtr); - if (data->size == 1 - && 0 == page_get_n_recs(btr_pcur_get_page(&pcur))) { + if (0 == page_get_n_recs(btr_pcur_get_page(&pcur))) { /* This tree is empty */ @@ -1946,6 +1945,7 @@ ibuf_insert_low( ulint page_no,/* in: page number where to insert */ que_thr_t* thr) /* in: query thread */ { + big_rec_t* dummy_big_rec; ulint entry_size; btr_pcur_t pcur; btr_cur_t* cursor; @@ -2101,7 +2101,8 @@ ibuf_insert_low( if (mode == BTR_MODIFY_PREV) { err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor, - ibuf_entry, &ins_rec, thr, + ibuf_entry, &ins_rec, + &dummy_big_rec, thr, &mtr); if (err == DB_SUCCESS) { /* Update the page max trx id field */ @@ -2121,7 +2122,8 @@ ibuf_insert_low( err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, cursor, - ibuf_entry, &ins_rec, thr, + ibuf_entry, &ins_rec, + &dummy_big_rec, thr, &mtr); if (err == DB_SUCCESS) { /* Update the page max trx id field */ diff --git a/innobase/include/btr0btr.h b/innobase/include/btr0btr.h index f8a3000ca8a..bea85565125 100644 --- a/innobase/include/btr0btr.h +++ b/innobase/include/btr0btr.h @@ -357,6 +357,44 @@ btr_get_size( /* out: number of pages */ dict_index_t* index, /* in: index */ ulint flag); /* in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ +/****************************************************************** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! */ + +page_t* +btr_page_alloc( +/*===========*/ + /* out: new allocated page, x-latched; + NULL if out of space */ + dict_tree_t* tree, /* in: index tree */ + ulint hint_page_no, /* in: hint of a good page */ + byte file_direction, /* in: direction where a possible + page split is made */ + ulint level, /* in: level where the page is placed + in the tree */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Frees a file page used in an index tree. NOTE: cannot free field external +storage pages because the page must contain info on its level. */ + +void +btr_page_free( +/*==========*/ + dict_tree_t* tree, /* in: index tree */ + page_t* page, /* in: page to be freed, x-latched */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Frees a file page used in an index tree. Can be used also to BLOB +external storage pages, because the page level 0 can be given as an +argument. */ + +void +btr_page_free_low( +/*==============*/ + dict_tree_t* tree, /* in: index tree */ + page_t* page, /* in: page to be freed, x-latched */ + ulint level, /* in: page level */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Prints size info of a B-tree. */ diff --git a/innobase/include/btr0cur.h b/innobase/include/btr0cur.h index 4ce2177bfe8..ffae434a5d9 100644 --- a/innobase/include/btr0cur.h +++ b/innobase/include/btr0cur.h @@ -151,11 +151,14 @@ btr_cur_optimistic_insert( ulint flags, /* in: undo logging and locking flags: if not zero, the parameters index and thr should be specified */ - btr_cur_t* cursor, /* in: cursor on page after which - to insert; cursor stays valid */ + btr_cur_t* cursor, /* in: cursor on page after which to insert; + cursor stays valid */ dtuple_t* entry, /* in: entry to insert */ rec_t** rec, /* out: pointer to inserted record if succeed */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ que_thr_t* thr, /* in: query thread or NULL */ mtr_t* mtr); /* in: mtr */ /***************************************************************** @@ -169,13 +172,19 @@ btr_cur_pessimistic_insert( /*=======================*/ /* out: DB_SUCCESS or error number */ ulint flags, /* in: undo logging and locking flags: if not - zero, the parameters index and thr should be - specified */ + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ btr_cur_t* cursor, /* in: cursor after which to insert; - cursor does not stay valid */ + cursor stays valid */ dtuple_t* entry, /* in: entry to insert */ rec_t** rec, /* out: pointer to inserted record if succeed */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ que_thr_t* thr, /* in: query thread or NULL */ mtr_t* mtr); /* in: mtr */ /***************************************************************** @@ -228,8 +237,9 @@ btr_cur_pessimistic_update( /* out: DB_SUCCESS or error code */ ulint flags, /* in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /* in: cursor on the record to update; - cursor does not stay valid */ + btr_cur_t* cursor, /* in: cursor on the record to update */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or NULL */ upd_t* update, /* in: update vector; this is allowed also contain trx id and roll ptr fields, but the values in update vector have no effect */ @@ -407,6 +417,92 @@ btr_estimate_number_of_different_key_vals( /*======================================*/ /* out: estimated number of key values */ dict_index_t* index); /* in: index */ +/*********************************************************************** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The fields are stored on pages allocated from leaf node +file segment of the index tree. */ + +ulint +btr_store_big_rec_extern_fields( +/*============================*/ + /* out: DB_SUCCESS or error */ + dict_index_t* index, /* in: index of rec; the index tree + MUST be X-latched */ + rec_t* rec, /* in: record */ + big_rec_t* big_rec_vec, /* in: vector containing fields + to be stored externally */ + mtr_t* local_mtr); /* in: mtr containing the latch to + rec and to the tree */ +/*********************************************************************** +Frees the space in an externally stored field to the file space +management. */ + +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /* in: index of the data, the index + tree MUST be X-latched */ + byte* data, /* in: internally stored data + + reference to the externally + stored part */ + ulint local_len, /* in: length of data */ + mtr_t* local_mtr); /* in: mtr containing the latch to + data an an X-latch to the index + tree */ +/*************************************************************** +Frees the externally stored fields for a record. */ + +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /* in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /* in: record */ + mtr_t* mtr); /* in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +/*********************************************************************** +Copies an externally stored field of a record to mem heap. */ + +byte* +btr_rec_copy_externally_stored_field( +/*=================================*/ + /* out: the field copied to heap */ + rec_t* rec, /* in: record */ + ulint no, /* in: field number */ + ulint* len, /* out: length of the field */ + mem_heap_t* heap); /* in: mem heap */ +/*********************************************************************** +Copies an externally stored field of a record to mem heap. Parameter +data contains a pointer to 'internally' stored part of the field: +possibly some data, and the reference to the externally stored part in +the last 20 bytes of data. */ + +byte* +btr_copy_externally_stored_field( +/*=============================*/ + /* out: the whole field copied to heap */ + ulint* len, /* out: length of the whole field */ + byte* data, /* in: 'internally' stored part of the + field containing also the reference to + the external part */ + ulint local_len,/* in: length of data */ + mem_heap_t* heap); /* in: mem heap */ +/*********************************************************************** +Stores the positions of the fields marked as extern storage in the update +vector, and also those fields who are marked as extern storage in rec +and not mentioned in updated fields. We use this function to remember +which fields we must mark as extern storage in a record inserted for an +update. */ + +ulint +btr_push_update_extern_fields( +/*==========================*/ + /* out: number of values stored in ext_vect */ + ulint* ext_vect, /* in: array of ulints, must be preallocated + to have place for all fields in rec */ + rec_t* rec, /* in: record */ + upd_t* update); /* in: update vector */ /*######################################################################*/ @@ -516,6 +612,19 @@ and sleep this many microseconds in between */ #define BTR_CUR_RETRY_DELETE_N_TIMES 100 #define BTR_CUR_RETRY_SLEEP_TIME 50000 +/* The reference in a field of which data is stored on a different page */ +/*--------------------------------------*/ +#define BTR_EXTERN_SPACE_ID 0 /* space id where stored */ +#define BTR_EXTERN_PAGE_NO 4 /* page no where stored */ +#define BTR_EXTERN_OFFSET 8 /* offset of BLOB header + on that page */ +#define BTR_EXTERN_LEN 12 /* 8 bytes containing the + length of the externally + stored part of the BLOB */ +/*--------------------------------------*/ +#define BTR_EXTERN_FIELD_REF_SIZE 20 + + extern ulint btr_cur_n_non_sea; #ifndef UNIV_NONINL diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index 7f3e20a4505..8b22561adf8 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -378,6 +378,14 @@ buf_calc_page_checksum( /*===================*/ /* out: checksum */ byte* page); /* in: buffer page */ +/************************************************************************ +Checks if a page is corrupt. */ + +ibool +buf_page_is_corrupted( +/*==================*/ + /* out: TRUE if corrupted */ + byte* read_buf); /* in: a database page */ /************************************************************************** Gets the page number of a pointer pointing within a buffer frame containing a file page. */ diff --git a/innobase/include/buf0flu.h b/innobase/include/buf0flu.h index 9317950904f..cb1c0965a65 100644 --- a/innobase/include/buf0flu.h +++ b/innobase/include/buf0flu.h @@ -101,7 +101,7 @@ make sure that a read-ahead batch can be read efficiently in a single sweep). */ #define BUF_FLUSH_FREE_BLOCK_MARGIN (5 + BUF_READ_AHEAD_AREA) -#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4) +#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4 + 100) #ifndef UNIV_NONINL #include "buf0flu.ic" diff --git a/innobase/include/data0data.h b/innobase/include/data0data.h index d7f0986b0b6..f695e0989a5 100644 --- a/innobase/include/data0data.h +++ b/innobase/include/data0data.h @@ -14,6 +14,9 @@ Created 5/30/1994 Heikki Tuuri #include "data0types.h" #include "data0type.h" #include "mem0mem.h" +#include "dict0types.h" + +typedef struct big_rec_struct big_rec_t; /* Some non-inlined functions used in the MySQL interface: */ void @@ -312,6 +315,41 @@ dtuple_sprintf( char* buf, /* in: print buffer */ ulint buf_len,/* in: buf length in bytes */ dtuple_t* tuple); /* in: tuple */ +/****************************************************************** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. */ + +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + /* out, own: created big record vector, + NULL if we are not able to shorten + the entry enough, i.e., if there are + too many short fields in entry */ + dict_index_t* index, /* in: index */ + dtuple_t* entry); /* in: index entry */ +/****************************************************************** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ + +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: entry whose data was put to vector */ + big_rec_t* vector);/* in, own: big rec vector; it is + freed in this function */ +/****************************************************************** +Frees the memory in a big rec vector. */ + +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector); /* in, own: big rec vector; it is + freed in this function */ /*************************************************************** Generates a random tuple. */ @@ -396,7 +434,7 @@ dtuple_gen_search_tuple_TPC_C( /* Structure for an SQL data field */ struct dfield_struct{ void* data; /* pointer to data */ - ulint len; /* data length; UNIV_SQL_NULL if SQL null */ + ulint len; /* data length; UNIV_SQL_NULL if SQL null; */ dtype_t type; /* type of data */ ulint col_no; /* when building index entries, the column number can be stored here */ @@ -423,6 +461,24 @@ struct dtuple_struct { }; #define DATA_TUPLE_MAGIC_N 65478679 +/* A slot for a field in a big rec vector */ + +typedef struct big_rec_field_struct big_rec_field_t; +struct big_rec_field_struct { + ulint field_no; /* field number in record */ + ulint len; /* stored data len */ + byte* data; /* stored data */ +}; + +/* Storage format for overflow data in a big record, that is, a record +which needs external storage of data fields */ + +struct big_rec_struct { + mem_heap_t* heap; /* memory heap from which allocated */ + ulint n_fields; /* number of stored fields */ + big_rec_field_t* fields; /* stored fields */ +}; + #ifndef UNIV_NONINL #include "data0data.ic" #endif diff --git a/innobase/include/data0data.ic b/innobase/include/data0data.ic index 27b5552d338..b886ad6c69c 100644 --- a/innobase/include/data0data.ic +++ b/innobase/include/data0data.ic @@ -307,12 +307,13 @@ dtuple_create( /************************************************************** The following function returns the sum of data lengths of a tuple. The space -occupied by the field structs or the tuple struct is not counted. */ +occupied by the field structs or the tuple struct is not counted. Neither +is possible space in externally stored parts of the field. */ UNIV_INLINE ulint dtuple_get_data_size( /*=================*/ - /* out: sum of data lens */ + /* out: sum of data lengths */ dtuple_t* tuple) /* in: typed data tuple */ { dfield_t* field; @@ -382,7 +383,7 @@ dtuple_datas_are_equal( field2 = dtuple_get_nth_field(tuple2, i); data2 = (byte*) dfield_get_data(field2); - len2 = dfield_get_len(field2); + len2 = dfield_get_len(field2); if (len1 != len2) { diff --git a/innobase/include/dict0dict.ic b/innobase/include/dict0dict.ic index 549a5763b44..9089ebe8edd 100644 --- a/innobase/include/dict0dict.ic +++ b/innobase/include/dict0dict.ic @@ -651,8 +651,6 @@ dict_table_get_index( char* name) /* in: index name */ { dict_index_t* index = NULL; - - mutex_enter(&(dict_sys->mutex)); index = dict_table_get_first_index(table); @@ -665,8 +663,6 @@ dict_table_get_index( index = dict_table_get_next_index(index); } - mutex_exit(&(dict_sys->mutex)); - return(index); } diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h index be9cd42b7be..74ecbc8bba2 100644 --- a/innobase/include/dict0mem.h +++ b/innobase/include/dict0mem.h @@ -143,7 +143,7 @@ struct dict_col_struct{ ulint clust_pos;/* position of the column in the clustered index */ ulint ord_part;/* count of how many times this column - appears in an ordering fields of an index */ + appears in ordering fields of an index */ char* name; /* name */ dtype_t type; /* data type */ dict_table_t* table; /* back pointer to table of this column */ diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h index 9905b5a2c3c..bfc322270fc 100644 --- a/innobase/include/fil0fil.h +++ b/innobase/include/fil0fil.h @@ -196,6 +196,16 @@ fil_space_get_size( /* out: space size */ ulint id); /* in: space id */ /*********************************************************************** +Checks if the pair space, page_no refers to an existing page in a +tablespace file space. */ + +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + /* out: TRUE if the address is meaningful */ + ulint id, /* in: space id */ + ulint page_no);/* in: page number */ +/*********************************************************************** Appends a new file to the chain of files of a space. File must be closed. */ diff --git a/innobase/include/fsp0fsp.h b/innobase/include/fsp0fsp.h index f1be4de4d40..e7f9eab330b 100644 --- a/innobase/include/fsp0fsp.h +++ b/innobase/include/fsp0fsp.h @@ -70,7 +70,7 @@ page_t* fseg_create( /*========*/ /* out: the page where the segment header is placed, - x-latched, FIL_NULL if could not create segment + x-latched, NULL if could not create segment because of lack of space */ ulint space, /* in: space id */ ulint page, /* in: page where the segment header is placed: if diff --git a/innobase/include/mach0data.ic b/innobase/include/mach0data.ic index 176f3415281..1d6badd035b 100644 --- a/innobase/include/mach0data.ic +++ b/innobase/include/mach0data.ic @@ -115,7 +115,7 @@ mach_write_to_4( { ut_ad(b); -#if notdefined && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC) +#if (0 == 1) && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC) /* We do not use this even on Intel, because unaligned accesses may be slow */ @@ -143,7 +143,7 @@ mach_read_from_4( /* out: ulint integer */ byte* b) /* in: pointer to four bytes */ { -#if notdefined && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC) +#if (0 == 1) && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC) /* We do not use this even on Intel, because unaligned accesses may be slow */ diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index c093cb92ca9..75bbbba549f 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -59,6 +59,10 @@ log. */ #define OS_FILE_AIO 61 #define OS_FILE_NORMAL 62 +/* Types for file create */ +#define OS_DATA_FILE 100 +#define OS_LOG_FILE 101 + /* Error codes from os_file_get_last_error */ #define OS_FILE_NOT_FOUND 71 #define OS_FILE_DISK_FULL 72 @@ -125,6 +129,7 @@ os_file_create( if a new file is created or an old overwritten */ ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o is desired, OS_FILE_NORMAL, if any normal file */ + ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ ibool* success);/* out: TRUE if succeed, FALSE if error */ /*************************************************************************** Closes a file handle. In case of error, error number can be retrieved with @@ -263,6 +268,13 @@ os_aio( operation); if mode is OS_AIO_SYNC, these are ignored */ void* message2); +/**************************************************************************** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ + +void +os_aio_wait_until_no_pending_writes(void); +/*=====================================*/ /************************************************************************** Wakes up simulated aio i/o-handler threads if they have something to do. */ @@ -298,7 +310,8 @@ os_aio_windows_handle( the aio operation failed, these output parameters are valid and can be used to restart the operation, for example */ - void** message2); + void** message2, + ulint* type); /* out: OS_FILE_WRITE or ..._READ */ #endif #ifdef POSIX_ASYNC_IO /************************************************************************** @@ -335,7 +348,8 @@ os_aio_simulated_handle( the aio operation failed, these output parameters are valid and can be used to restart the operation, for example */ - void** message2); + void** message2, + ulint* type); /* out: OS_FILE_WRITE or ..._READ */ /************************************************************************** Validates the consistency of the aio system. */ diff --git a/innobase/include/rem0cmp.h b/innobase/include/rem0cmp.h index 77b9ef9edc8..10c428cb9ca 100644 --- a/innobase/include/rem0cmp.h +++ b/innobase/include/rem0cmp.h @@ -1,7 +1,7 @@ /*********************************************************************** Comparison services for records -(c) 1994-1996 Innobase Oy +(c) 1994-2001 Innobase Oy Created 7/1/1994 Heikki Tuuri ************************************************************************/ @@ -31,14 +31,18 @@ This function is used to compare a data tuple to a physical record. Only dtuple->n_fields_cmp first fields are taken into account for the the data tuple! If we denote by n = n_fields_cmp, then rec must have either m >= n fields, or it must differ from dtuple in some of -the m fields rec has. */ +the m fields rec has. If rec has an externally stored field we do not +compare it but return with value 0 if such a comparison should be +made. */ int cmp_dtuple_rec_with_match( /*======================*/ /* out: 1, 0, -1, if dtuple is greater, equal, less than rec, respectively, when only the - common first fields are compared */ + common first fields are compared, or + until the first externally stored field in + rec */ dtuple_t* dtuple, /* in: data tuple */ rec_t* rec, /* in: physical record which differs from dtuple in some of the common fields, or which @@ -89,7 +93,8 @@ cmp_dtuple_rec_prefix_equal( fields in dtuple */ /***************************************************************** This function is used to compare two physical records. Only the common -first fields are compared. */ +first fields are compared, and if an externally stored field is +encountered, then 0 is returned. */ int cmp_rec_rec_with_match( diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h index 62c0aa14519..12e3a8b39d6 100644 --- a/innobase/include/rem0rec.h +++ b/innobase/include/rem0rec.h @@ -12,6 +12,7 @@ Created 5/30/1994 Heikki Tuuri #include "univ.i" #include "data0data.h" #include "rem0types.h" +#include "mtr0types.h" /* Maximum values for various fields (for non-blob tuples) */ #define REC_MAX_N_FIELDS (1024 - 1) @@ -162,6 +163,49 @@ rec_get_nth_field_size( /* out: field size in bytes */ rec_t* rec, /* in: record */ ulint n); /* in: index of the field */ +/*************************************************************** +Gets the value of the ith field extern storage bit. If it is TRUE +it means that the field is stored on another page. */ +UNIV_INLINE +ibool +rec_get_nth_field_extern_bit( +/*=========================*/ + /* in: TRUE or FALSE */ + rec_t* rec, /* in: record */ + ulint i); /* in: ith field */ +/********************************************************** +Returns TRUE if the extern bit is set in any of the fields +of rec. */ +UNIV_INLINE +ibool +rec_contains_externally_stored_field( +/*=================================*/ + /* out: TRUE if a field is stored externally */ + rec_t* rec); /* in: record */ +/*************************************************************** +Sets the value of the ith field extern storage bit. */ + +void +rec_set_nth_field_extern_bit( +/*=========================*/ + rec_t* rec, /* in: record */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page where + rec is, or NULL; in the NULL case we do not + write to log about the change */ +/*************************************************************** +Sets TRUE the extern storage bits of fields mentioned in an array. */ + +void +rec_set_field_extern_bits( +/*======================*/ + rec_t* rec, /* in: record */ + ulint* vec, /* in: array of field numbers */ + ulint n_fields, /* in: number of fields numbers */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case we + do not write to log about the change */ /**************************************************************** The following function is used to get a copy of the nth data field in the record to a buffer. */ @@ -350,6 +394,15 @@ rec_sprintf( #define REC_INFO_BITS 6 /* This is single byte bit-field */ +/* Maximum lengths for the data in a physical record if the offsets +are given in one byte (resp. two byte) format. */ +#define REC_1BYTE_OFFS_LIMIT 0x7F +#define REC_2BYTE_OFFS_LIMIT 0x7FFF + +/* The data size of record must be smaller than this because we reserve +two upmost bits in a two byte offset for special purposes */ +#define REC_MAX_DATA_SIZE (16 * 1024) + #ifndef UNIV_NONINL #include "rem0rec.ic" #endif diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic index c63b25374dd..1e9ecb47e2e 100644 --- a/innobase/include/rem0rec.ic +++ b/innobase/include/rem0rec.ic @@ -25,12 +25,6 @@ significant bytes and bits are written below less significant. 4 bits info bits */ - -/* Maximum lengths for the data in a physical record if the offsets -are given as one byte (resp. two byte) format. */ -#define REC_1BYTE_OFFS_LIMIT 0x7F -#define REC_2BYTE_OFFS_LIMIT 0x7FFF - /* We list the byte offsets from the origin of the record, the mask, and the shift needed to obtain each bit-field of the record. */ @@ -66,6 +60,11 @@ one-byte and two-byte offsets */ #define REC_1BYTE_SQL_NULL_MASK 0x80 #define REC_2BYTE_SQL_NULL_MASK 0x8000 +/* In a 2-byte offset the second most significant bit denotes +a field stored to another page: */ + +#define REC_2BYTE_EXTERN_MASK 0x4000 + /*************************************************************** Sets the value of the ith field SQL null bit. */ @@ -489,7 +488,7 @@ ulint rec_2_get_field_end_info( /*=====================*/ /* out: offset of the start of the field, SQL null - flag ORed */ + flag and extern storage flag ORed */ rec_t* rec, /* in: record */ ulint n) /* in: field index */ { @@ -499,6 +498,63 @@ rec_2_get_field_end_info( return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2))); } +/*************************************************************** +Gets the value of the ith field extern storage bit. If it is TRUE +it means that the field is stored on another page. */ +UNIV_INLINE +ibool +rec_get_nth_field_extern_bit( +/*=========================*/ + /* in: TRUE or FALSE */ + rec_t* rec, /* in: record */ + ulint i) /* in: ith field */ +{ + ulint info; + + if (rec_get_1byte_offs_flag(rec)) { + + return(FALSE); + } + + info = rec_2_get_field_end_info(rec, i); + + if (info & REC_2BYTE_EXTERN_MASK) { + return(TRUE); + } + + return(FALSE); +} + +/********************************************************** +Returns TRUE if the extern bit is set in any of the fields +of rec. */ +UNIV_INLINE +ibool +rec_contains_externally_stored_field( +/*=================================*/ + /* out: TRUE if a field is stored externally */ + rec_t* rec) /* in: record */ +{ + ulint n; + ulint i; + + if (rec_get_1byte_offs_flag(rec)) { + + return(FALSE); + } + + n = rec_get_n_fields(rec); + + for (i = 0; i < n; i++) { + if (rec_get_nth_field_extern_bit(rec, i)) { + + return(TRUE); + } + } + + return(FALSE); +} + /********************************************************** Returns the offset of n - 1th field end if the record is stored in the 1-byte offsets form. If the field is SQL null, the flag is ORed in the returned @@ -616,7 +672,7 @@ rec_2_get_field_start_offs( } return(rec_2_get_prev_field_end_info(rec, n) - & ~REC_2BYTE_SQL_NULL_MASK); + & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK)); } /********************************************************** diff --git a/innobase/include/row0ins.h b/innobase/include/row0ins.h index 94b0e8dec37..612b9e8d73a 100644 --- a/innobase/include/row0ins.h +++ b/innobase/include/row0ins.h @@ -56,6 +56,9 @@ row_ins_index_entry_low( pessimistic descent down the index tree */ dict_index_t* index, /* in: index */ dtuple_t* entry, /* in: index entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ que_thr_t* thr); /* in: query thread */ /******************************************************************* Inserts an index entry to index. Tries first optimistic, then pessimistic @@ -70,6 +73,9 @@ row_ins_index_entry( DB_DUPLICATE_KEY, or some other error code */ dict_index_t* index, /* in: index */ dtuple_t* entry, /* in: index entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ que_thr_t* thr); /* in: query thread */ /*************************************************************** Inserts a row to a table. */ diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h index 554da2c035c..31f9e15cddc 100644 --- a/innobase/include/row0mysql.h +++ b/innobase/include/row0mysql.h @@ -189,7 +189,9 @@ row_update_for_mysql( row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL handle */ /************************************************************************* -Does a table creation operation for MySQL. */ +Does a table creation operation for MySQL. If the name of the created +table ends to characters INNODB_MONITOR, then this also starts +printing of monitor output by the master thread. */ int row_create_table_for_mysql( @@ -209,7 +211,9 @@ row_create_index_for_mysql( dict_index_t* index, /* in: index defintion */ trx_t* trx); /* in: transaction handle */ /************************************************************************* -Drops a table for MySQL. */ +Drops a table for MySQL. If the name of the dropped table ends to +characters INNODB_MONITOR, then this also stops printing of monitor +output by the master thread. */ int row_drop_table_for_mysql( diff --git a/innobase/include/row0row.h b/innobase/include/row0row.h index fb1e1b01ee3..09a79e19fd7 100644 --- a/innobase/include/row0row.h +++ b/innobase/include/row0row.h @@ -250,6 +250,7 @@ row_search_index_entry( #define ROW_COPY_DATA 1 #define ROW_COPY_POINTERS 2 +#define ROW_COPY_ALSO_EXTERNALS 3 /* The allowed latching order of index records is the following: (1) a secondary index record -> diff --git a/innobase/include/row0upd.h b/innobase/include/row0upd.h index 3046345f446..9bb73726b29 100644 --- a/innobase/include/row0upd.h +++ b/innobase/include/row0upd.h @@ -147,6 +147,9 @@ row_upd_build_difference( fields, excluding roll ptr and trx id */ dict_index_t* index, /* in: clustered index */ dtuple_t* entry, /* in: entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ rec_t* rec, /* in: clustered index record */ mem_heap_t* heap); /* in: memory heap from which allocated */ /*************************************************************** @@ -262,6 +265,9 @@ struct upd_field_struct{ constants in the symbol table of the query graph */ dfield_t new_val; /* new value for the column */ + ibool extern_storage; /* this is set to TRUE if dfield + actually contains a reference to + an externally stored field */ }; /* Update vector structure */ @@ -318,6 +324,10 @@ struct upd_node_struct{ dtuple_t* row; /* NULL, or a copy (also fields copied to heap) of the row to update; this must be reset to NULL after a successful update */ + ulint* ext_vec;/* array describing which fields are stored + externally in the clustered index record of + row */ + ulint n_ext_vec;/* number of fields in ext_vec */ mem_heap_t* heap; /* memory heap used as auxiliary storage for row; this must be emptied after a successful update if node->row != NULL */ @@ -349,7 +359,7 @@ struct upd_node_struct{ looked at and updated if an ordering field changed */ -/* Compilation info flags: these must fit within one byte */ +/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */ #define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be changed in the update and no ordering field of the clustered index */ diff --git a/innobase/include/row0upd.ic b/innobase/include/row0upd.ic index b1b10bef0e8..b785e52caa0 100644 --- a/innobase/include/row0upd.ic +++ b/innobase/include/row0upd.ic @@ -23,6 +23,7 @@ upd_create( mem_heap_t* heap) /* in: heap from which memory allocated */ { upd_t* update; + ulint i; update = mem_heap_alloc(heap, sizeof(upd_t)); @@ -30,6 +31,10 @@ upd_create( update->n_fields = n; update->fields = mem_heap_alloc(heap, sizeof(upd_field_t) * n); + for (i = 0; i < n; i++) { + update->fields[i].extern_storage = 0; + } + return(update); } diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index f80abda19c6..e635964e5ec 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -27,6 +27,9 @@ extern char** srv_data_file_names; extern ulint* srv_data_file_sizes; extern ulint* srv_data_file_is_raw_partition; +#define SRV_NEW_RAW 1 +#define SRV_OLD_RAW 2 + extern char** srv_log_group_home_dirs; extern ulint srv_n_log_groups; @@ -52,10 +55,14 @@ extern ulint srv_lock_wait_timeout; extern char* srv_unix_file_flush_method_str; extern ulint srv_unix_file_flush_method; +extern ibool srv_use_doublewrite_buf; + extern ibool srv_set_thread_priorities; extern int srv_query_thread_priority; /*-------------------------------------------*/ + +extern ibool srv_print_innodb_monitor; extern ulint srv_n_spin_wait_rounds; extern ulint srv_spin_wait_delay; extern ibool srv_priority_boost; @@ -104,26 +111,13 @@ typedef struct srv_sys_struct srv_sys_t; /* The server system */ extern srv_sys_t* srv_sys; -/* Alternatives for file flush option in Unix; see the InnoDB manual about +/* Alternatives for fiel flush option in Unix; see the InnoDB manual about what these mean */ #define SRV_UNIX_FDATASYNC 1 #define SRV_UNIX_O_DSYNC 2 #define SRV_UNIX_LITTLESYNC 3 #define SRV_UNIX_NOSYNC 4 -/* Raw partition flags */ -#define SRV_OLD_RAW 1 -#define SRV_NEW_RAW 2 - -void -srv_mysql_thread_release(void); -/*==========================*/ -os_event_t -srv_mysql_thread_event_get(void); -void -srv_mysql_thread_slot_free( -/*==========================*/ - os_event_t event); /************************************************************************* Boots Innobase server. */ diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h index 4b12dd3c86d..cb86b2b815c 100644 --- a/innobase/include/sync0sync.h +++ b/innobase/include/sync0sync.h @@ -393,6 +393,7 @@ Memory pool mutex */ #define SYNC_RSEG_HEADER_NEW 591 #define SYNC_RSEG_HEADER 590 #define SYNC_TRX_UNDO_PAGE 570 +#define SYNC_EXTERN_STORAGE 500 #define SYNC_FSP 400 #define SYNC_FSP_PAGE 395 /*------------------------------------- Insert buffer headers */ @@ -415,6 +416,7 @@ Memory pool mutex */ the level is SYNC_MEM_HASH. */ #define SYNC_BUF_POOL 150 #define SYNC_BUF_BLOCK 149 +#define SYNC_DOUBLEWRITE 140 #define SYNC_ANY_LATCH 135 #define SYNC_MEM_HASH 131 #define SYNC_MEM_POOL 130 diff --git a/innobase/include/trx0rec.h b/innobase/include/trx0rec.h index ea9e9f3fce5..edfc283d1b2 100644 --- a/innobase/include/trx0rec.h +++ b/innobase/include/trx0rec.h @@ -45,6 +45,14 @@ trx_undo_rec_get_cmpl_info( /* out: compiler info */ trx_undo_rec_t* undo_rec); /* in: undo log record */ /************************************************************************** +Returns TRUE if an undo log record contains an extern storage field. */ +UNIV_INLINE +ibool +trx_undo_rec_get_extern_storage( +/*============================*/ + /* out: TRUE if extern */ + trx_undo_rec_t* undo_rec); /* in: undo log record */ +/************************************************************************** Reads the undo log record number. */ UNIV_INLINE dulint @@ -65,6 +73,8 @@ trx_undo_rec_get_pars( TRX_UNDO_INSERT_REC, ... */ ulint* cmpl_info, /* out: compiler info, relevant only for update type records */ + ibool* updated_extern, /* out: TRUE if we updated an + externally stored fild */ dulint* undo_no, /* out: undo log record number */ dulint* table_id); /* out: table id */ /*********************************************************************** @@ -272,7 +282,11 @@ record */ do not change */ #define TRX_UNDO_CMPL_INFO_MULT 16 /* compilation info is multiplied by this and ORed to the type above */ - +#define TRX_UNDO_UPD_EXTERN 128 /* This bit can be ORed to type_cmpl + to denote that we updated external + storage fields: used by purge to + free the external storage */ + /* Operation type flags used in trx_undo_report_row_operation */ #define TRX_UNDO_INSERT_OP 1 #define TRX_UNDO_MODIFY_OP 2 diff --git a/innobase/include/trx0rec.ic b/innobase/include/trx0rec.ic index f813a52ff9c..cd02ed9e04c 100644 --- a/innobase/include/trx0rec.ic +++ b/innobase/include/trx0rec.ic @@ -31,6 +31,23 @@ trx_undo_rec_get_cmpl_info( } /************************************************************************** +Returns TRUE if an undo log record contains an extern storage field. */ +UNIV_INLINE +ibool +trx_undo_rec_get_extern_storage( +/*============================*/ + /* out: TRUE if extern */ + trx_undo_rec_t* undo_rec) /* in: undo log record */ +{ + if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************** Reads the undo log record number. */ UNIV_INLINE dulint diff --git a/innobase/include/trx0sys.h b/innobase/include/trx0sys.h index d0506dd65b7..e26f7e19850 100644 --- a/innobase/include/trx0sys.h +++ b/innobase/include/trx0sys.h @@ -27,6 +27,23 @@ Created 3/26/1996 Heikki Tuuri /* The transaction system */ extern trx_sys_t* trx_sys; +/* Doublewrite system */ +extern trx_doublewrite_t* trx_doublewrite; + +/******************************************************************** +Creates the doublewrite buffer at a database start. The header of the +doublewrite buffer is placed on the trx system header page. */ + +void +trx_sys_create_doublewrite_buf(void); +/*================================*/ +/******************************************************************** +At a database startup uses a possible doublewrite buffer to restore +half-written pages in the data files. */ + +void +trx_sys_doublewrite_restore_corrupt_pages(void); +/*===========================================*/ /******************************************************************* Checks if a page address is the trx sys header page. */ UNIV_INLINE @@ -235,6 +252,59 @@ therefore 256 */ segment specification slots */ /*-------------------------------------------------------------*/ +/* The offset of the doublewrite buffer header on the trx system header page */ +#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200) +/*-------------------------------------------------------------*/ +#define TRX_SYS_DOUBLEWRITE_FSEG 0 /* fseg header of the fseg + containing the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE + /* 4-byte magic number which + shows if we already have + created the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE) + /* page number of the + first page in the first + sequence of 64 + (= FSP_EXTENT_SIZE) consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE) + /* page number of the + first page in the second + sequence of 64 consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /* we repeat the above 3 + numbers so that if the trx + sys header is half-written + to disk, we still may be able + to recover the information */ +/*-------------------------------------------------------------*/ +#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855 + +#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE + +/* Doublewrite control struct */ +struct trx_doublewrite_struct{ + mutex_t mutex; /* mutex protecting the first_free field and + write_buf */ + ulint block1; /* the page number of the first + doublewrite block (64 pages) */ + ulint block2; /* page number of the second block */ + ulint first_free; /* first free position in write_buf measured + in units of UNIV_PAGE_SIZE */ + byte* write_buf; /* write buffer used in writing to the + doublewrite buffer, aligned to an + address divisible by UNIV_PAGE_SIZE + (which is required by Windows aio) */ + byte* write_buf_unaligned; /* pointer to write_buf, but unaligned */ + buf_block_t** + buf_block_arr; /* array to store pointers to the buffer + blocks which have been cached to write_buf */ +}; + /* The transaction system central memory data structure; protected by the kernel mutex */ struct trx_sys_struct{ diff --git a/innobase/include/trx0types.h b/innobase/include/trx0types.h index 02da1605077..b8befe7172f 100644 --- a/innobase/include/trx0types.h +++ b/innobase/include/trx0types.h @@ -15,6 +15,7 @@ Created 3/26/1996 Heikki Tuuri /* Memory objects */ typedef struct trx_struct trx_t; typedef struct trx_sys_struct trx_sys_t; +typedef struct trx_doublewrite_struct trx_doublewrite_t; typedef struct trx_sig_struct trx_sig_t; typedef struct trx_rseg_struct trx_rseg_t; typedef struct trx_undo_struct trx_undo_t; diff --git a/innobase/include/trx0undo.h b/innobase/include/trx0undo.h index 82c21f756e6..7f0378c68d3 100644 --- a/innobase/include/trx0undo.h +++ b/innobase/include/trx0undo.h @@ -341,7 +341,9 @@ struct trx_undo_struct{ have delete marked records, because of a delete of a row or an update of an indexed field; purge is then - necessary. */ + necessary; also TRUE if the transaction + has updated an externally stored + field */ dulint trx_id; /* id of the trx assigned to the undo log */ ibool dict_operation; /* TRUE if a dict operation trx */ diff --git a/innobase/include/univ.i b/innobase/include/univ.i index 73bf48b1bc0..6ffbb1b8fef 100644 --- a/innobase/include/univ.i +++ b/innobase/include/univ.i @@ -9,11 +9,12 @@ Created 1/20/1994 Heikki Tuuri #ifndef univ_i #define univ_i -#undef UNIV_INTEL_X86 - -#if (defined(_WIN32) || defined(_WIN64)) && !defined(MYSQL_SERVER) +#if (defined(_WIN32) || defined(_WIN64)) #define __WIN__ + +#ifndef MYSQL_SERVER #include <windows.h> +#endif /* If you want to check for errors with compiler level -W4, comment out the above include of windows.h and let the following defines @@ -40,10 +41,8 @@ subdirectory of 'mysql'. */ #include <global.h> #include <my_pthread.h> -#ifndef __WIN__ /* Include <sys/stat.h> to get S_I... macros defined for os0file.c */ #include <sys/stat.h> -#endif #undef PACKAGE #undef VERSION @@ -63,19 +62,21 @@ subdirectory of 'mysql'. */ /* DEBUG VERSION CONTROL ===================== */ + +/* +#define UNIV_SYNC_DEBUG +*/ + /* Make a non-inline debug version */ /* #define UNIV_DEBUG #define UNIV_MEM_DEBUG -#define UNIV_SYNC_DEBUG #define UNIV_SEARCH_DEBUG #define UNIV_IBUF_DEBUG #define UNIV_SYNC_PERF_STAT #define UNIV_SEARCH_PERF_STAT - -#define UNIV_DEBUG_FILE_ACCESSES */ #define UNIV_LIGHT_MEM_DEBUG @@ -192,6 +193,13 @@ headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */ has the SQL NULL as its value. */ #define UNIV_SQL_NULL ULINT_UNDEFINED +/* Lengths which are not UNIV_SQL_NULL, but bigger than the following +number indicate that a field contains a reference to an externally +stored part of the field in the tablespace. The length field then +contains the sum of the following flag and the locally stored len. */ + +#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE) + /* The following definition of __FILE__ removes compiler warnings associated with const char* / char* mismatches with __FILE__ */ diff --git a/innobase/include/ut0dbg.h b/innobase/include/ut0dbg.h index 657d1bf95b2..fc5d493ca5e 100644 --- a/innobase/include/ut0dbg.h +++ b/innobase/include/ut0dbg.h @@ -41,7 +41,7 @@ extern ulint* ut_dbg_null_ptr; }\ if (ut_dbg_stop_threads) {\ fprintf(stderr,\ - "Innobase: Thread %lu stopped in file %s line %lu\n",\ + "InnoDB: Thread %lu stopped in file %s line %lu\n",\ os_thread_get_curr_id(), IB__FILE__, (ulint)__LINE__);\ os_thread_sleep(1000000000);\ }\ @@ -50,19 +50,17 @@ extern ulint* ut_dbg_null_ptr; #define ut_error {\ ulint dbg_i;\ fprintf(stderr,\ - "Innobase: Assertion failure in thread %lu in file %s line %lu\n",\ + "InnoDB: Assertion failure in thread %lu in file %s line %lu\n",\ os_thread_get_curr_id(), IB__FILE__, (ulint)__LINE__);\ fprintf(stderr,\ - "Innobase: we intentionally generate a memory trap.\n");\ + "InnoDB: We intentionally generate a memory trap.\n");\ fprintf(stderr,\ - "Innobase: Send a bug report to mysql@lists.mysql.com\n");\ + "InnoDB: Send a detailed bug report to mysql@lists.mysql.com\n");\ ut_dbg_stop_threads = TRUE;\ dbg_i = *(ut_dbg_null_ptr);\ printf("%lu", dbg_i);\ } - - #ifdef UNIV_DEBUG #define ut_ad(EXPR) ut_a(EXPR) #define ut_d(EXPR) {EXPR;} diff --git a/innobase/include/ut0ut.h b/innobase/include/ut0ut.h index 1e93a2b8a36..e1813e763bd 100644 --- a/innobase/include/ut0ut.h +++ b/innobase/include/ut0ut.h @@ -11,8 +11,7 @@ Created 1/20/1994 Heikki Tuuri #include "univ.i" #include <time.h> -#include <m_ctype.h> - +#include <ctype.h> typedef time_t ib_time_t; diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c index 79fb66459b2..5f8f538f392 100644 --- a/innobase/lock/lock0lock.c +++ b/innobase/lock/lock0lock.c @@ -3219,6 +3219,7 @@ lock_rec_print( ulint space; ulint page_no; ulint i; + ulint count = 0; mtr_t mtr; ut_ad(mutex_own(&kernel_mutex)); @@ -3230,7 +3231,8 @@ lock_rec_print( printf("\nRECORD LOCKS space id %lu page no %lu n bits %lu", space, page_no, lock_rec_get_n_bits(lock)); - printf(" index %s trx id %lu %lu", (lock->index)->name, + printf(" table %s index %s trx id %lu %lu", + lock->index->table->name, lock->index->name, (lock->trx)->id.high, (lock->trx)->id.low); if (lock_get_mode(lock) == LOCK_S) { @@ -3281,10 +3283,18 @@ lock_rec_print( rec_print(page_find_rec_with_heap_no(page, i)); } + count++; + printf("\n"); } - } + if (count >= 3) { + printf( + "3 LOCKS PRINTED FOR THIS TRX AND PAGE: SUPPRESSING FURTHER PRINTS\n"); + goto end_prints; + } + } +end_prints: mtr_commit(&mtr); } @@ -3335,7 +3345,6 @@ lock_print_info(void) lock_mutex_enter_kernel(); - printf("------------------------------------\n"); printf("LOCK INFO:\n"); printf("Number of locks in the record hash table %lu\n", lock_get_n_rec_locks()); @@ -3352,7 +3361,7 @@ loop: if (trx == NULL) { lock_mutex_exit_kernel(); - lock_validate(); + /* lock_validate(); */ return; } @@ -3360,6 +3369,19 @@ loop: if (nth_lock == 0) { printf("\nLOCKS FOR TRANSACTION ID %lu %lu\n", trx->id.high, trx->id.low); + if (trx->que_state == TRX_QUE_LOCK_WAIT) { + printf( + "################# TRX IS WAITING FOR THE LOCK: ###\n"); + + if (lock_get_type(trx->wait_lock) == LOCK_REC) { + lock_rec_print(trx->wait_lock); + } else { + lock_table_print(trx->wait_lock); + } + + printf( + "##################################################\n"); + } } i = 0; @@ -3409,6 +3431,16 @@ loop: nth_lock++; + if (nth_lock >= 25) { + printf( + "25 LOCKS PRINTED FOR THIS TRX: SUPPRESSING FURTHER PRINTS\n"); + + nth_trx++; + nth_lock = 0; + + goto loop; + } + goto loop; } diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index 31cf595e59e..351ea7f2fd5 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -838,7 +838,9 @@ log_io_complete( /* It was a checkpoint write */ group = (log_group_t*)((ulint)group - 1); - if (srv_unix_file_flush_method == SRV_UNIX_LITTLESYNC) { + if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { + fil_flush(group->space_id); } @@ -847,7 +849,9 @@ log_io_complete( return; } - if (srv_unix_file_flush_method == SRV_UNIX_LITTLESYNC) { + if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { + fil_flush(group->space_id); } @@ -1478,7 +1482,7 @@ log_checkpoint( recv_apply_hashed_log_recs(TRUE); } - if (srv_unix_file_flush_method == SRV_UNIX_LITTLESYNC) { + if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { fil_flush_file_spaces(FIL_TABLESPACE); } @@ -1885,10 +1889,11 @@ loop: fil_reserve_right_to_open(); file_handle = os_file_create(name, open_mode, OS_FILE_AIO, - &ret); + OS_DATA_FILE, &ret); + if (!ret && (open_mode == OS_FILE_CREATE)) { file_handle = os_file_create(name, OS_FILE_OPEN, - OS_FILE_AIO, &ret); + OS_FILE_AIO, OS_DATA_FILE, &ret); } if (!ret) { diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index d16085a2d6f..edab98fa39c 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -2234,7 +2234,8 @@ try_open_again: fil_reserve_right_to_open(); - file_handle = os_file_create(name, OS_FILE_OPEN, OS_FILE_AIO, &ret); + file_handle = os_file_create(name, OS_FILE_OPEN, + OS_FILE_LOG, OS_FILE_AIO, &ret); if (ret == FALSE) { fil_release_right_to_open(); diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index 668d74d75b5..0525fd7b59a 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -10,6 +10,7 @@ Created 10/21/1995 Heikki Tuuri #include "os0sync.h" #include "ut0mem.h" #include "srv0srv.h" +#include "trx0sys.h" #undef HAVE_FDATASYNC @@ -74,9 +75,12 @@ typedef struct os_aio_array_struct os_aio_array_t; struct os_aio_array_struct{ os_mutex_t mutex; /* the mutex protecting the aio array */ - os_event_t not_full; /* The event which is set to signaled + os_event_t not_full; /* The event which is set to the signaled state when there is space in the aio outside the ibuf segment */ + os_event_t is_empty; /* The event which is set to the signaled + state when there are no pending i/os + in this array */ ulint n_slots; /* Total number of slots in the aio array. This must be divisible by n_threads. */ ulint n_segments;/* Number of segments in the aio array of @@ -254,6 +258,7 @@ os_file_create( if a new is created or an old overwritten */ ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o is desired, OS_FILE_NORMAL, if any normal file */ + ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ ibool* success)/* out: TRUE if succeed, FALSE if error */ { #ifdef __WIN__ @@ -347,9 +352,11 @@ try_again: UT_NOT_USED(purpose); -#ifdef O_DSYNC - if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { - create_flag = create_flag | O_DSYNC; +#ifdef O_SYNC + if ((!srv_use_doublewrite_buf || type != OS_DATA_FILE) + && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + + create_flag = create_flag | O_SYNC; } #endif if (create_mode == OS_FILE_CREATE) { @@ -548,12 +555,6 @@ os_file_flush( #else int ret; -#ifdef O_DSYNC - if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { - return(TRUE); - } -#endif - #ifdef HAVE_FDATASYNC ret = fdatasync(file); #else @@ -634,7 +635,8 @@ os_file_pwrite( ret = pwrite(file, buf, n, offs); if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC - && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && !trx_doublewrite) { /* Always do fsync to reduce the probability that when the OS crashes, a database page is only partially @@ -663,7 +665,8 @@ os_file_pwrite( ret = write(file, buf, n); if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC - && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && !trx_doublewrite) { /* Always do fsync to reduce the probability that when the OS crashes, a database page is only partially @@ -822,7 +825,9 @@ try_again: /* Always do fsync to reduce the probability that when the OS crashes, a database page is only partially physically written to disk. */ - ut_a(TRUE == os_file_flush(file)); + if (!trx_doublewrite) { + ut_a(TRUE == os_file_flush(file)); + } os_mutex_exit(os_file_seek_mutexes[i]); @@ -897,6 +902,10 @@ os_aio_array_create( array->mutex = os_mutex_create(NULL); array->not_full = os_event_create(NULL); + array->is_empty = os_event_create(NULL); + + os_event_set(array->is_empty); + array->n_slots = n; array->n_segments = n_segments; array->n_reserved = 0; @@ -996,6 +1005,17 @@ os_aio_init( #endif } +/**************************************************************************** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ + +void +os_aio_wait_until_no_pending_writes(void) +/*=====================================*/ +{ + os_event_wait(os_aio_write_array->is_empty); +} + /************************************************************************** Calculates segment number for a slot. */ static @@ -1188,6 +1208,10 @@ loop: array->n_reserved++; + if (array->n_reserved == 1) { + os_event_reset(array->is_empty); + } + if (array->n_reserved == array->n_slots) { os_event_reset(array->not_full); } @@ -1261,6 +1285,10 @@ os_aio_array_free_slot( os_event_set(array->not_full); } + if (array->n_reserved == 0) { + os_event_set(array->is_empty); + } + #ifdef WIN_ASYNC_IO os_event_reset(slot->control.hEvent); #endif @@ -1374,6 +1402,7 @@ os_aio( DWORD len = n; void* dummy_mess1; void* dummy_mess2; + ulint dummy_type; #endif ulint err = 0; ibool retry; @@ -1486,8 +1515,9 @@ try_again: use the same wait mechanism as for async i/o */ return(os_aio_windows_handle(ULINT_UNDEFINED, - slot->pos, - &dummy_mess1, &dummy_mess2)); + slot->pos, + &dummy_mess1, &dummy_mess2, + &dummy_type)); } return(TRUE); @@ -1544,7 +1574,8 @@ os_aio_windows_handle( the aio operation failed, these output parameters are valid and can be used to restart the operation, for example */ - void** message2) + void** message2, + ulint* type) /* out: OS_FILE_WRITE or ..._READ */ { os_aio_array_t* array; os_aio_slot_t* slot; @@ -1589,10 +1620,12 @@ os_aio_windows_handle( *message1 = slot->message1; *message2 = slot->message2; + *type = slot->type; + if (ret && len == slot->len) { ret_val = TRUE; - if (slot->type == OS_FILE_WRITE) { + if (slot->type == OS_FILE_WRITE && !trx_doublewrite) { ut_a(TRUE == os_file_flush(slot->file)); } } else { @@ -1676,7 +1709,7 @@ os_aio_posix_handle( *message1 = slot->message1; *message2 = slot->message2; - if (slot->type == OS_FILE_WRITE) { + if (slot->type == OS_FILE_WRITE && !trx_doublewrite) { ut_a(TRUE == os_file_flush(slot->file)); } @@ -1706,7 +1739,8 @@ os_aio_simulated_handle( the aio operation failed, these output parameters are valid and can be used to restart the operation, for example */ - void** message2) + void** message2, + ulint* type) /* out: OS_FILE_WRITE or ..._READ */ { os_aio_array_t* array; ulint segment; @@ -1903,6 +1937,8 @@ slot_io_done: *message1 = slot->message1; *message2 = slot->message2; + *type = slot->type; + os_mutex_exit(array->mutex); os_aio_array_free_slot(array, slot); @@ -1986,13 +2022,13 @@ os_aio_print(void) os_aio_slot_t* slot; ulint n_reserved; ulint i; - + + printf("Pending normal aio reads:\n"); + array = os_aio_read_array; loop: ut_a(array); - printf("INFO OF AN AIO ARRAY\n"); - os_mutex_enter(array->mutex); ut_a(array->n_slots > 0); @@ -2019,24 +2055,29 @@ loop: os_mutex_exit(array->mutex); if (array == os_aio_read_array) { + printf("Pending aio writes:\n"); + array = os_aio_write_array; goto loop; } if (array == os_aio_write_array) { + printf("Pending insert buffer aio reads:\n"); array = os_aio_ibuf_array; goto loop; } if (array == os_aio_ibuf_array) { + printf("Pending log writes or reads:\n"); array = os_aio_log_array; goto loop; } if (array == os_aio_log_array) { + printf("Pending synchronous reads or writes:\n"); array = os_aio_sync_array; goto loop; diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c index e329b916b1b..0b233b4dd72 100644 --- a/innobase/page/page0cur.c +++ b/innobase/page/page0cur.c @@ -1019,16 +1019,16 @@ page_cur_delete_rec( page_cur_t* cursor, /* in: a page cursor */ mtr_t* mtr) /* in: mini-transaction handle */ { + page_dir_slot_t* cur_dir_slot; + page_dir_slot_t* prev_slot; page_t* page; rec_t* current_rec; rec_t* prev_rec = NULL; rec_t* next_rec; ulint cur_slot_no; - page_dir_slot_t* cur_dir_slot; - page_dir_slot_t* prev_slot; ulint cur_n_owned; rec_t* rec; - + ut_ad(cursor && mtr); page = page_cur_get_page(cursor); @@ -1037,7 +1037,7 @@ page_cur_delete_rec( /* The record must not be the supremum or infimum record. */ ut_ad(current_rec != page_get_supremum_rec(page)); ut_ad(current_rec != page_get_infimum_rec(page)); - + /* Save to local variables some data associated with current_rec */ cur_slot_no = page_dir_find_owner_slot(current_rec); cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no); diff --git a/innobase/pars/pars0pars.c b/innobase/pars/pars0pars.c index f6c51f3905a..4a298426476 100644 --- a/innobase/pars/pars0pars.c +++ b/innobase/pars/pars0pars.c @@ -2028,11 +2028,7 @@ pars_complete_graph_for_exec( que_node_set_parent(node, thr); - mutex_enter(&kernel_mutex); - trx->graph = NULL; - mutex_exit(&kernel_mutex); - return(thr); } diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c index 78f4e450269..cdf1f363946 100644 --- a/innobase/rem/rem0cmp.c +++ b/innobase/rem/rem0cmp.c @@ -295,14 +295,18 @@ This function is used to compare a data tuple to a physical record. Only dtuple->n_fields_cmp first fields are taken into account for the the data tuple! If we denote by n = n_fields_cmp, then rec must have either m >= n fields, or it must differ from dtuple in some of -the m fields rec has. */ +the m fields rec has. If rec has an externally stored field we do not +compare it but return with value 0 if such a comparison should be +made. */ int cmp_dtuple_rec_with_match( /*======================*/ /* out: 1, 0, -1, if dtuple is greater, equal, less than rec, respectively, when only the - common first fields are compared */ + common first fields are compared, or + until the first externally stored field in + rec */ dtuple_t* dtuple, /* in: data tuple */ rec_t* rec, /* in: physical record which differs from dtuple in some of the common fields, or which @@ -344,7 +348,8 @@ cmp_dtuple_rec_with_match( ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple)); ut_ad(cur_field <= rec_get_n_fields(rec)); - /* Match fields in a loop; stop if we run out of fields in dtuple */ + /* Match fields in a loop; stop if we run out of fields in dtuple + or find an externally stored field */ while (cur_field < dtuple_get_n_fields_cmp(dtuple)) { @@ -357,7 +362,8 @@ cmp_dtuple_rec_with_match( /* If we have matched yet 0 bytes, it may be that one or both the fields are SQL null, or the record or dtuple may be - the predefined minimum record */ + the predefined minimum record, or the field is externally + stored */ if (cur_bytes == 0) { if (cur_field == 0) { @@ -384,6 +390,15 @@ cmp_dtuple_rec_with_match( } } + if (rec_get_nth_field_extern_bit(rec, cur_field)) { + /* We do not compare to an externally + stored field */ + + ret = 0; + + goto order_resolved; + } + if (dtuple_f_len == UNIV_SQL_NULL || rec_f_len == UNIV_SQL_NULL) { @@ -604,7 +619,8 @@ cmp_dtuple_rec_prefix_equal( /***************************************************************** This function is used to compare two physical records. Only the common -first fields are compared. */ +first fields are compared, and if an externally stored field is +encountered, then 0 is returned. */ int cmp_rec_rec_with_match( @@ -688,8 +704,18 @@ cmp_rec_rec_with_match( goto order_resolved; } - } + } + + if (rec_get_nth_field_extern_bit(rec1, cur_field) + || rec_get_nth_field_extern_bit(rec2, cur_field)) { + /* We do not compare to an externally + stored field */ + ret = 0; + + goto order_resolved; + } + if (rec1_f_len == UNIV_SQL_NULL || rec2_f_len == UNIV_SQL_NULL) { @@ -812,7 +838,8 @@ order_resolved: Used in debug checking of cmp_dtuple_... . This function is used to compare a data tuple to a physical record. If dtuple has n fields then rec must have either m >= n fields, or it must -differ from dtuple in some of the m fields rec has. */ +differ from dtuple in some of the m fields rec has. If encounters an +externally stored field, returns 0. */ static int cmp_debug_dtuple_rec_with_match( @@ -882,6 +909,14 @@ cmp_debug_dtuple_rec_with_match( rec_f_data = rec_get_nth_field(rec, cur_field, &rec_f_len); + if (rec_get_nth_field_extern_bit(rec, cur_field)) { + /* We do not compare to an externally stored field */ + + ret = 0; + + goto order_resolved; + } + ret = cmp_data_data(cur_type, dtuple_f_data, dtuple_f_len, rec_f_data, rec_f_len); if (ret != 0) { diff --git a/innobase/rem/rem0rec.c b/innobase/rem/rem0rec.c index 9ddfe7a4b9a..88009f2f5c9 100644 --- a/innobase/rem/rem0rec.c +++ b/innobase/rem/rem0rec.c @@ -1,7 +1,7 @@ /************************************************************************ Record manager -(c) 1994-1996 Innobase Oy +(c) 1994-2001 Innobase Oy Created 5/30/1994 Heikki Tuuri *************************************************************************/ @@ -12,6 +12,9 @@ Created 5/30/1994 Heikki Tuuri #include "rem0rec.ic" #endif +#include "mtr0mtr.h" +#include "mtr0log.h" + /* PHYSICAL RECORD =============== @@ -21,7 +24,10 @@ found in index pages of the database, has the following format represented on a higher text line): | offset of the end of the last field of data, the most significant - bit is set to 1 if and only if the field is SQL-null | + bit is set to 1 if and only if the field is SQL-null, + if the offset is 2-byte, then the second most significant + bit is set to 1 if the field is stored on another page: + mostly this will occur in the case of big BLOB fields | ... | offset of the end of the first field of data + the SQL-null bit | | 4 bits used to delete mark a record, and mark a predefined @@ -122,7 +128,8 @@ rec_get_nth_field( return(rec + os); } - next_os = next_os & ~REC_2BYTE_SQL_NULL_MASK; + next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK + | REC_2BYTE_EXTERN_MASK); } *len = next_os - os; @@ -170,6 +177,60 @@ rec_set_nth_field_null_bit( rec_2_set_field_end_info(rec, i, info); } +/*************************************************************** +Sets the value of the ith field extern storage bit. */ + +void +rec_set_nth_field_extern_bit( +/*=========================*/ + rec_t* rec, /* in: record */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr holding an X-latch to the page where + rec is, or NULL; in the NULL case we do not + write to log about the change */ +{ + ulint info; + + ut_a(!rec_get_1byte_offs_flag(rec)); + ut_a(i < rec_get_n_fields(rec)); + + info = rec_2_get_field_end_info(rec, i); + + if (val) { + info = info | REC_2BYTE_EXTERN_MASK; + } else { + info = info & ~REC_2BYTE_EXTERN_MASK; + } + + if (mtr) { + mlog_write_ulint(rec - REC_N_EXTRA_BYTES - 2 * (i + 1), info, + MLOG_2BYTES, mtr); + } else { + rec_2_set_field_end_info(rec, i, info); + } +} + +/*************************************************************** +Sets TRUE the extern storage bits of fields mentioned in an array. */ + +void +rec_set_field_extern_bits( +/*======================*/ + rec_t* rec, /* in: record */ + ulint* vec, /* in: array of field numbers */ + ulint n_fields, /* in: number of fields numbers */ + mtr_t* mtr) /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case we + do not write to log about the change */ +{ + ulint i; + + for (i = 0; i < n_fields; i++) { + rec_set_nth_field_extern_bit(rec, vec[i], TRUE, mtr); + } +} + /*************************************************************** Sets a record field to SQL null. The physical size of the field is not changed. */ diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index e57622fd1c5..8542dcae326 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -234,7 +234,13 @@ row_ins_clust_index_entry_by_modify( depending on whether mtr holds just a leaf latch or also a tree latch */ btr_cur_t* cursor, /* in: B-tree cursor */ + big_rec_t** big_rec,/* out: possible big rec vector of fields + which have to be stored externally by the + caller */ dtuple_t* entry, /* in: index entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ que_thr_t* thr, /* in: query thread */ mtr_t* mtr) /* in: mtr */ { @@ -243,8 +249,10 @@ row_ins_clust_index_entry_by_modify( upd_t* update; ulint err; - ut_ad((cursor->index)->type & DICT_CLUSTERED); + ut_ad(cursor->index->type & DICT_CLUSTERED); + *big_rec = NULL; + rec = btr_cur_get_rec(cursor); ut_ad(rec_get_deleted_flag(rec)); @@ -254,21 +262,21 @@ row_ins_clust_index_entry_by_modify( /* Build an update vector containing all the fields to be modified; NOTE that this vector may contain also system columns! */ - update = row_upd_build_difference(cursor->index, entry, rec, heap); - + update = row_upd_build_difference(cursor->index, entry, ext_vec, + n_ext_vec, rec, heap); if (mode == BTR_MODIFY_LEAF) { /* Try optimistic updating of the record, keeping changes within the page */ - err = btr_cur_optimistic_update(0, cursor, update, 0, thr, - mtr); - if ((err == DB_OVERFLOW) || (err == DB_UNDERFLOW)) { + err = btr_cur_optimistic_update(0, cursor, update, 0, thr, mtr); + + if (err == DB_OVERFLOW || err == DB_UNDERFLOW) { err = DB_FAIL; } } else { - ut_ad(mode == BTR_MODIFY_TREE); - err = btr_cur_pessimistic_update(0, cursor, update, 0, thr, - mtr); + ut_a(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_update(0, cursor, big_rec, update, + 0, thr, mtr); } mem_heap_free(heap); @@ -597,14 +605,18 @@ row_ins_index_entry_low( pessimistic descent down the index tree */ dict_index_t* index, /* in: index */ dtuple_t* entry, /* in: index entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ que_thr_t* thr) /* in: query thread */ { btr_cur_t cursor; ulint modify; - rec_t* dummy_rec; + rec_t* insert_rec; rec_t* rec; ulint err; ulint n_unique; + big_rec_t* big_rec = NULL; mtr_t mtr; log_free_check(); @@ -682,24 +694,54 @@ row_ins_index_entry_low( if (index->type & DICT_CLUSTERED) { err = row_ins_clust_index_entry_by_modify(mode, - &cursor, entry, - thr, &mtr); + &cursor, &big_rec, + entry, + ext_vec, n_ext_vec, + thr, &mtr); } else { err = row_ins_sec_index_entry_by_modify(&cursor, thr, &mtr); } - } else if (mode == BTR_MODIFY_LEAF) { - err = btr_cur_optimistic_insert(0, &cursor, entry, - &dummy_rec, thr, &mtr); } else { - ut_ad(mode == BTR_MODIFY_TREE); - err = btr_cur_pessimistic_insert(0, &cursor, entry, - &dummy_rec, thr, &mtr); + if (mode == BTR_MODIFY_LEAF) { + err = btr_cur_optimistic_insert(0, &cursor, entry, + &insert_rec, &big_rec, thr, &mtr); + } else { + ut_a(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_insert(0, &cursor, entry, + &insert_rec, &big_rec, thr, &mtr); + } + + if (err == DB_SUCCESS) { + if (ext_vec) { + rec_set_field_extern_bits(insert_rec, + ext_vec, n_ext_vec, &mtr); + } + } } + function_exit: mtr_commit(&mtr); + if (big_rec) { + mtr_start(&mtr); + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, &mtr); + + err = btr_store_big_rec_extern_fields(index, + btr_cur_get_rec(&cursor), + big_rec, &mtr); + if (modify) { + dtuple_big_rec_free(big_rec); + } else { + dtuple_convert_back_big_rec(index, entry, big_rec); + } + + mtr_commit(&mtr); + } + return(err); } @@ -716,14 +758,17 @@ row_ins_index_entry( DB_DUPLICATE_KEY, or some other error code */ dict_index_t* index, /* in: index */ dtuple_t* entry, /* in: index entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ que_thr_t* thr) /* in: query thread */ { ulint err; /* Try first optimistic descent to the B-tree */ - err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, thr); - + err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, + ext_vec, n_ext_vec, thr); if (err != DB_FAIL) { return(err); @@ -731,8 +776,8 @@ row_ins_index_entry( /* Try then pessimistic descent to the B-tree */ - err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, thr); - + err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, + ext_vec, n_ext_vec, thr); return(err); } @@ -784,7 +829,7 @@ row_ins_index_entry_step( ut_ad(dtuple_check_typed(node->entry)); - err = row_ins_index_entry(node->index, node->entry, thr); + err = row_ins_index_entry(node->index, node->entry, NULL, 0, thr); return(err); } diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index 8e1a584f667..9bbc45a5c9a 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -625,7 +625,8 @@ row_update_for_mysql( ut_ad(prebuilt && trx); ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); - + UT_NOT_USED(mysql_rec); + node = prebuilt->upd_node; clust_index = dict_table_get_first_index(table); @@ -777,7 +778,9 @@ row_get_mysql_key_number_for_index( } /************************************************************************* -Does a table creation operation for MySQL. */ +Does a table creation operation for MySQL. If the name of the created +table ends to characters INNODB_MONITOR, then this also starts +printing of monitor output by the master thread. */ int row_create_table_for_mysql( @@ -789,6 +792,8 @@ row_create_table_for_mysql( tab_node_t* node; mem_heap_t* heap; que_thr_t* thr; + ulint namelen; + ulint keywordlen; ulint err; ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); @@ -833,6 +838,20 @@ row_create_table_for_mysql( } trx->error_state = DB_SUCCESS; + } else { + namelen = ut_strlen(table->name); + + keywordlen = ut_strlen("innodb_monitor"); + + if (namelen >= keywordlen + && 0 == ut_memcmp(table->name + namelen - keywordlen, + "innodb_monitor", keywordlen)) { + + /* Table name ends to characters innodb_monitor: + start monitor prints */ + + srv_print_innodb_monitor = TRUE; + } } mutex_exit(&(dict_sys->mutex)); @@ -900,7 +919,9 @@ row_create_index_for_mysql( } /************************************************************************* -Drops a table for MySQL. */ +Drops a table for MySQL. If the name of the dropped table ends to +characters INNODB_MONITOR, then this also stops printing of monitor +output by the master thread. */ int row_drop_table_for_mysql( @@ -918,11 +939,26 @@ row_drop_table_for_mysql( char* str1; char* str2; ulint len; + ulint namelen; + ulint keywordlen; char buf[10000]; ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); ut_a(name != NULL); + namelen = ut_strlen(name); + keywordlen = ut_strlen("innodb_monitor"); + + if (namelen >= keywordlen + && 0 == ut_memcmp(name + namelen - keywordlen, + "innodb_monitor", keywordlen)) { + + /* Table name ends to characters innodb_monitor: + stop monitor prints */ + + srv_print_innodb_monitor = FALSE; + } + /* We use the private SQL parser of Innobase to generate the query graphs needed in deleting the dictionary data from system tables in Innobase. Deleting a row from SYS_INDEXES table also diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c index 0a6fabe584c..ec880d3fe04 100644 --- a/innobase/row/row0purge.c +++ b/innobase/row/row0purge.c @@ -347,20 +347,36 @@ row_purge_del_mark( } /*************************************************************** -Purges an update of an existing record. */ +Purges an update of an existing record. Also purges an update of a delete +marked record if that record contained an externally stored field. */ static void -row_purge_upd_exist( -/*================*/ +row_purge_upd_exist_or_extern( +/*==========================*/ purge_node_t* node, /* in: row purge node */ que_thr_t* thr) /* in: query thread */ { mem_heap_t* heap; dtuple_t* entry; dict_index_t* index; + upd_field_t* ufield; + ibool is_insert; + ulint rseg_id; + ulint page_no; + ulint offset; + ulint internal_offset; + byte* data_field; + ulint data_field_len; + ulint i; + mtr_t mtr; ut_ad(node && thr); + if (node->rec_type == TRX_UNDO_UPD_DEL_REC) { + + goto skip_secondaries; + } + heap = mem_heap_create(1024); while (node->index != NULL) { @@ -378,6 +394,53 @@ row_purge_upd_exist( } mem_heap_free(heap); + +skip_secondaries: + /* Free possible externally stored fields */ + for (i = 0; i < upd_get_n_fields(node->update); i++) { + + ufield = upd_get_nth_field(node->update, i); + + if (ufield->extern_storage) { + /* We use the fact that new_val points to + node->undo_rec and get thus the offset of + dfield data inside the unod record. Then we + can calculate from node->roll_ptr the file + address of the new_val data */ + + internal_offset = ((byte*)ufield->new_val.data) + - node->undo_rec; + + ut_a(internal_offset < UNIV_PAGE_SIZE); + + trx_undo_decode_roll_ptr(node->roll_ptr, + &is_insert, &rseg_id, + &page_no, &offset); + mtr_start(&mtr); + + /* We have to acquire an X-latch to the clustered + index tree */ + + index = dict_table_get_first_index(node->table); + + mtr_x_lock(dict_tree_get_lock(index->tree), &mtr); + + /* We assume in purge of externally stored fields + that the space id of the undo log record is 0! */ + + data_field = buf_page_get(0, page_no, RW_X_LATCH, &mtr) + + offset + internal_offset; + + buf_page_dbg_add_level(buf_frame_align(data_field), + SYNC_TRX_UNDO_PAGE); + + data_field_len = ufield->new_val.len; + + btr_free_externally_stored_field(index, data_field, + data_field_len, &mtr); + mtr_commit(&mtr); + } + } } /*************************************************************** @@ -388,6 +451,9 @@ row_purge_parse_undo_rec( /*=====================*/ /* out: TRUE if purge operation required */ purge_node_t* node, /* in: row undo node */ + ibool* updated_extern, + /* out: TRUE if an externally stored field + was updated */ que_thr_t* thr) /* in: query thread */ { dict_index_t* clust_index; @@ -403,10 +469,10 @@ row_purge_parse_undo_rec( ut_ad(node && thr); ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, - &undo_no, &table_id); + updated_extern, &undo_no, &table_id); node->rec_type = type; - if (type == TRX_UNDO_UPD_DEL_REC) { + if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) { return(FALSE); } @@ -416,7 +482,7 @@ row_purge_parse_undo_rec( node->table = NULL; if (type == TRX_UNDO_UPD_EXIST_REC - && cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) { /* Purge requires no changes to indexes: we may return */ @@ -455,8 +521,11 @@ row_purge_parse_undo_rec( /* Read to the partial row the fields that occur in indexes */ - ptr = trx_undo_rec_get_partial_row(ptr, clust_index, &(node->row), - node->heap); + if (!cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + ptr = trx_undo_rec_get_partial_row(ptr, clust_index, + &(node->row), node->heap); + } + return(TRUE); } @@ -475,6 +544,7 @@ row_purge( { dulint roll_ptr; ibool purge_needed; + ibool updated_extern; ut_ad(node && thr); @@ -494,7 +564,8 @@ row_purge( if (node->undo_rec == &trx_purge_dummy_rec) { purge_needed = FALSE; } else { - purge_needed = row_purge_parse_undo_rec(node, thr); + purge_needed = row_purge_parse_undo_rec(node, &updated_extern, + thr); } if (purge_needed) { @@ -503,11 +574,13 @@ row_purge( node->index = dict_table_get_next_index( dict_table_get_first_index(node->table)); - if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { - row_purge_upd_exist(node, thr); - } else { - ut_ad(node->rec_type == TRX_UNDO_DEL_MARK_REC); + if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { row_purge_del_mark(node, thr); + + } else if (updated_extern + || node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + + row_purge_upd_exist_or_extern(node, thr); } if (node->found_clust) { diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c index f85789fa0d6..59169ef2a98 100644 --- a/innobase/row/row0row.c +++ b/innobase/row/row0row.c @@ -146,15 +146,17 @@ row_build_index_entry( /*********************************************************************** An inverse function to dict_row_build_index_entry. Builds a row from a -record in a clustered index. */ +record in a clustered index. NOTE that externally stored (often big) +fields are always copied to heap. */ dtuple_t* row_build( /*======*/ /* out, own: row built; see the NOTE below! */ - ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS: - the former copies also the data fields to - heap as the latter only places pointers to + ulint type, /* in: ROW_COPY_POINTERS, ROW_COPY_DATA, or + ROW_COPY_ALSO_EXTERNALS, + the two last copy also the data fields to + heap as the first only places pointers to data fields on the index page, and thus is more efficient */ dict_index_t* index, /* in: clustered index */ @@ -170,19 +172,19 @@ row_build( { dtuple_t* row; dict_table_t* table; - ulint n_fields; - ulint i; + dict_col_t* col; dfield_t* dfield; + ulint n_fields; byte* field; ulint len; ulint row_len; - dict_col_t* col; byte* buf; + ulint i; ut_ad(index && rec && heap); ut_ad(index->type & DICT_CLUSTERED); - if (type == ROW_COPY_DATA) { + if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ buf = mem_heap_alloc(heap, rec_get_size(rec)); rec = rec_copy(buf, rec); @@ -207,6 +209,13 @@ row_build( dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); field = rec_get_nth_field(rec, i, &len); + if (type == ROW_COPY_ALSO_EXTERNALS + && rec_get_nth_field_extern_bit(rec, i)) { + + field = btr_rec_copy_externally_stored_field(rec, + i, &len, heap); + } + dfield_set_data(dfield, field, len); } @@ -215,6 +224,7 @@ row_build( return(row); } +#ifdef notdefined /*********************************************************************** An inverse function to dict_row_build_index_entry. Builds a row from a record in a clustered index. */ @@ -229,7 +239,9 @@ row_build_to_tuple( directly into this record, therefore, the buffer page of this record must be at least s-latched and the latch held - as long as the row dtuple is used! */ + as long as the row dtuple is used! + NOTE 2: does not work with externally + stored fields! */ { dict_table_t* table; ulint n_fields; @@ -265,9 +277,11 @@ row_build_to_tuple( ut_ad(dtuple_check_typed(row)); } +#endif /*********************************************************************** -Converts an index record to a typed data tuple. */ +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. */ dtuple_t* row_rec_to_index_entry( diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index eef60c07af3..b74bd29a89e 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -2036,7 +2036,8 @@ row_sel_store_mysql_rec( which was described in prebuilt's template */ { - mysql_row_templ_t* templ; + mysql_row_templ_t* templ; + mem_heap_t* extern_field_heap = NULL; byte* data; ulint len; byte* blob_buf; @@ -2059,6 +2060,24 @@ row_sel_store_mysql_rec( data = rec_get_nth_field(rec, templ->rec_field_no, &len); + if (rec_get_nth_field_extern_bit(rec, templ->rec_field_no)) { + /* Copy an externally stored field to the temporary + heap */ + + if (prebuilt->trx->has_search_latch) { + rw_lock_s_unlock(&btr_search_latch); + prebuilt->trx->has_search_latch = FALSE; + } + + extern_field_heap = mem_heap_create(UNIV_PAGE_SIZE); + + data = btr_rec_copy_externally_stored_field(rec, + templ->rec_field_no, &len, + extern_field_heap); + + ut_a(len != UNIV_SQL_NULL); + } + if (len != UNIV_SQL_NULL) { if (templ->type == DATA_BLOB) { @@ -2081,6 +2100,10 @@ row_sel_store_mysql_rec( mysql_rec + templ->mysql_col_offset, templ->mysql_col_len, data, len, templ->type, templ->is_unsigned); + + if (extern_field_heap) { + mem_heap_free(extern_field_heap); + } } else { mysql_rec[templ->mysql_null_byte_offset] |= (byte) (templ->mysql_null_bit_mask); @@ -2450,6 +2473,7 @@ row_search_for_mysql( ibool unique_search_from_clust_index = FALSE; ibool mtr_has_extra_clust_latch = FALSE; ibool moves_up = FALSE; + ulint cnt = 0; mtr_t mtr; ut_ad(index && pcur && search_tuple); @@ -2457,6 +2481,11 @@ row_search_for_mysql( ut_ad(sync_thread_levels_empty_gen(FALSE)); +/* printf("Match mode %lu\n search tuple ", match_mode); + dtuple_print(search_tuple); + + printf("N tables locked %lu\n", trx->mysql_n_tables_locked); +*/ if (direction == 0) { prebuilt->n_rows_fetched = 0; prebuilt->n_fetch_cached = 0; @@ -2528,6 +2557,8 @@ row_search_for_mysql( mtr_commit(&mtr); + /* printf("%s record not found 1\n", index->name); */ + return(DB_RECORD_NOT_FOUND); } @@ -2537,10 +2568,7 @@ row_search_for_mysql( unique_search_from_clust_index = TRUE; - /* Disable this optimization (hence FALSE below) until - the hang of Peter Zaitsev has been tracked down */ - - if (FALSE && trx->mysql_n_tables_locked == 0 + if (trx->mysql_n_tables_locked == 0 && !prebuilt->sql_stat_start) { /* This is a SELECT query done as a consistent read, @@ -2568,14 +2596,21 @@ row_search_for_mysql( mtr_commit(&mtr); + /* printf("%s shortcut\n", index->name); */ + return(DB_SUCCESS); } else if (shortcut == SEL_EXHAUSTED) { mtr_commit(&mtr); + /* printf("%s record not found 2\n", + index->name); */ return(DB_RECORD_NOT_FOUND); } + + mtr_commit(&mtr); + mtr_start(&mtr); } } @@ -2656,7 +2691,12 @@ rec_loop: cons_read_requires_clust_rec = FALSE; rec = btr_pcur_get_rec(pcur); - +/* + printf("Using index %s cnt %lu ", index->name, cnt); + printf("; Page no %lu\n", + buf_frame_get_page_no(buf_frame_align(rec))); + rec_print(rec); +*/ if (rec == page_get_infimum_rec(buf_frame_align(rec))) { /* The infimum record on a page cannot be in the result set, @@ -2697,12 +2737,15 @@ rec_loop: /* Test if the index record matches completely to search_tuple in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */ + /* printf("Comparing rec and search tuple\n"); */ + if (0 != cmp_dtuple_rec(search_tuple, rec)) { btr_pcur_store_position(pcur, &mtr); ret = DB_RECORD_NOT_FOUND; - + /* printf("%s record not found 3\n", index->name); */ + goto normal_return; } @@ -2713,6 +2756,7 @@ rec_loop: btr_pcur_store_position(pcur, &mtr); ret = DB_RECORD_NOT_FOUND; + /* printf("%s record not found 4\n", index->name); */ goto normal_return; } @@ -2881,6 +2925,8 @@ next_rec: moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, moves_up, &mtr); if (moved) { + cnt++; + goto rec_loop; } } @@ -2903,6 +2949,8 @@ next_rec: goto normal_return; } + cnt++; + goto rec_loop; /*-------------------------------------------------------------*/ lock_wait_or_error: @@ -2928,7 +2976,9 @@ lock_wait_or_error: goto rec_loop; } - + + /* printf("Using index %s cnt %lu ret value %lu err\n", index->name, + cnt, err); */ return(err); normal_return: @@ -2942,5 +2992,7 @@ normal_return: ret = DB_SUCCESS; } + /* printf("Using index %s cnt %lu ret value %lu\n", index->name, + cnt, err); */ return(ret); } diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c index c9330318ac0..47807877779 100644 --- a/innobase/row/row0uins.c +++ b/innobase/row/row0uins.c @@ -242,11 +242,12 @@ row_undo_ins_parse_undo_rec( dulint table_id; ulint type; ulint dummy; + ibool dummy_extern; ut_ad(node && thr); - ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, &undo_no, - &table_id); + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, + &dummy_extern, &undo_no, &table_id); ut_ad(type == TRX_UNDO_INSERT_REC); node->rec_type = type; @@ -284,9 +285,9 @@ row_undo_ins( row_undo_ins_parse_undo_rec(node, thr); if (node->table == NULL) { - found = FALSE; + found = FALSE; } else { - found = row_undo_search_clust_to_pcur(node, thr); + found = row_undo_search_clust_to_pcur(node, thr); } if (!found) { diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c index c8db428bade..0221c51b985 100644 --- a/innobase/row/row0umod.c +++ b/innobase/row/row0umod.c @@ -94,12 +94,12 @@ row_undo_mod_clust_low( mtr_t* mtr, /* in: mtr */ ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ { + big_rec_t* dummy_big_rec; dict_index_t* index; btr_pcur_t* pcur; btr_cur_t* btr_cur; ulint err; ibool success; - ibool do_remove; index = dict_table_get_first_index(node->table); @@ -110,49 +110,80 @@ row_undo_mod_clust_low( ut_ad(success); + if (mode == BTR_MODIFY_LEAF) { + + err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, &dummy_big_rec, node->update, + node->cmpl_info, thr, mtr); + } + + return(err); +} + +/*************************************************************** +Removes a clustered index record after undo if possible. */ +static +ulint +row_undo_mod_remove_clust_low( +/*==========================*/ + /* out: DB_SUCCESS, DB_FAIL, or error code: + we may run out of file space */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr, /* in: mtr */ + ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + ibool success; + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + success = btr_pcur_restore_position(mode, pcur, mtr); + + if (!success) { + + return(DB_SUCCESS); + } + /* Find out if we can remove the whole clustered index record */ if (node->rec_type == TRX_UNDO_UPD_DEL_REC && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) { - do_remove = TRUE; + /* Ok, we can remove */ } else { - do_remove = FALSE; + return(DB_SUCCESS); } if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, mtr); - if (do_remove) { - success = btr_cur_optimistic_delete(btr_cur, mtr); - - if (success) { - err = DB_SUCCESS; - } else { - err = DB_FAIL; - } + if (success) { + err = DB_SUCCESS; } else { - err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG - | BTR_NO_UNDO_LOG_FLAG - | BTR_KEEP_SYS_FLAG, - btr_cur, node->update, - node->cmpl_info, thr, mtr); + err = DB_FAIL; } } else { ut_ad(mode == BTR_MODIFY_TREE); - if (do_remove) { - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, mtr); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, mtr); - /* The delete operation may fail if we have little - file space left: TODO: easiest to crash the database - and restart with more file space */ - } else { - err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG - | BTR_NO_UNDO_LOG_FLAG - | BTR_KEEP_SYS_FLAG, - btr_cur, node->update, - node->cmpl_info, thr, mtr); - } + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ } return(err); @@ -204,10 +235,31 @@ row_undo_mod_clust( err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE); } - node->state = UNDO_NODE_FETCH_NEXT; - btr_pcur_commit_specify_mtr(pcur, &mtr); + if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) { + + mtr_start(&mtr); + + err = row_undo_mod_remove_clust_low(node, thr, &mtr, + BTR_MODIFY_LEAF); + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a + pessimistic descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_remove_clust_low(node, thr, &mtr, + BTR_MODIFY_TREE); + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + } + + node->state = UNDO_NODE_FETCH_NEXT; + trx_undo_rec_release(node->trx, node->undo_no); if (more_vers && err == DB_SUCCESS) { @@ -388,7 +440,6 @@ row_undo_mod_del_unmark_sec( mem_free(err_buf); } else { - btr_cur = btr_pcur_get_btr_cur(&pcur); err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, @@ -546,11 +597,12 @@ row_undo_mod_parse_undo_rec( ulint info_bits; ulint type; ulint cmpl_info; + ibool dummy_extern; ut_ad(node && thr); ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, - &undo_no, &table_id); + &dummy_extern, &undo_no, &table_id); node->rec_type = type; node->table = dict_table_get_on_id(table_id, thr_get_trx(thr)); @@ -598,10 +650,9 @@ row_undo_mod( row_undo_mod_parse_undo_rec(node, thr); if (node->table == NULL) { - found = FALSE; + found = FALSE; } else { - - found = row_undo_search_clust_to_pcur(node, thr); + found = row_undo_search_clust_to_pcur(node, thr); } if (!found) { diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c index 10ac3af6de9..5119254f405 100644 --- a/innobase/row/row0undo.c +++ b/innobase/row/row0undo.c @@ -124,6 +124,8 @@ row_undo_node_create( undo->state = UNDO_NODE_FETCH_NEXT; undo->trx = trx; + btr_pcur_init(&(undo->pcur)); + undo->heap = mem_heap_create(256); return(undo); @@ -303,6 +305,16 @@ row_undo_step( if (err != DB_SUCCESS) { /* SQL error detected */ + fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", err); + + if (err == DB_OUT_OF_FILE_SPACE) { + fprintf(stderr, + "InnoDB: Error 13 means out of tablespace.\n" + "InnoDB: Consider increasing your tablespace.\n"); + + exit(1); + } + ut_a(0); return(NULL); diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index 5bca2a24c01..67a5925a3f5 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -90,8 +90,10 @@ upd_node_create( node->in_mysql_interface = FALSE; node->row = NULL; + node->ext_vec = NULL; node->index = NULL; - + node->update = NULL; + node->select = NULL; node->heap = mem_heap_create(128); @@ -160,7 +162,8 @@ row_upd_index_entry_sys_field( } /*************************************************************** -Returns TRUE if row update changes size of some field in index. */ +Returns TRUE if row update changes size of some field in index +or if some field to be updated is stored externally in rec or update. */ ibool row_upd_changes_field_size( @@ -199,6 +202,16 @@ row_upd_changes_field_size( return(TRUE); } + + if (rec_get_nth_field_extern_bit(rec, upd_field->field_no)) { + + return(TRUE); + } + + if (upd_field->extern_storage) { + + return(TRUE); + } } return(FALSE); @@ -441,6 +454,34 @@ row_upd_index_parse( return(ptr); } + +/******************************************************************* +Returns TRUE if ext_vec contains i. */ +UNIV_INLINE +ibool +upd_ext_vec_contains( +/*=================*/ + /* out: TRUE if i is in ext_vec */ + ulint* ext_vec, /* in: array of indexes or NULL */ + ulint n_ext_vec, /* in: number of numbers in ext_vec */ + ulint i) /* in: a number */ +{ + ulint j; + + if (ext_vec == NULL) { + + return(FALSE); + } + + for (j = 0; j < n_ext_vec; j++) { + if (ext_vec[j] == i) { + + return(TRUE); + } + } + + return(FALSE); +} /******************************************************************* Builds an update vector from those fields, excluding the roll ptr and @@ -454,6 +495,9 @@ row_upd_build_difference( fields, excluding roll ptr and trx id */ dict_index_t* index, /* in: clustered index */ dtuple_t* entry, /* in: entry to insert */ + ulint* ext_vec,/* in: array containing field numbers of + externally stored fields in entry, or NULL */ + ulint n_ext_vec,/* in: number of fields in ext_vec */ rec_t* rec, /* in: clustered index record */ mem_heap_t* heap) /* in: memory heap from which allocated */ { @@ -480,16 +524,25 @@ row_upd_build_difference( for (i = 0; i < dtuple_get_n_fields(entry); i++) { data = rec_get_nth_field(rec, i, &len); + dfield = dtuple_get_nth_field(entry, i); - if ((i != trx_id_pos) && (i != roll_ptr_pos) - && !dfield_data_is_equal(dfield, len, data)) { + if ((rec_get_nth_field_extern_bit(rec, i) + != upd_ext_vec_contains(ext_vec, n_ext_vec, i)) + || ((i != trx_id_pos) && (i != roll_ptr_pos) + && !dfield_data_is_equal(dfield, len, data))) { upd_field = upd_get_nth_field(update, n_diff); dfield_copy(&(upd_field->new_val), dfield); upd_field_set_field_no(upd_field, i, index); + + if (upd_ext_vec_contains(ext_vec, n_ext_vec, i)) { + upd_field->extern_storage = TRUE; + } else { + upd_field->extern_storage = FALSE; + } n_diff++; } @@ -630,9 +683,7 @@ row_upd_changes_ord_field( } /*************************************************************** -Checks if an update vector changes an ordering field of an index record. -This function is fast if the update vector is short or the number of ordering -fields in the index is small. Otherwise, this can be quadratic. */ +Checks if an update vector changes an ordering field of an index record. */ ibool row_upd_changes_some_index_ord_field( @@ -642,19 +693,24 @@ row_upd_changes_some_index_ord_field( dict_table_t* table, /* in: table */ upd_t* update) /* in: update vector for the row */ { + upd_field_t* upd_field; dict_index_t* index; - + ulint i; + index = dict_table_get_first_index(table); - while (index) { - if (row_upd_changes_ord_field(NULL, index, update)) { + for (i = 0; i < upd_get_n_fields(update); i++) { - return(TRUE); - } + upd_field = upd_get_nth_field(update, i); - index = dict_table_get_next_index(index); - } + if (dict_field_get_col(dict_index_get_nth_field(index, + upd_field->field_no)) + ->ord_part) { + return(TRUE); + } + } + return(FALSE); } @@ -710,15 +766,17 @@ row_upd_eval_new_vals( /*************************************************************** Stores to the heap the row on which the node->pcur is positioned. */ -UNIV_INLINE +static void row_upd_store_row( /*==============*/ upd_node_t* node) /* in: row update node */ { dict_index_t* clust_index; + upd_t* update; + rec_t* rec; - ut_ad((node->pcur)->latch_mode != BTR_NO_LATCHES); + ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); if (node->row != NULL) { mem_heap_empty(node->heap); @@ -727,8 +785,20 @@ row_upd_store_row( clust_index = dict_table_get_first_index(node->table); - node->row = row_build(ROW_COPY_DATA, clust_index, - btr_pcur_get_rec(node->pcur), node->heap); + rec = btr_pcur_get_rec(node->pcur); + + node->row = row_build(ROW_COPY_DATA, clust_index, rec, node->heap); + + node->ext_vec = mem_heap_alloc(node->heap, sizeof(ulint) + * rec_get_n_fields(rec)); + if (node->is_delete) { + update = NULL; + } else { + update = node->update; + } + + node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec, + rec, update); } /*************************************************************** @@ -812,7 +882,7 @@ row_upd_sec_index_entry( row_upd_index_replace_new_col_vals(entry, index, node->update); /* Insert new index entry */ - err = row_ins_index_entry(index, entry, thr); + err = row_ins_index_entry(index, entry, NULL, 0, thr); mem_heap_free(heap); @@ -870,6 +940,8 @@ row_upd_clust_rec_by_insert( dict_table_t* table; mem_heap_t* heap; dtuple_t* entry; + ulint* ext_vec; + ulint n_ext_vec; ulint err; ut_ad(node); @@ -897,14 +969,18 @@ row_upd_clust_rec_by_insert( heap = mem_heap_create(1024); + ext_vec = mem_heap_alloc(heap, + sizeof(ulint) * dtuple_get_n_fields(node->row)); + n_ext_vec = 0; + entry = row_build_index_entry(node->row, index, heap); row_upd_clust_index_replace_new_col_vals(entry, node->update); - + row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); - err = row_ins_index_entry(index, entry, thr); - + err = row_ins_index_entry(index, entry, node->ext_vec, + node->n_ext_vec, thr); mem_heap_free(heap); return(err); @@ -924,6 +1000,7 @@ row_upd_clust_rec( que_thr_t* thr, /* in: query thread */ mtr_t* mtr) /* in: mtr; gets committed here */ { + big_rec_t* big_rec = NULL; btr_pcur_t* pcur; btr_cur_t* btr_cur; ulint err; @@ -973,9 +1050,24 @@ row_upd_clust_rec( ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, - node->update, node->cmpl_info, thr, mtr); + &big_rec, node->update, + node->cmpl_info, thr, mtr); mtr_commit(mtr); + if (err == DB_SUCCESS && big_rec) { + mtr_start(mtr); + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); + + err = btr_store_big_rec_extern_fields(index, + btr_cur_get_rec(btr_cur), + big_rec, mtr); + mtr_commit(mtr); + } + + if (big_rec) { + dtuple_big_rec_free(big_rec); + } + return(err); } @@ -1194,10 +1286,12 @@ row_upd( ut_ad(node && thr); if (node->in_mysql_interface) { + /* We do not get the cmpl_info value from the MySQL interpreter: we must calculate it on the fly: */ - if (row_upd_changes_some_index_ord_field(node->table, + if (node->is_delete || + row_upd_changes_some_index_ord_field(node->table, node->update)) { node->cmpl_info = 0; } else { @@ -1239,6 +1333,7 @@ function_exit: if (node->row != NULL) { mem_heap_empty(node->heap); node->row = NULL; + node->n_ext_vec = 0; } node->state = UPD_NODE_UPDATE_CLUSTERED; diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index 028fae010d5..8dd9c9f3feb 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -93,6 +93,8 @@ ulint srv_lock_wait_timeout = 1024 * 1024 * 1024; char* srv_unix_file_flush_method_str = NULL; ulint srv_unix_file_flush_method = 0; +ibool srv_use_doublewrite_buf = TRUE; + ibool srv_set_thread_priorities = TRUE; int srv_query_thread_priority = 0; /*-------------------------------------------*/ @@ -109,6 +111,8 @@ ibool srv_print_buf_io = FALSE; ibool srv_print_log_io = FALSE; ibool srv_print_latch_waits = FALSE; +ibool srv_print_innodb_monitor = FALSE; + /* The parameters below are obsolete: */ ibool srv_print_parsed_sql = FALSE; @@ -1492,7 +1496,6 @@ srv_init(void) slot = srv_mysql_table + i; slot->in_use = FALSE; slot->event = os_event_create(NULL); - slot->suspended = FALSE; ut_a(slot->event); } @@ -1661,7 +1664,6 @@ srv_suspend_mysql_thread( slot->thr = thr; os_event_reset(event); - slot->suspended = TRUE; slot->suspend_time = ut_time(); @@ -1693,27 +1695,6 @@ srv_suspend_mysql_thread( return(FALSE); } -os_event_t -srv_mysql_thread_event_get(void) -{ - srv_slot_t* slot; - os_event_t event; - - mutex_enter(&kernel_mutex); - - slot = srv_table_reserve_slot_for_mysql(); - - event = slot->event; - - os_event_reset(event); - - slot->suspended = TRUE; - - mutex_exit(&kernel_mutex); - - return(event); -} - /************************************************************************ Releases a MySQL OS thread waiting for a lock to be released, if the thread is already suspended. */ @@ -1737,7 +1718,6 @@ srv_release_mysql_thread_if_suspended( /* Found */ os_event_set(slot->event); - slot->suspended = FALSE; return; } @@ -1746,59 +1726,6 @@ srv_release_mysql_thread_if_suspended( /* not found */ } -void -srv_mysql_thread_release(void) -/*==========================*/ -{ - srv_slot_t* slot; - ulint i; - - mutex_enter(&kernel_mutex); - - for (i = 0; i < OS_THREAD_MAX_N; i++) { - - slot = srv_mysql_table + i; - - if (slot->in_use && slot->suspended) { - /* Found */ - slot->suspended = FALSE; - mutex_exit(&kernel_mutex); - - os_event_set(slot->event); - - return; - } - } - - ut_a(0); -} - -void -srv_mysql_thread_slot_free( -/*==========================*/ - os_event_t event) -{ - srv_slot_t* slot; - ulint i; - - mutex_enter(&kernel_mutex); - - for (i = 0; i < OS_THREAD_MAX_N; i++) { - - slot = srv_mysql_table + i; - - if (slot->in_use && slot->event == event) { - /* Found */ - slot->in_use = FALSE; - mutex_exit(&kernel_mutex); - - return; - } - } - - ut_a(0); -} - /************************************************************************* A thread which wakes up threads whose lock wait may have lasted too long. */ @@ -1924,6 +1851,7 @@ srv_master_thread( ulint i; time_t last_flush_time; time_t current_time; + time_t last_monitor_time; UT_NOT_USED(arg); @@ -1936,6 +1864,8 @@ srv_master_thread( mutex_exit(&kernel_mutex); os_event_set(srv_sys->operational); + + last_monitor_time = time(NULL); loop: mutex_enter(&kernel_mutex); @@ -1975,8 +1905,18 @@ loop: while (n_pages_purged) { /* TODO: replace this by a check if we are running out of file space! */ + if (srv_print_innodb_monitor) { + ut_print_timestamp(stdout); + printf(" InnoDB starts purge\n"); + } + n_pages_purged = trx_purge(); + if (srv_print_innodb_monitor) { + ut_print_timestamp(stdout); + printf(" InnoDB purged %lu pages\n", n_pages_purged); + } + current_time = time(NULL); if (difftime(current_time, last_flush_time) > 1) { @@ -1986,14 +1926,40 @@ loop: } background_loop: - /* - sync_array_print_info(sync_primary_wait_array); - os_aio_print(); - buf_print_io(); - */ /* In this loop we run background operations while the server is quiet */ + current_time = time(NULL); + + if (srv_print_innodb_monitor + && difftime(current_time, last_monitor_time) > 8) { + + printf("================================\n"); + last_monitor_time = time(NULL); + ut_print_timestamp(stdout); + + printf(" INNODB MONITOR OUTPUT\n" + "================================\n"); + printf("--------------------------\n" + "LOCKS HELD BY TRANSACTIONS\n" + "--------------------------\n"); + lock_print_info(); + printf("-----------------------------------------------\n" + "CURRENT SEMAPHORES RESERVED AND SEMAPHORE WAITS\n" + "-----------------------------------------------\n"); + sync_print(); + printf("CURRENT PENDING FILE I/O'S\n" + "--------------------------\n"); + os_aio_print(); + printf("-----------\n" + "BUFFER POOL\n" + "-----------\n"); + buf_print_io(); + printf("----------------------------\n" + "END OF INNODB MONITOR OUTPUT\n" + "============================\n"); + } + mutex_enter(&kernel_mutex); if (srv_activity_count != old_activity_count) { mutex_exit(&kernel_mutex); @@ -2005,8 +1971,18 @@ background_loop: /* The server has been quiet for a while: start running background operations */ + if (srv_print_innodb_monitor) { + ut_print_timestamp(stdout); + printf(" InnoDB starts purge\n"); + } + n_pages_purged = trx_purge(); + if (srv_print_innodb_monitor) { + ut_print_timestamp(stdout); + printf(" InnoDB purged %lu pages\n", n_pages_purged); + } + mutex_enter(&kernel_mutex); if (srv_activity_count != old_activity_count) { mutex_exit(&kernel_mutex); @@ -2014,8 +1990,18 @@ background_loop: } mutex_exit(&kernel_mutex); + if (srv_print_innodb_monitor) { + ut_print_timestamp(stdout); + printf(" InnoDB starts insert buffer merge\n"); + } + n_bytes_merged = ibuf_contract(TRUE); + if (srv_print_innodb_monitor) { + ut_print_timestamp(stdout); + printf(" InnoDB merged %lu bytes\n", n_bytes_merged); + } + mutex_enter(&kernel_mutex); if (srv_activity_count != old_activity_count) { mutex_exit(&kernel_mutex); @@ -2023,7 +2009,7 @@ background_loop: } mutex_exit(&kernel_mutex); - n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 20, ut_dulint_max); + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); mutex_enter(&kernel_mutex); if (srv_activity_count != old_activity_count) { @@ -2052,14 +2038,12 @@ background_loop: /* mem_print_new_info(); */ - -/* fsp_print(0); */ - -/* fprintf(stderr, "Validating tablespace\n"); +/* + fsp_print(0); + fprintf(stderr, "Validating tablespace\n"); fsp_validate(0); fprintf(stderr, "Validation ok\n"); */ - #ifdef UNIV_SEARCH_PERF_STAT /* btr_search_print_info(); */ #endif diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index a343f2115e7..a79a808ba2e 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -1,7 +1,7 @@ /************************************************************************ Starts the InnoDB database server -(c) 1996-2000 InnoDB Oy +(c) 1996-2000 Innobase Oy Created 2/16/1996 Heikki Tuuri *************************************************************************/ @@ -203,8 +203,8 @@ open_or_create_log_file( sprintf(name, "%s%s%lu", srv_log_group_home_dirs[k], "ib_logfile", i); - files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL, &ret); - + files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL, + OS_LOG_FILE, &ret); if (ret == FALSE) { if (os_file_get_last_error() != OS_FILE_ALREADY_EXISTS) { fprintf(stderr, @@ -214,7 +214,8 @@ open_or_create_log_file( } files[i] = os_file_create( - name, OS_FILE_OPEN, OS_FILE_AIO, &ret); + name, OS_FILE_OPEN, OS_FILE_AIO, + OS_LOG_FILE, &ret); if (!ret) { fprintf(stderr, "InnoDB: Error in opening %s\n", name); @@ -239,7 +240,7 @@ open_or_create_log_file( fprintf(stderr, "InnoDB: Log file %s did not exist: new to be created\n", name); - printf("InnoDB: Setting log file %s size to %lu\n", + fprintf(stderr, "InnoDB: Setting log file %s size to %lu\n", name, UNIV_PAGE_SIZE * srv_log_file_size); ret = os_file_set_size(name, files[i], @@ -330,27 +331,28 @@ open_or_create_data_files( sprintf(name, "%s%s", srv_data_home, srv_data_file_names[i]); - if (srv_data_file_is_raw_partition[i] == 0) { - - files[i] = os_file_create(name, OS_FILE_CREATE, - OS_FILE_NORMAL, &ret); - } else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { - ret = FALSE; - } else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) { + files[i] = os_file_create(name, OS_FILE_CREATE, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); - files[i] = os_file_create( - name, OS_FILE_OPEN, OS_FILE_NORMAL, &ret); + if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) { + /* The partition is opened, not created; then it is + written over */ - if (!ret) { + files[i] = os_file_create( + name, OS_FILE_OPEN, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + if (!ret) { fprintf(stderr, "InnoDB: Error in opening %s\n", name); return(DB_ERROR); - } + } + } else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + ret = FALSE; } if (ret == FALSE) { - if (srv_data_file_is_raw_partition[i] == 0 + if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW && os_file_get_last_error() != OS_FILE_ALREADY_EXISTS) { fprintf(stderr, @@ -370,8 +372,8 @@ open_or_create_data_files( } files[i] = os_file_create( - name, OS_FILE_OPEN, OS_FILE_NORMAL, &ret); - + name, OS_FILE_OPEN, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); if (!ret) { fprintf(stderr, "InnoDB: Error in opening %s\n", name); @@ -379,18 +381,21 @@ open_or_create_data_files( return(DB_ERROR); } - ret = os_file_get_size(files[i], &size, &size_high); - ut_a(ret); + if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW) { + + ret = os_file_get_size(files[i], &size, + &size_high); + ut_a(ret); - if (srv_data_file_is_raw_partition[i] == 0 - && (size != UNIV_PAGE_SIZE * srv_data_file_sizes[i] - || size_high != 0)) { - - fprintf(stderr, + if (size != + UNIV_PAGE_SIZE * srv_data_file_sizes[i] + || size_high != 0) { + fprintf(stderr, "InnoDB: Error: data file %s is of different size\n" "InnoDB: than specified in the .cnf file!\n", name); - return(DB_ERROR); + return(DB_ERROR); + } } fil_read_flushed_lsn_and_arch_log_no(files[i], @@ -403,7 +408,8 @@ open_or_create_data_files( if (i > 0) { fprintf(stderr, - "InnoDB: Data file %s did not exist: new to be created\n", name); + "InnoDB: Data file %s did not exist: new to be created\n", + name); } else { fprintf(stderr, "InnoDB: The first specified data file %s did not exist:\n" @@ -411,10 +417,10 @@ open_or_create_data_files( *create_new_db = TRUE; } - printf("InnoDB: Setting file %s size to %lu\n", + fprintf(stderr, "InnoDB: Setting file %s size to %lu\n", name, UNIV_PAGE_SIZE * srv_data_file_sizes[i]); - printf( + fprintf(stderr, "InnoDB: Database physically writes the file full: wait...\n"); ret = os_file_set_size(name, files[i], @@ -555,19 +561,22 @@ innobase_start_or_create_for_mysql(void) srv_startup_is_before_trx_rollback_phase = TRUE; if (0 == ut_strcmp(srv_unix_file_flush_method_str, "fdatasync")) { - srv_unix_file_flush_method = SRV_UNIX_FDATASYNC; + srv_unix_file_flush_method = SRV_UNIX_FDATASYNC; + } else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "O_DSYNC")) { - srv_unix_file_flush_method = SRV_UNIX_O_DSYNC; + srv_unix_file_flush_method = SRV_UNIX_O_DSYNC; + } else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "littlesync")) { - srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; + srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; + } else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "nosync")) { - srv_unix_file_flush_method = SRV_UNIX_NOSYNC; + srv_unix_file_flush_method = SRV_UNIX_NOSYNC; } else { - fprintf(stderr, - "InnoDB: Unrecognized value for innodb_unix_file_flush_method\n"); - - return(DB_ERROR); + fprintf(stderr, + "InnoDB: Unrecognized value %s for innodb_flush_method\n", + srv_unix_file_flush_method_str); + return(DB_ERROR); } /* @@ -593,14 +602,15 @@ innobase_start_or_create_for_mysql(void) #ifdef __WIN__ if (os_get_os_version() == OS_WIN95 || os_get_os_version() == OS_WIN31) { - /* On Win 95, 98, ME, and Win32 subsystem for Windows 3.1 use - simulated aio */ - os_aio_use_native_aio = FALSE; - srv_n_file_io_threads = 4; + /* On Win 95, 98, ME, and Win32 subsystem for Windows 3.1 use + simulated aio */ + + os_aio_use_native_aio = FALSE; + srv_n_file_io_threads = 4; } else { - /* On NT and Win 2000 always use aio */ - os_aio_use_native_aio = TRUE; + /* On NT and Win 2000 always use aio */ + os_aio_use_native_aio = TRUE; } #endif if (!os_aio_use_native_aio) { @@ -652,14 +662,21 @@ innobase_start_or_create_for_mysql(void) sum_of_new_sizes = 0; for (i = 0; i < srv_n_data_files; i++) { - sum_of_new_sizes += srv_data_file_sizes[i]; + if (srv_data_file_sizes[i] >= 262144) { + fprintf(stderr, + "InnoDB: Error: file size must be < 4 GB, or on some OS's < 2 GB\n"); + + return(DB_ERROR); + } + + sum_of_new_sizes += srv_data_file_sizes[i]; } if (sum_of_new_sizes < 640) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Error: tablespace size must be at least 10 MB\n"); - return(DB_ERROR); + return(DB_ERROR); } err = open_or_create_data_files(&create_new_db, @@ -673,6 +690,15 @@ innobase_start_or_create_for_mysql(void) return((int) err); } + if (!create_new_db) { + /* If we are using the doublewrite method, we will + check if there are half-written pages in data files, + and restore them from the doublewrite buffer if + possible */ + + trx_sys_doublewrite_restore_corrupt_pages(); + } + srv_normalize_path_for_win(srv_arch_dir); srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir); @@ -742,7 +768,6 @@ innobase_start_or_create_for_mysql(void) mutex_exit(&(log_sys->mutex)); } - /* mutex_create(&row_mysql_thread_mutex); */ sess_sys_init_at_db_start(); if (create_new_db) { @@ -834,7 +859,7 @@ innobase_start_or_create_for_mysql(void) } if (srv_measure_contention) { - /* os_thread_create(&test_measure_cont, NULL, thread_ids + + /* os_thread_create(&test_measure_cont, NULL, thread_ids + SRV_MAX_N_IO_THREADS); */ } @@ -849,16 +874,20 @@ innobase_start_or_create_for_mysql(void) /* Create the thread which watches the timeouts for lock waits */ os_thread_create(&srv_lock_timeout_monitor_thread, NULL, thread_ids + 2 + SRV_MAX_N_IO_THREADS); - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Started\n"); - srv_was_started = TRUE; srv_is_being_started = FALSE; sync_order_checks_on = TRUE; + if (srv_use_doublewrite_buf && trx_doublewrite == NULL) { + trx_sys_create_doublewrite_buf(); + } + /* buf_debug_prints = TRUE; */ + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Started\n"); + return((int) DB_SUCCESS); } diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c index a77cc76ed37..dc49ce2197e 100644 --- a/innobase/sync/sync0rw.c +++ b/innobase/sync/sync0rw.c @@ -810,11 +810,10 @@ rw_lock_print( ulint count = 0; rw_lock_debug_t* info; - printf("----------------------------------------------\n"); + printf("-------------------------------------------------\n"); printf("RW-LOCK INFO\n"); printf("RW-LOCK: %lx ", (ulint)lock); - mutex_enter(&(lock->mutex)); if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) || (rw_lock_get_reader_count(lock) != 0) || (rw_lock_get_waiters(lock) != 0)) { @@ -831,8 +830,6 @@ rw_lock_print( info = UT_LIST_GET_NEXT(list, info); } } - - mutex_exit(&(lock->mutex)); #endif } diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c index c3a1ac3b47f..a125f65be41 100644 --- a/innobase/sync/sync0sync.c +++ b/innobase/sync/sync0sync.c @@ -158,7 +158,7 @@ struct sync_thread_struct{ }; /* Number of slots reserved for each OS thread in the sync level array */ -#define SYNC_THREAD_N_LEVELS 256 +#define SYNC_THREAD_N_LEVELS 10000 struct sync_level_struct{ void* latch; /* pointer to a mutex or an rw-lock; NULL means that @@ -768,6 +768,9 @@ sync_thread_levels_g( thread */ ulint limit) /* in: level limit */ { + char* file_name; + ulint line; + ulint thread_id; sync_level_t* slot; rw_lock_t* lock; mutex_t* mutex; @@ -783,8 +786,29 @@ sync_thread_levels_g( lock = slot->latch; mutex = slot->latch; - ut_error; - + printf( + "InnoDB error: sync levels should be > %lu but a level is %lu\n", + limit, slot->level); + + if (mutex->magic_n == MUTEX_MAGIC_N) { + printf("Mutex created at %s %lu\n", &(mutex->cfile_name), + mutex->cline); + + if (mutex_get_lock_word(mutex) != 0) { + + mutex_get_debug_info(mutex, + &file_name, &line, &thread_id); + + printf("InnoDB: Locked mutex: addr %lx thread %ld file %s line %ld\n", + (ulint)mutex, thread_id, + file_name, line); + } else { + printf("Not locked\n"); + } + } else { + rw_lock_print(lock); + } + return(FALSE); } } @@ -973,6 +997,8 @@ sync_thread_add_level( ut_a(sync_thread_levels_g(array, SYNC_ANY_LATCH)); } else if (level == SYNC_TRX_SYS_HEADER) { ut_a(sync_thread_levels_contain(array, SYNC_KERNEL)); + } else if (level == SYNC_DOUBLEWRITE) { + ut_a(sync_thread_levels_g(array, SYNC_DOUBLEWRITE)); } else if (level == SYNC_BUF_BLOCK) { ut_a((sync_thread_levels_contain(array, SYNC_BUF_POOL) && sync_thread_levels_g(array, SYNC_BUF_BLOCK - 1)) @@ -1000,6 +1026,8 @@ sync_thread_add_level( } else if (level == SYNC_FSP) { ut_a(sync_thread_levels_contain(array, SYNC_FSP) || sync_thread_levels_g(array, SYNC_FSP)); + } else if (level == SYNC_EXTERN_STORAGE) { + ut_a(TRUE); } else if (level == SYNC_TRX_UNDO_PAGE) { ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO) || sync_thread_levels_contain(array, SYNC_RSEG) @@ -1221,10 +1249,10 @@ void sync_print(void) /*============*/ { - printf("SYNC INFO:------------------------------------------\n"); + printf("SYNC INFO:\n"); mutex_list_print_info(); rw_lock_list_print_info(); sync_array_print_info(sync_primary_wait_array); sync_print_wait_info(); - printf("----------------------------------------------------\n"); + printf("-----------------------------------------------------\n"); } diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c index f65943f27e3..afb83926fa3 100644 --- a/innobase/trx/trx0purge.c +++ b/innobase/trx/trx0purge.c @@ -678,6 +678,8 @@ trx_purge_choose_next_log(void) rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + min_trx_no = ut_dulint_max; + min_rseg = NULL; while (rseg) { @@ -692,6 +694,9 @@ trx_purge_choose_next_log(void) min_rseg = rseg; min_trx_no = rseg->last_trx_no; space = rseg->space; + ut_a(space == 0); /* We assume in purge of + externally stored fields + that space id == 0 */ page_no = rseg->last_page_no; offset = rseg->last_offset; } @@ -820,6 +825,10 @@ trx_purge_get_next_rec( } cmpl_info = trx_undo_rec_get_cmpl_info(rec2); + + if (trx_undo_rec_get_extern_storage(rec2)) { + break; + } if ((type == TRX_UNDO_UPD_EXIST_REC) && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c index c31d786011d..64febb8f523 100644 --- a/innobase/trx/trx0rec.c +++ b/innobase/trx/trx0rec.c @@ -292,6 +292,8 @@ trx_undo_rec_get_pars( TRX_UNDO_INSERT_REC, ... */ ulint* cmpl_info, /* out: compiler info, relevant only for update type records */ + ibool* updated_extern, /* out: TRUE if we updated an + externally stored fild */ dulint* undo_no, /* out: undo log record number */ dulint* table_id) /* out: table id */ { @@ -303,7 +305,14 @@ trx_undo_rec_get_pars( type_cmpl = mach_read_from_1(ptr); ptr++; - + + if (type_cmpl & TRX_UNDO_UPD_EXTERN) { + *updated_extern = TRUE; + type_cmpl -= TRX_UNDO_UPD_EXTERN; + } else { + *updated_extern = FALSE; + } + *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1); *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT; @@ -336,7 +345,11 @@ trx_undo_rec_get_col_val( *field = ptr; if (*len != UNIV_SQL_NULL) { - ptr += *len; + if (*len >= UNIV_EXTERN_STORAGE_FIELD) { + ptr += (*len - UNIV_EXTERN_STORAGE_FIELD); + } else { + ptr += *len; + } } return(ptr); @@ -452,6 +465,7 @@ trx_undo_page_report_modify( ulint col_no; byte* old_ptr; ulint type_cmpl; + byte* type_cmpl_ptr; ulint i; ut_ad(index->type & DICT_CLUSTERED); @@ -491,6 +505,8 @@ trx_undo_page_report_modify( mach_write_to_1(ptr, type_cmpl); + type_cmpl_ptr = ptr; + ptr++; len = mach_dulint_write_much_compressed(ptr, trx->undo_no); ptr += len; @@ -577,7 +593,23 @@ trx_undo_page_report_modify( return(0); } - len = mach_write_compressed(ptr, flen); + if (rec_get_nth_field_extern_bit(rec, pos)) { + /* If a field has external storage, we add to + flen the flag */ + + len = mach_write_compressed(ptr, + UNIV_EXTERN_STORAGE_FIELD + flen); + + /* Notify purge that it eventually has to free the old + externally stored field */ + + (trx->update_undo)->del_marks = TRUE; + + *type_cmpl_ptr = *type_cmpl_ptr | TRX_UNDO_UPD_EXTERN; + } else { + len = mach_write_compressed(ptr, flen); + } + ptr += len; if (flen != UNIV_SQL_NULL) { @@ -825,6 +857,13 @@ trx_undo_update_rec_get_update( upd_field_set_field_no(upd_field, field_no, index); + if (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD) { + + upd_field->extern_storage = TRUE; + + len -= UNIV_EXTERN_STORAGE_FIELD; + } + dfield_set_data(&(upd_field->new_val), field, len); } @@ -1222,8 +1261,10 @@ trx_undo_prev_version_build( byte* ptr; ulint info_bits; ulint cmpl_info; + ibool dummy_extern; byte* buf; ulint err; + ulint i; ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); ut_ad(mtr_memo_contains(index_mtr, buf_block_align(index_rec), @@ -1252,8 +1293,9 @@ trx_undo_prev_version_build( return(err); } - ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, &undo_no, - &table_id); + ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, &info_bits); ptr = trx_undo_rec_skip_row_ref(ptr, index); @@ -1278,5 +1320,15 @@ trx_undo_prev_version_build( row_upd_rec_in_place(*old_vers, update); } + for (i = 0; i < upd_get_n_fields(update); i++) { + + if (upd_get_nth_field(update, i)->extern_storage) { + + rec_set_nth_field_extern_bit(*old_vers, + upd_get_nth_field(update, i)->field_no, + TRUE, NULL); + } + } + return(DB_SUCCESS); } diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index 99ec5b50237..b056975d28a 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -19,9 +19,326 @@ Created 3/26/1996 Heikki Tuuri #include "trx0undo.h" #include "srv0srv.h" #include "trx0purge.h" +#include "log0log.h" /* The transaction system */ -trx_sys_t* trx_sys = NULL; +trx_sys_t* trx_sys = NULL; +trx_doublewrite_t* trx_doublewrite = NULL; + +/******************************************************************** +Creates or initialializes the doublewrite buffer at a database start. */ +static +void +trx_doublewrite_init( +/*=================*/ + byte* doublewrite) /* in: pointer to the doublewrite buf + header on trx sys page */ +{ + trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t)); + + mutex_create(&(trx_doublewrite->mutex)); + mutex_set_level(&(trx_doublewrite->mutex), SYNC_DOUBLEWRITE); + + trx_doublewrite->first_free = 0; + + trx_doublewrite->block1 = mach_read_from_4( + doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1); + trx_doublewrite->block2 = mach_read_from_4( + doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2); + trx_doublewrite->write_buf_unaligned = + ut_malloc( + (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + * UNIV_PAGE_SIZE); + + trx_doublewrite->write_buf = ut_align( + trx_doublewrite->write_buf_unaligned, + UNIV_PAGE_SIZE); + trx_doublewrite->buf_block_arr = mem_alloc( + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + * sizeof(void*)); +} + +/******************************************************************** +Creates the doublewrite buffer at a database start. The header of the +doublewrite buffer is placed on the trx system header page. */ + +void +trx_sys_create_doublewrite_buf(void) +/*================================*/ +{ + page_t* page; + page_t* page2; + page_t* new_page; + byte* doublewrite; + byte* fseg_header; + ulint page_no; + ulint prev_page_no; + ulint i; + mtr_t mtr; + + if (trx_doublewrite) { + /* Already inited */ + + return; + } + +start_again: + mtr_start(&mtr); + + page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK); + + doublewrite = page + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + + /* The doublewrite buffer has already been created: + just read in some numbers */ + + trx_doublewrite_init(doublewrite); + + mtr_commit(&mtr); + } else { + fprintf(stderr, + "InnoDB: Doublewrite buffer not found: creating new\n"); + + if (buf_pool_get_curr_size() < + (2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2 + 100) + * UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer: you must\n" + "InnoDB: increase your buffer pool size.\n" + "InnoDB: Cannot continue operation.\n"); + + exit(1); + } + + page2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); + + /* fseg_create acquires a second latch on the page, + therefore we must declare it: */ + + buf_page_dbg_add_level(page2, SYNC_NO_ORDER_CHECK); + + if (page2 == NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer: you must\n" + "InnoDB: increase your tablespace size.\n" + "InnoDB: Cannot continue operation.\n"); + + /* We exit without committing the mtr to prevent + its modifications to the database getting to disk */ + + exit(1); + } + + fseg_header = page + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_FSEG; + prev_page_no = 0; + + for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2; i++) { + page_no = fseg_alloc_free_page(fseg_header, + prev_page_no + 1, + FSP_UP, &mtr); + if (page_no == FIL_NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer: you must\n" + "InnoDB: increase your tablespace size.\n" + "InnoDB: Cannot continue operation.\n"); + + exit(1); + } + + /* We read the allocated pages to the buffer pool; + when they are written to disk in a flush, the space + id and page number fields are also written to the + pages. When we at database startup read pages + from the doublewrite buffer, we know that if the + space id and page number in them are the same as + the page position in the tablespace, then the page + has not been written to in doublewrite. */ + + new_page = buf_page_get(TRX_SYS_SPACE, page_no, + RW_X_LATCH, &mtr); + buf_page_dbg_add_level(new_page, SYNC_NO_ORDER_CHECK); + + /* Make a dummy change to the page to ensure it will + be written to disk in a flush */ + + mlog_write_ulint(new_page + FIL_PAGE_DATA, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + + if (i == FSP_EXTENT_SIZE / 2) { + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + } else if (i == FSP_EXTENT_SIZE / 2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + } else if (i > FSP_EXTENT_SIZE / 2) { + ut_a(page_no == prev_page_no + 1); + } + + prev_page_no = page_no; + } + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT, + TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(ut_dulint_max, TRUE); + + fprintf(stderr, "InnoDB: Doublewrite buffer created\n"); + + goto start_again; + } +} + +/******************************************************************** +At a database startup uses a possible doublewrite buffer to restore +half-written pages in the data files. */ + +void +trx_sys_doublewrite_restore_corrupt_pages(void) +/*===========================================*/ +{ + byte* buf; + byte* read_buf; + byte* unaligned_read_buf; + ulint block1; + ulint block2; + byte* page; + byte* doublewrite; + ulint space_id; + ulint page_no; + ulint i; + + /* We do the file i/o past the buffer pool */ + + unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE); + read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE); + + /* Read the trx sys header to check if we are using the + doublewrite buffer */ + + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, TRX_SYS_PAGE_NO, 0, + UNIV_PAGE_SIZE, read_buf, NULL); + + doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has been created */ + + trx_doublewrite_init(doublewrite); + + block1 = trx_doublewrite->block1; + block2 = trx_doublewrite->block2; + + buf = trx_doublewrite->write_buf; + } else { + goto leave_func; + } + + /* Read the pages from the doublewrite buffer to memory */ + + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block1, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf, NULL); + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block2, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + NULL); + /* Check if any of these pages is half-written in data files, in the + intended position */ + + page = buf; + + for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { + + space_id = mach_read_from_4(page + FIL_PAGE_SPACE); + page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + + if (!fil_check_adress_in_tablespace(space_id, page_no)) { + fprintf(stderr, + "InnoDB: Warning: an inconsistent page in the doublewrite buffer\n" + "InnoDB: space id %lu page number %lu, %lu'th page in dblwr buf.\n", + space_id, page_no, i); + + } else if (space_id == TRX_SYS_SPACE + && ( (page_no >= block1 + && page_no + < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + || (page_no >= block2 + && page_no + < block2 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) { + + /* It is an unwritten doublewrite buffer page: + do nothing */ + + } else { + /* Read in the actual page from the data files */ + + fil_io(OS_FILE_READ, TRUE, space_id, page_no, 0, + UNIV_PAGE_SIZE, read_buf, NULL); + /* Check if the page is corrupt */ + + if (buf_page_is_corrupted(read_buf)) { + + fprintf(stderr, + "InnoDB: Warning: database page corruption or a failed\n" + "InnoDB: file read of page %lu.\n", page_no); + fprintf(stderr, + "InnoDB: Trying to recover it from the doublewrite buffer.\n"); + + if (buf_page_is_corrupted(page)) { + fprintf(stderr, + "InnoDB: Also the page in the doublewrite buffer is corrupt.\n" + "InnoDB: Cannot continue operation.\n"); + exit(1); + } + + /* Write the good page from the + doublewrite buffer to the intended + position */ + + fil_io(OS_FILE_WRITE, TRUE, space_id, + page_no, 0, + UNIV_PAGE_SIZE, page, NULL); + fprintf(stderr, + "InnoDB: Recovered the page from the doublewrite buffer.\n"); + } + } + + page += UNIV_PAGE_SIZE; + } + + fil_flush_file_spaces(FIL_TABLESPACE); + +leave_func: + ut_free(unaligned_read_buf); +} /******************************************************************** Checks that trx is in the trx list. */ diff --git a/libmysql/Makefile.shared b/libmysql/Makefile.shared index 28248a0b982..ba97c7cf04d 100644 --- a/libmysql/Makefile.shared +++ b/libmysql/Makefile.shared @@ -55,7 +55,8 @@ mysysobjects1 = my_init.lo my_static.lo my_malloc.lo my_realloc.lo \ mf_loadpath.lo my_pthread.lo my_thr_init.lo \ thr_mutex.lo mulalloc.lo string.lo default.lo \ my_compress.lo array.lo my_once.lo list.lo my_net.lo \ - charset.lo hash.lo + charset.lo hash.lo mf_iocache.lo my_seek.lo \ + my_pread.lo mf_cache.lo # Not needed in the minimum library mysysobjects2 = getopt.lo getopt1.lo getvar.lo my_lib.lo mysysobjects = $(mysysobjects1) $(mysysobjects2) diff --git a/libmysql/libmysql.c b/libmysql/libmysql.c index 22b3c75fcb9..378744b9f1b 100644 --- a/libmysql/libmysql.c +++ b/libmysql/libmysql.c @@ -293,7 +293,7 @@ HANDLE create_named_pipe(NET *net, uint connect_timeout, char **arg_host, ** or packet is an error message *****************************************************************************/ -static uint +uint net_safe_read(MYSQL *mysql) { NET *net= &mysql->net; @@ -415,7 +415,7 @@ static void free_rows(MYSQL_DATA *cur) } -static int +int simple_command(MYSQL *mysql,enum enum_server_command command, const char *arg, uint length, my_bool skipp_check) { diff --git a/ltmain.sh b/ltmain.sh index eba25223e45..cebed74c167 100644 --- a/ltmain.sh +++ b/ltmain.sh @@ -1798,6 +1798,9 @@ compiler." *-*-cygwin* | *-*-mingw* | *-*-os2* | *-*-beos*) # these systems don't actually have a c library (as such)! ;; + *-*-freebsd*) + #FreeBSD needs to handle -lc and -lc_r itself + ;; *-*-rhapsody*) # rhapsody is a little odd... deplibs="$deplibs -framework System" diff --git a/myisam/myisamchk.c b/myisam/myisamchk.c index bda1637314f..f650e4312f7 100644 --- a/myisam/myisamchk.c +++ b/myisam/myisamchk.c @@ -206,7 +206,7 @@ static struct option long_options[] = static void print_version(void) { - printf("%s Ver 1.48 for %s at %s\n",my_progname,SYSTEM_TYPE, + printf("%s Ver 1.49 for %s at %s\n",my_progname,SYSTEM_TYPE, MACHINE_TYPE); } @@ -468,7 +468,7 @@ static void get_options(register int *argc,register char ***argv) if ((check_param.testflag & T_READONLY) && (check_param.testflag & (T_REP_BY_SORT | T_REP | T_STATISTICS | T_AUTO_INC | - T_SORT_RECORDS | T_SORT_INDEX))) + T_SORT_RECORDS | T_SORT_INDEX | T_FORCE_CREATE))) { VOID(fprintf(stderr, "%s: Can't use --readonly when repairing or sorting\n", diff --git a/mysql-test/r/bdb.result b/mysql-test/r/bdb.result index 89d9c56b3b0..39b4962ef58 100644 --- a/mysql-test/r/bdb.result +++ b/mysql-test/r/bdb.result @@ -139,6 +139,8 @@ t1 1 level 1 level A 3 NULL NULL gesuchnr benutzer_id 1 1 2 1 +id x +1 2 Table Op Msg_type Msg_text test.t1 optimize status OK a diff --git a/mysql-test/r/order_by.result b/mysql-test/r/order_by.result index a47fc950f0e..79b70ac2cc9 100644 --- a/mysql-test/r/order_by.result +++ b/mysql-test/r/order_by.result @@ -111,6 +111,31 @@ DateOfAction TransactionID member_id nickname voornaam 1 2 +gid sid uid +104620 5 15 +103867 5 27 +103962 5 27 +104619 5 75 +104505 5 117 +103853 5 250 +gid sid uid +104620 5 15 +103867 5 27 +103962 5 27 +104619 5 75 +104505 5 117 +103853 5 250 +table type possible_keys key key_len ref rows Extra +t1 index PRIMARY PRIMARY 4 NULL 6 Using index +t2 eq_ref PRIMARY,uid PRIMARY 4 t1.gid 1 +t3 eq_ref PRIMARY PRIMARY 2 t2.uid 1 where used; Using index +table type possible_keys key key_len ref rows Extra +t1 index PRIMARY PRIMARY 4 NULL 6 Using index +t3 eq_ref PRIMARY PRIMARY 2 t1.gid 1 where used +table type possible_keys key key_len ref rows Extra +t1 index PRIMARY PRIMARY 4 NULL 6 Using index; Using temporary; Using filesort +t2 eq_ref PRIMARY,uid PRIMARY 4 t1.gid 1 +t3 eq_ref PRIMARY PRIMARY 2 t2.uid 1 where used; Using index table type possible_keys key key_len ref rows Extra t1 range a a 20 NULL 2 where used; Using index a b c diff --git a/mysql-test/t/bdb.test b/mysql-test/t/bdb.test index f0c24722e90..5e28c31e051 100644 --- a/mysql-test/t/bdb.test +++ b/mysql-test/t/bdb.test @@ -65,6 +65,13 @@ replace into t1 (gesuchnr,benutzer_id) values (1,1); select * from t1; drop table t1; +# test for bug in replace with secondary key +create table t1 (id int not null primary key, x int not null, key (x)) type=bdb; +insert into t1 (id, x) values (1, 1); +replace into t1 (id, x) values (1, 2); +select * from t1; +drop table t1; + # # test delete using hidden_primary_key # diff --git a/mysql-test/t/fulltext.test b/mysql-test/t/fulltext.test index 064219c6ad3..153fdefd960 100644 --- a/mysql-test/t/fulltext.test +++ b/mysql-test/t/fulltext.test @@ -2,7 +2,7 @@ # Test of fulltext index # -drop table if exists t1,t2; +drop table if exists t1,t2,t3; CREATE TABLE t1 (a VARCHAR(200), b TEXT, FULLTEXT (a,b)); INSERT INTO t1 VALUES('MySQL has now support', 'for full-text search'),('Full-text indexes', 'are called collections'),('Only MyISAM tables','support collections'),('Function MATCH ... AGAINST()','is used to do a search'),('Full-text search in MySQL', 'implements vector space model'); @@ -61,4 +61,23 @@ select * from t2 where MATCH inhalt AGAINST (NULL); select * from t2 where MATCH inhalt AGAINST ('foobar'); select * from t2 having MATCH inhalt AGAINST ('foobar'); -drop table t1,t2; +# +# check of fulltext errors +# + +CREATE TABLE t3 ( + ticket int(11), + inhalt text, + KEY tig (ticket), + fulltext index tix (inhalt) +); + +--error 1210 +select * from t2 having MATCH inhalt AGAINST (t1.id); +--error 1210 +select * from t2 having MATCH ticket AGAINST ('foobar'); +--error 1210 +select * from t2,t3 having MATCH (t2.inhalt,t3.inhalt) AGAINST ('foobar'); + +drop table t1,t2,t3; + diff --git a/mysql-test/t/order_by.test b/mysql-test/t/order_by.test index 08d26413761..baa3fe67f0b 100644 --- a/mysql-test/t/order_by.test +++ b/mysql-test/t/order_by.test @@ -254,3 +254,41 @@ select * from t1 where a between 0 and 1 order by a desc, b desc; drop table t1; +CREATE TABLE t1 ( + gid int(10) unsigned NOT NULL auto_increment, + cid smallint(5) unsigned NOT NULL default '0', + PRIMARY KEY (gid), + KEY component_id (cid) +) TYPE=MyISAM; +INSERT INTO t1 VALUES (103853,108),(103867,108),(103962,108),(104505,108),(104619,108),(104620,108); +ALTER TABLE t1 add skr int(10) not null; + +CREATE TABLE t2 ( + gid int(10) unsigned NOT NULL default '0', + uid smallint(5) unsigned NOT NULL default '1', + sid tinyint(3) unsigned NOT NULL default '1', + PRIMARY KEY (gid), + KEY uid (uid), + KEY status_id (sid) +) TYPE=MyISAM; +INSERT INTO t2 VALUES (103853,250,5),(103867,27,5),(103962,27,5),(104505,117,5),(104619,75,5),(104620,15,5); + +CREATE TABLE t3 ( + uid smallint(6) NOT NULL auto_increment, + PRIMARY KEY (uid) +) TYPE=MyISAM; +INSERT INTO t3 VALUES (1),(15),(27),(75),(117),(250); +ALTER TABLE t3 add skr int(10) not null; + +select t1.gid, t2.sid, t3.uid from t2, t1, t3 where t2.gid = t1.gid and t2.uid = t3.uid order by t3.uid, t1.gid; +select t1.gid, t2.sid, t3.uid from t3, t2, t1 where t2.gid = t1.gid and t2.uid = t3.uid order by t3.uid, t1.gid; + +# The following ORDER BY can be optimimized +EXPLAIN select t1.gid, t2.sid, t3.uid from t3, t2, t1 where t2.gid = t1.gid and t2.uid = t3.uid order by t1.gid, t3.uid; +EXPLAIN SELECT t1.gid, t3.uid from t1, t3 where t1.gid = t3.uid order by t1.gid,t3.skr; + +# The following ORDER BY can't be optimimized +EXPLAIN SELECT t1.gid, t2.sid, t3.uid from t2, t1, t3 where t2.gid = t1.gid and t2.uid = t3.uid order by t3.uid, t1.gid; +EXPLAIN SELECT t1.gid, t3.uid from t1, t3 where t1.gid = t3.uid order by t3.skr,t1.gid; +EXPLAIN SELECT t1.gid, t3.uid from t1, t3 where t1.skr = t3.uid order by t1.gid,t3.skr; +drop table t1,t2,t3; diff --git a/mysys/default.c b/mysys/default.c index cb842da0f02..126c0270a17 100644 --- a/mysys/default.c +++ b/mysys/default.c @@ -222,7 +222,7 @@ static my_bool search_default_file(DYNAMIC_ARRAY *args, MEM_ROOT *alloc, const char *dir, const char *config_file, const char *ext, TYPELIB *group) { - char name[FN_REFLEN+10],buff[FN_REFLEN+1],*ptr,*end,*value,*tmp; + char name[FN_REFLEN+10],buff[4096],*ptr,*end,*value,*tmp; FILE *fp; uint line=0; my_bool read_values=0,found_group=0; diff --git a/scripts/mysqlhotcopy.sh b/scripts/mysqlhotcopy.sh index 1c26bf8e2d6..71359fa5612 100644 --- a/scripts/mysqlhotcopy.sh +++ b/scripts/mysqlhotcopy.sh @@ -223,18 +223,27 @@ foreach my $rdb ( @db_desc ) { my $db = $rdb->{src}; eval { $dbh->do( "use $db" ); }; die "Database '$db' not accessible: $@" if ( $@ ); - my @dbh_tables = $dbh->func( '_ListTables' ); + my @dbh_tables = $dbh->tables(); ## generate regex for tables/files - my $t_regex = $rdb->{t_regex}; ## assign temporary regex - my $negated = $t_regex =~ tr/~//d; ## remove and count negation operator: we don't allow ~ in table names - $t_regex = qr/$t_regex/; ## make regex string from user regex - - ## filter (out) tables specified in t_regex - print "Filtering tables with '$t_regex'\n" if $opt{debug}; - @dbh_tables = ( $negated - ? grep { $_ !~ $t_regex } @dbh_tables - : grep { $_ =~ $t_regex } @dbh_tables ); + my $t_regex; + my $negated; + if ($rdb->{t_regex}) { + $t_regex = $rdb->{t_regex}; ## assign temporary regex + $negated = $t_regex =~ tr/~//d; ## remove and count + ## negation operator: we + ## don't allow ~ in table + ## names + + $t_regex = qr/$t_regex/; ## make regex string from + ## user regex + + ## filter (out) tables specified in t_regex + print "Filtering tables with '$t_regex'\n" if $opt{debug}; + @dbh_tables = ( $negated + ? grep { $_ !~ $t_regex } @dbh_tables + : grep { $_ =~ $t_regex } @dbh_tables ); + } ## get list of files to copy my $db_dir = "$datadir/$db"; @@ -249,10 +258,18 @@ foreach my $rdb ( @db_desc ) { closedir( DBDIR ); ## filter (out) files specified in t_regex - my @db_files = ( $negated - ? grep { $db_files{$_} !~ $t_regex } keys %db_files - : grep { $db_files{$_} =~ $t_regex } keys %db_files ); + my @db_files; + if ($rdb->{t_regex}) { + @db_files = ($negated + ? grep { $db_files{$_} !~ $t_regex } keys %db_files + : grep { $db_files{$_} =~ $t_regex } keys %db_files ); + } + else { + @db_files = keys %db_files; + } + @db_files = sort @db_files; + my @index_files=(); ## remove indices unless we're told to keep them @@ -809,3 +826,7 @@ Ask Bjoern Hansen - Cleanup code to fix a few bugs and enable -w again. Emil S. Hansen - Added resetslave and resetmaster. +Jeremy D. Zawodny - Removed depricated DBI calls. Fixed bug which +resulted in nothing being copied when a regexp was specified but no +database name(s). + diff --git a/sql/Makefile.am b/sql/Makefile.am index 02ac8aa6cc7..d3c3ff541fd 100644 --- a/sql/Makefile.am +++ b/sql/Makefile.am @@ -27,7 +27,6 @@ INCLUDES = @MT_INCLUDES@ \ -I$(srcdir) -I../include -I.. -I. $(openssl_includes) WRAPLIBS= @WRAPLIBS@ SUBDIRS = share -bin_PROGRAMS = mysqlbinlog libexec_PROGRAMS = mysqld noinst_PROGRAMS = gen_lex_hash gen_lex_hash_LDFLAGS = @NOINST_LDFLAGS@ @@ -83,12 +82,9 @@ mysqld_SOURCES = sql_lex.cc sql_handler.cc \ sql_udf.cc sql_analyse.cc sql_analyse.h sql_cache.cc \ slave.cc sql_repl.cc \ mini_client.cc mini_client_errors.c \ - md5.c stacktrace.c sql_union.cc + md5.c stacktrace.c gen_lex_hash_SOURCES = gen_lex_hash.cc gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS) -mysqlbinlog_SOURCES = mysqlbinlog.cc mini_client.cc net_serv.cc \ - mini_client_errors.c password.c -mysqlbinlog_LDADD = $(LDADD) $(CXXLDFLAGS) $(mysqld_LDADD) DEFS = -DMYSQL_SERVER \ -DDEFAULT_MYSQL_HOME="\"$(MYSQLBASEdir)\"" \ diff --git a/sql/ha_berkeley.cc b/sql/ha_berkeley.cc index 6907da855b9..25f8148e52f 100644 --- a/sql/ha_berkeley.cc +++ b/sql/ha_berkeley.cc @@ -888,7 +888,7 @@ int ha_berkeley::write_row(byte * record) if (changed_keys & 1) { if ((new_error = remove_key(sub_trans, keynr, record, - (DBT*) 0, &prim_key))) + &prim_key))) break; /* purecov: inspected */ } } @@ -970,7 +970,7 @@ int ha_berkeley::update_primary_key(DB_TXN *trans, bool primary_key_changed, { // Primary key changed or we are updating a key that can have duplicates. // Delete the old row and add a new one - if (!(error=remove_key(trans, primary_key, old_row, (DBT *) 0, old_key))) + if (!(error=remove_key(trans, primary_key, old_row, old_key))) { if (!(error=pack_row(&row, new_row, 0))) { @@ -1034,7 +1034,7 @@ int ha_berkeley::restore_keys(DB_TXN *trans, key_map changed_keys, if (changed_keys & 1) { if (changed_keys != 1 && - (error = remove_key(trans, keynr, new_row, (DBT*) 0, new_key))) + (error = remove_key(trans, keynr, new_row, new_key))) break; /* purecov: inspected */ if ((error = key_file[keynr]->put(key_file[keynr], trans, create_key(&tmp_key, keynr, key_buff2, @@ -1105,8 +1105,7 @@ int ha_berkeley::update_row(const byte * old_row, byte * new_row) continue; if (key_cmp(keynr, old_row, new_row) || primary_key_changed) { - if ((error=remove_key(sub_trans, keynr, old_row, (DBT*) 0, - &old_prim_key))) + if ((error=remove_key(sub_trans, keynr, old_row, &old_prim_key))) { if (using_ignore && /* purecov: inspected */ (thd_options & OPTION_INTERNAL_SUBTRANSACTIONS)) @@ -1172,11 +1171,9 @@ int ha_berkeley::update_row(const byte * old_row, byte * new_row) Delete one key This uses key_buff2, when keynr != primary key, so it's important that a function that calls this doesn't use this buffer for anything else. - packed_record may be NULL if the key is unique */ int ha_berkeley::remove_key(DB_TXN *trans, uint keynr, const byte *record, - DBT *packed_record, DBT *prim_key) { int error; @@ -1207,13 +1204,9 @@ int ha_berkeley::remove_key(DB_TXN *trans, uint keynr, const byte *record, if (!(error=key_file[keynr]->cursor(key_file[keynr], trans, &tmp_cursor, 0))) { - if (!(error=cursor->c_get(tmp_cursor, - (keynr == primary_key ? - prim_key : - create_key(&key, keynr, key_buff2, record)), - (keynr == primary_key ? - packed_record : prim_key), - DB_GET_BOTH | DB_RMW))) + if (!(error=tmp_cursor->c_get(tmp_cursor, + create_key(&key, keynr, key_buff2, record), + prim_key, DB_GET_BOTH | DB_RMW))) { // This shouldn't happen error=tmp_cursor->c_del(tmp_cursor,0); } @@ -1236,7 +1229,7 @@ int ha_berkeley::remove_keys(DB_TXN *trans, const byte *record, { if (keys & 1) { - int new_error=remove_key(trans, keynr, record, new_record, prim_key); + int new_error=remove_key(trans, keynr, record, prim_key); if (new_error) { result=new_error; // Return last error /* purecov: inspected */ diff --git a/sql/ha_berkeley.h b/sql/ha_berkeley.h index 9e657d72da1..3eb793937ae 100644 --- a/sql/ha_berkeley.h +++ b/sql/ha_berkeley.h @@ -69,8 +69,7 @@ class ha_berkeley: public handler int key_length = MAX_KEY_LENGTH); DBT *pack_key(DBT *key, uint keynr, char *buff, const byte *key_ptr, uint key_length); - int remove_key(DB_TXN *trans, uint keynr, const byte *record, - DBT *packed_record, DBT *prim_key); + int remove_key(DB_TXN *trans, uint keynr, const byte *record, DBT *prim_key); int remove_keys(DB_TXN *trans,const byte *record, DBT *new_record, DBT *prim_key, key_map keys); int restore_keys(DB_TXN *trans, key_map changed_keys, uint primary_key, diff --git a/sql/ha_innobase.cc b/sql/ha_innobase.cc index 8ea700de789..7bd71363915 100644 --- a/sql/ha_innobase.cc +++ b/sql/ha_innobase.cc @@ -822,11 +822,11 @@ ha_innobase::open( if (NULL == (ib_table = dict_table_get(norm_name, NULL))) { - fprintf(stderr, "\ -Cannot find table %s from the internal data dictionary\n\ -of InnoDB though the .frm file for the table exists. Maybe you have deleted\n\ -and created again an InnoDB database but forgotten to delete the\n\ -corresponding .frm files of old InnoDB tables?\n", + fprintf(stderr, +"Cannot find table %s from the internal data dictionary\n" +"of InnoDB though the .frm file for the table exists. Maybe you have deleted\n" +"and created again an InnoDB database but forgotten to delete the\n" +"corresponding .frm files of old InnoDB tables?\n", norm_name); free_share(share); @@ -2660,6 +2660,37 @@ ha_innobase::records_in_range( } /************************************************************************* +Gives an UPPER BOUND to the number of rows in a table. This is used in +filesort.cc and the upper bound must hold. TODO: Since the number of +rows in a table may change after this function is called, we still may +get a 'Sort aborted' error in filesort.cc of MySQL. The ultimate fix is to +improve the algorithm of filesort.cc. */ + +ha_rows +ha_innobase::estimate_number_of_rows(void) +/*======================================*/ + /* out: upper bound of rows, currently 32-bit int + or uint */ +{ + row_prebuilt_t* prebuilt = (row_prebuilt_t*) innobase_prebuilt; + dict_table_t* ib_table; + + DBUG_ENTER("info"); + + ib_table = prebuilt->table; + + dict_update_statistics(ib_table); + + data_file_length = ((ulonglong) + ib_table->stat_clustered_index_size) + * UNIV_PAGE_SIZE; + + /* The minimum clustered index record size is 20 bytes */ + + return((ha_rows) (1000 + data_file_length / 20)); +} + +/************************************************************************* How many seeks it will take to read through the table. This is to be comparable to the number returned by records_in_range so that we can decide if we should scan the table or use keys. */ diff --git a/sql/ha_innobase.h b/sql/ha_innobase.h index 4dbff654337..d129e00ba6e 100644 --- a/sql/ha_innobase.h +++ b/sql/ha_innobase.h @@ -137,6 +137,7 @@ class ha_innobase: public handler enum ha_rkey_function start_search_flag, const byte *end_key,uint end_key_len, enum ha_rkey_function end_search_flag); + ha_rows estimate_number_of_rows(); int create(const char *name, register TABLE *form, HA_CREATE_INFO *create_info); diff --git a/sql/ha_myisam.cc b/sql/ha_myisam.cc index 63e2cf7c201..0a86e833134 100644 --- a/sql/ha_myisam.cc +++ b/sql/ha_myisam.cc @@ -35,7 +35,7 @@ ulong myisam_recover_options= HA_RECOVER_NONE; /* bits in myisam_recover_options */ const char *myisam_recover_names[] = -{ "DEFAULT", "BACKUP", "FORCE", "QUICK"}; +{ "DEFAULT", "BACKUP", "FORCE", "QUICK", NullS}; TYPELIB myisam_recover_typelib= {array_elements(myisam_recover_names),"", myisam_recover_names}; diff --git a/sql/item_func.cc b/sql/item_func.cc index b76bee78b2e..66a50eb0ec0 100644 --- a/sql/item_func.cc +++ b/sql/item_func.cc @@ -1954,13 +1954,17 @@ bool Item_func_match::fix_fields(THD *thd,struct st_table_list *tlist) maybe_null=1; join_key=0; - /* Why testing for const_item ? Monty */ - /* I'll remove it later, but this should include modifications to - find_best and auto_close as complement to auto_init code above. SerG */ - /* I'd rather say now that const_item is assumed in quite a bit of - places, so it would be difficult to remove. SerG */ + /* Serg: + I'd rather say now that const_item is assumed in quite a bit of + places, so it would be difficult to remove; If it would ever to be + removed, this should include modifications to find_best and auto_close + as complement to auto_init code above. + */ if (Item_func::fix_fields(thd,tlist) || !const_item()) + { + my_error(ER_WRONG_ARGUMENTS,MYF(0),"AGAINST"); return 1; + } while ((item=li++)) { @@ -1969,12 +1973,18 @@ bool Item_func_match::fix_fields(THD *thd,struct st_table_list *tlist) if (item->type() == Item::REF_ITEM) li.replace(item= *((Item_ref *)item)->ref); if (item->type() != Item::FIELD_ITEM || !item->used_tables()) + { + my_error(ER_WRONG_ARGUMENTS,MYF(0),"MATCH"); return 1; + } used_tables_cache|=item->used_tables(); } /* check that all columns come from the same table */ if (count_bits(used_tables_cache) != 1) + { + my_error(ER_WRONG_ARGUMENTS,MYF(0),"MATCH"); return 1; + } const_item_cache=0; table=((Item_field *)fields.head())->field->table; return 0; diff --git a/sql/log_event.cc b/sql/log_event.cc index 14524216076..ca01e418992 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -404,15 +404,26 @@ int Log_event::read_log_event(IO_CACHE* file, String* packet, #endif // MYSQL_CLIENT -// allocates memory - the caller is responsible for clean-up +#ifndef MYSQL_CLIENT +#define UNLOCK_MUTEX if(log_lock) pthread_mutex_unlock(log_lock); +#else +#define UNLOCK_MUTEX +#endif +// allocates memory - the caller is responsible for clean-up +#ifndef MYSQL_CLIENT Log_event* Log_event::read_log_event(IO_CACHE* file, pthread_mutex_t* log_lock) +#else +Log_event* Log_event::read_log_event(IO_CACHE* file) +#endif { char head[LOG_EVENT_HEADER_LEN]; - if(log_lock) pthread_mutex_lock(log_lock); +#ifndef MYSQL_CLIENT + if(log_lock) pthread_mutex_lock(log_lock); +#endif if (my_b_read(file, (byte *) head, sizeof(head))) { - if (log_lock) pthread_mutex_unlock(log_lock); + UNLOCK_MUTEX; return 0; } @@ -449,7 +460,7 @@ Log_event* Log_event::read_log_event(IO_CACHE* file, pthread_mutex_t* log_lock) if((res = read_log_event(buf, data_len))) res->register_temp_buf(buf); err: - if (log_lock) pthread_mutex_unlock(log_lock); + UNLOCK_MUTEX; if(error) { sql_print_error(error); diff --git a/sql/log_event.h b/sql/log_event.h index d16d76bed37..4c24f640465 100644 --- a/sql/log_event.h +++ b/sql/log_event.h @@ -233,10 +233,13 @@ public: virtual void print(FILE* file, bool short_form = 0, char* last_db = 0) = 0; void print_timestamp(FILE* file, time_t *ts = 0); void print_header(FILE* file); -#endif +#ifndef MYSQL_CLIENT // if mutex is 0, the read will proceed without mutex static Log_event* read_log_event(IO_CACHE* file, pthread_mutex_t* log_lock); +#else // avoid having to link mysqlbinlog against libpthread + static Log_event* read_log_event(IO_CACHE* file); +#endif static Log_event* read_log_event(const char* buf, int event_len); const char* get_type_str(); diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h index d02a2eb729e..6326e3d07c2 100644 --- a/sql/mysql_priv.h +++ b/sql/mysql_priv.h @@ -177,6 +177,13 @@ char* query_table_status(THD *thd,const char *db,const char *table_name); #define SELECT_NO_UNLOCK (QUERY_NO_GOOD_INDEX_USED*2) #define TMP_TABLE_ALL_COLUMNS (SELECT_NO_UNLOCK*2) +#define MODE_REAL_AS_FLOAT 1 +#define MODE_PIPES_AS_CONCAT 2 +#define MODE_ANSI_QUOTES 4 +#define MODE_IGNORE_SPACE 8 +#define MODE_SERIALIZABLE 16 +#define MODE_ONLY_FULL_GROUP_BY 32 + #define RAID_BLOCK_SIZE 1024 /* BINLOG_DUMP options */ diff --git a/sql/mysqld.cc b/sql/mysqld.cc index dad4a1c2427..09c464dd15b 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -220,7 +220,7 @@ static char mysql_home[FN_REFLEN],pidfile_name[FN_REFLEN]; static pthread_t select_thread; static bool opt_log,opt_update_log,opt_bin_log,opt_slow_log,opt_noacl, opt_disable_networking=0, opt_bootstrap=0,opt_skip_show_db=0, - opt_ansi_mode=0,opt_myisam_log=0, + opt_myisam_log=0, opt_large_files=sizeof(my_off_t) > 4; bool opt_sql_bin_update = 0, opt_log_slave_updates = 0, opt_safe_show_db=0, opt_show_slave_auth_info = 0, opt_old_rpl_compat = 0; @@ -320,6 +320,7 @@ char server_version[SERVER_VERSION_LENGTH]=MYSQL_SERVER_VERSION; const char *first_keyword="first"; const char **errmesg; /* Error messages */ const char *myisam_recover_options_str="OFF"; +const char *sql_mode_str="OFF"; const char *default_tx_isolation_name; enum_tx_isolation default_tx_isolation=ISO_READ_COMMITTED; @@ -333,6 +334,12 @@ double log_10[32]; /* 10 potences */ I_List<THD> threads,thread_cache; time_t start_time; +ulong opt_sql_mode = 0L; +const char *sql_mode_names[] = +{ "REAL_AS_FLOAT", "PIPES_AS_CONCAT", "ANSI_QUOTES", "IGNORE_SPACE", + "SERIALIZE","ONLY_FULL_GROUP_BY", NullS }; +TYPELIB sql_mode_typelib= {array_elements(sql_mode_names),"", + sql_mode_names}; MY_BITMAP temp_pool; bool use_temp_pool=0; @@ -1525,7 +1532,7 @@ static void open_log(MYSQL_LOG *log, const char *hostname, // get rid of extention if the log is binary to avoid problems if (type == LOG_BIN) { - char* p = strrchr(opt_name, FN_EXTCHAR); + char* p = strrchr((char*) opt_name, FN_EXTCHAR); if (p) *p = 0; } @@ -2520,6 +2527,7 @@ enum options { OPT_REPORT_USER, OPT_REPORT_PASSWORD, OPT_REPORT_PORT, OPT_MAX_BINLOG_DUMP_EVENTS, OPT_SPORADIC_BINLOG_DUMP_FAIL, OPT_SHOW_SLAVE_AUTH_INFO, OPT_OLD_RPL_COMPAT, + OPT_SQL_MODE, OPT_SLAVE_LOAD_TMPDIR}; static struct option long_options[] = { @@ -2664,6 +2672,7 @@ static struct option long_options[] = { {"skip-thread-priority", no_argument, 0, (int) OPT_SKIP_PRIOR}, {"slave-load-tmpdir", required_argument, 0, (int) OPT_SLAVE_LOAD_TMPDIR}, {"sql-bin-update-same", no_argument, 0, (int) OPT_SQL_BIN_UPDATE_SAME}, + {"sql-mode", required_argument, 0, (int) OPT_SQL_MODE}, #include "sslopt-longopts.h" #ifdef __WIN__ {"standalone", no_argument, 0, (int) OPT_STANDALONE}, @@ -2832,7 +2841,6 @@ CHANGEABLE_VAR changeable_vars[] = { struct show_var_st init_vars[]= { - {"ansi_mode", (char*) &opt_ansi_mode, SHOW_BOOL}, {"back_log", (char*) &back_log, SHOW_LONG}, {"basedir", mysql_home, SHOW_CHAR}, #ifdef HAVE_BERKELEY_DB @@ -2939,6 +2947,7 @@ struct show_var_st init_vars[]= { {"slow_launch_time", (char*) &slow_launch_time, SHOW_LONG}, {"socket", (char*) &mysql_unix_port, SHOW_CHAR_PTR}, {"sort_buffer", (char*) &sortbuff_size, SHOW_LONG}, + {"sql_mode", (char*) &sql_mode_str, SHOW_CHAR_PTR}, {"table_cache", (char*) &table_cache_size, SHOW_LONG}, {"table_type", (char*) &default_table_type_name, SHOW_CHAR_PTR}, {"thread_cache_size", (char*) &thread_cache_size, SHOW_LONG}, @@ -3122,6 +3131,9 @@ static void usage(void) Don't give threads different priorities.\n\ --socket=... Socket file to use for connection\n\ -t, --tmpdir=path Path for temporary files\n\ + --sql-mode=option[,option[,option...]] where option can be one of:\n\ + REAL_AS_FLOAT, PIPES_AS_CONCAT, ANSI_QUOTES,\n\ + IGNORE_SPACE, SERIALIZE, ONLY_FULL_GROUP_BY.\n\ --transaction-isolation\n\ Default transaction isolation level\n\ --temp-pool Use a pool of temporary files\n\ @@ -3277,8 +3289,9 @@ static void get_options(int argc,char **argv) opt_warnings=1; break; case 'a': - opt_ansi_mode=1; - thd_startup_options|=OPTION_ANSI_MODE; + opt_sql_mode = (MODE_REAL_AS_FLOAT | MODE_PIPES_AS_CONCAT | + MODE_ANSI_QUOTES | MODE_IGNORE_SPACE | MODE_SERIALIZABLE + | MODE_ONLY_FULL_GROUP_BY); default_tx_isolation= ISO_SERIALIZABLE; break; case 'b': diff --git a/sql/sql_class.cc b/sql/sql_class.cc index d0ff5d962db..13c673fdb33 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -125,6 +125,7 @@ THD::THD():user_time(0),fatal_error(0),last_insert_id_used(0), server_status=SERVER_STATUS_AUTOCOMMIT; update_lock_default= low_priority_updates ? TL_WRITE_LOW_PRIORITY : TL_WRITE; options=thd_startup_options; + sql_mode=(uint) opt_sql_mode; inactive_timeout=net_wait_timeout; open_options=ha_open_options; tx_isolation=session_tx_isolation=default_tx_isolation; diff --git a/sql/sql_class.h b/sql/sql_class.h index 3f212a8f8c6..d149f087c1f 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -240,7 +240,7 @@ public: char *query,*thread_stack; char *host,*user,*priv_user,*db,*ip; const char *proc_info; - uint client_capabilities,max_packet_length; + uint client_capabilities,sql_mode,max_packet_length; uint master_access,db_access; TABLE *open_tables,*temporary_tables, *handler_tables; MYSQL_LOCK *lock,*locked_tables; diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc index 8f77931a05f..0e77757b13b 100644 --- a/sql/sql_lex.cc +++ b/sql/sql_lex.cc @@ -121,7 +121,7 @@ void lex_init(void) state_map[(uchar)'*']= (uchar) STATE_END_LONG_COMMENT; state_map[(uchar)'@']= (uchar) STATE_USER_END; state_map[(uchar) '`']= (uchar) STATE_USER_VARIABLE_DELIMITER; - if (thd_startup_options & OPTION_ANSI_MODE) + if (opt_sql_mode & MODE_ANSI_QUOTES) { state_map[(uchar) '"'] = STATE_USER_VARIABLE_DELIMITER; } @@ -149,7 +149,7 @@ LEX *lex_start(THD *thd, uchar *buf,uint length) lex->select->ftfunc_list.empty(); lex->convert_set=(lex->thd=thd)->convert_set; lex->yacc_yyss=lex->yacc_yyvs=0; - lex->ignore_space=test(thd->client_capabilities & CLIENT_IGNORE_SPACE); + lex->ignore_space=test(thd->sql_mode & MODE_IGNORE_SPACE); return lex; } diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 6c3205c2feb..1655d54e9fc 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -417,6 +417,8 @@ check_connections(THD *thd) return(ER_OUT_OF_RESOURCES); thd->client_capabilities=uint2korr(net->read_pos); + if (thd->client_capabilities & CLIENT_IGNORE_SPACE) + thd->sql_mode|= MODE_IGNORE_SPACE; #ifdef HAVE_OPENSSL DBUG_PRINT("info", ("pkt_len:%d, client capabilities: %d", @@ -541,8 +543,6 @@ pthread_handler_decl(handle_one_connection,arg) thd->options |= OPTION_BIG_SELECTS; if (thd->client_capabilities & CLIENT_COMPRESS) net->compress=1; // Use compression - if (thd->options & OPTION_ANSI_MODE) - thd->client_capabilities|=CLIENT_IGNORE_SPACE; thd->proc_info=0; // Remove 'login' thd->command=COM_SLEEP; diff --git a/sql/sql_select.cc b/sql/sql_select.cc index cd6caf213f8..3a1d36796b2 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -2701,12 +2701,12 @@ static void update_depend_map(JOIN *join) for (i=0 ; i < ref->key_parts ; i++,item++) depend_map|=(*item)->used_tables(); ref->depend_map=depend_map; - for (JOIN_TAB *join_tab2=join->join_tab; + for (JOIN_TAB **tab=join->map2table; depend_map ; - join_tab2++,depend_map>>=1 ) + tab++,depend_map>>=1 ) { if (depend_map & 1) - ref->depend_map|=join_tab2->ref.depend_map; + ref->depend_map|=(*tab)->ref.depend_map; } } } @@ -2723,12 +2723,12 @@ static void update_depend_map(JOIN *join, ORDER *order) order->depend_map=depend_map=order->item[0]->used_tables(); if (!(order->depend_map & RAND_TABLE_BIT)) // Not item_sum() or RAND() { - for (JOIN_TAB *join_tab=join->join_tab; + for (JOIN_TAB **tab=join->map2table; depend_map ; - join_tab++, depend_map>>=1) + tab++, depend_map>>=1) { if (depend_map & 1) - order->depend_map|=join_tab->ref.depend_map; + order->depend_map|=(*tab)->ref.depend_map; } } } @@ -5552,6 +5552,7 @@ static int remove_dup_with_compare(THD *thd, TABLE *table, Field **first_field, { if ((error=file->delete_row(record))) goto err; + error=file->rnd_next(record); continue; } if (copy_blobs(first_field)) @@ -6063,7 +6064,7 @@ setup_group(THD *thd,TABLE_LIST *tables,List<Item> &fields, if (!order) return 0; /* Everything is ok */ - if (thd->options & OPTION_ANSI_MODE) + if (thd->sql_mode & MODE_ONLY_FULL_GROUP_BY) { Item *item; List_iterator<Item> li(fields); @@ -6085,7 +6086,7 @@ setup_group(THD *thd,TABLE_LIST *tables,List<Item> &fields, return 1; } } - if (thd->options & OPTION_ANSI_MODE) + if (thd->sql_mode & MODE_ONLY_FULL_GROUP_BY) { /* Don't allow one to use fields that is not used in GROUP BY */ Item *item; diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index c6ae2241603..5537206089b 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -35,7 +35,7 @@ int yylex(void *yylval); inline Item *or_or_concat(Item* A, Item* B) { - return (current_thd->options & OPTION_ANSI_MODE ? + return (current_thd->sql_mode & MODE_PIPES_AS_CONCAT ? (Item*) new Item_func_concat(A,B) : (Item*) new Item_cond_or(A,B)); } @@ -949,7 +949,7 @@ int_type: | BIGINT { $$=FIELD_TYPE_LONGLONG; } real_type: - REAL { $$= current_thd->options & OPTION_ANSI_MODE ? + REAL { $$= current_thd->sql_mode & MODE_REAL_AS_FLOAT ? FIELD_TYPE_FLOAT : FIELD_TYPE_DOUBLE; } | DOUBLE_SYM { $$=FIELD_TYPE_DOUBLE; } | DOUBLE_SYM PRECISION { $$=FIELD_TYPE_DOUBLE; } |