diff options
Diffstat (limited to 'innobase')
131 files changed, 11537 insertions, 4670 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c index ae967e0525e..c27fb73ff8d 100644 --- a/innobase/btr/btr0btr.c +++ b/innobase/btr/btr0btr.c @@ -20,6 +20,7 @@ Created 6/2/1994 Heikki Tuuri #include "rem0cmp.h" #include "lock0lock.h" #include "ibuf0ibuf.h" +#include "trx0trx.h" /* Latching strategy of the InnoDB B-tree @@ -86,15 +87,6 @@ btr_page_create( page_t* page, /* in: page to be created */ dict_tree_t* tree, /* in: index tree */ mtr_t* mtr); /* in: mtr */ -/****************************************************************** -Sets the child node file address in a node pointer. */ -UNIV_INLINE -void -btr_node_ptr_set_child_page_no( -/*===========================*/ - rec_t* rec, /* in: node pointer record */ - ulint page_no, /* in: child node address */ - mtr_t* mtr); /* in: mtr */ /**************************************************************** Returns the upper level node pointer to a page. It is assumed that mtr holds an x-latch on the tree. */ @@ -128,7 +120,10 @@ btr_page_insert_fits( rec_t* split_rec, /* in: suggestion for first record on upper half-page, or NULL if tuple should be first */ - dtuple_t* tuple); /* in: tuple to insert */ + const ulint* offsets, /* in: rec_get_offsets( + split_rec, cursor->index) */ + dtuple_t* tuple, /* in: tuple to insert */ + mem_heap_t* heap); /* in: temporary memory heap */ /****************************************************************** Gets the root node of a tree and x-latches it. */ @@ -148,6 +143,8 @@ btr_root_get( root_page_no = dict_tree_get_page(tree); root = btr_page_get(space, root_page_no, RW_X_LATCH, mtr); + ut_a((ibool)!!page_is_comp(root) == + UT_LIST_GET_FIRST(tree->tree_indexes)->table->comp); return(root); } @@ -167,21 +164,19 @@ btr_get_prev_user_rec( page_t* page; page_t* prev_page; ulint prev_page_no; - rec_t* prev_rec; ulint space; - page = buf_frame_align(rec); - - if (page_get_infimum_rec(page) != rec) { + if (!page_rec_is_infimum(rec)) { - prev_rec = page_rec_get_prev(rec); + rec_t* prev_rec = page_rec_get_prev(rec); - if (page_get_infimum_rec(page) != prev_rec) { + if (!page_rec_is_infimum(prev_rec)) { return(prev_rec); } } + page = buf_frame_align(rec); prev_page_no = btr_page_get_prev(page, mtr); space = buf_frame_get_space_id(page); @@ -194,10 +189,9 @@ btr_get_prev_user_rec( MTR_MEMO_PAGE_S_FIX)) || (mtr_memo_contains(mtr, buf_block_align(prev_page), MTR_MEMO_PAGE_X_FIX))); + ut_a(page_is_comp(prev_page) == page_is_comp(page)); - prev_rec = page_rec_get_prev(page_get_supremum_rec(prev_page)); - - return(prev_rec); + return(page_rec_get_prev(page_get_supremum_rec(prev_page))); } return(NULL); @@ -218,21 +212,19 @@ btr_get_next_user_rec( page_t* page; page_t* next_page; ulint next_page_no; - rec_t* next_rec; ulint space; - page = buf_frame_align(rec); - - if (page_get_supremum_rec(page) != rec) { + if (!page_rec_is_supremum(rec)) { - next_rec = page_rec_get_next(rec); + rec_t* next_rec = page_rec_get_next(rec); - if (page_get_supremum_rec(page) != next_rec) { + if (!page_rec_is_supremum(next_rec)) { return(next_rec); } } + page = buf_frame_align(rec); next_page_no = btr_page_get_next(page, mtr); space = buf_frame_get_space_id(page); @@ -246,9 +238,8 @@ btr_get_next_user_rec( || (mtr_memo_contains(mtr, buf_block_align(next_page), MTR_MEMO_PAGE_X_FIX))); - next_rec = page_rec_get_next(page_get_infimum_rec(next_page)); - - return(next_rec); + ut_a(page_is_comp(next_page) == page_is_comp(page)); + return(page_rec_get_next(page_get_infimum_rec(next_page))); } return(NULL); @@ -267,7 +258,8 @@ btr_page_create( { ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - page_create(page, mtr); + page_create(page, mtr, + UT_LIST_GET_FIRST(tree->tree_indexes)->table->comp); buf_block_align(page)->check_index_page_at_flush = TRUE; btr_page_set_index_id(page, tree->id, mtr); @@ -503,20 +495,21 @@ UNIV_INLINE void btr_node_ptr_set_child_page_no( /*===========================*/ - rec_t* rec, /* in: node pointer record */ - ulint page_no, /* in: child node address */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: node pointer record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint page_no,/* in: child node address */ + mtr_t* mtr) /* in: mtr */ { - ulint n_fields; byte* field; ulint len; + ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_ad(0 < btr_page_get_level(buf_frame_align(rec), mtr)); - - n_fields = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); /* The child address is in the last field */ - field = rec_get_nth_field(rec, n_fields - 1, &len); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); ut_ad(len == 4); @@ -529,16 +522,18 @@ static page_t* btr_node_ptr_get_child( /*===================*/ - /* out: child page, x-latched */ - rec_t* node_ptr, /* in: node pointer */ - mtr_t* mtr) /* in: mtr */ + /* out: child page, x-latched */ + rec_t* node_ptr,/* in: node pointer */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr) /* in: mtr */ { ulint page_no; ulint space; page_t* page; - + + ut_ad(rec_offs_validate(node_ptr, NULL, offsets)); space = buf_frame_get_space_id(node_ptr); - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); page = btr_page_get(space, page_no, RW_X_LATCH, mtr); @@ -564,11 +559,14 @@ btr_page_get_father_for_rec( dtuple_t* tuple; btr_cur_t cursor; rec_t* node_ptr; + dict_index_t* index; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK)); - ut_a(user_rec != page_get_supremum_rec(page)); - ut_a(user_rec != page_get_infimum_rec(page)); + ut_a(page_rec_is_user_rec(user_rec)); ut_ad(dict_tree_get_page(tree) != buf_frame_get_page_no(page)); @@ -576,36 +574,44 @@ btr_page_get_father_for_rec( tuple = dict_tree_build_node_ptr(tree, user_rec, 0, heap, btr_page_get_level(page, mtr)); + index = UT_LIST_GET_FIRST(tree->tree_indexes); /* In the following, we choose just any index from the tree as the first parameter for btr_cur_search_to_nth_level. */ - - btr_cur_search_to_nth_level(UT_LIST_GET_FIRST(tree->tree_indexes), + + btr_cur_search_to_nth_level(index, btr_page_get_level(page, mtr) + 1, tuple, PAGE_CUR_LE, BTR_CONT_MODIFY_TREE, &cursor, 0, mtr); node_ptr = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); - if (btr_node_ptr_get_child_page_no(node_ptr) != + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != buf_frame_get_page_no(page)) { + rec_t* print_rec; fputs("InnoDB: Dump of the child page:\n", stderr); buf_page_print(buf_frame_align(page)); fputs("InnoDB: Dump of the parent page:\n", stderr); buf_page_print(buf_frame_align(node_ptr)); fputs("InnoDB: Corruption of an index tree: table ", stderr); - ut_print_name(stderr, NULL, - UT_LIST_GET_FIRST(tree->tree_indexes)->table_name); + ut_print_name(stderr, NULL, index->table_name); fputs(", index ", stderr); - ut_print_name(stderr, NULL, - UT_LIST_GET_FIRST(tree->tree_indexes)->name); + ut_print_name(stderr, NULL, index->name); fprintf(stderr, ",\n" "InnoDB: father ptr page no %lu, child page no %lu\n", - (ulong) btr_node_ptr_get_child_page_no(node_ptr), + (ulong) + btr_node_ptr_get_child_page_no(node_ptr, offsets), (ulong) buf_frame_get_page_no(page)); - page_rec_print(page_rec_get_next(page_get_infimum_rec(page))); - page_rec_print(node_ptr); + print_rec = page_rec_get_next(page_get_infimum_rec(page)); + offsets = rec_get_offsets(print_rec, index, + offsets, ULINT_UNDEFINED, &heap); + page_rec_print(print_rec, offsets); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(node_ptr, offsets); fputs( "InnoDB: You should dump + drop + reimport the table to fix the\n" @@ -614,7 +620,7 @@ btr_page_get_father_for_rec( "InnoDB: forcing recovery. Then dump + drop + reimport.\n", stderr); } - ut_a(btr_node_ptr_get_child_page_no(node_ptr) == + ut_a(btr_node_ptr_get_child_page_no(node_ptr, offsets) == buf_frame_get_page_no(page)); mem_heap_free(heap); @@ -649,6 +655,7 @@ btr_create( ulint type, /* in: type of the index */ ulint space, /* in: space where created */ dulint index_id,/* in: index id */ + ulint comp, /* in: nonzero=compact page format */ mtr_t* mtr) /* in: mini-transaction handle */ { ulint page_no; @@ -716,7 +723,7 @@ btr_create( } /* Create a new index page on the the allocated segment page */ - page = page_create(frame, mtr); + page = page_create(frame, mtr, comp); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Set the index id of the page */ @@ -821,12 +828,14 @@ static void btr_page_reorganize_low( /*====================*/ - ibool recovery,/* in: TRUE if called in recovery: locks should not - be updated, i.e., there cannot exist locks on the - page, and a hash index should not be dropped: it - cannot exist */ - page_t* page, /* in: page to be reorganized */ - mtr_t* mtr) /* in: mtr */ + ibool recovery,/* in: TRUE if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + page_t* page, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_t* new_page; ulint log_mode; @@ -837,11 +846,14 @@ btr_page_reorganize_low( ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); + ut_ad(!!page_is_comp(page) == index->table->comp); data_size1 = page_get_data_size(page); max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1); /* Write the log record */ - mlog_write_initial_log_record(page, MLOG_PAGE_REORGANIZE, mtr); + mlog_open_and_write_index(mtr, page, index, page_is_comp(page) + ? MLOG_COMP_PAGE_REORGANIZE + : MLOG_PAGE_REORGANIZE, 0); /* Turn logging off */ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); @@ -858,14 +870,14 @@ btr_page_reorganize_low( /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, mtr); + page_create(page, mtr, page_is_comp(page)); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Copy the records from the temporary space to the recreated page; do not copy the lock bits yet */ page_copy_rec_list_end_no_locks(page, new_page, - page_get_infimum_rec(new_page), mtr); + page_get_infimum_rec(new_page), index, mtr); /* Copy max trx id to recreated page */ page_set_max_trx_id(page, page_get_max_trx_id(new_page)); @@ -901,10 +913,11 @@ Reorganizes an index page. */ void btr_page_reorganize( /*================*/ - page_t* page, /* in: page to be reorganized */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { - btr_page_reorganize_low(FALSE, page, mtr); + btr_page_reorganize_low(FALSE, page, index, mtr); } /*************************************************************** @@ -913,18 +926,20 @@ Parses a redo log record of reorganizing a page. */ byte* btr_parse_page_reorganize( /*======================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr __attribute__((unused)), /* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), + /* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ut_ad(ptr && end_ptr); /* The record is empty, except for the record initial part */ if (page) { - btr_page_reorganize_low(TRUE, page, mtr); + btr_page_reorganize_low(TRUE, page, index, mtr); } return(ptr); @@ -946,7 +961,7 @@ btr_page_empty( /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, mtr); + page_create(page, mtr, page_is_comp(page)); buf_block_align(page)->check_index_page_at_flush = TRUE; } @@ -1011,7 +1026,7 @@ btr_root_raise_and_insert( /* Move the records from root to the new page */ page_move_rec_list_end(new_page, root, page_get_infimum_rec(root), - mtr); + cursor->index, mtr); /* If this is a pessimistic insert which is actually done to perform a pessimistic update then we have stored the lock information of the record to be inserted on the infimum of the @@ -1031,7 +1046,7 @@ btr_root_raise_and_insert( node_ptr = dict_tree_build_node_ptr(tree, rec, new_page_no, heap, level); /* Reorganize the root to get free space */ - btr_page_reorganize(root, mtr); + btr_page_reorganize(root, cursor->index, mtr); page_cursor = btr_cur_get_page_cur(cursor); @@ -1039,7 +1054,8 @@ btr_root_raise_and_insert( page_cur_set_before_first(root, page_cursor); - node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, mtr); + node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, + cursor->index, mtr); ut_ad(node_ptr_rec); @@ -1047,7 +1063,7 @@ btr_root_raise_and_insert( as there is no lower alphabetical limit to records in the leftmost node of a level: */ - btr_set_min_rec_mark(node_ptr_rec, mtr); + btr_set_min_rec_mark(node_ptr_rec, page_is_comp(root), mtr); /* Free the memory heap */ mem_heap_free(heap); @@ -1060,7 +1076,8 @@ btr_root_raise_and_insert( ibuf_reset_free_bits(UT_LIST_GET_FIRST(tree->tree_indexes), new_page); /* Reposition the cursor to the child node */ - page_cur_search(new_page, tuple, PAGE_CUR_LE, page_cursor); + page_cur_search(new_page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); /* Split the child and insert tuple */ return(btr_page_split_and_insert(cursor, tuple, mtr)); @@ -1127,7 +1144,6 @@ btr_page_get_split_rec_to_right( { page_t* page; rec_t* insert_point; - rec_t* supremum; page = btr_cur_get_page(cursor); insert_point = btr_cur_get_rec(cursor); @@ -1136,13 +1152,23 @@ btr_page_get_split_rec_to_right( the previous insert on the same page, we assume that there is a pattern of sequential inserts here. */ - if (page_header_get_ptr(page, PAGE_LAST_INSERT) == insert_point) { + if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT) + == insert_point)) { + + rec_t* next_rec; - supremum = page_get_supremum_rec(page); - - if (page_rec_get_next(insert_point) != supremum - && page_rec_get_next(page_rec_get_next(insert_point)) - != supremum) { + next_rec = page_rec_get_next(insert_point); + + if (page_rec_is_supremum(next_rec)) { +split_at_new: + /* Split at the new record to insert */ + *split_rec = NULL; + } else { + rec_t* next_next_rec = page_rec_get_next(next_rec); + if (page_rec_is_supremum(next_next_rec)) { + + goto split_at_new; + } /* If there are >= 2 user records up from the insert point, split all but 1 off. We want to keep one because @@ -1151,12 +1177,8 @@ btr_page_get_split_rec_to_right( search position just by looking at the records on this page. */ - *split_rec = page_rec_get_next( - page_rec_get_next(insert_point)); - } else { - /* Else split at the new record to insert */ - *split_rec = NULL; - } + *split_rec = next_next_rec; + } return(TRUE); } @@ -1190,11 +1212,13 @@ btr_page_get_sure_split_rec( rec_t* rec; rec_t* next_rec; ulint n; - + mem_heap_t* heap; + ulint* offsets; + page = btr_cur_get_page(cursor); - insert_size = rec_get_converted_size(tuple); - free_space = page_get_free_space_of_empty(); + insert_size = rec_get_converted_size(cursor->index, tuple); + free_space = page_get_free_space_of_empty(page_is_comp(page)); /* free_space is now the free space of a created new page */ @@ -1208,6 +1232,9 @@ btr_page_get_sure_split_rec( ins_rec = btr_cur_get_rec(cursor); rec = page_get_infimum_rec(page); + heap = NULL; + offsets = NULL; + /* We start to include records to the left half, and when the space reserved by them exceeds half of total_space, then if the included records fit on the left page, they will be put there @@ -1230,7 +1257,9 @@ btr_page_get_sure_split_rec( /* Include tuple */ incl_data += insert_size; } else { - incl_data += rec_get_size(rec); + offsets = rec_get_offsets(rec, cursor->index, + offsets, ULINT_UNDEFINED, &heap); + incl_data += rec_offs_size(offsets); } n++; @@ -1245,18 +1274,24 @@ btr_page_get_sure_split_rec( supremum record of page */ if (rec == ins_rec) { - next_rec = NULL; + rec = NULL; + + goto func_exit; } else if (rec == NULL) { next_rec = page_rec_get_next(ins_rec); } else { next_rec = page_rec_get_next(rec); } - if (next_rec != page_get_supremum_rec(page)) { - - return(next_rec); + ut_ad(next_rec); + if (!page_rec_is_supremum(next_rec)) { + rec = next_rec; } } +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(rec); } } @@ -1275,7 +1310,10 @@ btr_page_insert_fits( rec_t* split_rec, /* in: suggestion for first record on upper half-page, or NULL if tuple to be inserted should be first */ - dtuple_t* tuple) /* in: tuple to insert */ + const ulint* offsets, /* in: rec_get_offsets( + split_rec, cursor->index) */ + dtuple_t* tuple, /* in: tuple to insert */ + mem_heap_t* heap) /* in: temporary memory heap */ { page_t* page; ulint insert_size; @@ -1284,11 +1322,18 @@ btr_page_insert_fits( ulint total_n_recs; rec_t* rec; rec_t* end_rec; + ulint* offs; page = btr_cur_get_page(cursor); - - insert_size = rec_get_converted_size(tuple); - free_space = page_get_free_space_of_empty(); + + ut_ad(!split_rec == !offsets); + ut_ad(!offsets + || !page_is_comp(page) == !rec_offs_comp(offsets)); + ut_ad(!offsets + || rec_offs_validate(split_rec, cursor->index, offsets)); + + insert_size = rec_get_converted_size(cursor->index, tuple); + free_space = page_get_free_space_of_empty(page_is_comp(page)); /* free_space is now the free space of a created new page */ @@ -1303,7 +1348,7 @@ btr_page_insert_fits( rec = page_rec_get_next(page_get_infimum_rec(page)); end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); - } else if (cmp_dtuple_rec(tuple, split_rec) >= 0) { + } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) { rec = page_rec_get_next(page_get_infimum_rec(page)); end_rec = split_rec; @@ -1321,11 +1366,16 @@ btr_page_insert_fits( return(TRUE); } + offs = NULL; + while (rec != end_rec) { /* In this loop we calculate the amount of reserved space after rec is removed from page. */ - total_data -= rec_get_size(rec); + offs = rec_get_offsets(rec, cursor->index, offs, + ULINT_UNDEFINED, &heap); + + total_data -= rec_offs_size(offs); total_n_recs--; if (total_data + page_dir_calc_reserved_space(total_n_recs) @@ -1411,6 +1461,10 @@ btr_attach_half_pages( MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains(mtr, buf_block_align(new_page), MTR_MEMO_PAGE_X_FIX)); + ut_a(page_is_comp(page) == page_is_comp(new_page)); + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(1024); /* Based on split direction, decide upper and lower pages */ if (direction == FSP_DOWN) { @@ -1426,7 +1480,12 @@ btr_attach_half_pages( /* Replace the address of the old child node (= page) with the address of the new lower half */ - btr_node_ptr_set_child_page_no(node_ptr, lower_page_no, mtr); + btr_node_ptr_set_child_page_no(node_ptr, + rec_get_offsets(node_ptr, + UT_LIST_GET_FIRST(tree->tree_indexes), + NULL, ULINT_UNDEFINED, &heap), + lower_page_no, mtr); + mem_heap_empty(heap); } else { lower_page_no = buf_frame_get_page_no(page); upper_page_no = buf_frame_get_page_no(new_page); @@ -1434,9 +1493,6 @@ btr_attach_half_pages( upper_page = new_page; } - /* Create a memory heap where the data tuple is stored */ - heap = mem_heap_create(100); - /* Get the level of the split pages */ level = btr_page_get_level(page, mtr); @@ -1465,6 +1521,7 @@ btr_attach_half_pages( if (prev_page_no != FIL_NULL) { prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(prev_page) == page_is_comp(page)); btr_page_set_next(prev_page, lower_page_no, mtr); } @@ -1472,6 +1529,7 @@ btr_attach_half_pages( if (next_page_no != FIL_NULL) { next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); btr_page_set_prev(next_page, upper_page_no, mtr); } @@ -1522,7 +1580,15 @@ btr_page_split_and_insert( ibool insert_will_fit; ulint n_iterations = 0; rec_t* rec; + mem_heap_t* heap; + ulint n_uniq; + ulint* offsets; + + heap = mem_heap_create(1024); + n_uniq = dict_index_get_n_unique_in_tree(cursor->index); func_start: + mem_heap_empty(heap); + offsets = NULL; tree = btr_cur_get_tree(cursor); ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), @@ -1574,9 +1640,10 @@ func_start: first_rec = split_rec; move_limit = split_rec; } else { - buf = mem_alloc(rec_get_converted_size(tuple)); + buf = mem_alloc(rec_get_converted_size(cursor->index, tuple)); - first_rec = rec_convert_dtuple_to_rec(buf, tuple); + first_rec = rec_convert_dtuple_to_rec(buf, + cursor->index, tuple); move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); } @@ -1593,7 +1660,16 @@ func_start: We can then move the records after releasing the tree latch, thus reducing the tree latch contention. */ - insert_will_fit = btr_page_insert_fits(cursor, split_rec, tuple); + if (split_rec) { + offsets = rec_get_offsets(split_rec, cursor->index, offsets, + n_uniq, &heap); + + insert_will_fit = btr_page_insert_fits(cursor, + split_rec, offsets, tuple, heap); + } else { + insert_will_fit = btr_page_insert_fits(cursor, + NULL, NULL, tuple, heap); + } if (insert_will_fit && (btr_page_get_level(page, mtr) == 0)) { @@ -1605,7 +1681,8 @@ func_start: if (direction == FSP_DOWN) { /* fputs("Split left\n", stderr); */ - page_move_rec_list_start(new_page, page, move_limit, mtr); + page_move_rec_list_start(new_page, page, move_limit, + cursor->index, mtr); left_page = new_page; right_page = page; @@ -1613,7 +1690,8 @@ func_start: } else { /* fputs("Split right\n", stderr); */ - page_move_rec_list_end(new_page, page, move_limit, mtr); + page_move_rec_list_end(new_page, page, move_limit, + cursor->index, mtr); left_page = page; right_page = new_page; @@ -1626,19 +1704,25 @@ func_start: if (split_rec == NULL) { insert_page = right_page; - } else if (cmp_dtuple_rec(tuple, first_rec) >= 0) { - - insert_page = right_page; } else { - insert_page = left_page; + offsets = rec_get_offsets(first_rec, cursor->index, + offsets, n_uniq, &heap); + + if (cmp_dtuple_rec(tuple, first_rec, offsets) >= 0) { + + insert_page = right_page; + } else { + insert_page = left_page; + } } /* 7. Reposition the cursor for insert and try insertion */ page_cursor = btr_cur_get_page_cur(cursor); - page_cur_search(insert_page, tuple, PAGE_CUR_LE, page_cursor); + page_cur_search(insert_page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); if (rec != NULL) { /* Insert fit on the page: update the free bits for the @@ -1650,15 +1734,17 @@ func_start: /* fprintf(stderr, "Split and insert done %lu %lu\n", buf_frame_get_page_no(left_page), buf_frame_get_page_no(right_page)); */ + mem_heap_free(heap); return(rec); } /* 8. If insert did not fit, try page reorganization */ - btr_page_reorganize(insert_page, mtr); + btr_page_reorganize(insert_page, cursor->index, mtr); - page_cur_search(insert_page, tuple, PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + page_cur_search(insert_page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); if (rec == NULL) { /* The insert did not fit on the page: loop back to the @@ -1688,6 +1774,7 @@ func_start: ut_ad(page_validate(left_page, UT_LIST_GET_FIRST(tree->tree_indexes))); ut_ad(page_validate(right_page, UT_LIST_GET_FIRST(tree->tree_indexes))); + mem_heap_free(heap); return(rec); } @@ -1721,6 +1808,7 @@ btr_level_list_remove( if (prev_page_no != FIL_NULL) { prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(prev_page) == page_is_comp(page)); btr_page_set_next(prev_page, next_page_no, mtr); } @@ -1728,6 +1816,7 @@ btr_level_list_remove( if (next_page_no != FIL_NULL) { next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); btr_page_set_prev(next_page, prev_page_no, mtr); } @@ -1741,12 +1830,15 @@ void btr_set_min_rec_mark_log( /*=====================*/ rec_t* rec, /* in: record */ + ulint comp, /* nonzero=compact record format */ mtr_t* mtr) /* in: mtr */ { - mlog_write_initial_log_record(rec, MLOG_REC_MIN_MARK, mtr); + mlog_write_initial_log_record(rec, + comp ? MLOG_COMP_REC_MIN_MARK : MLOG_REC_MIN_MARK, mtr); /* Write rec offset as a 2-byte ulint */ - mlog_catenate_ulint(mtr, rec - buf_frame_align(rec), MLOG_2BYTES); + mlog_catenate_ulint(mtr, ut_align_offset(rec, UNIV_PAGE_SIZE), + MLOG_2BYTES); } /******************************************************************** @@ -1759,6 +1851,7 @@ btr_parse_set_min_rec_mark( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { @@ -1770,9 +1863,11 @@ btr_parse_set_min_rec_mark( } if (page) { + ut_a(!page_is_comp(page) == !comp); + rec = page + mach_read_from_2(ptr); - btr_set_min_rec_mark(rec, mtr); + btr_set_min_rec_mark(rec, comp, mtr); } return(ptr + 2); @@ -1785,15 +1880,16 @@ void btr_set_min_rec_mark( /*=================*/ rec_t* rec, /* in: record */ + ulint comp, /* in: nonzero=compact page format */ mtr_t* mtr) /* in: mtr */ { ulint info_bits; - info_bits = rec_get_info_bits(rec); + info_bits = rec_get_info_bits(rec, comp); - rec_set_info_bits(rec, info_bits | REC_INFO_MIN_REC_FLAG); + rec_set_info_bits(rec, comp, info_bits | REC_INFO_MIN_REC_FLAG); - btr_set_min_rec_mark_log(rec, mtr); + btr_set_min_rec_mark_log(rec, comp, mtr); } /***************************************************************** @@ -1842,18 +1938,19 @@ btr_lift_page_up( record from the page should be removed */ mtr_t* mtr) /* in: mtr */ { - rec_t* node_ptr; - page_t* father_page; - ulint page_level; - + page_t* father_page; + ulint page_level; + dict_index_t* index; + ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - node_ptr = btr_page_get_father_node_ptr(tree, page, mtr); - father_page = buf_frame_align(node_ptr); + father_page = buf_frame_align( + btr_page_get_father_node_ptr(tree, page, mtr)); page_level = btr_page_get_level(page, mtr); + index = UT_LIST_GET_FIRST(tree->tree_indexes); btr_search_drop_page_hash_index(page); @@ -1862,7 +1959,7 @@ btr_lift_page_up( /* Move records to the father */ page_copy_rec_list_end(father_page, page, page_get_infimum_rec(page), - mtr); + index, mtr); lock_update_copy_and_discard(father_page, page); btr_page_set_level(father_page, page_level, mtr); @@ -1871,10 +1968,8 @@ btr_lift_page_up( btr_page_free(tree, page, mtr); /* We play safe and reset the free bits for the father */ - ibuf_reset_free_bits(UT_LIST_GET_FIRST(tree->tree_indexes), - father_page); - ut_ad(page_validate(father_page, - UT_LIST_GET_FIRST(tree->tree_indexes))); + ibuf_reset_free_bits(index, father_page); + ut_ad(page_validate(father_page, index)); ut_ad(btr_check_node_ptr(tree, father_page, mtr)); } @@ -1914,9 +2009,12 @@ btr_compress( ulint max_ins_size; ulint max_ins_size_reorg; ulint level; - + ulint comp; + page = btr_cur_get_page(cursor); tree = btr_cur_get_tree(cursor); + comp = page_is_comp(page); + ut_a((ibool)!!comp == cursor->index->table->comp); ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK)); @@ -1932,7 +2030,9 @@ btr_compress( right_page_no); */ node_ptr = btr_page_get_father_node_ptr(tree, page, mtr); + ut_ad(!comp || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); father_page = buf_frame_align(node_ptr); + ut_a(comp == page_is_comp(father_page)); /* Decide the page to which we try to merge and which will inherit the locks */ @@ -1957,6 +2057,7 @@ btr_compress( n_recs = page_get_n_recs(page); data_size = page_get_data_size(page); + ut_a(page_is_comp(merge_page) == comp); max_ins_size_reorg = page_get_max_insert_size_after_reorganize( merge_page, n_recs); @@ -1975,7 +2076,7 @@ btr_compress( /* We have to reorganize merge_page */ - btr_page_reorganize(merge_page, mtr); + btr_page_reorganize(merge_page, cursor->index, mtr); max_ins_size = page_get_max_insert_size(merge_page, n_recs); @@ -1999,11 +2100,19 @@ btr_compress( if (is_left) { btr_node_ptr_delete(tree, page, mtr); } else { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; /* Replace the address of the old child node (= page) with the address of the merge page to the right */ - btr_node_ptr_set_child_page_no(node_ptr, right_page_no, mtr); - + btr_node_ptr_set_child_page_no(node_ptr, + rec_get_offsets(node_ptr, cursor->index, + offsets_, ULINT_UNDEFINED, &heap), + right_page_no, mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } btr_node_ptr_delete(tree, merge_page, mtr); } @@ -2012,14 +2121,14 @@ btr_compress( orig_pred = page_rec_get_prev( page_get_supremum_rec(merge_page)); page_copy_rec_list_start(merge_page, page, - page_get_supremum_rec(page), mtr); + page_get_supremum_rec(page), cursor->index, mtr); lock_update_merge_left(merge_page, orig_pred, page); } else { orig_succ = page_rec_get_next( page_get_infimum_rec(merge_page)); page_copy_rec_list_end(merge_page, page, - page_get_infimum_rec(page), mtr); + page_get_infimum_rec(page), cursor->index, mtr); lock_update_merge_right(orig_succ, page); } @@ -2133,6 +2242,7 @@ btr_discard_page( return; } + ut_a(page_is_comp(merge_page) == page_is_comp(page)); btr_search_drop_page_hash_index(page); if (left_page_no == FIL_NULL && btr_page_get_level(page, mtr) > 0) { @@ -2142,9 +2252,9 @@ btr_discard_page( node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page)); - ut_ad(node_ptr != page_get_supremum_rec(merge_page)); + ut_ad(page_rec_is_user_rec(node_ptr)); - btr_set_min_rec_mark(node_ptr, mtr); + btr_set_min_rec_mark(node_ptr, page_is_comp(merge_page), mtr); } btr_node_ptr_delete(tree, page, mtr); @@ -2165,6 +2275,7 @@ btr_discard_page( ut_ad(btr_check_node_ptr(tree, merge_page, mtr)); } +#ifdef UNIV_BTR_PRINT /***************************************************************** Prints size info of a B-tree. */ @@ -2215,6 +2326,8 @@ btr_print_recursive( page_t* page, /* in: index page */ ulint width, /* in: print this many entries from start and end */ + mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */ + ulint** offsets,/* in/out: buffer for rec_get_offsets() */ mtr_t* mtr) /* in: mtr */ { page_cur_t cursor; @@ -2223,14 +2336,16 @@ btr_print_recursive( mtr_t mtr2; rec_t* node_ptr; page_t* child; - + dict_index_t* index; + ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n", (ulong) btr_page_get_level(page, mtr), (ulong) buf_frame_get_page_no(page)); - page_print(page, width, width); + index = UT_LIST_GET_FIRST(tree->tree_indexes); + page_print(page, index, width, width); n_recs = page_get_n_recs(page); @@ -2249,9 +2364,12 @@ btr_print_recursive( node_ptr = page_cur_get_rec(&cursor); - child = btr_node_ptr_get_child(node_ptr, &mtr2); - - btr_print_recursive(tree, child, width, &mtr2); + *offsets = rec_get_offsets(node_ptr, index, *offsets, + ULINT_UNDEFINED, heap); + child = btr_node_ptr_get_child(node_ptr, + *offsets, &mtr2); + btr_print_recursive(tree, child, width, + heap, offsets, &mtr2); mtr_commit(&mtr2); } @@ -2270,8 +2388,12 @@ btr_print_tree( ulint width) /* in: print this many entries from start and end */ { - mtr_t mtr; - page_t* root; + mtr_t mtr; + page_t* root; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; fputs("--------------------------\n" "INDEX TREE PRINT\n", stderr); @@ -2280,12 +2402,16 @@ btr_print_tree( root = btr_root_get(tree, &mtr); - btr_print_recursive(tree, root, width, &mtr); + btr_print_recursive(tree, root, width, &heap, &offsets, &mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } mtr_commit(&mtr); - btr_validate_tree(tree); + btr_validate_tree(tree, NULL); } +#endif /* UNIV_BTR_PRINT */ /**************************************************************** Checks that the node pointer to a page is appropriate. */ @@ -2323,7 +2449,10 @@ btr_check_node_ptr( page_rec_get_next(page_get_infimum_rec(page)), 0, heap, btr_page_get_level(page, mtr)); - ut_a(cmp_dtuple_rec(node_ptr_tuple, node_ptr) == 0); + ut_a(cmp_dtuple_rec(node_ptr_tuple, node_ptr, + rec_get_offsets(node_ptr, + dict_tree_find_index(tree, node_ptr), + NULL, ULINT_UNDEFINED, &heap)) == 0); mem_heap_free(heap); @@ -2360,14 +2489,18 @@ btr_index_rec_validate( should print hex dump of record and page on error */ { - ulint len; - ulint n; - ulint i; - page_t* page; + ulint len; + ulint n; + ulint i; + page_t* page; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; page = buf_frame_align(rec); - - if (index->type & DICT_UNIVERSAL) { + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { /* The insert buffer index tree can contain records from any other index: we cannot check the number of fields or their length */ @@ -2375,38 +2508,46 @@ btr_index_rec_validate( return(TRUE); } + if (UNIV_UNLIKELY((ibool)!!page_is_comp(page) != index->table->comp)) { + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n", + (ulong) !!page_is_comp(page), + (ulong) index->table->comp); + return(FALSE); + } + n = dict_index_get_n_fields(index); - if (rec_get_n_fields(rec) != n) { + if (!page_is_comp(page) + && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) { btr_index_rec_validate_report(page, rec, index); fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n", - (ulong) rec_get_n_fields(rec), (ulong) n); + (ulong) rec_get_n_fields_old(rec), (ulong) n); - if (!dump_on_error) { + if (dump_on_error) { + buf_page_print(page); - return(FALSE); + fputs("InnoDB: corrupt record ", stderr); + rec_print_old(stderr, rec); + putc('\n', stderr); } - - buf_page_print(page); - - fputs("InnoDB: corrupt record ", stderr); - rec_print(stderr, rec); - putc('\n', stderr); - return(FALSE); } + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + for (i = 0; i < n; i++) { dtype_t* type = dict_index_get_nth_type(index, i); + ulint fixed_size = dtype_get_fixed_size(type); - rec_get_nth_field(rec, i, &len); + rec_get_nth_field(rec, offsets, i, &len); /* Note that prefix indexes are not fixed size even when their type is CHAR. */ if ((dict_index_get_nth_field(index, i)->prefix_len == 0 - && len != UNIV_SQL_NULL && dtype_is_fixed_size(type) - && len != dtype_get_fixed_size(type)) + && len != UNIV_SQL_NULL && fixed_size + && len != fixed_size) || (dict_index_get_nth_field(index, i)->prefix_len > 0 && len != UNIV_SQL_NULL @@ -2418,21 +2559,23 @@ btr_index_rec_validate( "InnoDB: field %lu len is %lu, should be %lu\n", (ulong) i, (ulong) len, (ulong) dtype_get_fixed_size(type)); - if (!dump_on_error) { - - return(FALSE); - } - - buf_page_print(page); - - fputs("InnoDB: corrupt record ", stderr); - rec_print(stderr, rec); - putc('\n', stderr); + if (dump_on_error) { + buf_page_print(page); + fputs("InnoDB: corrupt record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(FALSE); } } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(TRUE); } @@ -2518,6 +2661,7 @@ btr_validate_level( /*===============*/ /* out: TRUE if ok */ dict_tree_t* tree, /* in: index tree */ + trx_t* trx, /* in: transaction or NULL */ ulint level) /* in: level number */ { ulint space; @@ -2527,15 +2671,18 @@ btr_validate_level( page_t* right_father_page; rec_t* node_ptr; rec_t* right_node_ptr; + rec_t* rec; ulint right_page_no; ulint left_page_no; page_cur_t cursor; - mem_heap_t* heap; dtuple_t* node_ptr_tuple; ibool ret = TRUE; dict_index_t* index; mtr_t mtr; - + mem_heap_t* heap = mem_heap_create(256); + ulint* offsets = NULL; + ulint* offsets2= NULL; + mtr_start(&mtr); mtr_x_lock(dict_tree_get_lock(tree), &mtr); @@ -2544,6 +2691,8 @@ btr_validate_level( space = buf_frame_get_space_id(page); + index = UT_LIST_GET_FIRST(tree->tree_indexes); + while (level != btr_page_get_level(page, &mtr)) { ut_a(btr_page_get_level(page, &mtr) > 0); @@ -2552,14 +2701,21 @@ btr_validate_level( page_cur_move_to_next(&cursor); node_ptr = page_cur_get_rec(&cursor); - page = btr_node_ptr_get_child(node_ptr, &mtr); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + page = btr_node_ptr_get_child(node_ptr, offsets, &mtr); } - index = UT_LIST_GET_FIRST(tree->tree_indexes); - /* Now we are on the desired level. Loop through the pages on that level. */ loop: + if (trx_is_interrupted(trx)) { + mtr_commit(&mtr); + mem_heap_free(heap); + return(ret); + } + mem_heap_empty(heap); + offsets = offsets2 = NULL; mtr_x_lock(dict_tree_get_lock(tree), &mtr); /* Check ordering etc. of records */ @@ -2588,12 +2744,19 @@ loop: (buf_frame_get_page_no(page) == dict_tree_get_page(tree)))); if (right_page_no != FIL_NULL) { - + rec_t* right_rec; right_page = btr_page_get(space, right_page_no, RW_X_LATCH, &mtr); - if (cmp_rec_rec(page_rec_get_prev(page_get_supremum_rec(page)), - page_rec_get_next(page_get_infimum_rec(right_page)), - UT_LIST_GET_FIRST(tree->tree_indexes)) >= 0) { + ut_a(page_is_comp(right_page) == page_is_comp(page)); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + right_rec = page_rec_get_next( + page_get_infimum_rec(right_page)); + offsets = rec_get_offsets(rec, index, + offsets, ULINT_UNDEFINED, &heap); + offsets2 = rec_get_offsets(right_rec, index, + offsets2, ULINT_UNDEFINED, &heap); + if (cmp_rec_rec(rec, right_rec, offsets, offsets2, index) + >= 0) { btr_validate_report2(index, level, page, right_page); @@ -2604,12 +2767,13 @@ loop: buf_page_print(right_page); fputs("InnoDB: record ", stderr); - rec_print(stderr, page_rec_get_prev( - page_get_supremum_rec(page))); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + rec_print(stderr, rec, index); putc('\n', stderr); fputs("InnoDB: record ", stderr); - rec_print(stderr, page_rec_get_next( - page_get_infimum_rec(right_page))); + rec = page_rec_get_next(page_get_infimum_rec( + right_page)); + rec_print(stderr, rec, index); putc('\n', stderr); ret = FALSE; @@ -2618,7 +2782,8 @@ loop: if (level > 0 && left_page_no == FIL_NULL) { ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( - page_rec_get_next(page_get_infimum_rec(page)))); + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page))); } if (buf_frame_get_page_no(page) != dict_tree_get_page(tree)) { @@ -2627,12 +2792,14 @@ loop: node_ptr = btr_page_get_father_node_ptr(tree, page, &mtr); father_page = buf_frame_align(node_ptr); + offsets = rec_get_offsets(node_ptr, index, + offsets, ULINT_UNDEFINED, &heap); - if (btr_node_ptr_get_child_page_no(node_ptr) != + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != buf_frame_get_page_no(page) || node_ptr != btr_page_get_father_for_rec(tree, page, - page_rec_get_prev(page_get_supremum_rec(page)), - &mtr)) { + page_rec_get_prev(page_get_supremum_rec(page)), + &mtr)) { btr_validate_report1(index, level, page); fputs("InnoDB: node pointer to the page is wrong\n", @@ -2642,17 +2809,18 @@ loop: buf_page_print(page); fputs("InnoDB: node ptr ", stderr); - rec_print(stderr, node_ptr); + rec_print_new(stderr, node_ptr, offsets); fprintf(stderr, "\n" "InnoDB: node ptr child page n:o %lu\n", - (unsigned long) btr_node_ptr_get_child_page_no(node_ptr)); + (unsigned long) btr_node_ptr_get_child_page_no( + node_ptr, offsets)); fputs("InnoDB: record on page ", stderr); - rec_print(stderr, - btr_page_get_father_for_rec(tree, page, - page_rec_get_prev(page_get_supremum_rec(page)), - &mtr)); + rec = btr_page_get_father_for_rec(tree, page, + page_rec_get_prev(page_get_supremum_rec(page)), + &mtr); + rec_print(stderr, rec, index); putc('\n', stderr); ret = FALSE; @@ -2660,7 +2828,8 @@ loop: } if (btr_page_get_level(page, &mtr) > 0) { - heap = mem_heap_create(256); + offsets = rec_get_offsets(node_ptr, index, + offsets, ULINT_UNDEFINED, &heap); node_ptr_tuple = dict_tree_build_node_ptr( tree, @@ -2669,7 +2838,10 @@ loop: 0, heap, btr_page_get_level(page, &mtr)); - if (cmp_dtuple_rec(node_ptr_tuple, node_ptr) != 0) { + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, + offsets)) { + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); btr_validate_report1(index, level, page); @@ -2679,18 +2851,14 @@ loop: fputs("InnoDB: Error: node ptrs differ" " on levels > 0\n" "InnoDB: node ptr ", stderr); - rec_print(stderr, node_ptr); + rec_print_new(stderr, node_ptr, offsets); fputs("InnoDB: first rec ", stderr); - rec_print(stderr, page_rec_get_next( - page_get_infimum_rec(page))); + rec_print(stderr, first_rec, index); putc('\n', stderr); ret = FALSE; - mem_heap_free(heap); goto node_ptr_fails; } - - mem_heap_free(heap); } if (left_page_no == FIL_NULL) { @@ -2701,7 +2869,7 @@ loop: if (right_page_no == FIL_NULL) { ut_a(node_ptr == page_rec_get_prev( - page_get_supremum_rec(father_page))); + page_get_supremum_rec(father_page))); ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL); } @@ -2771,13 +2939,16 @@ node_ptr_fails: mtr_commit(&mtr); if (right_page_no != FIL_NULL) { + ulint comp = page_is_comp(page); mtr_start(&mtr); page = btr_page_get(space, right_page_no, RW_X_LATCH, &mtr); + ut_a(page_is_comp(page) == comp); goto loop; } + mem_heap_free(heap); return(ret); } @@ -2788,7 +2959,8 @@ ibool btr_validate_tree( /*==============*/ /* out: TRUE if ok */ - dict_tree_t* tree) /* in: tree */ + dict_tree_t* tree, /* in: tree */ + trx_t* trx) /* in: transaction or NULL */ { mtr_t mtr; page_t* root; @@ -2801,9 +2973,8 @@ btr_validate_tree( root = btr_root_get(tree, &mtr); n = btr_page_get_level(root, &mtr); - for (i = 0; i <= n; i++) { - - if (!btr_validate_level(tree, n - i)) { + for (i = 0; i <= n && !trx_is_interrupted(trx); i++) { + if (!btr_validate_level(tree, trx, n - i)) { mtr_commit(&mtr); diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index 48de5644908..f81cce5b8e9 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -36,11 +36,11 @@ Created 10/16/1994 Heikki Tuuri #include "ibuf0ibuf.h" #include "lock0lock.h" +#ifdef UNIV_DEBUG /* If the following is set to TRUE, this module prints a lot of trace information of individual record operations */ ibool btr_cur_print_record_ops = FALSE; - -ulint btr_cur_rnd = 0; +#endif /* UNIV_DEBUG */ ulint btr_cur_n_non_sea = 0; ulint btr_cur_n_sea = 0; @@ -73,8 +73,9 @@ static void btr_cur_unmark_extern_fields( /*=========================*/ - rec_t* rec, /* in: record in a clustered index */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + mtr_t* mtr, /* in: mtr */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*********************************************************************** Adds path information to the cursor for the current page, for which the binary search has been performed. */ @@ -96,6 +97,7 @@ btr_rec_free_updated_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free @@ -108,9 +110,10 @@ static ulint btr_rec_get_externally_stored_len( /*==============================*/ - /* out: externally stored part, in units of a - database page */ - rec_t* rec); /* in: record */ + /* out: externally stored part, + in units of a database page */ + rec_t* rec, /* in: record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*==================== B-TREE SEARCH =========================*/ @@ -137,11 +140,13 @@ btr_cur_latch_leaves( if (latch_mode == BTR_SEARCH_LEAF) { get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_LEAF) { get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_TREE) { @@ -152,11 +157,13 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { get_page = btr_page_get(space, left_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; right_page_no = btr_page_get_next(page, mtr); @@ -176,11 +183,14 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { cursor->left_page = btr_page_get(space, left_page_no, RW_S_LATCH, mtr); + ut_a(page_is_comp(cursor->left_page) == + page_is_comp(page)); buf_block_align( cursor->left_page)->check_index_page_at_flush = TRUE; } get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_PREV) { @@ -191,11 +201,14 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { cursor->left_page = btr_page_get(space, left_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(cursor->left_page) == + page_is_comp(page)); buf_block_align( cursor->left_page)->check_index_page_at_flush = TRUE; } get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else { ut_error; @@ -261,6 +274,10 @@ btr_cur_search_to_nth_level( #ifdef BTR_CUR_ADAPT btr_search_t* info; #endif + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; /* Currently, PAGE_CUR_LE is the only search mode used for searches ending to upper levels */ @@ -299,7 +316,9 @@ btr_cur_search_to_nth_level( if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ && !estimate +#ifdef PAGE_CUR_LE_OR_EXTENDS && mode != PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ && srv_use_adaptive_hash_indexes && btr_search_guess_on_hash(index, info, tuple, mode, latch_mode, cursor, @@ -373,13 +392,16 @@ btr_cur_search_to_nth_level( page_mode = PAGE_CUR_LE; break; default: - ut_ad(mode == PAGE_CUR_L - || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE || mode == PAGE_CUR_LE_OR_EXTENDS); +#else /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ page_mode = mode; break; } - + /* Loop and search until we arrive at the desired level */ for (;;) { @@ -414,7 +436,9 @@ retry_page_get: cursor->thr)) { /* Insertion to the insert buffer succeeded */ cursor->flag = BTR_CUR_INSERT_TO_IBUF; - + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return; } @@ -470,9 +494,9 @@ retry_page_get: page_mode = mode; } - page_cur_search_with_match(page, tuple, page_mode, &up_match, - &up_bytes, &low_match, &low_bytes, - page_cursor); + page_cur_search_with_match(page, index, tuple, page_mode, + &up_match, &up_bytes, + &low_match, &low_bytes, page_cursor); if (estimate) { btr_cur_add_path_info(cursor, height, root_height); } @@ -486,7 +510,10 @@ retry_page_get: if (level > 0) { /* x-latch the page */ - btr_page_get(space, page_no, RW_X_LATCH, mtr); + page = btr_page_get(space, + page_no, RW_X_LATCH, mtr); + ut_a((ibool)!!page_is_comp(page) + == index->table->comp); } break; @@ -498,9 +525,14 @@ retry_page_get: guess = NULL; node_ptr = page_cur_get_rec(page_cursor); - + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); /* Go to the child node */ - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); } if (level == 0) { @@ -552,6 +584,10 @@ btr_cur_open_at_index_side( rec_t* node_ptr; ulint estimate; ulint savepoint; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; estimate = latch_mode & BTR_ESTIMATE; latch_mode = latch_mode & ~BTR_ESTIMATE; @@ -576,7 +612,7 @@ btr_cur_open_at_index_side( page_no = dict_tree_get_page(tree); height = ULINT_UNDEFINED; - + for (;;) { page = buf_page_get_gen(space, page_no, RW_NO_LATCH, NULL, BUF_GET, @@ -645,9 +681,14 @@ btr_cur_open_at_index_side( height--; node_ptr = page_cur_get_rec(page_cursor); - + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); /* Go to the child node */ - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); } } @@ -669,6 +710,10 @@ btr_cur_open_at_rnd_pos( ulint space; ulint height; rec_t* node_ptr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; tree = index->tree; @@ -717,9 +762,14 @@ btr_cur_open_at_rnd_pos( height--; node_ptr = page_cur_get_rec(page_cursor); - + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); /* Go to the child node */ - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); } } @@ -758,18 +808,20 @@ btr_cur_insert_if_possible( page_cursor = btr_cur_get_page_cur(cursor); /* Now, try the insert */ - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); if (!rec) { /* If record did not fit, reorganize */ - btr_page_reorganize(page, mtr); + btr_page_reorganize(page, cursor->index, mtr); *reorg = TRUE; - page_cur_search(page, tuple, PAGE_CUR_LE, page_cursor); + page_cur_search(page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, + cursor->index, mtr); } return(rec); @@ -833,6 +885,7 @@ btr_cur_ins_lock_and_undo( return(DB_SUCCESS); } +#ifdef UNIV_DEBUG /***************************************************************** Report information about a transaction. */ static @@ -850,6 +903,7 @@ btr_cur_trx_report( dict_index_name_print(stderr, trx, index); putc('\n', stderr); } +#endif /* UNIV_DEBUG */ /***************************************************************** Tries to perform an insert to a page in an index tree, next to cursor. @@ -887,8 +941,6 @@ btr_cur_optimistic_insert( ibool reorg; ibool inherit; ulint rec_size; - ulint data_size; - ulint extra_size; ulint type; ulint err; @@ -901,12 +953,13 @@ btr_cur_optimistic_insert( fputs("InnoDB: Error in a tuple to insert into ", stderr); dict_index_name_print(stderr, thr_get_trx(thr), index); } - +#ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(thr_get_trx(thr), index, "insert into "); dtuple_print(stderr, entry); } - +#endif /* UNIV_DEBUG */ + ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); max_size = page_get_max_insert_size_after_reorganize(page, 1); @@ -914,13 +967,11 @@ btr_cur_optimistic_insert( calculate_sizes_again: /* Calculate the record size when entry is converted to a record */ - data_size = dtuple_get_data_size(entry); - extra_size = rec_get_converted_extra_size(data_size, - dtuple_get_n_fields(entry)); - rec_size = data_size + extra_size; + rec_size = rec_get_converted_size(index, entry); - if ((rec_size >= page_get_free_space_of_empty() / 2) - || (rec_size >= REC_MAX_DATA_SIZE)) { + if (rec_size >= + ut_min(page_get_free_space_of_empty(page_is_comp(page)) / 2, + REC_MAX_DATA_SIZE)) { /* The record is so big that we have to store some fields externally on separate database pages */ @@ -983,21 +1034,21 @@ calculate_sizes_again: /* Now, try the insert */ - *rec = page_cur_insert_rec_low(page_cursor, entry, data_size, - NULL, mtr); - if (!(*rec)) { + *rec = page_cur_insert_rec_low(page_cursor, entry, index, + NULL, NULL, mtr); + if (UNIV_UNLIKELY(!(*rec))) { /* If the record did not fit, reorganize */ - btr_page_reorganize(page, mtr); + btr_page_reorganize(page, index, mtr); ut_ad(page_get_max_insert_size(page, 1) == max_size); reorg = TRUE; - page_cur_search(page, entry, PAGE_CUR_LE, page_cursor); + page_cur_search(page, index, entry, PAGE_CUR_LE, page_cursor); - *rec = page_cur_tuple_insert(page_cursor, entry, mtr); + *rec = page_cur_tuple_insert(page_cursor, entry, index, mtr); - if (!*rec) { + if (UNIV_UNLIKELY(!*rec)) { fputs("InnoDB: Error: cannot insert tuple ", stderr); dtuple_print(stderr, entry); fputs(" into ", stderr); @@ -1123,9 +1174,9 @@ btr_cur_pessimistic_insert( } } - if ((rec_get_converted_size(entry) - >= page_get_free_space_of_empty() / 2) - || (rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE)) { + if (rec_get_converted_size(index, entry) >= + ut_min(page_get_free_space_of_empty(page_is_comp(page)) / 2, + REC_MAX_DATA_SIZE)) { /* The record is so big that we have to store some fields externally on separate database pages */ @@ -1212,8 +1263,16 @@ btr_cur_upd_lock_and_undo( err = DB_SUCCESS; if (!(flags & BTR_NO_LOCKING_FLAG)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + err = lock_clust_rec_modify_check_and_lock(flags, rec, index, - thr); + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), thr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } if (err != DB_SUCCESS) { return(err); @@ -1243,14 +1302,19 @@ btr_cur_update_in_place_log( mtr_t* mtr) /* in: mtr */ { byte* log_ptr; + page_t* page = ut_align_down(rec, UNIV_PAGE_SIZE); + ut_ad(flags < 256); + ut_ad(!!page_is_comp(page) == index->table->comp); - log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN); + log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page) + ? MLOG_COMP_REC_UPDATE_IN_PLACE + : MLOG_REC_UPDATE_IN_PLACE, + 1 + DATA_ROLL_PTR_LEN + 14 + 2 + MLOG_BUF_MARGIN); - log_ptr = mlog_write_initial_log_record_fast(rec, - MLOG_REC_UPDATE_IN_PLACE, log_ptr, mtr); - - mach_write_to_1(log_ptr, flags); - log_ptr++; + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } /* The code below assumes index is a clustered index: change index to the clustered index if we are updating a secondary index record (or we @@ -1259,9 +1323,12 @@ btr_cur_update_in_place_log( index = dict_table_get_first_index(index->table); + mach_write_to_1(log_ptr, flags); + log_ptr++; + log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, mtr); - mach_write_to_2(log_ptr, rec - buf_frame_align(rec)); + mach_write_to_2(log_ptr, ut_align_offset(rec, UNIV_PAGE_SIZE)); log_ptr += 2; row_upd_index_write_log(update, log_ptr, mtr); @@ -1273,10 +1340,11 @@ Parses a redo log record of updating a record in-place. */ byte* btr_cur_parse_update_in_place( /*==========================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + dict_index_t* index) /* in: index corresponding to page */ { ulint flags; rec_t* rec; @@ -1286,6 +1354,7 @@ btr_cur_parse_update_in_place( dulint roll_ptr; ulint rec_offset; mem_heap_t* heap; + ulint* offsets; if (end_ptr < ptr + 1) { @@ -1316,29 +1385,27 @@ btr_cur_parse_update_in_place( ptr = row_upd_index_parse(ptr, end_ptr, heap, &update); - if (ptr == NULL) { - mem_heap_free(heap); - - return(NULL); - } - - if (!page) { - mem_heap_free(heap); + if (!ptr || !page) { - return(ptr); + goto func_exit; } - + + ut_a((ibool)!!page_is_comp(page) == index->table->comp); rec = page + rec_offset; /* We do not need to reserve btr_search_latch, as the page is only being recovered, and there cannot be a hash index to it. */ + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields_in_recovery(rec, pos, trx_id, roll_ptr); + row_upd_rec_sys_fields_in_recovery(rec, offsets, + pos, trx_id, roll_ptr); } - row_upd_rec_in_place(rec, update); + row_upd_rec_in_place(rec, offsets, update); +func_exit: mem_heap_free(heap); return(ptr); @@ -1368,26 +1435,38 @@ btr_cur_update_in_place( rec_t* rec; dulint roll_ptr = ut_dulint_zero; trx_t* trx; - ibool was_delete_marked; + ulint was_delete_marked; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; rec = btr_cur_get_rec(cursor); index = cursor->index; + ut_ad(!!page_rec_is_comp(rec) == index->table->comp); trx = thr_get_trx(thr); - + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); +#ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(trx, index, "update "); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); } +#endif /* UNIV_DEBUG */ /* Do lock checking and undo logging */ err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, thr, &roll_ptr); - if (err != DB_SUCCESS) { + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(err); } block = buf_block_align(rec); + ut_ad(!!page_is_comp(buf_block_get_frame(block)) + == index->table->comp); if (block->is_hashed) { /* The function row_upd_changes_ord_field_binary works only @@ -1405,15 +1484,16 @@ btr_cur_update_in_place( } if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields(rec, index, trx, roll_ptr); + row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr); } /* FIXME: in a mixed tree, all records may not have enough ordering fields for btr search: */ - was_delete_marked = rec_get_deleted_flag(rec); - - row_upd_rec_in_place(rec, update); + was_delete_marked = rec_get_deleted_flag(rec, + page_is_comp(buf_block_get_frame(block))); + + row_upd_rec_in_place(rec, offsets, update); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); @@ -1421,13 +1501,17 @@ btr_cur_update_in_place( btr_cur_update_in_place_log(flags, rec, index, update, trx, roll_ptr, mtr); - if (was_delete_marked && !rec_get_deleted_flag(rec)) { + if (was_delete_marked && !rec_get_deleted_flag(rec, + page_is_comp(buf_block_get_frame(block)))) { /* The new updated record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(DB_SUCCESS); } @@ -1469,24 +1553,31 @@ btr_cur_optimistic_update( mem_heap_t* heap; ibool reorganized = FALSE; ulint i; - + ulint* offsets; + page = btr_cur_get_page(cursor); rec = btr_cur_get_rec(cursor); index = cursor->index; + ut_ad(!!page_rec_is_comp(rec) == index->table->comp); + heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + +#ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(thr_get_trx(thr), index, "update "); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); } +#endif /* UNIV_DEBUG */ ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - if (!row_upd_changes_field_size_or_external(rec, index, update)) { + if (!row_upd_changes_field_size_or_external(index, offsets, update)) { /* The simplest and the most common case: the update does not change the size of any field and none of the updated fields is externally stored in rec or update */ - + mem_heap_free(heap); return(btr_cur_update_in_place(flags, cursor, update, cmpl_info, thr, mtr)); } @@ -1497,29 +1588,30 @@ btr_cur_optimistic_update( /* Externally stored fields are treated in pessimistic update */ + mem_heap_free(heap); return(DB_OVERFLOW); } } - if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) { + if (rec_offs_any_extern(offsets)) { /* Externally stored fields are treated in pessimistic update */ + mem_heap_free(heap); return(DB_OVERFLOW); } page_cursor = btr_cur_get_page_cur(cursor); - heap = mem_heap_create(1024); - new_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, NULL); - old_rec_size = rec_get_size(rec); - new_rec_size = rec_get_converted_size(new_entry); + old_rec_size = rec_offs_size(offsets); + new_rec_size = rec_get_converted_size(index, new_entry); - if (new_rec_size >= page_get_free_space_of_empty() / 2) { + if (UNIV_UNLIKELY(new_rec_size >= page_get_free_space_of_empty( + page_is_comp(page)) / 2)) { mem_heap_free(heap); @@ -1529,8 +1621,9 @@ btr_cur_optimistic_update( max_size = old_rec_size + page_get_max_insert_size_after_reorganize(page, 1); - if (page_get_data_size(page) - old_rec_size + new_rec_size - < BTR_CUR_PAGE_COMPRESS_LIMIT) { + if (UNIV_UNLIKELY(page_get_data_size(page) + - old_rec_size + new_rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT)) { /* The page would become too empty */ @@ -1566,11 +1659,11 @@ btr_cur_optimistic_update( explicit locks on rec, before deleting rec (see the comment in .._pessimistic_update). */ - lock_rec_store_on_page_infimum(rec); + lock_rec_store_on_page_infimum(page, rec); btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(page_cursor, mtr); + page_cur_delete_rec(page_cursor, index, offsets, mtr); page_cur_move_to_prev(page_cursor); @@ -1587,11 +1680,13 @@ btr_cur_optimistic_update( ut_a(rec); /* <- We calculated above the insert would fit */ - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, page_is_comp(page))) { /* The new inserted record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } /* Restore the old explicit lock state on the record */ @@ -1690,6 +1785,7 @@ btr_cur_pessimistic_update( ulint* ext_vect; ulint n_ext_vect; ulint reserve_flag; + ulint* offsets = NULL; *big_rec = NULL; @@ -1733,7 +1829,7 @@ btr_cur_pessimistic_update( } success = fsp_reserve_free_extents(&n_reserved, - cursor->index->space, + index->space, n_extents, reserve_flag, mtr); if (!success) { err = DB_OUT_OF_FILE_SPACE; @@ -1743,6 +1839,7 @@ btr_cur_pessimistic_update( } heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); trx = thr_get_trx(thr); @@ -1767,28 +1864,29 @@ btr_cur_pessimistic_update( ut_a(big_rec_vec == NULL); - btr_rec_free_updated_extern_fields(index, rec, update, - TRUE, mtr); + btr_rec_free_updated_extern_fields(index, rec, offsets, + update, TRUE, mtr); } /* We have to set appropriate extern storage bits in the new record to be inserted: we have to remember which fields were such */ - ext_vect = mem_heap_alloc(heap, sizeof(ulint) * rec_get_n_fields(rec)); - n_ext_vect = btr_push_update_extern_fields(ext_vect, rec, update); - - if ((rec_get_converted_size(new_entry) >= - page_get_free_space_of_empty() / 2) - || (rec_get_converted_size(new_entry) >= REC_MAX_DATA_SIZE)) { + ext_vect = mem_heap_alloc(heap, sizeof(ulint) + * dict_index_get_n_fields(index)); + ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec)); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + n_ext_vect = btr_push_update_extern_fields(ext_vect, offsets, update); + + if (UNIV_UNLIKELY(rec_get_converted_size(index, new_entry) >= + ut_min(page_get_free_space_of_empty(page_is_comp(page)) / 2, + REC_MAX_DATA_SIZE))) { big_rec_vec = dtuple_convert_big_rec(index, new_entry, ext_vect, n_ext_vect); if (big_rec_vec == NULL) { - mem_heap_free(heap); - err = DB_TOO_BIG_RECORD; - goto return_after_reservations; } } @@ -1804,11 +1902,11 @@ btr_cur_pessimistic_update( delete the lock structs set on the root page even if the root page carries just node pointers. */ - lock_rec_store_on_page_infimum(rec); + lock_rec_store_on_page_infimum(buf_frame_align(rec), rec); btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(page_cursor, mtr); + page_cur_delete_rec(page_cursor, index, offsets, mtr); page_cur_move_to_prev(page_cursor); @@ -1817,21 +1915,22 @@ btr_cur_pessimistic_update( ut_a(rec || optim_err != DB_UNDERFLOW); if (rec) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + lock_rec_restore_from_page_infimum(rec, page); - rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr); + rec_set_field_extern_bits(rec, index, + ext_vect, n_ext_vect, mtr); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { /* The new inserted record owns its possible externally stored fields */ - - btr_cur_unmark_extern_fields(rec, mtr); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } btr_cur_compress_if_useful(cursor, mtr); err = DB_SUCCESS; - mem_heap_free(heap); - goto return_after_reservations; } @@ -1856,13 +1955,14 @@ btr_cur_pessimistic_update( ut_a(err == DB_SUCCESS); ut_a(dummy_big_rec == NULL); - rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr); + rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { /* The new inserted record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } lock_rec_restore_from_page_infimum(rec, page); @@ -1876,13 +1976,11 @@ btr_cur_pessimistic_update( btr_cur_pess_upd_restore_supremum(rec, mtr); } - mem_heap_free(heap); - return_after_reservations: + mem_heap_free(heap); if (n_extents > 0) { - fil_space_release_free_extents(cursor->index->space, - n_reserved); + fil_space_release_free_extents(index->space, n_reserved); } *big_rec = big_rec_vec; @@ -1908,11 +2006,21 @@ btr_cur_del_mark_set_clust_rec_log( mtr_t* mtr) /* in: mtr */ { byte* log_ptr; + ut_ad(flags < 256); + ut_ad(val <= 1); - log_ptr = mlog_open(mtr, 30); + ut_ad(!!page_rec_is_comp(rec) == index->table->comp); - log_ptr = mlog_write_initial_log_record_fast(rec, - MLOG_REC_CLUST_DELETE_MARK, log_ptr, mtr); + log_ptr = mlog_open_and_write_index(mtr, rec, index, + page_rec_is_comp(rec) + ? MLOG_COMP_REC_CLUST_DELETE_MARK + : MLOG_REC_CLUST_DELETE_MARK, + 1 + 1 + DATA_ROLL_PTR_LEN + 14 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } mach_write_to_1(log_ptr, flags); log_ptr++; @@ -1921,7 +2029,7 @@ btr_cur_del_mark_set_clust_rec_log( log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, mtr); - mach_write_to_2(log_ptr, rec - buf_frame_align(rec)); + mach_write_to_2(log_ptr, ut_align_offset(rec, UNIV_PAGE_SIZE)); log_ptr += 2; mlog_close(mtr, log_ptr); @@ -1934,19 +2042,22 @@ index record. */ byte* btr_cur_parse_del_mark_set_clust_rec( /*=================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: index corresponding to page */ + page_t* page) /* in: page or NULL */ { ulint flags; - ibool val; + ulint val; ulint pos; dulint trx_id; dulint roll_ptr; ulint offset; rec_t* rec; + ut_ad(!page || !!page_is_comp(page) == index->table->comp); + if (end_ptr < ptr + 2) { return(NULL); @@ -1978,15 +2089,24 @@ btr_cur_parse_del_mark_set_clust_rec( rec = page + offset; if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields_in_recovery(rec, pos, trx_id, - roll_ptr); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + row_upd_rec_sys_fields_in_recovery(rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + pos, trx_id, roll_ptr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } /* We do not need to reserve btr_search_latch, as the page is only being recovered, and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, page_is_comp(page), val); } return(ptr); @@ -2015,22 +2135,34 @@ btr_cur_del_mark_set_clust_rec( ulint err; rec_t* rec; trx_t* trx; - + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + rec = btr_cur_get_rec(cursor); index = cursor->index; - + ut_ad(!!page_rec_is_comp(rec) == index->table->comp); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + +#ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(thr_get_trx(thr), index, "del mark "); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); } +#endif /* UNIV_DEBUG */ ut_ad(index->type & DICT_CLUSTERED); - ut_ad(rec_get_deleted_flag(rec) == FALSE); + ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); - err = lock_clust_rec_modify_check_and_lock(flags, rec, index, thr); + err = lock_clust_rec_modify_check_and_lock(flags, + rec, index, offsets, thr); if (err != DB_SUCCESS) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(err); } @@ -2039,6 +2171,9 @@ btr_cur_del_mark_set_clust_rec( &roll_ptr); if (err != DB_SUCCESS) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(err); } @@ -2048,13 +2183,12 @@ btr_cur_del_mark_set_clust_rec( rw_lock_x_lock(&btr_search_latch); } - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, rec_offs_comp(offsets), val); trx = thr_get_trx(thr); if (!(flags & BTR_KEEP_SYS_FLAG)) { - - row_upd_rec_sys_fields(rec, index, trx, roll_ptr); + row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr); } if (block->is_hashed) { @@ -2063,6 +2197,9 @@ btr_cur_del_mark_set_clust_rec( btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx, roll_ptr, mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(DB_SUCCESS); } @@ -2073,21 +2210,27 @@ UNIV_INLINE void btr_cur_del_mark_set_sec_rec_log( /*=============================*/ - rec_t* rec, /* in: record */ - ibool val, /* in: value to set */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr */ { byte* log_ptr; + ut_ad(val <= 1); - log_ptr = mlog_open(mtr, 30); + log_ptr = mlog_open(mtr, 11 + 1 + 2); - log_ptr = mlog_write_initial_log_record_fast(rec, - MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr); + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + log_ptr = mlog_write_initial_log_record_fast( + rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr); mach_write_to_1(log_ptr, val); log_ptr++; - mach_write_to_2(log_ptr, rec - buf_frame_align(rec)); + mach_write_to_2(log_ptr, ut_align_offset(rec, UNIV_PAGE_SIZE)); log_ptr += 2; mlog_close(mtr, log_ptr); @@ -2100,12 +2243,12 @@ index record. */ byte* btr_cur_parse_del_mark_set_sec_rec( /*===============================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page) /* in: page or NULL */ { - ibool val; + ulint val; ulint offset; rec_t* rec; @@ -2129,7 +2272,7 @@ btr_cur_parse_del_mark_set_sec_rec( is only being recovered, and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, page_is_comp(page), val); } return(ptr); @@ -2155,11 +2298,13 @@ btr_cur_del_mark_set_sec_rec( rec = btr_cur_get_rec(cursor); +#ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(thr_get_trx(thr), cursor->index, "del mark "); - rec_print(stderr, rec); + rec_print(stderr, rec, cursor->index); } +#endif /* UNIV_DEBUG */ err = lock_sec_rec_modify_check_and_lock(flags, rec, cursor->index, thr); @@ -2169,12 +2314,15 @@ btr_cur_del_mark_set_sec_rec( } block = buf_block_align(rec); + ut_ad(!!page_is_comp(buf_block_get_frame(block)) + == cursor->index->table->comp); if (block->is_hashed) { rw_lock_x_lock(&btr_search_latch); } - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, page_is_comp(buf_block_get_frame(block)), + val); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); @@ -2192,13 +2340,13 @@ used by the insert buffer insert merge mechanism. */ void btr_cur_del_unmark_for_ibuf( /*========================*/ - rec_t* rec, /* in: record to delete unmark */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record to delete unmark */ + mtr_t* mtr) /* in: mtr */ { /* We do not need to reserve btr_search_latch, as the page has just been read to the buffer pool and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, FALSE); + rec_set_deleted_flag(rec, page_is_comp(buf_frame_align(rec)), FALSE); btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr); } @@ -2279,8 +2427,14 @@ btr_cur_optimistic_delete( successor of the deleted record */ mtr_t* mtr) /* in: mtr */ { - page_t* page; - ulint max_ins_size; + page_t* page; + ulint max_ins_size; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ibool no_compress_needed; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; ut_ad(mtr_memo_contains(mtr, buf_block_align(btr_cur_get_page(cursor)), MTR_MEMO_PAGE_X_FIX)); @@ -2290,27 +2444,34 @@ btr_cur_optimistic_delete( ut_ad(btr_page_get_level(page, mtr) == 0); - if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) { + rec = btr_cur_get_rec(cursor); + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); - return(FALSE); - } + no_compress_needed = !rec_offs_any_extern(offsets) + && btr_cur_can_delete_without_compress( + cursor, rec_offs_size(offsets), mtr); - if (btr_cur_can_delete_without_compress(cursor, mtr)) { + if (no_compress_needed) { - lock_update_delete(btr_cur_get_rec(cursor)); + lock_update_delete(rec); btr_search_update_hash_on_delete(cursor); max_ins_size = page_get_max_insert_size_after_reorganize(page, 1); - page_cur_delete_rec(btr_cur_get_page_cur(cursor), mtr); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, offsets, mtr); ibuf_update_free_bits_low(cursor->index, page, max_ins_size, mtr); - return(TRUE); } - return(FALSE); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(no_compress_needed); } /***************************************************************** @@ -2349,7 +2510,9 @@ btr_cur_pessimistic_delete( ulint n_reserved; ibool success; ibool ret = FALSE; + ulint level; mem_heap_t* heap; + ulint* offsets; page = btr_cur_get_page(cursor); tree = btr_cur_get_tree(cursor); @@ -2375,11 +2538,24 @@ btr_cur_pessimistic_delete( } } - btr_rec_free_externally_stored_fields(cursor->index, - btr_cur_get_rec(cursor), in_rollback, mtr); + heap = mem_heap_create(1024); + rec = btr_cur_get_rec(cursor); + + offsets = rec_get_offsets(rec, cursor->index, + NULL, ULINT_UNDEFINED, &heap); + + /* Free externally stored fields if the record is neither + a node pointer nor in two-byte format. + This avoids an unnecessary loop. */ + if (page_is_comp(page) + ? !rec_get_node_ptr_flag(rec) + : !rec_get_1byte_offs_flag(rec)) { + btr_rec_free_externally_stored_fields(cursor->index, + rec, offsets, in_rollback, mtr); + } - if ((page_get_n_recs(page) < 2) - && (dict_tree_get_page(btr_cur_get_tree(cursor)) + if (UNIV_UNLIKELY(page_get_n_recs(page) < 2) + && UNIV_UNLIKELY(dict_tree_get_page(btr_cur_get_tree(cursor)) != buf_frame_get_page_no(page))) { /* If there is only one record, drop the whole page in @@ -2393,12 +2569,14 @@ btr_cur_pessimistic_delete( goto return_after_reservations; } - rec = btr_cur_get_rec(cursor); - lock_update_delete(rec); + level = btr_page_get_level(page, mtr); - if ((btr_page_get_level(page, mtr) > 0) - && (page_rec_get_next(page_get_infimum_rec(page)) == rec)) { + if (level > 0 + && UNIV_UNLIKELY(rec == page_rec_get_next( + page_get_infimum_rec(page)))) { + + rec_t* next_rec = page_rec_get_next(rec); if (btr_page_get_prev(page, mtr) == FIL_NULL) { @@ -2406,7 +2584,8 @@ btr_cur_pessimistic_delete( non-leaf level, we must mark the new leftmost node pointer as the predefined minimum record */ - btr_set_min_rec_mark(page_rec_get_next(rec), mtr); + btr_set_min_rec_mark(next_rec, page_is_comp(page), + mtr); } else { /* Otherwise, if we delete the leftmost node pointer on a page, we have to change the father node pointer @@ -2415,30 +2594,27 @@ btr_cur_pessimistic_delete( btr_node_ptr_delete(tree, page, mtr); - heap = mem_heap_create(256); - node_ptr = dict_tree_build_node_ptr( - tree, page_rec_get_next(rec), + tree, next_rec, buf_frame_get_page_no(page), - heap, btr_page_get_level(page, mtr)); + heap, level); btr_insert_on_non_leaf_level(tree, - btr_page_get_level(page, mtr) + 1, - node_ptr, mtr); - - mem_heap_free(heap); + level + 1, node_ptr, mtr); } } btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(btr_cur_get_page_cur(cursor), mtr); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), cursor->index, + offsets, mtr); ut_ad(btr_check_node_ptr(tree, page, mtr)); *err = DB_SUCCESS; return_after_reservations: + mem_heap_free(heap); if (ret == FALSE) { ret = btr_cur_compress_if_useful(cursor, mtr); @@ -2663,18 +2839,25 @@ btr_estimate_number_of_different_key_vals( ulint j; ulint add_on; mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_rec_[REC_OFFS_NORMAL_SIZE]; + ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets_rec = offsets_rec_; + ulint* offsets_next_rec= offsets_next_rec_; + *offsets_rec_ = (sizeof offsets_rec_) / sizeof *offsets_rec_; + *offsets_next_rec_ = + (sizeof offsets_next_rec_) / sizeof *offsets_next_rec_; n_cols = dict_index_get_n_unique(index); n_diff = mem_alloc((n_cols + 1) * sizeof(ib_longlong)); - for (j = 0; j <= n_cols; j++) { - n_diff[j] = 0; - } + memset(n_diff, 0, (n_cols + 1) * sizeof(ib_longlong)); /* We sample some pages in the index to get an estimate */ for (i = 0; i < BTR_KEY_VAL_ESTIMATE_N_PAGES; i++) { + rec_t* supremum; mtr_start(&mtr); btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr); @@ -2687,20 +2870,29 @@ btr_estimate_number_of_different_key_vals( page = btr_cur_get_page(&cursor); - rec = page_get_infimum_rec(page); - rec = page_rec_get_next(rec); + supremum = page_get_supremum_rec(page); + rec = page_rec_get_next(page_get_infimum_rec(page)); - if (rec != page_get_supremum_rec(page)) { + if (rec != supremum) { not_empty_flag = 1; + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + ULINT_UNDEFINED, &heap); } - - while (rec != page_get_supremum_rec(page) - && page_rec_get_next(rec) - != page_get_supremum_rec(page)) { + + while (rec != supremum) { + rec_t* next_rec = page_rec_get_next(rec); + if (next_rec == supremum) { + break; + } + matched_fields = 0; matched_bytes = 0; + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, + n_cols, &heap); - cmp_rec_rec_with_match(rec, page_rec_get_next(rec), + cmp_rec_rec_with_match(rec, next_rec, + offsets_rec, offsets_next_rec, index, &matched_fields, &matched_bytes); @@ -2712,9 +2904,18 @@ btr_estimate_number_of_different_key_vals( } total_external_size += - btr_rec_get_externally_stored_len(rec); + btr_rec_get_externally_stored_len( + rec, offsets_rec); - rec = page_rec_get_next(rec); + rec = next_rec; + /* Initialize offsets_rec for the next round + and assign the old offsets_rec buffer to + offsets_next_rec. */ + { + ulint* offsets_tmp = offsets_rec; + offsets_rec = offsets_next_rec; + offsets_next_rec = offsets_tmp; + } } @@ -2736,8 +2937,11 @@ btr_estimate_number_of_different_key_vals( } } + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + ULINT_UNDEFINED, &heap); total_external_size += - btr_rec_get_externally_stored_len(rec); + btr_rec_get_externally_stored_len(rec, + offsets_rec); mtr_commit(&mtr); } @@ -2778,6 +2982,9 @@ btr_estimate_number_of_different_key_vals( } mem_free(n_diff); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ @@ -2788,9 +2995,10 @@ static ulint btr_rec_get_externally_stored_len( /*==============================*/ - /* out: externally stored part, in units of a - database page */ - rec_t* rec) /* in: record */ + /* out: externally stored part, + in units of a database page */ + rec_t* rec, /* in: record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_fields; byte* data; @@ -2799,17 +3007,13 @@ btr_rec_get_externally_stored_len( ulint total_extern_len = 0; ulint i; - if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { - - return(0); - } - - n_fields = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n_fields = rec_offs_n_fields(offsets); for (i = 0; i < n_fields; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { - data = rec_get_nth_field(rec, i, &local_len); + data = rec_get_nth_field(rec, offsets, i, &local_len); local_len -= BTR_EXTERN_FIELD_REF_SIZE; @@ -2830,16 +3034,17 @@ static void btr_cur_set_ownership_of_extern_field( /*==================================*/ - rec_t* rec, /* in: clustered index record */ - ulint i, /* in: field number */ - ibool val, /* in: value to set */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: clustered index record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint i, /* in: field number */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr */ { byte* data; ulint local_len; ulint byte_val; - data = rec_get_nth_field(rec, i, &local_len); + data = rec_get_nth_field(rec, offsets, i, &local_len); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); @@ -2866,19 +3071,22 @@ to free the field. */ void btr_cur_mark_extern_inherited_fields( /*=================================*/ - rec_t* rec, /* in: record in a clustered index */ - upd_t* update, /* in: update vector */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update, /* in: update vector */ + mtr_t* mtr) /* in: mtr */ { ibool is_updated; ulint n; ulint j; ulint i; - - n = rec_get_n_fields(rec); + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n = rec_offs_n_fields(offsets); for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { /* Check it is not in updated fields */ is_updated = FALSE; @@ -2894,8 +3102,8 @@ btr_cur_mark_extern_inherited_fields( } if (!is_updated) { - btr_cur_set_ownership_of_extern_field(rec, i, - FALSE, mtr); + btr_cur_set_ownership_of_extern_field(rec, + offsets, i, FALSE, mtr); } } } @@ -2967,18 +3175,20 @@ static void btr_cur_unmark_extern_fields( /*=========================*/ - rec_t* rec, /* in: record in a clustered index */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + mtr_t* mtr, /* in: mtr */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n; ulint i; - n = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n = rec_offs_n_fields(offsets); for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { - - btr_cur_set_ownership_of_extern_field(rec, i, + if (rec_offs_nth_extern(offsets, i)) { + + btr_cur_set_ownership_of_extern_field(rec, offsets, i, TRUE, mtr); } } @@ -3028,10 +3238,10 @@ ulint btr_push_update_extern_fields( /*==========================*/ /* out: number of values stored in ext_vect */ - ulint* ext_vect, /* in: array of ulints, must be preallocated + ulint* ext_vect,/* in: array of ulints, must be preallocated to have space for all fields in rec */ - rec_t* rec, /* in: record */ - upd_t* update) /* in: update vector or NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update) /* in: update vector or NULL */ { ulint n_pushed = 0; ibool is_updated; @@ -3054,10 +3264,10 @@ btr_push_update_extern_fields( } } - n = rec_get_n_fields(rec); + n = rec_offs_n_fields(offsets); for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { /* Check it is not in updated fields */ is_updated = FALSE; @@ -3119,6 +3329,7 @@ btr_store_big_rec_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ mtr_t* local_mtr __attribute__((unused))) /* in: mtr @@ -3139,6 +3350,7 @@ btr_store_big_rec_extern_fields( ulint i; mtr_t mtr; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, buf_block_align(rec), @@ -3152,8 +3364,8 @@ btr_store_big_rec_extern_fields( for (i = 0; i < big_rec_vec->n_fields; i++) { - data = rec_get_nth_field(rec, big_rec_vec->fields[i].field_no, - &local_len); + data = rec_get_nth_field(rec, offsets, + big_rec_vec->fields[i].field_no, &local_len); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); local_len -= BTR_EXTERN_FIELD_REF_SIZE; extern_len = big_rec_vec->fields[i].len; @@ -3254,7 +3466,7 @@ btr_store_big_rec_extern_fields( /* Set the bit denoting that this field in rec is stored externally */ - rec_set_nth_field_extern_bit(rec, + rec_set_nth_field_extern_bit(rec, index, big_rec_vec->fields[i].field_no, TRUE, &mtr); } @@ -3407,6 +3619,7 @@ btr_rec_free_externally_stored_fields( dict_index_t* index, /* in: index of the data, the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -3419,21 +3632,18 @@ btr_rec_free_externally_stored_fields( ulint len; ulint i; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)); - if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { - - return; - } - /* Free possible externally stored fields in the record */ - n_fields = rec_get_n_fields(rec); + ut_ad(index->table->comp == !!rec_offs_comp(offsets)); + n_fields = rec_offs_n_fields(offsets); for (i = 0; i < n_fields; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); btr_free_externally_stored_field(index, data, len, do_not_free_inherited, mtr); } @@ -3450,6 +3660,7 @@ btr_rec_free_updated_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free @@ -3463,13 +3674,10 @@ btr_rec_free_updated_extern_fields( ulint len; ulint i; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)); - if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { - return; - } - /* Free possible externally stored fields in the record */ n_fields = upd_get_n_fields(update); @@ -3477,9 +3685,10 @@ btr_rec_free_updated_extern_fields( for (i = 0; i < n_fields; i++) { ufield = upd_get_nth_field(update, i); - if (rec_get_nth_field_extern_bit(rec, ufield->field_no)) { + if (rec_offs_nth_extern(offsets, ufield->field_no)) { - data = rec_get_nth_field(rec, ufield->field_no, &len); + data = rec_get_nth_field(rec, offsets, + ufield->field_no, &len); btr_free_externally_stored_field(index, data, len, do_not_free_inherited, mtr); } @@ -3583,7 +3792,8 @@ byte* btr_rec_copy_externally_stored_field( /*=================================*/ /* out: the field copied to heap */ - rec_t* rec, /* in: record */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint no, /* in: field number */ ulint* len, /* out: length of the field */ mem_heap_t* heap) /* in: mem heap */ @@ -3591,7 +3801,8 @@ btr_rec_copy_externally_stored_field( ulint local_len; byte* data; - ut_a(rec_get_nth_field_extern_bit(rec, no)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_a(rec_offs_nth_extern(offsets, no)); /* An externally stored field can contain some initial data from the field, and in the last 20 bytes it has the @@ -3602,7 +3813,7 @@ btr_rec_copy_externally_stored_field( limit so that field offsets are stored in two bytes, and the extern bit is available in those two bytes. */ - data = rec_get_nth_field(rec, no, &local_len); + data = rec_get_nth_field(rec, offsets, no, &local_len); return(btr_copy_externally_stored_field(len, data, local_len, heap)); } diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c index 0dcf6c2f3fc..cb398b4afab 100644 --- a/innobase/btr/btr0pcur.c +++ b/innobase/btr/btr0pcur.c @@ -46,12 +46,12 @@ btr_pcur_free_for_mysql( mem_free(cursor->old_rec_buf); - cursor->old_rec = NULL; cursor->old_rec_buf = NULL; } cursor->btr_cur.page_cur.rec = NULL; cursor->old_rec = NULL; + cursor->old_n_fields = 0; cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; cursor->latch_mode = BTR_NO_LATCHES; @@ -78,6 +78,7 @@ btr_pcur_store_position( rec_t* rec; dict_tree_t* tree; page_t* page; + ulint offs; ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); ut_ad(cursor->latch_mode != BTR_NO_LATCHES); @@ -87,7 +88,8 @@ btr_pcur_store_position( page_cursor = btr_pcur_get_page_cur(cursor); rec = page_cur_get_rec(page_cursor); - page = buf_frame_align(rec); + page = ut_align_down(rec, UNIV_PAGE_SIZE); + offs = ut_align_offset(rec, UNIV_PAGE_SIZE); ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_S_FIX) @@ -95,35 +97,33 @@ btr_pcur_store_position( MTR_MEMO_PAGE_X_FIX)); ut_a(cursor->latch_mode != BTR_NO_LATCHES); - if (page_get_n_recs(page) == 0) { + if (UNIV_UNLIKELY(page_get_n_recs(page) == 0)) { /* It must be an empty index tree; NOTE that in this case we do not store the modify_clock, but always do a search if we restore the cursor position */ - ut_a(btr_page_get_next(page, mtr) == FIL_NULL - && btr_page_get_prev(page, mtr) == FIL_NULL); + ut_a(btr_page_get_next(page, mtr) == FIL_NULL); + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); - if (rec == page_get_supremum_rec(page)) { + cursor->old_stored = BTR_PCUR_OLD_STORED; - cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; - cursor->old_stored = BTR_PCUR_OLD_STORED; + if (page_rec_is_supremum_low(offs)) { - return; + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + } else { + cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE; } - cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE; - cursor->old_stored = BTR_PCUR_OLD_STORED; - return; } - if (rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum_low(offs)) { rec = page_rec_get_prev(rec); cursor->rel_pos = BTR_PCUR_AFTER; - } else if (rec == page_get_infimum_rec(page)) { + } else if (page_rec_is_infimum_low(offs)) { rec = page_rec_get_next(rec); @@ -134,11 +134,13 @@ btr_pcur_store_position( cursor->old_stored = BTR_PCUR_OLD_STORED; cursor->old_rec = dict_tree_copy_rec_order_prefix(tree, rec, - &(cursor->old_rec_buf), - &(cursor->buf_size)); - + &cursor->old_n_fields, + &cursor->old_rec_buf, + &cursor->buf_size); + cursor->block_when_stored = buf_block_align(page); - cursor->modify_clock = buf_frame_get_modify_clock(page); + cursor->modify_clock = buf_block_get_modify_clock( + cursor->block_when_stored); } /****************************************************************** @@ -167,6 +169,8 @@ btr_pcur_copy_stored_position( pcur_receive->old_rec = pcur_receive->old_rec_buf + (pcur_donate->old_rec - pcur_donate->old_rec_buf); } + + pcur_receive->old_n_fields = pcur_donate->old_n_fields; } /****************************************************************** @@ -199,33 +203,27 @@ btr_pcur_restore_position( dtuple_t* tuple; ulint mode; ulint old_mode; - ibool from_left; mem_heap_t* heap; - ut_a(cursor->pos_state == BTR_PCUR_WAS_POSITIONED - || cursor->pos_state == BTR_PCUR_IS_POSITIONED); - if (cursor->old_stored != BTR_PCUR_OLD_STORED) { + if (UNIV_UNLIKELY(cursor->old_stored != BTR_PCUR_OLD_STORED) + || UNIV_UNLIKELY(cursor->pos_state != BTR_PCUR_WAS_POSITIONED + && cursor->pos_state != BTR_PCUR_IS_POSITIONED)) { ut_print_buf(stderr, (const byte*)cursor, sizeof(btr_pcur_t)); if (cursor->trx_if_known) { trx_print(stderr, cursor->trx_if_known); } - ut_a(0); + ut_error; } - if (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE - || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) { + if (UNIV_UNLIKELY(cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE + || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) { /* In these cases we do not try an optimistic restoration, but always do a search */ - if (cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) { - from_left = TRUE; - } else { - from_left = FALSE; - } - - btr_cur_open_at_index_side(from_left, + btr_cur_open_at_index_side( + cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE, btr_pcur_get_btr_cur(cursor)->index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr); @@ -236,31 +234,47 @@ btr_pcur_restore_position( } ut_a(cursor->old_rec); + ut_a(cursor->old_n_fields); page = btr_cur_get_page(btr_pcur_get_btr_cur(cursor)); - if (latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF) { + if (UNIV_LIKELY(latch_mode == BTR_SEARCH_LEAF) + || UNIV_LIKELY(latch_mode == BTR_MODIFY_LEAF)) { /* Try optimistic restoration */ - if (buf_page_optimistic_get(latch_mode, + if (UNIV_LIKELY(buf_page_optimistic_get(latch_mode, cursor->block_when_stored, page, - cursor->modify_clock, mtr)) { + cursor->modify_clock, mtr))) { cursor->pos_state = BTR_PCUR_IS_POSITIONED; #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_TREE_NODE); #endif /* UNIV_SYNC_DEBUG */ if (cursor->rel_pos == BTR_PCUR_ON) { - +#ifdef UNIV_DEBUG + rec_t* rec; + ulint* offsets1; + ulint* offsets2; + dict_index_t* index; +#endif /* UNIV_DEBUG */ cursor->latch_mode = latch_mode; - - ut_ad(cmp_rec_rec(cursor->old_rec, - btr_pcur_get_rec(cursor), - dict_tree_find_index( - btr_cur_get_tree( +#ifdef UNIV_DEBUG + rec = btr_pcur_get_rec(cursor); + index = dict_tree_find_index( + btr_cur_get_tree( btr_pcur_get_btr_cur(cursor)), - btr_pcur_get_rec(cursor))) - == 0); + rec); + heap = mem_heap_create(256); + offsets1 = rec_get_offsets(cursor->old_rec, + index, NULL, + cursor->old_n_fields, &heap); + offsets2 = rec_get_offsets(rec, index, NULL, + cursor->old_n_fields, &heap); + + ut_ad(cmp_rec_rec(cursor->old_rec, + rec, offsets1, offsets2, index) == 0); + mem_heap_free(heap); +#endif /* UNIV_DEBUG */ return(TRUE); } @@ -273,12 +287,13 @@ btr_pcur_restore_position( heap = mem_heap_create(256); tree = btr_cur_get_tree(btr_pcur_get_btr_cur(cursor)); - tuple = dict_tree_build_data_tuple(tree, cursor->old_rec, heap); + tuple = dict_tree_build_data_tuple(tree, cursor->old_rec, + cursor->old_n_fields, heap); /* Save the old search mode of the cursor */ old_mode = cursor->search_mode; - if (cursor->rel_pos == BTR_PCUR_ON) { + if (UNIV_LIKELY(cursor->rel_pos == BTR_PCUR_ON)) { mode = PAGE_CUR_LE; } else if (cursor->rel_pos == BTR_PCUR_AFTER) { mode = PAGE_CUR_G; @@ -295,18 +310,19 @@ btr_pcur_restore_position( if (cursor->rel_pos == BTR_PCUR_ON && btr_pcur_is_on_user_rec(cursor, mtr) - && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor))) { + && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor), + rec_get_offsets(btr_pcur_get_rec(cursor), + btr_pcur_get_btr_cur(cursor)->index, + NULL, ULINT_UNDEFINED, &heap))) { /* We have to store the NEW value for the modify clock, since the cursor can now be on a different page! But we can retain the value of old_rec */ - cursor->modify_clock = - buf_frame_get_modify_clock(btr_pcur_get_page(cursor)); - cursor->block_when_stored = buf_block_align(btr_pcur_get_page(cursor)); - + cursor->modify_clock = + buf_block_get_modify_clock(cursor->block_when_stored); cursor->old_stored = BTR_PCUR_OLD_STORED; mem_heap_free(heap); @@ -384,6 +400,7 @@ btr_pcur_move_to_next_page( ut_ad(next_page_no != FIL_NULL); next_page = btr_page_get(space, next_page_no, cursor->latch_mode, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); buf_block_align(next_page)->check_index_page_at_flush = TRUE; btr_leaf_page_release(page, cursor->latch_mode, mtr); diff --git a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c index 9384168df88..f705fee4275 100644 --- a/innobase/btr/btr0sea.c +++ b/innobase/btr/btr0sea.c @@ -411,11 +411,17 @@ btr_search_update_hash_ref( ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) || rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ + ut_ad(buf_block_align(btr_cur_get_rec(cursor)) == block); + ut_a(!block->is_hashed || block->index == cursor->index); + if (block->is_hashed && (info->n_hash_potential > 0) && (block->curr_n_fields == info->n_fields) && (block->curr_n_bytes == info->n_bytes) && (block->curr_side == info->side)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; rec = btr_cur_get_rec(cursor); @@ -425,10 +431,13 @@ btr_search_update_hash_ref( } tree_id = ((cursor->index)->tree)->id; - - fold = rec_fold(rec, block->curr_n_fields, - block->curr_n_bytes, tree_id); - + fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, + offsets_, ULINT_UNDEFINED, &heap), + block->curr_n_fields, + block->curr_n_bytes, tree_id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ @@ -535,135 +544,127 @@ btr_search_check_guess( or PAGE_CUR_GE */ mtr_t* mtr) /* in: mtr */ { - page_t* page; - rec_t* rec; - rec_t* prev_rec; - rec_t* next_rec; - ulint n_unique; - ulint match; - ulint bytes; - int cmp; - + rec_t* rec; + ulint n_unique; + ulint match; + ulint bytes; + int cmp; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ibool success = FALSE; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + n_unique = dict_index_get_n_unique_in_tree(cursor->index); rec = btr_cur_get_rec(cursor); - page = buf_frame_align(rec); ut_ad(page_rec_is_user_rec(rec)); match = 0; bytes = 0; - cmp = page_cmp_dtuple_rec_with_match(tuple, rec, &match, &bytes); + offsets = rec_get_offsets(rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, rec, + offsets, &match, &bytes); if (mode == PAGE_CUR_GE) { if (cmp == 1) { - - return(FALSE); + goto exit_func; } cursor->up_match = match; if (match >= n_unique) { - - return(TRUE); + success = TRUE; + goto exit_func; } } else if (mode == PAGE_CUR_LE) { if (cmp == -1) { - - return(FALSE); + goto exit_func; } cursor->low_match = match; } else if (mode == PAGE_CUR_G) { if (cmp != -1) { - - return(FALSE); + goto exit_func; } } else if (mode == PAGE_CUR_L) { if (cmp != 1) { - - return(FALSE); + goto exit_func; } } if (can_only_compare_to_cursor_rec) { /* Since we could not determine if our guess is right just by looking at the record under the cursor, return FALSE */ - - return(FALSE); + goto exit_func; } match = 0; bytes = 0; if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) { + rec_t* prev_rec; - ut_ad(rec != page_get_infimum_rec(page)); + ut_ad(!page_rec_is_infimum(rec)); prev_rec = page_rec_get_prev(rec); - if (prev_rec == page_get_infimum_rec(page)) { - - if (btr_page_get_prev(page, mtr) != FIL_NULL) { + if (page_rec_is_infimum(prev_rec)) { + success = btr_page_get_prev( + buf_frame_align(prev_rec), mtr) == FIL_NULL; - return(FALSE); - } - - return(TRUE); + goto exit_func; } + offsets = rec_get_offsets(prev_rec, cursor->index, offsets, + n_unique, &heap); cmp = page_cmp_dtuple_rec_with_match(tuple, prev_rec, - &match, &bytes); + offsets, &match, &bytes); if (mode == PAGE_CUR_GE) { - if (cmp != 1) { - - return(FALSE); - } + success = cmp == 1; } else { - if (cmp == -1) { - - return(FALSE); - } + success = cmp != -1; } - return(TRUE); - } - - ut_ad(rec != page_get_supremum_rec(page)); - - next_rec = page_rec_get_next(rec); - - if (next_rec == page_get_supremum_rec(page)) { - - if (btr_page_get_next(page, mtr) == FIL_NULL) { - - cursor->up_match = 0; + goto exit_func; + } else { + rec_t* next_rec; - return(TRUE); - } + ut_ad(!page_rec_is_supremum(rec)); + + next_rec = page_rec_get_next(rec); - return(FALSE); - } + if (page_rec_is_supremum(next_rec)) { + if (btr_page_get_next( + buf_frame_align(next_rec), mtr) == FIL_NULL) { - cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, &match, &bytes); - - if (mode == PAGE_CUR_LE) { - if (cmp != -1) { + cursor->up_match = 0; + success = TRUE; + } - return(FALSE); + goto exit_func; } - cursor->up_match = match; - } else { - if (cmp == 1) { - - return(FALSE); + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, + offsets, &match, &bytes); + if (mode == PAGE_CUR_LE) { + success = cmp == -1; + cursor->up_match = match; + } else { + success = cmp != 1; } } - - return(TRUE); +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); } /********************************************************************** @@ -695,7 +696,6 @@ btr_search_guess_on_hash( buf_block_t* block; rec_t* rec; page_t* page; - ibool success; ulint fold; ulint tuple_n_fields; dulint tree_id; @@ -711,7 +711,7 @@ btr_search_guess_on_hash( /* Note that, for efficiency, the struct info may not be protected by any latch here! */ - if (info->n_hash_potential == 0) { + if (UNIV_UNLIKELY(info->n_hash_potential == 0)) { return(FALSE); } @@ -721,12 +721,13 @@ btr_search_guess_on_hash( tuple_n_fields = dtuple_get_n_fields(tuple); - if (tuple_n_fields < cursor->n_fields) { + if (UNIV_UNLIKELY(tuple_n_fields < cursor->n_fields)) { return(FALSE); } - if ((cursor->n_bytes > 0) && (tuple_n_fields <= cursor->n_fields)) { + if (UNIV_UNLIKELY(tuple_n_fields == cursor->n_fields) + && (cursor->n_bytes > 0)) { return(FALSE); } @@ -741,39 +742,31 @@ btr_search_guess_on_hash( cursor->fold = fold; cursor->flag = BTR_CUR_HASH; - if (!has_search_latch) { + if (UNIV_LIKELY(!has_search_latch)) { rw_lock_s_lock(&btr_search_latch); } - ut_a(btr_search_latch.writer != RW_LOCK_EX); - ut_a(btr_search_latch.reader_count > 0); + ut_ad(btr_search_latch.writer != RW_LOCK_EX); + ut_ad(btr_search_latch.reader_count > 0); rec = ha_search_and_get_data(btr_search_sys->hash_index, fold); - if (!rec) { - if (!has_search_latch) { - rw_lock_s_unlock(&btr_search_latch); - } - - goto failure; + if (UNIV_UNLIKELY(!rec)) { + goto failure_unlock; } page = buf_frame_align(rec); - if (!has_search_latch) { + if (UNIV_LIKELY(!has_search_latch)) { - success = buf_page_get_known_nowait(latch_mode, page, + if (UNIV_UNLIKELY(!buf_page_get_known_nowait(latch_mode, page, BUF_MAKE_YOUNG, __FILE__, __LINE__, - mtr); - - rw_lock_s_unlock(&btr_search_latch); - - if (!success) { - - goto failure; + mtr))) { + goto failure_unlock; } + rw_lock_s_unlock(&btr_search_latch); can_only_compare_to_cursor_rec = FALSE; #ifdef UNIV_SYNC_DEBUG @@ -783,8 +776,8 @@ btr_search_guess_on_hash( block = buf_block_align(page); - if (block->state == BUF_BLOCK_REMOVE_HASH) { - if (!has_search_latch) { + if (UNIV_UNLIKELY(block->state == BUF_BLOCK_REMOVE_HASH)) { + if (UNIV_LIKELY(!has_search_latch)) { btr_leaf_page_release(page, latch_mode, mtr); } @@ -792,51 +785,33 @@ btr_search_guess_on_hash( goto failure; } - ut_a(block->state == BUF_BLOCK_FILE_PAGE); - ut_a(page_rec_is_user_rec(rec)); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + ut_ad(page_rec_is_user_rec(rec)); btr_cur_position(index, rec, cursor); /* Check the validity of the guess within the page */ - if (0 != ut_dulint_cmp(tree_id, btr_page_get_index_id(page))) { - - success = FALSE; -/* - fprintf(stderr, "Tree id %lu, page index id %lu fold %lu\n", - ut_dulint_get_low(tree_id), - ut_dulint_get_low(btr_page_get_index_id(page)), - fold); -*/ - } else { - /* If we only have the latch on btr_search_latch, not on the - page, it only protects the columns of the record the cursor - is positioned on. We cannot look at the next of the previous - record to determine if our guess for the cursor position is - right. */ - - success = btr_search_check_guess(cursor, - can_only_compare_to_cursor_rec, - tuple, mode, mtr); - } - - if (!success) { - if (!has_search_latch) { + /* If we only have the latch on btr_search_latch, not on the + page, it only protects the columns of the record the cursor + is positioned on. We cannot look at the next of the previous + record to determine if our guess for the cursor position is + right. */ + if (UNIV_EXPECT(ut_dulint_cmp(tree_id, btr_page_get_index_id(page)), 0) + || !btr_search_check_guess(cursor, can_only_compare_to_cursor_rec, + tuple, mode, mtr)) { + if (UNIV_LIKELY(!has_search_latch)) { btr_leaf_page_release(page, latch_mode, mtr); } goto failure; } - if (info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5) { + if (UNIV_LIKELY(info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5)) { info->n_hash_potential++; } - if (info->last_hash_succ != TRUE) { - info->last_hash_succ = TRUE; - } - #ifdef notdefined /* These lines of code can be used in a debug version to check the correctness of the searched cursor position: */ @@ -844,15 +819,14 @@ btr_search_guess_on_hash( info->last_hash_succ = FALSE; /* Currently, does not work if the following fails: */ - ut_a(!has_search_latch); + ut_ad(!has_search_latch); btr_leaf_page_release(page, latch_mode, mtr); btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode, &cursor2, 0, mtr); if (mode == PAGE_CUR_GE - && btr_cur_get_rec(&cursor2) == page_get_supremum_rec( - buf_frame_align(btr_cur_get_rec(&cursor2)))) { + && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) { /* If mode is PAGE_CUR_GE, then the binary search in the index tree may actually take us to the supremum @@ -862,22 +836,22 @@ btr_search_guess_on_hash( btr_pcur_open_on_user_rec(index, tuple, mode, latch_mode, &pcur, mtr); - ut_a(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor)); + ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor)); } else { - ut_a(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor)); + ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor)); } /* NOTE that it is theoretically possible that the above assertions fail if the page of the cursor gets removed from the buffer pool meanwhile! Thus it might not be a bug. */ - - info->last_hash_succ = TRUE; #endif + info->last_hash_succ = TRUE; #ifdef UNIV_SEARCH_PERF_STAT btr_search_n_succ++; #endif - if (!has_search_latch && buf_block_peek_if_too_old(block)) { + if (UNIV_LIKELY(!has_search_latch) + && buf_block_peek_if_too_old(block)) { buf_page_make_young(page); } @@ -890,6 +864,10 @@ btr_search_guess_on_hash( return(TRUE); /*-------------------------------------------*/ +failure_unlock: + if (UNIV_LIKELY(!has_search_latch)) { + rw_lock_s_unlock(&btr_search_latch); + } failure: info->n_hash_fail++; @@ -918,7 +896,6 @@ btr_search_drop_page_hash_index( ulint n_fields; ulint n_bytes; rec_t* rec; - rec_t* sup; ulint fold; ulint prev_fold; dulint tree_id; @@ -926,6 +903,8 @@ btr_search_drop_page_hash_index( ulint n_recs; ulint* folds; ulint i; + mem_heap_t* heap; + ulint* offsets; #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); @@ -967,16 +946,14 @@ btr_search_drop_page_hash_index( n_cached = 0; - sup = page_get_supremum_rec(page); - rec = page_get_infimum_rec(page); rec = page_rec_get_next(rec); - if (rec != sup) { - ut_a(n_fields <= rec_get_n_fields(rec)); + if (!page_rec_is_supremum(rec)) { + ut_a(n_fields <= rec_get_n_fields(rec, block->index)); if (n_bytes > 0) { - ut_a(n_fields < rec_get_n_fields(rec)); + ut_a(n_fields < rec_get_n_fields(rec, block->index)); } } @@ -984,11 +961,15 @@ btr_search_drop_page_hash_index( prev_fold = 0; - while (rec != sup) { + heap = NULL; + offsets = NULL; + + while (!page_rec_is_supremum(rec)) { /* FIXME: in a mixed tree, not all records may have enough ordering fields: */ - - fold = rec_fold(rec, n_fields, n_bytes, tree_id); + offsets = rec_get_offsets(rec, block->index, + offsets, n_fields + (n_bytes > 0), &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id); if (fold == prev_fold && prev_fold != 0) { @@ -1005,6 +986,10 @@ next_rec: prev_fold = fold; } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + rw_lock_x_lock(&btr_search_latch); for (i = 0; i < n_cached; i++) { @@ -1013,6 +998,7 @@ next_rec: } block->is_hashed = FALSE; + block->index = NULL; rw_lock_x_unlock(&btr_search_latch); @@ -1069,8 +1055,7 @@ static void btr_search_build_page_hash_index( /*=============================*/ - dict_index_t* index, /* in: index for which to build, or NULL if - not known */ + dict_index_t* index, /* in: index for which to build */ page_t* page, /* in: index page, s- or x-latched */ ulint n_fields,/* in: hash this many full fields */ ulint n_bytes,/* in: hash this many bytes from the next @@ -1081,7 +1066,6 @@ btr_search_build_page_hash_index( buf_block_t* block; rec_t* rec; rec_t* next_rec; - rec_t* sup; ulint fold; ulint next_fold; dulint tree_id; @@ -1090,7 +1074,13 @@ btr_search_build_page_hash_index( ulint* folds; rec_t** recs; ulint i; - + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_ad(index); + block = buf_block_align(page); table = btr_search_sys->hash_index; @@ -1127,9 +1117,9 @@ btr_search_build_page_hash_index( return; } - if (index && (dict_index_get_n_unique_in_tree(index) < n_fields + if (dict_index_get_n_unique_in_tree(index) < n_fields || (dict_index_get_n_unique_in_tree(index) == n_fields - && n_bytes > 0))) { + && n_bytes > 0)) { return; } @@ -1143,23 +1133,23 @@ btr_search_build_page_hash_index( tree_id = btr_page_get_index_id(page); - sup = page_get_supremum_rec(page); - rec = page_get_infimum_rec(page); rec = page_rec_get_next(rec); - if (rec != sup) { - ut_a(n_fields <= rec_get_n_fields(rec)); + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + + if (!page_rec_is_supremum(rec)) { + ut_a(n_fields <= rec_offs_n_fields(offsets)); if (n_bytes > 0) { - ut_a(n_fields < rec_get_n_fields(rec)); + ut_a(n_fields < rec_offs_n_fields(offsets)); } } /* FIXME: in a mixed tree, all records may not have enough ordering fields: */ - - fold = rec_fold(rec, n_fields, n_bytes, tree_id); + fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id); if (side == BTR_SEARCH_LEFT_SIDE) { @@ -1171,7 +1161,7 @@ btr_search_build_page_hash_index( for (;;) { next_rec = page_rec_get_next(rec); - if (next_rec == sup) { + if (page_rec_is_supremum(next_rec)) { if (side == BTR_SEARCH_RIGHT_SIDE) { @@ -1183,7 +1173,10 @@ btr_search_build_page_hash_index( break; } - next_fold = rec_fold(next_rec, n_fields, n_bytes, tree_id); + offsets = rec_get_offsets(next_rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, tree_id); if (fold != next_fold) { /* Insert an entry into the hash index */ @@ -1211,13 +1204,7 @@ btr_search_build_page_hash_index( if (block->is_hashed && ((block->curr_n_fields != n_fields) || (block->curr_n_bytes != n_bytes) || (block->curr_side != side))) { - - rw_lock_x_unlock(&btr_search_latch); - - mem_free(folds); - mem_free(recs); - - return; + goto exit_func; } block->is_hashed = TRUE; @@ -1226,16 +1213,21 @@ btr_search_build_page_hash_index( block->curr_n_fields = n_fields; block->curr_n_bytes = n_bytes; block->curr_side = side; + block->index = index; for (i = 0; i < n_cached; i++) { ha_insert_for_fold(table, folds[i], recs[i]); } +exit_func: rw_lock_x_unlock(&btr_search_latch); mem_free(folds); mem_free(recs); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } /************************************************************************ @@ -1247,10 +1239,13 @@ parameters as page (this often happens when a page is split). */ void btr_search_move_or_delete_hash_entries( /*===================================*/ - page_t* new_page, /* in: records are copied to this page */ - page_t* page) /* in: index page from which records were - copied, and the copied records will be deleted - from this page */ + page_t* new_page, /* in: records are copied + to this page */ + page_t* page, /* in: index page from which + records were copied, and the + copied records will be deleted + from this page */ + dict_index_t* index) /* in: record descriptor */ { buf_block_t* block; buf_block_t* new_block; @@ -1260,11 +1255,14 @@ btr_search_move_or_delete_hash_entries( block = buf_block_align(page); new_block = buf_block_align(new_page); + ut_a(page_is_comp(page) == page_is_comp(new_page)); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ + ut_a(!new_block->is_hashed || new_block->index == index); + ut_a(!block->is_hashed || block->index == index); rw_lock_s_lock(&btr_search_latch); @@ -1290,8 +1288,8 @@ btr_search_move_or_delete_hash_entries( rw_lock_s_unlock(&btr_search_latch); ut_a(n_fields + n_bytes > 0); - - btr_search_build_page_hash_index(NULL, new_page, n_fields, + + btr_search_build_page_hash_index(index, new_page, n_fields, n_bytes, side); ut_a(n_fields == block->curr_n_fields); ut_a(n_bytes == block->curr_n_bytes); @@ -1319,6 +1317,9 @@ btr_search_update_hash_on_delete( ulint fold; dulint tree_id; ibool found; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t* heap = NULL; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; rec = btr_cur_get_rec(cursor); @@ -1333,14 +1334,18 @@ btr_search_update_hash_on_delete( return; } + ut_a(block->index == cursor->index); ut_a(block->curr_n_fields + block->curr_n_bytes > 0); table = btr_search_sys->hash_index; tree_id = cursor->index->tree->id; - - fold = rec_fold(rec, block->curr_n_fields, block->curr_n_bytes, - tree_id); + fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, offsets_, + ULINT_UNDEFINED, &heap), block->curr_n_fields, + block->curr_n_bytes, tree_id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } rw_lock_x_lock(&btr_search_latch); found = ha_search_and_delete_if_found(table, fold, rec); @@ -1376,6 +1381,8 @@ btr_search_update_hash_node_on_insert( return; } + ut_a(block->index == cursor->index); + rw_lock_x_lock(&btr_search_latch); if ((cursor->flag == BTR_CUR_HASH) @@ -1409,7 +1416,6 @@ btr_search_update_hash_on_insert( { hash_table_t* table; buf_block_t* block; - page_t* page; rec_t* rec; rec_t* ins_rec; rec_t* next_rec; @@ -1420,7 +1426,11 @@ btr_search_update_hash_on_insert( ulint n_fields; ulint n_bytes; ulint side; - ibool locked = FALSE; + ibool locked = FALSE; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; table = btr_search_sys->hash_index; @@ -1439,6 +1449,8 @@ btr_search_update_hash_on_insert( return; } + ut_a(block->index == cursor->index); + tree_id = ((cursor->index)->tree)->id; n_fields = block->curr_n_fields; @@ -1448,16 +1460,21 @@ btr_search_update_hash_on_insert( ins_rec = page_rec_get_next(rec); next_rec = page_rec_get_next(ins_rec); - page = buf_frame_align(rec); - - ins_fold = rec_fold(ins_rec, n_fields, n_bytes, tree_id); + offsets = rec_get_offsets(ins_rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, tree_id); - if (next_rec != page_get_supremum_rec(page)) { - next_fold = rec_fold(next_rec, n_fields, n_bytes, tree_id); + if (!page_rec_is_supremum(next_rec)) { + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, tree_id); } - if (rec != page_get_infimum_rec(page)) { - fold = rec_fold(rec, n_fields, n_bytes, tree_id); + if (!page_rec_is_infimum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + n_fields + (n_bytes > 0), &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id); } else { if (side == BTR_SEARCH_LEFT_SIDE) { @@ -1488,7 +1505,7 @@ btr_search_update_hash_on_insert( } check_next_rec: - if (next_rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum(next_rec)) { if (side == BTR_SEARCH_RIGHT_SIDE) { @@ -1527,6 +1544,9 @@ check_next_rec: } function_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } if (locked) { rw_lock_x_unlock(&btr_search_latch); } @@ -1546,6 +1566,10 @@ btr_search_validate(void) ulint n_page_dumps = 0; ibool ok = TRUE; ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; rw_lock_x_lock(&btr_search_latch); @@ -1555,9 +1579,14 @@ btr_search_validate(void) while (node != NULL) { block = buf_block_align(node->data); page = buf_frame_align(node->data); + offsets = rec_get_offsets((rec_t*) node->data, + block->index, offsets, + block->curr_n_fields + + (block->curr_n_bytes > 0), &heap); if (!block->is_hashed || node->fold != rec_fold((rec_t*)(node->data), + offsets, block->curr_n_fields, block->curr_n_bytes, btr_page_get_index_id(page))) { @@ -1573,12 +1602,14 @@ btr_search_validate(void) (ulong) ut_dulint_get_low(btr_page_get_index_id(page)), (ulong) node->fold, (ulong) rec_fold((rec_t*)(node->data), + offsets, block->curr_n_fields, block->curr_n_bytes, btr_page_get_index_id(page))); fputs("InnoDB: Record ", stderr); - rec_print(stderr, (rec_t*)(node->data)); + rec_print_new(stderr, (rec_t*)node->data, + offsets); fprintf(stderr, "\nInnoDB: on that page." "Page mem address %p, is hashed %lu, n fields %lu, n bytes %lu\n" "side %lu\n", @@ -1602,6 +1633,9 @@ btr_search_validate(void) } rw_lock_x_unlock(&btr_search_latch); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(ok); } diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index 699ad5fb42e..fe4498e6f10 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -223,13 +223,14 @@ in the free list to the frames. buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */ +#ifdef UNIV_DEBUG ulint buf_dbg_counter = 0; /* This is used to insert validation operations in excution in the debug version */ ibool buf_debug_prints = FALSE; /* If this is set TRUE, the program prints info whenever read-ahead or flush occurs */ - +#endif /* UNIV_DEBUG */ /************************************************************************ Calculates a page checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value on @@ -331,33 +332,43 @@ buf_page_is_corrupted( } } #endif - old_checksum = buf_calc_page_old_checksum(read_buf); - - old_checksum_field = mach_read_from_4(read_buf + UNIV_PAGE_SIZE + + /* If we use checksums validation, make additional check before returning + TRUE to ensure that the checksum is not equal to BUF_NO_CHECKSUM_MAGIC which + might be stored by InnoDB with checksums disabled. + Otherwise, skip checksum calculation and return FALSE */ + + if (srv_use_checksums) { + old_checksum = buf_calc_page_old_checksum(read_buf); + + old_checksum_field = mach_read_from_4(read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM); - /* There are 2 valid formulas for old_checksum_field: - 1. Very old versions of InnoDB only stored 8 byte lsn to the start - and the end of the page. - 2. Newer InnoDB versions store the old formula checksum there. */ + /* There are 2 valid formulas for old_checksum_field: + 1. Very old versions of InnoDB only stored 8 byte lsn to the start + and the end of the page. + 2. Newer InnoDB versions store the old formula checksum there. */ - if (old_checksum_field != mach_read_from_4(read_buf + FIL_PAGE_LSN) - && old_checksum_field != old_checksum) { - - return(TRUE); - } + if (old_checksum_field != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && old_checksum_field != old_checksum + && old_checksum_field != BUF_NO_CHECKSUM_MAGIC) { - checksum = buf_calc_page_new_checksum(read_buf); - checksum_field = mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM); + return(TRUE); + } - /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id - (always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */ + checksum = buf_calc_page_new_checksum(read_buf); + checksum_field = mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM); - if (checksum_field != 0 && checksum_field != checksum) { + /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id + (always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */ - return(TRUE); - } + if (checksum_field != 0 && checksum_field != checksum + && checksum_field != BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + } + return(FALSE); } @@ -379,8 +390,10 @@ buf_page_print( ut_print_buf(stderr, read_buf, UNIV_PAGE_SIZE); fputs("InnoDB: End of page dump\n", stderr); - checksum = buf_calc_page_new_checksum(read_buf); - old_checksum = buf_calc_page_old_checksum(read_buf); + checksum = srv_use_checksums ? + buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC; + old_checksum = srv_use_checksums ? + buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC; ut_print_timestamp(stderr); fprintf(stderr, @@ -460,6 +473,7 @@ buf_block_init( block->file_page_was_freed = FALSE; block->check_index_page_at_flush = FALSE; + block->index = NULL; block->in_free_list = FALSE; block->in_LRU_list = FALSE; @@ -547,7 +561,7 @@ buf_pool_init( } /*----------------------------------------*/ } else { - buf_pool->frame_mem = ut_malloc_low( + buf_pool->frame_mem = os_mem_alloc_large( UNIV_PAGE_SIZE * (n_frames + 1), TRUE, FALSE); } @@ -1273,8 +1287,9 @@ buf_page_optimistic_get_func( /* If AWE is used, block may have a different frame now, e.g., NULL */ - if (block->state != BUF_BLOCK_FILE_PAGE || block->frame != guess) { - + if (UNIV_UNLIKELY(block->state != BUF_BLOCK_FILE_PAGE) + || UNIV_UNLIKELY(block->frame != guess)) { + exit_func: mutex_exit(&(buf_pool->mutex)); return(FALSE); @@ -1307,19 +1322,17 @@ buf_page_optimistic_get_func( fix_type = MTR_MEMO_PAGE_X_FIX; } - if (!success) { + if (UNIV_UNLIKELY(!success)) { mutex_enter(&(buf_pool->mutex)); block->buf_fix_count--; #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); -#endif - mutex_exit(&(buf_pool->mutex)); - - return(FALSE); +#endif + goto exit_func; } - if (!UT_DULINT_EQ(modify_clock, block->modify_clock)) { + if (UNIV_UNLIKELY(!UT_DULINT_EQ(modify_clock, block->modify_clock))) { #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(block->frame, SYNC_NO_ORDER_CHECK); #endif /* UNIV_SYNC_DEBUG */ @@ -1334,10 +1347,8 @@ buf_page_optimistic_get_func( block->buf_fix_count--; #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); -#endif - mutex_exit(&(buf_pool->mutex)); - - return(FALSE); +#endif + goto exit_func; } mtr_memo_push(mtr, block, fix_type); @@ -1355,7 +1366,7 @@ buf_page_optimistic_get_func( #ifdef UNIV_DEBUG_FILE_ACCESSES ut_a(block->file_page_was_freed == FALSE); #endif - if (!accessed) { + if (UNIV_UNLIKELY(!accessed)) { /* In the case of a first access, try to apply linear read-ahead */ @@ -1535,6 +1546,7 @@ buf_page_init( block->offset = offset; block->check_index_page_at_flush = FALSE; + block->index = NULL; block->lock_hash_val = lock_rec_hash(space, offset); block->lock_mutex = NULL; @@ -1728,10 +1740,12 @@ buf_page_create( /* If we get here, the page was not in buf_pool: init it there */ +#ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Creating space %lu page %lu to buffer\n", (ulong) space, (ulong) offset); } +#endif /* UNIV_DEBUG */ block = free_block; @@ -1882,9 +1896,11 @@ buf_page_io_complete( rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ); +#ifdef UNIV_DEBUG if (buf_debug_prints) { fputs("Has read ", stderr); } +#endif /* UNIV_DEBUG */ } else { ut_ad(io_type == BUF_IO_WRITE); @@ -1897,17 +1913,21 @@ buf_page_io_complete( buf_pool->n_pages_written++; +#ifdef UNIV_DEBUG if (buf_debug_prints) { fputs("Has written ", stderr); } +#endif /* UNIV_DEBUG */ } mutex_exit(&(buf_pool->mutex)); +#ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "page space %lu page no %lu\n", (ulong) block->space, (ulong) block->offset); } +#endif /* UNIV_DEBUG */ } /************************************************************************* @@ -1936,6 +1956,7 @@ buf_pool_invalidate(void) mutex_exit(&(buf_pool->mutex)); } +#ifdef UNIV_DEBUG /************************************************************************* Validates the buffer buf_pool data structure. */ @@ -2080,10 +2101,6 @@ buf_print(void) n_found = 0; - for (i = 0 ; i < size; i++) { - counts[i] = 0; - } - for (i = 0; i < size; i++) { frame = buf_pool_get_nth_block(buf_pool, i)->frame; @@ -2135,6 +2152,32 @@ buf_print(void) ut_a(buf_validate()); } +#endif /* UNIV_DEBUG */ + +/************************************************************************* +Returns the number of latched pages in the buffer pool. */ + +ulint +buf_get_latched_pages_number(void) +{ + buf_block_t* block; + ulint i; + ulint fixed_pages_number = 0; + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (((block->buf_fix_count != 0) || (block->io_fix != 0)) && + block->magic_n == BUF_BLOCK_MAGIC_N ) + fixed_pages_number++; + } + + mutex_exit(&(buf_pool->mutex)); + return fixed_pages_number; +} /************************************************************************* Returns the number of pending buf pool ios. */ diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 4df0e9962fb..ffb16790b2d 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -281,6 +281,10 @@ buf_flush_buffered_writes(void) } } + /* increment the doublewrite flushed pages counter */ + srv_dblwr_pages_written+= trx_doublewrite->first_free; + srv_dblwr_writes++; + if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; } else { @@ -452,7 +456,8 @@ buf_flush_init_for_writing( /* Store the new formula checksum */ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, - buf_calc_page_new_checksum(page)); + srv_use_checksums ? + buf_calc_page_new_checksum(page) : BUF_NO_CHECKSUM_MAGIC); /* We overwrite the first 4 bytes of the end lsn field to store the old formula checksum. Since it depends also on the field @@ -460,7 +465,8 @@ buf_flush_init_for_writing( new formula checksum. */ mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - buf_calc_page_old_checksum(page)); + srv_use_checksums ? + buf_calc_page_old_checksum(page) : BUF_NO_CHECKSUM_MAGIC); } /************************************************************************ @@ -580,11 +586,13 @@ buf_flush_try_page( rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); } +#ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Flushing page space %lu, page no %lu \n", (ulong) block->space, (ulong) block->offset); } +#endif /* UNIV_DEBUG */ buf_flush_write_block_low(block); @@ -668,12 +676,14 @@ buf_flush_try_page( rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); +#ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Flushing single page space %lu, page no %lu \n", (ulong) block->space, (ulong) block->offset); } +#endif /* UNIV_DEBUG */ buf_flush_write_block_low(block); @@ -900,6 +910,7 @@ buf_flush_batch( buf_flush_buffered_writes(); +#ifdef UNIV_DEBUG if (buf_debug_prints && page_count > 0) { ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); @@ -908,7 +919,11 @@ buf_flush_batch( : "Flushed %lu pages in flush list flush\n", (ulong) page_count); } +#endif /* UNIV_DEBUG */ + if (page_count != ULINT_UNDEFINED) + srv_buf_pool_flushed+= page_count; + return(page_count); } diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c index 05e92933edf..a0157da2d42 100644 --- a/innobase/buf/buf0lru.c +++ b/innobase/buf/buf0lru.c @@ -213,12 +213,14 @@ buf_LRU_search_and_free_block( ut_a(block->in_LRU_list); if (buf_flush_ready_for_replace(block)) { +#ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Putting space %lu page %lu to free list\n", (ulong) block->space, (ulong) block->offset); } +#endif /* UNIV_DEBUG */ buf_LRU_block_remove_hashed_page(block); @@ -465,6 +467,7 @@ loop: /* No free block was found: try to flush the LRU list */ buf_flush_free_margin(); + ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); @@ -918,7 +921,8 @@ buf_LRU_block_free_hashed_page( buf_LRU_block_free_non_file_page(block); } - + +#ifdef UNIV_DEBUG /************************************************************************** Validates the LRU list. */ @@ -1049,3 +1053,4 @@ buf_LRU_print(void) mutex_exit(&(buf_pool->mutex)); } +#endif /* UNIV_DEBUG */ diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c index 055eede5c1a..813ca589907 100644 --- a/innobase/buf/buf0rea.c +++ b/innobase/buf/buf0rea.c @@ -20,6 +20,10 @@ Created 11/5/1995 Heikki Tuuri #include "os0file.h" #include "srv0start.h" +extern ulint srv_read_ahead_rnd; +extern ulint srv_read_ahead_seq; +extern ulint srv_buf_pool_reads; + /* The size in blocks of the area where the random read-ahead algorithm counts the accessed pages when deciding whether to read-ahead */ #define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA @@ -284,13 +288,16 @@ buf_read_ahead_random( os_aio_simulated_wake_handler_threads(); +#ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { fprintf(stderr, "Random read-ahead space %lu offset %lu pages %lu\n", (ulong) space, (ulong) offset, (ulong) count); } +#endif /* UNIV_DEBUG */ + ++srv_read_ahead_rnd; return(count); } @@ -323,6 +330,7 @@ buf_read_page( count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, tablespace_version, offset); + srv_buf_pool_reads+= count2; if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, @@ -569,12 +577,15 @@ buf_read_ahead_linear( /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); +#ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { fprintf(stderr, "LINEAR read-ahead space %lu offset %lu pages %lu\n", (ulong) space, (ulong) offset, (ulong) count); } +#endif /* UNIV_DEBUG */ + ++srv_read_ahead_seq; return(count); } @@ -634,11 +645,13 @@ buf_read_ibuf_merge_pages( /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); +#ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Ibuf merge read-ahead space %lu pages %lu\n", (ulong) space_ids[0], (ulong) n_stored); } +#endif /* UNIV_DEBUG */ } /************************************************************************ @@ -704,8 +717,10 @@ buf_read_recv_pages( /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); +#ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Recovery applies read-ahead pages %lu\n", (ulong) n_stored); } +#endif /* UNIV_DEBUG */ } diff --git a/innobase/configure.in b/innobase/configure.in index baf11272ab9..c56bd8274c4 100644 --- a/innobase/configure.in +++ b/innobase/configure.in @@ -117,6 +117,13 @@ case "$target" in CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE";; esac +# must go in pair with AR as set by MYSQL_CHECK_AR +if test -z "$ARFLAGS" +then + ARFLAGS="cru" +fi +AC_SUBST(ARFLAGS) + AC_OUTPUT(Makefile os/Makefile ut/Makefile btr/Makefile dnl buf/Makefile data/Makefile dnl dict/Makefile dyn/Makefile dnl diff --git a/innobase/data/data0data.c b/innobase/data/data0data.c index 97ec1a1acd9..194213a04e1 100644 --- a/innobase/data/data0data.c +++ b/innobase/data/data0data.c @@ -500,9 +500,9 @@ dtuple_convert_big_rec( ut_a(dtuple_check_typed_no_assert(entry)); - size = rec_get_converted_size(entry); + size = rec_get_converted_size(index, entry); - if (size > 1000000000) { + if (UNIV_UNLIKELY(size > 1000000000)) { fprintf(stderr, "InnoDB: Warning: tuple size very big: %lu\n", (ulong) size); fputs("InnoDB: Tuple contents: ", stderr); @@ -524,9 +524,10 @@ dtuple_convert_big_rec( n_fields = 0; - while ((rec_get_converted_size(entry) - >= page_get_free_space_of_empty() / 2) - || rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE) { + while (rec_get_converted_size(index, entry) + >= ut_min(page_get_free_space_of_empty( + index->table->comp) / 2, + REC_MAX_DATA_SIZE)) { longest = 0; for (i = dict_index_get_n_unique_in_tree(index); @@ -545,9 +546,7 @@ dtuple_convert_big_rec( } } - if (!is_externally_stored - && dict_index_get_nth_type(index, i)->mtype - == DATA_BLOB) { + if (!is_externally_stored) { dfield = dtuple_get_nth_field(entry, i); diff --git a/innobase/data/data0type.c b/innobase/data/data0type.c index dab14df4240..d4264ad2926 100644 --- a/innobase/data/data0type.c +++ b/innobase/data/data0type.c @@ -39,59 +39,39 @@ column definitions, or records in the insert buffer, we use this charset-collation code for them. */ ulint data_mysql_default_charset_coll = 99999999; -ulint data_mysql_latin1_swedish_charset_coll = 99999999; -dtype_t dtype_binary_val = {DATA_BINARY, 0, 0, 0}; +dtype_t dtype_binary_val = {DATA_BINARY, 0, 0, 0, 0, 0}; dtype_t* dtype_binary = &dtype_binary_val; /************************************************************************* -Checks if a string type has to be compared by the MySQL comparison functions. -InnoDB internally only handles binary byte string comparisons, as well as -latin1_swedish_ci strings. For example, UTF-8 strings have to be compared -by MySQL. */ - -ibool -dtype_str_needs_mysql_cmp( -/*======================*/ - /* out: TRUE if a string type that requires - comparison with MySQL functions */ - dtype_t* dtype) /* in: type struct */ -{ - if (dtype->mtype == DATA_MYSQL - || dtype->mtype == DATA_VARMYSQL - || (dtype->mtype == DATA_BLOB - && 0 == (dtype->prtype & DATA_BINARY_TYPE) - && dtype_get_charset_coll(dtype->prtype) != - data_mysql_latin1_swedish_charset_coll)) { - return(TRUE); - } - - return(FALSE); -} - -/************************************************************************* -For the documentation of this function, see innobase_get_at_most_n_mbchars() -in ha_innodb.cc. */ +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. */ ulint dtype_get_at_most_n_mbchars( /*========================*/ - dtype_t* dtype, - ulint prefix_len, - ulint data_len, - const char* str) + /* out: length of the prefix, + in bytes */ + const dtype_t* dtype, /* in: data type */ + ulint prefix_len, /* in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /* in: length of str (in bytes) */ + const char* str) /* in: the string whose prefix + length is being determined */ { +#ifndef UNIV_HOTBACKUP ut_a(data_len != UNIV_SQL_NULL); + ut_ad(!dtype->mbmaxlen || !(prefix_len % dtype->mbmaxlen)); - if (dtype_str_needs_mysql_cmp(dtype)) { + if (dtype->mbminlen != dtype->mbmaxlen) { + ut_a(!(prefix_len % dtype->mbmaxlen)); return(innobase_get_at_most_n_mbchars( dtype_get_charset_coll(dtype->prtype), prefix_len, data_len, str)); } - /* We assume here that the string types that InnoDB itself can compare - are single-byte charsets! */ - if (prefix_len < data_len) { return(prefix_len); @@ -99,6 +79,12 @@ dtype_get_at_most_n_mbchars( } return(data_len); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } /************************************************************************* @@ -206,9 +192,11 @@ dtype_validate( ut_a((type->mtype >= DATA_VARCHAR) && (type->mtype <= DATA_MYSQL)); if (type->mtype == DATA_SYS) { - ut_a(type->prtype <= DATA_MIX_ID); + ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS); } + ut_a(type->mbminlen <= type->mbmaxlen); + return(TRUE); } diff --git a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c index f156cf67a18..18a707a1b93 100644 --- a/innobase/dict/dict0boot.c +++ b/innobase/dict/dict0boot.c @@ -66,15 +66,6 @@ dict_hdr_get_new_id( dict_hdr = dict_hdr_get(&mtr); id = mtr_read_dulint(dict_hdr + type, &mtr); - - /* Add some dummy code here because otherwise pgcc seems to - compile wrong */ - - if (0 == ut_dulint_cmp(id, ut_dulint_max)) { - /* TO DO: remove this code, or make it conditional */ - ut_dbg_null_ptr = 0; - } - id = ut_dulint_add(id, 1); mlog_write_dulint(dict_hdr + type, id, &mtr); @@ -158,7 +149,7 @@ dict_hdr_create( /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_TABLES_ID, mtr); + DICT_HDR_SPACE, DICT_TABLES_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -168,7 +159,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, - DICT_TABLE_IDS_ID, mtr); + DICT_TABLE_IDS_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -178,7 +169,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_COLUMNS_ID, mtr); + DICT_HDR_SPACE, DICT_COLUMNS_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -188,7 +179,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_INDEXES_ID, mtr); + DICT_HDR_SPACE, DICT_INDEXES_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -198,7 +189,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_FIELDS_ID, mtr); + DICT_HDR_SPACE, DICT_FIELDS_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -223,6 +214,7 @@ dict_boot(void) dict_index_t* index; dict_hdr_t* dict_hdr; mtr_t mtr; + ibool success; mtr_start(&mtr); @@ -254,7 +246,7 @@ dict_boot(void) /* Insert into the dictionary cache the descriptions of the basic system tables */ /*-------------------------*/ - table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE,8); + table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, FALSE); dict_mem_table_add_col(table, "NAME", DATA_BINARY, 0, 0, 0); dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); @@ -275,22 +267,22 @@ dict_boot(void) dict_mem_index_add_field(index, "NAME", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_TABLES, - MLOG_4BYTES, &mtr); index->id = DICT_TABLES_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_TABLES, MLOG_4BYTES, &mtr)); + ut_a(success); /*-------------------------*/ index = dict_mem_index_create("SYS_TABLES", "ID_IND", DICT_HDR_SPACE, DICT_UNIQUE, 1); dict_mem_index_add_field(index, "ID", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_TABLE_IDS, - MLOG_4BYTES, &mtr); index->id = DICT_TABLE_IDS_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_TABLE_IDS, MLOG_4BYTES, &mtr)); + ut_a(success); /*-------------------------*/ - table = dict_mem_table_create("SYS_COLUMNS",DICT_HDR_SPACE,7); + table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, FALSE); dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY,0,0,0); dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); @@ -311,12 +303,12 @@ dict_boot(void) dict_mem_index_add_field(index, "TABLE_ID", 0, 0); dict_mem_index_add_field(index, "POS", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_COLUMNS, - MLOG_4BYTES, &mtr); index->id = DICT_COLUMNS_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_COLUMNS, MLOG_4BYTES, &mtr)); + ut_a(success); /*-------------------------*/ - table = dict_mem_table_create("SYS_INDEXES",DICT_HDR_SPACE,7); + table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, FALSE); dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY, 0,0,0); dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); @@ -333,6 +325,9 @@ dict_boot(void) #if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2 #error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2" #endif +#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2 +#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2" +#endif table->id = DICT_INDEXES_ID; dict_table_add_to_cache(table); @@ -344,12 +339,12 @@ dict_boot(void) dict_mem_index_add_field(index, "TABLE_ID", 0, 0); dict_mem_index_add_field(index, "ID", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_INDEXES, - MLOG_4BYTES, &mtr); index->id = DICT_INDEXES_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_INDEXES, MLOG_4BYTES, &mtr)); + ut_a(success); /*-------------------------*/ - table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE,3); + table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, FALSE); dict_mem_table_add_col(table, "INDEX_ID", DATA_BINARY, 0,0,0); dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); @@ -365,10 +360,10 @@ dict_boot(void) dict_mem_index_add_field(index, "INDEX_ID", 0, 0); dict_mem_index_add_field(index, "POS", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_FIELDS, - MLOG_4BYTES, &mtr); index->id = DICT_FIELDS_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_FIELDS, MLOG_4BYTES, &mtr)); + ut_a(success); mtr_commit(&mtr); /*-------------------------*/ diff --git a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c index d9e89316613..c7d6ffd2c22 100644 --- a/innobase/dict/dict0crea.c +++ b/innobase/dict/dict0crea.c @@ -63,8 +63,8 @@ dict_create_sys_tables_tuple( dfield = dtuple_get_nth_field(entry, 2); ptr = mem_heap_alloc(heap, 4); - mach_write_to_4(ptr, table->n_def); - + mach_write_to_4(ptr, table->n_def + | ((ulint) table->comp << 31)); dfield_set_data(dfield, ptr, 4); /* 5: TYPE -----------------------------*/ dfield = dtuple_get_nth_field(entry, 3); @@ -82,16 +82,6 @@ dict_create_sys_tables_tuple( dfield_set_data(dfield, ptr, 8); /* 7: MIX_LEN --------------------------*/ - /* Track corruption reported on mailing list Jan 14, 2005 */ - if (table->mix_len != 0 && table->mix_len != 0x80000000) { - fprintf(stderr, -"InnoDB: Error: mix_len is %lu in table %s\n", (ulong)table->mix_len, - table->name); - mem_analyze_corruption((byte*)&(table->mix_len)); - - ut_error; - } - dfield = dtuple_get_nth_field(entry, 5); ptr = mem_heap_alloc(heap, 4); @@ -219,6 +209,8 @@ dict_build_table_def_step( const char* path_or_name; ibool is_path; mtr_t mtr; + ulint i; + ulint row_len; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(dict_sys->mutex))); @@ -230,6 +222,15 @@ dict_build_table_def_step( thr_get_trx(thr)->table_id = table->id; + row_len = 0; + for (i = 0; i < table->n_def; i++) { + row_len += dtype_get_min_size(dict_col_get_type( + &table->cols[i])); + } + if (row_len > BTR_PAGE_MAX_REC_SIZE) { + return(DB_TOO_BIG_RECORD); + } + if (table->type == DICT_TABLE_CLUSTER_MEMBER) { cluster_table = dict_table_get_low(table->cluster_name); @@ -554,9 +555,7 @@ dict_build_index_def_step( table in the same tablespace */ index->space = table->space; - - index->page_no = FIL_NULL; - + node->page_no = FIL_NULL; row = dict_create_sys_indexes_tuple(index, node->heap); node->ind_row = row; @@ -634,18 +633,18 @@ dict_create_index_tree_step( btr_pcur_move_to_next_user_rec(&pcur, &mtr); - index->page_no = btr_create(index->type, index->space, index->id, - &mtr); + node->page_no = btr_create(index->type, index->space, index->id, + table->comp, &mtr); /* printf("Created a new index tree in space %lu root page %lu\n", index->space, index->page_no); */ page_rec_write_index_page_no(btr_pcur_get_rec(&pcur), DICT_SYS_INDEXES_PAGE_NO_FIELD, - index->page_no, &mtr); + node->page_no, &mtr); btr_pcur_close(&pcur); mtr_commit(&mtr); - if (index->page_no == FIL_NULL) { + if (node->page_no == FIL_NULL) { return(DB_OUT_OF_FILE_SPACE); } @@ -671,8 +670,9 @@ dict_drop_index_tree( #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(dict_sys->mutex))); #endif /* UNIV_SYNC_DEBUG */ - - ptr = rec_get_nth_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); + + ut_a(!dict_sys->sys_indexes->comp); + ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); @@ -684,8 +684,9 @@ dict_drop_index_tree( return; } - ptr = rec_get_nth_field(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); - + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); + ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); @@ -710,8 +711,132 @@ dict_drop_index_tree( root_page_no); */ btr_free_root(space, root_page_no, mtr); + page_rec_write_index_page_no(rec, + DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); +} + +/*********************************************************************** +Truncates the index tree associated with a row in SYS_INDEXES table. */ + +ulint +dict_truncate_index_tree( +/*=====================*/ + /* out: new root page number, or + FIL_NULL on failure */ + dict_table_t* table, /* in: the table the index belongs to */ + rec_t* rec, /* in: record in the clustered index of + SYS_INDEXES table */ + mtr_t* mtr) /* in: mtr having the latch + on the record page. The mtr may be + committed and restarted in this call. */ +{ + ulint root_page_no; + ulint space; + ulint type; + dulint index_id; + byte* ptr; + ulint len; + ulint comp; + dict_index_t* index; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + ut_a(!dict_sys->sys_indexes->comp); + ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); + + ut_ad(len == 4); + + root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + + if (root_page_no == FIL_NULL) { + /* The tree has been freed. */ + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Trying to TRUNCATE" + " a missing index of table %s!\n", table->name); + return(FIL_NULL); + } + + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); + + ut_ad(len == 4); + + space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + + if (!fil_tablespace_exists_in_mem(space)) { + /* It is a single table tablespace and the .ibd file is + missing: do nothing */ + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Trying to TRUNCATE" + " a missing .ibd file of table %s!\n", table->name); + return(FIL_NULL); + } + + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_TYPE_FIELD, &len); + ut_ad(len == 4); + type = mach_read_from_4(ptr); + + ptr = rec_get_nth_field_old(rec, 1, &len); + ut_ad(len == 8); + index_id = mach_read_from_8(ptr); + + /* We free all the pages but the root page first; this operation + may span several mini-transactions */ + + btr_free_but_not_root(space, root_page_no); + + /* Then we free the root page in the same mini-transaction where + we create the b-tree and write its new root page number to the + appropriate field in the SYS_INDEXES record: this mini-transaction + marks the B-tree totally truncated */ + + comp = page_is_comp(btr_page_get( + space, root_page_no, RW_X_LATCH, mtr)); + + btr_free_root(space, root_page_no, mtr); + /* We will temporarily write FIL_NULL to the PAGE_NO field + in SYS_INDEXES, so that the database will not get into an + inconsistent state in case it crashes between the mtr_commit() + below and the following mtr_commit() call. */ page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); + + /* We will need to commit the mini-transaction in order to avoid + deadlocks in the btr_create() call, because otherwise we would + be freeing and allocating pages in the same mini-transaction. */ + mtr_commit(mtr); + /* mtr_commit() will invalidate rec. */ + rec = NULL; + mtr_start(mtr); + + /* Find the index corresponding to this SYS_INDEXES record. */ + for (index = UT_LIST_GET_FIRST(table->indexes); + index; + index = UT_LIST_GET_NEXT(indexes, index)) { + if (!ut_dulint_cmp(index->id, index_id)) { + break; + } + } + + root_page_no = btr_create(type, space, index_id, comp, mtr); + if (index) { + index->tree->page = root_page_no; + } else { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Index %lu %lu of table %s is missing\n" + "InnoDB: from the data dictionary during TRUNCATE!\n", + ut_dulint_get_high(index_id), + ut_dulint_get_low(index_id), + table->name); + } + + return(root_page_no); } /************************************************************************* @@ -770,6 +895,7 @@ ind_create_graph_create( node->index = index; node->state = INDEX_BUILD_INDEX_DEF; + node->page_no = FIL_NULL; node->heap = mem_heap_create(256); node->ind_def = ins_node_create(INS_DIRECT, @@ -989,7 +1115,8 @@ dict_create_index_step( if (node->state == INDEX_ADD_TO_CACHE) { - success = dict_index_add_to_cache(node->table, node->index); + success = dict_index_add_to_cache(node->table, node->index, + node->page_no); ut_a(success); diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c index b0327f77fd3..9580a80e7e7 100644 --- a/innobase/dict/dict0dict.c +++ b/innobase/dict/dict0dict.c @@ -53,6 +53,7 @@ rw_lock_t dict_operation_lock; /* table create, drop, etc. reserve /* Identifies generated InnoDB foreign key names */ static char dict_ibfk[] = "_ibfk_"; +#ifndef UNIV_HOTBACKUP /********************************************************************** Compares NUL-terminated UTF-8 strings case insensitively. @@ -76,6 +77,7 @@ void innobase_casedn_str( /*================*/ char* a); /* in/out: string to put in lower case */ +#endif /* !UNIV_HOTBACKUP */ /************************************************************************** Adds a column to the data dictionary hash table. */ @@ -824,23 +826,22 @@ dict_table_add_to_cache( system columns. */ dict_mem_table_add_col(table, "DB_ROW_ID", DATA_SYS, - DATA_ROW_ID, 0, 0); + DATA_ROW_ID | DATA_NOT_NULL, DATA_ROW_ID_LEN, 0); #if DATA_ROW_ID != 0 #error "DATA_ROW_ID != 0" #endif dict_mem_table_add_col(table, "DB_TRX_ID", DATA_SYS, - DATA_TRX_ID, 0, 0); + DATA_TRX_ID | DATA_NOT_NULL, DATA_TRX_ID_LEN, 0); #if DATA_TRX_ID != 1 #error "DATA_TRX_ID != 1" #endif dict_mem_table_add_col(table, "DB_ROLL_PTR", DATA_SYS, - DATA_ROLL_PTR, 0, 0); + DATA_ROLL_PTR | DATA_NOT_NULL, DATA_ROLL_PTR_LEN, 0); #if DATA_ROLL_PTR != 2 #error "DATA_ROLL_PTR != 2" #endif - dict_mem_table_add_col(table, "DB_MIX_ID", DATA_SYS, - DATA_MIX_ID, 0, 0); + DATA_MIX_ID | DATA_NOT_NULL, DATA_MIX_ID_LEN, 0); #if DATA_MIX_ID != 3 #error "DATA_MIX_ID != 3" #endif @@ -1385,8 +1386,9 @@ dict_index_add_to_cache( /*====================*/ /* out: TRUE if success */ dict_table_t* table, /* in: table on which the index is */ - dict_index_t* index) /* in, own: index; NOTE! The index memory + dict_index_t* index, /* in, own: index; NOTE! The index memory object is freed in this function! */ + ulint page_no)/* in: root page number of the index */ { dict_index_t* new_index; dict_tree_t* tree; @@ -1451,7 +1453,7 @@ dict_index_add_to_cache( /* Increment the ord_part counts in columns which are ordering */ - if (index->type & DICT_UNIVERSAL) { + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { n_ord = new_index->n_fields; } else { n_ord = dict_index_get_n_unique(new_index); @@ -1472,16 +1474,15 @@ dict_index_add_to_cache( tree = dict_index_get_tree( UT_LIST_GET_FIRST(cluster->indexes)); new_index->tree = tree; - new_index->page_no = tree->page; } else { /* Create an index tree memory object for the index */ - tree = dict_tree_create(new_index); + tree = dict_tree_create(new_index, page_no); ut_ad(tree); new_index->tree = tree; } - if (!(new_index->type & DICT_UNIVERSAL)) { + if (!UNIV_UNLIKELY(new_index->type & DICT_UNIVERSAL)) { new_index->stat_n_diff_key_vals = mem_heap_alloc(new_index->heap, @@ -1598,7 +1599,7 @@ dict_index_find_cols( /*********************************************************************** Adds a column to index. */ -UNIV_INLINE + void dict_index_add_col( /*===============*/ @@ -1614,6 +1615,34 @@ dict_index_add_col( field = dict_index_get_nth_field(index, index->n_def - 1); field->col = col; + field->fixed_len = dtype_get_fixed_size(&col->type); + + if (prefix_len && field->fixed_len > prefix_len) { + field->fixed_len = prefix_len; + } + + /* Long fixed-length fields that need external storage are treated as + variable-length fields, so that the extern flag can be embedded in + the length word. */ + + if (field->fixed_len > DICT_MAX_COL_PREFIX_LEN) { + field->fixed_len = 0; + } + + if (!(dtype_get_prtype(&col->type) & DATA_NOT_NULL)) { + index->n_nullable++; + } + + if (index->n_def > 1) { + const dict_field_t* field2 = + dict_index_get_nth_field(index, index->n_def - 2); + field->fixed_offs = (!field2->fixed_len || + field2->fixed_offs == ULINT_UNDEFINED) + ? ULINT_UNDEFINED + : field2->fixed_len + field2->fixed_offs; + } else { + field->fixed_offs = 0; + } } /*********************************************************************** @@ -1654,7 +1683,7 @@ dict_index_copy_types( dtype_t* type; ulint i; - if (index->type & DICT_UNIVERSAL) { + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { dtuple_set_types_binary(tuple, n_fields); return; @@ -1732,7 +1761,6 @@ dict_index_build_internal_clust( new_index->n_user_defined_cols = index->n_fields; new_index->id = index->id; - new_index->page_no = index->page_no; if (table->type != DICT_TABLE_ORDINARY) { /* The index is mixed: copy common key prefix fields */ @@ -1751,7 +1779,7 @@ dict_index_build_internal_clust( dict_index_copy(new_index, index, 0, index->n_fields); } - if (index->type & DICT_UNIVERSAL) { + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { /* No fixed number of fields determines an entry uniquely */ new_index->n_uniq = ULINT_MAX; @@ -1911,7 +1939,6 @@ dict_index_build_internal_non_clust( new_index->n_user_defined_cols = index->n_fields; new_index->id = index->id; - new_index->page_no = index->page_no; /* Copy fields from index to new_index */ dict_index_copy(new_index, index, 0, index->n_fields); @@ -2080,6 +2107,7 @@ dict_foreign_find_index( dict_index_t* types_idx)/* in: NULL or an index to whose types the column types must match */ { +#ifndef UNIV_HOTBACKUP dict_index_t* index; const char* col_name; ulint i; @@ -2124,6 +2152,12 @@ dict_foreign_find_index( } return(NULL); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } /************************************************************************** @@ -2462,7 +2496,7 @@ dict_scan_id( my_isspace(). Only after that, convert id names to UTF-8. */ b = (byte*)(*id); - id_len = strlen(b); + id_len = strlen((char*) b); if (id_len >= 3 && b[id_len - 1] == 0xA0 && b[id_len - 2] == 0xC2) { @@ -2491,6 +2525,7 @@ dict_scan_col( const char** name) /* out,own: the column name; NULL if no name was scannable */ { +#ifndef UNIV_HOTBACKUP dict_col_t* col; ulint i; @@ -2524,6 +2559,12 @@ dict_scan_col( } return(ptr); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } /************************************************************************* @@ -2541,6 +2582,7 @@ dict_scan_table_name( const char** ref_name)/* out,own: the table name; NULL if no name was scannable */ { +#ifndef UNIV_HOTBACKUP const char* database_name = NULL; ulint database_name_len = 0; const char* table_name = NULL; @@ -2622,6 +2664,12 @@ dict_scan_table_name( *table = dict_table_get_low(ref); return(ptr); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } /************************************************************************* @@ -3570,9 +3618,10 @@ dict_tree_t* dict_tree_create( /*=============*/ /* out, own: created tree */ - dict_index_t* index) /* in: the index for which to create: in the + dict_index_t* index, /* in: the index for which to create: in the case of a mixed tree, this should be the index of the cluster object */ + ulint page_no)/* in: root page number of the index */ { dict_tree_t* tree; @@ -3582,7 +3631,7 @@ dict_tree_create( tree->type = index->type; tree->space = index->space; - tree->page = index->page_no; + tree->page = page_no; tree->id = index->id; @@ -3633,12 +3682,13 @@ dict_tree_find_index_low( table = index->table; if ((index->type & DICT_CLUSTERED) - && (table->type != DICT_TABLE_ORDINARY)) { + && UNIV_UNLIKELY(table->type != DICT_TABLE_ORDINARY)) { /* Get the mix id of the record */ + ut_a(!table->comp); mix_id = mach_dulint_read_compressed( - rec_get_nth_field(rec, table->mix_len, &len)); + rec_get_nth_field_old(rec, table->mix_len, &len)); while (ut_dulint_cmp(table->mix_id, mix_id) != 0) { @@ -3717,6 +3767,29 @@ dict_tree_find_index_for_tuple( return(index); } +/*********************************************************************** +Checks if a table which is a mixed cluster member owns a record. */ + +ibool +dict_is_mixed_table_rec( +/*====================*/ + /* out: TRUE if the record belongs to this + table */ + dict_table_t* table, /* in: table in a mixed cluster */ + rec_t* rec) /* in: user record in the clustered index */ +{ + byte* mix_id_field; + ulint len; + + ut_ad(!table->comp); + + mix_id_field = rec_get_nth_field_old(rec, + table->mix_len, &len); + + return(len == table->mix_id_len + && !ut_memcmp(table->mix_id_buf, mix_id_field, len)); +} + /************************************************************************** Checks that a tuple has n_fields_cmp value in a sensible range, so that no comparison can occur with the page number field in a node pointer. */ @@ -3765,13 +3838,14 @@ dict_tree_build_node_ptr( ind = dict_tree_find_index_low(tree, rec); - if (tree->type & DICT_UNIVERSAL) { + if (UNIV_UNLIKELY(tree->type & DICT_UNIVERSAL)) { /* In a universal index tree, we take the whole record as the node pointer if the reord is on the leaf level, on non-leaf levels we remove the last field, which contains the page number of the child page */ - n_unique = rec_get_n_fields(rec); + ut_a(!ind->table->comp); + n_unique = rec_get_n_fields_old(rec); if (level > 0) { ut_a(n_unique > 1); @@ -3800,9 +3874,11 @@ dict_tree_build_node_ptr( field = dtuple_get_nth_field(tuple, n_unique); dfield_set_data(field, buf, 4); - dtype_set(dfield_get_type(field), DATA_SYS_CHILD, 0, 0, 0); + dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4, 0); - rec_copy_prefix_to_dtuple(tuple, rec, n_unique, heap); + rec_copy_prefix_to_dtuple(tuple, rec, ind, n_unique, heap); + dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) | + REC_STATUS_NODE_PTR); ut_ad(dtuple_check_typed(tuple)); @@ -3819,27 +3895,27 @@ dict_tree_copy_rec_order_prefix( /* out: pointer to the prefix record */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to copy prefix */ + ulint* n_fields,/* out: number of fields copied */ byte** buf, /* in/out: memory buffer for the copied prefix, or NULL */ ulint* buf_size)/* in/out: buffer size */ { - dict_index_t* ind; - rec_t* order_rec; - ulint n_fields; - - ind = dict_tree_find_index_low(tree, rec); + dict_index_t* index; + ulint n; - n_fields = dict_index_get_n_unique_in_tree(ind); - - if (tree->type & DICT_UNIVERSAL) { + UNIV_PREFETCH_R(rec); + index = dict_tree_find_index_low(tree, rec); - n_fields = rec_get_n_fields(rec); + if (UNIV_UNLIKELY(tree->type & DICT_UNIVERSAL)) { + ut_a(!index->table->comp); + n = rec_get_n_fields_old(rec); + } else { + n = dict_index_get_n_unique_in_tree(index); } - order_rec = rec_copy_prefix_to_buf(rec, n_fields, buf, buf_size); - - return(order_rec); -} + *n_fields = n; + return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size)); +} /************************************************************************** Builds a typed data tuple out of a physical record. */ @@ -3850,21 +3926,21 @@ dict_tree_build_data_tuple( /* out, own: data tuple */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to build data tuple */ + ulint n_fields,/* in: number of data fields */ mem_heap_t* heap) /* in: memory heap where tuple created */ { dtuple_t* tuple; dict_index_t* ind; - ulint n_fields; ind = dict_tree_find_index_low(tree, rec); - n_fields = rec_get_n_fields(rec); + ut_ad(ind->table->comp || n_fields <= rec_get_n_fields_old(rec)); tuple = dtuple_create(heap, n_fields); dict_index_copy_types(tuple, ind, n_fields); - rec_copy_prefix_to_dtuple(tuple, rec, n_fields, heap); + rec_copy_prefix_to_dtuple(tuple, rec, ind, n_fields, heap); ut_ad(dtuple_check_typed(tuple)); @@ -3882,6 +3958,27 @@ dict_index_calc_min_rec_len( ulint sum = 0; ulint i; + if (UNIV_LIKELY(index->table->comp)) { + ulint nullable = 0; + sum = REC_N_NEW_EXTRA_BYTES; + for (i = 0; i < dict_index_get_n_fields(index); i++) { + dtype_t*t = dict_index_get_nth_type(index, i); + ulint size = dtype_get_fixed_size(t); + sum += size; + if (!size) { + size = dtype_get_len(t); + sum += size < 128 ? 1 : 2; + } + if (!(dtype_get_prtype(t) & DATA_NOT_NULL)) + nullable++; + } + + /* round the NULL flags up to full bytes */ + sum += (nullable + 7) / 8; + + return(sum); + } + for (i = 0; i < dict_index_get_n_fields(index); i++) { sum += dtype_get_fixed_size(dict_index_get_nth_type(index, i)); } @@ -3892,7 +3989,7 @@ dict_index_calc_min_rec_len( sum += dict_index_get_n_fields(index); } - sum += REC_N_EXTRA_BYTES; + sum += REC_N_OLD_EXTRA_BYTES; return(sum); } @@ -4181,9 +4278,11 @@ dict_index_print_low( putc('\n', stderr); -/* btr_print_size(tree); */ +#ifdef UNIV_BTR_PRINT + btr_print_size(tree); -/* btr_print_tree(tree, 7); */ + btr_print_tree(tree, 7); +#endif /* UNIV_BTR_PRINT */ } /************************************************************************** diff --git a/innobase/dict/dict0load.c b/innobase/dict/dict0load.c index 0d58823a2ea..9bafcf33553 100644 --- a/innobase/dict/dict0load.c +++ b/innobase/dict/dict0load.c @@ -8,6 +8,9 @@ Created 4/24/1996 Heikki Tuuri *******************************************************/ #include "dict0load.h" +#ifndef UNIV_HOTBACKUP +#include "mysql_version.h" +#endif /* !UNIV_HOTBACKUP */ #ifdef UNIV_NONINL #include "dict0load.ic" @@ -55,6 +58,7 @@ dict_get_first_table_name_in_db( sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!sys_tables->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -77,7 +81,7 @@ loop: return(NULL); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); if (len < strlen(name) || ut_memcmp(name, field, strlen(name)) != 0) { @@ -90,7 +94,7 @@ loop: return(NULL); } - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, sys_tables->comp)) { /* We found one */ @@ -163,9 +167,9 @@ loop: return; } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, sys_tables->comp)) { /* We found one */ @@ -180,7 +184,7 @@ loop: if (table == NULL) { fputs("InnoDB: Failed to load table ", stderr); - ut_print_namel(stderr, NULL, field, len); + ut_print_namel(stderr, NULL, (char*) field, len); putc('\n', stderr); } else { /* The table definition was corrupt if there @@ -231,6 +235,7 @@ dict_check_tablespaces_and_store_max_id( sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!sys_tables->comp); btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); @@ -257,15 +262,15 @@ loop: return; } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, sys_tables->comp)) { /* We found one */ char* name = mem_strdupl((char*) field, len); - field = rec_get_nth_field(rec, 9, &len); + field = rec_get_nth_field_old(rec, 9, &len); ut_a(len == 4); space_id = mach_read_from_4(field); @@ -338,6 +343,7 @@ dict_load_columns( sys_columns = dict_table_get_low("SYS_COLUMNS"); sys_index = UT_LIST_GET_FIRST(sys_columns->indexes); + ut_a(!sys_columns->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -356,28 +362,27 @@ dict_load_columns( ut_a(btr_pcur_is_on_user_rec(&pcur, &mtr)); - ut_a(!rec_get_deleted_flag(rec)); - - field = rec_get_nth_field(rec, 0, &len); + ut_a(!rec_get_deleted_flag(rec, sys_columns->comp)); + + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); ut_a(ut_dulint_cmp(table->id, mach_read_from_8(field)) == 0); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 4); ut_a(i == mach_read_from_4(field)); ut_a(0 == ut_strcmp("NAME", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_columns), 4))->name)); + dict_index_get_nth_field(sys_index, 4))->name)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); name = mem_heap_strdupl(heap, (char*) field, len); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); mtype = mach_read_from_4(field); - field = rec_get_nth_field(rec, 6, &len); + field = rec_get_nth_field_old(rec, 6, &len); prtype = mach_read_from_4(field); if (dtype_is_non_binary_string_type(mtype, prtype) @@ -389,15 +394,14 @@ dict_load_columns( data_mysql_default_charset_coll); } - field = rec_get_nth_field(rec, 7, &len); + field = rec_get_nth_field_old(rec, 7, &len); col_len = mach_read_from_4(field); ut_a(0 == ut_strcmp("PREC", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_columns), 8))->name)); + dict_index_get_nth_field(sys_index, 8))->name)); - field = rec_get_nth_field(rec, 8, &len); + field = rec_get_nth_field_old(rec, 8, &len); prec = mach_read_from_4(field); dict_mem_table_add_col(table, name, mtype, prtype, col_len, @@ -462,6 +466,7 @@ dict_load_fields( sys_fields = dict_table_get_low("SYS_FIELDS"); sys_index = UT_LIST_GET_FIRST(sys_fields->indexes); + ut_a(!sys_fields->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -479,15 +484,15 @@ dict_load_fields( rec = btr_pcur_get_rec(&pcur); ut_a(btr_pcur_is_on_user_rec(&pcur, &mtr)); - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, sys_fields->comp)) { dict_load_report_deleted_index(table->name, i); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); ut_a(ut_memcmp(buf, field, len) == 0); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_a(len == 4); /* The next field stores the field position in the index @@ -513,10 +518,9 @@ dict_load_fields( ut_a(0 == ut_strcmp("COL_NAME", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_fields), 4))->name)); + dict_index_get_nth_field(sys_index, 4))->name)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); dict_mem_index_add_field(index, mem_heap_strdupl(heap, (char*) field, len), 0, prefix_len); @@ -575,6 +579,7 @@ dict_load_indexes( sys_indexes = dict_table_get_low("SYS_INDEXES"); sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes); + ut_a(!sys_indexes->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -595,14 +600,14 @@ dict_load_indexes( rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); if (ut_memcmp(buf, field, len) != 0) { break; } - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, table->comp)) { dict_load_report_deleted_index(table->name, ULINT_UNDEFINED); @@ -612,33 +617,31 @@ dict_load_indexes( return(FALSE); } - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 8); id = mach_read_from_8(field); ut_a(0 == ut_strcmp("NAME", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_indexes), 4))->name)); - - field = rec_get_nth_field(rec, 4, &name_len); + dict_index_get_nth_field(sys_index, 4))->name)); + + field = rec_get_nth_field_old(rec, 4, &name_len); name_buf = mem_heap_strdupl(heap, (char*) field, name_len); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); n_fields = mach_read_from_4(field); - field = rec_get_nth_field(rec, 6, &len); + field = rec_get_nth_field_old(rec, 6, &len); type = mach_read_from_4(field); - field = rec_get_nth_field(rec, 7, &len); + field = rec_get_nth_field_old(rec, 7, &len); space = mach_read_from_4(field); ut_a(0 == ut_strcmp("PAGE_NO", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_indexes), 8))->name)); + dict_index_get_nth_field(sys_index, 8))->name)); - field = rec_get_nth_field(rec, 8, &len); + field = rec_get_nth_field_old(rec, 8, &len); page_no = mach_read_from_4(field); if (page_no == FIL_NULL) { @@ -680,12 +683,10 @@ dict_load_indexes( } else { index = dict_mem_index_create(table->name, name_buf, space, type, n_fields); - index->page_no = page_no; index->id = id; dict_load_fields(table, index, heap); - - dict_index_add_to_cache(table, index); + dict_index_add_to_cache(table, index, page_no); } btr_pcur_move_to_next_user_rec(&pcur, &mtr); @@ -741,6 +742,7 @@ dict_load_table( sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!sys_tables->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -753,7 +755,7 @@ dict_load_table( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur, &mtr) - || rec_get_deleted_flag(rec)) { + || rec_get_deleted_flag(rec, sys_tables->comp)) { /* Not found */ btr_pcur_close(&pcur); @@ -763,11 +765,10 @@ dict_load_table( return(NULL); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); /* Check if the table name in record is the searched one */ if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) { - err_exit: btr_pcur_close(&pcur); mtr_commit(&mtr); mem_heap_free(heap); @@ -777,10 +778,9 @@ dict_load_table( ut_a(0 == ut_strcmp("SPACE", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_tables), 9))->name)); + dict_index_get_nth_field(sys_index, 9))->name)); - field = rec_get_nth_field(rec, 9, &len); + field = rec_get_nth_field_old(rec, 9, &len); space = mach_read_from_4(field); /* Check if the tablespace exists and has the right name */ @@ -812,49 +812,44 @@ dict_load_table( ut_a(0 == ut_strcmp("N_COLS", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_tables), 4))->name)); + dict_index_get_nth_field(sys_index, 4))->name)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); n_cols = mach_read_from_4(field); - if (n_cols & 0x80000000UL) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: table %s is in the new compact format\n" - "InnoDB: of MySQL 5.0.3 or later\n", name); - goto err_exit; - } - table = dict_mem_table_create(name, space, n_cols); + /* The high-order bit of N_COLS is the "compact format" flag. */ + table = dict_mem_table_create(name, space, + n_cols & ~0x80000000UL, + !!(n_cols & 0x80000000UL)); table->ibd_file_missing = ibd_file_missing; ut_a(0 == ut_strcmp("ID", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_tables), 3))->name)); + dict_index_get_nth_field(sys_index, 3))->name)); - field = rec_get_nth_field(rec, 3, &len); + field = rec_get_nth_field_old(rec, 3, &len); table->id = mach_read_from_8(field); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); table->type = mach_read_from_4(field); if (table->type == DICT_TABLE_CLUSTER_MEMBER) { ut_error; #if 0 /* clustered tables have not been implemented yet */ - field = rec_get_nth_field(rec, 6, &len); + field = rec_get_nth_field_old(rec, 6, &len); table->mix_id = mach_read_from_8(field); - field = rec_get_nth_field(rec, 8, &len); + field = rec_get_nth_field_old(rec, 8, &len); table->cluster_name = mem_heap_strdupl(heap, (char*) field, len); #endif } if ((table->type == DICT_TABLE_CLUSTER) || (table->type == DICT_TABLE_CLUSTER_MEMBER)) { - - field = rec_get_nth_field(rec, 7, &len); + + field = rec_get_nth_field_old(rec, 7, &len); + ut_a(len == 4); table->mix_len = mach_read_from_4(field); } @@ -933,6 +928,7 @@ dict_load_table_on_id( sys_tables = dict_sys->sys_tables; sys_table_ids = dict_table_get_next_index( dict_table_get_first_index(sys_tables)); + ut_a(!sys_tables->comp); heap = mem_heap_create(256); tuple = dtuple_create(heap, 1); @@ -949,7 +945,7 @@ dict_load_table_on_id( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur, &mtr) - || rec_get_deleted_flag(rec)) { + || rec_get_deleted_flag(rec, sys_tables->comp)) { /* Not found */ btr_pcur_close(&pcur); @@ -964,7 +960,7 @@ dict_load_table_on_id( table ID and NAME */ rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); /* Check if the table id in record is the one searched for */ @@ -978,7 +974,7 @@ dict_load_table_on_id( } /* Now we get the table name from the record */ - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); /* Load the table definition to memory */ table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len)); @@ -1046,6 +1042,7 @@ dict_load_foreign_cols( sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS"); sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes); + ut_a(!sys_foreign_cols->comp); tuple = dtuple_create(foreign->heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -1060,21 +1057,21 @@ dict_load_foreign_cols( rec = btr_pcur_get_rec(&pcur); ut_a(btr_pcur_is_on_user_rec(&pcur, &mtr)); - ut_a(!rec_get_deleted_flag(rec)); - - field = rec_get_nth_field(rec, 0, &len); + ut_a(!rec_get_deleted_flag(rec, sys_foreign_cols->comp)); + + field = rec_get_nth_field_old(rec, 0, &len); ut_a(len == ut_strlen(id)); ut_a(ut_memcmp(id, field, len) == 0); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_a(len == 4); ut_a(i == mach_read_from_4(field)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); foreign->foreign_col_names[i] = mem_heap_strdupl(foreign->heap, (char*) field, len); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); foreign->referenced_col_names[i] = mem_heap_strdupl(foreign->heap, (char*) field, len); @@ -1118,6 +1115,7 @@ dict_load_foreign( sys_foreign = dict_table_get_low("SYS_FOREIGN"); sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes); + ut_a(!sys_foreign->comp); tuple = dtuple_create(heap2, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -1130,7 +1128,7 @@ dict_load_foreign( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur, &mtr) - || rec_get_deleted_flag(rec)) { + || rec_get_deleted_flag(rec, sys_foreign->comp)) { /* Not found */ fprintf(stderr, @@ -1144,7 +1142,7 @@ dict_load_foreign( return(DB_ERROR); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); /* Check if the id in record is the searched one */ if (len != ut_strlen(id) || ut_memcmp(id, field, len) != 0) { @@ -1167,7 +1165,8 @@ dict_load_foreign( foreign = dict_mem_foreign_create(); - foreign->n_fields = mach_read_from_4(rec_get_nth_field(rec, 5, &len)); + foreign->n_fields = + mach_read_from_4(rec_get_nth_field_old(rec, 5, &len)); ut_a(len == 4); @@ -1178,11 +1177,11 @@ dict_load_foreign( foreign->id = mem_heap_strdup(foreign->heap, id); - field = rec_get_nth_field(rec, 3, &len); + field = rec_get_nth_field_old(rec, 3, &len); foreign->foreign_table_name = mem_heap_strdupl(foreign->heap, (char*) field, len); - - field = rec_get_nth_field(rec, 4, &len); + + field = rec_get_nth_field_old(rec, 4, &len); foreign->referenced_table_name = mem_heap_strdupl(foreign->heap, (char*) field, len); @@ -1250,6 +1249,7 @@ dict_load_foreigns( return(DB_ERROR); } + ut_a(!sys_foreign->comp); mtr_start(&mtr); /* Get the secondary index based on FOR_NAME from table @@ -1281,7 +1281,7 @@ loop: name and a foreign constraint ID */ rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); /* Check if the table name in the record is the one searched for; the following call does the comparison in the latin1_swedish_ci @@ -1304,13 +1304,13 @@ loop: goto next_rec; } - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, sys_foreign->comp)) { goto next_rec; } /* Now we get a foreign key constraint id */ - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); id = mem_heap_strdupl(heap, (char*) field, len); btr_pcur_store_position(&pcur, &mtr); diff --git a/innobase/dict/dict0mem.c b/innobase/dict/dict0mem.c index 1d45585aac1..eec35310039 100644 --- a/innobase/dict/dict0mem.c +++ b/innobase/dict/dict0mem.c @@ -35,12 +35,14 @@ dict_mem_table_create( the table is placed; this parameter is ignored if the table is made a member of a cluster */ - ulint n_cols) /* in: number of columns */ + ulint n_cols, /* in: number of columns */ + ibool comp) /* in: TRUE=compact page format */ { dict_table_t* table; mem_heap_t* heap; ut_ad(name); + ut_ad(comp == FALSE || comp == TRUE); heap = mem_heap_create(DICT_HEAP_SIZE); @@ -54,6 +56,7 @@ dict_mem_table_create( table->space = space; table->ibd_file_missing = FALSE; table->tablespace_discarded = FALSE; + table->comp = comp; table->n_def = 0; table->n_cols = n_cols + DATA_N_SYS_COLS; table->mem_fix = 0; @@ -110,7 +113,8 @@ dict_mem_cluster_create( { dict_table_t* cluster; - cluster = dict_mem_table_create(name, space, n_cols); + /* Clustered tables cannot work with the compact record format. */ + cluster = dict_mem_table_create(name, space, n_cols, FALSE); cluster->type = DICT_TABLE_CLUSTER; cluster->mix_len = mix_len; @@ -197,7 +201,7 @@ dict_mem_index_create( index->name = mem_heap_strdup(heap, index_name); index->table_name = table_name; index->table = NULL; - index->n_def = 0; + index->n_def = index->n_nullable = 0; index->n_fields = n_fields; index->fields = mem_heap_alloc(heap, 1 + n_fields * sizeof(dict_field_t)); diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index e83d2fcde32..20f522c1a60 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -89,6 +89,9 @@ but in the MySQL Embedded Server Library and ibbackup it is not the default directory, and we must set the base file path explicitly */ const char* fil_path_to_mysql_datadir = "."; +/* The number of fsyncs done to the log */ +ulint fil_n_log_flushes = 0; + ulint fil_n_pending_log_flushes = 0; ulint fil_n_pending_tablespace_flushes = 0; @@ -96,7 +99,6 @@ ulint fil_n_pending_tablespace_flushes = 0; fil_addr_t fil_addr_null = {FIL_NULL, 0}; /* File node of a tablespace or the log data space */ -typedef struct fil_node_struct fil_node_t; struct fil_node_struct { fil_space_t* space; /* backpointer to the space where this node belongs */ @@ -517,7 +519,7 @@ fil_node_open_file( if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Error: the size of single-table tablespace file %s\n" -"InnoDB: is only %lu %lu, should be at least %lu!", node->name, +"InnoDB: is only %lu %lu, should be at least %lu!\n", node->name, (ulong) size_high, (ulong) size_low, (ulong) (4 * UNIV_PAGE_SIZE)); @@ -687,8 +689,8 @@ fil_try_to_close_file_in_LRU( ut_print_filename(stderr, node->name); fprintf(stderr, ", because mod_count %ld != fl_count %ld\n", - (ulong) node->modification_counter, - (ulong) node->flush_counter); + (long) node->modification_counter, + (long) node->flush_counter); } node = UT_LIST_GET_PREV(LRU, node); @@ -1652,30 +1654,38 @@ fil_op_write_log( mtr_t* mtr) /* in: mini-transaction handle */ { byte* log_ptr; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } - log_ptr = mlog_open(mtr, 30); - log_ptr = mlog_write_initial_log_record_for_file_op(type, space_id, 0, log_ptr, mtr); /* Let us store the strings as null-terminated for easier readability and handling */ - mach_write_to_2(log_ptr, ut_strlen(name) + 1); + len = strlen(name) + 1; + + mach_write_to_2(log_ptr, len); log_ptr += 2; - mlog_close(mtr, log_ptr); - mlog_catenate_string(mtr, (byte*) name, ut_strlen(name) + 1); + mlog_catenate_string(mtr, (byte*) name, len); if (type == MLOG_FILE_RENAME) { - log_ptr = mlog_open(mtr, 30); - mach_write_to_2(log_ptr, ut_strlen(new_name) + 1); + ulint len = strlen(new_name) + 1; + log_ptr = mlog_open(mtr, 2 + len); + ut_a(log_ptr); + mach_write_to_2(log_ptr, len); log_ptr += 2; - mlog_close(mtr, log_ptr); - mlog_catenate_string(mtr, (byte*) new_name, - ut_strlen(new_name) + 1); + mlog_catenate_string(mtr, (byte*) new_name, len); } } #endif @@ -2612,12 +2622,12 @@ fil_open_single_table_tablespace( fputs("!\n" "InnoDB: Have you moved InnoDB .ibd files around without using the\n" "InnoDB: commands DISCARD TABLESPACE and IMPORT TABLESPACE?\n" -"InnoDB: It is also possible that this is a table created with\n" -"InnoDB: CREATE TEMPORARY TABLE, and MySQL removed the .ibd file for this.\n" +"InnoDB: It is also possible that this is a temporary table #sql...,\n" +"InnoDB: and MySQL removed the .ibd file for this.\n" "InnoDB: Please refer to\n" "InnoDB:" " http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n" -"InnoDB: how to resolve the issue.\n", stderr); +"InnoDB: for how to resolve the issue.\n", stderr); mem_free(filepath); @@ -2657,7 +2667,7 @@ fil_open_single_table_tablespace( "InnoDB: Please refer to\n" "InnoDB:" " http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n" -"InnoDB: how to resolve the issue.\n", (ulong) space_id, (ulong) id); +"InnoDB: for how to resolve the issue.\n", (ulong) space_id, (ulong) id); ret = FALSE; @@ -3292,7 +3302,7 @@ fil_space_for_table_exists_in_mem( ut_print_filename(stderr, name); fprintf(stderr, "\n" "InnoDB: in InnoDB data dictionary has tablespace id %lu,\n" -"InnoDB: but tablespace with that id does not exist. There is\n" +"InnoDB: but a tablespace with that id does not exist. There is\n" "InnoDB: a tablespace of name %s and id %lu, though. Have\n" "InnoDB: you deleted or moved .ibd files?\n", (ulong) id, namespace->name, @@ -3303,7 +3313,7 @@ fil_space_for_table_exists_in_mem( "InnoDB: Please refer to\n" "InnoDB:" " http://dev.mysql.com/doc/mysql/en/InnoDB_troubleshooting_datadict.html\n" -"InnoDB: how to resolve the issue.\n", stderr); +"InnoDB: for how to resolve the issue.\n", stderr); mem_free(path); mutex_exit(&(system->mutex)); @@ -3317,7 +3327,7 @@ fil_space_for_table_exists_in_mem( ut_print_filename(stderr, name); fprintf(stderr, "\n" "InnoDB: in InnoDB data dictionary has tablespace id %lu,\n" -"InnoDB: but tablespace with that id has name %s.\n" +"InnoDB: but the tablespace with that id has name %s.\n" "InnoDB: Have you deleted or moved .ibd files?\n", (ulong) id, space->name); if (namespace != NULL) { @@ -3815,6 +3825,12 @@ fil_io( mode = OS_AIO_NORMAL; } + if (type == OS_FILE_READ) { + srv_data_read+= len; + } else if (type == OS_FILE_WRITE) { + srv_data_written+= len; + } + /* Reserve the fil_system mutex and make sure that we can open at least one file while holding it, if the file is not already open */ @@ -4016,7 +4032,7 @@ fil_aio_wait( if (os_aio_use_native_aio) { srv_set_io_thread_op_info(segment, "native aio handle"); #ifdef WIN_ASYNC_IO - ret = os_aio_windows_handle(segment, 0, (void**) &fil_node, + ret = os_aio_windows_handle(segment, 0, &fil_node, &message, &type); #elif defined(POSIX_ASYNC_IO) ret = os_aio_posix_handle(segment, &fil_node, &message); @@ -4027,7 +4043,7 @@ fil_aio_wait( } else { srv_set_io_thread_op_info(segment, "simulated aio handle"); - ret = os_aio_simulated_handle(segment, (void**) &fil_node, + ret = os_aio_simulated_handle(segment, &fil_node, &message, &type); } @@ -4100,6 +4116,7 @@ fil_flush( fil_n_pending_tablespace_flushes++; } else { fil_n_pending_log_flushes++; + fil_n_log_flushes++; } #ifdef __WIN__ if (node->is_raw_disk) { diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c index e1621cc2765..ad4228f6797 100644 --- a/innobase/fsp/fsp0fsp.c +++ b/innobase/fsp/fsp0fsp.c @@ -910,7 +910,7 @@ fsp_header_init( if (space == 0) { fsp_fill_free_list(FALSE, space, header, mtr); btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space, - ut_dulint_add(DICT_IBUF_ID_MIN, space), mtr); + ut_dulint_add(DICT_IBUF_ID_MIN, space), FALSE, mtr); } else { fsp_fill_free_list(TRUE, space, header, mtr); } @@ -2325,7 +2325,6 @@ fseg_alloc_free_page_low( dulint seg_id; ulint used; ulint reserved; - fil_addr_t first; xdes_t* descr; /* extent of the hinted page */ ulint ret_page; /* the allocated page offset, FIL_NULL if could not be allocated */ @@ -2428,6 +2427,8 @@ fseg_alloc_free_page_low( } else if (reserved - used > 0) { /* 5. We take any unused page from the segment ==============================================*/ + fil_addr_t first; + if (flst_get_len(seg_inode + FSEG_NOT_FULL, mtr) > 0) { first = flst_get_first(seg_inode + FSEG_NOT_FULL, mtr); @@ -2435,6 +2436,7 @@ fseg_alloc_free_page_low( first = flst_get_first(seg_inode + FSEG_FREE, mtr); } else { ut_error; + return(FIL_NULL); } ret_descr = xdes_lst_get_descriptor(space, first, mtr); diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c index 2191cdc0ee6..d7fa48b6e66 100644 --- a/innobase/ibuf/ibuf0ibuf.c +++ b/innobase/ibuf/ibuf0ibuf.c @@ -46,7 +46,7 @@ Note that contary to what we planned in the 1990's, there will only be one insert buffer tree, and that is in the system tablespace of InnoDB. 1. The first field is the space id. -2. The second field is a one-byte marker which differentiates records from +2. The second field is a one-byte marker (0) which differentiates records from the < 4.1.x storage format. 3. The third field is the page number. 4. The fourth field contains the type info, where we have also added 2 bytes to @@ -55,7 +55,14 @@ insert buffer tree, and that is in the system tablespace of InnoDB. can use in the binary search on the index page in the ibuf merge phase. 5. The rest of the fields contain the fields of the actual index record. -*/ +In versions >= 5.0.3: + +The first byte of the fourth field is an additional marker (0) if the record +is in the compact format. The presence of this marker can be detected by +looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE. + +The high-order bit of the character set field in the type info is the +"nullable" flag for the field. */ /* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM @@ -525,8 +532,8 @@ ibuf_data_init_for_space( ibuf_exit(); sprintf(buf, "SYS_IBUF_TABLE_%lu", (ulong) space); - - table = dict_mem_table_create(buf, space, 2); + /* use old-style record format for the insert buffer */ + table = dict_mem_table_create(buf, space, 2, FALSE); dict_mem_table_add_col(table, "PAGE_NO", DATA_BINARY, 0, 0, 0); dict_mem_table_add_col(table, "TYPES", DATA_BINARY, 0, 0, 0); @@ -541,11 +548,9 @@ ibuf_data_init_for_space( dict_mem_index_add_field(index, "PAGE_NO", 0, 0); dict_mem_index_add_field(index, "TYPES", 0, 0); - index->page_no = FSP_IBUF_TREE_ROOT_PAGE_NO; - index->id = ut_dulint_add(DICT_IBUF_ID_MIN, space); - dict_index_add_to_cache(table, index); + dict_index_add_to_cache(table, index, FSP_IBUF_TREE_ROOT_PAGE_NO); data->index = dict_table_get_first_index(table); @@ -1049,20 +1054,20 @@ ibuf_rec_get_page_no( ulint len; ut_ad(ibuf_inside()); - ut_ad(rec_get_n_fields(rec) > 2); + ut_ad(rec_get_n_fields_old(rec) > 2); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); if (len == 1) { /* This is of the >= 4.1.x record format */ ut_a(trx_sys_multiple_tablespace_format); - field = rec_get_nth_field(rec, 2, &len); + field = rec_get_nth_field_old(rec, 2, &len); } else { ut_a(trx_doublewrite_must_reset_space_ids); ut_a(!trx_sys_multiple_tablespace_format); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); } ut_a(len == 4); @@ -1084,15 +1089,15 @@ ibuf_rec_get_space( ulint len; ut_ad(ibuf_inside()); - ut_ad(rec_get_n_fields(rec) > 2); + ut_ad(rec_get_n_fields_old(rec) > 2); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); if (len == 1) { /* This is of the >= 4.1.x record format */ ut_a(trx_sys_multiple_tablespace_format); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_a(len == 4); return(mach_read_from_4(field)); @@ -1105,6 +1110,162 @@ ibuf_rec_get_space( } /************************************************************************ +Creates a dummy index for inserting a record to a non-clustered index. +*/ +static +dict_index_t* +ibuf_dummy_index_create( +/*====================*/ + /* out: dummy index */ + ulint n, /* in: number of fields */ + ibool comp) /* in: TRUE=use compact record format */ +{ + dict_table_t* table; + dict_index_t* index; + table = dict_mem_table_create("IBUF_DUMMY", + DICT_HDR_SPACE, n, comp); + index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY", + DICT_HDR_SPACE, 0, n); + index->table = table; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + return(index); +} +/************************************************************************ +Add a column to the dummy index */ +static +void +ibuf_dummy_index_add_col( +/*====================*/ + dict_index_t* index, /* in: dummy index */ + dtype_t* type, /* in: the data type of the column */ + ulint len) /* in: length of the column */ +{ + ulint i = index->table->n_def; + dict_mem_table_add_col(index->table, "DUMMY", + dtype_get_mtype(type), + dtype_get_prtype(type), + dtype_get_len(type), + dtype_get_prec(type)); + dict_index_add_col(index, + dict_table_get_nth_col(index->table, i), 0, len); +} +/************************************************************************ +Deallocates a dummy index for inserting a record to a non-clustered index. +*/ +static +void +ibuf_dummy_index_free( +/*====================*/ + dict_index_t* index) /* in: dummy index */ +{ + dict_table_t* table = index->table; + mem_heap_free(index->heap); + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); +} + +/************************************************************************* +Builds the entry to insert into a non-clustered index when we have the +corresponding record in an ibuf index. */ +static +dtuple_t* +ibuf_build_entry_from_ibuf_rec( +/*===========================*/ + /* out, own: entry to insert to + a non-clustered index; NOTE that + as we copy pointers to fields in + ibuf_rec, the caller must hold a + latch to the ibuf_rec page as long + as the entry is used! */ + rec_t* ibuf_rec, /* in: record in an insert buffer */ + mem_heap_t* heap, /* in: heap where built */ + dict_index_t** pindex) /* out, own: dummy index that + describes the entry */ +{ + dtuple_t* tuple; + dfield_t* field; + ulint n_fields; + byte* types; + const byte* data; + ulint len; + ulint i; + dict_index_t* index; + + data = rec_get_nth_field_old(ibuf_rec, 1, &len); + + if (len > 1) { + /* This a < 4.1.x format record */ + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + n_fields = rec_get_n_fields_old(ibuf_rec) - 2; + tuple = dtuple_create(heap, n_fields); + types = rec_get_nth_field_old(ibuf_rec, 1, &len); + + ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old(ibuf_rec, i + 2, &len); + + dfield_set_data(field, data, len); + + dtype_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } + + *pindex = ibuf_dummy_index_create(n_fields, FALSE); + return(tuple); + } + + /* This a >= 4.1.x format record */ + + ut_a(trx_sys_multiple_tablespace_format); + ut_a(*data == 0); + ut_a(rec_get_n_fields_old(ibuf_rec) > 4); + + n_fields = rec_get_n_fields_old(ibuf_rec) - 4; + + tuple = dtuple_create(heap, n_fields); + + types = rec_get_nth_field_old(ibuf_rec, 3, &len); + + ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1); + index = ibuf_dummy_index_create(n_fields, + len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + /* compact record format */ + len--; + ut_a(*types == 0); + types++; + } + + ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old(ibuf_rec, i + 4, &len); + + dfield_set_data(field, data, len); + + dtype_new_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + ibuf_dummy_index_add_col(index, dfield_get_type(field), len); + } + + *pindex = index; + return(tuple); +} + +/************************************************************************ Returns the space taken by a stored non-clustered index entry if converted to an index record. */ static @@ -1125,43 +1286,60 @@ ibuf_rec_get_volume( ulint i; ut_ad(ibuf_inside()); - ut_ad(rec_get_n_fields(ibuf_rec) > 2); - - data = rec_get_nth_field(ibuf_rec, 1, &len); + ut_ad(rec_get_n_fields_old(ibuf_rec) > 2); + + data = rec_get_nth_field_old(ibuf_rec, 1, &len); if (len > 1) { - /* < 4.1.x format record */ + /* < 4.1.x format record */ ut_a(trx_doublewrite_must_reset_space_ids); ut_a(!trx_sys_multiple_tablespace_format); - n_fields = rec_get_n_fields(ibuf_rec) - 2; + n_fields = rec_get_n_fields_old(ibuf_rec) - 2; - types = rec_get_nth_field(ibuf_rec, 1, &len); + types = rec_get_nth_field_old(ibuf_rec, 1, &len); ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); } else { - /* >= 4.1.x format record */ + /* >= 4.1.x format record */ ut_a(trx_sys_multiple_tablespace_format); - new_format = TRUE; + ut_a(*data == 0); + + types = rec_get_nth_field_old(ibuf_rec, 3, &len); + + ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1); + if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + /* compact record format */ + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + dtuple_t* entry = + ibuf_build_entry_from_ibuf_rec( + ibuf_rec, heap, &dummy_index); + volume = rec_get_converted_size(dummy_index, entry); + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + return(volume + page_dir_calc_reserved_space(1)); + } - n_fields = rec_get_n_fields(ibuf_rec) - 4; + n_fields = rec_get_n_fields_old(ibuf_rec) - 4; - types = rec_get_nth_field(ibuf_rec, 3, &len); + new_format = TRUE; } for (i = 0; i < n_fields; i++) { if (new_format) { - data = rec_get_nth_field(ibuf_rec, i + 4, &len); + data = rec_get_nth_field_old(ibuf_rec, i + 4, &len); dtype_new_read_for_order_and_null_size(&dtype, - types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); } else { - data = rec_get_nth_field(ibuf_rec, i + 2, &len); + data = rec_get_nth_field_old(ibuf_rec, i + 2, &len); dtype_read_for_order_and_null_size(&dtype, - types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); } if (len == UNIV_SQL_NULL) { @@ -1187,6 +1365,7 @@ ibuf_entry_build( must be kept because we copy pointers to its fields */ dtuple_t* entry, /* in: entry for a non-clustered index */ + ibool comp, /* in: flag: TRUE=compact record format */ ulint space, /* in: space id */ ulint page_no,/* in: index page number where entry should be inserted */ @@ -1202,11 +1381,14 @@ ibuf_entry_build( /* Starting from 4.1.x, we have to build a tuple whose (1) first field is the space id, - (2) the second field a single marker byte to tell that this + (2) the second field a single marker byte (0) to tell that this is a new format record, (3) the third contains the page number, and (4) the fourth contains the relevent type information of each data - field, + field; the length of this field % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE is + (a) 0 for b-trees in the old format, and + (b) 1 for b-trees in the compact format, the first byte of the field + being the marker (0); (5) and the rest of the fields are copied from entry. All fields in the tuple are ordered like the type binary in our insert buffer tree. */ @@ -1247,10 +1429,15 @@ ibuf_entry_build( dfield_set_data(field, buf, 4); + ut_ad(comp == 0 || comp == 1); /* Store the type info in buf2, and add the fields from entry to tuple */ buf2 = mem_heap_alloc(heap, n_fields - * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + + comp); + if (comp) { + *buf2++ = 0; /* write the compact format indicator */ + } for (i = 0; i < n_fields; i++) { /* We add 4 below because we have the 4 extra fields at the start of an ibuf record */ @@ -1268,8 +1455,13 @@ ibuf_entry_build( field = dtuple_get_nth_field(tuple, 3); + if (comp) { + buf2--; + } + dfield_set_data(field, buf2, n_fields - * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + + comp); /* Set all the types in the new tuple binary */ dtuple_set_types_binary(tuple, n_fields + 4); @@ -1278,88 +1470,6 @@ ibuf_entry_build( } /************************************************************************* -Builds the entry to insert into a non-clustered index when we have the -corresponding record in an ibuf index. */ -static -dtuple_t* -ibuf_build_entry_from_ibuf_rec( -/*===========================*/ - /* out, own: entry to insert to - a non-clustered index; NOTE that - as we copy pointers to fields in - ibuf_rec, the caller must hold a - latch to the ibuf_rec page as long - as the entry is used! */ - rec_t* ibuf_rec, /* in: record in an insert buffer */ - mem_heap_t* heap) /* in: heap where built */ -{ - dtuple_t* tuple; - dfield_t* field; - ulint n_fields; - byte* types; - byte* data; - ulint len; - ulint i; - - data = rec_get_nth_field(ibuf_rec, 1, &len); - - if (len > 1) { - /* This a < 4.1.x format record */ - - ut_a(trx_doublewrite_must_reset_space_ids); - ut_a(!trx_sys_multiple_tablespace_format); - - n_fields = rec_get_n_fields(ibuf_rec) - 2; - tuple = dtuple_create(heap, n_fields); - types = rec_get_nth_field(ibuf_rec, 1, &len); - - ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); - - for (i = 0; i < n_fields; i++) { - field = dtuple_get_nth_field(tuple, i); - - data = rec_get_nth_field(ibuf_rec, i + 2, &len); - - dfield_set_data(field, data, len); - - dtype_read_for_order_and_null_size( - dfield_get_type(field), - types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); - } - - return(tuple); - } - - /* This a >= 4.1.x format record */ - - ut_a(trx_sys_multiple_tablespace_format); - - ut_a(rec_get_n_fields(ibuf_rec) > 4); - - n_fields = rec_get_n_fields(ibuf_rec) - 4; - - tuple = dtuple_create(heap, n_fields); - - types = rec_get_nth_field(ibuf_rec, 3, &len); - - ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - - for (i = 0; i < n_fields; i++) { - field = dtuple_get_nth_field(tuple, i); - - data = rec_get_nth_field(ibuf_rec, i + 4, &len); - - dfield_set_data(field, data, len); - - dtype_new_read_for_order_and_null_size( - dfield_get_type(field), - types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - } - - return(tuple); -} - -/************************************************************************* Builds a search tuple used to search buffered inserts for an index page. This is for < 4.1.x format records */ static @@ -1779,7 +1889,7 @@ ibuf_get_merge_page_nos( contract the tree, FALSE if this is called when a single page becomes full and we look if it pays to read also nearby pages */ - rec_t* first_rec,/* in: record from which we read up and down + rec_t* rec, /* in: record from which we read up and down in the chain of records */ ulint* space_ids,/* in/out: space id's of the pages */ ib_longlong* space_versions,/* in/out: tablespace version @@ -1797,47 +1907,42 @@ ibuf_get_merge_page_nos( ulint first_space_id; ulint rec_page_no; ulint rec_space_id; - rec_t* rec; ulint sum_volumes; ulint volume_for_page; ulint rec_volume; ulint limit; - page_t* page; ulint n_pages; *n_stored = 0; limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool->curr_size / 4); - page = buf_frame_align(first_rec); - - if (first_rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum(rec)) { - first_rec = page_rec_get_prev(first_rec); + rec = page_rec_get_prev(rec); } - if (first_rec == page_get_infimum_rec(page)) { + if (page_rec_is_infimum(rec)) { - first_rec = page_rec_get_next(first_rec); + rec = page_rec_get_next(rec); } - if (first_rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum(rec)) { return(0); } - rec = first_rec; - first_page_no = ibuf_rec_get_page_no(first_rec); - first_space_id = ibuf_rec_get_space(first_rec); + first_page_no = ibuf_rec_get_page_no(rec); + first_space_id = ibuf_rec_get_space(rec); n_pages = 0; prev_page_no = 0; prev_space_id = 0; - /* Go backwards from the first_rec until we reach the border of the + /* Go backwards from the first rec until we reach the border of the 'merge area', or the page start or the limit of storeable pages is reached */ - while ((rec != page_get_infimum_rec(page)) && (n_pages < limit)) { + while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) { rec_page_no = ibuf_rec_get_page_no(rec); rec_space_id = ibuf_rec_get_space(rec); @@ -1872,7 +1977,7 @@ ibuf_get_merge_page_nos( volume_for_page = 0; while (*n_stored < limit) { - if (rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum(rec)) { /* When no more records available, mark this with another 'impossible' pair of space id, page no */ rec_page_no = 1; @@ -2047,8 +2152,7 @@ loop: mutex_exit(&ibuf_mutex); sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur), - space_ids, space_versions, page_nos, - &n_stored); + space_ids, space_versions, page_nos, &n_stored); #ifdef UNIV_IBUF_DEBUG /* fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n", sync, n_stored, sum_sizes); */ @@ -2202,12 +2306,12 @@ ibuf_get_volume_buffered( page = buf_frame_align(rec); - if (rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum(rec)) { rec = page_rec_get_prev(rec); } for (;;) { - if (rec == page_get_infimum_rec(page)) { + if (page_rec_is_infimum(rec)) { break; } @@ -2242,7 +2346,7 @@ ibuf_get_volume_buffered( rec = page_rec_get_prev(rec); for (;;) { - if (rec == page_get_infimum_rec(prev_page)) { + if (page_rec_is_infimum(rec)) { /* We cannot go to yet a previous page, because we do not have the x-latch on it, and cannot acquire one @@ -2265,12 +2369,12 @@ ibuf_get_volume_buffered( count_later: rec = btr_pcur_get_rec(pcur); - if (rec != page_get_supremum_rec(page)) { + if (!page_rec_is_supremum(rec)) { rec = page_rec_get_next(rec); } for (;;) { - if (rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum(rec)) { break; } @@ -2305,7 +2409,7 @@ count_later: rec = page_rec_get_next(rec); for (;;) { - if (rec == page_get_supremum_rec(next_page)) { + if (page_rec_is_supremum(rec)) { /* We give up */ @@ -2344,6 +2448,7 @@ ibuf_update_max_tablespace_id(void) ibuf_data = fil_space_get_ibuf_data(0); ibuf_index = ibuf_data->index; + ut_a(!ibuf_index->table->comp); ibuf_enter(); @@ -2360,7 +2465,7 @@ ibuf_update_max_tablespace_id(void) } else { rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_a(len == 4); @@ -2479,7 +2584,7 @@ ibuf_insert_low( ibuf_enter(); } - entry_size = rec_get_converted_size(entry); + entry_size = rec_get_converted_size(index, entry); heap = mem_heap_create(512); @@ -2487,7 +2592,8 @@ ibuf_insert_low( the first fields and the type information for other fields, and which will be inserted to the insert buffer. */ - ibuf_entry = ibuf_entry_build(entry, space, page_no, heap); + ibuf_entry = ibuf_entry_build(entry, index->table->comp, + space, page_no, heap); /* Open a cursor to the insert buffer tree to calculate if we can add the new entry to it without exceeding the free space limit for the @@ -2532,8 +2638,8 @@ ibuf_insert_low( do_merge = TRUE; ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur), - space_ids, space_versions, page_nos, - &n_stored); + space_ids, space_versions, + page_nos, &n_stored); goto function_exit; } @@ -2656,8 +2762,8 @@ ibuf_insert( ut_a(!(index->type & DICT_CLUSTERED)); - if (rec_get_converted_size(entry) - >= page_get_free_space_of_empty() / 2) { + if (rec_get_converted_size(index, entry) + >= page_get_free_space_of_empty(index->table->comp) / 2) { return(FALSE); } @@ -2692,6 +2798,7 @@ ibuf_insert_to_index_page( dtuple_t* entry, /* in: buffered entry to insert */ page_t* page, /* in: index page where the buffered entry should be placed */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ { page_cur_t page_cur; @@ -2703,13 +2810,21 @@ ibuf_insert_to_index_page( ut_ad(ibuf_inside()); ut_ad(dtuple_check_typed(entry)); - if (rec_get_n_fields(page_rec_get_next(page_get_infimum_rec(page))) - != dtuple_get_n_fields(entry)) { - - fprintf(stderr, + if (UNIV_UNLIKELY(index->table->comp != (ibool)!!page_is_comp(page))) { + fputs( "InnoDB: Trying to insert a record from the insert buffer to an index page\n" -"InnoDB: but the number of fields does not match!\n"); +"InnoDB: but the 'compact' flag does not match!\n", stderr); + goto dump; + } + rec = page_rec_get_next(page_get_infimum_rec(page)); + + if (UNIV_UNLIKELY(rec_get_n_fields(rec, index) + != dtuple_get_n_fields(entry))) { + fputs( +"InnoDB: Trying to insert a record from the insert buffer to an index page\n" +"InnoDB: but the number of fields does not match!\n", stderr); + dump: buf_page_print(page); dtuple_print(stderr, entry); @@ -2723,31 +2838,34 @@ ibuf_insert_to_index_page( return; } - low_match = page_cur_search(page, entry, PAGE_CUR_LE, &page_cur); + low_match = page_cur_search(page, index, entry, + PAGE_CUR_LE, &page_cur); if (low_match == dtuple_get_n_fields(entry)) { rec = page_cur_get_rec(&page_cur); btr_cur_del_unmark_for_ibuf(rec, mtr); } else { - rec = page_cur_tuple_insert(&page_cur, entry, mtr); + rec = page_cur_tuple_insert(&page_cur, entry, index, mtr); if (rec == NULL) { /* If the record did not fit, reorganize */ - btr_page_reorganize(page, mtr); + btr_page_reorganize(page, index, mtr); - page_cur_search(page, entry, PAGE_CUR_LE, &page_cur); + page_cur_search(page, index, entry, + PAGE_CUR_LE, &page_cur); /* This time the record must fit */ - if (!page_cur_tuple_insert(&page_cur, entry, mtr)) { + if (UNIV_UNLIKELY(!page_cur_tuple_insert( + &page_cur, entry, index, mtr))) { ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: Error: Insert buffer insert fails; page free %lu, dtuple size %lu\n", (ulong) page_get_max_insert_size(page, 1), - (ulong) rec_get_converted_size(entry)); + (ulong) rec_get_converted_size(index, entry)); fputs("InnoDB: Cannot insert index record ", stderr); dtuple_print(stderr, entry); @@ -2836,17 +2954,20 @@ ibuf_delete_rec( "InnoDB: ibuf record inserted to page %lu\n", (ulong) page_no); fflush(stderr); - rec_print(stderr, btr_pcur_get_rec(pcur)); - rec_print(stderr, pcur->old_rec); + rec_print_old(stderr, btr_pcur_get_rec(pcur)); + rec_print_old(stderr, pcur->old_rec); dtuple_print(stderr, search_tuple); - rec_print(stderr, page_rec_get_next(btr_pcur_get_rec(pcur))); + rec_print_old(stderr, + page_rec_get_next(btr_pcur_get_rec(pcur))); fflush(stderr); btr_pcur_commit_specify_mtr(pcur, mtr); fputs("InnoDB: Validating insert buffer tree:\n", stderr); - ut_a(btr_validate_tree(ibuf_data->index->tree)); + if (!btr_validate_tree(ibuf_data->index->tree, NULL)) { + ut_error; + } fprintf(stderr, "InnoDB: ibuf tree ok\n"); fflush(stderr); @@ -3075,7 +3196,7 @@ loop: if (corruption_noticed) { fputs("InnoDB: Discarding record\n ", stderr); - rec_print(stderr, ibuf_rec); + rec_print_old(stderr, ibuf_rec); fputs("\n from the insert buffer!\n\n", stderr); } else if (page) { /* Now we have at pcur a record which should be @@ -3083,19 +3204,22 @@ loop: copies pointers to fields in ibuf_rec, and we must keep the latch to the ibuf_rec page until the insertion is finished! */ - - dulint max_trx_id = page_get_max_trx_id( + dict_index_t* dummy_index; + dulint max_trx_id = page_get_max_trx_id( buf_frame_align(ibuf_rec)); page_update_max_trx_id(page, max_trx_id); - entry = ibuf_build_entry_from_ibuf_rec(ibuf_rec, heap); + entry = ibuf_build_entry_from_ibuf_rec(ibuf_rec, + heap, &dummy_index); #ifdef UNIV_IBUF_DEBUG - volume += rec_get_converted_size(entry) + volume += rec_get_converted_size(dummy_index, entry) + page_dir_calc_reserved_space(1); ut_a(volume <= 4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); #endif - ibuf_insert_to_index_page(entry, page, &mtr); + ibuf_insert_to_index_page(entry, page, + dummy_index, &mtr); + ibuf_dummy_index_free(dummy_index); } n_inserts++; diff --git a/innobase/include/Makefile.am b/innobase/include/Makefile.am index 102d25566da..eb1e3b72877 100644 --- a/innobase/include/Makefile.am +++ b/innobase/include/Makefile.am @@ -49,7 +49,7 @@ noinst_HEADERS = btr0btr.h btr0btr.ic btr0cur.h btr0cur.ic \ thr0loc.h thr0loc.ic trx0purge.h trx0purge.ic trx0rec.h \ trx0rec.ic trx0roll.h trx0roll.ic trx0rseg.h trx0rseg.ic \ trx0sys.h trx0sys.ic trx0trx.h trx0trx.ic trx0types.h \ - trx0undo.h trx0undo.ic univ.i \ + trx0undo.h trx0undo.ic trx0xa.h univ.i \ usr0sess.h usr0sess.ic usr0types.h ut0byte.h ut0byte.ic \ ut0dbg.h ut0lst.h ut0mem.h ut0mem.ic ut0rnd.h ut0rnd.ic \ ut0sort.h ut0ut.h ut0ut.ic diff --git a/innobase/include/btr0btr.h b/innobase/include/btr0btr.h index 8606fcd2a5c..1f3a32fa70c 100644 --- a/innobase/include/btr0btr.h +++ b/innobase/include/btr0btr.h @@ -155,7 +155,8 @@ ulint btr_node_ptr_get_child_page_no( /*===========================*/ /* out: child node address */ - rec_t* rec); /* in: node pointer record */ + rec_t* rec, /* in: node pointer record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /**************************************************************** Creates the root node for a new index tree. */ @@ -167,6 +168,7 @@ btr_create( ulint type, /* in: type of the index */ ulint space, /* in: space where created */ dulint index_id,/* in: index id */ + ulint comp, /* in: nonzero=compact page format */ mtr_t* mtr); /* in: mini-transaction handle */ /**************************************************************** Frees a B-tree except the root page, which MUST be freed after this @@ -210,8 +212,9 @@ Reorganizes an index page. */ void btr_page_reorganize( /*================*/ - page_t* page, /* in: page to be reorganized */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Decides if the page should be split at the convergence point of inserts converging to left. */ @@ -273,6 +276,7 @@ void btr_set_min_rec_mark( /*=================*/ rec_t* rec, /* in: record */ + ulint comp, /* in: nonzero=compact page format */ mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes on the upper level the node pointer to a page. */ @@ -332,6 +336,7 @@ btr_parse_set_min_rec_mark( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** @@ -340,11 +345,12 @@ Parses a redo log record of reorganizing a page. */ byte* btr_parse_page_reorganize( /*======================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /****************************************************************** Gets the number of pages in a B-tree. */ @@ -392,6 +398,7 @@ btr_page_free_low( page_t* page, /* in: page to be freed, x-latched */ ulint level, /* in: page level */ mtr_t* mtr); /* in: mtr */ +#ifdef UNIV_BTR_PRINT /***************************************************************** Prints size info of a B-tree. */ @@ -408,6 +415,7 @@ btr_print_tree( dict_tree_t* tree, /* in: tree */ ulint width); /* in: print this many entries from start and end */ +#endif /* UNIV_BTR_PRINT */ /**************************************************************** Checks the size and number of fields in a record based on the definition of the index. */ @@ -428,7 +436,8 @@ ibool btr_validate_tree( /*==============*/ /* out: TRUE if ok */ - dict_tree_t* tree); /* in: tree */ + dict_tree_t* tree, /* in: tree */ + trx_t* trx); /* in: transaction or NULL */ #define BTR_N_LEAF_PAGES 1 #define BTR_TOTAL_SIZE 2 diff --git a/innobase/include/btr0btr.ic b/innobase/include/btr0btr.ic index b0aa0756307..a0860b1c3a7 100644 --- a/innobase/include/btr0btr.ic +++ b/innobase/include/btr0btr.ic @@ -183,26 +183,27 @@ ulint btr_node_ptr_get_child_page_no( /*===========================*/ /* out: child node address */ - rec_t* rec) /* in: node pointer record */ + rec_t* rec, /* in: node pointer record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n_fields; byte* field; ulint len; ulint page_no; - n_fields = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); /* The child address is in the last field */ - field = rec_get_nth_field(rec, n_fields - 1, &len); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); ut_ad(len == 4); page_no = mach_read_from_4(field); - if (page_no == 0) { + if (UNIV_UNLIKELY(page_no == 0)) { fprintf(stderr, "InnoDB: a nonsensical page number 0 in a node ptr record at offset %lu\n", - (unsigned long)(rec - buf_frame_align(rec))); + (ulong) ut_align_offset(rec, UNIV_PAGE_SIZE)); buf_page_print(buf_frame_align(rec)); } diff --git a/innobase/include/btr0cur.h b/innobase/include/btr0cur.h index f1334656d53..352d1739b6a 100644 --- a/innobase/include/btr0cur.h +++ b/innobase/include/btr0cur.h @@ -34,7 +34,7 @@ page_cur_t* btr_cur_get_page_cur( /*=================*/ /* out: pointer to page cursor component */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Returns the record pointer of a tree cursor. */ UNIV_INLINE @@ -42,14 +42,14 @@ rec_t* btr_cur_get_rec( /*============*/ /* out: pointer to record */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Invalidates a tree cursor by setting record pointer to NULL. */ UNIV_INLINE void btr_cur_invalidate( /*===============*/ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Returns the page of a tree cursor. */ UNIV_INLINE @@ -57,7 +57,7 @@ page_t* btr_cur_get_page( /*=============*/ /* out: pointer to page */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Returns the tree of a cursor. */ UNIV_INLINE @@ -65,7 +65,7 @@ dict_tree_t* btr_cur_get_tree( /*=============*/ /* out: tree */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Positions a tree cursor at a given record. */ UNIV_INLINE @@ -283,8 +283,8 @@ only used by the insert buffer insert merge mechanism. */ void btr_cur_del_unmark_for_ibuf( /*========================*/ - rec_t* rec, /* in: record to delete unmark */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in: record to delete unmark */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Tries to compress a page of the tree on the leaf level. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid @@ -361,10 +361,11 @@ Parses a redo log record of updating a record in-place. */ byte* btr_cur_parse_update_in_place( /*==========================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + dict_index_t* index); /* in: index corresponding to page */ /******************************************************************** Parses the redo log record for delete marking or unmarking of a clustered index record. */ @@ -372,10 +373,11 @@ index record. */ byte* btr_cur_parse_del_mark_set_clust_rec( /*=================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: index corresponding to page */ + page_t* page); /* in: page or NULL */ /******************************************************************** Parses the redo log record for delete marking or unmarking of a secondary index record. */ @@ -383,10 +385,10 @@ index record. */ byte* btr_cur_parse_del_mark_set_sec_rec( /*===============================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page); /* in: page or NULL */ /*********************************************************************** Estimates the number of rows in a given index range. */ @@ -417,9 +419,10 @@ to free the field. */ void btr_cur_mark_extern_inherited_fields( /*=================================*/ - rec_t* rec, /* in: record in a clustered index */ - upd_t* update, /* in: update vector */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update, /* in: update vector */ + mtr_t* mtr); /* in: mtr */ /*********************************************************************** The complement of the previous function: in an update entry may inherit some externally stored fields from a record. We must mark them as inherited @@ -456,6 +459,7 @@ btr_store_big_rec_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ mtr_t* local_mtr); /* in: mtr containing the latch to @@ -496,6 +500,7 @@ btr_rec_free_externally_stored_fields( dict_index_t* index, /* in: index of the data, the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -510,6 +515,7 @@ btr_rec_copy_externally_stored_field( /*=================================*/ /* out: the field copied to heap */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint no, /* in: field number */ ulint* len, /* out: length of the field */ mem_heap_t* heap); /* in: mem heap */ @@ -540,10 +546,10 @@ ulint btr_push_update_extern_fields( /*==========================*/ /* out: number of values stored in ext_vect */ - ulint* ext_vect, /* in: array of ulints, must be preallocated - to have place for all fields in rec */ - rec_t* rec, /* in: record */ - upd_t* update); /* in: update vector */ + ulint* ext_vect,/* in: array of ulints, must be preallocated + to have space for all fields in rec */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update);/* in: update vector or NULL */ /*######################################################################*/ diff --git a/innobase/include/btr0cur.ic b/innobase/include/btr0cur.ic index a3a04b60c45..bf8a6efb68d 100644 --- a/innobase/include/btr0cur.ic +++ b/innobase/include/btr0cur.ic @@ -52,7 +52,9 @@ btr_cur_get_page( /* out: pointer to page */ btr_cur_t* cursor) /* in: tree cursor */ { - return(buf_frame_align(page_cur_get_rec(&(cursor->page_cur)))); + page_t* page = buf_frame_align(page_cur_get_rec(&(cursor->page_cur))); + ut_ad(!!page_is_comp(page) == cursor->index->table->comp); + return(page); } /************************************************************* @@ -134,17 +136,15 @@ btr_cur_can_delete_without_compress( /* out: TRUE if can be deleted without recommended compression */ btr_cur_t* cursor, /* in: btr cursor */ + ulint rec_size,/* in: rec_get_size(btr_cur_get_rec(cursor))*/ mtr_t* mtr) /* in: mtr */ { - ulint rec_size; page_t* page; ut_ad(mtr_memo_contains(mtr, buf_block_align( btr_cur_get_page(cursor)), MTR_MEMO_PAGE_X_FIX)); - rec_size = rec_get_size(btr_cur_get_rec(cursor)); - page = btr_cur_get_page(cursor); if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT) diff --git a/innobase/include/btr0pcur.h b/innobase/include/btr0pcur.h index 9339eb5d0ee..eb3822aab7a 100644 --- a/innobase/include/btr0pcur.h +++ b/innobase/include/btr0pcur.h @@ -462,6 +462,7 @@ struct btr_pcur_struct{ contains an initial segment of the latest record cursor was positioned either on, before, or after */ + ulint old_n_fields; /* number of fields in old_rec */ ulint rel_pos; /* BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on whether cursor was on, before, or after the diff --git a/innobase/include/btr0sea.h b/innobase/include/btr0sea.h index ce4140ecf92..78e88a24083 100644 --- a/innobase/include/btr0sea.h +++ b/innobase/include/btr0sea.h @@ -77,8 +77,10 @@ parameters as page (this often happens when a page is split). */ void btr_search_move_or_delete_hash_entries( /*===================================*/ - page_t* new_page, /* in: records are copied to this page */ - page_t* page); /* in: index page */ + page_t* new_page, /* in: records are copied + to this page */ + page_t* page, /* in: index page */ + dict_index_t* index); /* in: record descriptor */ /************************************************************************ Drops a page hash index. */ @@ -129,8 +131,8 @@ Validates the search system. */ ibool btr_search_validate(void); -/*=====================*/ - +/*======================*/ + /* out: TRUE if ok */ /* Search info directions */ #define BTR_SEA_NO_DIRECTION 1 diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index 53599d03c73..ae8d0411c12 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -52,11 +52,17 @@ Created 11/5/1995 Heikki Tuuri /* Modes for buf_page_get_known_nowait */ #define BUF_MAKE_YOUNG 51 #define BUF_KEEP_OLD 52 +/* Magic value to use instead of checksums when they are disabled */ +#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL extern buf_pool_t* buf_pool; /* The buffer pool of the database */ +#ifdef UNIV_DEBUG extern ibool buf_debug_prints;/* If this is set TRUE, the program prints info whenever read or flush occurs */ +#endif /* UNIV_DEBUG */ +extern ulint srv_buf_pool_write_requests; /* variable to count write request + issued */ /************************************************************************ Creates the buffer pool. */ @@ -378,10 +384,10 @@ Returns the value of the modify clock. The caller must have an s-lock or x-lock on the block. */ UNIV_INLINE dulint -buf_frame_get_modify_clock( +buf_block_get_modify_clock( /*=======================*/ /* out: value */ - buf_frame_t* frame); /* in: pointer to a frame */ + buf_block_t* block); /* in: block */ /************************************************************************ Calculates a page checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value @@ -476,12 +482,20 @@ buf_pool_is_block( /*==============*/ /* out: TRUE if pointer to block */ void* ptr); /* in: pointer to memory */ +#ifdef UNIV_DEBUG /************************************************************************* Validates the buffer pool data structure. */ ibool buf_validate(void); /*==============*/ +/************************************************************************* +Prints info of the buffer pool data structure. */ + +void +buf_print(void); +/*============*/ +#endif /* UNIV_DEBUG */ /************************************************************************ Prints a page to stderr. */ @@ -490,11 +504,11 @@ buf_page_print( /*===========*/ byte* read_buf); /* in: a database page */ /************************************************************************* -Prints info of the buffer pool data structure. */ +Returns the number of latched pages in the buffer pool. */ -void -buf_print(void); -/*============*/ +ulint +buf_get_latched_pages_number(void); +/*==============================*/ /************************************************************************* Returns the number of pending buf pool ios. */ @@ -731,6 +745,8 @@ struct buf_block_struct{ buffer pool which are index pages, but this flag is not set because we do not keep track of all pages */ + dict_index_t* index; /* index for which the adaptive + hash index has been created */ /* 2. Page flushing fields */ UT_LIST_NODE_T(buf_block_t) flush_list; diff --git a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic index 681a0ef000a..d949254d47d 100644 --- a/innobase/include/buf0buf.ic +++ b/innobase/include/buf0buf.ic @@ -11,10 +11,11 @@ Created 11/5/1995 Heikki Tuuri #include "buf0rea.h" #include "mtr0mtr.h" +#ifdef UNIV_DEBUG extern ulint buf_dbg_counter; /* This is used to insert validation operations in execution in the debug version */ - +#endif /* UNIV_DEBUG */ /************************************************************************ Recommends a move of a block to the start of the LRU list if there is danger of dropping from the buffer pool. NOTE: does not reserve the buffer pool @@ -26,12 +27,8 @@ buf_block_peek_if_too_old( /* out: TRUE if should be made younger */ buf_block_t* block) /* in: block to make younger */ { - if (buf_pool->freed_page_clock >= block->freed_page_clock - + 1 + (buf_pool->curr_size / 1024)) { - return(TRUE); - } - - return(FALSE); + return(buf_pool->freed_page_clock >= block->freed_page_clock + + 1 + (buf_pool->curr_size / 1024)); } /************************************************************************* @@ -210,8 +207,8 @@ buf_block_align( frame_zero = buf_pool->frame_zero; - if ((ulint)ptr < (ulint)frame_zero - || (ulint)ptr > (ulint)(buf_pool->high_end)) { + if (UNIV_UNLIKELY((ulint)ptr < (ulint)frame_zero) + || UNIV_UNLIKELY((ulint)ptr > (ulint)(buf_pool->high_end))) { ut_print_timestamp(stderr); fprintf(stderr, @@ -246,8 +243,8 @@ buf_frame_align( frame = ut_align_down(ptr, UNIV_PAGE_SIZE); - if (((ulint)frame < (ulint)(buf_pool->frame_zero)) - || (ulint)frame >= (ulint)(buf_pool->high_end)) { + if (UNIV_UNLIKELY((ulint)frame < (ulint)(buf_pool->frame_zero)) + || UNIV_UNLIKELY((ulint)frame >= (ulint)(buf_pool->high_end))) { ut_print_timestamp(stderr); fprintf(stderr, @@ -485,17 +482,11 @@ Returns the value of the modify clock. The caller must have an s-lock or x-lock on the block. */ UNIV_INLINE dulint -buf_frame_get_modify_clock( +buf_block_get_modify_clock( /*=======================*/ /* out: value */ - buf_frame_t* frame) /* in: pointer to a frame */ + buf_block_t* block) /* in: block */ { - buf_block_t* block; - - ut_ad(frame); - - block = buf_block_align(frame); - #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); diff --git a/innobase/include/buf0flu.ic b/innobase/include/buf0flu.ic index d6dbdcc0865..9a8a021e029 100644 --- a/innobase/include/buf0flu.ic +++ b/innobase/include/buf0flu.ic @@ -61,6 +61,8 @@ buf_flush_note_modification( ut_ad(ut_dulint_cmp(block->oldest_modification, mtr->start_lsn) <= 0); } + + ++srv_buf_pool_write_requests; } /************************************************************************ diff --git a/innobase/include/buf0lru.h b/innobase/include/buf0lru.h index 45164dd561e..fb29b44ba98 100644 --- a/innobase/include/buf0lru.h +++ b/innobase/include/buf0lru.h @@ -122,6 +122,7 @@ void buf_LRU_make_block_old( /*===================*/ buf_block_t* block); /* in: control block */ +#ifdef UNIV_DEBUG /************************************************************************** Validates the LRU list. */ @@ -134,6 +135,7 @@ Prints the LRU list. */ void buf_LRU_print(void); /*===============*/ +#endif /* UNIV_DEBUG */ #ifndef UNIV_NONINL #include "buf0lru.ic" diff --git a/innobase/include/data0type.h b/innobase/include/data0type.h index 02c874836fd..7e9692eca5a 100644 --- a/innobase/include/data0type.h +++ b/innobase/include/data0type.h @@ -12,7 +12,7 @@ Created 1/16/1996 Heikki Tuuri #include "univ.i" extern ulint data_mysql_default_charset_coll; -extern ulint data_mysql_latin1_swedish_charset_coll; +#define DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL 8 /* SQL data type struct */ typedef struct dtype_struct dtype_t; @@ -24,7 +24,11 @@ extern dtype_t* dtype_binary; /*-------------------------------------------*/ /* The 'MAIN TYPE' of a column */ #define DATA_VARCHAR 1 /* character varying of the - latin1_swedish_ci charset-collation */ + latin1_swedish_ci charset-collation; note + that the MySQL format for this, DATA_BINARY, + DATA_VARMYSQL, is also affected by whether the + 'precise type' contains + DATA_MYSQL_TRUE_VARCHAR */ #define DATA_CHAR 2 /* fixed length character of the latin1_swedish_ci charset-collation */ #define DATA_FIXBINARY 3 /* binary string of fixed length */ @@ -32,7 +36,9 @@ extern dtype_t* dtype_binary; #define DATA_BLOB 5 /* binary large object, or a TEXT type; if prtype & DATA_BINARY_TYPE == 0, then this is actually a TEXT column (or a BLOB created - with < 4.0.14) */ + with < 4.0.14; since column prefix indexes + came only in 4.0.14, the missing flag in BLOBs + created before that does not cause any harm) */ #define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */ #define DATA_SYS_CHILD 7 /* address of the child page in node pointer */ #define DATA_SYS 8 /* system column */ @@ -102,6 +108,8 @@ columns, and for them the precise type is usually not used at all. #define DATA_MYSQL_TYPE_MASK 255 /* AND with this mask to extract the MySQL type from the precise type */ +#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3 + format true VARCHAR */ /* Precise data types for system columns and the length of those columns; NOTE: the values must run from 0 up in the order given! All codes must @@ -134,6 +142,10 @@ be less than 256 */ In earlier versions this was set for some BLOB columns. */ +#define DATA_LONG_TRUE_VARCHAR 4096 /* this is ORed to the precise data + type when the column is true VARCHAR where + MySQL uses 2 bytes to store the data len; + for shorter VARCHARs MySQL uses only 1 byte */ /*-------------------------------------------*/ /* This many bytes we need to store the type information affecting the @@ -145,28 +157,31 @@ store the charset-collation number; one byte is left unused, though */ #define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 /************************************************************************* -Checks if a string type has to be compared by the MySQL comparison functions. -InnoDB internally only handles binary byte string comparisons, as well as -latin1_swedish_ci strings. For example, UTF-8 strings have to be compared -by MySQL. */ - -ibool -dtype_str_needs_mysql_cmp( -/*======================*/ - /* out: TRUE if a string type that requires - comparison with MySQL functions */ - dtype_t* dtype); /* in: type struct */ +Gets the MySQL type code from a dtype. */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + /* out: MySQL type code; this is NOT an InnoDB + type code! */ + dtype_t* type); /* in: type struct */ /************************************************************************* -For the documentation of this function, see innobase_get_at_most_n_mbchars() -in ha_innodb.cc. */ +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. */ ulint dtype_get_at_most_n_mbchars( /*========================*/ - dtype_t* dtype, - ulint prefix_len, - ulint data_len, - const char* str); + /* out: length of the prefix, + in bytes */ + const dtype_t* dtype, /* in: data type */ + ulint prefix_len, /* in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /* in: length of str (in bytes) */ + const char* str); /* in: the string whose prefix + length is being determined */ /************************************************************************* Checks if a data main type is a string type. Also a BLOB is considered a string type. */ @@ -271,6 +286,24 @@ dtype_get_prec( /*===========*/ dtype_t* type); /************************************************************************* +Gets the minimum length of a character, in bytes. */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + /* out: minimum length of a char, in bytes, + or 0 if this is not a character type */ + const dtype_t* type); /* in: type */ +/************************************************************************* +Gets the maximum length of a character, in bytes. */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + /* out: maximum length of a char, in bytes, + or 0 if this is not a character type */ + const dtype_t* type); /* in: type */ +/************************************************************************* Gets the padding character code for the type. */ UNIV_INLINE ulint @@ -288,6 +321,14 @@ dtype_get_fixed_size( /* out: fixed size, or 0 */ dtype_t* type); /* in: type */ /*************************************************************************** +Returns the minimum size of a data type. */ +UNIV_INLINE +ulint +dtype_get_min_size( +/*===============*/ + /* out: minimum size */ + const dtype_t* type); /* in: type */ +/*************************************************************************** Returns a stored SQL NULL size for a type. For fixed length types it is the fixed length of the type, otherwise 0. */ UNIV_INLINE @@ -352,16 +393,34 @@ dtype_print( /*========*/ dtype_t* type); /* in: type */ -/* Structure for an SQL data type */ +/* Structure for an SQL data type. +If you add fields to this structure, be sure to initialize them everywhere. +This structure is initialized in the following functions: +dtype_set() +dtype_read_for_order_and_null_size() +dtype_new_read_for_order_and_null_size() +sym_tab_add_null_lit() */ struct dtype_struct{ ulint mtype; /* main data type */ - ulint prtype; /* precise type; MySQL data type */ - - /* the remaining two fields do not affect alphabetical ordering: */ - - ulint len; /* length */ + ulint prtype; /* precise type; MySQL data type, charset code, + flags to indicate nullability, signedness, + whether this is a binary string, whether this + is a true VARCHAR where MySQL uses 2 bytes to + store the length */ + + /* the remaining fields do not affect alphabetical ordering: */ + + ulint len; /* length; for MySQL data this is + field->pack_length(), except that for a + >= 5.0.3 type true VARCHAR this is the + maximum byte length of the string data + (in addition to the string, MySQL uses 1 or + 2 bytes to store the string length) */ ulint prec; /* precision */ + + ulint mbminlen; /* minimum length of a character, in bytes */ + ulint mbmaxlen; /* maximum length of a character, in bytes */ }; #ifndef UNIV_NONINL diff --git a/innobase/include/data0type.ic b/innobase/include/data0type.ic index 946b646ffbf..06d45dd5501 100644 --- a/innobase/include/data0type.ic +++ b/innobase/include/data0type.ic @@ -8,6 +8,61 @@ Created 1/16/1996 Heikki Tuuri #include "mach0data.h" +/********************************************************************** +Get the variable length bounds of the given character set. + +NOTE: the prototype of this function is copied from ha_innodb.cc! If you change +this function, you MUST change also the prototype here! */ +extern +void +innobase_get_cset_width( +/*====================*/ + ulint cset, /* in: MySQL charset-collation code */ + ulint* mbminlen, /* out: minimum length of a char (in bytes) */ + ulint* mbmaxlen); /* out: maximum length of a char (in bytes) */ + +/************************************************************************* +Gets the MySQL charset-collation code for MySQL string types. */ +UNIV_INLINE +ulint +dtype_get_charset_coll( +/*===================*/ + ulint prtype) /* in: precise data type */ +{ + return((prtype >> 16) & 0xFFUL); +} + +/************************************************************************* +Gets the MySQL type code from a dtype. */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + /* out: MySQL type code; this is NOT an InnoDB + type code! */ + dtype_t* type) /* in: type struct */ +{ + return(type->prtype & 0xFFUL); +} + +/************************************************************************* +Sets the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_set_mblen( +/*============*/ + dtype_t* type) /* in/out: type struct */ +{ + ut_ad(type); + if (dtype_is_string_type(type->mtype)) { + innobase_get_cset_width(dtype_get_charset_coll(type->prtype), + &type->mbminlen, &type->mbmaxlen); + ut_ad(type->mbminlen <= type->mbmaxlen); + } else { + type->mbminlen = type->mbmaxlen = 0; + } +} + /************************************************************************* Sets a data type structure. */ UNIV_INLINE @@ -28,6 +83,7 @@ dtype_set( type->len = len; type->prec = prec; + dtype_set_mblen(type); ut_ad(dtype_validate(type)); } @@ -72,17 +128,6 @@ dtype_get_prtype( } /************************************************************************* -Gets the MySQL charset-collation code for MySQL string types. */ -UNIV_INLINE -ulint -dtype_get_charset_coll( -/*===================*/ - ulint prtype) /* in: precise data type */ -{ - return((prtype >> 16) & 0xFFUL); -} - -/************************************************************************* Gets the type length. */ UNIV_INLINE ulint @@ -109,6 +154,33 @@ dtype_get_prec( } /************************************************************************* +Gets the minimum length of a character, in bytes. */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + /* out: minimum length of a char, in bytes, + or 0 if this is not a character type */ + const dtype_t* type) /* in: type */ +{ + ut_ad(type); + return(type->mbminlen); +} +/************************************************************************* +Gets the maximum length of a character, in bytes. */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + /* out: maximum length of a char, in bytes, + or 0 if this is not a character type */ + const dtype_t* type) /* in: type */ +{ + ut_ad(type); + return(type->mbmaxlen); +} + +/************************************************************************* Gets the padding character code for the type. */ UNIV_INLINE ulint @@ -123,10 +195,12 @@ dtype_get_pad_char( || type->mtype == DATA_BINARY || type->mtype == DATA_FIXBINARY || type->mtype == DATA_MYSQL - || type->mtype == DATA_VARMYSQL) { + || type->mtype == DATA_VARMYSQL + || (type->mtype == DATA_BLOB + && (type->prtype & DATA_BINARY_TYPE) == 0)) { /* Space is the padding character for all char and binary - strings */ + strings, and starting from 5.0.3, also for TEXT strings. */ return((ulint)' '); } @@ -149,8 +223,10 @@ dtype_new_store_for_order_and_null_size( bytes where we store the info */ dtype_t* type) /* in: type struct */ { - ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + buf[0] = (byte)(type->mtype & 0xFFUL); if (type->prtype & DATA_BINARY_TYPE) { @@ -166,10 +242,12 @@ dtype_new_store_for_order_and_null_size( mach_write_to_2(buf + 2, type->len & 0xFFFFUL); + ut_ad(dtype_get_charset_coll(type->prtype) < 256); mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); - /* Note that the second last byte is left unused, because the - charset-collation code is always < 256 */ + if (type->prtype & DATA_NOT_NULL) { + buf[4] |= 128; + } } /************************************************************************** @@ -196,6 +274,7 @@ dtype_read_for_order_and_null_size( type->prtype = dtype_form_prtype(type->prtype, data_mysql_default_charset_coll); + dtype_set_mblen(type); } /************************************************************************** @@ -211,20 +290,26 @@ dtype_new_read_for_order_and_null_size( { ulint charset_coll; - ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif type->mtype = buf[0] & 63; type->prtype = buf[1]; if (buf[0] & 128) { - type->prtype = type->prtype | DATA_BINARY_TYPE; + type->prtype |= DATA_BINARY_TYPE; + } + + if (buf[4] & 128) { + type->prtype |= DATA_NOT_NULL; } type->len = mach_read_from_2(buf + 2); mach_read_from_2(buf + 4); - charset_coll = mach_read_from_2(buf + 4); + charset_coll = mach_read_from_2(buf + 4) & 0x7fff; if (dtype_is_string_type(type->mtype)) { ut_a(charset_coll < 256); @@ -241,8 +326,10 @@ dtype_new_read_for_order_and_null_size( type->prtype = dtype_form_prtype(type->prtype, charset_coll); } + dtype_set_mblen(type); } +#ifndef UNIV_HOTBACKUP /*************************************************************************** Returns the size of a fixed size data type, 0 if not a fixed size type. */ UNIV_INLINE @@ -257,23 +344,127 @@ dtype_get_fixed_size( mtype = dtype_get_mtype(type); switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (type->prtype & DATA_MYSQL_TYPE_MASK) { + default: + ut_ad(0); + return(0); + case DATA_ROW_ID: + ut_ad(type->len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(type->len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(type->len == DATA_ROLL_PTR_LEN); + break; + case DATA_MIX_ID: + ut_ad(type->len == DATA_MIX_ID_LEN); + break; + } +#endif /* UNIV_DEBUG */ case DATA_CHAR: case DATA_FIXBINARY: case DATA_INT: case DATA_FLOAT: case DATA_DOUBLE: - case DATA_MYSQL: return(dtype_get_len(type)); - - case DATA_SYS: if (type->prtype == DATA_ROW_ID) { - return(DATA_ROW_ID_LEN); - } else if (type->prtype == DATA_TRX_ID) { - return(DATA_TRX_ID_LEN); - } else if (type->prtype == DATA_ROLL_PTR) { - return(DATA_ROLL_PTR_LEN); + case DATA_MYSQL: + if (type->prtype & DATA_BINARY_TYPE) { + return(dtype_get_len(type)); } else { + /* We play it safe here and ask MySQL for + mbminlen and mbmaxlen. Although + type->mbminlen and type->mbmaxlen are + initialized if and only if type->prtype + is (in one of the 3 functions in this file), + it could be that none of these functions + has been called. */ + + ulint mbminlen, mbmaxlen; + + innobase_get_cset_width( + dtype_get_charset_coll(type->prtype), + &mbminlen, &mbmaxlen); + + if (UNIV_UNLIKELY(type->mbminlen != mbminlen) + || UNIV_UNLIKELY(type->mbmaxlen != mbmaxlen)) { + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: " + "mbminlen=%lu, " + "mbmaxlen=%lu, " + "type->mbminlen=%lu, " + "type->mbmaxlen=%lu\n", + (ulong) mbminlen, + (ulong) mbmaxlen, + (ulong) type->mbminlen, + (ulong) type->mbmaxlen); + } + if (mbminlen == mbmaxlen) { + return(dtype_get_len(type)); + } + } + /* fall through for variable-length charsets */ + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_BLOB: + return(0); + default: ut_error; + } + + return(0); +} + +/*************************************************************************** +Returns the size of a fixed size data type, 0 if not a fixed size type. */ +UNIV_INLINE +ulint +dtype_get_min_size( +/*===============*/ + /* out: minimum size */ + const dtype_t* type) /* in: type */ +{ + switch (type->mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (type->prtype & DATA_MYSQL_TYPE_MASK) { + default: + ut_ad(0); return(0); + case DATA_ROW_ID: + ut_ad(type->len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(type->len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(type->len == DATA_ROLL_PTR_LEN); + break; + case DATA_MIX_ID: + ut_ad(type->len == DATA_MIX_ID_LEN); + break; + } +#endif /* UNIV_DEBUG */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return(type->len); + case DATA_MYSQL: + if ((type->prtype & DATA_BINARY_TYPE) + || type->mbminlen == type->mbmaxlen) { + return(type->len); } + /* this is a variable-length character set */ + ut_a(type->mbminlen > 0); + ut_a(type->mbmaxlen > type->mbminlen); + ut_a(type->len % type->mbmaxlen == 0); + return(type->len * type->mbminlen / type->mbmaxlen); case DATA_VARCHAR: case DATA_BINARY: case DATA_DECIMAL: @@ -285,6 +476,7 @@ dtype_get_fixed_size( return(0); } +#endif /* !UNIV_HOTBACKUP */ /*************************************************************************** Returns a stored SQL NULL size for a type. For fixed length types it is diff --git a/innobase/include/dict0boot.h b/innobase/include/dict0boot.h index 35eff5af29a..86702cbca05 100644 --- a/innobase/include/dict0boot.h +++ b/innobase/include/dict0boot.h @@ -119,6 +119,7 @@ dict_create(void); clustered index */ #define DICT_SYS_INDEXES_PAGE_NO_FIELD 8 #define DICT_SYS_INDEXES_SPACE_NO_FIELD 7 +#define DICT_SYS_INDEXES_TYPE_FIELD 6 /* When a row id which is zero modulo this number (which must be a power of two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is diff --git a/innobase/include/dict0crea.h b/innobase/include/dict0crea.h index 8b6944fc605..5dd571be59c 100644 --- a/innobase/include/dict0crea.h +++ b/innobase/include/dict0crea.h @@ -54,6 +54,20 @@ dict_create_index_step( /* out: query thread to run next or NULL */ que_thr_t* thr); /* in: query thread */ /*********************************************************************** +Truncates the index tree associated with a row in SYS_INDEXES table. */ + +ulint +dict_truncate_index_tree( +/*=====================*/ + /* out: new root page number, or + FIL_NULL on failure */ + dict_table_t* table, /* in: the table the index belongs to */ + rec_t* rec, /* in: record in the clustered index of + SYS_INDEXES table */ + mtr_t* mtr); /* in: mtr having the latch + on the record page. The mtr may be + committed and restarted in this call. */ +/*********************************************************************** Drops the index tree associated with a row in SYS_INDEXES table. */ void @@ -142,6 +156,7 @@ struct ind_node_struct{ /*----------------------*/ /* Local storage for this graph node */ ulint state; /* node execution state */ + ulint page_no;/* root page number of the index */ dict_table_t* table; /* table which owns the index */ dtuple_t* ind_row;/* index definition row built */ ulint field_no;/* next field definition to insert */ diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h index 3333385ec56..d9cda402bac 100644 --- a/innobase/include/dict0dict.h +++ b/innobase/include/dict0dict.h @@ -516,8 +516,9 @@ dict_index_add_to_cache( /*====================*/ /* out: TRUE if success */ dict_table_t* table, /* in: table on which the index is */ - dict_index_t* index); /* in, own: index; NOTE! The index memory + dict_index_t* index, /* in, own: index; NOTE! The index memory object is freed in this function! */ + ulint page_no);/* in: root page number of the index */ /************************************************************************ Gets the number of fields in the internal representation of an index, including fields added by the dictionary system. */ @@ -647,6 +648,16 @@ dict_index_get_sys_col_pos( dict_index_t* index, /* in: index */ ulint type); /* in: DATA_ROW_ID, ... */ /*********************************************************************** +Adds a column to index. */ + +void +dict_index_add_col( +/*===============*/ + dict_index_t* index, /* in: index */ + dict_col_t* col, /* in: column */ + ulint order, /* in: order criterion */ + ulint prefix_len); /* in: column prefix length */ +/*********************************************************************** Copies types of fields contained in index to tuple. */ void @@ -655,18 +666,6 @@ dict_index_copy_types( dtuple_t* tuple, /* in: data tuple */ dict_index_t* index, /* in: index */ ulint n_fields); /* in: number of field types to copy */ -/************************************************************************ -Gets the value of a system column in a clustered index record. The clustered -index must contain the system column: if the index is unique, row id is -not contained there! */ -UNIV_INLINE -dulint -dict_index_rec_get_sys_col( -/*=======================*/ - /* out: system column value */ - dict_index_t* index, /* in: clustered index describing the record */ - ulint type, /* in: column type: DATA_ROLL_PTR, ... */ - rec_t* rec); /* in: record */ /************************************************************************* Gets the index tree where the index is stored. */ UNIV_INLINE @@ -696,9 +695,10 @@ dict_tree_t* dict_tree_create( /*=============*/ /* out, own: created tree */ - dict_index_t* index); /* in: the index for which to create: in the + dict_index_t* index, /* in: the index for which to create: in the case of a mixed tree, this should be the index of the cluster object */ + ulint page_no);/* in: root page number of the index */ /************************************************************************** Frees an index tree struct. */ @@ -728,7 +728,7 @@ dict_tree_find_index_for_tuple( dtuple_t* tuple); /* in: tuple for which to find index */ /*********************************************************************** Checks if a table which is a mixed cluster member owns a record. */ -UNIV_INLINE + ibool dict_is_mixed_table_rec( /*====================*/ @@ -778,6 +778,7 @@ dict_tree_copy_rec_order_prefix( /* out: pointer to the prefix record */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to copy prefix */ + ulint* n_fields,/* out: number of fields copied */ byte** buf, /* in/out: memory buffer for the copied prefix, or NULL */ ulint* buf_size);/* in/out: buffer size */ @@ -790,6 +791,7 @@ dict_tree_build_data_tuple( /* out, own: data tuple */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to build data tuple */ + ulint n_fields,/* in: number of data fields */ mem_heap_t* heap); /* in: memory heap where tuple created */ /************************************************************************* Gets the space id of the root of the index tree. */ diff --git a/innobase/include/dict0dict.ic b/innobase/include/dict0dict.ic index 85e4aaf1a05..928a693f860 100644 --- a/innobase/include/dict0dict.ic +++ b/innobase/include/dict0dict.ic @@ -9,7 +9,6 @@ Created 1/8/1996 Heikki Tuuri #include "dict0load.h" #include "trx0undo.h" #include "trx0sys.h" -#include "rem0rec.h" /************************************************************************* Gets the column data type. */ @@ -168,7 +167,7 @@ dict_table_get_sys_col( col = dict_table_get_nth_col(table, table->n_cols - DATA_N_SYS_COLS + sys); ut_ad(col->type.mtype == DATA_SYS); - ut_ad(col->type.prtype == sys); + ut_ad(col->type.prtype == (sys | DATA_NOT_NULL)); return(col); } @@ -312,49 +311,6 @@ dict_index_get_sys_col_pos( dict_table_get_sys_col_no(index->table, type))); } -/************************************************************************ -Gets the value of a system column in a clustered index record. The clustered -index must contain the system column: if the index is unique, row id is -not contained there! */ -UNIV_INLINE -dulint -dict_index_rec_get_sys_col( -/*=======================*/ - /* out: system column value */ - dict_index_t* index, /* in: clustered index describing the record */ - ulint type, /* in: column type: DATA_ROLL_PTR, ... */ - rec_t* rec) /* in: record */ -{ - ulint pos; - byte* field; - ulint len; - - ut_ad(index); - ut_ad(index->type & DICT_CLUSTERED); - - pos = dict_index_get_sys_col_pos(index, type); - - ut_ad(pos != ULINT_UNDEFINED); - - field = rec_get_nth_field(rec, pos, &len); - - if (type == DATA_ROLL_PTR) { - ut_ad(len == 7); - - return(trx_read_roll_ptr(field)); - } else if (type == DATA_TRX_ID) { - - return(trx_read_trx_id(field)); - } else if (type == DATA_MIX_ID) { - - return(mach_dulint_read_compressed(field)); - } else { - ut_a(type == DATA_ROW_ID); - - return(mach_read_from_6(field)); - } -} - /************************************************************************* Gets the index tree where the index is stored. */ UNIV_INLINE @@ -662,28 +618,3 @@ dict_table_get_index( return(index); } - -/*********************************************************************** -Checks if a table which is a mixed cluster member owns a record. */ -UNIV_INLINE -ibool -dict_is_mixed_table_rec( -/*====================*/ - /* out: TRUE if the record belongs to this - table */ - dict_table_t* table, /* in: table in a mixed cluster */ - rec_t* rec) /* in: user record in the clustered index */ -{ - byte* mix_id_field; - ulint len; - - mix_id_field = rec_get_nth_field(rec, table->mix_len, &len); - - if ((len != table->mix_id_len) - || (0 != ut_memcmp(table->mix_id_buf, mix_id_field, len))) { - - return(FALSE); - } - - return(TRUE); -} diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h index 1e496a25477..ff6c4ec9b28 100644 --- a/innobase/include/dict0mem.h +++ b/innobase/include/dict0mem.h @@ -54,7 +54,8 @@ dict_mem_table_create( of the table is placed; this parameter is ignored if the table is made a member of a cluster */ - ulint n_cols); /* in: number of columns */ + ulint n_cols, /* in: number of columns */ + ibool comp); /* in: TRUE=compact page format */ /************************************************************************** Creates a cluster memory object. */ @@ -171,6 +172,13 @@ struct dict_field_struct{ DICT_MAX_COL_PREFIX_LEN; NOTE that in the UTF-8 charset, MySQL sets this to 3 * the prefix len in UTF-8 chars */ + ulint fixed_len; /* 0 or the fixed length of the + column if smaller than + DICT_MAX_COL_PREFIX_LEN */ + ulint fixed_offs; /* offset to the field, or + ULINT_UNDEFINED if it is not fixed + within the record (due to preceding + variable-length fields) */ }; /* Data structure for an index tree */ @@ -210,7 +218,6 @@ struct dict_index_struct{ const char* table_name; /* table name */ dict_table_t* table; /* back pointer to table */ ulint space; /* space where the index tree is placed */ - ulint page_no;/* page number of the index tree root */ ulint trx_id_offset;/* position of the the trx id column in a clustered index record, if the fields before it are known to be of a fixed size, @@ -225,6 +232,7 @@ struct dict_index_struct{ ulint n_def; /* number of fields defined so far */ ulint n_fields;/* number of fields in the index */ dict_field_t* fields; /* array of field descriptions */ + ulint n_nullable;/* number of nullable fields */ UT_LIST_NODE_T(dict_index_t) indexes;/* list of indexes of the table */ dict_tree_t* tree; /* index tree struct */ @@ -320,6 +328,7 @@ struct dict_table_struct{ ibool tablespace_discarded;/* this flag is set TRUE when the user calls DISCARD TABLESPACE on this table, and reset to FALSE in IMPORT TABLESPACE */ + ibool comp; /* flag: TRUE=compact page format */ hash_node_t name_hash; /* hash chain node */ hash_node_t id_hash; /* hash chain node */ ulint n_def; /* number of columns defined so far */ diff --git a/innobase/include/dyn0dyn.h b/innobase/include/dyn0dyn.h index abee62300e3..1df976a5301 100644 --- a/innobase/include/dyn0dyn.h +++ b/innobase/include/dyn0dyn.h @@ -132,7 +132,7 @@ void dyn_push_string( /*============*/ dyn_array_t* arr, /* in: dyn array */ - byte* str, /* in: string to write */ + const byte* str, /* in: string to write */ ulint len); /* in: string length */ /*#################################################################*/ diff --git a/innobase/include/dyn0dyn.ic b/innobase/include/dyn0dyn.ic index b6c4808398b..c1b8f2cb8ce 100644 --- a/innobase/include/dyn0dyn.ic +++ b/innobase/include/dyn0dyn.ic @@ -324,10 +324,9 @@ void dyn_push_string( /*============*/ dyn_array_t* arr, /* in: dyn array */ - byte* str, /* in: string to write */ + const byte* str, /* in: string to write */ ulint len) /* in: string length */ { - byte* ptr; ulint n_copied; while (len > 0) { @@ -337,9 +336,7 @@ dyn_push_string( n_copied = len; } - ptr = (byte*) dyn_array_push(arr, n_copied); - - ut_memcpy(ptr, str, n_copied); + memcpy(dyn_array_push(arr, n_copied), str, n_copied); str += n_copied; len -= n_copied; diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h index c1a127aadca..aa1ec5c25a5 100644 --- a/innobase/include/fil0fil.h +++ b/innobase/include/fil0fil.h @@ -89,6 +89,8 @@ extern fil_addr_t fil_addr_null; #define FIL_TABLESPACE 501 #define FIL_LOG 502 +extern ulint fil_n_log_flushes; + extern ulint fil_n_pending_log_flushes; extern ulint fil_n_pending_tablespace_flushes; diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h index 74288d57285..20b1f1d7145 100644 --- a/innobase/include/lock0lock.h +++ b/innobase/include/lock0lock.h @@ -19,7 +19,9 @@ Created 5/7/1996 Heikki Tuuri #include "read0types.h" #include "hash0hash.h" +#ifdef UNIV_DEBUG extern ibool lock_print_waits; +#endif /* UNIV_DEBUG */ /* Buffer for storing information about the most recent deadlock error */ extern FILE* lock_latest_err_file; @@ -47,7 +49,8 @@ lock_sec_rec_some_has_impl_off_kernel( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index); /* in: secondary index */ + dict_index_t* index, /* in: secondary index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Checks if some transaction has an implicit x-lock on a record in a clustered index. */ @@ -58,7 +61,8 @@ lock_clust_rec_some_has_impl( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /***************************************************************** Resets the lock bits for a single record. Releases transactions waiting for lock requests here. */ @@ -214,6 +218,7 @@ actual record is being moved. */ void lock_rec_store_on_page_infimum( /*===========================*/ + page_t* page, /* in: page containing the record */ rec_t* rec); /* in: record whose lock state is stored on the infimum record of the same page; lock bits are reset on the record */ @@ -275,6 +280,7 @@ lock_clust_rec_modify_check_and_lock( does nothing */ rec_t* rec, /* in: record which should be modified */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr); /* in: query thread */ /************************************************************************* Checks if locks of other transactions prevent an immediate modify @@ -308,6 +314,7 @@ lock_sec_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: secondary index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -333,6 +340,34 @@ lock_clust_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint mode, /* in: mode of the lock which the read cursor + should set on records: LOCK_S or LOCK_X; the + latter is possible in SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". */ + +ulint +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + rec_t* rec, /* in: user record or page supremum record + which should be read or passed over by a read + cursor */ + dict_index_t* index, /* in: clustered index */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -350,6 +385,7 @@ lock_clust_rec_cons_read_sees( rec_t* rec, /* in: user record which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ read_view_t* view); /* in: consistent read view */ /************************************************************************* Checks that a non-clustered index record is seen in a consistent read. */ @@ -379,9 +415,7 @@ lock_table( /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, - does nothing; - if LOCK_TABLE_EXP bits are set, - creates an explicit table lock */ + does nothing */ dict_table_t* table, /* in: database table in dictionary cache */ ulint mode, /* in: lock mode */ que_thr_t* thr); /* in: query thread */ @@ -418,15 +452,6 @@ lock_release_off_kernel( /*====================*/ trx_t* trx); /* in: transaction */ /************************************************************************* -Releases table locks explicitly requested with LOCK TABLES (indicated by -lock type LOCK_TABLE_EXP), and releases possible other transactions waiting -because of these locks. */ - -void -lock_release_tables_off_kernel( -/*===========================*/ - trx_t* trx); /* in: transaction */ -/************************************************************************* Cancels a waiting lock request and releases possible other transactions waiting behind it. */ @@ -499,6 +524,7 @@ lock_check_trx_id_sanity( dulint trx_id, /* in: trx id */ rec_t* rec, /* in: user record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ ibool has_kernel_mutex);/* in: TRUE if the caller owns the kernel mutex */ /************************************************************************* @@ -509,7 +535,8 @@ lock_rec_queue_validate( /*====================*/ /* out: TRUE if ok */ rec_t* rec, /* in: record to look at */ - dict_index_t* index); /* in: index, or NULL if not known */ + dict_index_t* index, /* in: index, or NULL if not known */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Prints info of a table lock. */ @@ -583,7 +610,6 @@ extern lock_sys_t* lock_sys; /* Lock types */ #define LOCK_TABLE 16 /* these type values should be so high that */ #define LOCK_REC 32 /* they can be ORed to the lock mode */ -#define LOCK_TABLE_EXP 80 /* explicit table lock (80 = 16 + 64) */ #define LOCK_TYPE_MASK 0xF0UL /* mask used to extract lock type from the type_mode field in a lock */ /* Waiting lock flag */ diff --git a/innobase/include/lock0lock.ic b/innobase/include/lock0lock.ic index fabc9256401..c7a71bb45d8 100644 --- a/innobase/include/lock0lock.ic +++ b/innobase/include/lock0lock.ic @@ -60,7 +60,8 @@ lock_clust_rec_some_has_impl( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { dulint trx_id; @@ -70,7 +71,7 @@ lock_clust_rec_some_has_impl( ut_ad(index->type & DICT_CLUSTERED); ut_ad(page_rec_is_user_rec(rec)); - trx_id = row_get_rec_trx_id(rec, index); + trx_id = row_get_rec_trx_id(rec, index, offsets); if (trx_is_active(trx_id)) { /* The modifying or inserting transaction is active */ diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h index d14a116072d..7f3f10438b4 100644 --- a/innobase/include/log0log.h +++ b/innobase/include/log0log.h @@ -17,8 +17,12 @@ Created 12/9/1995 Heikki Tuuri typedef struct log_struct log_t; typedef struct log_group_struct log_group_t; +#ifdef UNIV_DEBUG extern ibool log_do_write; extern ibool log_debug_writes; +#else /* UNIV_DEBUG */ +# define log_do_write TRUE +#endif /* UNIV_DEBUG */ /* Wait modes for log_write_up_to */ #define LOG_NO_WAIT 91 diff --git a/innobase/include/mach0data.h b/innobase/include/mach0data.h index 7ad760cd60f..f9a3ff521d5 100644 --- a/innobase/include/mach0data.h +++ b/innobase/include/mach0data.h @@ -52,6 +52,27 @@ mach_read_from_2( /*=============*/ /* out: ulint integer, >= 0, < 64k */ byte* b); /* in: pointer to two bytes */ + +/************************************************************ +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + /* out: 16-bit integer in canonical format */ + ulint n); /* in: integer in machine-dependent format */ +/************************************************************ +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + /* out: integer in machine-dependent format */ + uint16 n); /* in: 16-bit integer in canonical format */ /*********************************************************** The following function is used to store data in 3 consecutive bytes. We store the most significant byte to the lowest address. */ diff --git a/innobase/include/mach0data.ic b/innobase/include/mach0data.ic index 3ffb9baa344..888f3f743e4 100644 --- a/innobase/include/mach0data.ic +++ b/innobase/include/mach0data.ic @@ -68,6 +68,37 @@ mach_read_from_2( ); } +/************************************************************ +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + /* out: 16-bit integer in canonical format */ + ulint n) /* in: integer in machine-dependent format */ +{ + uint16 ret; + ut_ad(2 == sizeof ret); + mach_write_to_2((byte*) &ret, n); + return(ret); +} +/************************************************************ +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + /* out: integer in machine-dependent format */ + uint16 n) /* in: 16-bit integer in canonical format */ +{ + ut_ad(2 == sizeof n); + return(mach_read_from_2((byte*) &n)); +} + /*********************************************************** The following function is used to store data in 3 consecutive bytes. We store the most significant byte to the lowest address. */ diff --git a/innobase/include/mem0mem.ic b/innobase/include/mem0mem.ic index 82d88099c3f..8c87c884d78 100644 --- a/innobase/include/mem0mem.ic +++ b/innobase/include/mem0mem.ic @@ -623,7 +623,7 @@ mem_strdupq( } *d++ = q; *d++ = '\0'; - ut_ad(len == d - dst); + ut_ad((ssize_t) len == d - dst); return(dst); } diff --git a/innobase/include/mtr0log.h b/innobase/include/mtr0log.h index 9c9c6f696e8..6a3920aa8a1 100644 --- a/innobase/include/mtr0log.h +++ b/innobase/include/mtr0log.h @@ -11,6 +11,7 @@ Created 12/7/1995 Heikki Tuuri #include "univ.i" #include "mtr0mtr.h" +#include "dict0types.h" /************************************************************ Writes 1 - 4 bytes to a file page buffered in the buffer pool. @@ -40,10 +41,10 @@ corresponding log record to the mini-transaction log. */ void mlog_write_string( /*==============*/ - byte* ptr, /* in: pointer where to write */ - byte* str, /* in: string to write */ - ulint len, /* in: string length */ - mtr_t* mtr); /* in: mini-transaction handle */ + byte* ptr, /* in: pointer where to write */ + const byte* str, /* in: string to write */ + ulint len, /* in: string length */ + mtr_t* mtr); /* in: mini-transaction handle */ /************************************************************ Writes initial part of a log record consisting of one-byte item type and four-byte space and page numbers. */ @@ -84,9 +85,9 @@ Catenates n bytes to the mtr log. */ void mlog_catenate_string( /*=================*/ - mtr_t* mtr, /* in: mtr */ - byte* str, /* in: string to write */ - ulint len); /* in: string length */ + mtr_t* mtr, /* in: mtr */ + const byte* str, /* in: string to write */ + ulint len); /* in: string length */ /************************************************************ Catenates a compressed ulint to mlog. */ UNIV_INLINE @@ -173,6 +174,38 @@ mlog_parse_string( byte* page); /* in: page where to apply the log record, or NULL */ +/************************************************************ +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. Reserves space +for further log entries. The log entry must be closed with +mtr_close(). */ + +byte* +mlog_open_and_write_index( +/*======================*/ + /* out: buffer, NULL if log mode + MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + byte* rec, /* in: index record or page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: log item type */ + ulint size); /* in: requested buffer size in bytes + (if 0, calls mlog_close() and returns NULL) */ + +/************************************************************ +Parses a log record written by mlog_open_and_write_index. */ + +byte* +mlog_parse_index( +/*=============*/ + /* out: parsed record end, + NULL if not a complete record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + /* out: new value of log_ptr */ + ibool comp, /* in: TRUE=compact record format */ + dict_index_t** index); /* out, own: dummy index */ + /* Insert, update, and maybe other functions may use this value to define an extra mlog buffer size for variable size data */ #define MLOG_BUF_MARGIN 256 diff --git a/innobase/include/mtr0mtr.h b/innobase/include/mtr0mtr.h index e8c68a91dad..f44e813cf6b 100644 --- a/innobase/include/mtr0mtr.h +++ b/innobase/include/mtr0mtr.h @@ -102,7 +102,35 @@ flag value must give the length also! */ file rename */ #define MLOG_FILE_DELETE ((byte)35) /* log record about an .ibd file deletion */ -#define MLOG_BIGGEST_TYPE ((byte)35) /* biggest value (used in +#define MLOG_COMP_REC_MIN_MARK ((byte)36) /* mark a compact index record + as the predefined minimum + record */ +#define MLOG_COMP_PAGE_CREATE ((byte)37) /* create a compact + index page */ +#define MLOG_COMP_REC_INSERT ((byte)38) /* compact record insert */ +#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39) + /* mark compact clustered index + record deleted */ +#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/* mark compact secondary index + record deleted; this log + record type is redundant, as + MLOG_REC_SEC_DELETE_MARK is + independent of the record + format. */ +#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/* update of a compact record, + preserves record field sizes */ +#define MLOG_COMP_REC_DELETE ((byte)42) /* delete a compact record + from a page */ +#define MLOG_COMP_LIST_END_DELETE ((byte)43) /* delete compact record list + end on index page */ +#define MLOG_COMP_LIST_START_DELETE ((byte)44) /* delete compact record list + start on index page */ +#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45) + /* copy compact record list end + to a new created index page */ +#define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /* reorganize an index page */ + +#define MLOG_BIGGEST_TYPE ((byte)46) /* biggest value (used in asserts) */ /******************************************************************* diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index 280a949c1c5..adbc4afafd2 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -17,6 +17,8 @@ Created 10/21/1995 Heikki Tuuri #include <time.h> #endif +typedef struct fil_node_struct fil_node_t; + #ifdef UNIV_DO_FLUSH extern ibool os_do_not_call_flush_at_each_write; #endif /* UNIV_DO_FLUSH */ @@ -26,6 +28,9 @@ extern ibool os_aio_print_debug; extern ulint os_file_n_pending_preads; extern ulint os_file_n_pending_pwrites; +extern ulint os_n_pending_reads; +extern ulint os_n_pending_writes; + #ifdef __WIN__ /* We define always WIN_ASYNC_IO, and check at run-time whether @@ -562,7 +567,7 @@ os_aio( ulint offset_high, /* in: most significant 32 bits of offset */ ulint n, /* in: number of bytes to read or write */ - void* message1,/* in: messages for the aio handler (these + fil_node_t* message1,/* in: messages for the aio handler (these can be used to identify a completed aio operation); if mode is OS_AIO_SYNC, these are ignored */ @@ -620,7 +625,7 @@ os_aio_windows_handle( ignored */ ulint pos, /* this parameter is used only in sync aio: wait for the aio slot at this position */ - void** message1, /* out: the messages passed with the aio + fil_node_t**message1, /* out: the messages passed with the aio request; note that also in the case where the aio operation failed, these output parameters are valid and can be used to @@ -640,7 +645,7 @@ os_aio_posix_handle( /*================*/ /* out: TRUE if the aio operation succeeded */ ulint array_no, /* in: array number 0 - 3 */ - void** message1, /* out: the messages passed with the aio + fil_node_t**message1, /* out: the messages passed with the aio request; note that also in the case where the aio operation failed, these output parameters are valid and can be used to @@ -660,7 +665,7 @@ os_aio_simulated_handle( i/o thread, segment 1 the log i/o thread, then follow the non-ibuf read threads, and as the last are the non-ibuf write threads */ - void** message1, /* out: the messages passed with the aio + fil_node_t**message1, /* out: the messages passed with the aio request; note that also in the case where the aio operation failed, these output parameters are valid and can be used to @@ -687,6 +692,8 @@ Refreshes the statistics used to print per-second averages. */ void os_aio_refresh_stats(void); /*======================*/ + +#ifdef UNIV_DEBUG /************************************************************************** Checks that all slots in the system have been freed, that is, there are no pending io operations. */ @@ -694,6 +701,7 @@ no pending io operations. */ ibool os_aio_all_slots_free(void); /*=======================*/ +#endif /* UNIV_DEBUG */ /*********************************************************************** This function returns information about the specified file */ diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h index d0d3cf82e38..b0b72e18675 100644 --- a/innobase/include/os0proc.h +++ b/innobase/include/os0proc.h @@ -12,6 +12,11 @@ Created 9/30/1995 Heikki Tuuri #include "univ.i" +#ifdef UNIV_LINUX +#include <sys/ipc.h> +#include <sys/shm.h> +#endif + typedef void* os_process_t; typedef unsigned long int os_process_id_t; @@ -27,6 +32,10 @@ page size of an Intel x86 processor. We cannot use AWE with 2 MB or 4 MB pages. */ #define OS_AWE_X86_PAGE_SIZE 4096 +extern ibool os_use_large_pages; +/* Large page size. This may be a boot-time option on some platforms */ +extern ulint os_large_page_size; + /******************************************************************** Windows AWE support. Tries to enable the "lock pages in memory" privilege for the current process so that the current process can allocate memory-locked @@ -103,6 +112,25 @@ os_mem_alloc_nocache( /* out: allocated memory */ ulint n); /* in: number of bytes */ /******************************************************************** +Allocates large pages memory. */ + +void* +os_mem_alloc_large( +/*=================*/ + /* out: allocated memory */ + ulint n, /* in: number of bytes */ + ibool set_to_zero, /* in: TRUE if allocated memory should be set + to zero if UNIV_SET_MEM_TO_ZERO is defined */ + ibool assert_on_error); /* in: if TRUE, we crash mysqld if the memory + cannot be allocated */ +/******************************************************************** +Frees large pages memory. */ + +void +os_mem_free_large( +/*=================*/ +void *ptr); /* in: number of bytes */ +/******************************************************************** Sets the priority boost for threads released from waiting within the current process. */ diff --git a/innobase/include/page0cur.h b/innobase/include/page0cur.h index c85669ed4df..b03302b0e77 100644 --- a/innobase/include/page0cur.h +++ b/innobase/include/page0cur.h @@ -26,11 +26,13 @@ Created 10/4/1994 Heikki Tuuri #define PAGE_CUR_GE 2 #define PAGE_CUR_L 3 #define PAGE_CUR_LE 4 -#define PAGE_CUR_LE_OR_EXTENDS 5 /* This is a search mode used in +/*#define PAGE_CUR_LE_OR_EXTENDS 5*/ /* This is a search mode used in "column LIKE 'abc%' ORDER BY column DESC"; we have to find strings which are <= 'abc' or which extend it */ -#define PAGE_CUR_DBG 6 +#ifdef UNIV_SEARCH_DEBUG +# define PAGE_CUR_DBG 6 /* As PAGE_CUR_LE, but skips search shortcut */ +#endif /* UNIV_SEARCH_DEBUG */ #ifdef PAGE_CUR_ADAPT # ifdef UNIV_SEARCH_PERF_STAT @@ -78,16 +80,16 @@ UNIV_INLINE ibool page_cur_is_before_first( /*=====================*/ - /* out: TRUE if at start */ - page_cur_t* cur); /* in: cursor */ + /* out: TRUE if at start */ + const page_cur_t* cur); /* in: cursor */ /************************************************************* Returns TRUE if the cursor is after last user record. */ UNIV_INLINE ibool page_cur_is_after_last( /*===================*/ - /* out: TRUE if at end */ - page_cur_t* cur); /* in: cursor */ + /* out: TRUE if at end */ + const page_cur_t* cur); /* in: cursor */ /************************************************************** Positions the cursor on the given record. */ UNIV_INLINE @@ -128,7 +130,8 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple */ + dtuple_t* tuple, /* in: pointer to a data tuple */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mini-transaction handle */ /*************************************************************** Inserts a record next to page cursor. Returns pointer to inserted record if @@ -142,6 +145,8 @@ page_cur_rec_insert( otherwise */ page_cur_t* cursor, /* in: a page cursor */ rec_t* rec, /* in: record to insert */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ mtr_t* mtr); /* in: mini-transaction handle */ /*************************************************************** Inserts a record next to page cursor. Returns pointer to inserted record if @@ -155,9 +160,10 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ - ulint data_size,/* in: data size of tuple */ - rec_t* rec, /* in: pointer to a physical record or NULL */ + dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ + dict_index_t* index, /* in: record descriptor */ + rec_t* rec, /* in: pointer to a physical record or NULL */ + ulint* offsets,/* in: rec_get_offsets(rec, index) or NULL */ mtr_t* mtr); /* in: mini-transaction handle */ /***************************************************************** Copies records from page to a newly created page, from a given record onward, @@ -166,10 +172,11 @@ including that record. Infimum and supremum records are not copied. */ void page_copy_rec_list_end_to_created_page( /*===================================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: first record to copy */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: first record to copy */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /*************************************************************** Deletes a record at the page cursor. The cursor is moved to the next record after the deleted one. */ @@ -177,8 +184,10 @@ next record after the deleted one. */ void page_cur_delete_rec( /*================*/ - page_cur_t* cursor, /* in: a page cursor */ - mtr_t* mtr); /* in: mini-transaction handle */ + page_cur_t* cursor, /* in: a page cursor */ + dict_index_t* index, /* in: record descriptor */ + const ulint* offsets,/* in: rec_get_offsets(cursor->rec, index) */ + mtr_t* mtr); /* in: mini-transaction handle */ /******************************************************************** Searches the right position for a page cursor. */ UNIV_INLINE @@ -187,6 +196,7 @@ page_cur_search( /*============*/ /* out: number of matched fields on the left */ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -198,6 +208,7 @@ void page_cur_search_with_match( /*=======================*/ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -229,34 +240,37 @@ Parses a log record of a record insert on a page. */ byte* page_cur_parse_insert_rec( /*======================*/ - /* out: end of log record or NULL */ - ibool is_short,/* in: TRUE if short inserts */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + ibool is_short,/* in: TRUE if short inserts */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /************************************************************** Parses a log record of copying a record list end to a new created page. */ byte* page_parse_copy_rec_list_to_created_page( /*=====================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** Parses log record of a record delete on a page. */ byte* page_cur_parse_delete_rec( /*======================*/ - /* out: pointer to record end or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: pointer to record end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /* Index page cursor */ diff --git a/innobase/include/page0cur.ic b/innobase/include/page0cur.ic index 39f8ab11513..f8346819e84 100644 --- a/innobase/include/page0cur.ic +++ b/innobase/include/page0cur.ic @@ -69,15 +69,10 @@ UNIV_INLINE ibool page_cur_is_before_first( /*=====================*/ - /* out: TRUE if at start */ - page_cur_t* cur) /* in: cursor */ + /* out: TRUE if at start */ + const page_cur_t* cur) /* in: cursor */ { - if (page_get_infimum_rec(page_cur_get_page(cur)) == cur->rec) { - - return(TRUE); - } - - return(FALSE); + return(page_rec_is_infimum(cur->rec)); } /************************************************************* @@ -86,15 +81,10 @@ UNIV_INLINE ibool page_cur_is_after_last( /*===================*/ - /* out: TRUE if at end */ - page_cur_t* cur) /* in: cursor */ + /* out: TRUE if at end */ + const page_cur_t* cur) /* in: cursor */ { - if (page_get_supremum_rec(page_cur_get_page(cur)) == cur->rec) { - - return(TRUE); - } - - return(FALSE); + return(page_rec_is_supremum(cur->rec)); } /************************************************************** @@ -143,7 +133,7 @@ UNIV_INLINE void page_cur_move_to_prev( /*==================*/ - page_cur_t* cur) /* in: cursor; must not before first */ + page_cur_t* cur) /* in: page cursor, not before first */ { ut_ad(!page_cur_is_before_first(cur)); @@ -158,6 +148,7 @@ page_cur_search( /*============*/ /* out: number of matched fields on the left */ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -170,7 +161,7 @@ page_cur_search( ut_ad(dtuple_check_typed(tuple)); - page_cur_search_with_match(page, tuple, mode, + page_cur_search_with_match(page, index, tuple, mode, &up_matched_fields, &up_matched_bytes, &low_matched_fields, @@ -190,16 +181,11 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple */ + dtuple_t* tuple, /* in: pointer to a data tuple */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mini-transaction handle */ { - ulint data_size; - - ut_ad(dtuple_check_typed(tuple)); - - data_size = dtuple_get_data_size(tuple); - - return(page_cur_insert_rec_low(cursor, tuple, data_size, NULL, mtr)); + return(page_cur_insert_rec_low(cursor, tuple, index, NULL, NULL, mtr)); } /*************************************************************** @@ -214,8 +200,11 @@ page_cur_rec_insert( otherwise */ page_cur_t* cursor, /* in: a page cursor */ rec_t* rec, /* in: record to insert */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ mtr_t* mtr) /* in: mini-transaction handle */ { - return(page_cur_insert_rec_low(cursor, NULL, 0, rec, mtr)); + return(page_cur_insert_rec_low(cursor, NULL, index, rec, + offsets, mtr)); } diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h index 969313614e3..c4ffa39d3ac 100644 --- a/innobase/include/page0page.h +++ b/innobase/include/page0page.h @@ -37,7 +37,8 @@ typedef byte page_header_t; /*-----------------------------*/ #define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */ #define PAGE_HEAP_TOP 2 /* pointer to record heap top */ -#define PAGE_N_HEAP 4 /* number of records in the heap */ +#define PAGE_N_HEAP 4 /* number of records in the heap, + bit 15=flag: new-style compact page format */ #define PAGE_FREE 6 /* pointer to start of page free record list */ #define PAGE_GARBAGE 8 /* number of bytes in deleted records */ #define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or @@ -79,15 +80,24 @@ typedef byte page_header_t; #define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE) /* start of data on the page */ -#define PAGE_INFIMUM (PAGE_DATA + 1 + REC_N_EXTRA_BYTES) - /* offset of the page infimum record on the - page */ -#define PAGE_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_EXTRA_BYTES + 8) - /* offset of the page supremum record on the - page */ -#define PAGE_SUPREMUM_END (PAGE_SUPREMUM + 9) +#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES) + /* offset of the page infimum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8) + /* offset of the page supremum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9) /* offset of the page supremum record end on - the page */ + an old-style page */ +#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES) + /* offset of the page infimum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8) + /* offset of the page supremum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8) + /* offset of the page supremum record end on + a new-style compact page */ /*-----------------------------*/ /* Directions of cursor movement */ @@ -233,6 +243,7 @@ page_cmp_dtuple_rec_with_match( be page infimum or supremum, in which case matched-parameter values below are not affected */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns contains the value for current comparison */ @@ -259,6 +270,22 @@ page_rec_get_n_recs_before( /* out: number of records */ rec_t* rec); /* in: the physical record */ /***************************************************************** +Gets the number of records in the heap. */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + /* out: number of user records */ + page_t* page); /* in: index page */ +/***************************************************************** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /* in: index page */ + ulint n_heap);/* in: number of records */ +/***************************************************************** Gets the number of dir slots in directory. */ UNIV_INLINE ulint @@ -267,6 +294,15 @@ page_dir_get_n_slots( /* out: number of slots */ page_t* page); /* in: index page */ /***************************************************************** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + /* out: number of slots */ + page_t* page, /* in: index page */ + ulint n_slots);/* in: number of slots */ +/***************************************************************** Gets pointer to nth directory slot. */ UNIV_INLINE page_dir_slot_t* @@ -333,7 +369,24 @@ ulint page_dir_find_owner_slot( /*=====================*/ /* out: the directory slot number */ - rec_t* rec); /* in: the physical record */ + rec_t* rec); /* in: the physical record */ +/**************************************************************** +Determine whether the page is in new-style compact format. */ +UNIV_INLINE +ulint +page_is_comp( +/*=========*/ + /* out: nonzero if the page is in compact + format, zero if it is in old-style format */ + page_t* page); /* in: index page */ +/**************************************************************** +TRUE if the record is on a page in compact format. */ +UNIV_INLINE +ulint +page_rec_is_comp( +/*=============*/ + /* out: nonzero if in compact format */ + const rec_t* rec); /* in: record */ /**************************************************************** Gets the pointer to the next record on the page. */ UNIV_INLINE @@ -359,49 +412,58 @@ UNIV_INLINE rec_t* page_rec_get_prev( /*==============*/ - /* out: pointer to previous record */ - rec_t* rec); /* in: pointer to record, must not be page - infimum */ + /* out: pointer to previous record */ + rec_t* rec); /* in: pointer to record, + must not be page infimum */ /**************************************************************** TRUE if the record is a user record on the page. */ UNIV_INLINE ibool -page_rec_is_user_rec( -/*=================*/ +page_rec_is_user_rec_low( +/*=====================*/ /* out: TRUE if a user record */ - rec_t* rec); /* in: record */ + ulint offset);/* in: record offset on page */ /**************************************************************** TRUE if the record is the supremum record on a page. */ UNIV_INLINE ibool -page_rec_is_supremum( -/*=================*/ +page_rec_is_supremum_low( +/*=====================*/ /* out: TRUE if the supremum record */ - rec_t* rec); /* in: record */ + ulint offset);/* in: record offset on page */ /**************************************************************** TRUE if the record is the infimum record on a page. */ UNIV_INLINE ibool -page_rec_is_infimum( -/*================*/ +page_rec_is_infimum_low( +/*=====================*/ /* out: TRUE if the infimum record */ - rec_t* rec); /* in: record */ + ulint offset);/* in: record offset on page */ + /**************************************************************** -TRUE if the record is the first user record on the page. */ +TRUE if the record is a user record on the page. */ UNIV_INLINE ibool -page_rec_is_first_user_rec( -/*=======================*/ - /* out: TRUE if first user record */ - rec_t* rec); /* in: record */ +page_rec_is_user_rec( +/*=================*/ + /* out: TRUE if a user record */ + const rec_t* rec); /* in: record */ /**************************************************************** -TRUE if the record is the last user record on the page. */ +TRUE if the record is the supremum record on a page. */ UNIV_INLINE ibool -page_rec_is_last_user_rec( -/*======================*/ - /* out: TRUE if last user record */ - rec_t* rec); /* in: record */ +page_rec_is_supremum( +/*=================*/ + /* out: TRUE if the supremum record */ + const rec_t* rec); /* in: record */ +/**************************************************************** +TRUE if the record is the infimum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_infimum( +/*================*/ + /* out: TRUE if the infimum record */ + const rec_t* rec); /* in: record */ /******************************************************************* Looks for the record which owns the given record. */ UNIV_INLINE @@ -446,9 +508,11 @@ page_get_max_insert_size_after_reorganize( Calculates free space if a page is emptied. */ UNIV_INLINE ulint -page_get_free_space_of_empty(void); -/*==============================*/ - /* out: free space */ +page_get_free_space_of_empty( +/*=========================*/ + /* out: free space */ + ulint comp) /* in: nonzero=compact page format */ + __attribute__((const)); /**************************************************************** Returns the sum of the sizes of the records in the record list excluding the infimum and supremum records. */ @@ -464,20 +528,23 @@ Allocates a block of memory from an index page. */ byte* page_mem_alloc( /*===========*/ - /* out: pointer to start of allocated - buffer, or NULL if allocation fails */ - page_t* page, /* in: index page */ - ulint need, /* in: number of bytes needed */ - ulint* heap_no);/* out: this contains the heap number - of the allocated record if allocation succeeds */ + /* out: pointer to start of allocated + buffer, or NULL if allocation fails */ + page_t* page, /* in: index page */ + ulint need, /* in: number of bytes needed */ + dict_index_t* index, /* in: record descriptor */ + ulint* heap_no);/* out: this contains the heap number + of the allocated record + if allocation succeeds */ /**************************************************************** Puts a record to free list. */ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /* in: index page */ - rec_t* rec); /* in: pointer to the (origin of) record */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: pointer to the (origin of) record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /************************************************************** The index page creation function. */ @@ -487,7 +554,8 @@ page_create( /* out: pointer to the page */ buf_frame_t* frame, /* in: a buffer frame where the page is created */ - mtr_t* mtr); /* in: mini-transaction handle */ + mtr_t* mtr, /* in: mini-transaction handle */ + ulint comp); /* in: nonzero=compact page format */ /***************************************************************** Differs from page_copy_rec_list_end, because this function does not touch the lock table and max trx id on page. */ @@ -495,10 +563,11 @@ touch the lock table and max trx id on page. */ void page_copy_rec_list_end_no_locks( /*============================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Copies records from page to new_page, from the given record onward, including that record. Infimum and supremum records are not copied. @@ -507,10 +576,11 @@ The records are copied to the start of the record list on new_page. */ void page_copy_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Copies records from page to new_page, up to the given record, NOT including that record. Infimum and supremum records are not copied. @@ -519,10 +589,11 @@ The records are copied to the end of the record list on new_page. */ void page_copy_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes records from a page from a given record onward, including that record. The infimum and supremum records are not deleted. */ @@ -530,14 +601,15 @@ The infimum and supremum records are not deleted. */ void page_delete_rec_list_end( /*=====================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - ulint n_recs, /* in: number of records to delete, or ULINT_UNDEFINED - if not known */ - ulint size, /* in: the sum of the sizes of the records in the end - of the chain to delete, or ULINT_UNDEFINED if not - known */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + ulint n_recs, /* in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /* in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes records from page, up to the given record, NOT including that record. Infimum and supremum records are not deleted. */ @@ -545,9 +617,10 @@ that record. Infimum and supremum records are not deleted. */ void page_delete_rec_list_start( /*=======================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Moves record list end to another page. Moved records include split_rec. */ @@ -555,10 +628,11 @@ split_rec. */ void page_move_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record to move */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Moves record list start to another page. Moved records do not include split_rec. */ @@ -566,10 +640,11 @@ split_rec. */ void page_move_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record not to move */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record not to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /******************************************************************** Splits a directory slot which owns too many records. */ @@ -595,13 +670,16 @@ Parses a log record of a record list end or start deletion. */ byte* page_parse_delete_rec_list( /*=======================*/ - /* out: end of log record or NULL */ - byte type, /* in: MLOG_LIST_END_DELETE or - MLOG_LIST_START_DELETE */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte type, /* in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** Parses a redo log record of creating a page. */ @@ -611,6 +689,7 @@ page_parse_create( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /**************************************************************** @@ -620,7 +699,8 @@ the index page context. */ void page_rec_print( /*===========*/ - rec_t* rec); + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: record descriptor */ /******************************************************************* This is used to print the contents of the directory for debugging purposes. */ @@ -637,8 +717,9 @@ debugging purposes. */ void page_print_list( /*============*/ - page_t* page, /* in: index page */ - ulint pr_n); /* in: print n first and n last entries */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint pr_n); /* in: print n first and n last entries */ /******************************************************************* Prints the info in a page header. */ @@ -653,9 +734,12 @@ debugging purposes. */ void page_print( /*======*/ - page_t* page, /* in: index page */ - ulint dn, /* in: print dn first and last entries in directory */ - ulint rn); /* in: print rn first and last records on page */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint dn, /* in: print dn first and last entries + in directory */ + ulint rn); /* in: print rn first and last records + in directory */ /******************************************************************* The following is used to validate a record on a page. This function differs from rec_validate as it can also check the n_owned field and @@ -664,8 +748,9 @@ the heap_no field. */ ibool page_rec_validate( /*==============*/ - /* out: TRUE if ok */ - rec_t* rec); /* in: record on the page */ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Checks that the first directory slot points to the infimum record and the last to the supremum. This function is intended to track if the diff --git a/innobase/include/page0page.ic b/innobase/include/page0page.ic index c7bf78040e9..fd5281fdbec 100644 --- a/innobase/include/page0page.ic +++ b/innobase/include/page0page.ic @@ -73,7 +73,8 @@ page_header_set_field( { ut_ad(page); ut_ad(field <= PAGE_N_RECS); - ut_ad(val < UNIV_PAGE_SIZE); + ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE); + ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE); mach_write_to_2(page + PAGE_HEADER + field, val); } @@ -152,6 +153,32 @@ page_header_reset_last_insert( } /**************************************************************** +Determine whether the page is in new-style compact format. */ +UNIV_INLINE +ulint +page_is_comp( +/*=========*/ + /* out: nonzero if the page is in compact + format, zero if it is in old-style format */ + page_t* page) /* in: index page */ +{ + return(UNIV_EXPECT(page_header_get_field(page, PAGE_N_HEAP) & 0x8000, + 0x8000)); +} + +/**************************************************************** +TRUE if the record is on a page in compact format. */ +UNIV_INLINE +ulint +page_rec_is_comp( +/*=============*/ + /* out: nonzero if in compact format */ + const rec_t* rec) /* in: record */ +{ + return(page_is_comp(ut_align_down((rec_t*) rec, UNIV_PAGE_SIZE))); +} + +/**************************************************************** Gets the first record on the page. */ UNIV_INLINE rec_t* @@ -162,7 +189,11 @@ page_get_infimum_rec( { ut_ad(page); - return(page + PAGE_INFIMUM); + if (page_is_comp(page)) { + return(page + PAGE_NEW_INFIMUM); + } else { + return(page + PAGE_OLD_INFIMUM); + } } /**************************************************************** @@ -176,119 +207,118 @@ page_get_supremum_rec( { ut_ad(page); - return(page + PAGE_SUPREMUM); + if (page_is_comp(page)) { + return(page + PAGE_NEW_SUPREMUM); + } else { + return(page + PAGE_OLD_SUPREMUM); + } } /**************************************************************** TRUE if the record is a user record on the page. */ UNIV_INLINE ibool -page_rec_is_user_rec( -/*=================*/ +page_rec_is_user_rec_low( +/*=====================*/ /* out: TRUE if a user record */ - rec_t* rec) /* in: record */ + ulint offset) /* in: record offset on page */ { - ut_ad(rec); - - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { - - return(FALSE); - } - - if (rec == page_get_infimum_rec(buf_frame_align(rec))) { - - return(FALSE); - } + ut_ad(offset >= PAGE_NEW_INFIMUM); +#if PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM +# error "PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM" +#endif +#if PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM +# error "PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM" +#endif +#if PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM +# error "PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM" +#endif +#if PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM +# error "PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM" +#endif +#if PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END +# error "PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END" +#endif +#if PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END +# error "PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END" +#endif + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); - return(TRUE); + return(UNIV_LIKELY(offset != PAGE_NEW_SUPREMUM) + && UNIV_LIKELY(offset != PAGE_NEW_INFIMUM) + && UNIV_LIKELY(offset != PAGE_OLD_INFIMUM) + && UNIV_LIKELY(offset != PAGE_OLD_SUPREMUM)); } /**************************************************************** TRUE if the record is the supremum record on a page. */ UNIV_INLINE ibool -page_rec_is_supremum( -/*=================*/ +page_rec_is_supremum_low( +/*=====================*/ /* out: TRUE if the supremum record */ - rec_t* rec) /* in: record */ + ulint offset) /* in: record offset on page */ { - ut_ad(rec); - - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); - return(TRUE); - } - - return(FALSE); + return(UNIV_UNLIKELY(offset == PAGE_NEW_SUPREMUM) + || UNIV_UNLIKELY(offset == PAGE_OLD_SUPREMUM)); } /**************************************************************** TRUE if the record is the infimum record on a page. */ UNIV_INLINE ibool -page_rec_is_infimum( -/*================*/ +page_rec_is_infimum_low( +/*=====================*/ /* out: TRUE if the infimum record */ - rec_t* rec) /* in: record */ + ulint offset) /* in: record offset on page */ { - ut_ad(rec); - - if (rec == page_get_infimum_rec(buf_frame_align(rec))) { - - return(TRUE); - } + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); - return(FALSE); + return(UNIV_UNLIKELY(offset == PAGE_NEW_INFIMUM) + || UNIV_UNLIKELY(offset == PAGE_OLD_INFIMUM)); } /**************************************************************** -TRUE if the record is the first user record on the page. */ +TRUE if the record is a user record on the page. */ UNIV_INLINE ibool -page_rec_is_first_user_rec( -/*=======================*/ - /* out: TRUE if first user record */ - rec_t* rec) /* in: record */ +page_rec_is_user_rec( +/*=================*/ + /* out: TRUE if a user record */ + const rec_t* rec) /* in: record */ { - ut_ad(rec); - - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { - - return(FALSE); - } - - if (rec == page_rec_get_next( - page_get_infimum_rec(buf_frame_align(rec)))) { - - return(TRUE); - } - - return(FALSE); + return(page_rec_is_user_rec_low( + ut_align_offset(rec, UNIV_PAGE_SIZE))); } /**************************************************************** -TRUE if the record is the last user record on the page. */ +TRUE if the record is the supremum record on a page. */ UNIV_INLINE ibool -page_rec_is_last_user_rec( -/*======================*/ - /* out: TRUE if last user record */ - rec_t* rec) /* in: record */ +page_rec_is_supremum( +/*=================*/ + /* out: TRUE if the supremum record */ + const rec_t* rec) /* in: record */ { - ut_ad(rec); - - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { - - return(FALSE); - } - - if (page_rec_get_next(rec) - == page_get_supremum_rec(buf_frame_align(rec))) { - - return(TRUE); - } + return(page_rec_is_supremum_low( + ut_align_offset(rec, UNIV_PAGE_SIZE))); +} - return(FALSE); +/**************************************************************** +TRUE if the record is the infimum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_infimum( +/*================*/ + /* out: TRUE if the infimum record */ + const rec_t* rec) /* in: record */ +{ + return(page_rec_is_infimum_low( + ut_align_offset(rec, UNIV_PAGE_SIZE))); } /***************************************************************** @@ -309,6 +339,7 @@ page_cmp_dtuple_rec_with_match( be page infimum or supremum, in which case matched-parameter values below are not affected */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns contains the value for current comparison */ @@ -317,21 +348,26 @@ page_cmp_dtuple_rec_with_match( matched; when function returns contains the value for current comparison */ { - page_t* page; + ulint rec_offset; ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec)); - page = buf_frame_align(rec); + rec_offset = ut_align_offset(rec, UNIV_PAGE_SIZE); - if (rec == page_get_infimum_rec(page)) { + if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_INFIMUM) + || UNIV_UNLIKELY(rec_offset == PAGE_OLD_INFIMUM)) { return(1); - } else if (rec == page_get_supremum_rec(page)) { + } + if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_SUPREMUM) + || UNIV_UNLIKELY(rec_offset == PAGE_OLD_SUPREMUM)) { return(-1); - } else { - return(cmp_dtuple_rec_with_match(dtuple, rec, + } + + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, matched_fields, matched_bytes)); - } } /***************************************************************** @@ -358,6 +394,45 @@ page_dir_get_n_slots( { return(page_header_get_field(page, PAGE_N_DIR_SLOTS)); } +/***************************************************************** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + /* out: number of slots */ + page_t* page, /* in: index page */ + ulint n_slots)/* in: number of slots */ +{ + page_header_set_field(page, PAGE_N_DIR_SLOTS, n_slots); +} + +/***************************************************************** +Gets the number of records in the heap. */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + /* out: number of user records */ + page_t* page) /* in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff); +} + +/***************************************************************** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /* in: index page */ + ulint n_heap) /* in: number of records */ +{ + ut_ad(n_heap < 0x8000); + + page_header_set_field(page, PAGE_N_HEAP, n_heap | (0x8000 & + page_header_get_field(page, PAGE_N_HEAP))); +} /***************************************************************** Gets pointer to nth directory slot. */ @@ -369,7 +444,7 @@ page_dir_get_nth_slot( page_t* page, /* in: index page */ ulint n) /* in: position */ { - ut_ad(page_header_get_field(page, PAGE_N_DIR_SLOTS) > n); + ut_ad(page_dir_get_n_slots(page) > n); return(page + UNIV_PAGE_SIZE - PAGE_DIR - (n + 1) * PAGE_DIR_SLOT_SIZE); @@ -419,7 +494,7 @@ page_dir_slot_set_rec( { ut_ad(page_rec_check(rec)); - mach_write_to_2(slot, rec - buf_frame_align(rec)); + mach_write_to_2(slot, ut_align_offset(rec, UNIV_PAGE_SIZE)); } /******************************************************************* @@ -431,7 +506,8 @@ page_dir_slot_get_n_owned( /* out: number of records */ page_dir_slot_t* slot) /* in: page directory slot */ { - return(rec_get_n_owned(page_dir_slot_get_rec(slot))); + rec_t* rec = page_dir_slot_get_rec(slot); + return(rec_get_n_owned(rec, page_rec_is_comp(rec))); } /******************************************************************* @@ -444,7 +520,8 @@ page_dir_slot_set_n_owned( ulint n) /* in: number of records owned by the slot */ { - rec_set_n_owned(page_dir_slot_get_rec(slot), n); + rec_t* rec = page_dir_slot_get_rec(slot); + rec_set_n_owned(rec, page_rec_is_comp(rec), n); } /**************************************************************** @@ -475,26 +552,25 @@ page_rec_get_next( ut_ad(page_rec_check(rec)); - page = buf_frame_align(rec); + page = ut_align_down(rec, UNIV_PAGE_SIZE); - offs = rec_get_next_offs(rec); + offs = rec_get_next_offs(rec, page_is_comp(page)); - if (offs >= UNIV_PAGE_SIZE) { + if (UNIV_UNLIKELY(offs >= UNIV_PAGE_SIZE)) { fprintf(stderr, -"InnoDB: Next record offset is nonsensical %lu in record at offset %lu\n", - (ulong)offs, (ulong)(rec - page)); - fprintf(stderr, -"\nInnoDB: rec address %p, first buffer frame %p\n" +"InnoDB: Next record offset is nonsensical %lu in record at offset %lu\n" +"InnoDB: rec address %p, first buffer frame %p\n" "InnoDB: buffer pool high end %p, buf fix count %lu\n", + (ulong)offs, (ulong)(rec - page), rec, buf_pool->frame_zero, buf_pool->high_end, (ulong)buf_block_align(rec)->buf_fix_count); buf_page_print(page); - ut_a(0); + ut_error; } - if (offs == 0) { + if (UNIV_UNLIKELY(offs == 0)) { return(NULL); } @@ -513,21 +589,21 @@ page_rec_set_next( infimum */ { page_t* page; + ulint offs; ut_ad(page_rec_check(rec)); - ut_a((next == NULL) - || (buf_frame_align(rec) == buf_frame_align(next))); - - page = buf_frame_align(rec); - - ut_ad(rec != page_get_supremum_rec(page)); - ut_ad(next != page_get_infimum_rec(page)); + ut_ad(!page_rec_is_supremum(rec)); + page = ut_align_down(rec, UNIV_PAGE_SIZE); - if (next == NULL) { - rec_set_next_offs(rec, 0); + if (next) { + ut_ad(!page_rec_is_infimum(next)); + ut_ad(page == ut_align_down(next, UNIV_PAGE_SIZE)); + offs = (ulint) (next - page); } else { - rec_set_next_offs(rec, (ulint)(next - page)); + offs = 0; } + + rec_set_next_offs(rec, page_is_comp(page), offs); } /**************************************************************** @@ -548,9 +624,9 @@ page_rec_get_prev( ut_ad(page_rec_check(rec)); - page = buf_frame_align(rec); + page = ut_align_down(rec, UNIV_PAGE_SIZE); - ut_ad(rec != page_get_infimum_rec(page)); + ut_ad(!page_rec_is_infimum(rec)); slot_no = page_dir_find_owner_slot(rec); @@ -581,8 +657,14 @@ page_rec_find_owner_rec( { ut_ad(page_rec_check(rec)); - while (rec_get_n_owned(rec) == 0) { - rec = page_rec_get_next(rec); + if (page_rec_is_comp(rec)) { + while (rec_get_n_owned(rec, TRUE) == 0) { + rec = page_rec_get_next(rec); + } + } else { + while (rec_get_n_owned(rec, FALSE) == 0) { + rec = page_rec_get_next(rec); + } } return(rec); @@ -601,7 +683,9 @@ page_get_data_size( ulint ret; ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP) - - PAGE_SUPREMUM_END + - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) - page_header_get_field(page, PAGE_GARBAGE)); ut_ad(ret < UNIV_PAGE_SIZE); @@ -613,12 +697,20 @@ page_get_data_size( Calculates free space if a page is emptied. */ UNIV_INLINE ulint -page_get_free_space_of_empty(void) -/*==============================*/ +page_get_free_space_of_empty( +/*=========================*/ /* out: free space */ + ulint comp) /* in: nonzero=compact page layout */ { + if (UNIV_LIKELY(comp)) { + return((ulint)(UNIV_PAGE_SIZE + - PAGE_NEW_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); + } + return((ulint)(UNIV_PAGE_SIZE - - PAGE_SUPREMUM_END + - PAGE_OLD_SUPREMUM_END - PAGE_DIR - 2 * PAGE_DIR_SLOT_SIZE)); } @@ -641,13 +733,20 @@ page_get_max_insert_size( ulint occupied; ulint free_space; - occupied = page_header_get_field(page, PAGE_HEAP_TOP) - - PAGE_SUPREMUM_END - + page_dir_calc_reserved_space( - n_recs + (page_header_get_field(page, PAGE_N_HEAP) - 2)); + if (page_is_comp(page)) { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_NEW_SUPREMUM_END + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(TRUE); + } else { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_OLD_SUPREMUM_END + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(FALSE); + } - free_space = page_get_free_space_of_empty(); - /* Above the 'n_recs +' part reserves directory space for the new inserted records; the '- 2' excludes page infimum and supremum records */ @@ -673,11 +772,11 @@ page_get_max_insert_size_after_reorganize( { ulint occupied; ulint free_space; - + occupied = page_get_data_size(page) + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page)); - free_space = page_get_free_space_of_empty(); + free_space = page_get_free_space_of_empty(page_is_comp(page)); if (occupied > free_space) { @@ -693,21 +792,34 @@ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /* in: index page */ - rec_t* rec) /* in: pointer to the (origin of) record */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: pointer to the (origin of) record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - rec_t* free; - ulint garbage; + rec_t* free; + ulint garbage; + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec)); free = page_header_get_ptr(page, PAGE_FREE); page_rec_set_next(rec, free); page_header_set_ptr(page, PAGE_FREE, rec); +#if 0 /* It's better not to destroy the user's data. */ + + /* Clear the data bytes of the deleted record in order to improve + the compression ratio of the page and to make it easier to read + page dumps in corruption reports. The extra bytes of the record + cannot be cleared, because page_mem_alloc() needs them in order + to determine the size of the deleted record. */ + memset(rec, 0, rec_offs_data_size(offsets)); +#endif + garbage = page_header_get_field(page, PAGE_GARBAGE); page_header_set_field(page, PAGE_GARBAGE, - garbage + rec_get_size(rec)); + garbage + rec_offs_size(offsets)); } #ifdef UNIV_MATERIALIZE diff --git a/innobase/include/que0que.h b/innobase/include/que0que.h index e1874edcaf2..4113e52d425 100644 --- a/innobase/include/que0que.h +++ b/innobase/include/que0que.h @@ -359,6 +359,8 @@ struct que_thr_struct{ the control came */ ulint resource; /* resource usage of the query thread thus far */ + ulint lock_state; /* lock state of thread (table or + row) */ }; #define QUE_THR_MAGIC_N 8476583 @@ -482,6 +484,11 @@ struct que_fork_struct{ #define QUE_THR_SUSPENDED 7 #define QUE_THR_ERROR 8 +/* Query thread lock states */ +#define QUE_THR_LOCK_NOLOCK 0 +#define QUE_THR_LOCK_ROW 1 +#define QUE_THR_LOCK_TABLE 2 + /* From where the cursor position is counted */ #define QUE_CUR_NOT_DEFINED 1 #define QUE_CUR_START 2 diff --git a/innobase/include/read0read.h b/innobase/include/read0read.h index db6bf888095..1a7a86470a8 100644 --- a/innobase/include/read0read.h +++ b/innobase/include/read0read.h @@ -69,6 +69,35 @@ read_view_print( /*============*/ read_view_t* view); /* in: read view */ +/************************************************************************* +Create a consistent cursor view for mysql to be used in cursors. In this +consistent read view modifications done by the creating transaction or future +transactions are not visible. */ + +cursor_view_t* +read_cursor_view_create_for_mysql( +/*==============================*/ + trx_t* cr_trx);/* in: trx where cursor view is created */ + +/************************************************************************* +Close a given consistent cursor view for and restore global read view +back to a transaction. */ + +void +read_cursor_view_close_for_mysql( +/*=============================*/ + trx_t* trx, /* in: trx */ + cursor_view_t* curview); /* in: cursor view to be closed */ +/************************************************************************* +This function sets a given consistent cursor view to a transaction +read view if given consistent cursor view is not null. Otherwice, function +restores a global read view to a transaction read view. */ + +void +read_cursor_set_for_mysql( +/*======================*/ + trx_t* trx, /* in: transaction where cursor is set */ + cursor_view_t* curview);/* in: consistent cursor view to be set */ /* Read view lists the trx ids of those transactions for which a consistent read should not see the modifications to the database. */ @@ -100,6 +129,17 @@ struct read_view_struct{ /* List of read views in trx_sys */ }; +/* Implement InnoDB framework to support consistent read views in +cursors. This struct holds both heap where consistent read view +is allocated and pointer to a read view. */ + +struct cursor_view_struct{ + mem_heap_t* heap; + /* Memory heap for the cursor view */ + read_view_t* read_view; + /* Consistent read view of the cursor*/ +}; + #ifndef UNIV_NONINL #include "read0read.ic" #endif diff --git a/innobase/include/read0read.ic b/innobase/include/read0read.ic index 03d84ee0c51..ec9ef5814bb 100644 --- a/innobase/include/read0read.ic +++ b/innobase/include/read0read.ic @@ -71,13 +71,8 @@ read_view_sees_trx_id( cmp = ut_dulint_cmp(trx_id, read_view_get_nth_trx_id(view, n_ids - i - 1)); - if (0 == cmp) { - - return(FALSE); - - } else if (cmp < 0) { - - return(TRUE); + if (cmp <= 0) { + return(cmp < 0); } } diff --git a/innobase/include/read0types.h b/innobase/include/read0types.h index 5eb3e533f89..7d42728523e 100644 --- a/innobase/include/read0types.h +++ b/innobase/include/read0types.h @@ -10,5 +10,6 @@ Created 2/16/1997 Heikki Tuuri #define read0types_h typedef struct read_view_struct read_view_t; +typedef struct cursor_view_struct cursor_view_t; #endif diff --git a/innobase/include/rem0cmp.h b/innobase/include/rem0cmp.h index 712e263350e..1b1ee26b809 100644 --- a/innobase/include/rem0cmp.h +++ b/innobase/include/rem0cmp.h @@ -90,6 +90,7 @@ cmp_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns, contains the value for current comparison */ @@ -107,7 +108,8 @@ cmp_dtuple_rec( less than rec, respectively; see the comments for cmp_dtuple_rec_with_match */ dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /****************************************************************** Checks if a dtuple is a prefix of a record. The last field in dtuple is allowed to be a prefix of the corresponding field in the record. */ @@ -116,23 +118,9 @@ ibool cmp_dtuple_is_prefix_of_rec( /*========================*/ /* out: TRUE if prefix */ - dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec); /* in: physical record */ -/****************************************************************** -Compares a prefix of a data tuple to a prefix of a physical record for -equality. If there are less fields in rec than parameter n_fields, FALSE -is returned. NOTE that n_fields_cmp of dtuple does not affect this -comparison. */ - -ibool -cmp_dtuple_rec_prefix_equal( -/*========================*/ - /* out: TRUE if equal */ dtuple_t* dtuple, /* in: data tuple */ rec_t* rec, /* in: physical record */ - ulint n_fields); /* in: number of fields which should be - compared; must not exceed the number of - fields in dtuple */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /***************************************************************** This function is used to compare two physical records. Only the common first fields are compared, and if an externally stored field is @@ -146,6 +134,8 @@ cmp_rec_rec_with_match( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, @@ -167,6 +157,8 @@ cmp_rec_rec( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index); /* in: data dictionary index */ diff --git a/innobase/include/rem0cmp.ic b/innobase/include/rem0cmp.ic index 75cb3ef04e8..b86534e0a6a 100644 --- a/innobase/include/rem0cmp.ic +++ b/innobase/include/rem0cmp.ic @@ -57,10 +57,13 @@ cmp_rec_rec( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index) /* in: data dictionary index */ { ulint match_f = 0; ulint match_b = 0; - return(cmp_rec_rec_with_match(rec1, rec2, index, &match_f, &match_b)); + return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, + &match_f, &match_b)); } diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h index 86bf263170f..1d15b8d1c77 100644 --- a/innobase/include/rem0rec.h +++ b/innobase/include/rem0rec.h @@ -23,9 +23,23 @@ Created 5/30/1994 Heikki Tuuri info bits of a record */ #define REC_INFO_MIN_REC_FLAG 0x10UL -/* Number of extra bytes in a record, in addition to the data and the -offsets */ -#define REC_N_EXTRA_BYTES 6 +/* Number of extra bytes in an old-style record, +in addition to the data and the offsets */ +#define REC_N_OLD_EXTRA_BYTES 6 +/* Number of extra bytes in a new-style record, +in addition to the data and the offsets */ +#define REC_N_NEW_EXTRA_BYTES 5 + +/* Record status values */ +#define REC_STATUS_ORDINARY 0 +#define REC_STATUS_NODE_PTR 1 +#define REC_STATUS_INFIMUM 2 +#define REC_STATUS_SUPREMUM 3 + +/* Number of elements that should be initially allocated for the +offsets[] array, first passed to rec_get_offsets() */ +#define REC_OFFS_NORMAL_SIZE 100 +#define REC_OFFS_SMALL_SIZE 10 /********************************************************** The following function is used to get the offset of the @@ -36,7 +50,8 @@ rec_get_next_offs( /*==============*/ /* out: the page offset of the next chained record */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ /********************************************************** The following function is used to set the next record offset field of the record. */ @@ -45,17 +60,28 @@ void rec_set_next_offs( /*==============*/ rec_t* rec, /* in: physical record */ + ulint comp, /* in: nonzero=compact page format */ ulint next); /* in: offset of the next record */ /********************************************************** The following function is used to get the number of fields -in the record. */ +in an old-style record. */ UNIV_INLINE ulint -rec_get_n_fields( -/*=============*/ +rec_get_n_fields_old( +/*=================*/ /* out: number of data fields */ rec_t* rec); /* in: physical record */ /********************************************************** +The following function is used to get the number of fields +in a record. */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + /* out: number of data fields */ + rec_t* rec, /* in: physical record */ + dict_index_t* index); /* in: record descriptor */ +/********************************************************** The following function is used to get the number of records owned by the previous directory record. */ UNIV_INLINE @@ -63,7 +89,8 @@ ulint rec_get_n_owned( /*============*/ /* out: number of owned records */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ /********************************************************** The following function is used to set the number of owned records. */ @@ -72,6 +99,7 @@ void rec_set_n_owned( /*============*/ rec_t* rec, /* in: physical record */ + ulint comp, /* in: nonzero=compact page format */ ulint n_owned); /* in: the number of owned */ /********************************************************** The following function is used to retrieve the info bits of @@ -81,7 +109,8 @@ ulint rec_get_info_bits( /*==============*/ /* out: info bits */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ /********************************************************** The following function is used to set the info bits of a record. */ UNIV_INLINE @@ -89,23 +118,56 @@ void rec_set_info_bits( /*==============*/ rec_t* rec, /* in: physical record */ + ulint comp, /* in: nonzero=compact page format */ ulint bits); /* in: info bits */ /********************************************************** -Gets the value of the deleted falg in info bits. */ +The following function retrieves the status bits of a new-style record. */ UNIV_INLINE -ibool -rec_info_bits_get_deleted_flag( -/*===========================*/ - /* out: TRUE if deleted flag set */ - ulint info_bits); /* in: info bits from a record */ +ulint +rec_get_status( +/*===========*/ + /* out: status bits */ + rec_t* rec); /* in: physical record */ + +/********************************************************** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /* in: physical record */ + ulint bits); /* in: info bits */ + +/********************************************************** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +ulint +rec_get_info_and_status_bits( +/*=========================*/ + /* out: info bits */ + rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ +/********************************************************** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /* in: physical record */ + ulint comp, /* in: nonzero=compact page format */ + ulint bits); /* in: info bits */ + /********************************************************** The following function tells if record is delete marked. */ UNIV_INLINE -ibool +ulint rec_get_deleted_flag( /*=================*/ - /* out: TRUE if delete marked */ - rec_t* rec); /* in: physical record */ + /* out: nonzero if delete marked */ + rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ /********************************************************** The following function is used to set the deleted bit. */ UNIV_INLINE @@ -113,7 +175,16 @@ void rec_set_deleted_flag( /*=================*/ rec_t* rec, /* in: physical record */ - ibool flag); /* in: TRUE if delete marked */ + ulint comp, /* in: nonzero=compact page format */ + ulint flag); /* in: nonzero if delete marked */ +/********************************************************** +The following function tells if a new-style record is a node pointer. */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*=================*/ + /* out: TRUE if node pointer */ + rec_t* rec); /* in: physical record */ /********************************************************** The following function is used to get the order number of the record in the heap of the index page. */ @@ -122,7 +193,8 @@ ulint rec_get_heap_no( /*=============*/ /* out: heap order number */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ /********************************************************** The following function is used to set the heap number field in the record. */ @@ -131,6 +203,7 @@ void rec_set_heap_no( /*=============*/ rec_t* rec, /* in: physical record */ + ulint comp, /* in: nonzero=compact page format */ ulint heap_no);/* in: the heap number */ /********************************************************** The following function is used to test whether the data offsets @@ -141,31 +214,65 @@ rec_get_1byte_offs_flag( /*====================*/ /* out: TRUE if 1-byte form */ rec_t* rec); /* in: physical record */ +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ + +ulint* +rec_get_offsets_func( +/*=================*/ + /* out: the new offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in: array consisting of offsets[0] + allocated elements, or an array from + rec_get_offsets(), or NULL */ + ulint n_fields,/* in: maximum number of initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t** heap, /* in/out: memory heap */ + const char* file, /* in: file name where called */ + ulint line); /* in: line number where called */ + +#define rec_get_offsets(rec,index,offsets,n,heap) \ + rec_get_offsets_func(rec,index,offsets,n,heap,__FILE__,__LINE__) + +/**************************************************************** +Validates offsets returned by rec_get_offsets(). */ +UNIV_INLINE +ibool +rec_offs_validate( +/*==============*/ + /* out: TRUE if valid */ + rec_t* rec, /* in: record or NULL */ + dict_index_t* index, /* in: record descriptor or NULL */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/**************************************************************** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + rec_t* rec, /* in: record */ + dict_index_t* index,/* in: record descriptor */ + ulint* offsets);/* in: array returned by rec_get_offsets() */ + /**************************************************************** The following function is used to get a pointer to the nth -data field in the record. */ +data field in an old-style record. */ byte* -rec_get_nth_field( -/*==============*/ +rec_get_nth_field_old( +/*==================*/ /* out: pointer to the field */ rec_t* rec, /* in: record */ ulint n, /* in: index of the field */ ulint* len); /* out: length of the field; UNIV_SQL_NULL if SQL null */ /**************************************************************** -Return field length or UNIV_SQL_NULL. */ -UNIV_INLINE -ulint -rec_get_nth_field_len( -/*==================*/ - /* out: length of the field; UNIV_SQL_NULL if SQL - null */ - rec_t* rec, /* in: record */ - ulint n); /* in: index of the field */ -/**************************************************************** -Gets the physical size of a field. Also an SQL null may have a field of -size > 0, if the data type is of a fixed size. */ +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. */ UNIV_INLINE ulint rec_get_nth_field_size( @@ -173,131 +280,176 @@ rec_get_nth_field_size( /* out: field size in bytes */ rec_t* rec, /* in: record */ ulint n); /* in: index of the field */ -/*************************************************************** -Gets the value of the ith field extern storage bit. If it is TRUE -it means that the field is stored on another page. */ +/**************************************************************** +The following function is used to get a pointer to the nth +data field in a record. */ UNIV_INLINE -ibool -rec_get_nth_field_extern_bit( -/*=========================*/ - /* in: TRUE or FALSE */ - rec_t* rec, /* in: record */ - ulint i); /* in: ith field */ +byte* +rec_get_nth_field( +/*==============*/ + /* out: pointer to the field */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index of the field */ + ulint* len); /* out: length of the field; UNIV_SQL_NULL + if SQL null */ +/********************************************************** +Determine if the offsets are for a record in the new +compact format. */ +UNIV_INLINE +ulint +rec_offs_comp( +/*==========*/ + /* out: nonzero if compact format */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/********************************************************** +Returns nonzero if the extern bit is set in nth field of rec. */ +UNIV_INLINE +ulint +rec_offs_nth_extern( +/*================*/ + /* out: nonzero if externally stored */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ +/********************************************************** +Gets the physical size of a field. */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + /* out: length of field */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ + /********************************************************** Returns TRUE if the extern bit is set in any of the fields of rec. */ UNIV_INLINE ibool -rec_contains_externally_stored_field( -/*=================================*/ - /* out: TRUE if a field is stored externally */ - rec_t* rec); /* in: record */ +rec_offs_any_extern( +/*================*/ + /* out: TRUE if a field is stored externally */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*************************************************************** Sets the value of the ith field extern storage bit. */ - +UNIV_INLINE void rec_set_nth_field_extern_bit( /*=========================*/ - rec_t* rec, /* in: record */ - ulint i, /* in: ith field */ - ibool val, /* in: value to set */ - mtr_t* mtr); /* in: mtr holding an X-latch to the page where - rec is, or NULL; in the NULL case we do not - write to log about the change */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ /*************************************************************** Sets TRUE the extern storage bits of fields mentioned in an array. */ void rec_set_field_extern_bits( /*======================*/ - rec_t* rec, /* in: record */ - ulint* vec, /* in: array of field numbers */ - ulint n_fields, /* in: number of fields numbers */ - mtr_t* mtr); /* in: mtr holding an X-latch to the page - where rec is, or NULL; in the NULL case we - do not write to log about the change */ -/**************************************************************** -The following function is used to get a copy of the nth -data field in the record to a buffer. */ -UNIV_INLINE -void -rec_copy_nth_field( -/*===============*/ - void* buf, /* in: pointer to the buffer */ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - ulint* len); /* out: length of the field; UNIV_SQL_NULL if SQL - null */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + const ulint* vec, /* in: array of field numbers */ + ulint n_fields,/* in: number of fields numbers */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ /*************************************************************** -This is used to modify the value of an already existing field in -a physical record. The previous value must have exactly the same -size as the new value. If len is UNIV_SQL_NULL then the field is -treated as SQL null. */ +This is used to modify the value of an already existing field in a record. +The previous value must have exactly the same size as the new value. If len +is UNIV_SQL_NULL then the field is treated as an SQL null for old-style +records. For new-style records, len must not be UNIV_SQL_NULL. */ UNIV_INLINE void rec_set_nth_field( /*==============*/ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - void* data, /* in: pointer to the data if not SQL null */ - ulint len); /* in: length of the data or UNIV_SQL_NULL. - If not SQL null, must have the same length as the - previous value. If SQL null, previous value must be - SQL null. */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index number of the field */ + const void* data, /* in: pointer to the data if not SQL null */ + ulint len); /* in: length of the data or UNIV_SQL_NULL. + If not SQL null, must have the same + length as the previous value. + If SQL null, previous value must be + SQL null. */ /************************************************************** -The following function returns the data size of a physical +The following function returns the data size of an old-style physical record, that is the sum of field lengths. SQL null fields are counted as length 0 fields. The value returned by the function is the distance from record origin to record end in bytes. */ UNIV_INLINE ulint -rec_get_data_size( -/*==============*/ - /* out: size */ +rec_get_data_size_old( +/*==================*/ + /* out: size */ rec_t* rec); /* in: physical record */ /************************************************************** +The following function returns the number of fields in a record. */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*===============*/ + /* out: number of fields */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** Returns the total size of record minus data size of record. The value returned by the function is the distance from record start to record origin in bytes. */ UNIV_INLINE ulint -rec_get_extra_size( -/*===============*/ - /* out: size */ - rec_t* rec); /* in: physical record */ -/************************************************************** +rec_offs_extra_size( +/*================*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** Returns the total size of a physical record. */ UNIV_INLINE ulint -rec_get_size( -/*=========*/ - /* out: size */ - rec_t* rec); /* in: physical record */ +rec_offs_size( +/*==========*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /************************************************************** Returns a pointer to the start of the record. */ UNIV_INLINE byte* rec_get_start( /*==========*/ - /* out: pointer to start */ - rec_t* rec); /* in: pointer to record */ + /* out: pointer to start */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /************************************************************** Returns a pointer to the end of the record. */ UNIV_INLINE byte* rec_get_end( /*========*/ - /* out: pointer to end */ - rec_t* rec); /* in: pointer to record */ + /* out: pointer to end */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Copies a physical record to a buffer. */ UNIV_INLINE rec_t* rec_copy( /*=====*/ - /* out: pointer to the origin of the copied record */ - void* buf, /* in: buffer */ - rec_t* rec); /* in: physical record */ + /* out: pointer to the origin of the copy */ + void* buf, /* in: buffer */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /****************************************************************** Copies the first n fields of a physical record to a new physical record in a buffer. */ @@ -305,49 +457,43 @@ a buffer. */ rec_t* rec_copy_prefix_to_buf( /*===================*/ - /* out, own: copied record */ - rec_t* rec, /* in: physical record */ - ulint n_fields, /* in: number of fields to copy */ - byte** buf, /* in/out: memory buffer for the copied prefix, - or NULL */ - ulint* buf_size); /* in/out: buffer size */ + /* out, own: copied record */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields to copy */ + byte** buf, /* in/out: memory buffer + for the copied prefix, or NULL */ + ulint* buf_size); /* in/out: buffer size */ /**************************************************************** Folds a prefix of a physical record to a ulint. */ UNIV_INLINE ulint rec_fold( /*=====*/ - /* out: the folded value */ - rec_t* rec, /* in: the physical record */ - ulint n_fields, /* in: number of complete fields to fold */ - ulint n_bytes, /* in: number of bytes to fold in an - incomplete last field */ - dulint tree_id); /* in: index tree id */ + /* out: the folded value */ + rec_t* rec, /* in: the physical record */ + const ulint* offsets, /* in: array returned by + rec_get_offsets() */ + ulint n_fields, /* in: number of complete + fields to fold */ + ulint n_bytes, /* in: number of bytes to fold + in an incomplete last field */ + dulint tree_id); /* in: index tree id */ /************************************************************* Builds a physical record out of a data tuple and stores it beginning from address destination. */ -UNIV_INLINE + rec_t* rec_convert_dtuple_to_rec( /*======================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple); /* in: data tuple */ -/************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -address destination. */ - -rec_t* -rec_convert_dtuple_to_rec_low( -/*==========================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple, /* in: data tuple */ - ulint data_size); /* in: data size of dtuple */ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the + physical record */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple);/* in: data tuple */ /************************************************************** -Returns the extra size of a physical record if we know its +Returns the extra size of an old-style physical record if we know its data size and number of fields. */ UNIV_INLINE ulint @@ -355,7 +501,8 @@ rec_get_converted_extra_size( /*=========================*/ /* out: extra size */ ulint data_size, /* in: data size */ - ulint n_fields); /* in: number of fields */ + ulint n_fields) /* in: number of fields */ + __attribute__((const)); /************************************************************** The following function returns the size of a data tuple when converted to a physical record. */ @@ -364,6 +511,7 @@ ulint rec_get_converted_size( /*===================*/ /* out: size */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* dtuple);/* in: data tuple */ /****************************************************************** Copies the first n fields of a physical record to a data tuple. @@ -374,6 +522,7 @@ rec_copy_prefix_to_dtuple( /*======================*/ dtuple_t* tuple, /* in: data tuple */ rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ ulint n_fields, /* in: number of fields to copy */ mem_heap_t* heap); /* in: memory heap */ /******************************************************************* @@ -382,16 +531,35 @@ Validates the consistency of a physical record. */ ibool rec_validate( /*=========*/ - /* out: TRUE if ok */ - rec_t* rec); /* in: physical record */ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/******************************************************************* +Prints an old-style physical record. */ + +void +rec_print_old( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec); /* in: physical record */ +/******************************************************************* +Prints a physical record. */ + +void +rec_print_new( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Prints a physical record. */ void rec_print( /*======*/ - FILE* file, /* in: file where to print */ - rec_t* rec); /* in: physical record */ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + dict_index_t* index); /* in: record descriptor */ #define REC_INFO_BITS 6 /* This is single byte bit-field */ diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic index c36bf8f6d6e..e2dceb6bae5 100644 --- a/innobase/include/rem0rec.ic +++ b/innobase/include/rem0rec.ic @@ -8,9 +8,19 @@ Created 5/30/1994 Heikki Tuuri #include "mach0data.h" #include "ut0byte.h" +#include "dict0dict.h" -/* Offsets of the bit-fields in the record. NOTE! In the table the most -significant bytes and bits are written below less significant. +/* Compact flag ORed to the extra size returned by rec_get_offsets() */ +#define REC_OFFS_COMPACT ((ulint) 1 << 31) +/* SQL NULL flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_SQL_NULL ((ulint) 1 << 31) +/* External flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_EXTERNAL ((ulint) 1 << 30) +/* Mask for offsets returned by rec_get_offsets() */ +#define REC_OFFS_MASK (REC_OFFS_EXTERNAL - 1) + +/* Offsets of the bit-fields in an old-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. (1) byte offset (2) bit usage within byte downward from @@ -25,6 +35,35 @@ significant bytes and bits are written below less significant. 4 bits info bits */ +/* Offsets of the bit-fields in a new-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits relative offset of next record + 2 8 bits relative offset of next record + the relative offset is an unsigned 16-bit + integer: + (offset_of_next_record + - offset_of_this_record) mod 64Ki, + where mod is the modulo as a non-negative + number; + we can calculate the the offset of the next + record with the formula: + relative_offset + offset_of_this_record + mod UNIV_PAGE_SIZE + 3 3 bits status: + 000=conventional record + 001=node pointer record (inside B-tree) + 010=infimum record + 011=supremum record + 1xx=reserved + 5 bits heap number + 4 8 bits heap number + 5 4 bits n_owned + 4 bits info bits +*/ + /* We list the byte offsets from the origin of the record, the mask, and the shift needed to obtain each bit-field of the record. */ @@ -32,22 +71,30 @@ and the shift needed to obtain each bit-field of the record. */ #define REC_NEXT_MASK 0xFFFFUL #define REC_NEXT_SHIFT 0 -#define REC_SHORT 3 /* This is single byte bit-field */ -#define REC_SHORT_MASK 0x1UL -#define REC_SHORT_SHIFT 0 +#define REC_OLD_SHORT 3 /* This is single byte bit-field */ +#define REC_OLD_SHORT_MASK 0x1UL +#define REC_OLD_SHORT_SHIFT 0 + +#define REC_OLD_N_FIELDS 4 +#define REC_OLD_N_FIELDS_MASK 0x7FEUL +#define REC_OLD_N_FIELDS_SHIFT 1 -#define REC_N_FIELDS 4 -#define REC_N_FIELDS_MASK 0x7FEUL -#define REC_N_FIELDS_SHIFT 1 +#define REC_NEW_STATUS 3 /* This is single byte bit-field */ +#define REC_NEW_STATUS_MASK 0x7UL +#define REC_NEW_STATUS_SHIFT 0 -#define REC_HEAP_NO 5 +#define REC_OLD_HEAP_NO 5 +#define REC_NEW_HEAP_NO 4 #define REC_HEAP_NO_MASK 0xFFF8UL #define REC_HEAP_NO_SHIFT 3 -#define REC_N_OWNED 6 /* This is single byte bit-field */ +#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */ +#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */ #define REC_N_OWNED_MASK 0xFUL #define REC_N_OWNED_SHIFT 0 +#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */ +#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */ #define REC_INFO_BITS_MASK 0xF0UL #define REC_INFO_BITS_SHIFT 0 @@ -65,26 +112,24 @@ a field stored to another page: */ #define REC_2BYTE_EXTERN_MASK 0x4000UL -/**************************************************************** -Return field length or UNIV_SQL_NULL. */ -UNIV_INLINE -ulint -rec_get_nth_field_len( -/*==================*/ - /* out: length of the field; UNIV_SQL_NULL if SQL - null */ - rec_t* rec, /* in: record */ - ulint n) /* in: index of the field */ -{ - ulint len; - - rec_get_nth_field(rec, n, &len); - - return(len); -} +#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \ + ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \ + ^ 0xFFFFFFFFUL +# error "sum of old-style masks != 0xFFFFFFFFUL" +#endif +#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \ + ^ 0xFFFFFFUL +# error "sum of new-style masks != 0xFFFFFFUL" +#endif /*************************************************************** -Sets the value of the ith field SQL null bit. */ +Sets the value of the ith field SQL null bit of an old-style record. */ void rec_set_nth_field_null_bit( @@ -93,8 +138,8 @@ rec_set_nth_field_null_bit( ulint i, /* in: ith field */ ibool val); /* in: value to set */ /*************************************************************** -Sets a record field to SQL null. The physical size of the field is not -changed. */ +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ void rec_set_nth_field_sql_null( @@ -102,6 +147,32 @@ rec_set_nth_field_sql_null( rec_t* rec, /* in: record */ ulint n); /* in: index of the field */ +/*************************************************************** +Sets the value of the ith field extern storage bit of an old-style record. */ + +void +rec_set_nth_field_extern_bit_old( +/*=============================*/ + rec_t* rec, /* in: old-style record */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page where + rec is, or NULL; in the NULL case we do not + write to log about the change */ +/*************************************************************** +Sets the value of the ith field extern storage bit of a new-style record. */ + +void +rec_set_nth_field_extern_bit_new( +/*=============================*/ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint ith, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ + /********************************************************** Gets a bit field from within 1 byte. */ UNIV_INLINE @@ -131,7 +202,7 @@ rec_set_bit_field_1( ulint shift) /* in: shift right applied after masking */ { ut_ad(rec); - ut_ad(offs <= REC_N_EXTRA_BYTES); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); ut_ad(mask); ut_ad(mask <= 0xFFUL); ut_ad(((mask >> shift) << shift) == mask); @@ -171,30 +242,14 @@ rec_set_bit_field_2( ulint shift) /* in: shift right applied after masking */ { ut_ad(rec); - ut_ad(offs <= REC_N_EXTRA_BYTES); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); ut_ad(mask > 0xFFUL); ut_ad(mask <= 0xFFFFUL); ut_ad((mask >> shift) & 1); ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); ut_ad(((mask >> shift) << shift) == mask); ut_ad(((val << shift) & mask) == (val << shift)); -#ifdef UNIV_DEBUG - { - ulint m; - - /* The following assertion checks that the masks of currently - defined bit-fields in bytes 3-6 do not overlap. */ - m = (ulint)((REC_SHORT_MASK << (8 * (REC_SHORT - 3))) - + (REC_N_FIELDS_MASK << (8 * (REC_N_FIELDS - 4))) - + (REC_HEAP_NO_MASK << (8 * (REC_HEAP_NO - 4))) - + (REC_N_OWNED_MASK << (8 * (REC_N_OWNED - 3))) - + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3)))); - if (m != ut_dbg_zero + 0xFFFFFFFFUL) { - fprintf(stderr, "Sum of masks %lx\n", m); - ut_error; - } - } -#endif + mach_write_to_2(rec - offs, (mach_read_from_2(rec - offs) & ~mask) | (val << shift)); @@ -207,18 +262,46 @@ UNIV_INLINE ulint rec_get_next_offs( /*==============*/ - /* out: the page offset of the next chained record */ - rec_t* rec) /* in: physical record */ + /* out: the page offset of the next chained record, or + 0 if none */ + rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ { - ulint ret; - - ut_ad(rec); - - ret = rec_get_bit_field_2(rec, REC_NEXT, REC_NEXT_MASK, - REC_NEXT_SHIFT); - ut_ad(ret < UNIV_PAGE_SIZE); + ulint field_value; + + ut_ad(REC_NEXT_MASK == 0xFFFFUL); + ut_ad(REC_NEXT_SHIFT == 0); + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (comp) { +#if UNIV_PAGE_SIZE <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, UNIV_PAGE_SIZE) + < UNIV_PAGE_SIZE); +#endif + if (field_value == 0) { + + return(0); + } + + return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + } else { + ut_ad(field_value < UNIV_PAGE_SIZE); - return(ret); + return(field_value); + } } /********************************************************** @@ -229,21 +312,42 @@ void rec_set_next_offs( /*==============*/ rec_t* rec, /* in: physical record */ - ulint next) /* in: offset of the next record */ + ulint comp, /* in: nonzero=compact page format */ + ulint next) /* in: offset of the next record, or 0 if none */ { ut_ad(rec); ut_ad(UNIV_PAGE_SIZE > next); + ut_ad(REC_NEXT_MASK == 0xFFFFUL); + ut_ad(REC_NEXT_SHIFT == 0); + + if (comp) { + ulint field_value; + + if (next) { + /* The following two statements calculate + next - offset_of_rec mod 64Ki, where mod is the modulo + as a non-negative number */ + + field_value = (ulint)((lint)next + - (lint)ut_align_offset(rec, UNIV_PAGE_SIZE)); + field_value &= REC_NEXT_MASK; + } else { + field_value = 0; + } - rec_set_bit_field_2(rec, next, REC_NEXT, REC_NEXT_MASK, - REC_NEXT_SHIFT); + mach_write_to_2(rec - REC_NEXT, field_value); + } else { + mach_write_to_2(rec - REC_NEXT, next); + } } /********************************************************** -The following function is used to get the number of fields in the record. */ +The following function is used to get the number of fields +in an old-style record. */ UNIV_INLINE ulint -rec_get_n_fields( -/*=============*/ +rec_get_n_fields_old( +/*=================*/ /* out: number of data fields */ rec_t* rec) /* in: physical record */ { @@ -251,8 +355,8 @@ rec_get_n_fields( ut_ad(rec); - ret = rec_get_bit_field_2(rec, REC_N_FIELDS, REC_N_FIELDS_MASK, - REC_N_FIELDS_SHIFT); + ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); ut_ad(ret <= REC_MAX_N_FIELDS); ut_ad(ret > 0); @@ -260,12 +364,12 @@ rec_get_n_fields( } /********************************************************** -The following function is used to set the number of fields field in the -record. */ +The following function is used to set the number of fields +in an old-style record. */ UNIV_INLINE void -rec_set_n_fields( -/*=============*/ +rec_set_n_fields_old( +/*=================*/ rec_t* rec, /* in: physical record */ ulint n_fields) /* in: the number of fields */ { @@ -273,8 +377,58 @@ rec_set_n_fields( ut_ad(n_fields <= REC_MAX_N_FIELDS); ut_ad(n_fields > 0); - rec_set_bit_field_2(rec, n_fields, REC_N_FIELDS, REC_N_FIELDS_MASK, - REC_N_FIELDS_SHIFT); + rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); +} + +/********************************************************** +The following function retrieves the status bits of a new-style record. */ +UNIV_INLINE +ulint +rec_get_status( +/*===========*/ + /* out: status bits */ + rec_t* rec) /* in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_1(rec, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); + ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0); + + return(ret); +} + +/********************************************************** +The following function is used to get the number of fields +in a record. */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + /* out: number of data fields */ + rec_t* rec, /* in: physical record */ + dict_index_t* index) /* in: record descriptor */ +{ + ut_ad(rec); + ut_ad(index); + if (UNIV_UNLIKELY(!index->table->comp)) { + return(rec_get_n_fields_old(rec)); + } + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + return(dict_index_get_n_fields(index)); + case REC_STATUS_NODE_PTR: + return(dict_index_get_n_unique_in_tree(index) + 1); + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + return(1); + default: + ut_error; + return(ULINT_UNDEFINED); + } } /********************************************************** @@ -285,14 +439,16 @@ ulint rec_get_n_owned( /*============*/ /* out: number of owned records */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ { ulint ret; ut_ad(rec); - ret = rec_get_bit_field_1(rec, REC_N_OWNED, REC_N_OWNED_MASK, - REC_N_OWNED_SHIFT); + ret = rec_get_bit_field_1(rec, + comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); ut_ad(ret <= REC_MAX_N_OWNED); return(ret); @@ -305,13 +461,15 @@ void rec_set_n_owned( /*============*/ rec_t* rec, /* in: physical record */ + ulint comp, /* in: nonzero=compact page format */ ulint n_owned) /* in: the number of owned */ { ut_ad(rec); ut_ad(n_owned <= REC_MAX_N_OWNED); - rec_set_bit_field_1(rec, n_owned, REC_N_OWNED, REC_N_OWNED_MASK, - REC_N_OWNED_SHIFT); + rec_set_bit_field_1(rec, n_owned, + comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); } /********************************************************** @@ -321,14 +479,16 @@ ulint rec_get_info_bits( /*==============*/ /* out: info bits */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ { ulint ret; ut_ad(rec); - ret = rec_get_bit_field_1(rec, REC_INFO_BITS, REC_INFO_BITS_MASK, - REC_INFO_BITS_SHIFT); + ret = rec_get_bit_field_1(rec, + comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); ut_ad((ret & ~REC_INFO_BITS_MASK) == 0); return(ret); @@ -341,47 +501,99 @@ void rec_set_info_bits( /*==============*/ rec_t* rec, /* in: physical record */ + ulint comp, /* in: nonzero=compact page format */ ulint bits) /* in: info bits */ { ut_ad(rec); ut_ad((bits & ~REC_INFO_BITS_MASK) == 0); - rec_set_bit_field_1(rec, bits, REC_INFO_BITS, REC_INFO_BITS_MASK, - REC_INFO_BITS_SHIFT); + rec_set_bit_field_1(rec, bits, + comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); } /********************************************************** -Gets the value of the deleted flag in info bits. */ +The following function is used to set the status bits of a new-style record. */ UNIV_INLINE -ibool -rec_info_bits_get_deleted_flag( -/*===========================*/ - /* out: TRUE if deleted flag set */ - ulint info_bits) /* in: info bits from a record */ +void +rec_set_status( +/*===========*/ + rec_t* rec, /* in: physical record */ + ulint bits) /* in: info bits */ { - if (info_bits & REC_INFO_DELETED_FLAG) { + ut_ad(rec); + ut_ad((bits & ~REC_NEW_STATUS_MASK) == 0); - return(TRUE); - } + rec_set_bit_field_1(rec, bits, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); +} - return(FALSE); +/********************************************************** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +ulint +rec_get_info_and_status_bits( +/*=========================*/ + /* out: info bits */ + rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ +{ + ulint bits; +#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ +& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) +# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" +#endif + if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) { + bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec); + } else { + bits = rec_get_info_bits(rec, FALSE); + ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); + } + return(bits); +} +/********************************************************** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /* in: physical record */ + ulint comp, /* in: nonzero=compact page format */ + ulint bits) /* in: info bits */ +{ +#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ +& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) +# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" +#endif + if (comp) { + rec_set_status(rec, bits & REC_NEW_STATUS_MASK); + } else { + ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); + } + rec_set_info_bits(rec, comp, bits & ~REC_NEW_STATUS_MASK); } /********************************************************** The following function tells if record is delete marked. */ UNIV_INLINE -ibool +ulint rec_get_deleted_flag( /*=================*/ - /* out: TRUE if delete marked */ - rec_t* rec) /* in: physical record */ + /* out: nonzero if delete marked */ + rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ { - if (REC_INFO_DELETED_FLAG & rec_get_info_bits(rec)) { - - return(TRUE); + if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) { + return(UNIV_UNLIKELY(rec_get_bit_field_1(rec, + REC_NEW_INFO_BITS, REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT))); + } else { + return(UNIV_UNLIKELY(rec_get_bit_field_1(rec, + REC_OLD_INFO_BITS, REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT))); } - - return(FALSE); } /********************************************************** @@ -391,23 +603,32 @@ void rec_set_deleted_flag( /*=================*/ rec_t* rec, /* in: physical record */ - ibool flag) /* in: TRUE if delete marked */ + ulint comp, /* in: nonzero=compact page format */ + ulint flag) /* in: nonzero if delete marked */ { - ulint old_val; - ulint new_val; - - ut_ad(TRUE == 1); - ut_ad(flag <= TRUE); + ulint val; - old_val = rec_get_info_bits(rec); + val = rec_get_info_bits(rec, comp); if (flag) { - new_val = REC_INFO_DELETED_FLAG | old_val; + val |= REC_INFO_DELETED_FLAG; } else { - new_val = ~REC_INFO_DELETED_FLAG & old_val; + val &= ~REC_INFO_DELETED_FLAG; } - rec_set_info_bits(rec, new_val); + rec_set_info_bits(rec, comp, val); +} + +/********************************************************** +The following function tells if a new-style record is a node pointer. */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*=================*/ + /* out: TRUE if node pointer */ + rec_t* rec) /* in: physical record */ +{ + return(REC_STATUS_NODE_PTR == rec_get_status(rec)); } /********************************************************** @@ -418,14 +639,16 @@ ulint rec_get_heap_no( /*=============*/ /* out: heap order number */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ { ulint ret; ut_ad(rec); - ret = rec_get_bit_field_2(rec, REC_HEAP_NO, REC_HEAP_NO_MASK, - REC_HEAP_NO_SHIFT); + ret = rec_get_bit_field_2(rec, + comp ? REC_NEW_HEAP_NO : REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); ut_ad(ret <= REC_MAX_HEAP_NO); return(ret); @@ -438,12 +661,14 @@ void rec_set_heap_no( /*=============*/ rec_t* rec, /* in: physical record */ + ulint comp, /* in: nonzero=compact page format */ ulint heap_no)/* in: the heap number */ { ut_ad(heap_no <= REC_MAX_HEAP_NO); - rec_set_bit_field_2(rec, heap_no, REC_HEAP_NO, REC_HEAP_NO_MASK, - REC_HEAP_NO_SHIFT); + rec_set_bit_field_2(rec, heap_no, + comp ? REC_NEW_HEAP_NO : REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); } /********************************************************** @@ -456,10 +681,12 @@ rec_get_1byte_offs_flag( /* out: TRUE if 1-byte form */ rec_t* rec) /* in: physical record */ { - ut_ad(TRUE == 1); +#if TRUE != 1 +#error "TRUE != 1" +#endif - return(rec_get_bit_field_1(rec, REC_SHORT, REC_SHORT_MASK, - REC_SHORT_SHIFT)); + return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT)); } /********************************************************** @@ -471,11 +698,13 @@ rec_set_1byte_offs_flag( rec_t* rec, /* in: physical record */ ibool flag) /* in: TRUE if 1byte form */ { - ut_ad(TRUE == 1); +#if TRUE != 1 +#error "TRUE != 1" +#endif ut_ad(flag <= TRUE); - rec_set_bit_field_1(rec, flag, REC_SHORT, REC_SHORT_MASK, - REC_SHORT_SHIFT); + rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT); } /********************************************************** @@ -492,9 +721,9 @@ rec_1_get_field_end_info( ulint n) /* in: field index */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - return(mach_read_from_1(rec - (REC_N_EXTRA_BYTES + n + 1))); + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1))); } /********************************************************** @@ -511,73 +740,281 @@ rec_2_get_field_end_info( ulint n) /* in: field index */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2))); + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); } -/*************************************************************** -Gets the value of the ith field extern storage bit. If it is TRUE -it means that the field is stored on another page. */ +#ifdef UNIV_DEBUG +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 4 +#else /* UNIV_DEBUG */ +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 2 +#endif /* UNIV_DEBUG */ + +/* Get the base address of offsets. The extra_size is stored at +this position, and following positions hold the end offsets of +the fields. */ +#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) + +/************************************************************** +The following function returns the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +ulint +rec_offs_get_n_alloc( +/*=================*/ + /* out: number of elements */ + const ulint* offsets)/* in: array for rec_get_offsets() */ +{ + ulint n_alloc; + ut_ad(offsets); + n_alloc = offsets[0]; + ut_ad(n_alloc > 0); + return(n_alloc); +} + +/************************************************************** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + ulint* offsets, /* in: array for rec_get_offsets() */ + ulint n_alloc) /* in: number of elements */ +{ + ut_ad(offsets); + ut_ad(n_alloc > 0); + offsets[0] = n_alloc; +} + +/************************************************************** +The following function returns the number of fields in a record. */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*===============*/ + /* out: number of fields */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint n_fields; + ut_ad(offsets); + n_fields = offsets[1]; + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + return(n_fields); +} + +/**************************************************************** +Validates offsets returned by rec_get_offsets(). */ UNIV_INLINE ibool -rec_get_nth_field_extern_bit( -/*=========================*/ - /* in: TRUE or FALSE */ - rec_t* rec, /* in: record */ - ulint i) /* in: ith field */ +rec_offs_validate( +/*==============*/ + /* out: TRUE if valid */ + rec_t* rec, /* in: record or NULL */ + dict_index_t* index, /* in: record descriptor or NULL */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint info; + ulint i = rec_offs_n_fields(offsets); + ulint last = ULINT_MAX; + ulint comp = *rec_offs_base(offsets) & REC_OFFS_COMPACT; + + if (rec) { + ut_ad((ulint) rec == offsets[2]); + if (!comp) { + ut_a(rec_get_n_fields_old(rec) >= i); + } + } + if (index) { + ulint max_n_fields; + ut_ad((ulint) index == offsets[3]); + max_n_fields = ut_max( + dict_index_get_n_fields(index), + dict_index_get_n_unique_in_tree(index) + 1); + if (comp && rec) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_NODE_PTR: + max_n_fields = + dict_index_get_n_unique_in_tree(index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + max_n_fields = 1; + break; + default: + ut_error; + } + } + /* index->n_def == 0 for dummy indexes if !comp */ + ut_a(!comp || index->n_def); + ut_a(!index->n_def || i <= max_n_fields); + } + while (i--) { + ulint curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK; + ut_a(curr <= last); + last = curr; + } + return(TRUE); +} +/**************************************************************** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + rec_t* rec __attribute__((unused)), + /* in: record */ + dict_index_t* index __attribute__((unused)), + /* in: record descriptor */ + ulint* offsets __attribute__((unused))) + /* in: array returned by rec_get_offsets() */ +{ +#ifdef UNIV_DEBUG + ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets)); + offsets[2] = (ulint) rec; + offsets[3] = (ulint) index; +#endif /* UNIV_DEBUG */ +} - if (rec_get_1byte_offs_flag(rec)) { +/**************************************************************** +The following function is used to get a pointer to the nth +data field in a record. */ +UNIV_INLINE +byte* +rec_get_nth_field( +/*==============*/ + /* out: pointer to the field */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index of the field */ + ulint* len) /* out: length of the field; UNIV_SQL_NULL + if SQL null */ +{ + byte* field; + ulint length; + ut_ad(rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + ut_ad(len); - return(FALSE); + if (UNIV_UNLIKELY(n == 0)) { + field = rec; + } else { + field = rec + (rec_offs_base(offsets)[n] & REC_OFFS_MASK); } - info = rec_2_get_field_end_info(rec, i); + length = rec_offs_base(offsets)[1 + n]; - if (info & REC_2BYTE_EXTERN_MASK) { - return(TRUE); + if (length & REC_OFFS_SQL_NULL) { + length = UNIV_SQL_NULL; + } else { + length &= REC_OFFS_MASK; + length -= field - rec; } - return(FALSE); + *len = length; + return(field); } /********************************************************** -Returns TRUE if the extern bit is set in any of the fields -of rec. */ +Determine if the offsets are for a record in the new +compact format. */ UNIV_INLINE -ibool -rec_contains_externally_stored_field( -/*=================================*/ - /* out: TRUE if a field is stored externally */ - rec_t* rec) /* in: record */ +ulint +rec_offs_comp( +/*==========*/ + /* out: nonzero if compact format */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n; - ulint i; - - if (rec_get_1byte_offs_flag(rec)) { - - return(FALSE); - } + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return(*rec_offs_base(offsets) & REC_OFFS_COMPACT); +} - n = rec_get_n_fields(rec); +/********************************************************** +Returns nonzero if the extern bit is set in nth field of rec. */ +UNIV_INLINE +ulint +rec_offs_nth_extern( +/*================*/ + /* out: nonzero if externally stored */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n] + & REC_OFFS_EXTERNAL)); +} - for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { +/********************************************************** +Gets the physical size of a field. */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + /* out: length of field */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return((rec_offs_base(offsets)[1 + n] - rec_offs_base(offsets)[n]) + & REC_OFFS_MASK); +} +/********************************************************** +Returns TRUE if the extern bit is set in any of the fields +of an old-style record. */ +UNIV_INLINE +ibool +rec_offs_any_extern( +/*================*/ + /* out: TRUE if a field is stored externally */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint i; + for (i = rec_offs_n_fields(offsets); i--; ) { + if (rec_offs_nth_extern(offsets, i)) { return(TRUE); } } - return(FALSE); } +/*************************************************************** +Sets the value of the ith field extern storage bit. */ +UNIV_INLINE +void +rec_set_nth_field_extern_bit( +/*=========================*/ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ +{ + if (UNIV_LIKELY(index->table->comp)) { + rec_set_nth_field_extern_bit_new(rec, index, i, val, mtr); + } else { + rec_set_nth_field_extern_bit_old(rec, i, val, mtr); + } +} + /********************************************************** Returns the offset of n - 1th field end if the record is stored in the 1-byte offsets form. If the field is SQL null, the flag is ORed in the returned value. This function and the 2-byte counterpart are defined here because the -C-compilerwas not able to sum negative and positive constant offsets, and +C-compiler was not able to sum negative and positive constant offsets, and warned of constant arithmetic overflow within the compiler. */ UNIV_INLINE ulint @@ -589,9 +1026,9 @@ rec_1_get_prev_field_end_info( ulint n) /* in: field index */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); - return(mach_read_from_1(rec - (REC_N_EXTRA_BYTES + n))); + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n))); } /********************************************************** @@ -608,9 +1045,9 @@ rec_2_get_prev_field_end_info( ulint n) /* in: field index */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); - return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n))); + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n))); } /********************************************************** @@ -625,9 +1062,9 @@ rec_1_set_field_end_info( ulint info) /* in: value to set */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - mach_write_to_1(rec - (REC_N_EXTRA_BYTES + n + 1), info); + mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info); } /********************************************************** @@ -642,9 +1079,9 @@ rec_2_set_field_end_info( ulint info) /* in: value to set */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - mach_write_to_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2), info); + mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info); } /********************************************************** @@ -659,7 +1096,7 @@ rec_1_get_field_start_offs( ulint n) /* in: field index */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); if (n == 0) { @@ -682,7 +1119,7 @@ rec_2_get_field_start_offs( ulint n) /* in: field index */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); if (n == 0) { @@ -707,7 +1144,7 @@ rec_get_field_start_offs( ulint n) /* in: field index */ { ut_ad(rec); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); if (n == 0) { @@ -723,8 +1160,9 @@ rec_get_field_start_offs( } /**************************************************************** -Gets the physical size of a field. Also an SQL null may have a field of -size > 0, if the data type is of a fixed size. */ +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. */ UNIV_INLINE ulint rec_get_nth_field_size( @@ -744,133 +1182,134 @@ rec_get_nth_field_size( return(next_os - os); } -/**************************************************************** -The following function is used to get a copy of the nth data field in a -record to a buffer. */ -UNIV_INLINE -void -rec_copy_nth_field( -/*===============*/ - void* buf, /* in: pointer to the buffer */ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - ulint* len) /* out: length of the field; UNIV_SQL_NULL if SQL - null */ -{ - byte* ptr; - - ut_ad(buf && rec && len); - - ptr = rec_get_nth_field(rec, n, len); - - if (*len == UNIV_SQL_NULL) { - - return; - } - - ut_memcpy(buf, ptr, *len); -} - /*************************************************************** This is used to modify the value of an already existing field in a record. The previous value must have exactly the same size as the new value. If len -is UNIV_SQL_NULL then the field is treated as an SQL null. */ +is UNIV_SQL_NULL then the field is treated as an SQL null for old-style +records. For new-style records, len must not be UNIV_SQL_NULL. */ UNIV_INLINE void rec_set_nth_field( /*==============*/ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - void* data, /* in: pointer to the data if not SQL null */ - ulint len) /* in: length of the data or UNIV_SQL_NULL */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index number of the field */ + const void* data, /* in: pointer to the data + if not SQL null */ + ulint len) /* in: length of the data or UNIV_SQL_NULL. + If not SQL null, must have the same + length as the previous value. + If SQL null, previous value must be + SQL null. */ { byte* data2; ulint len2; - ut_ad((len == UNIV_SQL_NULL) - || (rec_get_nth_field_size(rec, n) == len)); - + ut_ad(rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + if (len == UNIV_SQL_NULL) { + ut_ad(!rec_offs_comp(offsets)); rec_set_nth_field_sql_null(rec, n); return; } - data2 = rec_get_nth_field(rec, n, &len2); - - ut_memcpy(data2, data, len); - + data2 = rec_get_nth_field(rec, offsets, n, &len2); if (len2 == UNIV_SQL_NULL) { - + ut_ad(!rec_offs_comp(offsets)); rec_set_nth_field_null_bit(rec, n, FALSE); + ut_ad(len == rec_get_nth_field_size(rec, n)); + } else { + ut_ad(len2 == len); } + + ut_memcpy(data2, data, len); } /************************************************************** -The following function returns the data size of a physical +The following function returns the data size of an old-style physical record, that is the sum of field lengths. SQL null fields are counted as length 0 fields. The value returned by the function is the distance from record origin to record end in bytes. */ UNIV_INLINE ulint -rec_get_data_size( -/*==============*/ - /* out: size */ +rec_get_data_size_old( +/*==================*/ + /* out: size */ rec_t* rec) /* in: physical record */ { ut_ad(rec); - return(rec_get_field_start_offs(rec, rec_get_n_fields(rec))); + return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec))); } /************************************************************** -Returns the total size of record minus data size of record. The value -returned by the function is the distance from record start to record origin -in bytes. */ +The following function sets the number of fields in offsets. */ +UNIV_INLINE +void +rec_offs_set_n_fields( +/*==================*/ + ulint* offsets, /* in: array returned by rec_get_offsets() */ + ulint n_fields) /* in: number of fields */ +{ + ut_ad(offsets); + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + offsets[1] = n_fields; +} + +/************************************************************** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ UNIV_INLINE ulint -rec_get_extra_size( +rec_offs_data_size( /*===============*/ - /* out: size */ - rec_t* rec) /* in: physical record */ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n_fields; - - ut_ad(rec); - - n_fields = rec_get_n_fields(rec); + ulint size; - if (rec_get_1byte_offs_flag(rec)) { - - return(REC_N_EXTRA_BYTES + n_fields); - } + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)] + & REC_OFFS_MASK; + ut_ad(size < UNIV_PAGE_SIZE); + return(size); +} - return(REC_N_EXTRA_BYTES + 2 * n_fields); +/************************************************************** +Returns the total size of record minus data size of record. The value +returned by the function is the distance from record start to record origin +in bytes. */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint size; + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = *rec_offs_base(offsets) & ~REC_OFFS_COMPACT; + ut_ad(size < UNIV_PAGE_SIZE); + return(size); } -/************************************************************** +/************************************************************** Returns the total size of a physical record. */ UNIV_INLINE ulint -rec_get_size( -/*=========*/ - /* out: size */ - rec_t* rec) /* in: physical record */ +rec_offs_size( +/*==========*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n_fields; - - ut_ad(rec); - - n_fields = rec_get_n_fields(rec); - - if (rec_get_1byte_offs_flag(rec)) { - - return(REC_N_EXTRA_BYTES + n_fields - + rec_1_get_field_start_offs(rec, n_fields)); - } - - return(REC_N_EXTRA_BYTES + 2 * n_fields - + rec_2_get_field_start_offs(rec, n_fields)); + return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets)); } /************************************************************** @@ -879,10 +1318,11 @@ UNIV_INLINE byte* rec_get_end( /*========*/ - /* out: pointer to end */ - rec_t* rec) /* in: pointer to record */ + /* out: pointer to end */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - return(rec + rec_get_data_size(rec)); + return(rec + rec_offs_data_size(offsets)); } /************************************************************** @@ -891,10 +1331,11 @@ UNIV_INLINE byte* rec_get_start( /*==========*/ - /* out: pointer to start */ - rec_t* rec) /* in: pointer to record */ + /* out: pointer to start */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - return(rec - rec_get_extra_size(rec)); + return(rec - rec_offs_extra_size(offsets)); } /******************************************************************* @@ -903,18 +1344,20 @@ UNIV_INLINE rec_t* rec_copy( /*=====*/ - /* out: pointer to the origin of the copied record */ - void* buf, /* in: buffer */ - rec_t* rec) /* in: physical record */ + /* out: pointer to the origin of the copy */ + void* buf, /* in: buffer */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint extra_len; ulint data_len; ut_ad(rec && buf); - ut_ad(rec_validate(rec)); + ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets)); + ut_ad(rec_validate((rec_t*) rec, offsets)); - extra_len = rec_get_extra_size(rec); - data_len = rec_get_data_size(rec); + extra_len = rec_offs_extra_size(offsets); + data_len = rec_offs_data_size(offsets); ut_memcpy(buf, rec - extra_len, extra_len + data_len); @@ -922,8 +1365,8 @@ rec_copy( } /************************************************************** -Returns the extra size of a physical record if we know its data size and -the number of fields. */ +Returns the extra size of an old-style physical record if we know its +data size and number of fields. */ UNIV_INLINE ulint rec_get_converted_extra_size( @@ -934,28 +1377,51 @@ rec_get_converted_extra_size( { if (data_size <= REC_1BYTE_OFFS_LIMIT) { - return(REC_N_EXTRA_BYTES + n_fields); + return(REC_N_OLD_EXTRA_BYTES + n_fields); } - return(REC_N_EXTRA_BYTES + 2 * n_fields); + return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields); } /************************************************************** The following function returns the size of a data tuple when converted to +a new-style physical record. */ + +ulint +rec_get_converted_size_new( +/*=======================*/ + /* out: size */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple);/* in: data tuple */ +/************************************************************** +The following function returns the size of a data tuple when converted to a physical record. */ UNIV_INLINE ulint rec_get_converted_size( /*===================*/ /* out: size */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* dtuple) /* in: data tuple */ { ulint data_size; ulint extra_size; - + + ut_ad(index); ut_ad(dtuple); ut_ad(dtuple_check_typed(dtuple)); + ut_ad(index->type & DICT_UNIVERSAL + || dtuple_get_n_fields(dtuple) == + (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + == REC_STATUS_NODE_PTR) + ? dict_index_get_n_unique_in_tree(index) + 1 + : dict_index_get_n_fields(index))); + + if (UNIV_LIKELY(index->table->comp)) { + return(rec_get_converted_size_new(index, dtuple)); + } + data_size = dtuple_get_data_size(dtuple); extra_size = rec_get_converted_extra_size( @@ -971,12 +1437,15 @@ UNIV_INLINE ulint rec_fold( /*=====*/ - /* out: the folded value */ - rec_t* rec, /* in: the physical record */ - ulint n_fields, /* in: number of complete fields to fold */ - ulint n_bytes, /* in: number of bytes to fold in an - incomplete last field */ - dulint tree_id) /* in: index tree id */ + /* out: the folded value */ + rec_t* rec, /* in: the physical record */ + const ulint* offsets, /* in: array returned by + rec_get_offsets() */ + ulint n_fields, /* in: number of complete + fields to fold */ + ulint n_bytes, /* in: number of bytes to fold + in an incomplete last field */ + dulint tree_id) /* in: index tree id */ { ulint i; byte* data; @@ -984,12 +1453,13 @@ rec_fold( ulint fold; ulint n_fields_rec; - ut_ad(rec_validate(rec)); - ut_ad(n_fields <= rec_get_n_fields(rec)); - ut_ad((n_fields < rec_get_n_fields(rec)) || (n_bytes == 0)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate((rec_t*) rec, offsets)); ut_ad(n_fields + n_bytes > 0); - - n_fields_rec = rec_get_n_fields(rec); + + n_fields_rec = rec_offs_n_fields(offsets); + ut_ad(n_fields <= n_fields_rec); + ut_ad(n_fields < n_fields_rec || n_bytes == 0); if (n_fields > n_fields_rec) { n_fields = n_fields_rec; @@ -1002,7 +1472,7 @@ rec_fold( fold = ut_fold_dulint(tree_id); for (i = 0; i < n_fields; i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); if (len != UNIV_SQL_NULL) { fold = ut_fold_ulint_pair(fold, @@ -1011,7 +1481,7 @@ rec_fold( } if (n_bytes > 0) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); if (len != UNIV_SQL_NULL) { if (len > n_bytes) { @@ -1025,19 +1495,3 @@ rec_fold( return(fold); } - -/************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -the address destination. */ -UNIV_INLINE -rec_t* -rec_convert_dtuple_to_rec( -/*======================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple) /* in: data tuple */ -{ - return(rec_convert_dtuple_to_rec_low(destination, dtuple, - dtuple_get_data_size(dtuple))); -} diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h index 13773ed380d..4e6ff73b0f8 100644 --- a/innobase/include/row0mysql.h +++ b/innobase/include/row0mysql.h @@ -22,36 +22,6 @@ Created 9/17/2000 Heikki Tuuri typedef struct row_prebuilt_struct row_prebuilt_t; /*********************************************************************** -Stores a variable-length field (like VARCHAR) length to dest, in the -MySQL format. */ -UNIV_INLINE -byte* -row_mysql_store_var_len( -/*====================*/ - /* out: dest + 2 */ - byte* dest, /* in: where to store */ - ulint len); /* in: length, must fit in two bytes */ -/*********************************************************************** -Reads a MySQL format variable-length field (like VARCHAR) length and -returns pointer to the field data. */ -UNIV_INLINE -byte* -row_mysql_read_var_ref( -/*===================*/ - /* out: field + 2 */ - ulint* len, /* out: variable-length field length */ - byte* field); /* in: field */ -/*********************************************************************** -Reads a MySQL format variable-length field (like VARCHAR) length and -returns pointer to the field data. */ - -byte* -row_mysql_read_var_ref_noninline( -/*=============================*/ - /* out: field + 2 */ - ulint* len, /* out: variable-length field length */ - byte* field); /* in: field */ -/*********************************************************************** Frees the blob heap in prebuilt when no longer needed. */ void @@ -60,6 +30,30 @@ row_mysql_prebuilt_free_blob_heap( row_prebuilt_t* prebuilt); /* in: prebuilt struct of a ha_innobase:: table handle */ /*********************************************************************** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. */ + +byte* +row_mysql_store_true_var_len( +/*=========================*/ + /* out: pointer to the data, we skip the 1 or 2 bytes + at the start that are used to store the len */ + byte* dest, /* in: where to store */ + ulint len, /* in: length, must fit in two bytes */ + ulint lenlen);/* in: storage length of len: either 1 or 2 bytes */ +/*********************************************************************** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. */ + +byte* +row_mysql_read_true_varchar( +/*========================*/ + /* out: pointer to the data, we skip the 1 or 2 bytes + at the start that are used to store the len */ + ulint* len, /* out: variable-length field length */ + byte* field, /* in: field in the MySQL format */ + ulint lenlen);/* in: storage length of len: either 1 or 2 bytes */ +/*********************************************************************** Stores a reference to a BLOB in the MySQL format. */ void @@ -83,23 +77,40 @@ row_mysql_read_blob_ref( ulint col_len); /* in: BLOB reference length (not BLOB length) */ /****************************************************************** -Stores a non-SQL-NULL field given in the MySQL format in the Innobase -format. */ -UNIV_INLINE -void +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.c. */ + +byte* row_mysql_store_col_in_innobase_format( /*===================================*/ - dfield_t* dfield, /* in/out: dfield */ - byte* buf, /* in/out: buffer for the converted - value */ + /* out: up to which byte we used + buf in the conversion */ + dfield_t* dfield, /* in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /* in/out: buffer for a converted + integer value; this must be at least + col_len long then! */ + ibool row_format_col, /* TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ byte* mysql_data, /* in: MySQL column value, not SQL NULL; NOTE that dfield may also get a pointer to mysql_data, therefore do not discard this as long as dfield is used! */ - ulint col_len, /* in: MySQL column length */ - ulint type, /* in: data type */ - ulint is_unsigned); /* in: != 0 if unsigned integer type */ + ulint col_len, /* in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp); /* in: nonzero=compact format */ /******************************************************************** Handles user errors and lock waits detected by the database engine. */ @@ -161,14 +172,6 @@ row_lock_table_autoinc_for_mysql( row_prebuilt_t* prebuilt); /* in: prebuilt struct in the MySQL table handle */ /************************************************************************* -Unlocks all table locks explicitly requested by trx (with LOCK TABLES, -lock type LOCK_TABLE_EXP). */ - -void -row_unlock_tables_for_mysql( -/*========================*/ - trx_t* trx); /* in: transaction */ -/************************************************************************* Sets a table lock on the table mentioned in prebuilt. */ int @@ -179,9 +182,10 @@ row_lock_table_for_mysql( table handle */ dict_table_t* table, /* in: table to lock, or NULL if prebuilt->table should be - locked as LOCK_TABLE_EXP | + locked as prebuilt->select_lock_type */ - ulint mode); /* in: lock mode of table */ + ulint mode); /* in: lock mode of table + (ignored if table==NULL) */ /************************************************************************* Does an insert for MySQL. */ @@ -240,6 +244,27 @@ row_update_for_mysql( row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL handle */ /************************************************************************* +This can only be used when srv_locks_unsafe_for_binlog is TRUE. Before +calling this function we must use trx_reset_new_rec_lock_info() and +trx_register_new_rec_lock() to store the information which new record locks +really were set. This function removes a newly set lock under prebuilt->pcur, +and also under prebuilt->clust_pcur. Currently, this is only used and tested +in the case of an UPDATE or a DELETE statement, where the row lock is of the +LOCK_X type. +Thus, this implements a 'mini-rollback' that releases the latest record +locks we set. */ + +int +row_unlock_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL + handle */ + ibool has_latches_on_recs);/* TRUE if called so that we have + the latches on the records under pcur + and clust_pcur, and we do not need to + reposition the cursors. */ +/************************************************************************* Creates an query graph node of 'update' type to be used in the MySQL interface. */ @@ -352,6 +377,15 @@ row_get_background_drop_list_len_low(void); /*======================================*/ /* out: how many tables in list */ /************************************************************************* +Truncates a table for MySQL. */ + +int +row_truncate_table_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table handle */ + trx_t* trx); /* in: transaction handle */ +/************************************************************************* Drops a table for MySQL. If the name of the dropped table ends to characters INNODB_MONITOR, then this also stops printing of monitor output by the master thread. */ @@ -436,8 +470,22 @@ struct mysql_row_templ_struct { zero if column cannot be NULL */ ulint type; /* column type in Innobase mtype numbers DATA_CHAR... */ + ulint mysql_type; /* MySQL type code; this is always + < 256 */ + ulint mysql_length_bytes; /* if mysql_type + == DATA_MYSQL_TRUE_VARCHAR, this tells + whether we should use 1 or 2 bytes to + store the MySQL true VARCHAR data + length at the start of row in the MySQL + format (NOTE that the MySQL key value + format always uses 2 bytes for the data + len) */ ulint charset; /* MySQL charset-collation code of the column, or zero */ + ulint mbminlen; /* minimum length of a char, in bytes, + or zero if not a char type */ + ulint mbmaxlen; /* maximum length of a char, in bytes, + or zero if not a char type */ ulint is_unsigned; /* if a column type is an integer type and this field is != 0, then it is an unsigned integer type */ @@ -554,6 +602,8 @@ struct row_prebuilt_struct { that was decided in ha_innodb.cc, ::store_lock(), ::external_lock(), etc. */ + ulint mysql_prefix_len;/* byte offset of the end of + the last requested column */ ulint mysql_row_len; /* length in bytes of a row in the MySQL format */ ulint n_rows_fetched; /* number of rows fetched after @@ -569,6 +619,10 @@ struct row_prebuilt_struct { allocated mem buf start, because there is a 4 byte magic number at the start and at the end */ + ibool keep_other_fields_on_keyread; /* when using fetch + cache with HA_EXTRA_KEYREAD, don't + overwrite other fields in mysql row + row buffer.*/ ulint fetch_cache_first;/* position of the first not yet fetched row in fetch_cache */ ulint n_fetch_cached; /* number of not yet fetched rows diff --git a/innobase/include/row0mysql.ic b/innobase/include/row0mysql.ic index fc922b52d0a..aa8a70d8761 100644 --- a/innobase/include/row0mysql.ic +++ b/innobase/include/row0mysql.ic @@ -5,122 +5,3 @@ MySQL interface for Innobase Created 1/23/2001 Heikki Tuuri *******************************************************/ - -/*********************************************************************** -Stores a variable-length field (like VARCHAR) length to dest, in the -MySQL format. No real var implemented in MySQL yet! */ -UNIV_INLINE -byte* -row_mysql_store_var_len( -/*====================*/ - /* out: dest + 2 */ - byte* dest, /* in: where to store */ - ulint len __attribute__((unused))) /* in: length, must fit in two - bytes */ -{ - ut_ad(len < 256 * 256); -/* - mach_write_to_2_little_endian(dest, len); - - return(dest + 2); -*/ - return(dest); /* No real var implemented in MySQL yet! */ -} - -/*********************************************************************** -Reads a MySQL format variable-length field (like VARCHAR) length and -returns pointer to the field data. No real var implemented in MySQL yet! */ -UNIV_INLINE -byte* -row_mysql_read_var_ref( -/*===================*/ - /* out: field + 2 */ - ulint* len, /* out: variable-length field length; does not work - yet! */ - byte* field) /* in: field */ -{ -/* - *len = mach_read_from_2_little_endian(field); - - return(field + 2); -*/ - UT_NOT_USED(len); - - return(field); /* No real var implemented in MySQL yet! */ -} - -/****************************************************************** -Stores a non-SQL-NULL field given in the MySQL format in the Innobase -format. */ -UNIV_INLINE -void -row_mysql_store_col_in_innobase_format( -/*===================================*/ - dfield_t* dfield, /* in/out: dfield */ - byte* buf, /* in/out: buffer for the converted - value; this must be at least col_len - long! */ - byte* mysql_data, /* in: MySQL column value, not - SQL NULL; NOTE that dfield may also - get a pointer to mysql_data, - therefore do not discard this as long - as dfield is used! */ - ulint col_len, /* in: MySQL column length */ - ulint type, /* in: data type */ - ulint is_unsigned) /* in: != 0 if unsigned integer type */ -{ - byte* ptr = mysql_data; - - if (type == DATA_INT) { - /* Store integer data in Innobase in a big-endian format, - sign bit negated */ - - ptr = buf + col_len; - - for (;;) { - ptr--; - *ptr = *mysql_data; - if (ptr == buf) { - break; - } - mysql_data++; - } - - if (!is_unsigned) { - *ptr = (byte) (*ptr ^ 128); - } - } else if (type == DATA_VARCHAR || type == DATA_VARMYSQL - || type == DATA_BINARY) { - /* Remove trailing spaces. */ - - /* Handle UCS2 strings differently. As no new - collations will be introduced in 4.1, we hardcode the - charset-collation codes here. In 5.0, the logic will - be based on mbminlen. */ - ulint cset = dtype_get_charset_coll( - dtype_get_prtype(dfield_get_type(dfield))); - ptr = row_mysql_read_var_ref(&col_len, mysql_data); - if (cset == 35/*ucs2_general_ci*/ - || cset == 90/*ucs2_bin*/ - || (cset >= 128/*ucs2_unicode_ci*/ - && cset <= 144/*ucs2_persian_ci*/)) { - /* space=0x0020 */ - /* Trim "half-chars", just in case. */ - col_len &= ~1; - - while (col_len >= 2 && ptr[col_len - 2] == 0x00 - && ptr[col_len - 1] == 0x20) { - col_len -= 2; - } - } else { - /* space=0x20 */ - while (col_len > 0 && ptr[col_len - 1] == 0x20) { - col_len--; - } - } - } else if (type == DATA_BLOB) { - ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len); - } - - dfield_set_data(dfield, ptr, col_len); -} diff --git a/innobase/include/row0row.h b/innobase/include/row0row.h index 951e211fb37..782973d8f5d 100644 --- a/innobase/include/row0row.h +++ b/innobase/include/row0row.h @@ -27,7 +27,8 @@ row_get_rec_trx_id( /*===============*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Reads the roll pointer field from a clustered index record. */ UNIV_INLINE @@ -36,7 +37,8 @@ row_get_rec_roll_ptr( /*=================*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Writes the trx id field to a clustered index record. */ UNIV_INLINE @@ -45,7 +47,8 @@ row_set_rec_trx_id( /*===============*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ - dulint trx_id); /* in: value of the field */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + dulint trx_id);/* in: value of the field */ /************************************************************************* Sets the roll pointer field in a clustered index record. */ UNIV_INLINE @@ -54,6 +57,7 @@ row_set_rec_roll_ptr( /*=================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint roll_ptr);/* in: value of the field */ /********************************************************************* When an insert to a table is performed, this function builds the entry which @@ -90,6 +94,9 @@ row_build( the buffer page of this record must be at least s-latched and the latch held as long as the row dtuple is used! */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) + or NULL, in which case this function + will invoke rec_get_offsets() */ mem_heap_t* heap); /* in: memory heap from which the memory needed is allocated */ /*********************************************************************** @@ -175,14 +182,15 @@ UNIV_INLINE void row_build_row_ref_fast( /*===================*/ - dtuple_t* ref, /* in: typed data tuple where the reference - is built */ - ulint* map, /* in: array of field numbers in rec telling - how ref should be built from the fields of - rec */ - rec_t* rec); /* in: record in the index; must be preserved - while ref is used, as we do not copy field - values to heap */ + dtuple_t* ref, /* in: typed data tuple where the + reference is built */ + const ulint* map, /* in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + rec_t* rec, /* in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Searches the clustered index record for a row, if we have the row reference. */ diff --git a/innobase/include/row0row.ic b/innobase/include/row0row.ic index 8e5121f5a96..85410beacf0 100644 --- a/innobase/include/row0row.ic +++ b/innobase/include/row0row.ic @@ -20,7 +20,8 @@ row_get_rec_sys_field( /* out: value of the field */ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Sets the trx id or roll ptr field in a clustered index record: this function is slower than the specialized inline functions. */ @@ -32,6 +33,7 @@ row_set_rec_sys_field( ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint val); /* in: value to set */ /************************************************************************* @@ -42,18 +44,21 @@ row_get_rec_trx_id( /*===============*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { return(trx_read_trx_id(rec + offset)); } else { - return(row_get_rec_sys_field(DATA_TRX_ID, rec, index)); + return(row_get_rec_sys_field(DATA_TRX_ID, + rec, index, offsets)); } } @@ -65,18 +70,21 @@ row_get_rec_roll_ptr( /*=================*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); } else { - return(row_get_rec_sys_field(DATA_ROLL_PTR, rec, index)); + return(row_get_rec_sys_field(DATA_ROLL_PTR, + rec, index, offsets)); } } @@ -88,18 +96,21 @@ row_set_rec_trx_id( /*===============*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint trx_id) /* in: value of the field */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { trx_write_trx_id(rec + offset, trx_id); } else { - row_set_rec_sys_field(DATA_TRX_ID, rec, index, trx_id); + row_set_rec_sys_field(DATA_TRX_ID, + rec, index, offsets, trx_id); } } @@ -111,18 +122,21 @@ row_set_rec_roll_ptr( /*=================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint roll_ptr)/* in: value of the field */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); } else { - row_set_rec_sys_field(DATA_ROLL_PTR, rec, index, roll_ptr); + row_set_rec_sys_field(DATA_ROLL_PTR, + rec, index, offsets, roll_ptr); } } @@ -133,14 +147,15 @@ UNIV_INLINE void row_build_row_ref_fast( /*===================*/ - dtuple_t* ref, /* in: typed data tuple where the reference - is built */ - ulint* map, /* in: array of field numbers in rec telling - how ref should be built from the fields of - rec */ - rec_t* rec) /* in: record in the index; must be preserved - while ref is used, as we do not copy field - values to heap */ + dtuple_t* ref, /* in: typed data tuple where the + reference is built */ + const ulint* map, /* in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + rec_t* rec, /* in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { dfield_t* dfield; byte* field; @@ -149,6 +164,7 @@ row_build_row_ref_fast( ulint field_no; ulint i; + ut_ad(rec_offs_validate(rec, NULL, offsets)); ref_len = dtuple_get_n_fields(ref); for (i = 0; i < ref_len; i++) { @@ -158,7 +174,8 @@ row_build_row_ref_fast( if (field_no != ULINT_UNDEFINED) { - field = rec_get_nth_field(rec, field_no, &len); + field = rec_get_nth_field(rec, offsets, + field_no, &len); dfield_set_data(dfield, field, len); } } diff --git a/innobase/include/row0sel.ic b/innobase/include/row0sel.ic index 595cea1138b..600c6204571 100644 --- a/innobase/include/row0sel.ic +++ b/innobase/include/row0sel.ic @@ -75,7 +75,7 @@ open_step( } } - if (err != DB_SUCCESS) { + if (UNIV_EXPECT(err, DB_SUCCESS) != DB_SUCCESS) { /* SQL error detected */ fprintf(stderr, "SQL error %lu\n", (ulong) err); diff --git a/innobase/include/row0upd.h b/innobase/include/row0upd.h index 28210364833..673e0511153 100644 --- a/innobase/include/row0upd.h +++ b/innobase/include/row0upd.h @@ -80,6 +80,7 @@ row_upd_rec_sys_fields( /*===================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ dulint roll_ptr);/* in: roll ptr of the undo log record */ /************************************************************************* @@ -124,8 +125,8 @@ row_upd_changes_field_size_or_external( /* out: TRUE if the update changes the size of some field in index or the field is external in rec or update */ - rec_t* rec, /* in: record in index */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update);/* in: update vector */ /*************************************************************** Replaces the new column values stored in the update vector to the record @@ -135,8 +136,9 @@ a clustered index */ void row_upd_rec_in_place( /*=================*/ - rec_t* rec, /* in/out: record where replaced */ - upd_t* update);/* in: update vector */ + rec_t* rec, /* in/out: record where replaced */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update);/* in: update vector */ /******************************************************************* Builds an update vector from those fields which in a secondary index entry differ from a record that has the equal ordering fields. NOTE: we compare @@ -274,10 +276,11 @@ recovery. */ void row_upd_rec_sys_fields_in_recovery( /*===============================*/ - rec_t* rec, /* in: record */ - ulint pos, /* in: TRX_ID position in rec */ - dulint trx_id, /* in: transaction id */ - dulint roll_ptr);/* in: roll ptr of the undo log record */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr);/* in: roll ptr of the undo log record */ /************************************************************************* Parses the log data written by row_upd_index_write_log. */ diff --git a/innobase/include/row0upd.ic b/innobase/include/row0upd.ic index a124228a0de..acbb11aa1c7 100644 --- a/innobase/include/row0upd.ic +++ b/innobase/include/row0upd.ic @@ -83,7 +83,7 @@ upd_field_set_field_no( { upd_field->field_no = field_no; - if (field_no >= dict_index_get_n_fields(index)) { + if (UNIV_UNLIKELY(field_no >= dict_index_get_n_fields(index))) { fprintf(stderr, "InnoDB: Error: trying to access field %lu in ", (ulong) field_no); @@ -106,15 +106,17 @@ row_upd_rec_sys_fields( /*===================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ dulint roll_ptr)/* in: roll ptr of the undo log record */ { ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); #ifdef UNIV_SYNC_DEBUG ut_ad(!buf_block_align(rec)->is_hashed || rw_lock_own(&btr_search_latch, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - row_set_rec_trx_id(rec, index, trx->id); - row_set_rec_roll_ptr(rec, index, roll_ptr); + row_set_rec_trx_id(rec, index, offsets, trx->id); + row_set_rec_roll_ptr(rec, index, offsets, roll_ptr); } diff --git a/innobase/include/row0vers.h b/innobase/include/row0vers.h index 30cf82144e9..079d841f7f3 100644 --- a/innobase/include/row0vers.h +++ b/innobase/include/row0vers.h @@ -30,7 +30,8 @@ row_vers_impl_x_locked_off_kernel( transaction; NOTE that the kernel mutex is temporarily released! */ rec_t* rec, /* in: record in a secondary index */ - dict_index_t* index); /* in: the secondary index */ + dict_index_t* index, /* in: the secondary index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /********************************************************************* Finds out if we must preserve a delete marked earlier version of a clustered index record, because it is >= the purge view. */ @@ -78,7 +79,11 @@ row_vers_build_for_consistent_read( mtr_t* mtr, /* in: mtr holding the latch on rec; it will also hold the latch on purge_view */ dict_index_t* index, /* in: the clustered index */ + ulint** offsets,/* in/out: offsets returned by + rec_get_offsets(rec, index) */ read_view_t* view, /* in: the consistent read view */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ mem_heap_t* in_heap,/* in: memory heap from which the memory for old_vers is allocated; memory for possible intermediate versions is allocated and freed diff --git a/innobase/include/row0vers.ic b/innobase/include/row0vers.ic index 5ece47c35d1..ab1e264635b 100644 --- a/innobase/include/row0vers.ic +++ b/innobase/include/row0vers.ic @@ -11,73 +11,3 @@ Created 2/6/1997 Heikki Tuuri #include "read0read.h" #include "page0page.h" #include "log0recv.h" - -/************************************************************************* -Fetches the trx id of a clustered index record or version. */ -UNIV_INLINE -dulint -row_vers_get_trx_id( -/*================*/ - /* out: trx id or ut_dulint_zero if the - clustered index record not found */ - rec_t* rec, /* in: clustered index record, or an old - version of it */ - dict_table_t* table) /* in: table */ -{ - return(row_get_rec_trx_id(rec, dict_table_get_first_index(table))); -} - -/************************************************************************* -Checks if a consistent read can be performed immediately on the index -record, or if an older version is needed. */ -UNIV_INLINE -ibool -row_vers_clust_rec_sees_older( -/*==========================*/ - /* out: FALSE if can read immediately */ - rec_t* rec, /* in: record which should be read or passed - over by a read cursor */ - dict_index_t* index, /* in: clustered index */ - read_view_t* view) /* in: read view */ -{ - ut_ad(index->type & DICT_CLUSTERED); - - if (read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index))) { - - return(FALSE); - } - - return(TRUE); -} - -/************************************************************************* -Checks if a secondary index record can be read immediately by a consistent -read, or if an older version may be needed. To be sure, we will have to -look in the clustered index. */ -UNIV_INLINE -ibool -row_vers_sec_rec_may_see_older( -/*===========================*/ - /* out: FALSE if can be read immediately */ - rec_t* rec, /* in: record which should be read or passed */ - dict_index_t* index __attribute__((unused)),/* in: secondary index */ - read_view_t* view) /* in: read view */ -{ - page_t* page; - - ut_ad(!(index->type & DICT_CLUSTERED)); - - page = buf_frame_align(rec); - - if ((ut_dulint_cmp(page_get_max_trx_id(page), view->up_limit_id) >= 0) - || recv_recovery_is_on()) { - - /* It may be that the record was inserted or modified by a - transaction the view should not see: we have to look in the - clustered index */ - - return(TRUE); - } - - return(FALSE); -} diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index 4352083b21f..116ae7b6438 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -93,20 +93,23 @@ extern ulint srv_max_n_open_files; extern ulint srv_max_dirty_pages_pct; extern ulint srv_force_recovery; -extern ulint srv_thread_concurrency; +extern ulong srv_thread_concurrency; extern ulint srv_max_n_threads; extern lint srv_conc_n_threads; -extern ibool srv_fast_shutdown; -extern ibool srv_very_fast_shutdown; /* if this TRUE, do not flush the +extern ulint srv_fast_shutdown; /* If this is 1, do not do a + purge and index buffer merge. + If this 2, do not even flush the buffer pool to data files at the - shutdown; we effectively 'crash' - InnoDB */ + shutdown: we effectively 'crash' + InnoDB (but lose no committed + transactions). */ extern ibool srv_innodb_status; extern ibool srv_use_doublewrite_buf; +extern ibool srv_use_checksums; extern ibool srv_set_thread_priorities; extern int srv_query_thread_priority; @@ -131,7 +134,9 @@ extern ibool srv_print_innodb_table_monitor; extern ibool srv_lock_timeout_and_monitor_active; extern ibool srv_error_monitor_active; -extern ulint srv_n_spin_wait_rounds; +extern ulong srv_n_spin_wait_rounds; +extern ulong srv_n_free_tickets_to_enter; +extern ulong srv_thread_sleep_delay; extern ulint srv_spin_wait_delay; extern ibool srv_priority_boost; @@ -177,12 +182,70 @@ extern mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs, #define kernel_mutex (*kernel_mutex_temp) #define SRV_MAX_N_IO_THREADS 100 +#define SRV_CONCURRENCY_THRESHOLD 20 /* Array of English strings describing the current state of an i/o handler thread */ extern const char* srv_io_thread_op_info[]; extern const char* srv_io_thread_function[]; +/* the number of the log write requests done */ +extern ulint srv_log_write_requests; + +/* the number of physical writes to the log performed */ +extern ulint srv_log_writes; + +/* amount of data written to the log files in bytes */ +extern ulint srv_os_log_written; + +/* amount of writes being done to the log files */ +extern ulint srv_os_log_pending_writes; + +/* we increase this counter, when there we don't have enough space in the +log buffer and have to flush it */ +extern ulint srv_log_waits; + +/* variable that counts amount of data read in total (in bytes) */ +extern ulint srv_data_read; + +/* here we count the amount of data written in total (in bytes) */ +extern ulint srv_data_written; + +/* this variable counts the amount of times, when the doublewrite buffer +was flushed */ +extern ulint srv_dblwr_writes; + +/* here we store the number of pages that have been flushed to the +doublewrite buffer */ +extern ulint srv_dblwr_pages_written; + +/* in this variable we store the number of write requests issued */ +extern ulint srv_buf_pool_write_requests; + +/* here we store the number of times when we had to wait for a free page +in the buffer pool. It happens when the buffer pool is full and we need +to make a flush, in order to be able to read or create a page. */ +extern ulint srv_buf_pool_wait_free; + +/* variable to count the number of pages that were written from the +buffer pool to disk */ +extern ulint srv_buf_pool_flushed; + +/* variable to count the number of buffer pool reads that led to the +reading of a disk page */ +extern ulint srv_buf_pool_reads; + +/* variable to count the number of sequential read-aheads were done */ +extern ulint srv_read_ahead_seq; + +/* variable to count the number of random read-aheads were done */ +extern ulint srv_read_ahead_rnd; + +/* In this structure we store status variables to be passed to MySQL */ +typedef struct export_var_struct export_struc; + +extern export_struc export_vars; + typedef struct srv_sys_struct srv_sys_t; /* The server system */ @@ -233,6 +296,12 @@ srv_boot(void); /*==========*/ /* out: DB_SUCCESS or error code */ /************************************************************************* +Initializes the server. */ + +void +srv_init(void); +/*==========*/ +/************************************************************************* Frees the OS fast mutex created in srv_boot(). */ void @@ -404,6 +473,12 @@ srv_printf_innodb_monitor( ulint* trx_end); /* out: file position of the end of the list of active transactions */ +/********************************************************************** +Function to pass InnoDB status variables to MySQL */ + +void +srv_export_innodb_status(void); +/*=====================*/ /* Types for the threads existing in the system. Threads of types 4 - 9 are called utility threads. Note that utility threads are mainly disk @@ -429,6 +504,53 @@ typedef struct srv_slot_struct srv_slot_t; /* Thread table is an array of slots */ typedef srv_slot_t srv_table_t; +/* In this structure we store status variables to be passed to MySQL */ +struct export_var_struct{ + ulint innodb_data_pending_reads; + ulint innodb_data_pending_writes; + ulint innodb_data_pending_fsyncs; + ulint innodb_data_fsyncs; + ulint innodb_data_read; + ulint innodb_data_writes; + ulint innodb_data_written; + ulint innodb_data_reads; + ulint innodb_buffer_pool_pages_total; + ulint innodb_buffer_pool_pages_data; + ulint innodb_buffer_pool_pages_dirty; + ulint innodb_buffer_pool_pages_misc; + ulint innodb_buffer_pool_pages_free; + ulint innodb_buffer_pool_pages_latched; + ulint innodb_buffer_pool_read_requests; + ulint innodb_buffer_pool_reads; + ulint innodb_buffer_pool_wait_free; + ulint innodb_buffer_pool_pages_flushed; + ulint innodb_buffer_pool_write_requests; + ulint innodb_buffer_pool_read_ahead_seq; + ulint innodb_buffer_pool_read_ahead_rnd; + ulint innodb_dblwr_pages_written; + ulint innodb_dblwr_writes; + ulint innodb_log_waits; + ulint innodb_log_write_requests; + ulint innodb_log_writes; + ulint innodb_os_log_written; + ulint innodb_os_log_fsyncs; + ulint innodb_os_log_pending_writes; + ulint innodb_os_log_pending_fsyncs; + ulint innodb_page_size; + ulint innodb_pages_created; + ulint innodb_pages_read; + ulint innodb_pages_written; + ulint innodb_row_lock_waits; + ulint innodb_row_lock_current_waits; + ib_longlong innodb_row_lock_time; + ulint innodb_row_lock_time_avg; + ulint innodb_row_lock_time_max; + ulint innodb_rows_read; + ulint innodb_rows_inserted; + ulint innodb_rows_updated; + ulint innodb_rows_deleted; +}; + /* The server system struct */ struct srv_sys_struct{ os_event_t operational; /* created threads must wait for the @@ -437,6 +559,10 @@ struct srv_sys_struct{ srv_table_t* threads; /* server thread table */ UT_LIST_BASE_NODE_T(que_thr_t) tasks; /* task queue */ + dict_index_t* dummy_ind1; /* dummy index for old-style + supremum and infimum records */ + dict_index_t* dummy_ind2; /* dummy index for new-style + supremum and infimum records */ }; extern ulint srv_n_threads_active[]; diff --git a/innobase/include/srv0start.h b/innobase/include/srv0start.h index 8df0f97c4ff..d24f119c0b0 100644 --- a/innobase/include/srv0start.h +++ b/innobase/include/srv0start.h @@ -53,6 +53,16 @@ srv_parse_log_group_home_dirs( error */ char* str, /* in: character string */ char*** log_group_home_dirs); /* out, own: log group home dirs */ +/************************************************************************* +Adds a slash or a backslash to the end of a string if it is missing +and the string is not empty. */ + +char* +srv_add_path_separator_if_needed( +/*=============================*/ + /* out: string which has the separator if the + string is not empty */ + char* str); /* in: null-terminated character string */ /******************************************************************** Starts Innobase and creates a new database if database files are not found and the user wants. Server parameters are diff --git a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h index 9a988a03e92..911c8ac3f4a 100644 --- a/innobase/include/sync0rw.h +++ b/innobase/include/sync0rw.h @@ -61,8 +61,8 @@ Creates, or rather, initializes an rw-lock object in a specified memory location (which must be appropriately aligned). The rw-lock is initialized to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free is necessary only if the memory block containing it is freed. */ - -#define rw_lock_create(L) rw_lock_create_func((L), __FILE__, __LINE__) +#define rw_lock_create(L) rw_lock_create_func((L), __FILE__, __LINE__, #L) + /*=====================*/ /********************************************************************** Creates, or rather, initializes an rw-lock object in a specified memory @@ -75,7 +75,8 @@ rw_lock_create_func( /*================*/ rw_lock_t* lock, /* in: pointer to memory */ const char* cfile_name, /* in: file name where created */ - ulint cline); /* in: file line where created */ + ulint cline, /* in: file line where created */ + const char* cmutex_name); /* in: mutex name */ /********************************************************************** Calling this function is obligatory only if the memory buffer containing the rw-lock is freed. Removes an rw-lock object from the global list. The diff --git a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic index 3a92100ba01..b1ae636010a 100644 --- a/innobase/include/sync0rw.ic +++ b/innobase/include/sync0rw.ic @@ -138,7 +138,7 @@ rw_lock_s_lock_low( #endif /* UNIV_SYNC_DEBUG */ /* Check if the writer field is free */ - if (lock->writer == RW_LOCK_NOT_LOCKED) { + if (UNIV_LIKELY(lock->writer == RW_LOCK_NOT_LOCKED)) { /* Set the shared lock by incrementing the reader count */ lock->reader_count++; @@ -243,7 +243,7 @@ rw_lock_s_lock_func( mutex_enter(rw_lock_get_mutex(lock)); - if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { + if (UNIV_LIKELY(rw_lock_s_lock_low(lock, pass, file_name, line))) { mutex_exit(rw_lock_get_mutex(lock)); return; /* Success */ @@ -307,21 +307,18 @@ rw_lock_x_lock_func_nowait( const char* file_name,/* in: file name where lock requested */ ulint line) /* in: line where requested */ { - ibool success = FALSE; - + ibool success = FALSE; + os_thread_id_t curr_thread = os_thread_get_curr_id(); mutex_enter(rw_lock_get_mutex(lock)); - if ((rw_lock_get_reader_count(lock) == 0) - && ((rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) - || ((rw_lock_get_writer(lock) == RW_LOCK_EX) - && (lock->pass == 0) - && os_thread_eq(lock->writer_thread, - os_thread_get_curr_id())))) { - + if (UNIV_UNLIKELY(rw_lock_get_reader_count(lock) != 0)) { + } else if (UNIV_LIKELY(rw_lock_get_writer(lock) + == RW_LOCK_NOT_LOCKED)) { rw_lock_set_writer(lock, RW_LOCK_EX); - lock->writer_thread = os_thread_get_curr_id(); - lock->writer_count++; + lock->writer_thread = curr_thread; lock->pass = 0; + relock: + lock->writer_count++; #ifdef UNIV_SYNC_DEBUG rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); @@ -331,6 +328,10 @@ rw_lock_x_lock_func_nowait( lock->last_x_line = line; success = TRUE; + } else if (rw_lock_get_writer(lock) == RW_LOCK_EX + && lock->pass == 0 + && os_thread_eq(lock->writer_thread, curr_thread)) { + goto relock; } mutex_exit(rw_lock_get_mutex(lock)); @@ -361,7 +362,7 @@ rw_lock_s_unlock_func( /* Reset the shared lock by decrementing the reader count */ - ut_a(lock->reader_count > 0); + ut_ad(lock->reader_count > 0); lock->reader_count--; #ifdef UNIV_SYNC_DEBUG @@ -371,7 +372,8 @@ rw_lock_s_unlock_func( /* If there may be waiters and this was the last s-lock, signal the object */ - if (lock->waiters && (lock->reader_count == 0)) { + if (UNIV_UNLIKELY(lock->waiters) + && lock->reader_count == 0) { sg = TRUE; rw_lock_set_waiters(lock, 0); @@ -379,7 +381,7 @@ rw_lock_s_unlock_func( mutex_exit(mutex); - if (sg == TRUE) { + if (UNIV_UNLIKELY(sg)) { sync_array_signal_object(sync_primary_wait_array, lock); } @@ -450,7 +452,8 @@ rw_lock_x_unlock_func( #endif /* If there may be waiters, signal the lock */ - if (lock->waiters && (lock->writer_count == 0)) { + if (UNIV_UNLIKELY(lock->waiters) + && lock->writer_count == 0) { sg = TRUE; rw_lock_set_waiters(lock, 0); @@ -458,7 +461,7 @@ rw_lock_x_unlock_func( mutex_exit(&(lock->mutex)); - if (sg == TRUE) { + if (UNIV_UNLIKELY(sg)) { sync_array_signal_object(sync_primary_wait_array, lock); } diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h index 8e0ec715b12..5955ab9a06a 100644 --- a/innobase/include/sync0sync.h +++ b/innobase/include/sync0sync.h @@ -17,6 +17,10 @@ Created 9/5/1995 Heikki Tuuri #include "os0sync.h" #include "sync0arr.h" +#ifndef UNIV_HOTBACKUP +extern my_bool timed_mutexes; +#endif /* UNIV_HOTBACKUP */ + /********************************************************************** Initializes the synchronization data structures. */ @@ -35,8 +39,7 @@ location (which must be appropriately aligned). The mutex is initialized in the reset state. Explicit freeing of the mutex with mutex_free is necessary only if the memory block containing it is freed. */ - -#define mutex_create(M) mutex_create_func((M), __FILE__, __LINE__) +#define mutex_create(M) mutex_create_func((M), __FILE__, __LINE__, #M) /*===================*/ /********************************************************************** Creates, or rather, initializes a mutex object in a specified memory @@ -49,7 +52,8 @@ mutex_create_func( /*==============*/ mutex_t* mutex, /* in: pointer to memory */ const char* cfile_name, /* in: file name where created */ - ulint cline); /* in: file line where created */ + ulint cline, /* in: file line where created */ + const char* cmutex_name); /* in: mutex name */ /********************************************************************** Calling this function is obligatory only if the memory buffer containing the mutex is freed. Removes a mutex object from the mutex list. The mutex @@ -413,6 +417,8 @@ or row lock! */ /*------------------------------------- Insert buffer tree */ #define SYNC_IBUF_BITMAP_MUTEX 351 #define SYNC_IBUF_BITMAP 350 +/*------------------------------------- MySQL query cache mutex */ +/*------------------------------------- MySQL binlog mutex */ /*-------------------------------*/ #define SYNC_KERNEL 300 #define SYNC_REC_LOCK 299 @@ -471,6 +477,17 @@ struct mutex_struct { const char* cfile_name;/* File name where mutex created */ ulint cline; /* Line where created */ ulint magic_n; +#ifndef UNIV_HOTBACKUP + ulong count_using; /* count of times mutex used */ + ulong count_spin_loop; /* count of spin loops */ + ulong count_spin_rounds; /* count of spin rounds */ + ulong count_os_wait; /* count of os_wait */ + ulong count_os_yield; /* count of os_wait */ + ulonglong lspent_time; /* mutex os_wait timer msec */ + ulonglong lmax_spent_time; /* mutex os_wait timer msec */ + const char* cmutex_name;/* mutex name */ + ulint mutex_type;/* 0 - usual mutex 1 - rw_lock mutex */ +#endif /* !UNIV_HOTBACKUP */ }; #define MUTEX_MAGIC_N (ulint)979585 @@ -504,6 +521,14 @@ extern ibool sync_order_checks_on; /* This variable is set to TRUE when sync_init is called */ extern ibool sync_initialized; +/* Global list of database mutexes (not OS mutexes) created. */ +typedef UT_LIST_BASE_NODE_T(mutex_t) ut_list_base_node_t; +extern ut_list_base_node_t mutex_list; + +/* Mutex protecting the mutex_list variable */ +extern mutex_t mutex_list_mutex; + + #ifndef UNIV_NONINL #include "sync0sync.ic" #endif diff --git a/innobase/include/sync0sync.ic b/innobase/include/sync0sync.ic index aaf5e1fd9e9..b3fde61db5e 100644 --- a/innobase/include/sync0sync.ic +++ b/innobase/include/sync0sync.ic @@ -249,8 +249,13 @@ mutex_enter_func( /* Note that we do not peek at the value of lock_word before trying the atomic test_and_set; we could peek, and possibly save time. */ + +#ifndef UNIV_HOTBACKUP + mutex->count_using++; +#endif /* UNIV_HOTBACKUP */ - if (!mutex_test_and_set(mutex)) { + if (!mutex_test_and_set(mutex)) + { #ifdef UNIV_SYNC_DEBUG mutex_set_debug_info(mutex, file_name, line); #endif @@ -258,4 +263,5 @@ mutex_enter_func( } mutex_spin_wait(mutex, file_name, line); + } diff --git a/innobase/include/trx0rec.h b/innobase/include/trx0rec.h index 9d7f41cd94e..4387ce1a61e 100644 --- a/innobase/include/trx0rec.h +++ b/innobase/include/trx0rec.h @@ -246,6 +246,7 @@ trx_undo_prev_version_build( index_rec page and purge_view */ rec_t* rec, /* in: version of a clustered index record */ dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ mem_heap_t* heap, /* in: memory heap from which the memory needed is allocated */ rec_t** old_vers);/* out, own: previous version, or NULL if diff --git a/innobase/include/trx0roll.h b/innobase/include/trx0roll.h index 6004551f456..944142a299d 100644 --- a/innobase/include/trx0roll.h +++ b/innobase/include/trx0roll.h @@ -104,11 +104,20 @@ trx_rollback( /*********************************************************************** Rollback or clean up transactions which have no user session. If the transaction already was committed, then we clean up a possible insert -undo log. If the transaction was not yet committed, then we roll it back. */ +undo log. If the transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. */ -void -trx_rollback_or_clean_all_without_sess(void); -/*========================================*/ +#ifndef __WIN__ +void* +#else +ulint +#endif +trx_rollback_or_clean_all_without_sess( +/*===================================*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))); + /* in: a dummy parameter required by + os_thread_create */ /******************************************************************** Finishes a transaction rollback. */ @@ -216,6 +225,21 @@ trx_savepoint_for_mysql( position corresponding to this connection at the time of the savepoint */ + +/*********************************************************************** +Releases a named savepoint. Savepoints which +were set after this savepoint are deleted. */ + +ulint +trx_release_savepoint_for_mysql( +/*================================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name); /* in: savepoint name */ + /*********************************************************************** Frees savepoint structs. */ diff --git a/innobase/include/trx0rseg.ic b/innobase/include/trx0rseg.ic index 35e927f5e79..c9ac50ebf16 100644 --- a/innobase/include/trx0rseg.ic +++ b/innobase/include/trx0rseg.ic @@ -65,7 +65,7 @@ trx_rsegf_get_nth_undo( ulint n, /* in: index of slot */ mtr_t* mtr) /* in: mtr */ { - if (n >= TRX_RSEG_N_SLOTS) { + if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) { fprintf(stderr, "InnoDB: Error: trying to get slot %lu of rseg\n", (unsigned long) n); ut_error; @@ -86,7 +86,7 @@ trx_rsegf_set_nth_undo( ulint page_no,/* in: page number of the undo log segment */ mtr_t* mtr) /* in: mtr */ { - if (n >= TRX_RSEG_N_SLOTS) { + if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) { fprintf(stderr, "InnoDB: Error: trying to set slot %lu of rseg\n", (unsigned long) n); ut_error; diff --git a/innobase/include/trx0sys.ic b/innobase/include/trx0sys.ic index 8f455e554ea..24610bef827 100644 --- a/innobase/include/trx0sys.ic +++ b/innobase/include/trx0sys.ic @@ -315,7 +315,7 @@ trx_is_active( } if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) { - + /* There must be corruption: we return TRUE because this function is only called by lock_clust_rec_some_has_impl() and row_vers_impl_x_locked_off_kernel() and they have @@ -325,8 +325,9 @@ trx_is_active( } trx = trx_get_on_id(trx_id); - if (trx && (trx->conc_state == TRX_ACTIVE)) { - + if (trx && (trx->conc_state == TRX_ACTIVE || + trx->conc_state == TRX_PREPARED)) { + return(TRUE); } diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h index 905b25447a8..2fc4d5a289f 100644 --- a/innobase/include/trx0trx.h +++ b/innobase/include/trx0trx.h @@ -16,9 +16,39 @@ Created 3/26/1996 Heikki Tuuri #include "que0types.h" #include "mem0mem.h" #include "read0types.h" +#include "dict0types.h" +#include "trx0xa.h" extern ulint trx_n_mysql_transactions; +/***************************************************************** +Resets the new record lock info in a transaction struct. */ +UNIV_INLINE +void +trx_reset_new_rec_lock_info( +/*========================*/ + trx_t* trx); /* in: transaction struct */ +/***************************************************************** +Registers that we have set a new record lock on an index. We only have space +to store 2 indexes! If this is called to store more than 2 indexes after +trx_reset_new_rec_lock_info(), then this function does nothing. */ +UNIV_INLINE +void +trx_register_new_rec_lock( +/*======================*/ + trx_t* trx, /* in: transaction struct */ + dict_index_t* index); /* in: trx sets a new record lock on this + index */ +/***************************************************************** +Checks if trx has set a new record lock on an index. */ +UNIV_INLINE +ibool +trx_new_rec_locks_contain( +/*======================*/ + /* out: TRUE if trx has set a new record lock + on index */ + trx_t* trx, /* in: transaction struct */ + dict_index_t* index); /* in: index */ /************************************************************************ Releases the search latch if trx has reserved it. */ @@ -157,6 +187,32 @@ trx_commit_for_mysql( /* out: 0 or error number */ trx_t* trx); /* in: trx handle */ /************************************************************************** +Does the transaction prepare for MySQL. */ + +ulint +trx_prepare_for_mysql( +/*=================*/ + /* out: 0 or error number */ + trx_t* trx); /* in: trx handle */ +/************************************************************************** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. */ + +int +trx_recover_for_mysql( +/*==================*/ + /* out: number of prepared transactions */ + XID* xid_list, /* in/out: prepared transactions */ + ulint len); /* in: number of slots in xid_list */ +/*********************************************************************** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state */ +trx_t * +trx_get_trx_by_xid( +/*===============*/ + /* out: trx or NULL */ + XID* xid); /* in: X/Open XA transaction identification */ +/************************************************************************** If required, flushes the log to disk if we called trx_commit_for_mysql() with trx->flush_log_later == TRUE. */ @@ -285,6 +341,19 @@ trx_print( FILE* f, /* in: output stream */ trx_t* trx); /* in: transaction */ +#ifndef UNIV_HOTBACKUP +/************************************************************************** +Determines if the currently running transaction has been interrupted. */ + +ibool +trx_is_interrupted( +/*===============*/ + /* out: TRUE if interrupted */ + trx_t* trx); /* in: transaction */ +#else /* !UNIV_HOTBACKUP */ +#define trx_is_interrupted(trx) FALSE +#endif /* !UNIV_HOTBACKUP */ + /* Signal to a transaction */ struct trx_sig_struct{ @@ -339,6 +408,14 @@ struct trx_struct{ if we can use the insert buffer for them, we set this FALSE */ dulint id; /* transaction id */ + XID xid; /* X/Open XA transaction + identification to identify a + transaction branch */ + ibool support_xa; /* normally we do the XA two-phase + commit steps, but by setting this to + FALSE, one can save CPU time and about + 150 bytes in the undo log size as then + we skip XA steps */ dulint no; /* transaction serialization number == max trx id when the transaction is moved to COMMITTED_IN_MEMORY state */ @@ -355,12 +432,17 @@ struct trx_struct{ dulint commit_lsn; /* lsn at the time of the commit */ ibool dict_operation; /* TRUE if the trx is used to create a table, create an index, or drop a - table */ + table. This is a hint that the table + may need to be dropped in crash + recovery. */ dulint table_id; /* table id if the preceding field is TRUE */ /*------------------------------*/ - void* mysql_thd; /* MySQL thread handle corresponding - to this trx, or NULL */ + int active_trans; /* 1 - if a transaction in MySQL + is active. 2 - if prepare_commit_mutex + was taken */ + void* mysql_thd; /* MySQL thread handle corresponding + to this trx, or NULL */ char** mysql_query_str;/* pointer to the field in mysqld_thd which contains the pointer to the current SQL query string */ @@ -442,9 +524,18 @@ struct trx_struct{ lock_t* auto_inc_lock; /* possible auto-inc lock reserved by the transaction; note that it is also in the lock list trx_locks */ - ulint n_lock_table_exp;/* number of explicit table locks - (LOCK TABLES) reserved by the - transaction, stored in trx_locks */ + dict_index_t* new_rec_locks[2];/* these are normally NULL; if + srv_locks_unsafe_for_binlog is TRUE, + in a cursor search, if we set a new + record lock on an index, this is set + to point to the index; this is + used in releasing the locks under the + cursors if we are performing an UPDATE + and we determine after retrieving + the row that it does not need to be + locked; thus, these can be used to + implement a 'mini-rollback' that + releases the latest record locks */ UT_LIST_NODE_T(trx_t) trx_list; /* list of transactions */ UT_LIST_NODE_T(trx_t) @@ -511,8 +602,19 @@ struct trx_struct{ UT_LIST_BASE_NODE_T(lock_t) trx_locks; /* locks reserved by the transaction */ /*------------------------------*/ - mem_heap_t* read_view_heap; /* memory heap for the read view */ - read_view_t* read_view; /* consistent read view or NULL */ + mem_heap_t* global_read_view_heap; + /* memory heap for the global read + view */ + read_view_t* global_read_view; + /* consistent read view used in the + transaction is stored here if + transaction is using a consistent + read view associated to a cursor */ + read_view_t* read_view; /* consistent read view used in the + transaction or NULL, this read view + can be normal read view associated + to a transaction or read view + associated to a cursor */ /*------------------------------*/ UT_LIST_BASE_NODE_T(trx_named_savept_t) trx_savepoints; /* savepoints set with SAVEPOINT ..., @@ -560,6 +662,7 @@ struct trx_struct{ #define TRX_NOT_STARTED 1 #define TRX_ACTIVE 2 #define TRX_COMMITTED_IN_MEMORY 3 +#define TRX_PREPARED 4 /* Support for 2PC/XA */ /* Transaction execution states when trx state is TRX_ACTIVE */ #define TRX_QUE_RUNNING 1 /* transaction is running */ diff --git a/innobase/include/trx0trx.ic b/innobase/include/trx0trx.ic index 78e5acda148..54cf2ff331f 100644 --- a/innobase/include/trx0trx.ic +++ b/innobase/include/trx0trx.ic @@ -39,4 +39,60 @@ trx_start_if_not_started_low( } } +/***************************************************************** +Resets the new record lock info in a transaction struct. */ +UNIV_INLINE +void +trx_reset_new_rec_lock_info( +/*========================*/ + trx_t* trx) /* in: transaction struct */ +{ + trx->new_rec_locks[0] = NULL; + trx->new_rec_locks[1] = NULL; +} + +/***************************************************************** +Registers that we have set a new record lock on an index. We only have space +to store 2 indexes! If this is called to store more than 2 indexes after +trx_reset_new_rec_lock_info(), then this function does nothing. */ +UNIV_INLINE +void +trx_register_new_rec_lock( +/*======================*/ + trx_t* trx, /* in: transaction struct */ + dict_index_t* index) /* in: trx sets a new record lock on this + index */ +{ + if (trx->new_rec_locks[0] == NULL) { + trx->new_rec_locks[0] = index; + + return; + } + + if (trx->new_rec_locks[0] == index) { + return; + } + + if (trx->new_rec_locks[1] != NULL) { + + return; + } + + trx->new_rec_locks[1] = index; +} + +/***************************************************************** +Checks if trx has set a new record lock on an index. */ +UNIV_INLINE +ibool +trx_new_rec_locks_contain( +/*======================*/ + /* out: TRUE if trx has set a new record lock + on index */ + trx_t* trx, /* in: transaction struct */ + dict_index_t* index) /* in: index */ +{ + return(trx->new_rec_locks[0] == index + || trx->new_rec_locks[1] == index); +} diff --git a/innobase/include/trx0undo.h b/innobase/include/trx0undo.h index 20002076cc3..bd7337e4f90 100644 --- a/innobase/include/trx0undo.h +++ b/innobase/include/trx0undo.h @@ -14,6 +14,7 @@ Created 3/26/1996 Heikki Tuuri #include "mtr0mtr.h" #include "trx0sys.h" #include "page0types.h" +#include "trx0xa.h" /*************************************************************************** Builds a roll pointer dulint. */ @@ -36,7 +37,7 @@ trx_undo_decode_roll_ptr( ibool* is_insert, /* out: TRUE if insert undo log */ ulint* rseg_id, /* out: rollback segment id */ ulint* page_no, /* out: page number */ - ulint* offset); /* out: offset of the undo entry within page */ + ulint* offset); /* out: offset of the undo entry within page */ /*************************************************************************** Returns TRUE if the roll pointer is of the insert type. */ UNIV_INLINE @@ -239,6 +240,18 @@ trx_undo_set_state_at_finish( trx_t* trx, /* in: transaction */ trx_undo_t* undo, /* in: undo log memory copy */ mtr_t* mtr); /* in: mtr */ +/********************************************************************** +Sets the state of the undo log segment at a transaction prepare. */ + +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr); /* in: mtr */ + /************************************************************************** Adds the update undo log header as the first in the history list, and frees the memory object, or puts it to the list of cached update undo log @@ -295,7 +308,6 @@ trx_undo_parse_discard_latest( page_t* page, /* in: page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ - /* Types of an undo log segment */ #define TRX_UNDO_INSERT 1 /* contains undo entries for inserts */ #define TRX_UNDO_UPDATE 2 /* contains undo entries for updates @@ -310,6 +322,8 @@ trx_undo_parse_discard_latest( #define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be reused: it can be freed in purge when all undo data in it is removed */ +#define TRX_UNDO_PREPARED 5 /* contains an undo log of an + prepared transaction */ /* Transaction undo log memory object; this is protected by the undo_mutex in the corresponding transaction object */ @@ -332,6 +346,8 @@ struct trx_undo_struct{ field */ dulint trx_id; /* id of the trx assigned to the undo log */ + XID xid; /* X/Open XA transaction + identification */ ibool dict_operation; /* TRUE if a dict operation trx */ dulint table_id; /* if a dict operation, then the table id */ @@ -386,7 +402,8 @@ struct trx_undo_struct{ #define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE) /* An update undo segment with just one page can be reused if it has -< this number bytes used */ +< this number bytes used; we must leave space at least for one new undo +log header on the page */ #define TRX_UNDO_PAGE_REUSE_LIMIT (3 * UNIV_PAGE_SIZE / 4) @@ -436,7 +453,10 @@ page of an update undo log segment. */ log start, and therefore this is not necessarily the same as this log header end offset */ -#define TRX_UNDO_DICT_OPERATION 20 /* TRUE if the transaction is a table +#define TRX_UNDO_XID_EXISTS 20 /* TRUE if undo log header includes + X/Open XA transaction identification + XID */ +#define TRX_UNDO_DICT_TRANS 21 /* TRUE if the transaction is a table create, index create, or drop transaction: in recovery the transaction cannot be rolled back @@ -452,7 +472,25 @@ page of an update undo log segment. */ #define TRX_UNDO_HISTORY_NODE 34 /* If the log is put to the history list, the file list node is here */ /*-------------------------------------------------------------*/ -#define TRX_UNDO_LOG_HDR_SIZE (34 + FLST_NODE_SIZE) +#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE) + +/* Note: the writing of the undo log old header is coded by a log record +MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE. The appending of an XID to the +header is logged separately. In this sense, the XID is not really a member +of the undo log header. TODO: do not append the XID to the log header if XA +is not needed by the user. The XID wastes about 150 bytes of space in every +undo log. In the history list we may have millions of undo logs, which means +quite a large overhead. */ + +/* X/Open XA Transaction Identification (XID) */ + +#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE) +#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4) +#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4) +#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4) +/*--------------------------------------------------------------*/ +#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE) + /* Total size of the header with the XA XID */ #ifndef UNIV_NONINL #include "trx0undo.ic" diff --git a/innobase/include/trx0xa.h b/innobase/include/trx0xa.h new file mode 100644 index 00000000000..34b7a2f95a8 --- /dev/null +++ b/innobase/include/trx0xa.h @@ -0,0 +1,182 @@ +/* + * Start of xa.h header + * + * Define a symbol to prevent multiple inclusions of this header file + */ +#ifndef XA_H +#define XA_H + +/* + * Transaction branch identification: XID and NULLXID: + */ +#ifndef XIDDATASIZE + +#define XIDDATASIZE 128 /* size in bytes */ +#define MAXGTRIDSIZE 64 /* maximum size in bytes of gtrid */ +#define MAXBQUALSIZE 64 /* maximum size in bytes of bqual */ + +struct xid_t { + long formatID; /* format identifier */ + long gtrid_length; /* value from 1 through 64 */ + long bqual_length; /* value from 1 through 64 */ + char data[XIDDATASIZE]; +}; +typedef struct xid_t XID; +#endif +/* + * A value of -1 in formatID means that the XID is null. + */ + + +#ifdef NOTDEFINED +/* Let us comment this out to remove compiler errors!!!!!!!!!!!! */ + +/* + * Declarations of routines by which RMs call TMs: + */ +extern int ax_reg __P((int, XID *, long)); +extern int ax_unreg __P((int, long)); + +/* + * XA Switch Data Structure + */ +#define RMNAMESZ 32 /* length of resource manager name, */ + /* including the null terminator */ +#define MAXINFOSIZE 256 /* maximum size in bytes of xa_info */ + /* strings, including the null + terminator */ + + +struct xa_switch_t { + char name[RMNAMESZ]; /* name of resource manager */ + long flags; /* resource manager specific options */ + long version; /* must be 0 */ + int (*xa_open_entry) /* xa_open function pointer */ + __P((char *, int, long)); + int (*xa_close_entry) /* xa_close function pointer */ + __P((char *, int, long)); + int (*xa_start_entry) /* xa_start function pointer */ + __P((XID *, int, long)); + int (*xa_end_entry) /* xa_end function pointer */ + __P((XID *, int, long)); + int (*xa_rollback_entry) /* xa_rollback function pointer */ + __P((XID *, int, long)); + int (*xa_prepare_entry) /* xa_prepare function pointer */ + __P((XID *, int, long)); + int (*xa_commit_entry) /* xa_commit function pointer */ + __P((XID *, int, long)); + int (*xa_recover_entry) /* xa_recover function pointer */ + __P((XID *, long, int, long)); + int (*xa_forget_entry) /* xa_forget function pointer */ + __P((XID *, int, long)); + int (*xa_complete_entry) /* xa_complete function pointer */ + __P((int *, int *, int, long)); +}; +#endif /* NOTDEFINED */ + + +/* + * Flag definitions for the RM switch + */ +#define TMNOFLAGS 0x00000000L /* no resource manager features + selected */ +#define TMREGISTER 0x00000001L /* resource manager dynamically + registers */ +#define TMNOMIGRATE 0x00000002L /* resource manager does not support + association migration */ +#define TMUSEASYNC 0x00000004L /* resource manager supports + asynchronous operations */ +/* + * Flag definitions for xa_ and ax_ routines + */ +/* use TMNOFLAGGS, defined above, when not specifying other flags */ +#define TMASYNC 0x80000000L /* perform routine asynchronously */ +#define TMONEPHASE 0x40000000L /* caller is using one-phase commit + optimisation */ +#define TMFAIL 0x20000000L /* dissociates caller and marks + transaction branch rollback-only */ +#define TMNOWAIT 0x10000000L /* return if blocking condition + exists */ +#define TMRESUME 0x08000000L /* caller is resuming association with + suspended transaction branch */ +#define TMSUCCESS 0x04000000L /* dissociate caller from transaction + branch */ +#define TMSUSPEND 0x02000000L /* caller is suspending, not ending, + association */ +#define TMSTARTRSCAN 0x01000000L /* start a recovery scan */ +#define TMENDRSCAN 0x00800000L /* end a recovery scan */ +#define TMMULTIPLE 0x00400000L /* wait for any asynchronous + operation */ +#define TMJOIN 0x00200000L /* caller is joining existing + transaction branch */ +#define TMMIGRATE 0x00100000L /* caller intends to perform + migration */ + +/* + * ax_() return codes (transaction manager reports to resource manager) + */ +#define TM_JOIN 2 /* caller is joining existing + transaction branch */ +#define TM_RESUME 1 /* caller is resuming association with + suspended transaction branch */ +#define TM_OK 0 /* normal execution */ +#define TMER_TMERR -1 /* an error occurred in the transaction + manager */ +#define TMER_INVAL -2 /* invalid arguments were given */ +#define TMER_PROTO -3 /* routine invoked in an improper + context */ + +/* + * xa_() return codes (resource manager reports to transaction manager) + */ +#define XA_RBBASE 100 /* The inclusive lower bound of the + rollback codes */ +#define XA_RBROLLBACK XA_RBBASE /* The rollback was caused by an + unspecified reason */ +#define XA_RBCOMMFAIL XA_RBBASE+1 /* The rollback was caused by a + communication failure */ +#define XA_RBDEADLOCK XA_RBBASE+2 /* A deadlock was detected */ +#define XA_RBINTEGRITY XA_RBBASE+3 /* A condition that violates the + integrity of the resources was + detected */ +#define XA_RBOTHER XA_RBBASE+4 /* The resource manager rolled back the + transaction branch for a reason not + on this list */ +#define XA_RBPROTO XA_RBBASE+5 /* A protocol error occurred in the + resource manager */ +#define XA_RBTIMEOUT XA_RBBASE+6 /* A transaction branch took too long */ +#define XA_RBTRANSIENT XA_RBBASE+7 /* May retry the transaction branch */ +#define XA_RBEND XA_RBTRANSIENT /* The inclusive upper bound of the + rollback codes */ +#define XA_NOMIGRATE 9 /* resumption must occur where + suspension occurred */ +#define XA_HEURHAZ 8 /* the transaction branch may have + been heuristically completed */ +#define XA_HEURCOM 7 /* the transaction branch has been + heuristically committed */ +#define XA_HEURRB 6 /* the transaction branch has been + heuristically rolled back */ +#define XA_HEURMIX 5 /* the transaction branch has been + heuristically committed and rolled + back */ +#define XA_RETRY 4 /* routine returned with no effect and + may be re-issued */ +#define XA_RDONLY 3 /* the transaction branch was read-only + and has been committed */ +#define XA_OK 0 /* normal execution */ +#define XAER_ASYNC -2 /* asynchronous operation already + outstanding */ +#define XAER_RMERR -3 /* a resource manager error occurred in + the transaction branch */ +#define XAER_NOTA -4 /* the XID is not valid */ +#define XAER_INVAL -5 /* invalid arguments were given */ +#define XAER_PROTO -6 /* routine invoked in an improper + context */ +#define XAER_RMFAIL -7 /* resource manager unavailable */ +#define XAER_DUPID -8 /* the XID already exists */ +#define XAER_OUTSIDE -9 /* resource manager doing work outside + transaction */ +#endif /* ifndef XA_H */ +/* + * End of xa.h header + */ diff --git a/innobase/include/univ.i b/innobase/include/univ.i index 625978ffc38..132ac9e18c5 100644 --- a/innobase/include/univ.i +++ b/innobase/include/univ.i @@ -88,6 +88,7 @@ memory is read outside the allocated blocks. */ #define UNIV_SEARCH_DEBUG #define UNIV_SYNC_PERF_STAT #define UNIV_SEARCH_PERF_STAT +#define UNIV_SRV_PRINT_LATCH_WAITS; */ #define UNIV_LIGHT_MEM_DEBUG @@ -180,7 +181,7 @@ management to ensure correct alignment for doubles etc. */ /* Another basic type we use is unsigned long integer which should be equal to the word size of the machine, that is on a 32-bit platform 32 bits, and on a 64-bit platform 64 bits. We also give the printf format for the type as a -macro PRULINT. */ +macro ULINTPF. */ #ifdef _WIN64 typedef unsigned __int64 ulint; @@ -242,6 +243,30 @@ contains the sum of the following flag and the locally stored len. */ #define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE) +/* Some macros to improve branch prediction and reduce cache misses */ +#if defined(__GNUC__) && (__GNUC__ > 2) +/* Tell the compiler that 'expr' probably evaluates to 'constant'. */ +# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant) +/* Tell the compiler that a pointer is likely to be NULL */ +# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ulint) ptr, 0) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read. */ +# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read or written. */ +# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3) +#else +/* Dummy versions of the macros */ +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +#endif +/* Tell the compiler that cond is likely to hold */ +#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE) +/* Tell the compiler that cond is unlikely to hold */ +#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE) + #include <stdio.h> #include "ut0dbg.h" #include "ut0ut.h" diff --git a/innobase/include/ut0byte.h b/innobase/include/ut0byte.h index a62c2e2e318..22d488abeaf 100644 --- a/innobase/include/ut0byte.h +++ b/innobase/include/ut0byte.h @@ -208,7 +208,20 @@ ut_align_down( /*==========*/ /* out: aligned pointer */ void* ptr, /* in: pointer */ - ulint align_no); /* in: align by this number */ + ulint align_no) /* in: align by this number */ + __attribute__((const)); +/************************************************************* +The following function computes the offset of a pointer from the nearest +aligned address. */ +UNIV_INLINE +ulint +ut_align_offset( +/*==========*/ + /* out: distance from aligned + pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ + __attribute__((const)); /********************************************************************* Gets the nth bit of a ulint. */ UNIV_INLINE diff --git a/innobase/include/ut0byte.ic b/innobase/include/ut0byte.ic index 5a70dcf12a8..e141de3aa3f 100644 --- a/innobase/include/ut0byte.ic +++ b/innobase/include/ut0byte.ic @@ -335,6 +335,27 @@ ut_align_down( return((void*)((((ulint)ptr)) & ~(align_no - 1))); } +/************************************************************* +The following function computes the offset of a pointer from the nearest +aligned address. */ +UNIV_INLINE +ulint +ut_align_offset( +/*============*/ + /* out: distance from + aligned pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return(((ulint)ptr) & (align_no - 1)); +} + /********************************************************************* Gets the nth bit of a ulint. */ UNIV_INLINE diff --git a/innobase/include/ut0dbg.h b/innobase/include/ut0dbg.h index 5f30a894874..bc3f852626a 100644 --- a/innobase/include/ut0dbg.h +++ b/innobase/include/ut0dbg.h @@ -13,74 +13,75 @@ Created 1/30/1994 Heikki Tuuri #include <stdlib.h> #include "os0thread.h" +#if defined(__GNUC__) && (__GNUC__ > 2) +# define UT_DBG_FAIL(EXPR) UNIV_UNLIKELY(!((ulint)(EXPR))) +#else extern ulint ut_dbg_zero; /* This is used to eliminate compiler warnings */ +# define UT_DBG_FAIL(EXPR) !((ulint)(EXPR) + ut_dbg_zero) +#endif + +/***************************************************************** +Report a failed assertion. */ + +void +ut_dbg_assertion_failed( +/*====================*/ + const char* expr, /* in: the failed assertion */ + const char* file, /* in: source file containing the assertion */ + ulint line); /* in: line number of the assertion */ + +#ifdef __NETWARE__ +/* Flag for ignoring further assertion failures. +On NetWare, have a graceful exit rather than a segfault to avoid abends. */ +extern ibool panic_shutdown; +/* Abort the execution. */ +void ut_dbg_panic(void); +# define UT_DBG_PANIC ut_dbg_panic() +/* Stop threads in ut_a(). */ +# define UT_DBG_STOP while (0) /* We do not do this on NetWare */ +#else /* __NETWARE__ */ +/* Flag for indicating that all threads should stop. This will be set +by ut_dbg_assertion_failed(). */ extern ibool ut_dbg_stop_threads; +/* A null pointer that will be dereferenced to trigger a memory trap */ extern ulint* ut_dbg_null_ptr; -extern const char* ut_dbg_msg_assert_fail; -extern const char* ut_dbg_msg_trap; -extern const char* ut_dbg_msg_stop; -/* Have a graceful exit on NetWare rather than a segfault to avoid abends */ -#ifdef __NETWARE__ -extern ibool panic_shutdown; -#define ut_a(EXPR) do {\ - if (!((ulint)(EXPR) + ut_dbg_zero)) {\ - ut_print_timestamp(stderr);\ - fprintf(stderr, ut_dbg_msg_assert_fail,\ - os_thread_pf(os_thread_get_curr_id()), __FILE__,\ - (ulint)__LINE__);\ - fputs("InnoDB: Failing assertion: " #EXPR "\n", stderr);\ - fputs(ut_dbg_msg_trap, stderr);\ - ut_dbg_stop_threads = TRUE;\ - if (ut_dbg_stop_threads) {\ - fprintf(stderr, ut_dbg_msg_stop,\ - os_thread_pf(os_thread_get_curr_id()), __FILE__, (ulint)__LINE__);\ - }\ - if(!panic_shutdown){\ - panic_shutdown = TRUE;\ - innobase_shutdown_for_mysql();}\ - exit(1);\ - }\ -} while (0) -#define ut_error do {\ - ut_print_timestamp(stderr);\ - fprintf(stderr, ut_dbg_msg_assert_fail,\ - os_thread_pf(os_thread_get_curr_id()), __FILE__, (ulint)__LINE__);\ - fprintf(stderr, ut_dbg_msg_trap);\ - ut_dbg_stop_threads = TRUE;\ - if(!panic_shutdown){panic_shutdown = TRUE;\ - innobase_shutdown_for_mysql();}\ -} while (0) -#else -#define ut_a(EXPR) do {\ - if (!((ulint)(EXPR) + ut_dbg_zero)) {\ - ut_print_timestamp(stderr);\ - fprintf(stderr, ut_dbg_msg_assert_fail,\ - os_thread_pf(os_thread_get_curr_id()), __FILE__,\ - (ulint)__LINE__);\ - fputs("InnoDB: Failing assertion: " #EXPR "\n", stderr);\ - fputs(ut_dbg_msg_trap, stderr);\ - ut_dbg_stop_threads = TRUE;\ - if (*(ut_dbg_null_ptr)) ut_dbg_null_ptr = NULL;\ - }\ - if (ut_dbg_stop_threads) {\ - fprintf(stderr, ut_dbg_msg_stop,\ - os_thread_pf(os_thread_get_curr_id()), __FILE__, (ulint)__LINE__);\ - os_thread_sleep(1000000000);\ - }\ +/***************************************************************** +Stop a thread after assertion failure. */ + +void +ut_dbg_stop_thread( +/*===============*/ + const char* file, + ulint line); + +/* Abort the execution. */ +# define UT_DBG_PANIC \ + if (*(ut_dbg_null_ptr)) ut_dbg_null_ptr = NULL +/* Stop threads in ut_a(). */ +# define UT_DBG_STOP do \ + if (UNIV_UNLIKELY(ut_dbg_stop_threads)) { \ + ut_dbg_stop_thread(__FILE__, (ulint) __LINE__); \ + } while (0) +#endif /* __NETWARE__ */ + +/* Abort execution if EXPR does not evaluate to nonzero. */ +#define ut_a(EXPR) do { \ + if (UT_DBG_FAIL(EXPR)) { \ + ut_dbg_assertion_failed(#EXPR, \ + __FILE__, (ulint) __LINE__); \ + UT_DBG_PANIC; \ + } \ + UT_DBG_STOP; \ } while (0) -#define ut_error do {\ - ut_print_timestamp(stderr);\ - fprintf(stderr, ut_dbg_msg_assert_fail,\ - os_thread_pf(os_thread_get_curr_id()), __FILE__, (ulint)__LINE__);\ - fprintf(stderr, ut_dbg_msg_trap);\ - ut_dbg_stop_threads = TRUE;\ - if (*(ut_dbg_null_ptr)) ut_dbg_null_ptr = NULL;\ +/* Abort execution. */ +#define ut_error do { \ + ut_dbg_assertion_failed(0, __FILE__, (ulint) __LINE__); \ + UT_DBG_PANIC; \ } while (0) -#endif #ifdef UNIV_DEBUG #define ut_ad(EXPR) ut_a(EXPR) diff --git a/innobase/include/ut0rnd.ic b/innobase/include/ut0rnd.ic index 06d7012f60b..d2ab087d491 100644 --- a/innobase/include/ut0rnd.ic +++ b/innobase/include/ut0rnd.ic @@ -207,12 +207,12 @@ ut_fold_binary( const byte* str, /* in: string of bytes */ ulint len) /* in: length */ { - ulint i; - ulint fold = 0; + const byte* str_end = str + len; + ulint fold = 0; ut_ad(str); - for (i = 0; i < len; i++) { + while (str < str_end) { fold = ut_fold_ulint_pair(fold, (ulint)(*str)); str++; diff --git a/innobase/include/ut0ut.h b/innobase/include/ut0ut.h index dee8785c9e7..8938957cd12 100644 --- a/innobase/include/ut0ut.h +++ b/innobase/include/ut0ut.h @@ -139,6 +139,14 @@ ib_time_t ut_time(void); /*=========*/ /************************************************************** +Returns system time. */ + +void +ut_usectime( +/*========*/ + ulint* sec, /* out: seconds since the Epoch */ + ulint* ms); /* out: microseconds since the Epoch+*sec */ +/************************************************************** Returns the difference of two times in seconds. */ double diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c index 1c08d3defaa..280c4871ee9 100644 --- a/innobase/lock/lock0lock.c +++ b/innobase/lock/lock0lock.c @@ -292,7 +292,25 @@ waiting, in its lock queue. Solution: We can copy the locks as gap type locks, so that also the waiting locks are transformed to granted gap type locks on the inserted record. */ +/* LOCK COMPATIBILITY MATRIX + * IS IX S X AI + * IS + + + - + + * IX + + - - + + * S + - + - - + * X - - - - - + * AI + + - - - + * + * Note that for rows, InnoDB only acquires S or X locks. + * For tables, InnoDB normally acquires IS or IX locks. + * S or X table locks are only acquired for LOCK TABLES. + * Auto-increment (AI) locks are needed because of + * statement-level MySQL binlog. + * See also lock_mode_compatible(). + */ + +#ifdef UNIV_DEBUG ibool lock_print_waits = FALSE; +#endif /* UNIV_DEBUG */ /* The lock system */ lock_sys_t* lock_sys = NULL; @@ -348,17 +366,26 @@ static ibool lock_deadlock_occurs( /*=================*/ - /* out: TRUE if a deadlock was detected */ + /* out: TRUE if a deadlock was detected and we + chose trx as a victim; FALSE if no deadlock, or + there was a deadlock, but we chose other + transaction(s) as victim(s) */ lock_t* lock, /* in: lock the transaction is requesting */ trx_t* trx); /* in: transaction */ /************************************************************************ Looks recursively for a deadlock. */ static -ibool +ulint lock_deadlock_recursive( /*====================*/ - /* out: TRUE if a deadlock was detected - or the calculation took too long */ + /* out: 0 if no deadlock found, + LOCK_VICTIM_IS_START if there was a deadlock + and we chose 'start' as the victim, + LOCK_VICTIM_IS_OTHER if a deadlock + was found and we chose some other trx as a + victim: we must do the search again in this + last case because there may be another + deadlock! */ trx_t* start, /* in: recursion starting point */ trx_t* trx, /* in: a transaction waiting for a lock */ lock_t* wait_lock, /* in: the lock trx is waiting to be granted */ @@ -424,12 +451,15 @@ lock_check_trx_id_sanity( /* out: TRUE if ok */ dulint trx_id, /* in: trx id */ rec_t* rec, /* in: user record */ - dict_index_t* index, /* in: clustered index */ + dict_index_t* index, /* in: index */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ ibool has_kernel_mutex)/* in: TRUE if the caller owns the kernel mutex */ { ibool is_ok = TRUE; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (!has_kernel_mutex) { mutex_enter(&kernel_mutex); } @@ -442,7 +472,7 @@ lock_check_trx_id_sanity( fputs(" InnoDB: Error: transaction id associated" " with record\n", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); fputs("InnoDB: in ", stderr); dict_index_name_print(stderr, NULL, index); fprintf(stderr, "\n" @@ -474,25 +504,22 @@ lock_clust_rec_cons_read_sees( rec_t* rec, /* in: user record which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ read_view_t* view) /* in: consistent read view */ { dulint trx_id; ut_ad(index->type & DICT_CLUSTERED); ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); /* NOTE that we call this function while holding the search system latch. To obey the latching order we must NOT reserve the kernel mutex here! */ - trx_id = row_get_rec_trx_id(rec, index); + trx_id = row_get_rec_trx_id(rec, index, offsets); - if (read_view_sees_trx_id(view, trx_id)) { - - return(TRUE); - } - - return(FALSE); + return(read_view_sees_trx_id(view, trx_id)); } /************************************************************************* @@ -689,7 +716,7 @@ lock_is_table_exclusive( trx_t* trx) /* in: transaction */ { lock_t* lock; - bool ok = FALSE; + ibool ok = FALSE; ut_ad(table && trx); @@ -929,7 +956,7 @@ lock_rec_has_to_wait( cause waits */ if ((lock_is_on_supremum || (type_mode & LOCK_GAP)) - && !(type_mode & LOCK_INSERT_INTENTION)) { + && !(type_mode & LOCK_INSERT_INTENTION)) { /* Gap type locks without LOCK_INSERT_INTENTION flag do not need to wait for anything. This is because @@ -1263,19 +1290,19 @@ lock_rec_get_next( #endif /* UNIV_SYNC_DEBUG */ ut_ad(lock_get_type(lock) == LOCK_REC); - for (;;) { - lock = lock_rec_get_next_on_page(lock); - - if (lock == NULL) { - - return(NULL); - } - - if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec))) { - - return(lock); - } + if (page_rec_is_comp(rec)) { + do { + lock = lock_rec_get_next_on_page(lock); + } while (lock && !lock_rec_get_nth_bit(lock, + rec_get_heap_no(rec, TRUE))); + } else { + do { + lock = lock_rec_get_next_on_page(lock); + } while (lock && !lock_rec_get_nth_bit(lock, + rec_get_heap_no(rec, FALSE))); } + + return(lock); } /************************************************************************* @@ -1294,14 +1321,12 @@ lock_rec_get_first( #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first_on_page(rec); + if (UNIV_LIKELY_NULL(lock)) { + ulint heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); - while (lock) { - if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec))) { - - break; + while (lock && !lock_rec_get_nth_bit(lock, heap_no)) { + lock = lock_rec_get_next_on_page(lock); } - - lock = lock_rec_get_next_on_page(lock); } return(lock); @@ -1556,7 +1581,6 @@ lock_rec_other_has_conflicting( trx_t* trx) /* in: our transaction */ { lock_t* lock; - #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ @@ -1596,8 +1620,7 @@ lock_rec_find_similar_on_page( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(rec); - + heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); lock = lock_rec_get_first_on_page(rec); while (lock != NULL) { @@ -1624,7 +1647,8 @@ lock_sec_rec_some_has_impl_off_kernel( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index) /* in: secondary index */ + dict_index_t* index, /* in: secondary index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { page_t* page; @@ -1633,6 +1657,7 @@ lock_sec_rec_some_has_impl_off_kernel( #endif /* UNIV_SYNC_DEBUG */ ut_ad(!(index->type & DICT_CLUSTERED)); ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); page = buf_frame_align(rec); @@ -1652,8 +1677,8 @@ lock_sec_rec_some_has_impl_off_kernel( /* Ok, in this case it is possible that some transaction has an implicit x-lock. We have to look in the clustered index. */ - if (!lock_check_trx_id_sanity(page_get_max_trx_id(page), rec, index, - TRUE)) { + if (!lock_check_trx_id_sanity(page_get_max_trx_id(page), + rec, index, offsets, TRUE)) { buf_page_print(page); /* The page is corrupt: try to avoid a crash by returning @@ -1661,7 +1686,7 @@ lock_sec_rec_some_has_impl_off_kernel( return(NULL); } - return(row_vers_impl_x_locked_off_kernel(rec, index)); + return(row_vers_impl_x_locked_off_kernel(rec, index, offsets)); } /*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/ @@ -1695,7 +1720,9 @@ lock_rec_create( page = buf_frame_align(rec); space = buf_frame_get_space_id(page); page_no = buf_frame_get_page_no(page); - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_is_comp(page)); + + ut_ad(!!page_is_comp(page) == index->table->comp); /* If rec is the supremum record, then we reset the gap and LOCK_REC_NOT_GAP bits, as all locks on the supremum are @@ -1708,13 +1735,12 @@ lock_rec_create( } /* Make lock bitmap bigger by a safety margin */ - n_bits = page_header_get_field(page, PAGE_N_HEAP) - + LOCK_PAGE_BITMAP_MARGIN; + n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN; n_bytes = 1 + n_bits / 8; lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t) + n_bytes); - if (lock == NULL) { + if (UNIV_UNLIKELY(lock == NULL)) { return(NULL); } @@ -1739,7 +1765,7 @@ lock_rec_create( lock_rec_set_nth_bit(lock, heap_no); HASH_INSERT(lock_t, hash, lock_sys->rec_hash, - lock_rec_fold(space, page_no), lock); + lock_rec_fold(space, page_no), lock); if (type_mode & LOCK_WAIT) { lock_set_lock_and_trx_wait(lock, trx); @@ -1811,7 +1837,8 @@ lock_rec_enqueue_waiting( if (lock_deadlock_occurs(lock, trx)) { lock_reset_lock_and_trx_wait(lock); - lock_rec_reset_nth_bit(lock, rec_get_heap_no(rec)); + lock_rec_reset_nth_bit(lock, rec_get_heap_no(rec, + page_rec_is_comp(rec))); return(DB_DEADLOCK); } @@ -1830,11 +1857,13 @@ lock_rec_enqueue_waiting( ut_a(que_thr_stop(thr)); +#ifdef UNIV_DEBUG if (lock_print_waits) { fprintf(stderr, "Lock wait for trx %lu in index ", (ulong) ut_dulint_get_low(trx->id)); ut_print_name(stderr, trx, index->name); } +#endif /* UNIV_DEBUG */ return(DB_LOCK_WAIT); } @@ -1861,7 +1890,6 @@ lock_rec_add_to_queue( lock_t* lock; lock_t* similar_lock = NULL; ulint heap_no; - page_t* page; ibool somebody_waits = FALSE; #ifdef UNIV_SYNC_DEBUG @@ -1869,21 +1897,21 @@ lock_rec_add_to_queue( #endif /* UNIV_SYNC_DEBUG */ ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_S) - || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, rec, trx)); + || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, + rec, trx)); ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_X) - || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, trx)); + || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, + rec, trx)); type_mode = type_mode | LOCK_REC; - page = buf_frame_align(rec); - /* If rec is the supremum record, then we can reset the gap bit, as all locks on the supremum are automatically of the gap type, and we try to avoid unnecessary memory consumption of a new record lock struct for a gap type lock */ - if (rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum(rec)) { ut_ad(!(type_mode & LOCK_REC_NOT_GAP)); /* There should never be LOCK_REC_NOT_GAP on a supremum @@ -1894,7 +1922,7 @@ lock_rec_add_to_queue( /* Look for a waiting lock request on the same record or on a gap */ - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); lock = lock_rec_get_first_on_page(rec); while (lock != NULL) { @@ -1945,6 +1973,7 @@ lock_rec_lock_fast( { lock_t* lock; ulint heap_no; + trx_t* trx; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -1959,13 +1988,19 @@ lock_rec_lock_fast( || mode - (LOCK_MODE_MASK & mode) == 0 || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); lock = lock_rec_get_first_on_page(rec); + trx = thr_get_trx(thr); + if (lock == NULL) { if (!impl) { - lock_rec_create(mode, rec, index, thr_get_trx(thr)); + lock_rec_create(mode, rec, index, trx); + + if (srv_locks_unsafe_for_binlog) { + trx_register_new_rec_lock(trx, index); + } } return(TRUE); @@ -1976,14 +2011,23 @@ lock_rec_lock_fast( return(FALSE); } - if (lock->trx != thr_get_trx(thr) - || lock->type_mode != (mode | LOCK_REC) - || lock_rec_get_n_bits(lock) <= heap_no) { + if (lock->trx != trx + || lock->type_mode != (mode | LOCK_REC) + || lock_rec_get_n_bits(lock) <= heap_no) { + return(FALSE); } if (!impl) { - lock_rec_set_nth_bit(lock, heap_no); + /* If the nth bit of the record lock is already set then we + do not set a new lock bit, otherwise we do set */ + + if (!lock_rec_get_nth_bit(lock, heap_no)) { + lock_rec_set_nth_bit(lock, heap_no); + if (srv_locks_unsafe_for_binlog) { + trx_register_new_rec_lock(trx, index); + } + } } return(TRUE); @@ -2039,12 +2083,19 @@ lock_rec_lock_slow( enough already granted on the record, we have to wait. */ err = lock_rec_enqueue_waiting(mode, rec, index, thr); + + if (srv_locks_unsafe_for_binlog) { + trx_register_new_rec_lock(trx, index); + } } else { if (!impl) { /* Set the requested lock on the record */ lock_rec_add_to_queue(LOCK_REC | mode, rec, index, trx); + if (srv_locks_unsafe_for_binlog) { + trx_register_new_rec_lock(trx, index); + } } err = DB_SUCCESS; @@ -2168,15 +2219,14 @@ lock_grant( release it at the end of the SQL statement */ lock->trx->auto_inc_lock = lock; - } else if (lock_get_type(lock) == LOCK_TABLE_EXP) { - ut_a(lock_get_mode(lock) == LOCK_S - || lock_get_mode(lock) == LOCK_X); - } + } +#ifdef UNIV_DEBUG if (lock_print_waits) { fprintf(stderr, "Lock wait for trx %lu ends\n", (ulong) ut_dulint_get_low(lock->trx->id)); } +#endif /* UNIV_DEBUG */ /* If we are resolving a deadlock by choosing another transaction as a victim, then our original transaction may not be in the @@ -2343,12 +2393,12 @@ lock_rec_reset_and_release_wait( { lock_t* lock; ulint heap_no; - + #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); lock = lock_rec_get_first(rec); @@ -2377,15 +2427,21 @@ lock_rec_inherit_to_gap( the locks on this record */ { lock_t* lock; - #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first(rec); + /* If srv_locks_unsafe_for_binlog is TRUE, we do not want locks set + by an UPDATE or a DELETE to be inherited as gap type locks. But we + DO want S-locks set by a consistency constraint to be inherited also + then. */ + while (lock != NULL) { - if (!lock_rec_get_insert_intention(lock)) { + if (!lock_rec_get_insert_intention(lock) + && !(srv_locks_unsafe_for_binlog + && lock_get_mode(lock) == LOCK_X)) { lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock) | LOCK_GAP, @@ -2409,7 +2465,6 @@ lock_rec_inherit_to_gap_if_gap_lock( the locks on this record */ { lock_t* lock; - #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ @@ -2439,7 +2494,8 @@ lock_rec_move( /*==========*/ rec_t* receiver, /* in: record which gets locks; this record must have no lock requests on it! */ - rec_t* donator) /* in: record which gives locks */ + rec_t* donator, /* in: record which gives locks */ + ulint comp) /* in: nonzero=compact page format */ { lock_t* lock; ulint heap_no; @@ -2449,7 +2505,7 @@ lock_rec_move( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(donator); + heap_no = rec_get_heap_no(donator, comp); lock = lock_rec_get_first(donator); @@ -2495,6 +2551,7 @@ lock_move_reorganize_page( UT_LIST_BASE_NODE_T(lock_t) old_locks; mem_heap_t* heap = NULL; rec_t* sup; + ulint comp; lock_mutex_enter_kernel(); @@ -2535,6 +2592,9 @@ lock_move_reorganize_page( lock = UT_LIST_GET_FIRST(old_locks); + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(old_page)); + while (lock) { /* NOTE: we copy also the locks set on the infimum and supremum of the page; the infimum may carry locks if an @@ -2546,12 +2606,12 @@ lock_move_reorganize_page( /* Set locks according to old locks */ for (;;) { - ut_ad(0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), - rec_get_data_size( + rec_get_data_size_old( page_cur_get_rec(&cur2)))); - - old_heap_no = rec_get_heap_no(page_cur_get_rec(&cur2)); + old_heap_no = rec_get_heap_no(page_cur_get_rec(&cur2), + comp); if (lock_rec_get_nth_bit(lock, old_heap_no)) { @@ -2610,7 +2670,9 @@ lock_move_rec_list_end( ulint heap_no; rec_t* sup; ulint type_mode; - + ulint comp; + ut_ad(page == buf_frame_align(rec)); + lock_mutex_enter_kernel(); /* Note: when we move locks from record to record, waiting locks @@ -2623,6 +2685,8 @@ lock_move_rec_list_end( lock = lock_rec_get_first_on_page(page); + comp = page_is_comp(page); + while (lock != NULL) { page_cur_position(rec, &cur1); @@ -2638,13 +2702,12 @@ lock_move_rec_list_end( reset the lock bits on the old */ while (page_cur_get_rec(&cur1) != sup) { - - ut_ad(0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), - rec_get_data_size( + rec_get_data_size_old( page_cur_get_rec(&cur2)))); - - heap_no = rec_get_heap_no(page_cur_get_rec(&cur1)); + heap_no = rec_get_heap_no(page_cur_get_rec(&cur1), + comp); if (lock_rec_get_nth_bit(lock, heap_no)) { type_mode = lock->type_mode; @@ -2694,12 +2757,16 @@ lock_move_rec_list_start( page_cur_t cur2; ulint heap_no; ulint type_mode; + ulint comp; ut_a(new_page); lock_mutex_enter_kernel(); lock = lock_rec_get_first_on_page(page); + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(new_page)); + ut_ad(page == buf_frame_align(rec)); while (lock != NULL) { @@ -2713,13 +2780,12 @@ lock_move_rec_list_start( reset the lock bits on the old */ while (page_cur_get_rec(&cur1) != rec) { - - ut_ad(0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), - rec_get_data_size( + rec_get_data_size_old( page_cur_get_rec(&cur2)))); - - heap_no = rec_get_heap_no(page_cur_get_rec(&cur1)); + heap_no = rec_get_heap_no(page_cur_get_rec(&cur1), + comp); if (lock_rec_get_nth_bit(lock, heap_no)) { type_mode = lock->type_mode; @@ -2759,13 +2825,16 @@ lock_update_split_right( page_t* right_page, /* in: right page */ page_t* left_page) /* in: left page */ { + ulint comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(left_page); + ut_ad(comp == page_is_comp(right_page)); + /* Move the locks on the supremum of the left page to the supremum of the right page */ lock_rec_move(page_get_supremum_rec(right_page), - page_get_supremum_rec(left_page)); + page_get_supremum_rec(left_page), comp); /* Inherit the locks to the supremum of left page from the successor of the infimum on right page */ @@ -2819,13 +2888,16 @@ lock_update_root_raise( page_t* new_page, /* in: index page to which copied */ page_t* root) /* in: root page */ { + ulint comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(root); + ut_ad(comp == page_is_comp(new_page)); + /* Move the locks on the supremum of the root to the supremum of new_page */ lock_rec_move(page_get_supremum_rec(new_page), - page_get_supremum_rec(root)); + page_get_supremum_rec(root), comp); lock_mutex_exit_kernel(); } @@ -2839,13 +2911,16 @@ lock_update_copy_and_discard( page_t* new_page, /* in: index page to which copied */ page_t* page) /* in: index page; NOT the root! */ { + ulint comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(new_page)); + /* Move the locks on the supremum of the old page to the supremum of new_page */ lock_rec_move(page_get_supremum_rec(new_page), - page_get_supremum_rec(page)); + page_get_supremum_rec(page), comp); lock_rec_free_all_from_discard_page(page); lock_mutex_exit_kernel(); @@ -2883,28 +2958,34 @@ lock_update_merge_left( page_t* right_page) /* in: merged index page which will be discarded */ { + rec_t* left_next_rec; + rec_t* left_supremum; + ulint comp; lock_mutex_enter_kernel(); - - if (page_rec_get_next(orig_pred) != page_get_supremum_rec(left_page)) { + comp = page_is_comp(left_page); + ut_ad(comp == page_is_comp(right_page)); + ut_ad(left_page == buf_frame_align(orig_pred)); + + left_next_rec = page_rec_get_next(orig_pred); + left_supremum = page_get_supremum_rec(left_page); + + if (UNIV_LIKELY(left_next_rec != left_supremum)) { /* Inherit the locks on the supremum of the left page to the first record which was moved from the right page */ - lock_rec_inherit_to_gap(page_rec_get_next(orig_pred), - page_get_supremum_rec(left_page)); + lock_rec_inherit_to_gap(left_next_rec, left_supremum); /* Reset the locks on the supremum of the left page, releasing waiting transactions */ - lock_rec_reset_and_release_wait(page_get_supremum_rec( - left_page)); + lock_rec_reset_and_release_wait(left_supremum); } /* Move the locks from the supremum of right page to the supremum of the left page */ - lock_rec_move(page_get_supremum_rec(left_page), - page_get_supremum_rec(right_page)); + lock_rec_move(left_supremum, page_get_supremum_rec(right_page), comp); lock_rec_free_all_from_discard_page(right_page); @@ -2963,7 +3044,7 @@ lock_update_discard( lock_rec_reset_and_release_wait(rec); - if (rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum(rec)) { break; } @@ -2992,7 +3073,7 @@ lock_update_insert( lock_rec_inherit_to_gap_if_gap_lock(rec, page_rec_get_next(rec)); lock_mutex_exit_kernel(); -} +} /***************************************************************** Updates the lock table when a record is removed. */ @@ -3026,17 +3107,16 @@ actual record is being moved. */ void lock_rec_store_on_page_infimum( /*===========================*/ + page_t* page, /* in: page containing the record */ rec_t* rec) /* in: record whose lock state is stored on the infimum record of the same page; lock bits are reset on the record */ { - page_t* page; - - page = buf_frame_align(rec); + ut_ad(page == buf_frame_align(rec)); lock_mutex_enter_kernel(); - lock_rec_move(page_get_infimum_rec(page), rec); + lock_rec_move(page_get_infimum_rec(page), rec, page_is_comp(page)); lock_mutex_exit_kernel(); } @@ -3053,9 +3133,12 @@ lock_rec_restore_from_page_infimum( whose infimum stored the lock state; lock bits are reset on the infimum */ { + ulint comp; lock_mutex_enter_kernel(); - - lock_rec_move(rec, page_get_infimum_rec(page)); + comp = page_is_comp(page); + ut_ad(!comp == !page_rec_is_comp(rec)); + + lock_rec_move(rec, page_get_infimum_rec(page), comp); lock_mutex_exit_kernel(); } @@ -3246,11 +3329,11 @@ lock_deadlock_recursive( } else { lock_table_print(ef, start->wait_lock); } - +#ifdef UNIV_DEBUG if (lock_print_waits) { fputs("Deadlock detected\n", stderr); } - +#endif /* UNIV_DEBUG */ if (ut_dulint_cmp(wait_lock->trx->undo_no, start->undo_no) >= 0) { /* Our recursion starting point @@ -3348,10 +3431,6 @@ lock_table_create( lock->type_mode = type_mode | LOCK_TABLE; lock->trx = trx; - if (lock_get_type(lock) == LOCK_TABLE_EXP) { - lock->trx->n_lock_table_exp++; - } - lock->un_member.tab_lock.table = table; UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock); @@ -3388,10 +3467,6 @@ lock_table_remove_low( trx->auto_inc_lock = NULL; } - if (lock_get_type(lock) == LOCK_TABLE_EXP) { - lock->trx->n_lock_table_exp--; - } - UT_LIST_REMOVE(trx_locks, trx->trx_locks, lock); UT_LIST_REMOVE(un_member.tab_lock.locks, table->locks, lock); } @@ -3522,9 +3597,7 @@ lock_table( /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, - does nothing; - if LOCK_TABLE_EXP bits are set, - creates an explicit table lock */ + does nothing */ dict_table_t* table, /* in: database table in dictionary cache */ ulint mode, /* in: lock mode */ que_thr_t* thr) /* in: query thread */ @@ -3539,7 +3612,7 @@ lock_table( return(DB_SUCCESS); } - ut_a(flags == 0 || flags == LOCK_TABLE_EXP); + ut_a(flags == 0); trx = thr_get_trx(thr); @@ -3562,7 +3635,7 @@ lock_table( /* Another trx has a request on the table in an incompatible mode: this trx may have to wait */ - err = lock_table_enqueue_waiting(mode, table, thr); + err = lock_table_enqueue_waiting(mode | flags, table, thr); lock_mutex_exit_kernel(); @@ -3652,8 +3725,7 @@ lock_table_dequeue( #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - ut_a(lock_get_type(in_lock) == LOCK_TABLE || - lock_get_type(in_lock) == LOCK_TABLE_EXP); + ut_a(lock_get_type(in_lock) == LOCK_TABLE); lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock); @@ -3757,10 +3829,6 @@ lock_release_off_kernel( } lock_table_dequeue(lock); - if (lock_get_type(lock) == LOCK_TABLE_EXP) { - ut_a(lock_get_mode(lock) == LOCK_S - || lock_get_mode(lock) == LOCK_X); - } } if (count == LOCK_RELEASE_KERNEL_INTERVAL) { @@ -3780,72 +3848,6 @@ lock_release_off_kernel( mem_heap_empty(trx->lock_heap); ut_a(trx->auto_inc_lock == NULL); - ut_a(trx->n_lock_table_exp == 0); -} - -/************************************************************************* -Releases table locks explicitly requested with LOCK TABLES (indicated by -lock type LOCK_TABLE_EXP), and releases possible other transactions waiting -because of these locks. */ - -void -lock_release_tables_off_kernel( -/*===========================*/ - trx_t* trx) /* in: transaction */ -{ - dict_table_t* table; - ulint count; - lock_t* lock; - -#ifdef UNIV_SYNC_DEBUG - ut_ad(mutex_own(&kernel_mutex)); -#endif /* UNIV_SYNC_DEBUG */ - - lock = UT_LIST_GET_LAST(trx->trx_locks); - - count = 0; - - while (lock != NULL) { - - count++; - - if (lock_get_type(lock) == LOCK_TABLE_EXP) { - ut_a(lock_get_mode(lock) == LOCK_S - || lock_get_mode(lock) == LOCK_X); - if (trx->insert_undo || trx->update_undo) { - - /* The trx may have modified the table. - We block the use of the MySQL query - cache for all currently active - transactions. */ - - table = lock->un_member.tab_lock.table; - - table->query_cache_inv_trx_id = - trx_sys->max_trx_id; - } - - lock_table_dequeue(lock); - - lock = UT_LIST_GET_LAST(trx->trx_locks); - continue; - } - - if (count == LOCK_RELEASE_KERNEL_INTERVAL) { - /* Release the kernel mutex for a while, so that we - do not monopolize it */ - - lock_mutex_exit_kernel(); - - lock_mutex_enter_kernel(); - - count = 0; - } - - lock = UT_LIST_GET_PREV(trx_locks, lock); - } - - ut_a(trx->n_lock_table_exp == 0); } /************************************************************************* @@ -3958,12 +3960,8 @@ lock_table_print( #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - ut_a(lock_get_type(lock) == LOCK_TABLE || - lock_get_type(lock) == LOCK_TABLE_EXP); + ut_a(lock_get_type(lock) == LOCK_TABLE); - if (lock_get_type(lock) == LOCK_TABLE_EXP) { - fputs("EXPLICIT ", file); - } fputs("TABLE LOCK table ", file); ut_print_name(file, lock->trx, lock->un_member.tab_lock.table->name); fprintf(file, " trx id %lu %lu", @@ -3999,11 +3997,15 @@ lock_rec_print( FILE* file, /* in: file where to print */ lock_t* lock) /* in: record type lock */ { - page_t* page; - ulint space; - ulint page_no; - ulint i; - mtr_t mtr; + page_t* page; + ulint space; + ulint page_no; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -4082,8 +4084,11 @@ lock_rec_print( fprintf(file, "Record lock, heap no %lu ", (ulong) i); if (page) { - rec_print(file, - page_find_rec_with_heap_no(page, i)); + rec_t* rec + = page_find_rec_with_heap_no(page, i); + offsets = rec_get_offsets(rec, lock->index, + offsets, ULINT_UNDEFINED, &heap); + rec_print_new(file, rec, offsets); } putc('\n', file); @@ -4091,8 +4096,11 @@ lock_rec_print( } mtr_commit(&mtr); -} - + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + /************************************************************************* Calculates the number of record lock structs in the record lock hash table. */ static @@ -4121,7 +4129,8 @@ lock_get_n_rec_locks(void) return(n_locks); } - + +#ifndef UNIV_HOTBACKUP /************************************************************************* Prints info of locks for all transactions. */ @@ -4349,6 +4358,7 @@ lock_table_queue_validate( while (lock) { ut_a(((lock->trx)->conc_state == TRX_ACTIVE) + || ((lock->trx)->conc_state == TRX_PREPARED) || ((lock->trx)->conc_state == TRX_COMMITTED_IN_MEMORY)); if (!lock_get_wait(lock)) { @@ -4377,21 +4387,25 @@ lock_rec_queue_validate( /*====================*/ /* out: TRUE if ok */ rec_t* rec, /* in: record to look at */ - dict_index_t* index) /* in: index, or NULL if not known */ + dict_index_t* index, /* in: index, or NULL if not known */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { trx_t* impl_trx; lock_t* lock; - + ut_a(rec); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); lock_mutex_enter_kernel(); - if (page_rec_is_supremum(rec) || page_rec_is_infimum(rec)) { + if (!page_rec_is_user_rec(rec)) { lock = lock_rec_get_first(rec); while (lock) { ut_a(lock->trx->conc_state == TRX_ACTIVE + || lock->trx->conc_state == TRX_PREPARED || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY); @@ -4415,13 +4429,13 @@ lock_rec_queue_validate( if (index && (index->type & DICT_CLUSTERED)) { - impl_trx = lock_clust_rec_some_has_impl(rec, index); + impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets); if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, impl_trx)) { ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)); + impl_trx)); } } @@ -4431,13 +4445,14 @@ lock_rec_queue_validate( next function call: we have to release lock table mutex to obey the latching order */ - impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index); + impl_trx = lock_sec_rec_some_has_impl_off_kernel( + rec, index, offsets); if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, impl_trx)) { - ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)); + ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + rec, impl_trx)); } } @@ -4445,6 +4460,7 @@ lock_rec_queue_validate( while (lock) { ut_a(lock->trx->conc_state == TRX_ACTIVE + || lock->trx->conc_state == TRX_PREPARED || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY); ut_a(trx_in_trx_list(lock->trx)); @@ -4453,14 +4469,16 @@ lock_rec_queue_validate( } if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) { + + ulint mode; if (lock_get_mode(lock) == LOCK_S) { - ut_a(!lock_rec_other_has_expl_req(LOCK_X, - 0, 0, rec, lock->trx)); + mode = LOCK_X; } else { - ut_a(!lock_rec_other_has_expl_req(LOCK_S, - 0, 0, rec, lock->trx)); + mode = LOCK_S; } + ut_a(!lock_rec_other_has_expl_req(mode, + 0, 0, rec, lock->trx)); } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { @@ -4489,10 +4507,14 @@ lock_rec_validate_page( page_t* page; lock_t* lock; rec_t* rec; - ulint nth_lock = 0; - ulint nth_bit = 0; + ulint nth_lock = 0; + ulint nth_bit = 0; ulint i; mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; #ifdef UNIV_SYNC_DEBUG ut_ad(!mutex_own(&kernel_mutex)); @@ -4524,6 +4546,7 @@ loop: ut_a(trx_in_trx_list(lock->trx)); ut_a(lock->trx->conc_state == TRX_ACTIVE + || lock->trx->conc_state == TRX_PREPARED || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY); for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) { @@ -4532,13 +4555,15 @@ loop: index = lock->index; rec = page_find_rec_with_heap_no(page, i); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); fprintf(stderr, "Validating %lu %lu\n", (ulong) space, (ulong) page_no); lock_mutex_exit_kernel(); - lock_rec_queue_validate(rec, index); + lock_rec_queue_validate(rec, index, offsets); lock_mutex_enter_kernel(); @@ -4558,6 +4583,9 @@ function_exit: mtr_commit(&mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(TRUE); } @@ -4637,7 +4665,7 @@ lock_validate(void) return(TRUE); } - +#endif /* !UNIV_HOTBACKUP */ /*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/ /************************************************************************* @@ -4730,8 +4758,22 @@ lock_rec_insert_check_and_lock( page_update_max_trx_id(buf_frame_align(rec), thr_get_trx(thr)->id); } - - ut_ad(lock_rec_queue_validate(next_rec, index)); + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + offsets = rec_get_offsets(next_rec, index, offsets_, + ULINT_UNDEFINED, &heap); + ut_ad(lock_rec_queue_validate(next_rec, index, offsets)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ return(err); } @@ -4745,7 +4787,8 @@ void lock_rec_convert_impl_to_expl( /*==========================*/ rec_t* rec, /* in: user record on page */ - dict_index_t* index) /* in: index of record */ + dict_index_t* index, /* in: index of record */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { trx_t* impl_trx; @@ -4753,11 +4796,14 @@ lock_rec_convert_impl_to_expl( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); if (index->type & DICT_CLUSTERED) { - impl_trx = lock_clust_rec_some_has_impl(rec, index); + impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets); } else { - impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index); + impl_trx = lock_sec_rec_some_has_impl_off_kernel( + rec, index, offsets); } if (impl_trx) { @@ -4765,7 +4811,7 @@ lock_rec_convert_impl_to_expl( record, set one for it */ if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)) { + impl_trx)) { lock_rec_add_to_queue(LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP, rec, index, @@ -4791,17 +4837,19 @@ lock_clust_rec_modify_check_and_lock( does nothing */ rec_t* rec, /* in: record which should be modified */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr) /* in: query thread */ { ulint err; - + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(index->type & DICT_CLUSTERED); + if (flags & BTR_NO_LOCKING_FLAG) { return(DB_SUCCESS); } - ut_ad(index->type & DICT_CLUSTERED); - lock_mutex_enter_kernel(); ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); @@ -4809,13 +4857,13 @@ lock_clust_rec_modify_check_and_lock( /* If a transaction has no explicit x-lock set on the record, set one for it */ - lock_rec_convert_impl_to_expl(rec, index); + lock_rec_convert_impl_to_expl(rec, index, offsets); err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr); lock_mutex_exit_kernel(); - ut_ad(lock_rec_queue_validate(rec, index)); + ut_ad(lock_rec_queue_validate(rec, index, offsets)); return(err); } @@ -4859,8 +4907,22 @@ lock_sec_rec_modify_check_and_lock( err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr); lock_mutex_exit_kernel(); - - ut_ad(lock_rec_queue_validate(rec, index)); + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + ut_ad(lock_rec_queue_validate(rec, index, offsets)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ if (err == DB_SUCCESS) { /* Update the page max trx id field */ @@ -4887,6 +4949,7 @@ lock_sec_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: secondary index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -4898,6 +4961,7 @@ lock_sec_rec_read_check_and_lock( ut_ad(!(index->type & DICT_CLUSTERED)); ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); if (flags & BTR_NO_LOCKING_FLAG) { @@ -4920,14 +4984,14 @@ lock_sec_rec_read_check_and_lock( || recv_recovery_is_on()) && !page_rec_is_supremum(rec)) { - lock_rec_convert_impl_to_expl(rec, index); + lock_rec_convert_impl_to_expl(rec, index, offsets); } err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr); lock_mutex_exit_kernel(); - ut_ad(lock_rec_queue_validate(rec, index)); + ut_ad(lock_rec_queue_validate(rec, index, offsets)); return(err); } @@ -4951,6 +5015,7 @@ lock_clust_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -4964,6 +5029,8 @@ lock_clust_rec_read_check_and_lock( ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP || gap_mode == LOCK_REC_NOT_GAP); + ut_ad(rec_offs_validate(rec, index, offsets)); + if (flags & BTR_NO_LOCKING_FLAG) { return(DB_SUCCESS); @@ -4978,14 +5045,57 @@ lock_clust_rec_read_check_and_lock( if (!page_rec_is_supremum(rec)) { - lock_rec_convert_impl_to_expl(rec, index); + lock_rec_convert_impl_to_expl(rec, index, offsets); } err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr); lock_mutex_exit_kernel(); - ut_ad(lock_rec_queue_validate(rec, index)); - + ut_ad(lock_rec_queue_validate(rec, index, offsets)); + return(err); } +/************************************************************************* +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". */ + +ulint +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + rec_t* rec, /* in: user record or page supremum record + which should be read or passed over by a read + cursor */ + dict_index_t* index, /* in: clustered index */ + ulint mode, /* in: mode of the lock which the read cursor + should set on records: LOCK_S or LOCK_X; the + latter is possible in SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ulint ret; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + ret = lock_clust_rec_read_check_and_lock(flags, rec, index, + offsets, mode, gap_mode, thr); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + return(ret); +} diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index e08adb013b5..2f76bf450db 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -24,6 +24,32 @@ Created 12/9/1995 Heikki Tuuri #include "trx0sys.h" #include "trx0trx.h" +/* +General philosophy of InnoDB redo-logs: + +1) Every change to a contents of a data page must be done +through mtr, which in mtr_commit() writes log records +to the InnoDB redo log. + +2) Normally these changes are performed using a mlog_write_ulint() +or similar function. + +3) In some page level operations only a code number of a +c-function and its parameters are written to the log to +reduce the size of the log. + + 3a) You should not add parameters to these kind of functions + (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse()) + + 3b) You should not add such functionality which either change + working when compared with the old or are dependent on data + outside of the page. These kind of functions should implement + self-contained page transformation and it should be unchanged + if you don't have very essential reasons to change log + semantics or format. + +*/ + /* Current free limit of space 0; protected by the log sys mutex; 0 means uninitialized */ ulint log_fsp_current_free_limit = 0; @@ -31,10 +57,11 @@ ulint log_fsp_current_free_limit = 0; /* Global log system variable */ log_t* log_sys = NULL; +#ifdef UNIV_DEBUG ibool log_do_write = TRUE; ibool log_debug_writes = FALSE; - +#endif /* UNIV_DEBUG */ /* These control how often we print warnings if the last checkpoint is too old */ @@ -190,6 +217,8 @@ loop: log_buffer_flush_to_disk(); + srv_log_waits++; + ut_ad(++count < 50); goto loop; @@ -292,6 +321,8 @@ part_loop: if (str_len > 0) { goto part_loop; } + + srv_log_write_requests++; } /**************************************************************** @@ -944,22 +975,24 @@ log_group_check_flush_completion( #endif /* UNIV_SYNC_DEBUG */ if (!log_sys->one_flushed && group->n_pending_writes == 0) { +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "Log flushed first to group %lu\n", (ulong) group->id); } - +#endif /* UNIV_DEBUG */ log_sys->written_to_some_lsn = log_sys->write_lsn; log_sys->one_flushed = TRUE; return(LOG_UNLOCK_NONE_FLUSHED_LOCK); } +#ifdef UNIV_DEBUG if (log_debug_writes && (group->n_pending_writes == 0)) { fprintf(stderr, "Log flushed to group %lu\n", (ulong) group->id); } - +#endif /* UNIV_DEBUG */ return(0); } @@ -1036,12 +1069,13 @@ log_io_complete( fil_flush(group->space_id); } +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "Checkpoint info written to group %lu\n", group->id); } - +#endif /* UNIV_DEBUG */ log_io_complete_checkpoint(); return; @@ -1103,20 +1137,25 @@ log_group_file_header_flush( dest_offset = nth_file * group->file_size; +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "Writing log file header to group %lu file %lu\n", (ulong) group->id, (ulong) nth_file); } - +#endif /* UNIV_DEBUG */ if (log_do_write) { log_sys->n_log_ios++; + srv_os_log_pending_writes++; + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, dest_offset / UNIV_PAGE_SIZE, dest_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, buf, group); + + srv_os_log_pending_writes--; } } @@ -1181,6 +1220,8 @@ loop: log_group_file_header_flush(group, next_offset / group->file_size, start_lsn); + srv_os_log_written+= OS_FILE_LOG_BLOCK_SIZE; + srv_log_writes++; } if ((next_offset % group->file_size) + len > group->file_size) { @@ -1190,7 +1231,8 @@ loop: } else { write_len = len; } - + +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, @@ -1214,7 +1256,7 @@ loop: + i * OS_FILE_LOG_BLOCK_SIZE)); } } - +#endif /* UNIV_DEBUG */ /* Calculate the checksums for each log block and write them to the trailer fields of the log blocks */ @@ -1225,9 +1267,16 @@ loop: if (log_do_write) { log_sys->n_log_ios++; + srv_os_log_pending_writes++; + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, next_offset / UNIV_PAGE_SIZE, next_offset % UNIV_PAGE_SIZE, write_len, buf, group); + + srv_os_log_pending_writes--; + + srv_os_log_written+= write_len; + srv_log_writes++; } if (write_len < len) { @@ -1341,6 +1390,7 @@ loop: return; } +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "Writing log from %lu %lu up to lsn %lu %lu\n", @@ -1349,7 +1399,7 @@ loop: (ulong) ut_dulint_get_high(log_sys->lsn), (ulong) ut_dulint_get_low(log_sys->lsn)); } - +#endif /* UNIV_DEBUG */ log_sys->n_pending_writes++; group = UT_LIST_GET_FIRST(log_sys->log_groups); @@ -1918,12 +1968,14 @@ log_checkpoint( log_sys->next_checkpoint_lsn = oldest_lsn; +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "Making checkpoint no %lu at lsn %lu %lu\n", (ulong) ut_dulint_get_low(log_sys->next_checkpoint_no), (ulong) ut_dulint_get_high(oldest_lsn), (ulong) ut_dulint_get_low(oldest_lsn)); } +#endif /* UNIV_DEBUG */ log_groups_write_checkpoint_info(); @@ -1986,8 +2038,6 @@ log_checkpoint_margin(void) ulint checkpoint_age; ulint advance; dulint oldest_lsn; - dulint new_oldest; - ibool do_preflush; ibool sync; ibool checkpoint_sync; ibool do_checkpoint; @@ -1995,7 +2045,6 @@ log_checkpoint_margin(void) loop: sync = FALSE; checkpoint_sync = FALSE; - do_preflush = FALSE; do_checkpoint = FALSE; mutex_enter(&(log->mutex)); @@ -2015,21 +2064,13 @@ loop: /* A flush is urgent: we have to do a synchronous preflush */ sync = TRUE; - advance = 2 * (age - log->max_modified_age_sync); - - new_oldest = ut_dulint_add(oldest_lsn, advance); - - do_preflush = TRUE; - } else if (age > log->max_modified_age_async) { /* A flush is not urgent: we do an asynchronous preflush */ advance = age - log->max_modified_age_async; - - new_oldest = ut_dulint_add(oldest_lsn, advance); - - do_preflush = TRUE; + } else { + advance = 0; } checkpoint_age = ut_dulint_minus(log->lsn, log->last_checkpoint_lsn); @@ -2053,7 +2094,9 @@ loop: mutex_exit(&(log->mutex)); - if (do_preflush) { + if (advance) { + dulint new_oldest = ut_dulint_add(oldest_lsn, advance); + success = log_preflush_pool_modified_pages(new_oldest, sync); /* If the flush succeeded, this thread has done its part @@ -2304,9 +2347,11 @@ loop: exit(1); } +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "Created archive file %s\n", name); } +#endif /* UNIV_DEBUG */ ret = os_file_close(file_handle); @@ -2332,7 +2377,8 @@ loop: len = group->file_size - (next_offset % group->file_size); } - + +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "Archiving starting at lsn %lu %lu, len %lu to group %lu\n", @@ -2340,6 +2386,7 @@ loop: (ulong) ut_dulint_get_low(start_lsn), (ulong) len, (ulong) group->id); } +#endif /* UNIV_DEBUG */ log_sys->n_pending_archive_ios++; @@ -2430,11 +2477,13 @@ log_archive_write_complete_groups(void) trunc_files = n_files - 1; } +#ifdef UNIV_DEBUG if (log_debug_writes && trunc_files) { fprintf(stderr, "Complete file(s) archived to group %lu\n", (ulong) group->id); } +#endif /* UNIV_DEBUG */ /* Calculate the archive file space start lsn */ start_lsn = ut_dulint_subtract(log_sys->next_archived_lsn, @@ -2457,9 +2506,11 @@ log_archive_write_complete_groups(void) fil_space_truncate_start(group->archive_space_id, trunc_files * group->file_size); +#ifdef UNIV_DEBUG if (log_debug_writes) { fputs("Archiving writes completed\n", stderr); } +#endif /* UNIV_DEBUG */ } /********************************************************** @@ -2476,9 +2527,11 @@ log_archive_check_completion_low(void) if (log_sys->n_pending_archive_ios == 0 && log_sys->archiving_phase == LOG_ARCHIVE_READ) { +#ifdef UNIV_DEBUG if (log_debug_writes) { fputs("Archiving read completed\n", stderr); } +#endif /* UNIV_DEBUG */ /* Archive buffer has now been read in: start archive writes */ @@ -2622,6 +2675,7 @@ loop: log_sys->next_archived_lsn = limit_lsn; +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "Archiving from lsn %lu %lu to lsn %lu %lu\n", @@ -2630,6 +2684,7 @@ loop: (ulong) ut_dulint_get_high(limit_lsn), (ulong) ut_dulint_get_low(limit_lsn)); } +#endif /* UNIV_DEBUG */ /* Read the log segment to the archive buffer */ @@ -2732,12 +2787,14 @@ log_archive_close_groups( group->archived_file_no += 2; } +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "Incrementing arch file no to %lu in log group %lu\n", (ulong) group->archived_file_no + 2, (ulong) group->id); } +#endif /* UNIV_DEBUG */ } } @@ -3004,7 +3061,10 @@ loop: mutex_enter(&kernel_mutex); - /* Check that there are no longer transactions */ + /* Check that there are no longer transactions. We need this wait even + for the 'very fast' shutdown, because the InnoDB layer may have + committed or prepared transactions and we don't want to lose them. */ + if (trx_n_mysql_transactions > 0 || UT_LIST_GET_LEN(trx_sys->trx_list) > 0) { @@ -3013,6 +3073,21 @@ loop: goto loop; } + if (srv_fast_shutdown == 2) { + /* In this fastest shutdown we do not flush the buffer pool: + it is essentially a 'crash' of the InnoDB server. + Make sure that the log is all flushed to disk, so that + we can recover all committed transactions in a crash + recovery. + We must not write the lsn stamps to the data files, since at a + startup InnoDB deduces from the stamps if the previous + shutdown was clean. */ + + log_buffer_flush_to_disk(); + return; /* We SKIP ALL THE REST !! */ + } + + /* Check that the master thread is suspended */ if (srv_n_threads_active[SRV_MASTER] != 0) { @@ -3049,24 +3124,13 @@ loop: log_archive_all(); #endif /* UNIV_LOG_ARCHIVE */ - if (!srv_very_fast_shutdown) { - /* In a 'very fast' shutdown we do not flush the buffer pool: - it is essentially a 'crash' of the InnoDB server. */ - log_make_checkpoint_at(ut_dulint_max, TRUE); - } else { - /* Make sure that the log is all flushed to disk, so that - we can recover all committed transactions in a crash - recovery */ - log_buffer_flush_to_disk(); - } mutex_enter(&(log_sys->mutex)); lsn = log_sys->lsn; - if ((ut_dulint_cmp(lsn, log_sys->last_checkpoint_lsn) != 0 - && !srv_very_fast_shutdown) + if ((ut_dulint_cmp(lsn, log_sys->last_checkpoint_lsn) != 0) #ifdef UNIV_LOG_ARCHIVE || (srv_log_archive_on && ut_dulint_cmp(lsn, @@ -3115,7 +3179,7 @@ loop: completely flushed to disk! (We do not call fil_write... if the 'very fast' shutdown is enabled.) */ - if (!srv_very_fast_shutdown && !buf_all_freed()) { + if (!buf_all_freed()) { goto loop; } @@ -3138,7 +3202,7 @@ loop: /* Make some checks that the server really is quiet */ ut_a(srv_n_threads_active[SRV_MASTER] == 0); - ut_a(srv_very_fast_shutdown || buf_all_freed()); + ut_a(buf_all_freed()); ut_a(0 == ut_dulint_cmp(lsn, log_sys->lsn)); if (ut_dulint_cmp(lsn, srv_start_lsn) < 0) { @@ -3153,15 +3217,7 @@ loop: srv_shutdown_lsn = lsn; - if (!srv_very_fast_shutdown) { - /* In a 'very fast' shutdown we do not flush the buffer pool: - it is essentially a 'crash' of the InnoDB server. Then we must - not write the lsn stamps to the data files, since at a - startup InnoDB deduces from the stamps if the previous - shutdown was clean. */ - fil_write_flushed_lsn_to_data_files(lsn, arch_log_no); - } fil_flush_file_spaces(FIL_TABLESPACE); @@ -3169,7 +3225,7 @@ loop: /* Make some checks that the server really is quiet */ ut_a(srv_n_threads_active[SRV_MASTER] == 0); - ut_a(srv_very_fast_shutdown || buf_all_freed()); + ut_a(buf_all_freed()); ut_a(0 == ut_dulint_cmp(lsn, log_sys->lsn)); } diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index ddb33de6fa6..42e854398ba 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -477,6 +477,7 @@ recv_find_max_checkpoint( max_no = ut_dulint_zero; *max_group = NULL; + *max_field = 0; buf = log_sys->checkpoint_buf; @@ -489,6 +490,7 @@ recv_find_max_checkpoint( log_group_read_checkpoint_info(group, field); if (!recv_check_cp_is_consistent(buf)) { +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "InnoDB: Checkpoint in group %lu at %lu invalid, %lu\n", @@ -498,7 +500,7 @@ recv_find_max_checkpoint( + LOG_CHECKPOINT_CHECKSUM_1)); } - +#endif /* UNIV_DEBUG */ goto not_consistent; } @@ -511,13 +513,15 @@ recv_find_max_checkpoint( checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO); +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "InnoDB: Checkpoint number %lu found in group %lu\n", (ulong) ut_dulint_get_low(checkpoint_no), (ulong) group->id); } - +#endif /* UNIV_DEBUG */ + if (ut_dulint_cmp(checkpoint_no, max_no) >= 0) { *max_group = group; *max_field = field; @@ -540,7 +544,6 @@ recv_find_max_checkpoint( "InnoDB: to create the InnoDB data files, but log file creation failed.\n" "InnoDB: If that is the case, please refer to\n" "InnoDB: http://dev.mysql.com/doc/mysql/en/Error_creating_InnoDB.html\n"); - return(DB_ERROR); } @@ -756,81 +759,143 @@ recv_parse_or_apply_log_rec_body( mtr_t* mtr) /* in: mtr or NULL; should be non-NULL if and only if page is non-NULL */ { - byte* new_ptr; - - if (type <= MLOG_8BYTES) { - new_ptr = mlog_parse_nbytes(type, ptr, end_ptr, page); - - } else if (type == MLOG_REC_INSERT) { - new_ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr, page, - mtr); - } else if (type == MLOG_REC_CLUST_DELETE_MARK) { - new_ptr = btr_cur_parse_del_mark_set_clust_rec(ptr, end_ptr, - page); - } else if (type == MLOG_REC_SEC_DELETE_MARK) { - new_ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, - page); - } else if (type == MLOG_REC_UPDATE_IN_PLACE) { - new_ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page); - - } else if ((type == MLOG_LIST_END_DELETE) - || (type == MLOG_LIST_START_DELETE)) { - new_ptr = page_parse_delete_rec_list(type, ptr, end_ptr, page, - mtr); - } else if (type == MLOG_LIST_END_COPY_CREATED) { - new_ptr = page_parse_copy_rec_list_to_created_page(ptr, - end_ptr, page, mtr); - } else if (type == MLOG_PAGE_REORGANIZE) { - new_ptr = btr_parse_page_reorganize(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_PAGE_CREATE) { - new_ptr = page_parse_create(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_UNDO_INSERT) { - new_ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); - - } else if (type == MLOG_UNDO_ERASE_END) { - new_ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, - mtr); - } else if (type == MLOG_UNDO_INIT) { - new_ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_UNDO_HDR_DISCARD) { - new_ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, - mtr); - } else if ((type == MLOG_UNDO_HDR_CREATE) - || (type == MLOG_UNDO_HDR_REUSE)) { - new_ptr = trx_undo_parse_page_header(type, ptr, end_ptr, page, - mtr); - } else if (type == MLOG_REC_MIN_MARK) { - new_ptr = btr_parse_set_min_rec_mark(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_REC_DELETE) { - new_ptr = page_cur_parse_delete_rec(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_IBUF_BITMAP_INIT) { - new_ptr = ibuf_parse_bitmap_init(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_INIT_FILE_PAGE) { - new_ptr = fsp_parse_init_file_page(ptr, end_ptr, page); - - } else if (type == MLOG_WRITE_STRING) { - new_ptr = mlog_parse_string(ptr, end_ptr, page); - - } else if (type == MLOG_FILE_CREATE - || type == MLOG_FILE_RENAME - || type == MLOG_FILE_DELETE) { - new_ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE, + dict_index_t* index = NULL; + + switch (type) { + case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES: + ptr = mlog_parse_nbytes(type, ptr, end_ptr, page); + break; + case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_INSERT, &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page)==index->table->comp); + ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr, + index, page, mtr); + } + break; + case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_CLUST_DELETE_MARK, &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page)==index->table->comp); + ptr = btr_cur_parse_del_mark_set_clust_rec(ptr, + end_ptr, index, page); + } + break; + case MLOG_COMP_REC_SEC_DELETE_MARK: + /* This log record type is obsolete, but we process it for + backward compatibility with MySQL 5.0.3 and 5.0.4. */ + ut_a(!page || page_is_comp(page)); + ptr = mlog_parse_index(ptr, end_ptr, TRUE, &index); + if (!ptr) { + break; + } + /* Fall through */ + case MLOG_REC_SEC_DELETE_MARK: + ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, page); + break; + case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_UPDATE_IN_PLACE, &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page)==index->table->comp); + ptr = btr_cur_parse_update_in_place(ptr, end_ptr, + page, index); + } + break; + case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE: + case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE, &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page)==index->table->comp); + ptr = page_parse_delete_rec_list(type, ptr, end_ptr, + index, page, mtr); + } + break; + case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_LIST_END_COPY_CREATED, &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page)==index->table->comp); + ptr = page_parse_copy_rec_list_to_created_page(ptr, + end_ptr, index, page, mtr); + } + break; + case MLOG_PAGE_REORGANIZE: case MLOG_COMP_PAGE_REORGANIZE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_PAGE_REORGANIZE, &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page)==index->table->comp); + ptr = btr_parse_page_reorganize(ptr, end_ptr, index, + page, mtr); + } + break; + case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE: + ptr = page_parse_create(ptr, end_ptr, + type == MLOG_COMP_PAGE_CREATE, page, mtr); + break; + case MLOG_UNDO_INSERT: + ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); + break; + case MLOG_UNDO_ERASE_END: + ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_INIT: + ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_DISCARD: + ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_CREATE: + case MLOG_UNDO_HDR_REUSE: + ptr = trx_undo_parse_page_header(type, ptr, end_ptr, + page, mtr); + break; + case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK: + ptr = btr_parse_set_min_rec_mark(ptr, end_ptr, + type == MLOG_COMP_REC_MIN_MARK, page, mtr); + break; + case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_DELETE, &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page)==index->table->comp); + ptr = page_cur_parse_delete_rec(ptr, end_ptr, + index, page, mtr); + } + break; + case MLOG_IBUF_BITMAP_INIT: + ptr = ibuf_parse_bitmap_init(ptr, end_ptr, page, mtr); + break; + case MLOG_INIT_FILE_PAGE: + ptr = fsp_parse_init_file_page(ptr, end_ptr, page); + break; + case MLOG_WRITE_STRING: + ptr = mlog_parse_string(ptr, end_ptr, page); + break; + case MLOG_FILE_CREATE: + case MLOG_FILE_RENAME: + case MLOG_FILE_DELETE: + ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE, ULINT_UNDEFINED); - } else { - new_ptr = NULL; - + break; + default: + ptr = NULL; recv_sys->found_corrupt_log = TRUE; } - ut_ad(!page || new_ptr); + ut_ad(!page || ptr); + if (index) { + dict_table_t* table = index->table; + mem_heap_free(index->heap); + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); + } - return(new_ptr); + return(ptr); } /************************************************************************* @@ -1110,6 +1175,7 @@ recv_recover_page( } modification_to_page = FALSE; + start_lsn = end_lsn = ut_dulint_zero; recv = UT_LIST_GET_FIRST(recv_addr->rec_list); @@ -1143,6 +1209,7 @@ recv_recover_page( start_lsn = recv->start_lsn; } +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "InnoDB: Applying log rec type %lu len %lu to space %lu page no %lu\n", @@ -1150,6 +1217,7 @@ recv_recover_page( (ulong) recv_addr->space, (ulong) recv_addr->page_no); } +#endif /* UNIV_DEBUG */ recv_parse_or_apply_log_rec_body(recv->type, buf, buf + recv->len, page, &mtr); @@ -1394,7 +1462,7 @@ loop: /* This page is allocated from the buffer pool and used in the function below */ -page_t* recv_backup_application_page = NULL; +static page_t* recv_backup_application_page = NULL; /*********************************************************************** Applies log records in the hash table to a backup. */ @@ -1736,6 +1804,8 @@ recv_parse_log_rec( { byte* new_ptr; + *body = NULL; + if (ptr == end_ptr) { return(0); @@ -1758,25 +1828,25 @@ recv_parse_log_rec( new_ptr = mlog_parse_initial_log_record(ptr, end_ptr, type, space, page_no); - if (!new_ptr) { + *body = new_ptr; + + if (UNIV_UNLIKELY(!new_ptr)) { return(0); } /* Check that page_no is sensible */ - if (*page_no > 0x8FFFFFFFUL) { + if (UNIV_UNLIKELY(*page_no > 0x8FFFFFFFUL)) { recv_sys->found_corrupt_log = TRUE; return(0); } - *body = new_ptr; - new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr, NULL, NULL); - if (new_ptr == NULL) { + if (UNIV_UNLIKELY(new_ptr == NULL)) { return(0); } @@ -1970,12 +2040,14 @@ loop: recv_sys->recovered_offset += len; recv_sys->recovered_lsn = new_recovered_lsn; +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "InnoDB: Parsed a single log rec type %lu len %lu space %lu page no %lu\n", (ulong) type, (ulong) len, (ulong) space, (ulong) page_no); } +#endif /* UNIV_DEBUG */ if (type == MLOG_DUMMY_RECORD) { /* Do nothing */ @@ -2058,13 +2130,15 @@ loop: body, ptr + len); #endif /* UNIV_LOG_REPLICATE */ } - + +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "InnoDB: Parsed a multi log rec type %lu len %lu space %lu page no %lu\n", (ulong) type, (ulong) len, (ulong) space, (ulong) page_no); } +#endif /* UNIV_DEBUG */ total_len += len; n_recs++; @@ -2471,6 +2545,7 @@ recv_group_scan_log_recs( start_lsn = end_lsn; } +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "InnoDB: Scanned group %lu up to log sequence number %lu %lu\n", @@ -2478,6 +2553,7 @@ recv_group_scan_log_recs( (ulong) ut_dulint_get_high(*group_scanned_lsn), (ulong) ut_dulint_get_low(*group_scanned_lsn)); } +#endif /* UNIV_DEBUG */ } /************************************************************ @@ -2853,11 +2929,8 @@ void recv_recovery_from_checkpoint_finish(void) /*======================================*/ { - /* Rollback the uncommitted transactions which have no user session */ - - if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { - trx_rollback_or_clean_all_without_sess(); - } + int i; + os_thread_id_t recovery_thread_id; /* Apply the hashed log records to the respective file pages */ @@ -2866,10 +2939,12 @@ recv_recovery_from_checkpoint_finish(void) recv_apply_hashed_log_recs(TRUE); } +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "InnoDB: Log records applied to the database\n"); } +#endif /* UNIV_DEBUG */ if (recv_needed_recovery) { trx_sys_print_mysql_master_log_pos(); @@ -2890,9 +2965,17 @@ recv_recovery_from_checkpoint_finish(void) /* Free the resources of the recovery system */ recv_recovery_on = FALSE; + #ifndef UNIV_LOG_DEBUG recv_sys_free(); #endif + if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { + /* Rollback the uncommitted transactions which have no user + session */ + + os_thread_create(trx_rollback_or_clean_all_without_sess, + (void *)&i, &recovery_thread_id); + } } /********************************************************** @@ -3198,6 +3281,7 @@ ask_again: break; } +#ifdef UNIV_DEBUG if (log_debug_writes) { fprintf(stderr, "InnoDB: Archive read starting at lsn %lu %lu, len %lu from file %s\n", @@ -3205,6 +3289,7 @@ ask_again: (ulong) ut_dulint_get_low(start_lsn), (ulong) len, name); } +#endif /* UNIV_DEBUG */ fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->archive_space_id, read_offset / UNIV_PAGE_SIZE, diff --git a/innobase/mtr/mtr0log.c b/innobase/mtr/mtr0log.c index 82baa8905ba..0308619073a 100644 --- a/innobase/mtr/mtr0log.c +++ b/innobase/mtr/mtr0log.c @@ -15,6 +15,7 @@ Created 12/7/1995 Heikki Tuuri #include "buf0buf.h" #include "dict0boot.h" #include "log0recv.h" +#include "page0page.h" /************************************************************ Catenates n bytes to the mtr log. */ @@ -22,9 +23,9 @@ Catenates n bytes to the mtr log. */ void mlog_catenate_string( /*=================*/ - mtr_t* mtr, /* in: mtr */ - byte* str, /* in: string to write */ - ulint len) /* in: string length */ + mtr_t* mtr, /* in: mtr */ + const byte* str, /* in: string to write */ + ulint len) /* in: string length */ { dyn_array_t* mlog; @@ -301,14 +302,15 @@ corresponding log record to the mini-transaction log. */ void mlog_write_string( /*==============*/ - byte* ptr, /* in: pointer where to write */ - byte* str, /* in: string to write */ - ulint len, /* in: string length */ - mtr_t* mtr) /* in: mini-transaction handle */ + byte* ptr, /* in: pointer where to write */ + const byte* str, /* in: string to write */ + ulint len, /* in: string length */ + mtr_t* mtr) /* in: mini-transaction handle */ { byte* log_ptr; - if (ptr < buf_pool->frame_zero || ptr >= buf_pool->high_end) { + if (UNIV_UNLIKELY(ptr < buf_pool->frame_zero) + || UNIV_UNLIKELY(ptr >= buf_pool->high_end)) { fprintf(stderr, "InnoDB: Error: trying to write to a stray memory location %p\n", ptr); ut_error; @@ -384,3 +386,165 @@ mlog_parse_string( return(ptr + len); } + +/************************************************************ +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. */ + +byte* +mlog_open_and_write_index( +/*======================*/ + /* out: buffer, NULL if log mode + MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + byte* rec, /* in: index record or page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: log item type */ + ulint size) /* in: requested buffer size in bytes + (if 0, calls mlog_close() and returns NULL) */ +{ + byte* log_ptr; + const byte* log_start; + const byte* log_end; + + ut_ad(!!page_rec_is_comp(rec) == index->table->comp); + + if (!page_rec_is_comp(rec)) { + log_start = log_ptr = mlog_open(mtr, 11 + size); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + log_end = log_ptr + 11 + size; + } else { + ulint i; + ulint n = dict_index_get_n_fields(index); + /* total size needed */ + ulint total = 11 + size + (n + 2) * 2; + ulint alloc = total; + /* allocate at most DYN_ARRAY_DATA_SIZE at a time */ + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + mach_write_to_2(log_ptr, n); + log_ptr += 2; + mach_write_to_2(log_ptr, + dict_index_get_n_unique_in_tree(index)); + log_ptr += 2; + for (i = 0; i < n; i++) { + dict_field_t* field; + dtype_t* type; + ulint len; + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + len = field->fixed_len; + ut_ad(len < 0x7fff); + if (len == 0 && (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB)) { + /* variable-length field + with maximum length > 255 */ + len = 0x7fff; + } + if (dtype_get_prtype(type) & DATA_NOT_NULL) { + len |= 0x8000; + } + if (log_ptr + 2 > log_end) { + mlog_close(mtr, log_ptr); + ut_a(total > (ulint) (log_ptr - log_start)); + total -= log_ptr - log_start; + alloc = total; + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + } + mach_write_to_2(log_ptr, len); + log_ptr += 2; + } + } + if (size == 0) { + mlog_close(mtr, log_ptr); + log_ptr = NULL; + } else if (log_ptr + size > log_end) { + mlog_close(mtr, log_ptr); + log_ptr = mlog_open(mtr, size); + } + return(log_ptr); +} + +/************************************************************ +Parses a log record written by mlog_open_and_write_index. */ + +byte* +mlog_parse_index( +/*=============*/ + /* out: parsed record end, + NULL if not a complete record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + /* out: new value of log_ptr */ + ibool comp, /* in: TRUE=compact record format */ + dict_index_t** index) /* out, own: dummy index */ +{ + ulint i, n, n_uniq; + dict_table_t* table; + dict_index_t* ind; + + ut_ad(comp == FALSE || comp == TRUE); + + if (comp) { + if (end_ptr < ptr + 4) { + return(NULL); + } + n = mach_read_from_2(ptr); + ptr += 2; + n_uniq = mach_read_from_2(ptr); + ut_ad(n_uniq <= n); + if (end_ptr < ptr + (n + 1) * 2) { + return(NULL); + } + } else { + n = n_uniq = 1; + } + table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n, comp); + ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY", + DICT_HDR_SPACE, 0, n); + ind->table = table; + ind->n_uniq = n_uniq; + if (n_uniq != n) { + ind->type = DICT_CLUSTERED; + } + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + ind->cached = TRUE; + if (comp) { + for (i = 0; i < n; i++) { + ulint len = mach_read_from_2(ptr += 2); + /* The high-order bit of len is the NOT NULL flag; + the rest is 0 or 0x7fff for variable-length fields, + and 1..0x7ffe for fixed-length fields. */ + dict_mem_table_add_col(table, "DUMMY", + ((len + 1) & 0x7fff) <= 1 + ? DATA_BINARY + : DATA_FIXBINARY, + len & 0x8000 ? DATA_NOT_NULL : 0, + len & 0x7fff, 0); + dict_index_add_col(ind, + dict_table_get_nth_col(table, i), 0, 0); + } + ptr += 2; + } + *index = ind; + return(ptr); +} diff --git a/innobase/mtr/mtr0mtr.c b/innobase/mtr/mtr0mtr.c index 6e918806eb1..da045be1f62 100644 --- a/innobase/mtr/mtr0mtr.c +++ b/innobase/mtr/mtr0mtr.c @@ -48,16 +48,11 @@ mtr_memo_slot_release( object = slot->object; type = slot->type; - if (object != NULL) { + if (UNIV_LIKELY(object != NULL)) { if (type <= MTR_MEMO_BUF_FIX) { buf_page_release((buf_block_t*)object, type, mtr); } else if (type == MTR_MEMO_S_LOCK) { rw_lock_s_unlock((rw_lock_t*)object); -#ifndef UNIV_DEBUG - } else { - rw_lock_x_unlock((rw_lock_t*)object); - } -#endif #ifdef UNIV_DEBUG } else if (type == MTR_MEMO_X_LOCK) { rw_lock_x_unlock((rw_lock_t*)object); @@ -65,8 +60,11 @@ mtr_memo_slot_release( ut_ad(type == MTR_MEMO_MODIFY); ut_ad(mtr_memo_contains(mtr, object, MTR_MEMO_PAGE_X_FIX)); - } +#else + } else { + rw_lock_x_unlock((rw_lock_t*)object); #endif + } } slot->object = NULL; diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index 49f88c0d62a..9c87b59f018 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -15,6 +15,13 @@ Created 10/21/1995 Heikki Tuuri #include "fil0fil.h" #include "buf0buf.h" +#if defined(UNIV_HOTBACKUP) && defined(__WIN__) +/* Add includes for the _stat() call to compile on Windows */ +#include <sys/types.h> +#include <sys/stat.h> +#include <errno.h> +#endif /* UNIV_HOTBACKUP */ + #undef HAVE_FDATASYNC #ifdef POSIX_ASYNC_IO @@ -80,7 +87,7 @@ struct os_aio_slot_struct{ made and only the slot message needs to be passed to the caller of os_aio_simulated_handle */ - void* message1; /* message which is given by the */ + fil_node_t* message1; /* message which is given by the */ void* message2; /* the requester of an aio operation and which can be used to identify which pending aio operation was @@ -130,17 +137,17 @@ os_event_t* os_aio_segment_wait_events = NULL; /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These are NULL when the module has not yet been initialized. */ -os_aio_array_t* os_aio_read_array = NULL; -os_aio_array_t* os_aio_write_array = NULL; -os_aio_array_t* os_aio_ibuf_array = NULL; -os_aio_array_t* os_aio_log_array = NULL; -os_aio_array_t* os_aio_sync_array = NULL; +static os_aio_array_t* os_aio_read_array = NULL; +static os_aio_array_t* os_aio_write_array = NULL; +static os_aio_array_t* os_aio_ibuf_array = NULL; +static os_aio_array_t* os_aio_log_array = NULL; +static os_aio_array_t* os_aio_sync_array = NULL; -ulint os_aio_n_segments = ULINT_UNDEFINED; +static ulint os_aio_n_segments = ULINT_UNDEFINED; /* If the following is TRUE, read i/o handler threads try to wait until a batch of new read requests have been posted */ -ibool os_aio_recommend_sleep_for_read_threads = FALSE; +static ibool os_aio_recommend_sleep_for_read_threads = FALSE; ulint os_n_file_reads = 0; ulint os_bytes_read_since_printout = 0; @@ -155,10 +162,14 @@ ibool os_has_said_disk_full = FALSE; /* The mutex protecting the following counts of pending pread and pwrite operations */ -os_mutex_t os_file_count_mutex; +static os_mutex_t os_file_count_mutex; ulint os_file_n_pending_preads = 0; ulint os_file_n_pending_pwrites = 0; +/* These are not protected by any mutex */ +ulint os_n_pending_writes = 0; +ulint os_n_pending_reads = 0; + /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -598,7 +609,7 @@ os_file_opendir( lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA)); - dir = FindFirstFile(path, lpFindFileData); + dir = FindFirstFile((LPCTSTR) path, lpFindFileData); ut_free(lpFindFileData); @@ -679,15 +690,15 @@ next_file: ret = FindNextFile(dir, lpFindFileData); if (ret) { - ut_a(strlen(lpFindFileData->cFileName) < OS_FILE_MAX_PATH); + ut_a(strlen((char *) lpFindFileData->cFileName) < OS_FILE_MAX_PATH); - if (strcmp(lpFindFileData->cFileName, ".") == 0 - || strcmp(lpFindFileData->cFileName, "..") == 0) { + if (strcmp((char *) lpFindFileData->cFileName, ".") == 0 + || strcmp((char *) lpFindFileData->cFileName, "..") == 0) { goto next_file; } - strcpy(info->name, lpFindFileData->cFileName); + strcpy(info->name, (char *) lpFindFileData->cFileName); info->size = (ib_longlong)(lpFindFileData->nFileSizeLow) + (((ib_longlong)(lpFindFileData->nFileSizeHigh)) << 32); @@ -823,7 +834,7 @@ os_file_create_directory( #ifdef __WIN__ BOOL rcode; - rcode = CreateDirectory(pathname, NULL); + rcode = CreateDirectory((LPCTSTR) pathname, NULL); if (!(rcode != 0 || (GetLastError() == ERROR_ALREADY_EXISTS && !fail_if_exists))) { /* failure */ @@ -907,7 +918,7 @@ try_again: ut_error; } - file = CreateFile(name, + file = CreateFile((LPCTSTR) name, access, FILE_SHARE_READ | FILE_SHARE_WRITE, /* file can be read ansd written also @@ -1046,7 +1057,7 @@ os_file_create_simple_no_error_handling( ut_error; } - file = CreateFile(name, + file = CreateFile((LPCTSTR) name, access, share_mode, NULL, /* default security attributes */ @@ -1193,7 +1204,7 @@ try_again: ut_error; } - file = CreateFile(name, + file = CreateFile((LPCTSTR) name, GENERIC_READ | GENERIC_WRITE, /* read and write access */ share_mode, /* File can be read also by other @@ -2085,8 +2096,12 @@ try_again: goto error_handling; } + os_n_pending_reads++; + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + os_n_pending_reads--; + os_mutex_exit(os_file_seek_mutexes[i]); if (ret && len == n) { @@ -2099,8 +2114,12 @@ try_again: os_bytes_read_since_printout += n; try_again: + os_n_pending_reads++; + ret = os_file_pread(file, buf, n, offset, offset_high); + os_n_pending_reads--; + if ((ulint)ret == n) { return(TRUE); @@ -2188,8 +2207,12 @@ try_again: goto error_handling; } + os_n_pending_reads++; + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + os_n_pending_reads--; + os_mutex_exit(os_file_seek_mutexes[i]); if (ret && len == n) { @@ -2202,8 +2225,12 @@ try_again: os_bytes_read_since_printout += n; try_again: + os_n_pending_reads++; + ret = os_file_pread(file, buf, n, offset, offset_high); + os_n_pending_reads--; + if ((ulint)ret == n) { return(TRUE); @@ -2285,7 +2312,11 @@ retry: return(FALSE); } + os_n_pending_writes++; + ret = WriteFile(file, buf, (DWORD) n, &len, NULL); + + os_n_pending_writes--; /* Always do fsync to reduce the probability that when the OS crashes, a database page is only partially physically written to disk. */ @@ -2348,8 +2379,12 @@ retry: #else ssize_t ret; + os_n_pending_writes++; + ret = os_file_pwrite(file, buf, n, offset, offset_high); + os_n_pending_writes--; + if ((ulint)ret == n) { return(TRUE); @@ -2999,7 +3034,7 @@ os_aio_array_reserve_slot( /* out: pointer to slot */ ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */ os_aio_array_t* array, /* in: aio array */ - void* message1,/* in: message to be passed along with + fil_node_t* message1,/* in: message to be passed along with the aio operation */ void* message2,/* in: message to be passed along with the aio operation */ @@ -3261,7 +3296,7 @@ os_aio( ulint offset_high, /* in: most significant 32 bits of offset */ ulint n, /* in: number of bytes to read or write */ - void* message1,/* in: messages for the aio handler (these + fil_node_t* message1,/* in: messages for the aio handler (these can be used to identify a completed aio operation); if mode is OS_AIO_SYNC, these are ignored */ @@ -3273,7 +3308,7 @@ os_aio( ibool retval; BOOL ret = TRUE; DWORD len = (DWORD) n; - void* dummy_mess1; + struct fil_node_struct * dummy_mess1; void* dummy_mess2; ulint dummy_type; #endif @@ -3446,7 +3481,7 @@ os_aio_windows_handle( ignored */ ulint pos, /* this parameter is used only in sync aio: wait for the aio slot at this position */ - void** message1, /* out: the messages passed with the aio + fil_node_t**message1, /* out: the messages passed with the aio request; note that also in the case where the aio operation failed, these output parameters are valid and can be used to @@ -3539,7 +3574,7 @@ os_aio_posix_handle( /*================*/ /* out: TRUE if the aio operation succeeded */ ulint array_no, /* in: array number 0 - 3 */ - void** message1, /* out: the messages passed with the aio + fil_node_t**message1, /* out: the messages passed with the aio request; note that also in the case where the aio operation failed, these output parameters are valid and can be used to @@ -3622,7 +3657,7 @@ os_aio_simulated_handle( i/o thread, segment 1 the log i/o thread, then follow the non-ibuf read threads, and as the last are the non-ibuf write threads */ - void** message1, /* out: the messages passed with the aio + fil_node_t**message1, /* out: the messages passed with the aio request; note that also in the case where the aio operation failed, these output parameters are valid and can be used to @@ -4160,6 +4195,7 @@ os_aio_refresh_stats(void) os_last_printout = time(NULL); } +#ifdef UNIV_DEBUG /************************************************************************** Checks that all slots in the system have been freed, that is, there are no pending io operations. */ @@ -4219,3 +4255,4 @@ os_aio_all_slots_free(void) return(FALSE); } +#endif /* UNIV_DEBUG */ diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c index 2f155788420..167aed93de7 100644 --- a/innobase/os/os0proc.c +++ b/innobase/os/os0proc.c @@ -69,6 +69,10 @@ byte* os_awe_window; ulint os_awe_window_size; #endif +ibool os_use_large_pages; +/* Large page size. This may be a boot-time option on some platforms */ +ulint os_large_page_size; + /******************************************************************** Windows AWE support. Tries to enable the "lock pages in memory" privilege for the current process so that the current process can allocate memory-locked @@ -516,6 +520,89 @@ os_mem_alloc_nocache( } /******************************************************************** +Allocates large pages memory. */ + +void* +os_mem_alloc_large( +/*=================*/ + /* out: allocated memory */ + ulint n, /* in: number of bytes */ + ibool set_to_zero, /* in: TRUE if allocated memory should be set + to zero if UNIV_SET_MEM_TO_ZERO is defined */ + ibool assert_on_error) /* in: if TRUE, we crash mysqld if the memory + cannot be allocated */ +{ +#ifdef HAVE_LARGE_PAGES + ulint size; + int shmid; + void *ptr = NULL; + struct shmid_ds buf; + + if (!os_use_large_pages || !os_large_page_size) { + goto skip; + } + +#ifdef UNIV_LINUX + /* Align block size to os_large_page_size */ + size = ((n - 1) & ~(os_large_page_size - 1)) + os_large_page_size; + + shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W); + if (shmid < 0) { + fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. " + "errno %d\n", n, errno); + } else { + ptr = shmat(shmid, NULL, 0); + if (ptr == (void *)-1) { + fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to attach shared memory " + "segment, errno %d\n", errno); + } + /* + Remove the shared memory segment so that it will be automatically freed + after memory is detached or process exits + */ + shmctl(shmid, IPC_RMID, &buf); + } +#endif + + if (ptr) { + if (set_to_zero) { +#ifdef UNIV_SET_MEM_TO_ZERO + memset(ptr, '\0', size); +#endif + } + + return(ptr); + } + + fprintf(stderr, "InnoDB HugeTLB: Warning: Using conventional memory pool\n"); +skip: +#endif /* HAVE_LARGE_PAGES */ + + return(ut_malloc_low(n, set_to_zero, assert_on_error)); +} + +/******************************************************************** +Frees large pages memory. */ + +void +os_mem_free_large( +/*=================*/ + void *ptr) /* in: number of bytes */ +{ +#ifdef HAVE_LARGE_PAGES + if (os_use_large_pages && os_large_page_size +#ifdef UNIV_LINUX + && !shmdt(ptr) +#endif + ) { + return; + } +#endif + + ut_free(ptr); +} + +/******************************************************************** Sets the priority boost for threads released from waiting within the current process. */ diff --git a/innobase/os/os0sync.c b/innobase/os/os0sync.c index 18d92af5054..356d7c8c163 100644 --- a/innobase/os/os0sync.c +++ b/innobase/os/os0sync.c @@ -121,7 +121,7 @@ os_event_create( event->handle = CreateEvent(NULL,/* No security attributes */ TRUE, /* Manual reset */ FALSE, /* Initial state nonsignaled */ - name); + (LPCTSTR) name); if (!event->handle) { fprintf(stderr, "InnoDB: Could not create a Windows event semaphore; Windows error %lu\n", @@ -177,7 +177,7 @@ os_event_create_auto( event->handle = CreateEvent(NULL,/* No security attributes */ FALSE, /* Auto-reset */ FALSE, /* Initial state nonsignaled */ - name); + (LPCTSTR) name); if (!event->handle) { fprintf(stderr, @@ -440,7 +440,7 @@ os_mutex_create( mutex = CreateMutex(NULL, /* No security attributes */ FALSE, /* Initial state: no owner */ - name); + (LPCTSTR) name); ut_a(mutex); #else os_fast_mutex_t* mutex; diff --git a/innobase/os/os0thread.c b/innobase/os/os0thread.c index 0278e3b2b66..847d0ee1cc7 100644 --- a/innobase/os/os0thread.c +++ b/innobase/os/os0thread.c @@ -88,7 +88,7 @@ os_thread_create( /*=============*/ /* out: handle to the thread */ #ifndef __WIN__ - os_posix_f_t start_f, + os_posix_f_t start_f, #else ulint (*start_f)(void*), /* in: pointer to function from which to start */ diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c index 459ab986610..d0b89e81787 100644 --- a/innobase/page/page0cur.c +++ b/innobase/page/page0cur.c @@ -29,7 +29,9 @@ UNIV_INLINE ibool page_cur_try_search_shortcut( /*=========================*/ + /* out: TRUE on success */ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint* iup_matched_fields, /* in/out: already matched fields in upper @@ -45,7 +47,6 @@ page_cur_try_search_shortcut( not yet completely matched */ page_cur_t* cursor) /* out: page cursor */ { - int cmp; rec_t* rec; rec_t* next_rec; ulint low_match; @@ -55,9 +56,17 @@ page_cur_try_search_shortcut( #ifdef UNIV_SEARCH_DEBUG page_cur_t cursor2; #endif + ibool success = FALSE; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_ad(dtuple_check_typed(tuple)); rec = page_header_get_ptr(page, PAGE_LAST_INSERT); + offsets = rec_get_offsets(rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); ut_ad(rec); ut_ad(page_rec_is_user_rec(rec)); @@ -69,26 +78,24 @@ page_cur_try_search_shortcut( up_match = low_match; up_bytes = low_bytes; - cmp = page_cmp_dtuple_rec_with_match(tuple, rec, &low_match, - &low_bytes); - if (cmp == -1) { - - return(FALSE); + if (page_cmp_dtuple_rec_with_match(tuple, rec, offsets, + &low_match, &low_bytes) < 0) { + goto exit_func; } next_rec = page_rec_get_next(rec); + offsets = rec_get_offsets(next_rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); - cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, &up_match, - &up_bytes); - if (cmp != -1) { - - return(FALSE); + if (page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets, + &up_match, &up_bytes) >= 0) { + goto exit_func; } cursor->rec = rec; #ifdef UNIV_SEARCH_DEBUG - page_cur_search_with_match(page, tuple, PAGE_CUR_DBG, + page_cur_search_with_match(page, index, tuple, PAGE_CUR_DBG, iup_matched_fields, iup_matched_bytes, ilow_matched_fields, @@ -105,7 +112,7 @@ page_cur_try_search_shortcut( ut_a(*ilow_matched_fields == low_match); ut_a(*ilow_matched_bytes == low_bytes); #endif - if (next_rec != page_get_supremum_rec(page)) { + if (!page_rec_is_supremum(next_rec)) { *iup_matched_fields = up_match; *iup_matched_bytes = up_bytes; @@ -117,11 +124,17 @@ page_cur_try_search_shortcut( #ifdef UNIV_SEARCH_PERF_STAT page_cur_short_succ++; #endif - return(TRUE); + success = TRUE; +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); } #endif +#ifdef PAGE_CUR_LE_OR_EXTENDS /******************************************************************** Checks if the nth field in a record is a character type field which extends the nth field in tuple, i.e., the field is longer or equal in length and has @@ -130,22 +143,24 @@ static ibool page_cur_rec_field_extends( /*=======================*/ - /* out: TRUE if rec field extends tuple - field */ - dtuple_t* tuple, /* in: data tuple */ - rec_t* rec, /* in: record */ - ulint n) /* in: compare nth field */ + /* out: TRUE if rec field + extends tuple field */ + dtuple_t* tuple, /* in: data tuple */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: compare nth field */ { dtype_t* type; dfield_t* dfield; byte* rec_f; ulint rec_f_len; + ut_ad(rec_offs_validate(rec, NULL, offsets)); dfield = dtuple_get_nth_field(tuple, n); type = dfield_get_type(dfield); - rec_f = rec_get_nth_field(rec, n, &rec_f_len); + rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len); if (type->mtype == DATA_VARCHAR || type->mtype == DATA_CHAR @@ -168,6 +183,7 @@ page_cur_rec_field_extends( return(FALSE); } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ /******************************************************************** Searches the right position for a page cursor. */ @@ -176,6 +192,7 @@ void page_cur_search_with_match( /*=======================*/ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -212,14 +229,26 @@ page_cur_search_with_match( ulint dbg_matched_fields; ulint dbg_matched_bytes; #endif + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_ad(page && tuple && iup_matched_fields && iup_matched_bytes && ilow_matched_fields && ilow_matched_bytes && cursor); ut_ad(dtuple_validate(tuple)); ut_ad(dtuple_check_typed(tuple)); +#ifdef UNIV_DEBUG +# ifdef PAGE_CUR_DBG + if (mode != PAGE_CUR_DBG) +# endif /* PAGE_CUR_DBG */ +# ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode != PAGE_CUR_LE_OR_EXTENDS) +# endif /* PAGE_CUR_LE_OR_EXTENDS */ ut_ad((mode == PAGE_CUR_L) || (mode == PAGE_CUR_LE) - || (mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE) - || (mode == PAGE_CUR_LE_OR_EXTENDS) || (mode == PAGE_CUR_DBG)); - + || (mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)); +#endif /* UNIV_DEBUG */ + page_check_dir(page); #ifdef PAGE_CUR_ADAPT @@ -229,7 +258,7 @@ page_cur_search_with_match( && (page_header_get_ptr(page, PAGE_LAST_INSERT)) && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) { - if (page_cur_try_search_shortcut(page, tuple, + if (page_cur_try_search_shortcut(page, index, tuple, iup_matched_fields, iup_matched_bytes, ilow_matched_fields, @@ -238,16 +267,18 @@ page_cur_search_with_match( return; } } -/*#ifdef UNIV_SEARCH_DEBUG */ +# ifdef PAGE_CUR_DBG if (mode == PAGE_CUR_DBG) { mode = PAGE_CUR_LE; } -/*#endif */ +# endif #endif /* The following flag does not work for non-latin1 char sets because cmp_full_field does not tell how many bytes matched */ +#ifdef PAGE_CUR_LE_OR_EXTENDS ut_a(mode != PAGE_CUR_LE_OR_EXTENDS); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ /* If mode PAGE_CUR_G is specified, we are trying to position the cursor to answer a query of the form "tuple < X", where tuple is @@ -279,37 +310,42 @@ page_cur_search_with_match( low_matched_fields, low_matched_bytes, up_matched_fields, up_matched_bytes); - cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, &cur_matched_fields, &cur_matched_bytes); - if (cmp == 1) { + if (UNIV_LIKELY(cmp > 0)) { +low_slot_match: low = mid; low_matched_fields = cur_matched_fields; low_matched_bytes = cur_matched_bytes; - } else if (cmp == -1) { - + } else if (UNIV_LIKELY(cmp /* == -1 */)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS if (mode == PAGE_CUR_LE_OR_EXTENDS && page_cur_rec_field_extends(tuple, mid_rec, - cur_matched_fields)) { - low = mid; - low_matched_fields = cur_matched_fields; - low_matched_bytes = cur_matched_bytes; - } else { - up = mid; - up_matched_fields = cur_matched_fields; - up_matched_bytes = cur_matched_bytes; - } + offsets, cur_matched_fields)) { - } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE - || mode == PAGE_CUR_LE_OR_EXTENDS) { - low = mid; - low_matched_fields = cur_matched_fields; - low_matched_bytes = cur_matched_bytes; - } else { + goto low_slot_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_slot_match: up = mid; up_matched_fields = cur_matched_fields; up_matched_bytes = cur_matched_bytes; + + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_slot_match; + } else { + + goto up_slot_match; } } @@ -329,35 +365,41 @@ page_cur_search_with_match( low_matched_fields, low_matched_bytes, up_matched_fields, up_matched_bytes); - cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, &cur_matched_fields, &cur_matched_bytes); - if (cmp == 1) { + if (UNIV_LIKELY(cmp > 0)) { +low_rec_match: low_rec = mid_rec; low_matched_fields = cur_matched_fields; low_matched_bytes = cur_matched_bytes; - } else if (cmp == -1) { + } else if (UNIV_LIKELY(cmp /* == -1 */)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS if (mode == PAGE_CUR_LE_OR_EXTENDS && page_cur_rec_field_extends(tuple, mid_rec, - cur_matched_fields)) { - low_rec = mid_rec; - low_matched_fields = cur_matched_fields; - low_matched_bytes = cur_matched_bytes; - } else { - up_rec = mid_rec; - up_matched_fields = cur_matched_fields; - up_matched_bytes = cur_matched_bytes; + offsets, cur_matched_fields)) { + + goto low_rec_match; } - } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE - || mode == PAGE_CUR_LE_OR_EXTENDS) { - low_rec = mid_rec; - low_matched_fields = cur_matched_fields; - low_matched_bytes = cur_matched_bytes; - } else { +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_rec_match: up_rec = mid_rec; up_matched_fields = cur_matched_fields; up_matched_bytes = cur_matched_bytes; + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_rec_match; + } else { + + goto up_rec_match; } } @@ -368,7 +410,9 @@ page_cur_search_with_match( dbg_matched_fields = 0; dbg_matched_bytes = 0; - dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, + offsets = rec_get_offsets(low_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets, &dbg_matched_fields, &dbg_matched_bytes); if (mode == PAGE_CUR_G) { @@ -390,7 +434,9 @@ page_cur_search_with_match( dbg_matched_fields = 0; dbg_matched_bytes = 0; - dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, + offsets = rec_get_offsets(up_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets, &dbg_matched_fields, &dbg_matched_bytes); if (mode == PAGE_CUR_G) { @@ -419,6 +465,9 @@ page_cur_search_with_match( *iup_matched_bytes = up_matched_bytes; *ilow_matched_fields = low_matched_fields; *ilow_matched_bytes = low_matched_bytes; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } /*************************************************************** @@ -463,10 +512,12 @@ static void page_cur_insert_rec_write_log( /*==========================*/ - rec_t* insert_rec, /* in: inserted physical record */ - ulint rec_size, /* in: insert_rec size */ - rec_t* cursor_rec, /* in: record the cursor is pointing to */ - mtr_t* mtr) /* in: mini-transaction handle */ + rec_t* insert_rec, /* in: inserted physical record */ + ulint rec_size, /* in: insert_rec size */ + rec_t* cursor_rec, /* in: record the + cursor is pointing to */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ { ulint cur_rec_size; ulint extra_size; @@ -476,22 +527,40 @@ page_cur_insert_rec_write_log( byte* cur_ptr; ulint extra_info_yes; byte* log_ptr; + byte* log_end; ulint i; + ulint comp; ut_a(rec_size < UNIV_PAGE_SIZE); - ut_ad(rec_size == rec_get_size(insert_rec)); + ut_ad(buf_frame_align(insert_rec) == buf_frame_align(cursor_rec)); + ut_ad(!page_rec_is_comp(insert_rec) == !index->table->comp); + comp = page_rec_is_comp(insert_rec); - log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN); + { + mem_heap_t* heap = NULL; + ulint cur_offs_[REC_OFFS_NORMAL_SIZE]; + ulint ins_offs_[REC_OFFS_NORMAL_SIZE]; - if (log_ptr == NULL) { + ulint* cur_offs; + ulint* ins_offs; - return; - } + *cur_offs_ = (sizeof cur_offs_) / sizeof *cur_offs_; + *ins_offs_ = (sizeof ins_offs_) / sizeof *ins_offs_; + + cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_, + ULINT_UNDEFINED, &heap); + ins_offs = rec_get_offsets(insert_rec, index, ins_offs_, + ULINT_UNDEFINED, &heap); - extra_size = rec_get_extra_size(insert_rec); + extra_size = rec_offs_extra_size(ins_offs); + cur_extra_size = rec_offs_extra_size(cur_offs); + ut_ad(rec_size == rec_offs_size(ins_offs)); + cur_rec_size = rec_offs_size(cur_offs); - cur_extra_size = rec_get_extra_size(cursor_rec); - cur_rec_size = rec_get_size(cursor_rec); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } ins_ptr = insert_rec - extra_size; @@ -514,7 +583,9 @@ page_cur_insert_rec_write_log( ins_ptr++; cur_ptr++; } else if ((i < extra_size) - && (i >= extra_size - REC_N_EXTRA_BYTES)) { + && (i >= extra_size - (comp + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES))) { i = extra_size; ins_ptr = insert_rec; cur_ptr = cursor_rec; @@ -525,16 +596,35 @@ page_cur_insert_rec_write_log( } if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) { - - log_ptr = mlog_write_initial_log_record_fast(insert_rec, - MLOG_REC_INSERT, log_ptr, mtr); + + log_ptr = mlog_open_and_write_index(mtr, insert_rec, index, + comp + ? MLOG_COMP_REC_INSERT : MLOG_REC_INSERT, + 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + + log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; /* Write the cursor rec offset as a 2-byte ulint */ mach_write_to_2(log_ptr, cursor_rec - buf_frame_align(cursor_rec)); log_ptr += 2; + } else { + log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; } - if ((rec_get_info_bits(insert_rec) != rec_get_info_bits(cursor_rec)) + if ((rec_get_info_and_status_bits(insert_rec, comp) != + rec_get_info_and_status_bits(cursor_rec, comp)) || (extra_size != cur_extra_size) || (rec_size != cur_rec_size)) { @@ -549,7 +639,8 @@ page_cur_insert_rec_write_log( + extra_info_yes); if (extra_info_yes) { /* Write the info bits */ - mach_write_to_1(log_ptr, rec_get_info_bits(insert_rec)); + mach_write_to_1(log_ptr, + rec_get_info_and_status_bits(insert_rec, comp)); log_ptr++; /* Write the record origin offset */ @@ -565,17 +656,15 @@ page_cur_insert_rec_write_log( /* Write to the log the inserted index record end segment which differs from the cursor record */ - if (rec_size - i < MLOG_BUF_MARGIN) { - ut_memcpy(log_ptr, ins_ptr, rec_size - i); - log_ptr += rec_size - i; - } - - mlog_close(mtr, log_ptr); + rec_size -= i; - ut_a(rec_size - i < UNIV_PAGE_SIZE); - - if (rec_size - i >= MLOG_BUF_MARGIN) { - mlog_catenate_string(mtr, ins_ptr, rec_size - i); + if (log_ptr + rec_size <= log_end) { + memcpy(log_ptr, ins_ptr, rec_size); + mlog_close(mtr, log_ptr + rec_size); + } else { + mlog_close(mtr, log_ptr); + ut_a(rec_size < UNIV_PAGE_SIZE); + mlog_catenate_string(mtr, ins_ptr, rec_size); } } @@ -585,12 +674,13 @@ Parses a log record of a record insert on a page. */ byte* page_cur_parse_insert_rec( /*======================*/ - /* out: end of log record or NULL */ - ibool is_short,/* in: TRUE if short inserts */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + ibool is_short,/* in: TRUE if short inserts */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ulint extra_info_yes; ulint offset = 0; /* remove warning */ @@ -601,8 +691,12 @@ page_cur_parse_insert_rec( byte buf1[1024]; byte* buf; byte* ptr2 = ptr; - ulint info_bits = 0; /* remove warning */ + ulint info_and_status_bits = 0; /* remove warning */ page_cur_t cursor; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; if (!is_short) { /* Read the cursor rec offset as a 2-byte ulint */ @@ -648,7 +742,7 @@ page_cur_parse_insert_rec( return(NULL); } - info_bits = mach_read_from_1(ptr); + info_and_status_bits = mach_read_from_1(ptr); ptr++; ptr = mach_parse_compressed(ptr, end_ptr, &origin_offset); @@ -680,6 +774,8 @@ page_cur_parse_insert_rec( return(ptr + end_seg_len); } + ut_ad(!!page_is_comp(page) == index->table->comp); + /* Read from the log the inserted index record end segment which differs from the cursor record */ @@ -689,11 +785,15 @@ page_cur_parse_insert_rec( cursor_rec = page + offset; } + offsets = rec_get_offsets(cursor_rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (extra_info_yes == 0) { - info_bits = rec_get_info_bits(cursor_rec); - origin_offset = rec_get_extra_size(cursor_rec); - mismatch_index = rec_get_size(cursor_rec) - end_seg_len; - } + info_and_status_bits = rec_get_info_and_status_bits( + cursor_rec, page_is_comp(page)); + origin_offset = rec_offs_extra_size(offsets); + mismatch_index = rec_offs_size(offsets) - end_seg_len; + } if (mismatch_index + end_seg_len < sizeof buf1) { buf = buf1; @@ -705,11 +805,12 @@ page_cur_parse_insert_rec( if (mismatch_index >= UNIV_PAGE_SIZE) { fprintf(stderr, - "Is short %lu, info_bits %lu, offset %lu, " + "Is short %lu, info_and_status_bits %lu, offset %lu, " "o_offset %lu\n" "mismatch index %lu, end_seg_len %lu\n" "parsed len %lu\n", - (ulong) is_short, (ulong) info_bits, (ulong) offset, + (ulong) is_short, (ulong) info_and_status_bits, + (ulong) offset, (ulong) origin_offset, (ulong) mismatch_index, (ulong) end_seg_len, (ulong) (ptr - ptr2)); @@ -722,20 +823,27 @@ page_cur_parse_insert_rec( ut_error; } - ut_memcpy(buf, rec_get_start(cursor_rec), mismatch_index); + ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index); ut_memcpy(buf + mismatch_index, ptr, end_seg_len); - rec_set_info_bits(buf + origin_offset, info_bits); + rec_set_info_and_status_bits(buf + origin_offset, page_is_comp(page), + info_and_status_bits); page_cur_position(cursor_rec, &cursor); - page_cur_rec_insert(&cursor, buf + origin_offset, mtr); + offsets = rec_get_offsets(buf + origin_offset, index, offsets, + ULINT_UNDEFINED, &heap); + page_cur_rec_insert(&cursor, buf + origin_offset, index, offsets, mtr); if (buf != buf1) { mem_free(buf); } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(ptr + end_seg_len); } @@ -751,68 +859,85 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ - ulint data_size,/* in: data size of tuple */ - rec_t* rec, /* in: pointer to a physical record or NULL */ + dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ + dict_index_t* index, /* in: record descriptor */ + rec_t* rec, /* in: pointer to a physical record or NULL */ + ulint* offsets,/* in: rec_get_offsets(rec, index) or NULL */ mtr_t* mtr) /* in: mini-transaction handle */ { - byte* insert_buf = NULL; - ulint rec_size; - byte* page; /* the relevant page */ - rec_t* last_insert; /* cursor position at previous insert */ - rec_t* insert_rec; /* inserted record */ - ulint heap_no; /* heap number of the inserted record */ - rec_t* current_rec; /* current record after which the - new record is inserted */ - rec_t* next_rec; /* next record after current before - the insertion */ - ulint owner_slot; /* the slot which owns the inserted record */ - rec_t* owner_rec; - ulint n_owned; - + byte* insert_buf = NULL; + ulint rec_size; + byte* page; /* the relevant page */ + rec_t* last_insert; /* cursor position at previous insert */ + rec_t* insert_rec; /* inserted record */ + ulint heap_no; /* heap number of the inserted record */ + rec_t* current_rec; /* current record after which the + new record is inserted */ + rec_t* next_rec; /* next record after current before + the insertion */ + ulint owner_slot; /* the slot which owns the + inserted record */ + rec_t* owner_rec; + ulint n_owned; + mem_heap_t* heap = NULL; + ulint comp; + ut_ad(cursor && mtr); ut_ad(tuple || rec); ut_ad(!(tuple && rec)); ut_ad(rec || dtuple_check_typed(tuple)); - ut_ad(rec || (dtuple_get_data_size(tuple) == data_size)); page = page_cur_get_page(cursor); + comp = page_is_comp(page); + ut_ad(index->table->comp == !!comp); ut_ad(cursor->rec != page_get_supremum_rec(page)); /* 1. Get the size of the physical record in the page */ if (tuple != NULL) { - rec_size = data_size + rec_get_converted_extra_size( - data_size, - dtuple_get_n_fields(tuple)); + rec_size = rec_get_converted_size(index, tuple); } else { - rec_size = rec_get_size(rec); + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + } + ut_ad(rec_offs_validate(rec, index, offsets)); + rec_size = rec_offs_size(offsets); } /* 2. Try to find suitable space from page memory management */ - insert_buf = page_mem_alloc(page, rec_size, &heap_no); + insert_buf = page_mem_alloc(page, rec_size, index, &heap_no); if (insert_buf == NULL) { - + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(NULL); } /* 3. Create the record */ if (tuple != NULL) { - insert_rec = rec_convert_dtuple_to_rec_low(insert_buf, tuple, - data_size); + insert_rec = rec_convert_dtuple_to_rec(insert_buf, + index, tuple); + offsets = rec_get_offsets(insert_rec, index, offsets, + ULINT_UNDEFINED, &heap); } else { - insert_rec = rec_copy(insert_buf, rec); + insert_rec = rec_copy(insert_buf, rec, offsets); + ut_ad(rec_offs_validate(rec, index, offsets)); + rec_offs_make_valid(insert_rec, index, offsets); } ut_ad(insert_rec); - ut_ad(rec_size == rec_get_size(insert_rec)); + ut_ad(rec_size == rec_offs_size(offsets)); /* 4. Insert the record in the linked list of records */ - current_rec = cursor->rec; + ut_ad(!comp || rec_get_status(current_rec) <= REC_STATUS_INFIMUM); + ut_ad(!comp || rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + next_rec = page_rec_get_next(current_rec); + ut_ad(!comp || rec_get_status(next_rec) != REC_STATUS_INFIMUM); page_rec_set_next(insert_rec, next_rec); page_rec_set_next(current_rec, insert_rec); @@ -821,12 +946,15 @@ page_cur_insert_rec_low( /* 5. Set the n_owned field in the inserted record to zero, and set the heap_no field */ - rec_set_n_owned(insert_rec, 0); - rec_set_heap_no(insert_rec, heap_no); + rec_set_n_owned(insert_rec, comp, 0); + rec_set_heap_no(insert_rec, comp, heap_no); /* 6. Update the last insertion info in page header */ last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert || !comp + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); if (last_insert == NULL) { page_header_set_field(page, PAGE_DIRECTION, PAGE_NO_DIRECTION); @@ -855,8 +983,8 @@ page_cur_insert_rec_low( /* 7. It remains to update the owner record. */ owner_rec = page_rec_find_owner_rec(insert_rec); - n_owned = rec_get_n_owned(owner_rec); - rec_set_n_owned(owner_rec, n_owned + 1); + n_owned = rec_get_n_owned(owner_rec, comp); + rec_set_n_owned(owner_rec, comp, n_owned + 1); /* 8. Now we have incremented the n_owned field of the owner record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, @@ -868,8 +996,12 @@ page_cur_insert_rec_low( } /* 9. Write log record of the insert */ - page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, mtr); + page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, + index, mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(insert_rec); } @@ -879,17 +1011,21 @@ UNIV_INLINE byte* page_copy_rec_list_to_created_page_write_log( /*=========================================*/ - /* out: 4-byte field where to write the log data - length */ - page_t* page, /* in: index page */ - mtr_t* mtr) /* in: mtr */ + /* out: 4-byte field where to + write the log data length */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { byte* log_ptr; - - mlog_write_initial_log_record(page, MLOG_LIST_END_COPY_CREATED, mtr); - log_ptr = mlog_open(mtr, 4); + ut_ad(!!page_is_comp(page) == index->table->comp); + log_ptr = mlog_open_and_write_index(mtr, page, index, + page_is_comp(page) + ? MLOG_COMP_LIST_END_COPY_CREATED + : MLOG_LIST_END_COPY_CREATED, 4); + ut_a(log_ptr); mlog_close(mtr, log_ptr + 4); return(log_ptr); @@ -901,11 +1037,12 @@ Parses a log record of copying a record list end to a new created page. */ byte* page_parse_copy_rec_list_to_created_page( /*=====================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { byte* rec_end; ulint log_data_len; @@ -931,7 +1068,8 @@ page_parse_copy_rec_list_to_created_page( } while (ptr < rec_end) { - ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, page, mtr); + ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, + index, page, mtr); } ut_a(ptr == rec_end); @@ -950,10 +1088,11 @@ including that record. Infimum and supremum records are not copied. */ void page_copy_rec_list_end_to_created_page( /*===================================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: first record to copy */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: first record to copy */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_dir_slot_t* slot = 0; /* remove warning */ byte* heap_top; @@ -966,9 +1105,15 @@ page_copy_rec_list_end_to_created_page( ulint log_mode; byte* log_ptr; ulint log_data_len; + ulint comp = page_is_comp(page); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; - ut_ad(page_header_get_field(new_page, PAGE_N_HEAP) == 2); + ut_ad(page_dir_get_n_heap(new_page) == 2); ut_ad(page != new_page); + ut_ad(comp == page_is_comp(new_page)); if (rec == page_get_infimum_rec(page)) { @@ -983,12 +1128,13 @@ page_copy_rec_list_end_to_created_page( #ifdef UNIV_DEBUG /* To pass the debug tests we have to set these dummy values in the debug version */ - page_header_set_field(new_page, PAGE_N_DIR_SLOTS, UNIV_PAGE_SIZE / 2); + page_dir_set_n_slots(new_page, UNIV_PAGE_SIZE / 2); page_header_set_ptr(new_page, PAGE_HEAP_TOP, new_page + UNIV_PAGE_SIZE - 1); #endif - log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, mtr); + log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, + index, mtr); log_data_len = dyn_array_get_data_size(&(mtr->log)); @@ -997,22 +1143,27 @@ page_copy_rec_list_end_to_created_page( log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS); prev_rec = page_get_infimum_rec(new_page); - heap_top = new_page + PAGE_SUPREMUM_END; + if (comp) { + heap_top = new_page + PAGE_NEW_SUPREMUM_END; + } else { + heap_top = new_page + PAGE_OLD_SUPREMUM_END; + } count = 0; slot_index = 0; n_recs = 0; /* should be do ... until, comment by Jani */ while (rec != page_get_supremum_rec(page)) { - - insert_rec = rec_copy(heap_top, rec); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + insert_rec = rec_copy(heap_top, rec, offsets); - rec_set_next_offs(prev_rec, insert_rec - new_page); + rec_set_next_offs(prev_rec, comp, insert_rec - new_page); - rec_set_n_owned(insert_rec, 0); - rec_set_heap_no(insert_rec, 2 + n_recs); + rec_set_n_owned(insert_rec, comp, 0); + rec_set_heap_no(insert_rec, comp, 2 + n_recs); - rec_size = rec_get_size(insert_rec); + rec_size = rec_offs_size(offsets); heap_top = heap_top + rec_size; @@ -1034,7 +1185,7 @@ page_copy_rec_list_end_to_created_page( } page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, - mtr); + index, mtr); prev_rec = insert_rec; rec = page_rec_get_next(rec); } @@ -1056,22 +1207,27 @@ page_copy_rec_list_end_to_created_page( slot_index--; } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len; ut_a(log_data_len < 100 * UNIV_PAGE_SIZE); mach_write_to_4(log_ptr, log_data_len); - rec_set_next_offs(insert_rec, PAGE_SUPREMUM); + rec_set_next_offs(insert_rec, comp, + comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM); slot = page_dir_get_nth_slot(new_page, 1 + slot_index); page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page)); page_dir_slot_set_n_owned(slot, count + 1); - page_header_set_field(new_page, PAGE_N_DIR_SLOTS, 2 + slot_index); + page_dir_set_n_slots(new_page, 2 + slot_index); page_header_set_ptr(new_page, PAGE_HEAP_TOP, heap_top); - page_header_set_field(new_page, PAGE_N_HEAP, 2 + n_recs); + page_dir_set_n_heap(new_page, 2 + n_recs); page_header_set_field(new_page, PAGE_N_RECS, n_recs); page_header_set_ptr(new_page, PAGE_LAST_INSERT, NULL); @@ -1089,14 +1245,29 @@ UNIV_INLINE void page_cur_delete_rec_write_log( /*==========================*/ - rec_t* cursor_rec, /* in: record to be deleted */ - mtr_t* mtr) /* in: mini-transaction handle */ + rec_t* rec, /* in: record to be deleted */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ { - mlog_write_initial_log_record(cursor_rec, MLOG_REC_DELETE, mtr); + byte* log_ptr; + + ut_ad(!!page_rec_is_comp(rec) == index->table->comp); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + page_rec_is_comp(rec) + ? MLOG_COMP_REC_DELETE + : MLOG_REC_DELETE, 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } /* Write the cursor rec offset as a 2-byte ulint */ - mlog_catenate_ulint(mtr, cursor_rec - buf_frame_align(cursor_rec), - MLOG_2BYTES); + mach_write_to_2(log_ptr, ut_align_offset(rec, UNIV_PAGE_SIZE)); + + mlog_close(mtr, log_ptr + 2); } /*************************************************************** @@ -1105,11 +1276,12 @@ Parses log record of a record delete on a page. */ byte* page_cur_parse_delete_rec( /*======================*/ - /* out: pointer to record end or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: pointer to record end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ulint offset; page_cur_t cursor; @@ -1126,9 +1298,19 @@ page_cur_parse_delete_rec( ut_a(offset <= UNIV_PAGE_SIZE); if (page) { - page_cur_position(page + offset, &cursor); - - page_cur_delete_rec(&cursor, mtr); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec = page + offset; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + page_cur_position(rec, &cursor); + + page_cur_delete_rec(&cursor, index, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } return(ptr); @@ -1142,6 +1324,8 @@ void page_cur_delete_rec( /*================*/ page_cur_t* cursor, /* in: a page cursor */ + dict_index_t* index, /* in: record descriptor */ + const ulint* offsets,/* in: rec_get_offsets(cursor->rec, index) */ mtr_t* mtr) /* in: mini-transaction handle */ { page_dir_slot_t* cur_dir_slot; @@ -1158,6 +1342,8 @@ page_cur_delete_rec( page = page_cur_get_page(cursor); current_rec = cursor->rec; + ut_ad(rec_offs_validate(current_rec, index, offsets)); + ut_ad(!!page_is_comp(page) == index->table->comp); /* The record must not be the supremum or infimum record. */ ut_ad(current_rec != page_get_supremum_rec(page)); @@ -1169,7 +1355,7 @@ page_cur_delete_rec( cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); /* 0. Write the log record */ - page_cur_delete_rec_write_log(current_rec, mtr); + page_cur_delete_rec_write_log(current_rec, index, mtr); /* 1. Reset the last insert info in the page header and increment the modify clock for the frame */ @@ -1223,7 +1409,7 @@ page_cur_delete_rec( page_dir_slot_set_n_owned(cur_dir_slot, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ - page_mem_free(page, current_rec); + page_mem_free(page, current_rec, offsets); /* 7. Now we have decremented the number of owned records of the slot. If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c index 343f300fc77..7e09cdf073e 100644 --- a/innobase/page/page0page.c +++ b/innobase/page/page0page.c @@ -18,6 +18,8 @@ Created 2/2/1994 Heikki Tuuri #include "fut0lst.h" #include "btr0sea.h" #include "buf0buf.h" +#include "srv0srv.h" +#include "btr0btr.h" /* THE INDEX PAGE ============== @@ -70,53 +72,70 @@ page_dir_find_owner_slot( /* out: the directory slot number */ rec_t* rec) /* in: the physical record */ { - ulint i; - ulint steps = 0; - page_t* page; - page_dir_slot_t* slot; - rec_t* original_rec = rec; - + page_t* page; + register uint16 rec_offs_bytes; + register page_dir_slot_t* slot; + register const page_dir_slot_t* first_slot; + register rec_t* r = rec; + ut_ad(page_rec_check(rec)); - while (rec_get_n_owned(rec) == 0) { - steps++; - rec = page_rec_get_next(rec); - } - page = buf_frame_align(rec); + first_slot = page_dir_get_nth_slot(page, 0); + slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1); + + if (page_is_comp(page)) { + while (rec_get_n_owned(r, TRUE) == 0) { + r = page + rec_get_next_offs(r, TRUE); + ut_ad(r >= page + PAGE_NEW_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } else { + while (rec_get_n_owned(r, FALSE) == 0) { + r = page + rec_get_next_offs(r, FALSE); + ut_ad(r >= page + PAGE_OLD_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } - i = page_dir_get_n_slots(page) - 1; - slot = page_dir_get_nth_slot(page, i); + rec_offs_bytes = mach_encode_2(r - page); - while (page_dir_slot_get_rec(slot) != rec) { + while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) { - if (i == 0) { + if (UNIV_UNLIKELY(slot == first_slot)) { fprintf(stderr, "InnoDB: Probable data corruption on page %lu\n" "InnoDB: Original record ", (ulong) buf_frame_get_page_no(page)); - rec_print(stderr, original_rec); + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, rec); + } - fprintf(stderr, "\n" - "InnoDB: on that page. Steps %lu.\n", (ulong) steps); - fputs( + fputs("\n" + "InnoDB: on that page.\n" "InnoDB: Cannot find the dir slot for record ", stderr); - rec_print(stderr, rec); + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, page + + mach_decode_2(rec_offs_bytes)); + } fputs("\n" "InnoDB: on that page!\n", stderr); buf_page_print(page); - ut_error; - } + ut_error; + } - i--; - slot = page_dir_get_nth_slot(page, i); + slot += PAGE_DIR_SLOT_SIZE; } - return(i); + return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE); } /****************************************************************** @@ -136,14 +155,15 @@ page_dir_slot_check( page = buf_frame_align(slot); - n_slots = page_header_get_field(page, PAGE_N_DIR_SLOTS); + n_slots = page_dir_get_n_slots(page); ut_a(slot <= page_dir_get_nth_slot(page, 0)); ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); - ut_a(page_rec_check(page + mach_read_from_2(slot))); + ut_a(page_rec_check(page_dir_slot_get_rec(slot))); - n_owned = rec_get_n_owned(page + mach_read_from_2(slot)); + n_owned = rec_get_n_owned(page_dir_slot_get_rec(slot), + page_is_comp(page)); if (slot == page_dir_get_nth_slot(page, 0)) { ut_a(n_owned == 1); @@ -194,12 +214,14 @@ Allocates a block of memory from an index page. */ byte* page_mem_alloc( /*===========*/ - /* out: pointer to start of allocated - buffer, or NULL if allocation fails */ - page_t* page, /* in: index page */ - ulint need, /* in: number of bytes needed */ - ulint* heap_no)/* out: this contains the heap number - of the allocated record if allocation succeeds */ + /* out: pointer to start of allocated + buffer, or NULL if allocation fails */ + page_t* page, /* in: index page */ + ulint need, /* in: number of bytes needed */ + dict_index_t* index, /* in: record descriptor */ + ulint* heap_no)/* out: this contains the heap number + of the allocated record + if allocation succeeds */ { rec_t* rec; byte* block; @@ -213,18 +235,37 @@ page_mem_alloc( rec = page_header_get_ptr(page, PAGE_FREE); - if (rec && (rec_get_size(rec) >= need)) { + if (rec) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; - page_header_set_ptr(page, PAGE_FREE, page_rec_get_next(rec)); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); - garbage = page_header_get_field(page, PAGE_GARBAGE); - ut_ad(garbage >= need); + if (rec_offs_size(offsets) >= need) { + page_header_set_ptr(page, PAGE_FREE, + page_rec_get_next(rec)); - page_header_set_field(page, PAGE_GARBAGE, garbage - need); + garbage = page_header_get_field(page, PAGE_GARBAGE); + ut_ad(garbage >= need); - *heap_no = rec_get_heap_no(rec); + page_header_set_field(page, PAGE_GARBAGE, + garbage - need); - return(rec_get_start(rec)); + *heap_no = rec_get_heap_no(rec, page_is_comp(page)); + + block = rec_get_start(rec, offsets); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(block); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } /* Could not find space from the free list, try top of heap */ @@ -235,9 +276,9 @@ page_mem_alloc( block = page_header_get_ptr(page, PAGE_HEAP_TOP); page_header_set_ptr(page, PAGE_HEAP_TOP, block + need); - *heap_no = page_header_get_field(page, PAGE_N_HEAP); + *heap_no = page_dir_get_n_heap(page); - page_header_set_field(page, PAGE_N_HEAP, 1 + *heap_no); + page_dir_set_n_heap(page, 1 + *heap_no); return(block); } @@ -253,9 +294,11 @@ page_create_write_log( /*==================*/ buf_frame_t* frame, /* in: a buffer frame where the page is created */ - mtr_t* mtr) /* in: mini-transaction handle */ + mtr_t* mtr, /* in: mini-transaction handle */ + ulint comp) /* in: nonzero=compact page format */ { - mlog_write_initial_log_record(frame, MLOG_PAGE_CREATE, mtr); + mlog_write_initial_log_record(frame, + comp ? MLOG_COMP_PAGE_CREATE : MLOG_PAGE_CREATE, mtr); } /*************************************************************** @@ -267,6 +310,7 @@ page_parse_create( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr __attribute__((unused)), /* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { @@ -275,7 +319,7 @@ page_parse_create( /* The record is empty, except for the record initial part */ if (page) { - page_create(page, mtr); + page_create(page, mtr, comp); } return(ptr); @@ -290,7 +334,8 @@ page_create( /* out: pointer to the page */ buf_frame_t* frame, /* in: a buffer frame where the page is created */ - mtr_t* mtr) /* in: mini-transaction handle */ + mtr_t* mtr, /* in: mini-transaction handle */ + ulint comp) /* in: nonzero=compact page format */ { page_dir_slot_t* slot; mem_heap_t* heap; @@ -300,6 +345,10 @@ page_create( rec_t* infimum_rec; rec_t* supremum_rec; page_t* page; + dict_index_t* index; + ulint* offsets; + + index = comp ? srv_sys->dummy_ind2 : srv_sys->dummy_ind1; ut_ad(frame && mtr); ut_ad(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE @@ -311,7 +360,7 @@ page_create( buf_frame_modify_clock_inc(frame); /* 2. WRITE LOG INFORMATION */ - page_create_write_log(frame, mtr); + page_create_write_log(frame, mtr, comp); page = frame; @@ -323,51 +372,61 @@ page_create( /* Create first a data tuple for infimum record */ tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM); field = dtuple_get_nth_field(tuple, 0); - dfield_set_data(field, "infimum", sizeof "infimum"); - dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH, 20, 0); - + dfield_set_data(field, "infimum", 8); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8, 0); /* Set the corresponding physical record to its place in the page record heap */ heap_top = page + PAGE_DATA; - infimum_rec = rec_convert_dtuple_to_rec(heap_top, tuple); + infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); + + ut_a(infimum_rec == + page + (comp ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); + + rec_set_n_owned(infimum_rec, comp, 1); + rec_set_heap_no(infimum_rec, comp, 0); + offsets = rec_get_offsets(infimum_rec, index, NULL, + ULINT_UNDEFINED, &heap); + + heap_top = rec_get_end(infimum_rec, offsets); - ut_a(infimum_rec == page + PAGE_INFIMUM); - - rec_set_n_owned(infimum_rec, 1); - rec_set_heap_no(infimum_rec, 0); - - heap_top = rec_get_end(infimum_rec); - /* Create then a tuple for supremum */ tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM); field = dtuple_get_nth_field(tuple, 0); - dfield_set_data(field, "supremum", sizeof "supremum"); - dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH, 20, 0); + dfield_set_data(field, "supremum", comp ? 8 : 9); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, comp ? 8 : 9, 0); - supremum_rec = rec_convert_dtuple_to_rec(heap_top, tuple); + supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); - ut_a(supremum_rec == page + PAGE_SUPREMUM); + ut_a(supremum_rec == + page + (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)); - rec_set_n_owned(supremum_rec, 1); - rec_set_heap_no(supremum_rec, 1); - - heap_top = rec_get_end(supremum_rec); + rec_set_n_owned(supremum_rec, comp, 1); + rec_set_heap_no(supremum_rec, comp, 1); + + offsets = rec_get_offsets(supremum_rec, index, offsets, + ULINT_UNDEFINED, &heap); + heap_top = rec_get_end(supremum_rec, offsets); - ut_ad(heap_top == page + PAGE_SUPREMUM_END); + ut_ad(heap_top == + page + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END)); mem_heap_free(heap); - /* 4. INITIALIZE THE PAGE HEADER */ + /* 4. INITIALIZE THE PAGE */ page_header_set_field(page, PAGE_N_DIR_SLOTS, 2); page_header_set_ptr(page, PAGE_HEAP_TOP, heap_top); - page_header_set_field(page, PAGE_N_HEAP, 2); + page_header_set_field(page, PAGE_N_HEAP, comp ? 0x8002 : 2); page_header_set_ptr(page, PAGE_FREE, NULL); page_header_set_field(page, PAGE_GARBAGE, 0); page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); @@ -375,7 +434,9 @@ page_create( page_header_set_field(page, PAGE_N_DIRECTION, 0); page_header_set_field(page, PAGE_N_RECS, 0); page_set_max_trx_id(page, ut_dulint_zero); - + memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START + - (heap_top - page)); + /* 5. SET POINTERS IN RECORDS AND DIR SLOTS */ /* Set the slots to point to infimum and supremum. */ @@ -388,8 +449,8 @@ page_create( /* Set the next pointers in infimum and supremum */ - rec_set_next_offs(infimum_rec, (ulint)(supremum_rec - page)); - rec_set_next_offs(supremum_rec, 0); + rec_set_next_offs(infimum_rec, comp, (ulint)(supremum_rec - page)); + rec_set_next_offs(supremum_rec, comp, 0); return(page); } @@ -401,14 +462,19 @@ touch the lock table and max trx id on page. */ void page_copy_rec_list_end_no_locks( /*============================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; page_cur_t cur2; rec_t* sup; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; page_cur_position(rec, &cur1); @@ -416,8 +482,12 @@ page_copy_rec_list_end_no_locks( page_cur_move_to_next(&cur1); } - - ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == PAGE_INFIMUM); + + ut_a((ibool)!!page_is_comp(new_page) == index->table->comp); + ut_a(page_is_comp(new_page) == page_is_comp(page)); + ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) + (page_is_comp(new_page) + ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); page_cur_set_before_first(new_page, &cur2); @@ -425,9 +495,15 @@ page_copy_rec_list_end_no_locks( sup = page_get_supremum_rec(page); - while (sup != page_cur_get_rec(&cur1)) { - if (!page_cur_rec_insert(&cur2, - page_cur_get_rec(&cur1), mtr)) { + for (;;) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + if (cur1_rec == sup) { + break; + } + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_cur_rec_insert(&cur2, cur1_rec, index, + offsets, mtr))) { /* Track an assertion failure reported on the mailing list on June 18th, 2003 */ @@ -446,7 +522,11 @@ page_copy_rec_list_end_no_locks( page_cur_move_to_next(&cur1); page_cur_move_to_next(&cur2); } -} + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} /***************************************************************** Copies records from page to new_page, from a given record onward, @@ -456,16 +536,18 @@ The records are copied to the start of the record list on new_page. */ void page_copy_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { - if (page_header_get_field(new_page, PAGE_N_HEAP) == 2) { + if (page_dir_get_n_heap(new_page) == 2) { page_copy_rec_list_end_to_created_page(new_page, page, rec, - mtr); + index, mtr); } else { - page_copy_rec_list_end_no_locks(new_page, page, rec, mtr); + page_copy_rec_list_end_no_locks(new_page, page, rec, + index, mtr); } /* Update the lock table, MAX_TRX_ID, and possible hash index */ @@ -474,7 +556,7 @@ page_copy_rec_list_end( page_update_max_trx_id(new_page, page_get_max_trx_id(page)); - btr_search_move_or_delete_hash_entries(new_page, page); + btr_search_move_or_delete_hash_entries(new_page, page, index); } /***************************************************************** @@ -485,14 +567,19 @@ The records are copied to the end of the record list on new_page. */ void page_copy_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; page_cur_t cur2; rec_t* old_end; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; page_cur_set_before_first(page, &cur1); @@ -510,8 +597,13 @@ page_copy_rec_list_start( /* Copy records from the original page to the new page */ while (page_cur_get_rec(&cur1) != rec) { - ut_a( - page_cur_rec_insert(&cur2, page_cur_get_rec(&cur1), mtr)); + rec_t* ins_rec; + rec_t* cur1_rec = page_cur_get_rec(&cur1); + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + ins_rec = page_cur_rec_insert(&cur2, cur1_rec, index, + offsets, mtr); + ut_a(ins_rec); page_cur_move_to_next(&cur1); page_cur_move_to_next(&cur2); @@ -523,8 +615,12 @@ page_copy_rec_list_start( page_update_max_trx_id(new_page, page_get_max_trx_id(page)); - btr_search_move_or_delete_hash_entries(new_page, page); -} + btr_search_move_or_delete_hash_entries(new_page, page, index); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} /************************************************************** Writes a log record of a record list end or start deletion. */ @@ -532,18 +628,24 @@ UNIV_INLINE void page_delete_rec_list_write_log( /*===========================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - byte type, /* in: operation type: MLOG_LIST_END_DELETE, ... */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: operation type: + MLOG_LIST_END_DELETE, ... */ + mtr_t* mtr) /* in: mtr */ { - ut_ad((type == MLOG_LIST_END_DELETE) - || (type == MLOG_LIST_START_DELETE)); - - mlog_write_initial_log_record(page, type, mtr); - - /* Write the parameter as a 2-byte ulint */ - mlog_catenate_ulint(mtr, rec - page, MLOG_2BYTES); + byte* log_ptr; + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, type, 2); + if (log_ptr) { + /* Write the parameter as a 2-byte ulint */ + mach_write_to_2(log_ptr, ut_align_offset(rec, UNIV_PAGE_SIZE)); + mlog_close(mtr, log_ptr + 2); + } } /************************************************************** @@ -552,18 +654,23 @@ Parses a log record of a record list end or start deletion. */ byte* page_parse_delete_rec_list( /*=======================*/ - /* out: end of log record or NULL */ - byte type, /* in: MLOG_LIST_END_DELETE or - MLOG_LIST_START_DELETE */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte type, /* in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ulint offset; - ut_ad((type == MLOG_LIST_END_DELETE) - || (type == MLOG_LIST_START_DELETE)); + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); /* Read the record offset as a 2-byte ulint */ @@ -580,11 +687,14 @@ page_parse_delete_rec_list( return(ptr); } - if (type == MLOG_LIST_END_DELETE) { - page_delete_rec_list_end(page, page + offset, ULINT_UNDEFINED, - ULINT_UNDEFINED, mtr); + ut_ad(!!page_is_comp(page) == index->table->comp); + + if (type == MLOG_LIST_END_DELETE + || type == MLOG_COMP_LIST_END_DELETE) { + page_delete_rec_list_end(page, page + offset, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, mtr); } else { - page_delete_rec_list_start(page, page + offset, mtr); + page_delete_rec_list_start(page, page + offset, index, mtr); } return(ptr); @@ -597,14 +707,15 @@ The infimum and supremum records are not deleted. */ void page_delete_rec_list_end( /*=====================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - ulint n_recs, /* in: number of records to delete, or ULINT_UNDEFINED - if not known */ - ulint size, /* in: the sum of the sizes of the records in the end - of the chain to delete, or ULINT_UNDEFINED if not - known */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + ulint n_recs, /* in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /* in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /* in: mtr */ { page_dir_slot_t* slot; ulint slot_index; @@ -615,10 +726,12 @@ page_delete_rec_list_end( ulint count; ulint n_owned; rec_t* sup; + ulint comp; /* Reset the last insert info in the page header and increment the modify clock for the frame */ + ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); /* The page gets invalid for optimistic searches: increment the @@ -628,11 +741,13 @@ page_delete_rec_list_end( sup = page_get_supremum_rec(page); - if (rec == page_get_infimum_rec(page)) { + comp = page_is_comp(page); + if (page_rec_is_infimum_low(rec - page)) { rec = page_rec_get_next(rec); } - page_delete_rec_list_write_log(page, rec, MLOG_LIST_END_DELETE, mtr); + page_delete_rec_list_write_log(rec, index, + comp ? MLOG_COMP_LIST_END_DELETE : MLOG_LIST_END_DELETE, mtr); if (rec == sup) { @@ -644,19 +759,36 @@ page_delete_rec_list_end( last_rec = page_rec_get_prev(sup); if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; /* Calculate the sum of sizes and the number of records */ size = 0; n_recs = 0; rec2 = rec; while (rec2 != sup) { - size += rec_get_size(rec2); + ulint s; + offsets = rec_get_offsets(rec2, index, offsets, + ULINT_UNDEFINED, &heap); + s = rec_offs_size(offsets); + ut_ad(rec2 - page + s - rec_offs_extra_size(offsets) + < UNIV_PAGE_SIZE); + ut_ad(size + s < UNIV_PAGE_SIZE); + size += s; n_recs++; rec2 = page_rec_get_next(rec2); } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } + ut_ad(size < UNIV_PAGE_SIZE); + /* Update the page directory; there is no need to balance the number of the records owned by the supremum record, as it is allowed to be less than PAGE_DIR_SLOT_MIN_N_OWNED */ @@ -664,15 +796,15 @@ page_delete_rec_list_end( rec2 = rec; count = 0; - while (rec_get_n_owned(rec2) == 0) { + while (rec_get_n_owned(rec2, comp) == 0) { count++; rec2 = page_rec_get_next(rec2); } - ut_ad(rec_get_n_owned(rec2) - count > 0); + ut_ad(rec_get_n_owned(rec2, comp) - count > 0); - n_owned = rec_get_n_owned(rec2) - count; + n_owned = rec_get_n_owned(rec2, comp) - count; slot_index = page_dir_find_owner_slot(rec2); slot = page_dir_get_nth_slot(page, slot_index); @@ -680,7 +812,7 @@ page_delete_rec_list_end( page_dir_slot_set_rec(slot, sup); page_dir_slot_set_n_owned(slot, n_owned); - page_header_set_field(page, PAGE_N_DIR_SLOTS, slot_index + 1); + page_dir_set_n_slots(page, slot_index + 1); /* Remove the record chain segment from the record chain */ page_rec_set_next(prev_rec, page_get_supremum_rec(page)); @@ -706,14 +838,28 @@ that record. Infimum and supremum records are not deleted. */ void page_delete_rec_list_start( /*=======================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; ulint log_mode; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + mem_heap_t* heap = NULL; + byte type; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_ad(!!page_is_comp(page) == index->table->comp); + + if (page_is_comp(page)) { + type = MLOG_COMP_LIST_START_DELETE; + } else { + type = MLOG_LIST_START_DELETE; + } - page_delete_rec_list_write_log(page, rec, MLOG_LIST_START_DELETE, mtr); + page_delete_rec_list_write_log(rec, index, type, mtr); page_cur_set_before_first(page, &cur1); @@ -729,8 +875,13 @@ page_delete_rec_list_start( log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); while (page_cur_get_rec(&cur1) != rec) { + offsets = rec_get_offsets(page_cur_get_rec(&cur1), index, + offsets, ULINT_UNDEFINED, &heap); + page_cur_delete_rec(&cur1, index, offsets, mtr); + } - page_cur_delete_rec(&cur1, mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); } /* Restore log mode */ @@ -745,10 +896,11 @@ split_rec. */ void page_move_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record to move */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { ulint old_data_size; ulint new_data_size; @@ -758,15 +910,15 @@ page_move_rec_list_end( old_data_size = page_get_data_size(new_page); old_n_recs = page_get_n_recs(new_page); - page_copy_rec_list_end(new_page, page, split_rec, mtr); + page_copy_rec_list_end(new_page, page, split_rec, index, mtr); new_data_size = page_get_data_size(new_page); new_n_recs = page_get_n_recs(new_page); ut_ad(new_data_size >= old_data_size); - page_delete_rec_list_end(page, split_rec, new_n_recs - old_n_recs, - new_data_size - old_data_size, mtr); + page_delete_rec_list_end(page, split_rec, index, + new_n_recs - old_n_recs, new_data_size - old_data_size, mtr); } /***************************************************************** @@ -776,14 +928,15 @@ split_rec. */ void page_move_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record not to move */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record not to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { - page_copy_rec_list_start(new_page, page, split_rec, mtr); + page_copy_rec_list_start(new_page, page, split_rec, index, mtr); - page_delete_rec_list_start(page, split_rec, mtr); + page_delete_rec_list_start(page, split_rec, index, mtr); } /*************************************************************************** @@ -801,7 +954,7 @@ page_rec_write_index_page_no( byte* data; ulint len; - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field_old(rec, i, &len); ut_ad(len == 4); @@ -885,7 +1038,7 @@ page_dir_add_slots( ut_ad(start < n_slots - 1); /* Update the page header */ - page_header_set_field(page, PAGE_N_DIR_SLOTS, n_slots + n); + page_dir_set_n_slots(page, n_slots + n); /* Move slots up */ @@ -1006,8 +1159,8 @@ page_dir_balance_slot( old_rec = page_dir_slot_get_rec(slot); new_rec = page_rec_get_next(old_rec); - rec_set_n_owned(old_rec, 0); - rec_set_n_owned(new_rec, n_owned + 1); + rec_set_n_owned(old_rec, page_is_comp(page), 0); + rec_set_n_owned(new_rec, page_is_comp(page), n_owned + 1); page_dir_slot_set_rec(slot, new_rec); @@ -1080,13 +1233,15 @@ page_rec_get_n_recs_before( rec_t* slot_rec; page_t* page; ulint i; + ulint comp; lint n = 0; ut_ad(page_rec_check(rec)); page = buf_frame_align(rec); - - while (rec_get_n_owned(rec) == 0) { + comp = page_is_comp(page); + + while (rec_get_n_owned(rec, comp) == 0) { rec = page_rec_get_next(rec); n--; @@ -1096,7 +1251,7 @@ page_rec_get_n_recs_before( slot = page_dir_get_nth_slot(page, i); slot_rec = page_dir_slot_get_rec(slot); - n += rec_get_n_owned(slot_rec); + n += rec_get_n_owned(slot_rec, comp); if (rec == slot_rec) { @@ -1118,17 +1273,21 @@ the index page context. */ void page_rec_print( /*===========*/ - rec_t* rec) + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: record descriptor */ { - rec_print(stderr, rec); + ulint comp = page_is_comp(buf_frame_align(rec)); + + ut_a(!comp == !rec_offs_comp(offsets)); + rec_print_new(stderr, rec, offsets); fprintf(stderr, " n_owned: %lu; heap_no: %lu; next rec: %lu\n", - (ulong) rec_get_n_owned(rec), - (ulong) rec_get_heap_no(rec), - (ulong) rec_get_next_offs(rec)); + (ulong) rec_get_n_owned(rec, comp), + (ulong) rec_get_heap_no(rec, comp), + (ulong) rec_get_next_offs(rec, comp)); page_rec_check(rec); - rec_validate(rec); + rec_validate(rec, offsets); } /******************************************************************* @@ -1176,12 +1335,19 @@ debugging purposes. */ void page_print_list( /*============*/ - page_t* page, /* in: index page */ - ulint pr_n) /* in: print n first and n last entries */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint pr_n) /* in: print n first and n last entries */ { page_cur_t cur; ulint count; ulint n_recs; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + ut_a((ibool)!!page_is_comp(page) == index->table->comp); fprintf(stderr, "--------------------------------\n" @@ -1193,7 +1359,9 @@ page_print_list( page_cur_set_before_first(page, &cur); count = 0; for (;;) { - page_rec_print(cur.rec); + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); if (count == pr_n) { break; @@ -1213,7 +1381,9 @@ page_print_list( page_cur_move_to_next(&cur); if (count + pr_n >= n_recs) { - page_rec_print(cur.rec); + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); } count++; } @@ -1222,6 +1392,10 @@ page_print_list( "Total of %lu records \n" "--------------------------------\n", (ulong) (count + 1)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } /******************************************************************* @@ -1235,14 +1409,15 @@ page_header_print( fprintf(stderr, "--------------------------------\n" "PAGE HEADER INFO\n" - "Page address %p, n records %lu\n" + "Page address %p, n records %lu (%s)\n" "n dir slots %lu, heap top %lu\n" "Page n heap %lu, free %lu, garbage %lu\n" "Page last insert %lu, direction %lu, n direction %lu\n", page, (ulong) page_header_get_field(page, PAGE_N_RECS), + page_is_comp(page) ? "compact format" : "original format", (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS), (ulong) page_header_get_field(page, PAGE_HEAP_TOP), - (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_dir_get_n_heap(page), (ulong) page_header_get_field(page, PAGE_FREE), (ulong) page_header_get_field(page, PAGE_GARBAGE), (ulong) page_header_get_field(page, PAGE_LAST_INSERT), @@ -1257,13 +1432,16 @@ debugging purposes. */ void page_print( /*======*/ - page_t* page, /* in: index page */ - ulint dn, /* in: print dn first and last entries in directory */ - ulint rn) /* in: print rn first and last records on page */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint dn, /* in: print dn first and last entries + in directory */ + ulint rn) /* in: print rn first and last records + in directory */ { page_header_print(page); page_dir_print(page, dn); - page_print_list(page, rn); + page_print_list(page, index, rn); } /******************************************************************* @@ -1274,20 +1452,24 @@ the heap_no field. */ ibool page_rec_validate( /*==============*/ - /* out: TRUE if ok */ - rec_t* rec) /* in: record on the page */ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_owned; ulint heap_no; - page_t* page; + page_t* page; + ulint comp; page = buf_frame_align(rec); + comp = page_is_comp(page); + ut_a(!comp == !rec_offs_comp(offsets)); page_rec_check(rec); - rec_validate(rec); + rec_validate(rec, offsets); - n_owned = rec_get_n_owned(rec); - heap_no = rec_get_heap_no(rec); + n_owned = rec_get_n_owned(rec, comp); + heap_no = rec_get_heap_no(rec, comp); if (!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED)) { fprintf(stderr, @@ -1296,11 +1478,11 @@ page_rec_validate( return(FALSE); } - if (!(heap_no < page_header_get_field(page, PAGE_N_HEAP))) { + if (!(heap_no < page_dir_get_n_heap(page))) { fprintf(stderr, "InnoDB: Heap no of rec %lu too big %lu %lu\n", (ulong)(rec - page), (ulong) heap_no, - (ulong) page_header_get_field(page, PAGE_N_HEAP)); + (ulong) page_dir_get_n_heap(page)); return(FALSE); } @@ -1358,6 +1540,7 @@ page_simple_validate( ulint count; ulint own_count; ibool ret = FALSE; + ulint comp = page_is_comp(page); /* Check first that the record heap and the directory do not overlap. */ @@ -1404,13 +1587,13 @@ page_simple_validate( goto func_exit; } - if (rec_get_n_owned(rec) != 0) { + if (rec_get_n_owned(rec, comp) != 0) { /* This is a record pointed to by a dir slot */ - if (rec_get_n_owned(rec) != own_count) { + if (rec_get_n_owned(rec, comp) != own_count) { fprintf(stderr, "InnoDB: Wrong owned count %lu, %lu, rec %lu\n", - (ulong) rec_get_n_owned(rec), + (ulong) rec_get_n_owned(rec, comp), (ulong) own_count, (ulong)(rec - page)); @@ -1438,11 +1621,11 @@ page_simple_validate( break; } - if (rec_get_next_offs(rec) < FIL_PAGE_DATA - || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) { + if (rec_get_next_offs(rec, comp) < FIL_PAGE_DATA + || rec_get_next_offs(rec, comp) >= UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Next record offset nonsensical %lu for rec %lu\n", - (ulong) rec_get_next_offs(rec), + (ulong) rec_get_next_offs(rec, comp), (ulong)(rec - page)); goto func_exit; @@ -1461,7 +1644,7 @@ page_simple_validate( own_count++; } - if (rec_get_n_owned(rec) == 0) { + if (rec_get_n_owned(rec, comp) == 0) { fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n"); goto func_exit; @@ -1514,10 +1697,10 @@ page_simple_validate( rec = page_rec_get_next(rec); } - if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) { + if (page_dir_get_n_heap(page) != count + 1) { fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", - (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_dir_get_n_heap(page), (ulong) (count + 1)); goto func_exit; @@ -1549,25 +1732,30 @@ page_validate( ulint slot_no; ulint data_size; rec_t* rec; - rec_t* old_rec = NULL; + rec_t* old_rec = NULL; ulint offs; ulint n_slots; - ibool ret = FALSE; + ibool ret = FALSE; ulint i; - + ulint comp = page_is_comp(page); + ulint* offsets = NULL; + ulint* old_offsets = NULL; + + if ((ibool)!!comp != index->table->comp) { + fputs("InnoDB: 'compact format' flag mismatch\n", stderr); + goto func_exit2; + } if (!page_simple_validate(page)) { goto func_exit2; } - heap = mem_heap_create(UNIV_PAGE_SIZE); + heap = mem_heap_create(UNIV_PAGE_SIZE + 200); /* The following buffer is used to check that the records in the page record heap do not overlap */ buf = mem_heap_alloc(heap, UNIV_PAGE_SIZE); - for (i = 0; i < UNIV_PAGE_SIZE; i++) { - buf[i] = 0; - } + memset(buf, 0, UNIV_PAGE_SIZE); /* Check first that the record heap and the directory do not overlap. */ @@ -1599,37 +1787,47 @@ page_validate( for (;;) { rec = cur.rec; + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (comp && page_rec_is_user_rec(rec) + && rec_get_node_ptr_flag(rec) + != (ibool) + (btr_page_get_level_low(page) != 0)) { + fputs("InnoDB: node_ptr flag mismatch\n", stderr); + goto func_exit; + } - if (!page_rec_validate(rec)) { + if (!page_rec_validate(rec, offsets)) { goto func_exit; } /* Check that the records are in the ascending order */ if ((count >= 2) && (!page_cur_is_after_last(&cur))) { - if (!(1 == cmp_rec_rec(rec, old_rec, index))) { + if (!(1 == cmp_rec_rec(rec, old_rec, + offsets, old_offsets, index))) { fprintf(stderr, "InnoDB: Records in wrong order on page %lu", (ulong) buf_frame_get_page_no(page)); dict_index_name_print(stderr, NULL, index); fputs("\nInnoDB: previous record ", stderr); - rec_print(stderr, old_rec); + rec_print_new(stderr, old_rec, old_offsets); fputs("\nInnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); putc('\n', stderr); goto func_exit; } } - if ((rec != page_get_supremum_rec(page)) - && (rec != page_get_infimum_rec(page))) { + if (page_rec_is_user_rec(rec)) { - data_size += rec_get_size(rec); + data_size += rec_offs_size(offsets); } - offs = rec_get_start(rec) - page; + offs = rec_get_start(rec, offsets) - page; - for (i = 0; i < rec_get_size(rec); i++) { + for (i = 0; i < rec_offs_size(offsets); i++) { if (!buf[offs + i] == 0) { /* No other record may overlap this */ @@ -1641,12 +1839,12 @@ page_validate( buf[offs + i] = 1; } - if (rec_get_n_owned(rec) != 0) { + if (rec_get_n_owned(rec, comp) != 0) { /* This is a record pointed to by a dir slot */ - if (rec_get_n_owned(rec) != own_count) { + if (rec_get_n_owned(rec, comp) != own_count) { fprintf(stderr, "InnoDB: Wrong owned count %lu, %lu\n", - (ulong) rec_get_n_owned(rec), + (ulong) rec_get_n_owned(rec, comp), (ulong) own_count); goto func_exit; } @@ -1671,11 +1869,11 @@ page_validate( break; } - if (rec_get_next_offs(rec) < FIL_PAGE_DATA - || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) { + if (rec_get_next_offs(rec, comp) < FIL_PAGE_DATA + || rec_get_next_offs(rec, comp) >= UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Next record offset wrong %lu\n", - (ulong) rec_get_next_offs(rec)); + (ulong) rec_get_next_offs(rec, comp)); goto func_exit; } @@ -1683,9 +1881,15 @@ page_validate( page_cur_move_to_next(&cur); own_count++; old_rec = rec; + /* set old_offsets to offsets; recycle offsets */ + { + ulint* offs = old_offsets; + old_offsets = offsets; + offsets = offs; + } } - if (rec_get_n_owned(rec) == 0) { + if (rec_get_n_owned(rec, comp) == 0) { fputs("InnoDB: n owned is zero\n", stderr); goto func_exit; } @@ -1714,15 +1918,17 @@ page_validate( rec = page_header_get_ptr(page, PAGE_FREE); while (rec != NULL) { - if (!page_rec_validate(rec)) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (!page_rec_validate(rec, offsets)) { goto func_exit; } count++; - offs = rec_get_start(rec) - page; + offs = rec_get_start(rec, offsets) - page; - for (i = 0; i < rec_get_size(rec); i++) { + for (i = 0; i < rec_offs_size(offsets); i++) { if (buf[offs + i] != 0) { fputs( @@ -1736,9 +1942,9 @@ page_validate( rec = page_rec_get_next(rec); } - if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) { + if (page_dir_get_n_heap(page) != count + 1) { fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n", - (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_dir_get_n_heap(page), (ulong) count + 1); goto func_exit; } @@ -1775,7 +1981,7 @@ page_find_rec_with_heap_no( page_cur_set_before_first(page, &cur); for (;;) { - if (rec_get_heap_no(cur.rec) == heap_no) { + if (rec_get_heap_no(cur.rec, page_is_comp(page)) == heap_no) { return(cur.rec); } diff --git a/innobase/pars/lexyy.c b/innobase/pars/lexyy.c index 0112f618533..1145ca295e7 100644 --- a/innobase/pars/lexyy.c +++ b/innobase/pars/lexyy.c @@ -616,6 +616,9 @@ How to make the InnoDB parser and lexer C files: 7. Add '#include "univ.i"' before #include <stdio.h> in lexyy.c (Needed for AIX) +8. Add a type cast to int to the assignment below the comment + 'need more input.' (Removes a warning on Win64) + These instructions seem to work at least with bison-1.28 and flex-2.5.4 on Linux. *******************************************************/ @@ -2114,7 +2117,7 @@ static int input() else { /* need more input */ - int offset = yy_c_buf_p - yytext_ptr; + int offset = (int) (yy_c_buf_p - yytext_ptr); ++yy_c_buf_p; switch ( yy_get_next_buffer() ) diff --git a/innobase/pars/pars0lex.l b/innobase/pars/pars0lex.l index 811057d48a1..e481634f77e 100644 --- a/innobase/pars/pars0lex.l +++ b/innobase/pars/pars0lex.l @@ -31,6 +31,9 @@ How to make the InnoDB parser and lexer C files: 7. Add '#include "univ.i"' before #include <stdio.h> in lexyy.c (Needed for AIX) +8. Add a type cast to int to the assignment below the comment + 'need more input.' (Removes a warning on Win64) + These instructions seem to work at least with bison-1.28 and flex-2.5.4 on Linux. *******************************************************/ diff --git a/innobase/pars/pars0pars.c b/innobase/pars/pars0pars.c index 16d630dd318..c62184abd85 100644 --- a/innobase/pars/pars0pars.c +++ b/innobase/pars/pars0pars.c @@ -1514,8 +1514,11 @@ pars_create_table( n_cols = que_node_list_get_len(column_defs); - table = dict_mem_table_create(table_sym->name, 0, n_cols); - + /* As the InnoDB SQL parser is for internal use only, + for creating some system tables, this function will only + create tables in the old (not compact) record format. */ + table = dict_mem_table_create(table_sym->name, 0, n_cols, FALSE); + if (not_fit_in_memory != NULL) { table->does_not_fit_in_memory = TRUE; } diff --git a/innobase/pars/pars0sym.c b/innobase/pars/pars0sym.c index 194e6677183..8ade5579e47 100644 --- a/innobase/pars/pars0sym.c +++ b/innobase/pars/pars0sym.c @@ -220,7 +220,7 @@ sym_tab_add_id( node->resolved = FALSE; node->indirection = NULL; - node->name = mem_heap_strdupl(sym_tab->heap, name, len + 1); + node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len + 1); node->name_len = len; UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); diff --git a/innobase/que/que0que.c b/innobase/que/que0que.c index 22878dec27f..a0a6adf9b83 100644 --- a/innobase/que/que0que.c +++ b/innobase/que/que0que.c @@ -163,6 +163,7 @@ que_thr_create( thr->run_node = NULL; thr->resource = 0; + thr->lock_state = QUE_THR_LOCK_NOLOCK; UT_LIST_ADD_LAST(thrs, parent->thrs, thr); diff --git a/innobase/read/read0read.c b/innobase/read/read0read.c index 889612deef4..76df7cdbee0 100644 --- a/innobase/read/read0read.c +++ b/innobase/read/read0read.c @@ -153,10 +153,11 @@ read_view_open_now( /* No active transaction should be visible, except cr_trx */ while (trx) { - if (trx != cr_trx && trx->conc_state == TRX_ACTIVE) { + if (trx != cr_trx && (trx->conc_state == TRX_ACTIVE || + trx->conc_state == TRX_PREPARED)) { read_view_set_nth_trx_id(view, n, trx->id); - + n++; /* NOTE that a transaction whose trx number is < @@ -164,7 +165,7 @@ read_view_open_now( in the middle of its commit! Note that when a transaction starts, we initialize trx->no to ut_dulint_max. */ - + if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) { view->low_limit_no = trx->no; @@ -211,15 +212,16 @@ read_view_close_for_mysql( /*======================*/ trx_t* trx) /* in: trx which has a read view */ { - ut_a(trx->read_view); + ut_a(trx->global_read_view); mutex_enter(&kernel_mutex); - read_view_close(trx->read_view); + read_view_close(trx->global_read_view); - mem_heap_empty(trx->read_view_heap); + mem_heap_empty(trx->global_read_view_heap); trx->read_view = NULL; + trx->global_read_view = NULL; mutex_exit(&kernel_mutex); } @@ -257,3 +259,137 @@ read_view_print( (ulong) ut_dulint_get_low(read_view_get_nth_trx_id(view, i))); } } + +/************************************************************************* +Create a consistent cursor view for mysql to be used in cursors. In this +consistent read view modifications done by the creating transaction or future +transactions are not visible. */ + +cursor_view_t* +read_cursor_view_create_for_mysql( +/*==============================*/ + trx_t* cr_trx) /* in: trx where cursor view is created */ +{ + cursor_view_t* curview; + read_view_t* view; + mem_heap_t* heap; + trx_t* trx; + ulint n; + + ut_a(cr_trx); + + /* Use larger heap than in trx_create when creating a read_view + because cursors are quite long. */ + + heap = mem_heap_create(512); + + curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(cursor_view_t)); + curview->heap = heap; + + mutex_enter(&kernel_mutex); + + curview->read_view = read_view_create_low( + UT_LIST_GET_LEN(trx_sys->trx_list), + curview->heap); + + view = curview->read_view; + view->creator = cr_trx; + + /* No future transactions should be visible in the view */ + + view->low_limit_no = trx_sys->max_trx_id; + view->low_limit_id = view->low_limit_no; + + view->can_be_too_old = FALSE; + + n = 0; + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + /* No active transaction should be visible, not even cr_trx !*/ + + while (trx) { + if (trx->conc_state == TRX_ACTIVE || + trx->conc_state == TRX_PREPARED) { + + read_view_set_nth_trx_id(view, n, trx->id); + + n++; + + /* NOTE that a transaction whose trx number is < + trx_sys->max_trx_id can still be active, if it is + in the middle of its commit! Note that when a + transaction starts, we initialize trx->no to + ut_dulint_max. */ + + if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) { + + view->low_limit_no = trx->no; + } + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + view->n_trx_ids = n; + + if (n > 0) { + /* The last active transaction has the smallest id: */ + view->up_limit_id = read_view_get_nth_trx_id(view, n - 1); + } else { + view->up_limit_id = view->low_limit_id; + } + + UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view); + + mutex_exit(&kernel_mutex); + + return(curview); +} + +/************************************************************************* +Close a given consistent cursor view for and restore global read view +back to a transaction. */ + +void +read_cursor_view_close_for_mysql( +/*=============================*/ + trx_t* trx, /* in: trx */ + cursor_view_t* curview)/* in: cursor view to be closed */ +{ + ut_a(curview); + ut_a(curview->read_view); + ut_a(curview->heap); + + mutex_enter(&kernel_mutex); + + read_view_close(curview->read_view); + trx->read_view = trx->global_read_view; + + mutex_exit(&kernel_mutex); + + mem_heap_free(curview->heap); +} + +/************************************************************************* +This function sets a given consistent cursor view to a transaction +read view if given consistent cursor view is not null. Otherwice, function +restores a global read view to a transaction read view. */ + +void +read_cursor_set_for_mysql( +/*======================*/ + trx_t* trx, /* in: transaction where cursor is set */ + cursor_view_t* curview)/* in: consistent cursor view to be set */ +{ + ut_a(trx); + + mutex_enter(&kernel_mutex); + + if (UNIV_LIKELY(curview != NULL)) { + trx->read_view = curview->read_view; + } else { + trx->read_view = trx->global_read_view; + } + + mutex_exit(&kernel_mutex); +} diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c index f2dc8a7021a..7c33476fb9e 100644 --- a/innobase/rem/rem0cmp.c +++ b/innobase/rem/rem0cmp.c @@ -51,6 +51,7 @@ cmp_debug_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields);/* in/out: number of already completely matched fields; when function returns, contains the value for current @@ -319,7 +320,7 @@ cmp_data_data_slow( || (cur_type->mtype == DATA_BLOB && 0 == (cur_type->prtype & DATA_BINARY_TYPE) && dtype_get_charset_coll(cur_type->prtype) != - data_mysql_latin1_swedish_charset_coll)) { + DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { return(cmp_whole_field(cur_type, data1, (unsigned) len1, @@ -412,6 +413,7 @@ cmp_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns, contains the value for current comparison */ @@ -441,12 +443,27 @@ cmp_dtuple_rec_with_match( ut_ad(dtuple && rec && matched_fields && matched_bytes); ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); cur_field = *matched_fields; cur_bytes = *matched_bytes; ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple)); - ut_ad(cur_field <= rec_get_n_fields(rec)); + ut_ad(cur_field <= rec_offs_n_fields(offsets)); + + if (cur_bytes == 0 && cur_field == 0) { + ulint rec_info = rec_get_info_bits(rec, + rec_offs_comp(offsets)); + ulint tup_info = dtuple_get_info_bits(dtuple); + + if (rec_info & REC_INFO_MIN_REC_FLAG) { + ret = !(tup_info & REC_INFO_MIN_REC_FLAG); + goto order_resolved; + } else if (tup_info & REC_INFO_MIN_REC_FLAG) { + ret = -1; + goto order_resolved; + } + } /* Match fields in a loop; stop if we run out of fields in dtuple or find an externally stored field */ @@ -458,39 +475,16 @@ cmp_dtuple_rec_with_match( dtuple_f_len = dfield_get_len(dtuple_field); - rec_b_ptr = rec_get_nth_field(rec, cur_field, &rec_f_len); + rec_b_ptr = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); /* If we have matched yet 0 bytes, it may be that one or both the fields are SQL null, or the record or dtuple may be the predefined minimum record, or the field is externally stored */ - if (cur_bytes == 0) { - if (cur_field == 0) { - - if (rec_get_info_bits(rec) - & REC_INFO_MIN_REC_FLAG) { - - if (dtuple_get_info_bits(dtuple) - & REC_INFO_MIN_REC_FLAG) { - - ret = 0; - } else { - ret = 1; - } - - goto order_resolved; - } - - if (dtuple_get_info_bits(dtuple) - & REC_INFO_MIN_REC_FLAG) { - ret = -1; - - goto order_resolved; - } - } - - if (rec_get_nth_field_extern_bit(rec, cur_field)) { + if (UNIV_LIKELY(cur_bytes == 0)) { + if (rec_offs_nth_extern(offsets, cur_field)) { /* We do not compare to an externally stored field */ @@ -499,24 +493,20 @@ cmp_dtuple_rec_with_match( goto order_resolved; } - if (dtuple_f_len == UNIV_SQL_NULL - || rec_f_len == UNIV_SQL_NULL) { - - if (dtuple_f_len == rec_f_len) { + if (dtuple_f_len == UNIV_SQL_NULL) { + if (rec_f_len == UNIV_SQL_NULL) { goto next_field; } - if (rec_f_len == UNIV_SQL_NULL) { - /* We define the SQL null to be the - smallest possible value of a field - in the alphabetical order */ - - ret = 1; - } else { - ret = -1; - } + ret = -1; + goto order_resolved; + } else if (rec_f_len == UNIV_SQL_NULL) { + /* We define the SQL null to be the + smallest possible value of a field + in the alphabetical order */ + ret = 1; goto order_resolved; } } @@ -525,7 +515,7 @@ cmp_dtuple_rec_with_match( || (cur_type->mtype == DATA_BLOB && 0 == (cur_type->prtype & DATA_BINARY_TYPE) && dtype_get_charset_coll(cur_type->prtype) != - data_mysql_latin1_swedish_charset_coll)) { + DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { ret = cmp_whole_field( cur_type, @@ -550,7 +540,7 @@ cmp_dtuple_rec_with_match( /* Compare then the fields */ for (;;) { - if (rec_f_len <= cur_bytes) { + if (UNIV_UNLIKELY(rec_f_len <= cur_bytes)) { if (dtuple_f_len <= cur_bytes) { goto next_field; @@ -567,7 +557,7 @@ cmp_dtuple_rec_with_match( rec_byte = *rec_b_ptr; } - if (dtuple_f_len <= cur_bytes) { + if (UNIV_UNLIKELY(dtuple_f_len <= cur_bytes)) { dtuple_byte = dtype_get_pad_char(cur_type); if (dtuple_byte == ULINT_UNDEFINED) { @@ -595,14 +585,16 @@ cmp_dtuple_rec_with_match( rec_byte = cmp_collate(rec_byte); dtuple_byte = cmp_collate(dtuple_byte); } - - if (dtuple_byte > rec_byte) { - ret = 1; - goto order_resolved; - } else if (dtuple_byte < rec_byte) { - ret = -1; - goto order_resolved; + ret = dtuple_byte - rec_byte; + if (UNIV_UNLIKELY(ret)) { + if (ret < 0) { + ret = -1; + goto order_resolved; + } else { + ret = 1; + goto order_resolved; + } } next_byte: /* Next byte */ @@ -622,7 +614,7 @@ cmp_dtuple_rec_with_match( up to the common fields */ order_resolved: ut_ad((ret >= - 1) && (ret <= 1)); - ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, + ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets, matched_fields)); ut_ad(*matched_fields == cur_field); /* In the debug version, the above cmp_debug_... sets @@ -643,13 +635,15 @@ cmp_dtuple_rec( less than rec, respectively; see the comments for cmp_dtuple_rec_with_match */ dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint matched_fields = 0; ulint matched_bytes = 0; - return(cmp_dtuple_rec_with_match(dtuple, rec, &matched_fields, - &matched_bytes)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes)); } /****************************************************************** @@ -660,22 +654,24 @@ ibool cmp_dtuple_is_prefix_of_rec( /*========================*/ /* out: TRUE if prefix */ - dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec) /* in: physical record */ + dtuple_t* dtuple, /* in: data tuple */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_fields; ulint matched_fields = 0; ulint matched_bytes = 0; + ut_ad(rec_offs_validate(rec, NULL, offsets)); n_fields = dtuple_get_n_fields(dtuple); - if (n_fields > rec_get_n_fields(rec)) { + if (n_fields > rec_offs_n_fields(offsets)) { return(FALSE); } - cmp_dtuple_rec_with_match(dtuple, rec, &matched_fields, - &matched_bytes); + cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes); if (matched_fields == n_fields) { return(TRUE); @@ -690,42 +686,6 @@ cmp_dtuple_is_prefix_of_rec( return(FALSE); } -/****************************************************************** -Compares a prefix of a data tuple to a prefix of a physical record for -equality. If there are less fields in rec than parameter n_fields, FALSE -is returned. NOTE that n_fields_cmp of dtuple does not affect this -comparison. */ - -ibool -cmp_dtuple_rec_prefix_equal( -/*========================*/ - /* out: TRUE if equal */ - dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec, /* in: physical record */ - ulint n_fields) /* in: number of fields which should be - compared; must not exceed the number of - fields in dtuple */ -{ - ulint matched_fields = 0; - ulint matched_bytes = 0; - - ut_ad(n_fields <= dtuple_get_n_fields(dtuple)); - - if (rec_get_n_fields(rec) < n_fields) { - - return(FALSE); - } - - cmp_dtuple_rec_with_match(dtuple, rec, &matched_fields, - &matched_bytes); - if (matched_fields >= n_fields) { - - return(TRUE); - } - - return(FALSE); -} - /***************************************************************** This function is used to compare two physical records. Only the common first fields are compared, and if an externally stored field is @@ -739,6 +699,8 @@ cmp_rec_rec_with_match( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, @@ -765,17 +727,21 @@ cmp_rec_rec_with_match( ulint cur_bytes; /* number of already matched bytes in current field */ int ret = 3333; /* return value */ + ulint comp; ut_ad(rec1 && rec2 && index); + ut_ad(rec_offs_validate(rec1, index, offsets1)); + ut_ad(rec_offs_validate(rec2, index, offsets2)); + ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2)); - rec1_n_fields = rec_get_n_fields(rec1); - rec2_n_fields = rec_get_n_fields(rec2); + comp = rec_offs_comp(offsets1); + rec1_n_fields = rec_offs_n_fields(offsets1); + rec2_n_fields = rec_offs_n_fields(offsets2); cur_field = *matched_fields; cur_bytes = *matched_bytes; - /* Match fields in a loop; stop if we run out of fields in either - record */ + /* Match fields in a loop */ while ((cur_field < rec1_n_fields) && (cur_field < rec2_n_fields)) { @@ -787,17 +753,19 @@ cmp_rec_rec_with_match( dict_index_get_nth_field(index, cur_field))); } - rec1_b_ptr = rec_get_nth_field(rec1, cur_field, &rec1_f_len); - rec2_b_ptr = rec_get_nth_field(rec2, cur_field, &rec2_f_len); - + rec1_b_ptr = rec_get_nth_field(rec1, offsets1, + cur_field, &rec1_f_len); + rec2_b_ptr = rec_get_nth_field(rec2, offsets2, + cur_field, &rec2_f_len); + if (cur_bytes == 0) { if (cur_field == 0) { /* Test if rec is the predefined minimum record */ - if (rec_get_info_bits(rec1) + if (rec_get_info_bits(rec1, comp) & REC_INFO_MIN_REC_FLAG) { - if (rec_get_info_bits(rec2) + if (rec_get_info_bits(rec2, comp) & REC_INFO_MIN_REC_FLAG) { ret = 0; } else { @@ -806,7 +774,7 @@ cmp_rec_rec_with_match( goto order_resolved; - } else if (rec_get_info_bits(rec2) + } else if (rec_get_info_bits(rec2, comp) & REC_INFO_MIN_REC_FLAG) { ret = 1; @@ -815,8 +783,8 @@ cmp_rec_rec_with_match( } } - if (rec_get_nth_field_extern_bit(rec1, cur_field) - || rec_get_nth_field_extern_bit(rec2, cur_field)) { + if (rec_offs_nth_extern(offsets1, cur_field) + || rec_offs_nth_extern(offsets2, cur_field)) { /* We do not compare to an externally stored field */ @@ -851,7 +819,7 @@ cmp_rec_rec_with_match( || (cur_type->mtype == DATA_BLOB && 0 == (cur_type->prtype & DATA_BINARY_TYPE) && dtype_get_charset_coll(cur_type->prtype) != - data_mysql_latin1_swedish_charset_coll)) { + DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { ret = cmp_whole_field(cur_type, rec1_b_ptr, (unsigned) rec1_f_len, @@ -971,6 +939,7 @@ cmp_debug_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields) /* in/out: number of already completely matched fields; when function returns, contains the value for current @@ -990,21 +959,19 @@ cmp_debug_dtuple_rec_with_match( ut_ad(dtuple && rec && matched_fields); ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_ad(*matched_fields <= dtuple_get_n_fields_cmp(dtuple)); - ut_ad(*matched_fields <= rec_get_n_fields(rec)); + ut_ad(*matched_fields <= rec_offs_n_fields(offsets)); cur_field = *matched_fields; if (cur_field == 0) { - if (rec_get_info_bits(rec) & REC_INFO_MIN_REC_FLAG) { + if (rec_get_info_bits(rec, rec_offs_comp(offsets)) + & REC_INFO_MIN_REC_FLAG) { - if (dtuple_get_info_bits(dtuple) - & REC_INFO_MIN_REC_FLAG) { - ret = 0; - } else { - ret = 1; - } + ret = !(dtuple_get_info_bits(dtuple) + & REC_INFO_MIN_REC_FLAG); goto order_resolved; } @@ -1027,9 +994,10 @@ cmp_debug_dtuple_rec_with_match( dtuple_f_data = dfield_get_data(dtuple_field); dtuple_f_len = dfield_get_len(dtuple_field); - rec_f_data = rec_get_nth_field(rec, cur_field, &rec_f_len); + rec_f_data = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); - if (rec_get_nth_field_extern_bit(rec, cur_field)) { + if (rec_offs_nth_extern(offsets, cur_field)) { /* We do not compare to an externally stored field */ ret = 0; diff --git a/innobase/rem/rem0rec.c b/innobase/rem/rem0rec.c index 1db89241dff..fbc33aea669 100644 --- a/innobase/rem/rem0rec.c +++ b/innobase/rem/rem0rec.c @@ -15,8 +15,8 @@ Created 5/30/1994 Heikki Tuuri #include "mtr0mtr.h" #include "mtr0log.h" -/* PHYSICAL RECORD - =============== +/* PHYSICAL RECORD (OLD STYLE) + =========================== The physical record, which is the data type of all the records found in index pages of the database, has the following format @@ -39,7 +39,7 @@ represented on a higher text line): | 10 bits giving the number of fields in this record | | 1 bit which is set to 1 if the offsets above are given in one byte format, 0 if in two byte format | -| two bytes giving the pointer to the next record in the page | +| two bytes giving an absolute pointer to the next record in the page | ORIGIN of the record | first field of data | ... @@ -55,9 +55,50 @@ The offsets of the data fields are given as one-byte (if there are less than 127 bytes of data in the record) or two-byte unsigned integers. The most significant bit is not part of the offset, instead it indicates the SQL-null -if the bit is set to 1. +if the bit is set to 1. */ -CANONICAL COORDINATES. A record can be seen as a single +/* PHYSICAL RECORD (NEW STYLE) + =========================== + +The physical record, which is the data type of all the records +found in index pages of the database, has the following format +(lower addresses and more significant bits inside a byte are below +represented on a higher text line): + +| length of the last non-null variable-length field of data: + if the maximum length is 255, one byte; otherwise, + 0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes, + length=128..16383, extern storage flag) | +... +| length of first variable-length field of data | +| SQL-null flags (1 bit per nullable field), padded to full bytes | +| 4 bits used to delete mark a record, and mark a predefined + minimum record in alphabetical order | +| 4 bits giving the number of records owned by this record + (this term is explained in page0page.h) | +| 13 bits giving the order number of this record in the + heap of the index page | +| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree), + 010=infimum, 011=supremum, 1xx=reserved | +| two bytes giving a relative pointer to the next record in the page | +ORIGIN of the record +| first field of data | +... +| last field of data | + +The origin of the record is the start address of the first field +of data. The offsets are given relative to the origin. +The offsets of the data fields are stored in an inverted +order because then the offset of the first fields are near the +origin, giving maybe a better processor cache hit rate in searches. + +The offsets of the data fields are given as one-byte +(if there are less than 127 bytes of data in the record) +or two-byte unsigned integers. The most significant bit +is not part of the offset, instead it indicates the SQL-null +if the bit is set to 1. */ + +/* CANONICAL COORDINATES. A record can be seen as a single string of 'characters' in the following way: catenate the bytes in each field, in the order of fields. An SQL-null field is taken to be an empty sequence of bytes. Then after @@ -86,13 +127,250 @@ the corresponding canonical strings have the same property. */ ulint rec_dummy; /* this is used to fool compiler in rec_validate */ +/******************************************************************* +Validates the consistency of an old-style physical record. */ +static +ibool +rec_validate_old( +/*=============*/ + /* out: TRUE if ok */ + rec_t* rec); /* in: physical record */ + +/********************************************************** +The following function determines the offsets to each field in the +record. The offsets are written to a previously allocated array of +ulint, where rec_offs_n_fields(offsets) has been initialized to the +number of fields in the record. The rest of the array will be +initialized by this function. rec_offs_base(offsets)[0] will be set +to the extra size (if REC_OFFS_COMPACT is set, the record is in the +new format), and rec_offs_base(offsets)[1..n_fields] will be set to +offsets past the end of fields 0..n_fields, or to the beginning of +fields 1..n_fields+1. When the high-order bit of the offset at [i+1] +is set (REC_OFFS_SQL_NULL), the field i is NULL. When the second +high-order bit of the offset at [i+1] is set (REC_OFFS_EXTERNAL), the +field i is being stored externally. */ +static +void +rec_init_offsets( +/*=============*/ + /* out: the offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets)/* in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ +{ + ulint i = 0; + ulint offs; + + rec_offs_make_valid(rec, index, offsets); + + if (UNIV_LIKELY(index->table->comp)) { + const byte* nulls; + const byte* lens; + dict_field_t* field; + ulint null_mask; + ulint status = rec_get_status(rec); + ulint n_node_ptr_field = ULINT_UNDEFINED; + + switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* the field is 8 bytes long */ + rec_offs_base(offsets)[0] = + REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT; + rec_offs_base(offsets)[1] = 8; + return; + case REC_STATUS_NODE_PTR: + n_node_ptr_field = + dict_index_get_n_unique_in_tree(index); + break; + case REC_STATUS_ORDINARY: + break; + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - (index->n_nullable + 7) / 8; + offs = 0; + null_mask = 1; + + /* read the lengths of fields 0..n */ + do { + ulint len; + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + len = offs += 4; + goto resolved; + } + + field = dict_index_get_nth_field(index, i); + if (!(dtype_get_prtype(dict_col_get_type( + dict_field_get_col(field))) + & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. + We do not advance offs, and we set + the length to zero and enable the + SQL NULL flag in offsets[]. */ + len = offs | REC_OFFS_SQL_NULL; + goto resolved; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + dtype_t* type = dict_col_get_type( + dict_field_get_col(field)); + len = *lens--; + if (UNIV_UNLIKELY(dtype_get_len(type) > 255) + || UNIV_UNLIKELY(dtype_get_mtype(type) + == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + len <<= 8; + len |= *lens--; + + offs += len & 0x3fff; + if (UNIV_UNLIKELY(len + & 0x4000)) { + len = offs + | REC_OFFS_EXTERNAL; + } else { + len = offs; + } + + goto resolved; + } + } + + len = offs += len; + } else { + len = offs += field->fixed_len; + } + resolved: + rec_offs_base(offsets)[i + 1] = len; + } while (++i < rec_offs_n_fields(offsets)); + + *rec_offs_base(offsets) = + (rec - (lens + 1)) | REC_OFFS_COMPACT; + } else { + /* Old-style record: determine extra size and end offsets */ + offs = REC_N_OLD_EXTRA_BYTES; + if (rec_get_1byte_offs_flag(rec)) { + offs += rec_offs_n_fields(offsets); + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + do { + offs = rec_1_get_field_end_info(rec, i); + if (offs & REC_1BYTE_SQL_NULL_MASK) { + offs &= ~REC_1BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + rec_offs_base(offsets)[1 + i] = offs; + } while (++i < rec_offs_n_fields(offsets)); + } else { + offs += 2 * rec_offs_n_fields(offsets); + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + do { + offs = rec_2_get_field_end_info(rec, i); + if (offs & REC_2BYTE_SQL_NULL_MASK) { + offs &= ~REC_2BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + if (offs & REC_2BYTE_EXTERN_MASK) { + offs &= ~REC_2BYTE_EXTERN_MASK; + offs |= REC_OFFS_EXTERNAL; + } + rec_offs_base(offsets)[1 + i] = offs; + } while (++i < rec_offs_n_fields(offsets)); + } + } +} + +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously returned array. */ + +ulint* +rec_get_offsets_func( +/*=================*/ + /* out: the new offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in: array consisting of offsets[0] + allocated elements, or an array from + rec_get_offsets(), or NULL */ + ulint n_fields,/* in: maximum number of initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t** heap, /* in/out: memory heap */ + const char* file, /* in: file name where called */ + ulint line) /* in: line number where called */ +{ + ulint n; + ulint size; + + ut_ad(rec); + ut_ad(index); + ut_ad(heap); + + if (UNIV_LIKELY(index->table->comp)) { + switch (UNIV_EXPECT(rec_get_status(rec), + REC_STATUS_ORDINARY)) { + case REC_STATUS_ORDINARY: + n = dict_index_get_n_fields(index); + break; + case REC_STATUS_NODE_PTR: + n = dict_index_get_n_unique_in_tree(index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record */ + n = 1; + break; + default: + ut_error; + return(NULL); + } + } else { + n = rec_get_n_fields_old(rec); + } + + if (UNIV_UNLIKELY(n_fields < n)) { + n = n_fields; + } + + size = n + (1 + REC_OFFS_HEADER_SIZE); + + if (UNIV_UNLIKELY(!offsets) || + UNIV_UNLIKELY(rec_offs_get_n_alloc(offsets) < size)) { + if (!*heap) { + *heap = mem_heap_create_func(size * sizeof(ulint), + NULL, MEM_HEAP_DYNAMIC, file, line); + } + offsets = mem_heap_alloc(*heap, size * sizeof(ulint)); + rec_offs_set_n_alloc(offsets, size); + } + + rec_offs_set_n_fields(offsets, n); + rec_init_offsets(rec, index, offsets); + return(offsets); +} + /**************************************************************** -The following function is used to get a pointer to the nth data field in a -record. */ +The following function is used to get a pointer to the nth +data field in an old-style record. */ byte* -rec_get_nth_field( -/*==============*/ +rec_get_nth_field_old( +/*==================*/ /* out: pointer to the field */ rec_t* rec, /* in: record */ ulint n, /* in: index of the field */ @@ -103,9 +381,9 @@ rec_get_nth_field( ulint next_os; ut_ad(rec && len); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - if (n > 1024) { + if (n > REC_MAX_N_FIELDS) { fprintf(stderr, "Error: trying to access field %lu in rec\n", (ulong) n); ut_error; @@ -150,8 +428,78 @@ rec_get_nth_field( return(rec + os); } +/************************************************************** +The following function returns the size of a data tuple when converted to +a new-style physical record. */ + +ulint +rec_get_converted_size_new( +/*=======================*/ + /* out: size */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple) /* in: data tuple */ +{ + ulint size = REC_N_NEW_EXTRA_BYTES + + (index->n_nullable + 7) / 8; + dict_field_t* field; + dtype_t* type; + ulint i; + ulint n_fields; + ut_ad(index && dtuple); + ut_ad(index->table->comp); + + switch (dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) { + case REC_STATUS_ORDINARY: + n_fields = dict_index_get_n_fields(index); + ut_ad(n_fields == dtuple_get_n_fields(dtuple)); + break; + case REC_STATUS_NODE_PTR: + n_fields = dict_index_get_n_unique_in_tree(index); + ut_ad(n_fields + 1 == dtuple_get_n_fields(dtuple)); + ut_ad(dtuple_get_nth_field(dtuple, n_fields)->len == 4); + size += 4; /* child page number */ + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record, 8 bytes */ + return(size + 8); /* no extra data needed */ + default: + ut_a(0); + return(ULINT_UNDEFINED); + } + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + ulint len = dtuple_get_nth_field(dtuple, i)->len; + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + ut_ad(len != UNIV_SQL_NULL || + !(dtype_get_prtype(type) & DATA_NOT_NULL)); + + if (len == UNIV_SQL_NULL) { + /* No length is stored for NULL fields. */ + continue; + } + + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + ut_ad(!field->fixed_len || len == field->fixed_len); + + if (field->fixed_len) { + } else if (len < 128 || (dtype_get_len(type) < 256 + && dtype_get_mtype(type) != DATA_BLOB)) { + size++; + } else { + size += 2; + } + size += len; + } + + return(size); +} + /*************************************************************** -Sets the value of the ith field SQL null bit. */ +Sets the value of the ith field SQL null bit of an old-style record. */ void rec_set_nth_field_null_bit( @@ -189,12 +537,12 @@ rec_set_nth_field_null_bit( } /*************************************************************** -Sets the value of the ith field extern storage bit. */ +Sets the value of the ith field extern storage bit of an old-style record. */ void -rec_set_nth_field_extern_bit( -/*=========================*/ - rec_t* rec, /* in: record */ +rec_set_nth_field_extern_bit_old( +/*=============================*/ + rec_t* rec, /* in: old-style record */ ulint i, /* in: ith field */ ibool val, /* in: value to set */ mtr_t* mtr) /* in: mtr holding an X-latch to the page where @@ -204,7 +552,7 @@ rec_set_nth_field_extern_bit( ulint info; ut_a(!rec_get_1byte_offs_flag(rec)); - ut_a(i < rec_get_n_fields(rec)); + ut_a(i < rec_get_n_fields_old(rec)); info = rec_2_get_field_end_info(rec, i); @@ -215,36 +563,133 @@ rec_set_nth_field_extern_bit( } if (mtr) { - mlog_write_ulint(rec - REC_N_EXTRA_BYTES - 2 * (i + 1), info, - MLOG_2BYTES, mtr); + mlog_write_ulint(rec - REC_N_OLD_EXTRA_BYTES - 2 * (i + 1), + info, MLOG_2BYTES, mtr); } else { rec_2_set_field_end_info(rec, i, info); } } /*************************************************************** +Sets the value of the ith field extern storage bit of a new-style record. */ + +void +rec_set_nth_field_extern_bit_new( +/*=============================*/ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint ith, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ +{ + byte* nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + byte* lens = nulls - (index->n_nullable + 7) / 8; + dict_field_t* field; + dtype_t* type; + ulint i; + ulint n_fields; + ulint null_mask = 1; + ut_ad(rec && index); + ut_ad(index->table->comp); + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + + n_fields = dict_index_get_n_fields(index); + + ut_ad(ith < n_fields); + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* NULL fields cannot be external. */ + ut_ad(i != ith); + continue; + } + + null_mask <<= 1; + } + if (field->fixed_len) { + /* fixed-length fields cannot be external + (Fixed-length fields longer than + DICT_MAX_COL_PREFIX_LEN will be treated as + variable-length ones in dict_index_add_col().) */ + ut_ad(i != ith); + continue; + } + lens--; + if (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB) { + ulint len = lens[1]; + if (len & 0x80) { /* 1exxxxxx: 2-byte length */ + if (i == ith) { + if (!val == !(len & 0x40)) { + return; /* no change */ + } + /* toggle the extern bit */ + len ^= 0x40; + if (mtr) { + mlog_write_ulint(lens + 1, len, + MLOG_1BYTE, mtr); + } else { + lens[1] = (byte) len; + } + return; + } + lens--; + } else { + /* short fields cannot be external */ + ut_ad(i != ith); + } + } else { + /* short fields cannot be external */ + ut_ad(i != ith); + } + } +} + +/*************************************************************** Sets TRUE the extern storage bits of fields mentioned in an array. */ void rec_set_field_extern_bits( /*======================*/ - rec_t* rec, /* in: record */ - ulint* vec, /* in: array of field numbers */ - ulint n_fields, /* in: number of fields numbers */ - mtr_t* mtr) /* in: mtr holding an X-latch to the page - where rec is, or NULL; in the NULL case we - do not write to log about the change */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + const ulint* vec, /* in: array of field numbers */ + ulint n_fields,/* in: number of fields numbers */ + mtr_t* mtr) /* in: mtr holding an X-latch to the + page where rec is, or NULL; + in the NULL case we do not write + to log about the change */ { ulint i; - - for (i = 0; i < n_fields; i++) { - rec_set_nth_field_extern_bit(rec, vec[i], TRUE, mtr); + + if (UNIV_LIKELY(index->table->comp)) { + for (i = 0; i < n_fields; i++) { + rec_set_nth_field_extern_bit_new(rec, index, vec[i], + TRUE, mtr); + } + } else { + for (i = 0; i < n_fields; i++) { + rec_set_nth_field_extern_bit_old(rec, vec[i], + TRUE, mtr); + } } } /*************************************************************** -Sets a record field to SQL null. The physical size of the field is not -changed. */ +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ void rec_set_nth_field_sql_null( @@ -262,20 +707,20 @@ rec_set_nth_field_sql_null( } /************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -address destination. */ - -rec_t* -rec_convert_dtuple_to_rec_low( +Builds an old-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +static +rec_t* +rec_convert_dtuple_to_rec_old( /*==========================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple, /* in: data tuple */ - ulint data_size) /* in: data size of dtuple */ + /* out: pointer to the origin of + physical record */ + byte* buf, /* in: start address of the physical record */ + dtuple_t* dtuple)/* in: data tuple */ { dfield_t* field; ulint n_fields; + ulint data_size; rec_t* rec; ulint end_offset; ulint ored_offset; @@ -283,24 +728,25 @@ rec_convert_dtuple_to_rec_low( ulint len; ulint i; - ut_ad(destination && dtuple); + ut_ad(buf && dtuple); ut_ad(dtuple_validate(dtuple)); ut_ad(dtuple_check_typed(dtuple)); - ut_ad(dtuple_get_data_size(dtuple) == data_size); n_fields = dtuple_get_n_fields(dtuple); + data_size = dtuple_get_data_size(dtuple); ut_ad(n_fields > 0); /* Calculate the offset of the origin in the physical record */ - rec = destination + rec_get_converted_extra_size(data_size, n_fields); + rec = buf + rec_get_converted_extra_size(data_size, n_fields); /* Store the number of fields */ - rec_set_n_fields(rec, n_fields); + rec_set_n_fields_old(rec, n_fields); /* Set the info bits of the record */ - rec_set_info_bits(rec, dtuple_get_info_bits(dtuple)); + rec_set_info_bits(rec, FALSE, + dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK); /* Store the data and the offsets */ @@ -361,8 +807,218 @@ rec_convert_dtuple_to_rec_low( } } - ut_ad(rec_validate(rec)); + return(rec); +} + +/************************************************************* +Builds a new-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +static +rec_t* +rec_convert_dtuple_to_rec_new( +/*==========================*/ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the physical record */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple) /* in: data tuple */ +{ + dfield_t* field; + dtype_t* type; + rec_t* rec = buf + REC_N_NEW_EXTRA_BYTES; + byte* end; + byte* nulls; + byte* lens; + ulint len; + ulint i; + ulint n_node_ptr_field; + ulint fixed_len; + ulint null_mask = 1; + const ulint n_fields = dtuple_get_n_fields(dtuple); + const ulint status = dtuple_get_info_bits(dtuple) + & REC_NEW_STATUS_MASK; + ut_ad(index->table->comp); + + ut_ad(n_fields > 0); + + /* Try to ensure that the memset() between the for() loops + completes fast. The address is not exact, but UNIV_PREFETCH + should never generate a memory fault. */ + UNIV_PREFETCH_RW(rec - REC_N_NEW_EXTRA_BYTES - n_fields); + UNIV_PREFETCH_RW(rec); + + switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + n_node_ptr_field = ULINT_UNDEFINED; + break; + case REC_STATUS_NODE_PTR: + ut_ad(n_fields == dict_index_get_n_unique_in_tree(index) + 1); + n_node_ptr_field = n_fields - 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad(n_fields == 1); + n_node_ptr_field = ULINT_UNDEFINED; + goto init; + default: + ut_a(0); + return(0); + } + + /* Calculate the offset of the origin in the physical record. + We must loop over all fields to do this. */ + rec += (index->n_nullable + 7) / 8; + + for (i = 0; i < n_fields; i++) { + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { +#ifdef UNIV_DEBUG + field = dtuple_get_nth_field(dtuple, i); + type = dfield_get_type(field); + ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); + ut_ad(dfield_get_len(field) == 4); +#endif /* UNIV_DEBUG */ + goto init; + } + field = dtuple_get_nth_field(dtuple, i); + type = dfield_get_type(field); + len = dfield_get_len(field); + fixed_len = dict_index_get_nth_field(index, i)->fixed_len; + + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + if (len == UNIV_SQL_NULL) + continue; + } + /* only nullable fields can be null */ + ut_ad(len != UNIV_SQL_NULL); + if (fixed_len) { + ut_ad(len == fixed_len); + } else { + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + rec++; + if (len >= 128 && (dtype_get_len(type) >= 256 + || dtype_get_mtype(type) == DATA_BLOB)) { + rec++; + } + } + } + +init: + end = rec; + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - (index->n_nullable + 7) / 8; + /* clear the SQL-null flags */ + memset (lens + 1, 0, nulls - lens); + + /* Set the info bits of the record */ + rec_set_status(rec, status); + + rec_set_info_bits(rec, TRUE, + dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK); + + /* Store the data and the offsets */ + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(dtuple, i); + type = dfield_get_type(field); + len = dfield_get_len(field); + + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); + ut_ad(len == 4); + memcpy(end, dfield_get_data(field), len); + break; + } + fixed_len = dict_index_get_nth_field(index, i)->fixed_len; + + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + /* nullable field */ + ut_ad(index->n_nullable > 0); + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + ut_ad(*nulls < null_mask); + + /* set the null flag if necessary */ + if (len == UNIV_SQL_NULL) { + *nulls |= null_mask; + null_mask <<= 1; + continue; + } + + null_mask <<= 1; + } + /* only nullable fields can be null */ + ut_ad(len != UNIV_SQL_NULL); + if (fixed_len) { + ut_ad(len == fixed_len); + } else { + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + if (len < 128 || (dtype_get_len(type) < 256 + && dtype_get_mtype(type) != DATA_BLOB)) { + *lens-- = (byte) len; + } + else { + /* the extern bits will be set later */ + ut_ad(len < 16384); + *lens-- = (byte) (len >> 8) | 0x80; + *lens-- = (byte) len; + } + } + + memcpy(end, dfield_get_data(field), len); + end += len; + } + + return(rec); +} + +/************************************************************* +Builds a physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ + +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the + physical record */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple) /* in: data tuple */ +{ + rec_t* rec; + + ut_ad(buf && index && dtuple); + ut_ad(dtuple_validate(dtuple)); + ut_ad(dtuple_check_typed(dtuple)); + if (UNIV_LIKELY(index->table->comp)) { + rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple); + } else { + rec = rec_convert_dtuple_to_rec_old(buf, dtuple); + } + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + offsets = rec_get_offsets(rec, index, + offsets_, ULINT_UNDEFINED, &heap); + ut_ad(rec_validate(rec, offsets)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ return(rec); } @@ -375,6 +1031,7 @@ rec_copy_prefix_to_dtuple( /*======================*/ dtuple_t* tuple, /* in: data tuple */ rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ ulint n_fields, /* in: number of fields to copy */ mem_heap_t* heap) /* in: memory heap */ { @@ -383,16 +1040,22 @@ rec_copy_prefix_to_dtuple( ulint len; byte* buf = NULL; ulint i; - - ut_ad(rec_validate(rec)); + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + offsets = rec_get_offsets(rec, index, offsets, n_fields, &heap); + + ut_ad(rec_validate(rec, offsets)); ut_ad(dtuple_check_typed(tuple)); - dtuple_set_info_bits(tuple, rec_get_info_bits(rec)); + dtuple_set_info_bits(tuple, + rec_get_info_bits(rec, index->table->comp)); for (i = 0; i < n_fields; i++) { field = dtuple_get_nth_field(tuple, i); - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); if (len != UNIV_SQL_NULL) { buf = mem_heap_alloc(heap, len); @@ -405,32 +1068,28 @@ rec_copy_prefix_to_dtuple( } /****************************************************************** -Copies the first n fields of a physical record to a new physical record in -a buffer. */ - +Copies the first n fields of an old-style physical record +to a new physical record in a buffer. */ +static rec_t* -rec_copy_prefix_to_buf( -/*===================*/ +rec_copy_prefix_to_buf_old( +/*=======================*/ /* out, own: copied record */ rec_t* rec, /* in: physical record */ ulint n_fields, /* in: number of fields to copy */ + ulint area_end, /* in: end of the prefix data */ byte** buf, /* in/out: memory buffer for the copied prefix, or NULL */ ulint* buf_size) /* in/out: buffer size */ { rec_t* copy_rec; ulint area_start; - ulint area_end; ulint prefix_len; - ut_ad(rec_validate(rec)); - - area_end = rec_get_field_start_offs(rec, n_fields); - if (rec_get_1byte_offs_flag(rec)) { - area_start = REC_N_EXTRA_BYTES + n_fields; + area_start = REC_N_OLD_EXTRA_BYTES + n_fields; } else { - area_start = REC_N_EXTRA_BYTES + 2 * n_fields; + area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields; } prefix_len = area_start + area_end; @@ -448,17 +1107,129 @@ rec_copy_prefix_to_buf( copy_rec = *buf + area_start; - rec_set_n_fields(copy_rec, n_fields); + rec_set_n_fields_old(copy_rec, n_fields); return(copy_rec); } -/******************************************************************* -Validates the consistency of a physical record. */ +/****************************************************************** +Copies the first n fields of a physical record to a new physical record in +a buffer. */ + +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + /* out, own: copied record */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields to copy */ + byte** buf, /* in/out: memory buffer + for the copied prefix, or NULL */ + ulint* buf_size) /* in/out: buffer size */ +{ + byte* nulls; + byte* lens; + dict_field_t* field; + dtype_t* type; + ulint i; + ulint prefix_len; + ulint null_mask; + ulint status; + UNIV_PREFETCH_RW(*buf); + + if (UNIV_UNLIKELY(!index->table->comp)) { + ut_ad(rec_validate_old(rec)); + return(rec_copy_prefix_to_buf_old(rec, n_fields, + rec_get_field_start_offs(rec, n_fields), + buf, buf_size)); + } + + status = rec_get_status(rec); + + switch (status) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + break; + case REC_STATUS_NODE_PTR: + /* it doesn't make sense to copy the child page number field */ + ut_ad(n_fields <= dict_index_get_n_unique_in_tree(index)); + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record: no sense to copy anything */ + default: + ut_error; + return(NULL); + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - (index->n_nullable + 7) / 8; + UNIV_PREFETCH_R(lens); + prefix_len = 0; + null_mask = 1; + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + continue; + } + + null_mask <<= 1; + } + + if (field->fixed_len) { + prefix_len += field->fixed_len; + } else { + ulint len = *lens--; + if (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB) { + if (len & 0x80) { + /* 1exxxxxx */ + len &= 0x3f; + len <<= 8; + len |= *lens--; + UNIV_PREFETCH_R(lens); + } + } + prefix_len += len; + } + } + + UNIV_PREFETCH_R(rec + prefix_len); + + prefix_len += rec - (lens + 1); + + if ((*buf == NULL) || (*buf_size < prefix_len)) { + if (*buf != NULL) { + mem_free(*buf); + } + + *buf = mem_alloc(prefix_len); + *buf_size = prefix_len; + } + + memcpy(*buf, lens + 1, prefix_len); + + return(*buf + (rec - (lens + 1))); +} + +/******************************************************************* +Validates the consistency of an old-style physical record. */ +static ibool -rec_validate( -/*=========*/ +rec_validate_old( +/*=============*/ /* out: TRUE if ok */ rec_t* rec) /* in: physical record */ { @@ -470,7 +1241,7 @@ rec_validate( ulint i; ut_a(rec); - n_fields = rec_get_n_fields(rec); + n_fields = rec_get_n_fields_old(rec); if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { fprintf(stderr, "InnoDB: Error: record has %lu fields\n", @@ -479,7 +1250,7 @@ rec_validate( } for (i = 0; i < n_fields; i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field_old(rec, i, &len); if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { fprintf(stderr, @@ -499,45 +1270,167 @@ rec_validate( } } - if (len_sum != (ulint)(rec_get_end(rec) - rec)) { + if (len_sum != rec_get_data_size_old(rec)) { fprintf(stderr, "InnoDB: Error: record len should be %lu, len %lu\n", (ulong) len_sum, - (ulong) (rec_get_end(rec) - rec)); + rec_get_data_size_old(rec)); + return(FALSE); + } + + rec_dummy = sum; /* This is here only to fool the compiler */ + + return(TRUE); +} + +/******************************************************************* +Validates the consistency of a physical record. */ + +ibool +rec_validate( +/*=========*/ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + const byte* data; + ulint len; + ulint n_fields; + ulint len_sum = 0; + ulint sum = 0; + ulint i; + + ut_a(rec); + n_fields = rec_offs_n_fields(offsets); + + if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { + fprintf(stderr, "InnoDB: Error: record has %lu fields\n", + (ulong) n_fields); + return(FALSE); + } + + ut_a(rec_offs_comp(offsets) || n_fields <= rec_get_n_fields_old(rec)); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { + fprintf(stderr, + "InnoDB: Error: record field %lu len %lu\n", (ulong) i, + (ulong) len); + return(FALSE); + } + + if (len != UNIV_SQL_NULL) { + len_sum += len; + sum += *(data + len -1); /* dereference the + end of the field to + cause a memory trap + if possible */ + } else if (!rec_offs_comp(offsets)) { + len_sum += rec_get_nth_field_size(rec, i); + } + } + + if (len_sum != (ulint)(rec_get_end(rec, offsets) - rec)) { + fprintf(stderr, + "InnoDB: Error: record len should be %lu, len %lu\n", + (ulong) len_sum, + (ulong) (rec_get_end(rec, offsets) - rec)); return(FALSE); } rec_dummy = sum; /* This is here only to fool the compiler */ + if (!rec_offs_comp(offsets)) { + ut_a(rec_validate_old(rec)); + } + return(TRUE); } /******************************************************************* +Prints an old-style physical record. */ + +void +rec_print_old( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec) /* in: physical record */ +{ + const byte* data; + ulint len; + ulint n; + ulint i; + + ut_ad(rec); + + n = rec_get_n_fields_old(rec); + + fprintf(file, "PHYSICAL RECORD: n_fields %lu;" + " %u-byte offsets; info bits %lu\n", + (ulong) n, + rec_get_1byte_offs_flag(rec) ? 1 : 2, + (ulong) rec_get_info_bits(rec, FALSE)); + + for (i = 0; i < n; i++) { + + data = rec_get_nth_field_old(rec, i, &len); + + fprintf(file, " %lu:", (ulong) i); + + if (len != UNIV_SQL_NULL) { + if (len <= 30) { + + ut_print_buf(file, data, len); + } else { + ut_print_buf(file, data, 30); + + fputs("...(truncated)", file); + } + } else { + fprintf(file, " SQL NULL, size %lu ", + rec_get_nth_field_size(rec, i)); + } + putc(';', file); + } + + putc('\n', file); + + rec_validate_old(rec); +} + +/******************************************************************* Prints a physical record. */ void -rec_print( -/*======*/ - FILE* file, /* in: file where to print */ - rec_t* rec) /* in: physical record */ +rec_print_new( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - byte* data; - ulint len; - ulint n; - ulint i; + const byte* data; + ulint len; + ulint i; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (!rec_offs_comp(offsets)) { + rec_print_old(file, rec); + return; + } ut_ad(rec); - - n = rec_get_n_fields(rec); fprintf(file, "PHYSICAL RECORD: n_fields %lu;" - " 1-byte offs %s; info bits %lu\n", - (ulong) n, rec_get_1byte_offs_flag(rec) ? "TRUE" : "FALSE", - (ulong) rec_get_info_bits(rec)); + " compact format; info bits %lu\n", + (ulong) rec_offs_n_fields(offsets), + (ulong) rec_get_info_bits(rec, TRUE)); - for (i = 0; i < n; i++) { + for (i = 0; i < rec_offs_n_fields(offsets); i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); fprintf(file, " %lu:", (ulong) i); @@ -551,14 +1444,40 @@ rec_print( fputs("...(truncated)", file); } } else { - fprintf(file, " SQL NULL, size %lu ", - (ulong) rec_get_nth_field_size(rec, i)); - + fputs(" SQL NULL", file); } putc(';', file); } putc('\n', file); - rec_validate(rec); + rec_validate(rec, offsets); +} + +/******************************************************************* +Prints a physical record. */ + +void +rec_print( +/*======*/ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + dict_index_t* index) /* in: record descriptor */ +{ + ut_ad(index); + + if (!index->table->comp) { + rec_print_old(file, rec); + return; + } else { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + rec_print_new(file, rec, rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } } diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index 5ca1ee51cbd..bce775c25d6 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -256,7 +256,7 @@ row_ins_sec_index_entry_by_modify( rec = btr_cur_get_rec(cursor); ut_ad((cursor->index->type & DICT_CLUSTERED) == 0); - ut_ad(rec_get_deleted_flag(rec)); + ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp)); /* We know that in the alphabetical ordering, entry and rec are identified. But in their binary form there may be differences if @@ -321,7 +321,7 @@ row_ins_clust_index_entry_by_modify( rec = btr_cur_get_rec(cursor); - ut_ad(rec_get_deleted_flag(rec)); + ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp)); heap = mem_heap_create(1024); @@ -478,6 +478,8 @@ row_ins_cascade_calc_update_vec( if (parent_ufield->field_no == parent_field_no) { + ulint min_size; + /* A field in the parent index record is updated. Let us make the update vector field for the child table. */ @@ -506,10 +508,13 @@ row_ins_cascade_calc_update_vec( column, do not allow the update */ if (ufield->new_val.len != UNIV_SQL_NULL - && ufield->new_val.len - > dtype_get_len(type)) { + && dtype_get_at_most_n_mbchars( + type, dtype_get_len(type), + ufield->new_val.len, + ufield->new_val.data) + < ufield->new_val.len) { - return(ULINT_UNDEFINED); + return(ULINT_UNDEFINED); } /* If the parent column type has a different @@ -517,25 +522,48 @@ row_ins_cascade_calc_update_vec( need to pad with spaces the new value of the child column */ - if (dtype_is_fixed_size(type) + min_size = dtype_get_min_size(type); + + if (min_size && ufield->new_val.len != UNIV_SQL_NULL - && ufield->new_val.len - < dtype_get_fixed_size(type)) { + && ufield->new_val.len < min_size) { + char* pad_start; + const char* pad_end; ufield->new_val.data = mem_heap_alloc(heap, - dtype_get_fixed_size(type)); - ufield->new_val.len = - dtype_get_fixed_size(type); - ut_a(dtype_get_pad_char(type) - != ULINT_UNDEFINED); - - memset(ufield->new_val.data, - (byte)dtype_get_pad_char(type), - dtype_get_fixed_size(type)); + min_size); + pad_start = + ((char*) ufield->new_val.data) + + ufield->new_val.len; + pad_end = + ((char*) ufield->new_val.data) + + min_size; + ufield->new_val.len = min_size; ut_memcpy(ufield->new_val.data, parent_ufield->new_val.data, parent_ufield->new_val.len); + + switch (UNIV_EXPECT( + dtype_get_mbminlen(type), 1)) { + default: + ut_error; + case 1: + /* space=0x20 */ + memset(pad_start, 0x20, + pad_end - pad_start); + break; + case 2: + /* space=0x0020 */ + ut_a(!(ufield->new_val.len + % 2)); + ut_a(!(min_size % 2)); + do { + *pad_start++ = 0x00; + *pad_start++ = 0x20; + } while (pad_start < pad_end); + break; + } } ufield->extern_storage = FALSE; @@ -594,7 +622,7 @@ row_ins_foreign_report_err( ut_print_name(ef, trx, foreign->foreign_index->name); if (rec) { fputs(", there is a record:\n", ef); - rec_print(ef, rec); + rec_print(ef, rec, foreign->foreign_index); } else { fputs(", the record is not available\n", ef); } @@ -649,7 +677,7 @@ row_ins_foreign_report_add_err( } if (rec) { - rec_print(ef, rec); + rec_print(ef, rec, foreign->foreign_index); } putc('\n', ef); @@ -711,7 +739,6 @@ row_ins_foreign_check_on_constraint( dict_index_t* index; dict_index_t* clust_index; dtuple_t* ref; - mem_heap_t* tmp_heap; mem_heap_t* upd_vec_heap = NULL; rec_t* rec; rec_t* clust_rec; @@ -720,14 +747,17 @@ row_ins_foreign_check_on_constraint( ulint err; ulint i; trx_t* trx; + mem_heap_t* tmp_heap = NULL; - ut_a(thr && foreign && pcur && mtr); trx = thr_get_trx(thr); /* Since we are going to delete or update a row, we have to invalidate - the MySQL query cache for table */ + the MySQL query cache for table. A deadlock of threads is not possible + here because the caller of this function does not hold any latches with + the sync0sync.h rank above the kernel mutex. The query cache mutex has + a rank just above the kernel mutex. */ row_ins_invalidate_query_cache(thr, table->name); @@ -821,7 +851,7 @@ row_ins_foreign_check_on_constraint( err = DB_ROW_IS_REFERENCED; row_ins_foreign_report_err( -(char*)"Trying a too deep cascaded delete or update\n", +"Trying a too deep cascaded delete or update\n", thr, foreign, btr_pcur_get_rec(pcur), entry); goto nonstandard_exit_func; @@ -853,8 +883,6 @@ row_ins_foreign_check_on_constraint( PAGE_CUR_LE, BTR_SEARCH_LEAF, cascade->pcur, 0, mtr); - mem_heap_free(tmp_heap); - clust_rec = btr_pcur_get_rec(cascade->pcur); if (!page_rec_is_user_rec(clust_rec) @@ -868,10 +896,10 @@ row_ins_foreign_check_on_constraint( fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, index); fputs("\n" "InnoDB: clustered record ", stderr); - rec_print(stderr, clust_rec); + rec_print(stderr, clust_rec, clust_index); fputs("\n" "InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); @@ -889,9 +917,9 @@ row_ins_foreign_check_on_constraint( /* Here it suffices to use a LOCK_REC_NOT_GAP type lock; we already have a normal shared lock on the appropriate gap if the search criterion was not unique */ - - err = lock_clust_rec_read_check_and_lock(0, clust_rec, - clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr); + + err = lock_clust_rec_read_check_and_lock_alt(0, clust_rec, + clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr); } if (err != DB_SUCCESS) { @@ -899,7 +927,7 @@ row_ins_foreign_check_on_constraint( goto nonstandard_exit_func; } - if (rec_get_deleted_flag(clust_rec)) { + if (rec_get_deleted_flag(clust_rec, table->comp)) { /* This can happen if there is a circular reference of rows such that cascading delete comes to delete a row already in the process of being delete marked */ @@ -1008,6 +1036,10 @@ row_ins_foreign_check_on_constraint( btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + if (upd_vec_heap) { mem_heap_free(upd_vec_heap); } @@ -1015,6 +1047,9 @@ row_ins_foreign_check_on_constraint( return(err); nonstandard_exit_func: + if (tmp_heap) { + mem_heap_free(tmp_heap); + } if (upd_vec_heap) { mem_heap_free(upd_vec_heap); @@ -1042,16 +1077,19 @@ row_ins_set_shared_rec_lock( LOCK_REC_NOT_GAP type lock */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr) /* in: query thread */ { ulint err; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (index->type & DICT_CLUSTERED) { - err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_S, - type, thr); + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_S, type, thr); } else { - err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_S, - type, thr); + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_S, type, thr); } return(err); @@ -1069,16 +1107,19 @@ row_ins_set_exclusive_rec_lock( LOCK_REC_NOT_GAP type lock */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr) /* in: query thread */ { ulint err; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (index->type & DICT_CLUSTERED) { - err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_X, - type, thr); + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_X, type, thr); } else { - err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_X, - type, thr); + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_X, type, thr); } return(err); @@ -1117,7 +1158,11 @@ row_ins_check_foreign_constraint( ulint err; ulint i; mtr_t mtr; - trx_t* trx = thr_get_trx(thr); + trx_t* trx = thr_get_trx(thr); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; run_again: #ifdef UNIV_SYNC_DEBUG @@ -1129,8 +1174,7 @@ run_again: if (trx->check_foreigns == FALSE) { /* The user has suppressed foreign key checks currently for this session */ - - return(DB_SUCCESS); + goto exit_func; } /* If any of the foreign key fields in entry is SQL NULL, we @@ -1141,7 +1185,7 @@ run_again: if (UNIV_SQL_NULL == dfield_get_len( dtuple_get_nth_field(entry, i))) { - return(DB_SUCCESS); + goto exit_func; } } @@ -1164,8 +1208,8 @@ run_again: with each foreign key constraint, one after another, and the user has problems predicting in which order they are performed. */ - - return(DB_SUCCESS); + + goto exit_func; } } @@ -1199,10 +1243,10 @@ run_again: fputs("\nor its .ibd file does not currently exist!\n", ef); mutex_exit(&dict_foreign_err_mutex); - return(DB_NO_REFERENCED_ROW); + err = DB_NO_REFERENCED_ROW; } - return(DB_SUCCESS); + goto exit_func; } ut_a(check_table && check_index); @@ -1233,17 +1277,22 @@ run_again: /* Scan index records and check if there is a matching record */ for (;;) { + page_t* page; rec = btr_pcur_get_rec(&pcur); + page = buf_frame_align(rec); - if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + if (rec == page_get_infimum_rec(page)) { goto next_rec; } - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { - + offsets = rec_get_offsets(rec, check_index, + offsets, ULINT_UNDEFINED, &heap); + + if (rec == page_get_supremum_rec(page)) { + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec, - check_index, thr); + check_index, offsets, thr); if (err != DB_SUCCESS) { break; @@ -1252,13 +1301,14 @@ run_again: goto next_rec; } - cmp = cmp_dtuple_rec(entry, rec); + cmp = cmp_dtuple_rec(entry, rec, offsets); if (cmp == 0) { - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, + rec_offs_comp(offsets))) { err = row_ins_set_shared_rec_lock( - LOCK_ORDINARY, - rec, check_index, thr); + LOCK_ORDINARY, rec, + check_index, offsets, thr); if (err != DB_SUCCESS) { break; @@ -1269,9 +1319,9 @@ run_again: into gaps */ err = row_ins_set_shared_rec_lock( - LOCK_REC_NOT_GAP, - rec, check_index, thr); - + LOCK_REC_NOT_GAP, rec, + check_index, offsets, thr); + if (err != DB_SUCCESS) { break; @@ -1307,7 +1357,7 @@ run_again: if (cmp < 0) { err = row_ins_set_shared_rec_lock(LOCK_GAP, - rec, check_index, thr); + rec, check_index, offsets, thr); if (err != DB_SUCCESS) { break; @@ -1365,6 +1415,10 @@ do_possible_lock_wait: err = trx->error_state; } +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(err); } @@ -1462,19 +1516,23 @@ row_ins_dupl_error_with_rec( that the caller already has a record lock on the record! */ dtuple_t* entry, /* in: entry to insert */ - dict_index_t* index) /* in: index */ + dict_index_t* index, /* in: index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { ulint matched_fields; ulint matched_bytes; ulint n_unique; ulint i; - + + ut_ad(rec_offs_validate(rec, index, offsets)); + n_unique = dict_index_get_n_unique(index); matched_fields = 0; matched_bytes = 0; - cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes); + cmp_dtuple_rec_with_match(entry, rec, offsets, + &matched_fields, &matched_bytes); if (matched_fields < n_unique) { @@ -1495,12 +1553,7 @@ row_ins_dupl_error_with_rec( } } - if (!rec_get_deleted_flag(rec)) { - - return(TRUE); - } - - return(FALSE); + return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); } /******************************************************************* @@ -1517,6 +1570,7 @@ row_ins_scan_sec_index_for_duplicate( dtuple_t* entry, /* in: index entry */ que_thr_t* thr) /* in: query thread */ { +#ifndef UNIV_HOTBACKUP ulint n_unique; ulint i; int cmp; @@ -1526,8 +1580,11 @@ row_ins_scan_sec_index_for_duplicate( ulint err = DB_SUCCESS; ibool moved; mtr_t mtr; - trx_t* trx; - + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + n_unique = dict_index_get_n_unique(index); /* If the secondary index is unique, but one of the fields in the @@ -1562,10 +1619,8 @@ row_ins_scan_sec_index_for_duplicate( goto next_rec; } - /* Try to place a lock on the index record */ - - trx = thr_get_trx(thr); - ut_ad(trx); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); if (innobase_query_is_update()) { @@ -1574,12 +1629,12 @@ row_ins_scan_sec_index_for_duplicate( duplicates ( REPLACE, LOAD DATAFILE REPLACE, INSERT ON DUPLICATE KEY UPDATE). */ - err = row_ins_set_exclusive_rec_lock( - LOCK_ORDINARY,rec,index,thr); + err = row_ins_set_exclusive_rec_lock(LOCK_ORDINARY, + rec, index, offsets, thr); } else { - err = row_ins_set_shared_rec_lock( - LOCK_ORDINARY, rec, index,thr); + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, + rec, index, offsets, thr); } if (err != DB_SUCCESS) { @@ -1587,15 +1642,16 @@ row_ins_scan_sec_index_for_duplicate( break; } - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + if (page_rec_is_supremum(rec)) { goto next_rec; } - cmp = cmp_dtuple_rec(entry, rec); + cmp = cmp_dtuple_rec(entry, rec, offsets); if (cmp == 0) { - if (row_ins_dupl_error_with_rec(rec, entry, index)) { + if (row_ins_dupl_error_with_rec(rec, entry, + index, offsets)) { err = DB_DUPLICATE_KEY; thr_get_trx(thr)->error_info = index; @@ -1617,12 +1673,21 @@ next_rec: } } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } mtr_commit(&mtr); /* Restore old value */ dtuple_set_n_fields_cmp(entry, n_fields_cmp); return(err); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } /******************************************************************* @@ -1642,11 +1707,15 @@ row_ins_duplicate_error_in_clust( que_thr_t* thr, /* in: query thread */ mtr_t* mtr) /* in: mtr */ { +#ifndef UNIV_HOTBACKUP ulint err; rec_t* rec; - page_t* page; ulint n_unique; - trx_t* trx = thr_get_trx(thr); + trx_t* trx = thr_get_trx(thr); + mem_heap_t*heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; UT_NOT_USED(mtr); @@ -1671,9 +1740,10 @@ row_ins_duplicate_error_in_clust( if (cursor->low_match >= n_unique) { rec = btr_cur_get_rec(cursor); - page = buf_frame_align(rec); - if (rec != page_get_infimum_rec(page)) { + if (!page_rec_is_infimum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); /* We set a lock on the possible duplicate: this is needed in logical logging of MySQL to make @@ -1689,23 +1759,23 @@ row_ins_duplicate_error_in_clust( err = row_ins_set_exclusive_rec_lock( LOCK_REC_NOT_GAP,rec,cursor->index, - thr); + offsets, thr); } else { err = row_ins_set_shared_rec_lock( LOCK_REC_NOT_GAP,rec, cursor->index, - thr); + offsets, thr); } if (err != DB_SUCCESS) { - - return(err); + goto func_exit; } if (row_ins_dupl_error_with_rec(rec, entry, - cursor->index)) { + cursor->index, offsets)) { trx->error_info = cursor->index; - return(DB_DUPLICATE_KEY); + err = DB_DUPLICATE_KEY; + goto func_exit; } } } @@ -1713,10 +1783,10 @@ row_ins_duplicate_error_in_clust( if (cursor->up_match >= n_unique) { rec = page_rec_get_next(btr_cur_get_rec(cursor)); - page = buf_frame_align(rec); - - if (rec != page_get_supremum_rec(page)) { + if (!page_rec_is_supremum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); if (innobase_query_is_update()) { @@ -1726,32 +1796,41 @@ row_ins_duplicate_error_in_clust( INSERT ON DUPLICATE KEY UPDATE). */ err = row_ins_set_exclusive_rec_lock( - LOCK_REC_NOT_GAP, - rec,cursor->index,thr); + LOCK_REC_NOT_GAP, rec, + cursor->index, offsets, thr); } else { err = row_ins_set_shared_rec_lock( - LOCK_REC_NOT_GAP,rec, - cursor->index, thr); + LOCK_REC_NOT_GAP, rec, + cursor->index, offsets, thr); } if (err != DB_SUCCESS) { - - return(err); + goto func_exit; } if (row_ins_dupl_error_with_rec(rec, entry, - cursor->index)) { + cursor->index, offsets)) { trx->error_info = cursor->index; - return(DB_DUPLICATE_KEY); + err = DB_DUPLICATE_KEY; + goto func_exit; } + mem_heap_free(heap); } ut_a(!(cursor->index->type & DICT_CLUSTERED)); /* This should never happen */ } - return(DB_SUCCESS); + err = DB_SUCCESS; +func_exit: + return(err); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } /******************************************************************* @@ -1773,7 +1852,6 @@ row_ins_must_modify( { ulint enough_match; rec_t* rec; - page_t* page; /* NOTE: (compare to the note in row_ins_duplicate_error) Because node pointers on upper levels of the B-tree may match more to entry than @@ -1787,9 +1865,8 @@ row_ins_must_modify( if (cursor->low_match >= enough_match) { rec = btr_cur_get_rec(cursor); - page = buf_frame_align(rec); - if (rec != page_get_infimum_rec(page)) { + if (!page_rec_is_infimum(rec)) { return(ROW_INS_PREV); } @@ -1828,12 +1905,15 @@ row_ins_index_entry_low( ulint modify = 0; /* remove warning */ rec_t* insert_rec; rec_t* rec; - rec_t* first_rec; ulint err; ulint n_unique; big_rec_t* big_rec = NULL; mtr_t mtr; - + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + log_free_check(); mtr_start(&mtr); @@ -1859,15 +1939,20 @@ row_ins_index_entry_low( err = DB_SUCCESS; goto function_exit; - } - - first_rec = page_rec_get_next(page_get_infimum_rec( - buf_frame_align(btr_cur_get_rec(&cursor)))); + } + +#ifdef UNIV_DEBUG + { + page_t* page = btr_cur_get_page(&cursor); + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); - if (!page_rec_is_supremum(first_rec)) { - ut_a((rec_get_n_fields(first_rec)) - == dtuple_get_n_fields(entry)); + if (UNIV_LIKELY(first_rec != page_get_supremum_rec(page))) { + ut_a(rec_get_n_fields(first_rec, index) + == dtuple_get_n_fields(entry)); + } } +#endif n_unique = dict_index_get_n_unique(index); @@ -1944,7 +2029,7 @@ row_ins_index_entry_low( if (err == DB_SUCCESS) { if (ext_vec) { - rec_set_field_extern_bits(insert_rec, + rec_set_field_extern_bits(insert_rec, index, ext_vec, n_ext_vec, &mtr); } } @@ -1954,14 +2039,18 @@ function_exit: mtr_commit(&mtr); if (big_rec) { + rec_t* rec; mtr_start(&mtr); btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, BTR_MODIFY_TREE, &cursor, 0, &mtr); + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + err = btr_store_big_rec_extern_fields(index, rec, + offsets, big_rec, &mtr); - err = btr_store_big_rec_extern_fields(index, - btr_cur_get_rec(&cursor), - big_rec, &mtr); if (modify) { dtuple_big_rec_free(big_rec); } else { @@ -1971,6 +2060,9 @@ function_exit: mtr_commit(&mtr); } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(err); } diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index 7ce5c766e06..2ac0824b331 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -106,20 +106,6 @@ row_mysql_delay_if_needed(void) } /*********************************************************************** -Reads a MySQL format variable-length field (like VARCHAR) length and -returns pointer to the field data. */ - -byte* -row_mysql_read_var_ref_noninline( -/*=============================*/ - /* out: field + 2 */ - ulint* len, /* out: variable-length field length */ - byte* field) /* in: field */ -{ - return(row_mysql_read_var_ref(len, field)); -} - -/*********************************************************************** Frees the blob heap in prebuilt when no longer needed. */ void @@ -133,6 +119,61 @@ row_mysql_prebuilt_free_blob_heap( } /*********************************************************************** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. */ + +byte* +row_mysql_store_true_var_len( +/*=========================*/ + /* out: pointer to the data, we skip the 1 or 2 bytes + at the start that are used to store the len */ + byte* dest, /* in: where to store */ + ulint len, /* in: length, must fit in two bytes */ + ulint lenlen) /* in: storage length of len: either 1 or 2 bytes */ +{ + if (lenlen == 2) { + ut_a(len < 256 * 256); + + mach_write_to_2_little_endian(dest, len); + + return(dest + 2); + } + + ut_a(lenlen == 1); + ut_a(len < 256); + + mach_write_to_1(dest, len); + + return(dest + 1); +} + +/*********************************************************************** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. */ + +byte* +row_mysql_read_true_varchar( +/*========================*/ + /* out: pointer to the data, we skip the 1 or 2 bytes + at the start that are used to store the len */ + ulint* len, /* out: variable-length field length */ + byte* field, /* in: field in the MySQL format */ + ulint lenlen) /* in: storage length of len: either 1 or 2 bytes */ +{ + if (lenlen == 2) { + *len = mach_read_from_2_little_endian(field); + + return(field + 2); + } + + ut_a(lenlen == 1); + + *len = mach_read_from_1(field); + + return(field + 1); +} + +/*********************************************************************** Stores a reference to a BLOB in the MySQL format. */ void @@ -191,15 +232,177 @@ row_mysql_read_blob_ref( } /****************************************************************** -Convert a row in the MySQL format to a row in the Innobase format. */ +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.c. */ + +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + /* out: up to which byte we used + buf in the conversion */ + dfield_t* dfield, /* in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /* in/out: buffer for a converted + integer value; this must be at least + col_len long then! */ + ibool row_format_col, /* TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + byte* mysql_data, /* in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /* in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp) /* in: nonzero=compact format */ +{ + byte* ptr = mysql_data; + dtype_t* dtype; + ulint type; + ulint lenlen; + + dtype = dfield_get_type(dfield); + + type = dtype->mtype; + + if (type == DATA_INT) { + /* Store integer data in Innobase in a big-endian format, + sign bit negated if the data is a signed integer. In MySQL, + integers are stored in a little-endian format. */ + + ptr = buf + col_len; + + for (;;) { + ptr--; + *ptr = *mysql_data; + if (ptr == buf) { + break; + } + mysql_data++; + } + + if (!(dtype->prtype & DATA_UNSIGNED)) { + + *ptr = (byte) (*ptr ^ 128); + } + + buf += col_len; + } else if ((type == DATA_VARCHAR + || type == DATA_VARMYSQL + || type == DATA_BINARY)) { + + if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) { + /* The length of the actual data is stored to 1 or 2 + bytes at the start of the field */ + + if (row_format_col) { + if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) { + lenlen = 2; + } else { + lenlen = 1; + } + } else { + /* In a MySQL key value, lenlen is always 2 */ + lenlen = 2; + } + + ptr = row_mysql_read_true_varchar(&col_len, mysql_data, + lenlen); + } else { + /* Remove trailing spaces from old style VARCHAR + columns. */ + + /* Handle UCS2 strings differently. */ + ulint mbminlen = dtype_get_mbminlen(dtype); + + ptr = mysql_data; + + if (mbminlen == 2) { + /* space=0x0020 */ + /* Trim "half-chars", just in case. */ + col_len &= ~1; + + while (col_len >= 2 && ptr[col_len - 2] == 0x00 + && ptr[col_len - 1] == 0x20) { + col_len -= 2; + } + } else { + ut_a(mbminlen == 1); + /* space=0x20 */ + while (col_len > 0 + && ptr[col_len - 1] == 0x20) { + col_len--; + } + } + } + } else if (comp && type == DATA_MYSQL + && dtype_get_mbminlen(dtype) == 1 + && dtype_get_mbmaxlen(dtype) > 1) { + /* In some cases we strip trailing spaces from UTF-8 and other + multibyte charsets, from FIXED-length CHAR columns, to save + space. UTF-8 would otherwise normally use 3 * the string length + bytes to store a latin1 string! */ + + /* We assume that this CHAR field is encoded in a + variable-length character set where spaces have + 1:1 correspondence to 0x20 bytes, such as UTF-8. + + Consider a CHAR(n) field, a field of n characters. + It will contain between n * mbminlen and n * mbmaxlen bytes. + We will try to truncate it to n bytes by stripping + space padding. If the field contains single-byte + characters only, it will be truncated to n characters. + Consider a CHAR(5) field containing the string ".a " + where "." denotes a 3-byte character represented by + the bytes "$%&". After our stripping, the string will + be stored as "$%&a " (5 bytes). The string ".abc " + will be stored as "$%&abc" (6 bytes). + + The space padding will be restored in row0sel.c, function + row_sel_field_store_in_mysql_format(). */ + + ulint n_chars; + + ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype))); + + n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype); + + /* Strip space padding. */ + while (col_len > n_chars && ptr[col_len - 1] == 0x20) { + col_len--; + } + } else if (type == DATA_BLOB && row_format_col) { + + ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len); + } + + dfield_set_data(dfield, ptr, col_len); + + return(buf); +} + +/****************************************************************** +Convert a row in the MySQL format to a row in the Innobase format. Note that +the function to convert a MySQL format key value to an InnoDB dtuple is +row_sel_convert_mysql_key_to_innobase() in row0sel.c. */ static void row_mysql_convert_row_to_innobase( /*==============================*/ dtuple_t* row, /* in/out: Innobase row where the field type information is already - copied there, or will be copied - later */ + copied there! */ row_prebuilt_t* prebuilt, /* in: prebuilt struct where template must be of type ROW_MYSQL_WHOLE_ROW */ byte* mysql_rec) /* in: row in the MySQL format; @@ -236,9 +439,10 @@ row_mysql_convert_row_to_innobase( row_mysql_store_col_in_innobase_format(dfield, prebuilt->ins_upd_rec_buff + templ->mysql_col_offset, + TRUE, /* MySQL row format data */ mysql_rec + templ->mysql_col_offset, templ->mysql_col_len, - templ->type, templ->is_unsigned); + prebuilt->table->comp); next_column: ; } @@ -260,6 +464,7 @@ row_mysql_handle_errors( que_thr_t* thr, /* in: query thread */ trx_savept_t* savept) /* in: savepoint or NULL */ { +#ifndef UNIV_HOTBACKUP ulint err; handle_new_error: @@ -359,6 +564,12 @@ handle_new_error: trx->error_state = DB_SUCCESS; return(FALSE); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } /************************************************************************ @@ -421,6 +632,9 @@ row_create_prebuilt( clust_index = dict_table_get_first_index(table); + /* Make sure that search_tuple is long enough for clustered index */ + ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields); + ref_len = dict_index_get_n_unique(clust_index); ref = dtuple_create(heap, ref_len); @@ -583,7 +797,8 @@ static dtuple_t* row_get_prebuilt_insert_row( /*========================*/ - /* out: prebuilt dtuple */ + /* out: prebuilt dtuple; the column + type information is also set in it */ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL handle */ { @@ -756,24 +971,6 @@ run_again: } /************************************************************************* -Unlocks all table locks explicitly requested by trx (with LOCK TABLES, -lock type LOCK_TABLE_EXP). */ - -void -row_unlock_tables_for_mysql( -/*========================*/ - trx_t* trx) /* in: transaction */ -{ - if (!trx->n_lock_table_exp) { - - return; - } - - mutex_enter(&kernel_mutex); - lock_release_tables_off_kernel(trx); - mutex_exit(&kernel_mutex); -} -/************************************************************************* Sets a table lock on the table mentioned in prebuilt. */ int @@ -784,9 +981,10 @@ row_lock_table_for_mysql( table handle */ dict_table_t* table, /* in: table to lock, or NULL if prebuilt->table should be - locked as LOCK_TABLE_EXP | + locked as prebuilt->select_lock_type */ - ulint mode) /* in: lock mode of table */ + ulint mode) /* in: lock mode of table + (ignored if table==NULL) */ { trx_t* trx = prebuilt->trx; que_thr_t* thr; @@ -822,8 +1020,8 @@ run_again: if (table) { err = lock_table(0, table, mode, thr); } else { - err = lock_table(LOCK_TABLE_EXP, prebuilt->table, - prebuilt->select_lock_type, thr); + err = lock_table(0, prebuilt->table, + prebuilt->select_lock_type, thr); } trx->error_state = err; @@ -946,8 +1144,12 @@ run_again: if (err != DB_SUCCESS) { que_thr_stop_for_mysql(thr); +/* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW; + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK; + if (was_lock_wait) { goto run_again; } @@ -1193,9 +1395,11 @@ run_again: return((int) err); } - + + thr->lock_state= QUE_THR_LOCK_ROW; was_lock_wait = row_mysql_handle_errors(&err, trx, thr, &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK;; if (was_lock_wait) { goto run_again; } @@ -1224,6 +1428,112 @@ run_again: return((int) err); } +/************************************************************************* +This can only be used when srv_locks_unsafe_for_binlog is TRUE. Before +calling this function we must use trx_reset_new_rec_lock_info() and +trx_register_new_rec_lock() to store the information which new record locks +really were set. This function removes a newly set lock under prebuilt->pcur, +and also under prebuilt->clust_pcur. Currently, this is only used and tested +in the case of an UPDATE or a DELETE statement, where the row lock is of the +LOCK_X type. +Thus, this implements a 'mini-rollback' that releases the latest record +locks we set. */ + +int +row_unlock_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL + handle */ + ibool has_latches_on_recs)/* TRUE if called so that we have + the latches on the records under pcur + and clust_pcur, and we do not need to + reposition the cursors. */ +{ + dict_index_t* index; + btr_pcur_t* pcur = prebuilt->pcur; + btr_pcur_t* clust_pcur = prebuilt->clust_pcur; + trx_t* trx = prebuilt->trx; + rec_t* rec; + mtr_t mtr; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (!srv_locks_unsafe_for_binlog) { + + fprintf(stderr, +"InnoDB: Error: calling row_unlock_for_mysql though\n" +"InnoDB: srv_locks_unsafe_for_binlog is FALSE.\n"); + + return(DB_SUCCESS); + } + + trx->op_info = "unlock_row"; + + index = btr_pcur_get_btr_cur(pcur)->index; + + if (index != NULL && trx_new_rec_locks_contain(trx, index)) { + + mtr_start(&mtr); + + /* Restore the cursor position and find the record */ + + if (!has_latches_on_recs) { + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr); + } + + rec = btr_pcur_get_rec(pcur); + + mutex_enter(&kernel_mutex); + + lock_rec_reset_and_release_wait(rec); + + mutex_exit(&kernel_mutex); + + mtr_commit(&mtr); + + /* If the search was done through the clustered index, then + we have not used clust_pcur at all, and we must NOT try to + reset locks on clust_pcur. The values in clust_pcur may be + garbage! */ + + if (index->type & DICT_CLUSTERED) { + + goto func_exit; + } + } + + index = btr_pcur_get_btr_cur(clust_pcur)->index; + + if (index != NULL && trx_new_rec_locks_contain(trx, index)) { + + mtr_start(&mtr); + + /* Restore the cursor position and find the record */ + + if (!has_latches_on_recs) { + btr_pcur_restore_position(BTR_SEARCH_LEAF, clust_pcur, + &mtr); + } + + rec = btr_pcur_get_rec(clust_pcur); + + mutex_enter(&kernel_mutex); + + lock_rec_reset_and_release_wait(rec); + + mutex_exit(&kernel_mutex); + + mtr_commit(&mtr); + } + +func_exit: + trx->op_info = ""; + + return(DB_SUCCESS); +} + /************************************************************************** Does a cascaded delete or set null in a foreign key operation. */ @@ -2020,6 +2330,7 @@ row_add_table_to_background_drop_list( return(TRUE); } +#ifndef UNIV_HOTBACKUP /************************************************************************* Discards the tablespace of a table which stored in an .ibd file. Discarding means that this function deletes the .ibd file and assigns a new table id for @@ -2370,6 +2681,302 @@ funct_exit: } /************************************************************************* +Truncates a table for MySQL. */ + +int +row_truncate_table_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table handle */ + trx_t* trx) /* in: transaction handle */ +{ + dict_foreign_t* foreign; + ulint err; + mem_heap_t* heap; + byte* buf; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sys_index; + btr_pcur_t pcur; + mtr_t mtr; + dulint new_id; + char* sql; + que_thr_t* thr; + que_t* graph = NULL; + +/* How do we prevent crashes caused by ongoing operations on the table? Old +operations could try to access non-existent pages. + +1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock +on the table before we can do TRUNCATE TABLE. Then there are no running +queries on the table. This is guaranteed, because in +ha_innobase::store_lock(), we do not weaken the TL_WRITE lock requested +by MySQL when executing SQLCOM_TRUNCATE. +2) Purge and rollback: we assign a new table id for the table. Since purge and +rollback look for the table based on the table id, they see the table as +'dropped' and discard their operations. +3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE, so we do not +have to remove insert buffer records, as the insert buffer works at a low +level. If a freed page is later reallocated, the allocator will remove +the ibuf entries for it. + +TODO: when we truncate *.ibd files (analogous to DISCARD TABLESPACE), we +will have to remove we remove all entries for the table in the insert +buffer tree! + +4) Linear readahead and random readahead: we use the same method as in 3) to +discard ongoing operations. (This will only be relevant for TRUNCATE TABLE +by DISCARD TABLESPACE.) +5) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, we +do not allow the TRUNCATE. We also reserve the data dictionary latch. */ + + static const char renumber_tablespace_proc[] = + "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "old_id CHAR;\n" + "new_id CHAR;\n" + "old_id_low INT;\n" + "old_id_high INT;\n" + "new_id_low INT;\n" + "new_id_high INT;\n" + "BEGIN\n" + "old_id_high := %lu;\n" + "old_id_low := %lu;\n" + "new_id_high := %lu;\n" + "new_id_low := %lu;\n" + "old_id := CONCAT(TO_BINARY(old_id_high, 4), TO_BINARY(old_id_low, 4));\n" + "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n" + "UPDATE SYS_TABLES SET ID = new_id\n" + "WHERE ID = old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "COMMIT WORK;\n" + "END;\n"; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(table); + + if (srv_created_new_raw) { + fputs( + "InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "truncating table"; + + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + ut_a(trx->dict_operation_lock_mode == 0); + /* Prevent foreign key checks etc. while we are truncating the + table */ + + row_mysql_lock_data_dictionary(trx); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns) { + FILE* ef = dict_foreign_err_file; + + /* We only allow truncating a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot truncate table ", ef); + ut_print_name(ef, trx, table->name); + fputs(" by DROP+CREATE\n" + "InnoDB: because it is referenced by ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_ERROR; + goto funct_exit; + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that + they can cope with the table having been truncated here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Cannot truncate table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs(" by DROP+CREATE\n" +"InnoDB: because there is a foreign key check running on it.\n", + stderr); + err = DB_ERROR; + + goto funct_exit; + } + + /* Remove any locks there are on the table or its records */ + + lock_reset_all_on_table(table); + + trx->table_id = table->id; + + /* scan SYS_INDEXES for all indexes of the table */ + heap = mem_heap_create(800); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 8); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + sys_index = dict_table_get_first_index(dict_sys->sys_indexes); + dict_index_copy_types(tuple, sys_index, 1); + + mtr_start(&mtr); + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_MODIFY_LEAF, &pcur, &mtr); + for (;;) { + rec_t* rec; + const byte* field; + ulint len; + ulint root_page_no; + + if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) { + /* The end of SYS_INDEXES has been reached. */ + break; + } + + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, 0, &len); + ut_ad(len == 8); + + if (memcmp(buf, field, len) != 0) { + /* End of indexes for the table (TABLE_ID mismatch). */ + break; + } + + if (rec_get_deleted_flag(rec, FALSE)) { + /* The index has been dropped. */ + goto next_rec; + } + + btr_pcur_store_position(&pcur, &mtr); + + /* This call may commit and restart mtr. */ + root_page_no = dict_truncate_index_tree(table, rec, &mtr); + + btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr); + rec = btr_pcur_get_rec(&pcur); + + if (root_page_no != FIL_NULL) { + page_rec_write_index_page_no(rec, + DICT_SYS_INDEXES_PAGE_NO_FIELD, + root_page_no, &mtr); + /* We will need to commit and restart the + mini-transaction in order to avoid deadlocks. + The dict_truncate_index_tree() call has allocated + a page in this mini-transaction, and the rest of + this loop could latch another index page. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, + &pcur, &mtr); + } + + next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + mem_heap_empty(heap); + sql = mem_heap_alloc(heap, (sizeof renumber_tablespace_proc) + 40); + sprintf(sql, renumber_tablespace_proc, + (ulong) ut_dulint_get_high(table->id), + (ulong) ut_dulint_get_low(table->id), + (ulong) ut_dulint_get_high(new_id), + (ulong) ut_dulint_get_low(new_id)); + + graph = pars_sql(sql); + + ut_a(graph); + + mem_heap_free(heap); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + thr = que_fork_start_command(graph); + ut_a(thr); + + que_run_threads(thr); + + que_graph_free(graph); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + ut_print_timestamp(stderr); +fputs(" InnoDB: Unable to assign a new identifier to table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs("\n" +"InnoDB: after truncating it. Background processes may corrupt the table!\n", + stderr); + err = DB_ERROR; + } else { + dict_table_change_id_in_cache(table, new_id); + } + + dict_table_autoinc_initialize(table, 0); + dict_update_statistics(table); + + trx_commit_for_mysql(trx); + +funct_exit: + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + srv_wake_master_thread(); + + return((int) err); +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************************* Drops a table for MySQL. If the name of the table to be dropped is equal with one of the predefined magic table names, then this also stops printing the corresponding monitor output by the master thread. */ @@ -2769,7 +3376,9 @@ funct_exit: trx->op_info = ""; +#ifndef UNIV_HOTBACKUP srv_wake_master_thread(); +#endif /* !UNIV_HOTBACKUP */ return((int) err); } @@ -3025,8 +3634,9 @@ row_rename_table_for_mysql( "InnoDB: data dictionary though MySQL is trying to rename the table.\n" "InnoDB: Have you copied the .frm file of the table to the\n" "InnoDB: MySQL database directory from another database?\n" - "InnoDB: You can look for further help from section 15.1 of\n" - "InnoDB: http://www.innodb.com/ibman.php\n", stderr); + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/mysql/en/" + "InnoDB_troubleshooting_datadict.html\n", stderr); goto funct_exit; } @@ -3038,8 +3648,9 @@ row_rename_table_for_mysql( ut_print_name(stderr, trx, old_name); fputs( " does not have an .ibd file in the database directory.\n" - "InnoDB: You can look for further help from section 15.1 of\n" - "InnoDB: http://www.innodb.com/ibman.php\n", stderr); + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/mysql/en/" + "InnoDB_troubleshooting_datadict.html\n", stderr); goto funct_exit; } @@ -3278,7 +3889,7 @@ funct_exit: que_graph_free(graph); } - if (heap) { + if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -3303,18 +3914,22 @@ row_scan_and_check_index( ulint* n_rows) /* out: number of entries seen in the current consistent read */ { - mem_heap_t* heap; - dtuple_t* prev_entry = NULL; + dtuple_t* prev_entry = NULL; ulint matched_fields; ulint matched_bytes; byte* buf; ulint ret; rec_t* rec; - ibool is_ok = TRUE; + ibool is_ok = TRUE; int cmp; ibool contains_null; ulint i; - + ulint cnt; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + *n_rows = 0; buf = mem_alloc(UNIV_PAGE_SIZE); @@ -3332,11 +3947,19 @@ row_scan_and_check_index( dtuple_set_n_fields(prebuilt->search_tuple, 0); prebuilt->select_lock_type = LOCK_NONE; + cnt = 1000; ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0); loop: + /* Check thd->killed every 1,000 scanned rows */ + if (--cnt == 0) { + if (trx_is_interrupted(prebuilt->trx)) { + goto func_exit; + } + cnt = 1000; + } if (ret != DB_SUCCESS) { - + func_exit: mem_free(buf); mem_heap_free(heap); @@ -3354,8 +3977,10 @@ loop: if (prev_entry != NULL) { matched_fields = 0; matched_bytes = 0; - - cmp = cmp_dtuple_rec_with_match(prev_entry, rec, + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets, &matched_fields, &matched_bytes); contains_null = FALSE; @@ -3384,7 +4009,7 @@ loop: dtuple_print(stderr, prev_entry); fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); putc('\n', stderr); is_ok = FALSE; } else if ((index->type & DICT_UNIQUE) @@ -3398,6 +4023,7 @@ loop: } mem_heap_empty(heap); + offsets = offsets_; prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); @@ -3460,7 +4086,7 @@ row_check_table_for_mysql( ut_print_name(stderr, index->name); putc('\n', stderr); */ - if (!btr_validate_tree(index->tree)) { + if (!btr_validate_tree(index->tree, prebuilt->trx)) { ret = DB_ERROR; } else { if (!row_scan_and_check_index(prebuilt, @@ -3468,6 +4094,10 @@ row_check_table_for_mysql( ret = DB_ERROR; } + if (trx_is_interrupted(prebuilt->trx)) { + break; + } + /* fprintf(stderr, "%lu entries in index %s\n", n_rows, index->name); */ diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c index f7e01169b9d..abcf97110d9 100644 --- a/innobase/row/row0purge.c +++ b/innobase/row/row0purge.c @@ -99,6 +99,10 @@ row_purge_remove_clust_if_poss_low( ibool success; ulint err; mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; index = dict_table_get_first_index(node->table); @@ -117,15 +121,24 @@ row_purge_remove_clust_if_poss_low( return(TRUE); } + rec = btr_pcur_get_rec(pcur); + if (0 != ut_dulint_cmp(node->roll_ptr, - row_get_rec_roll_ptr(btr_pcur_get_rec(pcur), index))) { - + row_get_rec_roll_ptr(rec, index, rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap)))) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } /* Someone else has modified the record later: do not remove */ btr_pcur_commit_specify_mtr(pcur, &mtr); return(TRUE); } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + if (mode == BTR_MODIFY_LEAF) { success = btr_cur_optimistic_delete(btr_cur, &mtr); } else { diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c index 38714b0c49b..9a74397dc08 100644 --- a/innobase/row/row0row.c +++ b/innobase/row/row0row.c @@ -37,17 +37,18 @@ row_get_rec_sys_field( /* out: value of the field */ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { - ulint pos; - byte* field; - ulint len; + ulint pos; + byte* field; + ulint len; ut_ad(index->type & DICT_CLUSTERED); pos = dict_index_get_sys_col_pos(index, type); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); if (type == DATA_TRX_ID) { @@ -70,6 +71,7 @@ row_set_rec_sys_field( ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint val) /* in: value to set */ { ulint pos; @@ -77,10 +79,11 @@ row_set_rec_sys_field( ulint len; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); pos = dict_index_get_sys_col_pos(index, type); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); if (type == DATA_TRX_ID) { @@ -182,6 +185,9 @@ row_build( the buffer page of this record must be at least s-latched and the latch held as long as the row dtuple is used! */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) + or NULL, in which case this function + will invoke rec_get_offsets() */ mem_heap_t* heap) /* in: memory heap from which the memory needed is allocated */ { @@ -196,14 +202,26 @@ row_build( ulint row_len; byte* buf; ulint i; - + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_ad(index && rec && heap); ut_ad(index->type & DICT_CLUSTERED); + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &tmp_heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ - buf = mem_heap_alloc(heap, rec_get_size(rec)); - rec = rec_copy(buf, rec); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, (ulint*) offsets); } table = index->table; @@ -211,11 +229,9 @@ row_build( row = dtuple_create(heap, row_len); - dtuple_set_info_bits(row, rec_get_info_bits(rec)); - - n_fields = dict_index_get_n_fields(index); + dtuple_set_info_bits(row, rec_get_info_bits(rec, table->comp)); - ut_ad(n_fields == rec_get_n_fields(rec)); + n_fields = rec_offs_n_fields(offsets); dict_table_copy_types(row, table); @@ -227,13 +243,13 @@ row_build( col = dict_field_get_col(ind_field); dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); - field = rec_get_nth_field(rec, i, &len); + field = rec_get_nth_field(rec, offsets, i, &len); if (type == ROW_COPY_ALSO_EXTERNALS - && rec_get_nth_field_extern_bit(rec, i)) { + && rec_offs_nth_extern(offsets, i)) { field = btr_rec_copy_externally_stored_field( - rec, i, &len, heap); + rec, offsets, i, &len, heap); } dfield_set_data(dfield, field, len); @@ -242,6 +258,10 @@ row_build( ut_ad(dtuple_check_typed(row)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + return(row); } @@ -276,16 +296,25 @@ row_rec_to_index_entry( ulint len; ulint rec_len; byte* buf; - + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_ad(rec && heap && index); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + if (type == ROW_COPY_DATA) { /* Take a copy of rec to heap */ - buf = mem_heap_alloc(heap, rec_get_size(rec)); - rec = rec_copy(buf, rec); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); } - rec_len = rec_get_n_fields(rec); + rec_len = rec_offs_n_fields(offsets); entry = dtuple_create(heap, rec_len); @@ -295,17 +324,21 @@ row_rec_to_index_entry( dict_index_copy_types(entry, index, rec_len); - dtuple_set_info_bits(entry, rec_get_info_bits(rec)); + dtuple_set_info_bits(entry, + rec_get_info_bits(rec, rec_offs_comp(offsets))); for (i = 0; i < rec_len; i++) { dfield = dtuple_get_nth_field(entry, i); - field = rec_get_nth_field(rec, i, &len); + field = rec_get_nth_field(rec, offsets, i, &len); dfield_set_data(dfield, field, len); } ut_ad(dtuple_check_typed(entry)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } return(entry); } @@ -345,15 +378,24 @@ row_build_row_ref( byte* buf; ulint clust_col_prefix_len; ulint i; - + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_ad(index && rec && heap); - + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + if (type == ROW_COPY_DATA) { /* Take a copy of rec to heap */ - buf = mem_heap_alloc(heap, rec_get_size(rec)); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); - rec = rec_copy(buf, rec); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); } table = index->table; @@ -373,7 +415,7 @@ row_build_row_ref( ut_a(pos != ULINT_UNDEFINED); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); dfield_set_data(dfield, field, len); @@ -391,12 +433,15 @@ row_build_row_ref( dfield_set_len(dfield, dtype_get_at_most_n_mbchars( dfield_get_type(dfield), - clust_col_prefix_len, len, field)); + clust_col_prefix_len, len, (char*) field)); } } } ut_ad(dtuple_check_typed(ref)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } return(ref); } @@ -427,7 +472,11 @@ row_build_row_ref_in_tuple( ulint pos; ulint clust_col_prefix_len; ulint i; - + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_a(ref && index && rec); if (!index->table) { @@ -446,7 +495,9 @@ row_build_row_ref_in_tuple( fputs("InnoDB: clust index for table ", stderr); goto notfound; } - + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + ref_len = dict_index_get_n_unique(clust_index); ut_ad(ref_len == dtuple_get_n_fields(ref)); @@ -459,8 +510,8 @@ row_build_row_ref_in_tuple( pos = dict_index_get_nth_field_pos(index, clust_index, i); ut_a(pos != ULINT_UNDEFINED); - - field = rec_get_nth_field(rec, pos, &len); + + field = rec_get_nth_field(rec, offsets, pos, &len); dfield_set_data(dfield, field, len); @@ -478,12 +529,15 @@ row_build_row_ref_in_tuple( dfield_set_len(dfield, dtype_get_at_most_n_mbchars( dfield_get_type(dfield), - clust_col_prefix_len, len, field)); + clust_col_prefix_len, len, (char*) field)); } } } ut_ad(dtuple_check_typed(ref)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } /*********************************************************************** @@ -562,7 +616,6 @@ row_search_on_row_ref( ulint low_match; rec_t* rec; dict_index_t* index; - page_t* page; ut_ad(dtuple_check_typed(ref)); @@ -575,9 +628,8 @@ row_search_on_row_ref( low_match = btr_pcur_get_low_match(pcur); rec = btr_pcur_get_rec(pcur); - page = buf_frame_align(rec); - if (rec == page_get_infimum_rec(page)) { + if (page_rec_is_infimum(rec)) { return(FALSE); } @@ -648,7 +700,6 @@ row_search_index_entry( { ulint n_fields; ulint low_match; - page_t* page; rec_t* rec; ut_ad(dtuple_check_typed(entry)); @@ -657,11 +708,10 @@ row_search_index_entry( low_match = btr_pcur_get_low_match(pcur); rec = btr_pcur_get_rec(pcur); - page = buf_frame_align(rec); n_fields = dtuple_get_n_fields(entry); - if (rec == page_get_infimum_rec(page)) { + if (page_rec_is_infimum(rec)) { return(FALSE); } diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index f8218e08297..602f5855171 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -78,8 +78,20 @@ row_sel_sec_rec_is_for_clust_rec( ulint n; ulint i; dtype_t* cur_type; + mem_heap_t* heap = NULL; + ulint clust_offsets_[REC_OFFS_NORMAL_SIZE]; + ulint sec_offsets_[REC_OFFS_SMALL_SIZE]; + ulint* clust_offs = clust_offsets_; + ulint* sec_offs = sec_offsets_; + ibool is_equal = TRUE; - UT_NOT_USED(clust_index); + *clust_offsets_ = (sizeof clust_offsets_) / sizeof *clust_offsets_; + *sec_offsets_ = (sizeof sec_offsets_) / sizeof *sec_offsets_; + + clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs, + ULINT_UNDEFINED, &heap); + sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs, + ULINT_UNDEFINED, &heap); n = dict_index_get_n_ordering_defined_by_user(sec_index); @@ -87,10 +99,10 @@ row_sel_sec_rec_is_for_clust_rec( ifield = dict_index_get_nth_field(sec_index, i); col = dict_field_get_col(ifield); - clust_field = rec_get_nth_field(clust_rec, + clust_field = rec_get_nth_field(clust_rec, clust_offs, dict_col_get_clust_pos(col), &clust_len); - sec_field = rec_get_nth_field(sec_rec, i, &sec_len); + sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len); if (ifield->prefix_len > 0 && clust_len != UNIV_SQL_NULL) { @@ -101,17 +113,22 @@ row_sel_sec_rec_is_for_clust_rec( clust_len = dtype_get_at_most_n_mbchars( cur_type, ifield->prefix_len, - clust_len, clust_field); + clust_len, (char*) clust_field); } if (0 != cmp_data_data(dict_col_get_type(col), clust_field, clust_len, sec_field, sec_len)) { - return(FALSE); + is_equal = FALSE; + goto func_exit; } } - return(TRUE); +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(is_equal); } /************************************************************************* @@ -266,6 +283,7 @@ row_sel_fetch_columns( dict_index_t* index, /* in: record index */ rec_t* rec, /* in: record in a clustered or non-clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ sym_node_t* column) /* in: first column in a column list, or NULL */ { @@ -275,6 +293,8 @@ row_sel_fetch_columns( byte* data; ulint len; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (index->type & DICT_CLUSTERED) { index_type = SYM_CLUST_FIELD_NO; } else { @@ -286,7 +306,7 @@ row_sel_fetch_columns( if (field_no != ULINT_UNDEFINED) { - data = rec_get_nth_field(rec, field_no, &len); + data = rec_get_nth_field(rec, offsets, field_no, &len); if (column->copy_val) { eval_node_copy_and_alloc_val(column, data, @@ -491,6 +511,10 @@ row_sel_build_prev_vers( read_view_t* read_view, /* in: read view */ plan_t* plan, /* in: plan node for table */ rec_t* rec, /* in: record in a clustered index */ + ulint** offsets, /* in/out: offsets returned by + rec_get_offsets(rec, plan->index) */ + mem_heap_t** offset_heap, /* in/out: memory heap from which + the offsets are allocated */ rec_t** old_vers, /* out: old version, or NULL if the record does not exist in the view: i.e., it was freshly inserted @@ -506,8 +530,8 @@ row_sel_build_prev_vers( } err = row_vers_build_for_consistent_read(rec, mtr, plan->index, - read_view, plan->old_vers_heap, - old_vers); + offsets, read_view, offset_heap, + plan->old_vers_heap, old_vers); return(err); } @@ -601,8 +625,18 @@ row_sel_get_clust_rec( rec_t* clust_rec; rec_t* old_vers; ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + + *out_rec = NULL; + + offsets = rec_get_offsets(rec, + btr_pcur_get_btr_cur(&plan->pcur)->index, + offsets, ULINT_UNDEFINED, &heap); - row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec); + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets); index = dict_table_get_first_index(plan->table); @@ -619,7 +653,7 @@ row_sel_get_clust_rec( || btr_pcur_get_low_match(&(plan->clust_pcur)) < dict_index_get_n_unique(index)) { - ut_a(rec_get_deleted_flag(rec)); + ut_a(rec_get_deleted_flag(rec, plan->table->comp)); ut_a(node->read_view); /* In a rare case it is possible that no clust rec is found @@ -631,34 +665,33 @@ row_sel_get_clust_rec( clustered index record did not exist in the read view of trx. */ - clust_rec = NULL; - goto func_exit; } + offsets = rec_get_offsets(clust_rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (!node->read_view) { /* Try to place a lock on the index record */ /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ + we lock only the record, i.e., next-key locking is + not used. */ + ulint lock_type; if (srv_locks_unsafe_for_binlog) { - err = lock_clust_rec_read_check_and_lock(0, - clust_rec, - index, node->row_lock_mode, - LOCK_REC_NOT_GAP, thr); + lock_type = LOCK_REC_NOT_GAP; } else { - err = lock_clust_rec_read_check_and_lock(0, - clust_rec, - index, node->row_lock_mode, - LOCK_ORDINARY, thr); + lock_type = LOCK_ORDINARY; } + err = lock_clust_rec_read_check_and_lock(0, + clust_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + if (err != DB_SUCCESS) { - return(err); + goto err_exit; } } else { /* This is a non-locking consistent read: if necessary, fetch @@ -666,22 +699,21 @@ row_sel_get_clust_rec( old_vers = NULL; - if (!lock_clust_rec_cons_read_sees(clust_rec, index, + if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets, node->read_view)) { err = row_sel_build_prev_vers(node->read_view, plan, - clust_rec, &old_vers, mtr); + clust_rec, &offsets, &heap, + &old_vers, mtr); if (err != DB_SUCCESS) { - return(err); + goto err_exit; } clust_rec = old_vers; if (clust_rec == NULL) { - *out_rec = clust_rec; - - return(DB_SUCCESS); + goto func_exit; } } @@ -698,24 +730,25 @@ row_sel_get_clust_rec( visit through secondary index records that would not really exist in our snapshot. */ - if ((old_vers || rec_get_deleted_flag(rec)) + if ((old_vers || rec_get_deleted_flag(rec, plan->table->comp)) && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index, clust_rec, index)) { - clust_rec = NULL; - *out_rec = clust_rec; - - return(DB_SUCCESS); + goto func_exit; } } /* Fetch the columns needed in test conditions */ - - row_sel_fetch_columns(index, clust_rec, + + row_sel_fetch_columns(index, clust_rec, offsets, UT_LIST_GET_FIRST(plan->columns)); -func_exit: *out_rec = clust_rec; - - return(DB_SUCCESS); +func_exit: + err = DB_SUCCESS; +err_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); } /************************************************************************* @@ -727,6 +760,7 @@ sel_set_rec_lock( /* out: DB_SUCCESS or error code */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: lock mode */ ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */ que_thr_t* thr) /* in: query thread */ @@ -744,11 +778,11 @@ sel_set_rec_lock( } if (index->type & DICT_CLUSTERED) { - err = lock_clust_rec_read_check_and_lock(0, rec, index, mode, - type, thr); + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, mode, type, thr); } else { - err = lock_sec_rec_read_check_and_lock(0, rec, index, mode, - type, thr); + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, mode, type, thr); } return(err); @@ -956,6 +990,11 @@ row_sel_try_search_shortcut( { dict_index_t* index; rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ulint ret; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; index = plan->index; @@ -989,36 +1028,46 @@ row_sel_try_search_shortcut( /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + if (index->type & DICT_CLUSTERED) { - if (!lock_clust_rec_cons_read_sees(rec, index, + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, node->read_view)) { - return(SEL_RETRY); + ret = SEL_RETRY; + goto func_exit; } } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) { - return(SEL_RETRY); + ret = SEL_RETRY; + goto func_exit; } /* Test deleted flag. Fetch the columns needed in test conditions. */ - - row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); - if (rec_get_deleted_flag(rec)) { + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); - return(SEL_EXHAUSTED); + if (rec_get_deleted_flag(rec, plan->table->comp)) { + + ret = SEL_EXHAUSTED; + goto func_exit; } /* Test the rest of search conditions */ if (!row_sel_test_other_conds(plan)) { - return(SEL_EXHAUSTED); + ret = SEL_EXHAUSTED; + goto func_exit; } ut_ad(plan->pcur.latch_mode == node->latch_mode); plan->n_rows_fetched++; - +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(SEL_FOUND); } @@ -1067,7 +1116,11 @@ row_sel( to the next non-clustered record */ ulint found_flag; ulint err; - + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_ad(thr->run_node == node); search_latch_locked = FALSE; @@ -1207,7 +1260,7 @@ rec_loop: /* PHASE 1: Set a lock if specified */ if (!node->asc && cursor_just_opened - && (rec != page_get_supremum_rec(buf_frame_align(rec)))) { + && !page_rec_is_supremum(rec)) { /* When we open a cursor for a descending search, we must set a next-key lock on the successor record: otherwise it would @@ -1218,22 +1271,23 @@ rec_loop: if (!consistent_read) { /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ + we lock only the record, i.e., next-key locking is + not used. */ + + rec_t* next_rec = page_rec_get_next(rec); + ulint lock_type; + offsets = rec_get_offsets(next_rec, index, offsets, + ULINT_UNDEFINED, &heap); if (srv_locks_unsafe_for_binlog) { - err = sel_set_rec_lock(page_rec_get_next(rec), - index, - node->row_lock_mode, - LOCK_REC_NOT_GAP, thr); + lock_type = LOCK_REC_NOT_GAP; } else { - err = sel_set_rec_lock(page_rec_get_next(rec), - index, - node->row_lock_mode, - LOCK_ORDINARY, thr); + lock_type = LOCK_ORDINARY; } + err = sel_set_rec_lock(next_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + if (err != DB_SUCCESS) { /* Note that in this case we will store in pcur the PREDECESSOR of the record we are waiting @@ -1244,7 +1298,7 @@ rec_loop: } } - if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + if (page_rec_is_infimum(rec)) { /* The infimum record on a page cannot be in the result set, and neither can a record lock be placed on it: we skip such @@ -1260,25 +1314,29 @@ rec_loop: /* Try to place a lock on the index record */ /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ + we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); if (srv_locks_unsafe_for_binlog) { - err = sel_set_rec_lock(rec, index, node->row_lock_mode, - LOCK_REC_NOT_GAP, thr); + lock_type = LOCK_REC_NOT_GAP; } else { - err = sel_set_rec_lock(rec, index, node->row_lock_mode, - LOCK_ORDINARY, thr); + lock_type = LOCK_ORDINARY; } + err = sel_set_rec_lock(rec, index, offsets, + node->row_lock_mode, lock_type, thr); + if (err != DB_SUCCESS) { goto lock_wait_or_error; } } - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + if (page_rec_is_supremum(rec)) { /* A page supremum record cannot be in the result set: skip it now when we have placed a possible lock on it */ @@ -1334,6 +1392,7 @@ rec_loop: /* PHASE 3: Get previous version in a consistent read */ cons_read_requires_clust_rec = FALSE; + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); if (consistent_read) { /* This is a non-locking consistent read: if necessary, fetch @@ -1341,19 +1400,24 @@ rec_loop: if (index->type & DICT_CLUSTERED) { - if (!lock_clust_rec_cons_read_sees(rec, index, + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, node->read_view)) { err = row_sel_build_prev_vers(node->read_view, - plan, rec, &old_vers, - &mtr); + plan, rec, + &offsets, &heap, + &old_vers, &mtr); if (err != DB_SUCCESS) { goto lock_wait_or_error; } if (old_vers == NULL) { + offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &heap); row_sel_fetch_columns(index, rec, + offsets, UT_LIST_GET_FIRST(plan->columns)); if (!row_sel_test_end_conds(plan)) { @@ -1376,7 +1440,8 @@ rec_loop: /* Fetch the columns needed in test conditions */ - row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); /* Test the selection end conditions: these can only contain columns which already are found in the index, even though the index might be @@ -1391,7 +1456,8 @@ rec_loop: goto table_exhausted; } - if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + if (rec_get_deleted_flag(rec, plan->table->comp) + && !cons_read_requires_clust_rec) { /* The record is delete marked: we can skip it if this is not a consistent read which might see an earlier version @@ -1434,7 +1500,7 @@ rec_loop: goto next_rec; } - if (rec_get_deleted_flag(clust_rec)) { + if (rec_get_deleted_flag(clust_rec, plan->table->comp)) { /* The record is delete marked: we can skip it */ @@ -1592,8 +1658,9 @@ next_table_no_mtr: if (search_latch_locked) { rw_lock_s_unlock(&btr_search_latch); } - - return(DB_SUCCESS); + + err = DB_SUCCESS; + goto func_exit; } node->fetch_table++; @@ -1626,6 +1693,7 @@ table_exhausted: table_exhausted_no_mtr: if (node->fetch_table == 0) { + err = DB_SUCCESS; if (node->is_aggregate && !node->aggregate_already_fetched) { @@ -1639,7 +1707,7 @@ table_exhausted_no_mtr: rw_lock_s_unlock(&btr_search_latch); } - return(DB_SUCCESS); + goto func_exit; } node->state = SEL_NODE_NO_MORE_ROWS; @@ -1650,7 +1718,7 @@ table_exhausted_no_mtr: rw_lock_s_unlock(&btr_search_latch); } - return(DB_SUCCESS); + goto func_exit; } node->fetch_table--; @@ -1674,8 +1742,8 @@ stop_for_a_while: mtr_commit(&mtr); ut_ad(sync_thread_levels_empty_gen(TRUE)); - - return(DB_SUCCESS); + err = DB_SUCCESS; + goto func_exit; commit_mtr_for_a_while: /* Stores the cursor position and commits &mtr; this is used if @@ -1710,6 +1778,10 @@ lock_wait_or_error: ut_ad(sync_thread_levels_empty_gen(TRUE)); +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(err); } @@ -1945,7 +2017,8 @@ Converts a key value stored in MySQL format to an Innobase dtuple. The last field of the key value may be just a prefix of a fixed length field: hence the parameter key_len. But currently we do not allow search keys where the last field is only a prefix of the full key field len and print a warning if -such appears. */ +such appears. A counterpart of this function is +ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ void row_sel_convert_mysql_key_to_innobase( @@ -2026,10 +2099,10 @@ row_sel_convert_mysql_key_to_innobase( type = dfield_get_type(dfield)->mtype; /* Calculate data length and data field total length */ - + if (type == DATA_BLOB) { /* The key field is a column prefix of a BLOB or - TEXT type column */ + TEXT */ ut_a(field->prefix_len > 0); @@ -2045,11 +2118,12 @@ row_sel_convert_mysql_key_to_innobase( data_len = key_ptr[data_offset] + 256 * key_ptr[data_offset + 1]; data_field_len = data_offset + 2 + field->prefix_len; + data_offset += 2; - - type = DATA_CHAR; /* now that we know the length, we - store the column value like it would - be a fixed char field */ + + /* Now that we know the length, we store the column + value like it would be a fixed char field */ + } else if (field->prefix_len > 0) { /* Looks like MySQL pads unused end bytes in the prefix with space. Therefore, also in UTF-8, it is ok @@ -2069,14 +2143,32 @@ row_sel_convert_mysql_key_to_innobase( data_field_len = data_offset + data_len; } + if (dtype_get_mysql_type(dfield_get_type(dfield)) + == DATA_MYSQL_TRUE_VARCHAR + && dfield_get_type(dfield)->mtype != DATA_INT) { + /* In a MySQL key value format, a true VARCHAR is + always preceded by 2 bytes of a length field. + dfield_get_type(dfield)->len returns the maximum + 'payload' len in bytes. That does not include the + 2 bytes that tell the actual data length. + + We added the check != DATA_INT to make sure we do + not treat MySQL ENUM or SET as a true VARCHAR! */ + + data_len += 2; + data_field_len += 2; + } + /* Storing may use at most data_len bytes of buf */ if (!is_null) { row_mysql_store_col_in_innobase_format( - dfield, buf, key_ptr + data_offset, - data_len, type, - dfield_get_type(dfield)->prtype - & DATA_UNSIGNED); + dfield, + buf, + FALSE, /* MySQL key value format col */ + key_ptr + data_offset, + data_len, + index->table->comp); buf += data_len; } @@ -2133,11 +2225,16 @@ row_sel_store_row_id_to_prebuilt( /*=============================*/ row_prebuilt_t* prebuilt, /* in: prebuilt */ rec_t* index_rec, /* in: record */ - dict_index_t* index) /* in: index of the record */ + dict_index_t* index, /* in: index of the record */ + const ulint* offsets) /* in: rec_get_offsets + (index_rec, index) */ { byte* data; ulint len; - data = rec_get_nth_field(index_rec, + + ut_ad(rec_offs_validate(index_rec, index, offsets)); + + data = rec_get_nth_field(index_rec, offsets, dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); if (len != DATA_ROW_ID_LEN) { @@ -2146,8 +2243,8 @@ row_sel_store_row_id_to_prebuilt( dict_index_name_print(stderr, prebuilt->trx, index); fprintf(stderr, "\n" "InnoDB: Field number %lu, record:\n", - (ulong) dict_index_get_sys_col_pos(index, DATA_ROW_ID)); - rec_print(stderr, index_rec); + (ulong) dict_index_get_sys_col_pos(index, DATA_ROW_ID)); + rec_print_new(stderr, index_rec, offsets); putc('\n', stderr); ut_error; } @@ -2156,8 +2253,9 @@ row_sel_store_row_id_to_prebuilt( } /****************************************************************** -Stores a non-SQL-NULL field in the MySQL format. */ -UNIV_INLINE +Stores a non-SQL-NULL field in the MySQL format. The counterpart of this +function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */ +static void row_sel_field_store_in_mysql_format( /*================================*/ @@ -2165,17 +2263,19 @@ row_sel_field_store_in_mysql_format( are not in themselves stored here: the caller must allocate and copy the BLOB into buffer before, and pass the pointer to the BLOB in 'data' */ - ulint col_len,/* in: MySQL column length */ + const mysql_row_templ_t* templ, /* in: MySQL column template. + Its following fields are referenced: + type, is_unsigned, mysql_col_len, mbminlen, mbmaxlen */ byte* data, /* in: data to store */ - ulint len, /* in: length of the data */ - ulint type, /* in: data type */ - ulint is_unsigned)/* in: != 0 if an unsigned integer type */ + ulint len) /* in: length of the data */ { byte* ptr; + byte* field_end; + byte* pad_ptr; ut_ad(len != UNIV_SQL_NULL); - if (type == DATA_INT) { + if (templ->type == DATA_INT) { /* Convert integer data from Innobase to a little-endian format, sign bit restored to normal */ @@ -2190,31 +2290,103 @@ row_sel_field_store_in_mysql_format( data++; } - if (!is_unsigned) { + if (!templ->is_unsigned) { dest[len - 1] = (byte) (dest[len - 1] ^ 128); } - ut_ad(col_len == len); - } else if (type == DATA_VARCHAR || type == DATA_VARMYSQL - || type == DATA_BINARY) { - /* Store the length of the data to the first two bytes of - dest; does not do anything yet because MySQL has - no real vars! */ + ut_ad(templ->mysql_col_len == len); + } else if (templ->type == DATA_VARCHAR + || templ->type == DATA_VARMYSQL + || templ->type == DATA_BINARY) { + + field_end = dest + templ->mysql_col_len; + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR. Store the + length of the data to the first byte or the first + two bytes of dest. */ - dest = row_mysql_store_var_len(dest, len); - ut_memcpy(dest, data, len); + dest = row_mysql_store_true_var_len(dest, len, + templ->mysql_length_bytes); + } - /* ut_ad(col_len >= len + 2); No real var implemented in - MySQL yet! */ + /* Copy the actual data */ + ut_memcpy(dest, data, len); - } else if (type == DATA_BLOB) { + /* Pad with trailing spaces. We pad with spaces also the + unused end of a >= 5.0.3 true VARCHAR column, just in case + MySQL expects its contents to be deterministic. */ + + pad_ptr = dest + len; + + ut_ad(templ->mbminlen <= templ->mbmaxlen); + + /* We handle UCS2 charset strings differently. */ + if (templ->mbminlen == 2) { + /* A space char is two bytes, 0x0020 in UCS2 */ + + if (len & 1) { + /* A 0x20 has been stripped from the column. + Pad it back. */ + + if (pad_ptr < field_end) { + *pad_ptr = 0x20; + pad_ptr++; + } + } + + /* Pad the rest of the string with 0x0020 */ + + while (pad_ptr < field_end) { + *pad_ptr = 0x00; + pad_ptr++; + *pad_ptr = 0x20; + pad_ptr++; + } + } else { + ut_ad(templ->mbminlen == 1); + /* space=0x20 */ + + memset(pad_ptr, 0x20, field_end - pad_ptr); + } + } else if (templ->type == DATA_BLOB) { /* Store a pointer to the BLOB buffer to dest: the BLOB was already copied to the buffer in row_sel_store_mysql_rec */ - row_mysql_store_blob_ref(dest, col_len, data, len); + row_mysql_store_blob_ref(dest, templ->mysql_col_len, data, + len); + } else if (templ->type == DATA_MYSQL) { + memcpy(dest, data, len); + + ut_a(templ->mysql_col_len >= len); + ut_a(templ->mbmaxlen >= templ->mbminlen); + + ut_a(templ->mbmaxlen > templ->mbminlen + || templ->mysql_col_len == len); + /* The following assertion would fail for old tables + containing UTF-8 ENUM columns due to Bug #9526. */ + ut_ad(!templ->mbmaxlen + || !(templ->mysql_col_len % templ->mbmaxlen)); + ut_a(len * templ->mbmaxlen >= templ->mysql_col_len); + + if (templ->mbminlen != templ->mbmaxlen) { + /* Pad with spaces. This undoes the stripping + done in row0mysql.ic, function + row_mysql_store_col_in_innobase_format(). */ + + memset(dest + len, 0x20, templ->mysql_col_len - len); + } } else { - ut_memcpy(dest, data, len); - ut_ad(col_len == len); + ut_a(templ->type == DATA_CHAR + || templ->type == DATA_FIXBINARY + /*|| templ->type == DATA_SYS_CHILD + || templ->type == DATA_SYS*/ + || templ->type == DATA_FLOAT + || templ->type == DATA_DOUBLE + || templ->type == DATA_DECIMAL); + ut_ad(templ->mysql_col_len == len); + + memcpy(dest, data, len); } } @@ -2233,37 +2405,35 @@ row_sel_store_mysql_rec( case) */ byte* mysql_rec, /* out: row in the MySQL format */ row_prebuilt_t* prebuilt, /* in: prebuilt struct */ - rec_t* rec) /* in: Innobase record in the index + rec_t* rec, /* in: Innobase record in the index which was described in prebuilt's template */ + const ulint* offsets) /* in: array returned by + rec_get_offsets() */ { mysql_row_templ_t* templ; mem_heap_t* extern_field_heap = NULL; byte* data; ulint len; - byte* blob_buf; - int pad_char; ulint i; ut_ad(prebuilt->mysql_template); + ut_ad(rec_offs_validate(rec, NULL, offsets)); - if (prebuilt->blob_heap != NULL) { + if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) { mem_heap_free(prebuilt->blob_heap); prebuilt->blob_heap = NULL; } - /* MySQL assumes that all columns have the SQL NULL bit set unless it - is a nullable column with a non-NULL value */ - - memset(mysql_rec, 0xFF, prebuilt->null_bitmap_len); - for (i = 0; i < prebuilt->n_template; i++) { templ = prebuilt->mysql_template + i; - data = rec_get_nth_field(rec, templ->rec_field_no, &len); + data = rec_get_nth_field(rec, offsets, + templ->rec_field_no, &len); - if (rec_get_nth_field_extern_bit(rec, templ->rec_field_no)) { + if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, + templ->rec_field_no))) { /* Copy an externally stored field to the temporary heap */ @@ -2277,14 +2447,14 @@ row_sel_store_mysql_rec( causes an assert */ data = btr_rec_copy_externally_stored_field(rec, - templ->rec_field_no, &len, + offsets, templ->rec_field_no, &len, extern_field_heap); ut_a(len != UNIV_SQL_NULL); } if (len != UNIV_SQL_NULL) { - if (templ->type == DATA_BLOB) { + if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) { ut_a(prebuilt->templ_contains_blob); @@ -2293,8 +2463,9 @@ row_sel_store_mysql_rec( of 1000000 bytes. Since the test takes some CPU time, we do not use it for small BLOBs. */ - if (len > 2000000 - && !ut_test_malloc(len + 1000000)) { + if (UNIV_UNLIKELY(len > 2000000) + && UNIV_UNLIKELY(!ut_test_malloc( + len + 1000000))) { ut_print_timestamp(stderr); fprintf(stderr, @@ -2320,55 +2491,14 @@ row_sel_store_mysql_rec( mem_heap_create(len); } - blob_buf = mem_heap_alloc(prebuilt->blob_heap, - len); - ut_memcpy(blob_buf, data, len); - - data = blob_buf; + data = memcpy(mem_heap_alloc( + prebuilt->blob_heap, len), + data, len); } row_sel_field_store_in_mysql_format( mysql_rec + templ->mysql_col_offset, - templ->mysql_col_len, data, len, - templ->type, templ->is_unsigned); - - if (templ->type == DATA_VARCHAR - || templ->type == DATA_VARMYSQL - || templ->type == DATA_BINARY) { - /* Pad with trailing spaces */ - data = mysql_rec + templ->mysql_col_offset; - - /* Handle UCS2 strings differently. As no new - collations will be introduced in 4.1, we - hardcode the charset-collation codes here. - 5.0 will use a different approach. */ - if (templ->charset == 35 - || templ->charset == 90 - || (templ->charset >= 128 - && templ->charset <= 144)) { - /* space=0x0020 */ - ulint col_len = templ->mysql_col_len; - - ut_a(!(col_len & 1)); - if (len & 1) { - /* A 0x20 has been stripped - from the column. - Pad it back. */ - goto pad_0x20; - } - /* Pad the rest of the string - with 0x0020 */ - while (len < col_len) { - data[len++] = 0x00; - pad_0x20: - data[len++] = 0x20; - } - } else { - /* space=0x20 */ - memset(data + len, 0x20, - templ->mysql_col_len - len); - } - } + templ, data, len); /* Cleanup */ if (extern_field_heap) { @@ -2388,45 +2518,45 @@ row_sel_store_mysql_rec( account caused seg faults with NULL BLOB fields, and bug number 154 in the MySQL bug database: GROUP BY and DISTINCT could treat NULL values inequal. */ - - if (templ->type == DATA_VARCHAR - || templ->type == DATA_CHAR - || templ->type == DATA_BINARY - || templ->type == DATA_FIXBINARY - || templ->type == DATA_MYSQL - || templ->type == DATA_VARMYSQL) { + int pad_char; + + mysql_rec[templ->mysql_null_byte_offset] |= + (byte) (templ->mysql_null_bit_mask); + switch (templ->type) { + case DATA_VARCHAR: + case DATA_CHAR: + case DATA_BINARY: + case DATA_FIXBINARY: + case DATA_MYSQL: + case DATA_VARMYSQL: /* MySQL pads all non-BLOB and non-TEXT string types with space ' ' */ - - pad_char = ' '; - } else { - pad_char = '\0'; + if (UNIV_UNLIKELY(templ->mbminlen == 2)) { + /* Treat UCS2 as a special case. */ + data = mysql_rec + + templ->mysql_col_offset; + len = templ->mysql_col_len; + /* There are two UCS2 bytes per char, + so the length has to be even. */ + ut_a(!(len & 1)); + /* Pad with 0x0020. */ + while (len) { + *data++ = 0x00; + *data++ = 0x20; + len -= 2; + } + continue; + } + pad_char = 0x20; + break; + default: + pad_char = 0x00; + break; } - /* Handle UCS2 strings differently. As no new - collations will be introduced in 4.1, - we hardcode the charset-collation codes here. - 5.0 will use a different approach. */ - if (pad_char != '\0' - && (templ->charset == 35 - || templ->charset == 90 - || (templ->charset >= 128 - && templ->charset <= 144))) { - /* There are two bytes per char, so the length - has to be an even number. */ - ut_a(!(templ->mysql_col_len & 1)); - data = mysql_rec + templ->mysql_col_offset; - len = templ->mysql_col_len; - /* Pad with 0x0020. */ - while (len >= 2) { - *data++ = 0x00; - *data++ = 0x20; - len -= 2; - } - } else { - memset(mysql_rec + templ->mysql_col_offset, + ut_ad(!pad_char || templ->mbminlen == 1); + memset(mysql_rec + templ->mysql_col_offset, pad_char, templ->mysql_col_len); - } } } @@ -2444,6 +2574,10 @@ row_sel_build_prev_vers_for_mysql( dict_index_t* clust_index, /* in: clustered index */ row_prebuilt_t* prebuilt, /* in: prebuilt struct */ rec_t* rec, /* in: record in a clustered index */ + ulint** offsets, /* in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /* in/out: memory heap from which + the offsets are allocated */ rec_t** old_vers, /* out: old version, or NULL if the record does not exist in the view: i.e., it was freshly inserted @@ -2459,8 +2593,8 @@ row_sel_build_prev_vers_for_mysql( } err = row_vers_build_for_consistent_read(rec, mtr, clust_index, - read_view, prebuilt->old_vers_heap, - old_vers); + offsets, read_view, offset_heap, + prebuilt->old_vers_heap, old_vers); return(err); } @@ -2484,6 +2618,10 @@ row_sel_get_clust_rec_for_mysql( it, NULL if the old version did not exist in the read view, i.e., it was a fresh inserted version */ + ulint** offsets,/* out: offsets returned by + rec_get_offsets(out_rec, clust_index) */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ mtr_t* mtr) /* in: mtr used to get access to the non-clustered record; the same mtr is used to access the clustered index */ @@ -2525,9 +2663,8 @@ row_sel_get_clust_rec_for_mysql( clustered index record did not exist in the read view of trx. */ - if (!rec_get_deleted_flag(rec) + if (!rec_get_deleted_flag(rec, sec_index->table->comp) || prebuilt->select_lock_type != LOCK_NONE) { - ut_print_timestamp(stderr); fputs(" InnoDB: error clustered record" " for sec rec not found\n" @@ -2535,10 +2672,10 @@ row_sel_get_clust_rec_for_mysql( dict_index_name_print(stderr, trx, sec_index); fputs("\n" "InnoDB: sec index record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, sec_index); fputs("\n" "InnoDB: clust index record ", stderr); - rec_print(stderr, clust_rec); + rec_print(stderr, clust_rec, clust_index); putc('\n', stderr); trx_print(stderr, trx); @@ -2551,18 +2688,21 @@ row_sel_get_clust_rec_for_mysql( goto func_exit; } + *offsets = rec_get_offsets(clust_rec, clust_index, *offsets, + ULINT_UNDEFINED, offset_heap); + if (prebuilt->select_lock_type != LOCK_NONE) { /* Try to place a lock on the index record; we are searching the clust rec with a unique condition, hence we set a LOCK_REC_NOT_GAP type lock */ err = lock_clust_rec_read_check_and_lock(0, clust_rec, - clust_index, + clust_index, *offsets, prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr); if (err != DB_SUCCESS) { - return(err); + goto err_exit; } } else { /* This is a non-locking consistent read: if necessary, fetch @@ -2575,16 +2715,17 @@ row_sel_get_clust_rec_for_mysql( if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED && !lock_clust_rec_cons_read_sees(clust_rec, clust_index, - trx->read_view)) { + *offsets, trx->read_view)) { err = row_sel_build_prev_vers_for_mysql( trx->read_view, clust_index, prebuilt, clust_rec, + offsets, offset_heap, &old_vers, mtr); if (err != DB_SUCCESS) { - return(err); + goto err_exit; } clust_rec = old_vers; @@ -2603,7 +2744,8 @@ row_sel_get_clust_rec_for_mysql( visit through secondary index records that would not really exist in our snapshot. */ - if (clust_rec && (old_vers || rec_get_deleted_flag(rec)) + if (clust_rec && (old_vers + || rec_get_deleted_flag(rec, sec_index->table->comp)) && !row_sel_sec_rec_is_for_clust_rec(rec, sec_index, clust_rec, clust_index)) { clust_rec = NULL; @@ -2625,7 +2767,9 @@ func_exit: btr_pcur_store_position(prebuilt->clust_pcur, mtr); } - return(DB_SUCCESS); + err = DB_SUCCESS; +err_exit: + return(err); } /************************************************************************ @@ -2640,6 +2784,10 @@ sel_restore_position_for_mysql( process the record the cursor is now positioned on (i.e. we should not go to the next record yet) */ + ibool* same_user_rec, /* out: TRUE if we were able to restore + the cursor on a user record with the + same ordering prefix in in the + B-tree index */ ulint latch_mode, /* in: latch mode wished in restoration */ btr_pcur_t* pcur, /* in: cursor whose position @@ -2656,6 +2804,8 @@ sel_restore_position_for_mysql( success = btr_pcur_restore_position(latch_mode, pcur, mtr); + *same_user_rec = success; + if (relative_position == BTR_PCUR_ON) { if (success) { return(FALSE); @@ -2702,10 +2852,41 @@ row_sel_pop_cached_row_for_mysql( row */ row_prebuilt_t* prebuilt) /* in: prebuilt struct */ { - ut_ad(prebuilt->n_fetch_cached > 0); - - ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first], - prebuilt->mysql_row_len); + ulint i; + mysql_row_templ_t* templ; + byte* cached_rec; + ut_ad(prebuilt->n_fetch_cached > 0); + ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len); + + if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) + { + /* Copy cache record field by field, don't touch fields that + are not covered by current key */ + cached_rec = + prebuilt->fetch_cache[prebuilt->fetch_cache_first]; + + for (i = 0; i < prebuilt->n_template; i++) { + templ = prebuilt->mysql_template + i; + ut_memcpy( + buf + templ->mysql_col_offset, + cached_rec + templ->mysql_col_offset, + templ->mysql_col_len); + /* Copy NULL bit of the current field from cached_rec + to buf */ + if (templ->mysql_null_bit_mask) + { + buf[templ->mysql_null_byte_offset] ^= + (buf[templ->mysql_null_byte_offset] ^ + cached_rec[templ->mysql_null_byte_offset]) & + (byte)templ->mysql_null_bit_mask; + } + } + } + else + { + ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first], + prebuilt->mysql_prefix_len); + } prebuilt->n_fetch_cached--; prebuilt->fetch_cache_first++; @@ -2721,12 +2902,14 @@ void row_sel_push_cache_row_for_mysql( /*=============================*/ row_prebuilt_t* prebuilt, /* in: prebuilt struct */ - rec_t* rec) /* in: record to push */ + rec_t* rec, /* in: record to push */ + const ulint* offsets) /* in: rec_get_offsets() */ { byte* buf; ulint i; ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_a(!prebuilt->templ_contains_blob); if (prebuilt->fetch_cache[0] == NULL) { @@ -2750,9 +2933,11 @@ row_sel_push_cache_row_for_mysql( ut_ad(prebuilt->fetch_cache_first == 0); - ut_a(row_sel_store_mysql_rec( + if (UNIV_UNLIKELY(!row_sel_store_mysql_rec( prebuilt->fetch_cache[prebuilt->n_fetch_cached], - prebuilt, rec)); + prebuilt, rec, offsets))) { + ut_error; + } prebuilt->n_fetch_cached++; } @@ -2769,6 +2954,8 @@ row_sel_try_search_shortcut_for_mysql( /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ rec_t** out_rec,/* out: record if found */ row_prebuilt_t* prebuilt,/* in: prebuilt struct */ + ulint** offsets,/* in/out: for rec_get_offsets(*out_rec) */ + mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */ mtr_t* mtr) /* in: started mtr */ { dict_index_t* index = prebuilt->index; @@ -2806,13 +2993,17 @@ row_sel_try_search_shortcut_for_mysql( /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ - - if (!lock_clust_rec_cons_read_sees(rec, index, trx->read_view)) { + + *offsets = rec_get_offsets(rec, index, *offsets, + ULINT_UNDEFINED, heap); + + if (!lock_clust_rec_cons_read_sees(rec, index, + *offsets, trx->read_view)) { return(SEL_RETRY); } - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, index->table->comp)) { return(SEL_EXHAUSTED); } @@ -2865,12 +3056,7 @@ row_search_for_mysql( rec_t* index_rec; rec_t* clust_rec; rec_t* old_vers; - ulint err = DB_SUCCESS; - ibool moved; - ibool cons_read_requires_clust_rec; - ibool was_lock_wait; - ulint ret; - ulint shortcut; + ulint err = DB_SUCCESS; ibool unique_search = FALSE; ibool unique_search_from_clust_index = FALSE; ibool mtr_has_extra_clust_latch = FALSE; @@ -2880,15 +3066,22 @@ row_search_for_mysql( locking SELECT, and the isolation level is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */ - ibool success; +#ifdef UNIV_SEARCH_DEBUG ulint cnt = 0; +#endif /* UNIV_SEARCH_DEBUG */ ulint next_offs; + ibool same_user_rec; mtr_t mtr; - + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_ad(index && pcur && search_tuple); ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); - if (prebuilt->table->ibd_file_missing) { + if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error:\n" "InnoDB: MySQL is trying to use a table handle but the .ibd file for\n" @@ -2902,7 +3095,7 @@ row_search_for_mysql( return(DB_ERROR); } - if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" "InnoDB: table handle. Magic n %lu, table name ", @@ -2916,7 +3109,7 @@ row_search_for_mysql( } if (trx->n_mysql_tables_in_use == 0 - && prebuilt->select_lock_type == LOCK_NONE) { + && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) { /* Note that if MySQL uses an InnoDB temp table that it created inside LOCK TABLES, then n_mysql_tables_in_use can be zero; in that case select_lock_type is set to LOCK_X in @@ -2939,8 +3132,8 @@ row_search_for_mysql( /* PHASE 0: Release a possible s-latch we are holding on the adaptive hash index latch if there is someone waiting behind */ - if (trx->has_search_latch - && btr_search_latch.writer != RW_LOCK_NOT_LOCKED) { + if (UNIV_UNLIKELY(btr_search_latch.writer != RW_LOCK_NOT_LOCKED) + && trx->has_search_latch) { /* There is an x-latch request on the adaptive hash index: release the s-latch to reduce starvation and wait for @@ -2953,10 +3146,20 @@ row_search_for_mysql( trx->search_latch_timeout = BTR_SEA_TIMEOUT; } + /* Reset the new record lock info if we srv_locks_unsafe_for_binlog + is set. Then we are able to remove the record locks set here on an + individual row. */ + + if (srv_locks_unsafe_for_binlog + && prebuilt->select_lock_type != LOCK_NONE) { + + trx_reset_new_rec_lock_info(trx); + } + /*-------------------------------------------------------------*/ /* PHASE 1: Try to pop the row from the prefetch cache */ - if (direction == 0) { + if (UNIV_UNLIKELY(direction == 0)) { trx->op_info = "starting index read"; prebuilt->n_rows_fetched = 0; @@ -2974,8 +3177,8 @@ row_search_for_mysql( prebuilt->fetch_direction = direction; } - if (direction != prebuilt->fetch_direction) { - if (prebuilt->n_fetch_cached > 0) { + if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) { + if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) { ut_error; /* TODO: scrollable cursor: restore cursor to the place of the latest returned row, @@ -2987,15 +3190,14 @@ row_search_for_mysql( prebuilt->n_fetch_cached = 0; prebuilt->fetch_cache_first = 0; - } else if (prebuilt->n_fetch_cached > 0) { + } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) { row_sel_pop_cached_row_for_mysql(buf, prebuilt); prebuilt->n_rows_fetched++; srv_n_rows_read++; - trx->op_info = ""; - - return(DB_SUCCESS); + err = DB_SUCCESS; + goto func_exit; } if (prebuilt->fetch_cache_first > 0 @@ -3004,9 +3206,9 @@ row_search_for_mysql( /* The previous returned row was popped from the fetch cache, but the cache was not full at the time of the popping: no more rows can exist in the result set */ - - trx->op_info = ""; - return(DB_RECORD_NOT_FOUND); + + err = DB_RECORD_NOT_FOUND; + goto func_exit; } prebuilt->n_rows_fetched++; @@ -3048,10 +3250,11 @@ row_search_for_mysql( 1 column. Return immediately if this is not a HANDLER command. */ - if (direction != 0 && !prebuilt->used_in_HANDLER) { + if (UNIV_UNLIKELY(direction != 0 && + !prebuilt->used_in_HANDLER)) { - trx->op_info = ""; - return(DB_RECORD_NOT_FOUND); + err = DB_RECORD_NOT_FOUND; + goto func_exit; } } @@ -3066,9 +3269,9 @@ row_search_for_mysql( cannot use the adaptive hash index in a search in the case the row may be long and there may be externally stored fields */ - if (unique_search + if (UNIV_UNLIKELY(direction == 0) + && unique_search && index->type & DICT_CLUSTERED - && direction == 0 && !prebuilt->templ_contains_blob && !prebuilt->used_in_HANDLER && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) { @@ -3100,14 +3303,15 @@ row_search_for_mysql( trx->has_search_latch = TRUE; } #endif - shortcut = row_sel_try_search_shortcut_for_mysql(&rec, - prebuilt, &mtr); - if (shortcut == SEL_FOUND) { + switch (row_sel_try_search_shortcut_for_mysql(&rec, + prebuilt, &offsets, &heap, &mtr)) { + case SEL_FOUND: #ifdef UNIV_SEARCH_DEBUG - ut_a(0 == cmp_dtuple_rec(search_tuple, rec)); + ut_a(0 == cmp_dtuple_rec(search_tuple, + rec, offsets)); #endif if (!row_sel_store_mysql_rec(buf, prebuilt, - rec)) { + rec, offsets)) { err = DB_TOO_BIG_RECORD; /* We let the main loop to do the @@ -3131,15 +3335,12 @@ row_search_for_mysql( trx->has_search_latch = FALSE; } - trx->op_info = ""; - /* NOTE that we do NOT store the cursor position */ + err = DB_SUCCESS; + goto func_exit; - return(DB_SUCCESS); - - } else if (shortcut == SEL_EXHAUSTED) { - + case SEL_EXHAUSTED: mtr_commit(&mtr); /* ut_print_name(stderr, index->name); @@ -3154,12 +3355,11 @@ row_search_for_mysql( trx->has_search_latch = FALSE; } - trx->op_info = ""; - /* NOTE that we do NOT store the cursor position */ - return(DB_RECORD_NOT_FOUND); + err = DB_RECORD_NOT_FOUND; + goto func_exit; } shortcut_fails_too_big_rec: mtr_commit(&mtr); @@ -3183,6 +3383,7 @@ shortcut_fails_too_big_rec: /* Scan the MySQL query string; check if SELECT is the first word there */ + ibool success; dict_accept(*trx->mysql_query_str, "SELECT", &success); @@ -3198,7 +3399,7 @@ shortcut_fails_too_big_rec: naturally moves upward (in fetch next) in alphabetical order, otherwise downward */ - if (direction == 0) { + if (UNIV_UNLIKELY(direction == 0)) { if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) { moves_up = TRUE; } @@ -3212,10 +3413,10 @@ shortcut_fails_too_big_rec: clust_index = dict_table_get_first_index(index->table); - if (direction != 0) { - moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, - moves_up, &mtr); - if (!moved) { + if (UNIV_LIKELY(direction != 0)) { + if (!sel_restore_position_for_mysql(&same_user_rec, + BTR_SEARCH_LEAF, + pcur, moves_up, &mtr)) { goto next_rec; } @@ -3256,11 +3457,13 @@ shortcut_fails_too_big_rec: trx_assign_read_view(trx); prebuilt->sql_stat_start = FALSE; } else { + ulint lock_mode; if (prebuilt->select_lock_type == LOCK_S) { - err = lock_table(0, index->table, LOCK_IS, thr); + lock_mode = LOCK_IS; } else { - err = lock_table(0, index->table, LOCK_IX, thr); + lock_mode = LOCK_IX; } + err = lock_table(0, index->table, lock_mode, thr); if (err != DB_SUCCESS) { @@ -3274,6 +3477,8 @@ rec_loop: /* PHASE 4: Look for matching records in a loop */ rec = btr_pcur_get_rec(pcur); + ut_ad(!!page_rec_is_comp(rec) == index->table->comp); +#ifdef UNIV_SEARCH_DEBUG /* fputs("Using ", stderr); dict_index_name_print(stderr, index); @@ -3281,7 +3486,9 @@ rec_loop: buf_frame_get_page_no(buf_frame_align(rec))); rec_print(rec); */ - if (rec == page_get_infimum_rec(buf_frame_align(rec))) { +#endif /* UNIV_SEARCH_DEBUG */ + + if (page_rec_is_infimum(rec)) { /* The infimum record on a page cannot be in the result set, and neither can a record lock be placed on it: we skip such @@ -3290,10 +3497,11 @@ rec_loop: goto next_rec; } - if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + if (page_rec_is_supremum(rec)) { - if (prebuilt->select_lock_type != LOCK_NONE - && set_also_gap_locks) { + if (set_also_gap_locks + && !srv_locks_unsafe_for_binlog + && prebuilt->select_lock_type != LOCK_NONE) { /* Try to place a lock on the index record */ @@ -3301,16 +3509,16 @@ rec_loop: we do not lock gaps. Supremum record is really a gap and therefore we do not set locks there. */ - if (srv_locks_unsafe_for_binlog == FALSE) { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_ORDINARY, thr); - if (err != DB_SUCCESS) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(rec, index, offsets, + prebuilt->select_lock_type, + LOCK_ORDINARY, thr); - goto lock_wait_or_error; - } - } + if (err != DB_SUCCESS) { + goto lock_wait_or_error; + } } /* A page supremum record cannot be in the result set: skip it now that we have placed a possible lock on it */ @@ -3322,10 +3530,19 @@ rec_loop: /* Do sanity checks in case our cursor has bumped into page corruption */ - next_offs = rec_get_next_offs(rec); - - if (next_offs >= UNIV_PAGE_SIZE || next_offs < PAGE_SUPREMUM) { - + if (page_rec_is_comp(rec)) { + next_offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) { + goto wrong_offs; + } + } else { + next_offs = rec_get_next_offs(rec, FALSE); + if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) { + goto wrong_offs; + } + } + if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) { + wrong_offs: if (srv_force_recovery == 0 || moves_up == FALSE) { ut_print_timestamp(stderr); buf_page_print(buf_frame_align(rec)); @@ -3338,7 +3555,7 @@ rec_loop: fprintf(stderr, "InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" "InnoDB: ", - (ulong) (rec - buf_frame_align(rec)), + (ulong) ut_align_offset(rec, UNIV_PAGE_SIZE), (ulong) next_offs, (ulong) buf_frame_get_page_no(rec)); dict_index_name_print(stderr, trx, index); @@ -3356,7 +3573,7 @@ rec_loop: fprintf(stderr, "InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" "InnoDB: ", - (ulong) (rec - buf_frame_align(rec)), + (ulong) ut_align_offset(rec, UNIV_PAGE_SIZE), (ulong) next_offs, (ulong) buf_frame_get_page_no(rec)); dict_index_name_print(stderr, trx, index); @@ -3369,13 +3586,15 @@ rec_loop: } } - if (srv_force_recovery > 0) { - if (!rec_validate(rec) || !btr_index_rec_validate(rec, index, - FALSE)) { + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (UNIV_UNLIKELY(srv_force_recovery > 0)) { + if (!rec_validate(rec, offsets) + || !btr_index_rec_validate(rec, index, FALSE)) { fprintf(stderr, "InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" "InnoDB: ", - (ulong) (rec - buf_frame_align(rec)), + (ulong) ut_align_offset(rec, UNIV_PAGE_SIZE), (ulong) next_offs, (ulong) buf_frame_get_page_no(rec)); dict_index_name_print(stderr, trx, index); @@ -3399,31 +3618,29 @@ rec_loop: /* fputs("Comparing rec and search tuple\n", stderr); */ - if (0 != cmp_dtuple_rec(search_tuple, rec)) { + if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) { - if (prebuilt->select_lock_type != LOCK_NONE - && set_also_gap_locks) { + if (set_also_gap_locks + && !srv_locks_unsafe_for_binlog + && prebuilt->select_lock_type != LOCK_NONE) { /* Try to place a gap lock on the index record only if innodb_locks_unsafe_for_binlog option is not set */ - if (srv_locks_unsafe_for_binlog == FALSE) { - - err = sel_set_rec_lock(rec, index, + err = sel_set_rec_lock(rec, index, offsets, prebuilt->select_lock_type, LOCK_GAP, thr); - if (err != DB_SUCCESS) { - goto lock_wait_or_error; - } - } + if (err != DB_SUCCESS) { + goto lock_wait_or_error; + } } btr_pcur_store_position(pcur, &mtr); - ret = DB_RECORD_NOT_FOUND; + err = DB_RECORD_NOT_FOUND; /* ut_print_name(stderr, index->name); fputs(" record not found 3\n", stderr); */ @@ -3432,70 +3649,86 @@ rec_loop: } else if (match_mode == ROW_SEL_EXACT_PREFIX) { - if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) { + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) { - if (prebuilt->select_lock_type != LOCK_NONE - && set_also_gap_locks) { + if (set_also_gap_locks + && !srv_locks_unsafe_for_binlog + && prebuilt->select_lock_type != LOCK_NONE) { /* Try to place a gap lock on the index record only if innodb_locks_unsafe_for_binlog option is not set */ - if (srv_locks_unsafe_for_binlog == FALSE) { - - err = sel_set_rec_lock(rec, index, + err = sel_set_rec_lock(rec, index, offsets, prebuilt->select_lock_type, LOCK_GAP, thr); - if (err != DB_SUCCESS) { - goto lock_wait_or_error; - } - } + if (err != DB_SUCCESS) { + goto lock_wait_or_error; + } } btr_pcur_store_position(pcur, &mtr); - ret = DB_RECORD_NOT_FOUND; + err = DB_RECORD_NOT_FOUND; /* ut_print_name(stderr, index->name); fputs(" record not found 4\n", stderr); */ goto normal_return; } } - + /* We are ready to look at a possible new index entry in the result set: the cursor is now placed on a user record */ - cons_read_requires_clust_rec = FALSE; - if (prebuilt->select_lock_type != LOCK_NONE) { /* Try to place a lock on the index record; note that delete marked records are a special case in a unique search. If there is a non-delete marked record, then it is enough to lock its existence with LOCK_REC_NOT_GAP. */ + /* If innodb_locks_unsafe_for_binlog option is used, + we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + if (!set_also_gap_locks - || (unique_search && !rec_get_deleted_flag(rec))) { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_REC_NOT_GAP, thr); + || srv_locks_unsafe_for_binlog + || (unique_search && !UNIV_UNLIKELY(rec_get_deleted_flag( + rec, page_rec_is_comp(rec))))) { + + goto no_gap_lock; } else { - /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. */ + lock_type = LOCK_ORDINARY; + } - if (srv_locks_unsafe_for_binlog) { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_REC_NOT_GAP, thr); - } else { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_ORDINARY, thr); - } + /* If we are doing a 'greater or equal than a primary key + value' search from a clustered index, and we find a record + that has that exact primary key value, then there is no need + to lock the gap before the record, because no insert in the + gap can be in our search range. That is, no phantom row can + appear that way. + + An example: if col1 is the primary key, the search is WHERE + col1 >= 100, and we find a record where col1 = 100, then no + need to lock the gap before that record. */ + + if (index == clust_index + && mode == PAGE_CUR_GE + && direction == 0 + && dtuple_get_n_fields_cmp(search_tuple) + == dict_index_get_n_unique(index) + && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) { +no_gap_lock: + lock_type = LOCK_REC_NOT_GAP; } - + + err = sel_set_rec_lock(rec, index, offsets, + prebuilt->select_lock_type, + lock_type, thr); + if (err != DB_SUCCESS) { goto lock_wait_or_error; @@ -3516,13 +3749,14 @@ rec_loop: high force recovery level set, we try to avoid crashes by skipping this lookup */ - if (srv_force_recovery < 5 + if (UNIV_LIKELY(srv_force_recovery < 5) && !lock_clust_rec_cons_read_sees(rec, index, - trx->read_view)) { + offsets, trx->read_view)) { err = row_sel_build_prev_vers_for_mysql( trx->read_view, clust_index, prebuilt, rec, + &offsets, &heap, &old_vers, &mtr); if (err != DB_SUCCESS) { @@ -3546,16 +3780,31 @@ rec_loop: have to look also into the clustered index: this is necessary, because we can only get the undo information via the clustered index record. */ - - cons_read_requires_clust_rec = TRUE; + + /* Get the clustered index record if needed */ + index_rec = rec; + ut_ad(index != clust_index); + + goto requires_clust_rec; } } - if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, page_rec_is_comp(rec)))) { /* The record is delete-marked: we can skip it if this is not a consistent read which might see an earlier version of a non-clustered index record */ + + if (srv_locks_unsafe_for_binlog + && prebuilt->select_lock_type != LOCK_NONE) { + + /* No need to keep a lock on a delete-marked record + if we do not want to use next-key locking. */ + + row_unlock_for_mysql(prebuilt, TRUE); + + trx_reset_new_rec_lock_info(trx); + } goto next_rec; } @@ -3565,8 +3814,15 @@ rec_loop: index_rec = rec; - if (index != clust_index && (cons_read_requires_clust_rec - || prebuilt->need_to_access_clustered)) { + if (index != clust_index && prebuilt->need_to_access_clustered) { + +requires_clust_rec: + /* Before and after this "if" block, "offsets" will be + related to "rec", which may be in a secondary index "index" or + the clustered index ("clust_index"). However, after this + "if" block, "rec" may be pointing to + "clust_rec" of "clust_index". */ + ut_ad(rec_offs_validate(rec, index, offsets)); /* It was a non-clustered index and we must fetch also the clustered index record */ @@ -3574,7 +3830,8 @@ rec_loop: mtr_has_extra_clust_latch = TRUE; err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, - thr, &clust_rec, &mtr); + thr, &clust_rec, + &offsets, &heap, &mtr); if (err != DB_SUCCESS) { goto lock_wait_or_error; @@ -3587,21 +3844,42 @@ rec_loop: goto next_rec; } - if (rec_get_deleted_flag(clust_rec)) { + if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, + page_rec_is_comp(clust_rec)))) { /* The record is delete marked: we can skip it */ + if (srv_locks_unsafe_for_binlog + && prebuilt->select_lock_type != LOCK_NONE) { + + /* No need to keep a lock on a delete-marked + record if we do not want to use next-key + locking. */ + + row_unlock_for_mysql(prebuilt, TRUE); + + trx_reset_new_rec_lock_info(trx); + } + goto next_rec; } if (prebuilt->need_to_access_clustered) { rec = clust_rec; + ut_ad(rec_offs_validate(rec, clust_index, offsets)); + } else { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); } } /* We found a qualifying row */ - - if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD + ut_ad(rec_offs_validate(rec, + rec == clust_rec ? clust_index : index, + offsets)); + + if ((match_mode == ROW_SEL_EXACT + || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD) && prebuilt->select_lock_type == LOCK_NONE && !prebuilt->templ_contains_blob && !prebuilt->clust_index_was_generated @@ -3618,7 +3896,7 @@ rec_loop: not cache rows because there the cursor is a scrollable cursor. */ - row_sel_push_cache_row_for_mysql(prebuilt, rec); + row_sel_push_cache_row_for_mysql(prebuilt, rec, offsets); if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) { @@ -3628,11 +3906,13 @@ rec_loop: goto next_rec; } else { if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) { - ut_memcpy(buf + 4, rec - rec_get_extra_size(rec), - rec_get_size(rec)); - mach_write_to_4(buf, rec_get_extra_size(rec) + 4); + memcpy(buf + 4, rec - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + mach_write_to_4(buf, + rec_offs_extra_size(offsets) + 4); } else { - if (!row_sel_store_mysql_rec(buf, prebuilt, rec)) { + if (!row_sel_store_mysql_rec(buf, prebuilt, + rec, offsets)) { err = DB_TOO_BIG_RECORD; goto lock_wait_or_error; @@ -3640,8 +3920,13 @@ rec_loop: } if (prebuilt->clust_index_was_generated) { + if (rec != index_rec) { + offsets = rec_get_offsets( + index_rec, index, offsets, + ULINT_UNDEFINED, &heap); + } row_sel_store_row_id_to_prebuilt(prebuilt, index_rec, - index); + index, offsets); } } got_row: @@ -3661,15 +3946,15 @@ got_row: btr_pcur_store_position(pcur, &mtr); } - ret = DB_SUCCESS; + err = DB_SUCCESS; goto normal_return; next_rec: /*-------------------------------------------------------------*/ /* PHASE 5: Move the cursor to the next index record */ - - if (mtr_has_extra_clust_latch) { + + if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) { /* We must commit mtr if we are moving to the next non-clustered index record, because we could break the latching order if we would access a different clustered @@ -3681,34 +3966,39 @@ next_rec: mtr_has_extra_clust_latch = FALSE; mtr_start(&mtr); - moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, - moves_up, &mtr); - if (moved) { + if (sel_restore_position_for_mysql(&same_user_rec, + BTR_SEARCH_LEAF, + pcur, moves_up, &mtr)) { +#ifdef UNIV_SEARCH_DEBUG cnt++; +#endif /* UNIV_SEARCH_DEBUG */ goto rec_loop; } } if (moves_up) { - moved = btr_pcur_move_to_next(pcur, &mtr); - } else { - moved = btr_pcur_move_to_prev(pcur, &mtr); - } + if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) { + not_moved: + btr_pcur_store_position(pcur, &mtr); - if (!moved) { - btr_pcur_store_position(pcur, &mtr); + if (match_mode != 0) { + err = DB_RECORD_NOT_FOUND; + } else { + err = DB_END_OF_INDEX; + } - if (match_mode != 0) { - ret = DB_RECORD_NOT_FOUND; - } else { - ret = DB_END_OF_INDEX; + goto normal_return; + } + } else { + if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) { + goto not_moved; } - - goto normal_return; } +#ifdef UNIV_SEARCH_DEBUG cnt++; +#endif /* UNIV_SEARCH_DEBUG */ goto rec_loop; @@ -3726,24 +4016,50 @@ lock_wait_or_error: que_thr_stop_for_mysql(thr); - was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); - - if (was_lock_wait) { + thr->lock_state = QUE_THR_LOCK_ROW; + + if (row_mysql_handle_errors(&err, trx, thr, NULL)) { + /* It was a lock wait, and it ended */ + + thr->lock_state = QUE_THR_LOCK_NOLOCK; mtr_start(&mtr); - sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, - moves_up, &mtr); + sel_restore_position_for_mysql(&same_user_rec, + BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + if (srv_locks_unsafe_for_binlog && !same_user_rec) { + /* Since we were not able to restore the cursor + on the same user record, we cannot use + row_unlock_for_mysql() to unlock any records, and + we must thus reset the new rec lock info. Since + in lock0lock.c we have blocked the inheriting of gap + X-locks, we actually do not have any new record locks + set in this case. + + Note that if we were able to restore on the 'same' + user record, it is still possible that we were actually + waiting on a delete-marked record, and meanwhile + it was removed by purge and inserted again by some + other user. But that is no problem, because in + rec_loop we will again try to set a lock, and + new_rec_lock_info in trx will be right at the end. */ + + trx_reset_new_rec_lock_info(trx); + } + mode = pcur->search_mode; goto rec_loop; } + thr->lock_state = QUE_THR_LOCK_NOLOCK; + +#ifdef UNIV_SEARCH_DEBUG /* fputs("Using ", stderr); dict_index_name_print(stderr, index); fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ - trx->op_info = ""; - - return(err); +#endif /* UNIV_SEARCH_DEBUG */ + goto func_exit; normal_return: /*-------------------------------------------------------------*/ @@ -3754,19 +4070,29 @@ normal_return: if (prebuilt->n_fetch_cached > 0) { row_sel_pop_cached_row_for_mysql(buf, prebuilt); - ret = DB_SUCCESS; + err = DB_SUCCESS; } +#ifdef UNIV_SEARCH_DEBUG /* fputs("Using ", stderr); dict_index_name_print(stderr, index); fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ - if (ret == DB_SUCCESS) { +#endif /* UNIV_SEARCH_DEBUG */ + if (err == DB_SUCCESS) { srv_n_rows_read++; } - trx->op_info = ""; +func_exit: + /* Restore a global read view back to transaction. This forces + MySQL always to set cursor view before fetch if it is used. */ - return(ret); + trx->read_view = trx->global_read_view; + + trx->op_info = ""; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); } /*********************************************************************** @@ -3815,7 +4141,8 @@ row_search_check_if_query_cache_permitted( && !trx->read_view) { trx->read_view = read_view_open_now(trx, - trx->read_view_heap); + trx->global_read_view_heap); + trx->global_read_view = trx->read_view; } } diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c index e16d696314b..0225a9faec5 100644 --- a/innobase/row/row0umod.c +++ b/innobase/row/row0umod.c @@ -52,19 +52,16 @@ row_undo_mod_undo_also_prev_vers( /* out: TRUE if also previous modify or insert of this row should be undone */ undo_node_t* node, /* in: row undo node */ - que_thr_t* thr, /* in: query thread */ dulint* undo_no)/* out: the undo number */ { trx_undo_rec_t* undo_rec; - ibool ret; trx_t* trx; - UT_NOT_USED(thr); - trx = node->trx; if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) { + *undo_no = ut_dulint_zero; return(FALSE); } @@ -72,13 +69,7 @@ row_undo_mod_undo_also_prev_vers( *undo_no = trx_undo_rec_get_undo_no(undo_rec); - if (ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0) { - ret = TRUE; - } else { - ret = FALSE; - } - - return(ret); + return(ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0); } /*************************************************************** @@ -214,7 +205,7 @@ row_undo_mod_clust( /* Check if also the previous version of the clustered index record should be undone in this same rollback operation */ - more_vers = row_undo_mod_undo_also_prev_vers(node, thr, &new_undo_no); + more_vers = row_undo_mod_undo_also_prev_vers(node, &new_undo_no); pcur = &(node->pcur); @@ -438,7 +429,7 @@ row_undo_mod_del_unmark_sec_and_undo_update( dtuple_print(stderr, entry); fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, btr_pcur_get_rec(&pcur)); + rec_print(stderr, btr_pcur_get_rec(&pcur), index); putc('\n', stderr); trx_print(stderr, trx); fputs("\n" diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c index bc3cc8ea9f3..435c0279dbb 100644 --- a/innobase/row/row0undo.c +++ b/innobase/row/row0undo.c @@ -151,6 +151,10 @@ row_undo_search_clust_to_pcur( mtr_t mtr; ibool ret; rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; mtr_start(&mtr); @@ -161,8 +165,11 @@ row_undo_search_clust_to_pcur( rec = btr_pcur_get_rec(&(node->pcur)); + offsets = rec_get_offsets(rec, clust_index, offsets, + ULINT_UNDEFINED, &heap); + if (!found || 0 != ut_dulint_cmp(node->roll_ptr, - row_get_rec_roll_ptr(rec, clust_index))) { + row_get_rec_roll_ptr(rec, clust_index, offsets))) { /* We must remove the reservation on the undo log record BEFORE releasing the latch on the clustered index page: this @@ -175,7 +182,7 @@ row_undo_search_clust_to_pcur( ret = FALSE; } else { node->row = row_build(ROW_COPY_DATA, clust_index, rec, - node->heap); + offsets, node->heap); btr_pcur_store_position(&(node->pcur), &mtr); ret = TRUE; @@ -183,6 +190,9 @@ row_undo_search_clust_to_pcur( btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(ret); } diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index 9192f6dc692..514fb6bd577 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -301,19 +301,20 @@ recovery. */ void row_upd_rec_sys_fields_in_recovery( /*===============================*/ - rec_t* rec, /* in: record */ - ulint pos, /* in: TRX_ID position in rec */ - dulint trx_id, /* in: transaction id */ - dulint roll_ptr)/* in: roll ptr of the undo log record */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr)/* in: roll ptr of the undo log record */ { byte* field; ulint len; - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); ut_ad(len == DATA_TRX_ID_LEN); trx_write_trx_id(field, trx_id); - field = rec_get_nth_field(rec, pos + 1, &len); + field = rec_get_nth_field(rec, offsets, pos + 1, &len); ut_ad(len == DATA_ROLL_PTR_LEN); trx_write_roll_ptr(field, roll_ptr); } @@ -361,8 +362,8 @@ row_upd_changes_field_size_or_external( /* out: TRUE if the update changes the size of some field in index or the field is external in rec or update */ - rec_t* rec, /* in: record in index */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update) /* in: update vector */ { upd_field_t* upd_field; @@ -372,6 +373,7 @@ row_upd_changes_field_size_or_external( ulint n_fields; ulint i; + ut_ad(rec_offs_validate(NULL, index, offsets)); n_fields = upd_get_n_fields(update); for (i = 0; i < n_fields; i++) { @@ -380,7 +382,7 @@ row_upd_changes_field_size_or_external( new_val = &(upd_field->new_val); new_len = new_val->len; - if (new_len == UNIV_SQL_NULL) { + if (new_len == UNIV_SQL_NULL && !rec_offs_comp(offsets)) { /* A bug fixed on Dec 31st, 2004: we looked at the SQL NULL size from the wrong field! We may backport this fix also to 4.0. The merge to 5.0 will be made @@ -391,14 +393,14 @@ row_upd_changes_field_size_or_external( upd_field->field_no)); } - old_len = rec_get_nth_field_size(rec, upd_field->field_no); - + old_len = rec_offs_nth_size(offsets, upd_field->field_no); + if (old_len != new_len) { return(TRUE); } - if (rec_get_nth_field_extern_bit(rec, upd_field->field_no)) { + if (rec_offs_nth_extern(offsets, upd_field->field_no)) { return(TRUE); } @@ -420,15 +422,18 @@ a clustered index */ void row_upd_rec_in_place( /*=================*/ - rec_t* rec, /* in/out: record where replaced */ - upd_t* update) /* in: update vector */ + rec_t* rec, /* in/out: record where replaced */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update) /* in: update vector */ { upd_field_t* upd_field; dfield_t* new_val; ulint n_fields; ulint i; - rec_set_info_bits(rec, update->info_bits); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + rec_set_info_bits(rec, rec_offs_comp(offsets), update->info_bits); n_fields = upd_get_n_fields(update); @@ -436,7 +441,7 @@ row_upd_rec_in_place( upd_field = upd_get_nth_field(update, i); new_val = &(upd_field->new_val); - rec_set_nth_field(rec, upd_field->field_no, + rec_set_nth_field(rec, offsets, upd_field->field_no, dfield_get_data(new_val), dfield_get_len(new_val)); } @@ -701,6 +706,9 @@ row_upd_build_sec_rec_difference_binary( upd_t* update; ulint n_diff; ulint i; + ulint offsets_[REC_OFFS_SMALL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; /* This function is used only for a secondary index */ ut_a(0 == (index->type & DICT_CLUSTERED)); @@ -708,10 +716,12 @@ row_upd_build_sec_rec_difference_binary( update = upd_create(dtuple_get_n_fields(entry), heap); n_diff = 0; + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); for (i = 0; i < dtuple_get_n_fields(entry); i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); dfield = dtuple_get_nth_field(entry, i); @@ -774,6 +784,9 @@ row_upd_build_difference_binary( ulint trx_id_pos; ibool extern_bit; ulint i; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; /* This function is used only for a clustered index */ ut_a(index->type & DICT_CLUSTERED); @@ -785,9 +798,12 @@ row_upd_build_difference_binary( roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + for (i = 0; i < dtuple_get_n_fields(entry); i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); dfield = dtuple_get_nth_field(entry, i); @@ -799,9 +815,10 @@ row_upd_build_difference_binary( goto skip_compare; } - extern_bit = rec_get_nth_field_extern_bit(rec, i); + extern_bit = upd_ext_vec_contains(ext_vec, n_ext_vec, i); - if (extern_bit != upd_ext_vec_contains(ext_vec, n_ext_vec, i) + if (UNIV_UNLIKELY(extern_bit == + (ibool)!rec_offs_nth_extern(offsets, i)) || !dfield_data_is_binary_equal(dfield, len, data)) { upd_field = upd_get_nth_field(update, n_diff); @@ -810,12 +827,8 @@ row_upd_build_difference_binary( upd_field_set_field_no(upd_field, i, index, trx); - if (upd_ext_vec_contains(ext_vec, n_ext_vec, i)) { - upd_field->extern_storage = TRUE; - } else { - upd_field->extern_storage = FALSE; - } - + upd_field->extern_storage = extern_bit; + n_diff++; } skip_compare: @@ -1123,6 +1136,7 @@ void row_upd_copy_columns( /*=================*/ rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ sym_node_t* column) /* in: first column in a column list, or NULL */ { @@ -1130,7 +1144,7 @@ row_upd_copy_columns( ulint len; while (column) { - data = rec_get_nth_field(rec, + data = rec_get_nth_field(rec, offsets, column->field_nos[SYM_CLUST_FIELD_NO], &len); eval_node_copy_and_alloc_val(column, data, len); @@ -1177,7 +1191,11 @@ row_upd_store_row( dict_index_t* clust_index; upd_t* update; rec_t* rec; - + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); if (node->row != NULL) { @@ -1189,10 +1207,12 @@ row_upd_store_row( rec = btr_pcur_get_rec(node->pcur); - node->row = row_build(ROW_COPY_DATA, clust_index, rec, node->heap); - + offsets = rec_get_offsets(rec, clust_index, offsets_, + ULINT_UNDEFINED, &heap); + node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, + node->heap); node->ext_vec = mem_heap_alloc(node->heap, sizeof(ulint) - * rec_get_n_fields(rec)); + * rec_offs_n_fields(offsets)); if (node->is_delete) { update = NULL; } else { @@ -1200,7 +1220,10 @@ row_upd_store_row( } node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec, - rec, update); + offsets, update); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } } /*************************************************************** @@ -1244,7 +1267,7 @@ row_upd_sec_index_entry( rec = btr_cur_get_rec(btr_cur); - if (!found) { + if (UNIV_UNLIKELY(!found)) { fputs("InnoDB: error in sec index entry update in\n" "InnoDB: ", stderr); dict_index_name_print(stderr, trx, index); @@ -1253,7 +1276,7 @@ row_upd_sec_index_entry( dtuple_print(stderr, entry); fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, index); putc('\n', stderr); trx_print(stderr, trx); @@ -1265,7 +1288,7 @@ row_upd_sec_index_entry( delete marked if we return after a lock wait in row_ins_index_entry below */ - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, index->table->comp)) { err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE, thr, &mtr); if (err == DB_SUCCESS && check_ref) { @@ -1353,7 +1376,7 @@ row_upd_clust_rec_by_insert( a foreign key constraint */ mtr_t* mtr) /* in: mtr; gets committed here */ { - mem_heap_t* heap; + mem_heap_t* heap = NULL; btr_pcur_t* pcur; btr_cur_t* btr_cur; trx_t* trx; @@ -1370,12 +1393,13 @@ row_upd_clust_rec_by_insert( btr_cur = btr_pcur_get_btr_cur(pcur); if (node->state != UPD_NODE_INSERT_CLUSTERED) { + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, btr_cur, TRUE, thr, mtr); if (err != DB_SUCCESS) { mtr_commit(mtr); - return(err); } @@ -1385,7 +1409,9 @@ row_upd_clust_rec_by_insert( record is removed from the index tree, or updated. */ btr_cur_mark_extern_inherited_fields(btr_cur_get_rec(btr_cur), - node->update, mtr); + rec_get_offsets(btr_cur_get_rec(btr_cur), + dict_table_get_first_index(table), offsets_, + ULINT_UNDEFINED, &heap), node->update, mtr); if (check_ref) { /* NOTE that the following call loses the position of pcur ! */ @@ -1394,7 +1420,9 @@ row_upd_clust_rec_by_insert( index, thr, mtr); if (err != DB_SUCCESS) { mtr_commit(mtr); - + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(err); } } @@ -1403,10 +1431,11 @@ row_upd_clust_rec_by_insert( mtr_commit(mtr); + if (!heap) { + heap = mem_heap_create(500); + } node->state = UPD_NODE_INSERT_CLUSTERED; - heap = mem_heap_create(500); - entry = row_build_index_entry(node->row, index, heap); row_upd_index_replace_new_col_vals(entry, index, node->update, NULL); @@ -1458,7 +1487,8 @@ row_upd_clust_rec( pcur = node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); - ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + index->table->comp)); /* Try optimistic updating of the record, keeping changes within the page; we do not check locks because we assume the x-lock on the @@ -1494,7 +1524,8 @@ row_upd_clust_rec( ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); - ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + index->table->comp)); err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, &big_rec, node->update, @@ -1502,12 +1533,22 @@ row_upd_clust_rec( mtr_commit(mtr); if (err == DB_SUCCESS && big_rec) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + mtr_start(mtr); + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); - - err = btr_store_big_rec_extern_fields(index, - btr_cur_get_rec(btr_cur), - big_rec, mtr); + rec = btr_cur_get_rec(btr_cur); + err = btr_store_big_rec_extern_fields(index, rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + big_rec, mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } mtr_commit(mtr); } @@ -1591,7 +1632,12 @@ row_upd_clust_step( ulint err; mtr_t* mtr; mtr_t mtr_buf; - + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + index = dict_table_get_first_index(node->table); check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr)); @@ -1647,14 +1693,16 @@ row_upd_clust_step( } } + rec = btr_pcur_get_rec(pcur); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + if (!node->has_clust_rec_x_lock) { err = lock_clust_rec_modify_check_and_lock(0, - btr_pcur_get_rec(pcur), - index, thr); + rec, index, offsets, thr); if (err != DB_SUCCESS) { mtr_commit(mtr); - - return(err); + goto exit_func; } } @@ -1663,14 +1711,14 @@ row_upd_clust_step( if (node->is_delete) { err = row_upd_del_mark_clust_rec(node, index, thr, check_ref, mtr); - if (err != DB_SUCCESS) { - - return(err); + if (err == DB_SUCCESS) { + node->state = UPD_NODE_UPDATE_ALL_SEC; + node->index = dict_table_get_next_index(index); + } + exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); } - - node->state = UPD_NODE_UPDATE_ALL_SEC; - node->index = dict_table_get_next_index(index); - return(err); } @@ -1680,16 +1728,18 @@ row_upd_clust_step( if (!node->in_mysql_interface) { /* Copy the necessary columns from clust_rec and calculate the new values to set */ - - row_upd_copy_columns(btr_pcur_get_rec(pcur), + row_upd_copy_columns(rec, offsets, UT_LIST_GET_FIRST(node->columns)); row_upd_eval_new_vals(node->update); } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { err = row_upd_clust_rec(node, index, thr, mtr); - return(err); } @@ -1941,6 +1991,9 @@ row_upd_in_place_in_select( btr_pcur_t* pcur; btr_cur_t* btr_cur; ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; ut_ad(sel_node->select_will_do_update); ut_ad(sel_node->latch_mode == BTR_MODIFY_LEAF); @@ -1956,11 +2009,17 @@ row_upd_in_place_in_select( /* Copy the necessary columns from clust_rec and calculate the new values to set */ - row_upd_copy_columns(btr_pcur_get_rec(pcur), - UT_LIST_GET_FIRST(node->columns)); + row_upd_copy_columns(btr_pcur_get_rec(pcur), rec_get_offsets( + btr_pcur_get_rec(pcur), btr_cur->index, offsets_, + ULINT_UNDEFINED, &heap), + UT_LIST_GET_FIRST(node->columns)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } row_upd_eval_new_vals(node->update); - ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + btr_cur->index->table->comp)); ut_ad(node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE); ut_ad(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE); diff --git a/innobase/row/row0vers.c b/innobase/row/row0vers.c index bc17ede89e3..8e747423047 100644 --- a/innobase/row/row0vers.c +++ b/innobase/row/row0vers.c @@ -41,10 +41,12 @@ row_vers_impl_x_locked_off_kernel( transaction; NOTE that the kernel mutex is temporarily released! */ rec_t* rec, /* in: record in a secondary index */ - dict_index_t* index) /* in: the secondary index */ + dict_index_t* index, /* in: the secondary index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { dict_index_t* clust_index; rec_t* clust_rec; + ulint* clust_offsets; rec_t* version; rec_t* prev_version; dulint trx_id; @@ -55,10 +57,11 @@ row_vers_impl_x_locked_off_kernel( dtuple_t* entry = NULL; /* assignment to eliminate compiler warning */ trx_t* trx; - ibool vers_del; - ibool rec_del; + ulint vers_del; + ulint rec_del; ulint err; mtr_t mtr; + ulint comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -96,29 +99,33 @@ row_vers_impl_x_locked_off_kernel( return(NULL); } - trx_id = row_get_rec_trx_id(clust_rec, clust_index); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); + trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); mtr_s_lock(&(purge_sys->latch), &mtr); mutex_enter(&kernel_mutex); + trx = NULL; if (!trx_is_active(trx_id)) { /* The transaction that modified or inserted clust_rec is no longer active: no implicit lock on rec */ - - mtr_commit(&mtr); - - return(NULL); + goto exit_func; } - if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, TRUE)) { + if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, + clust_offsets, TRUE)) { /* Corruption noticed: try to avoid a crash by returning */ - - mtr_commit(&mtr); - - return(NULL); + goto exit_func; } + comp = page_rec_is_comp(rec); + ut_ad(index->table == clust_index->table); + ut_ad(!!comp == index->table->comp); + ut_ad(!comp == !page_rec_is_comp(clust_rec)); + /* We look up if some earlier version, which was modified by the trx_id transaction, of the clustered index record would require rec to be in a different state (delete marked or unmarked, or have different field @@ -128,11 +135,10 @@ row_vers_impl_x_locked_off_kernel( different state, then the trx_id transaction has not yet had time to modify rec, and does not necessarily have an implicit x-lock on rec. */ - rec_del = rec_get_deleted_flag(rec); + rec_del = rec_get_deleted_flag(rec, comp); trx = NULL; version = clust_rec; - heap = NULL; for (;;) { mutex_exit(&kernel_mutex); @@ -146,18 +152,17 @@ row_vers_impl_x_locked_off_kernel( heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(clust_rec, &mtr, version, - clust_index, heap, - &prev_version); - if (heap2) { - mem_heap_free(heap2); /* version was stored in heap2, - if heap2 != NULL */ - } + clust_index, clust_offsets, heap, + &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ if (prev_version) { + clust_offsets = rec_get_offsets(prev_version, + clust_index, NULL, + ULINT_UNDEFINED, &heap); row = row_build(ROW_COPY_POINTERS, clust_index, - prev_version, heap); + prev_version, clust_offsets, heap); entry = row_build_index_entry(row, index, heap); } @@ -189,11 +194,11 @@ row_vers_impl_x_locked_off_kernel( if prev_version would require rec to be in a different state. */ - vers_del = rec_get_deleted_flag(prev_version); + vers_del = rec_get_deleted_flag(prev_version, comp); /* We check if entry and rec are identified in the alphabetical ordering */ - if (0 == cmp_dtuple_rec(entry, rec)) { + if (0 == cmp_dtuple_rec(entry, rec, offsets)) { /* The delete marks of rec and prev_version should be equal for rec to be in the state required by prev_version */ @@ -211,7 +216,7 @@ row_vers_impl_x_locked_off_kernel( dtuple_set_types_binary(entry, dtuple_get_n_fields(entry)); - if (0 != cmp_dtuple_rec(entry, rec)) { + if (0 != cmp_dtuple_rec(entry, rec, offsets)) { trx = trx_get_on_id(trx_id); @@ -226,7 +231,8 @@ row_vers_impl_x_locked_off_kernel( break; } - prev_trx_id = row_get_rec_trx_id(prev_version, clust_index); + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { /* The versions modified by the trx_id transaction end @@ -238,6 +244,7 @@ row_vers_impl_x_locked_off_kernel( version = prev_version; }/* for (;;) */ +exit_func: mtr_commit(&mtr); mem_heap_free(heap); @@ -297,12 +304,14 @@ row_vers_old_has_index_entry( rec_t* version; rec_t* prev_version; dict_index_t* clust_index; + ulint* clust_offsets; mem_heap_t* heap; mem_heap_t* heap2; dtuple_t* row; dtuple_t* entry; ulint err; - + ulint comp; + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) || mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_S_FIX)); @@ -313,10 +322,15 @@ row_vers_old_has_index_entry( clust_index = dict_table_get_first_index(index->table); - if (also_curr && !rec_get_deleted_flag(rec)) { + comp = page_rec_is_comp(rec); + ut_ad(!index->table->comp == !comp); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); - heap = mem_heap_create(1024); - row = row_build(ROW_COPY_POINTERS, clust_index, rec, heap); + if (also_curr && !rec_get_deleted_flag(rec, comp)) { + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, clust_offsets, heap); entry = row_build_index_entry(row, index, heap); /* NOTE that we cannot do the comparison as binary @@ -331,24 +345,17 @@ row_vers_old_has_index_entry( return(TRUE); } - - mem_heap_free(heap); } version = rec; - heap = NULL; for (;;) { heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(rec, mtr, version, - clust_index, heap, - &prev_version); - if (heap2) { - mem_heap_free(heap2); /* version was stored in heap2, - if heap2 != NULL */ - } + clust_index, clust_offsets, heap, + &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ if (err != DB_SUCCESS || !prev_version) { /* Versions end here */ @@ -358,9 +365,12 @@ row_vers_old_has_index_entry( return(FALSE); } - if (!rec_get_deleted_flag(prev_version)) { + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, ULINT_UNDEFINED, &heap); + + if (!rec_get_deleted_flag(prev_version, comp)) { row = row_build(ROW_COPY_POINTERS, clust_index, - prev_version, heap); + prev_version, clust_offsets, heap); entry = row_build_index_entry(row, index, heap); /* NOTE that we cannot do the comparison as binary @@ -396,7 +406,11 @@ row_vers_build_for_consistent_read( of this records */ mtr_t* mtr, /* in: mtr holding the latch on rec */ dict_index_t* index, /* in: the clustered index */ + ulint** offsets,/* in/out: offsets returned by + rec_get_offsets(rec, index) */ read_view_t* view, /* in: the consistent read view */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ mem_heap_t* in_heap,/* in: memory heap from which the memory for old_vers is allocated; memory for possible intermediate versions is allocated and freed @@ -408,8 +422,7 @@ row_vers_build_for_consistent_read( rec_t* version; rec_t* prev_version; dulint prev_trx_id; - mem_heap_t* heap; - mem_heap_t* heap2; + mem_heap_t* heap = NULL; byte* buf; ulint err; @@ -420,21 +433,23 @@ row_vers_build_for_consistent_read( #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(!read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index))); + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + ut_ad(!read_view_sees_trx_id(view, + row_get_rec_trx_id(rec, index, *offsets))); rw_lock_s_lock(&(purge_sys->latch)); version = rec; - heap = NULL; for (;;) { - heap2 = heap; + mem_heap_t* heap2 = heap; heap = mem_heap_create(1024); err = trx_undo_prev_version_build(rec, mtr, version, index, - heap, &prev_version); + *offsets, heap, &prev_version); if (heap2) { - mem_heap_free(heap2); /* version was stored in heap2, - if heap2 != NULL */ + mem_heap_free(heap2); /* free version */ } if (err != DB_SUCCESS) { @@ -449,16 +464,19 @@ row_vers_build_for_consistent_read( break; } - prev_trx_id = row_get_rec_trx_id(prev_version, index); + *offsets = rec_get_offsets(prev_version, index, *offsets, + ULINT_UNDEFINED, offset_heap); + prev_trx_id = row_get_rec_trx_id(prev_version, index, + *offsets); if (read_view_sees_trx_id(view, prev_trx_id)) { /* The view already sees this version: we can copy it to in_heap and return */ - buf = mem_heap_alloc(in_heap, rec_get_size( - prev_version)); - *old_vers = rec_copy(buf, prev_version); + buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); + *old_vers = rec_copy(buf, prev_version, *offsets); + rec_offs_make_valid(*old_vers, index, *offsets); err = DB_SUCCESS; break; diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index e56389a8541..837c5be2bb6 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -44,6 +44,7 @@ Created 10/8/1995 Heikki Tuuri #include "buf0flu.h" #include "btr0sea.h" #include "dict0load.h" +#include "dict0boot.h" #include "srv0start.h" #include "row0mysql.h" @@ -186,6 +187,61 @@ that during a time of heavy update/insert activity. */ ulong srv_max_buf_pool_modified_pct = 90; +/* variable counts amount of data read in total (in bytes) */ +ulint srv_data_read = 0; + +/* here we count the amount of data written in total (in bytes) */ +ulint srv_data_written = 0; + +/* the number of the log write requests done */ +ulint srv_log_write_requests = 0; + +/* the number of physical writes to the log performed */ +ulint srv_log_writes = 0; + +/* amount of data written to the log files in bytes */ +ulint srv_os_log_written = 0; + +/* amount of writes being done to the log files */ +ulint srv_os_log_pending_writes = 0; + +/* we increase this counter, when there we don't have enough space in the +log buffer and have to flush it */ +ulint srv_log_waits = 0; + +/* this variable counts the amount of times, when the doublewrite buffer +was flushed */ +ulint srv_dblwr_writes = 0; + +/* here we store the number of pages that have been flushed to the +doublewrite buffer */ +ulint srv_dblwr_pages_written = 0; + +/* in this variable we store the number of write requests issued */ +ulint srv_buf_pool_write_requests = 0; + +/* here we store the number of times when we had to wait for a free page +in the buffer pool. It happens when the buffer pool is full and we need +to make a flush, in order to be able to read or create a page. */ +ulint srv_buf_pool_wait_free = 0; + +/* variable to count the number of pages that were written from buffer +pool to the disk */ +ulint srv_buf_pool_flushed = 0; + +/* variable to count the number of buffer pool reads that led to the +reading of a disk page */ +ulint srv_buf_pool_reads = 0; + +/* variable to count the number of sequential read-aheads */ +ulint srv_read_ahead_seq = 0; + +/* variable to count the number of random read-aheads */ +ulint srv_read_ahead_rnd = 0; + +/* structure to pass status variables to MySQL */ +export_struc export_vars; + /* If the following is != 0 we do not allow inserts etc. This protects the user from forgetting the innodb_force_recovery keyword to my.cnf */ @@ -204,7 +260,7 @@ semaphore contention and convoy problems can occur withput this restriction. Value 10 should be good if there are less than 4 processors + 4 disks in the computer. Bigger computers need bigger values. */ -ulint srv_thread_concurrency = 8; +ulong srv_thread_concurrency = SRV_CONCURRENCY_THRESHOLD; os_fast_mutex_t srv_conc_mutex; /* this mutex protects srv_conc data structures */ @@ -241,22 +297,20 @@ srv_conc_slot_t* srv_conc_slots; /* array of wait /* Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket at srv_conc_enter_innodb */ -#define SRV_FREE_TICKETS_TO_ENTER 500 - +#define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter +#define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay /*-----------------------*/ -/* If the following is set TRUE then we do not run purge and insert buffer -merge to completion before shutdown */ - -ibool srv_fast_shutdown = FALSE; +/* If the following is set to 1 then we do not run purge and insert buffer +merge to completion before shutdown. If it is set to 2, do not even flush the +buffer pool to data files at the shutdown: we effectively 'crash' +InnoDB (but lose no committed transactions). */ +ulint srv_fast_shutdown = 0; -ibool srv_very_fast_shutdown = FALSE; /* if this TRUE, do not flush the - buffer pool to data files at the - shutdown; we effectively 'crash' - InnoDB */ /* Generate a innodb_status.<pid> file */ ibool srv_innodb_status = FALSE; ibool srv_use_doublewrite_buf = TRUE; +ibool srv_use_checksums = TRUE; ibool srv_set_thread_priorities = TRUE; int srv_query_thread_priority = 0; @@ -267,7 +321,9 @@ ibool srv_use_awe = FALSE; ibool srv_use_adaptive_hash_indexes = TRUE; /*-------------------------------------------*/ -ulint srv_n_spin_wait_rounds = 20; +ulong srv_n_spin_wait_rounds = 20; +ulong srv_n_free_tickets_to_enter = 500; +ulong srv_thread_sleep_delay = 10000; ulint srv_spin_wait_delay = 5; ibool srv_priority_boost = TRUE; @@ -286,6 +342,12 @@ static ulint srv_n_rows_updated_old = 0; static ulint srv_n_rows_deleted_old = 0; static ulint srv_n_rows_read_old = 0; +ulint srv_n_lock_wait_count = 0; +ulint srv_n_lock_wait_current_count = 0; +ib_longlong srv_n_lock_wait_time = 0; +ulint srv_n_lock_max_wait_time = 0; + + /* Set the following to 0 if you want InnoDB to write messages on stderr on startup/shutdown @@ -780,13 +842,14 @@ srv_get_thread_type(void) /************************************************************************* Initializes the server. */ -static + void srv_init(void) /*==========*/ { srv_conc_slot_t* conc_slot; srv_slot_t* slot; + dict_table_t* table; ulint i; srv_sys = mem_alloc(sizeof(srv_sys_t)); @@ -836,6 +899,31 @@ srv_init(void) UT_LIST_INIT(srv_sys->tasks); + /* create dummy table and index for old-style infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY1", + DICT_HDR_SPACE, 1, FALSE); + dict_mem_table_add_col(table, "DUMMY", DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8, 0); + + srv_sys->dummy_ind1 = dict_mem_index_create("SYS_DUMMY1", + "SYS_DUMMY1", DICT_HDR_SPACE, 0, 1); + dict_index_add_col(srv_sys->dummy_ind1, + dict_table_get_nth_col(table, 0), 0, 0); + srv_sys->dummy_ind1->table = table; + /* create dummy table and index for new-style infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY2", + DICT_HDR_SPACE, 1, TRUE); + dict_mem_table_add_col(table, "DUMMY", DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8, 0); + srv_sys->dummy_ind2 = dict_mem_index_create("SYS_DUMMY2", + "SYS_DUMMY2", DICT_HDR_SPACE, 0, 1); + dict_index_add_col(srv_sys->dummy_ind2, + dict_table_get_nth_col(table, 0), 0, 0); + srv_sys->dummy_ind2->table = table; + + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + srv_sys->dummy_ind1->cached = srv_sys->dummy_ind2->cached = TRUE; + /* Init the server concurrency restriction data structures */ os_fast_mutex_init(&srv_conc_mutex); @@ -895,12 +983,6 @@ srv_conc_enter_innodb( srv_conc_slot_t* slot = NULL; ulint i; - if (srv_thread_concurrency >= 500) { - /* Disable the concurrency check */ - - return; - } - /* If trx has 'free tickets' to enter the engine left, then use one such ticket */ @@ -935,8 +1017,8 @@ retry: return; } - /* If the transaction is not holding resources, let it sleep for 50 - milliseconds, and try again then */ + /* If the transaction is not holding resources, + let it sleep for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */ if (!has_slept && !trx->has_search_latch && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) { @@ -955,8 +1037,10 @@ retry: situations of lots of thread switches. Simply put some threads aside for a while to reduce the number of thread switches. */ - - os_thread_sleep(10000); + if (SRV_THREAD_SLEEP_DELAY > 0) + { + os_thread_sleep(SRV_THREAD_SLEEP_DELAY); + } trx->op_info = ""; @@ -1044,7 +1128,7 @@ srv_conc_force_enter_innodb( trx_t* trx) /* in: transaction object associated with the thread */ { - if (srv_thread_concurrency >= 500) { + if (srv_thread_concurrency >= SRV_CONCURRENCY_THRESHOLD) { return; } @@ -1070,7 +1154,7 @@ srv_conc_force_exit_innodb( { srv_conc_slot_t* slot = NULL; - if (srv_thread_concurrency >= 500) { + if (srv_thread_concurrency >= SRV_CONCURRENCY_THRESHOLD) { return; } @@ -1122,11 +1206,6 @@ srv_conc_exit_innodb( trx_t* trx) /* in: transaction object associated with the thread */ { - if (srv_thread_concurrency >= 500) { - - return; - } - if (trx->n_tickets_to_enter_innodb > 0) { /* We will pretend the thread is still inside InnoDB though it now leaves the InnoDB engine. In this way we save @@ -1216,6 +1295,7 @@ srv_boot(void) return(DB_SUCCESS); } +#ifndef UNIV_HOTBACKUP /************************************************************************* Reserves a slot in the thread table for the current MySQL OS thread. NOTE! The kernel mutex has to be reserved by the caller! */ @@ -1274,6 +1354,7 @@ srv_table_reserve_slot_for_mysql(void) return(slot); } +#endif /* !UNIV_HOTBACKUP */ /******************************************************************* Puts a MySQL OS thread to wait for a lock to be released. If an error @@ -1288,13 +1369,19 @@ srv_suspend_mysql_thread( que_thr_t* thr) /* in: query thread associated with the MySQL OS thread */ { +#ifndef UNIV_HOTBACKUP srv_slot_t* slot; os_event_t event; double wait_time; trx_t* trx; ibool had_dict_lock = FALSE; ibool was_declared_inside_innodb = FALSE; - + ib_longlong start_time = 0; + ib_longlong finish_time; + ulint diff_time; + ulint sec; + ulint ms; + #ifdef UNIV_SYNC_DEBUG ut_ad(!mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ @@ -1337,6 +1424,13 @@ srv_suspend_mysql_thread( slot->suspend_time = ut_time(); + if (thr->lock_state == QUE_THR_LOCK_ROW) { + srv_n_lock_wait_count++; + srv_n_lock_wait_current_count++; + + ut_usectime(&sec, &ms); + start_time = (ib_longlong)sec * 1000000 + ms; + } /* Wake the lock timeout monitor thread, if it is suspended */ os_event_set(srv_lock_timeout_thread_event); @@ -1387,7 +1481,20 @@ srv_suspend_mysql_thread( slot->in_use = FALSE; wait_time = ut_difftime(ut_time(), slot->suspend_time); - + + if (thr->lock_state == QUE_THR_LOCK_ROW) { + ut_usectime(&sec, &ms); + finish_time = (ib_longlong)sec * 1000000 + ms; + + diff_time = (ulint) (finish_time - start_time); + + srv_n_lock_wait_current_count--; + srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time; + if (diff_time > srv_n_lock_max_wait_time) { + srv_n_lock_max_wait_time = diff_time; + } + } + if (trx->was_chosen_as_deadlock_victim) { trx->error_state = DB_DEADLOCK; @@ -1401,6 +1508,12 @@ srv_suspend_mysql_thread( trx->error_state = DB_LOCK_WAIT_TIMEOUT; } +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } /************************************************************************ @@ -1413,6 +1526,7 @@ srv_release_mysql_thread_if_suspended( que_thr_t* thr) /* in: query thread associated with the MySQL OS thread */ { +#ifndef UNIV_HOTBACKUP srv_slot_t* slot; ulint i; @@ -1434,8 +1548,15 @@ srv_release_mysql_thread_if_suspended( } /* not found */ +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } +#ifndef UNIV_HOTBACKUP /********************************************************************** Refreshes the values used to calculate per-second averages. */ static @@ -1625,19 +1746,80 @@ srv_printf_innodb_monitor( (srv_n_rows_read - srv_n_rows_read_old) / time_elapsed); - srv_n_rows_inserted_old = srv_n_rows_inserted; + srv_n_rows_inserted_old = srv_n_rows_inserted; srv_n_rows_updated_old = srv_n_rows_updated; srv_n_rows_deleted_old = srv_n_rows_deleted; srv_n_rows_read_old = srv_n_rows_read; - fputs("----------------------------\n" + fputs("----------------------------\n" "END OF INNODB MONITOR OUTPUT\n" "============================\n", file); - mutex_exit(&srv_innodb_monitor_mutex); fflush(file); } +/********************************************************************** +Function to pass InnoDB status variables to MySQL */ + +void +srv_export_innodb_status(void) +{ + + mutex_enter(&srv_innodb_monitor_mutex); + export_vars.innodb_data_pending_reads= os_n_pending_reads; + export_vars.innodb_data_pending_writes= os_n_pending_writes; + export_vars.innodb_data_pending_fsyncs= + fil_n_pending_log_flushes + fil_n_pending_tablespace_flushes; + export_vars.innodb_data_fsyncs= os_n_fsyncs; + export_vars.innodb_data_read= srv_data_read; + export_vars.innodb_data_reads= os_n_file_reads; + export_vars.innodb_data_writes= os_n_file_writes; + export_vars.innodb_data_written= srv_data_written; + export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets; + export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests; + export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free; + export_vars.innodb_buffer_pool_pages_flushed= srv_buf_pool_flushed; + export_vars.innodb_buffer_pool_reads= srv_buf_pool_reads; + export_vars.innodb_buffer_pool_read_ahead_rnd= srv_read_ahead_rnd; + export_vars.innodb_buffer_pool_read_ahead_seq= srv_read_ahead_seq; + export_vars.innodb_buffer_pool_pages_data= UT_LIST_GET_LEN(buf_pool->LRU); + export_vars.innodb_buffer_pool_pages_dirty= UT_LIST_GET_LEN(buf_pool->flush_list); + export_vars.innodb_buffer_pool_pages_free= UT_LIST_GET_LEN(buf_pool->free); + export_vars.innodb_buffer_pool_pages_latched= buf_get_latched_pages_number(); + export_vars.innodb_buffer_pool_pages_total= buf_pool->curr_size; + export_vars.innodb_buffer_pool_pages_misc= buf_pool->max_size - + UT_LIST_GET_LEN(buf_pool->LRU) - UT_LIST_GET_LEN(buf_pool->free); + export_vars.innodb_page_size= UNIV_PAGE_SIZE; + export_vars.innodb_log_waits= srv_log_waits; + export_vars.innodb_os_log_written= srv_os_log_written; + export_vars.innodb_os_log_fsyncs= fil_n_log_flushes; + export_vars.innodb_os_log_pending_fsyncs= fil_n_pending_log_flushes; + export_vars.innodb_os_log_pending_writes= srv_os_log_pending_writes; + export_vars.innodb_log_write_requests= srv_log_write_requests; + export_vars.innodb_log_writes= srv_log_writes; + export_vars.innodb_dblwr_pages_written= srv_dblwr_pages_written; + export_vars.innodb_dblwr_writes= srv_dblwr_writes; + export_vars.innodb_pages_created= buf_pool->n_pages_created; + export_vars.innodb_pages_read= buf_pool->n_pages_read; + export_vars.innodb_pages_written= buf_pool->n_pages_written; + export_vars.innodb_row_lock_waits= srv_n_lock_wait_count; + export_vars.innodb_row_lock_current_waits= srv_n_lock_wait_current_count; + export_vars.innodb_row_lock_time= srv_n_lock_wait_time / 10000; + if (srv_n_lock_wait_count > 0) { + export_vars.innodb_row_lock_time_avg = (ulint) + (srv_n_lock_wait_time / 10000 / srv_n_lock_wait_count); + } else { + export_vars.innodb_row_lock_time_avg = 0; + } + export_vars.innodb_row_lock_time_max= srv_n_lock_max_wait_time / 10000; + export_vars.innodb_rows_read= srv_n_rows_read; + export_vars.innodb_rows_inserted= srv_n_rows_inserted; + export_vars.innodb_rows_updated= srv_n_rows_updated; + export_vars.innodb_rows_deleted= srv_n_rows_deleted; + mutex_exit(&srv_innodb_monitor_mutex); + +} + /************************************************************************* A thread which wakes up threads whose lock wait may have lasted too long. This also prints the info output by various InnoDB monitors. */ @@ -2296,11 +2478,11 @@ background_loop: flush_loop: srv_main_thread_op_info = "flushing buffer pool pages"; - if (!srv_very_fast_shutdown) { + if (srv_fast_shutdown < 2) { n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); } else { - /* In a 'very fast' shutdown we do not flush the buffer pool + /* In the fastest shutdown we do not flush the buffer pool to data files: we set n_pages_flushed to 0 artificially. */ n_pages_flushed = 0; @@ -2420,3 +2602,4 @@ suspend_thread: return(0); #endif } +#endif /* !UNIV_HOTBACKUP */ diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index 62df7301cc9..7798e0c8e32 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -479,7 +479,6 @@ srv_normalize_path_for_win( Adds a slash or a backslash to the end of a string if it is missing and the string is not empty. */ -static char* srv_add_path_separator_if_needed( /*=============================*/ @@ -531,6 +530,7 @@ srv_calc_high32( return(file_size >> (32 - UNIV_PAGE_SIZE_SHIFT)); } +#ifndef UNIV_HOTBACKUP /************************************************************************* Creates or opens the log files and closes them. */ static @@ -1040,7 +1040,9 @@ innobase_start_or_create_for_mysql(void) srv_start_has_been_called = TRUE; +#ifdef UNIV_DEBUG log_do_write = TRUE; +#endif /* UNIV_DEBUG */ /* yydebug = TRUE; */ srv_is_being_started = TRUE; @@ -1477,15 +1479,13 @@ NetWare. */ fsp_header_inc_size(0, sum_of_new_sizes, &mtr); mtr_commit(&mtr); - } - if (recv_needed_recovery) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Flushing modified pages from the buffer pool...\n"); - } + /* Immediately write the log record about increased tablespace + size to disk, so that it is durable even if mysqld would crash + quickly */ - log_make_checkpoint_at(ut_dulint_max, TRUE); + log_buffer_flush_to_disk(); + } #ifdef UNIV_LOG_ARCHIVE /* Archiving is always off under MySQL */ @@ -1556,8 +1556,9 @@ NetWare. */ os_thread_create(&srv_master_thread, NULL, thread_ids + 1 + SRV_MAX_N_IO_THREADS); +#ifdef UNIV_DEBUG /* buf_debug_prints = TRUE; */ - +#endif /* UNIV_DEBUG */ sum_of_data_file_sizes = 0; for (i = 0; i < srv_n_data_files; i++) { @@ -1731,6 +1732,15 @@ innobase_shutdown_for_mysql(void) The step 1 is the real InnoDB shutdown. The remaining steps 2 - ... just free data structures after the shutdown. */ + + if (srv_fast_shutdown == 2) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: MySQL has requested a very fast shutdown without flushing " +"the InnoDB buffer pool to data files. At the next mysqld startup " +"InnoDB will do a crash recovery!\n"); + } + #ifdef __NETWARE__ if(!panic_shutdown) #endif @@ -1747,6 +1757,14 @@ innobase_shutdown_for_mysql(void) srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS; + /* In a 'very fast' shutdown, we do not need to wait for these threads + to die; all which counts is that we flushed the log; a 'very fast' + shutdown is essentially a crash. */ + + if (srv_fast_shutdown == 2) { + return(DB_SUCCESS); + } + /* All threads end up waiting for certain events. Put those events to the signaled state. Then the threads will exit themselves in os_thread_event_wait(). */ @@ -1855,4 +1873,5 @@ void set_panic_flag_for_netware() extern ibool panic_shutdown; panic_shutdown = TRUE; } -#endif +#endif /* __NETWARE__ */ +#endif /* !UNIV_HOTBACKUP */ diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c index 77757685208..973b46fdd50 100644 --- a/innobase/sync/sync0rw.c +++ b/innobase/sync/sync0rw.c @@ -90,7 +90,8 @@ rw_lock_create_func( /*================*/ rw_lock_t* lock, /* in: pointer to memory */ const char* cfile_name, /* in: file name where created */ - ulint cline) /* in: file line where created */ + ulint cline, /* in: file line where created */ + const char* cmutex_name) /* in: mutex name */ { /* If this is the very first time a synchronization object is created, then the following call initializes @@ -101,7 +102,11 @@ rw_lock_create_func( lock->mutex.cfile_name = cfile_name; lock->mutex.cline = cline; - +#ifndef UNIV_HOTBACKUP + lock->mutex.cmutex_name = cmutex_name; + lock->mutex.mutex_type = 1; +#endif /* !UNIV_HOTBACKUP */ + rw_lock_set_waiters(lock, 0); rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); lock->writer_count = 0; diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c index 86306e49cac..43249f4b96f 100644 --- a/innobase/sync/sync0sync.c +++ b/innobase/sync/sync0sync.c @@ -129,11 +129,6 @@ sync_array_t* sync_primary_wait_array; /* This variable is set to TRUE when sync_init is called */ ibool sync_initialized = FALSE; -/* Global list of database mutexes (not OS mutexes) created. */ -UT_LIST_BASE_NODE_T(mutex_t) mutex_list; - -/* Mutex protecting the mutex_list variable */ -mutex_t mutex_list_mutex; typedef struct sync_level_struct sync_level_t; typedef struct sync_thread_struct sync_thread_t; @@ -146,6 +141,12 @@ sync_thread_t* sync_thread_level_arrays; /* Mutex protecting sync_thread_level_arrays */ mutex_t sync_thread_mutex; +/* Global list of database mutexes (not OS mutexes) created. */ +ut_list_base_node_t mutex_list; + +/* Mutex protecting the mutex_list variable */ +mutex_t mutex_list_mutex; + /* Latching order checks start when this is set TRUE */ ibool sync_order_checks_on = FALSE; @@ -202,7 +203,8 @@ mutex_create_func( /*==============*/ mutex_t* mutex, /* in: pointer to memory */ const char* cfile_name, /* in: file name where created */ - ulint cline) /* in: file line where created */ + ulint cline, /* in: file line where created */ + const char* cmutex_name) /* in: mutex name */ { #if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) mutex_reset_lock_word(mutex); @@ -219,6 +221,17 @@ mutex_create_func( mutex->level = SYNC_LEVEL_NONE; mutex->cfile_name = cfile_name; mutex->cline = cline; +#ifndef UNIV_HOTBACKUP + mutex->cmutex_name= cmutex_name; + mutex->count_using= 0; + mutex->mutex_type= 0; + mutex->lspent_time= 0; + mutex->lmax_spent_time= 0; + mutex->count_spin_loop= 0; + mutex->count_spin_rounds= 0; + mutex->count_os_wait= 0; + mutex->count_os_yield= 0; +#endif /* !UNIV_HOTBACKUP */ /* Check that lock_word is aligned; this is important on Intel */ ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0); @@ -355,135 +368,188 @@ for the mutex before suspending the thread. */ void mutex_spin_wait( /*============*/ - mutex_t* mutex, /* in: pointer to mutex */ - const char* file_name, /* in: file name where - mutex requested */ - ulint line) /* in: line where requested */ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where + mutex requested */ + ulint line) /* in: line where requested */ { - ulint index; /* index of the reserved wait cell */ - ulint i; /* spin round count */ - - ut_ad(mutex); + ulint index; /* index of the reserved wait cell */ + ulint i; /* spin round count */ +#ifndef UNIV_HOTBACKUP + ib_longlong lstart_time = 0, lfinish_time; /* for timing os_wait */ + ulint ltime_diff; + ulint sec; + ulint ms; + uint timer_started = 0; +#endif /* !UNIV_HOTBACKUP */ + ut_ad(mutex); mutex_loop: - i = 0; - - /* Spin waiting for the lock word to become zero. Note that we do not - have to assume that the read access to the lock word is atomic, as the - actual locking is always committed with atomic test-and-set. In - reality, however, all processors probably have an atomic read of a - memory word. */ - -spin_loop: - mutex_spin_wait_count++; - - while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) { + i = 0; - if (srv_spin_wait_delay) { - ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); - } - - i++; - } +/* Spin waiting for the lock word to become zero. Note that we do not + have to assume that the read access to the lock word is atomic, as the + actual locking is always committed with atomic test-and-set. In + reality, however, all processors probably have an atomic read of a + memory word. */ - if (i == SYNC_SPIN_ROUNDS) { - os_thread_yield(); - } +spin_loop: +#ifndef UNIV_HOTBACKUP + mutex_spin_wait_count++; + mutex->count_spin_loop++; +#endif /* !UNIV_HOTBACKUP */ + + while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) + { + if (srv_spin_wait_delay) + { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + + i++; + } + + if (i == SYNC_SPIN_ROUNDS) + { +#ifndef UNIV_HOTBACKUP + mutex->count_os_yield++; + if (timed_mutexes == 1 && timer_started==0) + { + ut_usectime(&sec, &ms); + lstart_time= (ib_longlong)sec * 1000000 + ms; + timer_started = 1; + } +#endif /* !UNIV_HOTBACKUP */ + os_thread_yield(); + } + +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, + "Thread %lu spin wait mutex at %p cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); +#endif - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu spin wait mutex at %p cfile %s cline %lu rnds %lu\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, - mutex->cfile_name, (ulong) mutex->cline, (ulong) i); - } + mutex_spin_round_count += i; - mutex_spin_round_count += i; +#ifndef UNIV_HOTBACKUP + mutex->count_spin_rounds += i; +#endif /* !UNIV_HOTBACKUP */ - if (mutex_test_and_set(mutex) == 0) { - /* Succeeded! */ + if (mutex_test_and_set(mutex) == 0) + { + /* Succeeded! */ #ifdef UNIV_SYNC_DEBUG - mutex_set_debug_info(mutex, file_name, line); + mutex_set_debug_info(mutex, file_name, line); #endif - return; - } + goto finish_timing; + } - /* We may end up with a situation where lock_word is - 0 but the OS fast mutex is still reserved. On FreeBSD - the OS does not seem to schedule a thread which is constantly - calling pthread_mutex_trylock (in mutex_test_and_set - implementation). Then we could end up spinning here indefinitely. - The following 'i++' stops this infinite spin. */ + /* We may end up with a situation where lock_word is + 0 but the OS fast mutex is still reserved. On FreeBSD + the OS does not seem to schedule a thread which is constantly + calling pthread_mutex_trylock (in mutex_test_and_set + implementation). Then we could end up spinning here indefinitely. + The following 'i++' stops this infinite spin. */ - i++; - - if (i < SYNC_SPIN_ROUNDS) { + i++; - goto spin_loop; - } + if (i < SYNC_SPIN_ROUNDS) + { + goto spin_loop; + } - sync_array_reserve_cell(sync_primary_wait_array, mutex, - SYNC_MUTEX, - file_name, line, - &index); + sync_array_reserve_cell(sync_primary_wait_array, mutex, + SYNC_MUTEX, file_name, line, &index); - mutex_system_call_count++; + mutex_system_call_count++; - /* The memory order of the array reservation and the change in the - waiters field is important: when we suspend a thread, we first - reserve the cell and then set waiters field to 1. When threads are - released in mutex_exit, the waiters field is first set to zero and - then the event is set to the signaled state. */ - - mutex_set_waiters(mutex, 1); + /* The memory order of the array reservation and the change in the + waiters field is important: when we suspend a thread, we first + reserve the cell and then set waiters field to 1. When threads are + released in mutex_exit, the waiters field is first set to zero and + then the event is set to the signaled state. */ + + mutex_set_waiters(mutex, 1); - /* Try to reserve still a few times */ - for (i = 0; i < 4; i++) { - if (mutex_test_and_set(mutex) == 0) { + /* Try to reserve still a few times */ + for (i = 0; i < 4; i++) + { + if (mutex_test_and_set(mutex) == 0) + { + /* Succeeded! Free the reserved wait cell */ - /* Succeeded! Free the reserved wait cell */ + sync_array_free_cell(sync_primary_wait_array, index); - sync_array_free_cell(sync_primary_wait_array, index); - #ifdef UNIV_SYNC_DEBUG - mutex_set_debug_info(mutex, file_name, line); + mutex_set_debug_info(mutex, file_name, line); #endif - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu spin wait succeeds at 2:" - " mutex at %p\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), - mutex); - } - - return; +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, "Thread %lu spin wait succeeds at 2:" + " mutex at %p\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), + mutex); +#endif - /* Note that in this case we leave the waiters field - set to 1. We cannot reset it to zero, as we do not know - if there are other waiters. */ - } - } + goto finish_timing; - /* Now we know that there has been some thread holding the mutex - after the change in the wait array and the waiters field was made. - Now there is no risk of infinite wait on the event. */ + /* Note that in this case we leave the waiters field + set to 1. We cannot reset it to zero, as we do not know + if there are other waiters. */ + } + } - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu OS wait mutex at %p cfile %s cline %lu rnds %lu\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, - mutex->cfile_name, (ulong) mutex->cline, (ulong) i); - } - - mutex_system_call_count++; - mutex_os_wait_count++; + /* Now we know that there has been some thread holding the mutex + after the change in the wait array and the waiters field was made. +Now there is no risk of infinite wait on the event. */ - sync_array_wait_event(sync_primary_wait_array, index); +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, + "Thread %lu OS wait mutex at %p cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); +#endif - goto mutex_loop; + mutex_system_call_count++; + mutex_os_wait_count++; + +#ifndef UNIV_HOTBACKUP + mutex->count_os_wait++; + /* + !!!!! Sometimes os_wait can be called without os_thread_yield + */ + + if (timed_mutexes == 1 && timer_started==0) + { + ut_usectime(&sec, &ms); + lstart_time= (ib_longlong)sec * 1000000 + ms; + timer_started = 1; + } +#endif /* !UNIV_HOTBACKUP */ + + sync_array_wait_event(sync_primary_wait_array, index); + goto mutex_loop; + +finish_timing: +#ifndef UNIV_HOTBACKUP + if (timed_mutexes == 1 && timer_started==1) + { + ut_usectime(&sec, &ms); + lfinish_time= (ib_longlong)sec * 1000000 + ms; + + ltime_diff= (ulint) (lfinish_time - lstart_time); + mutex->lspent_time += ltime_diff; + if (mutex->lmax_spent_time < ltime_diff) + { + mutex->lmax_spent_time= ltime_diff; + } + } +#endif /* !UNIV_HOTBACKUP */ + return; } /********************************************************************** @@ -555,6 +621,7 @@ mutex_set_level( mutex->level = level; } + #ifdef UNIV_SYNC_DEBUG /********************************************************************** Checks that the current thread owns the mutex. Works only in the debug @@ -1075,8 +1142,12 @@ sync_thread_add_level( } else if (level == SYNC_DICT_HEADER) { ut_a(sync_thread_levels_g(array, SYNC_DICT_HEADER)); } else if (level == SYNC_DICT) { +#ifdef UNIV_DEBUG ut_a(buf_debug_prints || sync_thread_levels_g(array, SYNC_DICT)); +#else /* UNIV_DEBUG */ + ut_a(sync_thread_levels_g(array, SYNC_DICT)); +#endif /* UNIV_DEBUG */ } else { ut_error; } diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c index fe429d1cc62..3b7171e6038 100644 --- a/innobase/trx/trx0rec.c +++ b/innobase/trx/trx0rec.c @@ -38,16 +38,18 @@ trx_undof_page_add_undo_rec_log( ulint new_free, /* in: end offset of the entry */ mtr_t* mtr) /* in: mtr */ { - byte* log_ptr; - ulint len; + byte* log_ptr; + const byte* log_end; + ulint len; - log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN); + log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN); if (log_ptr == NULL) { return; } + log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN]; log_ptr = mlog_write_initial_log_record_fast(undo_page, MLOG_UNDO_INSERT, log_ptr, mtr); len = new_free - old_free - 4; @@ -55,14 +57,11 @@ trx_undof_page_add_undo_rec_log( mach_write_to_2(log_ptr, len); log_ptr += 2; - if (len < 256) { - ut_memcpy(log_ptr, undo_page + old_free + 2, len); - log_ptr += len; - } - - mlog_close(mtr, log_ptr); - - if (len >= MLOG_BUF_MARGIN) { + if (log_ptr + len <= log_end) { + memcpy(log_ptr, undo_page + old_free + 2, len); + mlog_close(mtr, log_ptr + len); + } else { + mlog_close(mtr, log_ptr); mlog_catenate_string(mtr, undo_page + old_free + 2, len); } } @@ -404,6 +403,7 @@ trx_undo_page_report_modify( delete marking is done */ rec_t* rec, /* in: clustered index record which has NOT yet been modified */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector which tells the columns to be updated; in the case of a delete, this should be set to NULL */ @@ -430,6 +430,7 @@ trx_undo_page_report_modify( ulint i; ut_a(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); table = index->table; @@ -454,7 +455,7 @@ trx_undo_page_report_modify( /* Store first some general parameters to the undo log */ if (update) { - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, table->comp)) { type_cmpl = TRX_UNDO_UPD_DEL_REC; } else { type_cmpl = TRX_UNDO_UPD_EXIST_REC; @@ -479,14 +480,20 @@ trx_undo_page_report_modify( /*----------------------------------------*/ /* Store the state of the info bits */ - bits = rec_get_info_bits(rec); + bits = rec_get_info_bits(rec, table->comp); mach_write_to_1(ptr, bits); ptr += 1; /* Store the values of the system columns */ - trx_id = dict_index_rec_get_sys_col(index, DATA_TRX_ID, rec); + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos(index, DATA_TRX_ID), &len); + ut_ad(len == DATA_TRX_ID_LEN); + trx_id = trx_read_trx_id(field); + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), &len); + ut_ad(len == DATA_ROLL_PTR_LEN); + roll_ptr = trx_read_roll_ptr(field); - roll_ptr = dict_index_rec_get_sys_col(index, DATA_ROLL_PTR, rec); len = mach_dulint_write_compressed(ptr, trx_id); ptr += len; @@ -499,7 +506,7 @@ trx_undo_page_report_modify( for (i = 0; i < dict_index_get_n_unique(index); i++) { - field = rec_get_nth_field(rec, i, &flen); + field = rec_get_nth_field(rec, offsets, i, &flen); if (trx_undo_left(undo_page, ptr) < 4) { @@ -547,14 +554,14 @@ trx_undo_page_report_modify( ptr += len; /* Save the old value of field */ - field = rec_get_nth_field(rec, pos, &flen); + field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 5) { return(0); } - if (rec_get_nth_field_extern_bit(rec, pos)) { + if (rec_offs_nth_extern(offsets, pos)) { /* If a field has external storage, we add to flen the flag */ @@ -631,7 +638,7 @@ trx_undo_page_report_modify( ptr += len; /* Save the old value of field */ - field = rec_get_nth_field(rec, pos, &flen); + field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 5) { @@ -934,13 +941,11 @@ trx_undo_erase_page_end( mtr_t* mtr) /* in: mtr */ { ulint first_free; - ulint i; - + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE); - for (i = first_free; i < UNIV_PAGE_SIZE - FIL_PAGE_DATA_END; i++) { - undo_page[i] = 0xFF; - } + memset(undo_page + first_free, 0xff, + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free); mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr); } @@ -1008,7 +1013,11 @@ trx_undo_report_row_operation( ibool is_insert; trx_rseg_t* rseg; mtr_t mtr; - + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; + ut_a(index->type & DICT_CLUSTERED); if (flags & BTR_NO_UNDO_LOG_FLAG) { @@ -1019,7 +1028,6 @@ trx_undo_report_row_operation( } ut_ad(thr); - ut_a(index->type & DICT_CLUSTERED); ut_ad((op_type != TRX_UNDO_INSERT_OP) || (clust_entry && !update && !rec)); @@ -1079,9 +1087,10 @@ trx_undo_report_row_operation( index, clust_entry, &mtr); } else { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); offset = trx_undo_page_report_modify(undo_page, trx, - index, rec, update, - cmpl_info, &mtr); + index, rec, offsets, update, cmpl_info, &mtr); } if (offset == 0) { @@ -1123,7 +1132,9 @@ trx_undo_report_row_operation( mutex_exit(&(trx->undo_mutex)); mtr_commit(&mtr); - + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(DB_OUT_OF_FILE_SPACE); } } @@ -1140,6 +1151,9 @@ trx_undo_report_row_operation( *roll_ptr = trx_undo_build_roll_ptr(is_insert, rseg->id, page_no, offset); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } return(DB_SUCCESS); } @@ -1236,6 +1250,7 @@ trx_undo_prev_version_build( index_rec page and purge_view */ rec_t* rec, /* in: version of a clustered index record */ dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ mem_heap_t* heap, /* in: memory heap from which the memory needed is allocated */ rec_t** old_vers)/* out, own: previous version, or NULL if @@ -1258,7 +1273,6 @@ trx_undo_prev_version_build( ibool dummy_extern; byte* buf; ulint err; - #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ @@ -1266,21 +1280,23 @@ trx_undo_prev_version_build( MTR_MEMO_PAGE_S_FIX) || mtr_memo_contains(index_mtr, buf_block_align(index_rec), MTR_MEMO_PAGE_X_FIX)); + ut_ad(rec_offs_validate(rec, index, offsets)); + if (!(index->type & DICT_CLUSTERED)) { fprintf(stderr, "InnoDB: Error: trying to access" " update undo rec for non-clustered index %s\n" "InnoDB: Submit a detailed bug report to" " http://bugs.mysql.com\n" "InnoDB: index record ", index->name); - rec_print(stderr, index_rec); + rec_print(stderr, index_rec, index); fputs("\n" "InnoDB: record version ", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); putc('\n', stderr); return(DB_ERROR); } - roll_ptr = row_get_rec_roll_ptr(rec, index); + roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); old_roll_ptr = roll_ptr; *old_vers = NULL; @@ -1292,7 +1308,7 @@ trx_undo_prev_version_build( return(DB_SUCCESS); } - rec_trx_id = row_get_rec_trx_id(rec, index); + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap); @@ -1341,10 +1357,10 @@ trx_undo_prev_version_build( ut_print_buf(stderr, undo_rec, 150); fputs("\n" "InnoDB: index record ", stderr); - rec_print(stderr, index_rec); + rec_print(stderr, index_rec, index); fputs("\n" "InnoDB: record version ", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); fprintf(stderr, "\n" "InnoDB: Record trx id %lu %lu, update rec trx id %lu %lu\n" "InnoDB: Roll ptr in rec %lu %lu, in update rec %lu %lu\n", @@ -1358,11 +1374,10 @@ trx_undo_prev_version_build( (ulong) ut_dulint_get_low(roll_ptr)); trx_purge_sys_print(); - return(DB_ERROR); } - if (row_upd_changes_field_size_or_external(rec, index, update)) { + if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint* ext_vect; ulint n_ext_vect; @@ -1372,27 +1387,28 @@ trx_undo_prev_version_build( those fields that update updates to become externally stored fields. Store the info to ext_vect: */ - ext_vect = mem_alloc(sizeof(ulint) * rec_get_n_fields(rec)); - n_ext_vect = btr_push_update_extern_fields(ext_vect, rec, + ext_vect = mem_alloc(sizeof(ulint) + * rec_offs_n_fields(offsets)); + n_ext_vect = btr_push_update_extern_fields(ext_vect, offsets, update); entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); row_upd_index_replace_new_col_vals(entry, index, update, heap); - buf = mem_heap_alloc(heap, rec_get_converted_size(entry)); + buf = mem_heap_alloc(heap, + rec_get_converted_size(index, entry)); - *old_vers = rec_convert_dtuple_to_rec(buf, entry); + *old_vers = rec_convert_dtuple_to_rec(buf, index, entry); /* Now set the extern bits in the old version of the record */ - rec_set_field_extern_bits(*old_vers, ext_vect, n_ext_vect, - NULL); + rec_set_field_extern_bits(*old_vers, index, + ext_vect, n_ext_vect, NULL); mem_free(ext_vect); } else { - buf = mem_heap_alloc(heap, rec_get_size(rec)); - - *old_vers = rec_copy(buf, rec); - - row_upd_rec_in_place(*old_vers, update); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + *old_vers = rec_copy(buf, rec, offsets); + rec_offs_make_valid(*old_vers, index, offsets); + row_upd_rec_in_place(*old_vers, offsets, update); } return(DB_SUCCESS); diff --git a/innobase/trx/trx0roll.c b/innobase/trx/trx0roll.c index eb7c7f43f03..fdfb7428129 100644 --- a/innobase/trx/trx0roll.c +++ b/innobase/trx/trx0roll.c @@ -30,9 +30,13 @@ Created 3/26/1996 Heikki Tuuri /* This many pages must be undone before a truncate is tried within rollback */ #define TRX_ROLL_TRUNC_THRESHOLD 1 +/* In crash recovery, the current trx to be rolled back */ +trx_t* trx_roll_crash_recv_trx = NULL; + /* In crash recovery we set this to the undo n:o of the current trx to be rolled back. Then we can print how many % the rollback has progressed. */ ib_longlong trx_roll_max_undo_no; + /* Auxiliary variable which tells the previous progress % we printed */ ulint trx_roll_progress_printed_pct; @@ -48,6 +52,7 @@ trx_general_rollback_for_mysql( trx_savept_t* savept) /* in: pointer to savepoint undo number, if partial rollback requested */ { +#ifndef UNIV_HOTBACKUP mem_heap_t* heap; que_thr_t* thr; roll_node_t* roll_node; @@ -99,6 +104,12 @@ trx_general_rollback_for_mysql( srv_active_wake_master_thread(); return((int) trx->error_state); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ } /*********************************************************************** @@ -313,6 +324,51 @@ trx_savepoint_for_mysql( } /*********************************************************************** +Releases a named savepoint. Savepoints which +were set after this savepoint are deleted. */ + +ulint +trx_release_savepoint_for_mysql( +/*============================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name) /* in: savepoint name */ +{ + trx_named_savept_t* savep; + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep == NULL) { + + return(DB_NO_SAVEPOINT); + } + + /* We can now free all savepoints strictly later than this one */ + + trx_roll_savepoints_free(trx, savep); + + /* Now we can free this savepoint too */ + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + + mem_free(savep->name); + mem_free(savep); + + return(DB_SUCCESS); +} + +/*********************************************************************** Returns a transaction savepoint taken at this point in time. */ trx_savept_t @@ -331,11 +387,20 @@ trx_savept_take( /*********************************************************************** Rollback or clean up transactions which have no user session. If the transaction already was committed, then we clean up a possible insert -undo log. If the transaction was not yet committed, then we roll it back. */ +undo log. If the transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. */ -void -trx_rollback_or_clean_all_without_sess(void) -/*========================================*/ +#ifndef __WIN__ +void* +#else +ulint +#endif +trx_rollback_or_clean_all_without_sess( +/*===================================*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))) + /* in: a dummy parameter required by + os_thread_create */ { mem_heap_t* heap; que_fork_t* fork; @@ -360,9 +425,9 @@ trx_rollback_or_clean_all_without_sess(void) if (UT_LIST_GET_FIRST(trx_sys->trx_list)) { fprintf(stderr, - "InnoDB: Starting rollback of uncommitted transactions\n"); +"InnoDB: Starting in background the rollback of uncommitted transactions\n"); } else { - return; + goto leave_function; } loop: heap = mem_heap_create(512); @@ -371,24 +436,32 @@ loop: trx = UT_LIST_GET_FIRST(trx_sys->trx_list); - while (trx && (trx->sess || (trx->conc_state == TRX_NOT_STARTED))) { + while (trx) { + if ((trx->sess || (trx->conc_state == TRX_NOT_STARTED))) { + trx = UT_LIST_GET_NEXT(trx_list, trx); + } else if (trx->conc_state == TRX_PREPARED) { - trx = UT_LIST_GET_NEXT(trx_list, trx); + trx->sess = trx_dummy_sess; + trx = UT_LIST_GET_NEXT(trx_list, trx); + } else { + break; + } } mutex_exit(&kernel_mutex); if (trx == NULL) { + ut_print_timestamp(stderr); fprintf(stderr, - "InnoDB: Rollback of uncommitted transactions completed\n"); + " InnoDB: Rollback of non-prepared transactions completed\n"); mem_heap_free(heap); - - return; + + goto leave_function; } trx->sess = trx_dummy_sess; - + if (trx->conc_state == TRX_COMMITTED_IN_MEMORY) { fprintf(stderr, "InnoDB: Cleaning up trx with id %lu %lu\n", (ulong) ut_dulint_get_high(trx->id), @@ -417,21 +490,28 @@ loop: ut_a(thr == que_fork_start_command(fork)); + trx_roll_crash_recv_trx = trx; trx_roll_max_undo_no = ut_conv_dulint_to_longlong(trx->undo_no); trx_roll_progress_printed_pct = 0; rows_to_undo = trx_roll_max_undo_no; + if (rows_to_undo > 1000000000) { rows_to_undo = rows_to_undo / 1000000; unit = "M"; } + ut_print_timestamp(stderr); fprintf(stderr, -"InnoDB: Rolling back trx with id %lu %lu, %lu%s rows to undo", +" InnoDB: Rolling back trx with id %lu %lu, %lu%s rows to undo\n", (ulong) ut_dulint_get_high(trx->id), (ulong) ut_dulint_get_low(trx->id), (ulong) rows_to_undo, unit); mutex_exit(&kernel_mutex); + trx->mysql_thread_id = os_thread_get_curr_id(); + + trx->mysql_process_no = os_proc_get_number(); + if (trx->dict_operation) { row_mysql_lock_data_dictionary(trx); } @@ -446,7 +526,7 @@ loop: fprintf(stderr, "InnoDB: Waiting for rollback of trx id %lu to end\n", - (ulong) ut_dulint_get_low(trx->id)); + (ulong) ut_dulint_get_low(trx->id)); os_thread_sleep(100000); mutex_enter(&kernel_mutex); @@ -485,7 +565,23 @@ loop: (ulong) ut_dulint_get_low(trx->id)); mem_heap_free(heap); + trx_roll_crash_recv_trx = NULL; + goto loop; + +leave_function: + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + /* The following is dummy code to keep the compiler happy: */ + +#ifndef __WIN__ + return(NULL); +#else + return(0); +#endif } /*********************************************************************** @@ -846,16 +942,17 @@ try_again: ut_ad(ut_dulint_cmp(ut_dulint_add(undo_no, 1), trx->undo_no) == 0); /* We print rollback progress info if we are in a crash recovery - and the transaction has at least 1000 row operations to undo */ + and the transaction has at least 1000 row operations to undo. */ + + if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) { - if (srv_is_being_started && trx_roll_max_undo_no > 1000) { - progress_pct = 100 - (ulint) + progress_pct = 100 - (ulint) ((ut_conv_dulint_to_longlong(undo_no) * 100) / trx_roll_max_undo_no); if (progress_pct != trx_roll_progress_printed_pct) { if (trx_roll_progress_printed_pct == 0) { fprintf(stderr, - "\nInnoDB: Progress in percents: %lu", (ulong) progress_pct); +"\nInnoDB: Progress in percents: %lu", (ulong) progress_pct); } else { fprintf(stderr, " %lu", (ulong) progress_pct); @@ -1140,10 +1237,12 @@ trx_finish_rollback_off_kernel( return; } +#ifdef UNIV_DEBUG if (lock_print_waits) { fprintf(stderr, "Trx %lu rollback finished\n", (ulong) ut_dulint_get_low(trx->id)); } +#endif /* UNIV_DEBUG */ trx_commit_off_kernel(trx); diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index c7292fb7650..bf48c30e942 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -126,6 +126,22 @@ trx_doublewrite_init( } /******************************************************************** +Frees the doublewrite buffer. */ +static +void +trx_doublewrite_free(void) +/*======================*/ +{ + mutex_free(&(trx_doublewrite->mutex)); + + mem_free(trx_doublewrite->buf_block_arr); + ut_free(trx_doublewrite->write_buf_unaligned); + + mem_free(trx_doublewrite); + trx_doublewrite = NULL; +} + +/******************************************************************** Marks the trx sys header when we have successfully upgraded to the >= 4.1.x multiple tablespace format. */ @@ -513,6 +529,9 @@ trx_sys_doublewrite_init_or_restore_pages( fil_flush_file_spaces(FIL_TABLESPACE); + if (!srv_use_doublewrite_buf) + trx_doublewrite_free(); + leave_func: ut_free(unaligned_read_buf); } @@ -889,8 +908,12 @@ trx_sys_init_at_db_start(void) trx = UT_LIST_GET_FIRST(trx_sys->trx_list); for (;;) { - rows_to_undo += + + if ( trx->conc_state != TRX_PREPARED) { + rows_to_undo += ut_conv_dulint_to_longlong(trx->undo_no); + } + trx = UT_LIST_GET_NEXT(trx_list, trx); if (!trx) { diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 13575a3cedd..f95491443ee 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -24,6 +24,7 @@ Created 3/26/1996 Heikki Tuuri #include "thr0loc.h" #include "btr0sea.h" #include "os0proc.h" +#include "trx0xa.h" /* Copy of the prototype for innobase_mysql_print_thd: this copy MUST be equal to the one in mysql/sql/ha_innodb.cc ! */ @@ -92,6 +93,8 @@ trx_create( trx->id = ut_dulint_zero; trx->no = ut_dulint_max; + trx->support_xa = TRUE; + trx->check_foreigns = TRUE; trx->check_unique_secondary = TRUE; @@ -155,11 +158,17 @@ trx_create( trx->n_tickets_to_enter_innodb = 0; trx->auto_inc_lock = NULL; - trx->n_lock_table_exp = 0; - trx->read_view_heap = mem_heap_create(256); + trx->global_read_view_heap = mem_heap_create(256); + trx->global_read_view = NULL; trx->read_view = NULL; + /* Set X/Open XA transaction identification to NULL */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; + + trx_reset_new_rec_lock_info(trx); + return(trx); } @@ -301,7 +310,6 @@ trx_free( ut_a(!trx->has_search_latch); ut_a(!trx->auto_inc_lock); - ut_a(!trx->n_lock_table_exp); ut_a(trx->dict_operation_lock_mode == 0); @@ -311,10 +319,12 @@ trx_free( ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0); - if (trx->read_view_heap) { - mem_heap_free(trx->read_view_heap); + if (trx->global_read_view_heap) { + mem_heap_free(trx->global_read_view_heap); } + trx->global_read_view = NULL; + ut_a(trx->read_view == NULL); mem_free(trx); @@ -430,13 +440,36 @@ trx_lists_init_at_db_start(void) trx = trx_create(NULL); trx->id = undo->trx_id; - + trx->xid = undo->xid; trx->insert_undo = undo; trx->rseg = rseg; if (undo->state != TRX_UNDO_ACTIVE) { - trx->conc_state = TRX_COMMITTED_IN_MEMORY; + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + + fprintf(stderr, +"InnoDB: Transaction %lu %lu was in the XA prepared state.\n", + ut_dulint_get_high(trx->id), + ut_dulint_get_low(trx->id)); + + if (srv_force_recovery == 0) { + + trx->conc_state = TRX_PREPARED; + } else { + fprintf(stderr, +"InnoDB: Since innodb_force_recovery > 0, we will rollback it anyway.\n"); + + trx->conc_state = TRX_ACTIVE; + } + } else { + trx->conc_state = + TRX_COMMITTED_IN_MEMORY; + } /* We give a dummy value for the trx no; this should have no relevance since purge @@ -479,10 +512,34 @@ trx_lists_init_at_db_start(void) trx = trx_create(NULL); trx->id = undo->trx_id; + trx->xid = undo->xid; if (undo->state != TRX_UNDO_ACTIVE) { - trx->conc_state = - TRX_COMMITTED_IN_MEMORY; + + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + fprintf(stderr, +"InnoDB: Transaction %lu %lu was in the XA prepared state.\n", + ut_dulint_get_high(trx->id), + ut_dulint_get_low(trx->id)); + + if (srv_force_recovery == 0) { + + trx->conc_state = TRX_PREPARED; + } else { + fprintf(stderr, +"InnoDB: Since innodb_force_recovery > 0, we will rollback it anyway.\n"); + + trx->conc_state = TRX_ACTIVE; + } + } else { + trx->conc_state = + TRX_COMMITTED_IN_MEMORY; + } + /* We give a dummy value for the trx number */ @@ -750,7 +807,8 @@ trx_commit_off_kernel( mutex_enter(&kernel_mutex); } - ut_ad(trx->conc_state == TRX_ACTIVE); + ut_ad(trx->conc_state == TRX_ACTIVE + || trx->conc_state == TRX_PREPARED); #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ @@ -776,15 +834,25 @@ trx_commit_off_kernel( lock_release_off_kernel(trx); if (trx->read_view) { + /* If transaction has a global read view this case + means that transaction has been using a consistent + read view associated to a cursor. Only the global + read view associated to a transaction is closed + and read view is then removed from the transaction. + If read view associated to a cursor is still used + it must be re-registered to another transaction. */ + + if (UNIV_LIKELY_NULL(trx->global_read_view)) { + trx->read_view = trx->global_read_view; + } + read_view_close(trx->read_view); - mem_heap_empty(trx->read_view_heap); + mem_heap_empty(trx->global_read_view_heap); trx->read_view = NULL; + trx->global_read_view = NULL; } -/* fprintf(stderr, "Trx %lu commit finished\n", - ut_dulint_get_low(trx->id)); */ - if (must_flush_log) { mutex_exit(&kernel_mutex); @@ -829,14 +897,15 @@ trx_commit_off_kernel( /* Do nothing */ } else if (srv_flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { - /* Write the log but do not flush it to disk */ + /* Write the log but do not flush it to disk */ - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + FALSE); } else { - /* Write the log to the log files AND flush - them to disk */ + /* Write the log to the log files AND flush + them to disk */ - log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } } else if (srv_flush_log_at_trx_commit == 2) { @@ -911,7 +980,9 @@ trx_assign_read_view( mutex_enter(&kernel_mutex); if (!trx->read_view) { - trx->read_view = read_view_open_now(trx, trx->read_view_heap); + trx->read_view = read_view_open_now(trx, + trx->global_read_view_heap); + trx->global_read_view = trx->read_view; } mutex_exit(&kernel_mutex); @@ -1609,20 +1680,24 @@ trx_print( (ulong) ut_dulint_get_high(trx->id), (ulong) ut_dulint_get_low(trx->id)); - switch (trx->conc_state) { + switch (trx->conc_state) { case TRX_NOT_STARTED: fputs(", not started", f); break; case TRX_ACTIVE: fprintf(f, ", ACTIVE %lu sec", (ulong)difftime(time(NULL), trx->start_time)); - break; + break; + case TRX_PREPARED: + fprintf(f, ", ACTIVE (PREPARED) %lu sec", + (ulong)difftime(time(NULL), trx->start_time)); + break; case TRX_COMMITTED_IN_MEMORY: fputs(", COMMITTED IN MEMORY", f); break; default: fprintf(f, " state %lu", (ulong) trx->conc_state); - } + } #ifdef UNIV_LINUX fprintf(f, ", process no %lu", trx->mysql_process_no); @@ -1647,11 +1722,10 @@ trx_print( putc('\n', f); if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { - - fprintf(f, "mysql tables in use %lu, locked %lu\n", - (ulong) trx->n_mysql_tables_in_use, - (ulong) trx->mysql_n_tables_locked); - } + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } newline = TRUE; @@ -1696,3 +1770,270 @@ trx_print( innobase_mysql_print_thd(f, trx->mysql_thd); } } + +/******************************************************************** +Prepares a transaction. */ + +void +trx_prepare_off_kernel( +/*===================*/ + trx_t* trx) /* in: transaction */ +{ + page_t* update_hdr_page; + trx_rseg_t* rseg; + ibool must_flush_log = FALSE; + dulint lsn; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + rseg = trx->rseg; + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + must_flush_log = TRUE; + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to TRX_UNDO_PREPARED: these modifications to the file data + structure define the transaction as prepared in the + file-based world, at the serialization point of lsn. */ + + mutex_enter(&(rseg->mutex)); + + if (trx->insert_undo != NULL) { + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction prepare for this transaction. */ + + trx_undo_set_state_at_prepare(trx, trx->insert_undo, + &mtr); + } + + if (trx->update_undo) { + update_hdr_page = trx_undo_set_state_at_prepare(trx, + trx->update_undo, &mtr); + } + + mutex_exit(&(rseg->mutex)); + + /*--------------*/ + mtr_commit(&mtr); /* This mtr commit makes the + transaction prepared in the file-based + world */ + /*--------------*/ + lsn = mtr.end_lsn; + + mutex_enter(&kernel_mutex); + } + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + /*--------------------------------------*/ + trx->conc_state = TRX_PREPARED; + /*--------------------------------------*/ + + if (must_flush_log) { + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the prepared state of the + transaction durable if the OS does not crash. We may also + flush the log files to disk, making the prepared state of the + transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group prepare is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which prepares the whole + group. Note that this group prepare will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + TODO: find out if MySQL holds some mutex when calling this. + That would spoil our group prepare algorithm. */ + + mutex_exit(&kernel_mutex); + + if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + FALSE); + } else { + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + mutex_enter(&kernel_mutex); + } +} + +/************************************************************************** +Does the transaction prepare for MySQL. */ + +ulint +trx_prepare_for_mysql( +/*====-=============*/ + /* out: 0 or error number */ + trx_t* trx) /* in: trx handle */ +{ + /* Because we do not do the prepare by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + trx->op_info = "preparing"; + + trx_start_if_not_started(trx); + + mutex_enter(&kernel_mutex); + + trx_prepare_off_kernel(trx); + + mutex_exit(&kernel_mutex); + + trx->op_info = ""; + + return(0); +} + +/************************************************************************** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. */ + +int +trx_recover_for_mysql( +/*==================*/ + /* out: number of prepared transactions + stored in xid_list */ + XID* xid_list, /* in/out: prepared transactions */ + ulint len) /* in: number of slots in xid_list */ +{ + trx_t* trx; + int count = 0; + + ut_ad(xid_list); + ut_ad(len); + + /* We should set those transactions which are in the prepared state + to the xid_list */ + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + if (trx->conc_state == TRX_PREPARED) { + xid_list[count] = trx->xid; + + if (count == 0) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Starting recovery for XA transactions...\n"); + } + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Transaction %lu %lu in prepared state after recovery\n", + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Transaction contains changes to %lu rows\n", + (ulong)ut_conv_dulint_to_longlong(trx->undo_no)); + + count++; + + if ((uint)count == len ) { + break; + } + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + if (count > 0){ + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: %d transactions in prepared state after recovery\n", + count); + } + + return (count); +} + +/*********************************************************************** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state */ + +trx_t* +trx_get_trx_by_xid( +/*===============*/ + /* out: trx or NULL */ + XID* xid) /* in: X/Open XA transaction identification */ +{ + trx_t* trx; + + if (xid == NULL) { + + return (NULL); + } + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + /* Compare two X/Open XA transaction id's: their + length should be the same and binary comparison + of gtrid_lenght+bqual_length bytes should be + the same */ + + if (xid->gtrid_length == trx->xid.gtrid_length && + xid->bqual_length == trx->xid.bqual_length && + memcmp(xid->data, trx->xid.data, + xid->gtrid_length + + xid->bqual_length) == 0) { + break; + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + if (trx) { + if (trx->conc_state != TRX_PREPARED) { + + return(NULL); + } + + return(trx); + } else { + return(NULL); + } +} diff --git a/innobase/trx/trx0undo.c b/innobase/trx/trx0undo.c index 8d1518753dd..7441dd3f152 100644 --- a/innobase/trx/trx0undo.c +++ b/innobase/trx/trx0undo.c @@ -19,6 +19,7 @@ Created 3/26/1996 Heikki Tuuri #include "srv0srv.h" #include "trx0rec.h" #include "trx0purge.h" +#include "trx0xa.h" /* How should the old versions in the history list be managed? ---------------------------------------------------------- @@ -97,10 +98,13 @@ trx_undo_mem_create( TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log is created */ + XID* xid, /* in: X/Open XA transaction identification*/ ulint page_no,/* in: undo log header page number */ - ulint offset); /* in: undo log header byte offset on page */ + ulint offset);/* in: undo log header byte offset on page */ /******************************************************************* -Initializes a cached insert undo log header page for new use. */ +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! */ static ulint trx_undo_insert_header_reuse( @@ -181,7 +185,8 @@ trx_undo_get_prev_rec( /* We have to go to the previous undo log page to look for the previous record */ - return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset, mtr)); + return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset, + mtr)); } /*************************************************************************** @@ -473,15 +478,17 @@ trx_undo_header_create_log( } /******************************************************************* -Creates a new undo log header in file. */ +Creates a new undo log header in file. NOTE that this function has its own +log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of +this function! */ static ulint trx_undo_header_create( /*===================*/ /* out: header byte offset on page */ page_t* undo_page, /* in: undo log segment header page, - x-latched; it is assumed that there is - TRX_UNDO_LOG_HDR_SIZE bytes free space + x-latched; it is assumed that there are + TRX_UNDO_LOG_XA_HDR_SIZE bytes free space on it */ dulint trx_id, /* in: transaction id */ mtr_t* mtr) /* in: mtr */ @@ -503,9 +510,9 @@ trx_undo_header_create( log_hdr = undo_page + free; - new_free = free + TRX_UNDO_LOG_HDR_SIZE; + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; - ut_ad(new_free <= UNIV_PAGE_SIZE); + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); @@ -530,16 +537,98 @@ trx_undo_header_create( mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); - mach_write_to_2(log_hdr + TRX_UNDO_DICT_OPERATION, FALSE); - + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0); mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log); - + + /* Write the log record about the header creation */ trx_undo_header_create_log(undo_page, trx_id, mtr); return(free); } +/************************************************************************ +Write X/Open XA Transaction Identification (XID) to undo log header */ +static +void +trx_undo_write_xid( +/*===============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + const XID* xid, /* in: X/Open XA Transaction Identification */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT, + (ulint)xid->formatID, MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN, + (ulint)xid->gtrid_length, MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN, + (ulint)xid->bqual_length, MLOG_4BYTES, mtr); + + mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data, + XIDDATASIZE, mtr); +} + +/************************************************************************ +Read X/Open XA Transaction Identification (XID) from undo log header */ +static +void +trx_undo_read_xid( +/*==============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid) /* out: X/Open XA Transaction Identification */ +{ + xid->formatID = (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT); + + xid->gtrid_length = + (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN); + xid->bqual_length = + (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN); + + memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE); +} + +/******************************************************************* +Adds space for the XA XID after an undo log old-style header. */ +static +void +trx_undo_header_add_space_for_xid( +/*==============================*/ + page_t* undo_page,/* in: undo log segment header page */ + trx_ulogf_t* log_hdr,/* in: undo log header */ + mtr_t* mtr) /* in: mtr */ +{ + trx_upagef_t* page_hdr; + ulint free; + ulint new_free; + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE); + + /* free is now the end offset of the old style undo log header */ + + ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE); + + new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE + - TRX_UNDO_LOG_OLD_HDR_SIZE); + + /* Add space for a XID after the header, update the free offset + fields on the undo log page and in the undo log header */ + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free, + MLOG_2BYTES, mtr); +} + /************************************************************************** Writes the mtr log entry of an undo log header reuse. */ UNIV_INLINE @@ -590,7 +679,9 @@ trx_undo_parse_page_header( } /******************************************************************* -Initializes a cached insert undo log header page for new use. */ +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! */ static ulint trx_undo_insert_header_reuse( @@ -614,9 +705,11 @@ trx_undo_insert_header_reuse( free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE; + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + log_hdr = undo_page + free; - new_free = free + TRX_UNDO_LOG_HDR_SIZE; + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; /* Insert undo data is not needed after commit: we may free all the space on the page */ @@ -636,8 +729,10 @@ trx_undo_insert_header_reuse( mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); - mach_write_to_2(log_hdr + TRX_UNDO_DICT_OPERATION, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + /* Write the log record MLOG_UNDO_HDR_REUSE */ trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr); return(free); @@ -800,7 +895,6 @@ trx_undo_free_page( list */ ulint space, /* in: space */ ulint hdr_page_no, /* in: header page number */ - ulint hdr_offset, /* in: header offset */ ulint page_no, /* in: page number to free: must not be the header page */ mtr_t* mtr) /* in: mtr which does not have a latch to any @@ -813,7 +907,6 @@ trx_undo_free_page( trx_rsegf_t* rseg_header; ulint hist_size; - UT_NOT_USED(hdr_offset); ut_a(hdr_page_no != page_no); #ifdef UNIV_SYNC_DEBUG ut_ad(!mutex_own(&kernel_mutex)); @@ -870,8 +963,7 @@ trx_undo_free_page_in_rollback( #endif /* UNIV_SYNC_DEBUG */ last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space, - undo->hdr_page_no, undo->hdr_offset, - page_no, mtr); + undo->hdr_page_no, page_no, mtr); undo->last_page_no = last_page_no; undo->size--; @@ -1039,7 +1131,7 @@ loop: trx_undo_empty_header_page(space, hdr_page_no, hdr_offset, &mtr); } else { - trx_undo_free_page(rseg, TRUE, space, hdr_page_no, hdr_offset, + trx_undo_free_page(rseg, TRUE, space, hdr_page_no, page_no, &mtr); } @@ -1123,7 +1215,9 @@ trx_undo_mem_create_at_db_start( fil_addr_t last_addr; page_t* last_page; trx_undo_rec_t* rec; - + XID xid; + ibool xid_exists = FALSE; + if (id >= TRX_RSEG_N_SLOTS) { fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", (ulong) id); @@ -1145,15 +1239,30 @@ trx_undo_mem_create_at_db_start( undo_header = undo_page + offset; trx_id = mtr_read_dulint(undo_header + TRX_UNDO_TRX_ID, mtr); + + xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS, + MLOG_1BYTE, mtr); + + /* Read X/Open XA transaction identification if it exists, or + set it to NULL. */ + + memset(&xid, 0, sizeof(xid)); + xid.formatID = -1; + + if (xid_exists == TRUE) { + trx_undo_read_xid(undo_header, &xid); + } + mutex_enter(&(rseg->mutex)); - undo = trx_undo_mem_create(rseg, id, type, trx_id, page_no, offset); - + undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid, + page_no, offset); mutex_exit(&(rseg->mutex)); - undo->dict_operation = mtr_read_ulint( - undo_header + TRX_UNDO_DICT_OPERATION, - MLOG_2BYTES, mtr); + undo->dict_operation = mtr_read_ulint( + undo_header + TRX_UNDO_DICT_TRANS, + MLOG_1BYTE, mtr); + undo->table_id = mtr_read_dulint(undo_header + TRX_UNDO_TABLE_ID, mtr); undo->state = state; undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr); @@ -1272,7 +1381,8 @@ trx_undo_mem_create( ulint type, /* in: type of the log: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log - is created */ + is created */ + XID* xid, /* in: X/Open transaction identification */ ulint page_no,/* in: undo log header page number */ ulint offset) /* in: undo log header byte offset on page */ { @@ -1295,6 +1405,7 @@ trx_undo_mem_create( undo->state = TRX_UNDO_ACTIVE; undo->del_marks = FALSE; undo->trx_id = trx_id; + undo->xid = *xid; undo->dict_operation = FALSE; @@ -1322,6 +1433,7 @@ trx_undo_mem_init_for_reuse( trx_undo_t* undo, /* in: undo log to init */ dulint trx_id, /* in: id of the trx for which the undo log is created */ + XID* xid, /* in: X/Open XA transaction identification*/ ulint offset) /* in: undo log header byte offset on page */ { #ifdef UNIV_SYNC_DEBUG @@ -1339,6 +1451,7 @@ trx_undo_mem_init_for_reuse( undo->state = TRX_UNDO_ACTIVE; undo->del_marks = FALSE; undo->trx_id = trx_id; + undo->xid = *xid; undo->dict_operation = FALSE; @@ -1371,11 +1484,13 @@ trx_undo_create( /*============*/ /* out: undo log object, NULL if did not succeed: out of space */ + trx_t* trx, /* in: transaction */ trx_rseg_t* rseg, /* in: rollback segment memory copy */ ulint type, /* in: type of the log: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log is created */ + XID* xid, /* in: X/Open transaction identification*/ mtr_t* mtr) /* in: mtr */ { trx_rsegf_t* rseg_header; @@ -1410,10 +1525,15 @@ trx_undo_create( page_no = buf_frame_get_page_no(undo_page); - offset = trx_undo_header_create(undo_page, trx_id, mtr); + offset = trx_undo_header_create(undo_page, trx_id, mtr); - undo = trx_undo_mem_create(rseg, id, type, trx_id, page_no, offset); - + if (trx->support_xa) { + trx_undo_header_add_space_for_xid(undo_page, + undo_page + offset, mtr); + } + + undo = trx_undo_mem_create(rseg, id, type, trx_id, xid, + page_no, offset); return(undo); } @@ -1427,11 +1547,13 @@ trx_undo_reuse_cached( /*==================*/ /* out: the undo log memory object, NULL if none cached */ + trx_t* trx, /* in: transaction */ trx_rseg_t* rseg, /* in: rollback segment memory object */ ulint type, /* in: type of the log: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log is used */ + XID* xid, /* in: X/Open XA transaction identification */ mtr_t* mtr) /* in: mtr */ { trx_undo_t* undo; @@ -1476,15 +1598,25 @@ trx_undo_reuse_cached( if (type == TRX_UNDO_INSERT) { offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid(undo_page, + undo_page + offset, mtr); + } } else { ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); offset = trx_undo_header_create(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid(undo_page, + undo_page + offset, mtr); + } } - trx_undo_mem_init_for_reuse(undo, trx_id, offset); + trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset); return(undo); } @@ -1506,9 +1638,10 @@ trx_undo_mark_as_dict_operation( hdr_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); - mlog_write_ulint(hdr_page + undo->hdr_offset + TRX_UNDO_DICT_OPERATION, - trx->dict_operation, MLOG_2BYTES, mtr); - + mlog_write_ulint(hdr_page + undo->hdr_offset + + TRX_UNDO_DICT_TRANS, + trx->dict_operation, MLOG_1BYTE, mtr); + mlog_write_dulint(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID, trx->table_id, mtr); @@ -1548,11 +1681,11 @@ trx_undo_assign_undo( #endif /* UNIV_SYNC_DEBUG */ mutex_enter(&(rseg->mutex)); - undo = trx_undo_reuse_cached(rseg, type, trx->id, &mtr); - + undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid, + &mtr); if (undo == NULL) { - undo = trx_undo_create(rseg, type, trx->id, &mtr); - + undo = trx_undo_create(trx, rseg, type, trx->id, &trx->xid, + &mtr); if (undo == NULL) { /* Did not succeed */ @@ -1632,6 +1765,57 @@ trx_undo_set_state_at_finish( return(undo_page); } +/********************************************************************** +Sets the state of the undo log segment at a transaction prepare. */ + +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr) /* in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + trx_ulogf_t* undo_header; + page_t* undo_page; + ulint offset; + + ut_ad(trx && undo && mtr); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption((byte*)undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + /*------------------------------*/ + undo->state = TRX_UNDO_PREPARED; + undo->xid = trx->xid; + /*------------------------------*/ + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state, + MLOG_2BYTES, mtr); + + offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + undo_header = undo_page + offset; + + mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS, + TRUE, MLOG_1BYTE, mtr); + + trx_undo_write_xid(undo_header, &undo->xid, mtr); + + return(undo_page); +} + /************************************************************************** Adds the update undo log header as the first in the history list, and frees the memory object, or puts it to the list of cached update undo log diff --git a/innobase/ut/ut0dbg.c b/innobase/ut/ut0dbg.c index 0f6a27d35d9..e810d8dead7 100644 --- a/innobase/ut/ut0dbg.c +++ b/innobase/ut/ut0dbg.c @@ -8,8 +8,11 @@ Created 1/30/1994 Heikki Tuuri #include "univ.i" +#if defined(__GNUC__) && (__GNUC__ > 2) +#else /* This is used to eliminate compiler warnings */ ulint ut_dbg_zero = 0; +#endif /* If this is set to TRUE all threads will stop into the next assertion and assert */ @@ -19,21 +22,69 @@ ibool panic_shutdown = FALSE; /* This is set to TRUE when on NetWare there happens an InnoDB assertion failure or other fatal error condition that requires an immediate shutdown. */ -#endif +#else /* __NETWARE__ */ /* Null pointer used to generate memory trap */ ulint* ut_dbg_null_ptr = NULL; +#endif /* __NETWARE__ */ + +/***************************************************************** +Report a failed assertion. */ -const char* ut_dbg_msg_assert_fail = -"InnoDB: Assertion failure in thread %lu in file %s line %lu\n"; -const char* ut_dbg_msg_trap = +void +ut_dbg_assertion_failed( +/*====================*/ + const char* expr, /* in: the failed assertion (optional) */ + const char* file, /* in: source file containing the assertion */ + ulint line) /* in: line number of the assertion */ +{ + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: Assertion failure in thread %lu" + " in file %s line %lu\n", + os_thread_pf(os_thread_get_curr_id()), file, line); + if (expr) { + fprintf(stderr, + "InnoDB: Failing assertion: %s\n", expr); + } + + fputs( "InnoDB: We intentionally generate a memory trap.\n" "InnoDB: Submit a detailed bug report to http://bugs.mysql.com.\n" "InnoDB: If you get repeated assertion failures or crashes, even\n" "InnoDB: immediately after the mysqld startup, there may be\n" "InnoDB: corruption in the InnoDB tablespace. Please refer to\n" "InnoDB: http://dev.mysql.com/doc/mysql/en/Forcing_recovery.html\n" -"InnoDB: about forcing recovery.\n"; +"InnoDB: about forcing recovery.\n", stderr); + ut_dbg_stop_threads = TRUE; +} + +#ifdef __NETWARE__ +/***************************************************************** +Shut down MySQL/InnoDB after assertion failure. */ + +void +ut_dbg_panic(void) +/*==============*/ +{ + if (!panic_shutdown) { + panic_shutdown = TRUE; + innobase_shutdown_for_mysql(); + } + exit(1); +} +#else /* __NETWARE__ */ +/***************************************************************** +Stop a thread after assertion failure. */ -const char* ut_dbg_msg_stop = -"InnoDB: Thread %lu stopped in file %s line %lu\n"; +void +ut_dbg_stop_thread( +/*===============*/ + const char* file, + ulint line) +{ + fprintf(stderr, "InnoDB: Thread %lu stopped in file %s line %lu\n", + os_thread_pf(os_thread_get_curr_id()), file, line); + os_thread_sleep(1000000000); +} +#endif /* __NETWARE__ */ diff --git a/innobase/ut/ut0mem.c b/innobase/ut/ut0mem.c index 18ee53a354c..3e8fd79a739 100644 --- a/innobase/ut/ut0mem.c +++ b/innobase/ut/ut0mem.c @@ -92,6 +92,8 @@ retry: "InnoDB: ulimits of your operating system.\n" "InnoDB: On FreeBSD check you have compiled the OS with\n" "InnoDB: a big enough maximum process size.\n" + "InnoDB: Note that in most 32-bit computers the process\n" + "InnoDB: memory space is limited to 2 GB or 4 GB.\n" "InnoDB: We keep retrying the allocation for 60 seconds...\n", (ulong) n, (ulong) ut_total_allocated_memory, #ifdef __WIN__ diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c index 2a2db9442b3..1be5939303a 100644 --- a/innobase/ut/ut0ut.c +++ b/innobase/ut/ut0ut.c @@ -74,6 +74,28 @@ ut_time(void) } /************************************************************** +Returns system time. */ + +void +ut_usectime( +/*========*/ + ulint* sec, /* out: seconds since the Epoch */ + ulint* ms) /* out: microseconds since the Epoch+*sec */ +{ +#ifdef __WIN__ + SYSTEMTIME st; + GetLocalTime(&st); + *sec = (ulint) st.wSecond; + *ms = (ulint) st.wMilliseconds; +#else + struct timeval tv; + gettimeofday(&tv,NULL); + *sec = (ulint) tv.tv_sec; + *ms = (ulint) tv.tv_usec; +#endif +} + +/************************************************************** Returns the difference of two times in seconds. */ double @@ -394,7 +416,11 @@ ut_print_namel( { const char* s = name; const char* e = s + namelen; +#ifdef UNIV_HOTBACKUP + int q = '"'; +#else int q = mysql_get_identifier_quote_char(trx, name, namelen); +#endif if (q == EOF) { fwrite(name, 1, namelen, f); return; |